chart-rvr-3b / trainer_state.json
sanchit97's picture
Upload folder using huggingface_hub
c9db2fa verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.6666666666666665,
"eval_steps": 500,
"global_step": 4000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 642.14,
"completions/max_terminated_length": 570.34,
"completions/mean_length": 420.1125,
"completions/mean_terminated_length": 398.3580334472656,
"completions/min_length": 250.16,
"completions/min_terminated_length": 250.16,
"epoch": 0.03333333333333333,
"frac_reward_zero_std": 0.0,
"grad_norm": 4.710774621900829,
"learning_rate": 9.918333333333334e-07,
"loss": 0.0113,
"num_tokens": 461361.0,
"reward": 6.184720268249512,
"reward_std": 1.6460176765918733,
"rewards/accuracy_reward/mean": 0.31,
"rewards/accuracy_reward/std": 0.393804452419281,
"rewards/chart_type_reward/mean": 0.83,
"rewards/chart_type_reward/std": 0.20554168462753297,
"rewards/format_reward/mean": 1.31,
"rewards/format_reward/std": 0.8051172530651093,
"rewards/length_think_reward/mean": 1.116875,
"rewards/length_think_reward/std": 0.35811117276549337,
"rewards/num_token_reward/mean": 0.645,
"rewards/num_token_reward/std": 0.4071964037418365,
"rewards/process_style_reward/mean": 0.7380022585391999,
"rewards/process_style_reward/std": 0.20786408737301826,
"rewards/table_style_reward/mean": 1.2348430597782134,
"rewards/table_style_reward/std": 0.6372630500793457,
"step": 50
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.055,
"completions/max_length": 627.98,
"completions/max_terminated_length": 567.12,
"completions/mean_length": 419.4975,
"completions/mean_terminated_length": 399.8742425537109,
"completions/min_length": 271.46,
"completions/min_terminated_length": 271.46,
"epoch": 0.06666666666666667,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.617454327420967,
"learning_rate": 9.835e-07,
"loss": 0.033,
"num_tokens": 922132.0,
"reward": 7.684444198608398,
"reward_std": 1.0128395134210586,
"rewards/accuracy_reward/mean": 0.3675,
"rewards/accuracy_reward/std": 0.39272902250289915,
"rewards/chart_type_reward/mean": 0.92,
"rewards/chart_type_reward/std": 0.10311741709709167,
"rewards/format_reward/mean": 1.81,
"rewards/format_reward/std": 0.39947535157203673,
"rewards/length_think_reward/mean": 1.36,
"rewards/length_think_reward/std": 0.18411283910274506,
"rewards/num_token_reward/mean": 0.8975,
"rewards/num_token_reward/std": 0.21008866012096405,
"rewards/process_style_reward/mean": 0.8522701609134674,
"rewards/process_style_reward/std": 0.2383432410657406,
"rewards/table_style_reward/mean": 1.4771740126609803,
"rewards/table_style_reward/std": 0.538226346373558,
"step": 100
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.055,
"completions/max_length": 663.8,
"completions/max_terminated_length": 622.22,
"completions/mean_length": 444.205,
"completions/mean_terminated_length": 427.1011260986328,
"completions/min_length": 294.06,
"completions/min_terminated_length": 294.06,
"epoch": 0.1,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.298847294243072,
"learning_rate": 9.751666666666666e-07,
"loss": 0.0272,
"num_tokens": 1393874.0,
"reward": 8.196055917739868,
"reward_std": 0.8822205343842506,
"rewards/accuracy_reward/mean": 0.405,
"rewards/accuracy_reward/std": 0.4406168383359909,
"rewards/chart_type_reward/mean": 0.91,
"rewards/chart_type_reward/std": 0.10892393231391907,
"rewards/format_reward/mean": 1.85,
"rewards/format_reward/std": 0.3395550119876862,
"rewards/length_think_reward/mean": 1.48125,
"rewards/length_think_reward/std": 0.04037815436720848,
"rewards/num_token_reward/mean": 0.9275,
"rewards/num_token_reward/std": 0.16868472278118132,
"rewards/process_style_reward/mean": 1.0046203970909118,
"rewards/process_style_reward/std": 0.28071307986974714,
"rewards/table_style_reward/mean": 1.617685569524765,
"rewards/table_style_reward/std": 0.43435441348701714,
"step": 150
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0375,
"completions/max_length": 590.62,
"completions/max_terminated_length": 560.86,
"completions/mean_length": 419.71,
"completions/mean_terminated_length": 407.2615026855469,
"completions/min_length": 300.28,
"completions/min_terminated_length": 300.28,
"epoch": 0.13333333333333333,
"frac_reward_zero_std": 0.04,
"grad_norm": 5.01614642751583,
"learning_rate": 9.668333333333332e-07,
"loss": 0.018,
"num_tokens": 1854582.0,
"reward": 8.454336786270142,
"reward_std": 0.7186576825380325,
"rewards/accuracy_reward/mean": 0.445,
"rewards/accuracy_reward/std": 0.37084252953529356,
"rewards/chart_type_reward/mean": 0.8825,
"rewards/chart_type_reward/std": 0.13139761447906495,
"rewards/format_reward/mean": 1.875,
"rewards/format_reward/std": 0.25387449383735655,
"rewards/length_think_reward/mean": 1.491875,
"rewards/length_think_reward/std": 0.016481903940439226,
"rewards/num_token_reward/mean": 0.935,
"rewards/num_token_reward/std": 0.12912438035011292,
"rewards/process_style_reward/mean": 1.1697046744823456,
"rewards/process_style_reward/std": 0.3359800568223,
"rewards/table_style_reward/mean": 1.6552570796012878,
"rewards/table_style_reward/std": 0.4633850826323032,
"step": 200
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0125,
"completions/max_length": 520.38,
"completions/max_terminated_length": 496.48,
"completions/mean_length": 381.645,
"completions/mean_terminated_length": 376.8010729980469,
"completions/min_length": 291.64,
"completions/min_terminated_length": 291.64,
"epoch": 0.16666666666666666,
"frac_reward_zero_std": 0.03,
"grad_norm": 3.8768085436240014,
"learning_rate": 9.585e-07,
"loss": 0.0049,
"num_tokens": 2300628.0,
"reward": 8.882754230499268,
"reward_std": 0.7584551328420639,
"rewards/accuracy_reward/mean": 0.6175,
"rewards/accuracy_reward/std": 0.41537604093551633,
"rewards/chart_type_reward/mean": 0.905,
"rewards/chart_type_reward/std": 0.11220384895801544,
"rewards/format_reward/mean": 1.93,
"rewards/format_reward/std": 0.17845415830612182,
"rewards/length_think_reward/mean": 1.49375,
"rewards/length_think_reward/std": 0.01767767071723938,
"rewards/num_token_reward/mean": 0.965,
"rewards/num_token_reward/std": 0.08922707915306091,
"rewards/process_style_reward/mean": 1.2509180808067322,
"rewards/process_style_reward/std": 0.35428043991327285,
"rewards/table_style_reward/mean": 1.7205862057209016,
"rewards/table_style_reward/std": 0.3823927499353886,
"step": 250
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.005,
"completions/max_length": 476.06,
"completions/max_terminated_length": 464.94,
"completions/mean_length": 359.98,
"completions/mean_terminated_length": 357.9764294433594,
"completions/min_length": 277.3,
"completions/min_terminated_length": 277.3,
"epoch": 0.2,
"frac_reward_zero_std": 0.03,
"grad_norm": 4.04444430042266,
"learning_rate": 9.501666666666667e-07,
"loss": -0.0013,
"num_tokens": 2738260.0,
"reward": 9.100077533721924,
"reward_std": 0.5911612424254418,
"rewards/accuracy_reward/mean": 0.64,
"rewards/accuracy_reward/std": 0.38784352123737337,
"rewards/chart_type_reward/mean": 0.9475,
"rewards/chart_type_reward/std": 0.06052331507205963,
"rewards/format_reward/mean": 1.96,
"rewards/format_reward/std": 0.10336921453475952,
"rewards/length_think_reward/mean": 1.489375,
"rewards/length_think_reward/std": 0.026885675489902495,
"rewards/num_token_reward/mean": 0.98,
"rewards/num_token_reward/std": 0.05168460726737976,
"rewards/process_style_reward/mean": 1.3212374341487885,
"rewards/process_style_reward/std": 0.343754044175148,
"rewards/table_style_reward/mean": 1.7619650173187256,
"rewards/table_style_reward/std": 0.41162539228796957,
"step": 300
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0025,
"completions/max_length": 433.68,
"completions/max_terminated_length": 429.78,
"completions/mean_length": 335.8,
"completions/mean_terminated_length": 334.99607177734373,
"completions/min_length": 257.7,
"completions/min_terminated_length": 257.7,
"epoch": 0.23333333333333334,
"frac_reward_zero_std": 0.07,
"grad_norm": 1.820258234118004,
"learning_rate": 9.418333333333332e-07,
"loss": -0.0019,
"num_tokens": 3165744.0,
"reward": 9.284861507415771,
"reward_std": 0.4822974817454815,
"rewards/accuracy_reward/mean": 0.715,
"rewards/accuracy_reward/std": 0.3768920677900314,
"rewards/chart_type_reward/mean": 0.955,
"rewards/chart_type_reward/std": 0.05690393328666687,
"rewards/format_reward/mean": 1.99,
"rewards/format_reward/std": 0.028284270763397217,
"rewards/length_think_reward/mean": 1.49125,
"rewards/length_think_reward/std": 0.01931762829422951,
"rewards/num_token_reward/mean": 0.9925,
"rewards/num_token_reward/std": 0.021213203072547912,
"rewards/process_style_reward/mean": 1.3195704579353333,
"rewards/process_style_reward/std": 0.3678068408370018,
"rewards/table_style_reward/mean": 1.821540994644165,
"rewards/table_style_reward/std": 0.3932980696856976,
"step": 350
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.005,
"completions/max_length": 433.7,
"completions/max_terminated_length": 419.2,
"completions/mean_length": 331.6575,
"completions/mean_terminated_length": 329.5703582763672,
"completions/min_length": 261.88,
"completions/min_terminated_length": 261.88,
"epoch": 0.26666666666666666,
"frac_reward_zero_std": 0.04,
"grad_norm": 3.996548131664497,
"learning_rate": 9.334999999999999e-07,
"loss": 0.0016,
"num_tokens": 3591319.0,
"reward": 9.078651866912843,
"reward_std": 0.5775595012307168,
"rewards/accuracy_reward/mean": 0.6775,
"rewards/accuracy_reward/std": 0.360657674074173,
"rewards/chart_type_reward/mean": 0.8875,
"rewards/chart_type_reward/std": 0.10328511297702789,
"rewards/format_reward/mean": 1.95,
"rewards/format_reward/std": 0.13165348529815674,
"rewards/length_think_reward/mean": 1.48875,
"rewards/length_think_reward/std": 0.021778347939252853,
"rewards/num_token_reward/mean": 0.975,
"rewards/num_token_reward/std": 0.06582674264907837,
"rewards/process_style_reward/mean": 1.3236911845207215,
"rewards/process_style_reward/std": 0.39317745611071586,
"rewards/table_style_reward/mean": 1.776210721731186,
"rewards/table_style_reward/std": 0.3345204618573189,
"step": 400
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.005,
"completions/max_length": 460.6,
"completions/max_terminated_length": 458.46,
"completions/mean_length": 363.545,
"completions/mean_terminated_length": 361.9766668701172,
"completions/min_length": 284.26,
"completions/min_terminated_length": 284.26,
"epoch": 0.3,
"frac_reward_zero_std": 0.08,
"grad_norm": 7.628999451616129,
"learning_rate": 9.251666666666666e-07,
"loss": 0.01,
"num_tokens": 4030253.0,
"reward": 9.122887582778931,
"reward_std": 0.406713061761111,
"rewards/accuracy_reward/mean": 0.74,
"rewards/accuracy_reward/std": 0.33945011138916015,
"rewards/chart_type_reward/mean": 0.865,
"rewards/chart_type_reward/std": 0.13173707962036132,
"rewards/format_reward/mean": 1.99,
"rewards/format_reward/std": 0.01851640224456787,
"rewards/length_think_reward/mean": 1.49375,
"rewards/length_think_reward/std": 0.01767766922712326,
"rewards/num_token_reward/mean": 0.995,
"rewards/num_token_reward/std": 0.009258201122283935,
"rewards/process_style_reward/mean": 1.2878882658481599,
"rewards/process_style_reward/std": 0.34421144127845765,
"rewards/table_style_reward/mean": 1.751249282360077,
"rewards/table_style_reward/std": 0.3605493099242449,
"step": 450
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 431.84,
"completions/max_terminated_length": 431.84,
"completions/mean_length": 340.835,
"completions/mean_terminated_length": 340.835,
"completions/min_length": 259.66,
"completions/min_terminated_length": 259.66,
"epoch": 0.3333333333333333,
"frac_reward_zero_std": 0.03,
"grad_norm": 3.667965991041236,
"learning_rate": 9.168333333333333e-07,
"loss": 0.0008,
"num_tokens": 4459843.0,
"reward": 9.339048709869385,
"reward_std": 0.4699262708425522,
"rewards/accuracy_reward/mean": 0.6875,
"rewards/accuracy_reward/std": 0.3332241028547287,
"rewards/chart_type_reward/mean": 0.925,
"rewards/chart_type_reward/std": 0.09570688426494599,
"rewards/format_reward/mean": 2.0,
"rewards/format_reward/std": 0.0,
"rewards/length_think_reward/mean": 1.5,
"rewards/length_think_reward/std": 0.0,
"rewards/num_token_reward/mean": 1.0,
"rewards/num_token_reward/std": 0.0,
"rewards/process_style_reward/mean": 1.3655757594108582,
"rewards/process_style_reward/std": 0.40603267412632704,
"rewards/table_style_reward/mean": 1.860972990989685,
"rewards/table_style_reward/std": 0.4130638699233532,
"step": 500
},
{
"epoch": 0.3333333333333333,
"eval_clip_ratio/high_max": 0.0,
"eval_clip_ratio/high_mean": 0.0,
"eval_clip_ratio/low_mean": 0.0,
"eval_clip_ratio/low_min": 0.0,
"eval_clip_ratio/region_mean": 0.0,
"eval_completions/clipped_ratio": 0.0,
"eval_completions/max_length": 476.36,
"eval_completions/max_terminated_length": 476.36,
"eval_completions/mean_length": 301.6075,
"eval_completions/mean_terminated_length": 301.6075,
"eval_completions/min_length": 198.44,
"eval_completions/min_terminated_length": 198.44,
"eval_frac_reward_zero_std": 0.495,
"eval_loss": 0.0033517335541546345,
"eval_num_tokens": 4459843.0,
"eval_reward": 7.484729690551758,
"eval_reward_std": 0.2337803066149354,
"eval_rewards/accuracy_reward/mean": 0.80875,
"eval_rewards/accuracy_reward/std": 0.3655660229921341,
"eval_rewards/chart_type_reward/mean": 0.6275,
"eval_rewards/chart_type_reward/std": 0.46189448714256287,
"eval_rewards/format_reward/mean": 1.9775,
"eval_rewards/format_reward/std": 0.09993488192558289,
"eval_rewards/length_think_reward/mean": 1.5,
"eval_rewards/length_think_reward/std": 0.0,
"eval_rewards/num_token_reward/mean": 0.98875,
"eval_rewards/num_token_reward/std": 0.049967440962791446,
"eval_rewards/process_style_reward/mean": 0.8432698702812195,
"eval_rewards/process_style_reward/std": 0.27860497415065766,
"eval_rewards/table_style_reward/mean": 0.7389598202705383,
"eval_rewards/table_style_reward/std": 0.0863262665271759,
"eval_runtime": 357.3984,
"eval_samples_per_second": 0.56,
"eval_steps_per_second": 0.02,
"step": 500
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 381.68,
"completions/max_terminated_length": 381.68,
"completions/mean_length": 302.93,
"completions/mean_terminated_length": 302.93,
"completions/min_length": 236.18,
"completions/min_terminated_length": 236.18,
"epoch": 0.36666666666666664,
"frac_reward_zero_std": 0.04,
"grad_norm": 4.242303838921506,
"learning_rate": 9.085e-07,
"loss": -0.0004,
"num_tokens": 4874635.0,
"reward": 9.339553604125976,
"reward_std": 0.3770983973145485,
"rewards/accuracy_reward/mean": 0.74,
"rewards/accuracy_reward/std": 0.3310369694232941,
"rewards/chart_type_reward/mean": 0.9175,
"rewards/chart_type_reward/std": 0.09259466350078582,
"rewards/format_reward/mean": 1.995,
"rewards/format_reward/std": 0.014142135381698609,
"rewards/length_think_reward/mean": 1.49875,
"rewards/length_think_reward/std": 0.003535533845424652,
"rewards/num_token_reward/mean": 0.9975,
"rewards/num_token_reward/std": 0.007071067690849304,
"rewards/process_style_reward/mean": 1.3991042220592498,
"rewards/process_style_reward/std": 0.37223585724830627,
"rewards/table_style_reward/mean": 1.791699321269989,
"rewards/table_style_reward/std": 0.3688546184077859,
"step": 550
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 382.36,
"completions/max_terminated_length": 382.36,
"completions/mean_length": 313.265,
"completions/mean_terminated_length": 313.265,
"completions/min_length": 249.0,
"completions/min_terminated_length": 249.0,
"epoch": 0.4,
"frac_reward_zero_std": 0.06,
"grad_norm": 2.6386198633761295,
"learning_rate": 9.001666666666667e-07,
"loss": -0.0002,
"num_tokens": 5292769.0,
"reward": 9.358322076797485,
"reward_std": 0.45063421681523325,
"rewards/accuracy_reward/mean": 0.7475,
"rewards/accuracy_reward/std": 0.32666426956653594,
"rewards/chart_type_reward/mean": 0.9275,
"rewards/chart_type_reward/std": 0.08190421402454376,
"rewards/format_reward/mean": 1.985,
"rewards/format_reward/std": 0.03265853762626648,
"rewards/length_think_reward/mean": 1.5,
"rewards/length_think_reward/std": 0.0,
"rewards/num_token_reward/mean": 0.9925,
"rewards/num_token_reward/std": 0.01632926881313324,
"rewards/process_style_reward/mean": 1.4022967970371247,
"rewards/process_style_reward/std": 0.3624314972758293,
"rewards/table_style_reward/mean": 1.8035252857208253,
"rewards/table_style_reward/std": 0.3688652907311916,
"step": 600
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0475,
"completions/max_length": 502.48,
"completions/max_terminated_length": 479.38,
"completions/mean_length": 390.065,
"completions/mean_terminated_length": 376.88559692382813,
"completions/min_length": 295.92,
"completions/min_terminated_length": 295.92,
"epoch": 0.43333333333333335,
"frac_reward_zero_std": 0.08,
"grad_norm": 1.6276070180776299,
"learning_rate": 8.918333333333333e-07,
"loss": 0.0067,
"num_tokens": 5742743.0,
"reward": 8.926296434402467,
"reward_std": 0.6193729147315026,
"rewards/accuracy_reward/mean": 0.7025,
"rewards/accuracy_reward/std": 0.33726297795772553,
"rewards/chart_type_reward/mean": 0.87,
"rewards/chart_type_reward/std": 0.13173707962036132,
"rewards/format_reward/mean": 1.88,
"rewards/format_reward/std": 0.18836578965187073,
"rewards/length_think_reward/mean": 1.476875,
"rewards/length_think_reward/std": 0.04876347452402115,
"rewards/num_token_reward/mean": 0.9325,
"rewards/num_token_reward/std": 0.09855559468269348,
"rewards/process_style_reward/mean": 1.3070782232284546,
"rewards/process_style_reward/std": 0.3663827758282423,
"rewards/table_style_reward/mean": 1.7573432433605194,
"rewards/table_style_reward/std": 0.41278132781386373,
"step": 650
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04,
"completions/max_length": 582.46,
"completions/max_terminated_length": 561.88,
"completions/mean_length": 474.75,
"completions/mean_terminated_length": 462.6513354492188,
"completions/min_length": 382.0,
"completions/min_terminated_length": 382.0,
"epoch": 0.4666666666666667,
"frac_reward_zero_std": 0.03,
"grad_norm": 3.113389277396801,
"learning_rate": 8.834999999999999e-07,
"loss": 0.0051,
"num_tokens": 6226015.0,
"reward": 9.087711658477783,
"reward_std": 0.6245314812660218,
"rewards/accuracy_reward/mean": 0.7075,
"rewards/accuracy_reward/std": 0.3379362678527832,
"rewards/chart_type_reward/mean": 0.875,
"rewards/chart_type_reward/std": 0.1228942984342575,
"rewards/format_reward/mean": 1.915,
"rewards/format_reward/std": 0.14443274736404418,
"rewards/length_think_reward/mean": 1.434375,
"rewards/length_think_reward/std": 0.12082115039229394,
"rewards/num_token_reward/mean": 0.9425,
"rewards/num_token_reward/std": 0.11464277982711792,
"rewards/process_style_reward/mean": 1.4090502178668975,
"rewards/process_style_reward/std": 0.4188565620034933,
"rewards/table_style_reward/mean": 1.8042864727973937,
"rewards/table_style_reward/std": 0.367302486859262,
"step": 700
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08,
"completions/max_length": 669.88,
"completions/max_terminated_length": 636.52,
"completions/mean_length": 543.3925,
"completions/mean_terminated_length": 523.3314343261719,
"completions/min_length": 434.66,
"completions/min_terminated_length": 434.66,
"epoch": 0.5,
"frac_reward_zero_std": 0.07,
"grad_norm": 2.705954822615204,
"learning_rate": 8.751666666666666e-07,
"loss": 0.0099,
"num_tokens": 6735800.0,
"reward": 9.080023832321167,
"reward_std": 0.6411656188964844,
"rewards/accuracy_reward/mean": 0.695,
"rewards/accuracy_reward/std": 0.3173846417665482,
"rewards/chart_type_reward/mean": 0.9,
"rewards/chart_type_reward/std": 0.10690449476242066,
"rewards/format_reward/mean": 1.84,
"rewards/format_reward/std": 0.2742329239845276,
"rewards/length_think_reward/mean": 1.445625,
"rewards/length_think_reward/std": 0.09453705742955208,
"rewards/num_token_reward/mean": 0.9175,
"rewards/num_token_reward/std": 0.1441875296831131,
"rewards/process_style_reward/mean": 1.434592843055725,
"rewards/process_style_reward/std": 0.4128647920489311,
"rewards/table_style_reward/mean": 1.8473060631752014,
"rewards/table_style_reward/std": 0.384682634845376,
"step": 750
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0425,
"completions/max_length": 555.72,
"completions/max_terminated_length": 536.3,
"completions/mean_length": 459.7675,
"completions/mean_terminated_length": 450.3732165527344,
"completions/min_length": 376.74,
"completions/min_terminated_length": 376.74,
"epoch": 0.5333333333333333,
"frac_reward_zero_std": 0.1,
"grad_norm": 2.744641896369063,
"learning_rate": 8.668333333333333e-07,
"loss": 0.0041,
"num_tokens": 7212935.0,
"reward": 9.342824554443359,
"reward_std": 0.41833467945456504,
"rewards/accuracy_reward/mean": 0.7575,
"rewards/accuracy_reward/std": 0.27658932030200956,
"rewards/chart_type_reward/mean": 0.9025,
"rewards/chart_type_reward/std": 0.10656502962112427,
"rewards/format_reward/mean": 1.92,
"rewards/format_reward/std": 0.11177420973777771,
"rewards/length_think_reward/mean": 1.48875,
"rewards/length_think_reward/std": 0.018290950953960418,
"rewards/num_token_reward/mean": 0.9575,
"rewards/num_token_reward/std": 0.06295817255973817,
"rewards/process_style_reward/mean": 1.4902155125141143,
"rewards/process_style_reward/std": 0.3751195715367794,
"rewards/table_style_reward/mean": 1.8263590598106385,
"rewards/table_style_reward/std": 0.3030733197927475,
"step": 800
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02,
"completions/max_length": 507.02,
"completions/max_terminated_length": 496.5,
"completions/mean_length": 406.635,
"completions/mean_terminated_length": 400.53512084960937,
"completions/min_length": 317.5,
"completions/min_terminated_length": 317.5,
"epoch": 0.5666666666666667,
"frac_reward_zero_std": 0.07,
"grad_norm": 4.8860738140378,
"learning_rate": 8.585e-07,
"loss": 0.0018,
"num_tokens": 7668721.0,
"reward": 9.412762422561645,
"reward_std": 0.4385050618648529,
"rewards/accuracy_reward/mean": 0.705,
"rewards/accuracy_reward/std": 0.333148148059845,
"rewards/chart_type_reward/mean": 0.92,
"rewards/chart_type_reward/std": 0.09823348283767701,
"rewards/format_reward/mean": 1.955,
"rewards/format_reward/std": 0.08232370734214783,
"rewards/length_think_reward/mean": 1.490625,
"rewards/length_think_reward/std": 0.018341146558523178,
"rewards/num_token_reward/mean": 0.9775,
"rewards/num_token_reward/std": 0.04116185367107392,
"rewards/process_style_reward/mean": 1.4954944217205048,
"rewards/process_style_reward/std": 0.4515664022415876,
"rewards/table_style_reward/mean": 1.8691429781913758,
"rewards/table_style_reward/std": 0.2963829467073083,
"step": 850
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 464.54,
"completions/max_terminated_length": 464.54,
"completions/mean_length": 374.935,
"completions/mean_terminated_length": 374.935,
"completions/min_length": 296.92,
"completions/min_terminated_length": 296.92,
"epoch": 0.6,
"frac_reward_zero_std": 0.09,
"grad_norm": 1.8355684342913006,
"learning_rate": 8.501666666666666e-07,
"loss": 0.0025,
"num_tokens": 8111331.0,
"reward": 9.84747314453125,
"reward_std": 0.36430649772286416,
"rewards/accuracy_reward/mean": 0.7225,
"rewards/accuracy_reward/std": 0.2612554532289505,
"rewards/chart_type_reward/mean": 0.9475,
"rewards/chart_type_reward/std": 0.0391424161195755,
"rewards/format_reward/mean": 1.995,
"rewards/format_reward/std": 0.014142135381698609,
"rewards/length_think_reward/mean": 1.5,
"rewards/length_think_reward/std": 0.0,
"rewards/num_token_reward/mean": 0.9975,
"rewards/num_token_reward/std": 0.007071067690849304,
"rewards/process_style_reward/mean": 1.6478522050380706,
"rewards/process_style_reward/std": 0.39793143898248673,
"rewards/table_style_reward/mean": 2.0371209740638734,
"rewards/table_style_reward/std": 0.3056399393081665,
"step": 900
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0075,
"completions/max_length": 482.28,
"completions/max_terminated_length": 480.06,
"completions/mean_length": 378.5375,
"completions/mean_terminated_length": 376.3790002441406,
"completions/min_length": 300.54,
"completions/min_terminated_length": 300.54,
"epoch": 0.6333333333333333,
"frac_reward_zero_std": 0.1,
"grad_norm": 1.3347305735332344,
"learning_rate": 8.418333333333333e-07,
"loss": 0.008,
"num_tokens": 8555522.0,
"reward": 9.62767692565918,
"reward_std": 0.3843289668299258,
"rewards/accuracy_reward/mean": 0.7425,
"rewards/accuracy_reward/std": 0.31665275037288665,
"rewards/chart_type_reward/mean": 0.93,
"rewards/chart_type_reward/std": 0.07483314633369446,
"rewards/format_reward/mean": 1.985,
"rewards/format_reward/std": 0.02070196866989136,
"rewards/length_think_reward/mean": 1.496875,
"rewards/length_think_reward/std": 0.004580627083778381,
"rewards/num_token_reward/mean": 0.9925,
"rewards/num_token_reward/std": 0.01035098433494568,
"rewards/process_style_reward/mean": 1.6256941604614257,
"rewards/process_style_reward/std": 0.46362122789025306,
"rewards/table_style_reward/mean": 1.8551077818870545,
"rewards/table_style_reward/std": 0.3706050312891602,
"step": 950
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.005,
"completions/max_length": 462.0,
"completions/max_terminated_length": 455.9,
"completions/mean_length": 378.345,
"completions/mean_terminated_length": 376.565,
"completions/min_length": 300.4,
"completions/min_terminated_length": 300.4,
"epoch": 0.6666666666666666,
"frac_reward_zero_std": 0.06,
"grad_norm": 2.6667420842164096,
"learning_rate": 8.334999999999999e-07,
"loss": 0.0022,
"num_tokens": 9000848.0,
"reward": 9.415374546051025,
"reward_std": 0.4214027213305235,
"rewards/accuracy_reward/mean": 0.7225,
"rewards/accuracy_reward/std": 0.2530916023254395,
"rewards/chart_type_reward/mean": 0.895,
"rewards/chart_type_reward/std": 0.12104663014411926,
"rewards/format_reward/mean": 1.99,
"rewards/format_reward/std": 0.01851640224456787,
"rewards/length_think_reward/mean": 1.49375,
"rewards/length_think_reward/std": 0.011572750806808472,
"rewards/num_token_reward/mean": 0.995,
"rewards/num_token_reward/std": 0.009258201122283935,
"rewards/process_style_reward/mean": 1.488259848356247,
"rewards/process_style_reward/std": 0.3761888966709375,
"rewards/table_style_reward/mean": 1.8308646750450135,
"rewards/table_style_reward/std": 0.3701925078779459,
"step": 1000
},
{
"epoch": 0.6666666666666666,
"eval_clip_ratio/high_max": 0.0,
"eval_clip_ratio/high_mean": 0.0,
"eval_clip_ratio/low_mean": 0.0,
"eval_clip_ratio/low_min": 0.0,
"eval_clip_ratio/region_mean": 0.0,
"eval_completions/clipped_ratio": 0.005,
"eval_completions/max_length": 588.16,
"eval_completions/max_terminated_length": 586.0,
"eval_completions/mean_length": 377.64875,
"eval_completions/mean_terminated_length": 375.7753918457031,
"eval_completions/min_length": 247.76,
"eval_completions/min_terminated_length": 247.76,
"eval_frac_reward_zero_std": 0.645,
"eval_loss": 0.001064616721123457,
"eval_num_tokens": 9000848.0,
"eval_reward": 7.492321758270264,
"eval_reward_std": 0.13835911433212458,
"eval_rewards/accuracy_reward/mean": 0.79625,
"eval_rewards/accuracy_reward/std": 0.3533613955974579,
"eval_rewards/chart_type_reward/mean": 0.61375,
"eval_rewards/chart_type_reward/std": 0.46422410249710083,
"eval_rewards/format_reward/mean": 1.9875,
"eval_rewards/format_reward/std": 0.05197583675384521,
"eval_rewards/length_think_reward/mean": 1.49625,
"eval_rewards/length_think_reward/std": 0.019564387649297715,
"eval_rewards/num_token_reward/mean": 0.9925,
"eval_rewards/num_token_reward/std": 0.03305898606777191,
"eval_rewards/process_style_reward/mean": 0.8522592496871948,
"eval_rewards/process_style_reward/std": 0.25637542963027954,
"eval_rewards/table_style_reward/mean": 0.7538124990463256,
"eval_rewards/table_style_reward/std": 0.035973691046237946,
"eval_runtime": 425.3173,
"eval_samples_per_second": 0.47,
"eval_steps_per_second": 0.016,
"step": 1000
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01,
"completions/max_length": 489.34,
"completions/max_terminated_length": 481.02,
"completions/mean_length": 393.5375,
"completions/mean_terminated_length": 389.2625,
"completions/min_length": 313.12,
"completions/min_terminated_length": 313.12,
"epoch": 0.7,
"frac_reward_zero_std": 0.06,
"grad_norm": 2.333315545051143,
"learning_rate": 8.251666666666667e-07,
"loss": 0.0042,
"num_tokens": 9451267.0,
"reward": 9.452465152740478,
"reward_std": 0.3454449198395014,
"rewards/accuracy_reward/mean": 0.695,
"rewards/accuracy_reward/std": 0.2591600608825684,
"rewards/chart_type_reward/mean": 0.8875,
"rewards/chart_type_reward/std": 0.08190421402454376,
"rewards/format_reward/mean": 1.98,
"rewards/format_reward/std": 0.021380898952484132,
"rewards/length_think_reward/mean": 1.49125,
"rewards/length_think_reward/std": 0.010938137769699097,
"rewards/num_token_reward/mean": 0.99,
"rewards/num_token_reward/std": 0.010690449476242066,
"rewards/process_style_reward/mean": 1.5581933534145356,
"rewards/process_style_reward/std": 0.46272379651665685,
"rewards/table_style_reward/mean": 1.8505217921733856,
"rewards/table_style_reward/std": 0.37696966528892517,
"step": 1050
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 496.34,
"completions/max_terminated_length": 496.34,
"completions/mean_length": 406.19,
"completions/mean_terminated_length": 406.19,
"completions/min_length": 335.7,
"completions/min_terminated_length": 335.7,
"epoch": 0.7333333333333333,
"frac_reward_zero_std": 0.09,
"grad_norm": 3.6502729861830896,
"learning_rate": 8.168333333333333e-07,
"loss": 0.0033,
"num_tokens": 9907007.0,
"reward": 9.746276054382324,
"reward_std": 0.30796022541588175,
"rewards/accuracy_reward/mean": 0.795,
"rewards/accuracy_reward/std": 0.25512682616710664,
"rewards/chart_type_reward/mean": 0.94,
"rewards/chart_type_reward/std": 0.06127820014953613,
"rewards/format_reward/mean": 2.0,
"rewards/format_reward/std": 0.0,
"rewards/length_think_reward/mean": 1.5,
"rewards/length_think_reward/std": 0.0,
"rewards/num_token_reward/mean": 1.0,
"rewards/num_token_reward/std": 0.0,
"rewards/process_style_reward/mean": 1.5490337193012238,
"rewards/process_style_reward/std": 0.4507550221681595,
"rewards/table_style_reward/mean": 1.962242330312729,
"rewards/table_style_reward/std": 0.3748661072552204,
"step": 1100
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 463.02,
"completions/max_terminated_length": 463.02,
"completions/mean_length": 380.8375,
"completions/mean_terminated_length": 380.8375,
"completions/min_length": 315.86,
"completions/min_terminated_length": 315.86,
"epoch": 0.7666666666666667,
"frac_reward_zero_std": 0.08,
"grad_norm": 1.6467571445573885,
"learning_rate": 8.085e-07,
"loss": 0.0081,
"num_tokens": 10352734.0,
"reward": 9.6218558883667,
"reward_std": 0.378871104568243,
"rewards/accuracy_reward/mean": 0.75,
"rewards/accuracy_reward/std": 0.27910871148109434,
"rewards/chart_type_reward/mean": 0.92,
"rewards/chart_type_reward/std": 0.08552359580993653,
"rewards/format_reward/mean": 2.0,
"rewards/format_reward/std": 0.0,
"rewards/length_think_reward/mean": 1.496875,
"rewards/length_think_reward/std": 0.00883883535861969,
"rewards/num_token_reward/mean": 0.995,
"rewards/num_token_reward/std": 0.014142135381698609,
"rewards/process_style_reward/mean": 1.5185503327846528,
"rewards/process_style_reward/std": 0.408857840411365,
"rewards/table_style_reward/mean": 1.9414305424690246,
"rewards/table_style_reward/std": 0.4009686389937997,
"step": 1150
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015,
"completions/max_length": 441.24,
"completions/max_terminated_length": 433.9,
"completions/mean_length": 362.795,
"completions/mean_terminated_length": 357.56416687011716,
"completions/min_length": 293.56,
"completions/min_terminated_length": 293.56,
"epoch": 0.8,
"frac_reward_zero_std": 0.06,
"grad_norm": 3.0683170001047517,
"learning_rate": 8.001666666666667e-07,
"loss": -0.0007,
"num_tokens": 10790708.0,
"reward": 9.64956069946289,
"reward_std": 0.394139921143651,
"rewards/accuracy_reward/mean": 0.72,
"rewards/accuracy_reward/std": 0.3199311000108719,
"rewards/chart_type_reward/mean": 0.91,
"rewards/chart_type_reward/std": 0.09427463591098785,
"rewards/format_reward/mean": 1.97,
"rewards/format_reward/std": 0.039897301197052,
"rewards/length_think_reward/mean": 1.4975,
"rewards/length_think_reward/std": 0.004629100561141968,
"rewards/num_token_reward/mean": 0.985,
"rewards/num_token_reward/std": 0.019948650598526,
"rewards/process_style_reward/mean": 1.6004662060737609,
"rewards/process_style_reward/std": 0.36032520439475774,
"rewards/table_style_reward/mean": 1.966594467163086,
"rewards/table_style_reward/std": 0.3408663283288479,
"step": 1200
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0275,
"completions/max_length": 510.16,
"completions/max_terminated_length": 501.4,
"completions/mean_length": 409.38,
"completions/mean_terminated_length": 402.9113104248047,
"completions/min_length": 333.52,
"completions/min_terminated_length": 333.52,
"epoch": 0.8333333333333334,
"frac_reward_zero_std": 0.13,
"grad_norm": 4.893266426475079,
"learning_rate": 7.918333333333333e-07,
"loss": -0.0018,
"num_tokens": 11247124.0,
"reward": 9.399707975387573,
"reward_std": 0.4224707083776593,
"rewards/accuracy_reward/mean": 0.705,
"rewards/accuracy_reward/std": 0.3079745310544968,
"rewards/chart_type_reward/mean": 0.91,
"rewards/chart_type_reward/std": 0.09621404528617859,
"rewards/format_reward/mean": 1.945,
"rewards/format_reward/std": 0.09107224106788635,
"rewards/length_think_reward/mean": 1.486875,
"rewards/length_think_reward/std": 0.025552249252796172,
"rewards/num_token_reward/mean": 0.9725,
"rewards/num_token_reward/std": 0.045536120533943174,
"rewards/process_style_reward/mean": 1.527708315849304,
"rewards/process_style_reward/std": 0.4601225584745407,
"rewards/table_style_reward/mean": 1.8526247000694276,
"rewards/table_style_reward/std": 0.363852179646492,
"step": 1250
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0325,
"completions/max_length": 504.7,
"completions/max_terminated_length": 489.56,
"completions/mean_length": 413.09,
"completions/mean_terminated_length": 403.423857421875,
"completions/min_length": 329.26,
"completions/min_terminated_length": 329.26,
"epoch": 0.8666666666666667,
"frac_reward_zero_std": 0.08,
"grad_norm": 4.064899633242494,
"learning_rate": 7.834999999999999e-07,
"loss": 0.0022,
"num_tokens": 11706160.0,
"reward": 9.2975998878479,
"reward_std": 0.42783591762185097,
"rewards/accuracy_reward/mean": 0.68,
"rewards/accuracy_reward/std": 0.3520024484395981,
"rewards/chart_type_reward/mean": 0.9075,
"rewards/chart_type_reward/std": 0.10328511297702789,
"rewards/format_reward/mean": 1.935,
"rewards/format_reward/std": 0.09544337391853333,
"rewards/length_think_reward/mean": 1.488125,
"rewards/length_think_reward/std": 0.019788713902235033,
"rewards/num_token_reward/mean": 0.9675,
"rewards/num_token_reward/std": 0.047721686959266665,
"rewards/process_style_reward/mean": 1.474567185640335,
"rewards/process_style_reward/std": 0.4061433684825897,
"rewards/table_style_reward/mean": 1.8449076747894286,
"rewards/table_style_reward/std": 0.3341277042776346,
"step": 1300
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.025,
"completions/max_length": 530.7,
"completions/max_terminated_length": 514.46,
"completions/mean_length": 428.025,
"completions/mean_terminated_length": 420.09643005371095,
"completions/min_length": 341.4,
"completions/min_terminated_length": 341.4,
"epoch": 0.9,
"frac_reward_zero_std": 0.07,
"grad_norm": 2.5562933049105356,
"learning_rate": 7.751666666666666e-07,
"loss": 0.0079,
"num_tokens": 12170214.0,
"reward": 9.505103206634521,
"reward_std": 0.33932688400149347,
"rewards/accuracy_reward/mean": 0.7425,
"rewards/accuracy_reward/std": 0.28146197378635407,
"rewards/chart_type_reward/mean": 0.895,
"rewards/chart_type_reward/std": 0.11616269588470458,
"rewards/format_reward/mean": 1.945,
"rewards/format_reward/std": 0.1008401095867157,
"rewards/length_think_reward/mean": 1.495,
"rewards/length_think_reward/std": 0.007967560291290284,
"rewards/num_token_reward/mean": 0.9725,
"rewards/num_token_reward/std": 0.05042005479335785,
"rewards/process_style_reward/mean": 1.5487450003623962,
"rewards/process_style_reward/std": 0.460056491792202,
"rewards/table_style_reward/mean": 1.9063581895828248,
"rewards/table_style_reward/std": 0.34742515232414006,
"step": 1350
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0075,
"completions/max_length": 482.48,
"completions/max_terminated_length": 480.16,
"completions/mean_length": 397.7925,
"completions/mean_terminated_length": 396.51416748046876,
"completions/min_length": 326.3,
"completions/min_terminated_length": 326.3,
"epoch": 0.9333333333333333,
"frac_reward_zero_std": 0.14,
"grad_norm": 2.5170689556650028,
"learning_rate": 7.668333333333333e-07,
"loss": 0.0022,
"num_tokens": 12621947.0,
"reward": 9.625234622955322,
"reward_std": 0.38073745464906095,
"rewards/accuracy_reward/mean": 0.775,
"rewards/accuracy_reward/std": 0.24745278298854828,
"rewards/chart_type_reward/mean": 0.9275,
"rewards/chart_type_reward/std": 0.08190421402454376,
"rewards/format_reward/mean": 1.975,
"rewards/format_reward/std": 0.0609428083896637,
"rewards/length_think_reward/mean": 1.499375,
"rewards/length_think_reward/std": 0.001767766922712326,
"rewards/num_token_reward/mean": 0.9875,
"rewards/num_token_reward/std": 0.03047140419483185,
"rewards/process_style_reward/mean": 1.5226418220996856,
"rewards/process_style_reward/std": 0.409429362565279,
"rewards/table_style_reward/mean": 1.9382178807258605,
"rewards/table_style_reward/std": 0.3078057858347893,
"step": 1400
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01,
"completions/max_length": 460.66,
"completions/max_terminated_length": 455.88,
"completions/mean_length": 374.6425,
"completions/mean_terminated_length": 372.0279779052734,
"completions/min_length": 303.96,
"completions/min_terminated_length": 303.96,
"epoch": 0.9666666666666667,
"frac_reward_zero_std": 0.11,
"grad_norm": 3.2253024096985468,
"learning_rate": 7.584999999999999e-07,
"loss": 0.0018,
"num_tokens": 13064872.0,
"reward": 9.732238264083863,
"reward_std": 0.35529854300431907,
"rewards/accuracy_reward/mean": 0.71,
"rewards/accuracy_reward/std": 0.2705294406414032,
"rewards/chart_type_reward/mean": 0.945,
"rewards/chart_type_reward/std": 0.06271044850349426,
"rewards/format_reward/mean": 1.97,
"rewards/format_reward/std": 0.07508494377136231,
"rewards/length_think_reward/mean": 1.495,
"rewards/length_think_reward/std": 0.014142136126756667,
"rewards/num_token_reward/mean": 0.985,
"rewards/num_token_reward/std": 0.037542471885681154,
"rewards/process_style_reward/mean": 1.6616894614696502,
"rewards/process_style_reward/std": 0.4416278822161257,
"rewards/table_style_reward/mean": 1.9655487489700318,
"rewards/table_style_reward/std": 0.31115186443552373,
"step": 1450
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01,
"completions/max_length": 430.54,
"completions/max_terminated_length": 430.2,
"completions/mean_length": 355.42,
"completions/mean_terminated_length": 352.83666748046875,
"completions/min_length": 287.88,
"completions/min_terminated_length": 287.88,
"epoch": 1.0,
"frac_reward_zero_std": 0.1,
"grad_norm": 2.660840181002644,
"learning_rate": 7.501666666666666e-07,
"loss": 0.0016,
"num_tokens": 13500408.0,
"reward": 9.479499158859253,
"reward_std": 0.3713982145488262,
"rewards/accuracy_reward/mean": 0.76,
"rewards/accuracy_reward/std": 0.2704376995563507,
"rewards/chart_type_reward/mean": 0.905,
"rewards/chart_type_reward/std": 0.10294564783573151,
"rewards/format_reward/mean": 1.98,
"rewards/format_reward/std": 0.03703280448913574,
"rewards/length_think_reward/mean": 1.4975,
"rewards/length_think_reward/std": 0.007071067690849304,
"rewards/num_token_reward/mean": 0.99,
"rewards/num_token_reward/std": 0.01851640224456787,
"rewards/process_style_reward/mean": 1.5098418951034547,
"rewards/process_style_reward/std": 0.43727574720978735,
"rewards/table_style_reward/mean": 1.8371572828292846,
"rewards/table_style_reward/std": 0.34460590325295926,
"step": 1500
},
{
"epoch": 1.0,
"eval_clip_ratio/high_max": 0.0,
"eval_clip_ratio/high_mean": 0.0,
"eval_clip_ratio/low_mean": 0.0,
"eval_clip_ratio/low_min": 0.0,
"eval_clip_ratio/region_mean": 0.0,
"eval_completions/clipped_ratio": 0.0,
"eval_completions/max_length": 529.84,
"eval_completions/max_terminated_length": 529.84,
"eval_completions/mean_length": 342.17125,
"eval_completions/mean_terminated_length": 342.17125,
"eval_completions/min_length": 239.0,
"eval_completions/min_terminated_length": 239.0,
"eval_frac_reward_zero_std": 0.655,
"eval_loss": 0.0062618558295071125,
"eval_num_tokens": 13500408.0,
"eval_reward": 7.520802612304688,
"eval_reward_std": 0.08726703974418343,
"eval_rewards/accuracy_reward/mean": 0.795,
"eval_rewards/accuracy_reward/std": 0.36195871770381927,
"eval_rewards/chart_type_reward/mean": 0.605,
"eval_rewards/chart_type_reward/std": 0.4631432008743286,
"eval_rewards/format_reward/mean": 2.0,
"eval_rewards/format_reward/std": 0.0,
"eval_rewards/length_think_reward/mean": 1.5,
"eval_rewards/length_think_reward/std": 0.0,
"eval_rewards/num_token_reward/mean": 1.0,
"eval_rewards/num_token_reward/std": 0.0,
"eval_rewards/process_style_reward/mean": 0.8638026165962219,
"eval_rewards/process_style_reward/std": 0.26793761402368543,
"eval_rewards/table_style_reward/mean": 0.7570000004768371,
"eval_rewards/table_style_reward/std": 0.022176709175109863,
"eval_runtime": 388.0338,
"eval_samples_per_second": 0.515,
"eval_steps_per_second": 0.018,
"step": 1500
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01,
"completions/max_length": 406.2,
"completions/max_terminated_length": 398.56,
"completions/mean_length": 341.1325,
"completions/mean_terminated_length": 337.2075,
"completions/min_length": 283.22,
"completions/min_terminated_length": 283.22,
"epoch": 1.0333333333333334,
"frac_reward_zero_std": 0.1,
"grad_norm": 3.8348164536534792,
"learning_rate": 7.418333333333333e-07,
"loss": -0.002,
"num_tokens": 13929829.0,
"reward": 9.53626503944397,
"reward_std": 0.3728026695176959,
"rewards/accuracy_reward/mean": 0.7,
"rewards/accuracy_reward/std": 0.3113518291711807,
"rewards/chart_type_reward/mean": 0.8775,
"rewards/chart_type_reward/std": 0.13535646140575408,
"rewards/format_reward/mean": 1.975,
"rewards/format_reward/std": 0.03552303433418274,
"rewards/length_think_reward/mean": 1.495625,
"rewards/length_think_reward/std": 0.0061997933685779575,
"rewards/num_token_reward/mean": 0.9875,
"rewards/num_token_reward/std": 0.01776151716709137,
"rewards/process_style_reward/mean": 1.5947026538848876,
"rewards/process_style_reward/std": 0.43727443397045135,
"rewards/table_style_reward/mean": 1.905937442779541,
"rewards/table_style_reward/std": 0.3321183892339468,
"step": 1550
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 399.86,
"completions/max_terminated_length": 399.86,
"completions/mean_length": 318.575,
"completions/mean_terminated_length": 318.575,
"completions/min_length": 249.92,
"completions/min_terminated_length": 249.92,
"epoch": 1.0666666666666667,
"frac_reward_zero_std": 0.08,
"grad_norm": 7.001967911074153,
"learning_rate": 7.335e-07,
"loss": -0.0028,
"num_tokens": 14350287.0,
"reward": 9.724173564910888,
"reward_std": 0.34998391315340993,
"rewards/accuracy_reward/mean": 0.8025,
"rewards/accuracy_reward/std": 0.2594077849388123,
"rewards/chart_type_reward/mean": 0.93,
"rewards/chart_type_reward/std": 0.07483314633369446,
"rewards/format_reward/mean": 1.995,
"rewards/format_reward/std": 0.014142135381698609,
"rewards/length_think_reward/mean": 1.5,
"rewards/length_think_reward/std": 0.0,
"rewards/num_token_reward/mean": 0.9975,
"rewards/num_token_reward/std": 0.007071067690849304,
"rewards/process_style_reward/mean": 1.4950222504138946,
"rewards/process_style_reward/std": 0.4426997843384743,
"rewards/table_style_reward/mean": 2.004151337146759,
"rewards/table_style_reward/std": 0.32994183532893656,
"step": 1600
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0025,
"completions/max_length": 388.28,
"completions/max_terminated_length": 388.08,
"completions/mean_length": 315.705,
"completions/mean_terminated_length": 314.9003576660156,
"completions/min_length": 250.5,
"completions/min_terminated_length": 250.5,
"epoch": 1.1,
"frac_reward_zero_std": 0.1,
"grad_norm": 9.088691176190808,
"learning_rate": 7.251666666666665e-07,
"loss": -0.0016,
"num_tokens": 14770033.0,
"reward": 9.393507385253907,
"reward_std": 0.37204572021961213,
"rewards/accuracy_reward/mean": 0.6925,
"rewards/accuracy_reward/std": 0.31723429918289187,
"rewards/chart_type_reward/mean": 0.91,
"rewards/chart_type_reward/std": 0.09621404528617859,
"rewards/format_reward/mean": 1.995,
"rewards/format_reward/std": 0.014142135381698609,
"rewards/length_think_reward/mean": 1.5,
"rewards/length_think_reward/std": 0.0,
"rewards/num_token_reward/mean": 0.9975,
"rewards/num_token_reward/std": 0.007071067690849304,
"rewards/process_style_reward/mean": 1.388883638381958,
"rewards/process_style_reward/std": 0.37191372729837896,
"rewards/table_style_reward/mean": 1.909623656272888,
"rewards/table_style_reward/std": 0.4070011004060507,
"step": 1650
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 399.28,
"completions/max_terminated_length": 399.28,
"completions/mean_length": 331.9925,
"completions/mean_terminated_length": 331.9925,
"completions/min_length": 274.02,
"completions/min_terminated_length": 274.02,
"epoch": 1.1333333333333333,
"frac_reward_zero_std": 0.09,
"grad_norm": 2.910463682760087,
"learning_rate": 7.168333333333333e-07,
"loss": -0.0014,
"num_tokens": 15195754.0,
"reward": 9.631321449279785,
"reward_std": 0.3087148568034172,
"rewards/accuracy_reward/mean": 0.75,
"rewards/accuracy_reward/std": 0.22186374604701997,
"rewards/chart_type_reward/mean": 0.91,
"rewards/chart_type_reward/std": 0.09621404528617859,
"rewards/format_reward/mean": 2.0,
"rewards/format_reward/std": 0.0,
"rewards/length_think_reward/mean": 1.5,
"rewards/length_think_reward/std": 0.0,
"rewards/num_token_reward/mean": 1.0,
"rewards/num_token_reward/std": 0.0,
"rewards/process_style_reward/mean": 1.5510436356067658,
"rewards/process_style_reward/std": 0.4377661471068859,
"rewards/table_style_reward/mean": 1.9202778291702272,
"rewards/table_style_reward/std": 0.3158441584557295,
"step": 1700
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 403.14,
"completions/max_terminated_length": 403.14,
"completions/mean_length": 334.7425,
"completions/mean_terminated_length": 334.7425,
"completions/min_length": 271.28,
"completions/min_terminated_length": 271.28,
"epoch": 1.1666666666666667,
"frac_reward_zero_std": 0.08,
"grad_norm": 2.309058895100409,
"learning_rate": 7.085e-07,
"loss": -0.002,
"num_tokens": 15622895.0,
"reward": 9.747770833969117,
"reward_std": 0.3141957564931363,
"rewards/accuracy_reward/mean": 0.835,
"rewards/accuracy_reward/std": 0.1784460115432739,
"rewards/chart_type_reward/mean": 0.93,
"rewards/chart_type_reward/std": 0.07483314633369446,
"rewards/format_reward/mean": 2.0,
"rewards/format_reward/std": 0.0,
"rewards/length_think_reward/mean": 1.5,
"rewards/length_think_reward/std": 0.0,
"rewards/num_token_reward/mean": 1.0,
"rewards/num_token_reward/std": 0.0,
"rewards/process_style_reward/mean": 1.5280090761184693,
"rewards/process_style_reward/std": 0.40883125707507134,
"rewards/table_style_reward/mean": 1.954761769771576,
"rewards/table_style_reward/std": 0.2725959676504135,
"step": 1750
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.005,
"completions/max_length": 411.5,
"completions/max_terminated_length": 407.24,
"completions/mean_length": 346.24,
"completions/mean_terminated_length": 344.6991668701172,
"completions/min_length": 293.46,
"completions/min_terminated_length": 293.46,
"epoch": 1.2,
"frac_reward_zero_std": 0.15,
"grad_norm": 2.902060665970371,
"learning_rate": 7.001666666666667e-07,
"loss": 0.0016,
"num_tokens": 16054907.0,
"reward": 9.750760135650635,
"reward_std": 0.31334371890872714,
"rewards/accuracy_reward/mean": 0.79,
"rewards/accuracy_reward/std": 0.23819708824157715,
"rewards/chart_type_reward/mean": 0.95,
"rewards/chart_type_reward/std": 0.05345224738121033,
"rewards/format_reward/mean": 1.99,
"rewards/format_reward/std": 0.01851640224456787,
"rewards/length_think_reward/mean": 1.49875,
"rewards/length_think_reward/std": 0.002314550280570984,
"rewards/num_token_reward/mean": 0.995,
"rewards/num_token_reward/std": 0.009258201122283935,
"rewards/process_style_reward/mean": 1.5837038934230805,
"rewards/process_style_reward/std": 0.40160174869000914,
"rewards/table_style_reward/mean": 1.9433062982559204,
"rewards/table_style_reward/std": 0.311205018684268,
"step": 1800
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 413.98,
"completions/max_terminated_length": 413.98,
"completions/mean_length": 346.11,
"completions/mean_terminated_length": 346.11,
"completions/min_length": 287.76,
"completions/min_terminated_length": 287.76,
"epoch": 1.2333333333333334,
"frac_reward_zero_std": 0.06,
"grad_norm": 3.4920562160852544,
"learning_rate": 6.918333333333333e-07,
"loss": 0.0001,
"num_tokens": 16486455.0,
"reward": 9.697437267303467,
"reward_std": 0.3302338109910488,
"rewards/accuracy_reward/mean": 0.775,
"rewards/accuracy_reward/std": 0.26244561791419985,
"rewards/chart_type_reward/mean": 0.89,
"rewards/chart_type_reward/std": 0.12432654678821564,
"rewards/format_reward/mean": 2.0,
"rewards/format_reward/std": 0.0,
"rewards/length_think_reward/mean": 1.5,
"rewards/length_think_reward/std": 0.0,
"rewards/num_token_reward/mean": 1.0,
"rewards/num_token_reward/std": 0.0,
"rewards/process_style_reward/mean": 1.6095642995834352,
"rewards/process_style_reward/std": 0.4244466606155038,
"rewards/table_style_reward/mean": 1.922872955799103,
"rewards/table_style_reward/std": 0.3481115462630987,
"step": 1850
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.005,
"completions/max_length": 435.06,
"completions/max_terminated_length": 435.04,
"completions/mean_length": 364.25,
"completions/mean_terminated_length": 363.41583374023435,
"completions/min_length": 306.08,
"completions/min_terminated_length": 306.08,
"epoch": 1.2666666666666666,
"frac_reward_zero_std": 0.12,
"grad_norm": 3.4554706037204777,
"learning_rate": 6.835e-07,
"loss": -0.0042,
"num_tokens": 16925067.0,
"reward": 9.758917856216431,
"reward_std": 0.3286038258485496,
"rewards/accuracy_reward/mean": 0.7975,
"rewards/accuracy_reward/std": 0.2225467497110367,
"rewards/chart_type_reward/mean": 0.8925,
"rewards/chart_type_reward/std": 0.08190421402454376,
"rewards/format_reward/mean": 1.99,
"rewards/format_reward/std": 0.01851640224456787,
"rewards/length_think_reward/mean": 1.499375,
"rewards/length_think_reward/std": 0.001767766922712326,
"rewards/num_token_reward/mean": 0.995,
"rewards/num_token_reward/std": 0.009258201122283935,
"rewards/process_style_reward/mean": 1.619047586917877,
"rewards/process_style_reward/std": 0.4610636526346207,
"rewards/table_style_reward/mean": 1.9654952633380889,
"rewards/table_style_reward/std": 0.279680118188262,
"step": 1900
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 470.18,
"completions/max_terminated_length": 470.18,
"completions/mean_length": 388.0075,
"completions/mean_terminated_length": 388.0075,
"completions/min_length": 316.4,
"completions/min_terminated_length": 316.4,
"epoch": 1.3,
"frac_reward_zero_std": 0.15,
"grad_norm": 2.4363619566126182,
"learning_rate": 6.751666666666667e-07,
"loss": 0.0043,
"num_tokens": 17373038.0,
"reward": 9.906033611297607,
"reward_std": 0.30710218355059626,
"rewards/accuracy_reward/mean": 0.81,
"rewards/accuracy_reward/std": 0.2447742748260498,
"rewards/chart_type_reward/mean": 0.935,
"rewards/chart_type_reward/std": 0.07340089797973633,
"rewards/format_reward/mean": 2.0,
"rewards/format_reward/std": 0.0,
"rewards/length_think_reward/mean": 1.5,
"rewards/length_think_reward/std": 0.0,
"rewards/num_token_reward/mean": 1.0,
"rewards/num_token_reward/std": 0.0,
"rewards/process_style_reward/mean": 1.6697776758670806,
"rewards/process_style_reward/std": 0.4667322512716055,
"rewards/table_style_reward/mean": 1.9912559032440185,
"rewards/table_style_reward/std": 0.33009951261803505,
"step": 1950
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 484.88,
"completions/max_terminated_length": 484.88,
"completions/mean_length": 407.5,
"completions/mean_terminated_length": 407.5,
"completions/min_length": 341.34,
"completions/min_terminated_length": 341.34,
"epoch": 1.3333333333333333,
"frac_reward_zero_std": 0.26,
"grad_norm": 4.715118687872951,
"learning_rate": 6.668333333333332e-07,
"loss": 0.0029,
"num_tokens": 17829218.0,
"reward": 9.806926441192626,
"reward_std": 0.23523858685046434,
"rewards/accuracy_reward/mean": 0.805,
"rewards/accuracy_reward/std": 0.2573123925924301,
"rewards/chart_type_reward/mean": 0.965,
"rewards/chart_type_reward/std": 0.0462134838104248,
"rewards/format_reward/mean": 2.0,
"rewards/format_reward/std": 0.0,
"rewards/length_think_reward/mean": 1.5,
"rewards/length_think_reward/std": 0.0,
"rewards/num_token_reward/mean": 1.0,
"rewards/num_token_reward/std": 0.0,
"rewards/process_style_reward/mean": 1.586816476583481,
"rewards/process_style_reward/std": 0.42196598000824453,
"rewards/table_style_reward/mean": 1.9501098704338073,
"rewards/table_style_reward/std": 0.2974155292659998,
"step": 2000
},
{
"epoch": 1.3333333333333333,
"eval_clip_ratio/high_max": 0.0,
"eval_clip_ratio/high_mean": 0.0,
"eval_clip_ratio/low_mean": 0.0,
"eval_clip_ratio/low_min": 0.0,
"eval_clip_ratio/region_mean": 0.0,
"eval_completions/clipped_ratio": 0.00125,
"eval_completions/max_length": 584.64,
"eval_completions/max_terminated_length": 584.32,
"eval_completions/mean_length": 393.495,
"eval_completions/mean_terminated_length": 393.06229736328123,
"eval_completions/min_length": 277.64,
"eval_completions/min_terminated_length": 277.64,
"eval_frac_reward_zero_std": 0.65,
"eval_loss": 0.005326578393578529,
"eval_num_tokens": 17829218.0,
"eval_reward": 7.547686309814453,
"eval_reward_std": 0.10336805308703333,
"eval_rewards/accuracy_reward/mean": 0.83125,
"eval_rewards/accuracy_reward/std": 0.3227571386098862,
"eval_rewards/chart_type_reward/mean": 0.61625,
"eval_rewards/chart_type_reward/std": 0.46573135137557986,
"eval_rewards/format_reward/mean": 1.995,
"eval_rewards/format_reward/std": 0.019674774408340454,
"eval_rewards/length_think_reward/mean": 1.5,
"eval_rewards/length_think_reward/std": 0.0,
"eval_rewards/num_token_reward/mean": 0.9975,
"eval_rewards/num_token_reward/std": 0.009837387204170227,
"eval_rewards/process_style_reward/mean": 0.8522800421714782,
"eval_rewards/process_style_reward/std": 0.25847422659397123,
"eval_rewards/table_style_reward/mean": 0.7554062509536743,
"eval_rewards/table_style_reward/std": 0.032950252890586854,
"eval_runtime": 422.8962,
"eval_samples_per_second": 0.473,
"eval_steps_per_second": 0.017,
"step": 2000
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 453.4,
"completions/max_terminated_length": 453.4,
"completions/mean_length": 380.1175,
"completions/mean_terminated_length": 380.1175,
"completions/min_length": 314.92,
"completions/min_terminated_length": 314.92,
"epoch": 1.3666666666666667,
"frac_reward_zero_std": 0.14,
"grad_norm": 1.3071829991089465,
"learning_rate": 6.584999999999999e-07,
"loss": 0.0006,
"num_tokens": 18274577.0,
"reward": 9.592507076263427,
"reward_std": 0.2941449248045683,
"rewards/accuracy_reward/mean": 0.71,
"rewards/accuracy_reward/std": 0.34862515032291413,
"rewards/chart_type_reward/mean": 0.95,
"rewards/chart_type_reward/std": 0.05345224738121033,
"rewards/format_reward/mean": 2.0,
"rewards/format_reward/std": 0.0,
"rewards/length_think_reward/mean": 1.5,
"rewards/length_think_reward/std": 0.0,
"rewards/num_token_reward/mean": 1.0,
"rewards/num_token_reward/std": 0.0,
"rewards/process_style_reward/mean": 1.5190071046352387,
"rewards/process_style_reward/std": 0.406856027841568,
"rewards/table_style_reward/mean": 1.9135000681877137,
"rewards/table_style_reward/std": 0.33413831643760206,
"step": 2050
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0025,
"completions/max_length": 449.9,
"completions/max_terminated_length": 449.66,
"completions/mean_length": 374.58,
"completions/mean_terminated_length": 373.85250061035157,
"completions/min_length": 307.08,
"completions/min_terminated_length": 307.08,
"epoch": 1.4,
"frac_reward_zero_std": 0.12,
"grad_norm": 17.23901734959147,
"learning_rate": 6.501666666666666e-07,
"loss": 0.0013,
"num_tokens": 18717565.0,
"reward": 9.62658073425293,
"reward_std": 0.41098684968426824,
"rewards/accuracy_reward/mean": 0.7425,
"rewards/accuracy_reward/std": 0.3153866308927536,
"rewards/chart_type_reward/mean": 0.915,
"rewards/chart_type_reward/std": 0.09478179693222046,
"rewards/format_reward/mean": 1.99,
"rewards/format_reward/std": 0.028284270763397217,
"rewards/length_think_reward/mean": 1.499375,
"rewards/length_think_reward/std": 0.001767766922712326,
"rewards/num_token_reward/mean": 0.995,
"rewards/num_token_reward/std": 0.014142135381698609,
"rewards/process_style_reward/mean": 1.5444307351112365,
"rewards/process_style_reward/std": 0.40891373321413993,
"rewards/table_style_reward/mean": 1.9402749633789063,
"rewards/table_style_reward/std": 0.3305619989708066,
"step": 2100
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 408.34,
"completions/max_terminated_length": 408.34,
"completions/mean_length": 335.9,
"completions/mean_terminated_length": 335.9,
"completions/min_length": 273.7,
"completions/min_terminated_length": 273.7,
"epoch": 1.4333333333333333,
"frac_reward_zero_std": 0.11,
"grad_norm": 3.329962314867653,
"learning_rate": 6.418333333333333e-07,
"loss": -0.0013,
"num_tokens": 19145669.0,
"reward": 9.794802322387696,
"reward_std": 0.2854239001870155,
"rewards/accuracy_reward/mean": 0.7575,
"rewards/accuracy_reward/std": 0.28735102355480197,
"rewards/chart_type_reward/mean": 0.95,
"rewards/chart_type_reward/std": 0.05345224738121033,
"rewards/format_reward/mean": 2.0,
"rewards/format_reward/std": 0.0,
"rewards/length_think_reward/mean": 1.5,
"rewards/length_think_reward/std": 0.0,
"rewards/num_token_reward/mean": 1.0,
"rewards/num_token_reward/std": 0.0,
"rewards/process_style_reward/mean": 1.5570359444618225,
"rewards/process_style_reward/std": 0.418922475874424,
"rewards/table_style_reward/mean": 2.0302664685249328,
"rewards/table_style_reward/std": 0.30267744371667504,
"step": 2150
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 395.74,
"completions/max_terminated_length": 395.74,
"completions/mean_length": 329.2375,
"completions/mean_terminated_length": 329.2375,
"completions/min_length": 270.2,
"completions/min_terminated_length": 270.2,
"epoch": 1.4666666666666668,
"frac_reward_zero_std": 0.1,
"grad_norm": 6.069260839474777,
"learning_rate": 6.335e-07,
"loss": -0.002,
"num_tokens": 19570032.0,
"reward": 9.73668830871582,
"reward_std": 0.34456304393708703,
"rewards/accuracy_reward/mean": 0.765,
"rewards/accuracy_reward/std": 0.1955270314216614,
"rewards/chart_type_reward/mean": 0.92,
"rewards/chart_type_reward/std": 0.08552359580993653,
"rewards/format_reward/mean": 1.99,
"rewards/format_reward/std": 0.028284270763397217,
"rewards/length_think_reward/mean": 1.49625,
"rewards/length_think_reward/std": 0.010606602281332016,
"rewards/num_token_reward/mean": 0.995,
"rewards/num_token_reward/std": 0.014142135381698609,
"rewards/process_style_reward/mean": 1.5674074435234069,
"rewards/process_style_reward/std": 0.415806692391634,
"rewards/table_style_reward/mean": 2.00303085565567,
"rewards/table_style_reward/std": 0.2655815637484193,
"step": 2200
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 425.58,
"completions/max_terminated_length": 425.58,
"completions/mean_length": 349.7875,
"completions/mean_terminated_length": 349.7875,
"completions/min_length": 279.34,
"completions/min_terminated_length": 279.34,
"epoch": 1.5,
"frac_reward_zero_std": 0.13,
"grad_norm": 2.1846728325533844,
"learning_rate": 6.251666666666667e-07,
"loss": 0.0,
"num_tokens": 20002643.0,
"reward": 9.64163824081421,
"reward_std": 0.32703839337453244,
"rewards/accuracy_reward/mean": 0.74,
"rewards/accuracy_reward/std": 0.3088252305984497,
"rewards/chart_type_reward/mean": 0.93,
"rewards/chart_type_reward/std": 0.07483314633369446,
"rewards/format_reward/mean": 2.0,
"rewards/format_reward/std": 0.0,
"rewards/length_think_reward/mean": 1.5,
"rewards/length_think_reward/std": 0.0,
"rewards/num_token_reward/mean": 1.0,
"rewards/num_token_reward/std": 0.0,
"rewards/process_style_reward/mean": 1.584349147081375,
"rewards/process_style_reward/std": 0.3927363380789757,
"rewards/table_style_reward/mean": 1.887289083003998,
"rewards/table_style_reward/std": 0.3619528949260712,
"step": 2250
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 402.0,
"completions/max_terminated_length": 402.0,
"completions/mean_length": 343.5925,
"completions/mean_terminated_length": 343.5925,
"completions/min_length": 286.94,
"completions/min_terminated_length": 286.94,
"epoch": 1.5333333333333332,
"frac_reward_zero_std": 0.12,
"grad_norm": 6.91609600549848,
"learning_rate": 6.168333333333333e-07,
"loss": -0.0033,
"num_tokens": 20433580.0,
"reward": 9.802964372634888,
"reward_std": 0.2991816225461662,
"rewards/accuracy_reward/mean": 0.7675,
"rewards/accuracy_reward/std": 0.261347194314003,
"rewards/chart_type_reward/mean": 0.94,
"rewards/chart_type_reward/std": 0.06414269685745239,
"rewards/format_reward/mean": 2.0,
"rewards/format_reward/std": 0.0,
"rewards/length_think_reward/mean": 1.5,
"rewards/length_think_reward/std": 0.0,
"rewards/num_token_reward/mean": 1.0,
"rewards/num_token_reward/std": 0.0,
"rewards/process_style_reward/mean": 1.6691261100769044,
"rewards/process_style_reward/std": 0.37101606719195845,
"rewards/table_style_reward/mean": 1.9263382363319397,
"rewards/table_style_reward/std": 0.32372167307883504,
"step": 2300
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01,
"completions/max_length": 431.12,
"completions/max_terminated_length": 424.64,
"completions/mean_length": 358.315,
"completions/mean_terminated_length": 354.0025,
"completions/min_length": 294.78,
"completions/min_terminated_length": 294.78,
"epoch": 1.5666666666666667,
"frac_reward_zero_std": 0.24,
"grad_norm": 0.0,
"learning_rate": 6.085e-07,
"loss": 0.0007,
"num_tokens": 20870814.0,
"reward": 9.584474143981934,
"reward_std": 0.24904431821312756,
"rewards/accuracy_reward/mean": 0.705,
"rewards/accuracy_reward/std": 0.27204171717166903,
"rewards/chart_type_reward/mean": 0.94,
"rewards/chart_type_reward/std": 0.06414269685745239,
"rewards/format_reward/mean": 1.98,
"rewards/format_reward/std": 0.021380898952484132,
"rewards/length_think_reward/mean": 1.499375,
"rewards/length_think_reward/std": 0.001767766922712326,
"rewards/num_token_reward/mean": 0.99,
"rewards/num_token_reward/std": 0.010690449476242066,
"rewards/process_style_reward/mean": 1.5493565130233764,
"rewards/process_style_reward/std": 0.4589155162498355,
"rewards/table_style_reward/mean": 1.920742678642273,
"rewards/table_style_reward/std": 0.3032746136933565,
"step": 2350
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 391.84,
"completions/max_terminated_length": 391.84,
"completions/mean_length": 330.47,
"completions/mean_terminated_length": 330.47,
"completions/min_length": 274.26,
"completions/min_terminated_length": 274.26,
"epoch": 1.6,
"frac_reward_zero_std": 0.18,
"grad_norm": 2.026677468691953,
"learning_rate": 6.001666666666666e-07,
"loss": -0.0019,
"num_tokens": 21297010.0,
"reward": 9.688363914489747,
"reward_std": 0.27519391929730774,
"rewards/accuracy_reward/mean": 0.76,
"rewards/accuracy_reward/std": 0.2853331530094147,
"rewards/chart_type_reward/mean": 0.94,
"rewards/chart_type_reward/std": 0.06414269685745239,
"rewards/format_reward/mean": 2.0,
"rewards/format_reward/std": 0.0,
"rewards/length_think_reward/mean": 1.5,
"rewards/length_think_reward/std": 0.0,
"rewards/num_token_reward/mean": 1.0,
"rewards/num_token_reward/std": 0.0,
"rewards/process_style_reward/mean": 1.6027523398399353,
"rewards/process_style_reward/std": 0.3893206799030304,
"rewards/table_style_reward/mean": 1.8856116580963134,
"rewards/table_style_reward/std": 0.3420239106938243,
"step": 2400
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 380.06,
"completions/max_terminated_length": 380.06,
"completions/mean_length": 313.7925,
"completions/mean_terminated_length": 313.7925,
"completions/min_length": 256.48,
"completions/min_terminated_length": 256.48,
"epoch": 1.6333333333333333,
"frac_reward_zero_std": 0.13,
"grad_norm": 5.555080099940961,
"learning_rate": 5.918333333333333e-07,
"loss": -0.0004,
"num_tokens": 21715431.0,
"reward": 9.615154209136962,
"reward_std": 0.2913367236009799,
"rewards/accuracy_reward/mean": 0.765,
"rewards/accuracy_reward/std": 0.2847459638118744,
"rewards/chart_type_reward/mean": 0.92,
"rewards/chart_type_reward/std": 0.08552359580993653,
"rewards/format_reward/mean": 2.0,
"rewards/format_reward/std": 0.0,
"rewards/length_think_reward/mean": 1.5,
"rewards/length_think_reward/std": 0.0,
"rewards/num_token_reward/mean": 1.0,
"rewards/num_token_reward/std": 0.0,
"rewards/process_style_reward/mean": 1.5297248589992523,
"rewards/process_style_reward/std": 0.3912577797472477,
"rewards/table_style_reward/mean": 1.9004293370246887,
"rewards/table_style_reward/std": 0.3435662076622248,
"step": 2450
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 381.04,
"completions/max_terminated_length": 381.04,
"completions/mean_length": 315.7675,
"completions/mean_terminated_length": 315.7675,
"completions/min_length": 256.08,
"completions/min_terminated_length": 256.08,
"epoch": 1.6666666666666665,
"frac_reward_zero_std": 0.15,
"grad_norm": 15.55412501444908,
"learning_rate": 5.835e-07,
"loss": -0.0012,
"num_tokens": 22134838.0,
"reward": 9.620207805633544,
"reward_std": 0.3084538695216179,
"rewards/accuracy_reward/mean": 0.75,
"rewards/accuracy_reward/std": 0.26951261222362516,
"rewards/chart_type_reward/mean": 0.8775,
"rewards/chart_type_reward/std": 0.1279459285736084,
"rewards/format_reward/mean": 1.99,
"rewards/format_reward/std": 0.01851640224456787,
"rewards/length_think_reward/mean": 1.5,
"rewards/length_think_reward/std": 0.0,
"rewards/num_token_reward/mean": 0.995,
"rewards/num_token_reward/std": 0.009258201122283935,
"rewards/process_style_reward/mean": 1.5722642850875854,
"rewards/process_style_reward/std": 0.4096125695109367,
"rewards/table_style_reward/mean": 1.9354435563087464,
"rewards/table_style_reward/std": 0.25229784632101654,
"step": 2500
},
{
"epoch": 1.6666666666666665,
"eval_clip_ratio/high_max": 0.0,
"eval_clip_ratio/high_mean": 0.0,
"eval_clip_ratio/low_mean": 0.0,
"eval_clip_ratio/low_min": 0.0,
"eval_clip_ratio/region_mean": 0.0,
"eval_completions/clipped_ratio": 0.0,
"eval_completions/max_length": 444.2,
"eval_completions/max_terminated_length": 444.2,
"eval_completions/mean_length": 306.56375,
"eval_completions/mean_terminated_length": 306.56375,
"eval_completions/min_length": 212.04,
"eval_completions/min_terminated_length": 212.04,
"eval_frac_reward_zero_std": 0.69,
"eval_loss": 0.0007887376705184579,
"eval_num_tokens": 22134838.0,
"eval_reward": 7.574729671478272,
"eval_reward_std": 0.06697057218523696,
"eval_rewards/accuracy_reward/mean": 0.84625,
"eval_rewards/accuracy_reward/std": 0.3025382542610168,
"eval_rewards/chart_type_reward/mean": 0.625,
"eval_rewards/chart_type_reward/std": 0.463611079454422,
"eval_rewards/format_reward/mean": 2.0,
"eval_rewards/format_reward/std": 0.0,
"eval_rewards/length_think_reward/mean": 1.5,
"eval_rewards/length_think_reward/std": 0.0,
"eval_rewards/num_token_reward/mean": 1.0,
"eval_rewards/num_token_reward/std": 0.0,
"eval_rewards/process_style_reward/mean": 0.8455421495437622,
"eval_rewards/process_style_reward/std": 0.2721589285135269,
"eval_rewards/table_style_reward/mean": 0.7579375004768372,
"eval_rewards/table_style_reward/std": 0.020825568437576294,
"eval_runtime": 332.6249,
"eval_samples_per_second": 0.601,
"eval_steps_per_second": 0.021,
"step": 2500
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 370.08,
"completions/max_terminated_length": 370.08,
"completions/mean_length": 310.395,
"completions/mean_terminated_length": 310.395,
"completions/min_length": 252.64,
"completions/min_terminated_length": 252.64,
"epoch": 1.7,
"frac_reward_zero_std": 0.18,
"grad_norm": 1.9862931479259052,
"learning_rate": 5.751666666666667e-07,
"loss": -0.0015,
"num_tokens": 22551672.0,
"reward": 9.761021976470948,
"reward_std": 0.2495887253805995,
"rewards/accuracy_reward/mean": 0.715,
"rewards/accuracy_reward/std": 0.2910652804374695,
"rewards/chart_type_reward/mean": 0.92,
"rewards/chart_type_reward/std": 0.08552359580993653,
"rewards/format_reward/mean": 2.0,
"rewards/format_reward/std": 0.0,
"rewards/length_think_reward/mean": 1.5,
"rewards/length_think_reward/std": 0.0,
"rewards/num_token_reward/mean": 1.0,
"rewards/num_token_reward/std": 0.0,
"rewards/process_style_reward/mean": 1.6147488832473755,
"rewards/process_style_reward/std": 0.4376735435426235,
"rewards/table_style_reward/mean": 2.0112730717658995,
"rewards/table_style_reward/std": 0.25577211238443853,
"step": 2550
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 378.76,
"completions/max_terminated_length": 378.76,
"completions/mean_length": 305.165,
"completions/mean_terminated_length": 305.165,
"completions/min_length": 235.9,
"completions/min_terminated_length": 235.9,
"epoch": 1.7333333333333334,
"frac_reward_zero_std": 0.18,
"grad_norm": 3.8761139220743552,
"learning_rate": 5.668333333333333e-07,
"loss": -0.0004,
"num_tokens": 22966918.0,
"reward": 9.719212608337402,
"reward_std": 0.2860183835402131,
"rewards/accuracy_reward/mean": 0.72,
"rewards/accuracy_reward/std": 0.27398112654685974,
"rewards/chart_type_reward/mean": 0.875,
"rewards/chart_type_reward/std": 0.09478179693222046,
"rewards/format_reward/mean": 2.0,
"rewards/format_reward/std": 0.0,
"rewards/length_think_reward/mean": 1.5,
"rewards/length_think_reward/std": 0.0,
"rewards/num_token_reward/mean": 1.0,
"rewards/num_token_reward/std": 0.0,
"rewards/process_style_reward/mean": 1.5869954061508178,
"rewards/process_style_reward/std": 0.3870758730173111,
"rewards/table_style_reward/mean": 2.037217149734497,
"rewards/table_style_reward/std": 0.29504469502717257,
"step": 2600
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 404.16,
"completions/max_terminated_length": 404.16,
"completions/mean_length": 327.3175,
"completions/mean_terminated_length": 327.3175,
"completions/min_length": 258.78,
"completions/min_terminated_length": 258.78,
"epoch": 1.7666666666666666,
"frac_reward_zero_std": 0.12,
"grad_norm": 4.5948173272275,
"learning_rate": 5.584999999999999e-07,
"loss": -0.0004,
"num_tokens": 23391041.0,
"reward": 9.734212398529053,
"reward_std": 0.3367015665024519,
"rewards/accuracy_reward/mean": 0.7325,
"rewards/accuracy_reward/std": 0.3094827342033386,
"rewards/chart_type_reward/mean": 0.9,
"rewards/chart_type_reward/std": 0.10690449476242066,
"rewards/format_reward/mean": 2.0,
"rewards/format_reward/std": 0.0,
"rewards/length_think_reward/mean": 1.5,
"rewards/length_think_reward/std": 0.0,
"rewards/num_token_reward/mean": 1.0,
"rewards/num_token_reward/std": 0.0,
"rewards/process_style_reward/mean": 1.649822280406952,
"rewards/process_style_reward/std": 0.43425638109445575,
"rewards/table_style_reward/mean": 1.9518901491165161,
"rewards/table_style_reward/std": 0.2947008777409792,
"step": 2650
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 373.46,
"completions/max_terminated_length": 373.46,
"completions/mean_length": 316.095,
"completions/mean_terminated_length": 316.095,
"completions/min_length": 264.2,
"completions/min_terminated_length": 264.2,
"epoch": 1.8,
"frac_reward_zero_std": 0.12,
"grad_norm": 2.8524341866405294,
"learning_rate": 5.501666666666666e-07,
"loss": -0.0016,
"num_tokens": 23809963.0,
"reward": 9.839047288894653,
"reward_std": 0.2964446726441383,
"rewards/accuracy_reward/mean": 0.7825,
"rewards/accuracy_reward/std": 0.22414826095104218,
"rewards/chart_type_reward/mean": 0.88,
"rewards/chart_type_reward/std": 0.12828539371490477,
"rewards/format_reward/mean": 2.0,
"rewards/format_reward/std": 0.0,
"rewards/length_think_reward/mean": 1.5,
"rewards/length_think_reward/std": 0.0,
"rewards/num_token_reward/mean": 1.0,
"rewards/num_token_reward/std": 0.0,
"rewards/process_style_reward/mean": 1.6252618777751922,
"rewards/process_style_reward/std": 0.43019559178501365,
"rewards/table_style_reward/mean": 2.0512854528427122,
"rewards/table_style_reward/std": 0.3453987674787641,
"step": 2700
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.005,
"completions/max_length": 405.08,
"completions/max_terminated_length": 405.02,
"completions/mean_length": 330.52,
"completions/mean_terminated_length": 328.8041668701172,
"completions/min_length": 260.76,
"completions/min_terminated_length": 260.76,
"epoch": 1.8333333333333335,
"frac_reward_zero_std": 0.21,
"grad_norm": 3.086957169930822,
"learning_rate": 5.418333333333332e-07,
"loss": 0.0012,
"num_tokens": 24235339.0,
"reward": 9.4709245967865,
"reward_std": 0.3313306954503059,
"rewards/accuracy_reward/mean": 0.6475,
"rewards/accuracy_reward/std": 0.315896298289299,
"rewards/chart_type_reward/mean": 0.92,
"rewards/chart_type_reward/std": 0.08552359580993653,
"rewards/format_reward/mean": 1.985,
"rewards/format_reward/std": 0.03265853762626648,
"rewards/length_think_reward/mean": 1.5,
"rewards/length_think_reward/std": 0.0,
"rewards/num_token_reward/mean": 0.9925,
"rewards/num_token_reward/std": 0.01632926881313324,
"rewards/process_style_reward/mean": 1.5009058022499084,
"rewards/process_style_reward/std": 0.41441123938187957,
"rewards/table_style_reward/mean": 1.9250187492370605,
"rewards/table_style_reward/std": 0.3126064923405647,
"step": 2750
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 386.7,
"completions/max_terminated_length": 386.7,
"completions/mean_length": 320.0475,
"completions/mean_terminated_length": 320.0475,
"completions/min_length": 260.54,
"completions/min_terminated_length": 260.54,
"epoch": 1.8666666666666667,
"frac_reward_zero_std": 0.17,
"grad_norm": 12.10983335214368,
"learning_rate": 5.335e-07,
"loss": 0.0026,
"num_tokens": 24656314.0,
"reward": 9.687625350952148,
"reward_std": 0.2753653322113678,
"rewards/accuracy_reward/mean": 0.7375,
"rewards/accuracy_reward/std": 0.27269922077655795,
"rewards/chart_type_reward/mean": 0.9,
"rewards/chart_type_reward/std": 0.10690449476242066,
"rewards/format_reward/mean": 2.0,
"rewards/format_reward/std": 0.0,
"rewards/length_think_reward/mean": 1.5,
"rewards/length_think_reward/std": 0.0,
"rewards/num_token_reward/mean": 1.0,
"rewards/num_token_reward/std": 0.0,
"rewards/process_style_reward/mean": 1.6192821896076202,
"rewards/process_style_reward/std": 0.4189461704902351,
"rewards/table_style_reward/mean": 1.9308432340621948,
"rewards/table_style_reward/std": 0.31184935322031376,
"step": 2800
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 415.46,
"completions/max_terminated_length": 415.46,
"completions/mean_length": 346.0825,
"completions/mean_terminated_length": 346.0825,
"completions/min_length": 281.58,
"completions/min_terminated_length": 281.58,
"epoch": 1.9,
"frac_reward_zero_std": 0.14,
"grad_norm": 1.9872501325277399,
"learning_rate": 5.251666666666667e-07,
"loss": 0.0008,
"num_tokens": 25088451.0,
"reward": 9.661721420288085,
"reward_std": 0.27916239865124226,
"rewards/accuracy_reward/mean": 0.75,
"rewards/accuracy_reward/std": 0.2755652844905853,
"rewards/chart_type_reward/mean": 0.8975,
"rewards/chart_type_reward/std": 0.11397556245326995,
"rewards/format_reward/mean": 2.0,
"rewards/format_reward/std": 0.0,
"rewards/length_think_reward/mean": 1.5,
"rewards/length_think_reward/std": 0.0,
"rewards/num_token_reward/mean": 1.0,
"rewards/num_token_reward/std": 0.0,
"rewards/process_style_reward/mean": 1.5982041406631469,
"rewards/process_style_reward/std": 0.3970548979192972,
"rewards/table_style_reward/mean": 1.9160172486305236,
"rewards/table_style_reward/std": 0.2998810928501189,
"step": 2850
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 395.2,
"completions/max_terminated_length": 395.2,
"completions/mean_length": 320.93,
"completions/mean_terminated_length": 320.93,
"completions/min_length": 261.28,
"completions/min_terminated_length": 261.28,
"epoch": 1.9333333333333333,
"frac_reward_zero_std": 0.14,
"grad_norm": 2.9893443101936286,
"learning_rate": 5.168333333333334e-07,
"loss": 0.0022,
"num_tokens": 25510639.0,
"reward": 9.695433053970337,
"reward_std": 0.26316976999863984,
"rewards/accuracy_reward/mean": 0.78,
"rewards/accuracy_reward/std": 0.30066137969493867,
"rewards/chart_type_reward/mean": 0.91,
"rewards/chart_type_reward/std": 0.05345224738121033,
"rewards/format_reward/mean": 2.0,
"rewards/format_reward/std": 0.0,
"rewards/length_think_reward/mean": 1.5,
"rewards/length_think_reward/std": 0.0,
"rewards/num_token_reward/mean": 1.0,
"rewards/num_token_reward/std": 0.0,
"rewards/process_style_reward/mean": 1.5607620286941528,
"rewards/process_style_reward/std": 0.42447459913790225,
"rewards/table_style_reward/mean": 1.9446710109710694,
"rewards/table_style_reward/std": 0.30673831250518563,
"step": 2900
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 391.78,
"completions/max_terminated_length": 391.78,
"completions/mean_length": 321.075,
"completions/mean_terminated_length": 321.075,
"completions/min_length": 255.9,
"completions/min_terminated_length": 255.9,
"epoch": 1.9666666666666668,
"frac_reward_zero_std": 0.14,
"grad_norm": 1.9919505127604515,
"learning_rate": 5.085e-07,
"loss": -0.0031,
"num_tokens": 25932085.0,
"reward": 9.60394895553589,
"reward_std": 0.3279293935373426,
"rewards/accuracy_reward/mean": 0.69,
"rewards/accuracy_reward/std": 0.31488103687763214,
"rewards/chart_type_reward/mean": 0.9,
"rewards/chart_type_reward/std": 0.08552359580993653,
"rewards/format_reward/mean": 2.0,
"rewards/format_reward/std": 0.0,
"rewards/length_think_reward/mean": 1.5,
"rewards/length_think_reward/std": 0.0,
"rewards/num_token_reward/mean": 1.0,
"rewards/num_token_reward/std": 0.0,
"rewards/process_style_reward/mean": 1.5492809236049652,
"rewards/process_style_reward/std": 0.43841719649732114,
"rewards/table_style_reward/mean": 1.9646680784225463,
"rewards/table_style_reward/std": 0.2879634938389063,
"step": 2950
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 395.38,
"completions/max_terminated_length": 395.38,
"completions/mean_length": 328.125,
"completions/mean_terminated_length": 328.125,
"completions/min_length": 269.92,
"completions/min_terminated_length": 269.92,
"epoch": 2.0,
"frac_reward_zero_std": 0.15,
"grad_norm": 2.558583506193219,
"learning_rate": 5.001666666666666e-07,
"loss": 0.0015,
"num_tokens": 26356915.0,
"reward": 9.858510990142822,
"reward_std": 0.2589301385357976,
"rewards/accuracy_reward/mean": 0.8175,
"rewards/accuracy_reward/std": 0.23172805666923524,
"rewards/chart_type_reward/mean": 0.8875,
"rewards/chart_type_reward/std": 0.12651368021965026,
"rewards/format_reward/mean": 2.0,
"rewards/format_reward/std": 0.0,
"rewards/length_think_reward/mean": 1.5,
"rewards/length_think_reward/std": 0.0,
"rewards/num_token_reward/mean": 1.0,
"rewards/num_token_reward/std": 0.0,
"rewards/process_style_reward/mean": 1.6852520942687987,
"rewards/process_style_reward/std": 0.3970717826485634,
"rewards/table_style_reward/mean": 1.968258823156357,
"rewards/table_style_reward/std": 0.29191130749881267,
"step": 3000
},
{
"epoch": 2.0,
"eval_clip_ratio/high_max": 0.0,
"eval_clip_ratio/high_mean": 0.0,
"eval_clip_ratio/low_mean": 0.0,
"eval_clip_ratio/low_min": 0.0,
"eval_clip_ratio/region_mean": 0.0,
"eval_completions/clipped_ratio": 0.0,
"eval_completions/max_length": 442.64,
"eval_completions/max_terminated_length": 442.64,
"eval_completions/mean_length": 303.75625,
"eval_completions/mean_terminated_length": 303.75625,
"eval_completions/min_length": 209.52,
"eval_completions/min_terminated_length": 209.52,
"eval_frac_reward_zero_std": 0.695,
"eval_loss": 0.0015344952698796988,
"eval_num_tokens": 26356915.0,
"eval_reward": 7.581516437530517,
"eval_reward_std": 0.062049417817033825,
"eval_rewards/accuracy_reward/mean": 0.85375,
"eval_rewards/accuracy_reward/std": 0.3139296269416809,
"eval_rewards/chart_type_reward/mean": 0.62875,
"eval_rewards/chart_type_reward/std": 0.4629242014884949,
"eval_rewards/format_reward/mean": 2.0,
"eval_rewards/format_reward/std": 0.0,
"eval_rewards/length_think_reward/mean": 1.5,
"eval_rewards/length_think_reward/std": 0.0,
"eval_rewards/num_token_reward/mean": 1.0,
"eval_rewards/num_token_reward/std": 0.0,
"eval_rewards/process_style_reward/mean": 0.840136501789093,
"eval_rewards/process_style_reward/std": 0.2791509646177292,
"eval_rewards/table_style_reward/mean": 0.7588799333572388,
"eval_rewards/table_style_reward/std": 0.02718144789338112,
"eval_runtime": 333.1256,
"eval_samples_per_second": 0.6,
"eval_steps_per_second": 0.021,
"step": 3000
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 375.1,
"completions/max_terminated_length": 375.1,
"completions/mean_length": 312.9625,
"completions/mean_terminated_length": 312.9625,
"completions/min_length": 254.74,
"completions/min_terminated_length": 254.74,
"epoch": 2.033333333333333,
"frac_reward_zero_std": 0.21,
"grad_norm": 2.2475136992847617,
"learning_rate": 4.918333333333333e-07,
"loss": -0.0057,
"num_tokens": 26775592.0,
"reward": 9.69863618850708,
"reward_std": 0.28390467911958694,
"rewards/accuracy_reward/mean": 0.7375,
"rewards/accuracy_reward/std": 0.2776748961210251,
"rewards/chart_type_reward/mean": 0.8975,
"rewards/chart_type_reward/std": 0.10656502962112427,
"rewards/format_reward/mean": 2.0,
"rewards/format_reward/std": 0.0,
"rewards/length_think_reward/mean": 1.5,
"rewards/length_think_reward/std": 0.0,
"rewards/num_token_reward/mean": 1.0,
"rewards/num_token_reward/std": 0.0,
"rewards/process_style_reward/mean": 1.6072617268562317,
"rewards/process_style_reward/std": 0.3515561890602112,
"rewards/table_style_reward/mean": 1.956374499797821,
"rewards/table_style_reward/std": 0.3190466545522213,
"step": 3050
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 388.64,
"completions/max_terminated_length": 388.64,
"completions/mean_length": 331.075,
"completions/mean_terminated_length": 331.075,
"completions/min_length": 278.64,
"completions/min_terminated_length": 278.64,
"epoch": 2.066666666666667,
"frac_reward_zero_std": 0.21,
"grad_norm": 1.6861060018846619,
"learning_rate": 4.835e-07,
"loss": 0.0028,
"num_tokens": 27200726.0,
"reward": 9.909198265075684,
"reward_std": 0.26314487379044293,
"rewards/accuracy_reward/mean": 0.7575,
"rewards/accuracy_reward/std": 0.24443480968475342,
"rewards/chart_type_reward/mean": 0.92,
"rewards/chart_type_reward/std": 0.08552359580993653,
"rewards/format_reward/mean": 2.0,
"rewards/format_reward/std": 0.0,
"rewards/length_think_reward/mean": 1.5,
"rewards/length_think_reward/std": 0.0,
"rewards/num_token_reward/mean": 1.0,
"rewards/num_token_reward/std": 0.0,
"rewards/process_style_reward/mean": 1.714371042251587,
"rewards/process_style_reward/std": 0.40186877727508546,
"rewards/table_style_reward/mean": 2.017327206134796,
"rewards/table_style_reward/std": 0.3402495227381587,
"step": 3100
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0025,
"completions/max_length": 411.52,
"completions/max_terminated_length": 411.02,
"completions/mean_length": 335.7075,
"completions/mean_terminated_length": 335.0121435546875,
"completions/min_length": 269.54,
"completions/min_terminated_length": 269.54,
"epoch": 2.1,
"frac_reward_zero_std": 0.18,
"grad_norm": 1.7582515763042907,
"learning_rate": 4.7516666666666667e-07,
"loss": -0.0049,
"num_tokens": 27628089.0,
"reward": 9.542483215332032,
"reward_std": 0.29263645596802235,
"rewards/accuracy_reward/mean": 0.7425,
"rewards/accuracy_reward/std": 0.29206632256507875,
"rewards/chart_type_reward/mean": 0.88,
"rewards/chart_type_reward/std": 0.10690449476242066,
"rewards/format_reward/mean": 1.995,
"rewards/format_reward/std": 0.014142135381698609,
"rewards/length_think_reward/mean": 1.498125,
"rewards/length_think_reward/std": 0.005303300619125366,
"rewards/num_token_reward/mean": 0.9975,
"rewards/num_token_reward/std": 0.007071067690849304,
"rewards/process_style_reward/mean": 1.4842811024188995,
"rewards/process_style_reward/std": 0.40244101256132125,
"rewards/table_style_reward/mean": 1.9450771474838258,
"rewards/table_style_reward/std": 0.3524085796624422,
"step": 3150
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 389.78,
"completions/max_terminated_length": 389.78,
"completions/mean_length": 324.99,
"completions/mean_terminated_length": 324.99,
"completions/min_length": 267.42,
"completions/min_terminated_length": 267.42,
"epoch": 2.1333333333333333,
"frac_reward_zero_std": 0.23,
"grad_norm": 5.2003103180671655,
"learning_rate": 4.668333333333333e-07,
"loss": -0.0014,
"num_tokens": 28050929.0,
"reward": 9.795648279190063,
"reward_std": 0.27396415136754515,
"rewards/accuracy_reward/mean": 0.745,
"rewards/accuracy_reward/std": 0.26900545120239255,
"rewards/chart_type_reward/mean": 0.96,
"rewards/chart_type_reward/std": 0.042761797904968264,
"rewards/format_reward/mean": 2.0,
"rewards/format_reward/std": 0.0,
"rewards/length_think_reward/mean": 1.5,
"rewards/length_think_reward/std": 0.0,
"rewards/num_token_reward/mean": 1.0,
"rewards/num_token_reward/std": 0.0,
"rewards/process_style_reward/mean": 1.6426741647720338,
"rewards/process_style_reward/std": 0.4128600428253412,
"rewards/table_style_reward/mean": 1.947974135875702,
"rewards/table_style_reward/std": 0.27850259508937597,
"step": 3200
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0075,
"completions/max_length": 378.94,
"completions/max_terminated_length": 378.38,
"completions/mean_length": 312.8675,
"completions/mean_terminated_length": 310.1270001220703,
"completions/min_length": 253.22,
"completions/min_terminated_length": 253.22,
"epoch": 2.1666666666666665,
"frac_reward_zero_std": 0.27,
"grad_norm": 8.03823522353211,
"learning_rate": 4.585e-07,
"loss": 0.003,
"num_tokens": 28468656.0,
"reward": 9.798268947601319,
"reward_std": 0.23171548346057536,
"rewards/accuracy_reward/mean": 0.8225,
"rewards/accuracy_reward/std": 0.21219169199466706,
"rewards/chart_type_reward/mean": 0.895,
"rewards/chart_type_reward/std": 0.11616269588470458,
"rewards/format_reward/mean": 1.985,
"rewards/format_reward/std": 0.02070196866989136,
"rewards/length_think_reward/mean": 1.5,
"rewards/length_think_reward/std": 0.0,
"rewards/num_token_reward/mean": 0.9925,
"rewards/num_token_reward/std": 0.01035098433494568,
"rewards/process_style_reward/mean": 1.56619358420372,
"rewards/process_style_reward/std": 0.4133440139889717,
"rewards/table_style_reward/mean": 2.037075364589691,
"rewards/table_style_reward/std": 0.29497910317033527,
"step": 3250
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 354.38,
"completions/max_terminated_length": 354.38,
"completions/mean_length": 292.085,
"completions/mean_terminated_length": 292.085,
"completions/min_length": 233.4,
"completions/min_terminated_length": 233.4,
"epoch": 2.2,
"frac_reward_zero_std": 0.26,
"grad_norm": 3.7658961156032484,
"learning_rate": 4.5016666666666664e-07,
"loss": -0.0009,
"num_tokens": 28878814.0,
"reward": 9.694805870056152,
"reward_std": 0.183432172909379,
"rewards/accuracy_reward/mean": 0.85,
"rewards/accuracy_reward/std": 0.13963742017745973,
"rewards/chart_type_reward/mean": 0.9125,
"rewards/chart_type_reward/std": 0.07306143283843994,
"rewards/format_reward/mean": 2.0,
"rewards/format_reward/std": 0.0,
"rewards/length_think_reward/mean": 1.5,
"rewards/length_think_reward/std": 0.0,
"rewards/num_token_reward/mean": 1.0,
"rewards/num_token_reward/std": 0.0,
"rewards/process_style_reward/mean": 1.5138389551639557,
"rewards/process_style_reward/std": 0.44110236927866936,
"rewards/table_style_reward/mean": 1.9184669041633606,
"rewards/table_style_reward/std": 0.3392397094517946,
"step": 3300
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 384.02,
"completions/max_terminated_length": 384.02,
"completions/mean_length": 319.64,
"completions/mean_terminated_length": 319.64,
"completions/min_length": 260.52,
"completions/min_terminated_length": 260.52,
"epoch": 2.2333333333333334,
"frac_reward_zero_std": 0.26,
"grad_norm": 2.7631543444157165,
"learning_rate": 4.4183333333333335e-07,
"loss": -0.0016,
"num_tokens": 29299838.0,
"reward": 9.804728012084961,
"reward_std": 0.2311769995908253,
"rewards/accuracy_reward/mean": 0.8175,
"rewards/accuracy_reward/std": 0.22684662878513337,
"rewards/chart_type_reward/mean": 0.91,
"rewards/chart_type_reward/std": 0.09621404528617859,
"rewards/format_reward/mean": 2.0,
"rewards/format_reward/std": 0.0,
"rewards/length_think_reward/mean": 1.5,
"rewards/length_think_reward/std": 0.0,
"rewards/num_token_reward/mean": 1.0,
"rewards/num_token_reward/std": 0.0,
"rewards/process_style_reward/mean": 1.630505404472351,
"rewards/process_style_reward/std": 0.40631200328469275,
"rewards/table_style_reward/mean": 1.9467226195335388,
"rewards/table_style_reward/std": 0.25668422447517514,
"step": 3350
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 390.78,
"completions/max_terminated_length": 390.78,
"completions/mean_length": 311.04,
"completions/mean_terminated_length": 311.04,
"completions/min_length": 244.4,
"completions/min_terminated_length": 244.4,
"epoch": 2.2666666666666666,
"frac_reward_zero_std": 0.12,
"grad_norm": 3.7039274908559348,
"learning_rate": 4.3349999999999996e-07,
"loss": -0.0003,
"num_tokens": 29716978.0,
"reward": 9.710132846832275,
"reward_std": 0.27550872176885605,
"rewards/accuracy_reward/mean": 0.78,
"rewards/accuracy_reward/std": 0.258406742811203,
"rewards/chart_type_reward/mean": 0.95,
"rewards/chart_type_reward/std": 0.05345224738121033,
"rewards/format_reward/mean": 2.0,
"rewards/format_reward/std": 0.0,
"rewards/length_think_reward/mean": 1.5,
"rewards/length_think_reward/std": 0.0,
"rewards/num_token_reward/mean": 1.0,
"rewards/num_token_reward/std": 0.0,
"rewards/process_style_reward/mean": 1.5488644635677338,
"rewards/process_style_reward/std": 0.4340578323602676,
"rewards/table_style_reward/mean": 1.9312683844566345,
"rewards/table_style_reward/std": 0.29763940557837487,
"step": 3400
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 365.52,
"completions/max_terminated_length": 365.52,
"completions/mean_length": 307.235,
"completions/mean_terminated_length": 307.235,
"completions/min_length": 259.8,
"completions/min_terminated_length": 259.8,
"epoch": 2.3,
"frac_reward_zero_std": 0.19,
"grad_norm": 3.2607533668304947,
"learning_rate": 4.2516666666666667e-07,
"loss": -0.0013,
"num_tokens": 30133056.0,
"reward": 9.773553447723389,
"reward_std": 0.30064396366477014,
"rewards/accuracy_reward/mean": 0.7575,
"rewards/accuracy_reward/std": 0.2782620853185654,
"rewards/chart_type_reward/mean": 0.9225,
"rewards/chart_type_reward/std": 0.08518413066864014,
"rewards/format_reward/mean": 2.0,
"rewards/format_reward/std": 0.0,
"rewards/length_think_reward/mean": 1.5,
"rewards/length_think_reward/std": 0.0,
"rewards/num_token_reward/mean": 1.0,
"rewards/num_token_reward/std": 0.0,
"rewards/process_style_reward/mean": 1.6084586906433105,
"rewards/process_style_reward/std": 0.41134398311376574,
"rewards/table_style_reward/mean": 1.9850947809219361,
"rewards/table_style_reward/std": 0.31916536355391145,
"step": 3450
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.005,
"completions/max_length": 375.18,
"completions/max_terminated_length": 374.2,
"completions/mean_length": 316.03,
"completions/mean_terminated_length": 313.87,
"completions/min_length": 264.38,
"completions/min_terminated_length": 264.38,
"epoch": 2.3333333333333335,
"frac_reward_zero_std": 0.19,
"grad_norm": 1.990920818414865,
"learning_rate": 4.1683333333333333e-07,
"loss": 0.0034,
"num_tokens": 30552900.0,
"reward": 9.88123031616211,
"reward_std": 0.2634401721076574,
"rewards/accuracy_reward/mean": 0.78,
"rewards/accuracy_reward/std": 0.2654762434959412,
"rewards/chart_type_reward/mean": 0.9525,
"rewards/chart_type_reward/std": 0.05311278223991394,
"rewards/format_reward/mean": 1.99,
"rewards/format_reward/std": 0.01851640224456787,
"rewards/length_think_reward/mean": 1.5,
"rewards/length_think_reward/std": 0.0,
"rewards/num_token_reward/mean": 0.995,
"rewards/num_token_reward/std": 0.009258201122283935,
"rewards/process_style_reward/mean": 1.6290400648117065,
"rewards/process_style_reward/std": 0.41983593456447127,
"rewards/table_style_reward/mean": 2.0346901965141297,
"rewards/table_style_reward/std": 0.28429963132366537,
"step": 3500
},
{
"epoch": 2.3333333333333335,
"eval_clip_ratio/high_max": 0.0,
"eval_clip_ratio/high_mean": 0.0,
"eval_clip_ratio/low_mean": 0.0,
"eval_clip_ratio/low_min": 0.0,
"eval_clip_ratio/region_mean": 0.0,
"eval_completions/clipped_ratio": 0.0,
"eval_completions/max_length": 434.36,
"eval_completions/max_terminated_length": 434.36,
"eval_completions/mean_length": 297.61625,
"eval_completions/mean_terminated_length": 297.61625,
"eval_completions/min_length": 204.8,
"eval_completions/min_terminated_length": 204.8,
"eval_frac_reward_zero_std": 0.665,
"eval_loss": 0.0004025402304250747,
"eval_num_tokens": 30552900.0,
"eval_reward": 7.552074928283691,
"eval_reward_std": 0.0732093141740188,
"eval_rewards/accuracy_reward/mean": 0.855,
"eval_rewards/accuracy_reward/std": 0.3025389724969864,
"eval_rewards/chart_type_reward/mean": 0.6025,
"eval_rewards/chart_type_reward/std": 0.4713040769100189,
"eval_rewards/format_reward/mean": 2.0,
"eval_rewards/format_reward/std": 0.0,
"eval_rewards/length_think_reward/mean": 1.5,
"eval_rewards/length_think_reward/std": 0.0,
"eval_rewards/num_token_reward/mean": 1.0,
"eval_rewards/num_token_reward/std": 0.0,
"eval_rewards/process_style_reward/mean": 0.8350749087333679,
"eval_rewards/process_style_reward/std": 0.28666666328907014,
"eval_rewards/table_style_reward/mean": 0.7595000004768372,
"eval_rewards/table_style_reward/std": 0.023632641285657882,
"eval_runtime": 327.7189,
"eval_samples_per_second": 0.61,
"eval_steps_per_second": 0.021,
"step": 3500
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 368.26,
"completions/max_terminated_length": 368.26,
"completions/mean_length": 310.085,
"completions/mean_terminated_length": 310.085,
"completions/min_length": 255.86,
"completions/min_terminated_length": 255.86,
"epoch": 2.3666666666666667,
"frac_reward_zero_std": 0.15,
"grad_norm": 2.559054191104846,
"learning_rate": 4.0849999999999993e-07,
"loss": -0.0016,
"num_tokens": 30970094.0,
"reward": 9.84461862564087,
"reward_std": 0.31480611886829135,
"rewards/accuracy_reward/mean": 0.7875,
"rewards/accuracy_reward/std": 0.24949051320552826,
"rewards/chart_type_reward/mean": 0.9325,
"rewards/chart_type_reward/std": 0.08047196567058564,
"rewards/format_reward/mean": 2.0,
"rewards/format_reward/std": 0.0,
"rewards/length_think_reward/mean": 1.5,
"rewards/length_think_reward/std": 0.0,
"rewards/num_token_reward/mean": 1.0,
"rewards/num_token_reward/std": 0.0,
"rewards/process_style_reward/mean": 1.6563871276378632,
"rewards/process_style_reward/std": 0.47597916625440123,
"rewards/table_style_reward/mean": 1.9682315516471862,
"rewards/table_style_reward/std": 0.31802774131298067,
"step": 3550
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.005,
"completions/max_length": 392.88,
"completions/max_terminated_length": 392.72,
"completions/mean_length": 328.53,
"completions/mean_terminated_length": 327.8175,
"completions/min_length": 269.42,
"completions/min_terminated_length": 269.42,
"epoch": 2.4,
"frac_reward_zero_std": 0.2,
"grad_norm": 1.3441577493975225,
"learning_rate": 4.0016666666666664e-07,
"loss": 0.0021,
"num_tokens": 31394678.0,
"reward": 9.768132333755494,
"reward_std": 0.2865133846178651,
"rewards/accuracy_reward/mean": 0.7325,
"rewards/accuracy_reward/std": 0.3207777261734009,
"rewards/chart_type_reward/mean": 0.8875,
"rewards/chart_type_reward/std": 0.11254331409931183,
"rewards/format_reward/mean": 1.99,
"rewards/format_reward/std": 0.01851640224456787,
"rewards/length_think_reward/mean": 1.5,
"rewards/length_think_reward/std": 0.0,
"rewards/num_token_reward/mean": 0.995,
"rewards/num_token_reward/std": 0.009258201122283935,
"rewards/process_style_reward/mean": 1.6195254147052764,
"rewards/process_style_reward/std": 0.4645379837602377,
"rewards/table_style_reward/mean": 2.0436069059371946,
"rewards/table_style_reward/std": 0.28304173408076166,
"step": 3600
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 357.54,
"completions/max_terminated_length": 357.54,
"completions/mean_length": 299.01,
"completions/mean_terminated_length": 299.01,
"completions/min_length": 244.34,
"completions/min_terminated_length": 244.34,
"epoch": 2.4333333333333336,
"frac_reward_zero_std": 0.21,
"grad_norm": 2.849518244313829,
"learning_rate": 3.918333333333333e-07,
"loss": 0.0003,
"num_tokens": 31807526.0,
"reward": 9.800979099273682,
"reward_std": 0.25777424886822703,
"rewards/accuracy_reward/mean": 0.7925,
"rewards/accuracy_reward/std": 0.24965977609157564,
"rewards/chart_type_reward/mean": 0.89,
"rewards/chart_type_reward/std": 0.11759494423866272,
"rewards/format_reward/mean": 2.0,
"rewards/format_reward/std": 0.0,
"rewards/length_think_reward/mean": 1.5,
"rewards/length_think_reward/std": 0.0,
"rewards/num_token_reward/mean": 1.0,
"rewards/num_token_reward/std": 0.0,
"rewards/process_style_reward/mean": 1.5430667841434478,
"rewards/process_style_reward/std": 0.4256218123435974,
"rewards/table_style_reward/mean": 2.075412368774414,
"rewards/table_style_reward/std": 0.3282872153446078,
"step": 3650
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 370.08,
"completions/max_terminated_length": 370.08,
"completions/mean_length": 309.9025,
"completions/mean_terminated_length": 309.9025,
"completions/min_length": 254.94,
"completions/min_terminated_length": 254.94,
"epoch": 2.466666666666667,
"frac_reward_zero_std": 0.22,
"grad_norm": 2.1631571281627955,
"learning_rate": 3.835e-07,
"loss": -0.0024,
"num_tokens": 32224583.0,
"reward": 9.830465087890625,
"reward_std": 0.23616183903533966,
"rewards/accuracy_reward/mean": 0.755,
"rewards/accuracy_reward/std": 0.27062118172645566,
"rewards/chart_type_reward/mean": 0.925,
"rewards/chart_type_reward/std": 0.08409134745597839,
"rewards/format_reward/mean": 1.995,
"rewards/format_reward/std": 0.014142135381698609,
"rewards/length_think_reward/mean": 1.5,
"rewards/length_think_reward/std": 0.0,
"rewards/num_token_reward/mean": 0.9975,
"rewards/num_token_reward/std": 0.007071067690849304,
"rewards/process_style_reward/mean": 1.6658795142173768,
"rewards/process_style_reward/std": 0.4328185883164406,
"rewards/table_style_reward/mean": 1.992085530757904,
"rewards/table_style_reward/std": 0.30545895665884015,
"step": 3700
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.005,
"completions/max_length": 374.04,
"completions/max_terminated_length": 374.02,
"completions/mean_length": 316.38,
"completions/mean_terminated_length": 315.13416748046876,
"completions/min_length": 261.26,
"completions/min_terminated_length": 261.26,
"epoch": 2.5,
"frac_reward_zero_std": 0.18,
"grad_norm": 3.743160685463559,
"learning_rate": 3.751666666666666e-07,
"loss": -0.0015,
"num_tokens": 32644799.0,
"reward": 9.782908029556275,
"reward_std": 0.2735483956709504,
"rewards/accuracy_reward/mean": 0.825,
"rewards/accuracy_reward/std": 0.2005802285671234,
"rewards/chart_type_reward/mean": 0.91,
"rewards/chart_type_reward/std": 0.07483314633369446,
"rewards/format_reward/mean": 1.99,
"rewards/format_reward/std": 0.01851640224456787,
"rewards/length_think_reward/mean": 1.499375,
"rewards/length_think_reward/std": 0.001767766922712326,
"rewards/num_token_reward/mean": 0.995,
"rewards/num_token_reward/std": 0.009258201122283935,
"rewards/process_style_reward/mean": 1.5454747760295868,
"rewards/process_style_reward/std": 0.431058616489172,
"rewards/table_style_reward/mean": 2.0180582451820372,
"rewards/table_style_reward/std": 0.3305081824213266,
"step": 3750
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 400.0,
"completions/max_terminated_length": 400.0,
"completions/mean_length": 333.855,
"completions/mean_terminated_length": 333.855,
"completions/min_length": 273.8,
"completions/min_terminated_length": 273.8,
"epoch": 2.533333333333333,
"frac_reward_zero_std": 0.2,
"grad_norm": 5.8673809839930975,
"learning_rate": 3.6683333333333333e-07,
"loss": -0.0002,
"num_tokens": 33072297.0,
"reward": 9.738230361938477,
"reward_std": 0.2538567354902625,
"rewards/accuracy_reward/mean": 0.7525,
"rewards/accuracy_reward/std": 0.27658367991447447,
"rewards/chart_type_reward/mean": 0.9,
"rewards/chart_type_reward/std": 0.10690449476242066,
"rewards/format_reward/mean": 2.0,
"rewards/format_reward/std": 0.0,
"rewards/length_think_reward/mean": 1.5,
"rewards/length_think_reward/std": 0.0,
"rewards/num_token_reward/mean": 1.0,
"rewards/num_token_reward/std": 0.0,
"rewards/process_style_reward/mean": 1.6415416550636293,
"rewards/process_style_reward/std": 0.433953458070755,
"rewards/table_style_reward/mean": 1.9441887497901917,
"rewards/table_style_reward/std": 0.32067165344953535,
"step": 3800
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 385.32,
"completions/max_terminated_length": 385.32,
"completions/mean_length": 327.8975,
"completions/mean_terminated_length": 327.8975,
"completions/min_length": 274.0,
"completions/min_terminated_length": 274.0,
"epoch": 2.5666666666666664,
"frac_reward_zero_std": 0.19,
"grad_norm": 3.653425622305845,
"learning_rate": 3.585e-07,
"loss": -0.0016,
"num_tokens": 33496748.0,
"reward": 9.902756881713866,
"reward_std": 0.27108980235410857,
"rewards/accuracy_reward/mean": 0.8125,
"rewards/accuracy_reward/std": 0.20225769460201262,
"rewards/chart_type_reward/mean": 0.89,
"rewards/chart_type_reward/std": 0.11759494423866272,
"rewards/format_reward/mean": 2.0,
"rewards/format_reward/std": 0.0,
"rewards/length_think_reward/mean": 1.5,
"rewards/length_think_reward/std": 0.0,
"rewards/num_token_reward/mean": 1.0,
"rewards/num_token_reward/std": 0.0,
"rewards/process_style_reward/mean": 1.700443229675293,
"rewards/process_style_reward/std": 0.37070401668548586,
"rewards/table_style_reward/mean": 1.9998136401176452,
"rewards/table_style_reward/std": 0.26890223439782857,
"step": 3850
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 405.18,
"completions/max_terminated_length": 405.18,
"completions/mean_length": 334.3775,
"completions/mean_terminated_length": 334.3775,
"completions/min_length": 270.76,
"completions/min_terminated_length": 270.76,
"epoch": 2.6,
"frac_reward_zero_std": 0.21,
"grad_norm": 0.0,
"learning_rate": 3.5016666666666665e-07,
"loss": 0.0018,
"num_tokens": 33923707.0,
"reward": 9.698110904693603,
"reward_std": 0.2213594539882615,
"rewards/accuracy_reward/mean": 0.7375,
"rewards/accuracy_reward/std": 0.27844556748867033,
"rewards/chart_type_reward/mean": 0.91,
"rewards/chart_type_reward/std": 0.1040399980545044,
"rewards/format_reward/mean": 2.0,
"rewards/format_reward/std": 0.0,
"rewards/length_think_reward/mean": 1.5,
"rewards/length_think_reward/std": 0.0,
"rewards/num_token_reward/mean": 1.0,
"rewards/num_token_reward/std": 0.0,
"rewards/process_style_reward/mean": 1.4740183496475219,
"rewards/process_style_reward/std": 0.416292352899909,
"rewards/table_style_reward/mean": 2.076592493057251,
"rewards/table_style_reward/std": 0.3206081053614616,
"step": 3900
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 381.28,
"completions/max_terminated_length": 381.28,
"completions/mean_length": 318.2575,
"completions/mean_terminated_length": 318.2575,
"completions/min_length": 256.24,
"completions/min_terminated_length": 256.24,
"epoch": 2.6333333333333333,
"frac_reward_zero_std": 0.23,
"grad_norm": 0.0,
"learning_rate": 3.418333333333333e-07,
"loss": -0.0002,
"num_tokens": 34343654.0,
"reward": 9.784586238861085,
"reward_std": 0.21556221422739327,
"rewards/accuracy_reward/mean": 0.77,
"rewards/accuracy_reward/std": 0.24662194311618804,
"rewards/chart_type_reward/mean": 0.95,
"rewards/chart_type_reward/std": 0.03207134842872619,
"rewards/format_reward/mean": 2.0,
"rewards/format_reward/std": 0.0,
"rewards/length_think_reward/mean": 1.499375,
"rewards/length_think_reward/std": 0.001767766922712326,
"rewards/num_token_reward/mean": 1.0,
"rewards/num_token_reward/std": 0.0,
"rewards/process_style_reward/mean": 1.5625994729995727,
"rewards/process_style_reward/std": 0.4333411505073309,
"rewards/table_style_reward/mean": 2.002611801624298,
"rewards/table_style_reward/std": 0.2887386105395853,
"step": 3950
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 385.1,
"completions/max_terminated_length": 385.1,
"completions/mean_length": 332.6775,
"completions/mean_terminated_length": 332.6775,
"completions/min_length": 280.72,
"completions/min_terminated_length": 280.72,
"epoch": 2.6666666666666665,
"frac_reward_zero_std": 0.2,
"grad_norm": 3.284695127062566,
"learning_rate": 3.335e-07,
"loss": -0.0007,
"num_tokens": 34769621.0,
"reward": 9.657586326599121,
"reward_std": 0.25254386219428854,
"rewards/accuracy_reward/mean": 0.73,
"rewards/accuracy_reward/std": 0.30427919447422025,
"rewards/chart_type_reward/mean": 0.93,
"rewards/chart_type_reward/std": 0.07483314633369446,
"rewards/format_reward/mean": 2.0,
"rewards/format_reward/std": 0.0,
"rewards/length_think_reward/mean": 1.5,
"rewards/length_think_reward/std": 0.0,
"rewards/num_token_reward/mean": 1.0,
"rewards/num_token_reward/std": 0.0,
"rewards/process_style_reward/mean": 1.5347757422924042,
"rewards/process_style_reward/std": 0.3837696108222008,
"rewards/table_style_reward/mean": 1.9628105711936952,
"rewards/table_style_reward/std": 0.36233584862202406,
"step": 4000
},
{
"epoch": 2.6666666666666665,
"eval_clip_ratio/high_max": 0.0,
"eval_clip_ratio/high_mean": 0.0,
"eval_clip_ratio/low_mean": 0.0,
"eval_clip_ratio/low_min": 0.0,
"eval_clip_ratio/region_mean": 0.0,
"eval_completions/clipped_ratio": 0.0,
"eval_completions/max_length": 440.6,
"eval_completions/max_terminated_length": 440.6,
"eval_completions/mean_length": 307.24125,
"eval_completions/mean_terminated_length": 307.24125,
"eval_completions/min_length": 217.76,
"eval_completions/min_terminated_length": 217.76,
"eval_frac_reward_zero_std": 0.71,
"eval_loss": -0.0005738374311476946,
"eval_num_tokens": 34769621.0,
"eval_reward": 7.5656030654907225,
"eval_reward_std": 0.0531126305134967,
"eval_rewards/accuracy_reward/mean": 0.8525,
"eval_rewards/accuracy_reward/std": 0.2918598711490631,
"eval_rewards/chart_type_reward/mean": 0.61375,
"eval_rewards/chart_type_reward/std": 0.466957231760025,
"eval_rewards/format_reward/mean": 2.0,
"eval_rewards/format_reward/std": 0.0,
"eval_rewards/length_think_reward/mean": 1.5,
"eval_rewards/length_think_reward/std": 0.0,
"eval_rewards/num_token_reward/mean": 1.0,
"eval_rewards/num_token_reward/std": 0.0,
"eval_rewards/process_style_reward/mean": 0.8417280244827271,
"eval_rewards/process_style_reward/std": 0.2765642327070236,
"eval_rewards/table_style_reward/mean": 0.7576250004768371,
"eval_rewards/table_style_reward/std": 0.030703708827495575,
"eval_runtime": 331.3654,
"eval_samples_per_second": 0.604,
"eval_steps_per_second": 0.021,
"step": 4000
}
],
"logging_steps": 50,
"max_steps": 6000,
"num_input_tokens_seen": 34769621,
"num_train_epochs": 4,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}