| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.2, | |
| "eval_steps": 500, | |
| "global_step": 50, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "completion_length": 361.75, | |
| "epoch": 0.004, | |
| "grad_norm": 0.0693558007478714, | |
| "kl": 0.0, | |
| "learning_rate": 0.0, | |
| "loss": -0.0, | |
| "reward": 0.02345000021159649, | |
| "reward_std": 0.04690000042319298, | |
| "rewards/pot_combined_reward": 0.02345000021159649, | |
| "step": 1 | |
| }, | |
| { | |
| "completion_length": 371.375, | |
| "epoch": 0.008, | |
| "grad_norm": 0.08951307833194733, | |
| "kl": 0.0, | |
| "learning_rate": 5.000000000000001e-07, | |
| "loss": -0.0, | |
| "reward": 0.026133334264159203, | |
| "reward_std": 0.052266668528318405, | |
| "rewards/pot_combined_reward": 0.026133334264159203, | |
| "step": 2 | |
| }, | |
| { | |
| "completion_length": 374.0, | |
| "epoch": 0.012, | |
| "grad_norm": 0.001663331058807671, | |
| "kl": 0.0005528016190510243, | |
| "learning_rate": 1.0000000000000002e-06, | |
| "loss": 0.0001, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/pot_combined_reward": 0.0, | |
| "step": 3 | |
| }, | |
| { | |
| "completion_length": 374.0, | |
| "epoch": 0.016, | |
| "grad_norm": 0.0016158577054738998, | |
| "kl": 0.0005013404006604105, | |
| "learning_rate": 1.5e-06, | |
| "loss": 0.0001, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/pot_combined_reward": 0.0, | |
| "step": 4 | |
| }, | |
| { | |
| "completion_length": 373.375, | |
| "epoch": 0.02, | |
| "grad_norm": 0.0030017346143722534, | |
| "kl": 0.0005660907772835344, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": 0.0001, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/pot_combined_reward": 0.0, | |
| "step": 5 | |
| }, | |
| { | |
| "completion_length": 373.5625, | |
| "epoch": 0.024, | |
| "grad_norm": 0.0015043334569782019, | |
| "kl": 0.0005426810312201269, | |
| "learning_rate": 2.5e-06, | |
| "loss": 0.0001, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/pot_combined_reward": 0.0, | |
| "step": 6 | |
| }, | |
| { | |
| "completion_length": 364.5, | |
| "epoch": 0.028, | |
| "grad_norm": 0.08007726073265076, | |
| "kl": 0.0005026786457165144, | |
| "learning_rate": 3e-06, | |
| "loss": 0.0001, | |
| "reward": 0.03146666660904884, | |
| "reward_std": 0.06293333321809769, | |
| "rewards/pot_combined_reward": 0.03146666660904884, | |
| "step": 7 | |
| }, | |
| { | |
| "completion_length": 374.0, | |
| "epoch": 0.032, | |
| "grad_norm": 0.0016093035228550434, | |
| "kl": 0.0005015511706005782, | |
| "learning_rate": 3.5e-06, | |
| "loss": 0.0001, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/pot_combined_reward": 0.0, | |
| "step": 8 | |
| }, | |
| { | |
| "completion_length": 353.0625, | |
| "epoch": 0.036, | |
| "grad_norm": 0.001784446300007403, | |
| "kl": 0.0003549655375536531, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 0.0, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/pot_combined_reward": 0.0, | |
| "step": 9 | |
| }, | |
| { | |
| "completion_length": 374.0, | |
| "epoch": 0.04, | |
| "grad_norm": 0.0020515809301286936, | |
| "kl": 0.0005762250584666617, | |
| "learning_rate": 4.5e-06, | |
| "loss": 0.0001, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/pot_combined_reward": 0.0, | |
| "step": 10 | |
| }, | |
| { | |
| "completion_length": 374.0, | |
| "epoch": 0.044, | |
| "grad_norm": 0.06735244393348694, | |
| "kl": 0.0005008808220736682, | |
| "learning_rate": 5e-06, | |
| "loss": 0.0001, | |
| "reward": 0.01808333396911621, | |
| "reward_std": 0.03616666793823242, | |
| "rewards/pot_combined_reward": 0.01808333396911621, | |
| "step": 11 | |
| }, | |
| { | |
| "completion_length": 342.5, | |
| "epoch": 0.048, | |
| "grad_norm": 0.0024033007211983204, | |
| "kl": 0.00046441886661341414, | |
| "learning_rate": 4.99847706754774e-06, | |
| "loss": 0.0, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/pot_combined_reward": 0.0, | |
| "step": 12 | |
| }, | |
| { | |
| "completion_length": 374.0, | |
| "epoch": 0.052, | |
| "grad_norm": 0.0014815045287832618, | |
| "kl": 0.0004604290661518462, | |
| "learning_rate": 4.993910125649561e-06, | |
| "loss": 0.0, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/pot_combined_reward": 0.0, | |
| "step": 13 | |
| }, | |
| { | |
| "completion_length": 374.0, | |
| "epoch": 0.056, | |
| "grad_norm": 0.001548771746456623, | |
| "kl": 0.0004989306180505082, | |
| "learning_rate": 4.986304738420684e-06, | |
| "loss": 0.0, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/pot_combined_reward": 0.0, | |
| "step": 14 | |
| }, | |
| { | |
| "completion_length": 360.8125, | |
| "epoch": 0.06, | |
| "grad_norm": 0.001822226564399898, | |
| "kl": 0.00046228048086049967, | |
| "learning_rate": 4.975670171853926e-06, | |
| "loss": 0.0, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/pot_combined_reward": 0.0, | |
| "step": 15 | |
| }, | |
| { | |
| "completion_length": 374.0, | |
| "epoch": 0.064, | |
| "grad_norm": 0.09072617441415787, | |
| "kl": 0.000476861278002616, | |
| "learning_rate": 4.962019382530521e-06, | |
| "loss": 0.0, | |
| "reward": 0.012600000016391277, | |
| "reward_std": 0.025200000032782555, | |
| "rewards/pot_combined_reward": 0.012600000016391277, | |
| "step": 16 | |
| }, | |
| { | |
| "completion_length": 374.0, | |
| "epoch": 0.068, | |
| "grad_norm": 0.0015437575057148933, | |
| "kl": 0.0005299622716847807, | |
| "learning_rate": 4.9453690018345144e-06, | |
| "loss": 0.0001, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/pot_combined_reward": 0.0, | |
| "step": 17 | |
| }, | |
| { | |
| "completion_length": 374.0, | |
| "epoch": 0.072, | |
| "grad_norm": 0.0016528957057744265, | |
| "kl": 0.0005346549514797516, | |
| "learning_rate": 4.925739315689991e-06, | |
| "loss": 0.0001, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/pot_combined_reward": 0.0, | |
| "step": 18 | |
| }, | |
| { | |
| "completion_length": 374.0, | |
| "epoch": 0.076, | |
| "grad_norm": 0.0017183530144393444, | |
| "kl": 0.0004841076224693097, | |
| "learning_rate": 4.903154239845798e-06, | |
| "loss": 0.0, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/pot_combined_reward": 0.0, | |
| "step": 19 | |
| }, | |
| { | |
| "completion_length": 373.5625, | |
| "epoch": 0.08, | |
| "grad_norm": 0.08476348221302032, | |
| "kl": 0.000539450986252632, | |
| "learning_rate": 4.8776412907378845e-06, | |
| "loss": 0.0001, | |
| "reward": 0.026249999180436134, | |
| "reward_std": 0.05249999836087227, | |
| "rewards/pot_combined_reward": 0.026249999180436134, | |
| "step": 20 | |
| }, | |
| { | |
| "completion_length": 374.0, | |
| "epoch": 0.084, | |
| "grad_norm": 0.07820821553468704, | |
| "kl": 0.0005957721295999363, | |
| "learning_rate": 4.849231551964771e-06, | |
| "loss": 0.0001, | |
| "reward": 0.07116249948740005, | |
| "reward_std": 0.1423249989748001, | |
| "rewards/pot_combined_reward": 0.07116249948740005, | |
| "step": 21 | |
| }, | |
| { | |
| "completion_length": 373.5625, | |
| "epoch": 0.088, | |
| "grad_norm": 0.0016008545644581318, | |
| "kl": 0.000533243379322812, | |
| "learning_rate": 4.817959636416969e-06, | |
| "loss": 0.0001, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/pot_combined_reward": 0.0, | |
| "step": 22 | |
| }, | |
| { | |
| "completion_length": 374.0, | |
| "epoch": 0.092, | |
| "grad_norm": 0.0018651616992428899, | |
| "kl": 0.0005620143201667815, | |
| "learning_rate": 4.783863644106502e-06, | |
| "loss": 0.0001, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/pot_combined_reward": 0.0, | |
| "step": 23 | |
| }, | |
| { | |
| "completion_length": 374.0, | |
| "epoch": 0.096, | |
| "grad_norm": 0.001895356923341751, | |
| "kl": 0.00047788477240828797, | |
| "learning_rate": 4.746985115747918e-06, | |
| "loss": 0.0, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/pot_combined_reward": 0.0, | |
| "step": 24 | |
| }, | |
| { | |
| "completion_length": 369.75, | |
| "epoch": 0.1, | |
| "grad_norm": 0.11337540298700333, | |
| "kl": 0.00047047801490407437, | |
| "learning_rate": 4.707368982147318e-06, | |
| "loss": 0.0, | |
| "reward": 0.04736666567623615, | |
| "reward_std": 0.0947333313524723, | |
| "rewards/pot_combined_reward": 0.04736666567623615, | |
| "step": 25 | |
| }, | |
| { | |
| "completion_length": 368.625, | |
| "epoch": 0.104, | |
| "grad_norm": 0.01079186424612999, | |
| "kl": 0.0006513141634059139, | |
| "learning_rate": 4.665063509461098e-06, | |
| "loss": 0.0001, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/pot_combined_reward": 0.0, | |
| "step": 26 | |
| }, | |
| { | |
| "completion_length": 374.0, | |
| "epoch": 0.108, | |
| "grad_norm": 0.0016566955018788576, | |
| "kl": 0.0005965056043351069, | |
| "learning_rate": 4.620120240391065e-06, | |
| "loss": 0.0001, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/pot_combined_reward": 0.0, | |
| "step": 27 | |
| }, | |
| { | |
| "completion_length": 374.0, | |
| "epoch": 0.112, | |
| "grad_norm": 0.001792517607100308, | |
| "kl": 0.0005393773099058308, | |
| "learning_rate": 4.572593931387604e-06, | |
| "loss": 0.0001, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/pot_combined_reward": 0.0, | |
| "step": 28 | |
| }, | |
| { | |
| "completion_length": 374.0, | |
| "epoch": 0.116, | |
| "grad_norm": 0.0017438618233427405, | |
| "kl": 0.0005095232809253503, | |
| "learning_rate": 4.522542485937369e-06, | |
| "loss": 0.0001, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/pot_combined_reward": 0.0, | |
| "step": 29 | |
| }, | |
| { | |
| "completion_length": 374.0, | |
| "epoch": 0.12, | |
| "grad_norm": 0.0015684061218053102, | |
| "kl": 0.00047055614413693547, | |
| "learning_rate": 4.470026884016805e-06, | |
| "loss": 0.0, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/pot_combined_reward": 0.0, | |
| "step": 30 | |
| }, | |
| { | |
| "completion_length": 374.0, | |
| "epoch": 0.124, | |
| "grad_norm": 0.0019608919974416494, | |
| "kl": 0.0005961552087683231, | |
| "learning_rate": 4.415111107797445e-06, | |
| "loss": 0.0001, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/pot_combined_reward": 0.0, | |
| "step": 31 | |
| }, | |
| { | |
| "completion_length": 372.0625, | |
| "epoch": 0.128, | |
| "grad_norm": 0.0015237935585901141, | |
| "kl": 0.0005326158570824191, | |
| "learning_rate": 4.357862063693486e-06, | |
| "loss": 0.0001, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/pot_combined_reward": 0.0, | |
| "step": 32 | |
| }, | |
| { | |
| "completion_length": 374.0, | |
| "epoch": 0.132, | |
| "grad_norm": 0.002203061943873763, | |
| "kl": 0.0006071907628211193, | |
| "learning_rate": 4.2983495008466285e-06, | |
| "loss": 0.0001, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/pot_combined_reward": 0.0, | |
| "step": 33 | |
| }, | |
| { | |
| "completion_length": 370.5, | |
| "epoch": 0.136, | |
| "grad_norm": 0.10126212984323502, | |
| "kl": 0.0006130525580374524, | |
| "learning_rate": 4.236645926147493e-06, | |
| "loss": 0.0001, | |
| "reward": 0.061249999329447746, | |
| "reward_std": 0.12249999865889549, | |
| "rewards/pot_combined_reward": 0.061249999329447746, | |
| "step": 34 | |
| }, | |
| { | |
| "completion_length": 374.0, | |
| "epoch": 0.14, | |
| "grad_norm": 0.08621055632829666, | |
| "kl": 0.0004927485424559563, | |
| "learning_rate": 4.172826515897146e-06, | |
| "loss": 0.0, | |
| "reward": 0.024966666474938393, | |
| "reward_std": 0.049933332949876785, | |
| "rewards/pot_combined_reward": 0.024966666474938393, | |
| "step": 35 | |
| }, | |
| { | |
| "completion_length": 374.0, | |
| "epoch": 0.144, | |
| "grad_norm": 0.0017127083847299218, | |
| "kl": 0.0005035524372942746, | |
| "learning_rate": 4.106969024216348e-06, | |
| "loss": 0.0001, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/pot_combined_reward": 0.0, | |
| "step": 36 | |
| }, | |
| { | |
| "completion_length": 374.0, | |
| "epoch": 0.148, | |
| "grad_norm": 0.0018064226023852825, | |
| "kl": 0.0005530964990612119, | |
| "learning_rate": 4.039153688314146e-06, | |
| "loss": 0.0001, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/pot_combined_reward": 0.0, | |
| "step": 37 | |
| }, | |
| { | |
| "completion_length": 373.8125, | |
| "epoch": 0.152, | |
| "grad_norm": 0.0019408023217692971, | |
| "kl": 0.0006417437689378858, | |
| "learning_rate": 3.969463130731183e-06, | |
| "loss": 0.0001, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/pot_combined_reward": 0.0, | |
| "step": 38 | |
| }, | |
| { | |
| "completion_length": 374.0, | |
| "epoch": 0.156, | |
| "grad_norm": 0.08421237021684647, | |
| "kl": 0.0006241849769139662, | |
| "learning_rate": 3.897982258676867e-06, | |
| "loss": 0.0001, | |
| "reward": 0.07303333282470703, | |
| "reward_std": 0.057444244623184204, | |
| "rewards/pot_combined_reward": 0.07303333282470703, | |
| "step": 39 | |
| }, | |
| { | |
| "completion_length": 374.0, | |
| "epoch": 0.16, | |
| "grad_norm": 0.08112610131502151, | |
| "kl": 0.0005679467285517603, | |
| "learning_rate": 3.824798160583012e-06, | |
| "loss": 0.0001, | |
| "reward": 0.014233333058655262, | |
| "reward_std": 0.028466666117310524, | |
| "rewards/pot_combined_reward": 0.014233333058655262, | |
| "step": 40 | |
| }, | |
| { | |
| "completion_length": 374.0, | |
| "epoch": 0.164, | |
| "grad_norm": 0.001669050194323063, | |
| "kl": 0.0005617116403300315, | |
| "learning_rate": 3.7500000000000005e-06, | |
| "loss": 0.0001, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/pot_combined_reward": 0.0, | |
| "step": 41 | |
| }, | |
| { | |
| "completion_length": 374.0, | |
| "epoch": 0.168, | |
| "grad_norm": 0.002355287317186594, | |
| "kl": 0.0006023006426403299, | |
| "learning_rate": 3.6736789069647273e-06, | |
| "loss": 0.0001, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/pot_combined_reward": 0.0, | |
| "step": 42 | |
| }, | |
| { | |
| "completion_length": 328.8125, | |
| "epoch": 0.172, | |
| "grad_norm": 0.0023054229095578194, | |
| "kl": 0.0005200250307098031, | |
| "learning_rate": 3.595927866972694e-06, | |
| "loss": 0.0001, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/pot_combined_reward": 0.0, | |
| "step": 43 | |
| }, | |
| { | |
| "completion_length": 374.0, | |
| "epoch": 0.176, | |
| "grad_norm": 0.11641528457403183, | |
| "kl": 0.0005873750924365595, | |
| "learning_rate": 3.516841607689501e-06, | |
| "loss": 0.0001, | |
| "reward": 0.07468749955296516, | |
| "reward_std": 0.14937499910593033, | |
| "rewards/pot_combined_reward": 0.07468749955296516, | |
| "step": 44 | |
| }, | |
| { | |
| "completion_length": 374.0, | |
| "epoch": 0.18, | |
| "grad_norm": 0.001737726735882461, | |
| "kl": 0.0005044558856752701, | |
| "learning_rate": 3.436516483539781e-06, | |
| "loss": 0.0001, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/pot_combined_reward": 0.0, | |
| "step": 45 | |
| }, | |
| { | |
| "completion_length": 370.9375, | |
| "epoch": 0.184, | |
| "grad_norm": 0.0019563438836485147, | |
| "kl": 0.000592117925407365, | |
| "learning_rate": 3.3550503583141726e-06, | |
| "loss": 0.0001, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/pot_combined_reward": 0.0, | |
| "step": 46 | |
| }, | |
| { | |
| "completion_length": 372.4375, | |
| "epoch": 0.188, | |
| "grad_norm": 0.0026125519070774317, | |
| "kl": 0.0005959889385849237, | |
| "learning_rate": 3.272542485937369e-06, | |
| "loss": 0.0001, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/pot_combined_reward": 0.0, | |
| "step": 47 | |
| }, | |
| { | |
| "completion_length": 372.75, | |
| "epoch": 0.192, | |
| "grad_norm": 0.0820600688457489, | |
| "kl": 0.0005973696243017912, | |
| "learning_rate": 3.189093389542498e-06, | |
| "loss": 0.0001, | |
| "reward": 0.03968749940395355, | |
| "reward_std": 0.0793749988079071, | |
| "rewards/pot_combined_reward": 0.03968749940395355, | |
| "step": 48 | |
| }, | |
| { | |
| "completion_length": 374.0, | |
| "epoch": 0.196, | |
| "grad_norm": 0.06782057881355286, | |
| "kl": 0.0007923852826934308, | |
| "learning_rate": 3.1048047389991693e-06, | |
| "loss": 0.0001, | |
| "reward": 0.026249999180436134, | |
| "reward_std": 0.05249999836087227, | |
| "rewards/pot_combined_reward": 0.026249999180436134, | |
| "step": 49 | |
| }, | |
| { | |
| "completion_length": 374.0, | |
| "epoch": 0.2, | |
| "grad_norm": 0.10189752280712128, | |
| "kl": 0.0004997247888240963, | |
| "learning_rate": 3.019779227044398e-06, | |
| "loss": 0.0, | |
| "reward": 0.06999999843537807, | |
| "reward_std": 0.13999999687075615, | |
| "rewards/pot_combined_reward": 0.06999999843537807, | |
| "step": 50 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 100, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |