{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 6.924444444444444, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.35555555555555557, "grad_norm": 4.476593494415283, "learning_rate": 0.00016, "logits/chosen": -0.5491379499435425, "logits/rejected": -0.3598092198371887, "logps/chosen": -121.235107421875, "logps/rejected": -177.16122436523438, "loss": 0.6271, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.012747581116855145, "rewards/margins": 0.18439342081546783, "rewards/rejected": -0.171645849943161, "step": 10 }, { "epoch": 0.7111111111111111, "grad_norm": Infinity, "learning_rate": 0.00019416666666666668, "logits/chosen": -0.8914409875869751, "logits/rejected": -0.8619183301925659, "logps/chosen": -170.39581298828125, "logps/rejected": -245.456787109375, "loss": 0.3518, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.338006019592285, "rewards/margins": 3.272592544555664, "rewards/rejected": -7.610598087310791, "step": 20 }, { "epoch": 1.0355555555555556, "grad_norm": 18.060880661010742, "learning_rate": 0.0001866666666666667, "logits/chosen": -0.23407170176506042, "logits/rejected": -0.19450390338897705, "logps/chosen": -234.69595336914062, "logps/rejected": -360.298828125, "loss": 0.1096, "rewards/accuracies": 0.9178082346916199, "rewards/chosen": -10.76107120513916, "rewards/margins": 8.50648021697998, "rewards/rejected": -19.267549514770508, "step": 30 }, { "epoch": 1.3911111111111112, "grad_norm": 27.05306625366211, "learning_rate": 0.00017833333333333335, "logits/chosen": 0.11382684856653214, "logits/rejected": 0.12602348625659943, "logps/chosen": -233.735107421875, "logps/rejected": -393.0685119628906, "loss": 0.0432, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -10.631088256835938, "rewards/margins": 12.194381713867188, "rewards/rejected": -22.825469970703125, "step": 40 }, { "epoch": 1.7466666666666666, "grad_norm": 10.061967849731445, "learning_rate": 0.00017, "logits/chosen": 0.10496137291193008, "logits/rejected": 0.022805940359830856, "logps/chosen": -221.74880981445312, "logps/rejected": -430.38739013671875, "loss": 0.0082, "rewards/accuracies": 1.0, "rewards/chosen": -9.394009590148926, "rewards/margins": 15.525640487670898, "rewards/rejected": -24.919647216796875, "step": 50 }, { "epoch": 2.071111111111111, "grad_norm": 0.023419547826051712, "learning_rate": 0.00016166666666666668, "logits/chosen": 0.4416600167751312, "logits/rejected": 0.45946231484413147, "logps/chosen": -211.32363891601562, "logps/rejected": -403.36846923828125, "loss": 0.0294, "rewards/accuracies": 0.9726027250289917, "rewards/chosen": -9.822237014770508, "rewards/margins": 14.4923677444458, "rewards/rejected": -24.314605712890625, "step": 60 }, { "epoch": 2.4266666666666667, "grad_norm": 0.028293505311012268, "learning_rate": 0.00015333333333333334, "logits/chosen": -0.061886321753263474, "logits/rejected": -0.13684093952178955, "logps/chosen": -199.42222595214844, "logps/rejected": -398.5884704589844, "loss": 0.0174, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -7.217587471008301, "rewards/margins": 15.358650207519531, "rewards/rejected": -22.57623863220215, "step": 70 }, { "epoch": 2.7822222222222224, "grad_norm": 0.026505785062909126, "learning_rate": 0.000145, "logits/chosen": -0.11573155969381332, "logits/rejected": -0.0978168398141861, "logps/chosen": -196.46902465820312, "logps/rejected": -407.5648193359375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -6.986049652099609, "rewards/margins": 16.664913177490234, "rewards/rejected": -23.65096092224121, "step": 80 }, { "epoch": 3.1066666666666665, "grad_norm": 0.002059513470157981, "learning_rate": 0.00013666666666666666, "logits/chosen": -0.06037778779864311, "logits/rejected": 0.009283392690122128, "logps/chosen": -223.2586669921875, "logps/rejected": -426.4629821777344, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -9.722103118896484, "rewards/margins": 15.552120208740234, "rewards/rejected": -25.27422332763672, "step": 90 }, { "epoch": 3.462222222222222, "grad_norm": 0.0005333780427463353, "learning_rate": 0.00012833333333333335, "logits/chosen": 0.32319241762161255, "logits/rejected": 0.33545833826065063, "logps/chosen": -225.3436737060547, "logps/rejected": -463.82855224609375, "loss": 0.0174, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -10.732784271240234, "rewards/margins": 19.125728607177734, "rewards/rejected": -29.8585147857666, "step": 100 }, { "epoch": 3.8177777777777777, "grad_norm": 0.0003525021020323038, "learning_rate": 0.00012, "logits/chosen": 0.1985320746898651, "logits/rejected": 0.16121362149715424, "logps/chosen": -254.75979614257812, "logps/rejected": -500.808837890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.534481048583984, "rewards/margins": 20.094514846801758, "rewards/rejected": -32.628997802734375, "step": 110 }, { "epoch": 4.142222222222222, "grad_norm": 0.0025702340062707663, "learning_rate": 0.00011166666666666668, "logits/chosen": 0.2640947699546814, "logits/rejected": 0.28702497482299805, "logps/chosen": -245.55862426757812, "logps/rejected": -473.2377014160156, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -12.008322715759277, "rewards/margins": 19.013830184936523, "rewards/rejected": -31.022153854370117, "step": 120 }, { "epoch": 4.497777777777777, "grad_norm": 8.971437637228519e-06, "learning_rate": 0.00010333333333333334, "logits/chosen": 0.33078843355178833, "logits/rejected": 0.3624512553215027, "logps/chosen": -248.3406982421875, "logps/rejected": -493.0819396972656, "loss": 0.0173, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -12.471681594848633, "rewards/margins": 19.82614517211914, "rewards/rejected": -32.297828674316406, "step": 130 }, { "epoch": 4.8533333333333335, "grad_norm": 0.01190439984202385, "learning_rate": 9.5e-05, "logits/chosen": 0.27441588044166565, "logits/rejected": 0.2612026333808899, "logps/chosen": -253.7766571044922, "logps/rejected": -513.3739624023438, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -12.685572624206543, "rewards/margins": 21.006938934326172, "rewards/rejected": -33.692508697509766, "step": 140 }, { "epoch": 5.177777777777778, "grad_norm": 0.00018138311861548573, "learning_rate": 8.666666666666667e-05, "logits/chosen": 0.2858441174030304, "logits/rejected": 0.2530161440372467, "logps/chosen": -237.8019561767578, "logps/rejected": -490.26483154296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.838016510009766, "rewards/margins": 20.136157989501953, "rewards/rejected": -31.97417449951172, "step": 150 }, { "epoch": 5.533333333333333, "grad_norm": 9.341749682789668e-05, "learning_rate": 7.833333333333333e-05, "logits/chosen": 0.23903337121009827, "logits/rejected": 0.22848956286907196, "logps/chosen": -234.5439453125, "logps/rejected": -483.54998779296875, "loss": 0.0087, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -11.003255844116211, "rewards/margins": 20.23318099975586, "rewards/rejected": -31.236434936523438, "step": 160 }, { "epoch": 5.888888888888889, "grad_norm": 0.014991430565714836, "learning_rate": 7e-05, "logits/chosen": 0.2805466949939728, "logits/rejected": 0.32509756088256836, "logps/chosen": -250.9308624267578, "logps/rejected": -486.75579833984375, "loss": 0.0087, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -12.33309555053711, "rewards/margins": 19.815921783447266, "rewards/rejected": -32.149017333984375, "step": 170 }, { "epoch": 6.213333333333333, "grad_norm": 0.0005387673154473305, "learning_rate": 6.166666666666667e-05, "logits/chosen": 0.20652510225772858, "logits/rejected": 0.25150975584983826, "logps/chosen": -248.03903198242188, "logps/rejected": -490.83819580078125, "loss": 0.0087, "rewards/accuracies": 0.9863013625144958, "rewards/chosen": -11.940282821655273, "rewards/margins": 19.73668098449707, "rewards/rejected": -31.67696189880371, "step": 180 }, { "epoch": 6.568888888888889, "grad_norm": 0.0032984877470880747, "learning_rate": 5.333333333333333e-05, "logits/chosen": 0.24716417491436005, "logits/rejected": 0.18740002810955048, "logps/chosen": -253.20565795898438, "logps/rejected": -522.0203857421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.635278701782227, "rewards/margins": 21.732524871826172, "rewards/rejected": -34.36780548095703, "step": 190 }, { "epoch": 6.924444444444444, "grad_norm": 0.0017882203683257103, "learning_rate": 4.5e-05, "logits/chosen": 0.19444510340690613, "logits/rejected": 0.21992090344429016, "logps/chosen": -245.88601684570312, "logps/rejected": -480.85516357421875, "loss": 0.0087, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -11.796374320983887, "rewards/margins": 19.393239974975586, "rewards/rejected": -31.18961524963379, "step": 200 } ], "logging_steps": 10, "max_steps": 250, "num_input_tokens_seen": 0, "num_train_epochs": 9, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }