{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 46, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.10869565217391304, "grad_norm": 145.4520263671875, "learning_rate": 4.000000000000001e-06, "logits/chosen": -7.522208213806152, "logits/rejected": -7.187244415283203, "logps/chosen": -42.41267776489258, "logps/rejected": -228.89913940429688, "loss": 5.0182, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": 3.0174379348754883, "rewards/margins": -4.805842399597168, "rewards/rejected": 7.823281288146973, "step": 5 }, { "epoch": 0.21739130434782608, "grad_norm": 67.39005279541016, "learning_rate": 4.883490980137327e-06, "logits/chosen": -7.832457542419434, "logits/rejected": -7.450900077819824, "logps/chosen": -37.844947814941406, "logps/rejected": -241.001220703125, "loss": 2.3347, "rewards/accuracies": 0.36250001192092896, "rewards/chosen": 2.7626261711120605, "rewards/margins": -1.523572564125061, "rewards/rejected": 4.28619909286499, "step": 10 }, { "epoch": 0.32608695652173914, "grad_norm": 11.523139953613281, "learning_rate": 4.428722949554858e-06, "logits/chosen": -8.29098892211914, "logits/rejected": -7.737236022949219, "logps/chosen": -36.03478240966797, "logps/rejected": -299.16802978515625, "loss": 0.3164, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 2.3383092880249023, "rewards/margins": 3.7309823036193848, "rewards/rejected": -1.3926727771759033, "step": 15 }, { "epoch": 0.43478260869565216, "grad_norm": 1.9360767602920532, "learning_rate": 3.6942995462806574e-06, "logits/chosen": -8.896014213562012, "logits/rejected": -8.395783424377441, "logps/chosen": -44.91072463989258, "logps/rejected": -343.45086669921875, "loss": 0.0385, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.546364426612854, "rewards/margins": 9.299250602722168, "rewards/rejected": -7.7528862953186035, "step": 20 }, { "epoch": 0.5434782608695652, "grad_norm": 0.7242340445518494, "learning_rate": 2.786708563496002e-06, "logits/chosen": -9.124911308288574, "logits/rejected": -8.544282913208008, "logps/chosen": -53.40107345581055, "logps/rejected": -400.3040466308594, "loss": 0.0092, "rewards/accuracies": 1.0, "rewards/chosen": 1.0653327703475952, "rewards/margins": 12.580676078796387, "rewards/rejected": -11.515342712402344, "step": 25 }, { "epoch": 0.6521739130434783, "grad_norm": 0.914761483669281, "learning_rate": 1.8375462445083464e-06, "logits/chosen": -9.442420959472656, "logits/rejected": -8.751354217529297, "logps/chosen": -58.87113571166992, "logps/rejected": -390.2606506347656, "loss": 0.035, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.6872259378433228, "rewards/margins": 14.051335334777832, "rewards/rejected": -13.364108085632324, "step": 30 }, { "epoch": 0.7608695652173914, "grad_norm": 2.3408379554748535, "learning_rate": 9.844364725834058e-07, "logits/chosen": -9.394502639770508, "logits/rejected": -8.917421340942383, "logps/chosen": -61.983360290527344, "logps/rejected": -399.78619384765625, "loss": 0.0784, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.25255903601646423, "rewards/margins": 15.496400833129883, "rewards/rejected": -15.243843078613281, "step": 35 }, { "epoch": 0.8695652173913043, "grad_norm": 0.053074195981025696, "learning_rate": 3.510759825319976e-07, "logits/chosen": -9.617226600646973, "logits/rejected": -8.915563583374023, "logps/chosen": -60.818382263183594, "logps/rejected": -431.60931396484375, "loss": 0.061, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.3885515630245209, "rewards/margins": 15.846685409545898, "rewards/rejected": -15.45813274383545, "step": 40 }, { "epoch": 0.9782608695652174, "grad_norm": 2.3728623390197754, "learning_rate": 2.9298940549128962e-08, "logits/chosen": -9.523667335510254, "logits/rejected": -8.924112319946289, "logps/chosen": -63.81079864501953, "logps/rejected": -414.37872314453125, "loss": 0.0305, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.3749346137046814, "rewards/margins": 15.598576545715332, "rewards/rejected": -15.223642349243164, "step": 45 } ], "logging_steps": 5, "max_steps": 46, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }