{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.888888888888889, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.5925925925925926, "grad_norm": 5.219141483306885, "learning_rate": 0.00018, "logits/chosen": -0.2618408203125, "logits/rejected": -0.405269056558609, "logps/chosen": -146.83187866210938, "logps/rejected": -130.77682495117188, "loss": 0.6931, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.024159394204616547, "rewards/margins": 0.020402083173394203, "rewards/rejected": 0.0037573135923594236, "step": 10 }, { "epoch": 1.1777777777777778, "grad_norm": 6.610161781311035, "learning_rate": 0.00019333333333333333, "logits/chosen": 0.21812255680561066, "logits/rejected": 0.1329454481601715, "logps/chosen": -144.95765686035156, "logps/rejected": -133.71145629882812, "loss": 0.682, "rewards/accuracies": 0.594936728477478, "rewards/chosen": -0.0013131718151271343, "rewards/margins": 0.18159937858581543, "rewards/rejected": -0.18291252851486206, "step": 20 }, { "epoch": 1.7703703703703704, "grad_norm": 2.0350332260131836, "learning_rate": 0.00018500000000000002, "logits/chosen": 0.40491190552711487, "logits/rejected": 0.2921258509159088, "logps/chosen": -140.00352478027344, "logps/rejected": -134.99171447753906, "loss": 0.3351, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.19238564372062683, "rewards/margins": 1.1973682641983032, "rewards/rejected": -1.004982590675354, "step": 30 }, { "epoch": 2.3555555555555556, "grad_norm": 2.579374313354492, "learning_rate": 0.00017666666666666666, "logits/chosen": 0.11085856705904007, "logits/rejected": 0.032480403780937195, "logps/chosen": -143.1613006591797, "logps/rejected": -159.1355438232422, "loss": 0.1806, "rewards/accuracies": 0.9620253443717957, "rewards/chosen": 0.3689553439617157, "rewards/margins": 2.494295358657837, "rewards/rejected": -2.125339984893799, "step": 40 }, { "epoch": 2.948148148148148, "grad_norm": 0.8431211709976196, "learning_rate": 0.00016833333333333335, "logits/chosen": -0.39661893248558044, "logits/rejected": -0.4452442228794098, "logps/chosen": -146.19427490234375, "logps/rejected": -168.6130828857422, "loss": 0.1049, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.37566089630126953, "rewards/margins": 3.8498573303222656, "rewards/rejected": -4.225518226623535, "step": 50 }, { "epoch": 3.533333333333333, "grad_norm": 1.5647461414337158, "learning_rate": 0.00016, "logits/chosen": -0.9004085659980774, "logits/rejected": -0.9173569679260254, "logps/chosen": -164.2067413330078, "logps/rejected": -209.5588836669922, "loss": 0.0286, "rewards/accuracies": 0.9873417615890503, "rewards/chosen": -1.5753328800201416, "rewards/margins": 6.148852825164795, "rewards/rejected": -7.724185943603516, "step": 60 }, { "epoch": 4.118518518518519, "grad_norm": 0.05377896502614021, "learning_rate": 0.00015166666666666668, "logits/chosen": -1.1217529773712158, "logits/rejected": -1.1465003490447998, "logps/chosen": -170.9961395263672, "logps/rejected": -241.1328125, "loss": 0.0204, "rewards/accuracies": 0.9873417615890503, "rewards/chosen": -3.3284900188446045, "rewards/margins": 7.842370986938477, "rewards/rejected": -11.170860290527344, "step": 70 }, { "epoch": 4.711111111111111, "grad_norm": 0.05912935361266136, "learning_rate": 0.00014333333333333334, "logits/chosen": -1.3752106428146362, "logits/rejected": -1.3843052387237549, "logps/chosen": -200.71902465820312, "logps/rejected": -283.8058166503906, "loss": 0.0098, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.450322151184082, "rewards/margins": 9.867142677307129, "rewards/rejected": -15.317463874816895, "step": 80 }, { "epoch": 5.296296296296296, "grad_norm": 0.08618709444999695, "learning_rate": 0.00013500000000000003, "logits/chosen": -1.3256553411483765, "logits/rejected": -1.3671971559524536, "logps/chosen": -237.59788513183594, "logps/rejected": -331.6789245605469, "loss": 0.0103, "rewards/accuracies": 0.9873417615890503, "rewards/chosen": -9.06311321258545, "rewards/margins": 10.989459991455078, "rewards/rejected": -20.052576065063477, "step": 90 }, { "epoch": 5.888888888888889, "grad_norm": 0.0007632412016391754, "learning_rate": 0.00012666666666666666, "logits/chosen": -1.4357213973999023, "logits/rejected": -1.4478198289871216, "logps/chosen": -212.84860229492188, "logps/rejected": -312.7068786621094, "loss": 0.0176, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -6.595943450927734, "rewards/margins": 11.647254943847656, "rewards/rejected": -18.24319839477539, "step": 100 } ], "logging_steps": 10, "max_steps": 250, "num_input_tokens_seen": 0, "num_train_epochs": 15, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }