{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.998234255444379, "eval_steps": 500, "global_step": 424, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002354326074161271, "grad_norm": 387.14608138046685, "learning_rate": 1.1627906976744186e-08, "logits/chosen": 0.9922162890434265, "logits/rejected": 2.5945053100585938, "logps/chosen": -671.7749633789062, "logps/pi_response": -362.8890380859375, "logps/ref_response": -362.8890380859375, "logps/rejected": -872.0834350585938, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.023543260741612712, "grad_norm": 416.60085993891914, "learning_rate": 1.1627906976744186e-07, "logits/chosen": 0.5027725696563721, "logits/rejected": 1.1612285375595093, "logps/chosen": -615.636474609375, "logps/pi_response": -373.8717346191406, "logps/ref_response": -371.5905456542969, "logps/rejected": -773.8812255859375, "loss": 0.68, "rewards/accuracies": 0.5185185074806213, "rewards/chosen": 0.0004265241150278598, "rewards/margins": 0.02714625373482704, "rewards/rejected": -0.026719728484749794, "step": 10 }, { "epoch": 0.047086521483225424, "grad_norm": 146.4782737758625, "learning_rate": 2.3255813953488372e-07, "logits/chosen": 1.0533249378204346, "logits/rejected": 1.2860088348388672, "logps/chosen": -583.3096923828125, "logps/pi_response": -664.9867553710938, "logps/ref_response": -384.66943359375, "logps/rejected": -1088.1510009765625, "loss": 0.5167, "rewards/accuracies": 0.8500000834465027, "rewards/chosen": -0.05585538223385811, "rewards/margins": 2.6645126342773438, "rewards/rejected": -2.7203681468963623, "step": 20 }, { "epoch": 0.07062978222483814, "grad_norm": 193.36310820542772, "learning_rate": 3.4883720930232557e-07, "logits/chosen": 1.5033495426177979, "logits/rejected": 2.0693583488464355, "logps/chosen": -675.2313232421875, "logps/pi_response": -1286.485107421875, "logps/ref_response": -382.0600280761719, "logps/rejected": -1842.0443115234375, "loss": 0.3477, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": -1.2492175102233887, "rewards/margins": 8.74349594116211, "rewards/rejected": -9.99271297454834, "step": 30 }, { "epoch": 0.09417304296645085, "grad_norm": 161.8114972801858, "learning_rate": 4.6511627906976743e-07, "logits/chosen": 2.231173515319824, "logits/rejected": 2.243076801300049, "logps/chosen": -888.1588745117188, "logps/pi_response": -1944.935546875, "logps/ref_response": -385.018798828125, "logps/rejected": -2694.44189453125, "loss": 0.5111, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": -3.3236842155456543, "rewards/margins": 15.406656265258789, "rewards/rejected": -18.7303409576416, "step": 40 }, { "epoch": 0.11771630370806356, "grad_norm": 73.27022369826706, "learning_rate": 4.995836722963699e-07, "logits/chosen": 1.7006012201309204, "logits/rejected": 2.5465452671051025, "logps/chosen": -736.0626220703125, "logps/pi_response": -1658.806640625, "logps/ref_response": -418.013671875, "logps/rejected": -2036.8658447265625, "loss": 0.3367, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.4946272373199463, "rewards/margins": 11.121105194091797, "rewards/rejected": -12.615732192993164, "step": 50 }, { "epoch": 0.14125956444967627, "grad_norm": 197.275331081517, "learning_rate": 4.975478535699678e-07, "logits/chosen": 2.0557973384857178, "logits/rejected": 2.7715961933135986, "logps/chosen": -786.7219848632812, "logps/pi_response": -1516.2132568359375, "logps/ref_response": -400.8934326171875, "logps/rejected": -1846.4228515625, "loss": 0.3929, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.6042267084121704, "rewards/margins": 9.362323760986328, "rewards/rejected": -10.966550827026367, "step": 60 }, { "epoch": 0.164802825191289, "grad_norm": 125.32869675723494, "learning_rate": 4.938298919762906e-07, "logits/chosen": 2.2630465030670166, "logits/rejected": 2.313655138015747, "logps/chosen": -874.7604370117188, "logps/pi_response": -1775.0299072265625, "logps/ref_response": -406.5614013671875, "logps/rejected": -2185.577392578125, "loss": 0.3204, "rewards/accuracies": 0.875, "rewards/chosen": -2.6973066329956055, "rewards/margins": 11.22050666809082, "rewards/rejected": -13.917811393737793, "step": 70 }, { "epoch": 0.1883460859329017, "grad_norm": 183.2640323450364, "learning_rate": 4.884550518934592e-07, "logits/chosen": 2.544085741043091, "logits/rejected": 2.2697596549987793, "logps/chosen": -814.3087158203125, "logps/pi_response": -1254.173583984375, "logps/ref_response": -385.601318359375, "logps/rejected": -1756.57421875, "loss": 0.4552, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.785598039627075, "rewards/margins": 6.5690412521362305, "rewards/rejected": -9.354639053344727, "step": 80 }, { "epoch": 0.21188934667451442, "grad_norm": 117.0357298014428, "learning_rate": 4.814598565584062e-07, "logits/chosen": 3.02203369140625, "logits/rejected": 3.5366241931915283, "logps/chosen": -908.7738037109375, "logps/pi_response": -1195.170166015625, "logps/ref_response": -383.25390625, "logps/rejected": -1815.823974609375, "loss": 0.3526, "rewards/accuracies": 0.85833340883255, "rewards/chosen": -3.5764949321746826, "rewards/margins": 6.11873722076416, "rewards/rejected": -9.695232391357422, "step": 90 }, { "epoch": 0.23543260741612712, "grad_norm": 398.94556345700823, "learning_rate": 4.7289183988333603e-07, "logits/chosen": 2.4340739250183105, "logits/rejected": 3.0718626976013184, "logps/chosen": -979.5059814453125, "logps/pi_response": -1823.2095947265625, "logps/ref_response": -389.8714294433594, "logps/rejected": -2347.225341796875, "loss": 0.5772, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -4.193310260772705, "rewards/margins": 10.396108627319336, "rewards/rejected": -14.5894193649292, "step": 100 }, { "epoch": 0.2589758681577399, "grad_norm": 122.8282681969783, "learning_rate": 4.6280922345219255e-07, "logits/chosen": 2.0978169441223145, "logits/rejected": 2.4143967628479004, "logps/chosen": -942.2001953125, "logps/pi_response": -1763.4075927734375, "logps/ref_response": -404.2528381347656, "logps/rejected": -2334.311279296875, "loss": 0.2675, "rewards/accuracies": 0.8916667103767395, "rewards/chosen": -3.5824508666992188, "rewards/margins": 11.474252700805664, "rewards/rejected": -15.056703567504883, "step": 110 }, { "epoch": 0.28251912889935255, "grad_norm": 114.42881059618138, "learning_rate": 4.512805208920118e-07, "logits/chosen": 1.876939058303833, "logits/rejected": 1.8963384628295898, "logps/chosen": -1116.2161865234375, "logps/pi_response": -2194.660888671875, "logps/ref_response": -382.1685485839844, "logps/rejected": -2665.172607421875, "loss": 0.3802, "rewards/accuracies": 0.8833333849906921, "rewards/chosen": -5.01099157333374, "rewards/margins": 14.220196723937988, "rewards/rejected": -19.231189727783203, "step": 120 }, { "epoch": 0.30606238964096527, "grad_norm": 289.6702886782038, "learning_rate": 4.383840723075488e-07, "logits/chosen": 0.6433286070823669, "logits/rejected": 1.8898694515228271, "logps/chosen": -1148.931884765625, "logps/pi_response": -1483.739990234375, "logps/ref_response": -384.0348815917969, "logps/rejected": -2114.025634765625, "loss": 0.5347, "rewards/accuracies": 0.875, "rewards/chosen": -5.955471515655518, "rewards/margins": 6.998837947845459, "rewards/rejected": -12.954309463500977, "step": 130 }, { "epoch": 0.329605650382578, "grad_norm": 68.08348369771358, "learning_rate": 4.2420751194279604e-07, "logits/chosen": -0.47634777426719666, "logits/rejected": 0.15769393742084503, "logps/chosen": -1069.734619140625, "logps/pi_response": -1077.26025390625, "logps/ref_response": -385.5350646972656, "logps/rejected": -1771.5843505859375, "loss": 0.2586, "rewards/accuracies": 0.9083333015441895, "rewards/chosen": -4.997823238372803, "rewards/margins": 4.333145618438721, "rewards/rejected": -9.330968856811523, "step": 140 }, { "epoch": 0.3531489111241907, "grad_norm": 163.9369240446969, "learning_rate": 4.0884717268675306e-07, "logits/chosen": -0.20242293179035187, "logits/rejected": 0.2356901615858078, "logps/chosen": -1135.7840576171875, "logps/pi_response": -1698.245361328125, "logps/ref_response": -393.395263671875, "logps/rejected": -2191.13671875, "loss": 0.2882, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.520671367645264, "rewards/margins": 8.618199348449707, "rewards/rejected": -14.138870239257812, "step": 150 }, { "epoch": 0.3766921718658034, "grad_norm": 150.38185986907246, "learning_rate": 3.9240743146996427e-07, "logits/chosen": -0.22249138355255127, "logits/rejected": 0.3604862689971924, "logps/chosen": -981.8104248046875, "logps/pi_response": -1485.242919921875, "logps/ref_response": -374.290283203125, "logps/rejected": -2042.9921875, "loss": 0.3641, "rewards/accuracies": 0.8250001072883606, "rewards/chosen": -3.9730944633483887, "rewards/margins": 8.374366760253906, "rewards/rejected": -12.347461700439453, "step": 160 }, { "epoch": 0.4002354326074161, "grad_norm": 80.54150444296407, "learning_rate": 3.75e-07, "logits/chosen": -0.11508075892925262, "logits/rejected": 0.29167068004608154, "logps/chosen": -1000.4583740234375, "logps/pi_response": -1468.6123046875, "logps/ref_response": -418.64288330078125, "logps/rejected": -2010.297607421875, "loss": 0.3769, "rewards/accuracies": 0.875, "rewards/chosen": -4.033980369567871, "rewards/margins": 7.807894229888916, "rewards/rejected": -11.841875076293945, "step": 170 }, { "epoch": 0.42377869334902885, "grad_norm": 157.0982901424276, "learning_rate": 3.5674316565549227e-07, "logits/chosen": 0.3534625172615051, "logits/rejected": 0.6805320382118225, "logps/chosen": -1109.6680908203125, "logps/pi_response": -1332.754638671875, "logps/ref_response": -410.44232177734375, "logps/rejected": -1955.057861328125, "loss": 0.2771, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": -5.245041370391846, "rewards/margins": 6.281018257141113, "rewards/rejected": -11.526060104370117, "step": 180 }, { "epoch": 0.44732195409064157, "grad_norm": 138.9366989622839, "learning_rate": 3.377609876970194e-07, "logits/chosen": -0.06883997470140457, "logits/rejected": 0.5686254501342773, "logps/chosen": -1120.6253662109375, "logps/pi_response": -1725.830810546875, "logps/ref_response": -391.46697998046875, "logps/rejected": -2330.28564453125, "loss": 0.3201, "rewards/accuracies": 0.85833340883255, "rewards/chosen": -5.111655235290527, "rewards/margins": 10.163948059082031, "rewards/rejected": -15.275602340698242, "step": 190 }, { "epoch": 0.47086521483225424, "grad_norm": 74.34016252556022, "learning_rate": 3.1818245425676556e-07, "logits/chosen": 0.038237761706113815, "logits/rejected": 0.7730900049209595, "logps/chosen": -1035.312744140625, "logps/pi_response": -1930.384765625, "logps/ref_response": -400.8553771972656, "logps/rejected": -2589.25439453125, "loss": 0.3094, "rewards/accuracies": 0.9250000715255737, "rewards/chosen": -4.300232410430908, "rewards/margins": 12.978002548217773, "rewards/rejected": -17.278234481811523, "step": 200 }, { "epoch": 0.49440847557386697, "grad_norm": 135.89742493179222, "learning_rate": 2.981406058353988e-07, "logits/chosen": 1.6664092540740967, "logits/rejected": 1.5270541906356812, "logps/chosen": -1253.89013671875, "logps/pi_response": -3124.22021484375, "logps/ref_response": -380.658447265625, "logps/rejected": -3518.0625, "loss": 0.31, "rewards/accuracies": 0.8916667103767395, "rewards/chosen": -6.066841125488281, "rewards/margins": 21.756336212158203, "rewards/rejected": -27.823177337646484, "step": 210 }, { "epoch": 0.5179517363154797, "grad_norm": 367.00195352780696, "learning_rate": 2.7777163126220113e-07, "logits/chosen": 1.1289093494415283, "logits/rejected": 1.753485918045044, "logps/chosen": -1161.3385009765625, "logps/pi_response": -2382.952880859375, "logps/ref_response": -397.78643798828125, "logps/rejected": -2924.198974609375, "loss": 0.3399, "rewards/accuracies": 0.8416668176651001, "rewards/chosen": -5.906828880310059, "rewards/margins": 15.091941833496094, "rewards/rejected": -20.998775482177734, "step": 220 }, { "epoch": 0.5414949970570924, "grad_norm": 429.21214818334255, "learning_rate": 2.5721394226160456e-07, "logits/chosen": 0.7405129671096802, "logits/rejected": 1.092500925064087, "logps/chosen": -1127.665771484375, "logps/pi_response": -3084.66748046875, "logps/ref_response": -385.5183410644531, "logps/rejected": -3698.11083984375, "loss": 0.3142, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -5.245604515075684, "rewards/margins": 23.772628784179688, "rewards/rejected": -29.018238067626953, "step": 230 }, { "epoch": 0.5650382577987051, "grad_norm": 143.78300847624018, "learning_rate": 2.3660723291465753e-07, "logits/chosen": 0.8486520648002625, "logits/rejected": 1.15015709400177, "logps/chosen": -1044.281982421875, "logps/pi_response": -2387.38623046875, "logps/ref_response": -387.73004150390625, "logps/rejected": -2920.59375, "loss": 0.2255, "rewards/accuracies": 0.9583333730697632, "rewards/chosen": -4.885871887207031, "rewards/margins": 15.968719482421875, "rewards/rejected": -20.854591369628906, "step": 240 }, { "epoch": 0.5885815185403178, "grad_norm": 125.17444301250116, "learning_rate": 2.1609153040659357e-07, "logits/chosen": 1.349198341369629, "logits/rejected": 1.477452278137207, "logps/chosen": -1202.511962890625, "logps/pi_response": -4804.65576171875, "logps/ref_response": -403.5562438964844, "logps/rejected": -4992.322265625, "loss": 0.2998, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -5.66055154800415, "rewards/margins": 36.13991928100586, "rewards/rejected": -41.800472259521484, "step": 250 }, { "epoch": 0.6121247792819305, "grad_norm": 75.45331010998015, "learning_rate": 1.9580624351088174e-07, "logits/chosen": 1.8510982990264893, "logits/rejected": 1.8215391635894775, "logps/chosen": -1237.7900390625, "logps/pi_response": -4366.1884765625, "logps/ref_response": -402.7124328613281, "logps/rejected": -4625.4326171875, "loss": 0.2256, "rewards/accuracies": 0.908333420753479, "rewards/chosen": -6.537631034851074, "rewards/margins": 31.707489013671875, "rewards/rejected": -38.245121002197266, "step": 260 }, { "epoch": 0.6356680400235433, "grad_norm": 121.70692272560186, "learning_rate": 1.7588921527552315e-07, "logits/chosen": 1.641945242881775, "logits/rejected": 1.5603419542312622, "logps/chosen": -1264.4483642578125, "logps/pi_response": -4920.81396484375, "logps/ref_response": -420.87274169921875, "logps/rejected": -5146.81005859375, "loss": 0.7912, "rewards/accuracies": 0.9000000953674316, "rewards/chosen": -6.559077262878418, "rewards/margins": 36.41227722167969, "rewards/rejected": -42.97135925292969, "step": 270 }, { "epoch": 0.659211300765156, "grad_norm": 121.24373782517053, "learning_rate": 1.564757863488017e-07, "logits/chosen": 1.3223627805709839, "logits/rejected": 2.0051231384277344, "logps/chosen": -1246.6629638671875, "logps/pi_response": -4004.585205078125, "logps/ref_response": -386.71160888671875, "logps/rejected": -4359.3427734375, "loss": 0.1897, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": -6.822081089019775, "rewards/margins": 27.98373794555664, "rewards/rejected": -34.80581283569336, "step": 280 }, { "epoch": 0.6827545615067687, "grad_norm": 830.0337931185638, "learning_rate": 1.3769787530939818e-07, "logits/chosen": 1.528591275215149, "logits/rejected": 2.117349147796631, "logps/chosen": -1309.528564453125, "logps/pi_response": -5731.9384765625, "logps/ref_response": -398.12322998046875, "logps/rejected": -5667.63916015625, "loss": 0.297, "rewards/accuracies": 0.9166668057441711, "rewards/chosen": -7.466167449951172, "rewards/margins": 41.12438201904297, "rewards/rejected": -48.590553283691406, "step": 290 }, { "epoch": 0.7062978222483814, "grad_norm": 152.99540073883847, "learning_rate": 1.19683082250231e-07, "logits/chosen": 1.3115044832229614, "logits/rejected": 1.6584991216659546, "logps/chosen": -1225.9017333984375, "logps/pi_response": -4866.2646484375, "logps/ref_response": -388.7275085449219, "logps/rejected": -4971.64306640625, "loss": 0.2874, "rewards/accuracies": 0.8833333253860474, "rewards/chosen": -6.632820129394531, "rewards/margins": 34.09852600097656, "rewards/rejected": -40.731346130371094, "step": 300 }, { "epoch": 0.7298410829899941, "grad_norm": 119.56456897387905, "learning_rate": 1.0255382170737015e-07, "logits/chosen": 1.3735281229019165, "logits/rejected": 1.32486891746521, "logps/chosen": -1208.898193359375, "logps/pi_response": -5530.7060546875, "logps/ref_response": -391.98046875, "logps/rejected": -6051.8076171875, "loss": 0.591, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": -6.081249713897705, "rewards/margins": 46.203948974609375, "rewards/rejected": -52.28520584106445, "step": 310 }, { "epoch": 0.7533843437316068, "grad_norm": 108.85771777397696, "learning_rate": 8.642649082596692e-08, "logits/chosen": 1.5552794933319092, "logits/rejected": 1.771057367324829, "logps/chosen": -1233.845947265625, "logps/pi_response": -5965.13623046875, "logps/ref_response": -382.88055419921875, "logps/rejected": -6220.9912109375, "loss": 0.4376, "rewards/accuracies": 0.8916667699813843, "rewards/chosen": -6.246255397796631, "rewards/margins": 48.06293487548828, "rewards/rejected": -54.30919647216797, "step": 320 }, { "epoch": 0.7769276044732195, "grad_norm": 64.98595778509755, "learning_rate": 7.141067841569633e-08, "logits/chosen": 1.0594632625579834, "logits/rejected": 1.5874229669570923, "logps/chosen": -1108.4306640625, "logps/pi_response": -3888.35400390625, "logps/ref_response": -377.4826354980469, "logps/rejected": -4457.27685546875, "loss": 0.6749, "rewards/accuracies": 0.908333420753479, "rewards/chosen": -5.92837381362915, "rewards/margins": 30.60651206970215, "rewards/rejected": -36.534889221191406, "step": 330 }, { "epoch": 0.8004708652148322, "grad_norm": 114.35171172829054, "learning_rate": 5.7608420270357614e-08, "logits/chosen": 1.8669030666351318, "logits/rejected": 2.0596258640289307, "logps/chosen": -1213.849609375, "logps/pi_response": -4912.9462890625, "logps/ref_response": -396.6376037597656, "logps/rejected": -4539.1298828125, "loss": 0.435, "rewards/accuracies": 0.9000000953674316, "rewards/chosen": -6.412614345550537, "rewards/margins": 30.942264556884766, "rewards/rejected": -37.354881286621094, "step": 340 }, { "epoch": 0.824014125956445, "grad_norm": 179.13336254225945, "learning_rate": 4.511350581190129e-08, "logits/chosen": 1.3449863195419312, "logits/rejected": 1.5248844623565674, "logps/chosen": -1232.9034423828125, "logps/pi_response": -6418.89697265625, "logps/ref_response": -406.46099853515625, "logps/rejected": -5906.890625, "loss": 0.2574, "rewards/accuracies": 0.85833340883255, "rewards/chosen": -6.55679178237915, "rewards/margins": 44.44318771362305, "rewards/rejected": -50.99998092651367, "step": 350 }, { "epoch": 0.8475573866980577, "grad_norm": 100.07236020794019, "learning_rate": 3.401084077039293e-08, "logits/chosen": 1.0567926168441772, "logits/rejected": 1.0804252624511719, "logps/chosen": -1290.2373046875, "logps/pi_response": -5767.3994140625, "logps/ref_response": -370.0479431152344, "logps/rejected": -6238.93115234375, "loss": 0.182, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -6.525579929351807, "rewards/margins": 48.529579162597656, "rewards/rejected": -55.0551643371582, "step": 360 }, { "epoch": 0.8711006474396704, "grad_norm": 154.01389939852425, "learning_rate": 2.4375870230643413e-08, "logits/chosen": 1.2523785829544067, "logits/rejected": 0.6351985931396484, "logps/chosen": -1255.387451171875, "logps/pi_response": -7876.88427734375, "logps/ref_response": -419.7919006347656, "logps/rejected": -7547.12353515625, "loss": 0.8119, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -6.398593902587891, "rewards/margins": 61.322898864746094, "rewards/rejected": -67.72149658203125, "step": 370 }, { "epoch": 0.8946439081812831, "grad_norm": 78.3832326659616, "learning_rate": 1.627406596603359e-08, "logits/chosen": 1.103989839553833, "logits/rejected": 1.358469843864441, "logps/chosen": -1181.985595703125, "logps/pi_response": -6574.1669921875, "logps/ref_response": -373.62188720703125, "logps/rejected": -7001.06103515625, "loss": 0.1925, "rewards/accuracies": 0.89166659116745, "rewards/chosen": -6.272666931152344, "rewards/margins": 55.82216262817383, "rewards/rejected": -62.094825744628906, "step": 380 }, { "epoch": 0.9181871689228959, "grad_norm": 127.93583130286875, "learning_rate": 9.760481543214128e-09, "logits/chosen": 1.224596619606018, "logits/rejected": 1.153881311416626, "logps/chosen": -1271.9290771484375, "logps/pi_response": -6838.2158203125, "logps/ref_response": -396.5231628417969, "logps/rejected": -7279.1904296875, "loss": 0.8634, "rewards/accuracies": 0.85833340883255, "rewards/chosen": -6.516213417053223, "rewards/margins": 58.329864501953125, "rewards/rejected": -64.84608459472656, "step": 390 }, { "epoch": 0.9417304296645085, "grad_norm": 70.22000102721056, "learning_rate": 4.879378220843666e-09, "logits/chosen": 0.9867501258850098, "logits/rejected": 1.4171621799468994, "logps/chosen": -1219.076416015625, "logps/pi_response": -6917.1083984375, "logps/ref_response": -392.57037353515625, "logps/rejected": -6101.5478515625, "loss": 0.225, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -6.3670525550842285, "rewards/margins": 46.422969818115234, "rewards/rejected": -52.7900276184082, "step": 400 }, { "epoch": 0.9652736904061212, "grad_norm": 138.07733003631637, "learning_rate": 1.6639241844659535e-09, "logits/chosen": 1.2070242166519165, "logits/rejected": 1.469334363937378, "logps/chosen": -1205.38525390625, "logps/pi_response": -5451.77392578125, "logps/ref_response": -356.38177490234375, "logps/rejected": -5381.77587890625, "loss": 0.2645, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -6.463148593902588, "rewards/margins": 39.27529525756836, "rewards/rejected": -45.73843765258789, "step": 410 }, { "epoch": 0.9888169511477339, "grad_norm": 52.54825181887789, "learning_rate": 1.359691612926872e-10, "logits/chosen": 1.1104196310043335, "logits/rejected": 0.860076904296875, "logps/chosen": -1356.6370849609375, "logps/pi_response": -6781.0595703125, "logps/ref_response": -390.4653015136719, "logps/rejected": -7239.3955078125, "loss": 0.3053, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": -7.076499938964844, "rewards/margins": 57.4049072265625, "rewards/rejected": -64.48140716552734, "step": 420 }, { "epoch": 0.998234255444379, "step": 424, "total_flos": 0.0, "train_loss": 0.387972928302468, "train_runtime": 9484.4674, "train_samples_per_second": 1.611, "train_steps_per_second": 0.045 } ], "logging_steps": 10, "max_steps": 424, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 3, "trial_name": null, "trial_params": null }