{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 512, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001955512099731117, "grad_norm": 95.17212677001953, "learning_rate": 0.0, "loss": 1.5299, "step": 1, "train/lambda_m_mean": 0.0445800784509629, "train/mu_mean": 0.47853364050388336, "train/mu_std": 0.22444287687540054, "train/rewards_chosen_mean": -0.5414648056030273, "train/rewards_rejected_mean": -0.42338430881500244, "train/tau_mean": 3.074050724506378, "train/tau_std": 0.8532908968627453 }, { "epoch": 0.003911024199462234, "grad_norm": 100.23924255371094, "learning_rate": 6.493506493506495e-08, "loss": 1.6573, "step": 2, "train/lambda_m_mean": 0.04340820340439677, "train/mu_mean": 0.46908850595355034, "train/mu_std": 0.23344899527728558, "train/rewards_chosen_mean": -0.6072074174880981, "train/rewards_rejected_mean": -0.4483951926231384, "train/tau_mean": 3.109097898006439, "train/tau_std": 0.8925016522407532 }, { "epoch": 0.005866536299193351, "grad_norm": 93.4162368774414, "learning_rate": 1.298701298701299e-07, "loss": 1.676, "step": 3, "train/lambda_m_mean": 0.04877929762005806, "train/mu_mean": 0.4478048272430897, "train/mu_std": 0.22298597358167171, "train/rewards_chosen_mean": -0.5867786407470703, "train/rewards_rejected_mean": -0.3154585361480713, "train/tau_mean": 2.991130143404007, "train/tau_std": 0.796511672437191 }, { "epoch": 0.007822048398924468, "grad_norm": 65.9186019897461, "learning_rate": 1.948051948051948e-07, "loss": 1.2661, "step": 4, "train/lambda_m_mean": 0.044873046688735485, "train/mu_mean": 0.5390936322510242, "train/mu_std": 0.2391140628606081, "train/rewards_chosen_mean": -0.42839503288269043, "train/rewards_rejected_mean": -0.6403279304504395, "train/tau_mean": 3.05698624253273, "train/tau_std": 0.8332416042685509 }, { "epoch": 0.009777560498655585, "grad_norm": 76.08716583251953, "learning_rate": 2.597402597402598e-07, "loss": 1.3863, "step": 5, "train/lambda_m_mean": 0.04135742271319032, "train/mu_mean": 0.5024140924215317, "train/mu_std": 0.24771693535149097, "train/rewards_chosen_mean": -0.42700958251953125, "train/rewards_rejected_mean": -0.42455577850341797, "train/tau_mean": 3.1636240780353546, "train/tau_std": 0.9708356857299805 }, { "epoch": 0.011733072598386702, "grad_norm": 114.66706085205078, "learning_rate": 3.2467532467532465e-07, "loss": 1.6792, "step": 6, "train/lambda_m_mean": 0.04541015764698386, "train/mu_mean": 0.4785664677619934, "train/mu_std": 0.2564187329262495, "train/rewards_chosen_mean": -0.448394775390625, "train/rewards_rejected_mean": -0.33034324645996094, "train/tau_mean": 3.14547860622406, "train/tau_std": 0.9047347605228424 }, { "epoch": 0.013688584698117819, "grad_norm": 165.28195190429688, "learning_rate": 3.896103896103896e-07, "loss": 1.6903, "step": 7, "train/lambda_m_mean": 0.047851563431322575, "train/mu_mean": 0.48986775055527687, "train/mu_std": 0.2096750270575285, "train/rewards_chosen_mean": -0.45418453216552734, "train/rewards_rejected_mean": -0.3939361572265625, "train/tau_mean": 2.882607191801071, "train/tau_std": 0.8586537763476372 }, { "epoch": 0.015644096797848936, "grad_norm": 108.8331069946289, "learning_rate": 4.5454545454545457e-07, "loss": 1.5582, "step": 8, "train/lambda_m_mean": 0.04404296958819032, "train/mu_mean": 0.48507898300886154, "train/mu_std": 0.21850931271910667, "train/rewards_chosen_mean": -0.7446575164794922, "train/rewards_rejected_mean": -0.6509332656860352, "train/tau_mean": 3.149608850479126, "train/tau_std": 0.7185487374663353 }, { "epoch": 0.017599608897580055, "grad_norm": 117.35000610351562, "learning_rate": 5.194805194805196e-07, "loss": 1.3869, "step": 9, "train/lambda_m_mean": 0.04384765774011612, "train/mu_mean": 0.5211376547813416, "train/mu_std": 0.22750909999012947, "train/rewards_chosen_mean": -0.5970687866210938, "train/rewards_rejected_mean": -0.7060623168945312, "train/tau_mean": 3.1719210743904114, "train/tau_std": 1.0047409757971764 }, { "epoch": 0.01955512099731117, "grad_norm": 135.2972412109375, "learning_rate": 5.844155844155845e-07, "loss": 1.6108, "step": 10, "train/lambda_m_mean": 0.04780273512005806, "train/mu_mean": 0.5210565850138664, "train/mu_std": 0.24844475649297237, "train/rewards_chosen_mean": -0.3793525695800781, "train/rewards_rejected_mean": -0.5190708637237549, "train/tau_mean": 3.330864131450653, "train/tau_std": 0.9139663204550743 }, { "epoch": 0.02151063309704229, "grad_norm": 56.733524322509766, "learning_rate": 6.493506493506493e-07, "loss": 1.1962, "step": 11, "train/lambda_m_mean": 0.04272461077198386, "train/mu_mean": 0.5131960734724998, "train/mu_std": 0.2219289317727089, "train/rewards_chosen_mean": -0.354351282119751, "train/rewards_rejected_mean": -0.4249083399772644, "train/tau_mean": 3.35987651348114, "train/tau_std": 1.004284344613552 }, { "epoch": 0.023466145196773404, "grad_norm": 90.56576538085938, "learning_rate": 7.142857142857143e-07, "loss": 1.5188, "step": 12, "train/lambda_m_mean": 0.04384765727445483, "train/mu_mean": 0.4553135745227337, "train/mu_std": 0.23072262667119503, "train/rewards_chosen_mean": -0.4625082015991211, "train/rewards_rejected_mean": -0.22656917572021484, "train/tau_mean": 3.492409825325012, "train/tau_std": 0.9224353134632111 }, { "epoch": 0.025421657296504523, "grad_norm": 96.55609130859375, "learning_rate": 7.792207792207792e-07, "loss": 1.2983, "step": 13, "train/lambda_m_mean": 0.04267578246071935, "train/mu_mean": 0.5332097485661507, "train/mu_std": 0.232064601033926, "train/rewards_chosen_mean": -0.30588388442993164, "train/rewards_rejected_mean": -0.4701223373413086, "train/tau_mean": 3.4865702986717224, "train/tau_std": 0.9381129741668701 }, { "epoch": 0.027377169396235638, "grad_norm": 51.4261360168457, "learning_rate": 8.441558441558442e-07, "loss": 1.0457, "step": 14, "train/lambda_m_mean": 0.04399414174258709, "train/mu_mean": 0.557446613907814, "train/mu_std": 0.23736833408474922, "train/rewards_chosen_mean": -0.4162168502807617, "train/rewards_rejected_mean": -0.7529544830322266, "train/tau_mean": 3.7284185886383057, "train/tau_std": 1.0614818558096886 }, { "epoch": 0.029332681495966757, "grad_norm": 39.170860290527344, "learning_rate": 9.090909090909091e-07, "loss": 1.0351, "step": 15, "train/lambda_m_mean": 0.0422851569019258, "train/mu_mean": 0.5305526405572891, "train/mu_std": 0.2073553316295147, "train/rewards_chosen_mean": -0.34066176414489746, "train/rewards_rejected_mean": -0.5117988586425781, "train/tau_mean": 3.8408084213733673, "train/tau_std": 1.0344671085476875 }, { "epoch": 0.03128819359569787, "grad_norm": 48.2778205871582, "learning_rate": 9.740259740259742e-07, "loss": 1.0611, "step": 16, "train/lambda_m_mean": 0.04506836039945483, "train/mu_mean": 0.5402821898460388, "train/mu_std": 0.22962470538914204, "train/rewards_chosen_mean": -0.3020339012145996, "train/rewards_rejected_mean": -0.5330740809440613, "train/tau_mean": 3.989381194114685, "train/tau_std": 1.0647742599248886 }, { "epoch": 0.03324370569542899, "grad_norm": 33.73464584350586, "learning_rate": 1.0389610389610392e-06, "loss": 0.9978, "step": 17, "train/lambda_m_mean": 0.04682617262005806, "train/mu_mean": 0.5142267197370529, "train/mu_std": 0.21431373059749603, "train/rewards_chosen_mean": -0.32087233662605286, "train/rewards_rejected_mean": -0.42900562286376953, "train/tau_mean": 4.226923286914825, "train/tau_std": 1.116330862045288 }, { "epoch": 0.03519921779516011, "grad_norm": 37.785545349121094, "learning_rate": 1.103896103896104e-06, "loss": 1.0514, "step": 18, "train/lambda_m_mean": 0.04711914202198386, "train/mu_mean": 0.5138016119599342, "train/mu_std": 0.21796306408941746, "train/rewards_chosen_mean": -0.1608428955078125, "train/rewards_rejected_mean": -0.22646665573120117, "train/tau_mean": 5.011099636554718, "train/tau_std": 1.2226325422525406 }, { "epoch": 0.03715472989489123, "grad_norm": 36.42053985595703, "learning_rate": 1.168831168831169e-06, "loss": 0.9397, "step": 19, "train/lambda_m_mean": 0.04438476776704192, "train/mu_mean": 0.5350121408700943, "train/mu_std": 0.19170945137739182, "train/rewards_chosen_mean": -0.20148026943206787, "train/rewards_rejected_mean": -0.3586522042751312, "train/tau_mean": 5.677576780319214, "train/tau_std": 1.2567478269338608 }, { "epoch": 0.03911024199462234, "grad_norm": 28.05086898803711, "learning_rate": 1.2337662337662338e-06, "loss": 0.9331, "step": 20, "train/lambda_m_mean": 0.04531250195577741, "train/mu_mean": 0.5111776143312454, "train/mu_std": 0.19518684968352318, "train/rewards_chosen_mean": -0.45126163959503174, "train/rewards_rejected_mean": -0.5160496234893799, "train/tau_mean": 6.5802993178367615, "train/tau_std": 1.5758563727140427 }, { "epoch": 0.04106575409435346, "grad_norm": 21.623167037963867, "learning_rate": 1.2987012987012986e-06, "loss": 0.8863, "step": 21, "train/lambda_m_mean": 0.04633789137005806, "train/mu_mean": 0.5272432528436184, "train/mu_std": 0.1958116516470909, "train/rewards_chosen_mean": -0.2475348711013794, "train/rewards_rejected_mean": -0.37477874755859375, "train/tau_mean": 6.884631395339966, "train/tau_std": 1.5031830072402954 }, { "epoch": 0.04302126619408458, "grad_norm": 20.81596565246582, "learning_rate": 1.3636363636363636e-06, "loss": 0.8179, "step": 22, "train/lambda_m_mean": 0.04208984412252903, "train/mu_mean": 0.5424778833985329, "train/mu_std": 0.20673364773392677, "train/rewards_chosen_mean": -0.09710693359375, "train/rewards_rejected_mean": -0.35781896114349365, "train/tau_mean": 7.184713780879974, "train/tau_std": 1.4980812519788742 }, { "epoch": 0.044976778293815696, "grad_norm": 26.434799194335938, "learning_rate": 1.4285714285714286e-06, "loss": 0.9633, "step": 23, "train/lambda_m_mean": 0.0438964853528887, "train/mu_mean": 0.5028824210166931, "train/mu_std": 0.2052743099629879, "train/rewards_chosen_mean": -0.4462369680404663, "train/rewards_rejected_mean": -0.4713835120201111, "train/tau_mean": 8.008618116378784, "train/tau_std": 1.8708870112895966 }, { "epoch": 0.04693229039354681, "grad_norm": 20.73443031311035, "learning_rate": 1.4935064935064936e-06, "loss": 0.8611, "step": 24, "train/lambda_m_mean": 0.04135742271319032, "train/mu_mean": 0.5236128568649292, "train/mu_std": 0.19875688664615154, "train/rewards_chosen_mean": -0.06740951538085938, "train/rewards_rejected_mean": -0.1970057487487793, "train/tau_mean": 9.129055142402649, "train/tau_std": 1.7175871580839157 }, { "epoch": 0.048887802493277926, "grad_norm": 15.331561088562012, "learning_rate": 1.5584415584415584e-06, "loss": 0.7254, "step": 25, "train/lambda_m_mean": 0.04267578152939677, "train/mu_mean": 0.5647916346788406, "train/mu_std": 0.19775481708347797, "train/rewards_chosen_mean": -0.26649951934814453, "train/rewards_rejected_mean": -0.5966510772705078, "train/tau_mean": 11.327203512191772, "train/tau_std": 2.1647128760814667 }, { "epoch": 0.050843314593009045, "grad_norm": 14.445000648498535, "learning_rate": 1.6233766233766235e-06, "loss": 0.7372, "step": 26, "train/lambda_m_mean": 0.043164064176380634, "train/mu_mean": 0.553157389163971, "train/mu_std": 0.18509593047201633, "train/rewards_chosen_mean": -0.056172847747802734, "train/rewards_rejected_mean": -0.3375493213534355, "train/tau_mean": 12.879811406135559, "train/tau_std": 2.531935378909111 }, { "epoch": 0.052798826692740164, "grad_norm": 13.768389701843262, "learning_rate": 1.6883116883116885e-06, "loss": 0.6819, "step": 27, "train/lambda_m_mean": 0.04340820387005806, "train/mu_mean": 0.5840543955564499, "train/mu_std": 0.20106658712029457, "train/rewards_chosen_mean": -0.0220792293548584, "train/rewards_rejected_mean": -0.49053192138671875, "train/tau_mean": 14.565759301185608, "train/tau_std": 2.4207835644483566 }, { "epoch": 0.054754338792471276, "grad_norm": 14.82371711730957, "learning_rate": 1.7532467532467535e-06, "loss": 0.7558, "step": 28, "train/lambda_m_mean": 0.04350586188957095, "train/mu_mean": 0.563632421195507, "train/mu_std": 0.2179623283445835, "train/rewards_chosen_mean": 0.018302738666534424, "train/rewards_rejected_mean": -0.3334224224090576, "train/tau_mean": 15.263247966766357, "train/tau_std": 2.7966881692409515 }, { "epoch": 0.056709850892202394, "grad_norm": 13.294106483459473, "learning_rate": 1.8181818181818183e-06, "loss": 0.7042, "step": 29, "train/lambda_m_mean": 0.04750976664945483, "train/mu_mean": 0.5882321186363697, "train/mu_std": 0.2179530169814825, "train/rewards_chosen_mean": 0.06147170066833496, "train/rewards_rejected_mean": -0.43368101119995117, "train/tau_mean": 16.49312424659729, "train/tau_std": 2.953476220369339 }, { "epoch": 0.05866536299193351, "grad_norm": 12.295063972473145, "learning_rate": 1.8831168831168833e-06, "loss": 0.7298, "step": 30, "train/lambda_m_mean": 0.04648437676951289, "train/mu_mean": 0.5769536420702934, "train/mu_std": 0.2308455929160118, "train/rewards_chosen_mean": -0.08927536010742188, "train/rewards_rejected_mean": -0.5783100128173828, "train/tau_mean": 17.51951003074646, "train/tau_std": 3.4313797652721405 }, { "epoch": 0.06062087509166463, "grad_norm": 14.880653381347656, "learning_rate": 1.9480519480519483e-06, "loss": 0.7194, "step": 31, "train/lambda_m_mean": 0.03735351748764515, "train/mu_mean": 0.5725050270557404, "train/mu_std": 0.21940569579601288, "train/rewards_chosen_mean": 0.006748318672180176, "train/rewards_rejected_mean": -0.43611574172973633, "train/tau_mean": 18.715001106262207, "train/tau_std": 3.2498193085193634 }, { "epoch": 0.06257638719139574, "grad_norm": 15.36508846282959, "learning_rate": 2.012987012987013e-06, "loss": 0.6672, "step": 32, "train/lambda_m_mean": 0.04272460984066129, "train/mu_mean": 0.6268533989787102, "train/mu_std": 0.24227406084537506, "train/rewards_chosen_mean": -0.03775382041931152, "train/rewards_rejected_mean": -0.9092319011688232, "train/tau_mean": 18.655263900756836, "train/tau_std": 3.6640569865703583 }, { "epoch": 0.06453189929112686, "grad_norm": 10.689703941345215, "learning_rate": 2.0779220779220784e-06, "loss": 0.6171, "step": 33, "train/lambda_m_mean": 0.044824219308793545, "train/mu_mean": 0.6247024536132812, "train/mu_std": 0.21311992965638638, "train/rewards_chosen_mean": -0.10646414756774902, "train/rewards_rejected_mean": -0.9025228023529053, "train/tau_mean": 19.59191370010376, "train/tau_std": 3.8157286643981934 }, { "epoch": 0.06648741139085798, "grad_norm": 10.791482925415039, "learning_rate": 2.1428571428571427e-06, "loss": 0.5842, "step": 34, "train/lambda_m_mean": 0.04667968861758709, "train/mu_mean": 0.6498581096529961, "train/mu_std": 0.23202074691653252, "train/rewards_chosen_mean": -0.15554046630859375, "train/rewards_rejected_mean": -1.2147979736328125, "train/tau_mean": 20.52672505378723, "train/tau_std": 3.534943401813507 }, { "epoch": 0.0684429234905891, "grad_norm": 14.760125160217285, "learning_rate": 2.207792207792208e-06, "loss": 0.6896, "step": 35, "train/lambda_m_mean": 0.04257812676951289, "train/mu_mean": 0.6139236316084862, "train/mu_std": 0.24570108391344547, "train/rewards_chosen_mean": -0.09945487976074219, "train/rewards_rejected_mean": -0.9782981872558594, "train/tau_mean": 20.58795738220215, "train/tau_std": 4.010409086942673 }, { "epoch": 0.07039843559032022, "grad_norm": 12.312627792358398, "learning_rate": 2.2727272727272728e-06, "loss": 0.6572, "step": 36, "train/lambda_m_mean": 0.050830078311264515, "train/mu_mean": 0.6312298104166985, "train/mu_std": 0.2516651675105095, "train/rewards_chosen_mean": -0.26575732231140137, "train/rewards_rejected_mean": -1.2848320007324219, "train/tau_mean": 21.458248376846313, "train/tau_std": 4.364693701267242 }, { "epoch": 0.07235394769005134, "grad_norm": 11.037529945373535, "learning_rate": 2.337662337662338e-06, "loss": 0.6661, "step": 37, "train/lambda_m_mean": 0.04775390727445483, "train/mu_mean": 0.6195650584995747, "train/mu_std": 0.2383068110793829, "train/rewards_chosen_mean": -0.2998695373535156, "train/rewards_rejected_mean": -1.2654705047607422, "train/tau_mean": 22.759637117385864, "train/tau_std": 4.054660052061081 }, { "epoch": 0.07430945978978246, "grad_norm": 11.09421443939209, "learning_rate": 2.402597402597403e-06, "loss": 0.6795, "step": 38, "train/lambda_m_mean": 0.04331054771319032, "train/mu_mean": 0.6009834185242653, "train/mu_std": 0.22626401111483574, "train/rewards_chosen_mean": -0.6398301124572754, "train/rewards_rejected_mean": -1.4142265319824219, "train/tau_mean": 24.025842666625977, "train/tau_std": 4.3340844810009 }, { "epoch": 0.07626497188951356, "grad_norm": 11.861464500427246, "learning_rate": 2.4675324675324676e-06, "loss": 0.7025, "step": 39, "train/lambda_m_mean": 0.043212891556322575, "train/mu_mean": 0.5880651026964188, "train/mu_std": 0.22144061885774136, "train/rewards_chosen_mean": -0.8082389831542969, "train/rewards_rejected_mean": -1.5454044342041016, "train/tau_mean": 25.337387323379517, "train/tau_std": 4.282187849283218 }, { "epoch": 0.07822048398924468, "grad_norm": 10.995736122131348, "learning_rate": 2.5324675324675324e-06, "loss": 0.6416, "step": 40, "train/lambda_m_mean": 0.04716796986758709, "train/mu_mean": 0.6253503113985062, "train/mu_std": 0.2362539954483509, "train/rewards_chosen_mean": -0.7260780334472656, "train/rewards_rejected_mean": -1.7524394989013672, "train/tau_mean": 26.473713636398315, "train/tau_std": 4.301453709602356 }, { "epoch": 0.0801759960889758, "grad_norm": 10.0005464553833, "learning_rate": 2.597402597402597e-06, "loss": 0.6554, "step": 41, "train/lambda_m_mean": 0.0435058600269258, "train/mu_mean": 0.6079395338892937, "train/mu_std": 0.23113984055817127, "train/rewards_chosen_mean": -0.7400884628295898, "train/rewards_rejected_mean": -1.5556392669677734, "train/tau_mean": 27.38258457183838, "train/tau_std": 4.558844119310379 }, { "epoch": 0.08213150818870692, "grad_norm": 9.037772178649902, "learning_rate": 2.6623376623376624e-06, "loss": 0.6353, "step": 42, "train/lambda_m_mean": 0.04287109524011612, "train/mu_mean": 0.6204890757799149, "train/mu_std": 0.22184200771152973, "train/rewards_chosen_mean": -0.6896929740905762, "train/rewards_rejected_mean": -1.5864990800619125, "train/tau_mean": 28.261298179626465, "train/tau_std": 3.9605740308761597 }, { "epoch": 0.08408702028843804, "grad_norm": 9.340326309204102, "learning_rate": 2.7272727272727272e-06, "loss": 0.6614, "step": 43, "train/lambda_m_mean": 0.03754882933571935, "train/mu_mean": 0.6014960631728172, "train/mu_std": 0.22495279647409916, "train/rewards_chosen_mean": -0.7585411071777344, "train/rewards_rejected_mean": -1.4893542528152466, "train/tau_mean": 29.572723388671875, "train/tau_std": 4.2449445724487305 }, { "epoch": 0.08604253238816915, "grad_norm": 8.074227333068848, "learning_rate": 2.7922077922077925e-06, "loss": 0.6161, "step": 44, "train/lambda_m_mean": 0.04531250102445483, "train/mu_mean": 0.614116869866848, "train/mu_std": 0.19932269677519798, "train/rewards_chosen_mean": -0.7604668140411377, "train/rewards_rejected_mean": -1.4446372985839844, "train/tau_mean": 28.88953447341919, "train/tau_std": 4.299824059009552 }, { "epoch": 0.08799804448790027, "grad_norm": 7.5323262214660645, "learning_rate": 2.8571428571428573e-06, "loss": 0.6333, "step": 45, "train/lambda_m_mean": 0.04213867289945483, "train/mu_mean": 0.5916094034910202, "train/mu_std": 0.18896847777068615, "train/rewards_chosen_mean": -0.7433090209960938, "train/rewards_rejected_mean": -1.2735090255737305, "train/tau_mean": 30.47125744819641, "train/tau_std": 3.7107060253620148 }, { "epoch": 0.08995355658763139, "grad_norm": 7.142524719238281, "learning_rate": 2.922077922077922e-06, "loss": 0.6035, "step": 46, "train/lambda_m_mean": 0.0458984375, "train/mu_mean": 0.6228935644030571, "train/mu_std": 0.1999011766165495, "train/rewards_chosen_mean": -0.8165435791015625, "train/rewards_rejected_mean": -1.5453072786331177, "train/tau_mean": 29.79115319252014, "train/tau_std": 4.029196888208389 }, { "epoch": 0.0919090686873625, "grad_norm": 8.299596786499023, "learning_rate": 2.9870129870129873e-06, "loss": 0.6479, "step": 47, "train/lambda_m_mean": 0.04799804696813226, "train/mu_mean": 0.5954819172620773, "train/mu_std": 0.20341892912983894, "train/rewards_chosen_mean": -0.7268295288085938, "train/rewards_rejected_mean": -1.3185501098632812, "train/tau_mean": 29.902886867523193, "train/tau_std": 4.611438691616058 }, { "epoch": 0.09386458078709362, "grad_norm": 8.237100601196289, "learning_rate": 3.051948051948052e-06, "loss": 0.6717, "step": 48, "train/lambda_m_mean": 0.047851563431322575, "train/mu_mean": 0.5838227868080139, "train/mu_std": 0.2002648450434208, "train/rewards_chosen_mean": -0.8803176879882812, "train/rewards_rejected_mean": -1.3788833618164062, "train/tau_mean": 30.583752155303955, "train/tau_std": 3.4549425542354584 }, { "epoch": 0.09582009288682473, "grad_norm": 7.825159072875977, "learning_rate": 3.116883116883117e-06, "loss": 0.6388, "step": 49, "train/lambda_m_mean": 0.0415527350269258, "train/mu_mean": 0.6016329079866409, "train/mu_std": 0.2144486289471388, "train/rewards_chosen_mean": -0.897273063659668, "train/rewards_rejected_mean": -1.5427722930908203, "train/tau_mean": 30.51124382019043, "train/tau_std": 4.160054802894592 }, { "epoch": 0.09777560498655585, "grad_norm": 6.487326622009277, "learning_rate": 3.181818181818182e-06, "loss": 0.609, "step": 50, "train/lambda_m_mean": 0.04096679808571935, "train/mu_mean": 0.6133948713541031, "train/mu_std": 0.20535002276301384, "train/rewards_chosen_mean": -0.9062652587890625, "train/rewards_rejected_mean": -1.6813125610351562, "train/tau_mean": 30.85771679878235, "train/tau_std": 3.603198766708374 }, { "epoch": 0.09973111708628697, "grad_norm": 11.08548641204834, "learning_rate": 3.246753246753247e-06, "loss": 0.6229, "step": 51, "train/lambda_m_mean": 0.04931640764698386, "train/mu_mean": 0.6212383061647415, "train/mu_std": 0.2206958383321762, "train/rewards_chosen_mean": -0.6872901916503906, "train/rewards_rejected_mean": -1.5563201904296875, "train/tau_mean": 30.867952823638916, "train/tau_std": 3.475619286298752 }, { "epoch": 0.10168662918601809, "grad_norm": 6.194069862365723, "learning_rate": 3.311688311688312e-06, "loss": 0.6072, "step": 52, "train/lambda_m_mean": 0.0435058600269258, "train/mu_mean": 0.6162678226828575, "train/mu_std": 0.20314601249992847, "train/rewards_chosen_mean": -0.7195510864257812, "train/rewards_rejected_mean": -1.5282649993896484, "train/tau_mean": 30.888718605041504, "train/tau_std": 3.6661965250968933 }, { "epoch": 0.10364214128574921, "grad_norm": 5.674131393432617, "learning_rate": 3.376623376623377e-06, "loss": 0.6485, "step": 53, "train/lambda_m_mean": 0.04267578339204192, "train/mu_mean": 0.5929751023650169, "train/mu_std": 0.19740562327206135, "train/rewards_chosen_mean": -0.9406890869140625, "train/rewards_rejected_mean": -1.6580162048339844, "train/tau_mean": 31.39568328857422, "train/tau_std": 2.5480915158987045 }, { "epoch": 0.10559765338548033, "grad_norm": 4.587912082672119, "learning_rate": 3.4415584415584418e-06, "loss": 0.6254, "step": 54, "train/lambda_m_mean": 0.043945313431322575, "train/mu_mean": 0.6023534387350082, "train/mu_std": 0.1940193297341466, "train/rewards_chosen_mean": -0.8936901092529297, "train/rewards_rejected_mean": -1.6843953132629395, "train/tau_mean": 30.969727039337158, "train/tau_std": 2.527120128273964 }, { "epoch": 0.10755316548521145, "grad_norm": 4.888927936553955, "learning_rate": 3.506493506493507e-06, "loss": 0.5971, "step": 55, "train/lambda_m_mean": 0.04267578339204192, "train/mu_mean": 0.6300938948988914, "train/mu_std": 0.2223819252103567, "train/rewards_chosen_mean": -0.6322140693664551, "train/rewards_rejected_mean": -1.6219626665115356, "train/tau_mean": 30.554571390151978, "train/tau_std": 3.3082843720912933 }, { "epoch": 0.10950867758494255, "grad_norm": 4.272146224975586, "learning_rate": 3.5714285714285718e-06, "loss": 0.5677, "step": 56, "train/lambda_m_mean": 0.04072265746071935, "train/mu_mean": 0.6468537151813507, "train/mu_std": 0.21450292877852917, "train/rewards_chosen_mean": -0.5278587341308594, "train/rewards_rejected_mean": -1.688905954360962, "train/tau_mean": 30.028329610824585, "train/tau_std": 3.2394866943359375 }, { "epoch": 0.11146418968467367, "grad_norm": 4.460328102111816, "learning_rate": 3.6363636363636366e-06, "loss": 0.6016, "step": 57, "train/lambda_m_mean": 0.041113281855359674, "train/mu_mean": 0.6214812994003296, "train/mu_std": 0.2139104437083006, "train/rewards_chosen_mean": -0.4083409309387207, "train/rewards_rejected_mean": -1.3396458625793457, "train/tau_mean": 30.264485597610474, "train/tau_std": 2.5173448026180267 }, { "epoch": 0.11341970178440479, "grad_norm": 4.194515705108643, "learning_rate": 3.701298701298702e-06, "loss": 0.5647, "step": 58, "train/lambda_m_mean": 0.04472656361758709, "train/mu_mean": 0.650407001376152, "train/mu_std": 0.2254655584692955, "train/rewards_chosen_mean": -0.08184814453125, "train/rewards_rejected_mean": -1.154287576675415, "train/tau_mean": 29.973021268844604, "train/tau_std": 2.8849972635507584 }, { "epoch": 0.11537521388413591, "grad_norm": 4.031130790710449, "learning_rate": 3.7662337662337666e-06, "loss": 0.5737, "step": 59, "train/lambda_m_mean": 0.04667968908324838, "train/mu_mean": 0.643591970205307, "train/mu_std": 0.2186010740697384, "train/rewards_chosen_mean": 0.049160003662109375, "train/rewards_rejected_mean": -0.9377269744873047, "train/tau_mean": 29.516502141952515, "train/tau_std": 2.6134540885686874 }, { "epoch": 0.11733072598386703, "grad_norm": 4.311197280883789, "learning_rate": 3.831168831168831e-06, "loss": 0.6282, "step": 60, "train/lambda_m_mean": 0.03872070275247097, "train/mu_mean": 0.5966866165399551, "train/mu_std": 0.18948550336062908, "train/rewards_chosen_mean": 0.15267109870910645, "train/rewards_rejected_mean": -0.5045528411865234, "train/tau_mean": 29.672966718673706, "train/tau_std": 2.043314166367054 }, { "epoch": 0.11928623808359815, "grad_norm": 4.9903082847595215, "learning_rate": 3.896103896103897e-06, "loss": 0.6142, "step": 61, "train/lambda_m_mean": 0.042480469681322575, "train/mu_mean": 0.616503044962883, "train/mu_std": 0.20781102031469345, "train/rewards_chosen_mean": 0.46181347966194153, "train/rewards_rejected_mean": -0.32892322540283203, "train/tau_mean": 29.217654705047607, "train/tau_std": 2.710300251841545 }, { "epoch": 0.12124175018332926, "grad_norm": 4.989065647125244, "learning_rate": 3.961038961038962e-06, "loss": 0.5743, "step": 62, "train/lambda_m_mean": 0.039062500931322575, "train/mu_mean": 0.6374526768922806, "train/mu_std": 0.21073496155440807, "train/rewards_chosen_mean": 0.9538218379020691, "train/rewards_rejected_mean": -0.04493023455142975, "train/tau_mean": 29.0721378326416, "train/tau_std": 2.227001056075096 }, { "epoch": 0.12319726228306038, "grad_norm": 3.4931235313415527, "learning_rate": 4.025974025974026e-06, "loss": 0.5731, "step": 63, "train/lambda_m_mean": 0.048730471171438694, "train/mu_mean": 0.6393754705786705, "train/mu_std": 0.20952653512358665, "train/rewards_chosen_mean": 0.9150928258895874, "train/rewards_rejected_mean": -0.016547203063964844, "train/tau_mean": 28.48568344116211, "train/tau_std": 2.3636923283338547 }, { "epoch": 0.1251527743827915, "grad_norm": 4.08262825012207, "learning_rate": 4.0909090909090915e-06, "loss": 0.6169, "step": 64, "train/lambda_m_mean": 0.0439453125, "train/mu_mean": 0.6090252548456192, "train/mu_std": 0.20946052856743336, "train/rewards_chosen_mean": 0.6326950788497925, "train/rewards_rejected_mean": -0.0984957218170166, "train/tau_mean": 28.83350133895874, "train/tau_std": 2.0194475278258324 }, { "epoch": 0.12710828648252262, "grad_norm": 3.4461913108825684, "learning_rate": 4.155844155844157e-06, "loss": 0.5968, "step": 65, "train/lambda_m_mean": 0.04233398521319032, "train/mu_mean": 0.6152602806687355, "train/mu_std": 0.185240738093853, "train/rewards_chosen_mean": 0.5067528486251831, "train/rewards_rejected_mean": -0.22580525279045105, "train/tau_mean": 28.378421783447266, "train/tau_std": 2.2168256789445877 }, { "epoch": 0.12906379858225372, "grad_norm": 4.2102861404418945, "learning_rate": 4.220779220779221e-06, "loss": 0.5565, "step": 66, "train/lambda_m_mean": 0.04570312541909516, "train/mu_mean": 0.6471520885825157, "train/mu_std": 0.21790389344096184, "train/rewards_chosen_mean": 0.3817000389099121, "train/rewards_rejected_mean": -0.5869216918945312, "train/tau_mean": 27.170931816101074, "train/tau_std": 2.472757488489151 }, { "epoch": 0.13101931068198486, "grad_norm": 4.013350963592529, "learning_rate": 4.2857142857142855e-06, "loss": 0.5943, "step": 67, "train/lambda_m_mean": 0.038281249813735485, "train/mu_mean": 0.6248749345541, "train/mu_std": 0.2151691671460867, "train/rewards_chosen_mean": 0.43183159828186035, "train/rewards_rejected_mean": -0.4625290036201477, "train/tau_mean": 27.39804482460022, "train/tau_std": 2.328546941280365 }, { "epoch": 0.13297482278171596, "grad_norm": 3.6336123943328857, "learning_rate": 4.350649350649351e-06, "loss": 0.5817, "step": 68, "train/lambda_m_mean": 0.04707031324505806, "train/mu_mean": 0.6359781324863434, "train/mu_std": 0.21463118493556976, "train/rewards_chosen_mean": 0.5206625461578369, "train/rewards_rejected_mean": -0.47002750635147095, "train/tau_mean": 26.95927143096924, "train/tau_std": 2.1428261399269104 }, { "epoch": 0.13493033488144707, "grad_norm": 4.254031181335449, "learning_rate": 4.415584415584416e-06, "loss": 0.5797, "step": 69, "train/lambda_m_mean": 0.04824218899011612, "train/mu_mean": 0.6496964693069458, "train/mu_std": 0.23419176042079926, "train/rewards_chosen_mean": 0.7880805730819702, "train/rewards_rejected_mean": -0.3801145553588867, "train/tau_mean": 27.087117671966553, "train/tau_std": 2.0923111140727997 }, { "epoch": 0.1368858469811782, "grad_norm": 3.8196699619293213, "learning_rate": 4.48051948051948e-06, "loss": 0.5595, "step": 70, "train/lambda_m_mean": 0.04116211086511612, "train/mu_mean": 0.647213414311409, "train/mu_std": 0.21895992383360863, "train/rewards_chosen_mean": 1.0294266939163208, "train/rewards_rejected_mean": -0.04196453094482422, "train/tau_mean": 26.876323699951172, "train/tau_std": 2.0181122571229935 }, { "epoch": 0.1388413590809093, "grad_norm": 5.081212043762207, "learning_rate": 4.5454545454545455e-06, "loss": 0.6524, "step": 71, "train/lambda_m_mean": 0.04926757933571935, "train/mu_mean": 0.5996471494436264, "train/mu_std": 0.2160791028290987, "train/rewards_chosen_mean": 0.8534164428710938, "train/rewards_rejected_mean": 0.12749910354614258, "train/tau_mean": 26.465622901916504, "train/tau_std": 2.2421123683452606 }, { "epoch": 0.14079687118064044, "grad_norm": 3.906294107437134, "learning_rate": 4.610389610389611e-06, "loss": 0.6038, "step": 72, "train/lambda_m_mean": 0.04770507989451289, "train/mu_mean": 0.6267451271414757, "train/mu_std": 0.2166711613535881, "train/rewards_chosen_mean": 1.1183719635009766, "train/rewards_rejected_mean": 0.16130447387695312, "train/tau_mean": 26.270981550216675, "train/tau_std": 1.6877407431602478 }, { "epoch": 0.14275238328037154, "grad_norm": 4.608340740203857, "learning_rate": 4.675324675324676e-06, "loss": 0.6113, "step": 73, "train/lambda_m_mean": 0.04526367364451289, "train/mu_mean": 0.6247369647026062, "train/mu_std": 0.2224284838885069, "train/rewards_chosen_mean": 1.1732778549194336, "train/rewards_rejected_mean": 0.3101520538330078, "train/tau_mean": 26.331557512283325, "train/tau_std": 2.0155224800109863 }, { "epoch": 0.14470789538010267, "grad_norm": 3.5148048400878906, "learning_rate": 4.74025974025974e-06, "loss": 0.5836, "step": 74, "train/lambda_m_mean": 0.0466796881519258, "train/mu_mean": 0.6270397529006004, "train/mu_std": 0.18881584936752915, "train/rewards_chosen_mean": 1.4767885208129883, "train/rewards_rejected_mean": 0.618828296661377, "train/tau_mean": 25.97057294845581, "train/tau_std": 1.9978251904249191 }, { "epoch": 0.14666340747983378, "grad_norm": 5.078949451446533, "learning_rate": 4.805194805194806e-06, "loss": 0.5892, "step": 75, "train/lambda_m_mean": 0.0464355475269258, "train/mu_mean": 0.6390694230794907, "train/mu_std": 0.22257071919739246, "train/rewards_chosen_mean": 1.6564569473266602, "train/rewards_rejected_mean": 0.6927604675292969, "train/tau_mean": 25.93649983406067, "train/tau_std": 2.285871386528015 }, { "epoch": 0.1486189195795649, "grad_norm": 3.289538860321045, "learning_rate": 4.870129870129871e-06, "loss": 0.5585, "step": 76, "train/lambda_m_mean": 0.04130859486758709, "train/mu_mean": 0.6414681226015091, "train/mu_std": 0.2045713234692812, "train/rewards_chosen_mean": 1.843221664428711, "train/rewards_rejected_mean": 0.9311723709106445, "train/tau_mean": 25.896389722824097, "train/tau_std": 2.288887605071068 }, { "epoch": 0.15057443167929602, "grad_norm": 3.149043560028076, "learning_rate": 4.935064935064935e-06, "loss": 0.5869, "step": 77, "train/lambda_m_mean": 0.04584961012005806, "train/mu_mean": 0.6292314454913139, "train/mu_std": 0.20318584889173508, "train/rewards_chosen_mean": 1.848531723022461, "train/rewards_rejected_mean": 0.9397974014282227, "train/tau_mean": 25.857347011566162, "train/tau_std": 1.8229808956384659 }, { "epoch": 0.15252994377902712, "grad_norm": 3.2672510147094727, "learning_rate": 5e-06, "loss": 0.6367, "step": 78, "train/lambda_m_mean": 0.04492187639698386, "train/mu_mean": 0.5941630452871323, "train/mu_std": 0.1929073054343462, "train/rewards_chosen_mean": 2.005786895751953, "train/rewards_rejected_mean": 1.2814464569091797, "train/tau_mean": 25.906984090805054, "train/tau_std": 1.922466367483139 }, { "epoch": 0.15448545587875825, "grad_norm": 3.5790038108825684, "learning_rate": 4.996572995202194e-06, "loss": 0.5991, "step": 79, "train/lambda_m_mean": 0.05219726683571935, "train/mu_mean": 0.6382706016302109, "train/mu_std": 0.2235070075839758, "train/rewards_chosen_mean": 1.8992944955825806, "train/rewards_rejected_mean": 0.8262176513671875, "train/tau_mean": 25.69573712348938, "train/tau_std": 1.8932983577251434 }, { "epoch": 0.15644096797848936, "grad_norm": 3.188812017440796, "learning_rate": 4.993145990404387e-06, "loss": 0.5971, "step": 80, "train/lambda_m_mean": 0.03857421944849193, "train/mu_mean": 0.6185019090771675, "train/mu_std": 0.20552214048802853, "train/rewards_chosen_mean": 1.7819404602050781, "train/rewards_rejected_mean": 0.9447412490844727, "train/tau_mean": 25.693042755126953, "train/tau_std": 1.8551257997751236 }, { "epoch": 0.1583964800782205, "grad_norm": 2.6125597953796387, "learning_rate": 4.98971898560658e-06, "loss": 0.5436, "step": 81, "train/lambda_m_mean": 0.04658203246071935, "train/mu_mean": 0.6571422219276428, "train/mu_std": 0.21014288626611233, "train/rewards_chosen_mean": 1.4870600700378418, "train/rewards_rejected_mean": 0.43126964569091797, "train/tau_mean": 25.167107582092285, "train/tau_std": 2.1866741329431534 }, { "epoch": 0.1603519921779516, "grad_norm": 3.0073978900909424, "learning_rate": 4.9862919808087735e-06, "loss": 0.5966, "step": 82, "train/lambda_m_mean": 0.03999023465439677, "train/mu_mean": 0.6161477193236351, "train/mu_std": 0.19628822803497314, "train/rewards_chosen_mean": 1.3961410522460938, "train/rewards_rejected_mean": 0.5893821716308594, "train/tau_mean": 25.4437472820282, "train/tau_std": 1.7953009605407715 }, { "epoch": 0.16230750427768273, "grad_norm": 3.565066337585449, "learning_rate": 4.982864976010967e-06, "loss": 0.5949, "step": 83, "train/lambda_m_mean": 0.04746093903668225, "train/mu_mean": 0.6349377036094666, "train/mu_std": 0.21205419301986694, "train/rewards_chosen_mean": 1.3471221923828125, "train/rewards_rejected_mean": 0.3750801086425781, "train/tau_mean": 25.254260301589966, "train/tau_std": 1.7454497069120407 }, { "epoch": 0.16426301637741383, "grad_norm": 3.651881217956543, "learning_rate": 4.97943797121316e-06, "loss": 0.6403, "step": 84, "train/lambda_m_mean": 0.044970703311264515, "train/mu_mean": 0.5923184454441071, "train/mu_std": 0.19546333700418472, "train/rewards_chosen_mean": 1.4448258876800537, "train/rewards_rejected_mean": 0.7783660888671875, "train/tau_mean": 24.93083667755127, "train/tau_std": 1.8768924176692963 }, { "epoch": 0.16621852847714494, "grad_norm": 3.467643976211548, "learning_rate": 4.976010966415353e-06, "loss": 0.6201, "step": 85, "train/lambda_m_mean": 0.043554688803851604, "train/mu_mean": 0.6048147603869438, "train/mu_std": 0.20238203555345535, "train/rewards_chosen_mean": 1.2943401336669922, "train/rewards_rejected_mean": 0.5782952308654785, "train/tau_mean": 24.90724015235901, "train/tau_std": 1.9766643196344376 }, { "epoch": 0.16817404057687607, "grad_norm": 2.76979923248291, "learning_rate": 4.9725839616175465e-06, "loss": 0.6035, "step": 86, "train/lambda_m_mean": 0.04218750074505806, "train/mu_mean": 0.6126594170928001, "train/mu_std": 0.20106958225369453, "train/rewards_chosen_mean": 1.3185844421386719, "train/rewards_rejected_mean": 0.5280187129974365, "train/tau_mean": 24.698895692825317, "train/tau_std": 2.056335136294365 }, { "epoch": 0.17012955267660718, "grad_norm": 2.4915764331817627, "learning_rate": 4.96915695681974e-06, "loss": 0.5699, "step": 87, "train/lambda_m_mean": 0.042187501676380634, "train/mu_mean": 0.6383110210299492, "train/mu_std": 0.2043277658522129, "train/rewards_chosen_mean": 1.528191089630127, "train/rewards_rejected_mean": 0.5332975387573242, "train/tau_mean": 24.880345821380615, "train/tau_std": 2.0618558526039124 }, { "epoch": 0.1720850647763383, "grad_norm": 2.901869773864746, "learning_rate": 4.965729952021933e-06, "loss": 0.6132, "step": 88, "train/lambda_m_mean": 0.041503907181322575, "train/mu_mean": 0.6109554618597031, "train/mu_std": 0.20407630316913128, "train/rewards_chosen_mean": 1.429330825805664, "train/rewards_rejected_mean": 0.6269989013671875, "train/tau_mean": 24.782063484191895, "train/tau_std": 2.1295868903398514 }, { "epoch": 0.1740405768760694, "grad_norm": 3.1541569232940674, "learning_rate": 4.962302947224126e-06, "loss": 0.6112, "step": 89, "train/lambda_m_mean": 0.0399902353528887, "train/mu_mean": 0.6134695485234261, "train/mu_std": 0.21276569925248623, "train/rewards_chosen_mean": 1.585958480834961, "train/rewards_rejected_mean": 0.7851147651672363, "train/tau_mean": 25.04143476486206, "train/tau_std": 1.9875654131174088 }, { "epoch": 0.17599608897580055, "grad_norm": 2.835092306137085, "learning_rate": 4.9588759424263196e-06, "loss": 0.5911, "step": 90, "train/lambda_m_mean": 0.05136718833819032, "train/mu_mean": 0.6318660005927086, "train/mu_std": 0.2062278836965561, "train/rewards_chosen_mean": 1.2122783660888672, "train/rewards_rejected_mean": 0.2856874465942383, "train/tau_mean": 24.508662700653076, "train/tau_std": 2.0933686941862106 }, { "epoch": 0.17795160107553165, "grad_norm": 2.9451212882995605, "learning_rate": 4.955448937628513e-06, "loss": 0.617, "step": 91, "train/lambda_m_mean": 0.04604492196813226, "train/mu_mean": 0.612684391438961, "train/mu_std": 0.20704650320112705, "train/rewards_chosen_mean": 0.6381077766418457, "train/rewards_rejected_mean": -0.19230079650878906, "train/tau_mean": 24.62921118736267, "train/tau_std": 2.1130027174949646 }, { "epoch": 0.17990711317526278, "grad_norm": 2.6974692344665527, "learning_rate": 4.952021932830706e-06, "loss": 0.5775, "step": 92, "train/lambda_m_mean": 0.044189452892169356, "train/mu_mean": 0.6388618275523186, "train/mu_std": 0.2132438775151968, "train/rewards_chosen_mean": 0.36050915718078613, "train/rewards_rejected_mean": -0.5853263139724731, "train/tau_mean": 24.759086847305298, "train/tau_std": 2.066805437207222 }, { "epoch": 0.1818626252749939, "grad_norm": 2.8913583755493164, "learning_rate": 4.9485949280329e-06, "loss": 0.5816, "step": 93, "train/lambda_m_mean": 0.04799804696813226, "train/mu_mean": 0.6349750235676765, "train/mu_std": 0.20661992020905018, "train/rewards_chosen_mean": 0.07923769950866699, "train/rewards_rejected_mean": -0.8646920919418335, "train/tau_mean": 24.32356548309326, "train/tau_std": 2.4148282557725906 }, { "epoch": 0.183818137374725, "grad_norm": 2.4111862182617188, "learning_rate": 4.945167923235093e-06, "loss": 0.6015, "step": 94, "train/lambda_m_mean": 0.04350585979409516, "train/mu_mean": 0.616554506123066, "train/mu_std": 0.18699581921100616, "train/rewards_chosen_mean": -0.4596061706542969, "train/rewards_rejected_mean": -1.2588005065917969, "train/tau_mean": 24.457905054092407, "train/tau_std": 1.9728475362062454 }, { "epoch": 0.18577364947445613, "grad_norm": 2.3960251808166504, "learning_rate": 4.941740918437286e-06, "loss": 0.5746, "step": 95, "train/lambda_m_mean": 0.044628906762227416, "train/mu_mean": 0.6315751001238823, "train/mu_std": 0.200370779260993, "train/rewards_chosen_mean": -0.6302642822265625, "train/rewards_rejected_mean": -1.5096187889575958, "train/tau_mean": 24.20305037498474, "train/tau_std": 2.1215746998786926 }, { "epoch": 0.18772916157418723, "grad_norm": 2.8553154468536377, "learning_rate": 4.93831391363948e-06, "loss": 0.5853, "step": 96, "train/lambda_m_mean": 0.044433594681322575, "train/mu_mean": 0.6301626488566399, "train/mu_std": 0.208343006670475, "train/rewards_chosen_mean": -0.876983642578125, "train/rewards_rejected_mean": -1.7948741912841797, "train/tau_mean": 24.114664316177368, "train/tau_std": 2.177145630121231 }, { "epoch": 0.18968467367391836, "grad_norm": 2.8842661380767822, "learning_rate": 4.934886908841673e-06, "loss": 0.6163, "step": 97, "train/lambda_m_mean": 0.04458007775247097, "train/mu_mean": 0.6163120269775391, "train/mu_std": 0.20963509008288383, "train/rewards_chosen_mean": -1.2767333984375, "train/rewards_rejected_mean": -2.1307373046875, "train/tau_mean": 24.35628652572632, "train/tau_std": 1.9794245213270187 }, { "epoch": 0.19164018577364947, "grad_norm": 2.3437910079956055, "learning_rate": 4.931459904043866e-06, "loss": 0.5586, "step": 98, "train/lambda_m_mean": 0.0461914069019258, "train/mu_mean": 0.6497498005628586, "train/mu_std": 0.21293544210493565, "train/rewards_chosen_mean": -1.385284423828125, "train/rewards_rejected_mean": -2.4285888671875, "train/tau_mean": 24.266958475112915, "train/tau_std": 2.3243183940649033 }, { "epoch": 0.1935956978733806, "grad_norm": 3.1281299591064453, "learning_rate": 4.928032899246059e-06, "loss": 0.596, "step": 99, "train/lambda_m_mean": 0.043554688803851604, "train/mu_mean": 0.6289903521537781, "train/mu_std": 0.21112661063671112, "train/rewards_chosen_mean": -1.658782958984375, "train/rewards_rejected_mean": -2.486713409423828, "train/tau_mean": 23.79543709754944, "train/tau_std": 2.07799232006073 }, { "epoch": 0.1955512099731117, "grad_norm": 2.31160831451416, "learning_rate": 4.924605894448253e-06, "loss": 0.5842, "step": 100, "train/lambda_m_mean": 0.04067382914945483, "train/mu_mean": 0.6253582760691643, "train/mu_std": 0.21116743609309196, "train/rewards_chosen_mean": -1.4107666015625, "train/rewards_rejected_mean": -2.2704620361328125, "train/tau_mean": 23.711310625076294, "train/tau_std": 2.3933853656053543 }, { "epoch": 0.19750672207284284, "grad_norm": 1.7066861391067505, "learning_rate": 4.9211788896504455e-06, "loss": 0.5802, "step": 101, "train/lambda_m_mean": 0.04487304715439677, "train/mu_mean": 0.6290720254182816, "train/mu_std": 0.2078013624995947, "train/rewards_chosen_mean": -1.38299560546875, "train/rewards_rejected_mean": -2.2540283203125, "train/tau_mean": 23.366575002670288, "train/tau_std": 2.3724118918180466 }, { "epoch": 0.19946223417257394, "grad_norm": 2.0934622287750244, "learning_rate": 4.917751884852639e-06, "loss": 0.5933, "step": 102, "train/lambda_m_mean": 0.03906250139698386, "train/mu_mean": 0.6174563020467758, "train/mu_std": 0.19972298108041286, "train/rewards_chosen_mean": -0.9839744567871094, "train/rewards_rejected_mean": -1.8029556274414062, "train/tau_mean": 23.881397485733032, "train/tau_std": 2.1675764322280884 }, { "epoch": 0.20141774627230505, "grad_norm": 2.5239086151123047, "learning_rate": 4.914324880054833e-06, "loss": 0.5961, "step": 103, "train/lambda_m_mean": 0.044531251303851604, "train/mu_mean": 0.628866657614708, "train/mu_std": 0.22418470866978168, "train/rewards_chosen_mean": -0.7528743743896484, "train/rewards_rejected_mean": -1.75634765625, "train/tau_mean": 23.876564502716064, "train/tau_std": 2.5330270677804947 }, { "epoch": 0.20337325837203618, "grad_norm": 2.2057158946990967, "learning_rate": 4.910897875257026e-06, "loss": 0.6062, "step": 104, "train/lambda_m_mean": 0.0442382819019258, "train/mu_mean": 0.6127707585692406, "train/mu_std": 0.19288966432213783, "train/rewards_chosen_mean": -0.6462497711181641, "train/rewards_rejected_mean": -1.5092353820800781, "train/tau_mean": 23.63750982284546, "train/tau_std": 2.578868329524994 }, { "epoch": 0.20532877047176729, "grad_norm": 2.4346351623535156, "learning_rate": 4.9074708704592185e-06, "loss": 0.5824, "step": 105, "train/lambda_m_mean": 0.04379882896319032, "train/mu_mean": 0.6323531121015549, "train/mu_std": 0.21500298753380775, "train/rewards_chosen_mean": -0.3213534355163574, "train/rewards_rejected_mean": -1.3083314895629883, "train/tau_mean": 24.025922536849976, "train/tau_std": 2.641270935535431 }, { "epoch": 0.20728428257149842, "grad_norm": 2.4963927268981934, "learning_rate": 4.904043865661413e-06, "loss": 0.5953, "step": 106, "train/lambda_m_mean": 0.043261718936264515, "train/mu_mean": 0.6202164068818092, "train/mu_std": 0.20497540198266506, "train/rewards_chosen_mean": -0.05702781677246094, "train/rewards_rejected_mean": -0.9136719703674316, "train/tau_mean": 24.081847190856934, "train/tau_std": 2.3635547012090683 }, { "epoch": 0.20923979467122952, "grad_norm": 2.63403058052063, "learning_rate": 4.900616860863606e-06, "loss": 0.5952, "step": 107, "train/lambda_m_mean": 0.04111328208819032, "train/mu_mean": 0.6199616864323616, "train/mu_std": 0.2012209165841341, "train/rewards_chosen_mean": 0.1580992341041565, "train/rewards_rejected_mean": -0.6992754936218262, "train/tau_mean": 24.247599840164185, "train/tau_std": 2.2931104749441147 }, { "epoch": 0.21119530677096066, "grad_norm": 2.4994828701019287, "learning_rate": 4.897189856065799e-06, "loss": 0.5723, "step": 108, "train/lambda_m_mean": 0.039208985632285476, "train/mu_mean": 0.6342691406607628, "train/mu_std": 0.21239744871854782, "train/rewards_chosen_mean": 0.3768758773803711, "train/rewards_rejected_mean": -0.5364894866943359, "train/tau_mean": 24.452682495117188, "train/tau_std": 2.333046555519104 }, { "epoch": 0.21315081887069176, "grad_norm": 1.9223628044128418, "learning_rate": 4.893762851267992e-06, "loss": 0.5487, "step": 109, "train/lambda_m_mean": 0.039355470798909664, "train/mu_mean": 0.6492720618844032, "train/mu_std": 0.2148063201457262, "train/rewards_chosen_mean": 0.3517448529601097, "train/rewards_rejected_mean": -0.7147108912467957, "train/tau_mean": 24.630282163619995, "train/tau_std": 2.4668149054050446 }, { "epoch": 0.2151063309704229, "grad_norm": 2.2066640853881836, "learning_rate": 4.890335846470186e-06, "loss": 0.6079, "step": 110, "train/lambda_m_mean": 0.04628906352445483, "train/mu_mean": 0.6112203299999237, "train/mu_std": 0.19621816091239452, "train/rewards_chosen_mean": 0.14479076862335205, "train/rewards_rejected_mean": -0.5925147533416748, "train/tau_mean": 24.07844305038452, "train/tau_std": 2.4213491082191467 }, { "epoch": 0.217061843070154, "grad_norm": 2.73237943649292, "learning_rate": 4.886908841672379e-06, "loss": 0.5569, "step": 111, "train/lambda_m_mean": 0.04682617262005806, "train/mu_mean": 0.6451166868209839, "train/mu_std": 0.19585498981177807, "train/rewards_chosen_mean": 0.40224742889404297, "train/rewards_rejected_mean": -0.5874028205871582, "train/tau_mean": 25.032257318496704, "train/tau_std": 2.690868616104126 }, { "epoch": 0.2190173551698851, "grad_norm": 3.2309014797210693, "learning_rate": 4.883481836874572e-06, "loss": 0.594, "step": 112, "train/lambda_m_mean": 0.04062500083819032, "train/mu_mean": 0.6251253709197044, "train/mu_std": 0.21068532951176167, "train/rewards_chosen_mean": 0.3913900852203369, "train/rewards_rejected_mean": -0.5379667282104492, "train/tau_mean": 25.7231662273407, "train/tau_std": 2.562523305416107 }, { "epoch": 0.22097286726961624, "grad_norm": 3.6422348022460938, "learning_rate": 4.8800548320767655e-06, "loss": 0.5925, "step": 113, "train/lambda_m_mean": 0.04189453087747097, "train/mu_mean": 0.6352258548140526, "train/mu_std": 0.22300923988223076, "train/rewards_chosen_mean": 0.49973368644714355, "train/rewards_rejected_mean": -0.5322012901306152, "train/tau_mean": 25.70559549331665, "train/tau_std": 2.5831172466278076 }, { "epoch": 0.22292837936934734, "grad_norm": 3.322655200958252, "learning_rate": 4.876627827278959e-06, "loss": 0.6, "step": 114, "train/lambda_m_mean": 0.047656252048909664, "train/mu_mean": 0.6241191402077675, "train/mu_std": 0.20632282085716724, "train/rewards_chosen_mean": 0.44190406799316406, "train/rewards_rejected_mean": -0.49608850479125977, "train/tau_mean": 25.595744132995605, "train/tau_std": 2.6930888146162033 }, { "epoch": 0.22488389146907847, "grad_norm": 3.1697452068328857, "learning_rate": 4.873200822481152e-06, "loss": 0.6264, "step": 115, "train/lambda_m_mean": 0.04257812676951289, "train/mu_mean": 0.6020521968603134, "train/mu_std": 0.20214700978249311, "train/rewards_chosen_mean": 0.26317495107650757, "train/rewards_rejected_mean": -0.45170629024505615, "train/tau_mean": 25.662132263183594, "train/tau_std": 2.685509741306305 }, { "epoch": 0.22683940356880958, "grad_norm": 2.640098810195923, "learning_rate": 4.869773817683345e-06, "loss": 0.5547, "step": 116, "train/lambda_m_mean": 0.04643554799258709, "train/mu_mean": 0.6507590413093567, "train/mu_std": 0.21786135621368885, "train/rewards_chosen_mean": 0.2100362777709961, "train/rewards_rejected_mean": -0.7654316425323486, "train/tau_mean": 25.55742073059082, "train/tau_std": 2.681734561920166 }, { "epoch": 0.2287949156685407, "grad_norm": 1.802557349205017, "learning_rate": 4.8663468128855385e-06, "loss": 0.585, "step": 117, "train/lambda_m_mean": 0.04648437676951289, "train/mu_mean": 0.6235530525445938, "train/mu_std": 0.19762133806943893, "train/rewards_chosen_mean": -0.17572975158691406, "train/rewards_rejected_mean": -0.9928297996520996, "train/tau_mean": 25.202665090560913, "train/tau_std": 2.396011531352997 }, { "epoch": 0.23075042776827182, "grad_norm": 1.6736953258514404, "learning_rate": 4.862919808087732e-06, "loss": 0.5968, "step": 118, "train/lambda_m_mean": 0.04755859589204192, "train/mu_mean": 0.6154100894927979, "train/mu_std": 0.1909904759377241, "train/rewards_chosen_mean": -0.3342266082763672, "train/rewards_rejected_mean": -1.0984973907470703, "train/tau_mean": 25.22707962989807, "train/tau_std": 2.794050306081772 }, { "epoch": 0.23270593986800292, "grad_norm": 1.6944215297698975, "learning_rate": 4.859492803289925e-06, "loss": 0.5862, "step": 119, "train/lambda_m_mean": 0.0437500006519258, "train/mu_mean": 0.6279813051223755, "train/mu_std": 0.2040298953652382, "train/rewards_chosen_mean": -0.4481620788574219, "train/rewards_rejected_mean": -1.322580337524414, "train/tau_mean": 25.63265085220337, "train/tau_std": 2.410101145505905 }, { "epoch": 0.23466145196773405, "grad_norm": 1.797908902168274, "learning_rate": 4.856065798492118e-06, "loss": 0.5827, "step": 120, "train/lambda_m_mean": 0.045703125186264515, "train/mu_mean": 0.6312213093042374, "train/mu_std": 0.1996159227564931, "train/rewards_chosen_mean": -0.26151275634765625, "train/rewards_rejected_mean": -1.2221755981445312, "train/tau_mean": 25.450315952301025, "train/tau_std": 2.564430817961693 }, { "epoch": 0.23661696406746516, "grad_norm": 1.9030839204788208, "learning_rate": 4.8526387936943116e-06, "loss": 0.5385, "step": 121, "train/lambda_m_mean": 0.046777345007285476, "train/mu_mean": 0.6691904291510582, "train/mu_std": 0.22994600608944893, "train/rewards_chosen_mean": -0.23442840576171875, "train/rewards_rejected_mean": -1.577855110168457, "train/tau_mean": 25.89699411392212, "train/tau_std": 2.9117511212825775 }, { "epoch": 0.2385724761671963, "grad_norm": 2.0009665489196777, "learning_rate": 4.849211788896505e-06, "loss": 0.6109, "step": 122, "train/lambda_m_mean": 0.04345703241415322, "train/mu_mean": 0.6112360432744026, "train/mu_std": 0.20013593509793282, "train/rewards_chosen_mean": -0.5345396995544434, "train/rewards_rejected_mean": -1.3594163656234741, "train/tau_mean": 25.524922132492065, "train/tau_std": 2.4939612448215485 }, { "epoch": 0.2405279882669274, "grad_norm": 1.781427025794983, "learning_rate": 4.845784784098698e-06, "loss": 0.5498, "step": 123, "train/lambda_m_mean": 0.04389648558571935, "train/mu_mean": 0.6543920934200287, "train/mu_std": 0.22215928882360458, "train/rewards_chosen_mean": -0.53265380859375, "train/rewards_rejected_mean": -1.615509033203125, "train/tau_mean": 25.508813619613647, "train/tau_std": 2.6306516230106354 }, { "epoch": 0.24248350036665853, "grad_norm": 1.3510771989822388, "learning_rate": 4.842357779300891e-06, "loss": 0.538, "step": 124, "train/lambda_m_mean": 0.04765625111758709, "train/mu_mean": 0.6615689396858215, "train/mu_std": 0.19643156696110964, "train/rewards_chosen_mean": -0.551384449005127, "train/rewards_rejected_mean": -1.6623992919921875, "train/tau_mean": 25.250513315200806, "train/tau_std": 2.5747044682502747 }, { "epoch": 0.24443901246638963, "grad_norm": 1.4695203304290771, "learning_rate": 4.838930774503085e-06, "loss": 0.5927, "step": 125, "train/lambda_m_mean": 0.04599609365686774, "train/mu_mean": 0.6164591237902641, "train/mu_std": 0.1941315159201622, "train/rewards_chosen_mean": -0.7096099853515625, "train/rewards_rejected_mean": -1.4743194580078125, "train/tau_mean": 24.698763370513916, "train/tau_std": 2.8026885092258453 }, { "epoch": 0.24639452456612077, "grad_norm": 1.7320611476898193, "learning_rate": 4.835503769705278e-06, "loss": 0.5692, "step": 126, "train/lambda_m_mean": 0.03964843926951289, "train/mu_mean": 0.6332984939217567, "train/mu_std": 0.20701955072581768, "train/rewards_chosen_mean": -0.622894287109375, "train/rewards_rejected_mean": -1.547760009765625, "train/tau_mean": 25.246586561203003, "train/tau_std": 2.70344814658165 }, { "epoch": 0.24835003666585187, "grad_norm": 1.9018486738204956, "learning_rate": 4.832076764907471e-06, "loss": 0.6512, "step": 127, "train/lambda_m_mean": 0.04389648512005806, "train/mu_mean": 0.5818619728088379, "train/mu_std": 0.17527224030345678, "train/rewards_chosen_mean": -0.9288177490234375, "train/rewards_rejected_mean": -1.5170135498046875, "train/tau_mean": 24.987332582473755, "train/tau_std": 2.425224855542183 }, { "epoch": 0.250305548765583, "grad_norm": 1.4879480600357056, "learning_rate": 4.828649760109664e-06, "loss": 0.5785, "step": 128, "train/lambda_m_mean": 0.03916015778668225, "train/mu_mean": 0.6273605301976204, "train/mu_std": 0.20895227044820786, "train/rewards_chosen_mean": -0.7417831420898438, "train/rewards_rejected_mean": -1.674041748046875, "train/tau_mean": 25.658961296081543, "train/tau_std": 2.6481430530548096 }, { "epoch": 0.2522610608653141, "grad_norm": 1.7023433446884155, "learning_rate": 4.825222755311858e-06, "loss": 0.6369, "step": 129, "train/lambda_m_mean": 0.03950195387005806, "train/mu_mean": 0.5839325934648514, "train/mu_std": 0.1785400714725256, "train/rewards_chosen_mean": -0.9156951904296875, "train/rewards_rejected_mean": -1.5158500671386719, "train/tau_mean": 25.560078620910645, "train/tau_std": 2.4249509125947952 }, { "epoch": 0.25421657296504524, "grad_norm": 1.6528823375701904, "learning_rate": 4.821795750514051e-06, "loss": 0.5706, "step": 130, "train/lambda_m_mean": 0.04345703078433871, "train/mu_mean": 0.6376466602087021, "train/mu_std": 0.20822152495384216, "train/rewards_chosen_mean": -0.72198486328125, "train/rewards_rejected_mean": -1.7181549072265625, "train/tau_mean": 25.8989474773407, "train/tau_std": 2.720160275697708 }, { "epoch": 0.25617208506477634, "grad_norm": 1.9284757375717163, "learning_rate": 4.818368745716244e-06, "loss": 0.5456, "step": 131, "train/lambda_m_mean": 0.03906250139698386, "train/mu_mean": 0.6510007977485657, "train/mu_std": 0.217148557305336, "train/rewards_chosen_mean": -0.7251739501953125, "train/rewards_rejected_mean": -1.863616943359375, "train/tau_mean": 26.47706699371338, "train/tau_std": 2.521327465772629 }, { "epoch": 0.25812759716450745, "grad_norm": 1.9019614458084106, "learning_rate": 4.814941740918438e-06, "loss": 0.5968, "step": 132, "train/lambda_m_mean": 0.04350586095824838, "train/mu_mean": 0.6157266944646835, "train/mu_std": 0.19423697516322136, "train/rewards_chosen_mean": -0.8131179809570312, "train/rewards_rejected_mean": -1.642547607421875, "train/tau_mean": 26.444839239120483, "train/tau_std": 2.7267803251743317 }, { "epoch": 0.26008310926423855, "grad_norm": 2.91790771484375, "learning_rate": 4.811514736120631e-06, "loss": 0.6408, "step": 133, "train/lambda_m_mean": 0.04033203236758709, "train/mu_mean": 0.5869394615292549, "train/mu_std": 0.19094444625079632, "train/rewards_chosen_mean": -0.903533935546875, "train/rewards_rejected_mean": -1.5605010986328125, "train/tau_mean": 26.73390245437622, "train/tau_std": 2.9640394747257233 }, { "epoch": 0.2620386213639697, "grad_norm": 2.9907772541046143, "learning_rate": 4.808087731322824e-06, "loss": 0.599, "step": 134, "train/lambda_m_mean": 0.038330079056322575, "train/mu_mean": 0.6109799444675446, "train/mu_std": 0.18937359005212784, "train/rewards_chosen_mean": -0.9138069152832031, "train/rewards_rejected_mean": -1.6602516174316406, "train/tau_mean": 27.06642174720764, "train/tau_std": 2.4956187307834625 }, { "epoch": 0.2639941334637008, "grad_norm": 3.262267827987671, "learning_rate": 4.804660726525018e-06, "loss": 0.5703, "step": 135, "train/lambda_m_mean": 0.04545898595824838, "train/mu_mean": 0.6393492221832275, "train/mu_std": 0.20671778917312622, "train/rewards_chosen_mean": -0.7681799829006195, "train/rewards_rejected_mean": -1.7361125946044922, "train/tau_mean": 27.01417303085327, "train/tau_std": 2.7850188314914703 }, { "epoch": 0.2659496455634319, "grad_norm": 4.241480350494385, "learning_rate": 4.8012337217272105e-06, "loss": 0.6239, "step": 136, "train/lambda_m_mean": 0.0410644537769258, "train/mu_mean": 0.6020227745175362, "train/mu_std": 0.20124712958931923, "train/rewards_chosen_mean": -0.7487945556640625, "train/rewards_rejected_mean": -1.4692821502685547, "train/tau_mean": 27.584481716156006, "train/tau_std": 2.683669000864029 }, { "epoch": 0.26790515766316303, "grad_norm": 3.938140630722046, "learning_rate": 4.797806716929404e-06, "loss": 0.5712, "step": 137, "train/lambda_m_mean": 0.04580078087747097, "train/mu_mean": 0.6440932676196098, "train/mu_std": 0.21739943139255047, "train/rewards_chosen_mean": -0.6624603271484375, "train/rewards_rejected_mean": -1.626887321472168, "train/tau_mean": 27.423840284347534, "train/tau_std": 3.124852776527405 }, { "epoch": 0.26986066976289413, "grad_norm": 4.5492024421691895, "learning_rate": 4.794379712131597e-06, "loss": 0.6133, "step": 138, "train/lambda_m_mean": 0.04345703171566129, "train/mu_mean": 0.6133077442646027, "train/mu_std": 0.208742156624794, "train/rewards_chosen_mean": -0.7626953721046448, "train/rewards_rejected_mean": -1.5533819198608398, "train/tau_mean": 27.641611337661743, "train/tau_std": 2.8817501962184906 }, { "epoch": 0.2718161818626253, "grad_norm": 3.9008657932281494, "learning_rate": 4.790952707333791e-06, "loss": 0.5893, "step": 139, "train/lambda_m_mean": 0.04165039071813226, "train/mu_mean": 0.6215837150812149, "train/mu_std": 0.20284290425479412, "train/rewards_chosen_mean": -0.6360616683959961, "train/rewards_rejected_mean": -1.4015636444091797, "train/tau_mean": 27.820321798324585, "train/tau_std": 2.704389214515686 }, { "epoch": 0.2737716939623564, "grad_norm": 2.988138198852539, "learning_rate": 4.7875257025359836e-06, "loss": 0.5654, "step": 140, "train/lambda_m_mean": 0.04980468889698386, "train/mu_mean": 0.6478567346930504, "train/mu_std": 0.21891424991190434, "train/rewards_chosen_mean": -0.7120685577392578, "train/rewards_rejected_mean": -1.7090377807617188, "train/tau_mean": 26.7368745803833, "train/tau_std": 2.9164544194936752 }, { "epoch": 0.2757272060620875, "grad_norm": 3.2865772247314453, "learning_rate": 4.784098697738177e-06, "loss": 0.5478, "step": 141, "train/lambda_m_mean": 0.04282226739451289, "train/mu_mean": 0.6556858941912651, "train/mu_std": 0.21322820149362087, "train/rewards_chosen_mean": -0.696044921875, "train/rewards_rejected_mean": -1.7076244354248047, "train/tau_mean": 26.99809193611145, "train/tau_std": 2.7171912491321564 }, { "epoch": 0.2776827181618186, "grad_norm": 3.4433727264404297, "learning_rate": 4.780671692940371e-06, "loss": 0.6204, "step": 142, "train/lambda_m_mean": 0.043847656808793545, "train/mu_mean": 0.5988790094852448, "train/mu_std": 0.1822637002915144, "train/rewards_chosen_mean": -0.8330001831054688, "train/rewards_rejected_mean": -1.452833890914917, "train/tau_mean": 26.876967430114746, "train/tau_std": 2.868125230073929 }, { "epoch": 0.27963823026154977, "grad_norm": 2.348418712615967, "learning_rate": 4.777244688142564e-06, "loss": 0.5669, "step": 143, "train/lambda_m_mean": 0.04887695470824838, "train/mu_mean": 0.6424141973257065, "train/mu_std": 0.2145171444863081, "train/rewards_chosen_mean": -0.5233707427978516, "train/rewards_rejected_mean": -1.4725818634033203, "train/tau_mean": 26.327656745910645, "train/tau_std": 2.751773327589035 }, { "epoch": 0.2815937423612809, "grad_norm": 2.8349766731262207, "learning_rate": 4.773817683344757e-06, "loss": 0.5377, "step": 144, "train/lambda_m_mean": 0.04506836086511612, "train/mu_mean": 0.6625234186649323, "train/mu_std": 0.20729079470038414, "train/rewards_chosen_mean": -0.485201358795166, "train/rewards_rejected_mean": -1.6046819686889648, "train/tau_mean": 26.652358293533325, "train/tau_std": 2.6364504396915436 }, { "epoch": 0.283549254461012, "grad_norm": 3.5538110733032227, "learning_rate": 4.770390678546951e-06, "loss": 0.5732, "step": 145, "train/lambda_m_mean": 0.04604492336511612, "train/mu_mean": 0.6403373256325722, "train/mu_std": 0.21305623836815357, "train/rewards_chosen_mean": -0.39670658111572266, "train/rewards_rejected_mean": -1.4084968566894531, "train/tau_mean": 26.899917602539062, "train/tau_std": 2.792020946741104 }, { "epoch": 0.2855047665607431, "grad_norm": 3.996368408203125, "learning_rate": 4.766963673749144e-06, "loss": 0.5625, "step": 146, "train/lambda_m_mean": 0.04321289015933871, "train/mu_mean": 0.652526967227459, "train/mu_std": 0.22363007254898548, "train/rewards_chosen_mean": -0.29175758361816406, "train/rewards_rejected_mean": -1.5410418510437012, "train/tau_mean": 27.010756015777588, "train/tau_std": 3.0341694355010986 }, { "epoch": 0.2874602786604742, "grad_norm": 4.9726786613464355, "learning_rate": 4.763536668951336e-06, "loss": 0.5711, "step": 147, "train/lambda_m_mean": 0.046728517627343535, "train/mu_mean": 0.6522921547293663, "train/mu_std": 0.2203656192868948, "train/rewards_chosen_mean": -0.334078311920166, "train/rewards_rejected_mean": -1.4618741273880005, "train/tau_mean": 26.614062786102295, "train/tau_std": 3.080471783876419 }, { "epoch": 0.28941579076020535, "grad_norm": 3.452028751373291, "learning_rate": 4.76010966415353e-06, "loss": 0.5825, "step": 148, "train/lambda_m_mean": 0.04360351711511612, "train/mu_mean": 0.6290916800498962, "train/mu_std": 0.2092552911490202, "train/rewards_chosen_mean": -0.5915489196777344, "train/rewards_rejected_mean": -1.5141372680664062, "train/tau_mean": 26.400896310806274, "train/tau_std": 2.602171242237091 }, { "epoch": 0.29137130285993645, "grad_norm": 3.579153537750244, "learning_rate": 4.756682659355724e-06, "loss": 0.5514, "step": 149, "train/lambda_m_mean": 0.04243164183571935, "train/mu_mean": 0.6575019955635071, "train/mu_std": 0.22844257950782776, "train/rewards_chosen_mean": -0.8090381622314453, "train/rewards_rejected_mean": -1.9757652282714844, "train/tau_mean": 26.852332592010498, "train/tau_std": 2.9408280551433563 }, { "epoch": 0.29332681495966756, "grad_norm": 3.9159436225891113, "learning_rate": 4.753255654557917e-06, "loss": 0.5919, "step": 150, "train/lambda_m_mean": 0.042480469681322575, "train/mu_mean": 0.6268450319766998, "train/mu_std": 0.21570908837020397, "train/rewards_chosen_mean": -1.1468772888183594, "train/rewards_rejected_mean": -2.048686981201172, "train/tau_mean": 26.299822092056274, "train/tau_std": 2.4468893259763718 }, { "epoch": 0.29528232705939866, "grad_norm": 3.465712070465088, "learning_rate": 4.7498286497601095e-06, "loss": 0.5374, "step": 151, "train/lambda_m_mean": 0.04311523539945483, "train/mu_mean": 0.6493482664227486, "train/mu_std": 0.19876410253345966, "train/rewards_chosen_mean": -1.223175048828125, "train/rewards_rejected_mean": -2.1281890869140625, "train/tau_mean": 25.67338538169861, "train/tau_std": 2.5265377163887024 }, { "epoch": 0.2972378391591298, "grad_norm": 2.9857399463653564, "learning_rate": 4.7464016449623036e-06, "loss": 0.5651, "step": 152, "train/lambda_m_mean": 0.0447265631519258, "train/mu_mean": 0.6336534842848778, "train/mu_std": 0.1947175618261099, "train/rewards_chosen_mean": -1.534423828125, "train/rewards_rejected_mean": -2.3770294189453125, "train/tau_mean": 25.28322958946228, "train/tau_std": 2.578656882047653 }, { "epoch": 0.29919335125886093, "grad_norm": 2.6458816528320312, "learning_rate": 4.742974640164497e-06, "loss": 0.5455, "step": 153, "train/lambda_m_mean": 0.0444824225269258, "train/mu_mean": 0.6537522971630096, "train/mu_std": 0.20989143289625645, "train/rewards_chosen_mean": -1.273406982421875, "train/rewards_rejected_mean": -2.280609130859375, "train/tau_mean": 25.42849612236023, "train/tau_std": 2.324952557682991 }, { "epoch": 0.30114886335859203, "grad_norm": 4.316255569458008, "learning_rate": 4.73954763536669e-06, "loss": 0.5388, "step": 154, "train/lambda_m_mean": 0.04218750074505806, "train/mu_mean": 0.6548971831798553, "train/mu_std": 0.21381761692464352, "train/rewards_chosen_mean": -1.4001178741455078, "train/rewards_rejected_mean": -2.4821624755859375, "train/tau_mean": 26.173844575881958, "train/tau_std": 2.49683478474617 }, { "epoch": 0.30310437545832314, "grad_norm": 4.156078338623047, "learning_rate": 4.736120630568883e-06, "loss": 0.5325, "step": 155, "train/lambda_m_mean": 0.044580078683793545, "train/mu_mean": 0.6703333407640457, "train/mu_std": 0.21581625565886497, "train/rewards_chosen_mean": -1.312154360115528, "train/rewards_rejected_mean": -2.5652542114257812, "train/tau_mean": 26.588244199752808, "train/tau_std": 2.7051835358142853 }, { "epoch": 0.30505988755805424, "grad_norm": 6.308278560638428, "learning_rate": 4.732693625771077e-06, "loss": 0.459, "step": 156, "train/lambda_m_mean": 0.04125976609066129, "train/mu_mean": 0.7337503209710121, "train/mu_std": 0.2357003167271614, "train/rewards_chosen_mean": -1.1069869995117188, "train/rewards_rejected_mean": -2.9962310791015625, "train/tau_mean": 27.564462900161743, "train/tau_std": 2.7115143686532974 }, { "epoch": 0.3070153996577854, "grad_norm": 6.217889785766602, "learning_rate": 4.72926662097327e-06, "loss": 0.5094, "step": 157, "train/lambda_m_mean": 0.043066407553851604, "train/mu_mean": 0.6922696456313133, "train/mu_std": 0.23090296797454357, "train/rewards_chosen_mean": -1.4465408325195312, "train/rewards_rejected_mean": -2.886515736579895, "train/tau_mean": 27.869638204574585, "train/tau_std": 2.5347485691308975 }, { "epoch": 0.3089709117575165, "grad_norm": 6.817811489105225, "learning_rate": 4.725839616175463e-06, "loss": 0.55, "step": 158, "train/lambda_m_mean": 0.04453125037252903, "train/mu_mean": 0.6745952889323235, "train/mu_std": 0.2427505198866129, "train/rewards_chosen_mean": -1.1239662170410156, "train/rewards_rejected_mean": -2.526763916015625, "train/tau_mean": 27.771986961364746, "train/tau_std": 2.5851361751556396 }, { "epoch": 0.3109264238572476, "grad_norm": 22.235116958618164, "learning_rate": 4.722412611377656e-06, "loss": 0.6702, "step": 159, "train/lambda_m_mean": 0.04262695414945483, "train/mu_mean": 0.6507194712758064, "train/mu_std": 0.23523969389498234, "train/rewards_chosen_mean": -1.106278896331787, "train/rewards_rejected_mean": -2.2313404083251953, "train/tau_mean": 28.088875770568848, "train/tau_std": 2.8221145272254944 }, { "epoch": 0.3128819359569787, "grad_norm": 5.920804977416992, "learning_rate": 4.71898560657985e-06, "loss": 0.5292, "step": 160, "train/lambda_m_mean": 0.04218750121071935, "train/mu_mean": 0.6689004004001617, "train/mu_std": 0.22622194327414036, "train/rewards_chosen_mean": -0.8507051467895508, "train/rewards_rejected_mean": -1.9559226036071777, "train/tau_mean": 27.77491021156311, "train/tau_std": 2.3832489401102066 }, { "epoch": 0.3148374480567098, "grad_norm": 5.338804721832275, "learning_rate": 4.715558601782043e-06, "loss": 0.4828, "step": 161, "train/lambda_m_mean": 0.04238281352445483, "train/mu_mean": 0.6949725225567818, "train/mu_std": 0.2131840456277132, "train/rewards_chosen_mean": -0.717595100402832, "train/rewards_rejected_mean": -1.9265213012695312, "train/tau_mean": 27.184593439102173, "train/tau_std": 2.481859266757965 }, { "epoch": 0.316792960156441, "grad_norm": 3.7878310680389404, "learning_rate": 4.712131596984236e-06, "loss": 0.5343, "step": 162, "train/lambda_m_mean": 0.04638672014698386, "train/mu_mean": 0.6574986055493355, "train/mu_std": 0.20435345359146595, "train/rewards_chosen_mean": -0.8173980712890625, "train/rewards_rejected_mean": -1.7280540466308594, "train/tau_mean": 26.58936834335327, "train/tau_std": 2.480402961373329 }, { "epoch": 0.3187484722561721, "grad_norm": 3.529370069503784, "learning_rate": 4.7087045921864295e-06, "loss": 0.5513, "step": 163, "train/lambda_m_mean": 0.04487304715439677, "train/mu_mean": 0.6453143283724785, "train/mu_std": 0.2044514250010252, "train/rewards_chosen_mean": -0.7511367797851562, "train/rewards_rejected_mean": -1.5949506759643555, "train/tau_mean": 26.72022795677185, "train/tau_std": 2.432288557291031 }, { "epoch": 0.3207039843559032, "grad_norm": 3.4380404949188232, "learning_rate": 4.705277587388623e-06, "loss": 0.5789, "step": 164, "train/lambda_m_mean": 0.03950195340439677, "train/mu_mean": 0.6235409304499626, "train/mu_std": 0.187337726354599, "train/rewards_chosen_mean": -0.7996063232421875, "train/rewards_rejected_mean": -1.5145854949951172, "train/tau_mean": 27.023935794830322, "train/tau_std": 2.1921055763959885 }, { "epoch": 0.3226594964556343, "grad_norm": 4.033701419830322, "learning_rate": 4.701850582590816e-06, "loss": 0.5471, "step": 165, "train/lambda_m_mean": 0.0416992197278887, "train/mu_mean": 0.647043913602829, "train/mu_std": 0.20416252315044403, "train/rewards_chosen_mean": -0.9169654846191406, "train/rewards_rejected_mean": -1.827253818511963, "train/tau_mean": 27.17676615715027, "train/tau_std": 2.1125795543193817 }, { "epoch": 0.32461500855536546, "grad_norm": 3.5270373821258545, "learning_rate": 4.698423577793009e-06, "loss": 0.5096, "step": 166, "train/lambda_m_mean": 0.04433593852445483, "train/mu_mean": 0.6816716343164444, "train/mu_std": 0.21421218663454056, "train/rewards_chosen_mean": -0.5885658264160156, "train/rewards_rejected_mean": -1.7731297016143799, "train/tau_mean": 27.09552788734436, "train/tau_std": 2.416838437318802 }, { "epoch": 0.32657052065509656, "grad_norm": 4.363593578338623, "learning_rate": 4.6949965729952025e-06, "loss": 0.4978, "step": 167, "train/lambda_m_mean": 0.04321289202198386, "train/mu_mean": 0.6976286619901657, "train/mu_std": 0.23409458063542843, "train/rewards_chosen_mean": -0.14594900608062744, "train/rewards_rejected_mean": -1.5383243560791016, "train/tau_mean": 27.44638419151306, "train/tau_std": 2.503202885389328 }, { "epoch": 0.32852603275482767, "grad_norm": 5.598028182983398, "learning_rate": 4.691569568197396e-06, "loss": 0.49, "step": 168, "train/lambda_m_mean": 0.04335937602445483, "train/mu_mean": 0.701655238866806, "train/mu_std": 0.22828513011336327, "train/rewards_chosen_mean": 0.16555309295654297, "train/rewards_rejected_mean": -1.2426729202270508, "train/tau_mean": 27.792003631591797, "train/tau_std": 2.419291600584984 }, { "epoch": 0.3304815448545588, "grad_norm": 5.037827968597412, "learning_rate": 4.688142563399589e-06, "loss": 0.5518, "step": 169, "train/lambda_m_mean": 0.05180664220824838, "train/mu_mean": 0.6728912740945816, "train/mu_std": 0.23422261141240597, "train/rewards_chosen_mean": -0.012360811233520508, "train/rewards_rejected_mean": -1.246551275253296, "train/tau_mean": 27.30469846725464, "train/tau_std": 2.6190616339445114 }, { "epoch": 0.3324370569542899, "grad_norm": 4.286274433135986, "learning_rate": 4.684715558601782e-06, "loss": 0.5646, "step": 170, "train/lambda_m_mean": 0.03867187676951289, "train/mu_mean": 0.6411762982606888, "train/mu_std": 0.20965873077511787, "train/rewards_chosen_mean": 0.08492469787597656, "train/rewards_rejected_mean": -0.9141902923583984, "train/tau_mean": 27.872077226638794, "train/tau_std": 2.2575145810842514 }, { "epoch": 0.33439256905402104, "grad_norm": 3.784738063812256, "learning_rate": 4.6812885538039756e-06, "loss": 0.4999, "step": 171, "train/lambda_m_mean": 0.0439453125, "train/mu_mean": 0.6930751278996468, "train/mu_std": 0.2219990212470293, "train/rewards_chosen_mean": 0.3808533549308777, "train/rewards_rejected_mean": -1.0197031497955322, "train/tau_mean": 27.16309690475464, "train/tau_std": 2.568499833345413 }, { "epoch": 0.33634808115375214, "grad_norm": 5.221006870269775, "learning_rate": 4.677861549006169e-06, "loss": 0.5316, "step": 172, "train/lambda_m_mean": 0.0404296878259629, "train/mu_mean": 0.6696095615625381, "train/mu_std": 0.2191164344549179, "train/rewards_chosen_mean": 0.16169267892837524, "train/rewards_rejected_mean": -1.0308135747909546, "train/tau_mean": 27.242687225341797, "train/tau_std": 2.329259291291237 }, { "epoch": 0.33830359325348325, "grad_norm": 3.4633145332336426, "learning_rate": 4.674434544208362e-06, "loss": 0.5464, "step": 173, "train/lambda_m_mean": 0.04370117140933871, "train/mu_mean": 0.6551927551627159, "train/mu_std": 0.21159927546977997, "train/rewards_chosen_mean": 0.27629709243774414, "train/rewards_rejected_mean": -0.7917595505714417, "train/tau_mean": 26.954389095306396, "train/tau_std": 2.599943056702614 }, { "epoch": 0.34025910535321435, "grad_norm": 4.10396146774292, "learning_rate": 4.671007539410555e-06, "loss": 0.5602, "step": 174, "train/lambda_m_mean": 0.04423828236758709, "train/mu_mean": 0.6473362296819687, "train/mu_std": 0.20944981276988983, "train/rewards_chosen_mean": -0.08007097244262695, "train/rewards_rejected_mean": -1.0359458923339844, "train/tau_mean": 26.925304412841797, "train/tau_std": 2.5828793942928314 }, { "epoch": 0.3422146174529455, "grad_norm": 4.146261692047119, "learning_rate": 4.667580534612749e-06, "loss": 0.5536, "step": 175, "train/lambda_m_mean": 0.04311523446813226, "train/mu_mean": 0.6499398946762085, "train/mu_std": 0.21646969951689243, "train/rewards_chosen_mean": -0.30796098709106445, "train/rewards_rejected_mean": -1.292719841003418, "train/tau_mean": 27.11280345916748, "train/tau_std": 2.495930388569832 }, { "epoch": 0.3441701295526766, "grad_norm": 7.123760223388672, "learning_rate": 4.664153529814942e-06, "loss": 0.5167, "step": 176, "train/lambda_m_mean": 0.044873048551380634, "train/mu_mean": 0.6921719014644623, "train/mu_std": 0.24706633388996124, "train/rewards_chosen_mean": -0.13526630401611328, "train/rewards_rejected_mean": -1.4703474044799805, "train/tau_mean": 27.242202281951904, "train/tau_std": 3.053193509578705 }, { "epoch": 0.3461256416524077, "grad_norm": 6.633544445037842, "learning_rate": 4.660726525017135e-06, "loss": 0.4946, "step": 177, "train/lambda_m_mean": 0.043212891556322575, "train/mu_mean": 0.6990130618214607, "train/mu_std": 0.22048906050622463, "train/rewards_chosen_mean": -0.2824134826660156, "train/rewards_rejected_mean": -1.6082468032836914, "train/tau_mean": 27.583017826080322, "train/tau_std": 2.9070863723754883 }, { "epoch": 0.3480811537521388, "grad_norm": 5.729059219360352, "learning_rate": 4.657299520219329e-06, "loss": 0.5203, "step": 178, "train/lambda_m_mean": 0.046142579056322575, "train/mu_mean": 0.6805417016148567, "train/mu_std": 0.22628968209028244, "train/rewards_chosen_mean": -0.27236461639404297, "train/rewards_rejected_mean": -1.4839916229248047, "train/tau_mean": 27.499523639678955, "train/tau_std": 3.225174218416214 }, { "epoch": 0.35003666585186993, "grad_norm": 5.3893141746521, "learning_rate": 4.653872515421522e-06, "loss": 0.5192, "step": 179, "train/lambda_m_mean": 0.046923830173909664, "train/mu_mean": 0.6851552054286003, "train/mu_std": 0.22787057608366013, "train/rewards_chosen_mean": -0.49927592277526855, "train/rewards_rejected_mean": -1.7505817413330078, "train/tau_mean": 27.39947271347046, "train/tau_std": 3.158607989549637 }, { "epoch": 0.3519921779516011, "grad_norm": 8.328786849975586, "learning_rate": 4.650445510623715e-06, "loss": 0.5961, "step": 180, "train/lambda_m_mean": 0.04311523586511612, "train/mu_mean": 0.653047002851963, "train/mu_std": 0.23271936364471912, "train/rewards_chosen_mean": -0.6515828371047974, "train/rewards_rejected_mean": -1.6485986709594727, "train/tau_mean": 27.90407085418701, "train/tau_std": 3.2240538895130157 }, { "epoch": 0.3539476900513322, "grad_norm": 5.4269585609436035, "learning_rate": 4.647018505825909e-06, "loss": 0.4743, "step": 181, "train/lambda_m_mean": 0.03935546870343387, "train/mu_mean": 0.7115359976887703, "train/mu_std": 0.23587557673454285, "train/rewards_chosen_mean": -0.5297203063964844, "train/rewards_rejected_mean": -1.9179878234863281, "train/tau_mean": 28.08790135383606, "train/tau_std": 3.233437240123749 }, { "epoch": 0.3559032021510633, "grad_norm": 4.750287055969238, "learning_rate": 4.6435915010281015e-06, "loss": 0.4812, "step": 182, "train/lambda_m_mean": 0.04580078227445483, "train/mu_mean": 0.6978375464677811, "train/mu_std": 0.2168635744601488, "train/rewards_chosen_mean": -0.5354537963867188, "train/rewards_rejected_mean": -1.764617919921875, "train/tau_mean": 27.33058214187622, "train/tau_std": 3.4407248497009277 }, { "epoch": 0.3578587142507944, "grad_norm": 4.7358174324035645, "learning_rate": 4.640164496230295e-06, "loss": 0.5436, "step": 183, "train/lambda_m_mean": 0.03994140774011612, "train/mu_mean": 0.6487820744514465, "train/mu_std": 0.20179563201963902, "train/rewards_chosen_mean": -0.8763408660888672, "train/rewards_rejected_mean": -1.81744384765625, "train/tau_mean": 27.66797947883606, "train/tau_std": 3.048469662666321 }, { "epoch": 0.35981422635052557, "grad_norm": 4.5828328132629395, "learning_rate": 4.636737491432489e-06, "loss": 0.5414, "step": 184, "train/lambda_m_mean": 0.04892578162252903, "train/mu_mean": 0.6517798975110054, "train/mu_std": 0.18471317179501057, "train/rewards_chosen_mean": -0.697751522064209, "train/rewards_rejected_mean": -1.599043846130371, "train/tau_mean": 27.391613721847534, "train/tau_std": 3.239463299512863 }, { "epoch": 0.3617697384502567, "grad_norm": 4.052616596221924, "learning_rate": 4.633310486634682e-06, "loss": 0.4932, "step": 185, "train/lambda_m_mean": 0.04272461077198386, "train/mu_mean": 0.7049626931548119, "train/mu_std": 0.22676853463053703, "train/rewards_chosen_mean": -0.46086692810058594, "train/rewards_rejected_mean": -1.830862283706665, "train/tau_mean": 27.620617389678955, "train/tau_std": 3.048994392156601 }, { "epoch": 0.3637252505499878, "grad_norm": 4.329556941986084, "learning_rate": 4.6298834818368745e-06, "loss": 0.5512, "step": 186, "train/lambda_m_mean": 0.042480469681322575, "train/mu_mean": 0.6575634330511093, "train/mu_std": 0.21996956132352352, "train/rewards_chosen_mean": -0.4403853416442871, "train/rewards_rejected_mean": -1.4836616516113281, "train/tau_mean": 28.116710901260376, "train/tau_std": 3.4869944155216217 }, { "epoch": 0.3656807626497189, "grad_norm": 4.163434028625488, "learning_rate": 4.626456477039068e-06, "loss": 0.4638, "step": 187, "train/lambda_m_mean": 0.04643554845824838, "train/mu_mean": 0.7148786559700966, "train/mu_std": 0.21793462336063385, "train/rewards_chosen_mean": -0.2940521240234375, "train/rewards_rejected_mean": -1.7219657897949219, "train/tau_mean": 27.896260499954224, "train/tau_std": 3.528663694858551 }, { "epoch": 0.36763627474945, "grad_norm": 4.55344820022583, "learning_rate": 4.623029472241262e-06, "loss": 0.5178, "step": 188, "train/lambda_m_mean": 0.04472656361758709, "train/mu_mean": 0.6749962791800499, "train/mu_std": 0.21849100664258003, "train/rewards_chosen_mean": -0.05375099182128906, "train/rewards_rejected_mean": -1.2340888977050781, "train/tau_mean": 28.629404306411743, "train/tau_std": 3.669487088918686 }, { "epoch": 0.36959178684918115, "grad_norm": 3.638319253921509, "learning_rate": 4.619602467443455e-06, "loss": 0.4847, "step": 189, "train/lambda_m_mean": 0.04379882896319032, "train/mu_mean": 0.7043685466051102, "train/mu_std": 0.2322601992636919, "train/rewards_chosen_mean": 0.09563976526260376, "train/rewards_rejected_mean": -1.4107999801635742, "train/tau_mean": 28.45850682258606, "train/tau_std": 3.587068557739258 }, { "epoch": 0.37154729894891225, "grad_norm": 4.731318950653076, "learning_rate": 4.6161754626456476e-06, "loss": 0.5516, "step": 190, "train/lambda_m_mean": 0.045605470426380634, "train/mu_mean": 0.6613958328962326, "train/mu_std": 0.2275866325944662, "train/rewards_chosen_mean": 0.044704437255859375, "train/rewards_rejected_mean": -1.1424388885498047, "train/tau_mean": 28.537117958068848, "train/tau_std": 3.3650819957256317 }, { "epoch": 0.37350281104864336, "grad_norm": 5.677102088928223, "learning_rate": 4.612748457847842e-06, "loss": 0.5148, "step": 191, "train/lambda_m_mean": 0.044775391230359674, "train/mu_mean": 0.691802091896534, "train/mu_std": 0.2369480151683092, "train/rewards_chosen_mean": -0.05701875686645508, "train/rewards_rejected_mean": -1.5000495910644531, "train/tau_mean": 28.026867389678955, "train/tau_std": 3.35860612988472 }, { "epoch": 0.37545832314837446, "grad_norm": 3.735384941101074, "learning_rate": 4.609321453050035e-06, "loss": 0.4411, "step": 192, "train/lambda_m_mean": 0.04091796884313226, "train/mu_mean": 0.7256805598735809, "train/mu_std": 0.21606630645692348, "train/rewards_chosen_mean": 0.267792209982872, "train/rewards_rejected_mean": -1.3536217212677002, "train/tau_mean": 27.833999633789062, "train/tau_std": 3.545042186975479 }, { "epoch": 0.3774138352481056, "grad_norm": 5.077584266662598, "learning_rate": 4.605894448252227e-06, "loss": 0.4936, "step": 193, "train/lambda_m_mean": 0.04316406277939677, "train/mu_mean": 0.6934581026434898, "train/mu_std": 0.20876295678317547, "train/rewards_chosen_mean": 0.21542465686798096, "train/rewards_rejected_mean": -1.0487413555383682, "train/tau_mean": 27.16701030731201, "train/tau_std": 3.4074134528636932 }, { "epoch": 0.3793693473478367, "grad_norm": 4.347666263580322, "learning_rate": 4.6024674434544215e-06, "loss": 0.5067, "step": 194, "train/lambda_m_mean": 0.048388672061264515, "train/mu_mean": 0.6994314640760422, "train/mu_std": 0.23578637093305588, "train/rewards_chosen_mean": 0.5222845077514648, "train/rewards_rejected_mean": -0.8947978019714355, "train/tau_mean": 26.215844869613647, "train/tau_std": 3.230974406003952 }, { "epoch": 0.38132485944756783, "grad_norm": 4.155826091766357, "learning_rate": 4.599040438656615e-06, "loss": 0.4643, "step": 195, "train/lambda_m_mean": 0.04130859486758709, "train/mu_mean": 0.7090180143713951, "train/mu_std": 0.21955249458551407, "train/rewards_chosen_mean": 0.6928396224975586, "train/rewards_rejected_mean": -0.7083015441894531, "train/tau_mean": 26.19582438468933, "train/tau_std": 3.0268510580062866 }, { "epoch": 0.38328037154729894, "grad_norm": 4.102189540863037, "learning_rate": 4.595613433858808e-06, "loss": 0.5643, "step": 196, "train/lambda_m_mean": 0.047363282181322575, "train/mu_mean": 0.6429995894432068, "train/mu_std": 0.20651816949248314, "train/rewards_chosen_mean": 0.08650577068328857, "train/rewards_rejected_mean": -0.8734283447265625, "train/tau_mean": 25.496127605438232, "train/tau_std": 3.326316177845001 }, { "epoch": 0.38523588364703004, "grad_norm": 4.4853363037109375, "learning_rate": 4.592186429061001e-06, "loss": 0.5313, "step": 197, "train/lambda_m_mean": 0.04536132887005806, "train/mu_mean": 0.6681367233395576, "train/mu_std": 0.21727186627686024, "train/rewards_chosen_mean": 0.23483800888061523, "train/rewards_rejected_mean": -0.8794336318969727, "train/tau_mean": 25.406779766082764, "train/tau_std": 3.500446707010269 }, { "epoch": 0.3871913957467612, "grad_norm": 6.067953109741211, "learning_rate": 4.5887594242631945e-06, "loss": 0.4435, "step": 198, "train/lambda_m_mean": 0.03994140727445483, "train/mu_mean": 0.7287672013044357, "train/mu_std": 0.21321318298578262, "train/rewards_chosen_mean": 0.19407367706298828, "train/rewards_rejected_mean": -1.4167098999023438, "train/tau_mean": 25.980491161346436, "train/tau_std": 2.998370796442032 }, { "epoch": 0.3891469078464923, "grad_norm": 5.502928256988525, "learning_rate": 4.585332419465388e-06, "loss": 0.5136, "step": 199, "train/lambda_m_mean": 0.04418945359066129, "train/mu_mean": 0.6898880824446678, "train/mu_std": 0.23693865351378918, "train/rewards_chosen_mean": -0.4099769592285156, "train/rewards_rejected_mean": -1.7147235870361328, "train/tau_mean": 25.834996461868286, "train/tau_std": 3.3786391764879227 }, { "epoch": 0.3911024199462234, "grad_norm": 5.54163932800293, "learning_rate": 4.581905414667581e-06, "loss": 0.4797, "step": 200, "train/lambda_m_mean": 0.04379882896319032, "train/mu_mean": 0.6970921978354454, "train/mu_std": 0.21134730242192745, "train/rewards_chosen_mean": -0.3426203727722168, "train/rewards_rejected_mean": -1.6324539184570312, "train/tau_mean": 25.907254219055176, "train/tau_std": 3.277559995651245 }, { "epoch": 0.3930579320459545, "grad_norm": 4.418539524078369, "learning_rate": 4.578478409869774e-06, "loss": 0.4419, "step": 201, "train/lambda_m_mean": 0.03837890736758709, "train/mu_mean": 0.7242706790566444, "train/mu_std": 0.21896855160593987, "train/rewards_chosen_mean": -0.15001869201660156, "train/rewards_rejected_mean": -1.674713134765625, "train/tau_mean": 26.405312299728394, "train/tau_std": 3.945722222328186 }, { "epoch": 0.3950134441456857, "grad_norm": 6.934731960296631, "learning_rate": 4.5750514050719676e-06, "loss": 0.5208, "step": 202, "train/lambda_m_mean": 0.04575195396319032, "train/mu_mean": 0.6978574693202972, "train/mu_std": 0.24137023463845253, "train/rewards_chosen_mean": -0.3169832229614258, "train/rewards_rejected_mean": -1.7207374572753906, "train/tau_mean": 26.474154233932495, "train/tau_std": 3.9550038278102875 }, { "epoch": 0.3969689562454168, "grad_norm": 5.087118148803711, "learning_rate": 4.571624400274161e-06, "loss": 0.463, "step": 203, "train/lambda_m_mean": 0.042285157134756446, "train/mu_mean": 0.7246439903974533, "train/mu_std": 0.23877288773655891, "train/rewards_chosen_mean": -0.32915592193603516, "train/rewards_rejected_mean": -1.8981389999389648, "train/tau_mean": 26.78811025619507, "train/tau_std": 3.5354280173778534 }, { "epoch": 0.3989244683451479, "grad_norm": 5.373819828033447, "learning_rate": 4.568197395476354e-06, "loss": 0.4337, "step": 204, "train/lambda_m_mean": 0.04628906329162419, "train/mu_mean": 0.7406701073050499, "train/mu_std": 0.21401755791157484, "train/rewards_chosen_mean": -0.5043182373046875, "train/rewards_rejected_mean": -2.0903987884521484, "train/tau_mean": 25.984412908554077, "train/tau_std": 3.7696470618247986 }, { "epoch": 0.400879980444879, "grad_norm": 5.851926326751709, "learning_rate": 4.564770390678547e-06, "loss": 0.4892, "step": 205, "train/lambda_m_mean": 0.04106445307843387, "train/mu_mean": 0.6955394446849823, "train/mu_std": 0.22226860001683235, "train/rewards_chosen_mean": -0.15546321868896484, "train/rewards_rejected_mean": -1.475494384765625, "train/tau_mean": 25.918489694595337, "train/tau_std": 3.7495423555374146 }, { "epoch": 0.4028354925446101, "grad_norm": 5.955161094665527, "learning_rate": 4.561343385880741e-06, "loss": 0.4619, "step": 206, "train/lambda_m_mean": 0.04477539146319032, "train/mu_mean": 0.7184754088521004, "train/mu_std": 0.21960615552961826, "train/rewards_chosen_mean": 0.1877422332763672, "train/rewards_rejected_mean": -1.2527599334716797, "train/tau_mean": 25.212211847305298, "train/tau_std": 4.048062682151794 }, { "epoch": 0.40479100464434126, "grad_norm": 5.985856533050537, "learning_rate": 4.557916381082934e-06, "loss": 0.4833, "step": 207, "train/lambda_m_mean": 0.050585937686264515, "train/mu_mean": 0.7054828554391861, "train/mu_std": 0.2127443291246891, "train/rewards_chosen_mean": 0.2272457480430603, "train/rewards_rejected_mean": -1.0516777038574219, "train/tau_mean": 24.297685146331787, "train/tau_std": 3.928411692380905 }, { "epoch": 0.40674651674407236, "grad_norm": 5.686359882354736, "learning_rate": 4.554489376285127e-06, "loss": 0.4927, "step": 208, "train/lambda_m_mean": 0.044531251303851604, "train/mu_mean": 0.7044408693909645, "train/mu_std": 0.2306167036294937, "train/rewards_chosen_mean": 0.3568439483642578, "train/rewards_rejected_mean": -1.0376251935958862, "train/tau_mean": 23.589203596115112, "train/tau_std": 3.515152543783188 }, { "epoch": 0.40870202884380347, "grad_norm": 4.9577155113220215, "learning_rate": 4.55106237148732e-06, "loss": 0.5054, "step": 209, "train/lambda_m_mean": 0.040429688757285476, "train/mu_mean": 0.6804002001881599, "train/mu_std": 0.21731316857039928, "train/rewards_chosen_mean": 0.30510270595550537, "train/rewards_rejected_mean": -0.7891731262207031, "train/tau_mean": 23.53329110145569, "train/tau_std": 3.4189796149730682 }, { "epoch": 0.41065754094353457, "grad_norm": 4.830063819885254, "learning_rate": 4.547635366689514e-06, "loss": 0.4872, "step": 210, "train/lambda_m_mean": 0.04257812537252903, "train/mu_mean": 0.692152313888073, "train/mu_std": 0.21602053567767143, "train/rewards_chosen_mean": 0.07470571994781494, "train/rewards_rejected_mean": -1.1690597534179688, "train/tau_mean": 22.39052414894104, "train/tau_std": 3.0116081833839417 }, { "epoch": 0.41261305304326573, "grad_norm": 4.863333702087402, "learning_rate": 4.544208361891707e-06, "loss": 0.5353, "step": 211, "train/lambda_m_mean": 0.04052734561264515, "train/mu_mean": 0.6651906594634056, "train/mu_std": 0.22143908590078354, "train/rewards_chosen_mean": -0.0131072998046875, "train/rewards_rejected_mean": -1.1114826202392578, "train/tau_mean": 22.883171796798706, "train/tau_std": 3.2562261819839478 }, { "epoch": 0.41456856514299684, "grad_norm": 6.4995036125183105, "learning_rate": 4.5407813570939e-06, "loss": 0.5424, "step": 212, "train/lambda_m_mean": 0.04013671958819032, "train/mu_mean": 0.6633161082863808, "train/mu_std": 0.22457788698375225, "train/rewards_chosen_mean": -0.01996135711669922, "train/rewards_rejected_mean": -1.1710867881774902, "train/tau_mean": 23.667588710784912, "train/tau_std": 4.001411288976669 }, { "epoch": 0.41652407724272794, "grad_norm": 5.2705583572387695, "learning_rate": 4.5373543522960935e-06, "loss": 0.4105, "step": 213, "train/lambda_m_mean": 0.04482421884313226, "train/mu_mean": 0.7500946745276451, "train/mu_std": 0.2200093325227499, "train/rewards_chosen_mean": -0.018758296966552734, "train/rewards_rejected_mean": -1.8138036727905273, "train/tau_mean": 23.239863872528076, "train/tau_std": 3.731255739927292 }, { "epoch": 0.41847958934245905, "grad_norm": 8.033626556396484, "learning_rate": 4.533927347498287e-06, "loss": 0.5094, "step": 214, "train/lambda_m_mean": 0.04555664258077741, "train/mu_mean": 0.6963394433259964, "train/mu_std": 0.22968903928995132, "train/rewards_chosen_mean": -0.1854724884033203, "train/rewards_rejected_mean": -1.4969573020935059, "train/tau_mean": 24.364575147628784, "train/tau_std": 3.792660713195801 }, { "epoch": 0.42043510144219015, "grad_norm": 5.245666980743408, "learning_rate": 4.53050034270048e-06, "loss": 0.5003, "step": 215, "train/lambda_m_mean": 0.04565429827198386, "train/mu_mean": 0.7028246894478798, "train/mu_std": 0.2285747118294239, "train/rewards_chosen_mean": -0.21773147583007812, "train/rewards_rejected_mean": -1.6529598236083984, "train/tau_mean": 23.840677976608276, "train/tau_std": 4.151129454374313 }, { "epoch": 0.4223906135419213, "grad_norm": 5.021378040313721, "learning_rate": 4.527073337902673e-06, "loss": 0.4774, "step": 216, "train/lambda_m_mean": 0.0449707037769258, "train/mu_mean": 0.7143959850072861, "train/mu_std": 0.22939271107316017, "train/rewards_chosen_mean": -0.6449222564697266, "train/rewards_rejected_mean": -2.1251220703125, "train/tau_mean": 24.359707832336426, "train/tau_std": 4.128442138433456 }, { "epoch": 0.4243461256416524, "grad_norm": 5.065217018127441, "learning_rate": 4.5236463331048665e-06, "loss": 0.5042, "step": 217, "train/lambda_m_mean": 0.04565429827198386, "train/mu_mean": 0.692287415266037, "train/mu_std": 0.21802975609898567, "train/rewards_chosen_mean": -0.36150407791137695, "train/rewards_rejected_mean": -1.6920318603515625, "train/tau_mean": 24.236663579940796, "train/tau_std": 4.266337752342224 }, { "epoch": 0.4263016377413835, "grad_norm": 8.361032485961914, "learning_rate": 4.52021932830706e-06, "loss": 0.5183, "step": 218, "train/lambda_m_mean": 0.04257812537252903, "train/mu_mean": 0.6927370503544807, "train/mu_std": 0.21814381331205368, "train/rewards_chosen_mean": -0.2156963348388672, "train/rewards_rejected_mean": -1.5062108039855957, "train/tau_mean": 24.802807569503784, "train/tau_std": 4.263110548257828 }, { "epoch": 0.4282571498411146, "grad_norm": 6.157194137573242, "learning_rate": 4.516792323509253e-06, "loss": 0.5615, "step": 219, "train/lambda_m_mean": 0.044531250605359674, "train/mu_mean": 0.6544767469167709, "train/mu_std": 0.21879151836037636, "train/rewards_chosen_mean": -0.4162445068359375, "train/rewards_rejected_mean": -1.370901107788086, "train/tau_mean": 24.325040578842163, "train/tau_std": 3.992752492427826 }, { "epoch": 0.4302126619408458, "grad_norm": 4.1716084480285645, "learning_rate": 4.513365318711447e-06, "loss": 0.4776, "step": 220, "train/lambda_m_mean": 0.04443359514698386, "train/mu_mean": 0.6861350536346436, "train/mu_std": 0.19293856993317604, "train/rewards_chosen_mean": -0.4931807518005371, "train/rewards_rejected_mean": -1.5428924560546875, "train/tau_mean": 23.799657344818115, "train/tau_std": 3.906118094921112 }, { "epoch": 0.4321681740405769, "grad_norm": 3.9541211128234863, "learning_rate": 4.5099383139136396e-06, "loss": 0.5208, "step": 221, "train/lambda_m_mean": 0.0414550791028887, "train/mu_mean": 0.6543739661574364, "train/mu_std": 0.18418716359883547, "train/rewards_chosen_mean": -0.6926765441894531, "train/rewards_rejected_mean": -1.5781946182250977, "train/tau_mean": 24.266437768936157, "train/tau_std": 3.6456915736198425 }, { "epoch": 0.434123686140308, "grad_norm": 3.4052956104278564, "learning_rate": 4.506511309115833e-06, "loss": 0.4838, "step": 222, "train/lambda_m_mean": 0.045214843936264515, "train/mu_mean": 0.694826751947403, "train/mu_std": 0.21705754101276398, "train/rewards_chosen_mean": -0.6725387573242188, "train/rewards_rejected_mean": -1.9204168319702148, "train/tau_mean": 24.71149468421936, "train/tau_std": 3.9400371313095093 }, { "epoch": 0.4360791982400391, "grad_norm": 4.939428806304932, "learning_rate": 4.503084304318027e-06, "loss": 0.4815, "step": 223, "train/lambda_m_mean": 0.04799804789945483, "train/mu_mean": 0.7082423865795135, "train/mu_std": 0.21462086774408817, "train/rewards_chosen_mean": -0.4982571005821228, "train/rewards_rejected_mean": -1.9037818908691406, "train/tau_mean": 25.7847158908844, "train/tau_std": 4.648459285497665 }, { "epoch": 0.4380347103397702, "grad_norm": 5.128222465515137, "learning_rate": 4.49965729952022e-06, "loss": 0.4693, "step": 224, "train/lambda_m_mean": 0.04228515760041773, "train/mu_mean": 0.7146727964282036, "train/mu_std": 0.21979346126317978, "train/rewards_chosen_mean": -0.6391000747680664, "train/rewards_rejected_mean": -2.1726484298706055, "train/tau_mean": 27.704631567001343, "train/tau_std": 4.909660279750824 }, { "epoch": 0.43999022243950137, "grad_norm": 8.491108894348145, "learning_rate": 4.496230294722413e-06, "loss": 0.5289, "step": 225, "train/lambda_m_mean": 0.04516601748764515, "train/mu_mean": 0.711324580013752, "train/mu_std": 0.2520925607532263, "train/rewards_chosen_mean": -0.7908210754394531, "train/rewards_rejected_mean": -2.4561843872070312, "train/tau_mean": 28.04738998413086, "train/tau_std": 5.0243494510650635 }, { "epoch": 0.44194573453923247, "grad_norm": 30.173460006713867, "learning_rate": 4.492803289924606e-06, "loss": 0.5957, "step": 226, "train/lambda_m_mean": 0.040917969308793545, "train/mu_mean": 0.7035473808646202, "train/mu_std": 0.25819525495171547, "train/rewards_chosen_mean": -0.8384838104248047, "train/rewards_rejected_mean": -2.5109786987304688, "train/tau_mean": 29.82960557937622, "train/tau_std": 5.163435935974121 }, { "epoch": 0.4439012466389636, "grad_norm": 5.168795108795166, "learning_rate": 4.4893762851268e-06, "loss": 0.4704, "step": 227, "train/lambda_m_mean": 0.049121095798909664, "train/mu_mean": 0.7284074947237968, "train/mu_std": 0.234023779630661, "train/rewards_chosen_mean": -0.9144468307495117, "train/rewards_rejected_mean": -2.5167646408081055, "train/tau_mean": 28.401885271072388, "train/tau_std": 5.220977365970612 }, { "epoch": 0.4458567587386947, "grad_norm": 6.491413593292236, "learning_rate": 4.485949280328992e-06, "loss": 0.4889, "step": 228, "train/lambda_m_mean": 0.0421386722009629, "train/mu_mean": 0.7185509577393532, "train/mu_std": 0.23466470651328564, "train/rewards_chosen_mean": -0.9730854034423828, "train/rewards_rejected_mean": -2.526153564453125, "train/tau_mean": 29.65773057937622, "train/tau_std": 4.6361793875694275 }, { "epoch": 0.4478122708384258, "grad_norm": 5.265679836273193, "learning_rate": 4.482522275531186e-06, "loss": 0.4671, "step": 229, "train/lambda_m_mean": 0.0422851569019258, "train/mu_mean": 0.705484963953495, "train/mu_std": 0.20681489072740078, "train/rewards_chosen_mean": -0.5579776763916016, "train/rewards_rejected_mean": -1.8791961669921875, "train/tau_mean": 28.821801900863647, "train/tau_std": 4.64475080370903 }, { "epoch": 0.44976778293815695, "grad_norm": 5.219050407409668, "learning_rate": 4.47909527073338e-06, "loss": 0.478, "step": 230, "train/lambda_m_mean": 0.04433593899011612, "train/mu_mean": 0.7028394415974617, "train/mu_std": 0.2176648285239935, "train/rewards_chosen_mean": -0.5616874694824219, "train/rewards_rejected_mean": -1.8412742614746094, "train/tau_mean": 28.842796087265015, "train/tau_std": 5.4313987493515015 }, { "epoch": 0.45172329503788805, "grad_norm": 4.006079196929932, "learning_rate": 4.475668265935573e-06, "loss": 0.4852, "step": 231, "train/lambda_m_mean": 0.04965820349752903, "train/mu_mean": 0.7001035436987877, "train/mu_std": 0.22118423506617546, "train/rewards_chosen_mean": -0.3725013732910156, "train/rewards_rejected_mean": -1.6459155082702637, "train/tau_mean": 27.748071670532227, "train/tau_std": 4.843711733818054 }, { "epoch": 0.45367880713761916, "grad_norm": 5.666781425476074, "learning_rate": 4.4722412611377655e-06, "loss": 0.4587, "step": 232, "train/lambda_m_mean": 0.040722658624872565, "train/mu_mean": 0.705392099916935, "train/mu_std": 0.19438090175390244, "train/rewards_chosen_mean": -0.24765539169311523, "train/rewards_rejected_mean": -1.4933302402496338, "train/tau_mean": 29.58693289756775, "train/tau_std": 4.96659791469574 }, { "epoch": 0.45563431923735026, "grad_norm": 5.625906467437744, "learning_rate": 4.4688142563399596e-06, "loss": 0.425, "step": 233, "train/lambda_m_mean": 0.04155273479409516, "train/mu_mean": 0.7407201156020164, "train/mu_std": 0.22075261920690536, "train/rewards_chosen_mean": -0.3014564514160156, "train/rewards_rejected_mean": -1.8569984436035156, "train/tau_mean": 30.13331651687622, "train/tau_std": 5.061522305011749 }, { "epoch": 0.4575898313370814, "grad_norm": 8.587593078613281, "learning_rate": 4.465387251542153e-06, "loss": 0.4934, "step": 234, "train/lambda_m_mean": 0.047216798178851604, "train/mu_mean": 0.7166515365242958, "train/mu_std": 0.22887198440730572, "train/rewards_chosen_mean": -0.49826300144195557, "train/rewards_rejected_mean": -1.9588782787322998, "train/tau_mean": 30.198742389678955, "train/tau_std": 5.525870323181152 }, { "epoch": 0.4595453434368125, "grad_norm": 7.739892482757568, "learning_rate": 4.461960246744346e-06, "loss": 0.4796, "step": 235, "train/lambda_m_mean": 0.047070314176380634, "train/mu_mean": 0.7229553088545799, "train/mu_std": 0.23893493972718716, "train/rewards_chosen_mean": -0.15334129333496094, "train/rewards_rejected_mean": -1.6773910522460938, "train/tau_mean": 30.181166172027588, "train/tau_std": 6.025844097137451 }, { "epoch": 0.46150085553654363, "grad_norm": 8.08480453491211, "learning_rate": 4.458533241946539e-06, "loss": 0.47, "step": 236, "train/lambda_m_mean": 0.04624023614451289, "train/mu_mean": 0.736327551305294, "train/mu_std": 0.22981903702020645, "train/rewards_chosen_mean": -0.324993371963501, "train/rewards_rejected_mean": -1.9767773151397705, "train/tau_mean": 30.42091417312622, "train/tau_std": 5.996791750192642 }, { "epoch": 0.46345636763627474, "grad_norm": 6.715762138366699, "learning_rate": 4.455106237148733e-06, "loss": 0.3809, "step": 237, "train/lambda_m_mean": 0.04238281259313226, "train/mu_mean": 0.7836352363228798, "train/mu_std": 0.2083203848451376, "train/rewards_chosen_mean": -0.46768760681152344, "train/rewards_rejected_mean": -2.3563528060913086, "train/tau_mean": 30.60743737220764, "train/tau_std": 5.60025030374527 }, { "epoch": 0.46541187973600584, "grad_norm": 6.061365604400635, "learning_rate": 4.451679232350926e-06, "loss": 0.4428, "step": 238, "train/lambda_m_mean": 0.04360351664945483, "train/mu_mean": 0.7214186862111092, "train/mu_std": 0.20495163090527058, "train/rewards_chosen_mean": -0.4765758514404297, "train/rewards_rejected_mean": -1.9147834777832031, "train/tau_mean": 30.201676607131958, "train/tau_std": 5.786230504512787 }, { "epoch": 0.467367391835737, "grad_norm": 6.218095302581787, "learning_rate": 4.448252227553118e-06, "loss": 0.4683, "step": 239, "train/lambda_m_mean": 0.041259766556322575, "train/mu_mean": 0.7199559137225151, "train/mu_std": 0.21726946718990803, "train/rewards_chosen_mean": -0.2625770568847656, "train/rewards_rejected_mean": -1.6876449584960938, "train/tau_mean": 30.89699077606201, "train/tau_std": 5.667397916316986 }, { "epoch": 0.4693229039354681, "grad_norm": 5.834134578704834, "learning_rate": 4.444825222755312e-06, "loss": 0.4459, "step": 240, "train/lambda_m_mean": 0.044287110678851604, "train/mu_mean": 0.7237103208899498, "train/mu_std": 0.20936212316155434, "train/rewards_chosen_mean": -0.46453094482421875, "train/rewards_rejected_mean": -1.8323860168457031, "train/tau_mean": 29.980491399765015, "train/tau_std": 6.332732319831848 }, { "epoch": 0.4712784160351992, "grad_norm": 5.255515098571777, "learning_rate": 4.441398217957506e-06, "loss": 0.4249, "step": 241, "train/lambda_m_mean": 0.03789062635041773, "train/mu_mean": 0.735970988869667, "train/mu_std": 0.22138473205268383, "train/rewards_chosen_mean": -0.1919078826904297, "train/rewards_rejected_mean": -1.7058429718017578, "train/tau_mean": 30.427268028259277, "train/tau_std": 5.932991683483124 }, { "epoch": 0.4732339281349303, "grad_norm": 5.602799892425537, "learning_rate": 4.437971213159699e-06, "loss": 0.4499, "step": 242, "train/lambda_m_mean": 0.04218750074505806, "train/mu_mean": 0.7282037883996964, "train/mu_std": 0.22413733042776585, "train/rewards_chosen_mean": -0.0825510025024414, "train/rewards_rejected_mean": -1.6137667894363403, "train/tau_mean": 30.362806797027588, "train/tau_std": 5.809271991252899 }, { "epoch": 0.4751894402346615, "grad_norm": 4.744879245758057, "learning_rate": 4.434544208361892e-06, "loss": 0.3995, "step": 243, "train/lambda_m_mean": 0.04848633008077741, "train/mu_mean": 0.7560381963849068, "train/mu_std": 0.19422081671655178, "train/rewards_chosen_mean": 0.22248601913452148, "train/rewards_rejected_mean": -1.4326813220977783, "train/tau_mean": 29.320824146270752, "train/tau_std": 5.903051674365997 }, { "epoch": 0.4771449523343926, "grad_norm": 6.187487602233887, "learning_rate": 4.4311172035640855e-06, "loss": 0.4004, "step": 244, "train/lambda_m_mean": 0.043212891556322575, "train/mu_mean": 0.7719411179423332, "train/mu_std": 0.21277941949665546, "train/rewards_chosen_mean": 0.6047192811965942, "train/rewards_rejected_mean": -1.2814311981201172, "train/tau_mean": 30.1508948802948, "train/tau_std": 6.044589042663574 }, { "epoch": 0.4791004644341237, "grad_norm": 5.81087589263916, "learning_rate": 4.427690198766279e-06, "loss": 0.4233, "step": 245, "train/lambda_m_mean": 0.047753906808793545, "train/mu_mean": 0.7558844015002251, "train/mu_std": 0.21847889386117458, "train/rewards_chosen_mean": 0.3495206832885742, "train/rewards_rejected_mean": -1.4383220672607422, "train/tau_mean": 29.817420721054077, "train/tau_std": 6.503582000732422 }, { "epoch": 0.4810559765338548, "grad_norm": 14.15267562866211, "learning_rate": 4.424263193968472e-06, "loss": 0.5526, "step": 246, "train/lambda_m_mean": 0.04414062574505806, "train/mu_mean": 0.7220788151025772, "train/mu_std": 0.25387347862124443, "train/rewards_chosen_mean": 0.19212913513183594, "train/rewards_rejected_mean": -1.428999900817871, "train/tau_mean": 30.264670610427856, "train/tau_std": 6.752505004405975 }, { "epoch": 0.4830114886335859, "grad_norm": 8.901935577392578, "learning_rate": 4.420836189170665e-06, "loss": 0.4382, "step": 247, "train/lambda_m_mean": 0.045117189176380634, "train/mu_mean": 0.762278750538826, "train/mu_std": 0.23376190289855003, "train/rewards_chosen_mean": 0.47223150730133057, "train/rewards_rejected_mean": -1.468940258026123, "train/tau_mean": 28.789581060409546, "train/tau_std": 5.832692801952362 }, { "epoch": 0.48496700073331706, "grad_norm": 5.311846733093262, "learning_rate": 4.4174091843728585e-06, "loss": 0.3822, "step": 248, "train/lambda_m_mean": 0.04057617159560323, "train/mu_mean": 0.7637762576341629, "train/mu_std": 0.21227459609508514, "train/rewards_chosen_mean": 0.4354366064071655, "train/rewards_rejected_mean": -1.3164243698120117, "train/tau_mean": 29.513941764831543, "train/tau_std": 5.515278995037079 }, { "epoch": 0.48692251283304816, "grad_norm": 5.524046421051025, "learning_rate": 4.413982179575052e-06, "loss": 0.4559, "step": 249, "train/lambda_m_mean": 0.04169921949505806, "train/mu_mean": 0.7226117700338364, "train/mu_std": 0.2212195135653019, "train/rewards_chosen_mean": 0.5158309936523438, "train/rewards_rejected_mean": -0.9300885796546936, "train/tau_mean": 28.70266556739807, "train/tau_std": 5.618339121341705 }, { "epoch": 0.48887802493277926, "grad_norm": 6.051169395446777, "learning_rate": 4.410555174777245e-06, "loss": 0.4429, "step": 250, "train/lambda_m_mean": 0.04443359561264515, "train/mu_mean": 0.7364515885710716, "train/mu_std": 0.23540376499295235, "train/rewards_chosen_mean": 0.631434440612793, "train/rewards_rejected_mean": -0.8969020843505859, "train/tau_mean": 28.26224970817566, "train/tau_std": 6.204244554042816 }, { "epoch": 0.49083353703251037, "grad_norm": 5.195281505584717, "learning_rate": 4.407128169979438e-06, "loss": 0.4294, "step": 251, "train/lambda_m_mean": 0.04482422024011612, "train/mu_mean": 0.7332597523927689, "train/mu_std": 0.21017743088304996, "train/rewards_chosen_mean": 0.723564624786377, "train/rewards_rejected_mean": -0.8116835504770279, "train/tau_mean": 27.56181263923645, "train/tau_std": 5.840022146701813 }, { "epoch": 0.49278904913224153, "grad_norm": 16.761974334716797, "learning_rate": 4.4037011651816316e-06, "loss": 0.5018, "step": 252, "train/lambda_m_mean": 0.04326171986758709, "train/mu_mean": 0.7334266304969788, "train/mu_std": 0.22859810292720795, "train/rewards_chosen_mean": 0.3915386199951172, "train/rewards_rejected_mean": -1.1079152822494507, "train/tau_mean": 27.747366666793823, "train/tau_std": 5.710576772689819 }, { "epoch": 0.49474456123197263, "grad_norm": 4.616611003875732, "learning_rate": 4.400274160383825e-06, "loss": 0.402, "step": 253, "train/lambda_m_mean": 0.04902343871071935, "train/mu_mean": 0.756364494562149, "train/mu_std": 0.20640752837061882, "train/rewards_chosen_mean": 0.6683473587036133, "train/rewards_rejected_mean": -0.8975648880004883, "train/tau_mean": 25.98980951309204, "train/tau_std": 5.2415578961372375 }, { "epoch": 0.49670007333170374, "grad_norm": 6.141413688659668, "learning_rate": 4.396847155586018e-06, "loss": 0.4265, "step": 254, "train/lambda_m_mean": 0.04453125107102096, "train/mu_mean": 0.7357022166252136, "train/mu_std": 0.205985389649868, "train/rewards_chosen_mean": 0.4639925956726074, "train/rewards_rejected_mean": -0.9912843704223633, "train/tau_mean": 26.156083583831787, "train/tau_std": 5.1656559109687805 }, { "epoch": 0.49865558543143484, "grad_norm": 5.712916851043701, "learning_rate": 4.393420150788211e-06, "loss": 0.4242, "step": 255, "train/lambda_m_mean": 0.04272460984066129, "train/mu_mean": 0.7370390072464943, "train/mu_std": 0.2053385954350233, "train/rewards_chosen_mean": 0.4100642204284668, "train/rewards_rejected_mean": -1.0864465236663818, "train/tau_mean": 27.210254907608032, "train/tau_std": 5.360902309417725 }, { "epoch": 0.500611097531166, "grad_norm": 7.349051475524902, "learning_rate": 4.389993145990405e-06, "loss": 0.4379, "step": 256, "train/lambda_m_mean": 0.04462890746071935, "train/mu_mean": 0.7378807440400124, "train/mu_std": 0.22112156637012959, "train/rewards_chosen_mean": 0.09430670738220215, "train/rewards_rejected_mean": -1.531661033630371, "train/tau_mean": 27.515692234039307, "train/tau_std": 5.851078510284424 }, { "epoch": 0.502566609630897, "grad_norm": 8.984638214111328, "learning_rate": 4.386566141192598e-06, "loss": 0.4678, "step": 257, "train/lambda_m_mean": 0.0395019541028887, "train/mu_mean": 0.7540078088641167, "train/mu_std": 0.25193779543042183, "train/rewards_chosen_mean": 0.20731735229492188, "train/rewards_rejected_mean": -1.6613597869873047, "train/tau_mean": 28.82816791534424, "train/tau_std": 6.146906733512878 }, { "epoch": 0.5045221217306282, "grad_norm": 6.733091831207275, "learning_rate": 4.383139136394791e-06, "loss": 0.3568, "step": 258, "train/lambda_m_mean": 0.04082031361758709, "train/mu_mean": 0.7925853952765465, "train/mu_std": 0.20020144898444414, "train/rewards_chosen_mean": -0.019969940185546875, "train/rewards_rejected_mean": -2.0932979583740234, "train/tau_mean": 28.858426094055176, "train/tau_std": 5.815879464149475 }, { "epoch": 0.5064776338303594, "grad_norm": 6.788168907165527, "learning_rate": 4.379712131596984e-06, "loss": 0.4512, "step": 259, "train/lambda_m_mean": 0.04882812546566129, "train/mu_mean": 0.7455856949090958, "train/mu_std": 0.23419118486344814, "train/rewards_chosen_mean": -0.1556844711303711, "train/rewards_rejected_mean": -1.9008102416992188, "train/tau_mean": 27.277170658111572, "train/tau_std": 5.966843545436859 }, { "epoch": 0.5084331459300905, "grad_norm": 6.841333866119385, "learning_rate": 4.376285126799178e-06, "loss": 0.4635, "step": 260, "train/lambda_m_mean": 0.04208984458819032, "train/mu_mean": 0.7158076837658882, "train/mu_std": 0.22510316781699657, "train/rewards_chosen_mean": -0.8103065490722656, "train/rewards_rejected_mean": -2.2802019119262695, "train/tau_mean": 28.517377138137817, "train/tau_std": 6.165718078613281 }, { "epoch": 0.5103886580298216, "grad_norm": 7.29754638671875, "learning_rate": 4.372858122001371e-06, "loss": 0.4559, "step": 261, "train/lambda_m_mean": 0.04750976664945483, "train/mu_mean": 0.7369512394070625, "train/mu_std": 0.2367011420428753, "train/rewards_chosen_mean": -0.8879451751708984, "train/rewards_rejected_mean": -2.5298728942871094, "train/tau_mean": 27.133354663848877, "train/tau_std": 5.644711971282959 }, { "epoch": 0.5123441701295527, "grad_norm": 6.158336162567139, "learning_rate": 4.369431117203564e-06, "loss": 0.4536, "step": 262, "train/lambda_m_mean": 0.0440917972009629, "train/mu_mean": 0.7241365909576416, "train/mu_std": 0.22037671692669392, "train/rewards_chosen_mean": -1.0897178649902344, "train/rewards_rejected_mean": -2.64605712890625, "train/tau_mean": 28.26177477836609, "train/tau_std": 6.424594581127167 }, { "epoch": 0.5142996822292838, "grad_norm": 6.065747261047363, "learning_rate": 4.3660041124057575e-06, "loss": 0.4405, "step": 263, "train/lambda_m_mean": 0.05000000027939677, "train/mu_mean": 0.7452285513281822, "train/mu_std": 0.2217242605984211, "train/rewards_chosen_mean": -0.9825897216796875, "train/rewards_rejected_mean": -2.5431747436523438, "train/tau_mean": 26.46369695663452, "train/tau_std": 5.833741545677185 }, { "epoch": 0.5162551943290149, "grad_norm": 6.26869010925293, "learning_rate": 4.362577107607951e-06, "loss": 0.4079, "step": 264, "train/lambda_m_mean": 0.04072265699505806, "train/mu_mean": 0.7371132522821426, "train/mu_std": 0.20054891705513, "train/rewards_chosen_mean": -1.0231475830078125, "train/rewards_rejected_mean": -2.451080322265625, "train/tau_mean": 27.877986192703247, "train/tau_std": 5.833207488059998 }, { "epoch": 0.518210706428746, "grad_norm": 9.166556358337402, "learning_rate": 4.359150102810144e-06, "loss": 0.4308, "step": 265, "train/lambda_m_mean": 0.041015625, "train/mu_mean": 0.7524077743291855, "train/mu_std": 0.2280119750648737, "train/rewards_chosen_mean": -0.5077075958251953, "train/rewards_rejected_mean": -2.1358556747436523, "train/tau_mean": 28.337929010391235, "train/tau_std": 6.067145466804504 }, { "epoch": 0.5201662185284771, "grad_norm": 9.462897300720215, "learning_rate": 4.355723098012338e-06, "loss": 0.4713, "step": 266, "train/lambda_m_mean": 0.041259765857830644, "train/mu_mean": 0.7420961782336235, "train/mu_std": 0.23387214913964272, "train/rewards_chosen_mean": -0.3138141632080078, "train/rewards_rejected_mean": -1.9199371337890625, "train/tau_mean": 28.78909730911255, "train/tau_std": 6.2958380579948425 }, { "epoch": 0.5221217306282082, "grad_norm": 6.1783671379089355, "learning_rate": 4.3522960932145305e-06, "loss": 0.4032, "step": 267, "train/lambda_m_mean": 0.0461914069019258, "train/mu_mean": 0.7508198842406273, "train/mu_std": 0.20642411522567272, "train/rewards_chosen_mean": -0.23871994018554688, "train/rewards_rejected_mean": -1.8469581604003906, "train/tau_mean": 27.55596423149109, "train/tau_std": 6.37752765417099 }, { "epoch": 0.5240772427279394, "grad_norm": 6.599634170532227, "learning_rate": 4.348869088416724e-06, "loss": 0.4499, "step": 268, "train/lambda_m_mean": 0.0422851569019258, "train/mu_mean": 0.7382588610053062, "train/mu_std": 0.23869690857827663, "train/rewards_chosen_mean": -0.26471424102783203, "train/rewards_rejected_mean": -1.8638916015625, "train/tau_mean": 29.055710554122925, "train/tau_std": 6.917707741260529 }, { "epoch": 0.5260327548276705, "grad_norm": 7.387856483459473, "learning_rate": 4.345442083618918e-06, "loss": 0.4297, "step": 269, "train/lambda_m_mean": 0.04477539099752903, "train/mu_mean": 0.7547021135687828, "train/mu_std": 0.22532370500266552, "train/rewards_chosen_mean": -0.28115415573120117, "train/rewards_rejected_mean": -1.9993209838867188, "train/tau_mean": 28.543760299682617, "train/tau_std": 6.607559084892273 }, { "epoch": 0.5279882669274016, "grad_norm": 7.765687942504883, "learning_rate": 4.342015078821111e-06, "loss": 0.4374, "step": 270, "train/lambda_m_mean": 0.04721679771319032, "train/mu_mean": 0.7517466098070145, "train/mu_std": 0.22965611703693867, "train/rewards_chosen_mean": -0.4254570007324219, "train/rewards_rejected_mean": -2.0901947021484375, "train/tau_mean": 28.59747886657715, "train/tau_std": 7.172805190086365 }, { "epoch": 0.5299437790271327, "grad_norm": 8.250661849975586, "learning_rate": 4.3385880740233036e-06, "loss": 0.4358, "step": 271, "train/lambda_m_mean": 0.044335938058793545, "train/mu_mean": 0.7527678087353706, "train/mu_std": 0.2309953309595585, "train/rewards_chosen_mean": -0.32831764221191406, "train/rewards_rejected_mean": -2.061595916748047, "train/tau_mean": 28.695605039596558, "train/tau_std": 6.393873751163483 }, { "epoch": 0.5318992911268638, "grad_norm": 4.253989219665527, "learning_rate": 4.335161069225498e-06, "loss": 0.3911, "step": 272, "train/lambda_m_mean": 0.04340820387005806, "train/mu_mean": 0.763125479221344, "train/mu_std": 0.21134262159466743, "train/rewards_chosen_mean": -0.043956756591796875, "train/rewards_rejected_mean": -1.733205795288086, "train/tau_mean": 28.805957078933716, "train/tau_std": 6.868643403053284 }, { "epoch": 0.533854803226595, "grad_norm": 6.3404083251953125, "learning_rate": 4.331734064427691e-06, "loss": 0.4635, "step": 273, "train/lambda_m_mean": 0.047753906808793545, "train/mu_mean": 0.7236996293067932, "train/mu_std": 0.20639752224087715, "train/rewards_chosen_mean": 0.1688976287841797, "train/rewards_rejected_mean": -1.2043075561523438, "train/tau_mean": 27.947582483291626, "train/tau_std": 6.745997369289398 }, { "epoch": 0.5358103153263261, "grad_norm": 4.335659503936768, "learning_rate": 4.328307059629883e-06, "loss": 0.3509, "step": 274, "train/lambda_m_mean": 0.04008789104409516, "train/mu_mean": 0.7841997146606445, "train/mu_std": 0.20186378248035908, "train/rewards_chosen_mean": 0.360569603741169, "train/rewards_rejected_mean": -1.533803939819336, "train/tau_mean": 28.675593376159668, "train/tau_std": 6.598908841609955 }, { "epoch": 0.5377658274260572, "grad_norm": 5.117324352264404, "learning_rate": 4.324880054832077e-06, "loss": 0.4158, "step": 275, "train/lambda_m_mean": 0.04951172089204192, "train/mu_mean": 0.7549240738153458, "train/mu_std": 0.22429987601935863, "train/rewards_chosen_mean": 0.1737222671508789, "train/rewards_rejected_mean": -1.5019350051879883, "train/tau_mean": 27.61531710624695, "train/tau_std": 7.3141868114471436 }, { "epoch": 0.5397213395257883, "grad_norm": 4.733757495880127, "learning_rate": 4.321453050034271e-06, "loss": 0.3621, "step": 276, "train/lambda_m_mean": 0.039111329009756446, "train/mu_mean": 0.7904274985194206, "train/mu_std": 0.20627489313483238, "train/rewards_chosen_mean": 0.4241905212402344, "train/rewards_rejected_mean": -1.5725760459899902, "train/tau_mean": 29.596738576889038, "train/tau_std": 7.525236368179321 }, { "epoch": 0.5416768516255195, "grad_norm": 7.170386791229248, "learning_rate": 4.318026045236464e-06, "loss": 0.3843, "step": 277, "train/lambda_m_mean": 0.04204101720824838, "train/mu_mean": 0.7765891551971436, "train/mu_std": 0.211967752315104, "train/rewards_chosen_mean": 0.7483177185058594, "train/rewards_rejected_mean": -1.2386102676391602, "train/tau_mean": 29.90462589263916, "train/tau_std": 8.098510563373566 }, { "epoch": 0.5436323637252506, "grad_norm": 9.99436092376709, "learning_rate": 4.314599040438656e-06, "loss": 0.5082, "step": 278, "train/lambda_m_mean": 0.04794921865686774, "train/mu_mean": 0.7273902967572212, "train/mu_std": 0.24707748740911484, "train/rewards_chosen_mean": 0.40753793716430664, "train/rewards_rejected_mean": -1.2512931823730469, "train/tau_mean": 29.99103093147278, "train/tau_std": 7.85931134223938 }, { "epoch": 0.5455878758249817, "grad_norm": 6.0565290451049805, "learning_rate": 4.3111720356408505e-06, "loss": 0.3692, "step": 279, "train/lambda_m_mean": 0.040136720053851604, "train/mu_mean": 0.7855003923177719, "train/mu_std": 0.21548471227288246, "train/rewards_chosen_mean": 0.6569406390190125, "train/rewards_rejected_mean": -1.4612672328948975, "train/tau_mean": 31.199019193649292, "train/tau_std": 7.737323701381683 }, { "epoch": 0.5475433879247128, "grad_norm": 7.578216075897217, "learning_rate": 4.307745030843044e-06, "loss": 0.4308, "step": 280, "train/lambda_m_mean": 0.0479003912769258, "train/mu_mean": 0.7394828274846077, "train/mu_std": 0.2111878376454115, "train/rewards_chosen_mean": 0.13004016876220703, "train/rewards_rejected_mean": -1.528365135192871, "train/tau_mean": 29.25786828994751, "train/tau_std": 7.769271194934845 }, { "epoch": 0.5494989000244439, "grad_norm": 7.148359775543213, "learning_rate": 4.304318026045237e-06, "loss": 0.3605, "step": 281, "train/lambda_m_mean": 0.03979492327198386, "train/mu_mean": 0.7883207872509956, "train/mu_std": 0.19612705893814564, "train/rewards_chosen_mean": 0.34247589111328125, "train/rewards_rejected_mean": -1.6405391693115234, "train/tau_mean": 31.135294675827026, "train/tau_std": 7.689535677433014 }, { "epoch": 0.551454412124175, "grad_norm": 6.330029487609863, "learning_rate": 4.30089102124743e-06, "loss": 0.4456, "step": 282, "train/lambda_m_mean": 0.043115234933793545, "train/mu_mean": 0.7289849668741226, "train/mu_std": 0.2124986257404089, "train/rewards_chosen_mean": -0.018269211053848267, "train/rewards_rejected_mean": -1.4977664947509766, "train/tau_mean": 30.301316499710083, "train/tau_std": 7.214077711105347 }, { "epoch": 0.5534099242239061, "grad_norm": 4.956923484802246, "learning_rate": 4.2974640164496236e-06, "loss": 0.3643, "step": 283, "train/lambda_m_mean": 0.042626953683793545, "train/mu_mean": 0.7860643044114113, "train/mu_std": 0.19944997690618038, "train/rewards_chosen_mean": 0.429740846157074, "train/rewards_rejected_mean": -1.497889518737793, "train/tau_mean": 29.67241883277893, "train/tau_std": 7.745015799999237 }, { "epoch": 0.5553654363236372, "grad_norm": 6.400308609008789, "learning_rate": 4.294037011651817e-06, "loss": 0.4298, "step": 284, "train/lambda_m_mean": 0.04345703241415322, "train/mu_mean": 0.7513935044407845, "train/mu_std": 0.22490961104631424, "train/rewards_chosen_mean": 0.26735877990722656, "train/rewards_rejected_mean": -1.4101266860961914, "train/tau_mean": 30.35945224761963, "train/tau_std": 8.065200865268707 }, { "epoch": 0.5573209484233683, "grad_norm": 35.57832336425781, "learning_rate": 4.29061000685401e-06, "loss": 0.5648, "step": 285, "train/lambda_m_mean": 0.04511718824505806, "train/mu_mean": 0.7260271608829498, "train/mu_std": 0.23982056230306625, "train/rewards_chosen_mean": 0.3138847351074219, "train/rewards_rejected_mean": -1.259359359741211, "train/tau_mean": 30.425344944000244, "train/tau_std": 8.12936794757843 }, { "epoch": 0.5592764605230995, "grad_norm": 5.933576583862305, "learning_rate": 4.287183002056203e-06, "loss": 0.4633, "step": 286, "train/lambda_m_mean": 0.04267578246071935, "train/mu_mean": 0.7068930044770241, "train/mu_std": 0.20653913728892803, "train/rewards_chosen_mean": 0.14633941650390625, "train/rewards_rejected_mean": -1.050504207611084, "train/tau_mean": 28.920000314712524, "train/tau_std": 7.1794668436050415 }, { "epoch": 0.5612319726228306, "grad_norm": 5.0625901222229, "learning_rate": 4.283755997258397e-06, "loss": 0.4274, "step": 287, "train/lambda_m_mean": 0.04277343861758709, "train/mu_mean": 0.7211701646447182, "train/mu_std": 0.1907486990094185, "train/rewards_chosen_mean": 0.0647326111793518, "train/rewards_rejected_mean": -1.2017979621887207, "train/tau_mean": 28.156821966171265, "train/tau_std": 7.14837908744812 }, { "epoch": 0.5631874847225617, "grad_norm": 5.411472320556641, "learning_rate": 4.28032899246059e-06, "loss": 0.4258, "step": 288, "train/lambda_m_mean": 0.04726562695577741, "train/mu_mean": 0.7519188299775124, "train/mu_std": 0.2169828787446022, "train/rewards_chosen_mean": 0.34401655197143555, "train/rewards_rejected_mean": -1.2590429782867432, "train/tau_mean": 27.042585134506226, "train/tau_std": 6.4639981389045715 }, { "epoch": 0.5651429968222929, "grad_norm": 6.870601177215576, "learning_rate": 4.276901987662783e-06, "loss": 0.4192, "step": 289, "train/lambda_m_mean": 0.043066407553851604, "train/mu_mean": 0.7615228220820427, "train/mu_std": 0.22777693159878254, "train/rewards_chosen_mean": 0.4851341247558594, "train/rewards_rejected_mean": -1.2456779479980469, "train/tau_mean": 28.229068279266357, "train/tau_std": 6.451570928096771 }, { "epoch": 0.567098508922024, "grad_norm": 5.99153470993042, "learning_rate": 4.273474982864976e-06, "loss": 0.3922, "step": 290, "train/lambda_m_mean": 0.04316406394354999, "train/mu_mean": 0.7753304019570351, "train/mu_std": 0.22467278316617012, "train/rewards_chosen_mean": 0.6768045425415039, "train/rewards_rejected_mean": -1.1612098217010498, "train/tau_mean": 28.20782732963562, "train/tau_std": 6.994142115116119 }, { "epoch": 0.5690540210217551, "grad_norm": 7.082943916320801, "learning_rate": 4.27004797806717e-06, "loss": 0.4438, "step": 291, "train/lambda_m_mean": 0.04545898595824838, "train/mu_mean": 0.7424382343888283, "train/mu_std": 0.22800949215888977, "train/rewards_chosen_mean": 0.35071176290512085, "train/rewards_rejected_mean": -1.2513017654418945, "train/tau_mean": 29.370194911956787, "train/tau_std": 7.441291511058807 }, { "epoch": 0.5710095331214862, "grad_norm": 9.11310863494873, "learning_rate": 4.266620973269363e-06, "loss": 0.4403, "step": 292, "train/lambda_m_mean": 0.04306640708819032, "train/mu_mean": 0.7519855722784996, "train/mu_std": 0.24250587821006775, "train/rewards_chosen_mean": 0.6618413925170898, "train/rewards_rejected_mean": -1.0593934059143066, "train/tau_mean": 28.55061912536621, "train/tau_std": 6.921593070030212 }, { "epoch": 0.5729650452212173, "grad_norm": 5.491240978240967, "learning_rate": 4.263193968471556e-06, "loss": 0.37, "step": 293, "train/lambda_m_mean": 0.04243164183571935, "train/mu_mean": 0.7812106907367706, "train/mu_std": 0.2129066437482834, "train/rewards_chosen_mean": 0.800755500793457, "train/rewards_rejected_mean": -1.1108150482177734, "train/tau_mean": 28.36775803565979, "train/tau_std": 7.38813978433609 }, { "epoch": 0.5749205573209484, "grad_norm": 5.356720924377441, "learning_rate": 4.2597669636737495e-06, "loss": 0.3971, "step": 294, "train/lambda_m_mean": 0.04599609551951289, "train/mu_mean": 0.7654501274228096, "train/mu_std": 0.21868669800460339, "train/rewards_chosen_mean": 0.49978911876678467, "train/rewards_rejected_mean": -1.2941017150878906, "train/tau_mean": 27.93252468109131, "train/tau_std": 7.186241626739502 }, { "epoch": 0.5768760694206796, "grad_norm": 4.765036582946777, "learning_rate": 4.256339958875943e-06, "loss": 0.3752, "step": 295, "train/lambda_m_mean": 0.0393554694019258, "train/mu_mean": 0.7830226048827171, "train/mu_std": 0.22307436726987362, "train/rewards_chosen_mean": 0.8973636627197266, "train/rewards_rejected_mean": -1.1424083709716797, "train/tau_mean": 28.27546501159668, "train/tau_std": 6.889392375946045 }, { "epoch": 0.5788315815204107, "grad_norm": 5.922994613647461, "learning_rate": 4.252912954078136e-06, "loss": 0.388, "step": 296, "train/lambda_m_mean": 0.04580078274011612, "train/mu_mean": 0.7730091512203217, "train/mu_std": 0.22253376059234142, "train/rewards_chosen_mean": 0.6175291538238525, "train/rewards_rejected_mean": -1.2946114540100098, "train/tau_mean": 27.223010063171387, "train/tau_std": 6.996976852416992 }, { "epoch": 0.5807870936201418, "grad_norm": 5.81480073928833, "learning_rate": 4.249485949280329e-06, "loss": 0.4004, "step": 297, "train/lambda_m_mean": 0.040771485306322575, "train/mu_mean": 0.7529489323496819, "train/mu_std": 0.21117877028882504, "train/rewards_chosen_mean": 0.18976569175720215, "train/rewards_rejected_mean": -1.4672966003417969, "train/tau_mean": 28.181732654571533, "train/tau_std": 7.150680720806122 }, { "epoch": 0.5827426057198729, "grad_norm": 6.050095081329346, "learning_rate": 4.2460589444825225e-06, "loss": 0.3548, "step": 298, "train/lambda_m_mean": 0.04365234449505806, "train/mu_mean": 0.8001967966556549, "train/mu_std": 0.204166017472744, "train/rewards_chosen_mean": 0.22191238403320312, "train/rewards_rejected_mean": -1.8333394527435303, "train/tau_mean": 27.019200086593628, "train/tau_std": 7.281996369361877 }, { "epoch": 0.584698117819604, "grad_norm": 6.092633247375488, "learning_rate": 4.242631939684716e-06, "loss": 0.3662, "step": 299, "train/lambda_m_mean": 0.0430175787769258, "train/mu_mean": 0.7782067060470581, "train/mu_std": 0.20501103810966015, "train/rewards_chosen_mean": 0.18923091888427734, "train/rewards_rejected_mean": -1.7306327819824219, "train/tau_mean": 27.380771160125732, "train/tau_std": 7.570125341415405 }, { "epoch": 0.5866536299193351, "grad_norm": 5.853192329406738, "learning_rate": 4.239204934886909e-06, "loss": 0.4136, "step": 300, "train/lambda_m_mean": 0.04575195396319032, "train/mu_mean": 0.7622772976756096, "train/mu_std": 0.21857184171676636, "train/rewards_chosen_mean": 0.012991666793823242, "train/rewards_rejected_mean": -1.7588696479797363, "train/tau_mean": 26.985267162322998, "train/tau_std": 7.304921984672546 }, { "epoch": 0.5886091420190662, "grad_norm": 5.5234761238098145, "learning_rate": 4.235777930089102e-06, "loss": 0.3581, "step": 301, "train/lambda_m_mean": 0.044531250139698386, "train/mu_mean": 0.7951503843069077, "train/mu_std": 0.20941604115068913, "train/rewards_chosen_mean": -0.026514053344726562, "train/rewards_rejected_mean": -2.164727210998535, "train/tau_mean": 26.987501621246338, "train/tau_std": 7.203462243080139 }, { "epoch": 0.5905646541187973, "grad_norm": 6.329101085662842, "learning_rate": 4.2323509252912956e-06, "loss": 0.4022, "step": 302, "train/lambda_m_mean": 0.04643554845824838, "train/mu_mean": 0.7696058750152588, "train/mu_std": 0.22515124827623367, "train/rewards_chosen_mean": 0.013214111328125, "train/rewards_rejected_mean": -1.899118423461914, "train/tau_mean": 26.676194429397583, "train/tau_std": 7.502895295619965 }, { "epoch": 0.5925201662185284, "grad_norm": 6.018845081329346, "learning_rate": 4.228923920493489e-06, "loss": 0.3985, "step": 303, "train/lambda_m_mean": 0.04501953208819032, "train/mu_mean": 0.7607549279928207, "train/mu_std": 0.21570195443928242, "train/rewards_chosen_mean": 0.12571096420288086, "train/rewards_rejected_mean": -1.6193523406982422, "train/tau_mean": 26.32824420928955, "train/tau_std": 6.6075374484062195 }, { "epoch": 0.5944756783182596, "grad_norm": 7.138452053070068, "learning_rate": 4.225496915695682e-06, "loss": 0.4099, "step": 304, "train/lambda_m_mean": 0.04287109477445483, "train/mu_mean": 0.7582952380180359, "train/mu_std": 0.21866465918719769, "train/rewards_chosen_mean": -0.13624095916748047, "train/rewards_rejected_mean": -1.9708399772644043, "train/tau_mean": 27.919086694717407, "train/tau_std": 7.771985471248627 }, { "epoch": 0.5964311904179908, "grad_norm": 5.9142632484436035, "learning_rate": 4.222069910897875e-06, "loss": 0.3726, "step": 305, "train/lambda_m_mean": 0.04257812676951289, "train/mu_mean": 0.7769686803221703, "train/mu_std": 0.205616838298738, "train/rewards_chosen_mean": -0.30028486251831055, "train/rewards_rejected_mean": -2.1442108154296875, "train/tau_mean": 29.367331743240356, "train/tau_std": 8.436933636665344 }, { "epoch": 0.5983867025177219, "grad_norm": 6.06231164932251, "learning_rate": 4.218642906100069e-06, "loss": 0.3682, "step": 306, "train/lambda_m_mean": 0.043750002048909664, "train/mu_mean": 0.7838400155305862, "train/mu_std": 0.21332641504704952, "train/rewards_chosen_mean": -0.07407331466674805, "train/rewards_rejected_mean": -1.9671854972839355, "train/tau_mean": 29.150070190429688, "train/tau_std": 8.226987659931183 }, { "epoch": 0.600342214617453, "grad_norm": 8.118069648742676, "learning_rate": 4.215215901302262e-06, "loss": 0.4575, "step": 307, "train/lambda_m_mean": 0.049560547806322575, "train/mu_mean": 0.7535210102796555, "train/mu_std": 0.23240177892148495, "train/rewards_chosen_mean": -0.5346450805664062, "train/rewards_rejected_mean": -2.2760543823242188, "train/tau_mean": 28.259894847869873, "train/tau_std": 8.301488876342773 }, { "epoch": 0.6022977267171841, "grad_norm": 4.79331636428833, "learning_rate": 4.211788896504456e-06, "loss": 0.3412, "step": 308, "train/lambda_m_mean": 0.04438476590439677, "train/mu_mean": 0.8024649247527122, "train/mu_std": 0.19561655540019274, "train/rewards_chosen_mean": -0.12965679168701172, "train/rewards_rejected_mean": -2.2890281677246094, "train/tau_mean": 30.011047840118408, "train/tau_std": 7.905382215976715 }, { "epoch": 0.6042532388169152, "grad_norm": 6.0390753746032715, "learning_rate": 4.208361891706648e-06, "loss": 0.3475, "step": 309, "train/lambda_m_mean": 0.03872070414945483, "train/mu_mean": 0.7943525314331055, "train/mu_std": 0.21245216019451618, "train/rewards_chosen_mean": -0.002254486083984375, "train/rewards_rejected_mean": -2.0233211517333984, "train/tau_mean": 31.678788900375366, "train/tau_std": 8.558496415615082 }, { "epoch": 0.6062087509166463, "grad_norm": 6.3541460037231445, "learning_rate": 4.204934886908842e-06, "loss": 0.3551, "step": 310, "train/lambda_m_mean": 0.04726562788709998, "train/mu_mean": 0.8039631769061089, "train/mu_std": 0.2073785699903965, "train/rewards_chosen_mean": 0.3163958787918091, "train/rewards_rejected_mean": -1.7765493392944336, "train/tau_mean": 29.326242923736572, "train/tau_std": 8.006892085075378 }, { "epoch": 0.6081642630163774, "grad_norm": 6.6144561767578125, "learning_rate": 4.201507882111036e-06, "loss": 0.415, "step": 311, "train/lambda_m_mean": 0.03906249953433871, "train/mu_mean": 0.7577035278081894, "train/mu_std": 0.23972088098526, "train/rewards_chosen_mean": -0.2613677978515625, "train/rewards_rejected_mean": -2.1260862350463867, "train/tau_mean": 32.81280708312988, "train/tau_std": 8.40091186761856 }, { "epoch": 0.6101197751161085, "grad_norm": 5.3669023513793945, "learning_rate": 4.198080877313229e-06, "loss": 0.3847, "step": 312, "train/lambda_m_mean": 0.03798828204162419, "train/mu_mean": 0.7819917425513268, "train/mu_std": 0.22606678679585457, "train/rewards_chosen_mean": 0.0235443115234375, "train/rewards_rejected_mean": -1.9588098526000977, "train/tau_mean": 31.49127459526062, "train/tau_std": 8.509172916412354 }, { "epoch": 0.6120752872158397, "grad_norm": 4.832149982452393, "learning_rate": 4.1946538725154215e-06, "loss": 0.3876, "step": 313, "train/lambda_m_mean": 0.03891601646319032, "train/mu_mean": 0.7731204107403755, "train/mu_std": 0.22778584621846676, "train/rewards_chosen_mean": -0.11818695068359375, "train/rewards_rejected_mean": -1.983633041381836, "train/tau_mean": 31.211769580841064, "train/tau_std": 9.01009315252304 }, { "epoch": 0.6140307993155708, "grad_norm": 5.499162673950195, "learning_rate": 4.191226867717615e-06, "loss": 0.3876, "step": 314, "train/lambda_m_mean": 0.04453125037252903, "train/mu_mean": 0.7701233923435211, "train/mu_std": 0.21912478283047676, "train/rewards_chosen_mean": 0.018209457397460938, "train/rewards_rejected_mean": -1.7519207000732422, "train/tau_mean": 30.83285880088806, "train/tau_std": 9.174290955066681 }, { "epoch": 0.6159863114153019, "grad_norm": 4.36789608001709, "learning_rate": 4.187799862919809e-06, "loss": 0.403, "step": 315, "train/lambda_m_mean": 0.04379882896319032, "train/mu_mean": 0.7526691630482674, "train/mu_std": 0.21105354093015194, "train/rewards_chosen_mean": -0.47170257568359375, "train/rewards_rejected_mean": -2.138195037841797, "train/tau_mean": 31.365097999572754, "train/tau_std": 9.849554181098938 }, { "epoch": 0.617941823515033, "grad_norm": 8.30946159362793, "learning_rate": 4.184372858122001e-06, "loss": 0.4335, "step": 316, "train/lambda_m_mean": 0.04721679771319032, "train/mu_mean": 0.7506323605775833, "train/mu_std": 0.21839086711406708, "train/rewards_chosen_mean": -0.6735553741455078, "train/rewards_rejected_mean": -2.316340446472168, "train/tau_mean": 31.610243320465088, "train/tau_std": 9.886292517185211 }, { "epoch": 0.6198973356147641, "grad_norm": 8.177659034729004, "learning_rate": 4.1809458533241945e-06, "loss": 0.4195, "step": 317, "train/lambda_m_mean": 0.041650391183793545, "train/mu_mean": 0.764789804816246, "train/mu_std": 0.22819996066391468, "train/rewards_chosen_mean": -0.5501241683959961, "train/rewards_rejected_mean": -2.4437551498413086, "train/tau_mean": 31.54524040222168, "train/tau_std": 8.593962550163269 }, { "epoch": 0.6218528477144952, "grad_norm": 5.47844934463501, "learning_rate": 4.177518848526389e-06, "loss": 0.4378, "step": 318, "train/lambda_m_mean": 0.04545898409560323, "train/mu_mean": 0.7453504800796509, "train/mu_std": 0.23542053997516632, "train/rewards_chosen_mean": -0.38290780782699585, "train/rewards_rejected_mean": -2.028522491455078, "train/tau_mean": 30.444460153579712, "train/tau_std": 9.583301782608032 }, { "epoch": 0.6238083598142263, "grad_norm": 3.8195271492004395, "learning_rate": 4.174091843728582e-06, "loss": 0.3424, "step": 319, "train/lambda_m_mean": 0.04301757947541773, "train/mu_mean": 0.7874171286821365, "train/mu_std": 0.19080077856779099, "train/rewards_chosen_mean": 0.01014089584350586, "train/rewards_rejected_mean": -1.8635673522949219, "train/tau_mean": 30.40273356437683, "train/tau_std": 8.765032887458801 }, { "epoch": 0.6257638719139574, "grad_norm": 5.132515907287598, "learning_rate": 4.170664838930774e-06, "loss": 0.3647, "step": 320, "train/lambda_m_mean": 0.04204101720824838, "train/mu_mean": 0.7854139357805252, "train/mu_std": 0.19657788425683975, "train/rewards_chosen_mean": -0.18784713745117188, "train/rewards_rejected_mean": -2.146638870239258, "train/tau_mean": 32.380539417266846, "train/tau_std": 10.166955053806305 }, { "epoch": 0.6277193840136885, "grad_norm": 6.169029235839844, "learning_rate": 4.167237834132968e-06, "loss": 0.3668, "step": 321, "train/lambda_m_mean": 0.03969726711511612, "train/mu_mean": 0.7891190946102142, "train/mu_std": 0.2025968711823225, "train/rewards_chosen_mean": -0.43701744079589844, "train/rewards_rejected_mean": -2.637554168701172, "train/tau_mean": 32.11608386039734, "train/tau_std": 10.1884326338768 }, { "epoch": 0.6296748961134196, "grad_norm": 18.50957489013672, "learning_rate": 4.163810829335162e-06, "loss": 0.48, "step": 322, "train/lambda_m_mean": 0.04003906389698386, "train/mu_mean": 0.769158385694027, "train/mu_std": 0.2538728639483452, "train/rewards_chosen_mean": -0.40852928161621094, "train/rewards_rejected_mean": -2.508127212524414, "train/tau_mean": 35.00670623779297, "train/tau_std": 11.095792591571808 }, { "epoch": 0.6316304082131509, "grad_norm": 7.556807518005371, "learning_rate": 4.160383824537355e-06, "loss": 0.3822, "step": 323, "train/lambda_m_mean": 0.034570313058793545, "train/mu_mean": 0.790294773876667, "train/mu_std": 0.22933636233210564, "train/rewards_chosen_mean": -0.8389163017272949, "train/rewards_rejected_mean": -3.18353271484375, "train/tau_mean": 35.22740292549133, "train/tau_std": 11.321607530117035 }, { "epoch": 0.633585920312882, "grad_norm": 5.635590553283691, "learning_rate": 4.156956819739548e-06, "loss": 0.366, "step": 324, "train/lambda_m_mean": 0.04458007961511612, "train/mu_mean": 0.7955590561032295, "train/mu_std": 0.2128606829792261, "train/rewards_chosen_mean": -0.5276432037353516, "train/rewards_rejected_mean": -2.5612106323242188, "train/tau_mean": 31.611531734466553, "train/tau_std": 11.381885290145874 }, { "epoch": 0.6355414324126131, "grad_norm": 4.604992866516113, "learning_rate": 4.1535298149417415e-06, "loss": 0.3761, "step": 325, "train/lambda_m_mean": 0.04331054771319032, "train/mu_mean": 0.778004102408886, "train/mu_std": 0.21599550731480122, "train/rewards_chosen_mean": -0.6258258819580078, "train/rewards_rejected_mean": -2.4921112060546875, "train/tau_mean": 31.21306586265564, "train/tau_std": 10.551878929138184 }, { "epoch": 0.6374969445123442, "grad_norm": 5.018922805786133, "learning_rate": 4.150102810143935e-06, "loss": 0.3859, "step": 326, "train/lambda_m_mean": 0.05014648661017418, "train/mu_mean": 0.753914974629879, "train/mu_std": 0.18409314006567, "train/rewards_chosen_mean": -0.6699657440185547, "train/rewards_rejected_mean": -2.1886749267578125, "train/tau_mean": 29.331053256988525, "train/tau_std": 11.183663487434387 }, { "epoch": 0.6394524566120753, "grad_norm": 4.842251777648926, "learning_rate": 4.146675805346128e-06, "loss": 0.3909, "step": 327, "train/lambda_m_mean": 0.0458496103528887, "train/mu_mean": 0.7646671831607819, "train/mu_std": 0.21317458525300026, "train/rewards_chosen_mean": -0.2345561981201172, "train/rewards_rejected_mean": -1.9800338745117188, "train/tau_mean": 32.098628520965576, "train/tau_std": 12.428023219108582 }, { "epoch": 0.6414079687118064, "grad_norm": 5.5486555099487305, "learning_rate": 4.143248800548321e-06, "loss": 0.3905, "step": 328, "train/lambda_m_mean": 0.04433593852445483, "train/mu_mean": 0.7605325356125832, "train/mu_std": 0.19891776889562607, "train/rewards_chosen_mean": -0.040225982666015625, "train/rewards_rejected_mean": -1.7724876403808594, "train/tau_mean": 30.667142629623413, "train/tau_std": 10.097985208034515 }, { "epoch": 0.6433634808115375, "grad_norm": 7.350486755371094, "learning_rate": 4.1398217957505145e-06, "loss": 0.4363, "step": 329, "train/lambda_m_mean": 0.04189453204162419, "train/mu_mean": 0.7544024139642715, "train/mu_std": 0.23597269877791405, "train/rewards_chosen_mean": 0.0733489990234375, "train/rewards_rejected_mean": -1.710245132446289, "train/tau_mean": 32.87781858444214, "train/tau_std": 11.140480399131775 }, { "epoch": 0.6453189929112686, "grad_norm": 7.135697841644287, "learning_rate": 4.136394790952708e-06, "loss": 0.4233, "step": 330, "train/lambda_m_mean": 0.04536132887005806, "train/mu_mean": 0.7741181999444962, "train/mu_std": 0.23084967024624348, "train/rewards_chosen_mean": 0.8102331161499023, "train/rewards_rejected_mean": -1.1907234191894531, "train/tau_mean": 32.23362970352173, "train/tau_std": 12.249086260795593 }, { "epoch": 0.6472745050109997, "grad_norm": 5.971845626831055, "learning_rate": 4.132967786154901e-06, "loss": 0.4092, "step": 331, "train/lambda_m_mean": 0.04374999995343387, "train/mu_mean": 0.7666233703494072, "train/mu_std": 0.22298135049641132, "train/rewards_chosen_mean": 0.4881572723388672, "train/rewards_rejected_mean": -1.3315999507904053, "train/tau_mean": 31.671157121658325, "train/tau_std": 11.718724608421326 }, { "epoch": 0.6492300171107309, "grad_norm": 5.109337329864502, "learning_rate": 4.129540781357094e-06, "loss": 0.3304, "step": 332, "train/lambda_m_mean": 0.04360351571813226, "train/mu_mean": 0.8135948479175568, "train/mu_std": 0.19913469441235065, "train/rewards_chosen_mean": 0.7449111938476562, "train/rewards_rejected_mean": -1.3973076343536377, "train/tau_mean": 32.00653624534607, "train/tau_std": 11.61254894733429 }, { "epoch": 0.651185529210462, "grad_norm": 6.331543922424316, "learning_rate": 4.1261137765592876e-06, "loss": 0.3827, "step": 333, "train/lambda_m_mean": 0.03681640699505806, "train/mu_mean": 0.7779244780540466, "train/mu_std": 0.22700375691056252, "train/rewards_chosen_mean": 0.19059371948242188, "train/rewards_rejected_mean": -1.7281742095947266, "train/tau_mean": 34.4146683216095, "train/tau_std": 12.073024153709412 }, { "epoch": 0.6531410413101931, "grad_norm": 6.172979354858398, "learning_rate": 4.122686771761481e-06, "loss": 0.3986, "step": 334, "train/lambda_m_mean": 0.04335937672294676, "train/mu_mean": 0.7605570554733276, "train/mu_std": 0.2115964200347662, "train/rewards_chosen_mean": 0.1567058563232422, "train/rewards_rejected_mean": -1.6120176315307617, "train/tau_mean": 32.28338074684143, "train/tau_std": 10.992487907409668 }, { "epoch": 0.6550965534099242, "grad_norm": 6.246697902679443, "learning_rate": 4.119259766963674e-06, "loss": 0.369, "step": 335, "train/lambda_m_mean": 0.038330078125, "train/mu_mean": 0.777490496635437, "train/mu_std": 0.21415580995380878, "train/rewards_chosen_mean": 0.008985519409179688, "train/rewards_rejected_mean": -1.8084359169006348, "train/tau_mean": 33.635191679000854, "train/tau_std": 11.81436562538147 }, { "epoch": 0.6570520655096553, "grad_norm": 6.744095325469971, "learning_rate": 4.115832762165867e-06, "loss": 0.3779, "step": 336, "train/lambda_m_mean": 0.04541015811264515, "train/mu_mean": 0.7642999663949013, "train/mu_std": 0.19282242469489574, "train/rewards_chosen_mean": -0.4506702423095703, "train/rewards_rejected_mean": -2.168278217315674, "train/tau_mean": 31.672369956970215, "train/tau_std": 11.86206042766571 }, { "epoch": 0.6590075776093864, "grad_norm": 6.686132907867432, "learning_rate": 4.112405757368061e-06, "loss": 0.4018, "step": 337, "train/lambda_m_mean": 0.041406252421438694, "train/mu_mean": 0.7652333602309227, "train/mu_std": 0.22630933858454227, "train/rewards_chosen_mean": -0.01178741455078125, "train/rewards_rejected_mean": -1.816558837890625, "train/tau_mean": 31.693177223205566, "train/tau_std": 11.524611234664917 }, { "epoch": 0.6609630897091175, "grad_norm": 5.594915390014648, "learning_rate": 4.108978752570254e-06, "loss": 0.3649, "step": 338, "train/lambda_m_mean": 0.04370117327198386, "train/mu_mean": 0.783181682229042, "train/mu_std": 0.19967390038073063, "train/rewards_chosen_mean": -0.1278700828552246, "train/rewards_rejected_mean": -2.0521488189697266, "train/tau_mean": 31.65675115585327, "train/tau_std": 11.602554082870483 }, { "epoch": 0.6629186018088487, "grad_norm": 13.47895622253418, "learning_rate": 4.105551747772447e-06, "loss": 0.4277, "step": 339, "train/lambda_m_mean": 0.04550781357102096, "train/mu_mean": 0.7962566614151001, "train/mu_std": 0.1985413134098053, "train/rewards_chosen_mean": 0.23981475830078125, "train/rewards_rejected_mean": -1.9178423881530762, "train/tau_mean": 32.9497230052948, "train/tau_std": 13.129249811172485 }, { "epoch": 0.6648741139085798, "grad_norm": 5.259345531463623, "learning_rate": 4.10212474297464e-06, "loss": 0.3815, "step": 340, "train/lambda_m_mean": 0.05048828339204192, "train/mu_mean": 0.7838718742132187, "train/mu_std": 0.19936152547597885, "train/rewards_chosen_mean": 0.11957740783691406, "train/rewards_rejected_mean": -1.7474318742752075, "train/tau_mean": 31.569368600845337, "train/tau_std": 12.192358016967773 }, { "epoch": 0.666829626008311, "grad_norm": 8.301177978515625, "learning_rate": 4.098697738176834e-06, "loss": 0.3932, "step": 341, "train/lambda_m_mean": 0.04926757887005806, "train/mu_mean": 0.7916114702820778, "train/mu_std": 0.2181376777589321, "train/rewards_chosen_mean": 0.23589253425598145, "train/rewards_rejected_mean": -1.7293176651000977, "train/tau_mean": 30.259812831878662, "train/tau_std": 11.093867540359497 }, { "epoch": 0.6687851381080421, "grad_norm": 4.553437232971191, "learning_rate": 4.095270733379027e-06, "loss": 0.3535, "step": 342, "train/lambda_m_mean": 0.04477539146319032, "train/mu_mean": 0.791391059756279, "train/mu_std": 0.19936157297343016, "train/rewards_chosen_mean": 0.0673980712890625, "train/rewards_rejected_mean": -1.8602581024169922, "train/tau_mean": 28.589577436447144, "train/tau_std": 10.95235288143158 }, { "epoch": 0.6707406502077732, "grad_norm": 4.928591251373291, "learning_rate": 4.09184372858122e-06, "loss": 0.3949, "step": 343, "train/lambda_m_mean": 0.047949220053851604, "train/mu_mean": 0.7803432047367096, "train/mu_std": 0.2130506057292223, "train/rewards_chosen_mean": -0.19418907165527344, "train/rewards_rejected_mean": -2.0144500732421875, "train/tau_mean": 28.55855631828308, "train/tau_std": 11.02649438381195 }, { "epoch": 0.6726961623075043, "grad_norm": 5.328364849090576, "learning_rate": 4.0884167237834135e-06, "loss": 0.3959, "step": 344, "train/lambda_m_mean": 0.045458986423909664, "train/mu_mean": 0.7527226954698563, "train/mu_std": 0.19420157745480537, "train/rewards_chosen_mean": -0.6882839202880859, "train/rewards_rejected_mean": -2.1817092895507812, "train/tau_mean": 28.688835382461548, "train/tau_std": 11.469707489013672 }, { "epoch": 0.6746516744072354, "grad_norm": 5.509541034698486, "learning_rate": 4.084989718985607e-06, "loss": 0.3972, "step": 345, "train/lambda_m_mean": 0.04106445494107902, "train/mu_mean": 0.755455732345581, "train/mu_std": 0.2159725073724985, "train/rewards_chosen_mean": -0.6196002960205078, "train/rewards_rejected_mean": -2.2501907348632812, "train/tau_mean": 29.876640796661377, "train/tau_std": 11.225894093513489 }, { "epoch": 0.6766071865069665, "grad_norm": 4.559186935424805, "learning_rate": 4.0815627141878e-06, "loss": 0.438, "step": 346, "train/lambda_m_mean": 0.048974609933793545, "train/mu_mean": 0.7322620004415512, "train/mu_std": 0.20933939330279827, "train/rewards_chosen_mean": -0.9296340942382812, "train/rewards_rejected_mean": -2.3906497955322266, "train/tau_mean": 28.031959533691406, "train/tau_std": 11.129688501358032 }, { "epoch": 0.6785626986066976, "grad_norm": 5.323885440826416, "learning_rate": 4.078135709389994e-06, "loss": 0.3723, "step": 347, "train/lambda_m_mean": 0.049414063803851604, "train/mu_mean": 0.7816778346896172, "train/mu_std": 0.20743129961192608, "train/rewards_chosen_mean": -0.5380516052246094, "train/rewards_rejected_mean": -2.3887405395507812, "train/tau_mean": 28.42229437828064, "train/tau_std": 11.644850373268127 }, { "epoch": 0.6805182107064287, "grad_norm": 5.485777854919434, "learning_rate": 4.0747087045921865e-06, "loss": 0.3723, "step": 348, "train/lambda_m_mean": 0.0440429684240371, "train/mu_mean": 0.78459682315588, "train/mu_std": 0.21215505711734295, "train/rewards_chosen_mean": -0.19977760314941406, "train/rewards_rejected_mean": -2.1953258514404297, "train/tau_mean": 30.88076138496399, "train/tau_std": 11.381887197494507 }, { "epoch": 0.6824737228061598, "grad_norm": 5.976443290710449, "learning_rate": 4.07128169979438e-06, "loss": 0.3807, "step": 349, "train/lambda_m_mean": 0.04731445387005806, "train/mu_mean": 0.7838690504431725, "train/mu_std": 0.21845452301204205, "train/rewards_chosen_mean": -0.07810020446777344, "train/rewards_rejected_mean": -2.120615005493164, "train/tau_mean": 31.01642346382141, "train/tau_std": 12.962032556533813 }, { "epoch": 0.684429234905891, "grad_norm": 8.760558128356934, "learning_rate": 4.067854694996574e-06, "loss": 0.4095, "step": 350, "train/lambda_m_mean": 0.04584961058571935, "train/mu_mean": 0.771177388727665, "train/mu_std": 0.21650788187980652, "train/rewards_chosen_mean": -0.0793466567993164, "train/rewards_rejected_mean": -2.0096359252929688, "train/tau_mean": 29.73142170906067, "train/tau_std": 11.270426988601685 }, { "epoch": 0.6863847470056221, "grad_norm": 10.756919860839844, "learning_rate": 4.064427690198766e-06, "loss": 0.394, "step": 351, "train/lambda_m_mean": 0.047460938803851604, "train/mu_mean": 0.7965559586882591, "train/mu_std": 0.21620879881083965, "train/rewards_chosen_mean": 0.4536008834838867, "train/rewards_rejected_mean": -1.6484432220458984, "train/tau_mean": 29.4520103931427, "train/tau_std": 12.72091019153595 }, { "epoch": 0.6883402591053532, "grad_norm": 7.192263126373291, "learning_rate": 4.0610006854009596e-06, "loss": 0.3939, "step": 352, "train/lambda_m_mean": 0.04204101720824838, "train/mu_mean": 0.7735478803515434, "train/mu_std": 0.2155851423740387, "train/rewards_chosen_mean": 0.4789586067199707, "train/rewards_rejected_mean": -1.4101386070251465, "train/tau_mean": 29.471003770828247, "train/tau_std": 11.943026542663574 }, { "epoch": 0.6902957712050843, "grad_norm": 5.627167701721191, "learning_rate": 4.057573680603153e-06, "loss": 0.3465, "step": 353, "train/lambda_m_mean": 0.04345703357830644, "train/mu_mean": 0.8034969419240952, "train/mu_std": 0.19780482351779938, "train/rewards_chosen_mean": 0.6863489151000977, "train/rewards_rejected_mean": -1.3169169425964355, "train/tau_mean": 27.979764223098755, "train/tau_std": 10.88703715801239 }, { "epoch": 0.6922512833048154, "grad_norm": 5.399471759796143, "learning_rate": 4.054146675805347e-06, "loss": 0.3855, "step": 354, "train/lambda_m_mean": 0.04609375121071935, "train/mu_mean": 0.7634414881467819, "train/mu_std": 0.2036236748099327, "train/rewards_chosen_mean": 0.4392585754394531, "train/rewards_rejected_mean": -1.21234130859375, "train/tau_mean": 25.644909381866455, "train/tau_std": 10.103271305561066 }, { "epoch": 0.6942067954045466, "grad_norm": 4.551477909088135, "learning_rate": 4.050719671007539e-06, "loss": 0.3575, "step": 355, "train/lambda_m_mean": 0.03745117178186774, "train/mu_mean": 0.7854263186454773, "train/mu_std": 0.203654695302248, "train/rewards_chosen_mean": 0.09257125854492188, "train/rewards_rejected_mean": -1.704279899597168, "train/tau_mean": 29.557100534439087, "train/tau_std": 10.609154462814331 }, { "epoch": 0.6961623075042777, "grad_norm": 6.307047367095947, "learning_rate": 4.047292666209733e-06, "loss": 0.4253, "step": 356, "train/lambda_m_mean": 0.03964843833819032, "train/mu_mean": 0.7489712759852409, "train/mu_std": 0.22815564461052418, "train/rewards_chosen_mean": -0.1790604591369629, "train/rewards_rejected_mean": -1.8179779052734375, "train/tau_mean": 30.093255281448364, "train/tau_std": 11.600051283836365 }, { "epoch": 0.6981178196040088, "grad_norm": 7.2402238845825195, "learning_rate": 4.043865661411927e-06, "loss": 0.3544, "step": 357, "train/lambda_m_mean": 0.03872070484794676, "train/mu_mean": 0.7856538370251656, "train/mu_std": 0.20592390187084675, "train/rewards_chosen_mean": -0.5254478454589844, "train/rewards_rejected_mean": -2.3985939025878906, "train/tau_mean": 30.42170739173889, "train/tau_std": 10.544421672821045 }, { "epoch": 0.7000733317037399, "grad_norm": 6.601073265075684, "learning_rate": 4.04043865661412e-06, "loss": 0.427, "step": 358, "train/lambda_m_mean": 0.04628906399011612, "train/mu_mean": 0.7456270307302475, "train/mu_std": 0.2220831010490656, "train/rewards_chosen_mean": -0.9815177917480469, "train/rewards_rejected_mean": -2.6553802490234375, "train/tau_mean": 29.612167358398438, "train/tau_std": 11.614918112754822 }, { "epoch": 0.7020288438034711, "grad_norm": 4.905711650848389, "learning_rate": 4.037011651816312e-06, "loss": 0.3514, "step": 359, "train/lambda_m_mean": 0.04589843796566129, "train/mu_mean": 0.7978499978780746, "train/mu_std": 0.2054575253278017, "train/rewards_chosen_mean": -0.5994796752929688, "train/rewards_rejected_mean": -2.732046127319336, "train/tau_mean": 30.958841562271118, "train/tau_std": 12.087960481643677 }, { "epoch": 0.7039843559032022, "grad_norm": 6.217625141143799, "learning_rate": 4.0335846470185065e-06, "loss": 0.3654, "step": 360, "train/lambda_m_mean": 0.05151367234066129, "train/mu_mean": 0.8010006844997406, "train/mu_std": 0.19918020069599152, "train/rewards_chosen_mean": -0.8149271011352539, "train/rewards_rejected_mean": -2.8457860946655273, "train/tau_mean": 31.00564432144165, "train/tau_std": 12.114604353904724 }, { "epoch": 0.7059398680029333, "grad_norm": 7.094750881195068, "learning_rate": 4.0301576422207e-06, "loss": 0.3709, "step": 361, "train/lambda_m_mean": 0.04677734477445483, "train/mu_mean": 0.8053304478526115, "train/mu_std": 0.20440784096717834, "train/rewards_chosen_mean": -0.5522408485412598, "train/rewards_rejected_mean": -2.6938323974609375, "train/tau_mean": 33.0712411403656, "train/tau_std": 12.731422424316406 }, { "epoch": 0.7078953801026644, "grad_norm": 3.3796417713165283, "learning_rate": 4.026730637422892e-06, "loss": 0.2843, "step": 362, "train/lambda_m_mean": 0.04262695321813226, "train/mu_mean": 0.8452496528625488, "train/mu_std": 0.18547238782048225, "train/rewards_chosen_mean": -0.39382266998291016, "train/rewards_rejected_mean": -2.89508056640625, "train/tau_mean": 35.15808057785034, "train/tau_std": 13.617368817329407 }, { "epoch": 0.7098508922023955, "grad_norm": 8.561649322509766, "learning_rate": 4.023303632625086e-06, "loss": 0.4297, "step": 363, "train/lambda_m_mean": 0.042333984980359674, "train/mu_mean": 0.7668076679110527, "train/mu_std": 0.2227313555777073, "train/rewards_chosen_mean": -0.08548259735107422, "train/rewards_rejected_mean": -2.042398452758789, "train/tau_mean": 35.959657192230225, "train/tau_std": 13.346782803535461 }, { "epoch": 0.7118064043021266, "grad_norm": 5.765273571014404, "learning_rate": 4.0198766278272796e-06, "loss": 0.3582, "step": 364, "train/lambda_m_mean": 0.04682617262005806, "train/mu_mean": 0.7963268831372261, "train/mu_std": 0.20309429336339235, "train/rewards_chosen_mean": 0.10619163513183594, "train/rewards_rejected_mean": -1.9201202392578125, "train/tau_mean": 33.14381670951843, "train/tau_std": 13.260381579399109 }, { "epoch": 0.7137619164018577, "grad_norm": 6.604729652404785, "learning_rate": 4.016449623029473e-06, "loss": 0.4045, "step": 365, "train/lambda_m_mean": 0.04741211049258709, "train/mu_mean": 0.7635313495993614, "train/mu_std": 0.22064510732889175, "train/rewards_chosen_mean": 0.16513705253601074, "train/rewards_rejected_mean": -1.613062858581543, "train/tau_mean": 32.16968750953674, "train/tau_std": 11.849596381187439 }, { "epoch": 0.7157174285015888, "grad_norm": 6.93689489364624, "learning_rate": 4.013022618231665e-06, "loss": 0.4193, "step": 366, "train/lambda_m_mean": 0.045068359933793545, "train/mu_mean": 0.7550241127610207, "train/mu_std": 0.21835463866591454, "train/rewards_chosen_mean": 0.1523299217224121, "train/rewards_rejected_mean": -1.5684852600097656, "train/tau_mean": 33.800243854522705, "train/tau_std": 12.829034209251404 }, { "epoch": 0.7176729406013199, "grad_norm": 4.694178581237793, "learning_rate": 4.009595613433859e-06, "loss": 0.3218, "step": 367, "train/lambda_m_mean": 0.04316406324505806, "train/mu_mean": 0.8011778816580772, "train/mu_std": 0.17873391136527061, "train/rewards_chosen_mean": 0.2982778549194336, "train/rewards_rejected_mean": -1.6343669891357422, "train/tau_mean": 36.35483264923096, "train/tau_std": 13.64227545261383 }, { "epoch": 0.7196284527010511, "grad_norm": 4.288336753845215, "learning_rate": 4.006168608636053e-06, "loss": 0.3569, "step": 368, "train/lambda_m_mean": 0.05068359477445483, "train/mu_mean": 0.7911434769630432, "train/mu_std": 0.19910231605172157, "train/rewards_chosen_mean": 0.05866813659667969, "train/rewards_rejected_mean": -1.9309660196304321, "train/tau_mean": 32.988266944885254, "train/tau_std": 13.288143038749695 }, { "epoch": 0.7215839648007822, "grad_norm": 6.923356533050537, "learning_rate": 4.002741603838246e-06, "loss": 0.3362, "step": 369, "train/lambda_m_mean": 0.03491211007349193, "train/mu_mean": 0.8134711161255836, "train/mu_std": 0.21384836360812187, "train/rewards_chosen_mean": -0.8445086479187012, "train/rewards_rejected_mean": -3.3350601196289062, "train/tau_mean": 41.010852336883545, "train/tau_std": 13.889739632606506 }, { "epoch": 0.7235394769005133, "grad_norm": 8.11633586883545, "learning_rate": 3.999314599040439e-06, "loss": 0.436, "step": 370, "train/lambda_m_mean": 0.042333985678851604, "train/mu_mean": 0.7605349868535995, "train/mu_std": 0.2458302415907383, "train/rewards_chosen_mean": -0.7137336730957031, "train/rewards_rejected_mean": -2.738006591796875, "train/tau_mean": 39.67570161819458, "train/tau_std": 14.995908260345459 }, { "epoch": 0.7254949890002445, "grad_norm": 6.084746837615967, "learning_rate": 3.995887594242632e-06, "loss": 0.372, "step": 371, "train/lambda_m_mean": 0.04604492196813226, "train/mu_mean": 0.7838908731937408, "train/mu_std": 0.21247492544353008, "train/rewards_chosen_mean": -1.214930534362793, "train/rewards_rejected_mean": -3.1064071655273438, "train/tau_mean": 36.65717673301697, "train/tau_std": 15.145748734474182 }, { "epoch": 0.7274505010999756, "grad_norm": 4.83130407333374, "learning_rate": 3.992460589444826e-06, "loss": 0.3721, "step": 372, "train/lambda_m_mean": 0.045605468563735485, "train/mu_mean": 0.7783629149198532, "train/mu_std": 0.20826442912220955, "train/rewards_chosen_mean": -1.6409077644348145, "train/rewards_rejected_mean": -3.5743942260742188, "train/tau_mean": 36.5238778591156, "train/tau_std": 14.396725416183472 }, { "epoch": 0.7294060131997067, "grad_norm": 5.57642936706543, "learning_rate": 3.989033584647019e-06, "loss": 0.3774, "step": 373, "train/lambda_m_mean": 0.045751954428851604, "train/mu_mean": 0.7722861766815186, "train/mu_std": 0.21422808058559895, "train/rewards_chosen_mean": -1.4084978103637695, "train/rewards_rejected_mean": -3.2912521362304688, "train/tau_mean": 34.901718616485596, "train/tau_std": 14.9901442527771 }, { "epoch": 0.7313615252994378, "grad_norm": 4.457095623016357, "learning_rate": 3.985606579849212e-06, "loss": 0.3246, "step": 374, "train/lambda_m_mean": 0.045263673178851604, "train/mu_mean": 0.8117682114243507, "train/mu_std": 0.1868651658296585, "train/rewards_chosen_mean": -1.9990234375, "train/rewards_rejected_mean": -4.07598876953125, "train/tau_mean": 34.31154727935791, "train/tau_std": 13.533478379249573 }, { "epoch": 0.7333170373991689, "grad_norm": 5.841901779174805, "learning_rate": 3.9821795750514055e-06, "loss": 0.4373, "step": 375, "train/lambda_m_mean": 0.04462890746071935, "train/mu_mean": 0.7417844533920288, "train/mu_std": 0.23327958770096302, "train/rewards_chosen_mean": -1.8201427459716797, "train/rewards_rejected_mean": -3.486236572265625, "train/tau_mean": 36.21882772445679, "train/tau_std": 16.051984906196594 }, { "epoch": 0.7352725494989, "grad_norm": 7.648315906524658, "learning_rate": 3.978752570253599e-06, "loss": 0.3602, "step": 376, "train/lambda_m_mean": 0.04536132933571935, "train/mu_mean": 0.8209062293171883, "train/mu_std": 0.21869874093681574, "train/rewards_chosen_mean": -0.9906997680664062, "train/rewards_rejected_mean": -3.31817626953125, "train/tau_mean": 34.96469330787659, "train/tau_std": 15.382327198982239 }, { "epoch": 0.7372280615986312, "grad_norm": 6.8951802253723145, "learning_rate": 3.975325565455792e-06, "loss": 0.369, "step": 377, "train/lambda_m_mean": 0.04169921879656613, "train/mu_mean": 0.7941224351525307, "train/mu_std": 0.21946285106241703, "train/rewards_chosen_mean": -1.357879638671875, "train/rewards_rejected_mean": -3.435953140258789, "train/tau_mean": 37.592652320861816, "train/tau_std": 15.05434238910675 }, { "epoch": 0.7391835736983623, "grad_norm": 7.045261859893799, "learning_rate": 3.971898560657985e-06, "loss": 0.3763, "step": 378, "train/lambda_m_mean": 0.04238281352445483, "train/mu_mean": 0.792046345770359, "train/mu_std": 0.21209781244397163, "train/rewards_chosen_mean": -1.1026959419250488, "train/rewards_rejected_mean": -3.2418365478515625, "train/tau_mean": 35.317002296447754, "train/tau_std": 14.30333161354065 }, { "epoch": 0.7411390857980934, "grad_norm": 5.044384002685547, "learning_rate": 3.9684715558601785e-06, "loss": 0.3809, "step": 379, "train/lambda_m_mean": 0.042236329056322575, "train/mu_mean": 0.772498719394207, "train/mu_std": 0.20745998434722424, "train/rewards_chosen_mean": -1.2101478576660156, "train/rewards_rejected_mean": -3.006732940673828, "train/tau_mean": 33.69096064567566, "train/tau_std": 13.590723514556885 }, { "epoch": 0.7430945978978245, "grad_norm": 6.042482852935791, "learning_rate": 3.965044551062372e-06, "loss": 0.3697, "step": 380, "train/lambda_m_mean": 0.042089845752343535, "train/mu_mean": 0.7836411520838737, "train/mu_std": 0.2002158034592867, "train/rewards_chosen_mean": -1.2427635192871094, "train/rewards_rejected_mean": -3.111663818359375, "train/tau_mean": 33.65243124961853, "train/tau_std": 13.787930011749268 }, { "epoch": 0.7450501099975556, "grad_norm": 5.737424373626709, "learning_rate": 3.961617546264565e-06, "loss": 0.4042, "step": 381, "train/lambda_m_mean": 0.04619140736758709, "train/mu_mean": 0.7750113978981972, "train/mu_std": 0.21629426162689924, "train/rewards_chosen_mean": -1.0355830192565918, "train/rewards_rejected_mean": -2.917504668235779, "train/tau_mean": 32.432570457458496, "train/tau_std": 14.695632934570312 }, { "epoch": 0.7470056220972867, "grad_norm": 5.266127586364746, "learning_rate": 3.958190541466758e-06, "loss": 0.375, "step": 382, "train/lambda_m_mean": 0.04790039174258709, "train/mu_mean": 0.7766255214810371, "train/mu_std": 0.20009410195052624, "train/rewards_chosen_mean": -0.80487060546875, "train/rewards_rejected_mean": -2.6237411499023438, "train/tau_mean": 30.509130239486694, "train/tau_std": 13.663369178771973 }, { "epoch": 0.7489611341970178, "grad_norm": 5.556458950042725, "learning_rate": 3.9547635366689516e-06, "loss": 0.3587, "step": 383, "train/lambda_m_mean": 0.03891601646319032, "train/mu_mean": 0.7904663309454918, "train/mu_std": 0.21459759585559368, "train/rewards_chosen_mean": -1.021517276763916, "train/rewards_rejected_mean": -2.9787750244140625, "train/tau_mean": 34.32772421836853, "train/tau_std": 13.222581267356873 }, { "epoch": 0.7509166462967489, "grad_norm": 5.331334590911865, "learning_rate": 3.951336531871145e-06, "loss": 0.3851, "step": 384, "train/lambda_m_mean": 0.04028320359066129, "train/mu_mean": 0.7654768973588943, "train/mu_std": 0.2165120579302311, "train/rewards_chosen_mean": -0.6404159069061279, "train/rewards_rejected_mean": -2.4920032024383545, "train/tau_mean": 32.99752688407898, "train/tau_std": 13.541016578674316 }, { "epoch": 0.75287215839648, "grad_norm": 7.543819904327393, "learning_rate": 3.947909527073338e-06, "loss": 0.3242, "step": 385, "train/lambda_m_mean": 0.04184570489451289, "train/mu_mean": 0.8106869757175446, "train/mu_std": 0.19233975559473038, "train/rewards_chosen_mean": -0.2499542236328125, "train/rewards_rejected_mean": -2.420436382293701, "train/tau_mean": 33.53919529914856, "train/tau_std": 14.518796801567078 }, { "epoch": 0.7548276704962112, "grad_norm": 6.64152717590332, "learning_rate": 3.944482522275531e-06, "loss": 0.3865, "step": 386, "train/lambda_m_mean": 0.04614257859066129, "train/mu_mean": 0.7752031534910202, "train/mu_std": 0.21790580451488495, "train/rewards_chosen_mean": -0.24648284912109375, "train/rewards_rejected_mean": -2.1494197845458984, "train/tau_mean": 32.94740056991577, "train/tau_std": 14.28356122970581 }, { "epoch": 0.7567831825959423, "grad_norm": 7.941102027893066, "learning_rate": 3.941055517477725e-06, "loss": 0.4596, "step": 387, "train/lambda_m_mean": 0.041259765625, "train/mu_mean": 0.7430495396256447, "train/mu_std": 0.240564638748765, "train/rewards_chosen_mean": -0.8324284553527832, "train/rewards_rejected_mean": -2.5662269592285156, "train/tau_mean": 34.983593463897705, "train/tau_std": 14.348434567451477 }, { "epoch": 0.7587386946956735, "grad_norm": 4.809168815612793, "learning_rate": 3.937628512679918e-06, "loss": 0.3824, "step": 388, "train/lambda_m_mean": 0.044433594681322575, "train/mu_mean": 0.779906339943409, "train/mu_std": 0.22706972807645798, "train/rewards_chosen_mean": -0.6674976348876953, "train/rewards_rejected_mean": -2.6273365020751953, "train/tau_mean": 33.987287521362305, "train/tau_std": 14.228681683540344 }, { "epoch": 0.7606942067954046, "grad_norm": 6.457339763641357, "learning_rate": 3.934201507882111e-06, "loss": 0.3355, "step": 389, "train/lambda_m_mean": 0.04409179883077741, "train/mu_mean": 0.8194610550999641, "train/mu_std": 0.1945861279964447, "train/rewards_chosen_mean": 0.14970874786376953, "train/rewards_rejected_mean": -2.0713329315185547, "train/tau_mean": 33.6846718788147, "train/tau_std": 14.098705649375916 }, { "epoch": 0.7626497188951357, "grad_norm": 6.213330268859863, "learning_rate": 3.930774503084304e-06, "loss": 0.3837, "step": 390, "train/lambda_m_mean": 0.04633789206854999, "train/mu_mean": 0.773614376783371, "train/mu_std": 0.20645080506801605, "train/rewards_chosen_mean": 0.29464802145957947, "train/rewards_rejected_mean": -1.5167512893676758, "train/tau_mean": 32.77578043937683, "train/tau_std": 14.271580457687378 }, { "epoch": 0.7646052309948668, "grad_norm": 6.120866298675537, "learning_rate": 3.927347498286498e-06, "loss": 0.3263, "step": 391, "train/lambda_m_mean": 0.0397460947278887, "train/mu_mean": 0.8066305667161942, "train/mu_std": 0.20307224243879318, "train/rewards_chosen_mean": 0.25275325775146484, "train/rewards_rejected_mean": -1.8883533477783203, "train/tau_mean": 33.267460346221924, "train/tau_std": 12.645367741584778 }, { "epoch": 0.7665607430945979, "grad_norm": 6.1970295906066895, "learning_rate": 3.923920493488691e-06, "loss": 0.3905, "step": 392, "train/lambda_m_mean": 0.052685548551380634, "train/mu_mean": 0.7868192121386528, "train/mu_std": 0.2234182469546795, "train/rewards_chosen_mean": 0.45441722869873047, "train/rewards_rejected_mean": -1.445299506187439, "train/tau_mean": 27.474360704421997, "train/tau_std": 11.927778244018555 }, { "epoch": 0.768516255194329, "grad_norm": 4.2422943115234375, "learning_rate": 3.920493488690885e-06, "loss": 0.3635, "step": 393, "train/lambda_m_mean": 0.043066407553851604, "train/mu_mean": 0.7782260328531265, "train/mu_std": 0.20381170511245728, "train/rewards_chosen_mean": 0.8319282531738281, "train/rewards_rejected_mean": -1.0280828475952148, "train/tau_mean": 30.133925437927246, "train/tau_std": 12.330523133277893 }, { "epoch": 0.7704717672940601, "grad_norm": 4.970379829406738, "learning_rate": 3.9170664838930775e-06, "loss": 0.3707, "step": 394, "train/lambda_m_mean": 0.04077148553915322, "train/mu_mean": 0.7790161594748497, "train/mu_std": 0.21462309919297695, "train/rewards_chosen_mean": 0.7084197998046875, "train/rewards_rejected_mean": -1.1995716094970703, "train/tau_mean": 31.34269618988037, "train/tau_std": 12.588685154914856 }, { "epoch": 0.7724272793937913, "grad_norm": 4.761236190795898, "learning_rate": 3.913639479095271e-06, "loss": 0.3581, "step": 395, "train/lambda_m_mean": 0.043212891556322575, "train/mu_mean": 0.791584849357605, "train/mu_std": 0.20991995558142662, "train/rewards_chosen_mean": 0.9354305267333984, "train/rewards_rejected_mean": -1.0757150650024414, "train/tau_mean": 30.013816595077515, "train/tau_std": 12.82782232761383 }, { "epoch": 0.7743827914935224, "grad_norm": 7.445184707641602, "learning_rate": 3.910212474297465e-06, "loss": 0.3808, "step": 396, "train/lambda_m_mean": 0.04121093824505806, "train/mu_mean": 0.7844840064644814, "train/mu_std": 0.21098599582910538, "train/rewards_chosen_mean": 0.7791604995727539, "train/rewards_rejected_mean": -1.2655277252197266, "train/tau_mean": 30.593286752700806, "train/tau_std": 11.620597004890442 }, { "epoch": 0.7763383035932535, "grad_norm": 4.772597789764404, "learning_rate": 3.906785469499657e-06, "loss": 0.3685, "step": 397, "train/lambda_m_mean": 0.04135742271319032, "train/mu_mean": 0.7797719985246658, "train/mu_std": 0.2174062505364418, "train/rewards_chosen_mean": 0.9188610911369324, "train/rewards_rejected_mean": -1.1085891723632812, "train/tau_mean": 31.020086526870728, "train/tau_std": 12.57812213897705 }, { "epoch": 0.7782938156929846, "grad_norm": 5.05748987197876, "learning_rate": 3.9033584647018505e-06, "loss": 0.3432, "step": 398, "train/lambda_m_mean": 0.04687500046566129, "train/mu_mean": 0.800984688103199, "train/mu_std": 0.19496066495776176, "train/rewards_chosen_mean": 0.7621936798095703, "train/rewards_rejected_mean": -1.3830914497375488, "train/tau_mean": 28.77737593650818, "train/tau_std": 12.46458876132965 }, { "epoch": 0.7802493277927157, "grad_norm": 7.8397016525268555, "learning_rate": 3.899931459904045e-06, "loss": 0.3914, "step": 399, "train/lambda_m_mean": 0.044531250605359674, "train/mu_mean": 0.7765432596206665, "train/mu_std": 0.2150105983018875, "train/rewards_chosen_mean": 0.5749149322509766, "train/rewards_rejected_mean": -1.341881275177002, "train/tau_mean": 30.85931706428528, "train/tau_std": 13.084937334060669 }, { "epoch": 0.7822048398924468, "grad_norm": 6.0910468101501465, "learning_rate": 3.896504455106238e-06, "loss": 0.3765, "step": 400, "train/lambda_m_mean": 0.04438476590439677, "train/mu_mean": 0.7803557589650154, "train/mu_std": 0.21944256499409676, "train/rewards_chosen_mean": 0.012529373168945312, "train/rewards_rejected_mean": -1.9995031356811523, "train/tau_mean": 30.51682186126709, "train/tau_std": 12.487597584724426 }, { "epoch": 0.7841603519921779, "grad_norm": 14.726369857788086, "learning_rate": 3.89307745030843e-06, "loss": 0.3652, "step": 401, "train/lambda_m_mean": 0.04536132933571935, "train/mu_mean": 0.789098471403122, "train/mu_std": 0.20769991725683212, "train/rewards_chosen_mean": 0.26518726348876953, "train/rewards_rejected_mean": -1.7481822967529297, "train/tau_mean": 31.75766396522522, "train/tau_std": 12.006651163101196 }, { "epoch": 0.786115864091909, "grad_norm": 5.68077278137207, "learning_rate": 3.8896504455106236e-06, "loss": 0.3622, "step": 402, "train/lambda_m_mean": 0.04091796884313226, "train/mu_mean": 0.8020050451159477, "train/mu_std": 0.22406684793531895, "train/rewards_chosen_mean": 0.20064711570739746, "train/rewards_rejected_mean": -2.01438570022583, "train/tau_mean": 33.12691402435303, "train/tau_std": 13.106442332267761 }, { "epoch": 0.7880713761916401, "grad_norm": 8.225995063781738, "learning_rate": 3.886223440712818e-06, "loss": 0.4209, "step": 403, "train/lambda_m_mean": 0.04467773577198386, "train/mu_mean": 0.7601565569639206, "train/mu_std": 0.2295110672712326, "train/rewards_chosen_mean": 0.0009522363543510437, "train/rewards_rejected_mean": -1.8639183044433594, "train/tau_mean": 31.869686365127563, "train/tau_std": 13.89377212524414 }, { "epoch": 0.7900268882913714, "grad_norm": 8.484590530395508, "learning_rate": 3.882796435915011e-06, "loss": 0.3612, "step": 404, "train/lambda_m_mean": 0.04370117234066129, "train/mu_mean": 0.7887224406003952, "train/mu_std": 0.2035960592329502, "train/rewards_chosen_mean": -0.1881847381591797, "train/rewards_rejected_mean": -2.0937492847442627, "train/tau_mean": 32.30269646644592, "train/tau_std": 12.831909239292145 }, { "epoch": 0.7919824003911025, "grad_norm": 5.308572769165039, "learning_rate": 3.879369431117203e-06, "loss": 0.3989, "step": 405, "train/lambda_m_mean": 0.05073242308571935, "train/mu_mean": 0.768599733710289, "train/mu_std": 0.20869660004973412, "train/rewards_chosen_mean": -0.16675496101379395, "train/rewards_rejected_mean": -1.887969970703125, "train/tau_mean": 30.096994638442993, "train/tau_std": 13.278002500534058 }, { "epoch": 0.7939379124908336, "grad_norm": 7.162442207336426, "learning_rate": 3.8759424263193975e-06, "loss": 0.3991, "step": 406, "train/lambda_m_mean": 0.038427735678851604, "train/mu_mean": 0.7789144292473793, "train/mu_std": 0.23082554154098034, "train/rewards_chosen_mean": -0.32143592834472656, "train/rewards_rejected_mean": -2.186635971069336, "train/tau_mean": 33.734142780303955, "train/tau_std": 13.05987012386322 }, { "epoch": 0.7958934245905647, "grad_norm": 5.459771633148193, "learning_rate": 3.872515421521591e-06, "loss": 0.3913, "step": 407, "train/lambda_m_mean": 0.04736328311264515, "train/mu_mean": 0.7759930044412613, "train/mu_std": 0.22245442867279053, "train/rewards_chosen_mean": 0.19547510147094727, "train/rewards_rejected_mean": -1.600325584411621, "train/tau_mean": 31.035184383392334, "train/tau_std": 12.424644231796265 }, { "epoch": 0.7978489366902958, "grad_norm": 7.065202713012695, "learning_rate": 3.869088416723783e-06, "loss": 0.3446, "step": 408, "train/lambda_m_mean": 0.042187499813735485, "train/mu_mean": 0.7863094881176949, "train/mu_std": 0.19193805195391178, "train/rewards_chosen_mean": 0.12986183166503906, "train/rewards_rejected_mean": -1.664252758026123, "train/tau_mean": 33.74985885620117, "train/tau_std": 12.294116139411926 }, { "epoch": 0.7998044487900269, "grad_norm": 3.4043986797332764, "learning_rate": 3.865661411925977e-06, "loss": 0.3228, "step": 409, "train/lambda_m_mean": 0.04482422024011612, "train/mu_mean": 0.8168949112296104, "train/mu_std": 0.1962652187794447, "train/rewards_chosen_mean": 0.24892139434814453, "train/rewards_rejected_mean": -1.9198799133300781, "train/tau_mean": 33.111419677734375, "train/tau_std": 13.595550537109375 }, { "epoch": 0.801759960889758, "grad_norm": 5.310075759887695, "learning_rate": 3.8622344071281705e-06, "loss": 0.3655, "step": 410, "train/lambda_m_mean": 0.04072265652939677, "train/mu_mean": 0.79520433396101, "train/mu_std": 0.203642463311553, "train/rewards_chosen_mean": 0.0040950775146484375, "train/rewards_rejected_mean": -1.971531629562378, "train/tau_mean": 37.92457389831543, "train/tau_std": 14.89251458644867 }, { "epoch": 0.8037154729894891, "grad_norm": 6.75141716003418, "learning_rate": 3.858807402330364e-06, "loss": 0.3399, "step": 411, "train/lambda_m_mean": 0.038525390438735485, "train/mu_mean": 0.8121644854545593, "train/mu_std": 0.20618969202041626, "train/rewards_chosen_mean": 0.4281940460205078, "train/rewards_rejected_mean": -1.9028491973876953, "train/tau_mean": 39.19871997833252, "train/tau_std": 15.250921368598938 }, { "epoch": 0.8056709850892202, "grad_norm": 13.136767387390137, "learning_rate": 3.855380397532557e-06, "loss": 0.3493, "step": 412, "train/lambda_m_mean": 0.04394531436264515, "train/mu_mean": 0.8150704279541969, "train/mu_std": 0.21210715733468533, "train/rewards_chosen_mean": 0.37890005111694336, "train/rewards_rejected_mean": -1.8597116470336914, "train/tau_mean": 37.169126987457275, "train/tau_std": 14.874300718307495 }, { "epoch": 0.8076264971889514, "grad_norm": 6.913757801055908, "learning_rate": 3.85195339273475e-06, "loss": 0.3835, "step": 413, "train/lambda_m_mean": 0.04140625102445483, "train/mu_mean": 0.7784413173794746, "train/mu_std": 0.21107205376029015, "train/rewards_chosen_mean": 0.1484241485595703, "train/rewards_rejected_mean": -1.980574607849121, "train/tau_mean": 40.46813106536865, "train/tau_std": 16.43422508239746 }, { "epoch": 0.8095820092886825, "grad_norm": 6.941732883453369, "learning_rate": 3.8485263879369436e-06, "loss": 0.3844, "step": 414, "train/lambda_m_mean": 0.039404297014698386, "train/mu_mean": 0.7796073779463768, "train/mu_std": 0.21793726831674576, "train/rewards_chosen_mean": 0.032294511795043945, "train/rewards_rejected_mean": -2.0117645263671875, "train/tau_mean": 42.1831955909729, "train/tau_std": 16.453607320785522 }, { "epoch": 0.8115375213884136, "grad_norm": 5.061097621917725, "learning_rate": 3.845099383139137e-06, "loss": 0.2902, "step": 415, "train/lambda_m_mean": 0.04287109477445483, "train/mu_mean": 0.8345183059573174, "train/mu_std": 0.17929074726998806, "train/rewards_chosen_mean": -0.19310379028320312, "train/rewards_rejected_mean": -2.5810699462890625, "train/tau_mean": 38.1891450881958, "train/tau_std": 14.591024994850159 }, { "epoch": 0.8134930334881447, "grad_norm": 7.34982967376709, "learning_rate": 3.84167237834133e-06, "loss": 0.3894, "step": 416, "train/lambda_m_mean": 0.04433593852445483, "train/mu_mean": 0.7848017141222954, "train/mu_std": 0.22892152704298496, "train/rewards_chosen_mean": -0.3428192138671875, "train/rewards_rejected_mean": -2.425952911376953, "train/tau_mean": 38.542945861816406, "train/tau_std": 15.36043393611908 }, { "epoch": 0.8154485455878758, "grad_norm": 6.823072910308838, "learning_rate": 3.838245373543523e-06, "loss": 0.3915, "step": 417, "train/lambda_m_mean": 0.04428711021319032, "train/mu_mean": 0.7996720522642136, "train/mu_std": 0.23728038929402828, "train/rewards_chosen_mean": -0.6492366790771484, "train/rewards_rejected_mean": -2.8893814086914062, "train/tau_mean": 39.70675039291382, "train/tau_std": 16.51280641555786 }, { "epoch": 0.8174040576876069, "grad_norm": 9.130431175231934, "learning_rate": 3.834818368745717e-06, "loss": 0.3831, "step": 418, "train/lambda_m_mean": 0.04663086053915322, "train/mu_mean": 0.790415458381176, "train/mu_std": 0.22659973800182343, "train/rewards_chosen_mean": -0.9352188110351562, "train/rewards_rejected_mean": -2.9623985290527344, "train/tau_mean": 37.13279056549072, "train/tau_std": 15.42805027961731 }, { "epoch": 0.819359569787338, "grad_norm": 5.803879737854004, "learning_rate": 3.83139136394791e-06, "loss": 0.3698, "step": 419, "train/lambda_m_mean": 0.0377929697278887, "train/mu_mean": 0.7844818904995918, "train/mu_std": 0.217170724645257, "train/rewards_chosen_mean": -1.1262550354003906, "train/rewards_rejected_mean": -3.037872314453125, "train/tau_mean": 39.09750318527222, "train/tau_std": 15.23183012008667 }, { "epoch": 0.8213150818870691, "grad_norm": 4.1565375328063965, "learning_rate": 3.827964359150103e-06, "loss": 0.3344, "step": 420, "train/lambda_m_mean": 0.04345703264698386, "train/mu_mean": 0.7975466474890709, "train/mu_std": 0.19118394516408443, "train/rewards_chosen_mean": -0.8772850036621094, "train/rewards_rejected_mean": -2.8002853393554688, "train/tau_mean": 35.96763062477112, "train/tau_std": 15.6293625831604 }, { "epoch": 0.8232705939868002, "grad_norm": 5.857281684875488, "learning_rate": 3.824537354352296e-06, "loss": 0.3163, "step": 421, "train/lambda_m_mean": 0.042089845053851604, "train/mu_mean": 0.8005281090736389, "train/mu_std": 0.1736741280183196, "train/rewards_chosen_mean": -0.8676271438598633, "train/rewards_rejected_mean": -2.7408151626586914, "train/tau_mean": 35.001832246780396, "train/tau_std": 15.730444312095642 }, { "epoch": 0.8252261060865315, "grad_norm": 4.413699626922607, "learning_rate": 3.82111034955449e-06, "loss": 0.3957, "step": 422, "train/lambda_m_mean": 0.043017578311264515, "train/mu_mean": 0.7577839717268944, "train/mu_std": 0.2143744956701994, "train/rewards_chosen_mean": -1.1904869079589844, "train/rewards_rejected_mean": -2.8843460083007812, "train/tau_mean": 34.95238661766052, "train/tau_std": 15.361993670463562 }, { "epoch": 0.8271816181862626, "grad_norm": 5.889071464538574, "learning_rate": 3.817683344756683e-06, "loss": 0.3453, "step": 423, "train/lambda_m_mean": 0.04877929715439677, "train/mu_mean": 0.8035086318850517, "train/mu_std": 0.19899465329945087, "train/rewards_chosen_mean": -0.5297145843505859, "train/rewards_rejected_mean": -2.5839500427246094, "train/tau_mean": 35.299599409103394, "train/tau_std": 16.78606379032135 }, { "epoch": 0.8291371302859937, "grad_norm": 7.44284725189209, "learning_rate": 3.814256339958876e-06, "loss": 0.3437, "step": 424, "train/lambda_m_mean": 0.04663086123764515, "train/mu_mean": 0.8198232278227806, "train/mu_std": 0.20521944761276245, "train/rewards_chosen_mean": -0.5081901550292969, "train/rewards_rejected_mean": -2.874034881591797, "train/tau_mean": 36.38177847862244, "train/tau_std": 17.248546481132507 }, { "epoch": 0.8310926423857248, "grad_norm": 6.410627365112305, "learning_rate": 3.81082933516107e-06, "loss": 0.3441, "step": 425, "train/lambda_m_mean": 0.04614257952198386, "train/mu_mean": 0.8289163336157799, "train/mu_std": 0.20043959096074104, "train/rewards_chosen_mean": -0.43172645568847656, "train/rewards_rejected_mean": -2.8835220336914062, "train/tau_mean": 36.54154324531555, "train/tau_std": 17.60421335697174 }, { "epoch": 0.8330481544854559, "grad_norm": 5.252377033233643, "learning_rate": 3.8074023303632627e-06, "loss": 0.2792, "step": 426, "train/lambda_m_mean": 0.039453126257285476, "train/mu_mean": 0.8485879376530647, "train/mu_std": 0.1702867206186056, "train/rewards_chosen_mean": 0.3322172164916992, "train/rewards_rejected_mean": -2.1333866119384766, "train/tau_mean": 36.26581597328186, "train/tau_std": 15.368647813796997 }, { "epoch": 0.835003666585187, "grad_norm": 4.865020275115967, "learning_rate": 3.803975325565456e-06, "loss": 0.3552, "step": 427, "train/lambda_m_mean": 0.03603515774011612, "train/mu_mean": 0.7894307300448418, "train/mu_std": 0.21978303231298923, "train/rewards_chosen_mean": 0.012866497039794922, "train/rewards_rejected_mean": -2.072535812854767, "train/tau_mean": 37.41776371002197, "train/tau_std": 15.445401787757874 }, { "epoch": 0.8369591786849181, "grad_norm": 5.648532390594482, "learning_rate": 3.8005483207676493e-06, "loss": 0.3333, "step": 428, "train/lambda_m_mean": 0.04150390764698386, "train/mu_mean": 0.7934764251112938, "train/mu_std": 0.19620280899107456, "train/rewards_chosen_mean": -0.19657325744628906, "train/rewards_rejected_mean": -2.1492555141448975, "train/tau_mean": 33.34157848358154, "train/tau_std": 15.162728786468506 }, { "epoch": 0.8389146907846492, "grad_norm": 6.10992431640625, "learning_rate": 3.797121315969843e-06, "loss": 0.41, "step": 429, "train/lambda_m_mean": 0.044091798132285476, "train/mu_mean": 0.7515411749482155, "train/mu_std": 0.21824349649250507, "train/rewards_chosen_mean": 0.09674835205078125, "train/rewards_rejected_mean": -1.5005035400390625, "train/tau_mean": 33.046077728271484, "train/tau_std": 15.580010533332825 }, { "epoch": 0.8408702028843803, "grad_norm": 4.797167778015137, "learning_rate": 3.7936943111720358e-06, "loss": 0.3852, "step": 430, "train/lambda_m_mean": 0.04023437644354999, "train/mu_mean": 0.7747792974114418, "train/mu_std": 0.22015376761555672, "train/rewards_chosen_mean": 0.29875850677490234, "train/rewards_rejected_mean": -1.5150508880615234, "train/tau_mean": 34.4517822265625, "train/tau_std": 15.0193452835083 }, { "epoch": 0.8428257149841115, "grad_norm": 5.508096218109131, "learning_rate": 3.790267306374229e-06, "loss": 0.3666, "step": 431, "train/lambda_m_mean": 0.047607422806322575, "train/mu_mean": 0.7745489701628685, "train/mu_std": 0.19506299681961536, "train/rewards_chosen_mean": 0.16300392150878906, "train/rewards_rejected_mean": -1.5062103271484375, "train/tau_mean": 30.432461261749268, "train/tau_std": 15.0137779712677 }, { "epoch": 0.8447812270838426, "grad_norm": 4.085626125335693, "learning_rate": 3.7868403015764227e-06, "loss": 0.3599, "step": 432, "train/lambda_m_mean": 0.04633789183571935, "train/mu_mean": 0.7819857224822044, "train/mu_std": 0.20288470946252346, "train/rewards_chosen_mean": 0.18773138523101807, "train/rewards_rejected_mean": -1.6245660781860352, "train/tau_mean": 31.584663152694702, "train/tau_std": 15.584208369255066 }, { "epoch": 0.8467367391835737, "grad_norm": 3.9480981826782227, "learning_rate": 3.783413296778616e-06, "loss": 0.3073, "step": 433, "train/lambda_m_mean": 0.04174804827198386, "train/mu_mean": 0.8178218826651573, "train/mu_std": 0.19347169436514378, "train/rewards_chosen_mean": 0.05948531627655029, "train/rewards_rejected_mean": -2.2074737548828125, "train/tau_mean": 35.30659770965576, "train/tau_std": 15.874517917633057 }, { "epoch": 0.8486922512833048, "grad_norm": 5.568853855133057, "learning_rate": 3.779986291980809e-06, "loss": 0.3858, "step": 434, "train/lambda_m_mean": 0.04321289202198386, "train/mu_mean": 0.7813986390829086, "train/mu_std": 0.2199656367301941, "train/rewards_chosen_mean": 0.014223575592041016, "train/rewards_rejected_mean": -2.112916946411133, "train/tau_mean": 38.28239870071411, "train/tau_std": 17.046275973320007 }, { "epoch": 0.8506477633830359, "grad_norm": 8.038228034973145, "learning_rate": 3.7765592871830025e-06, "loss": 0.3516, "step": 435, "train/lambda_m_mean": 0.04643554845824838, "train/mu_mean": 0.8357514441013336, "train/mu_std": 0.19574689120054245, "train/rewards_chosen_mean": -0.2673063278198242, "train/rewards_rejected_mean": -3.0577077865600586, "train/tau_mean": 39.44698619842529, "train/tau_std": 19.70859980583191 }, { "epoch": 0.852603275482767, "grad_norm": 15.15886116027832, "learning_rate": 3.7731322823851958e-06, "loss": 0.3915, "step": 436, "train/lambda_m_mean": 0.05058593908324838, "train/mu_mean": 0.8167149573564529, "train/mu_std": 0.20780878886580467, "train/rewards_chosen_mean": -0.28360340744256973, "train/rewards_rejected_mean": -2.700547754764557, "train/tau_mean": 38.66068124771118, "train/tau_std": 19.338010549545288 }, { "epoch": 0.8545587875824981, "grad_norm": 8.72911262512207, "learning_rate": 3.7697052775873886e-06, "loss": 0.3438, "step": 437, "train/lambda_m_mean": 0.04824218945577741, "train/mu_mean": 0.8223785161972046, "train/mu_std": 0.2121107466518879, "train/rewards_chosen_mean": -0.16294479370117188, "train/rewards_rejected_mean": -2.5047261714935303, "train/tau_mean": 36.89808940887451, "train/tau_std": 18.82061219215393 }, { "epoch": 0.8565142996822293, "grad_norm": 6.1376752853393555, "learning_rate": 3.7662782727895823e-06, "loss": 0.3775, "step": 438, "train/lambda_m_mean": 0.04340820433571935, "train/mu_mean": 0.782023549079895, "train/mu_std": 0.2082646507769823, "train/rewards_chosen_mean": -0.32704639434814453, "train/rewards_rejected_mean": -2.1754226684570312, "train/tau_mean": 37.913565158843994, "train/tau_std": 17.84208035469055 }, { "epoch": 0.8584698117819604, "grad_norm": 6.53562593460083, "learning_rate": 3.7628512679917756e-06, "loss": 0.34, "step": 439, "train/lambda_m_mean": 0.044335939921438694, "train/mu_mean": 0.8039670214056969, "train/mu_std": 0.20234501268714666, "train/rewards_chosen_mean": -0.1671750545501709, "train/rewards_rejected_mean": -2.208648681640625, "train/tau_mean": 37.76345181465149, "train/tau_std": 16.975391626358032 }, { "epoch": 0.8604253238816916, "grad_norm": 4.627742767333984, "learning_rate": 3.759424263193969e-06, "loss": 0.3752, "step": 440, "train/lambda_m_mean": 0.04340820456854999, "train/mu_mean": 0.7683567851781845, "train/mu_std": 0.1981712207198143, "train/rewards_chosen_mean": -0.18227946758270264, "train/rewards_rejected_mean": -1.8279390335083008, "train/tau_mean": 35.288336753845215, "train/tau_std": 15.820780396461487 }, { "epoch": 0.8623808359814227, "grad_norm": 3.888085126876831, "learning_rate": 3.7559972583961617e-06, "loss": 0.3685, "step": 441, "train/lambda_m_mean": 0.042626953683793545, "train/mu_mean": 0.7719527408480644, "train/mu_std": 0.192414166405797, "train/rewards_chosen_mean": -0.3638172149658203, "train/rewards_rejected_mean": -2.0545501708984375, "train/tau_mean": 37.24873995780945, "train/tau_std": 17.21127712726593 }, { "epoch": 0.8643363480811538, "grad_norm": 3.6816108226776123, "learning_rate": 3.7525702535983554e-06, "loss": 0.3056, "step": 442, "train/lambda_m_mean": 0.04472656222060323, "train/mu_mean": 0.8103335350751877, "train/mu_std": 0.17088115215301514, "train/rewards_chosen_mean": -0.46765613555908203, "train/rewards_rejected_mean": -2.365306854248047, "train/tau_mean": 37.0160596370697, "train/tau_std": 17.41852831840515 }, { "epoch": 0.8662918601808849, "grad_norm": 4.014649868011475, "learning_rate": 3.7491432488005486e-06, "loss": 0.3497, "step": 443, "train/lambda_m_mean": 0.044238281436264515, "train/mu_mean": 0.7889593839645386, "train/mu_std": 0.20004352740943432, "train/rewards_chosen_mean": -0.368685245513916, "train/rewards_rejected_mean": -2.3172683715820312, "train/tau_mean": 37.766839027404785, "train/tau_std": 18.155914545059204 }, { "epoch": 0.868247372280616, "grad_norm": 4.108330726623535, "learning_rate": 3.745716244002742e-06, "loss": 0.3187, "step": 444, "train/lambda_m_mean": 0.04453125083819032, "train/mu_mean": 0.8206296563148499, "train/mu_std": 0.19773303903639317, "train/rewards_chosen_mean": -0.012017250061035156, "train/rewards_rejected_mean": -2.334181785583496, "train/tau_mean": 39.02755784988403, "train/tau_std": 19.469990253448486 }, { "epoch": 0.8702028843803471, "grad_norm": 5.7628655433654785, "learning_rate": 3.7422892392049356e-06, "loss": 0.3353, "step": 445, "train/lambda_m_mean": 0.04633789090439677, "train/mu_mean": 0.8252735137939453, "train/mu_std": 0.19373109843581915, "train/rewards_chosen_mean": -0.13121604919433594, "train/rewards_rejected_mean": -2.621044635772705, "train/tau_mean": 41.648776054382324, "train/tau_std": 20.201514959335327 }, { "epoch": 0.8721583964800782, "grad_norm": 25.50374984741211, "learning_rate": 3.7388622344071284e-06, "loss": 0.4607, "step": 446, "train/lambda_m_mean": 0.040283204056322575, "train/mu_mean": 0.8186333030462265, "train/mu_std": 0.24101285636425018, "train/rewards_chosen_mean": 0.1491943895816803, "train/rewards_rejected_mean": -2.730009078979492, "train/tau_mean": 45.75091505050659, "train/tau_std": 19.863487005233765 }, { "epoch": 0.8741139085798093, "grad_norm": 13.57422161102295, "learning_rate": 3.7354352296093217e-06, "loss": 0.4266, "step": 447, "train/lambda_m_mean": 0.038085938431322575, "train/mu_mean": 0.8149630203843117, "train/mu_std": 0.23035919107496738, "train/rewards_chosen_mean": 0.3025684356689453, "train/rewards_rejected_mean": -2.4010733366012573, "train/tau_mean": 44.74773693084717, "train/tau_std": 18.38207447528839 }, { "epoch": 0.8760694206795404, "grad_norm": 9.251944541931152, "learning_rate": 3.7320082248115154e-06, "loss": 0.3582, "step": 448, "train/lambda_m_mean": 0.039697266183793545, "train/mu_mean": 0.8120851740241051, "train/mu_std": 0.22084325924515724, "train/rewards_chosen_mean": 0.15410804748535156, "train/rewards_rejected_mean": -2.3048088252544403, "train/tau_mean": 42.81124925613403, "train/tau_std": 18.20044207572937 }, { "epoch": 0.8780249327792715, "grad_norm": 11.584592819213867, "learning_rate": 3.728581220013708e-06, "loss": 0.3815, "step": 449, "train/lambda_m_mean": 0.045312502421438694, "train/mu_mean": 0.7847301885485649, "train/mu_std": 0.2238634191453457, "train/rewards_chosen_mean": 0.22947311401367188, "train/rewards_rejected_mean": -1.8173885345458984, "train/tau_mean": 37.91602802276611, "train/tau_std": 18.664105772972107 }, { "epoch": 0.8799804448790027, "grad_norm": 5.184962749481201, "learning_rate": 3.7251542152159015e-06, "loss": 0.3496, "step": 450, "train/lambda_m_mean": 0.048828125931322575, "train/mu_mean": 0.7913220003247261, "train/mu_std": 0.1914385911077261, "train/rewards_chosen_mean": 0.5733718872070312, "train/rewards_rejected_mean": -1.3215827941894531, "train/tau_mean": 35.105135679244995, "train/tau_std": 18.316518902778625 }, { "epoch": 0.8819359569787338, "grad_norm": 8.172103881835938, "learning_rate": 3.721727210418095e-06, "loss": 0.3922, "step": 451, "train/lambda_m_mean": 0.045556641183793545, "train/mu_mean": 0.7685257866978645, "train/mu_std": 0.21414003148674965, "train/rewards_chosen_mean": 0.9137248992919922, "train/rewards_rejected_mean": -0.8672847151756287, "train/tau_mean": 34.66932153701782, "train/tau_std": 16.501229286193848 }, { "epoch": 0.8838914690784649, "grad_norm": 5.037144184112549, "learning_rate": 3.7183002056202884e-06, "loss": 0.3785, "step": 452, "train/lambda_m_mean": 0.04184570303186774, "train/mu_mean": 0.7650704905390739, "train/mu_std": 0.20871605165302753, "train/rewards_chosen_mean": 0.43547534942626953, "train/rewards_rejected_mean": -1.2735633850097656, "train/tau_mean": 34.72698974609375, "train/tau_std": 15.43503201007843 }, { "epoch": 0.885846981178196, "grad_norm": 3.7037694454193115, "learning_rate": 3.7148732008224813e-06, "loss": 0.3595, "step": 453, "train/lambda_m_mean": 0.04560546902939677, "train/mu_mean": 0.7768047973513603, "train/mu_std": 0.19374884106218815, "train/rewards_chosen_mean": 0.41369175910949707, "train/rewards_rejected_mean": -1.3325212001800537, "train/tau_mean": 34.46933126449585, "train/tau_std": 16.599753975868225 }, { "epoch": 0.8878024932779272, "grad_norm": 4.253098487854004, "learning_rate": 3.7114461960246745e-06, "loss": 0.3516, "step": 454, "train/lambda_m_mean": 0.04233398474752903, "train/mu_mean": 0.7805278673768044, "train/mu_std": 0.1927888374775648, "train/rewards_chosen_mean": 0.5050578117370605, "train/rewards_rejected_mean": -1.2539877891540527, "train/tau_mean": 35.852084159851074, "train/tau_std": 16.366141080856323 }, { "epoch": 0.8897580053776583, "grad_norm": 4.354964256286621, "learning_rate": 3.708019191226868e-06, "loss": 0.3188, "step": 455, "train/lambda_m_mean": 0.04472656361758709, "train/mu_mean": 0.8221408724784851, "train/mu_std": 0.1961651723831892, "train/rewards_chosen_mean": 0.311248779296875, "train/rewards_rejected_mean": -1.8042497634887695, "train/tau_mean": 36.342650413513184, "train/tau_std": 17.811518669128418 }, { "epoch": 0.8917135174773894, "grad_norm": 5.169813632965088, "learning_rate": 3.7045921864290615e-06, "loss": 0.3407, "step": 456, "train/lambda_m_mean": 0.043994140811264515, "train/mu_mean": 0.813373863697052, "train/mu_std": 0.21138911321759224, "train/rewards_chosen_mean": 0.3176918029785156, "train/rewards_rejected_mean": -1.9607796669006348, "train/tau_mean": 36.00144100189209, "train/tau_std": 16.412105560302734 }, { "epoch": 0.8936690295771205, "grad_norm": 4.263603687286377, "learning_rate": 3.7011651816312543e-06, "loss": 0.317, "step": 457, "train/lambda_m_mean": 0.04619140736758709, "train/mu_mean": 0.8264894038438797, "train/mu_std": 0.19872461818158627, "train/rewards_chosen_mean": 0.15939712524414062, "train/rewards_rejected_mean": -2.226688861846924, "train/tau_mean": 36.9932746887207, "train/tau_std": 18.95650601387024 }, { "epoch": 0.8956245416768516, "grad_norm": 7.652323246002197, "learning_rate": 3.697738176833448e-06, "loss": 0.3979, "step": 458, "train/lambda_m_mean": 0.045068359933793545, "train/mu_mean": 0.8083246275782585, "train/mu_std": 0.23422775976359844, "train/rewards_chosen_mean": 0.21402835845947266, "train/rewards_rejected_mean": -2.155986785888672, "train/tau_mean": 38.61276960372925, "train/tau_std": 17.5495468378067 }, { "epoch": 0.8975800537765828, "grad_norm": 7.745325088500977, "learning_rate": 3.6943111720356413e-06, "loss": 0.377, "step": 459, "train/lambda_m_mean": 0.04956054920330644, "train/mu_mean": 0.8112797141075134, "train/mu_std": 0.21607884392142296, "train/rewards_chosen_mean": 0.24382400512695312, "train/rewards_rejected_mean": -2.1042659282684326, "train/tau_mean": 36.494972229003906, "train/tau_std": 17.55349111557007 }, { "epoch": 0.8995355658763139, "grad_norm": 9.80916976928711, "learning_rate": 3.690884167237834e-06, "loss": 0.3648, "step": 460, "train/lambda_m_mean": 0.04306640708819032, "train/mu_mean": 0.7983039021492004, "train/mu_std": 0.22016626968979836, "train/rewards_chosen_mean": 0.36676692962646484, "train/rewards_rejected_mean": -1.7855571508407593, "train/tau_mean": 39.5732102394104, "train/tau_std": 17.763830423355103 }, { "epoch": 0.901491077976045, "grad_norm": 5.84537935256958, "learning_rate": 3.6874571624400278e-06, "loss": 0.3608, "step": 461, "train/lambda_m_mean": 0.044677735306322575, "train/mu_mean": 0.7798709571361542, "train/mu_std": 0.19973759725689888, "train/rewards_chosen_mean": 0.13501334190368652, "train/rewards_rejected_mean": -1.7013548612594604, "train/tau_mean": 38.130717277526855, "train/tau_std": 17.055372714996338 }, { "epoch": 0.9034465900757761, "grad_norm": 4.840860843658447, "learning_rate": 3.684030157642221e-06, "loss": 0.3361, "step": 462, "train/lambda_m_mean": 0.04301757924258709, "train/mu_mean": 0.8129298612475395, "train/mu_std": 0.19728111661970615, "train/rewards_chosen_mean": 0.3986250162124634, "train/rewards_rejected_mean": -1.7708821296691895, "train/tau_mean": 38.35251045227051, "train/tau_std": 17.426863074302673 }, { "epoch": 0.9054021021755072, "grad_norm": 4.550471782684326, "learning_rate": 3.6806031528444143e-06, "loss": 0.3455, "step": 463, "train/lambda_m_mean": 0.04912109486758709, "train/mu_mean": 0.8071611672639847, "train/mu_std": 0.20544232986867428, "train/rewards_chosen_mean": 0.37793493270874023, "train/rewards_rejected_mean": -1.7069000601768494, "train/tau_mean": 34.69684171676636, "train/tau_std": 17.68169605731964 }, { "epoch": 0.9073576142752383, "grad_norm": 5.936154365539551, "learning_rate": 3.677176148046608e-06, "loss": 0.338, "step": 464, "train/lambda_m_mean": 0.04467773484066129, "train/mu_mean": 0.7937967255711555, "train/mu_std": 0.18612239137291908, "train/rewards_chosen_mean": 0.4670224189758301, "train/rewards_rejected_mean": -1.4398273825645447, "train/tau_mean": 38.180203437805176, "train/tau_std": 17.39568066596985 }, { "epoch": 0.9093131263749694, "grad_norm": 5.303564071655273, "learning_rate": 3.673749143248801e-06, "loss": 0.3541, "step": 465, "train/lambda_m_mean": 0.03950195387005806, "train/mu_mean": 0.7849394306540489, "train/mu_std": 0.20911014638841152, "train/rewards_chosen_mean": 0.7022589445114136, "train/rewards_rejected_mean": -1.2846603393554688, "train/tau_mean": 43.08836793899536, "train/tau_std": 19.59549856185913 }, { "epoch": 0.9112686384747005, "grad_norm": 4.091871738433838, "learning_rate": 3.670322138450994e-06, "loss": 0.343, "step": 466, "train/lambda_m_mean": 0.04614257859066129, "train/mu_mean": 0.801065668463707, "train/mu_std": 0.19730531051754951, "train/rewards_chosen_mean": 0.5504845380783081, "train/rewards_rejected_mean": -1.5363141596317291, "train/tau_mean": 39.22069692611694, "train/tau_std": 18.897924780845642 }, { "epoch": 0.9132241505744316, "grad_norm": 5.1252617835998535, "learning_rate": 3.6668951336531874e-06, "loss": 0.3633, "step": 467, "train/lambda_m_mean": 0.04472656361758709, "train/mu_mean": 0.7947822511196136, "train/mu_std": 0.2062288038432598, "train/rewards_chosen_mean": 0.8171253204345703, "train/rewards_rejected_mean": -1.292520523071289, "train/tau_mean": 40.58057165145874, "train/tau_std": 18.446468114852905 }, { "epoch": 0.9151796626741628, "grad_norm": 4.841407299041748, "learning_rate": 3.663468128855381e-06, "loss": 0.3142, "step": 468, "train/lambda_m_mean": 0.049511720426380634, "train/mu_mean": 0.8195517063140869, "train/mu_std": 0.18021339923143387, "train/rewards_chosen_mean": 0.47997140884399414, "train/rewards_rejected_mean": -1.6657190322875977, "train/tau_mean": 38.184980154037476, "train/tau_std": 19.249030351638794 }, { "epoch": 0.917135174773894, "grad_norm": 8.398624420166016, "learning_rate": 3.660041124057574e-06, "loss": 0.3339, "step": 469, "train/lambda_m_mean": 0.04741211188957095, "train/mu_mean": 0.8308653831481934, "train/mu_std": 0.19248445006087422, "train/rewards_chosen_mean": 1.0766630172729492, "train/rewards_rejected_mean": -1.3436355590820312, "train/tau_mean": 38.267066955566406, "train/tau_std": 19.677135705947876 }, { "epoch": 0.919090686873625, "grad_norm": 5.778276443481445, "learning_rate": 3.656614119259767e-06, "loss": 0.3702, "step": 470, "train/lambda_m_mean": 0.0413085944019258, "train/mu_mean": 0.806391678750515, "train/mu_std": 0.22944622300565243, "train/rewards_chosen_mean": 1.006521224975586, "train/rewards_rejected_mean": -1.3236961364746094, "train/tau_mean": 43.15369701385498, "train/tau_std": 19.823938369750977 }, { "epoch": 0.9210461989733562, "grad_norm": 5.643806457519531, "learning_rate": 3.653187114461961e-06, "loss": 0.2924, "step": 471, "train/lambda_m_mean": 0.039697266183793545, "train/mu_mean": 0.8365089148283005, "train/mu_std": 0.19949527084827423, "train/rewards_chosen_mean": 1.2700138092041016, "train/rewards_rejected_mean": -1.2708425521850586, "train/tau_mean": 40.888529777526855, "train/tau_std": 20.313745498657227 }, { "epoch": 0.9230017110730873, "grad_norm": 3.750397205352783, "learning_rate": 3.6497601096641537e-06, "loss": 0.3364, "step": 472, "train/lambda_m_mean": 0.04414062644354999, "train/mu_mean": 0.8048004731535912, "train/mu_std": 0.20144745707511902, "train/rewards_chosen_mean": 0.9875507354736328, "train/rewards_rejected_mean": -1.1712369918823242, "train/tau_mean": 39.84860372543335, "train/tau_std": 18.731982350349426 }, { "epoch": 0.9249572231728184, "grad_norm": 5.3757548332214355, "learning_rate": 3.646333104866347e-06, "loss": 0.3439, "step": 473, "train/lambda_m_mean": 0.043505859561264515, "train/mu_mean": 0.7947958707809448, "train/mu_std": 0.19719650223851204, "train/rewards_chosen_mean": 0.4993748664855957, "train/rewards_rejected_mean": -1.5445491075515747, "train/tau_mean": 38.32470512390137, "train/tau_std": 18.45390748977661 }, { "epoch": 0.9269127352725495, "grad_norm": 3.991332530975342, "learning_rate": 3.6429061000685406e-06, "loss": 0.3168, "step": 474, "train/lambda_m_mean": 0.04189453274011612, "train/mu_mean": 0.8134840130805969, "train/mu_std": 0.19578777998685837, "train/rewards_chosen_mean": 0.5121684074401855, "train/rewards_rejected_mean": -1.6283364295959473, "train/tau_mean": 38.03148412704468, "train/tau_std": 18.10262680053711 }, { "epoch": 0.9288682473722806, "grad_norm": 6.063412189483643, "learning_rate": 3.639479095270734e-06, "loss": 0.4213, "step": 475, "train/lambda_m_mean": 0.047460938803851604, "train/mu_mean": 0.7493914216756821, "train/mu_std": 0.224253935739398, "train/rewards_chosen_mean": 0.386061429977417, "train/rewards_rejected_mean": -1.3179163932800293, "train/tau_mean": 38.09343910217285, "train/tau_std": 20.297006845474243 }, { "epoch": 0.9308237594720117, "grad_norm": 14.998209953308105, "learning_rate": 3.6360520904729267e-06, "loss": 0.3293, "step": 476, "train/lambda_m_mean": 0.04536132887005806, "train/mu_mean": 0.8168286606669426, "train/mu_std": 0.2003151085227728, "train/rewards_chosen_mean": 0.44748497009277344, "train/rewards_rejected_mean": -1.84324049949646, "train/tau_mean": 36.94851613044739, "train/tau_std": 19.30376410484314 }, { "epoch": 0.9327792715717429, "grad_norm": 6.53387975692749, "learning_rate": 3.6326250856751204e-06, "loss": 0.3307, "step": 477, "train/lambda_m_mean": 0.042236329056322575, "train/mu_mean": 0.8110843449831009, "train/mu_std": 0.20257257483899593, "train/rewards_chosen_mean": 0.3439760208129883, "train/rewards_rejected_mean": -1.92340087890625, "train/tau_mean": 41.05791616439819, "train/tau_std": 19.35396456718445 }, { "epoch": 0.934734783671474, "grad_norm": 3.2387099266052246, "learning_rate": 3.6291980808773137e-06, "loss": 0.2835, "step": 478, "train/lambda_m_mean": 0.04248047014698386, "train/mu_mean": 0.8438010364770889, "train/mu_std": 0.18306062277406454, "train/rewards_chosen_mean": 0.39785099029541016, "train/rewards_rejected_mean": -2.1662559509277344, "train/tau_mean": 41.44697046279907, "train/tau_std": 20.383713006973267 }, { "epoch": 0.9366902957712051, "grad_norm": 5.3913044929504395, "learning_rate": 3.625771076079507e-06, "loss": 0.3458, "step": 479, "train/lambda_m_mean": 0.043017580173909664, "train/mu_mean": 0.8028832599520683, "train/mu_std": 0.20389832742512226, "train/rewards_chosen_mean": 0.17252731323242188, "train/rewards_rejected_mean": -2.0263214111328125, "train/tau_mean": 41.8539605140686, "train/tau_std": 19.056846380233765 }, { "epoch": 0.9386458078709362, "grad_norm": 10.087727546691895, "learning_rate": 3.6223440712816998e-06, "loss": 0.4248, "step": 480, "train/lambda_m_mean": 0.04130859533324838, "train/mu_mean": 0.780384324491024, "train/mu_std": 0.22529771737754345, "train/rewards_chosen_mean": 0.26221609115600586, "train/rewards_rejected_mean": -1.8777408599853516, "train/tau_mean": 43.21238708496094, "train/tau_std": 22.10599184036255 }, { "epoch": 0.9406013199706673, "grad_norm": 12.654647827148438, "learning_rate": 3.6189170664838935e-06, "loss": 0.4328, "step": 481, "train/lambda_m_mean": 0.04453125176951289, "train/mu_mean": 0.7936356216669083, "train/mu_std": 0.23844346590340137, "train/rewards_chosen_mean": 0.0672452449798584, "train/rewards_rejected_mean": -2.0955543518066406, "train/tau_mean": 40.78932762145996, "train/tau_std": 22.237476587295532 }, { "epoch": 0.9425568320703984, "grad_norm": 4.494231700897217, "learning_rate": 3.6154900616860867e-06, "loss": 0.3282, "step": 482, "train/lambda_m_mean": 0.0444824225269258, "train/mu_mean": 0.8077403083443642, "train/mu_std": 0.19157173298299313, "train/rewards_chosen_mean": 0.10262107849121094, "train/rewards_rejected_mean": -1.9303359985351562, "train/tau_mean": 41.05617570877075, "train/tau_std": 21.49762749671936 }, { "epoch": 0.9445123441701295, "grad_norm": 5.322830677032471, "learning_rate": 3.6120630568882796e-06, "loss": 0.3139, "step": 483, "train/lambda_m_mean": 0.038183594355359674, "train/mu_mean": 0.8180460184812546, "train/mu_std": 0.19815881364047527, "train/rewards_chosen_mean": 0.07110351324081421, "train/rewards_rejected_mean": -2.187145233154297, "train/tau_mean": 43.461612701416016, "train/tau_std": 19.877142906188965 }, { "epoch": 0.9464678562698606, "grad_norm": 5.071075916290283, "learning_rate": 3.6086360520904733e-06, "loss": 0.4003, "step": 484, "train/lambda_m_mean": 0.04702148586511612, "train/mu_mean": 0.7740271762013435, "train/mu_std": 0.20939024351537228, "train/rewards_chosen_mean": -0.20299148559570312, "train/rewards_rejected_mean": -1.9901991784572601, "train/tau_mean": 39.365256786346436, "train/tau_std": 21.224119186401367 }, { "epoch": 0.9484233683695917, "grad_norm": 5.6981706619262695, "learning_rate": 3.6052090472926665e-06, "loss": 0.3713, "step": 485, "train/lambda_m_mean": 0.04663085984066129, "train/mu_mean": 0.7880885601043701, "train/mu_std": 0.21013422682881355, "train/rewards_chosen_mean": -0.2221965789794922, "train/rewards_rejected_mean": -2.1726551055908203, "train/tau_mean": 37.46779441833496, "train/tau_std": 18.78421115875244 }, { "epoch": 0.950378880469323, "grad_norm": 5.8219523429870605, "learning_rate": 3.6017820424948598e-06, "loss": 0.3244, "step": 486, "train/lambda_m_mean": 0.042480469681322575, "train/mu_mean": 0.8153344988822937, "train/mu_std": 0.1979879904538393, "train/rewards_chosen_mean": -0.34853267669677734, "train/rewards_rejected_mean": -2.4765968322753906, "train/tau_mean": 41.81238770484924, "train/tau_std": 20.58939027786255 }, { "epoch": 0.9523343925690541, "grad_norm": 4.7076945304870605, "learning_rate": 3.5983550376970535e-06, "loss": 0.3777, "step": 487, "train/lambda_m_mean": 0.04736328357830644, "train/mu_mean": 0.780088797211647, "train/mu_std": 0.21765496768057346, "train/rewards_chosen_mean": 0.10890436172485352, "train/rewards_rejected_mean": -1.855860710144043, "train/tau_mean": 39.624459743499756, "train/tau_std": 21.933829069137573 }, { "epoch": 0.9542899046687852, "grad_norm": 9.124286651611328, "learning_rate": 3.5949280328992463e-06, "loss": 0.4122, "step": 488, "train/lambda_m_mean": 0.04741211049258709, "train/mu_mean": 0.7630868181586266, "train/mu_std": 0.23399281315505505, "train/rewards_chosen_mean": -0.00792694091796875, "train/rewards_rejected_mean": -1.7783088684082031, "train/tau_mean": 39.91012954711914, "train/tau_std": 21.994359731674194 }, { "epoch": 0.9562454167685163, "grad_norm": 7.721490859985352, "learning_rate": 3.5915010281014396e-06, "loss": 0.3974, "step": 489, "train/lambda_m_mean": 0.04521484533324838, "train/mu_mean": 0.7871584370732307, "train/mu_std": 0.23694059997797012, "train/rewards_chosen_mean": 0.13677978515625, "train/rewards_rejected_mean": -1.9100379943847656, "train/tau_mean": 39.37200880050659, "train/tau_std": 20.68173313140869 }, { "epoch": 0.9582009288682474, "grad_norm": 4.717765808105469, "learning_rate": 3.5880740233036333e-06, "loss": 0.273, "step": 490, "train/lambda_m_mean": 0.044238283298909664, "train/mu_mean": 0.8455358892679214, "train/mu_std": 0.1666601886972785, "train/rewards_chosen_mean": 0.06423664093017578, "train/rewards_rejected_mean": -2.3085784912109375, "train/tau_mean": 38.46659231185913, "train/tau_std": 19.70527482032776 }, { "epoch": 0.9601564409679785, "grad_norm": 3.485783100128174, "learning_rate": 3.584647018505826e-06, "loss": 0.3307, "step": 491, "train/lambda_m_mean": 0.0478515625, "train/mu_mean": 0.8077580705285072, "train/mu_std": 0.18753743544220924, "train/rewards_chosen_mean": -0.11069488525390625, "train/rewards_rejected_mean": -2.202381134033203, "train/tau_mean": 38.55243492126465, "train/tau_std": 19.648804426193237 }, { "epoch": 0.9621119530677096, "grad_norm": 3.2215824127197266, "learning_rate": 3.5812200137080194e-06, "loss": 0.2886, "step": 492, "train/lambda_m_mean": 0.04575195349752903, "train/mu_mean": 0.8466953709721565, "train/mu_std": 0.18308041617274284, "train/rewards_chosen_mean": 0.36334657669067383, "train/rewards_rejected_mean": -2.2154197692871094, "train/tau_mean": 39.47740983963013, "train/tau_std": 19.955599308013916 }, { "epoch": 0.9640674651674407, "grad_norm": 7.632595539093018, "learning_rate": 3.5777930089102126e-06, "loss": 0.4176, "step": 493, "train/lambda_m_mean": 0.04184570396319032, "train/mu_mean": 0.7783474698662758, "train/mu_std": 0.23597870022058487, "train/rewards_chosen_mean": 0.05827522277832031, "train/rewards_rejected_mean": -1.9204902648925781, "train/tau_mean": 43.54044771194458, "train/tau_std": 21.519464015960693 }, { "epoch": 0.9660229772671718, "grad_norm": 6.7310943603515625, "learning_rate": 3.5743660041124063e-06, "loss": 0.347, "step": 494, "train/lambda_m_mean": 0.04697265708819032, "train/mu_mean": 0.8050830215215683, "train/mu_std": 0.21094011887907982, "train/rewards_chosen_mean": 0.09329605102539062, "train/rewards_rejected_mean": -2.0667524337768555, "train/tau_mean": 39.220288038253784, "train/tau_std": 20.640413284301758 }, { "epoch": 0.967978489366903, "grad_norm": 7.979755878448486, "learning_rate": 3.570938999314599e-06, "loss": 0.3134, "step": 495, "train/lambda_m_mean": 0.04199218889698386, "train/mu_mean": 0.8304036483168602, "train/mu_std": 0.20876771956682205, "train/rewards_chosen_mean": 0.19663357734680176, "train/rewards_rejected_mean": -2.2834479212760925, "train/tau_mean": 41.27014398574829, "train/tau_std": 21.96702218055725 }, { "epoch": 0.9699340014666341, "grad_norm": 6.781925201416016, "learning_rate": 3.5675119945167924e-06, "loss": 0.3147, "step": 496, "train/lambda_m_mean": 0.042724610306322575, "train/mu_mean": 0.8167672008275986, "train/mu_std": 0.19245580956339836, "train/rewards_chosen_mean": 0.24750089645385742, "train/rewards_rejected_mean": -1.9578800201416016, "train/tau_mean": 41.22984504699707, "train/tau_std": 21.059587955474854 }, { "epoch": 0.9718895135663652, "grad_norm": 5.418050289154053, "learning_rate": 3.564084989718986e-06, "loss": 0.3669, "step": 497, "train/lambda_m_mean": 0.04057617345824838, "train/mu_mean": 0.7966624200344086, "train/mu_std": 0.21463802736252546, "train/rewards_chosen_mean": 0.4049243927001953, "train/rewards_rejected_mean": -1.6810083389282227, "train/tau_mean": 39.66510629653931, "train/tau_std": 19.694324612617493 }, { "epoch": 0.9738450256660963, "grad_norm": 8.100720405578613, "learning_rate": 3.5606579849211794e-06, "loss": 0.3767, "step": 498, "train/lambda_m_mean": 0.045898438431322575, "train/mu_mean": 0.781494490802288, "train/mu_std": 0.22326932474970818, "train/rewards_chosen_mean": 0.5389032661914825, "train/rewards_rejected_mean": -1.4121296405792236, "train/tau_mean": 39.053916931152344, "train/tau_std": 21.850138664245605 }, { "epoch": 0.9758005377658274, "grad_norm": 5.567352771759033, "learning_rate": 3.557230980123372e-06, "loss": 0.3827, "step": 499, "train/lambda_m_mean": 0.04550781287252903, "train/mu_mean": 0.7728244066238403, "train/mu_std": 0.21370028145611286, "train/rewards_chosen_mean": 0.3679487407207489, "train/rewards_rejected_mean": -1.4936760663986206, "train/tau_mean": 37.376458168029785, "train/tau_std": 19.44704246520996 }, { "epoch": 0.9777560498655585, "grad_norm": 4.178323268890381, "learning_rate": 3.553803975325566e-06, "loss": 0.3101, "step": 500, "train/lambda_m_mean": 0.04189453134313226, "train/mu_mean": 0.815443642437458, "train/mu_std": 0.18389382306486368, "train/rewards_chosen_mean": 0.38417911529541016, "train/rewards_rejected_mean": -1.744734764099121, "train/tau_mean": 40.21849203109741, "train/tau_std": 19.07002818584442 }, { "epoch": 0.9797115619652896, "grad_norm": 5.942978382110596, "learning_rate": 3.550376970527759e-06, "loss": 0.3905, "step": 501, "train/lambda_m_mean": 0.047509766183793545, "train/mu_mean": 0.7818047106266022, "train/mu_std": 0.20927602611482143, "train/rewards_chosen_mean": 0.30657291412353516, "train/rewards_rejected_mean": -1.6687912940979004, "train/tau_mean": 38.66069841384888, "train/tau_std": 21.483507871627808 }, { "epoch": 0.9816670740650207, "grad_norm": 17.166275024414062, "learning_rate": 3.5469499657299524e-06, "loss": 0.4208, "step": 502, "train/lambda_m_mean": 0.046777344308793545, "train/mu_mean": 0.8064995855093002, "train/mu_std": 0.21446431800723076, "train/rewards_chosen_mean": 0.595888614654541, "train/rewards_rejected_mean": -1.7256269454956055, "train/tau_mean": 40.55337691307068, "train/tau_std": 21.804219007492065 }, { "epoch": 0.9836225861647518, "grad_norm": 8.930787086486816, "learning_rate": 3.5435229609321457e-06, "loss": 0.3718, "step": 503, "train/lambda_m_mean": 0.04770508036017418, "train/mu_mean": 0.8047753348946571, "train/mu_std": 0.2174141053110361, "train/rewards_chosen_mean": 0.29480552673339844, "train/rewards_rejected_mean": -2.0428543090820312, "train/tau_mean": 38.404600620269775, "train/tau_std": 21.104411602020264 }, { "epoch": 0.9855780982644831, "grad_norm": 5.422333717346191, "learning_rate": 3.540095956134339e-06, "loss": 0.329, "step": 504, "train/lambda_m_mean": 0.04829101776704192, "train/mu_mean": 0.8067228198051453, "train/mu_std": 0.18968386575579643, "train/rewards_chosen_mean": 0.048435211181640625, "train/rewards_rejected_mean": -2.0099610686302185, "train/tau_mean": 39.85560154914856, "train/tau_std": 22.50836968421936 }, { "epoch": 0.9875336103642142, "grad_norm": 3.9332644939422607, "learning_rate": 3.536668951336532e-06, "loss": 0.2919, "step": 505, "train/lambda_m_mean": 0.04721679771319032, "train/mu_mean": 0.8344307467341423, "train/mu_std": 0.181368637830019, "train/rewards_chosen_mean": -0.17637348175048828, "train/rewards_rejected_mean": -2.4606056213378906, "train/tau_mean": 37.95946455001831, "train/tau_std": 19.185182571411133 }, { "epoch": 0.9894891224639453, "grad_norm": 6.99998140335083, "learning_rate": 3.533241946538725e-06, "loss": 0.3987, "step": 506, "train/lambda_m_mean": 0.04868164239451289, "train/mu_mean": 0.7779305800795555, "train/mu_std": 0.2198953591287136, "train/rewards_chosen_mean": -0.0581812858581543, "train/rewards_rejected_mean": -2.0092368125915527, "train/tau_mean": 39.953431129455566, "train/tau_std": 20.442979097366333 }, { "epoch": 0.9914446345636764, "grad_norm": 6.654542446136475, "learning_rate": 3.5298149417409187e-06, "loss": 0.3767, "step": 507, "train/lambda_m_mean": 0.041064453311264515, "train/mu_mean": 0.7809784784913063, "train/mu_std": 0.21361875347793102, "train/rewards_chosen_mean": -0.3048287630081177, "train/rewards_rejected_mean": -2.231318235397339, "train/tau_mean": 43.282451152801514, "train/tau_std": 19.690260648727417 }, { "epoch": 0.9934001466634075, "grad_norm": 5.081980228424072, "learning_rate": 3.526387936943112e-06, "loss": 0.3385, "step": 508, "train/lambda_m_mean": 0.046728516928851604, "train/mu_mean": 0.8095248639583588, "train/mu_std": 0.1926268506795168, "train/rewards_chosen_mean": -0.03839588165283203, "train/rewards_rejected_mean": -2.1689453125, "train/tau_mean": 40.23344421386719, "train/tau_std": 21.25540065765381 }, { "epoch": 0.9953556587631386, "grad_norm": 4.67738676071167, "learning_rate": 3.5229609321453053e-06, "loss": 0.3632, "step": 509, "train/lambda_m_mean": 0.04545898595824838, "train/mu_mean": 0.7929603829979897, "train/mu_std": 0.2146533988416195, "train/rewards_chosen_mean": -0.05233120918273926, "train/rewards_rejected_mean": -2.149465560913086, "train/tau_mean": 42.35365581512451, "train/tau_std": 21.67964482307434 }, { "epoch": 0.9973111708628697, "grad_norm": 5.067123889923096, "learning_rate": 3.519533927347499e-06, "loss": 0.3546, "step": 510, "train/lambda_m_mean": 0.04360351664945483, "train/mu_mean": 0.7982501685619354, "train/mu_std": 0.20910649001598358, "train/rewards_chosen_mean": -0.1885356903076172, "train/rewards_rejected_mean": -2.2818374633789062, "train/tau_mean": 41.694032192230225, "train/tau_std": 21.257568836212158 }, { "epoch": 0.9992666829626008, "grad_norm": 8.434648513793945, "learning_rate": 3.5161069225496918e-06, "loss": 0.3687, "step": 511, "train/lambda_m_mean": 0.04301757924258709, "train/mu_mean": 0.8061369135975838, "train/mu_std": 0.2070859894156456, "train/rewards_chosen_mean": -0.1971454620361328, "train/rewards_rejected_mean": -2.291593551635742, "train/tau_mean": 44.411585330963135, "train/tau_std": 21.159096002578735 }, { "epoch": 1.0, "grad_norm": 8.33646297454834, "learning_rate": 3.512679917751885e-06, "loss": 0.3491, "step": 512, "train/lambda_m_mean": 0.053776043156782784, "train/mu_mean": 0.7931918303171793, "train/mu_std": 0.19381321469942728, "train/rewards_chosen_mean": -0.051183064778645836, "train/rewards_rejected_mean": -1.976043701171875, "train/tau_mean": 38.31310017903646, "train/tau_std": 20.840969721476238 } ], "logging_steps": 1, "max_steps": 1536, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }