| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.7036059806508356, | |
| "eval_steps": 500, | |
| "global_step": 200, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.003518029903254178, | |
| "grad_norm": 1.4669007062911987, | |
| "learning_rate": 0.0001, | |
| "loss": 2.9718, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.007036059806508356, | |
| "grad_norm": 1.5240416526794434, | |
| "learning_rate": 9.949748743718594e-05, | |
| "loss": 3.0249, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.010554089709762533, | |
| "grad_norm": 1.3310328722000122, | |
| "learning_rate": 9.899497487437186e-05, | |
| "loss": 2.7545, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.014072119613016711, | |
| "grad_norm": 1.4892698526382446, | |
| "learning_rate": 9.84924623115578e-05, | |
| "loss": 2.6703, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.01759014951627089, | |
| "grad_norm": 1.4727792739868164, | |
| "learning_rate": 9.798994974874372e-05, | |
| "loss": 2.4731, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.021108179419525065, | |
| "grad_norm": 1.4451979398727417, | |
| "learning_rate": 9.748743718592965e-05, | |
| "loss": 2.2243, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.024626209322779244, | |
| "grad_norm": 1.3103245496749878, | |
| "learning_rate": 9.698492462311559e-05, | |
| "loss": 2.0194, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.028144239226033423, | |
| "grad_norm": 1.4852089881896973, | |
| "learning_rate": 9.64824120603015e-05, | |
| "loss": 1.9349, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.0316622691292876, | |
| "grad_norm": 1.5170249938964844, | |
| "learning_rate": 9.597989949748745e-05, | |
| "loss": 1.7582, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.03518029903254178, | |
| "grad_norm": 1.3428442478179932, | |
| "learning_rate": 9.547738693467337e-05, | |
| "loss": 1.6313, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.03869832893579595, | |
| "grad_norm": 1.0400348901748657, | |
| "learning_rate": 9.49748743718593e-05, | |
| "loss": 1.4358, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.04221635883905013, | |
| "grad_norm": 0.9891974329948425, | |
| "learning_rate": 9.447236180904523e-05, | |
| "loss": 1.3738, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.04573438874230431, | |
| "grad_norm": 0.6980912685394287, | |
| "learning_rate": 9.396984924623115e-05, | |
| "loss": 1.425, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.04925241864555849, | |
| "grad_norm": 0.6836680769920349, | |
| "learning_rate": 9.34673366834171e-05, | |
| "loss": 1.4575, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.052770448548812667, | |
| "grad_norm": 0.9314870238304138, | |
| "learning_rate": 9.296482412060302e-05, | |
| "loss": 1.3206, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.056288478452066845, | |
| "grad_norm": 0.6797922253608704, | |
| "learning_rate": 9.246231155778895e-05, | |
| "loss": 1.3724, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.05980650835532102, | |
| "grad_norm": 0.6958814263343811, | |
| "learning_rate": 9.195979899497488e-05, | |
| "loss": 1.3661, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.0633245382585752, | |
| "grad_norm": 0.7188398241996765, | |
| "learning_rate": 9.14572864321608e-05, | |
| "loss": 1.3242, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.06684256816182937, | |
| "grad_norm": 0.8997742533683777, | |
| "learning_rate": 9.095477386934675e-05, | |
| "loss": 1.4049, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.07036059806508356, | |
| "grad_norm": 0.8283623456954956, | |
| "learning_rate": 9.045226130653267e-05, | |
| "loss": 1.3121, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.07387862796833773, | |
| "grad_norm": 0.8064684867858887, | |
| "learning_rate": 8.99497487437186e-05, | |
| "loss": 1.3451, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.0773966578715919, | |
| "grad_norm": 0.8180544972419739, | |
| "learning_rate": 8.944723618090453e-05, | |
| "loss": 1.2111, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.08091468777484609, | |
| "grad_norm": 0.8000004887580872, | |
| "learning_rate": 8.894472361809045e-05, | |
| "loss": 1.2933, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.08443271767810026, | |
| "grad_norm": 0.8804137706756592, | |
| "learning_rate": 8.84422110552764e-05, | |
| "loss": 1.3173, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.08795074758135445, | |
| "grad_norm": 0.8556327819824219, | |
| "learning_rate": 8.793969849246232e-05, | |
| "loss": 1.321, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.09146877748460862, | |
| "grad_norm": 0.827410876750946, | |
| "learning_rate": 8.743718592964825e-05, | |
| "loss": 1.2195, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.09498680738786279, | |
| "grad_norm": 0.9081262946128845, | |
| "learning_rate": 8.693467336683418e-05, | |
| "loss": 1.2451, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.09850483729111698, | |
| "grad_norm": 0.9331269860267639, | |
| "learning_rate": 8.64321608040201e-05, | |
| "loss": 1.2204, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.10202286719437115, | |
| "grad_norm": 1.0290558338165283, | |
| "learning_rate": 8.592964824120603e-05, | |
| "loss": 1.2379, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.10554089709762533, | |
| "grad_norm": 1.1296031475067139, | |
| "learning_rate": 8.542713567839196e-05, | |
| "loss": 1.2412, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.1090589270008795, | |
| "grad_norm": 1.1690081357955933, | |
| "learning_rate": 8.49246231155779e-05, | |
| "loss": 1.1888, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.11257695690413369, | |
| "grad_norm": 1.1313647031784058, | |
| "learning_rate": 8.442211055276383e-05, | |
| "loss": 1.2961, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.11609498680738786, | |
| "grad_norm": 1.1976656913757324, | |
| "learning_rate": 8.391959798994975e-05, | |
| "loss": 1.2387, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.11961301671064203, | |
| "grad_norm": 1.20232355594635, | |
| "learning_rate": 8.341708542713568e-05, | |
| "loss": 1.3125, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.12313104661389622, | |
| "grad_norm": 1.2482579946517944, | |
| "learning_rate": 8.291457286432161e-05, | |
| "loss": 1.322, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.1266490765171504, | |
| "grad_norm": 1.0197736024856567, | |
| "learning_rate": 8.241206030150754e-05, | |
| "loss": 1.1192, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.13016710642040458, | |
| "grad_norm": 0.9190375208854675, | |
| "learning_rate": 8.190954773869348e-05, | |
| "loss": 1.2522, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.13368513632365875, | |
| "grad_norm": 0.7511453032493591, | |
| "learning_rate": 8.14070351758794e-05, | |
| "loss": 1.0525, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.13720316622691292, | |
| "grad_norm": 0.7151877880096436, | |
| "learning_rate": 8.090452261306533e-05, | |
| "loss": 1.1839, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.14072119613016712, | |
| "grad_norm": 0.6375951766967773, | |
| "learning_rate": 8.040201005025126e-05, | |
| "loss": 1.2203, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.1442392260334213, | |
| "grad_norm": 0.6267354488372803, | |
| "learning_rate": 7.989949748743719e-05, | |
| "loss": 1.1996, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.14775725593667546, | |
| "grad_norm": 0.5620112419128418, | |
| "learning_rate": 7.939698492462313e-05, | |
| "loss": 1.1745, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.15127528583992964, | |
| "grad_norm": 0.6898969411849976, | |
| "learning_rate": 7.889447236180904e-05, | |
| "loss": 1.2377, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.1547933157431838, | |
| "grad_norm": 0.5548388957977295, | |
| "learning_rate": 7.839195979899498e-05, | |
| "loss": 1.1654, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.158311345646438, | |
| "grad_norm": 0.5869529843330383, | |
| "learning_rate": 7.788944723618091e-05, | |
| "loss": 1.1669, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.16182937554969218, | |
| "grad_norm": 0.6272417902946472, | |
| "learning_rate": 7.738693467336684e-05, | |
| "loss": 1.132, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.16534740545294635, | |
| "grad_norm": 0.6158267855644226, | |
| "learning_rate": 7.688442211055277e-05, | |
| "loss": 1.0767, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.16886543535620052, | |
| "grad_norm": 0.661561906337738, | |
| "learning_rate": 7.638190954773869e-05, | |
| "loss": 1.1867, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.1723834652594547, | |
| "grad_norm": 0.5605206489562988, | |
| "learning_rate": 7.587939698492463e-05, | |
| "loss": 1.1243, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.1759014951627089, | |
| "grad_norm": 0.6338799595832825, | |
| "learning_rate": 7.537688442211056e-05, | |
| "loss": 1.1635, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.17941952506596306, | |
| "grad_norm": 0.7251884937286377, | |
| "learning_rate": 7.487437185929649e-05, | |
| "loss": 1.1462, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.18293755496921724, | |
| "grad_norm": 0.5688169598579407, | |
| "learning_rate": 7.437185929648241e-05, | |
| "loss": 1.1351, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.1864555848724714, | |
| "grad_norm": 0.6056070923805237, | |
| "learning_rate": 7.386934673366834e-05, | |
| "loss": 1.1352, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.18997361477572558, | |
| "grad_norm": 0.8283679485321045, | |
| "learning_rate": 7.336683417085427e-05, | |
| "loss": 1.2222, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.19349164467897978, | |
| "grad_norm": 0.6316900253295898, | |
| "learning_rate": 7.28643216080402e-05, | |
| "loss": 1.2023, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.19700967458223395, | |
| "grad_norm": 0.6092143058776855, | |
| "learning_rate": 7.236180904522614e-05, | |
| "loss": 1.0762, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.20052770448548812, | |
| "grad_norm": 0.5600019097328186, | |
| "learning_rate": 7.185929648241206e-05, | |
| "loss": 1.0127, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.2040457343887423, | |
| "grad_norm": 0.6157863736152649, | |
| "learning_rate": 7.135678391959799e-05, | |
| "loss": 1.1016, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.2075637642919965, | |
| "grad_norm": 0.6391822099685669, | |
| "learning_rate": 7.085427135678392e-05, | |
| "loss": 1.2009, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.21108179419525067, | |
| "grad_norm": 0.5637600421905518, | |
| "learning_rate": 7.035175879396985e-05, | |
| "loss": 1.1419, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.21459982409850484, | |
| "grad_norm": 0.6826542019844055, | |
| "learning_rate": 6.984924623115579e-05, | |
| "loss": 1.1084, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.218117854001759, | |
| "grad_norm": 0.6475107073783875, | |
| "learning_rate": 6.93467336683417e-05, | |
| "loss": 1.2033, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.22163588390501318, | |
| "grad_norm": 0.5701493620872498, | |
| "learning_rate": 6.884422110552764e-05, | |
| "loss": 1.1425, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.22515391380826738, | |
| "grad_norm": 0.5416231155395508, | |
| "learning_rate": 6.834170854271357e-05, | |
| "loss": 1.0869, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.22867194371152155, | |
| "grad_norm": 0.611254870891571, | |
| "learning_rate": 6.78391959798995e-05, | |
| "loss": 1.1344, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.23218997361477572, | |
| "grad_norm": 0.5644116401672363, | |
| "learning_rate": 6.733668341708544e-05, | |
| "loss": 1.0655, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.2357080035180299, | |
| "grad_norm": 0.5953249931335449, | |
| "learning_rate": 6.683417085427135e-05, | |
| "loss": 1.1267, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.23922603342128407, | |
| "grad_norm": 0.5902895331382751, | |
| "learning_rate": 6.633165829145729e-05, | |
| "loss": 1.1207, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.24274406332453827, | |
| "grad_norm": 0.571882426738739, | |
| "learning_rate": 6.582914572864322e-05, | |
| "loss": 1.0945, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.24626209322779244, | |
| "grad_norm": 0.6372458934783936, | |
| "learning_rate": 6.532663316582915e-05, | |
| "loss": 1.1933, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.2497801231310466, | |
| "grad_norm": 0.6739147901535034, | |
| "learning_rate": 6.482412060301508e-05, | |
| "loss": 1.1202, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.2532981530343008, | |
| "grad_norm": 0.6515147686004639, | |
| "learning_rate": 6.4321608040201e-05, | |
| "loss": 1.1685, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.256816182937555, | |
| "grad_norm": 0.5706716775894165, | |
| "learning_rate": 6.381909547738694e-05, | |
| "loss": 1.1084, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.26033421284080915, | |
| "grad_norm": 0.595585286617279, | |
| "learning_rate": 6.331658291457287e-05, | |
| "loss": 1.1218, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.2638522427440633, | |
| "grad_norm": 0.6020475625991821, | |
| "learning_rate": 6.28140703517588e-05, | |
| "loss": 1.1282, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.2673702726473175, | |
| "grad_norm": 0.628376305103302, | |
| "learning_rate": 6.231155778894473e-05, | |
| "loss": 1.1067, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.27088830255057167, | |
| "grad_norm": 0.6371076107025146, | |
| "learning_rate": 6.180904522613065e-05, | |
| "loss": 1.1466, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.27440633245382584, | |
| "grad_norm": 0.6206318140029907, | |
| "learning_rate": 6.130653266331658e-05, | |
| "loss": 1.0801, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.27792436235708, | |
| "grad_norm": 0.6293841600418091, | |
| "learning_rate": 6.080402010050251e-05, | |
| "loss": 1.1644, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.28144239226033424, | |
| "grad_norm": 0.6434080600738525, | |
| "learning_rate": 6.030150753768844e-05, | |
| "loss": 1.0589, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.2849604221635884, | |
| "grad_norm": 0.5857638120651245, | |
| "learning_rate": 5.979899497487438e-05, | |
| "loss": 1.1711, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.2884784520668426, | |
| "grad_norm": 0.6163449883460999, | |
| "learning_rate": 5.929648241206031e-05, | |
| "loss": 1.1627, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.29199648197009676, | |
| "grad_norm": 0.6543634533882141, | |
| "learning_rate": 5.879396984924623e-05, | |
| "loss": 1.0909, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.2955145118733509, | |
| "grad_norm": 0.6609559059143066, | |
| "learning_rate": 5.829145728643216e-05, | |
| "loss": 1.1505, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.2990325417766051, | |
| "grad_norm": 0.5798302292823792, | |
| "learning_rate": 5.778894472361809e-05, | |
| "loss": 1.0834, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.30255057167985927, | |
| "grad_norm": 0.6974066495895386, | |
| "learning_rate": 5.728643216080403e-05, | |
| "loss": 1.0965, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.30606860158311344, | |
| "grad_norm": 0.67149817943573, | |
| "learning_rate": 5.6783919597989955e-05, | |
| "loss": 1.09, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.3095866314863676, | |
| "grad_norm": 0.5761735439300537, | |
| "learning_rate": 5.628140703517588e-05, | |
| "loss": 1.1436, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.3131046613896218, | |
| "grad_norm": 0.6142584681510925, | |
| "learning_rate": 5.577889447236181e-05, | |
| "loss": 1.0489, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.316622691292876, | |
| "grad_norm": 0.6407614946365356, | |
| "learning_rate": 5.527638190954774e-05, | |
| "loss": 1.1449, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.3201407211961302, | |
| "grad_norm": 0.6835021376609802, | |
| "learning_rate": 5.477386934673368e-05, | |
| "loss": 1.1332, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.32365875109938436, | |
| "grad_norm": 0.5755856037139893, | |
| "learning_rate": 5.4271356783919604e-05, | |
| "loss": 1.1195, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.32717678100263853, | |
| "grad_norm": 0.6232398748397827, | |
| "learning_rate": 5.376884422110553e-05, | |
| "loss": 1.1696, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.3306948109058927, | |
| "grad_norm": 0.6193405389785767, | |
| "learning_rate": 5.3266331658291455e-05, | |
| "loss": 1.1106, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.33421284080914687, | |
| "grad_norm": 0.6834057569503784, | |
| "learning_rate": 5.276381909547739e-05, | |
| "loss": 1.1349, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.33773087071240104, | |
| "grad_norm": 0.7168384790420532, | |
| "learning_rate": 5.226130653266332e-05, | |
| "loss": 1.2054, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.3412489006156552, | |
| "grad_norm": 0.6553971767425537, | |
| "learning_rate": 5.175879396984925e-05, | |
| "loss": 1.0975, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.3447669305189094, | |
| "grad_norm": 0.6329600811004639, | |
| "learning_rate": 5.125628140703518e-05, | |
| "loss": 1.1212, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.3482849604221636, | |
| "grad_norm": 0.6656339764595032, | |
| "learning_rate": 5.0753768844221104e-05, | |
| "loss": 1.1451, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.3518029903254178, | |
| "grad_norm": 0.6817747950553894, | |
| "learning_rate": 5.0251256281407036e-05, | |
| "loss": 1.084, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.35532102022867196, | |
| "grad_norm": 0.6384849548339844, | |
| "learning_rate": 4.974874371859297e-05, | |
| "loss": 1.047, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.35883905013192613, | |
| "grad_norm": 0.6342082023620605, | |
| "learning_rate": 4.92462311557789e-05, | |
| "loss": 1.1122, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.3623570800351803, | |
| "grad_norm": 0.6114000082015991, | |
| "learning_rate": 4.874371859296483e-05, | |
| "loss": 1.1094, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.3658751099384345, | |
| "grad_norm": 0.6310352683067322, | |
| "learning_rate": 4.824120603015075e-05, | |
| "loss": 1.1508, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.36939313984168864, | |
| "grad_norm": 0.6773234605789185, | |
| "learning_rate": 4.7738693467336685e-05, | |
| "loss": 1.0511, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.3729111697449428, | |
| "grad_norm": 0.6625077724456787, | |
| "learning_rate": 4.723618090452262e-05, | |
| "loss": 1.1422, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.376429199648197, | |
| "grad_norm": 0.6125949025154114, | |
| "learning_rate": 4.673366834170855e-05, | |
| "loss": 1.1189, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.37994722955145116, | |
| "grad_norm": 0.684280514717102, | |
| "learning_rate": 4.6231155778894475e-05, | |
| "loss": 1.2249, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.3834652594547054, | |
| "grad_norm": 0.8305927515029907, | |
| "learning_rate": 4.57286432160804e-05, | |
| "loss": 1.1758, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.38698328935795956, | |
| "grad_norm": 0.6081312894821167, | |
| "learning_rate": 4.522613065326633e-05, | |
| "loss": 1.0853, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.39050131926121373, | |
| "grad_norm": 0.716929018497467, | |
| "learning_rate": 4.4723618090452266e-05, | |
| "loss": 1.1903, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.3940193491644679, | |
| "grad_norm": 0.5968315005302429, | |
| "learning_rate": 4.42211055276382e-05, | |
| "loss": 1.0717, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.3975373790677221, | |
| "grad_norm": 0.6502510905265808, | |
| "learning_rate": 4.3718592964824124e-05, | |
| "loss": 1.0629, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.40105540897097625, | |
| "grad_norm": 0.6408775448799133, | |
| "learning_rate": 4.321608040201005e-05, | |
| "loss": 1.0937, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.4045734388742304, | |
| "grad_norm": 0.6137213110923767, | |
| "learning_rate": 4.271356783919598e-05, | |
| "loss": 1.0853, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.4080914687774846, | |
| "grad_norm": 0.6401947736740112, | |
| "learning_rate": 4.2211055276381914e-05, | |
| "loss": 1.1542, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.41160949868073876, | |
| "grad_norm": 0.6332412362098694, | |
| "learning_rate": 4.170854271356784e-05, | |
| "loss": 1.0731, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.415127528583993, | |
| "grad_norm": 0.6274076700210571, | |
| "learning_rate": 4.120603015075377e-05, | |
| "loss": 1.0707, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.41864555848724716, | |
| "grad_norm": 0.632633626461029, | |
| "learning_rate": 4.07035175879397e-05, | |
| "loss": 1.108, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.42216358839050133, | |
| "grad_norm": 0.6979479193687439, | |
| "learning_rate": 4.020100502512563e-05, | |
| "loss": 1.1483, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.4256816182937555, | |
| "grad_norm": 0.7355033755302429, | |
| "learning_rate": 3.969849246231156e-05, | |
| "loss": 1.1358, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.4291996481970097, | |
| "grad_norm": 0.6254828572273254, | |
| "learning_rate": 3.919597989949749e-05, | |
| "loss": 1.1753, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.43271767810026385, | |
| "grad_norm": 0.6851824522018433, | |
| "learning_rate": 3.869346733668342e-05, | |
| "loss": 1.0128, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.436235708003518, | |
| "grad_norm": 0.6097928285598755, | |
| "learning_rate": 3.8190954773869346e-05, | |
| "loss": 1.1235, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.4397537379067722, | |
| "grad_norm": 0.6748325824737549, | |
| "learning_rate": 3.768844221105528e-05, | |
| "loss": 1.0452, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.44327176781002636, | |
| "grad_norm": 0.6666128039360046, | |
| "learning_rate": 3.7185929648241204e-05, | |
| "loss": 1.1075, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.4467897977132806, | |
| "grad_norm": 0.7474984526634216, | |
| "learning_rate": 3.668341708542714e-05, | |
| "loss": 1.0695, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.45030782761653476, | |
| "grad_norm": 0.6925339698791504, | |
| "learning_rate": 3.618090452261307e-05, | |
| "loss": 1.1024, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.45382585751978893, | |
| "grad_norm": 0.6140123009681702, | |
| "learning_rate": 3.5678391959798995e-05, | |
| "loss": 1.0788, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.4573438874230431, | |
| "grad_norm": 0.6771907806396484, | |
| "learning_rate": 3.517587939698493e-05, | |
| "loss": 1.0913, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.4608619173262973, | |
| "grad_norm": 0.6700430512428284, | |
| "learning_rate": 3.467336683417085e-05, | |
| "loss": 1.0566, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.46437994722955145, | |
| "grad_norm": 0.6931480169296265, | |
| "learning_rate": 3.4170854271356785e-05, | |
| "loss": 1.059, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.4678979771328056, | |
| "grad_norm": 0.6608771085739136, | |
| "learning_rate": 3.366834170854272e-05, | |
| "loss": 1.119, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.4714160070360598, | |
| "grad_norm": 0.6470663547515869, | |
| "learning_rate": 3.3165829145728643e-05, | |
| "loss": 1.0662, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.47493403693931396, | |
| "grad_norm": 0.5729122757911682, | |
| "learning_rate": 3.2663316582914576e-05, | |
| "loss": 0.9999, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.47845206684256814, | |
| "grad_norm": 0.6993862390518188, | |
| "learning_rate": 3.21608040201005e-05, | |
| "loss": 1.1819, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.48197009674582236, | |
| "grad_norm": 0.6929494738578796, | |
| "learning_rate": 3.1658291457286434e-05, | |
| "loss": 1.1719, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.48548812664907653, | |
| "grad_norm": 0.6951282620429993, | |
| "learning_rate": 3.1155778894472366e-05, | |
| "loss": 1.0716, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.4890061565523307, | |
| "grad_norm": 0.6766693592071533, | |
| "learning_rate": 3.065326633165829e-05, | |
| "loss": 1.1589, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.4925241864555849, | |
| "grad_norm": 0.6500269174575806, | |
| "learning_rate": 3.015075376884422e-05, | |
| "loss": 1.1122, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.49604221635883905, | |
| "grad_norm": 0.7741857171058655, | |
| "learning_rate": 2.9648241206030153e-05, | |
| "loss": 1.1594, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.4995602462620932, | |
| "grad_norm": 0.6630749106407166, | |
| "learning_rate": 2.914572864321608e-05, | |
| "loss": 1.0615, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.5030782761653474, | |
| "grad_norm": 0.7230671048164368, | |
| "learning_rate": 2.8643216080402015e-05, | |
| "loss": 1.1521, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.5065963060686016, | |
| "grad_norm": 0.6624138355255127, | |
| "learning_rate": 2.814070351758794e-05, | |
| "loss": 1.0347, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.5101143359718557, | |
| "grad_norm": 0.6560067534446716, | |
| "learning_rate": 2.763819095477387e-05, | |
| "loss": 1.1214, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.51363236587511, | |
| "grad_norm": 0.6742956638336182, | |
| "learning_rate": 2.7135678391959802e-05, | |
| "loss": 1.0956, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.5171503957783641, | |
| "grad_norm": 0.706284761428833, | |
| "learning_rate": 2.6633165829145728e-05, | |
| "loss": 1.1058, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.5206684256816183, | |
| "grad_norm": 0.6924006938934326, | |
| "learning_rate": 2.613065326633166e-05, | |
| "loss": 1.186, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.5241864555848724, | |
| "grad_norm": 0.6287305951118469, | |
| "learning_rate": 2.562814070351759e-05, | |
| "loss": 1.0422, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.5277044854881267, | |
| "grad_norm": 0.6957104206085205, | |
| "learning_rate": 2.5125628140703518e-05, | |
| "loss": 1.0896, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.5312225153913809, | |
| "grad_norm": 0.7039506435394287, | |
| "learning_rate": 2.462311557788945e-05, | |
| "loss": 1.0818, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.534740545294635, | |
| "grad_norm": 0.6502148509025574, | |
| "learning_rate": 2.4120603015075376e-05, | |
| "loss": 1.112, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.5382585751978892, | |
| "grad_norm": 0.6823992133140564, | |
| "learning_rate": 2.361809045226131e-05, | |
| "loss": 1.0298, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.5417766051011433, | |
| "grad_norm": 0.7539629936218262, | |
| "learning_rate": 2.3115577889447238e-05, | |
| "loss": 1.0618, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.5452946350043976, | |
| "grad_norm": 0.6974697113037109, | |
| "learning_rate": 2.2613065326633167e-05, | |
| "loss": 1.1702, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.5488126649076517, | |
| "grad_norm": 0.7035180330276489, | |
| "learning_rate": 2.21105527638191e-05, | |
| "loss": 1.0714, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.5523306948109059, | |
| "grad_norm": 0.9007865786552429, | |
| "learning_rate": 2.1608040201005025e-05, | |
| "loss": 1.0565, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.55584872471416, | |
| "grad_norm": 0.7083996534347534, | |
| "learning_rate": 2.1105527638190957e-05, | |
| "loss": 1.1425, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.5593667546174143, | |
| "grad_norm": 0.7241733074188232, | |
| "learning_rate": 2.0603015075376886e-05, | |
| "loss": 1.1211, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.5628847845206685, | |
| "grad_norm": 0.7474963068962097, | |
| "learning_rate": 2.0100502512562815e-05, | |
| "loss": 1.0546, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.5664028144239226, | |
| "grad_norm": 0.7051181793212891, | |
| "learning_rate": 1.9597989949748744e-05, | |
| "loss": 0.9878, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.5699208443271768, | |
| "grad_norm": 0.7359694242477417, | |
| "learning_rate": 1.9095477386934673e-05, | |
| "loss": 1.1283, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.5734388742304309, | |
| "grad_norm": 0.6908060908317566, | |
| "learning_rate": 1.8592964824120602e-05, | |
| "loss": 1.1287, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.5769569041336852, | |
| "grad_norm": 0.7220682501792908, | |
| "learning_rate": 1.8090452261306535e-05, | |
| "loss": 1.0424, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.5804749340369393, | |
| "grad_norm": 0.7415404319763184, | |
| "learning_rate": 1.7587939698492464e-05, | |
| "loss": 1.0749, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.5839929639401935, | |
| "grad_norm": 0.7168678641319275, | |
| "learning_rate": 1.7085427135678393e-05, | |
| "loss": 1.1308, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.5875109938434476, | |
| "grad_norm": 0.653301477432251, | |
| "learning_rate": 1.6582914572864322e-05, | |
| "loss": 1.0777, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.5910290237467019, | |
| "grad_norm": 0.7567819952964783, | |
| "learning_rate": 1.608040201005025e-05, | |
| "loss": 1.1476, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.594547053649956, | |
| "grad_norm": 0.7353144288063049, | |
| "learning_rate": 1.5577889447236183e-05, | |
| "loss": 1.0961, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.5980650835532102, | |
| "grad_norm": 0.6990388035774231, | |
| "learning_rate": 1.507537688442211e-05, | |
| "loss": 1.1619, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.6015831134564644, | |
| "grad_norm": 0.7032533288002014, | |
| "learning_rate": 1.457286432160804e-05, | |
| "loss": 1.0619, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.6051011433597185, | |
| "grad_norm": 0.6197975873947144, | |
| "learning_rate": 1.407035175879397e-05, | |
| "loss": 1.0953, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.6086191732629728, | |
| "grad_norm": 0.746258556842804, | |
| "learning_rate": 1.3567839195979901e-05, | |
| "loss": 1.1201, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.6121372031662269, | |
| "grad_norm": 0.6444905996322632, | |
| "learning_rate": 1.306532663316583e-05, | |
| "loss": 1.0241, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.6156552330694811, | |
| "grad_norm": 0.7037890553474426, | |
| "learning_rate": 1.2562814070351759e-05, | |
| "loss": 1.0739, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.6191732629727352, | |
| "grad_norm": 0.7138697504997253, | |
| "learning_rate": 1.2060301507537688e-05, | |
| "loss": 1.1102, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.6226912928759895, | |
| "grad_norm": 0.7358911037445068, | |
| "learning_rate": 1.1557788944723619e-05, | |
| "loss": 1.1945, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.6262093227792436, | |
| "grad_norm": 0.7306352853775024, | |
| "learning_rate": 1.105527638190955e-05, | |
| "loss": 1.0887, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.6297273526824978, | |
| "grad_norm": 0.7626399993896484, | |
| "learning_rate": 1.0552763819095479e-05, | |
| "loss": 1.0918, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.633245382585752, | |
| "grad_norm": 0.7157562375068665, | |
| "learning_rate": 1.0050251256281408e-05, | |
| "loss": 1.0794, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.6367634124890061, | |
| "grad_norm": 0.674655556678772, | |
| "learning_rate": 9.547738693467337e-06, | |
| "loss": 1.1632, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.6402814423922604, | |
| "grad_norm": 0.7276845574378967, | |
| "learning_rate": 9.045226130653267e-06, | |
| "loss": 1.0664, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.6437994722955145, | |
| "grad_norm": 0.7614260315895081, | |
| "learning_rate": 8.542713567839196e-06, | |
| "loss": 1.1185, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.6473175021987687, | |
| "grad_norm": 0.691209614276886, | |
| "learning_rate": 8.040201005025125e-06, | |
| "loss": 1.0648, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.6508355321020228, | |
| "grad_norm": 0.6736161708831787, | |
| "learning_rate": 7.537688442211055e-06, | |
| "loss": 1.11, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.6543535620052771, | |
| "grad_norm": 0.6875973343849182, | |
| "learning_rate": 7.035175879396985e-06, | |
| "loss": 1.1085, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.6578715919085312, | |
| "grad_norm": 0.6715053915977478, | |
| "learning_rate": 6.532663316582915e-06, | |
| "loss": 1.1391, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.6613896218117854, | |
| "grad_norm": 0.7241913080215454, | |
| "learning_rate": 6.030150753768844e-06, | |
| "loss": 1.193, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.6649076517150396, | |
| "grad_norm": 0.722939133644104, | |
| "learning_rate": 5.527638190954775e-06, | |
| "loss": 1.1218, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.6684256816182937, | |
| "grad_norm": 0.7348630428314209, | |
| "learning_rate": 5.025125628140704e-06, | |
| "loss": 1.0771, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.671943711521548, | |
| "grad_norm": 0.72852623462677, | |
| "learning_rate": 4.522613065326634e-06, | |
| "loss": 1.1196, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.6754617414248021, | |
| "grad_norm": 0.7617117762565613, | |
| "learning_rate": 4.020100502512563e-06, | |
| "loss": 1.1313, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.6789797713280563, | |
| "grad_norm": 0.8029654622077942, | |
| "learning_rate": 3.5175879396984926e-06, | |
| "loss": 1.1405, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.6824978012313104, | |
| "grad_norm": 0.6885625123977661, | |
| "learning_rate": 3.015075376884422e-06, | |
| "loss": 1.0565, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.6860158311345647, | |
| "grad_norm": 0.7057883143424988, | |
| "learning_rate": 2.512562814070352e-06, | |
| "loss": 1.1625, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.6895338610378188, | |
| "grad_norm": 0.7429342269897461, | |
| "learning_rate": 2.0100502512562813e-06, | |
| "loss": 1.044, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.693051890941073, | |
| "grad_norm": 0.7036694884300232, | |
| "learning_rate": 1.507537688442211e-06, | |
| "loss": 1.0991, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.6965699208443272, | |
| "grad_norm": 0.6950182318687439, | |
| "learning_rate": 1.0050251256281407e-06, | |
| "loss": 1.1014, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.7000879507475813, | |
| "grad_norm": 0.7009806632995605, | |
| "learning_rate": 5.025125628140703e-07, | |
| "loss": 1.1108, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.7036059806508356, | |
| "grad_norm": 0.6382765769958496, | |
| "learning_rate": 0.0, | |
| "loss": 1.0479, | |
| "step": 200 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 200, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.5191482454605824e+16, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |