{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.3813791623960146, "eval_steps": 500, "global_step": 8000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 4.7672395299501825e-05, "grad_norm": 18.2243709564209, "learning_rate": 1e-05, "loss": 4.1865, "step": 1 }, { "epoch": 9.534479059900365e-05, "grad_norm": 5.30253791809082, "learning_rate": 2e-05, "loss": 1.7704, "step": 2 }, { "epoch": 0.00014301718589850547, "grad_norm": 4.576740264892578, "learning_rate": 1.999999922855149e-05, "loss": 2.3123, "step": 3 }, { "epoch": 0.0001906895811980073, "grad_norm": 14.161197662353516, "learning_rate": 1.9999996914206083e-05, "loss": 2.3138, "step": 4 }, { "epoch": 0.0002383619764975091, "grad_norm": 4.212440490722656, "learning_rate": 1.9999993056964127e-05, "loss": 0.7261, "step": 5 }, { "epoch": 0.00028603437179701094, "grad_norm": 30.711898803710938, "learning_rate": 1.9999987656826223e-05, "loss": 2.9592, "step": 6 }, { "epoch": 0.00033370676709651274, "grad_norm": 17.282690048217773, "learning_rate": 1.9999980713793205e-05, "loss": 1.661, "step": 7 }, { "epoch": 0.0003813791623960146, "grad_norm": 18.53205108642578, "learning_rate": 1.9999972227866142e-05, "loss": 1.9224, "step": 8 }, { "epoch": 0.0004290515576955164, "grad_norm": 37.968788146972656, "learning_rate": 1.9999962199046343e-05, "loss": 1.9465, "step": 9 }, { "epoch": 0.0004767239529950182, "grad_norm": 2.2596421241760254, "learning_rate": 1.9999950627335357e-05, "loss": 1.1301, "step": 10 }, { "epoch": 0.0005243963482945201, "grad_norm": 4.727980136871338, "learning_rate": 1.9999937512734968e-05, "loss": 1.2272, "step": 11 }, { "epoch": 0.0005720687435940219, "grad_norm": 2.9042630195617676, "learning_rate": 1.9999922855247203e-05, "loss": 1.3935, "step": 12 }, { "epoch": 0.0006197411388935237, "grad_norm": 13.45541000366211, "learning_rate": 1.999990665487432e-05, "loss": 1.6829, "step": 13 }, { "epoch": 0.0006674135341930255, "grad_norm": 5.85912561416626, "learning_rate": 1.9999888911618815e-05, "loss": 0.889, "step": 14 }, { "epoch": 0.0007150859294925274, "grad_norm": 3.5429627895355225, "learning_rate": 1.9999869625483433e-05, "loss": 1.1139, "step": 15 }, { "epoch": 0.0007627583247920292, "grad_norm": 6.764256477355957, "learning_rate": 1.9999848796471148e-05, "loss": 1.4913, "step": 16 }, { "epoch": 0.000810430720091531, "grad_norm": 22.398008346557617, "learning_rate": 1.999982642458517e-05, "loss": 2.3193, "step": 17 }, { "epoch": 0.0008581031153910328, "grad_norm": 22.866546630859375, "learning_rate": 1.9999802509828955e-05, "loss": 1.3897, "step": 18 }, { "epoch": 0.0009057755106905346, "grad_norm": 5.639863967895508, "learning_rate": 1.999977705220619e-05, "loss": 1.4171, "step": 19 }, { "epoch": 0.0009534479059900364, "grad_norm": 3.3981564044952393, "learning_rate": 1.9999750051720802e-05, "loss": 1.0695, "step": 20 }, { "epoch": 0.0010011203012895383, "grad_norm": 12.594033241271973, "learning_rate": 1.9999721508376962e-05, "loss": 1.9061, "step": 21 }, { "epoch": 0.0010487926965890401, "grad_norm": 4.31193733215332, "learning_rate": 1.9999691422179066e-05, "loss": 1.7075, "step": 22 }, { "epoch": 0.001096465091888542, "grad_norm": 5.237526893615723, "learning_rate": 1.9999659793131764e-05, "loss": 1.7333, "step": 23 }, { "epoch": 0.0011441374871880437, "grad_norm": 5.530689716339111, "learning_rate": 1.9999626621239932e-05, "loss": 2.1986, "step": 24 }, { "epoch": 0.0011918098824875456, "grad_norm": 9.018251419067383, "learning_rate": 1.9999591906508686e-05, "loss": 0.506, "step": 25 }, { "epoch": 0.0012394822777870474, "grad_norm": 2.500774383544922, "learning_rate": 1.9999555648943387e-05, "loss": 0.8808, "step": 26 }, { "epoch": 0.0012871546730865492, "grad_norm": 4.035057544708252, "learning_rate": 1.9999517848549628e-05, "loss": 1.2847, "step": 27 }, { "epoch": 0.001334827068386051, "grad_norm": 4.671477794647217, "learning_rate": 1.9999478505333236e-05, "loss": 1.183, "step": 28 }, { "epoch": 0.001382499463685553, "grad_norm": 4.124985694885254, "learning_rate": 1.999943761930029e-05, "loss": 1.4332, "step": 29 }, { "epoch": 0.0014301718589850548, "grad_norm": 42.83099365234375, "learning_rate": 1.9999395190457093e-05, "loss": 2.6162, "step": 30 }, { "epoch": 0.0014778442542845566, "grad_norm": 15.408764839172363, "learning_rate": 1.999935121881019e-05, "loss": 1.5032, "step": 31 }, { "epoch": 0.0015255166495840584, "grad_norm": 4.75313663482666, "learning_rate": 1.999930570436637e-05, "loss": 1.1455, "step": 32 }, { "epoch": 0.0015731890448835602, "grad_norm": 2.505239486694336, "learning_rate": 1.9999258647132645e-05, "loss": 1.2099, "step": 33 }, { "epoch": 0.001620861440183062, "grad_norm": 2.3381943702697754, "learning_rate": 1.999921004711629e-05, "loss": 0.977, "step": 34 }, { "epoch": 0.0016685338354825638, "grad_norm": 4.889766693115234, "learning_rate": 1.9999159904324793e-05, "loss": 1.7548, "step": 35 }, { "epoch": 0.0017162062307820656, "grad_norm": 2.6877353191375732, "learning_rate": 1.9999108218765898e-05, "loss": 1.0806, "step": 36 }, { "epoch": 0.0017638786260815674, "grad_norm": 6.849803924560547, "learning_rate": 1.9999054990447576e-05, "loss": 1.5614, "step": 37 }, { "epoch": 0.0018115510213810692, "grad_norm": 2.4336485862731934, "learning_rate": 1.9999000219378036e-05, "loss": 0.7156, "step": 38 }, { "epoch": 0.001859223416680571, "grad_norm": 8.13830280303955, "learning_rate": 1.9998943905565733e-05, "loss": 1.1239, "step": 39 }, { "epoch": 0.0019068958119800728, "grad_norm": 1.631527066230774, "learning_rate": 1.9998886049019356e-05, "loss": 0.3686, "step": 40 }, { "epoch": 0.001954568207279575, "grad_norm": 39.150115966796875, "learning_rate": 1.999882664974783e-05, "loss": 1.166, "step": 41 }, { "epoch": 0.0020022406025790767, "grad_norm": 3.3509507179260254, "learning_rate": 1.999876570776032e-05, "loss": 0.5439, "step": 42 }, { "epoch": 0.0020499129978785785, "grad_norm": 6.455533981323242, "learning_rate": 1.999870322306623e-05, "loss": 1.2737, "step": 43 }, { "epoch": 0.0020975853931780803, "grad_norm": 4.2040205001831055, "learning_rate": 1.9998639195675197e-05, "loss": 0.9851, "step": 44 }, { "epoch": 0.002145257788477582, "grad_norm": 6.704083442687988, "learning_rate": 1.99985736255971e-05, "loss": 1.5998, "step": 45 }, { "epoch": 0.002192930183777084, "grad_norm": 2.616666078567505, "learning_rate": 1.9998506512842063e-05, "loss": 1.0957, "step": 46 }, { "epoch": 0.0022406025790765857, "grad_norm": 3.079622745513916, "learning_rate": 1.999843785742043e-05, "loss": 1.069, "step": 47 }, { "epoch": 0.0022882749743760875, "grad_norm": 1.947128415107727, "learning_rate": 1.9998367659342804e-05, "loss": 0.7664, "step": 48 }, { "epoch": 0.0023359473696755893, "grad_norm": 6.57976770401001, "learning_rate": 1.999829591862001e-05, "loss": 1.2904, "step": 49 }, { "epoch": 0.002383619764975091, "grad_norm": 4.031615257263184, "learning_rate": 1.9998222635263118e-05, "loss": 1.4492, "step": 50 }, { "epoch": 0.002431292160274593, "grad_norm": 4.04160213470459, "learning_rate": 1.9998147809283436e-05, "loss": 1.1642, "step": 51 }, { "epoch": 0.0024789645555740947, "grad_norm": 3.6480441093444824, "learning_rate": 1.9998071440692508e-05, "loss": 1.5053, "step": 52 }, { "epoch": 0.0025266369508735965, "grad_norm": 2.6463139057159424, "learning_rate": 1.9997993529502116e-05, "loss": 0.7771, "step": 53 }, { "epoch": 0.0025743093461730983, "grad_norm": 1.7695711851119995, "learning_rate": 1.9997914075724283e-05, "loss": 0.7755, "step": 54 }, { "epoch": 0.0026219817414726, "grad_norm": 2.9251327514648438, "learning_rate": 1.9997833079371263e-05, "loss": 1.116, "step": 55 }, { "epoch": 0.002669654136772102, "grad_norm": 1.9323079586029053, "learning_rate": 1.9997750540455562e-05, "loss": 0.4896, "step": 56 }, { "epoch": 0.0027173265320716037, "grad_norm": 1.6657203435897827, "learning_rate": 1.999766645898991e-05, "loss": 0.8084, "step": 57 }, { "epoch": 0.002764998927371106, "grad_norm": 3.26259183883667, "learning_rate": 1.9997580834987277e-05, "loss": 1.2315, "step": 58 }, { "epoch": 0.0028126713226706078, "grad_norm": 1.8272356986999512, "learning_rate": 1.9997493668460876e-05, "loss": 0.8441, "step": 59 }, { "epoch": 0.0028603437179701096, "grad_norm": 1.9205732345581055, "learning_rate": 1.9997404959424153e-05, "loss": 1.1089, "step": 60 }, { "epoch": 0.0029080161132696114, "grad_norm": 2.3295199871063232, "learning_rate": 1.9997314707890802e-05, "loss": 0.7738, "step": 61 }, { "epoch": 0.002955688508569113, "grad_norm": 3.4693026542663574, "learning_rate": 1.9997222913874745e-05, "loss": 1.2245, "step": 62 }, { "epoch": 0.003003360903868615, "grad_norm": 1.4701534509658813, "learning_rate": 1.999712957739014e-05, "loss": 0.829, "step": 63 }, { "epoch": 0.003051033299168117, "grad_norm": 3.402221918106079, "learning_rate": 1.9997034698451396e-05, "loss": 1.3311, "step": 64 }, { "epoch": 0.0030987056944676186, "grad_norm": 2.006373167037964, "learning_rate": 1.9996938277073146e-05, "loss": 0.8882, "step": 65 }, { "epoch": 0.0031463780897671204, "grad_norm": 1.5337969064712524, "learning_rate": 1.9996840313270268e-05, "loss": 0.8829, "step": 66 }, { "epoch": 0.003194050485066622, "grad_norm": 2.446826457977295, "learning_rate": 1.999674080705788e-05, "loss": 0.9319, "step": 67 }, { "epoch": 0.003241722880366124, "grad_norm": 5.346660614013672, "learning_rate": 1.9996639758451323e-05, "loss": 1.5478, "step": 68 }, { "epoch": 0.003289395275665626, "grad_norm": 6.304278373718262, "learning_rate": 1.9996537167466205e-05, "loss": 0.9192, "step": 69 }, { "epoch": 0.0033370676709651276, "grad_norm": 60.73661422729492, "learning_rate": 1.9996433034118342e-05, "loss": 1.0926, "step": 70 }, { "epoch": 0.0033847400662646294, "grad_norm": 5.363586902618408, "learning_rate": 1.9996327358423812e-05, "loss": 1.2102, "step": 71 }, { "epoch": 0.0034324124615641312, "grad_norm": 16.80270004272461, "learning_rate": 1.9996220140398907e-05, "loss": 0.9231, "step": 72 }, { "epoch": 0.003480084856863633, "grad_norm": 2.3622937202453613, "learning_rate": 1.9996111380060177e-05, "loss": 0.7771, "step": 73 }, { "epoch": 0.003527757252163135, "grad_norm": 3.1270358562469482, "learning_rate": 1.99960010774244e-05, "loss": 0.9786, "step": 74 }, { "epoch": 0.0035754296474626367, "grad_norm": 4.343189239501953, "learning_rate": 1.9995889232508595e-05, "loss": 0.8568, "step": 75 }, { "epoch": 0.0036231020427621385, "grad_norm": 1.7904366254806519, "learning_rate": 1.9995775845330022e-05, "loss": 0.6424, "step": 76 }, { "epoch": 0.0036707744380616403, "grad_norm": 2.8427789211273193, "learning_rate": 1.999566091590617e-05, "loss": 0.8684, "step": 77 }, { "epoch": 0.003718446833361142, "grad_norm": 1.8012349605560303, "learning_rate": 1.9995544444254777e-05, "loss": 0.8509, "step": 78 }, { "epoch": 0.003766119228660644, "grad_norm": 2.595200777053833, "learning_rate": 1.9995426430393808e-05, "loss": 0.9471, "step": 79 }, { "epoch": 0.0038137916239601457, "grad_norm": 4.616239547729492, "learning_rate": 1.9995306874341477e-05, "loss": 1.3471, "step": 80 }, { "epoch": 0.003861464019259648, "grad_norm": 6.4607343673706055, "learning_rate": 1.9995185776116225e-05, "loss": 1.5601, "step": 81 }, { "epoch": 0.00390913641455915, "grad_norm": 4.7140936851501465, "learning_rate": 1.9995063135736735e-05, "loss": 0.3913, "step": 82 }, { "epoch": 0.0039568088098586515, "grad_norm": 6.646099090576172, "learning_rate": 1.999493895322194e-05, "loss": 1.1003, "step": 83 }, { "epoch": 0.004004481205158153, "grad_norm": 2.548227310180664, "learning_rate": 1.9994813228590986e-05, "loss": 0.8912, "step": 84 }, { "epoch": 0.004052153600457655, "grad_norm": 4.441709995269775, "learning_rate": 1.999468596186328e-05, "loss": 0.8949, "step": 85 }, { "epoch": 0.004099825995757157, "grad_norm": 90.19632720947266, "learning_rate": 1.9994557153058456e-05, "loss": 3.2277, "step": 86 }, { "epoch": 0.004147498391056659, "grad_norm": 1.6170494556427002, "learning_rate": 1.9994426802196384e-05, "loss": 0.7079, "step": 87 }, { "epoch": 0.0041951707863561605, "grad_norm": 1.2660447359085083, "learning_rate": 1.999429490929718e-05, "loss": 0.2052, "step": 88 }, { "epoch": 0.004242843181655662, "grad_norm": 7.228525638580322, "learning_rate": 1.9994161474381198e-05, "loss": 1.056, "step": 89 }, { "epoch": 0.004290515576955164, "grad_norm": 2.0865042209625244, "learning_rate": 1.9994026497469016e-05, "loss": 0.9977, "step": 90 }, { "epoch": 0.004338187972254666, "grad_norm": 16.161222457885742, "learning_rate": 1.9993889978581462e-05, "loss": 0.9176, "step": 91 }, { "epoch": 0.004385860367554168, "grad_norm": 1.9300395250320435, "learning_rate": 1.9993751917739606e-05, "loss": 0.9029, "step": 92 }, { "epoch": 0.00443353276285367, "grad_norm": 2.299837350845337, "learning_rate": 1.999361231496474e-05, "loss": 1.1272, "step": 93 }, { "epoch": 0.004481205158153171, "grad_norm": 1.8956444263458252, "learning_rate": 1.9993471170278415e-05, "loss": 0.8191, "step": 94 }, { "epoch": 0.004528877553452673, "grad_norm": 1.5688307285308838, "learning_rate": 1.9993328483702393e-05, "loss": 0.867, "step": 95 }, { "epoch": 0.004576549948752175, "grad_norm": 2.408003807067871, "learning_rate": 1.9993184255258705e-05, "loss": 0.9292, "step": 96 }, { "epoch": 0.004624222344051677, "grad_norm": 3.870917797088623, "learning_rate": 1.9993038484969592e-05, "loss": 0.6028, "step": 97 }, { "epoch": 0.004671894739351179, "grad_norm": 1.854192852973938, "learning_rate": 1.9992891172857552e-05, "loss": 1.0358, "step": 98 }, { "epoch": 0.00471956713465068, "grad_norm": 4.5664591789245605, "learning_rate": 1.9992742318945307e-05, "loss": 0.6993, "step": 99 }, { "epoch": 0.004767239529950182, "grad_norm": 1.7928482294082642, "learning_rate": 1.999259192325583e-05, "loss": 0.7773, "step": 100 }, { "epoch": 0.004814911925249684, "grad_norm": 4.041717052459717, "learning_rate": 1.999243998581232e-05, "loss": 0.7349, "step": 101 }, { "epoch": 0.004862584320549186, "grad_norm": 2.914194107055664, "learning_rate": 1.9992286506638226e-05, "loss": 1.0398, "step": 102 }, { "epoch": 0.004910256715848688, "grad_norm": 1.4151759147644043, "learning_rate": 1.9992131485757223e-05, "loss": 0.7231, "step": 103 }, { "epoch": 0.004957929111148189, "grad_norm": 1.6251767873764038, "learning_rate": 1.9991974923193234e-05, "loss": 0.8153, "step": 104 }, { "epoch": 0.005005601506447691, "grad_norm": 2.348764181137085, "learning_rate": 1.9991816818970408e-05, "loss": 0.8527, "step": 105 }, { "epoch": 0.005053273901747193, "grad_norm": 2.0763232707977295, "learning_rate": 1.9991657173113144e-05, "loss": 0.8315, "step": 106 }, { "epoch": 0.005100946297046695, "grad_norm": 2.3932998180389404, "learning_rate": 1.999149598564607e-05, "loss": 0.6754, "step": 107 }, { "epoch": 0.005148618692346197, "grad_norm": 3.879709482192993, "learning_rate": 1.9991333256594062e-05, "loss": 1.2123, "step": 108 }, { "epoch": 0.0051962910876456984, "grad_norm": 1.5737781524658203, "learning_rate": 1.9991168985982223e-05, "loss": 0.6245, "step": 109 }, { "epoch": 0.0052439634829452, "grad_norm": 1.3460618257522583, "learning_rate": 1.9991003173835898e-05, "loss": 0.5251, "step": 110 }, { "epoch": 0.005291635878244702, "grad_norm": 1.8699207305908203, "learning_rate": 1.9990835820180665e-05, "loss": 0.632, "step": 111 }, { "epoch": 0.005339308273544204, "grad_norm": 3.712545394897461, "learning_rate": 1.9990666925042356e-05, "loss": 0.8691, "step": 112 }, { "epoch": 0.005386980668843706, "grad_norm": 4.666072845458984, "learning_rate": 1.9990496488447024e-05, "loss": 0.9637, "step": 113 }, { "epoch": 0.0054346530641432075, "grad_norm": 30.670461654663086, "learning_rate": 1.9990324510420966e-05, "loss": 0.3979, "step": 114 }, { "epoch": 0.005482325459442709, "grad_norm": 2.244295597076416, "learning_rate": 1.9990150990990717e-05, "loss": 0.7061, "step": 115 }, { "epoch": 0.005529997854742212, "grad_norm": 1.6453338861465454, "learning_rate": 1.998997593018305e-05, "loss": 0.7932, "step": 116 }, { "epoch": 0.005577670250041714, "grad_norm": 1.4517056941986084, "learning_rate": 1.998979932802497e-05, "loss": 0.7247, "step": 117 }, { "epoch": 0.0056253426453412156, "grad_norm": 1.9602091312408447, "learning_rate": 1.998962118454373e-05, "loss": 0.7311, "step": 118 }, { "epoch": 0.005673015040640717, "grad_norm": 2.30653715133667, "learning_rate": 1.9989441499766814e-05, "loss": 0.7201, "step": 119 }, { "epoch": 0.005720687435940219, "grad_norm": 1.9459872245788574, "learning_rate": 1.998926027372195e-05, "loss": 1.0758, "step": 120 }, { "epoch": 0.005768359831239721, "grad_norm": 6.566103458404541, "learning_rate": 1.998907750643709e-05, "loss": 1.4042, "step": 121 }, { "epoch": 0.005816032226539223, "grad_norm": 5.204738616943359, "learning_rate": 1.998889319794044e-05, "loss": 0.598, "step": 122 }, { "epoch": 0.005863704621838725, "grad_norm": 2.0690536499023438, "learning_rate": 1.998870734826044e-05, "loss": 1.0878, "step": 123 }, { "epoch": 0.005911377017138226, "grad_norm": 6.710436820983887, "learning_rate": 1.9988519957425754e-05, "loss": 1.5044, "step": 124 }, { "epoch": 0.005959049412437728, "grad_norm": 1.4392977952957153, "learning_rate": 1.9988331025465298e-05, "loss": 0.8179, "step": 125 }, { "epoch": 0.00600672180773723, "grad_norm": 3.523577928543091, "learning_rate": 1.998814055240823e-05, "loss": 1.1138, "step": 126 }, { "epoch": 0.006054394203036732, "grad_norm": 2.1186182498931885, "learning_rate": 1.9987948538283932e-05, "loss": 1.0725, "step": 127 }, { "epoch": 0.006102066598336234, "grad_norm": 2.1686816215515137, "learning_rate": 1.998775498312203e-05, "loss": 1.1928, "step": 128 }, { "epoch": 0.006149738993635735, "grad_norm": 1.6701370477676392, "learning_rate": 1.998755988695239e-05, "loss": 0.6307, "step": 129 }, { "epoch": 0.006197411388935237, "grad_norm": 1.5671544075012207, "learning_rate": 1.998736324980511e-05, "loss": 0.4937, "step": 130 }, { "epoch": 0.006245083784234739, "grad_norm": 1.2592086791992188, "learning_rate": 1.998716507171053e-05, "loss": 0.5423, "step": 131 }, { "epoch": 0.006292756179534241, "grad_norm": 2.133965253829956, "learning_rate": 1.9986965352699225e-05, "loss": 0.9762, "step": 132 }, { "epoch": 0.006340428574833743, "grad_norm": 2.673516273498535, "learning_rate": 1.9986764092802015e-05, "loss": 0.8452, "step": 133 }, { "epoch": 0.006388100970133244, "grad_norm": 2.2198140621185303, "learning_rate": 1.998656129204995e-05, "loss": 1.1588, "step": 134 }, { "epoch": 0.006435773365432746, "grad_norm": 2.097494125366211, "learning_rate": 1.998635695047432e-05, "loss": 0.7734, "step": 135 }, { "epoch": 0.006483445760732248, "grad_norm": 81.89967346191406, "learning_rate": 1.998615106810665e-05, "loss": 1.4706, "step": 136 }, { "epoch": 0.00653111815603175, "grad_norm": 4.292656898498535, "learning_rate": 1.9985943644978705e-05, "loss": 0.9319, "step": 137 }, { "epoch": 0.006578790551331252, "grad_norm": 2.725637912750244, "learning_rate": 1.9985734681122494e-05, "loss": 0.8047, "step": 138 }, { "epoch": 0.0066264629466307534, "grad_norm": 4.857945442199707, "learning_rate": 1.9985524176570255e-05, "loss": 1.1442, "step": 139 }, { "epoch": 0.006674135341930255, "grad_norm": 14.465811729431152, "learning_rate": 1.9985312131354467e-05, "loss": 0.9426, "step": 140 }, { "epoch": 0.006721807737229757, "grad_norm": 2.2471907138824463, "learning_rate": 1.9985098545507843e-05, "loss": 0.6077, "step": 141 }, { "epoch": 0.006769480132529259, "grad_norm": 3.9241859912872314, "learning_rate": 1.9984883419063343e-05, "loss": 1.3541, "step": 142 }, { "epoch": 0.006817152527828761, "grad_norm": 2.1206214427948, "learning_rate": 1.9984666752054152e-05, "loss": 0.7002, "step": 143 }, { "epoch": 0.0068648249231282625, "grad_norm": 1.9665372371673584, "learning_rate": 1.998444854451371e-05, "loss": 0.9509, "step": 144 }, { "epoch": 0.006912497318427764, "grad_norm": 2.6428842544555664, "learning_rate": 1.9984228796475672e-05, "loss": 1.2551, "step": 145 }, { "epoch": 0.006960169713727266, "grad_norm": 1.7541123628616333, "learning_rate": 1.9984007507973952e-05, "loss": 0.7907, "step": 146 }, { "epoch": 0.007007842109026768, "grad_norm": 1.826343059539795, "learning_rate": 1.9983784679042685e-05, "loss": 1.2216, "step": 147 }, { "epoch": 0.00705551450432627, "grad_norm": 1.8894635438919067, "learning_rate": 1.998356030971626e-05, "loss": 0.9068, "step": 148 }, { "epoch": 0.0071031868996257715, "grad_norm": 5.640469074249268, "learning_rate": 1.9983334400029285e-05, "loss": 0.9861, "step": 149 }, { "epoch": 0.007150859294925273, "grad_norm": 2.9244861602783203, "learning_rate": 1.998310695001662e-05, "loss": 0.9534, "step": 150 }, { "epoch": 0.007198531690224775, "grad_norm": 1.9021021127700806, "learning_rate": 1.9982877959713366e-05, "loss": 0.8217, "step": 151 }, { "epoch": 0.007246204085524277, "grad_norm": 1.8560668230056763, "learning_rate": 1.9982647429154843e-05, "loss": 0.8384, "step": 152 }, { "epoch": 0.007293876480823779, "grad_norm": 1.9364761114120483, "learning_rate": 1.9982415358376623e-05, "loss": 1.0554, "step": 153 }, { "epoch": 0.0073415488761232805, "grad_norm": 5.366004467010498, "learning_rate": 1.9982181747414508e-05, "loss": 1.3124, "step": 154 }, { "epoch": 0.007389221271422782, "grad_norm": 2.3039839267730713, "learning_rate": 1.998194659630455e-05, "loss": 0.9447, "step": 155 }, { "epoch": 0.007436893666722284, "grad_norm": 3.2765309810638428, "learning_rate": 1.9981709905083026e-05, "loss": 1.0979, "step": 156 }, { "epoch": 0.007484566062021786, "grad_norm": 2.7673983573913574, "learning_rate": 1.998147167378645e-05, "loss": 0.9274, "step": 157 }, { "epoch": 0.007532238457321288, "grad_norm": 1.6349414587020874, "learning_rate": 1.9981231902451595e-05, "loss": 0.8723, "step": 158 }, { "epoch": 0.0075799108526207895, "grad_norm": 1.4526450634002686, "learning_rate": 1.9980990591115437e-05, "loss": 0.8086, "step": 159 }, { "epoch": 0.007627583247920291, "grad_norm": 2.0796725749969482, "learning_rate": 1.9980747739815217e-05, "loss": 0.7309, "step": 160 }, { "epoch": 0.007675255643219793, "grad_norm": 1.8629250526428223, "learning_rate": 1.99805033485884e-05, "loss": 0.943, "step": 161 }, { "epoch": 0.007722928038519296, "grad_norm": 3.6882741451263428, "learning_rate": 1.99802574174727e-05, "loss": 1.3126, "step": 162 }, { "epoch": 0.007770600433818798, "grad_norm": 1.8233232498168945, "learning_rate": 1.9980009946506053e-05, "loss": 0.9655, "step": 163 }, { "epoch": 0.0078182728291183, "grad_norm": 1.6087697744369507, "learning_rate": 1.9979760935726647e-05, "loss": 0.7855, "step": 164 }, { "epoch": 0.007865945224417801, "grad_norm": 2.790602684020996, "learning_rate": 1.99795103851729e-05, "loss": 1.0484, "step": 165 }, { "epoch": 0.007913617619717303, "grad_norm": 1.838362455368042, "learning_rate": 1.997925829488347e-05, "loss": 0.7105, "step": 166 }, { "epoch": 0.007961290015016805, "grad_norm": 9.126262664794922, "learning_rate": 1.9979004664897252e-05, "loss": 1.3776, "step": 167 }, { "epoch": 0.008008962410316307, "grad_norm": 2.9251487255096436, "learning_rate": 1.9978749495253378e-05, "loss": 0.9252, "step": 168 }, { "epoch": 0.008056634805615808, "grad_norm": 7.816552639007568, "learning_rate": 1.9978492785991216e-05, "loss": 1.0545, "step": 169 }, { "epoch": 0.00810430720091531, "grad_norm": 3.071329355239868, "learning_rate": 1.997823453715038e-05, "loss": 1.0586, "step": 170 }, { "epoch": 0.008151979596214812, "grad_norm": 2.9898781776428223, "learning_rate": 1.9977974748770708e-05, "loss": 0.8904, "step": 171 }, { "epoch": 0.008199651991514314, "grad_norm": 1.7111653089523315, "learning_rate": 1.9977713420892287e-05, "loss": 0.6194, "step": 172 }, { "epoch": 0.008247324386813816, "grad_norm": 2.2094221115112305, "learning_rate": 1.9977450553555434e-05, "loss": 1.0304, "step": 173 }, { "epoch": 0.008294996782113317, "grad_norm": 2.236921548843384, "learning_rate": 1.9977186146800707e-05, "loss": 1.0642, "step": 174 }, { "epoch": 0.00834266917741282, "grad_norm": 1.5356533527374268, "learning_rate": 1.997692020066891e-05, "loss": 0.7384, "step": 175 }, { "epoch": 0.008390341572712321, "grad_norm": 1.6347790956497192, "learning_rate": 1.997665271520106e-05, "loss": 0.916, "step": 176 }, { "epoch": 0.008438013968011823, "grad_norm": 2.9392073154449463, "learning_rate": 1.997638369043844e-05, "loss": 1.1304, "step": 177 }, { "epoch": 0.008485686363311325, "grad_norm": 3.1310737133026123, "learning_rate": 1.9976113126422553e-05, "loss": 1.297, "step": 178 }, { "epoch": 0.008533358758610827, "grad_norm": 1.5780115127563477, "learning_rate": 1.997584102319514e-05, "loss": 0.694, "step": 179 }, { "epoch": 0.008581031153910328, "grad_norm": 3.4661989212036133, "learning_rate": 1.9975567380798195e-05, "loss": 1.4794, "step": 180 }, { "epoch": 0.00862870354920983, "grad_norm": 1.9178662300109863, "learning_rate": 1.997529219927393e-05, "loss": 0.6866, "step": 181 }, { "epoch": 0.008676375944509332, "grad_norm": 2.1521716117858887, "learning_rate": 1.9975015478664802e-05, "loss": 0.9348, "step": 182 }, { "epoch": 0.008724048339808834, "grad_norm": 2.1723456382751465, "learning_rate": 1.9974737219013513e-05, "loss": 1.0957, "step": 183 }, { "epoch": 0.008771720735108336, "grad_norm": 2.081552505493164, "learning_rate": 1.9974457420362986e-05, "loss": 0.66, "step": 184 }, { "epoch": 0.008819393130407837, "grad_norm": 0.943188488483429, "learning_rate": 1.9974176082756397e-05, "loss": 0.3902, "step": 185 }, { "epoch": 0.00886706552570734, "grad_norm": 1.5033038854599, "learning_rate": 1.9973893206237154e-05, "loss": 0.3994, "step": 186 }, { "epoch": 0.008914737921006841, "grad_norm": 1.846114158630371, "learning_rate": 1.99736087908489e-05, "loss": 0.9327, "step": 187 }, { "epoch": 0.008962410316306343, "grad_norm": 1.4994962215423584, "learning_rate": 1.9973322836635517e-05, "loss": 0.6214, "step": 188 }, { "epoch": 0.009010082711605845, "grad_norm": 2.1606719493865967, "learning_rate": 1.9973035343641127e-05, "loss": 0.7063, "step": 189 }, { "epoch": 0.009057755106905346, "grad_norm": 1.8727173805236816, "learning_rate": 1.9972746311910086e-05, "loss": 0.8753, "step": 190 }, { "epoch": 0.009105427502204848, "grad_norm": 3.3258538246154785, "learning_rate": 1.997245574148699e-05, "loss": 0.8881, "step": 191 }, { "epoch": 0.00915309989750435, "grad_norm": 1.8385637998580933, "learning_rate": 1.9972163632416666e-05, "loss": 0.9557, "step": 192 }, { "epoch": 0.009200772292803852, "grad_norm": 3.061753034591675, "learning_rate": 1.997186998474419e-05, "loss": 1.3106, "step": 193 }, { "epoch": 0.009248444688103354, "grad_norm": 3.6829543113708496, "learning_rate": 1.9971574798514862e-05, "loss": 0.7096, "step": 194 }, { "epoch": 0.009296117083402855, "grad_norm": 3.9840047359466553, "learning_rate": 1.997127807377423e-05, "loss": 0.6233, "step": 195 }, { "epoch": 0.009343789478702357, "grad_norm": 1.8962153196334839, "learning_rate": 1.9970979810568082e-05, "loss": 0.7132, "step": 196 }, { "epoch": 0.009391461874001859, "grad_norm": 1.8558872938156128, "learning_rate": 1.9970680008942425e-05, "loss": 0.8061, "step": 197 }, { "epoch": 0.00943913426930136, "grad_norm": 3.1907379627227783, "learning_rate": 1.9970378668943522e-05, "loss": 0.7062, "step": 198 }, { "epoch": 0.009486806664600863, "grad_norm": 3.308406352996826, "learning_rate": 1.9970075790617865e-05, "loss": 0.8985, "step": 199 }, { "epoch": 0.009534479059900364, "grad_norm": 1.7010345458984375, "learning_rate": 1.9969771374012186e-05, "loss": 0.8402, "step": 200 }, { "epoch": 0.009582151455199866, "grad_norm": 1.7002482414245605, "learning_rate": 1.996946541917345e-05, "loss": 0.7318, "step": 201 }, { "epoch": 0.009629823850499368, "grad_norm": 2.6917359828948975, "learning_rate": 1.996915792614887e-05, "loss": 1.0584, "step": 202 }, { "epoch": 0.00967749624579887, "grad_norm": 1.425291657447815, "learning_rate": 1.9968848894985884e-05, "loss": 0.5775, "step": 203 }, { "epoch": 0.009725168641098372, "grad_norm": 2.461945056915283, "learning_rate": 1.996853832573217e-05, "loss": 0.7251, "step": 204 }, { "epoch": 0.009772841036397873, "grad_norm": 3.177412748336792, "learning_rate": 1.996822621843565e-05, "loss": 0.8469, "step": 205 }, { "epoch": 0.009820513431697375, "grad_norm": 2.5107016563415527, "learning_rate": 1.9967912573144476e-05, "loss": 1.1968, "step": 206 }, { "epoch": 0.009868185826996877, "grad_norm": 3.7646501064300537, "learning_rate": 1.9967597389907043e-05, "loss": 1.5301, "step": 207 }, { "epoch": 0.009915858222296379, "grad_norm": 2.7186832427978516, "learning_rate": 1.9967280668771977e-05, "loss": 0.8036, "step": 208 }, { "epoch": 0.00996353061759588, "grad_norm": 1.9227368831634521, "learning_rate": 1.996696240978815e-05, "loss": 1.1097, "step": 209 }, { "epoch": 0.010011203012895382, "grad_norm": 1.5649751424789429, "learning_rate": 1.9966642613004664e-05, "loss": 0.8102, "step": 210 }, { "epoch": 0.010058875408194884, "grad_norm": 4.642465591430664, "learning_rate": 1.9966321278470856e-05, "loss": 0.4425, "step": 211 }, { "epoch": 0.010106547803494386, "grad_norm": 15.997318267822266, "learning_rate": 1.9965998406236306e-05, "loss": 0.9282, "step": 212 }, { "epoch": 0.010154220198793888, "grad_norm": 3.695793867111206, "learning_rate": 1.9965673996350836e-05, "loss": 1.3906, "step": 213 }, { "epoch": 0.01020189259409339, "grad_norm": 2.3274667263031006, "learning_rate": 1.9965348048864495e-05, "loss": 0.4954, "step": 214 }, { "epoch": 0.010249564989392891, "grad_norm": 5.78277063369751, "learning_rate": 1.9965020563827574e-05, "loss": 0.9774, "step": 215 }, { "epoch": 0.010297237384692393, "grad_norm": 1.8216116428375244, "learning_rate": 1.99646915412906e-05, "loss": 1.106, "step": 216 }, { "epoch": 0.010344909779991895, "grad_norm": 1.2086372375488281, "learning_rate": 1.996436098130433e-05, "loss": 0.6089, "step": 217 }, { "epoch": 0.010392582175291397, "grad_norm": 1.2094674110412598, "learning_rate": 1.9964028883919783e-05, "loss": 0.6924, "step": 218 }, { "epoch": 0.010440254570590899, "grad_norm": 1.7790417671203613, "learning_rate": 1.9963695249188185e-05, "loss": 0.7085, "step": 219 }, { "epoch": 0.0104879269658904, "grad_norm": 1.6460531949996948, "learning_rate": 1.9963360077161015e-05, "loss": 0.7923, "step": 220 }, { "epoch": 0.010535599361189902, "grad_norm": 3.221555471420288, "learning_rate": 1.996302336788999e-05, "loss": 1.0185, "step": 221 }, { "epoch": 0.010583271756489404, "grad_norm": 3.7141506671905518, "learning_rate": 1.9962685121427055e-05, "loss": 0.5976, "step": 222 }, { "epoch": 0.010630944151788906, "grad_norm": 2.564969301223755, "learning_rate": 1.9962345337824404e-05, "loss": 0.5263, "step": 223 }, { "epoch": 0.010678616547088408, "grad_norm": 3.3699798583984375, "learning_rate": 1.996200401713446e-05, "loss": 0.5765, "step": 224 }, { "epoch": 0.01072628894238791, "grad_norm": 1.953573226928711, "learning_rate": 1.9961661159409885e-05, "loss": 1.0328, "step": 225 }, { "epoch": 0.010773961337687411, "grad_norm": 2.079005479812622, "learning_rate": 1.9961316764703583e-05, "loss": 0.9661, "step": 226 }, { "epoch": 0.010821633732986913, "grad_norm": 1.4127886295318604, "learning_rate": 1.996097083306868e-05, "loss": 0.8102, "step": 227 }, { "epoch": 0.010869306128286415, "grad_norm": 5.974330902099609, "learning_rate": 1.9960623364558555e-05, "loss": 0.6324, "step": 228 }, { "epoch": 0.010916978523585917, "grad_norm": 6.547445297241211, "learning_rate": 1.9960274359226824e-05, "loss": 0.3014, "step": 229 }, { "epoch": 0.010964650918885419, "grad_norm": 3.30557918548584, "learning_rate": 1.9959923817127326e-05, "loss": 0.681, "step": 230 }, { "epoch": 0.011012323314184922, "grad_norm": 4.872275352478027, "learning_rate": 1.9959571738314153e-05, "loss": 1.187, "step": 231 }, { "epoch": 0.011059995709484424, "grad_norm": 3.176126480102539, "learning_rate": 1.9959218122841624e-05, "loss": 0.5215, "step": 232 }, { "epoch": 0.011107668104783926, "grad_norm": 15.39186954498291, "learning_rate": 1.99588629707643e-05, "loss": 1.0125, "step": 233 }, { "epoch": 0.011155340500083427, "grad_norm": 5.650630950927734, "learning_rate": 1.995850628213697e-05, "loss": 1.4208, "step": 234 }, { "epoch": 0.01120301289538293, "grad_norm": 5.125373363494873, "learning_rate": 1.995814805701468e-05, "loss": 0.4974, "step": 235 }, { "epoch": 0.011250685290682431, "grad_norm": 3.3534553050994873, "learning_rate": 1.9957788295452693e-05, "loss": 0.7318, "step": 236 }, { "epoch": 0.011298357685981933, "grad_norm": 1.9568144083023071, "learning_rate": 1.9957426997506518e-05, "loss": 0.8864, "step": 237 }, { "epoch": 0.011346030081281435, "grad_norm": 2.102306842803955, "learning_rate": 1.9957064163231896e-05, "loss": 0.8255, "step": 238 }, { "epoch": 0.011393702476580937, "grad_norm": 1.5542311668395996, "learning_rate": 1.9956699792684812e-05, "loss": 0.8405, "step": 239 }, { "epoch": 0.011441374871880438, "grad_norm": 2.799365758895874, "learning_rate": 1.9956333885921488e-05, "loss": 0.5947, "step": 240 }, { "epoch": 0.01148904726717994, "grad_norm": 5.494631290435791, "learning_rate": 1.995596644299837e-05, "loss": 1.1096, "step": 241 }, { "epoch": 0.011536719662479442, "grad_norm": 3.8105666637420654, "learning_rate": 1.9955597463972157e-05, "loss": 1.1307, "step": 242 }, { "epoch": 0.011584392057778944, "grad_norm": 1.5918257236480713, "learning_rate": 1.9955226948899782e-05, "loss": 0.6992, "step": 243 }, { "epoch": 0.011632064453078446, "grad_norm": 2.063843250274658, "learning_rate": 1.995485489783841e-05, "loss": 0.6998, "step": 244 }, { "epoch": 0.011679736848377947, "grad_norm": 1.2544373273849487, "learning_rate": 1.9954481310845437e-05, "loss": 0.6251, "step": 245 }, { "epoch": 0.01172740924367745, "grad_norm": 3.872133731842041, "learning_rate": 1.9954106187978507e-05, "loss": 1.2245, "step": 246 }, { "epoch": 0.011775081638976951, "grad_norm": 5.4492106437683105, "learning_rate": 1.9953729529295504e-05, "loss": 0.8282, "step": 247 }, { "epoch": 0.011822754034276453, "grad_norm": 5.580592155456543, "learning_rate": 1.9953351334854537e-05, "loss": 1.1402, "step": 248 }, { "epoch": 0.011870426429575955, "grad_norm": 1.815596580505371, "learning_rate": 1.9952971604713963e-05, "loss": 1.3938, "step": 249 }, { "epoch": 0.011918098824875456, "grad_norm": 19.696008682250977, "learning_rate": 1.995259033893236e-05, "loss": 0.5092, "step": 250 }, { "epoch": 0.011965771220174958, "grad_norm": 1.3136402368545532, "learning_rate": 1.9952207537568563e-05, "loss": 0.3227, "step": 251 }, { "epoch": 0.01201344361547446, "grad_norm": 4.327956199645996, "learning_rate": 1.9951823200681628e-05, "loss": 1.2593, "step": 252 }, { "epoch": 0.012061116010773962, "grad_norm": 1.2282180786132812, "learning_rate": 1.995143732833086e-05, "loss": 0.3331, "step": 253 }, { "epoch": 0.012108788406073464, "grad_norm": 2.6325550079345703, "learning_rate": 1.995104992057579e-05, "loss": 0.5151, "step": 254 }, { "epoch": 0.012156460801372965, "grad_norm": 2.4884750843048096, "learning_rate": 1.9950660977476196e-05, "loss": 0.6286, "step": 255 }, { "epoch": 0.012204133196672467, "grad_norm": 6.9264655113220215, "learning_rate": 1.9950270499092083e-05, "loss": 0.7175, "step": 256 }, { "epoch": 0.012251805591971969, "grad_norm": 2.0482723712921143, "learning_rate": 1.99498784854837e-05, "loss": 0.8469, "step": 257 }, { "epoch": 0.01229947798727147, "grad_norm": 3.3197004795074463, "learning_rate": 1.994948493671153e-05, "loss": 0.6618, "step": 258 }, { "epoch": 0.012347150382570973, "grad_norm": 1.6784948110580444, "learning_rate": 1.9949089852836297e-05, "loss": 0.9257, "step": 259 }, { "epoch": 0.012394822777870474, "grad_norm": 3.88934326171875, "learning_rate": 1.994869323391895e-05, "loss": 0.8041, "step": 260 }, { "epoch": 0.012442495173169976, "grad_norm": 14.89253044128418, "learning_rate": 1.9948295080020696e-05, "loss": 1.1327, "step": 261 }, { "epoch": 0.012490167568469478, "grad_norm": 2.403461217880249, "learning_rate": 1.9947895391202955e-05, "loss": 0.9128, "step": 262 }, { "epoch": 0.01253783996376898, "grad_norm": 3.1274027824401855, "learning_rate": 1.9947494167527398e-05, "loss": 0.9623, "step": 263 }, { "epoch": 0.012585512359068482, "grad_norm": 2.862778425216675, "learning_rate": 1.9947091409055933e-05, "loss": 1.8018, "step": 264 }, { "epoch": 0.012633184754367983, "grad_norm": 2.613210678100586, "learning_rate": 1.9946687115850696e-05, "loss": 1.1379, "step": 265 }, { "epoch": 0.012680857149667485, "grad_norm": 2.2426257133483887, "learning_rate": 1.994628128797407e-05, "loss": 0.5249, "step": 266 }, { "epoch": 0.012728529544966987, "grad_norm": 1.4983394145965576, "learning_rate": 1.9945873925488667e-05, "loss": 0.9067, "step": 267 }, { "epoch": 0.012776201940266489, "grad_norm": 1.6490086317062378, "learning_rate": 1.9945465028457337e-05, "loss": 0.9039, "step": 268 }, { "epoch": 0.01282387433556599, "grad_norm": 1.835891604423523, "learning_rate": 1.9945054596943177e-05, "loss": 0.8388, "step": 269 }, { "epoch": 0.012871546730865492, "grad_norm": 1.9135104417800903, "learning_rate": 1.9944642631009507e-05, "loss": 0.7224, "step": 270 }, { "epoch": 0.012919219126164994, "grad_norm": 1.6289864778518677, "learning_rate": 1.9944229130719885e-05, "loss": 0.778, "step": 271 }, { "epoch": 0.012966891521464496, "grad_norm": 1.39491868019104, "learning_rate": 1.9943814096138116e-05, "loss": 0.7036, "step": 272 }, { "epoch": 0.013014563916763998, "grad_norm": 2.2777199745178223, "learning_rate": 1.9943397527328233e-05, "loss": 0.6181, "step": 273 }, { "epoch": 0.0130622363120635, "grad_norm": 3.851339340209961, "learning_rate": 1.9942979424354506e-05, "loss": 0.4939, "step": 274 }, { "epoch": 0.013109908707363001, "grad_norm": 2.2557308673858643, "learning_rate": 1.9942559787281453e-05, "loss": 0.518, "step": 275 }, { "epoch": 0.013157581102662503, "grad_norm": 2.261568307876587, "learning_rate": 1.994213861617381e-05, "loss": 0.9162, "step": 276 }, { "epoch": 0.013205253497962005, "grad_norm": 1.7167255878448486, "learning_rate": 1.9941715911096563e-05, "loss": 0.7062, "step": 277 }, { "epoch": 0.013252925893261507, "grad_norm": 4.64394474029541, "learning_rate": 1.9941291672114928e-05, "loss": 1.1303, "step": 278 }, { "epoch": 0.013300598288561009, "grad_norm": 1.3798185586929321, "learning_rate": 1.9940865899294367e-05, "loss": 0.4313, "step": 279 }, { "epoch": 0.01334827068386051, "grad_norm": 1.8215328454971313, "learning_rate": 1.9940438592700568e-05, "loss": 0.7731, "step": 280 }, { "epoch": 0.013395943079160012, "grad_norm": 2.367222309112549, "learning_rate": 1.9940009752399462e-05, "loss": 0.358, "step": 281 }, { "epoch": 0.013443615474459514, "grad_norm": 3.4352400302886963, "learning_rate": 1.993957937845721e-05, "loss": 0.5845, "step": 282 }, { "epoch": 0.013491287869759016, "grad_norm": 3.778900384902954, "learning_rate": 1.993914747094022e-05, "loss": 0.9492, "step": 283 }, { "epoch": 0.013538960265058518, "grad_norm": 1.7573721408843994, "learning_rate": 1.9938714029915128e-05, "loss": 0.8815, "step": 284 }, { "epoch": 0.01358663266035802, "grad_norm": 2.7110109329223633, "learning_rate": 1.9938279055448814e-05, "loss": 1.0121, "step": 285 }, { "epoch": 0.013634305055657521, "grad_norm": 1.6014918088912964, "learning_rate": 1.993784254760838e-05, "loss": 0.6007, "step": 286 }, { "epoch": 0.013681977450957023, "grad_norm": 2.3209445476531982, "learning_rate": 1.9937404506461187e-05, "loss": 0.7738, "step": 287 }, { "epoch": 0.013729649846256525, "grad_norm": 1.5373198986053467, "learning_rate": 1.993696493207481e-05, "loss": 0.6923, "step": 288 }, { "epoch": 0.013777322241556027, "grad_norm": 3.516077756881714, "learning_rate": 1.9936523824517074e-05, "loss": 1.1835, "step": 289 }, { "epoch": 0.013824994636855529, "grad_norm": 1.887039065361023, "learning_rate": 1.993608118385604e-05, "loss": 1.3008, "step": 290 }, { "epoch": 0.01387266703215503, "grad_norm": 1.584210991859436, "learning_rate": 1.993563701016e-05, "loss": 0.6803, "step": 291 }, { "epoch": 0.013920339427454532, "grad_norm": 2.204319477081299, "learning_rate": 1.993519130349749e-05, "loss": 0.5211, "step": 292 }, { "epoch": 0.013968011822754034, "grad_norm": 1.4902498722076416, "learning_rate": 1.9934744063937273e-05, "loss": 0.8433, "step": 293 }, { "epoch": 0.014015684218053536, "grad_norm": 1.8343721628189087, "learning_rate": 1.9934295291548357e-05, "loss": 0.7311, "step": 294 }, { "epoch": 0.014063356613353038, "grad_norm": 2.033521890640259, "learning_rate": 1.9933844986399977e-05, "loss": 0.892, "step": 295 }, { "epoch": 0.01411102900865254, "grad_norm": 1.453253149986267, "learning_rate": 1.9933393148561616e-05, "loss": 0.9422, "step": 296 }, { "epoch": 0.014158701403952041, "grad_norm": 2.0837976932525635, "learning_rate": 1.9932939778102985e-05, "loss": 0.9896, "step": 297 }, { "epoch": 0.014206373799251543, "grad_norm": 2.659848213195801, "learning_rate": 1.9932484875094036e-05, "loss": 0.9624, "step": 298 }, { "epoch": 0.014254046194551045, "grad_norm": 3.5798799991607666, "learning_rate": 1.9932028439604958e-05, "loss": 0.6397, "step": 299 }, { "epoch": 0.014301718589850547, "grad_norm": 4.200873851776123, "learning_rate": 1.993157047170617e-05, "loss": 1.1005, "step": 300 }, { "epoch": 0.014349390985150048, "grad_norm": 2.4092493057250977, "learning_rate": 1.9931110971468332e-05, "loss": 0.6148, "step": 301 }, { "epoch": 0.01439706338044955, "grad_norm": 1.8427973985671997, "learning_rate": 1.9930649938962344e-05, "loss": 0.7859, "step": 302 }, { "epoch": 0.014444735775749052, "grad_norm": 3.5022478103637695, "learning_rate": 1.9930187374259338e-05, "loss": 0.7012, "step": 303 }, { "epoch": 0.014492408171048554, "grad_norm": 2.042717218399048, "learning_rate": 1.992972327743068e-05, "loss": 0.7806, "step": 304 }, { "epoch": 0.014540080566348056, "grad_norm": 2.3085291385650635, "learning_rate": 1.9929257648547976e-05, "loss": 0.7839, "step": 305 }, { "epoch": 0.014587752961647557, "grad_norm": 2.7668402194976807, "learning_rate": 1.992879048768307e-05, "loss": 0.7127, "step": 306 }, { "epoch": 0.01463542535694706, "grad_norm": 4.952498912811279, "learning_rate": 1.9928321794908035e-05, "loss": 1.4776, "step": 307 }, { "epoch": 0.014683097752246561, "grad_norm": 3.2454190254211426, "learning_rate": 1.992785157029519e-05, "loss": 0.8308, "step": 308 }, { "epoch": 0.014730770147546063, "grad_norm": 2.0695793628692627, "learning_rate": 1.9927379813917087e-05, "loss": 0.678, "step": 309 }, { "epoch": 0.014778442542845565, "grad_norm": 26.816200256347656, "learning_rate": 1.992690652584651e-05, "loss": 1.2626, "step": 310 }, { "epoch": 0.014826114938145066, "grad_norm": 6.531989574432373, "learning_rate": 1.992643170615648e-05, "loss": 2.0021, "step": 311 }, { "epoch": 0.014873787333444568, "grad_norm": 3.469572067260742, "learning_rate": 1.9925955354920265e-05, "loss": 0.8604, "step": 312 }, { "epoch": 0.01492145972874407, "grad_norm": 2.2434329986572266, "learning_rate": 1.9925477472211356e-05, "loss": 0.8491, "step": 313 }, { "epoch": 0.014969132124043572, "grad_norm": 3.034557342529297, "learning_rate": 1.9924998058103483e-05, "loss": 1.5331, "step": 314 }, { "epoch": 0.015016804519343074, "grad_norm": 1.6433539390563965, "learning_rate": 1.9924517112670617e-05, "loss": 0.7086, "step": 315 }, { "epoch": 0.015064476914642575, "grad_norm": 1.574570894241333, "learning_rate": 1.9924034635986968e-05, "loss": 0.7385, "step": 316 }, { "epoch": 0.015112149309942077, "grad_norm": 3.6537985801696777, "learning_rate": 1.992355062812697e-05, "loss": 0.8102, "step": 317 }, { "epoch": 0.015159821705241579, "grad_norm": 2.8240246772766113, "learning_rate": 1.99230650891653e-05, "loss": 0.8773, "step": 318 }, { "epoch": 0.015207494100541081, "grad_norm": 2.016932487487793, "learning_rate": 1.9922578019176878e-05, "loss": 0.8061, "step": 319 }, { "epoch": 0.015255166495840583, "grad_norm": 1.351644515991211, "learning_rate": 1.992208941823685e-05, "loss": 0.7521, "step": 320 }, { "epoch": 0.015302838891140084, "grad_norm": 1.7155174016952515, "learning_rate": 1.99215992864206e-05, "loss": 0.8785, "step": 321 }, { "epoch": 0.015350511286439586, "grad_norm": 2.364560842514038, "learning_rate": 1.9921107623803757e-05, "loss": 0.7609, "step": 322 }, { "epoch": 0.01539818368173909, "grad_norm": 2.028265953063965, "learning_rate": 1.9920614430462173e-05, "loss": 1.0037, "step": 323 }, { "epoch": 0.015445856077038592, "grad_norm": 2.755866527557373, "learning_rate": 1.9920119706471944e-05, "loss": 0.6831, "step": 324 }, { "epoch": 0.015493528472338093, "grad_norm": 1.3015339374542236, "learning_rate": 1.9919623451909402e-05, "loss": 0.6163, "step": 325 }, { "epoch": 0.015541200867637595, "grad_norm": 2.2389609813690186, "learning_rate": 1.9919125666851115e-05, "loss": 0.4706, "step": 326 }, { "epoch": 0.015588873262937097, "grad_norm": 2.114124298095703, "learning_rate": 1.9918626351373885e-05, "loss": 0.8427, "step": 327 }, { "epoch": 0.0156365456582366, "grad_norm": 3.373286008834839, "learning_rate": 1.991812550555475e-05, "loss": 0.8408, "step": 328 }, { "epoch": 0.0156842180535361, "grad_norm": 5.1343913078308105, "learning_rate": 1.9917623129470985e-05, "loss": 1.4614, "step": 329 }, { "epoch": 0.015731890448835602, "grad_norm": 1.4291434288024902, "learning_rate": 1.99171192232001e-05, "loss": 0.5323, "step": 330 }, { "epoch": 0.015779562844135103, "grad_norm": 1.5069769620895386, "learning_rate": 1.9916613786819856e-05, "loss": 0.7634, "step": 331 }, { "epoch": 0.015827235239434606, "grad_norm": 1.5601898431777954, "learning_rate": 1.991610682040822e-05, "loss": 0.8062, "step": 332 }, { "epoch": 0.015874907634734106, "grad_norm": 2.073275327682495, "learning_rate": 1.9915598324043415e-05, "loss": 1.0942, "step": 333 }, { "epoch": 0.01592258003003361, "grad_norm": 3.8580596446990967, "learning_rate": 1.9915088297803905e-05, "loss": 1.0956, "step": 334 }, { "epoch": 0.01597025242533311, "grad_norm": 2.2373106479644775, "learning_rate": 1.9914576741768373e-05, "loss": 0.8886, "step": 335 }, { "epoch": 0.016017924820632613, "grad_norm": 1.4013243913650513, "learning_rate": 1.991406365601575e-05, "loss": 0.8137, "step": 336 }, { "epoch": 0.016065597215932113, "grad_norm": 1.8992758989334106, "learning_rate": 1.99135490406252e-05, "loss": 0.9923, "step": 337 }, { "epoch": 0.016113269611231617, "grad_norm": 2.6627261638641357, "learning_rate": 1.9913032895676126e-05, "loss": 0.5913, "step": 338 }, { "epoch": 0.016160942006531117, "grad_norm": 2.40303897857666, "learning_rate": 1.9912515221248157e-05, "loss": 0.861, "step": 339 }, { "epoch": 0.01620861440183062, "grad_norm": 1.565407633781433, "learning_rate": 1.9911996017421168e-05, "loss": 0.7945, "step": 340 }, { "epoch": 0.01625628679713012, "grad_norm": 1.8255527019500732, "learning_rate": 1.991147528427527e-05, "loss": 0.9651, "step": 341 }, { "epoch": 0.016303959192429624, "grad_norm": 3.6867518424987793, "learning_rate": 1.9910953021890802e-05, "loss": 0.7636, "step": 342 }, { "epoch": 0.016351631587729124, "grad_norm": 1.8927056789398193, "learning_rate": 1.9910429230348348e-05, "loss": 0.7816, "step": 343 }, { "epoch": 0.016399303983028628, "grad_norm": 2.944861650466919, "learning_rate": 1.9909903909728722e-05, "loss": 1.089, "step": 344 }, { "epoch": 0.016446976378328128, "grad_norm": 3.066411018371582, "learning_rate": 1.9909377060112973e-05, "loss": 1.1133, "step": 345 }, { "epoch": 0.01649464877362763, "grad_norm": 7.740360260009766, "learning_rate": 1.990884868158239e-05, "loss": 1.1003, "step": 346 }, { "epoch": 0.01654232116892713, "grad_norm": 4.239732265472412, "learning_rate": 1.9908318774218498e-05, "loss": 0.8084, "step": 347 }, { "epoch": 0.016589993564226635, "grad_norm": 2.522313117980957, "learning_rate": 1.9907787338103054e-05, "loss": 0.9969, "step": 348 }, { "epoch": 0.016637665959526135, "grad_norm": 2.228370189666748, "learning_rate": 1.9907254373318054e-05, "loss": 0.4467, "step": 349 }, { "epoch": 0.01668533835482564, "grad_norm": 1.6454826593399048, "learning_rate": 1.9906719879945733e-05, "loss": 0.9234, "step": 350 }, { "epoch": 0.01673301075012514, "grad_norm": 3.1763508319854736, "learning_rate": 1.990618385806855e-05, "loss": 0.5725, "step": 351 }, { "epoch": 0.016780683145424642, "grad_norm": 2.071532964706421, "learning_rate": 1.9905646307769212e-05, "loss": 0.6487, "step": 352 }, { "epoch": 0.016828355540724142, "grad_norm": 2.020214796066284, "learning_rate": 1.990510722913066e-05, "loss": 0.9457, "step": 353 }, { "epoch": 0.016876027936023646, "grad_norm": 2.9650723934173584, "learning_rate": 1.9904566622236064e-05, "loss": 1.0136, "step": 354 }, { "epoch": 0.016923700331323146, "grad_norm": 48.10707473754883, "learning_rate": 1.9904024487168835e-05, "loss": 0.7203, "step": 355 }, { "epoch": 0.01697137272662265, "grad_norm": 3.6566269397735596, "learning_rate": 1.9903480824012617e-05, "loss": 0.6368, "step": 356 }, { "epoch": 0.01701904512192215, "grad_norm": 2.7905960083007812, "learning_rate": 1.9902935632851296e-05, "loss": 0.6772, "step": 357 }, { "epoch": 0.017066717517221653, "grad_norm": 1.4937987327575684, "learning_rate": 1.9902388913768987e-05, "loss": 0.7707, "step": 358 }, { "epoch": 0.017114389912521153, "grad_norm": 2.0287539958953857, "learning_rate": 1.9901840666850045e-05, "loss": 1.1541, "step": 359 }, { "epoch": 0.017162062307820657, "grad_norm": 1.8322936296463013, "learning_rate": 1.9901290892179056e-05, "loss": 0.7406, "step": 360 }, { "epoch": 0.017209734703120157, "grad_norm": 2.9989206790924072, "learning_rate": 1.9900739589840846e-05, "loss": 0.6528, "step": 361 }, { "epoch": 0.01725740709841966, "grad_norm": 2.342498540878296, "learning_rate": 1.9900186759920475e-05, "loss": 0.8852, "step": 362 }, { "epoch": 0.01730507949371916, "grad_norm": 2.000173330307007, "learning_rate": 1.9899632402503242e-05, "loss": 0.7001, "step": 363 }, { "epoch": 0.017352751889018664, "grad_norm": 3.8555989265441895, "learning_rate": 1.9899076517674674e-05, "loss": 1.108, "step": 364 }, { "epoch": 0.017400424284318164, "grad_norm": 2.1533634662628174, "learning_rate": 1.9898519105520537e-05, "loss": 0.746, "step": 365 }, { "epoch": 0.017448096679617667, "grad_norm": 1.9343632459640503, "learning_rate": 1.989796016612684e-05, "loss": 0.7821, "step": 366 }, { "epoch": 0.017495769074917168, "grad_norm": 1.9814963340759277, "learning_rate": 1.989739969957982e-05, "loss": 0.9643, "step": 367 }, { "epoch": 0.01754344147021667, "grad_norm": 2.1211135387420654, "learning_rate": 1.9896837705965946e-05, "loss": 0.7709, "step": 368 }, { "epoch": 0.017591113865516175, "grad_norm": 3.451181173324585, "learning_rate": 1.9896274185371934e-05, "loss": 1.0099, "step": 369 }, { "epoch": 0.017638786260815675, "grad_norm": 1.3221015930175781, "learning_rate": 1.9895709137884727e-05, "loss": 0.8877, "step": 370 }, { "epoch": 0.017686458656115178, "grad_norm": 2.346433401107788, "learning_rate": 1.989514256359151e-05, "loss": 0.9619, "step": 371 }, { "epoch": 0.01773413105141468, "grad_norm": 2.7488558292388916, "learning_rate": 1.9894574462579688e-05, "loss": 1.2417, "step": 372 }, { "epoch": 0.017781803446714182, "grad_norm": 2.576575517654419, "learning_rate": 1.9894004834936924e-05, "loss": 1.07, "step": 373 }, { "epoch": 0.017829475842013682, "grad_norm": 1.4878088235855103, "learning_rate": 1.9893433680751105e-05, "loss": 0.6564, "step": 374 }, { "epoch": 0.017877148237313185, "grad_norm": 1.783627986907959, "learning_rate": 1.989286100011035e-05, "loss": 0.4787, "step": 375 }, { "epoch": 0.017924820632612685, "grad_norm": 1.658076524734497, "learning_rate": 1.9892286793103018e-05, "loss": 0.6887, "step": 376 }, { "epoch": 0.01797249302791219, "grad_norm": 1.9088085889816284, "learning_rate": 1.9891711059817705e-05, "loss": 1.2357, "step": 377 }, { "epoch": 0.01802016542321169, "grad_norm": 1.7909817695617676, "learning_rate": 1.9891133800343245e-05, "loss": 0.5678, "step": 378 }, { "epoch": 0.018067837818511193, "grad_norm": 2.40408992767334, "learning_rate": 1.989055501476869e-05, "loss": 0.7371, "step": 379 }, { "epoch": 0.018115510213810693, "grad_norm": 2.4356093406677246, "learning_rate": 1.9889974703183354e-05, "loss": 1.007, "step": 380 }, { "epoch": 0.018163182609110196, "grad_norm": 1.4276548624038696, "learning_rate": 1.988939286567677e-05, "loss": 0.8721, "step": 381 }, { "epoch": 0.018210855004409696, "grad_norm": 1.4086828231811523, "learning_rate": 1.9888809502338706e-05, "loss": 0.9469, "step": 382 }, { "epoch": 0.0182585273997092, "grad_norm": 3.424661636352539, "learning_rate": 1.988822461325917e-05, "loss": 0.2993, "step": 383 }, { "epoch": 0.0183061997950087, "grad_norm": 1.3863457441329956, "learning_rate": 1.988763819852841e-05, "loss": 1.0035, "step": 384 }, { "epoch": 0.018353872190308203, "grad_norm": 1.849429726600647, "learning_rate": 1.9887050258236894e-05, "loss": 0.6809, "step": 385 }, { "epoch": 0.018401544585607704, "grad_norm": 1.7745966911315918, "learning_rate": 1.988646079247534e-05, "loss": 0.6901, "step": 386 }, { "epoch": 0.018449216980907207, "grad_norm": 1.6295166015625, "learning_rate": 1.9885869801334697e-05, "loss": 0.8989, "step": 387 }, { "epoch": 0.018496889376206707, "grad_norm": 6.392057418823242, "learning_rate": 1.988527728490615e-05, "loss": 0.5829, "step": 388 }, { "epoch": 0.01854456177150621, "grad_norm": 6.590545177459717, "learning_rate": 1.9884683243281117e-05, "loss": 1.1739, "step": 389 }, { "epoch": 0.01859223416680571, "grad_norm": 1.6761903762817383, "learning_rate": 1.988408767655125e-05, "loss": 0.9709, "step": 390 }, { "epoch": 0.018639906562105214, "grad_norm": 1.8666620254516602, "learning_rate": 1.9883490584808443e-05, "loss": 0.5188, "step": 391 }, { "epoch": 0.018687578957404714, "grad_norm": 1.5060175657272339, "learning_rate": 1.9882891968144816e-05, "loss": 0.7627, "step": 392 }, { "epoch": 0.018735251352704218, "grad_norm": 2.627070903778076, "learning_rate": 1.9882291826652735e-05, "loss": 0.5643, "step": 393 }, { "epoch": 0.018782923748003718, "grad_norm": 3.0624523162841797, "learning_rate": 1.988169016042479e-05, "loss": 0.9113, "step": 394 }, { "epoch": 0.01883059614330322, "grad_norm": 1.9177439212799072, "learning_rate": 1.988108696955382e-05, "loss": 0.9923, "step": 395 }, { "epoch": 0.01887826853860272, "grad_norm": 1.3925445079803467, "learning_rate": 1.988048225413288e-05, "loss": 1.0391, "step": 396 }, { "epoch": 0.018925940933902225, "grad_norm": 1.8818647861480713, "learning_rate": 1.9879876014255283e-05, "loss": 0.8457, "step": 397 }, { "epoch": 0.018973613329201725, "grad_norm": 15.9513578414917, "learning_rate": 1.9879268250014558e-05, "loss": 1.0194, "step": 398 }, { "epoch": 0.01902128572450123, "grad_norm": 1.5039842128753662, "learning_rate": 1.987865896150448e-05, "loss": 0.3371, "step": 399 }, { "epoch": 0.01906895811980073, "grad_norm": 1.3381415605545044, "learning_rate": 1.9878048148819054e-05, "loss": 0.7649, "step": 400 }, { "epoch": 0.019116630515100232, "grad_norm": 1.379574179649353, "learning_rate": 1.9877435812052522e-05, "loss": 0.6547, "step": 401 }, { "epoch": 0.019164302910399732, "grad_norm": 1.1685925722122192, "learning_rate": 1.9876821951299362e-05, "loss": 0.2342, "step": 402 }, { "epoch": 0.019211975305699236, "grad_norm": 12.052519798278809, "learning_rate": 1.9876206566654285e-05, "loss": 0.3376, "step": 403 }, { "epoch": 0.019259647700998736, "grad_norm": 1.135711908340454, "learning_rate": 1.9875589658212244e-05, "loss": 0.4995, "step": 404 }, { "epoch": 0.01930732009629824, "grad_norm": 1.9455606937408447, "learning_rate": 1.9874971226068417e-05, "loss": 0.7776, "step": 405 }, { "epoch": 0.01935499249159774, "grad_norm": 1.3894046545028687, "learning_rate": 1.987435127031822e-05, "loss": 0.3752, "step": 406 }, { "epoch": 0.019402664886897243, "grad_norm": 2.79687237739563, "learning_rate": 1.987372979105731e-05, "loss": 0.9352, "step": 407 }, { "epoch": 0.019450337282196743, "grad_norm": 2.8667871952056885, "learning_rate": 1.987310678838157e-05, "loss": 1.3754, "step": 408 }, { "epoch": 0.019498009677496247, "grad_norm": 1.4679111242294312, "learning_rate": 1.9872482262387128e-05, "loss": 0.5814, "step": 409 }, { "epoch": 0.019545682072795747, "grad_norm": 1.8989754915237427, "learning_rate": 1.987185621317034e-05, "loss": 0.8112, "step": 410 }, { "epoch": 0.01959335446809525, "grad_norm": 2.3282973766326904, "learning_rate": 1.98712286408278e-05, "loss": 1.0023, "step": 411 }, { "epoch": 0.01964102686339475, "grad_norm": 1.5745048522949219, "learning_rate": 1.9870599545456333e-05, "loss": 0.8833, "step": 412 }, { "epoch": 0.019688699258694254, "grad_norm": 4.777143478393555, "learning_rate": 1.9869968927153005e-05, "loss": 1.5177, "step": 413 }, { "epoch": 0.019736371653993754, "grad_norm": 2.690695285797119, "learning_rate": 1.986933678601511e-05, "loss": 1.1467, "step": 414 }, { "epoch": 0.019784044049293258, "grad_norm": 2.625542402267456, "learning_rate": 1.9868703122140186e-05, "loss": 0.4268, "step": 415 }, { "epoch": 0.019831716444592758, "grad_norm": 1.2323603630065918, "learning_rate": 1.9868067935625997e-05, "loss": 0.5527, "step": 416 }, { "epoch": 0.01987938883989226, "grad_norm": 8.139171600341797, "learning_rate": 1.9867431226570546e-05, "loss": 1.183, "step": 417 }, { "epoch": 0.01992706123519176, "grad_norm": 1.7020472288131714, "learning_rate": 1.9866792995072073e-05, "loss": 0.9839, "step": 418 }, { "epoch": 0.019974733630491265, "grad_norm": 1.2943341732025146, "learning_rate": 1.986615324122905e-05, "loss": 0.4531, "step": 419 }, { "epoch": 0.020022406025790765, "grad_norm": 1.480353832244873, "learning_rate": 1.986551196514018e-05, "loss": 0.8439, "step": 420 }, { "epoch": 0.02007007842109027, "grad_norm": 2.9385576248168945, "learning_rate": 1.9864869166904412e-05, "loss": 0.7533, "step": 421 }, { "epoch": 0.02011775081638977, "grad_norm": 1.6832324266433716, "learning_rate": 1.986422484662092e-05, "loss": 0.8662, "step": 422 }, { "epoch": 0.020165423211689272, "grad_norm": 1.4676250219345093, "learning_rate": 1.9863579004389115e-05, "loss": 0.9441, "step": 423 }, { "epoch": 0.020213095606988772, "grad_norm": 1.6102615594863892, "learning_rate": 1.9862931640308648e-05, "loss": 0.651, "step": 424 }, { "epoch": 0.020260768002288276, "grad_norm": 3.2582168579101562, "learning_rate": 1.9862282754479394e-05, "loss": 1.0099, "step": 425 }, { "epoch": 0.020308440397587776, "grad_norm": 1.4158923625946045, "learning_rate": 1.9861632347001474e-05, "loss": 0.7244, "step": 426 }, { "epoch": 0.02035611279288728, "grad_norm": 2.4489970207214355, "learning_rate": 1.986098041797524e-05, "loss": 0.9127, "step": 427 }, { "epoch": 0.02040378518818678, "grad_norm": 1.574802279472351, "learning_rate": 1.986032696750127e-05, "loss": 0.9991, "step": 428 }, { "epoch": 0.020451457583486283, "grad_norm": 5.254917144775391, "learning_rate": 1.9859671995680395e-05, "loss": 0.4747, "step": 429 }, { "epoch": 0.020499129978785783, "grad_norm": 1.2564741373062134, "learning_rate": 1.9859015502613666e-05, "loss": 0.5081, "step": 430 }, { "epoch": 0.020546802374085286, "grad_norm": 1.4141570329666138, "learning_rate": 1.9858357488402374e-05, "loss": 1.0678, "step": 431 }, { "epoch": 0.020594474769384787, "grad_norm": 1.6442327499389648, "learning_rate": 1.985769795314804e-05, "loss": 0.6662, "step": 432 }, { "epoch": 0.02064214716468429, "grad_norm": 1.1709163188934326, "learning_rate": 1.985703689695243e-05, "loss": 0.635, "step": 433 }, { "epoch": 0.02068981955998379, "grad_norm": 1.7995291948318481, "learning_rate": 1.9856374319917528e-05, "loss": 0.6015, "step": 434 }, { "epoch": 0.020737491955283294, "grad_norm": 1.5249440670013428, "learning_rate": 1.9855710222145576e-05, "loss": 0.604, "step": 435 }, { "epoch": 0.020785164350582794, "grad_norm": 5.639601230621338, "learning_rate": 1.985504460373903e-05, "loss": 0.7747, "step": 436 }, { "epoch": 0.020832836745882297, "grad_norm": 1.7618529796600342, "learning_rate": 1.9854377464800586e-05, "loss": 0.8751, "step": 437 }, { "epoch": 0.020880509141181797, "grad_norm": 3.759244680404663, "learning_rate": 1.9853708805433182e-05, "loss": 1.1403, "step": 438 }, { "epoch": 0.0209281815364813, "grad_norm": 1.4865508079528809, "learning_rate": 1.985303862573998e-05, "loss": 0.4212, "step": 439 }, { "epoch": 0.0209758539317808, "grad_norm": 1.1901158094406128, "learning_rate": 1.9852366925824393e-05, "loss": 0.49, "step": 440 }, { "epoch": 0.021023526327080305, "grad_norm": 1.6764826774597168, "learning_rate": 1.985169370579004e-05, "loss": 0.8045, "step": 441 }, { "epoch": 0.021071198722379805, "grad_norm": 1.5856084823608398, "learning_rate": 1.9851018965740806e-05, "loss": 0.6927, "step": 442 }, { "epoch": 0.021118871117679308, "grad_norm": 3.426955223083496, "learning_rate": 1.9850342705780788e-05, "loss": 1.0903, "step": 443 }, { "epoch": 0.021166543512978808, "grad_norm": 1.8891679048538208, "learning_rate": 1.984966492601433e-05, "loss": 1.126, "step": 444 }, { "epoch": 0.021214215908278312, "grad_norm": 2.5558974742889404, "learning_rate": 1.984898562654601e-05, "loss": 1.1514, "step": 445 }, { "epoch": 0.021261888303577812, "grad_norm": 1.8330786228179932, "learning_rate": 1.984830480748063e-05, "loss": 0.8547, "step": 446 }, { "epoch": 0.021309560698877315, "grad_norm": 2.5390944480895996, "learning_rate": 1.9847622468923236e-05, "loss": 1.0727, "step": 447 }, { "epoch": 0.021357233094176815, "grad_norm": 8.325060844421387, "learning_rate": 1.9846938610979104e-05, "loss": 0.5672, "step": 448 }, { "epoch": 0.02140490548947632, "grad_norm": 5.6124043464660645, "learning_rate": 1.984625323375375e-05, "loss": 0.9308, "step": 449 }, { "epoch": 0.02145257788477582, "grad_norm": 2.8409430980682373, "learning_rate": 1.984556633735292e-05, "loss": 0.817, "step": 450 }, { "epoch": 0.021500250280075323, "grad_norm": 2.745298385620117, "learning_rate": 1.9844877921882593e-05, "loss": 1.0862, "step": 451 }, { "epoch": 0.021547922675374823, "grad_norm": 1.320275902748108, "learning_rate": 1.9844187987448984e-05, "loss": 0.6576, "step": 452 }, { "epoch": 0.021595595070674326, "grad_norm": 1.8817437887191772, "learning_rate": 1.9843496534158543e-05, "loss": 1.1033, "step": 453 }, { "epoch": 0.021643267465973826, "grad_norm": 1.684211015701294, "learning_rate": 1.984280356211796e-05, "loss": 1.0972, "step": 454 }, { "epoch": 0.02169093986127333, "grad_norm": 1.3767002820968628, "learning_rate": 1.9842109071434143e-05, "loss": 0.9344, "step": 455 }, { "epoch": 0.02173861225657283, "grad_norm": 2.4937679767608643, "learning_rate": 1.9841413062214253e-05, "loss": 0.764, "step": 456 }, { "epoch": 0.021786284651872333, "grad_norm": 1.7059651613235474, "learning_rate": 1.9840715534565677e-05, "loss": 0.8478, "step": 457 }, { "epoch": 0.021833957047171833, "grad_norm": 1.4310321807861328, "learning_rate": 1.984001648859603e-05, "loss": 0.7032, "step": 458 }, { "epoch": 0.021881629442471337, "grad_norm": 1.4786193370819092, "learning_rate": 1.9839315924413174e-05, "loss": 0.7041, "step": 459 }, { "epoch": 0.021929301837770837, "grad_norm": 1.9900134801864624, "learning_rate": 1.9838613842125193e-05, "loss": 0.298, "step": 460 }, { "epoch": 0.02197697423307034, "grad_norm": 1.3430676460266113, "learning_rate": 1.9837910241840418e-05, "loss": 0.459, "step": 461 }, { "epoch": 0.022024646628369844, "grad_norm": 1.3910713195800781, "learning_rate": 1.9837205123667404e-05, "loss": 0.6807, "step": 462 }, { "epoch": 0.022072319023669344, "grad_norm": 2.3101654052734375, "learning_rate": 1.983649848771494e-05, "loss": 0.4995, "step": 463 }, { "epoch": 0.022119991418968848, "grad_norm": 1.8263473510742188, "learning_rate": 1.9835790334092054e-05, "loss": 0.5541, "step": 464 }, { "epoch": 0.022167663814268348, "grad_norm": 1.215735912322998, "learning_rate": 1.9835080662908013e-05, "loss": 0.6172, "step": 465 }, { "epoch": 0.02221533620956785, "grad_norm": 7.440627574920654, "learning_rate": 1.9834369474272307e-05, "loss": 0.5222, "step": 466 }, { "epoch": 0.02226300860486735, "grad_norm": 1.4894155263900757, "learning_rate": 1.983365676829466e-05, "loss": 0.5461, "step": 467 }, { "epoch": 0.022310681000166855, "grad_norm": 1.6445199251174927, "learning_rate": 1.9832942545085047e-05, "loss": 0.6309, "step": 468 }, { "epoch": 0.022358353395466355, "grad_norm": 5.644519805908203, "learning_rate": 1.9832226804753658e-05, "loss": 0.8083, "step": 469 }, { "epoch": 0.02240602579076586, "grad_norm": 4.1564812660217285, "learning_rate": 1.9831509547410922e-05, "loss": 1.613, "step": 470 }, { "epoch": 0.02245369818606536, "grad_norm": 1.6427197456359863, "learning_rate": 1.9830790773167513e-05, "loss": 1.0581, "step": 471 }, { "epoch": 0.022501370581364862, "grad_norm": 5.313671588897705, "learning_rate": 1.983007048213432e-05, "loss": 0.5421, "step": 472 }, { "epoch": 0.022549042976664362, "grad_norm": 1.734161376953125, "learning_rate": 1.9829348674422488e-05, "loss": 0.9461, "step": 473 }, { "epoch": 0.022596715371963866, "grad_norm": 2.149139642715454, "learning_rate": 1.982862535014337e-05, "loss": 0.6638, "step": 474 }, { "epoch": 0.022644387767263366, "grad_norm": 2.1884307861328125, "learning_rate": 1.9827900509408583e-05, "loss": 0.9279, "step": 475 }, { "epoch": 0.02269206016256287, "grad_norm": 1.2274335622787476, "learning_rate": 1.9827174152329952e-05, "loss": 0.8374, "step": 476 }, { "epoch": 0.02273973255786237, "grad_norm": 1.497936725616455, "learning_rate": 1.9826446279019547e-05, "loss": 0.9778, "step": 477 }, { "epoch": 0.022787404953161873, "grad_norm": 1.2841224670410156, "learning_rate": 1.9825716889589678e-05, "loss": 0.5275, "step": 478 }, { "epoch": 0.022835077348461373, "grad_norm": 1.717750072479248, "learning_rate": 1.9824985984152877e-05, "loss": 0.687, "step": 479 }, { "epoch": 0.022882749743760877, "grad_norm": 3.095571517944336, "learning_rate": 1.9824253562821915e-05, "loss": 0.4429, "step": 480 }, { "epoch": 0.022930422139060377, "grad_norm": 1.8677228689193726, "learning_rate": 1.98235196257098e-05, "loss": 0.8407, "step": 481 }, { "epoch": 0.02297809453435988, "grad_norm": 1.6857329607009888, "learning_rate": 1.982278417292977e-05, "loss": 0.8925, "step": 482 }, { "epoch": 0.02302576692965938, "grad_norm": 1.5242455005645752, "learning_rate": 1.98220472045953e-05, "loss": 0.6419, "step": 483 }, { "epoch": 0.023073439324958884, "grad_norm": 2.109609603881836, "learning_rate": 1.9821308720820086e-05, "loss": 1.2286, "step": 484 }, { "epoch": 0.023121111720258384, "grad_norm": 1.9739404916763306, "learning_rate": 1.9820568721718082e-05, "loss": 0.9189, "step": 485 }, { "epoch": 0.023168784115557887, "grad_norm": 2.0055463314056396, "learning_rate": 1.9819827207403458e-05, "loss": 1.1078, "step": 486 }, { "epoch": 0.023216456510857388, "grad_norm": 2.384929656982422, "learning_rate": 1.9819084177990615e-05, "loss": 0.5499, "step": 487 }, { "epoch": 0.02326412890615689, "grad_norm": 4.874091625213623, "learning_rate": 1.9818339633594203e-05, "loss": 0.9564, "step": 488 }, { "epoch": 0.02331180130145639, "grad_norm": 9.882637023925781, "learning_rate": 1.9817593574329096e-05, "loss": 1.3838, "step": 489 }, { "epoch": 0.023359473696755895, "grad_norm": 1.7439748048782349, "learning_rate": 1.9816846000310403e-05, "loss": 0.8163, "step": 490 }, { "epoch": 0.023407146092055395, "grad_norm": 1.8394163846969604, "learning_rate": 1.981609691165346e-05, "loss": 0.775, "step": 491 }, { "epoch": 0.0234548184873549, "grad_norm": 2.234813928604126, "learning_rate": 1.9815346308473857e-05, "loss": 1.2283, "step": 492 }, { "epoch": 0.0235024908826544, "grad_norm": 1.298910140991211, "learning_rate": 1.9814594190887394e-05, "loss": 0.5041, "step": 493 }, { "epoch": 0.023550163277953902, "grad_norm": 2.6392388343811035, "learning_rate": 1.9813840559010116e-05, "loss": 0.8703, "step": 494 }, { "epoch": 0.023597835673253402, "grad_norm": 3.647735118865967, "learning_rate": 1.9813085412958307e-05, "loss": 0.9736, "step": 495 }, { "epoch": 0.023645508068552906, "grad_norm": 1.6532799005508423, "learning_rate": 1.9812328752848474e-05, "loss": 0.6817, "step": 496 }, { "epoch": 0.023693180463852406, "grad_norm": 1.6136335134506226, "learning_rate": 1.981157057879736e-05, "loss": 1.0334, "step": 497 }, { "epoch": 0.02374085285915191, "grad_norm": 2.0291175842285156, "learning_rate": 1.9810810890921943e-05, "loss": 0.6898, "step": 498 }, { "epoch": 0.02378852525445141, "grad_norm": 1.876394510269165, "learning_rate": 1.981004968933944e-05, "loss": 0.8983, "step": 499 }, { "epoch": 0.023836197649750913, "grad_norm": 1.8519922494888306, "learning_rate": 1.9809286974167296e-05, "loss": 0.762, "step": 500 }, { "epoch": 0.023883870045050413, "grad_norm": 1.415461778640747, "learning_rate": 1.9808522745523186e-05, "loss": 0.5235, "step": 501 }, { "epoch": 0.023931542440349916, "grad_norm": 2.0346901416778564, "learning_rate": 1.9807757003525022e-05, "loss": 0.9127, "step": 502 }, { "epoch": 0.023979214835649416, "grad_norm": 1.8818345069885254, "learning_rate": 1.9806989748290954e-05, "loss": 0.7813, "step": 503 }, { "epoch": 0.02402688723094892, "grad_norm": 4.557500839233398, "learning_rate": 1.980622097993936e-05, "loss": 0.6555, "step": 504 }, { "epoch": 0.02407455962624842, "grad_norm": 3.821648359298706, "learning_rate": 1.9805450698588856e-05, "loss": 0.8257, "step": 505 }, { "epoch": 0.024122232021547924, "grad_norm": 2.6666078567504883, "learning_rate": 1.9804678904358284e-05, "loss": 1.1717, "step": 506 }, { "epoch": 0.024169904416847424, "grad_norm": 2.618318557739258, "learning_rate": 1.9803905597366726e-05, "loss": 0.9528, "step": 507 }, { "epoch": 0.024217576812146927, "grad_norm": 2.3368048667907715, "learning_rate": 1.9803130777733494e-05, "loss": 1.1081, "step": 508 }, { "epoch": 0.024265249207446427, "grad_norm": 4.741371154785156, "learning_rate": 1.9802354445578137e-05, "loss": 1.2031, "step": 509 }, { "epoch": 0.02431292160274593, "grad_norm": 2.2617404460906982, "learning_rate": 1.9801576601020435e-05, "loss": 0.7604, "step": 510 }, { "epoch": 0.02436059399804543, "grad_norm": 6.097237586975098, "learning_rate": 1.98007972441804e-05, "loss": 0.7608, "step": 511 }, { "epoch": 0.024408266393344934, "grad_norm": 1.6063331365585327, "learning_rate": 1.9800016375178276e-05, "loss": 0.7393, "step": 512 }, { "epoch": 0.024455938788644434, "grad_norm": 1.7038710117340088, "learning_rate": 1.979923399413455e-05, "loss": 0.6155, "step": 513 }, { "epoch": 0.024503611183943938, "grad_norm": 3.836108922958374, "learning_rate": 1.9798450101169927e-05, "loss": 0.8108, "step": 514 }, { "epoch": 0.024551283579243438, "grad_norm": 1.718099594116211, "learning_rate": 1.979766469640536e-05, "loss": 0.9674, "step": 515 }, { "epoch": 0.02459895597454294, "grad_norm": 2.81150221824646, "learning_rate": 1.9796877779962026e-05, "loss": 0.4572, "step": 516 }, { "epoch": 0.02464662836984244, "grad_norm": 3.344743013381958, "learning_rate": 1.9796089351961338e-05, "loss": 1.3236, "step": 517 }, { "epoch": 0.024694300765141945, "grad_norm": 0.9414018392562866, "learning_rate": 1.9795299412524948e-05, "loss": 0.316, "step": 518 }, { "epoch": 0.024741973160441445, "grad_norm": 1.675338625907898, "learning_rate": 1.9794507961774725e-05, "loss": 0.9532, "step": 519 }, { "epoch": 0.02478964555574095, "grad_norm": 1.8848239183425903, "learning_rate": 1.979371499983279e-05, "loss": 0.6513, "step": 520 }, { "epoch": 0.02483731795104045, "grad_norm": 2.5809998512268066, "learning_rate": 1.9792920526821486e-05, "loss": 0.6065, "step": 521 }, { "epoch": 0.024884990346339952, "grad_norm": 2.36930775642395, "learning_rate": 1.9792124542863394e-05, "loss": 1.2801, "step": 522 }, { "epoch": 0.024932662741639453, "grad_norm": 1.0927647352218628, "learning_rate": 1.9791327048081322e-05, "loss": 0.5992, "step": 523 }, { "epoch": 0.024980335136938956, "grad_norm": 1.4300240278244019, "learning_rate": 1.9790528042598316e-05, "loss": 0.4546, "step": 524 }, { "epoch": 0.025028007532238456, "grad_norm": 2.574460744857788, "learning_rate": 1.978972752653766e-05, "loss": 0.2686, "step": 525 }, { "epoch": 0.02507567992753796, "grad_norm": 2.383345127105713, "learning_rate": 1.978892550002286e-05, "loss": 0.6973, "step": 526 }, { "epoch": 0.02512335232283746, "grad_norm": 15.231221199035645, "learning_rate": 1.9788121963177663e-05, "loss": 0.6649, "step": 527 }, { "epoch": 0.025171024718136963, "grad_norm": 1.9268141984939575, "learning_rate": 1.978731691612604e-05, "loss": 0.771, "step": 528 }, { "epoch": 0.025218697113436463, "grad_norm": 0.9012970328330994, "learning_rate": 1.9786510358992213e-05, "loss": 0.3601, "step": 529 }, { "epoch": 0.025266369508735967, "grad_norm": 1.5469326972961426, "learning_rate": 1.9785702291900616e-05, "loss": 0.8453, "step": 530 }, { "epoch": 0.025314041904035467, "grad_norm": 2.350041627883911, "learning_rate": 1.978489271497593e-05, "loss": 0.6276, "step": 531 }, { "epoch": 0.02536171429933497, "grad_norm": 1.523940920829773, "learning_rate": 1.978408162834306e-05, "loss": 0.6375, "step": 532 }, { "epoch": 0.02540938669463447, "grad_norm": 1.175184965133667, "learning_rate": 1.9783269032127156e-05, "loss": 0.5706, "step": 533 }, { "epoch": 0.025457059089933974, "grad_norm": 1.4187142848968506, "learning_rate": 1.9782454926453585e-05, "loss": 1.2135, "step": 534 }, { "epoch": 0.025504731485233474, "grad_norm": 1.718551754951477, "learning_rate": 1.978163931144796e-05, "loss": 0.6412, "step": 535 }, { "epoch": 0.025552403880532978, "grad_norm": 3.0916621685028076, "learning_rate": 1.978082218723612e-05, "loss": 1.0229, "step": 536 }, { "epoch": 0.025600076275832478, "grad_norm": 1.2344852685928345, "learning_rate": 1.978000355394414e-05, "loss": 0.391, "step": 537 }, { "epoch": 0.02564774867113198, "grad_norm": 2.242340087890625, "learning_rate": 1.9779183411698327e-05, "loss": 1.0096, "step": 538 }, { "epoch": 0.02569542106643148, "grad_norm": 2.2608070373535156, "learning_rate": 1.977836176062522e-05, "loss": 0.6205, "step": 539 }, { "epoch": 0.025743093461730985, "grad_norm": 1.1044822931289673, "learning_rate": 1.977753860085159e-05, "loss": 0.7183, "step": 540 }, { "epoch": 0.025790765857030485, "grad_norm": 1.8876334428787231, "learning_rate": 1.977671393250444e-05, "loss": 1.0274, "step": 541 }, { "epoch": 0.02583843825232999, "grad_norm": 1.5824873447418213, "learning_rate": 1.977588775571101e-05, "loss": 0.9702, "step": 542 }, { "epoch": 0.02588611064762949, "grad_norm": 2.3319790363311768, "learning_rate": 1.9775060070598777e-05, "loss": 0.9352, "step": 543 }, { "epoch": 0.025933783042928992, "grad_norm": 1.3221014738082886, "learning_rate": 1.977423087729544e-05, "loss": 0.5927, "step": 544 }, { "epoch": 0.025981455438228492, "grad_norm": 1.501172661781311, "learning_rate": 1.977340017592893e-05, "loss": 0.7887, "step": 545 }, { "epoch": 0.026029127833527996, "grad_norm": 3.196305751800537, "learning_rate": 1.9772567966627417e-05, "loss": 0.9204, "step": 546 }, { "epoch": 0.026076800228827496, "grad_norm": 1.732845664024353, "learning_rate": 1.9771734249519307e-05, "loss": 0.7779, "step": 547 }, { "epoch": 0.026124472624127, "grad_norm": 7.03700065612793, "learning_rate": 1.9770899024733235e-05, "loss": 0.3785, "step": 548 }, { "epoch": 0.0261721450194265, "grad_norm": 4.985541343688965, "learning_rate": 1.9770062292398062e-05, "loss": 0.3287, "step": 549 }, { "epoch": 0.026219817414726003, "grad_norm": 3.5213537216186523, "learning_rate": 1.9769224052642887e-05, "loss": 0.8526, "step": 550 }, { "epoch": 0.026267489810025503, "grad_norm": 2.2162954807281494, "learning_rate": 1.9768384305597048e-05, "loss": 1.0172, "step": 551 }, { "epoch": 0.026315162205325007, "grad_norm": 2.0130722522735596, "learning_rate": 1.9767543051390103e-05, "loss": 0.5717, "step": 552 }, { "epoch": 0.026362834600624507, "grad_norm": 2.2378265857696533, "learning_rate": 1.9766700290151853e-05, "loss": 0.7639, "step": 553 }, { "epoch": 0.02641050699592401, "grad_norm": 1.358456015586853, "learning_rate": 1.9765856022012326e-05, "loss": 0.6762, "step": 554 }, { "epoch": 0.026458179391223514, "grad_norm": 5.6245951652526855, "learning_rate": 1.9765010247101783e-05, "loss": 0.9299, "step": 555 }, { "epoch": 0.026505851786523014, "grad_norm": 1.4390608072280884, "learning_rate": 1.9764162965550718e-05, "loss": 0.7357, "step": 556 }, { "epoch": 0.026553524181822517, "grad_norm": 1.5818909406661987, "learning_rate": 1.9763314177489858e-05, "loss": 0.7468, "step": 557 }, { "epoch": 0.026601196577122017, "grad_norm": 1.5696935653686523, "learning_rate": 1.9762463883050165e-05, "loss": 0.8834, "step": 558 }, { "epoch": 0.02664886897242152, "grad_norm": 1.6481987237930298, "learning_rate": 1.9761612082362828e-05, "loss": 0.7375, "step": 559 }, { "epoch": 0.02669654136772102, "grad_norm": 5.267855167388916, "learning_rate": 1.9760758775559275e-05, "loss": 0.4963, "step": 560 }, { "epoch": 0.026744213763020525, "grad_norm": 1.7277523279190063, "learning_rate": 1.9759903962771155e-05, "loss": 0.8688, "step": 561 }, { "epoch": 0.026791886158320025, "grad_norm": 3.814375162124634, "learning_rate": 1.9759047644130362e-05, "loss": 1.1375, "step": 562 }, { "epoch": 0.026839558553619528, "grad_norm": 1.6285319328308105, "learning_rate": 1.9758189819769017e-05, "loss": 0.6798, "step": 563 }, { "epoch": 0.026887230948919028, "grad_norm": 1.6149711608886719, "learning_rate": 1.9757330489819472e-05, "loss": 1.0756, "step": 564 }, { "epoch": 0.026934903344218532, "grad_norm": 1.523033618927002, "learning_rate": 1.9756469654414316e-05, "loss": 0.6719, "step": 565 }, { "epoch": 0.026982575739518032, "grad_norm": 2.452061653137207, "learning_rate": 1.9755607313686363e-05, "loss": 0.3536, "step": 566 }, { "epoch": 0.027030248134817535, "grad_norm": 1.6935207843780518, "learning_rate": 1.9754743467768663e-05, "loss": 1.0144, "step": 567 }, { "epoch": 0.027077920530117035, "grad_norm": 1.7408450841903687, "learning_rate": 1.9753878116794504e-05, "loss": 0.9138, "step": 568 }, { "epoch": 0.02712559292541654, "grad_norm": 1.9482660293579102, "learning_rate": 1.9753011260897392e-05, "loss": 0.8797, "step": 569 }, { "epoch": 0.02717326532071604, "grad_norm": 1.166355848312378, "learning_rate": 1.9752142900211084e-05, "loss": 0.568, "step": 570 }, { "epoch": 0.027220937716015543, "grad_norm": 1.862307071685791, "learning_rate": 1.9751273034869552e-05, "loss": 0.777, "step": 571 }, { "epoch": 0.027268610111315043, "grad_norm": 1.4859095811843872, "learning_rate": 1.975040166500701e-05, "loss": 0.6208, "step": 572 }, { "epoch": 0.027316282506614546, "grad_norm": 1.7215423583984375, "learning_rate": 1.97495287907579e-05, "loss": 0.8189, "step": 573 }, { "epoch": 0.027363954901914046, "grad_norm": 3.2154273986816406, "learning_rate": 1.97486544122569e-05, "loss": 0.7525, "step": 574 }, { "epoch": 0.02741162729721355, "grad_norm": 1.9213719367980957, "learning_rate": 1.974777852963891e-05, "loss": 0.9057, "step": 575 }, { "epoch": 0.02745929969251305, "grad_norm": 11.46835994720459, "learning_rate": 1.9746901143039082e-05, "loss": 1.1341, "step": 576 }, { "epoch": 0.027506972087812553, "grad_norm": 1.9643545150756836, "learning_rate": 1.974602225259278e-05, "loss": 0.6747, "step": 577 }, { "epoch": 0.027554644483112053, "grad_norm": 3.3582406044006348, "learning_rate": 1.9745141858435607e-05, "loss": 0.5221, "step": 578 }, { "epoch": 0.027602316878411557, "grad_norm": 2.9419772624969482, "learning_rate": 1.9744259960703405e-05, "loss": 0.7216, "step": 579 }, { "epoch": 0.027649989273711057, "grad_norm": 3.037371873855591, "learning_rate": 1.9743376559532234e-05, "loss": 0.6489, "step": 580 }, { "epoch": 0.02769766166901056, "grad_norm": 1.3546128273010254, "learning_rate": 1.9742491655058396e-05, "loss": 0.679, "step": 581 }, { "epoch": 0.02774533406431006, "grad_norm": 1.8137181997299194, "learning_rate": 1.974160524741843e-05, "loss": 0.8666, "step": 582 }, { "epoch": 0.027793006459609564, "grad_norm": 0.9405362606048584, "learning_rate": 1.974071733674909e-05, "loss": 0.3082, "step": 583 }, { "epoch": 0.027840678854909064, "grad_norm": 1.7752642631530762, "learning_rate": 1.973982792318737e-05, "loss": 0.6208, "step": 584 }, { "epoch": 0.027888351250208568, "grad_norm": 2.4724225997924805, "learning_rate": 1.9738937006870507e-05, "loss": 0.7182, "step": 585 }, { "epoch": 0.027936023645508068, "grad_norm": 1.5407928228378296, "learning_rate": 1.9738044587935957e-05, "loss": 0.8679, "step": 586 }, { "epoch": 0.02798369604080757, "grad_norm": 1.9845658540725708, "learning_rate": 1.9737150666521408e-05, "loss": 0.9731, "step": 587 }, { "epoch": 0.02803136843610707, "grad_norm": 2.7778193950653076, "learning_rate": 1.9736255242764782e-05, "loss": 0.9185, "step": 588 }, { "epoch": 0.028079040831406575, "grad_norm": 2.502527952194214, "learning_rate": 1.973535831680424e-05, "loss": 0.7861, "step": 589 }, { "epoch": 0.028126713226706075, "grad_norm": 2.2385640144348145, "learning_rate": 1.973445988877816e-05, "loss": 0.6714, "step": 590 }, { "epoch": 0.02817438562200558, "grad_norm": 2.457892894744873, "learning_rate": 1.9733559958825167e-05, "loss": 0.5676, "step": 591 }, { "epoch": 0.02822205801730508, "grad_norm": 1.67270827293396, "learning_rate": 1.973265852708411e-05, "loss": 0.8599, "step": 592 }, { "epoch": 0.028269730412604582, "grad_norm": 1.2847950458526611, "learning_rate": 1.973175559369407e-05, "loss": 0.4154, "step": 593 }, { "epoch": 0.028317402807904082, "grad_norm": 2.8948774337768555, "learning_rate": 1.9730851158794358e-05, "loss": 1.462, "step": 594 }, { "epoch": 0.028365075203203586, "grad_norm": 1.836553931236267, "learning_rate": 1.972994522252452e-05, "loss": 1.0466, "step": 595 }, { "epoch": 0.028412747598503086, "grad_norm": 1.6009653806686401, "learning_rate": 1.9729037785024333e-05, "loss": 0.4853, "step": 596 }, { "epoch": 0.02846041999380259, "grad_norm": 1.9315605163574219, "learning_rate": 1.972812884643381e-05, "loss": 0.9313, "step": 597 }, { "epoch": 0.02850809238910209, "grad_norm": 1.5604279041290283, "learning_rate": 1.9727218406893177e-05, "loss": 0.7766, "step": 598 }, { "epoch": 0.028555764784401593, "grad_norm": 1.1914689540863037, "learning_rate": 1.9726306466542923e-05, "loss": 0.6603, "step": 599 }, { "epoch": 0.028603437179701093, "grad_norm": 1.6006221771240234, "learning_rate": 1.972539302552374e-05, "loss": 0.9719, "step": 600 }, { "epoch": 0.028651109575000597, "grad_norm": 1.6553188562393188, "learning_rate": 1.9724478083976565e-05, "loss": 0.7604, "step": 601 }, { "epoch": 0.028698781970300097, "grad_norm": 1.3023021221160889, "learning_rate": 1.9723561642042563e-05, "loss": 0.7185, "step": 602 }, { "epoch": 0.0287464543655996, "grad_norm": 1.863294243812561, "learning_rate": 1.9722643699863135e-05, "loss": 0.9524, "step": 603 }, { "epoch": 0.0287941267608991, "grad_norm": 3.4323480129241943, "learning_rate": 1.9721724257579907e-05, "loss": 1.1218, "step": 604 }, { "epoch": 0.028841799156198604, "grad_norm": 1.5496827363967896, "learning_rate": 1.972080331533474e-05, "loss": 0.7628, "step": 605 }, { "epoch": 0.028889471551498104, "grad_norm": 1.1948590278625488, "learning_rate": 1.971988087326973e-05, "loss": 0.6586, "step": 606 }, { "epoch": 0.028937143946797608, "grad_norm": 1.685185432434082, "learning_rate": 1.9718956931527193e-05, "loss": 0.7733, "step": 607 }, { "epoch": 0.028984816342097108, "grad_norm": 1.624538540840149, "learning_rate": 1.9718031490249688e-05, "loss": 0.7968, "step": 608 }, { "epoch": 0.02903248873739661, "grad_norm": 2.243673086166382, "learning_rate": 1.9717104549580003e-05, "loss": 0.9498, "step": 609 }, { "epoch": 0.02908016113269611, "grad_norm": 2.4444425106048584, "learning_rate": 1.9716176109661148e-05, "loss": 1.3126, "step": 610 }, { "epoch": 0.029127833527995615, "grad_norm": 1.7432293891906738, "learning_rate": 1.9715246170636383e-05, "loss": 0.7163, "step": 611 }, { "epoch": 0.029175505923295115, "grad_norm": 10.065876007080078, "learning_rate": 1.9714314732649174e-05, "loss": 1.1127, "step": 612 }, { "epoch": 0.02922317831859462, "grad_norm": 3.2807443141937256, "learning_rate": 1.9713381795843244e-05, "loss": 1.0007, "step": 613 }, { "epoch": 0.02927085071389412, "grad_norm": 2.5899031162261963, "learning_rate": 1.9712447360362534e-05, "loss": 1.2204, "step": 614 }, { "epoch": 0.029318523109193622, "grad_norm": 1.3960285186767578, "learning_rate": 1.971151142635121e-05, "loss": 0.7094, "step": 615 }, { "epoch": 0.029366195504493122, "grad_norm": 2.9793753623962402, "learning_rate": 1.9710573993953685e-05, "loss": 0.6854, "step": 616 }, { "epoch": 0.029413867899792626, "grad_norm": 1.1436901092529297, "learning_rate": 1.9709635063314592e-05, "loss": 0.6775, "step": 617 }, { "epoch": 0.029461540295092126, "grad_norm": 9.052082061767578, "learning_rate": 1.97086946345788e-05, "loss": 0.9781, "step": 618 }, { "epoch": 0.02950921269039163, "grad_norm": 1.6794508695602417, "learning_rate": 1.9707752707891404e-05, "loss": 0.8579, "step": 619 }, { "epoch": 0.02955688508569113, "grad_norm": 2.077829122543335, "learning_rate": 1.9706809283397733e-05, "loss": 0.6141, "step": 620 }, { "epoch": 0.029604557480990633, "grad_norm": 1.7477926015853882, "learning_rate": 1.9705864361243355e-05, "loss": 0.7191, "step": 621 }, { "epoch": 0.029652229876290133, "grad_norm": 1.7621954679489136, "learning_rate": 1.9704917941574053e-05, "loss": 0.9054, "step": 622 }, { "epoch": 0.029699902271589636, "grad_norm": 2.0777807235717773, "learning_rate": 1.9703970024535855e-05, "loss": 0.7892, "step": 623 }, { "epoch": 0.029747574666889137, "grad_norm": 2.7849626541137695, "learning_rate": 1.970302061027502e-05, "loss": 0.8543, "step": 624 }, { "epoch": 0.02979524706218864, "grad_norm": 2.029613494873047, "learning_rate": 1.970206969893802e-05, "loss": 0.9293, "step": 625 }, { "epoch": 0.02984291945748814, "grad_norm": 3.735873222351074, "learning_rate": 1.970111729067158e-05, "loss": 1.4074, "step": 626 }, { "epoch": 0.029890591852787644, "grad_norm": 1.1065034866333008, "learning_rate": 1.9700163385622642e-05, "loss": 0.4194, "step": 627 }, { "epoch": 0.029938264248087144, "grad_norm": 1.4932465553283691, "learning_rate": 1.969920798393839e-05, "loss": 0.8679, "step": 628 }, { "epoch": 0.029985936643386647, "grad_norm": 1.1497528553009033, "learning_rate": 1.9698251085766226e-05, "loss": 0.4865, "step": 629 }, { "epoch": 0.030033609038686147, "grad_norm": 1.2771016359329224, "learning_rate": 1.969729269125379e-05, "loss": 0.79, "step": 630 }, { "epoch": 0.03008128143398565, "grad_norm": 1.7814726829528809, "learning_rate": 1.969633280054896e-05, "loss": 0.964, "step": 631 }, { "epoch": 0.03012895382928515, "grad_norm": 1.6634962558746338, "learning_rate": 1.9695371413799825e-05, "loss": 0.8258, "step": 632 }, { "epoch": 0.030176626224584654, "grad_norm": 7.164558410644531, "learning_rate": 1.9694408531154728e-05, "loss": 0.374, "step": 633 }, { "epoch": 0.030224298619884155, "grad_norm": 1.5273154973983765, "learning_rate": 1.969344415276223e-05, "loss": 0.8219, "step": 634 }, { "epoch": 0.030271971015183658, "grad_norm": 1.0765341520309448, "learning_rate": 1.9692478278771118e-05, "loss": 0.3298, "step": 635 }, { "epoch": 0.030319643410483158, "grad_norm": 5.0884013175964355, "learning_rate": 1.969151090933042e-05, "loss": 0.9095, "step": 636 }, { "epoch": 0.03036731580578266, "grad_norm": 15.906243324279785, "learning_rate": 1.9690542044589395e-05, "loss": 0.1833, "step": 637 }, { "epoch": 0.030414988201082162, "grad_norm": 2.3113362789154053, "learning_rate": 1.9689571684697527e-05, "loss": 0.8799, "step": 638 }, { "epoch": 0.030462660596381665, "grad_norm": 1.4296796321868896, "learning_rate": 1.9688599829804528e-05, "loss": 0.9618, "step": 639 }, { "epoch": 0.030510332991681165, "grad_norm": 2.759082078933716, "learning_rate": 1.968762648006035e-05, "loss": 1.0813, "step": 640 }, { "epoch": 0.03055800538698067, "grad_norm": 1.532196044921875, "learning_rate": 1.9686651635615172e-05, "loss": 0.7161, "step": 641 }, { "epoch": 0.03060567778228017, "grad_norm": 2.626645088195801, "learning_rate": 1.9685675296619397e-05, "loss": 1.155, "step": 642 }, { "epoch": 0.030653350177579673, "grad_norm": 0.9614362120628357, "learning_rate": 1.9684697463223664e-05, "loss": 0.4707, "step": 643 }, { "epoch": 0.030701022572879173, "grad_norm": 1.7256051301956177, "learning_rate": 1.968371813557885e-05, "loss": 1.0067, "step": 644 }, { "epoch": 0.030748694968178676, "grad_norm": 1.5617140531539917, "learning_rate": 1.968273731383605e-05, "loss": 0.3811, "step": 645 }, { "epoch": 0.03079636736347818, "grad_norm": 2.0715904235839844, "learning_rate": 1.9681754998146592e-05, "loss": 0.8022, "step": 646 }, { "epoch": 0.03084403975877768, "grad_norm": 2.6158509254455566, "learning_rate": 1.9680771188662044e-05, "loss": 0.8486, "step": 647 }, { "epoch": 0.030891712154077183, "grad_norm": 4.827423572540283, "learning_rate": 1.9679785885534196e-05, "loss": 0.4986, "step": 648 }, { "epoch": 0.030939384549376683, "grad_norm": 1.2536042928695679, "learning_rate": 1.9678799088915064e-05, "loss": 0.7874, "step": 649 }, { "epoch": 0.030987056944676187, "grad_norm": 1.6905231475830078, "learning_rate": 1.9677810798956906e-05, "loss": 0.7333, "step": 650 }, { "epoch": 0.031034729339975687, "grad_norm": 3.6590423583984375, "learning_rate": 1.9676821015812203e-05, "loss": 1.2042, "step": 651 }, { "epoch": 0.03108240173527519, "grad_norm": 2.787641763687134, "learning_rate": 1.967582973963367e-05, "loss": 0.6705, "step": 652 }, { "epoch": 0.03113007413057469, "grad_norm": 1.9217485189437866, "learning_rate": 1.9674836970574253e-05, "loss": 0.7618, "step": 653 }, { "epoch": 0.031177746525874194, "grad_norm": 1.7533555030822754, "learning_rate": 1.967384270878712e-05, "loss": 0.9996, "step": 654 }, { "epoch": 0.031225418921173694, "grad_norm": 2.0206172466278076, "learning_rate": 1.967284695442568e-05, "loss": 1.3232, "step": 655 }, { "epoch": 0.0312730913164732, "grad_norm": 1.805066704750061, "learning_rate": 1.9671849707643567e-05, "loss": 0.6891, "step": 656 }, { "epoch": 0.0313207637117727, "grad_norm": 1.5261296033859253, "learning_rate": 1.9670850968594642e-05, "loss": 0.9105, "step": 657 }, { "epoch": 0.0313684361070722, "grad_norm": 1.3711668252944946, "learning_rate": 1.9669850737433002e-05, "loss": 0.8131, "step": 658 }, { "epoch": 0.031416108502371705, "grad_norm": 8.398372650146484, "learning_rate": 1.9668849014312978e-05, "loss": 0.412, "step": 659 }, { "epoch": 0.031463780897671205, "grad_norm": 3.589878797531128, "learning_rate": 1.9667845799389117e-05, "loss": 1.141, "step": 660 }, { "epoch": 0.031511453292970705, "grad_norm": 1.4862463474273682, "learning_rate": 1.9666841092816212e-05, "loss": 0.6138, "step": 661 }, { "epoch": 0.031559125688270205, "grad_norm": 1.5963002443313599, "learning_rate": 1.9665834894749275e-05, "loss": 0.7301, "step": 662 }, { "epoch": 0.03160679808356971, "grad_norm": 1.64510178565979, "learning_rate": 1.966482720534355e-05, "loss": 0.5707, "step": 663 }, { "epoch": 0.03165447047886921, "grad_norm": 1.807691216468811, "learning_rate": 1.9663818024754516e-05, "loss": 0.6151, "step": 664 }, { "epoch": 0.03170214287416871, "grad_norm": 1.6394729614257812, "learning_rate": 1.966280735313788e-05, "loss": 0.6204, "step": 665 }, { "epoch": 0.03174981526946821, "grad_norm": 1.5085763931274414, "learning_rate": 1.9661795190649578e-05, "loss": 0.7999, "step": 666 }, { "epoch": 0.03179748766476772, "grad_norm": 2.1185641288757324, "learning_rate": 1.9660781537445774e-05, "loss": 0.4124, "step": 667 }, { "epoch": 0.03184516006006722, "grad_norm": 1.159080982208252, "learning_rate": 1.9659766393682867e-05, "loss": 0.4458, "step": 668 }, { "epoch": 0.03189283245536672, "grad_norm": 1.7421625852584839, "learning_rate": 1.965874975951748e-05, "loss": 0.4852, "step": 669 }, { "epoch": 0.03194050485066622, "grad_norm": 3.620832920074463, "learning_rate": 1.965773163510647e-05, "loss": 0.2355, "step": 670 }, { "epoch": 0.03198817724596573, "grad_norm": 2.30461049079895, "learning_rate": 1.9656712020606926e-05, "loss": 1.0058, "step": 671 }, { "epoch": 0.03203584964126523, "grad_norm": 6.597954273223877, "learning_rate": 1.9655690916176164e-05, "loss": 0.4127, "step": 672 }, { "epoch": 0.03208352203656473, "grad_norm": 4.061854839324951, "learning_rate": 1.9654668321971724e-05, "loss": 0.8925, "step": 673 }, { "epoch": 0.03213119443186423, "grad_norm": 1.1764100790023804, "learning_rate": 1.965364423815139e-05, "loss": 0.4638, "step": 674 }, { "epoch": 0.032178866827163734, "grad_norm": 2.0967013835906982, "learning_rate": 1.965261866487316e-05, "loss": 0.975, "step": 675 }, { "epoch": 0.032226539222463234, "grad_norm": 9.140423774719238, "learning_rate": 1.9651591602295275e-05, "loss": 0.6945, "step": 676 }, { "epoch": 0.032274211617762734, "grad_norm": 1.62010657787323, "learning_rate": 1.9650563050576195e-05, "loss": 0.8773, "step": 677 }, { "epoch": 0.032321884013062234, "grad_norm": 1.2245630025863647, "learning_rate": 1.964953300987462e-05, "loss": 0.4385, "step": 678 }, { "epoch": 0.03236955640836174, "grad_norm": 1.9532051086425781, "learning_rate": 1.9648501480349473e-05, "loss": 0.784, "step": 679 }, { "epoch": 0.03241722880366124, "grad_norm": 4.412245750427246, "learning_rate": 1.9647468462159906e-05, "loss": 0.1273, "step": 680 }, { "epoch": 0.03246490119896074, "grad_norm": 1.2344199419021606, "learning_rate": 1.9646433955465307e-05, "loss": 0.5754, "step": 681 }, { "epoch": 0.03251257359426024, "grad_norm": 1.2929292917251587, "learning_rate": 1.9645397960425287e-05, "loss": 0.5648, "step": 682 }, { "epoch": 0.03256024598955975, "grad_norm": 2.8150668144226074, "learning_rate": 1.964436047719969e-05, "loss": 0.9021, "step": 683 }, { "epoch": 0.03260791838485925, "grad_norm": 2.1372106075286865, "learning_rate": 1.9643321505948588e-05, "loss": 0.885, "step": 684 }, { "epoch": 0.03265559078015875, "grad_norm": 2.2482917308807373, "learning_rate": 1.9642281046832287e-05, "loss": 0.9908, "step": 685 }, { "epoch": 0.03270326317545825, "grad_norm": 1.6600946187973022, "learning_rate": 1.9641239100011312e-05, "loss": 0.7879, "step": 686 }, { "epoch": 0.032750935570757755, "grad_norm": 1.259423851966858, "learning_rate": 1.9640195665646434e-05, "loss": 0.8581, "step": 687 }, { "epoch": 0.032798607966057255, "grad_norm": 1.942671298980713, "learning_rate": 1.963915074389864e-05, "loss": 0.8663, "step": 688 }, { "epoch": 0.032846280361356756, "grad_norm": 1.6967015266418457, "learning_rate": 1.9638104334929145e-05, "loss": 0.5918, "step": 689 }, { "epoch": 0.032893952756656256, "grad_norm": 2.120243549346924, "learning_rate": 1.963705643889941e-05, "loss": 1.2791, "step": 690 }, { "epoch": 0.03294162515195576, "grad_norm": 2.302832841873169, "learning_rate": 1.9636007055971106e-05, "loss": 1.1948, "step": 691 }, { "epoch": 0.03298929754725526, "grad_norm": 3.452000856399536, "learning_rate": 1.9634956186306147e-05, "loss": 0.6068, "step": 692 }, { "epoch": 0.03303696994255476, "grad_norm": 1.3084638118743896, "learning_rate": 1.963390383006667e-05, "loss": 0.3356, "step": 693 }, { "epoch": 0.03308464233785426, "grad_norm": 1.6580809354782104, "learning_rate": 1.9632849987415038e-05, "loss": 1.0911, "step": 694 }, { "epoch": 0.03313231473315377, "grad_norm": 1.4388517141342163, "learning_rate": 1.9631794658513853e-05, "loss": 0.5739, "step": 695 }, { "epoch": 0.03317998712845327, "grad_norm": 1.9334794282913208, "learning_rate": 1.9630737843525946e-05, "loss": 0.7706, "step": 696 }, { "epoch": 0.03322765952375277, "grad_norm": 4.213309288024902, "learning_rate": 1.9629679542614363e-05, "loss": 1.1035, "step": 697 }, { "epoch": 0.03327533191905227, "grad_norm": 2.349947929382324, "learning_rate": 1.962861975594239e-05, "loss": 0.9024, "step": 698 }, { "epoch": 0.03332300431435178, "grad_norm": 1.374952793121338, "learning_rate": 1.9627558483673546e-05, "loss": 0.5061, "step": 699 }, { "epoch": 0.03337067670965128, "grad_norm": 2.7832841873168945, "learning_rate": 1.962649572597158e-05, "loss": 0.7869, "step": 700 }, { "epoch": 0.03341834910495078, "grad_norm": 2.1559762954711914, "learning_rate": 1.9625431483000448e-05, "loss": 0.752, "step": 701 }, { "epoch": 0.03346602150025028, "grad_norm": 2.687613010406494, "learning_rate": 1.9624365754924364e-05, "loss": 0.7765, "step": 702 }, { "epoch": 0.033513693895549784, "grad_norm": 9.63939380645752, "learning_rate": 1.9623298541907756e-05, "loss": 1.0306, "step": 703 }, { "epoch": 0.033561366290849284, "grad_norm": 2.1271538734436035, "learning_rate": 1.9622229844115284e-05, "loss": 1.0692, "step": 704 }, { "epoch": 0.033609038686148784, "grad_norm": 4.3584442138671875, "learning_rate": 1.9621159661711834e-05, "loss": 1.4418, "step": 705 }, { "epoch": 0.033656711081448284, "grad_norm": 1.646856427192688, "learning_rate": 1.9620087994862534e-05, "loss": 0.9123, "step": 706 }, { "epoch": 0.03370438347674779, "grad_norm": 2.322556734085083, "learning_rate": 1.961901484373272e-05, "loss": 1.0779, "step": 707 }, { "epoch": 0.03375205587204729, "grad_norm": 1.4996213912963867, "learning_rate": 1.9617940208487968e-05, "loss": 0.5601, "step": 708 }, { "epoch": 0.03379972826734679, "grad_norm": 3.1180505752563477, "learning_rate": 1.9616864089294095e-05, "loss": 1.0012, "step": 709 }, { "epoch": 0.03384740066264629, "grad_norm": 4.107066631317139, "learning_rate": 1.9615786486317124e-05, "loss": 0.8514, "step": 710 }, { "epoch": 0.0338950730579458, "grad_norm": 4.024191379547119, "learning_rate": 1.9614707399723318e-05, "loss": 0.7237, "step": 711 }, { "epoch": 0.0339427454532453, "grad_norm": 3.5964674949645996, "learning_rate": 1.9613626829679176e-05, "loss": 1.4357, "step": 712 }, { "epoch": 0.0339904178485448, "grad_norm": 2.3197476863861084, "learning_rate": 1.9612544776351415e-05, "loss": 0.4594, "step": 713 }, { "epoch": 0.0340380902438443, "grad_norm": 2.2602896690368652, "learning_rate": 1.961146123990699e-05, "loss": 0.7322, "step": 714 }, { "epoch": 0.034085762639143806, "grad_norm": 2.804417610168457, "learning_rate": 1.9610376220513067e-05, "loss": 1.5153, "step": 715 }, { "epoch": 0.034133435034443306, "grad_norm": 2.9005067348480225, "learning_rate": 1.9609289718337067e-05, "loss": 0.95, "step": 716 }, { "epoch": 0.034181107429742806, "grad_norm": 7.997469902038574, "learning_rate": 1.9608201733546615e-05, "loss": 1.3378, "step": 717 }, { "epoch": 0.034228779825042306, "grad_norm": 3.4590885639190674, "learning_rate": 1.9607112266309585e-05, "loss": 1.5682, "step": 718 }, { "epoch": 0.03427645222034181, "grad_norm": 2.0878725051879883, "learning_rate": 1.9606021316794065e-05, "loss": 0.9559, "step": 719 }, { "epoch": 0.03432412461564131, "grad_norm": 1.4294086694717407, "learning_rate": 1.9604928885168376e-05, "loss": 0.6045, "step": 720 }, { "epoch": 0.03437179701094081, "grad_norm": 1.5459392070770264, "learning_rate": 1.9603834971601075e-05, "loss": 0.7154, "step": 721 }, { "epoch": 0.03441946940624031, "grad_norm": 1.3652042150497437, "learning_rate": 1.9602739576260937e-05, "loss": 0.7979, "step": 722 }, { "epoch": 0.03446714180153982, "grad_norm": 2.1262052059173584, "learning_rate": 1.9601642699316968e-05, "loss": 0.8921, "step": 723 }, { "epoch": 0.03451481419683932, "grad_norm": 2.031620740890503, "learning_rate": 1.9600544340938415e-05, "loss": 0.9675, "step": 724 }, { "epoch": 0.03456248659213882, "grad_norm": 1.8469467163085938, "learning_rate": 1.9599444501294733e-05, "loss": 0.7833, "step": 725 }, { "epoch": 0.03461015898743832, "grad_norm": 1.959503173828125, "learning_rate": 1.959834318055562e-05, "loss": 1.132, "step": 726 }, { "epoch": 0.03465783138273783, "grad_norm": 1.5650908946990967, "learning_rate": 1.9597240378891e-05, "loss": 0.7967, "step": 727 }, { "epoch": 0.03470550377803733, "grad_norm": 2.6273374557495117, "learning_rate": 1.959613609647102e-05, "loss": 1.2453, "step": 728 }, { "epoch": 0.03475317617333683, "grad_norm": 1.9722366333007812, "learning_rate": 1.959503033346606e-05, "loss": 0.8889, "step": 729 }, { "epoch": 0.03480084856863633, "grad_norm": 1.6233927011489868, "learning_rate": 1.959392309004673e-05, "loss": 0.8429, "step": 730 }, { "epoch": 0.034848520963935835, "grad_norm": 1.907275915145874, "learning_rate": 1.959281436638387e-05, "loss": 0.9834, "step": 731 }, { "epoch": 0.034896193359235335, "grad_norm": 2.6624152660369873, "learning_rate": 1.9591704162648532e-05, "loss": 0.9018, "step": 732 }, { "epoch": 0.034943865754534835, "grad_norm": 2.5180978775024414, "learning_rate": 1.9590592479012022e-05, "loss": 1.1962, "step": 733 }, { "epoch": 0.034991538149834335, "grad_norm": 2.042426824569702, "learning_rate": 1.9589479315645857e-05, "loss": 0.6018, "step": 734 }, { "epoch": 0.03503921054513384, "grad_norm": 1.856363296508789, "learning_rate": 1.9588364672721785e-05, "loss": 0.989, "step": 735 }, { "epoch": 0.03508688294043334, "grad_norm": 3.38390851020813, "learning_rate": 1.9587248550411786e-05, "loss": 0.6093, "step": 736 }, { "epoch": 0.03513455533573284, "grad_norm": 1.378178358078003, "learning_rate": 1.9586130948888064e-05, "loss": 0.5984, "step": 737 }, { "epoch": 0.03518222773103235, "grad_norm": 4.037237644195557, "learning_rate": 1.9585011868323052e-05, "loss": 0.7504, "step": 738 }, { "epoch": 0.03522990012633185, "grad_norm": 1.9365379810333252, "learning_rate": 1.958389130888942e-05, "loss": 0.3903, "step": 739 }, { "epoch": 0.03527757252163135, "grad_norm": 1.7285670042037964, "learning_rate": 1.9582769270760055e-05, "loss": 0.9453, "step": 740 }, { "epoch": 0.03532524491693085, "grad_norm": 1.756253719329834, "learning_rate": 1.958164575410807e-05, "loss": 0.8585, "step": 741 }, { "epoch": 0.035372917312230356, "grad_norm": 4.003931522369385, "learning_rate": 1.958052075910682e-05, "loss": 0.8641, "step": 742 }, { "epoch": 0.035420589707529856, "grad_norm": 3.024296760559082, "learning_rate": 1.9579394285929877e-05, "loss": 1.0505, "step": 743 }, { "epoch": 0.03546826210282936, "grad_norm": 1.138126015663147, "learning_rate": 1.9578266334751045e-05, "loss": 0.339, "step": 744 }, { "epoch": 0.03551593449812886, "grad_norm": 2.1584057807922363, "learning_rate": 1.9577136905744353e-05, "loss": 0.7884, "step": 745 }, { "epoch": 0.035563606893428364, "grad_norm": 1.7360907793045044, "learning_rate": 1.957600599908406e-05, "loss": 0.6847, "step": 746 }, { "epoch": 0.035611279288727864, "grad_norm": 1.7327625751495361, "learning_rate": 1.9574873614944657e-05, "loss": 0.3045, "step": 747 }, { "epoch": 0.035658951684027364, "grad_norm": 1.9865176677703857, "learning_rate": 1.9573739753500857e-05, "loss": 1.1515, "step": 748 }, { "epoch": 0.035706624079326864, "grad_norm": 1.4304611682891846, "learning_rate": 1.9572604414927604e-05, "loss": 0.9681, "step": 749 }, { "epoch": 0.03575429647462637, "grad_norm": 1.925475835800171, "learning_rate": 1.957146759940007e-05, "loss": 0.7573, "step": 750 }, { "epoch": 0.03580196886992587, "grad_norm": 2.9961085319519043, "learning_rate": 1.9570329307093652e-05, "loss": 1.0703, "step": 751 }, { "epoch": 0.03584964126522537, "grad_norm": 1.74911367893219, "learning_rate": 1.9569189538183978e-05, "loss": 0.5074, "step": 752 }, { "epoch": 0.03589731366052487, "grad_norm": 3.323850393295288, "learning_rate": 1.95680482928469e-05, "loss": 0.698, "step": 753 }, { "epoch": 0.03594498605582438, "grad_norm": 2.003148078918457, "learning_rate": 1.9566905571258502e-05, "loss": 1.0439, "step": 754 }, { "epoch": 0.03599265845112388, "grad_norm": 3.6319541931152344, "learning_rate": 1.9565761373595094e-05, "loss": 0.7797, "step": 755 }, { "epoch": 0.03604033084642338, "grad_norm": 1.8461650609970093, "learning_rate": 1.9564615700033215e-05, "loss": 0.788, "step": 756 }, { "epoch": 0.03608800324172288, "grad_norm": 1.9516270160675049, "learning_rate": 1.956346855074963e-05, "loss": 0.9934, "step": 757 }, { "epoch": 0.036135675637022385, "grad_norm": 2.031405448913574, "learning_rate": 1.9562319925921333e-05, "loss": 0.7105, "step": 758 }, { "epoch": 0.036183348032321885, "grad_norm": 1.166115403175354, "learning_rate": 1.9561169825725546e-05, "loss": 0.7417, "step": 759 }, { "epoch": 0.036231020427621385, "grad_norm": 2.524076461791992, "learning_rate": 1.9560018250339712e-05, "loss": 1.2142, "step": 760 }, { "epoch": 0.036278692822920885, "grad_norm": 1.542688012123108, "learning_rate": 1.9558865199941515e-05, "loss": 0.2566, "step": 761 }, { "epoch": 0.03632636521822039, "grad_norm": 3.6229541301727295, "learning_rate": 1.9557710674708853e-05, "loss": 0.5391, "step": 762 }, { "epoch": 0.03637403761351989, "grad_norm": 2.812225818634033, "learning_rate": 1.955655467481986e-05, "loss": 0.7649, "step": 763 }, { "epoch": 0.03642171000881939, "grad_norm": 1.541874647140503, "learning_rate": 1.9555397200452892e-05, "loss": 0.8504, "step": 764 }, { "epoch": 0.03646938240411889, "grad_norm": 1.5217840671539307, "learning_rate": 1.9554238251786538e-05, "loss": 0.8975, "step": 765 }, { "epoch": 0.0365170547994184, "grad_norm": 2.1495580673217773, "learning_rate": 1.9553077828999614e-05, "loss": 0.9806, "step": 766 }, { "epoch": 0.0365647271947179, "grad_norm": 1.58120596408844, "learning_rate": 1.9551915932271156e-05, "loss": 0.576, "step": 767 }, { "epoch": 0.0366123995900174, "grad_norm": 1.8964204788208008, "learning_rate": 1.9550752561780434e-05, "loss": 0.5773, "step": 768 }, { "epoch": 0.0366600719853169, "grad_norm": 2.6379222869873047, "learning_rate": 1.9549587717706952e-05, "loss": 0.6676, "step": 769 }, { "epoch": 0.03670774438061641, "grad_norm": 1.5131341218948364, "learning_rate": 1.9548421400230418e-05, "loss": 0.6419, "step": 770 }, { "epoch": 0.03675541677591591, "grad_norm": 4.6081414222717285, "learning_rate": 1.9547253609530797e-05, "loss": 0.2493, "step": 771 }, { "epoch": 0.03680308917121541, "grad_norm": 1.161028504371643, "learning_rate": 1.954608434578826e-05, "loss": 0.3664, "step": 772 }, { "epoch": 0.03685076156651491, "grad_norm": 1.630754828453064, "learning_rate": 1.9544913609183214e-05, "loss": 0.9671, "step": 773 }, { "epoch": 0.036898433961814414, "grad_norm": 6.992397308349609, "learning_rate": 1.9543741399896295e-05, "loss": 0.1969, "step": 774 }, { "epoch": 0.036946106357113914, "grad_norm": 1.7193506956100464, "learning_rate": 1.9542567718108357e-05, "loss": 0.3565, "step": 775 }, { "epoch": 0.036993778752413414, "grad_norm": 32.21617126464844, "learning_rate": 1.954139256400049e-05, "loss": 0.9591, "step": 776 }, { "epoch": 0.037041451147712914, "grad_norm": 1.2193129062652588, "learning_rate": 1.954021593775401e-05, "loss": 0.6884, "step": 777 }, { "epoch": 0.03708912354301242, "grad_norm": 1.4926356077194214, "learning_rate": 1.953903783955045e-05, "loss": 0.7923, "step": 778 }, { "epoch": 0.03713679593831192, "grad_norm": 5.782951354980469, "learning_rate": 1.953785826957159e-05, "loss": 0.8947, "step": 779 }, { "epoch": 0.03718446833361142, "grad_norm": 1.9362940788269043, "learning_rate": 1.9536677227999415e-05, "loss": 0.7882, "step": 780 }, { "epoch": 0.03723214072891092, "grad_norm": 2.3215765953063965, "learning_rate": 1.953549471501616e-05, "loss": 0.5856, "step": 781 }, { "epoch": 0.03727981312421043, "grad_norm": 1.9120577573776245, "learning_rate": 1.953431073080426e-05, "loss": 0.6924, "step": 782 }, { "epoch": 0.03732748551950993, "grad_norm": 6.096733093261719, "learning_rate": 1.95331252755464e-05, "loss": 0.4019, "step": 783 }, { "epoch": 0.03737515791480943, "grad_norm": 1.5409770011901855, "learning_rate": 1.9531938349425484e-05, "loss": 0.8029, "step": 784 }, { "epoch": 0.03742283031010893, "grad_norm": 2.147329092025757, "learning_rate": 1.953074995262464e-05, "loss": 1.0116, "step": 785 }, { "epoch": 0.037470502705408436, "grad_norm": 2.0393097400665283, "learning_rate": 1.9529560085327227e-05, "loss": 1.0883, "step": 786 }, { "epoch": 0.037518175100707936, "grad_norm": 1.857731819152832, "learning_rate": 1.9528368747716827e-05, "loss": 0.8214, "step": 787 }, { "epoch": 0.037565847496007436, "grad_norm": 7.41095495223999, "learning_rate": 1.9527175939977252e-05, "loss": 1.1811, "step": 788 }, { "epoch": 0.037613519891306936, "grad_norm": 1.9293097257614136, "learning_rate": 1.952598166229254e-05, "loss": 0.5862, "step": 789 }, { "epoch": 0.03766119228660644, "grad_norm": 6.753727436065674, "learning_rate": 1.9524785914846956e-05, "loss": 0.5971, "step": 790 }, { "epoch": 0.03770886468190594, "grad_norm": 1.517684817314148, "learning_rate": 1.9523588697824995e-05, "loss": 0.7392, "step": 791 }, { "epoch": 0.03775653707720544, "grad_norm": 1.3403767347335815, "learning_rate": 1.952239001141137e-05, "loss": 0.6231, "step": 792 }, { "epoch": 0.03780420947250494, "grad_norm": 5.754158020019531, "learning_rate": 1.9521189855791026e-05, "loss": 1.4619, "step": 793 }, { "epoch": 0.03785188186780445, "grad_norm": 1.7080199718475342, "learning_rate": 1.9519988231149142e-05, "loss": 0.84, "step": 794 }, { "epoch": 0.03789955426310395, "grad_norm": 2.198056936264038, "learning_rate": 1.9518785137671107e-05, "loss": 0.7139, "step": 795 }, { "epoch": 0.03794722665840345, "grad_norm": 1.9409888982772827, "learning_rate": 1.9517580575542546e-05, "loss": 0.8135, "step": 796 }, { "epoch": 0.03799489905370295, "grad_norm": 1.7660938501358032, "learning_rate": 1.951637454494932e-05, "loss": 0.7041, "step": 797 }, { "epoch": 0.03804257144900246, "grad_norm": 1.3157745599746704, "learning_rate": 1.95151670460775e-05, "loss": 0.6999, "step": 798 }, { "epoch": 0.03809024384430196, "grad_norm": 1.2755836248397827, "learning_rate": 1.951395807911339e-05, "loss": 0.8245, "step": 799 }, { "epoch": 0.03813791623960146, "grad_norm": 1.8460443019866943, "learning_rate": 1.9512747644243525e-05, "loss": 1.016, "step": 800 }, { "epoch": 0.03818558863490096, "grad_norm": 1.0960500240325928, "learning_rate": 1.9511535741654663e-05, "loss": 0.6018, "step": 801 }, { "epoch": 0.038233261030200465, "grad_norm": 1.4150505065917969, "learning_rate": 1.9510322371533783e-05, "loss": 0.9056, "step": 802 }, { "epoch": 0.038280933425499965, "grad_norm": 2.163686513900757, "learning_rate": 1.95091075340681e-05, "loss": 0.8344, "step": 803 }, { "epoch": 0.038328605820799465, "grad_norm": 4.431417465209961, "learning_rate": 1.950789122944505e-05, "loss": 0.1334, "step": 804 }, { "epoch": 0.038376278216098965, "grad_norm": 2.1695454120635986, "learning_rate": 1.9506673457852293e-05, "loss": 0.9056, "step": 805 }, { "epoch": 0.03842395061139847, "grad_norm": 2.3549680709838867, "learning_rate": 1.9505454219477718e-05, "loss": 1.0862, "step": 806 }, { "epoch": 0.03847162300669797, "grad_norm": 1.5020617246627808, "learning_rate": 1.950423351450945e-05, "loss": 0.8241, "step": 807 }, { "epoch": 0.03851929540199747, "grad_norm": 1.5390362739562988, "learning_rate": 1.9503011343135828e-05, "loss": 0.7707, "step": 808 }, { "epoch": 0.03856696779729697, "grad_norm": 2.2688674926757812, "learning_rate": 1.9501787705545412e-05, "loss": 0.9565, "step": 809 }, { "epoch": 0.03861464019259648, "grad_norm": 1.713032841682434, "learning_rate": 1.9500562601927003e-05, "loss": 0.6219, "step": 810 }, { "epoch": 0.03866231258789598, "grad_norm": 1.7460881471633911, "learning_rate": 1.9499336032469626e-05, "loss": 0.8585, "step": 811 }, { "epoch": 0.03870998498319548, "grad_norm": 2.7170376777648926, "learning_rate": 1.949810799736252e-05, "loss": 1.1543, "step": 812 }, { "epoch": 0.03875765737849498, "grad_norm": 1.212214469909668, "learning_rate": 1.949687849679516e-05, "loss": 0.6487, "step": 813 }, { "epoch": 0.038805329773794486, "grad_norm": 4.563180446624756, "learning_rate": 1.949564753095725e-05, "loss": 1.3672, "step": 814 }, { "epoch": 0.038853002169093986, "grad_norm": 1.9729410409927368, "learning_rate": 1.949441510003871e-05, "loss": 0.998, "step": 815 }, { "epoch": 0.038900674564393486, "grad_norm": 1.927474021911621, "learning_rate": 1.9493181204229696e-05, "loss": 0.96, "step": 816 }, { "epoch": 0.03894834695969299, "grad_norm": 1.0984028577804565, "learning_rate": 1.949194584372058e-05, "loss": 0.6064, "step": 817 }, { "epoch": 0.038996019354992494, "grad_norm": 2.086822748184204, "learning_rate": 1.9490709018701967e-05, "loss": 1.0785, "step": 818 }, { "epoch": 0.039043691750291994, "grad_norm": 2.5809688568115234, "learning_rate": 1.9489470729364694e-05, "loss": 0.7195, "step": 819 }, { "epoch": 0.039091364145591494, "grad_norm": 1.8625272512435913, "learning_rate": 1.9488230975899804e-05, "loss": 0.9258, "step": 820 }, { "epoch": 0.039139036540890994, "grad_norm": 1.8708604574203491, "learning_rate": 1.948698975849859e-05, "loss": 1.1211, "step": 821 }, { "epoch": 0.0391867089361905, "grad_norm": 1.9733126163482666, "learning_rate": 1.9485747077352547e-05, "loss": 1.0904, "step": 822 }, { "epoch": 0.03923438133149, "grad_norm": 1.703894019126892, "learning_rate": 1.948450293265342e-05, "loss": 0.6765, "step": 823 }, { "epoch": 0.0392820537267895, "grad_norm": 1.9978395700454712, "learning_rate": 1.948325732459316e-05, "loss": 0.539, "step": 824 }, { "epoch": 0.039329726122089, "grad_norm": 1.8047763109207153, "learning_rate": 1.948201025336395e-05, "loss": 0.9598, "step": 825 }, { "epoch": 0.03937739851738851, "grad_norm": 2.928663969039917, "learning_rate": 1.9480761719158208e-05, "loss": 0.7686, "step": 826 }, { "epoch": 0.03942507091268801, "grad_norm": 1.6891142129898071, "learning_rate": 1.9479511722168567e-05, "loss": 0.696, "step": 827 }, { "epoch": 0.03947274330798751, "grad_norm": 1.5037572383880615, "learning_rate": 1.947826026258788e-05, "loss": 0.9345, "step": 828 }, { "epoch": 0.03952041570328701, "grad_norm": 1.8177145719528198, "learning_rate": 1.947700734060925e-05, "loss": 0.7538, "step": 829 }, { "epoch": 0.039568088098586515, "grad_norm": 1.433455467224121, "learning_rate": 1.9475752956425978e-05, "loss": 0.7786, "step": 830 }, { "epoch": 0.039615760493886015, "grad_norm": 3.665915012359619, "learning_rate": 1.9474497110231607e-05, "loss": 1.3014, "step": 831 }, { "epoch": 0.039663432889185515, "grad_norm": 3.8191745281219482, "learning_rate": 1.94732398022199e-05, "loss": 0.9444, "step": 832 }, { "epoch": 0.03971110528448502, "grad_norm": 1.385881781578064, "learning_rate": 1.9471981032584846e-05, "loss": 0.5207, "step": 833 }, { "epoch": 0.03975877767978452, "grad_norm": 1.8778563737869263, "learning_rate": 1.9470720801520665e-05, "loss": 0.624, "step": 834 }, { "epoch": 0.03980645007508402, "grad_norm": 2.0841329097747803, "learning_rate": 1.946945910922179e-05, "loss": 0.4488, "step": 835 }, { "epoch": 0.03985412247038352, "grad_norm": 3.347536087036133, "learning_rate": 1.9468195955882892e-05, "loss": 0.8075, "step": 836 }, { "epoch": 0.03990179486568303, "grad_norm": 3.5374181270599365, "learning_rate": 1.946693134169886e-05, "loss": 0.7185, "step": 837 }, { "epoch": 0.03994946726098253, "grad_norm": 1.4362339973449707, "learning_rate": 1.9465665266864815e-05, "loss": 0.6004, "step": 838 }, { "epoch": 0.03999713965628203, "grad_norm": 2.940203905105591, "learning_rate": 1.9464397731576093e-05, "loss": 0.9549, "step": 839 }, { "epoch": 0.04004481205158153, "grad_norm": 1.480191946029663, "learning_rate": 1.946312873602827e-05, "loss": 0.6108, "step": 840 }, { "epoch": 0.04009248444688104, "grad_norm": 1.2709978818893433, "learning_rate": 1.9461858280417134e-05, "loss": 0.6637, "step": 841 }, { "epoch": 0.04014015684218054, "grad_norm": 1.8085658550262451, "learning_rate": 1.94605863649387e-05, "loss": 1.2645, "step": 842 }, { "epoch": 0.04018782923748004, "grad_norm": 1.4298356771469116, "learning_rate": 1.945931298978922e-05, "loss": 0.2238, "step": 843 }, { "epoch": 0.04023550163277954, "grad_norm": 9.08450698852539, "learning_rate": 1.9458038155165157e-05, "loss": 2.0859, "step": 844 }, { "epoch": 0.040283174028079044, "grad_norm": 2.0305638313293457, "learning_rate": 1.94567618612632e-05, "loss": 0.7403, "step": 845 }, { "epoch": 0.040330846423378544, "grad_norm": 2.2413032054901123, "learning_rate": 1.9455484108280277e-05, "loss": 0.8184, "step": 846 }, { "epoch": 0.040378518818678044, "grad_norm": 1.2897911071777344, "learning_rate": 1.945420489641353e-05, "loss": 0.4618, "step": 847 }, { "epoch": 0.040426191213977544, "grad_norm": 3.2265336513519287, "learning_rate": 1.945292422586033e-05, "loss": 1.0359, "step": 848 }, { "epoch": 0.04047386360927705, "grad_norm": 1.4464894533157349, "learning_rate": 1.9451642096818258e-05, "loss": 0.662, "step": 849 }, { "epoch": 0.04052153600457655, "grad_norm": 1.6860398054122925, "learning_rate": 1.9450358509485152e-05, "loss": 1.149, "step": 850 }, { "epoch": 0.04056920839987605, "grad_norm": 1.7164076566696167, "learning_rate": 1.9449073464059048e-05, "loss": 0.7881, "step": 851 }, { "epoch": 0.04061688079517555, "grad_norm": 1.583937644958496, "learning_rate": 1.9447786960738212e-05, "loss": 0.7219, "step": 852 }, { "epoch": 0.04066455319047506, "grad_norm": 2.1447505950927734, "learning_rate": 1.944649899972114e-05, "loss": 0.5416, "step": 853 }, { "epoch": 0.04071222558577456, "grad_norm": 2.40384840965271, "learning_rate": 1.9445209581206557e-05, "loss": 0.9789, "step": 854 }, { "epoch": 0.04075989798107406, "grad_norm": 1.6884760856628418, "learning_rate": 1.94439187053934e-05, "loss": 0.9452, "step": 855 }, { "epoch": 0.04080757037637356, "grad_norm": 1.678084373474121, "learning_rate": 1.9442626372480838e-05, "loss": 0.9266, "step": 856 }, { "epoch": 0.040855242771673066, "grad_norm": 1.510408878326416, "learning_rate": 1.944133258266827e-05, "loss": 0.5553, "step": 857 }, { "epoch": 0.040902915166972566, "grad_norm": 1.5903170108795166, "learning_rate": 1.944003733615531e-05, "loss": 0.8193, "step": 858 }, { "epoch": 0.040950587562272066, "grad_norm": 1.9291753768920898, "learning_rate": 1.9438740633141804e-05, "loss": 0.7051, "step": 859 }, { "epoch": 0.040998259957571566, "grad_norm": 1.2462096214294434, "learning_rate": 1.9437442473827818e-05, "loss": 0.4045, "step": 860 }, { "epoch": 0.04104593235287107, "grad_norm": 1.6083624362945557, "learning_rate": 1.9436142858413648e-05, "loss": 0.6697, "step": 861 }, { "epoch": 0.04109360474817057, "grad_norm": 1.9976297616958618, "learning_rate": 1.9434841787099804e-05, "loss": 0.8119, "step": 862 }, { "epoch": 0.04114127714347007, "grad_norm": 1.580064058303833, "learning_rate": 1.9433539260087033e-05, "loss": 1.0078, "step": 863 }, { "epoch": 0.04118894953876957, "grad_norm": 1.811970829963684, "learning_rate": 1.9432235277576304e-05, "loss": 0.8918, "step": 864 }, { "epoch": 0.04123662193406908, "grad_norm": 2.8056108951568604, "learning_rate": 1.9430929839768803e-05, "loss": 0.4796, "step": 865 }, { "epoch": 0.04128429432936858, "grad_norm": 3.3034982681274414, "learning_rate": 1.9429622946865946e-05, "loss": 0.9444, "step": 866 }, { "epoch": 0.04133196672466808, "grad_norm": 1.4448636770248413, "learning_rate": 1.9428314599069375e-05, "loss": 0.7058, "step": 867 }, { "epoch": 0.04137963911996758, "grad_norm": 1.519383192062378, "learning_rate": 1.9427004796580954e-05, "loss": 0.7638, "step": 868 }, { "epoch": 0.04142731151526709, "grad_norm": 4.433587074279785, "learning_rate": 1.9425693539602773e-05, "loss": 0.8145, "step": 869 }, { "epoch": 0.04147498391056659, "grad_norm": 1.7333942651748657, "learning_rate": 1.9424380828337146e-05, "loss": 0.7182, "step": 870 }, { "epoch": 0.04152265630586609, "grad_norm": 1.652495265007019, "learning_rate": 1.9423066662986607e-05, "loss": 0.3631, "step": 871 }, { "epoch": 0.04157032870116559, "grad_norm": 1.9444339275360107, "learning_rate": 1.942175104375392e-05, "loss": 1.16, "step": 872 }, { "epoch": 0.041618001096465095, "grad_norm": 2.744253396987915, "learning_rate": 1.9420433970842078e-05, "loss": 0.545, "step": 873 }, { "epoch": 0.041665673491764595, "grad_norm": 1.701715111732483, "learning_rate": 1.941911544445428e-05, "loss": 0.8751, "step": 874 }, { "epoch": 0.041713345887064095, "grad_norm": 1.4953023195266724, "learning_rate": 1.941779546479397e-05, "loss": 0.8856, "step": 875 }, { "epoch": 0.041761018282363595, "grad_norm": 1.8698402643203735, "learning_rate": 1.9416474032064803e-05, "loss": 0.6765, "step": 876 }, { "epoch": 0.0418086906776631, "grad_norm": 3.711998224258423, "learning_rate": 1.9415151146470665e-05, "loss": 0.8479, "step": 877 }, { "epoch": 0.0418563630729626, "grad_norm": 2.123957633972168, "learning_rate": 1.9413826808215665e-05, "loss": 0.6198, "step": 878 }, { "epoch": 0.0419040354682621, "grad_norm": 1.6366348266601562, "learning_rate": 1.941250101750413e-05, "loss": 0.7139, "step": 879 }, { "epoch": 0.0419517078635616, "grad_norm": 1.9562444686889648, "learning_rate": 1.9411173774540616e-05, "loss": 0.6294, "step": 880 }, { "epoch": 0.04199938025886111, "grad_norm": 3.5185775756835938, "learning_rate": 1.9409845079529907e-05, "loss": 0.9229, "step": 881 }, { "epoch": 0.04204705265416061, "grad_norm": 1.930294394493103, "learning_rate": 1.9408514932677e-05, "loss": 0.8511, "step": 882 }, { "epoch": 0.04209472504946011, "grad_norm": 1.7541227340698242, "learning_rate": 1.9407183334187132e-05, "loss": 0.8352, "step": 883 }, { "epoch": 0.04214239744475961, "grad_norm": 1.5182242393493652, "learning_rate": 1.940585028426575e-05, "loss": 0.6217, "step": 884 }, { "epoch": 0.042190069840059116, "grad_norm": 1.8734807968139648, "learning_rate": 1.9404515783118533e-05, "loss": 0.6807, "step": 885 }, { "epoch": 0.042237742235358616, "grad_norm": 2.094998598098755, "learning_rate": 1.9403179830951376e-05, "loss": 0.4722, "step": 886 }, { "epoch": 0.042285414630658116, "grad_norm": 1.5760083198547363, "learning_rate": 1.9401842427970406e-05, "loss": 0.9682, "step": 887 }, { "epoch": 0.042333087025957616, "grad_norm": 1.9804329872131348, "learning_rate": 1.940050357438197e-05, "loss": 0.2387, "step": 888 }, { "epoch": 0.04238075942125712, "grad_norm": 1.821393609046936, "learning_rate": 1.9399163270392637e-05, "loss": 0.9595, "step": 889 }, { "epoch": 0.042428431816556623, "grad_norm": 14.376471519470215, "learning_rate": 1.9397821516209207e-05, "loss": 0.4811, "step": 890 }, { "epoch": 0.042476104211856124, "grad_norm": 3.35683012008667, "learning_rate": 1.9396478312038694e-05, "loss": 1.1819, "step": 891 }, { "epoch": 0.042523776607155624, "grad_norm": 2.020669937133789, "learning_rate": 1.9395133658088344e-05, "loss": 1.2711, "step": 892 }, { "epoch": 0.04257144900245513, "grad_norm": 1.6300127506256104, "learning_rate": 1.9393787554565618e-05, "loss": 1.1336, "step": 893 }, { "epoch": 0.04261912139775463, "grad_norm": 2.556307554244995, "learning_rate": 1.9392440001678213e-05, "loss": 1.2215, "step": 894 }, { "epoch": 0.04266679379305413, "grad_norm": 1.5415953397750854, "learning_rate": 1.9391090999634038e-05, "loss": 1.1214, "step": 895 }, { "epoch": 0.04271446618835363, "grad_norm": 1.6759424209594727, "learning_rate": 1.9389740548641232e-05, "loss": 0.6674, "step": 896 }, { "epoch": 0.04276213858365314, "grad_norm": 1.8798645734786987, "learning_rate": 1.9388388648908156e-05, "loss": 0.8414, "step": 897 }, { "epoch": 0.04280981097895264, "grad_norm": 1.722214698791504, "learning_rate": 1.9387035300643392e-05, "loss": 0.8246, "step": 898 }, { "epoch": 0.04285748337425214, "grad_norm": 1.241234302520752, "learning_rate": 1.9385680504055746e-05, "loss": 0.7037, "step": 899 }, { "epoch": 0.04290515576955164, "grad_norm": 1.4477009773254395, "learning_rate": 1.9384324259354254e-05, "loss": 0.5023, "step": 900 }, { "epoch": 0.042952828164851145, "grad_norm": 1.56638503074646, "learning_rate": 1.938296656674817e-05, "loss": 0.7424, "step": 901 }, { "epoch": 0.043000500560150645, "grad_norm": 3.264298677444458, "learning_rate": 1.938160742644697e-05, "loss": 0.6612, "step": 902 }, { "epoch": 0.043048172955450145, "grad_norm": 1.6142951250076294, "learning_rate": 1.9380246838660356e-05, "loss": 0.9295, "step": 903 }, { "epoch": 0.043095845350749645, "grad_norm": 2.484809637069702, "learning_rate": 1.937888480359825e-05, "loss": 1.0406, "step": 904 }, { "epoch": 0.04314351774604915, "grad_norm": 1.3895137310028076, "learning_rate": 1.9377521321470806e-05, "loss": 1.0379, "step": 905 }, { "epoch": 0.04319119014134865, "grad_norm": 1.5292794704437256, "learning_rate": 1.937615639248839e-05, "loss": 0.7214, "step": 906 }, { "epoch": 0.04323886253664815, "grad_norm": 1.3762729167938232, "learning_rate": 1.93747900168616e-05, "loss": 0.6806, "step": 907 }, { "epoch": 0.04328653493194765, "grad_norm": 4.4749755859375, "learning_rate": 1.937342219480125e-05, "loss": 0.621, "step": 908 }, { "epoch": 0.04333420732724716, "grad_norm": 1.5729974508285522, "learning_rate": 1.9372052926518386e-05, "loss": 0.8714, "step": 909 }, { "epoch": 0.04338187972254666, "grad_norm": 1.6284842491149902, "learning_rate": 1.937068221222427e-05, "loss": 0.6602, "step": 910 }, { "epoch": 0.04342955211784616, "grad_norm": 4.40969181060791, "learning_rate": 1.936931005213038e-05, "loss": 1.0772, "step": 911 }, { "epoch": 0.04347722451314566, "grad_norm": 3.7697627544403076, "learning_rate": 1.936793644644844e-05, "loss": 1.2258, "step": 912 }, { "epoch": 0.04352489690844517, "grad_norm": 1.448448896408081, "learning_rate": 1.936656139539038e-05, "loss": 0.7728, "step": 913 }, { "epoch": 0.04357256930374467, "grad_norm": 1.5610464811325073, "learning_rate": 1.936518489916835e-05, "loss": 0.7313, "step": 914 }, { "epoch": 0.04362024169904417, "grad_norm": 7.224313735961914, "learning_rate": 1.936380695799473e-05, "loss": 0.9441, "step": 915 }, { "epoch": 0.04366791409434367, "grad_norm": 1.8962578773498535, "learning_rate": 1.936242757208213e-05, "loss": 0.9251, "step": 916 }, { "epoch": 0.043715586489643174, "grad_norm": 4.338345527648926, "learning_rate": 1.936104674164337e-05, "loss": 1.07, "step": 917 }, { "epoch": 0.043763258884942674, "grad_norm": 1.7759220600128174, "learning_rate": 1.9359664466891495e-05, "loss": 0.9401, "step": 918 }, { "epoch": 0.043810931280242174, "grad_norm": 1.3625247478485107, "learning_rate": 1.9358280748039776e-05, "loss": 0.6422, "step": 919 }, { "epoch": 0.043858603675541674, "grad_norm": 5.836887359619141, "learning_rate": 1.9356895585301715e-05, "loss": 1.2319, "step": 920 }, { "epoch": 0.04390627607084118, "grad_norm": 2.191404342651367, "learning_rate": 1.935550897889102e-05, "loss": 1.1688, "step": 921 }, { "epoch": 0.04395394846614068, "grad_norm": 1.1320335865020752, "learning_rate": 1.9354120929021633e-05, "loss": 0.8847, "step": 922 }, { "epoch": 0.04400162086144018, "grad_norm": 1.4333887100219727, "learning_rate": 1.9352731435907715e-05, "loss": 0.702, "step": 923 }, { "epoch": 0.04404929325673969, "grad_norm": 1.5401864051818848, "learning_rate": 1.9351340499763654e-05, "loss": 0.8675, "step": 924 }, { "epoch": 0.04409696565203919, "grad_norm": 1.810653805732727, "learning_rate": 1.934994812080405e-05, "loss": 0.7717, "step": 925 }, { "epoch": 0.04414463804733869, "grad_norm": 1.6668885946273804, "learning_rate": 1.9348554299243737e-05, "loss": 0.6602, "step": 926 }, { "epoch": 0.04419231044263819, "grad_norm": 0.8640025854110718, "learning_rate": 1.934715903529777e-05, "loss": 0.372, "step": 927 }, { "epoch": 0.044239982837937696, "grad_norm": 6.436641693115234, "learning_rate": 1.934576232918142e-05, "loss": 0.4044, "step": 928 }, { "epoch": 0.044287655233237196, "grad_norm": 3.303285598754883, "learning_rate": 1.9344364181110185e-05, "loss": 1.3519, "step": 929 }, { "epoch": 0.044335327628536696, "grad_norm": 2.375943183898926, "learning_rate": 1.9342964591299785e-05, "loss": 0.4268, "step": 930 }, { "epoch": 0.044383000023836196, "grad_norm": 2.2730588912963867, "learning_rate": 1.934156355996616e-05, "loss": 0.5779, "step": 931 }, { "epoch": 0.0444306724191357, "grad_norm": 3.625521183013916, "learning_rate": 1.9340161087325483e-05, "loss": 0.8949, "step": 932 }, { "epoch": 0.0444783448144352, "grad_norm": 1.8681919574737549, "learning_rate": 1.9338757173594128e-05, "loss": 1.163, "step": 933 }, { "epoch": 0.0445260172097347, "grad_norm": 1.3595960140228271, "learning_rate": 1.9337351818988718e-05, "loss": 0.6256, "step": 934 }, { "epoch": 0.0445736896050342, "grad_norm": 1.5224964618682861, "learning_rate": 1.9335945023726076e-05, "loss": 0.7347, "step": 935 }, { "epoch": 0.04462136200033371, "grad_norm": 1.216257095336914, "learning_rate": 1.933453678802326e-05, "loss": 0.537, "step": 936 }, { "epoch": 0.04466903439563321, "grad_norm": 1.4969592094421387, "learning_rate": 1.9333127112097543e-05, "loss": 0.6825, "step": 937 }, { "epoch": 0.04471670679093271, "grad_norm": 1.4242290258407593, "learning_rate": 1.9331715996166424e-05, "loss": 0.5725, "step": 938 }, { "epoch": 0.04476437918623221, "grad_norm": 1.623018503189087, "learning_rate": 1.9330303440447627e-05, "loss": 0.6613, "step": 939 }, { "epoch": 0.04481205158153172, "grad_norm": 1.5217169523239136, "learning_rate": 1.9328889445159094e-05, "loss": 0.6965, "step": 940 }, { "epoch": 0.04485972397683122, "grad_norm": 2.455193281173706, "learning_rate": 1.9327474010518983e-05, "loss": 0.7261, "step": 941 }, { "epoch": 0.04490739637213072, "grad_norm": 3.400991201400757, "learning_rate": 1.932605713674569e-05, "loss": 0.9366, "step": 942 }, { "epoch": 0.04495506876743022, "grad_norm": 3.395486354827881, "learning_rate": 1.932463882405782e-05, "loss": 0.8198, "step": 943 }, { "epoch": 0.045002741162729724, "grad_norm": 3.2462515830993652, "learning_rate": 1.9323219072674207e-05, "loss": 1.3122, "step": 944 }, { "epoch": 0.045050413558029224, "grad_norm": 2.7918636798858643, "learning_rate": 1.9321797882813903e-05, "loss": 1.2664, "step": 945 }, { "epoch": 0.045098085953328725, "grad_norm": 9.712040901184082, "learning_rate": 1.9320375254696177e-05, "loss": 0.6191, "step": 946 }, { "epoch": 0.045145758348628225, "grad_norm": 2.626828908920288, "learning_rate": 1.9318951188540534e-05, "loss": 0.3006, "step": 947 }, { "epoch": 0.04519343074392773, "grad_norm": 1.2030017375946045, "learning_rate": 1.9317525684566686e-05, "loss": 0.4254, "step": 948 }, { "epoch": 0.04524110313922723, "grad_norm": 1.4220699071884155, "learning_rate": 1.9316098742994578e-05, "loss": 0.7015, "step": 949 }, { "epoch": 0.04528877553452673, "grad_norm": 1.5147393941879272, "learning_rate": 1.9314670364044374e-05, "loss": 0.6388, "step": 950 }, { "epoch": 0.04533644792982623, "grad_norm": 1.8807706832885742, "learning_rate": 1.931324054793645e-05, "loss": 0.7108, "step": 951 }, { "epoch": 0.04538412032512574, "grad_norm": 1.460456132888794, "learning_rate": 1.9311809294891422e-05, "loss": 0.7543, "step": 952 }, { "epoch": 0.04543179272042524, "grad_norm": 1.7707245349884033, "learning_rate": 1.931037660513011e-05, "loss": 1.1482, "step": 953 }, { "epoch": 0.04547946511572474, "grad_norm": 1.3123273849487305, "learning_rate": 1.930894247887357e-05, "loss": 0.728, "step": 954 }, { "epoch": 0.04552713751102424, "grad_norm": 0.8915164470672607, "learning_rate": 1.9307506916343066e-05, "loss": 0.3218, "step": 955 }, { "epoch": 0.045574809906323746, "grad_norm": 2.099499225616455, "learning_rate": 1.930606991776009e-05, "loss": 0.9247, "step": 956 }, { "epoch": 0.045622482301623246, "grad_norm": 3.0598840713500977, "learning_rate": 1.9304631483346364e-05, "loss": 0.6489, "step": 957 }, { "epoch": 0.045670154696922746, "grad_norm": 1.5167876482009888, "learning_rate": 1.930319161332382e-05, "loss": 0.7977, "step": 958 }, { "epoch": 0.045717827092222246, "grad_norm": 6.17631721496582, "learning_rate": 1.930175030791461e-05, "loss": 1.1156, "step": 959 }, { "epoch": 0.04576549948752175, "grad_norm": 1.7657543420791626, "learning_rate": 1.9300307567341124e-05, "loss": 0.7452, "step": 960 }, { "epoch": 0.04581317188282125, "grad_norm": 1.6475632190704346, "learning_rate": 1.9298863391825954e-05, "loss": 0.4463, "step": 961 }, { "epoch": 0.04586084427812075, "grad_norm": 4.049325942993164, "learning_rate": 1.929741778159192e-05, "loss": 0.4196, "step": 962 }, { "epoch": 0.045908516673420253, "grad_norm": 4.7456865310668945, "learning_rate": 1.9295970736862063e-05, "loss": 0.7544, "step": 963 }, { "epoch": 0.04595618906871976, "grad_norm": 1.6404321193695068, "learning_rate": 1.9294522257859655e-05, "loss": 0.8724, "step": 964 }, { "epoch": 0.04600386146401926, "grad_norm": 1.412407398223877, "learning_rate": 1.929307234480818e-05, "loss": 0.9744, "step": 965 }, { "epoch": 0.04605153385931876, "grad_norm": 3.00014591217041, "learning_rate": 1.929162099793134e-05, "loss": 0.9947, "step": 966 }, { "epoch": 0.04609920625461826, "grad_norm": 1.8120754957199097, "learning_rate": 1.9290168217453066e-05, "loss": 1.1211, "step": 967 }, { "epoch": 0.04614687864991777, "grad_norm": 1.742107629776001, "learning_rate": 1.9288714003597504e-05, "loss": 1.0559, "step": 968 }, { "epoch": 0.04619455104521727, "grad_norm": 1.3778812885284424, "learning_rate": 1.928725835658903e-05, "loss": 0.5452, "step": 969 }, { "epoch": 0.04624222344051677, "grad_norm": 1.5840113162994385, "learning_rate": 1.9285801276652226e-05, "loss": 0.7015, "step": 970 }, { "epoch": 0.04628989583581627, "grad_norm": 2.1550915241241455, "learning_rate": 1.9284342764011917e-05, "loss": 0.838, "step": 971 }, { "epoch": 0.046337568231115775, "grad_norm": 3.1609108448028564, "learning_rate": 1.9282882818893126e-05, "loss": 0.9829, "step": 972 }, { "epoch": 0.046385240626415275, "grad_norm": 4.29646635055542, "learning_rate": 1.9281421441521113e-05, "loss": 1.0196, "step": 973 }, { "epoch": 0.046432913021714775, "grad_norm": 1.2184792757034302, "learning_rate": 1.927995863212135e-05, "loss": 0.5431, "step": 974 }, { "epoch": 0.046480585417014275, "grad_norm": 2.432730197906494, "learning_rate": 1.9278494390919538e-05, "loss": 0.7913, "step": 975 }, { "epoch": 0.04652825781231378, "grad_norm": 1.3183215856552124, "learning_rate": 1.927702871814159e-05, "loss": 0.5146, "step": 976 }, { "epoch": 0.04657593020761328, "grad_norm": 1.6891658306121826, "learning_rate": 1.9275561614013644e-05, "loss": 0.9708, "step": 977 }, { "epoch": 0.04662360260291278, "grad_norm": 1.7222672700881958, "learning_rate": 1.9274093078762063e-05, "loss": 0.4512, "step": 978 }, { "epoch": 0.04667127499821228, "grad_norm": 1.4556636810302734, "learning_rate": 1.9272623112613425e-05, "loss": 0.8314, "step": 979 }, { "epoch": 0.04671894739351179, "grad_norm": 1.9452040195465088, "learning_rate": 1.927115171579453e-05, "loss": 1.1497, "step": 980 }, { "epoch": 0.04676661978881129, "grad_norm": 1.5137896537780762, "learning_rate": 1.9269678888532394e-05, "loss": 0.8747, "step": 981 }, { "epoch": 0.04681429218411079, "grad_norm": 1.9581245183944702, "learning_rate": 1.926820463105427e-05, "loss": 0.5568, "step": 982 }, { "epoch": 0.04686196457941029, "grad_norm": 1.609912395477295, "learning_rate": 1.9266728943587615e-05, "loss": 0.6783, "step": 983 }, { "epoch": 0.0469096369747098, "grad_norm": 1.1952351331710815, "learning_rate": 1.926525182636011e-05, "loss": 0.5354, "step": 984 }, { "epoch": 0.0469573093700093, "grad_norm": 2.4685332775115967, "learning_rate": 1.926377327959967e-05, "loss": 0.801, "step": 985 }, { "epoch": 0.0470049817653088, "grad_norm": 1.0144199132919312, "learning_rate": 1.9262293303534403e-05, "loss": 0.3081, "step": 986 }, { "epoch": 0.0470526541606083, "grad_norm": 1.8896009922027588, "learning_rate": 1.9260811898392665e-05, "loss": 0.8822, "step": 987 }, { "epoch": 0.047100326555907804, "grad_norm": 1.4001494646072388, "learning_rate": 1.925932906440302e-05, "loss": 0.5165, "step": 988 }, { "epoch": 0.047147998951207304, "grad_norm": 2.435675621032715, "learning_rate": 1.9257844801794253e-05, "loss": 0.8977, "step": 989 }, { "epoch": 0.047195671346506804, "grad_norm": 1.3700307607650757, "learning_rate": 1.925635911079537e-05, "loss": 0.7306, "step": 990 }, { "epoch": 0.047243343741806304, "grad_norm": 2.761582851409912, "learning_rate": 1.9254871991635598e-05, "loss": 1.0047, "step": 991 }, { "epoch": 0.04729101613710581, "grad_norm": 1.5248297452926636, "learning_rate": 1.9253383444544386e-05, "loss": 0.6427, "step": 992 }, { "epoch": 0.04733868853240531, "grad_norm": 1.0941801071166992, "learning_rate": 1.9251893469751396e-05, "loss": 0.3129, "step": 993 }, { "epoch": 0.04738636092770481, "grad_norm": 1.0071507692337036, "learning_rate": 1.9250402067486523e-05, "loss": 0.3861, "step": 994 }, { "epoch": 0.04743403332300431, "grad_norm": 1.5965173244476318, "learning_rate": 1.924890923797987e-05, "loss": 1.0344, "step": 995 }, { "epoch": 0.04748170571830382, "grad_norm": 1.3915090560913086, "learning_rate": 1.9247414981461768e-05, "loss": 1.0315, "step": 996 }, { "epoch": 0.04752937811360332, "grad_norm": 1.3044167757034302, "learning_rate": 1.9245919298162763e-05, "loss": 0.6425, "step": 997 }, { "epoch": 0.04757705050890282, "grad_norm": 1.6621586084365845, "learning_rate": 1.9244422188313624e-05, "loss": 0.8937, "step": 998 }, { "epoch": 0.04762472290420232, "grad_norm": 2.1778156757354736, "learning_rate": 1.9242923652145345e-05, "loss": 1.0258, "step": 999 }, { "epoch": 0.047672395299501825, "grad_norm": 1.8101787567138672, "learning_rate": 1.9241423689889126e-05, "loss": 0.7544, "step": 1000 }, { "epoch": 0.047720067694801326, "grad_norm": 1.024300217628479, "learning_rate": 1.9239922301776404e-05, "loss": 0.4062, "step": 1001 }, { "epoch": 0.047767740090100826, "grad_norm": 1.6066406965255737, "learning_rate": 1.923841948803882e-05, "loss": 1.1287, "step": 1002 }, { "epoch": 0.047815412485400326, "grad_norm": 1.7720218896865845, "learning_rate": 1.9236915248908244e-05, "loss": 1.1275, "step": 1003 }, { "epoch": 0.04786308488069983, "grad_norm": 2.4469680786132812, "learning_rate": 1.9235409584616774e-05, "loss": 0.9508, "step": 1004 }, { "epoch": 0.04791075727599933, "grad_norm": 2.878873825073242, "learning_rate": 1.9233902495396707e-05, "loss": 0.6744, "step": 1005 }, { "epoch": 0.04795842967129883, "grad_norm": 1.3485652208328247, "learning_rate": 1.9232393981480576e-05, "loss": 0.531, "step": 1006 }, { "epoch": 0.04800610206659833, "grad_norm": 1.3326756954193115, "learning_rate": 1.923088404310113e-05, "loss": 0.3869, "step": 1007 }, { "epoch": 0.04805377446189784, "grad_norm": 2.376333475112915, "learning_rate": 1.9229372680491334e-05, "loss": 0.7226, "step": 1008 }, { "epoch": 0.04810144685719734, "grad_norm": 1.694778323173523, "learning_rate": 1.922785989388438e-05, "loss": 1.0696, "step": 1009 }, { "epoch": 0.04814911925249684, "grad_norm": 1.7380653619766235, "learning_rate": 1.922634568351367e-05, "loss": 0.3589, "step": 1010 }, { "epoch": 0.04819679164779634, "grad_norm": 1.8209391832351685, "learning_rate": 1.922483004961284e-05, "loss": 0.7724, "step": 1011 }, { "epoch": 0.04824446404309585, "grad_norm": 4.721229553222656, "learning_rate": 1.9223312992415723e-05, "loss": 0.5786, "step": 1012 }, { "epoch": 0.04829213643839535, "grad_norm": 1.639879822731018, "learning_rate": 1.9221794512156394e-05, "loss": 0.8323, "step": 1013 }, { "epoch": 0.04833980883369485, "grad_norm": 1.9955905675888062, "learning_rate": 1.9220274609069143e-05, "loss": 0.9613, "step": 1014 }, { "epoch": 0.048387481228994354, "grad_norm": 1.4991236925125122, "learning_rate": 1.921875328338847e-05, "loss": 0.6176, "step": 1015 }, { "epoch": 0.048435153624293854, "grad_norm": 1.8712117671966553, "learning_rate": 1.9217230535349097e-05, "loss": 0.7881, "step": 1016 }, { "epoch": 0.048482826019593354, "grad_norm": 1.6780242919921875, "learning_rate": 1.9215706365185973e-05, "loss": 0.9323, "step": 1017 }, { "epoch": 0.048530498414892854, "grad_norm": 1.497131586074829, "learning_rate": 1.9214180773134257e-05, "loss": 0.6449, "step": 1018 }, { "epoch": 0.04857817081019236, "grad_norm": 1.2731022834777832, "learning_rate": 1.921265375942934e-05, "loss": 0.5281, "step": 1019 }, { "epoch": 0.04862584320549186, "grad_norm": 1.69632089138031, "learning_rate": 1.9211125324306816e-05, "loss": 0.6828, "step": 1020 }, { "epoch": 0.04867351560079136, "grad_norm": 2.5650973320007324, "learning_rate": 1.9209595468002515e-05, "loss": 0.4365, "step": 1021 }, { "epoch": 0.04872118799609086, "grad_norm": 1.9558252096176147, "learning_rate": 1.920806419075247e-05, "loss": 1.0096, "step": 1022 }, { "epoch": 0.04876886039139037, "grad_norm": 1.3596681356430054, "learning_rate": 1.9206531492792945e-05, "loss": 0.3994, "step": 1023 }, { "epoch": 0.04881653278668987, "grad_norm": 0.8595488667488098, "learning_rate": 1.9204997374360423e-05, "loss": 0.3795, "step": 1024 }, { "epoch": 0.04886420518198937, "grad_norm": 2.1809303760528564, "learning_rate": 1.9203461835691596e-05, "loss": 0.5414, "step": 1025 }, { "epoch": 0.04891187757728887, "grad_norm": 3.2933461666107178, "learning_rate": 1.9201924877023388e-05, "loss": 1.125, "step": 1026 }, { "epoch": 0.048959549972588376, "grad_norm": 1.5086666345596313, "learning_rate": 1.9200386498592932e-05, "loss": 0.6402, "step": 1027 }, { "epoch": 0.049007222367887876, "grad_norm": 2.256472587585449, "learning_rate": 1.9198846700637582e-05, "loss": 0.8795, "step": 1028 }, { "epoch": 0.049054894763187376, "grad_norm": 1.6830346584320068, "learning_rate": 1.9197305483394917e-05, "loss": 0.7627, "step": 1029 }, { "epoch": 0.049102567158486876, "grad_norm": 6.12313175201416, "learning_rate": 1.9195762847102732e-05, "loss": 1.9816, "step": 1030 }, { "epoch": 0.04915023955378638, "grad_norm": 1.7161015272140503, "learning_rate": 1.9194218791999037e-05, "loss": 0.9729, "step": 1031 }, { "epoch": 0.04919791194908588, "grad_norm": 5.440549373626709, "learning_rate": 1.9192673318322062e-05, "loss": 0.4963, "step": 1032 }, { "epoch": 0.04924558434438538, "grad_norm": 1.7278692722320557, "learning_rate": 1.9191126426310264e-05, "loss": 1.2882, "step": 1033 }, { "epoch": 0.04929325673968488, "grad_norm": 1.1383099555969238, "learning_rate": 1.918957811620231e-05, "loss": 0.4966, "step": 1034 }, { "epoch": 0.04934092913498439, "grad_norm": 2.120032787322998, "learning_rate": 1.9188028388237084e-05, "loss": 0.3744, "step": 1035 }, { "epoch": 0.04938860153028389, "grad_norm": 2.3618710041046143, "learning_rate": 1.9186477242653693e-05, "loss": 0.4978, "step": 1036 }, { "epoch": 0.04943627392558339, "grad_norm": 3.8753726482391357, "learning_rate": 1.9184924679691474e-05, "loss": 0.1962, "step": 1037 }, { "epoch": 0.04948394632088289, "grad_norm": 1.73308265209198, "learning_rate": 1.9183370699589954e-05, "loss": 0.7439, "step": 1038 }, { "epoch": 0.0495316187161824, "grad_norm": 4.969413757324219, "learning_rate": 1.918181530258891e-05, "loss": 1.3968, "step": 1039 }, { "epoch": 0.0495792911114819, "grad_norm": 1.4473387002944946, "learning_rate": 1.918025848892832e-05, "loss": 1.0092, "step": 1040 }, { "epoch": 0.0496269635067814, "grad_norm": 1.799941897392273, "learning_rate": 1.9178700258848383e-05, "loss": 0.7403, "step": 1041 }, { "epoch": 0.0496746359020809, "grad_norm": 1.4860204458236694, "learning_rate": 1.9177140612589517e-05, "loss": 0.5133, "step": 1042 }, { "epoch": 0.049722308297380405, "grad_norm": 2.0992960929870605, "learning_rate": 1.9175579550392362e-05, "loss": 0.2272, "step": 1043 }, { "epoch": 0.049769980692679905, "grad_norm": 8.47647476196289, "learning_rate": 1.9174017072497773e-05, "loss": 0.5295, "step": 1044 }, { "epoch": 0.049817653087979405, "grad_norm": 1.3025389909744263, "learning_rate": 1.9172453179146822e-05, "loss": 0.5968, "step": 1045 }, { "epoch": 0.049865325483278905, "grad_norm": 2.5897867679595947, "learning_rate": 1.9170887870580806e-05, "loss": 1.284, "step": 1046 }, { "epoch": 0.04991299787857841, "grad_norm": 1.8203742504119873, "learning_rate": 1.9169321147041234e-05, "loss": 0.79, "step": 1047 }, { "epoch": 0.04996067027387791, "grad_norm": 1.2220009565353394, "learning_rate": 1.916775300876983e-05, "loss": 0.7042, "step": 1048 }, { "epoch": 0.05000834266917741, "grad_norm": 1.4266115427017212, "learning_rate": 1.916618345600855e-05, "loss": 0.7694, "step": 1049 }, { "epoch": 0.05005601506447691, "grad_norm": 1.64155113697052, "learning_rate": 1.9164612488999556e-05, "loss": 0.8167, "step": 1050 }, { "epoch": 0.05010368745977642, "grad_norm": 1.6256442070007324, "learning_rate": 1.916304010798523e-05, "loss": 0.6897, "step": 1051 }, { "epoch": 0.05015135985507592, "grad_norm": 3.0482420921325684, "learning_rate": 1.916146631320818e-05, "loss": 0.7566, "step": 1052 }, { "epoch": 0.05019903225037542, "grad_norm": 1.2535357475280762, "learning_rate": 1.915989110491122e-05, "loss": 0.3171, "step": 1053 }, { "epoch": 0.05024670464567492, "grad_norm": 1.5403010845184326, "learning_rate": 1.9158314483337394e-05, "loss": 0.669, "step": 1054 }, { "epoch": 0.050294377040974426, "grad_norm": 1.7651960849761963, "learning_rate": 1.9156736448729952e-05, "loss": 1.1459, "step": 1055 }, { "epoch": 0.05034204943627393, "grad_norm": 2.693230152130127, "learning_rate": 1.9155157001332374e-05, "loss": 0.6859, "step": 1056 }, { "epoch": 0.05038972183157343, "grad_norm": 1.4504109621047974, "learning_rate": 1.915357614138835e-05, "loss": 0.7047, "step": 1057 }, { "epoch": 0.05043739422687293, "grad_norm": 1.6796247959136963, "learning_rate": 1.915199386914179e-05, "loss": 0.7399, "step": 1058 }, { "epoch": 0.050485066622172434, "grad_norm": 3.7206671237945557, "learning_rate": 1.9150410184836826e-05, "loss": 1.3115, "step": 1059 }, { "epoch": 0.050532739017471934, "grad_norm": 2.770829916000366, "learning_rate": 1.91488250887178e-05, "loss": 1.1037, "step": 1060 }, { "epoch": 0.050580411412771434, "grad_norm": 1.524247169494629, "learning_rate": 1.9147238581029276e-05, "loss": 0.752, "step": 1061 }, { "epoch": 0.050628083808070934, "grad_norm": 2.784879684448242, "learning_rate": 1.914565066201604e-05, "loss": 0.5708, "step": 1062 }, { "epoch": 0.05067575620337044, "grad_norm": 3.14414381980896, "learning_rate": 1.9144061331923086e-05, "loss": 0.5818, "step": 1063 }, { "epoch": 0.05072342859866994, "grad_norm": 2.0844810009002686, "learning_rate": 1.9142470590995636e-05, "loss": 0.662, "step": 1064 }, { "epoch": 0.05077110099396944, "grad_norm": 1.2482565641403198, "learning_rate": 1.9140878439479123e-05, "loss": 0.5516, "step": 1065 }, { "epoch": 0.05081877338926894, "grad_norm": 1.4249321222305298, "learning_rate": 1.9139284877619196e-05, "loss": 0.4251, "step": 1066 }, { "epoch": 0.05086644578456845, "grad_norm": 1.3449572324752808, "learning_rate": 1.9137689905661733e-05, "loss": 1.008, "step": 1067 }, { "epoch": 0.05091411817986795, "grad_norm": 1.894120216369629, "learning_rate": 1.9136093523852817e-05, "loss": 0.9094, "step": 1068 }, { "epoch": 0.05096179057516745, "grad_norm": 2.2801637649536133, "learning_rate": 1.9134495732438755e-05, "loss": 1.0731, "step": 1069 }, { "epoch": 0.05100946297046695, "grad_norm": 2.6421236991882324, "learning_rate": 1.9132896531666067e-05, "loss": 1.0985, "step": 1070 }, { "epoch": 0.051057135365766455, "grad_norm": 4.324747562408447, "learning_rate": 1.9131295921781495e-05, "loss": 0.7025, "step": 1071 }, { "epoch": 0.051104807761065955, "grad_norm": 2.1336493492126465, "learning_rate": 1.9129693903031995e-05, "loss": 0.9208, "step": 1072 }, { "epoch": 0.051152480156365455, "grad_norm": 1.2150462865829468, "learning_rate": 1.9128090475664748e-05, "loss": 0.8818, "step": 1073 }, { "epoch": 0.051200152551664956, "grad_norm": 1.9461040496826172, "learning_rate": 1.9126485639927137e-05, "loss": 0.7112, "step": 1074 }, { "epoch": 0.05124782494696446, "grad_norm": 1.807349681854248, "learning_rate": 1.9124879396066778e-05, "loss": 0.9164, "step": 1075 }, { "epoch": 0.05129549734226396, "grad_norm": 3.4449281692504883, "learning_rate": 1.9123271744331494e-05, "loss": 1.3733, "step": 1076 }, { "epoch": 0.05134316973756346, "grad_norm": 1.3120534420013428, "learning_rate": 1.9121662684969337e-05, "loss": 0.883, "step": 1077 }, { "epoch": 0.05139084213286296, "grad_norm": 1.1500074863433838, "learning_rate": 1.9120052218228558e-05, "loss": 0.6904, "step": 1078 }, { "epoch": 0.05143851452816247, "grad_norm": 1.375466227531433, "learning_rate": 1.911844034435764e-05, "loss": 0.6998, "step": 1079 }, { "epoch": 0.05148618692346197, "grad_norm": 5.473826885223389, "learning_rate": 1.911682706360528e-05, "loss": 0.5018, "step": 1080 }, { "epoch": 0.05153385931876147, "grad_norm": 1.468106985092163, "learning_rate": 1.9115212376220392e-05, "loss": 0.8559, "step": 1081 }, { "epoch": 0.05158153171406097, "grad_norm": 7.54047155380249, "learning_rate": 1.91135962824521e-05, "loss": 1.0628, "step": 1082 }, { "epoch": 0.05162920410936048, "grad_norm": 2.1193039417266846, "learning_rate": 1.911197878254975e-05, "loss": 0.7618, "step": 1083 }, { "epoch": 0.05167687650465998, "grad_norm": 1.572967290878296, "learning_rate": 1.9110359876762913e-05, "loss": 0.8235, "step": 1084 }, { "epoch": 0.05172454889995948, "grad_norm": 1.452972173690796, "learning_rate": 1.9108739565341365e-05, "loss": 0.7287, "step": 1085 }, { "epoch": 0.05177222129525898, "grad_norm": 2.0626370906829834, "learning_rate": 1.9107117848535105e-05, "loss": 0.8926, "step": 1086 }, { "epoch": 0.051819893690558484, "grad_norm": 2.210892915725708, "learning_rate": 1.9105494726594344e-05, "loss": 1.0116, "step": 1087 }, { "epoch": 0.051867566085857984, "grad_norm": 2.8380939960479736, "learning_rate": 1.910387019976952e-05, "loss": 0.6746, "step": 1088 }, { "epoch": 0.051915238481157484, "grad_norm": 1.2834552526474, "learning_rate": 1.910224426831127e-05, "loss": 0.6688, "step": 1089 }, { "epoch": 0.051962910876456984, "grad_norm": 2.0969395637512207, "learning_rate": 1.910061693247047e-05, "loss": 0.7519, "step": 1090 }, { "epoch": 0.05201058327175649, "grad_norm": 2.172556161880493, "learning_rate": 1.909898819249819e-05, "loss": 0.717, "step": 1091 }, { "epoch": 0.05205825566705599, "grad_norm": 1.823603630065918, "learning_rate": 1.9097358048645732e-05, "loss": 0.4221, "step": 1092 }, { "epoch": 0.05210592806235549, "grad_norm": 2.516322612762451, "learning_rate": 1.9095726501164616e-05, "loss": 0.4237, "step": 1093 }, { "epoch": 0.05215360045765499, "grad_norm": 1.9163011312484741, "learning_rate": 1.909409355030657e-05, "loss": 1.0891, "step": 1094 }, { "epoch": 0.0522012728529545, "grad_norm": 1.3037455081939697, "learning_rate": 1.909245919632354e-05, "loss": 0.5999, "step": 1095 }, { "epoch": 0.052248945248254, "grad_norm": 1.5745213031768799, "learning_rate": 1.9090823439467686e-05, "loss": 0.5965, "step": 1096 }, { "epoch": 0.0522966176435535, "grad_norm": 1.977568507194519, "learning_rate": 1.9089186279991398e-05, "loss": 0.7052, "step": 1097 }, { "epoch": 0.052344290038853, "grad_norm": 2.6339094638824463, "learning_rate": 1.908754771814726e-05, "loss": 1.0466, "step": 1098 }, { "epoch": 0.052391962434152506, "grad_norm": 1.6164226531982422, "learning_rate": 1.90859077541881e-05, "loss": 0.6806, "step": 1099 }, { "epoch": 0.052439634829452006, "grad_norm": 1.2467166185379028, "learning_rate": 1.9084266388366937e-05, "loss": 0.5265, "step": 1100 }, { "epoch": 0.052487307224751506, "grad_norm": 1.7109839916229248, "learning_rate": 1.9082623620937023e-05, "loss": 0.8662, "step": 1101 }, { "epoch": 0.052534979620051006, "grad_norm": 1.3563034534454346, "learning_rate": 1.9080979452151813e-05, "loss": 0.9214, "step": 1102 }, { "epoch": 0.05258265201535051, "grad_norm": 2.4167439937591553, "learning_rate": 1.9079333882264994e-05, "loss": 1.1707, "step": 1103 }, { "epoch": 0.05263032441065001, "grad_norm": 1.283447265625, "learning_rate": 1.907768691153045e-05, "loss": 0.7402, "step": 1104 }, { "epoch": 0.05267799680594951, "grad_norm": 2.731553316116333, "learning_rate": 1.90760385402023e-05, "loss": 0.7584, "step": 1105 }, { "epoch": 0.05272566920124901, "grad_norm": 3.888896942138672, "learning_rate": 1.9074388768534872e-05, "loss": 1.5162, "step": 1106 }, { "epoch": 0.05277334159654852, "grad_norm": 1.4903335571289062, "learning_rate": 1.9072737596782703e-05, "loss": 0.5361, "step": 1107 }, { "epoch": 0.05282101399184802, "grad_norm": 1.0833760499954224, "learning_rate": 1.9071085025200555e-05, "loss": 0.5811, "step": 1108 }, { "epoch": 0.05286868638714752, "grad_norm": 1.0817785263061523, "learning_rate": 1.9069431054043398e-05, "loss": 0.5059, "step": 1109 }, { "epoch": 0.05291635878244703, "grad_norm": 2.820789337158203, "learning_rate": 1.9067775683566433e-05, "loss": 0.9822, "step": 1110 }, { "epoch": 0.05296403117774653, "grad_norm": 4.069901943206787, "learning_rate": 1.9066118914025054e-05, "loss": 0.8095, "step": 1111 }, { "epoch": 0.05301170357304603, "grad_norm": 2.1332006454467773, "learning_rate": 1.906446074567489e-05, "loss": 0.7816, "step": 1112 }, { "epoch": 0.05305937596834553, "grad_norm": 1.5364291667938232, "learning_rate": 1.906280117877178e-05, "loss": 0.7996, "step": 1113 }, { "epoch": 0.053107048363645035, "grad_norm": 4.230134010314941, "learning_rate": 1.9061140213571777e-05, "loss": 0.9161, "step": 1114 }, { "epoch": 0.053154720758944535, "grad_norm": 2.7443134784698486, "learning_rate": 1.905947785033115e-05, "loss": 0.7761, "step": 1115 }, { "epoch": 0.053202393154244035, "grad_norm": 1.4390854835510254, "learning_rate": 1.9057814089306388e-05, "loss": 0.7214, "step": 1116 }, { "epoch": 0.053250065549543535, "grad_norm": 3.119563341140747, "learning_rate": 1.905614893075419e-05, "loss": 0.236, "step": 1117 }, { "epoch": 0.05329773794484304, "grad_norm": 3.512956380844116, "learning_rate": 1.905448237493147e-05, "loss": 1.1971, "step": 1118 }, { "epoch": 0.05334541034014254, "grad_norm": 1.7571995258331299, "learning_rate": 1.905281442209536e-05, "loss": 0.9151, "step": 1119 }, { "epoch": 0.05339308273544204, "grad_norm": 1.2232495546340942, "learning_rate": 1.9051145072503216e-05, "loss": 0.7435, "step": 1120 }, { "epoch": 0.05344075513074154, "grad_norm": 3.606189489364624, "learning_rate": 1.9049474326412593e-05, "loss": 0.3272, "step": 1121 }, { "epoch": 0.05348842752604105, "grad_norm": 0.963822603225708, "learning_rate": 1.904780218408127e-05, "loss": 0.5257, "step": 1122 }, { "epoch": 0.05353609992134055, "grad_norm": 2.12300181388855, "learning_rate": 1.9046128645767247e-05, "loss": 0.5798, "step": 1123 }, { "epoch": 0.05358377231664005, "grad_norm": 2.374272108078003, "learning_rate": 1.9044453711728733e-05, "loss": 0.3698, "step": 1124 }, { "epoch": 0.05363144471193955, "grad_norm": 1.8397048711776733, "learning_rate": 1.904277738222415e-05, "loss": 0.7794, "step": 1125 }, { "epoch": 0.053679117107239056, "grad_norm": 3.58266282081604, "learning_rate": 1.9041099657512138e-05, "loss": 0.7956, "step": 1126 }, { "epoch": 0.053726789502538556, "grad_norm": 1.8295494318008423, "learning_rate": 1.903942053785156e-05, "loss": 0.7855, "step": 1127 }, { "epoch": 0.053774461897838056, "grad_norm": 1.737149953842163, "learning_rate": 1.9037740023501473e-05, "loss": 0.622, "step": 1128 }, { "epoch": 0.05382213429313756, "grad_norm": 1.806960105895996, "learning_rate": 1.9036058114721174e-05, "loss": 0.5765, "step": 1129 }, { "epoch": 0.053869806688437064, "grad_norm": 1.7252026796340942, "learning_rate": 1.9034374811770163e-05, "loss": 1.071, "step": 1130 }, { "epoch": 0.053917479083736564, "grad_norm": 1.6096805334091187, "learning_rate": 1.9032690114908155e-05, "loss": 0.8907, "step": 1131 }, { "epoch": 0.053965151479036064, "grad_norm": 2.1335132122039795, "learning_rate": 1.903100402439508e-05, "loss": 0.8775, "step": 1132 }, { "epoch": 0.054012823874335564, "grad_norm": 1.6958810091018677, "learning_rate": 1.902931654049108e-05, "loss": 0.731, "step": 1133 }, { "epoch": 0.05406049626963507, "grad_norm": 2.1730849742889404, "learning_rate": 1.9027627663456528e-05, "loss": 0.8711, "step": 1134 }, { "epoch": 0.05410816866493457, "grad_norm": 2.0798728466033936, "learning_rate": 1.9025937393551993e-05, "loss": 0.8086, "step": 1135 }, { "epoch": 0.05415584106023407, "grad_norm": 2.0575478076934814, "learning_rate": 1.902424573103827e-05, "loss": 0.9894, "step": 1136 }, { "epoch": 0.05420351345553357, "grad_norm": 1.999036192893982, "learning_rate": 1.9022552676176358e-05, "loss": 1.1162, "step": 1137 }, { "epoch": 0.05425118585083308, "grad_norm": 1.2748388051986694, "learning_rate": 1.9020858229227483e-05, "loss": 0.4934, "step": 1138 }, { "epoch": 0.05429885824613258, "grad_norm": 2.557769298553467, "learning_rate": 1.901916239045308e-05, "loss": 1.2284, "step": 1139 }, { "epoch": 0.05434653064143208, "grad_norm": 1.9223003387451172, "learning_rate": 1.9017465160114804e-05, "loss": 0.7192, "step": 1140 }, { "epoch": 0.05439420303673158, "grad_norm": 1.5044459104537964, "learning_rate": 1.901576653847451e-05, "loss": 0.4125, "step": 1141 }, { "epoch": 0.054441875432031085, "grad_norm": 3.2889225482940674, "learning_rate": 1.9014066525794284e-05, "loss": 0.6339, "step": 1142 }, { "epoch": 0.054489547827330585, "grad_norm": 4.039918899536133, "learning_rate": 1.9012365122336425e-05, "loss": 1.2816, "step": 1143 }, { "epoch": 0.054537220222630085, "grad_norm": 2.790966272354126, "learning_rate": 1.9010662328363435e-05, "loss": 1.1504, "step": 1144 }, { "epoch": 0.054584892617929585, "grad_norm": 1.5610294342041016, "learning_rate": 1.900895814413804e-05, "loss": 0.689, "step": 1145 }, { "epoch": 0.05463256501322909, "grad_norm": 1.8879177570343018, "learning_rate": 1.9007252569923173e-05, "loss": 0.8787, "step": 1146 }, { "epoch": 0.05468023740852859, "grad_norm": 1.4097404479980469, "learning_rate": 1.9005545605981996e-05, "loss": 0.7708, "step": 1147 }, { "epoch": 0.05472790980382809, "grad_norm": 1.7411558628082275, "learning_rate": 1.900383725257787e-05, "loss": 0.6831, "step": 1148 }, { "epoch": 0.05477558219912759, "grad_norm": 4.240265846252441, "learning_rate": 1.9002127509974376e-05, "loss": 1.2961, "step": 1149 }, { "epoch": 0.0548232545944271, "grad_norm": 2.1117870807647705, "learning_rate": 1.9000416378435312e-05, "loss": 0.8604, "step": 1150 }, { "epoch": 0.0548709269897266, "grad_norm": 2.5202038288116455, "learning_rate": 1.899870385822469e-05, "loss": 0.9414, "step": 1151 }, { "epoch": 0.0549185993850261, "grad_norm": 1.6602882146835327, "learning_rate": 1.8996989949606724e-05, "loss": 0.5965, "step": 1152 }, { "epoch": 0.0549662717803256, "grad_norm": 2.404963493347168, "learning_rate": 1.8995274652845867e-05, "loss": 0.5455, "step": 1153 }, { "epoch": 0.05501394417562511, "grad_norm": 2.229182004928589, "learning_rate": 1.8993557968206763e-05, "loss": 0.6388, "step": 1154 }, { "epoch": 0.05506161657092461, "grad_norm": 1.6714191436767578, "learning_rate": 1.8991839895954277e-05, "loss": 0.9903, "step": 1155 }, { "epoch": 0.05510928896622411, "grad_norm": 1.0693182945251465, "learning_rate": 1.8990120436353496e-05, "loss": 0.5146, "step": 1156 }, { "epoch": 0.05515696136152361, "grad_norm": 2.527358055114746, "learning_rate": 1.898839958966971e-05, "loss": 0.4123, "step": 1157 }, { "epoch": 0.055204633756823114, "grad_norm": 2.7824013233184814, "learning_rate": 1.8986677356168433e-05, "loss": 0.4801, "step": 1158 }, { "epoch": 0.055252306152122614, "grad_norm": 1.8934613466262817, "learning_rate": 1.8984953736115382e-05, "loss": 0.8082, "step": 1159 }, { "epoch": 0.055299978547422114, "grad_norm": 2.3900678157806396, "learning_rate": 1.89832287297765e-05, "loss": 0.496, "step": 1160 }, { "epoch": 0.055347650942721614, "grad_norm": 12.552268981933594, "learning_rate": 1.8981502337417933e-05, "loss": 1.0013, "step": 1161 }, { "epoch": 0.05539532333802112, "grad_norm": 1.6940616369247437, "learning_rate": 1.8979774559306046e-05, "loss": 0.9139, "step": 1162 }, { "epoch": 0.05544299573332062, "grad_norm": 1.5934605598449707, "learning_rate": 1.897804539570742e-05, "loss": 0.8557, "step": 1163 }, { "epoch": 0.05549066812862012, "grad_norm": 1.645617961883545, "learning_rate": 1.8976314846888845e-05, "loss": 0.5171, "step": 1164 }, { "epoch": 0.05553834052391962, "grad_norm": 1.3355939388275146, "learning_rate": 1.8974582913117323e-05, "loss": 0.8613, "step": 1165 }, { "epoch": 0.05558601291921913, "grad_norm": 2.7546401023864746, "learning_rate": 1.897284959466008e-05, "loss": 0.7212, "step": 1166 }, { "epoch": 0.05563368531451863, "grad_norm": 2.5125839710235596, "learning_rate": 1.897111489178455e-05, "loss": 0.6046, "step": 1167 }, { "epoch": 0.05568135770981813, "grad_norm": 3.391326665878296, "learning_rate": 1.8969378804758375e-05, "loss": 1.043, "step": 1168 }, { "epoch": 0.05572903010511763, "grad_norm": 3.3391637802124023, "learning_rate": 1.8967641333849417e-05, "loss": 0.7025, "step": 1169 }, { "epoch": 0.055776702500417136, "grad_norm": 1.606366753578186, "learning_rate": 1.896590247932575e-05, "loss": 0.6304, "step": 1170 }, { "epoch": 0.055824374895716636, "grad_norm": 3.1777889728546143, "learning_rate": 1.8964162241455662e-05, "loss": 0.4038, "step": 1171 }, { "epoch": 0.055872047291016136, "grad_norm": 1.2329596281051636, "learning_rate": 1.896242062050765e-05, "loss": 0.5807, "step": 1172 }, { "epoch": 0.055919719686315636, "grad_norm": 2.3602383136749268, "learning_rate": 1.8960677616750435e-05, "loss": 1.1764, "step": 1173 }, { "epoch": 0.05596739208161514, "grad_norm": 1.634207010269165, "learning_rate": 1.8958933230452938e-05, "loss": 0.8832, "step": 1174 }, { "epoch": 0.05601506447691464, "grad_norm": 1.593907117843628, "learning_rate": 1.8957187461884308e-05, "loss": 0.716, "step": 1175 }, { "epoch": 0.05606273687221414, "grad_norm": 1.369978666305542, "learning_rate": 1.895544031131389e-05, "loss": 0.708, "step": 1176 }, { "epoch": 0.05611040926751364, "grad_norm": 2.046199083328247, "learning_rate": 1.8953691779011255e-05, "loss": 0.6917, "step": 1177 }, { "epoch": 0.05615808166281315, "grad_norm": 2.0883688926696777, "learning_rate": 1.895194186524618e-05, "loss": 0.9998, "step": 1178 }, { "epoch": 0.05620575405811265, "grad_norm": 1.8095883131027222, "learning_rate": 1.895019057028867e-05, "loss": 0.8894, "step": 1179 }, { "epoch": 0.05625342645341215, "grad_norm": 1.895460605621338, "learning_rate": 1.894843789440892e-05, "loss": 0.7409, "step": 1180 }, { "epoch": 0.05630109884871165, "grad_norm": 1.6952366828918457, "learning_rate": 1.8946683837877354e-05, "loss": 0.6736, "step": 1181 }, { "epoch": 0.05634877124401116, "grad_norm": 1.567142128944397, "learning_rate": 1.8944928400964606e-05, "loss": 1.0353, "step": 1182 }, { "epoch": 0.05639644363931066, "grad_norm": 1.685753583908081, "learning_rate": 1.894317158394152e-05, "loss": 0.8358, "step": 1183 }, { "epoch": 0.05644411603461016, "grad_norm": 1.324472427368164, "learning_rate": 1.8941413387079156e-05, "loss": 0.5607, "step": 1184 }, { "epoch": 0.05649178842990966, "grad_norm": 5.449647426605225, "learning_rate": 1.8939653810648785e-05, "loss": 0.8004, "step": 1185 }, { "epoch": 0.056539460825209165, "grad_norm": 1.8052685260772705, "learning_rate": 1.8937892854921892e-05, "loss": 0.9804, "step": 1186 }, { "epoch": 0.056587133220508665, "grad_norm": 1.7985178232192993, "learning_rate": 1.8936130520170172e-05, "loss": 0.9373, "step": 1187 }, { "epoch": 0.056634805615808165, "grad_norm": 2.4171130657196045, "learning_rate": 1.893436680666554e-05, "loss": 0.6786, "step": 1188 }, { "epoch": 0.056682478011107665, "grad_norm": 1.7155203819274902, "learning_rate": 1.893260171468011e-05, "loss": 0.6025, "step": 1189 }, { "epoch": 0.05673015040640717, "grad_norm": 1.4676045179367065, "learning_rate": 1.8930835244486232e-05, "loss": 0.4387, "step": 1190 }, { "epoch": 0.05677782280170667, "grad_norm": 2.474015951156616, "learning_rate": 1.892906739635644e-05, "loss": 1.1813, "step": 1191 }, { "epoch": 0.05682549519700617, "grad_norm": 1.848732829093933, "learning_rate": 1.8927298170563503e-05, "loss": 1.0332, "step": 1192 }, { "epoch": 0.05687316759230567, "grad_norm": 3.2035136222839355, "learning_rate": 1.892552756738039e-05, "loss": 0.9814, "step": 1193 }, { "epoch": 0.05692083998760518, "grad_norm": 3.143311023712158, "learning_rate": 1.8923755587080288e-05, "loss": 0.8789, "step": 1194 }, { "epoch": 0.05696851238290468, "grad_norm": 1.4218758344650269, "learning_rate": 1.8921982229936597e-05, "loss": 0.7748, "step": 1195 }, { "epoch": 0.05701618477820418, "grad_norm": 1.3497434854507446, "learning_rate": 1.8920207496222924e-05, "loss": 0.7783, "step": 1196 }, { "epoch": 0.05706385717350368, "grad_norm": 2.282148838043213, "learning_rate": 1.89184313862131e-05, "loss": 0.8524, "step": 1197 }, { "epoch": 0.057111529568803186, "grad_norm": 1.9011353254318237, "learning_rate": 1.891665390018115e-05, "loss": 0.902, "step": 1198 }, { "epoch": 0.057159201964102686, "grad_norm": 2.767425060272217, "learning_rate": 1.891487503840133e-05, "loss": 1.0341, "step": 1199 }, { "epoch": 0.057206874359402186, "grad_norm": 1.7280144691467285, "learning_rate": 1.8913094801148096e-05, "loss": 0.6495, "step": 1200 }, { "epoch": 0.05725454675470169, "grad_norm": 1.4182579517364502, "learning_rate": 1.891131318869612e-05, "loss": 1.0231, "step": 1201 }, { "epoch": 0.057302219150001193, "grad_norm": 1.7430015802383423, "learning_rate": 1.8909530201320288e-05, "loss": 0.8205, "step": 1202 }, { "epoch": 0.057349891545300694, "grad_norm": 1.7929915189743042, "learning_rate": 1.89077458392957e-05, "loss": 1.0896, "step": 1203 }, { "epoch": 0.057397563940600194, "grad_norm": 2.15480637550354, "learning_rate": 1.890596010289766e-05, "loss": 0.9131, "step": 1204 }, { "epoch": 0.0574452363358997, "grad_norm": 1.5927636623382568, "learning_rate": 1.8904172992401685e-05, "loss": 0.6557, "step": 1205 }, { "epoch": 0.0574929087311992, "grad_norm": 2.2707607746124268, "learning_rate": 1.8902384508083518e-05, "loss": 0.9403, "step": 1206 }, { "epoch": 0.0575405811264987, "grad_norm": 2.514889717102051, "learning_rate": 1.8900594650219096e-05, "loss": 0.5346, "step": 1207 }, { "epoch": 0.0575882535217982, "grad_norm": 2.6210289001464844, "learning_rate": 1.8898803419084578e-05, "loss": 0.9477, "step": 1208 }, { "epoch": 0.05763592591709771, "grad_norm": 1.4959616661071777, "learning_rate": 1.889701081495633e-05, "loss": 0.5462, "step": 1209 }, { "epoch": 0.05768359831239721, "grad_norm": 4.131924152374268, "learning_rate": 1.8895216838110938e-05, "loss": 0.7199, "step": 1210 }, { "epoch": 0.05773127070769671, "grad_norm": 1.0754896402359009, "learning_rate": 1.889342148882519e-05, "loss": 0.5139, "step": 1211 }, { "epoch": 0.05777894310299621, "grad_norm": 3.7189688682556152, "learning_rate": 1.889162476737609e-05, "loss": 0.6986, "step": 1212 }, { "epoch": 0.057826615498295715, "grad_norm": 4.594300270080566, "learning_rate": 1.8889826674040855e-05, "loss": 0.514, "step": 1213 }, { "epoch": 0.057874287893595215, "grad_norm": 1.001029372215271, "learning_rate": 1.8888027209096913e-05, "loss": 0.2689, "step": 1214 }, { "epoch": 0.057921960288894715, "grad_norm": 2.0562429428100586, "learning_rate": 1.88862263728219e-05, "loss": 0.8285, "step": 1215 }, { "epoch": 0.057969632684194215, "grad_norm": 1.5957159996032715, "learning_rate": 1.888442416549367e-05, "loss": 0.6524, "step": 1216 }, { "epoch": 0.05801730507949372, "grad_norm": 1.7283976078033447, "learning_rate": 1.888262058739028e-05, "loss": 0.6009, "step": 1217 }, { "epoch": 0.05806497747479322, "grad_norm": 1.5523749589920044, "learning_rate": 1.888081563879001e-05, "loss": 0.7249, "step": 1218 }, { "epoch": 0.05811264987009272, "grad_norm": 1.2984051704406738, "learning_rate": 1.887900931997134e-05, "loss": 0.7523, "step": 1219 }, { "epoch": 0.05816032226539222, "grad_norm": 3.114650011062622, "learning_rate": 1.8877201631212966e-05, "loss": 0.354, "step": 1220 }, { "epoch": 0.05820799466069173, "grad_norm": 1.8985135555267334, "learning_rate": 1.88753925727938e-05, "loss": 0.9879, "step": 1221 }, { "epoch": 0.05825566705599123, "grad_norm": 1.6196322441101074, "learning_rate": 1.887358214499296e-05, "loss": 0.4475, "step": 1222 }, { "epoch": 0.05830333945129073, "grad_norm": 1.1466352939605713, "learning_rate": 1.8871770348089774e-05, "loss": 0.6485, "step": 1223 }, { "epoch": 0.05835101184659023, "grad_norm": 2.748375654220581, "learning_rate": 1.8869957182363784e-05, "loss": 0.9876, "step": 1224 }, { "epoch": 0.05839868424188974, "grad_norm": 2.3730061054229736, "learning_rate": 1.8868142648094745e-05, "loss": 1.2395, "step": 1225 }, { "epoch": 0.05844635663718924, "grad_norm": 2.2381269931793213, "learning_rate": 1.886632674556262e-05, "loss": 0.7707, "step": 1226 }, { "epoch": 0.05849402903248874, "grad_norm": 1.3274270296096802, "learning_rate": 1.8864509475047583e-05, "loss": 0.5131, "step": 1227 }, { "epoch": 0.05854170142778824, "grad_norm": 2.31314754486084, "learning_rate": 1.886269083683002e-05, "loss": 0.9866, "step": 1228 }, { "epoch": 0.058589373823087744, "grad_norm": 2.0299339294433594, "learning_rate": 1.886087083119053e-05, "loss": 0.8491, "step": 1229 }, { "epoch": 0.058637046218387244, "grad_norm": 3.188847541809082, "learning_rate": 1.885904945840992e-05, "loss": 0.6063, "step": 1230 }, { "epoch": 0.058684718613686744, "grad_norm": 1.6119741201400757, "learning_rate": 1.885722671876921e-05, "loss": 0.5036, "step": 1231 }, { "epoch": 0.058732391008986244, "grad_norm": 2.852628231048584, "learning_rate": 1.8855402612549624e-05, "loss": 0.5136, "step": 1232 }, { "epoch": 0.05878006340428575, "grad_norm": 1.553695559501648, "learning_rate": 1.8853577140032614e-05, "loss": 0.6988, "step": 1233 }, { "epoch": 0.05882773579958525, "grad_norm": 2.5913333892822266, "learning_rate": 1.885175030149982e-05, "loss": 1.0885, "step": 1234 }, { "epoch": 0.05887540819488475, "grad_norm": 1.3405808210372925, "learning_rate": 1.8849922097233115e-05, "loss": 0.8781, "step": 1235 }, { "epoch": 0.05892308059018425, "grad_norm": 1.288750171661377, "learning_rate": 1.8848092527514564e-05, "loss": 0.7509, "step": 1236 }, { "epoch": 0.05897075298548376, "grad_norm": 2.9218556880950928, "learning_rate": 1.8846261592626455e-05, "loss": 0.7463, "step": 1237 }, { "epoch": 0.05901842538078326, "grad_norm": 1.2847224473953247, "learning_rate": 1.8844429292851282e-05, "loss": 0.4134, "step": 1238 }, { "epoch": 0.05906609777608276, "grad_norm": 1.4831727743148804, "learning_rate": 1.8842595628471746e-05, "loss": 0.691, "step": 1239 }, { "epoch": 0.05911377017138226, "grad_norm": 2.680004835128784, "learning_rate": 1.884076059977077e-05, "loss": 0.7823, "step": 1240 }, { "epoch": 0.059161442566681766, "grad_norm": 1.819541573524475, "learning_rate": 1.8838924207031474e-05, "loss": 0.8122, "step": 1241 }, { "epoch": 0.059209114961981266, "grad_norm": 6.2091593742370605, "learning_rate": 1.8837086450537195e-05, "loss": 0.964, "step": 1242 }, { "epoch": 0.059256787357280766, "grad_norm": 1.8609195947647095, "learning_rate": 1.883524733057148e-05, "loss": 0.9592, "step": 1243 }, { "epoch": 0.059304459752580266, "grad_norm": 3.6004459857940674, "learning_rate": 1.8833406847418088e-05, "loss": 1.033, "step": 1244 }, { "epoch": 0.05935213214787977, "grad_norm": 2.253506660461426, "learning_rate": 1.8831565001360987e-05, "loss": 0.3874, "step": 1245 }, { "epoch": 0.05939980454317927, "grad_norm": 2.738685131072998, "learning_rate": 1.8829721792684353e-05, "loss": 0.6196, "step": 1246 }, { "epoch": 0.05944747693847877, "grad_norm": 1.635756254196167, "learning_rate": 1.8827877221672578e-05, "loss": 0.7863, "step": 1247 }, { "epoch": 0.05949514933377827, "grad_norm": 0.8421286344528198, "learning_rate": 1.8826031288610255e-05, "loss": 0.3319, "step": 1248 }, { "epoch": 0.05954282172907778, "grad_norm": 1.600510597229004, "learning_rate": 1.8824183993782193e-05, "loss": 0.9212, "step": 1249 }, { "epoch": 0.05959049412437728, "grad_norm": 1.8550633192062378, "learning_rate": 1.8822335337473413e-05, "loss": 0.6204, "step": 1250 }, { "epoch": 0.05963816651967678, "grad_norm": 1.3988538980484009, "learning_rate": 1.8820485319969145e-05, "loss": 0.6403, "step": 1251 }, { "epoch": 0.05968583891497628, "grad_norm": 2.387962818145752, "learning_rate": 1.881863394155482e-05, "loss": 0.7924, "step": 1252 }, { "epoch": 0.05973351131027579, "grad_norm": 1.4238137006759644, "learning_rate": 1.88167812025161e-05, "loss": 0.3732, "step": 1253 }, { "epoch": 0.05978118370557529, "grad_norm": 1.3570970296859741, "learning_rate": 1.881492710313883e-05, "loss": 0.8902, "step": 1254 }, { "epoch": 0.05982885610087479, "grad_norm": 1.7247503995895386, "learning_rate": 1.8813071643709087e-05, "loss": 0.6211, "step": 1255 }, { "epoch": 0.05987652849617429, "grad_norm": 2.3720498085021973, "learning_rate": 1.8811214824513145e-05, "loss": 1.2221, "step": 1256 }, { "epoch": 0.059924200891473794, "grad_norm": 1.7537853717803955, "learning_rate": 1.8809356645837495e-05, "loss": 0.6426, "step": 1257 }, { "epoch": 0.059971873286773295, "grad_norm": 1.4743638038635254, "learning_rate": 1.8807497107968834e-05, "loss": 0.5793, "step": 1258 }, { "epoch": 0.060019545682072795, "grad_norm": 1.7508403062820435, "learning_rate": 1.8805636211194066e-05, "loss": 0.7718, "step": 1259 }, { "epoch": 0.060067218077372295, "grad_norm": 11.860549926757812, "learning_rate": 1.8803773955800313e-05, "loss": 0.6367, "step": 1260 }, { "epoch": 0.0601148904726718, "grad_norm": 1.9697636365890503, "learning_rate": 1.88019103420749e-05, "loss": 0.9679, "step": 1261 }, { "epoch": 0.0601625628679713, "grad_norm": 5.275759220123291, "learning_rate": 1.8800045370305365e-05, "loss": 0.9563, "step": 1262 }, { "epoch": 0.0602102352632708, "grad_norm": 4.440974235534668, "learning_rate": 1.879817904077945e-05, "loss": 0.8436, "step": 1263 }, { "epoch": 0.0602579076585703, "grad_norm": 2.7934744358062744, "learning_rate": 1.879631135378511e-05, "loss": 1.5814, "step": 1264 }, { "epoch": 0.06030558005386981, "grad_norm": 1.363739013671875, "learning_rate": 1.8794442309610518e-05, "loss": 0.8923, "step": 1265 }, { "epoch": 0.06035325244916931, "grad_norm": 2.2420594692230225, "learning_rate": 1.879257190854404e-05, "loss": 0.8111, "step": 1266 }, { "epoch": 0.06040092484446881, "grad_norm": 1.7087407112121582, "learning_rate": 1.879070015087426e-05, "loss": 0.9341, "step": 1267 }, { "epoch": 0.06044859723976831, "grad_norm": 3.1180970668792725, "learning_rate": 1.8788827036889978e-05, "loss": 0.8685, "step": 1268 }, { "epoch": 0.060496269635067816, "grad_norm": 0.9729019403457642, "learning_rate": 1.8786952566880192e-05, "loss": 0.5399, "step": 1269 }, { "epoch": 0.060543942030367316, "grad_norm": 2.52308988571167, "learning_rate": 1.878507674113411e-05, "loss": 1.035, "step": 1270 }, { "epoch": 0.060591614425666816, "grad_norm": 0.9263543486595154, "learning_rate": 1.878319955994116e-05, "loss": 0.5863, "step": 1271 }, { "epoch": 0.060639286820966316, "grad_norm": 1.5016515254974365, "learning_rate": 1.8781321023590962e-05, "loss": 0.9493, "step": 1272 }, { "epoch": 0.06068695921626582, "grad_norm": 2.0149483680725098, "learning_rate": 1.877944113237336e-05, "loss": 0.8444, "step": 1273 }, { "epoch": 0.06073463161156532, "grad_norm": 1.6895701885223389, "learning_rate": 1.8777559886578407e-05, "loss": 0.6487, "step": 1274 }, { "epoch": 0.060782304006864823, "grad_norm": 1.780248761177063, "learning_rate": 1.877567728649635e-05, "loss": 1.1951, "step": 1275 }, { "epoch": 0.060829976402164324, "grad_norm": 1.427687406539917, "learning_rate": 1.8773793332417664e-05, "loss": 0.7353, "step": 1276 }, { "epoch": 0.06087764879746383, "grad_norm": 4.466861724853516, "learning_rate": 1.8771908024633017e-05, "loss": 1.0419, "step": 1277 }, { "epoch": 0.06092532119276333, "grad_norm": 3.2163162231445312, "learning_rate": 1.8770021363433295e-05, "loss": 0.4114, "step": 1278 }, { "epoch": 0.06097299358806283, "grad_norm": 1.3906358480453491, "learning_rate": 1.876813334910959e-05, "loss": 0.991, "step": 1279 }, { "epoch": 0.06102066598336233, "grad_norm": 2.0710666179656982, "learning_rate": 1.8766243981953204e-05, "loss": 0.6592, "step": 1280 }, { "epoch": 0.06106833837866184, "grad_norm": 2.256863832473755, "learning_rate": 1.876435326225565e-05, "loss": 0.7951, "step": 1281 }, { "epoch": 0.06111601077396134, "grad_norm": 1.7044591903686523, "learning_rate": 1.8762461190308637e-05, "loss": 0.8216, "step": 1282 }, { "epoch": 0.06116368316926084, "grad_norm": 3.1906909942626953, "learning_rate": 1.8760567766404102e-05, "loss": 1.1828, "step": 1283 }, { "epoch": 0.06121135556456034, "grad_norm": 2.333390951156616, "learning_rate": 1.8758672990834172e-05, "loss": 0.7787, "step": 1284 }, { "epoch": 0.061259027959859845, "grad_norm": 1.56510329246521, "learning_rate": 1.87567768638912e-05, "loss": 1.0974, "step": 1285 }, { "epoch": 0.061306700355159345, "grad_norm": 2.0056633949279785, "learning_rate": 1.8754879385867738e-05, "loss": 0.6958, "step": 1286 }, { "epoch": 0.061354372750458845, "grad_norm": 2.849614143371582, "learning_rate": 1.875298055705654e-05, "loss": 1.1496, "step": 1287 }, { "epoch": 0.061402045145758345, "grad_norm": 2.2136480808258057, "learning_rate": 1.8751080377750585e-05, "loss": 0.7184, "step": 1288 }, { "epoch": 0.06144971754105785, "grad_norm": 1.6875826120376587, "learning_rate": 1.8749178848243042e-05, "loss": 0.5162, "step": 1289 }, { "epoch": 0.06149738993635735, "grad_norm": 8.317864418029785, "learning_rate": 1.8747275968827304e-05, "loss": 0.5979, "step": 1290 }, { "epoch": 0.06154506233165685, "grad_norm": 2.540807008743286, "learning_rate": 1.8745371739796962e-05, "loss": 0.8541, "step": 1291 }, { "epoch": 0.06159273472695636, "grad_norm": 3.1549360752105713, "learning_rate": 1.8743466161445823e-05, "loss": 0.5086, "step": 1292 }, { "epoch": 0.06164040712225586, "grad_norm": 1.5895482301712036, "learning_rate": 1.8741559234067893e-05, "loss": 0.7999, "step": 1293 }, { "epoch": 0.06168807951755536, "grad_norm": 1.2056721448898315, "learning_rate": 1.8739650957957396e-05, "loss": 0.5553, "step": 1294 }, { "epoch": 0.06173575191285486, "grad_norm": 1.6600650548934937, "learning_rate": 1.8737741333408757e-05, "loss": 0.885, "step": 1295 }, { "epoch": 0.06178342430815437, "grad_norm": 2.5915048122406006, "learning_rate": 1.873583036071661e-05, "loss": 0.4749, "step": 1296 }, { "epoch": 0.06183109670345387, "grad_norm": 1.690521478652954, "learning_rate": 1.87339180401758e-05, "loss": 0.6927, "step": 1297 }, { "epoch": 0.06187876909875337, "grad_norm": 1.1942768096923828, "learning_rate": 1.873200437208138e-05, "loss": 0.6184, "step": 1298 }, { "epoch": 0.06192644149405287, "grad_norm": 4.745242118835449, "learning_rate": 1.8730089356728605e-05, "loss": 0.4487, "step": 1299 }, { "epoch": 0.061974113889352374, "grad_norm": 2.1504414081573486, "learning_rate": 1.8728172994412948e-05, "loss": 0.6687, "step": 1300 }, { "epoch": 0.062021786284651874, "grad_norm": 2.5773351192474365, "learning_rate": 1.872625528543008e-05, "loss": 0.8053, "step": 1301 }, { "epoch": 0.062069458679951374, "grad_norm": 1.972769021987915, "learning_rate": 1.8724336230075885e-05, "loss": 0.8092, "step": 1302 }, { "epoch": 0.062117131075250874, "grad_norm": 2.7558534145355225, "learning_rate": 1.872241582864645e-05, "loss": 0.6769, "step": 1303 }, { "epoch": 0.06216480347055038, "grad_norm": 4.685482025146484, "learning_rate": 1.872049408143808e-05, "loss": 1.0463, "step": 1304 }, { "epoch": 0.06221247586584988, "grad_norm": 6.796249866485596, "learning_rate": 1.871857098874727e-05, "loss": 0.2981, "step": 1305 }, { "epoch": 0.06226014826114938, "grad_norm": 1.691559910774231, "learning_rate": 1.8716646550870746e-05, "loss": 0.738, "step": 1306 }, { "epoch": 0.06230782065644888, "grad_norm": 2.390437364578247, "learning_rate": 1.8714720768105425e-05, "loss": 1.4487, "step": 1307 }, { "epoch": 0.06235549305174839, "grad_norm": 1.676360845565796, "learning_rate": 1.8712793640748433e-05, "loss": 0.6378, "step": 1308 }, { "epoch": 0.06240316544704789, "grad_norm": 1.6796358823776245, "learning_rate": 1.8710865169097102e-05, "loss": 0.667, "step": 1309 }, { "epoch": 0.06245083784234739, "grad_norm": 2.0294792652130127, "learning_rate": 1.8708935353448982e-05, "loss": 0.7727, "step": 1310 }, { "epoch": 0.06249851023764689, "grad_norm": 2.0255770683288574, "learning_rate": 1.8707004194101825e-05, "loss": 1.3701, "step": 1311 }, { "epoch": 0.0625461826329464, "grad_norm": 1.7787353992462158, "learning_rate": 1.8705071691353583e-05, "loss": 0.745, "step": 1312 }, { "epoch": 0.0625938550282459, "grad_norm": 2.5972752571105957, "learning_rate": 1.870313784550242e-05, "loss": 0.4273, "step": 1313 }, { "epoch": 0.0626415274235454, "grad_norm": 1.857809066772461, "learning_rate": 1.8701202656846717e-05, "loss": 0.641, "step": 1314 }, { "epoch": 0.0626891998188449, "grad_norm": 3.2368533611297607, "learning_rate": 1.8699266125685052e-05, "loss": 0.2691, "step": 1315 }, { "epoch": 0.0627368722141444, "grad_norm": 2.417619466781616, "learning_rate": 1.8697328252316205e-05, "loss": 0.6198, "step": 1316 }, { "epoch": 0.0627845446094439, "grad_norm": 4.069061279296875, "learning_rate": 1.8695389037039172e-05, "loss": 0.4663, "step": 1317 }, { "epoch": 0.06283221700474341, "grad_norm": 3.2209033966064453, "learning_rate": 1.869344848015316e-05, "loss": 0.5283, "step": 1318 }, { "epoch": 0.06287988940004291, "grad_norm": 6.288067817687988, "learning_rate": 1.869150658195757e-05, "loss": 0.2191, "step": 1319 }, { "epoch": 0.06292756179534241, "grad_norm": 1.6023699045181274, "learning_rate": 1.868956334275202e-05, "loss": 1.0093, "step": 1320 }, { "epoch": 0.06297523419064191, "grad_norm": 1.6245163679122925, "learning_rate": 1.8687618762836334e-05, "loss": 0.7076, "step": 1321 }, { "epoch": 0.06302290658594141, "grad_norm": 1.7596030235290527, "learning_rate": 1.8685672842510536e-05, "loss": 0.475, "step": 1322 }, { "epoch": 0.06307057898124091, "grad_norm": 1.1735817193984985, "learning_rate": 1.8683725582074862e-05, "loss": 0.6964, "step": 1323 }, { "epoch": 0.06311825137654041, "grad_norm": 1.7039296627044678, "learning_rate": 1.868177698182976e-05, "loss": 0.5079, "step": 1324 }, { "epoch": 0.06316592377183991, "grad_norm": 4.908930778503418, "learning_rate": 1.867982704207587e-05, "loss": 0.2962, "step": 1325 }, { "epoch": 0.06321359616713942, "grad_norm": 1.541994333267212, "learning_rate": 1.8677875763114054e-05, "loss": 0.6977, "step": 1326 }, { "epoch": 0.06326126856243892, "grad_norm": 2.8934872150421143, "learning_rate": 1.8675923145245373e-05, "loss": 0.8652, "step": 1327 }, { "epoch": 0.06330894095773842, "grad_norm": 0.9850158095359802, "learning_rate": 1.8673969188771094e-05, "loss": 0.5087, "step": 1328 }, { "epoch": 0.06335661335303792, "grad_norm": 1.6392902135849, "learning_rate": 1.8672013893992697e-05, "loss": 0.5989, "step": 1329 }, { "epoch": 0.06340428574833742, "grad_norm": 3.292081117630005, "learning_rate": 1.8670057261211857e-05, "loss": 0.8183, "step": 1330 }, { "epoch": 0.06345195814363692, "grad_norm": 1.6214336156845093, "learning_rate": 1.8668099290730468e-05, "loss": 0.9003, "step": 1331 }, { "epoch": 0.06349963053893642, "grad_norm": 1.6833115816116333, "learning_rate": 1.8666139982850626e-05, "loss": 0.7086, "step": 1332 }, { "epoch": 0.06354730293423592, "grad_norm": 1.3699870109558105, "learning_rate": 1.8664179337874618e-05, "loss": 0.7194, "step": 1333 }, { "epoch": 0.06359497532953544, "grad_norm": 2.6119396686553955, "learning_rate": 1.866221735610497e-05, "loss": 0.5412, "step": 1334 }, { "epoch": 0.06364264772483494, "grad_norm": 2.3003079891204834, "learning_rate": 1.866025403784439e-05, "loss": 0.9531, "step": 1335 }, { "epoch": 0.06369032012013444, "grad_norm": 1.4991408586502075, "learning_rate": 1.865828938339579e-05, "loss": 0.4751, "step": 1336 }, { "epoch": 0.06373799251543394, "grad_norm": 1.6355737447738647, "learning_rate": 1.86563233930623e-05, "loss": 0.8553, "step": 1337 }, { "epoch": 0.06378566491073344, "grad_norm": 1.9629335403442383, "learning_rate": 1.8654356067147258e-05, "loss": 1.0921, "step": 1338 }, { "epoch": 0.06383333730603294, "grad_norm": 1.5107303857803345, "learning_rate": 1.8652387405954196e-05, "loss": 0.3482, "step": 1339 }, { "epoch": 0.06388100970133244, "grad_norm": 2.5146117210388184, "learning_rate": 1.865041740978686e-05, "loss": 0.6937, "step": 1340 }, { "epoch": 0.06392868209663194, "grad_norm": 1.4010186195373535, "learning_rate": 1.86484460789492e-05, "loss": 0.9053, "step": 1341 }, { "epoch": 0.06397635449193145, "grad_norm": 2.845616102218628, "learning_rate": 1.864647341374537e-05, "loss": 1.1672, "step": 1342 }, { "epoch": 0.06402402688723095, "grad_norm": 4.536278247833252, "learning_rate": 1.8644499414479735e-05, "loss": 0.5932, "step": 1343 }, { "epoch": 0.06407169928253045, "grad_norm": 2.0457026958465576, "learning_rate": 1.864252408145686e-05, "loss": 0.9775, "step": 1344 }, { "epoch": 0.06411937167782995, "grad_norm": 1.767301082611084, "learning_rate": 1.8640547414981523e-05, "loss": 0.7896, "step": 1345 }, { "epoch": 0.06416704407312945, "grad_norm": 5.789084434509277, "learning_rate": 1.8638569415358696e-05, "loss": 0.7714, "step": 1346 }, { "epoch": 0.06421471646842895, "grad_norm": 1.4148178100585938, "learning_rate": 1.863659008289357e-05, "loss": 0.6687, "step": 1347 }, { "epoch": 0.06426238886372845, "grad_norm": 1.5837879180908203, "learning_rate": 1.8634609417891535e-05, "loss": 0.7297, "step": 1348 }, { "epoch": 0.06431006125902795, "grad_norm": 2.070500135421753, "learning_rate": 1.8632627420658184e-05, "loss": 0.5979, "step": 1349 }, { "epoch": 0.06435773365432747, "grad_norm": 1.9625242948532104, "learning_rate": 1.8630644091499322e-05, "loss": 0.5878, "step": 1350 }, { "epoch": 0.06440540604962697, "grad_norm": 4.076782703399658, "learning_rate": 1.8628659430720958e-05, "loss": 1.4679, "step": 1351 }, { "epoch": 0.06445307844492647, "grad_norm": 1.3314266204833984, "learning_rate": 1.86266734386293e-05, "loss": 0.7836, "step": 1352 }, { "epoch": 0.06450075084022597, "grad_norm": 2.396193504333496, "learning_rate": 1.8624686115530767e-05, "loss": 1.1108, "step": 1353 }, { "epoch": 0.06454842323552547, "grad_norm": 1.7645889520645142, "learning_rate": 1.8622697461731983e-05, "loss": 1.1186, "step": 1354 }, { "epoch": 0.06459609563082497, "grad_norm": 1.8896069526672363, "learning_rate": 1.8620707477539776e-05, "loss": 0.9623, "step": 1355 }, { "epoch": 0.06464376802612447, "grad_norm": 1.6902443170547485, "learning_rate": 1.8618716163261185e-05, "loss": 0.5137, "step": 1356 }, { "epoch": 0.06469144042142397, "grad_norm": 1.171393632888794, "learning_rate": 1.8616723519203445e-05, "loss": 0.7234, "step": 1357 }, { "epoch": 0.06473911281672348, "grad_norm": 1.263073444366455, "learning_rate": 1.8614729545674e-05, "loss": 0.4881, "step": 1358 }, { "epoch": 0.06478678521202298, "grad_norm": 2.569489002227783, "learning_rate": 1.86127342429805e-05, "loss": 0.9636, "step": 1359 }, { "epoch": 0.06483445760732248, "grad_norm": 1.6446890830993652, "learning_rate": 1.86107376114308e-05, "loss": 0.8164, "step": 1360 }, { "epoch": 0.06488213000262198, "grad_norm": 1.5528117418289185, "learning_rate": 1.8608739651332965e-05, "loss": 0.8567, "step": 1361 }, { "epoch": 0.06492980239792148, "grad_norm": 2.500760793685913, "learning_rate": 1.8606740362995247e-05, "loss": 0.7141, "step": 1362 }, { "epoch": 0.06497747479322098, "grad_norm": 1.1972280740737915, "learning_rate": 1.8604739746726128e-05, "loss": 0.7305, "step": 1363 }, { "epoch": 0.06502514718852048, "grad_norm": 1.8277705907821655, "learning_rate": 1.8602737802834275e-05, "loss": 0.8047, "step": 1364 }, { "epoch": 0.06507281958381998, "grad_norm": 2.029378652572632, "learning_rate": 1.8600734531628573e-05, "loss": 0.8269, "step": 1365 }, { "epoch": 0.0651204919791195, "grad_norm": 1.831621766090393, "learning_rate": 1.8598729933418102e-05, "loss": 0.7176, "step": 1366 }, { "epoch": 0.065168164374419, "grad_norm": 1.9126170873641968, "learning_rate": 1.8596724008512153e-05, "loss": 0.7685, "step": 1367 }, { "epoch": 0.0652158367697185, "grad_norm": 1.3067179918289185, "learning_rate": 1.8594716757220218e-05, "loss": 0.4599, "step": 1368 }, { "epoch": 0.065263509165018, "grad_norm": 1.5223742723464966, "learning_rate": 1.8592708179851994e-05, "loss": 0.9997, "step": 1369 }, { "epoch": 0.0653111815603175, "grad_norm": 1.269461750984192, "learning_rate": 1.8590698276717386e-05, "loss": 0.8008, "step": 1370 }, { "epoch": 0.065358853955617, "grad_norm": 1.3915373086929321, "learning_rate": 1.8588687048126503e-05, "loss": 0.5982, "step": 1371 }, { "epoch": 0.0654065263509165, "grad_norm": 2.4347476959228516, "learning_rate": 1.8586674494389653e-05, "loss": 0.4196, "step": 1372 }, { "epoch": 0.065454198746216, "grad_norm": 1.9680578708648682, "learning_rate": 1.858466061581736e-05, "loss": 0.4175, "step": 1373 }, { "epoch": 0.06550187114151551, "grad_norm": 3.6322875022888184, "learning_rate": 1.858264541272033e-05, "loss": 0.655, "step": 1374 }, { "epoch": 0.06554954353681501, "grad_norm": 1.1171382665634155, "learning_rate": 1.8580628885409502e-05, "loss": 0.2657, "step": 1375 }, { "epoch": 0.06559721593211451, "grad_norm": 2.2409839630126953, "learning_rate": 1.8578611034196e-05, "loss": 0.9376, "step": 1376 }, { "epoch": 0.06564488832741401, "grad_norm": 1.258958339691162, "learning_rate": 1.8576591859391158e-05, "loss": 0.9122, "step": 1377 }, { "epoch": 0.06569256072271351, "grad_norm": 1.190206527709961, "learning_rate": 1.857457136130651e-05, "loss": 0.7015, "step": 1378 }, { "epoch": 0.06574023311801301, "grad_norm": 1.8071290254592896, "learning_rate": 1.857254954025381e-05, "loss": 0.6204, "step": 1379 }, { "epoch": 0.06578790551331251, "grad_norm": 1.597461462020874, "learning_rate": 1.857052639654499e-05, "loss": 0.6745, "step": 1380 }, { "epoch": 0.06583557790861201, "grad_norm": 1.2859656810760498, "learning_rate": 1.8568501930492204e-05, "loss": 0.4271, "step": 1381 }, { "epoch": 0.06588325030391153, "grad_norm": 1.1040730476379395, "learning_rate": 1.8566476142407814e-05, "loss": 0.1298, "step": 1382 }, { "epoch": 0.06593092269921103, "grad_norm": 2.955308675765991, "learning_rate": 1.856444903260437e-05, "loss": 0.6103, "step": 1383 }, { "epoch": 0.06597859509451053, "grad_norm": 1.4735954999923706, "learning_rate": 1.856242060139464e-05, "loss": 1.0844, "step": 1384 }, { "epoch": 0.06602626748981003, "grad_norm": 2.774493932723999, "learning_rate": 1.8560390849091585e-05, "loss": 1.103, "step": 1385 }, { "epoch": 0.06607393988510953, "grad_norm": 1.628644585609436, "learning_rate": 1.8558359776008377e-05, "loss": 0.7914, "step": 1386 }, { "epoch": 0.06612161228040903, "grad_norm": 1.5361404418945312, "learning_rate": 1.855632738245839e-05, "loss": 0.5651, "step": 1387 }, { "epoch": 0.06616928467570853, "grad_norm": 4.0101752281188965, "learning_rate": 1.8554293668755203e-05, "loss": 0.7962, "step": 1388 }, { "epoch": 0.06621695707100804, "grad_norm": 1.46977698802948, "learning_rate": 1.855225863521259e-05, "loss": 0.8936, "step": 1389 }, { "epoch": 0.06626462946630754, "grad_norm": 2.5550119876861572, "learning_rate": 1.8550222282144544e-05, "loss": 1.2993, "step": 1390 }, { "epoch": 0.06631230186160704, "grad_norm": 1.6742734909057617, "learning_rate": 1.854818460986525e-05, "loss": 0.5964, "step": 1391 }, { "epoch": 0.06635997425690654, "grad_norm": 1.6683356761932373, "learning_rate": 1.85461456186891e-05, "loss": 0.4198, "step": 1392 }, { "epoch": 0.06640764665220604, "grad_norm": 1.9733245372772217, "learning_rate": 1.8544105308930688e-05, "loss": 0.6145, "step": 1393 }, { "epoch": 0.06645531904750554, "grad_norm": 1.9773991107940674, "learning_rate": 1.8542063680904818e-05, "loss": 1.0799, "step": 1394 }, { "epoch": 0.06650299144280504, "grad_norm": 2.01918888092041, "learning_rate": 1.8540020734926483e-05, "loss": 1.0645, "step": 1395 }, { "epoch": 0.06655066383810454, "grad_norm": 1.5858941078186035, "learning_rate": 1.85379764713109e-05, "loss": 1.0481, "step": 1396 }, { "epoch": 0.06659833623340405, "grad_norm": 2.7093989849090576, "learning_rate": 1.8535930890373467e-05, "loss": 1.0287, "step": 1397 }, { "epoch": 0.06664600862870355, "grad_norm": 4.241600036621094, "learning_rate": 1.85338839924298e-05, "loss": 0.3816, "step": 1398 }, { "epoch": 0.06669368102400305, "grad_norm": 1.649603247642517, "learning_rate": 1.853183577779572e-05, "loss": 0.81, "step": 1399 }, { "epoch": 0.06674135341930255, "grad_norm": 1.113547682762146, "learning_rate": 1.8529786246787235e-05, "loss": 0.6889, "step": 1400 }, { "epoch": 0.06678902581460205, "grad_norm": 1.7195663452148438, "learning_rate": 1.8527735399720575e-05, "loss": 0.6214, "step": 1401 }, { "epoch": 0.06683669820990155, "grad_norm": 2.9614198207855225, "learning_rate": 1.852568323691216e-05, "loss": 0.5665, "step": 1402 }, { "epoch": 0.06688437060520105, "grad_norm": 2.033257246017456, "learning_rate": 1.8523629758678618e-05, "loss": 0.4025, "step": 1403 }, { "epoch": 0.06693204300050055, "grad_norm": 1.3746601343154907, "learning_rate": 1.8521574965336783e-05, "loss": 0.3748, "step": 1404 }, { "epoch": 0.06697971539580007, "grad_norm": 2.830188274383545, "learning_rate": 1.8519518857203686e-05, "loss": 1.1046, "step": 1405 }, { "epoch": 0.06702738779109957, "grad_norm": 1.7468923330307007, "learning_rate": 1.8517461434596563e-05, "loss": 0.825, "step": 1406 }, { "epoch": 0.06707506018639907, "grad_norm": 2.2459235191345215, "learning_rate": 1.851540269783285e-05, "loss": 0.9809, "step": 1407 }, { "epoch": 0.06712273258169857, "grad_norm": 1.8404539823532104, "learning_rate": 1.8513342647230197e-05, "loss": 0.7006, "step": 1408 }, { "epoch": 0.06717040497699807, "grad_norm": 2.422340154647827, "learning_rate": 1.8511281283106442e-05, "loss": 0.7631, "step": 1409 }, { "epoch": 0.06721807737229757, "grad_norm": 1.9104775190353394, "learning_rate": 1.850921860577964e-05, "loss": 0.8258, "step": 1410 }, { "epoch": 0.06726574976759707, "grad_norm": 1.5409177541732788, "learning_rate": 1.8507154615568027e-05, "loss": 0.7888, "step": 1411 }, { "epoch": 0.06731342216289657, "grad_norm": 1.7304933071136475, "learning_rate": 1.8505089312790067e-05, "loss": 0.5328, "step": 1412 }, { "epoch": 0.06736109455819608, "grad_norm": 1.835681676864624, "learning_rate": 1.850302269776441e-05, "loss": 0.8584, "step": 1413 }, { "epoch": 0.06740876695349558, "grad_norm": 2.1688413619995117, "learning_rate": 1.8500954770809915e-05, "loss": 0.8466, "step": 1414 }, { "epoch": 0.06745643934879508, "grad_norm": 2.751159429550171, "learning_rate": 1.8498885532245643e-05, "loss": 0.6508, "step": 1415 }, { "epoch": 0.06750411174409458, "grad_norm": 1.539573311805725, "learning_rate": 1.8496814982390856e-05, "loss": 0.5461, "step": 1416 }, { "epoch": 0.06755178413939408, "grad_norm": 1.7922183275222778, "learning_rate": 1.8494743121565015e-05, "loss": 0.8011, "step": 1417 }, { "epoch": 0.06759945653469358, "grad_norm": 1.9376097917556763, "learning_rate": 1.8492669950087792e-05, "loss": 0.8742, "step": 1418 }, { "epoch": 0.06764712892999308, "grad_norm": 1.8946641683578491, "learning_rate": 1.849059546827905e-05, "loss": 0.8625, "step": 1419 }, { "epoch": 0.06769480132529258, "grad_norm": 1.5216120481491089, "learning_rate": 1.8488519676458868e-05, "loss": 0.8587, "step": 1420 }, { "epoch": 0.0677424737205921, "grad_norm": 1.8230687379837036, "learning_rate": 1.848644257494751e-05, "loss": 0.4559, "step": 1421 }, { "epoch": 0.0677901461158916, "grad_norm": 1.2247145175933838, "learning_rate": 1.8484364164065457e-05, "loss": 0.3531, "step": 1422 }, { "epoch": 0.0678378185111911, "grad_norm": 4.0727434158325195, "learning_rate": 1.8482284444133388e-05, "loss": 1.3372, "step": 1423 }, { "epoch": 0.0678854909064906, "grad_norm": 2.97822642326355, "learning_rate": 1.848020341547218e-05, "loss": 1.1882, "step": 1424 }, { "epoch": 0.0679331633017901, "grad_norm": 1.7735543251037598, "learning_rate": 1.8478121078402914e-05, "loss": 0.8361, "step": 1425 }, { "epoch": 0.0679808356970896, "grad_norm": 1.4802738428115845, "learning_rate": 1.847603743324687e-05, "loss": 0.8286, "step": 1426 }, { "epoch": 0.0680285080923891, "grad_norm": 1.2952511310577393, "learning_rate": 1.847395248032554e-05, "loss": 0.8702, "step": 1427 }, { "epoch": 0.0680761804876886, "grad_norm": 1.3495980501174927, "learning_rate": 1.8471866219960604e-05, "loss": 0.8121, "step": 1428 }, { "epoch": 0.06812385288298811, "grad_norm": 1.3693236112594604, "learning_rate": 1.8469778652473955e-05, "loss": 0.8524, "step": 1429 }, { "epoch": 0.06817152527828761, "grad_norm": 1.3611301183700562, "learning_rate": 1.8467689778187684e-05, "loss": 0.4362, "step": 1430 }, { "epoch": 0.06821919767358711, "grad_norm": 1.2123875617980957, "learning_rate": 1.8465599597424076e-05, "loss": 0.5462, "step": 1431 }, { "epoch": 0.06826687006888661, "grad_norm": 5.222366809844971, "learning_rate": 1.8463508110505635e-05, "loss": 0.7139, "step": 1432 }, { "epoch": 0.06831454246418611, "grad_norm": 1.6212717294692993, "learning_rate": 1.8461415317755046e-05, "loss": 0.8212, "step": 1433 }, { "epoch": 0.06836221485948561, "grad_norm": 1.7302656173706055, "learning_rate": 1.8459321219495207e-05, "loss": 0.7832, "step": 1434 }, { "epoch": 0.06840988725478511, "grad_norm": 2.1390864849090576, "learning_rate": 1.845722581604922e-05, "loss": 0.6001, "step": 1435 }, { "epoch": 0.06845755965008461, "grad_norm": 1.9199094772338867, "learning_rate": 1.8455129107740383e-05, "loss": 0.9711, "step": 1436 }, { "epoch": 0.06850523204538413, "grad_norm": 1.5917038917541504, "learning_rate": 1.8453031094892196e-05, "loss": 0.7969, "step": 1437 }, { "epoch": 0.06855290444068363, "grad_norm": 3.0862739086151123, "learning_rate": 1.845093177782836e-05, "loss": 1.115, "step": 1438 }, { "epoch": 0.06860057683598313, "grad_norm": 2.6472458839416504, "learning_rate": 1.844883115687278e-05, "loss": 0.8385, "step": 1439 }, { "epoch": 0.06864824923128263, "grad_norm": 1.9862874746322632, "learning_rate": 1.8446729232349557e-05, "loss": 0.6876, "step": 1440 }, { "epoch": 0.06869592162658213, "grad_norm": 2.886171817779541, "learning_rate": 1.8444626004582998e-05, "loss": 0.7345, "step": 1441 }, { "epoch": 0.06874359402188163, "grad_norm": 1.312821388244629, "learning_rate": 1.8442521473897606e-05, "loss": 0.7208, "step": 1442 }, { "epoch": 0.06879126641718113, "grad_norm": 1.1149388551712036, "learning_rate": 1.8440415640618097e-05, "loss": 0.684, "step": 1443 }, { "epoch": 0.06883893881248063, "grad_norm": 1.811888575553894, "learning_rate": 1.843830850506937e-05, "loss": 0.6486, "step": 1444 }, { "epoch": 0.06888661120778014, "grad_norm": 7.313960552215576, "learning_rate": 1.843620006757654e-05, "loss": 0.5611, "step": 1445 }, { "epoch": 0.06893428360307964, "grad_norm": 2.800572633743286, "learning_rate": 1.8434090328464916e-05, "loss": 1.0487, "step": 1446 }, { "epoch": 0.06898195599837914, "grad_norm": 0.9820416569709778, "learning_rate": 1.843197928806001e-05, "loss": 0.4441, "step": 1447 }, { "epoch": 0.06902962839367864, "grad_norm": 2.877941131591797, "learning_rate": 1.842986694668753e-05, "loss": 0.2468, "step": 1448 }, { "epoch": 0.06907730078897814, "grad_norm": 1.351880431175232, "learning_rate": 1.8427753304673395e-05, "loss": 0.8793, "step": 1449 }, { "epoch": 0.06912497318427764, "grad_norm": 2.89223313331604, "learning_rate": 1.842563836234371e-05, "loss": 0.3041, "step": 1450 }, { "epoch": 0.06917264557957714, "grad_norm": 1.8016570806503296, "learning_rate": 1.8423522120024793e-05, "loss": 0.9527, "step": 1451 }, { "epoch": 0.06922031797487664, "grad_norm": 2.361917495727539, "learning_rate": 1.842140457804316e-05, "loss": 0.9726, "step": 1452 }, { "epoch": 0.06926799037017616, "grad_norm": 2.007028341293335, "learning_rate": 1.8419285736725524e-05, "loss": 1.1446, "step": 1453 }, { "epoch": 0.06931566276547566, "grad_norm": 1.4965935945510864, "learning_rate": 1.8417165596398803e-05, "loss": 0.871, "step": 1454 }, { "epoch": 0.06936333516077516, "grad_norm": 1.5803940296173096, "learning_rate": 1.8415044157390105e-05, "loss": 0.8411, "step": 1455 }, { "epoch": 0.06941100755607466, "grad_norm": 2.941607713699341, "learning_rate": 1.8412921420026757e-05, "loss": 0.4878, "step": 1456 }, { "epoch": 0.06945867995137416, "grad_norm": 1.9873912334442139, "learning_rate": 1.8410797384636267e-05, "loss": 0.7835, "step": 1457 }, { "epoch": 0.06950635234667366, "grad_norm": 1.763690710067749, "learning_rate": 1.8408672051546355e-05, "loss": 0.7951, "step": 1458 }, { "epoch": 0.06955402474197316, "grad_norm": 6.737948417663574, "learning_rate": 1.840654542108494e-05, "loss": 0.452, "step": 1459 }, { "epoch": 0.06960169713727266, "grad_norm": 2.328399181365967, "learning_rate": 1.8404417493580138e-05, "loss": 0.8578, "step": 1460 }, { "epoch": 0.06964936953257217, "grad_norm": 1.7654873132705688, "learning_rate": 1.840228826936026e-05, "loss": 0.6632, "step": 1461 }, { "epoch": 0.06969704192787167, "grad_norm": 1.4532341957092285, "learning_rate": 1.8400157748753835e-05, "loss": 0.4032, "step": 1462 }, { "epoch": 0.06974471432317117, "grad_norm": 11.273175239562988, "learning_rate": 1.839802593208957e-05, "loss": 0.8069, "step": 1463 }, { "epoch": 0.06979238671847067, "grad_norm": 1.4662144184112549, "learning_rate": 1.839589281969639e-05, "loss": 0.3238, "step": 1464 }, { "epoch": 0.06984005911377017, "grad_norm": 2.0241737365722656, "learning_rate": 1.8393758411903406e-05, "loss": 0.7023, "step": 1465 }, { "epoch": 0.06988773150906967, "grad_norm": 1.6374173164367676, "learning_rate": 1.839162270903994e-05, "loss": 0.6782, "step": 1466 }, { "epoch": 0.06993540390436917, "grad_norm": 1.3327624797821045, "learning_rate": 1.8389485711435505e-05, "loss": 0.5533, "step": 1467 }, { "epoch": 0.06998307629966867, "grad_norm": 2.0282294750213623, "learning_rate": 1.8387347419419824e-05, "loss": 0.6456, "step": 1468 }, { "epoch": 0.07003074869496818, "grad_norm": 1.6322304010391235, "learning_rate": 1.8385207833322805e-05, "loss": 0.5039, "step": 1469 }, { "epoch": 0.07007842109026768, "grad_norm": 4.069650650024414, "learning_rate": 1.838306695347457e-05, "loss": 0.354, "step": 1470 }, { "epoch": 0.07012609348556718, "grad_norm": 1.6614806652069092, "learning_rate": 1.8380924780205434e-05, "loss": 0.6034, "step": 1471 }, { "epoch": 0.07017376588086668, "grad_norm": 1.1433995962142944, "learning_rate": 1.837878131384591e-05, "loss": 0.518, "step": 1472 }, { "epoch": 0.07022143827616618, "grad_norm": 1.8702892065048218, "learning_rate": 1.8376636554726713e-05, "loss": 0.6899, "step": 1473 }, { "epoch": 0.07026911067146568, "grad_norm": 1.6741303205490112, "learning_rate": 1.8374490503178758e-05, "loss": 0.683, "step": 1474 }, { "epoch": 0.07031678306676518, "grad_norm": 1.2368090152740479, "learning_rate": 1.837234315953316e-05, "loss": 0.9024, "step": 1475 }, { "epoch": 0.0703644554620647, "grad_norm": 2.6631646156311035, "learning_rate": 1.8370194524121232e-05, "loss": 0.9153, "step": 1476 }, { "epoch": 0.0704121278573642, "grad_norm": 2.2205801010131836, "learning_rate": 1.8368044597274483e-05, "loss": 0.811, "step": 1477 }, { "epoch": 0.0704598002526637, "grad_norm": 1.5205700397491455, "learning_rate": 1.8365893379324628e-05, "loss": 0.7697, "step": 1478 }, { "epoch": 0.0705074726479632, "grad_norm": 1.1955779790878296, "learning_rate": 1.8363740870603578e-05, "loss": 0.6193, "step": 1479 }, { "epoch": 0.0705551450432627, "grad_norm": 1.6952061653137207, "learning_rate": 1.836158707144344e-05, "loss": 0.6845, "step": 1480 }, { "epoch": 0.0706028174385622, "grad_norm": 1.358463168144226, "learning_rate": 1.8359431982176526e-05, "loss": 0.7407, "step": 1481 }, { "epoch": 0.0706504898338617, "grad_norm": 1.8397055864334106, "learning_rate": 1.835727560313534e-05, "loss": 0.6134, "step": 1482 }, { "epoch": 0.0706981622291612, "grad_norm": 0.8710074424743652, "learning_rate": 1.8355117934652593e-05, "loss": 0.4185, "step": 1483 }, { "epoch": 0.07074583462446071, "grad_norm": 1.4832335710525513, "learning_rate": 1.835295897706119e-05, "loss": 0.7756, "step": 1484 }, { "epoch": 0.07079350701976021, "grad_norm": 4.303059101104736, "learning_rate": 1.8350798730694234e-05, "loss": 1.1799, "step": 1485 }, { "epoch": 0.07084117941505971, "grad_norm": 3.442786931991577, "learning_rate": 1.8348637195885033e-05, "loss": 0.7305, "step": 1486 }, { "epoch": 0.07088885181035921, "grad_norm": 1.6812269687652588, "learning_rate": 1.8346474372967086e-05, "loss": 0.751, "step": 1487 }, { "epoch": 0.07093652420565871, "grad_norm": 1.54777193069458, "learning_rate": 1.8344310262274093e-05, "loss": 0.8818, "step": 1488 }, { "epoch": 0.07098419660095821, "grad_norm": 2.2691755294799805, "learning_rate": 1.8342144864139962e-05, "loss": 0.6995, "step": 1489 }, { "epoch": 0.07103186899625771, "grad_norm": 2.8284544944763184, "learning_rate": 1.833997817889878e-05, "loss": 1.4897, "step": 1490 }, { "epoch": 0.07107954139155721, "grad_norm": 1.894149661064148, "learning_rate": 1.8337810206884853e-05, "loss": 0.7277, "step": 1491 }, { "epoch": 0.07112721378685673, "grad_norm": 3.558352470397949, "learning_rate": 1.8335640948432675e-05, "loss": 0.19, "step": 1492 }, { "epoch": 0.07117488618215623, "grad_norm": 1.9296996593475342, "learning_rate": 1.8333470403876935e-05, "loss": 0.7115, "step": 1493 }, { "epoch": 0.07122255857745573, "grad_norm": 2.4193904399871826, "learning_rate": 1.8331298573552534e-05, "loss": 0.9175, "step": 1494 }, { "epoch": 0.07127023097275523, "grad_norm": 1.8003171682357788, "learning_rate": 1.8329125457794557e-05, "loss": 0.6669, "step": 1495 }, { "epoch": 0.07131790336805473, "grad_norm": 1.8310233354568481, "learning_rate": 1.8326951056938295e-05, "loss": 0.6499, "step": 1496 }, { "epoch": 0.07136557576335423, "grad_norm": 1.5464767217636108, "learning_rate": 1.832477537131924e-05, "loss": 0.8482, "step": 1497 }, { "epoch": 0.07141324815865373, "grad_norm": 0.9716111421585083, "learning_rate": 1.8322598401273067e-05, "loss": 0.2961, "step": 1498 }, { "epoch": 0.07146092055395323, "grad_norm": 1.4455845355987549, "learning_rate": 1.8320420147135674e-05, "loss": 0.2451, "step": 1499 }, { "epoch": 0.07150859294925274, "grad_norm": 1.3346799612045288, "learning_rate": 1.831824060924313e-05, "loss": 0.7899, "step": 1500 }, { "epoch": 0.07155626534455224, "grad_norm": 1.3627867698669434, "learning_rate": 1.8316059787931725e-05, "loss": 0.8287, "step": 1501 }, { "epoch": 0.07160393773985174, "grad_norm": 1.9677326679229736, "learning_rate": 1.831387768353793e-05, "loss": 1.0129, "step": 1502 }, { "epoch": 0.07165161013515124, "grad_norm": 1.306816577911377, "learning_rate": 1.831169429639843e-05, "loss": 0.4651, "step": 1503 }, { "epoch": 0.07169928253045074, "grad_norm": 1.2083654403686523, "learning_rate": 1.830950962685009e-05, "loss": 0.6701, "step": 1504 }, { "epoch": 0.07174695492575024, "grad_norm": 1.4216758012771606, "learning_rate": 1.8307323675229986e-05, "loss": 0.702, "step": 1505 }, { "epoch": 0.07179462732104974, "grad_norm": 1.6769384145736694, "learning_rate": 1.8305136441875388e-05, "loss": 0.8615, "step": 1506 }, { "epoch": 0.07184229971634924, "grad_norm": 1.4091429710388184, "learning_rate": 1.8302947927123767e-05, "loss": 0.6523, "step": 1507 }, { "epoch": 0.07188997211164876, "grad_norm": 0.9538880586624146, "learning_rate": 1.8300758131312778e-05, "loss": 0.5335, "step": 1508 }, { "epoch": 0.07193764450694826, "grad_norm": 1.8690299987792969, "learning_rate": 1.8298567054780295e-05, "loss": 0.7042, "step": 1509 }, { "epoch": 0.07198531690224776, "grad_norm": 1.7488499879837036, "learning_rate": 1.8296374697864376e-05, "loss": 0.5988, "step": 1510 }, { "epoch": 0.07203298929754726, "grad_norm": 2.2740328311920166, "learning_rate": 1.8294181060903275e-05, "loss": 0.7583, "step": 1511 }, { "epoch": 0.07208066169284676, "grad_norm": 2.075404405593872, "learning_rate": 1.829198614423545e-05, "loss": 0.7631, "step": 1512 }, { "epoch": 0.07212833408814626, "grad_norm": 3.8949429988861084, "learning_rate": 1.8289789948199553e-05, "loss": 0.3551, "step": 1513 }, { "epoch": 0.07217600648344576, "grad_norm": 2.0617401599884033, "learning_rate": 1.8287592473134436e-05, "loss": 0.6646, "step": 1514 }, { "epoch": 0.07222367887874526, "grad_norm": 2.184965133666992, "learning_rate": 1.8285393719379146e-05, "loss": 0.901, "step": 1515 }, { "epoch": 0.07227135127404477, "grad_norm": 1.4092891216278076, "learning_rate": 1.8283193687272927e-05, "loss": 0.8031, "step": 1516 }, { "epoch": 0.07231902366934427, "grad_norm": 2.0028107166290283, "learning_rate": 1.8280992377155224e-05, "loss": 0.7283, "step": 1517 }, { "epoch": 0.07236669606464377, "grad_norm": 2.107572555541992, "learning_rate": 1.8278789789365675e-05, "loss": 0.9222, "step": 1518 }, { "epoch": 0.07241436845994327, "grad_norm": 1.6521728038787842, "learning_rate": 1.8276585924244113e-05, "loss": 0.5648, "step": 1519 }, { "epoch": 0.07246204085524277, "grad_norm": 0.9772564768791199, "learning_rate": 1.827438078213058e-05, "loss": 0.6166, "step": 1520 }, { "epoch": 0.07250971325054227, "grad_norm": 2.127978801727295, "learning_rate": 1.82721743633653e-05, "loss": 1.1091, "step": 1521 }, { "epoch": 0.07255738564584177, "grad_norm": 4.721246242523193, "learning_rate": 1.8269966668288704e-05, "loss": 1.0732, "step": 1522 }, { "epoch": 0.07260505804114127, "grad_norm": 2.551135778427124, "learning_rate": 1.8267757697241415e-05, "loss": 1.3089, "step": 1523 }, { "epoch": 0.07265273043644078, "grad_norm": 2.385348320007324, "learning_rate": 1.826554745056425e-05, "loss": 0.7595, "step": 1524 }, { "epoch": 0.07270040283174029, "grad_norm": 1.948810338973999, "learning_rate": 1.8263335928598237e-05, "loss": 1.1947, "step": 1525 }, { "epoch": 0.07274807522703979, "grad_norm": 1.3272274732589722, "learning_rate": 1.8261123131684587e-05, "loss": 0.8178, "step": 1526 }, { "epoch": 0.07279574762233929, "grad_norm": 1.742832899093628, "learning_rate": 1.8258909060164706e-05, "loss": 0.7982, "step": 1527 }, { "epoch": 0.07284342001763879, "grad_norm": 1.7113311290740967, "learning_rate": 1.8256693714380214e-05, "loss": 0.7727, "step": 1528 }, { "epoch": 0.07289109241293829, "grad_norm": 2.825806140899658, "learning_rate": 1.8254477094672903e-05, "loss": 1.2915, "step": 1529 }, { "epoch": 0.07293876480823779, "grad_norm": 1.0234042406082153, "learning_rate": 1.8252259201384786e-05, "loss": 0.4247, "step": 1530 }, { "epoch": 0.07298643720353729, "grad_norm": 1.4936145544052124, "learning_rate": 1.825004003485805e-05, "loss": 0.7923, "step": 1531 }, { "epoch": 0.0730341095988368, "grad_norm": 3.9392523765563965, "learning_rate": 1.8247819595435102e-05, "loss": 0.4282, "step": 1532 }, { "epoch": 0.0730817819941363, "grad_norm": 1.4250996112823486, "learning_rate": 1.8245597883458524e-05, "loss": 0.7803, "step": 1533 }, { "epoch": 0.0731294543894358, "grad_norm": 1.8672455549240112, "learning_rate": 1.8243374899271103e-05, "loss": 0.8509, "step": 1534 }, { "epoch": 0.0731771267847353, "grad_norm": 1.7403994798660278, "learning_rate": 1.8241150643215828e-05, "loss": 0.4006, "step": 1535 }, { "epoch": 0.0732247991800348, "grad_norm": 1.4508147239685059, "learning_rate": 1.823892511563588e-05, "loss": 0.7365, "step": 1536 }, { "epoch": 0.0732724715753343, "grad_norm": 1.2154483795166016, "learning_rate": 1.8236698316874625e-05, "loss": 0.4724, "step": 1537 }, { "epoch": 0.0733201439706338, "grad_norm": 1.597983717918396, "learning_rate": 1.8234470247275644e-05, "loss": 0.9239, "step": 1538 }, { "epoch": 0.0733678163659333, "grad_norm": 1.3959144353866577, "learning_rate": 1.8232240907182702e-05, "loss": 0.8261, "step": 1539 }, { "epoch": 0.07341548876123281, "grad_norm": 1.3057143688201904, "learning_rate": 1.8230010296939764e-05, "loss": 0.7321, "step": 1540 }, { "epoch": 0.07346316115653231, "grad_norm": 1.770998239517212, "learning_rate": 1.822777841689099e-05, "loss": 0.7913, "step": 1541 }, { "epoch": 0.07351083355183181, "grad_norm": 7.75728178024292, "learning_rate": 1.8225545267380736e-05, "loss": 0.8328, "step": 1542 }, { "epoch": 0.07355850594713131, "grad_norm": 2.049203634262085, "learning_rate": 1.8223310848753552e-05, "loss": 0.7342, "step": 1543 }, { "epoch": 0.07360617834243081, "grad_norm": 3.39388108253479, "learning_rate": 1.822107516135419e-05, "loss": 1.4048, "step": 1544 }, { "epoch": 0.07365385073773031, "grad_norm": 2.0998194217681885, "learning_rate": 1.821883820552759e-05, "loss": 0.2866, "step": 1545 }, { "epoch": 0.07370152313302981, "grad_norm": 3.1076385974884033, "learning_rate": 1.8216599981618895e-05, "loss": 0.9114, "step": 1546 }, { "epoch": 0.07374919552832931, "grad_norm": 0.9885009527206421, "learning_rate": 1.8214360489973435e-05, "loss": 0.6137, "step": 1547 }, { "epoch": 0.07379686792362883, "grad_norm": 1.954210877418518, "learning_rate": 1.8212119730936745e-05, "loss": 0.7286, "step": 1548 }, { "epoch": 0.07384454031892833, "grad_norm": 3.475102186203003, "learning_rate": 1.8209877704854547e-05, "loss": 0.7378, "step": 1549 }, { "epoch": 0.07389221271422783, "grad_norm": 1.5269593000411987, "learning_rate": 1.8207634412072765e-05, "loss": 0.8207, "step": 1550 }, { "epoch": 0.07393988510952733, "grad_norm": 1.8889150619506836, "learning_rate": 1.8205389852937516e-05, "loss": 0.7764, "step": 1551 }, { "epoch": 0.07398755750482683, "grad_norm": 1.664652705192566, "learning_rate": 1.820314402779511e-05, "loss": 0.3929, "step": 1552 }, { "epoch": 0.07403522990012633, "grad_norm": 1.946104645729065, "learning_rate": 1.820089693699206e-05, "loss": 1.0455, "step": 1553 }, { "epoch": 0.07408290229542583, "grad_norm": 1.2063770294189453, "learning_rate": 1.8198648580875063e-05, "loss": 0.2833, "step": 1554 }, { "epoch": 0.07413057469072533, "grad_norm": 1.4298354387283325, "learning_rate": 1.8196398959791022e-05, "loss": 0.7905, "step": 1555 }, { "epoch": 0.07417824708602484, "grad_norm": 1.6317298412322998, "learning_rate": 1.8194148074087025e-05, "loss": 0.6171, "step": 1556 }, { "epoch": 0.07422591948132434, "grad_norm": 1.4596104621887207, "learning_rate": 1.8191895924110364e-05, "loss": 0.4242, "step": 1557 }, { "epoch": 0.07427359187662384, "grad_norm": 2.159130334854126, "learning_rate": 1.8189642510208525e-05, "loss": 1.1579, "step": 1558 }, { "epoch": 0.07432126427192334, "grad_norm": 1.8697502613067627, "learning_rate": 1.818738783272918e-05, "loss": 0.6715, "step": 1559 }, { "epoch": 0.07436893666722284, "grad_norm": 1.6227350234985352, "learning_rate": 1.818513189202021e-05, "loss": 0.8521, "step": 1560 }, { "epoch": 0.07441660906252234, "grad_norm": 2.830514907836914, "learning_rate": 1.8182874688429674e-05, "loss": 1.6047, "step": 1561 }, { "epoch": 0.07446428145782184, "grad_norm": 1.4403462409973145, "learning_rate": 1.8180616222305847e-05, "loss": 0.753, "step": 1562 }, { "epoch": 0.07451195385312134, "grad_norm": 1.779158353805542, "learning_rate": 1.817835649399718e-05, "loss": 0.6482, "step": 1563 }, { "epoch": 0.07455962624842086, "grad_norm": 3.1841413974761963, "learning_rate": 1.817609550385232e-05, "loss": 0.6392, "step": 1564 }, { "epoch": 0.07460729864372036, "grad_norm": 2.0441768169403076, "learning_rate": 1.817383325222013e-05, "loss": 0.5695, "step": 1565 }, { "epoch": 0.07465497103901986, "grad_norm": 1.3322478532791138, "learning_rate": 1.8171569739449642e-05, "loss": 0.6165, "step": 1566 }, { "epoch": 0.07470264343431936, "grad_norm": 28.239721298217773, "learning_rate": 1.8169304965890088e-05, "loss": 0.3622, "step": 1567 }, { "epoch": 0.07475031582961886, "grad_norm": 1.3498544692993164, "learning_rate": 1.816703893189091e-05, "loss": 0.8487, "step": 1568 }, { "epoch": 0.07479798822491836, "grad_norm": 2.1444287300109863, "learning_rate": 1.816477163780173e-05, "loss": 1.01, "step": 1569 }, { "epoch": 0.07484566062021786, "grad_norm": 1.8706082105636597, "learning_rate": 1.8162503083972365e-05, "loss": 0.5758, "step": 1570 }, { "epoch": 0.07489333301551737, "grad_norm": 1.309287190437317, "learning_rate": 1.816023327075283e-05, "loss": 0.3584, "step": 1571 }, { "epoch": 0.07494100541081687, "grad_norm": 3.1240546703338623, "learning_rate": 1.815796219849334e-05, "loss": 1.1831, "step": 1572 }, { "epoch": 0.07498867780611637, "grad_norm": 2.0304906368255615, "learning_rate": 1.815568986754429e-05, "loss": 0.8677, "step": 1573 }, { "epoch": 0.07503635020141587, "grad_norm": 3.12980318069458, "learning_rate": 1.815341627825628e-05, "loss": 1.0142, "step": 1574 }, { "epoch": 0.07508402259671537, "grad_norm": 1.3019670248031616, "learning_rate": 1.8151141430980106e-05, "loss": 1.0115, "step": 1575 }, { "epoch": 0.07513169499201487, "grad_norm": 3.11868953704834, "learning_rate": 1.814886532606675e-05, "loss": 0.958, "step": 1576 }, { "epoch": 0.07517936738731437, "grad_norm": 1.458827018737793, "learning_rate": 1.8146587963867388e-05, "loss": 0.935, "step": 1577 }, { "epoch": 0.07522703978261387, "grad_norm": 1.6715525388717651, "learning_rate": 1.8144309344733397e-05, "loss": 0.7555, "step": 1578 }, { "epoch": 0.07527471217791339, "grad_norm": 4.716001987457275, "learning_rate": 1.8142029469016345e-05, "loss": 1.0024, "step": 1579 }, { "epoch": 0.07532238457321289, "grad_norm": 3.0512728691101074, "learning_rate": 1.8139748337067993e-05, "loss": 0.5921, "step": 1580 }, { "epoch": 0.07537005696851239, "grad_norm": 1.8836324214935303, "learning_rate": 1.8137465949240294e-05, "loss": 0.9274, "step": 1581 }, { "epoch": 0.07541772936381189, "grad_norm": 1.914754867553711, "learning_rate": 1.8135182305885403e-05, "loss": 0.579, "step": 1582 }, { "epoch": 0.07546540175911139, "grad_norm": 1.2325947284698486, "learning_rate": 1.8132897407355657e-05, "loss": 0.8546, "step": 1583 }, { "epoch": 0.07551307415441089, "grad_norm": 1.4415030479431152, "learning_rate": 1.813061125400359e-05, "loss": 0.5904, "step": 1584 }, { "epoch": 0.07556074654971039, "grad_norm": 1.9776607751846313, "learning_rate": 1.812832384618194e-05, "loss": 0.9808, "step": 1585 }, { "epoch": 0.07560841894500989, "grad_norm": 1.2737736701965332, "learning_rate": 1.8126035184243623e-05, "loss": 0.7254, "step": 1586 }, { "epoch": 0.0756560913403094, "grad_norm": 1.6993833780288696, "learning_rate": 1.812374526854176e-05, "loss": 0.4322, "step": 1587 }, { "epoch": 0.0757037637356089, "grad_norm": 1.276158094406128, "learning_rate": 1.812145409942966e-05, "loss": 0.772, "step": 1588 }, { "epoch": 0.0757514361309084, "grad_norm": 2.059755325317383, "learning_rate": 1.8119161677260827e-05, "loss": 1.1274, "step": 1589 }, { "epoch": 0.0757991085262079, "grad_norm": 2.593461513519287, "learning_rate": 1.811686800238896e-05, "loss": 0.5567, "step": 1590 }, { "epoch": 0.0758467809215074, "grad_norm": 2.1723110675811768, "learning_rate": 1.8114573075167947e-05, "loss": 0.8908, "step": 1591 }, { "epoch": 0.0758944533168069, "grad_norm": 1.1874706745147705, "learning_rate": 1.8112276895951872e-05, "loss": 0.144, "step": 1592 }, { "epoch": 0.0759421257121064, "grad_norm": 2.7023580074310303, "learning_rate": 1.8109979465095014e-05, "loss": 0.9457, "step": 1593 }, { "epoch": 0.0759897981074059, "grad_norm": 2.2425551414489746, "learning_rate": 1.810768078295184e-05, "loss": 1.0159, "step": 1594 }, { "epoch": 0.07603747050270541, "grad_norm": 2.7102904319763184, "learning_rate": 1.8105380849877013e-05, "loss": 1.297, "step": 1595 }, { "epoch": 0.07608514289800491, "grad_norm": 1.3896920680999756, "learning_rate": 1.810307966622539e-05, "loss": 0.9087, "step": 1596 }, { "epoch": 0.07613281529330441, "grad_norm": 1.352295160293579, "learning_rate": 1.8100777232352022e-05, "loss": 0.8202, "step": 1597 }, { "epoch": 0.07618048768860392, "grad_norm": 1.3081729412078857, "learning_rate": 1.8098473548612146e-05, "loss": 0.8788, "step": 1598 }, { "epoch": 0.07622816008390342, "grad_norm": 1.5088319778442383, "learning_rate": 1.8096168615361203e-05, "loss": 0.6741, "step": 1599 }, { "epoch": 0.07627583247920292, "grad_norm": 2.373295307159424, "learning_rate": 1.8093862432954815e-05, "loss": 0.6484, "step": 1600 }, { "epoch": 0.07632350487450242, "grad_norm": 2.361781120300293, "learning_rate": 1.809155500174881e-05, "loss": 0.8694, "step": 1601 }, { "epoch": 0.07637117726980192, "grad_norm": 2.5788233280181885, "learning_rate": 1.8089246322099188e-05, "loss": 0.8709, "step": 1602 }, { "epoch": 0.07641884966510143, "grad_norm": 2.2018938064575195, "learning_rate": 1.8086936394362165e-05, "loss": 0.3707, "step": 1603 }, { "epoch": 0.07646652206040093, "grad_norm": 2.9525387287139893, "learning_rate": 1.808462521889413e-05, "loss": 0.8743, "step": 1604 }, { "epoch": 0.07651419445570043, "grad_norm": 9.3198823928833, "learning_rate": 1.8082312796051685e-05, "loss": 0.2377, "step": 1605 }, { "epoch": 0.07656186685099993, "grad_norm": 2.2434709072113037, "learning_rate": 1.807999912619161e-05, "loss": 0.6437, "step": 1606 }, { "epoch": 0.07660953924629943, "grad_norm": 2.3313891887664795, "learning_rate": 1.807768420967087e-05, "loss": 0.9975, "step": 1607 }, { "epoch": 0.07665721164159893, "grad_norm": 1.0797885656356812, "learning_rate": 1.8075368046846647e-05, "loss": 0.3199, "step": 1608 }, { "epoch": 0.07670488403689843, "grad_norm": 3.6646196842193604, "learning_rate": 1.807305063807629e-05, "loss": 0.9589, "step": 1609 }, { "epoch": 0.07675255643219793, "grad_norm": 1.667553424835205, "learning_rate": 1.8070731983717357e-05, "loss": 0.957, "step": 1610 }, { "epoch": 0.07680022882749744, "grad_norm": 1.9350658655166626, "learning_rate": 1.8068412084127594e-05, "loss": 0.9101, "step": 1611 }, { "epoch": 0.07684790122279694, "grad_norm": 2.238814115524292, "learning_rate": 1.8066090939664934e-05, "loss": 0.7569, "step": 1612 }, { "epoch": 0.07689557361809644, "grad_norm": 2.1624526977539062, "learning_rate": 1.8063768550687504e-05, "loss": 0.8103, "step": 1613 }, { "epoch": 0.07694324601339594, "grad_norm": 1.4392058849334717, "learning_rate": 1.806144491755363e-05, "loss": 0.8295, "step": 1614 }, { "epoch": 0.07699091840869544, "grad_norm": 1.195868968963623, "learning_rate": 1.805912004062182e-05, "loss": 0.6247, "step": 1615 }, { "epoch": 0.07703859080399494, "grad_norm": 1.4350346326828003, "learning_rate": 1.8056793920250784e-05, "loss": 1.0748, "step": 1616 }, { "epoch": 0.07708626319929444, "grad_norm": 2.500607967376709, "learning_rate": 1.805446655679941e-05, "loss": 0.4902, "step": 1617 }, { "epoch": 0.07713393559459394, "grad_norm": 1.0993422269821167, "learning_rate": 1.8052137950626795e-05, "loss": 0.3553, "step": 1618 }, { "epoch": 0.07718160798989346, "grad_norm": 2.4582574367523193, "learning_rate": 1.8049808102092213e-05, "loss": 0.8212, "step": 1619 }, { "epoch": 0.07722928038519296, "grad_norm": 1.8061506748199463, "learning_rate": 1.8047477011555142e-05, "loss": 0.9618, "step": 1620 }, { "epoch": 0.07727695278049246, "grad_norm": 1.089192509651184, "learning_rate": 1.804514467937524e-05, "loss": 0.8592, "step": 1621 }, { "epoch": 0.07732462517579196, "grad_norm": 2.998304605484009, "learning_rate": 1.804281110591236e-05, "loss": 0.725, "step": 1622 }, { "epoch": 0.07737229757109146, "grad_norm": 2.592383623123169, "learning_rate": 1.804047629152655e-05, "loss": 0.6466, "step": 1623 }, { "epoch": 0.07741996996639096, "grad_norm": 1.3715007305145264, "learning_rate": 1.8038140236578053e-05, "loss": 0.7357, "step": 1624 }, { "epoch": 0.07746764236169046, "grad_norm": 1.8576723337173462, "learning_rate": 1.803580294142729e-05, "loss": 0.8056, "step": 1625 }, { "epoch": 0.07751531475698996, "grad_norm": 2.1937034130096436, "learning_rate": 1.803346440643489e-05, "loss": 0.1831, "step": 1626 }, { "epoch": 0.07756298715228947, "grad_norm": 1.4345577955245972, "learning_rate": 1.803112463196166e-05, "loss": 0.9221, "step": 1627 }, { "epoch": 0.07761065954758897, "grad_norm": 1.2779603004455566, "learning_rate": 1.8028783618368603e-05, "loss": 0.7634, "step": 1628 }, { "epoch": 0.07765833194288847, "grad_norm": 3.2183146476745605, "learning_rate": 1.8026441366016915e-05, "loss": 0.3845, "step": 1629 }, { "epoch": 0.07770600433818797, "grad_norm": 1.1786555051803589, "learning_rate": 1.8024097875267982e-05, "loss": 0.6195, "step": 1630 }, { "epoch": 0.07775367673348747, "grad_norm": 2.7002832889556885, "learning_rate": 1.8021753146483373e-05, "loss": 0.7568, "step": 1631 }, { "epoch": 0.07780134912878697, "grad_norm": 2.2595055103302, "learning_rate": 1.8019407180024867e-05, "loss": 0.6159, "step": 1632 }, { "epoch": 0.07784902152408647, "grad_norm": 1.8262847661972046, "learning_rate": 1.8017059976254415e-05, "loss": 0.9114, "step": 1633 }, { "epoch": 0.07789669391938597, "grad_norm": 1.3823014497756958, "learning_rate": 1.801471153553417e-05, "loss": 0.8128, "step": 1634 }, { "epoch": 0.07794436631468549, "grad_norm": 1.43936026096344, "learning_rate": 1.801236185822647e-05, "loss": 0.6423, "step": 1635 }, { "epoch": 0.07799203870998499, "grad_norm": 2.8125345706939697, "learning_rate": 1.8010010944693846e-05, "loss": 0.919, "step": 1636 }, { "epoch": 0.07803971110528449, "grad_norm": 1.6097040176391602, "learning_rate": 1.8007658795299023e-05, "loss": 0.845, "step": 1637 }, { "epoch": 0.07808738350058399, "grad_norm": 2.403620719909668, "learning_rate": 1.800530541040491e-05, "loss": 1.0715, "step": 1638 }, { "epoch": 0.07813505589588349, "grad_norm": 3.5078952312469482, "learning_rate": 1.800295079037461e-05, "loss": 1.1827, "step": 1639 }, { "epoch": 0.07818272829118299, "grad_norm": 1.1118353605270386, "learning_rate": 1.8000594935571416e-05, "loss": 0.2507, "step": 1640 }, { "epoch": 0.07823040068648249, "grad_norm": 1.5571948289871216, "learning_rate": 1.7998237846358812e-05, "loss": 0.7452, "step": 1641 }, { "epoch": 0.07827807308178199, "grad_norm": 2.5637552738189697, "learning_rate": 1.7995879523100478e-05, "loss": 0.828, "step": 1642 }, { "epoch": 0.0783257454770815, "grad_norm": 2.611086368560791, "learning_rate": 1.7993519966160276e-05, "loss": 0.7582, "step": 1643 }, { "epoch": 0.078373417872381, "grad_norm": 1.3323707580566406, "learning_rate": 1.7991159175902257e-05, "loss": 0.369, "step": 1644 }, { "epoch": 0.0784210902676805, "grad_norm": 2.1701362133026123, "learning_rate": 1.798879715269067e-05, "loss": 1.1436, "step": 1645 }, { "epoch": 0.07846876266298, "grad_norm": 1.1985524892807007, "learning_rate": 1.7986433896889955e-05, "loss": 0.9122, "step": 1646 }, { "epoch": 0.0785164350582795, "grad_norm": 1.9176534414291382, "learning_rate": 1.7984069408864733e-05, "loss": 1.1291, "step": 1647 }, { "epoch": 0.078564107453579, "grad_norm": 2.425058364868164, "learning_rate": 1.798170368897982e-05, "loss": 0.4954, "step": 1648 }, { "epoch": 0.0786117798488785, "grad_norm": 2.8729782104492188, "learning_rate": 1.7979336737600225e-05, "loss": 0.3185, "step": 1649 }, { "epoch": 0.078659452244178, "grad_norm": 3.787822961807251, "learning_rate": 1.797696855509114e-05, "loss": 0.67, "step": 1650 }, { "epoch": 0.07870712463947752, "grad_norm": 1.6017532348632812, "learning_rate": 1.7974599141817953e-05, "loss": 0.4133, "step": 1651 }, { "epoch": 0.07875479703477702, "grad_norm": 1.2655279636383057, "learning_rate": 1.7972228498146243e-05, "loss": 0.6146, "step": 1652 }, { "epoch": 0.07880246943007652, "grad_norm": 1.6031553745269775, "learning_rate": 1.7969856624441778e-05, "loss": 0.8803, "step": 1653 }, { "epoch": 0.07885014182537602, "grad_norm": 1.5829217433929443, "learning_rate": 1.7967483521070502e-05, "loss": 0.9961, "step": 1654 }, { "epoch": 0.07889781422067552, "grad_norm": 1.976746916770935, "learning_rate": 1.7965109188398572e-05, "loss": 0.8139, "step": 1655 }, { "epoch": 0.07894548661597502, "grad_norm": 1.462019920349121, "learning_rate": 1.796273362679232e-05, "loss": 0.4961, "step": 1656 }, { "epoch": 0.07899315901127452, "grad_norm": 1.5143170356750488, "learning_rate": 1.7960356836618265e-05, "loss": 0.8146, "step": 1657 }, { "epoch": 0.07904083140657402, "grad_norm": 6.419618606567383, "learning_rate": 1.795797881824313e-05, "loss": 0.2693, "step": 1658 }, { "epoch": 0.07908850380187353, "grad_norm": 3.069215774536133, "learning_rate": 1.7955599572033816e-05, "loss": 0.7735, "step": 1659 }, { "epoch": 0.07913617619717303, "grad_norm": 1.1946492195129395, "learning_rate": 1.795321909835741e-05, "loss": 0.5344, "step": 1660 }, { "epoch": 0.07918384859247253, "grad_norm": 3.634439706802368, "learning_rate": 1.79508373975812e-05, "loss": 1.1054, "step": 1661 }, { "epoch": 0.07923152098777203, "grad_norm": 1.0564801692962646, "learning_rate": 1.794845447007266e-05, "loss": 0.6648, "step": 1662 }, { "epoch": 0.07927919338307153, "grad_norm": 1.1457446813583374, "learning_rate": 1.7946070316199448e-05, "loss": 0.3502, "step": 1663 }, { "epoch": 0.07932686577837103, "grad_norm": 3.404604911804199, "learning_rate": 1.794368493632942e-05, "loss": 0.3236, "step": 1664 }, { "epoch": 0.07937453817367053, "grad_norm": 1.3143208026885986, "learning_rate": 1.79412983308306e-05, "loss": 0.2308, "step": 1665 }, { "epoch": 0.07942221056897004, "grad_norm": 1.3976773023605347, "learning_rate": 1.7938910500071233e-05, "loss": 0.872, "step": 1666 }, { "epoch": 0.07946988296426954, "grad_norm": 9.12065601348877, "learning_rate": 1.793652144441973e-05, "loss": 1.4998, "step": 1667 }, { "epoch": 0.07951755535956904, "grad_norm": 2.228480577468872, "learning_rate": 1.79341311642447e-05, "loss": 0.7514, "step": 1668 }, { "epoch": 0.07956522775486854, "grad_norm": 1.416662573814392, "learning_rate": 1.7931739659914936e-05, "loss": 0.6806, "step": 1669 }, { "epoch": 0.07961290015016804, "grad_norm": 1.256575345993042, "learning_rate": 1.792934693179942e-05, "loss": 0.7686, "step": 1670 }, { "epoch": 0.07966057254546755, "grad_norm": 2.7229995727539062, "learning_rate": 1.7926952980267335e-05, "loss": 1.1292, "step": 1671 }, { "epoch": 0.07970824494076705, "grad_norm": 2.3350911140441895, "learning_rate": 1.7924557805688033e-05, "loss": 0.9161, "step": 1672 }, { "epoch": 0.07975591733606655, "grad_norm": 1.7685816287994385, "learning_rate": 1.792216140843107e-05, "loss": 0.8183, "step": 1673 }, { "epoch": 0.07980358973136606, "grad_norm": 2.5546646118164062, "learning_rate": 1.791976378886618e-05, "loss": 0.378, "step": 1674 }, { "epoch": 0.07985126212666556, "grad_norm": 1.115749716758728, "learning_rate": 1.79173649473633e-05, "loss": 0.5229, "step": 1675 }, { "epoch": 0.07989893452196506, "grad_norm": 1.6336308717727661, "learning_rate": 1.7914964884292543e-05, "loss": 0.7181, "step": 1676 }, { "epoch": 0.07994660691726456, "grad_norm": 1.73137629032135, "learning_rate": 1.7912563600024212e-05, "loss": 0.7605, "step": 1677 }, { "epoch": 0.07999427931256406, "grad_norm": 1.929141879081726, "learning_rate": 1.79101610949288e-05, "loss": 0.9823, "step": 1678 }, { "epoch": 0.08004195170786356, "grad_norm": 1.803404688835144, "learning_rate": 1.7907757369376984e-05, "loss": 0.9783, "step": 1679 }, { "epoch": 0.08008962410316306, "grad_norm": 1.7051862478256226, "learning_rate": 1.7905352423739648e-05, "loss": 0.8257, "step": 1680 }, { "epoch": 0.08013729649846256, "grad_norm": 2.1527657508850098, "learning_rate": 1.790294625838784e-05, "loss": 0.8811, "step": 1681 }, { "epoch": 0.08018496889376207, "grad_norm": 1.4188205003738403, "learning_rate": 1.790053887369281e-05, "loss": 0.6163, "step": 1682 }, { "epoch": 0.08023264128906157, "grad_norm": 1.7763410806655884, "learning_rate": 1.7898130270025992e-05, "loss": 0.8388, "step": 1683 }, { "epoch": 0.08028031368436107, "grad_norm": 2.461472511291504, "learning_rate": 1.7895720447759007e-05, "loss": 0.6545, "step": 1684 }, { "epoch": 0.08032798607966057, "grad_norm": 1.487349510192871, "learning_rate": 1.7893309407263665e-05, "loss": 0.8915, "step": 1685 }, { "epoch": 0.08037565847496007, "grad_norm": 1.9209725856781006, "learning_rate": 1.789089714891197e-05, "loss": 0.8508, "step": 1686 }, { "epoch": 0.08042333087025957, "grad_norm": 1.7194499969482422, "learning_rate": 1.7888483673076104e-05, "loss": 0.6629, "step": 1687 }, { "epoch": 0.08047100326555907, "grad_norm": 1.2119414806365967, "learning_rate": 1.7886068980128444e-05, "loss": 0.7833, "step": 1688 }, { "epoch": 0.08051867566085857, "grad_norm": 2.0298550128936768, "learning_rate": 1.7883653070441548e-05, "loss": 1.0334, "step": 1689 }, { "epoch": 0.08056634805615809, "grad_norm": 1.4614098072052002, "learning_rate": 1.7881235944388173e-05, "loss": 0.5798, "step": 1690 }, { "epoch": 0.08061402045145759, "grad_norm": 4.900351524353027, "learning_rate": 1.7878817602341252e-05, "loss": 0.3148, "step": 1691 }, { "epoch": 0.08066169284675709, "grad_norm": 8.526938438415527, "learning_rate": 1.7876398044673912e-05, "loss": 0.6285, "step": 1692 }, { "epoch": 0.08070936524205659, "grad_norm": 2.280665397644043, "learning_rate": 1.787397727175946e-05, "loss": 1.1673, "step": 1693 }, { "epoch": 0.08075703763735609, "grad_norm": 1.4818731546401978, "learning_rate": 1.7871555283971408e-05, "loss": 0.7537, "step": 1694 }, { "epoch": 0.08080471003265559, "grad_norm": 2.9122393131256104, "learning_rate": 1.786913208168343e-05, "loss": 1.2202, "step": 1695 }, { "epoch": 0.08085238242795509, "grad_norm": 1.7488834857940674, "learning_rate": 1.7866707665269413e-05, "loss": 0.6305, "step": 1696 }, { "epoch": 0.08090005482325459, "grad_norm": 2.081796646118164, "learning_rate": 1.7864282035103415e-05, "loss": 0.9135, "step": 1697 }, { "epoch": 0.0809477272185541, "grad_norm": 1.6956685781478882, "learning_rate": 1.7861855191559682e-05, "loss": 0.7385, "step": 1698 }, { "epoch": 0.0809953996138536, "grad_norm": 3.47800612449646, "learning_rate": 1.785942713501266e-05, "loss": 0.7017, "step": 1699 }, { "epoch": 0.0810430720091531, "grad_norm": 1.4851592779159546, "learning_rate": 1.785699786583696e-05, "loss": 1.1012, "step": 1700 }, { "epoch": 0.0810907444044526, "grad_norm": 2.6097171306610107, "learning_rate": 1.7854567384407407e-05, "loss": 1.1458, "step": 1701 }, { "epoch": 0.0811384167997521, "grad_norm": 2.019005537033081, "learning_rate": 1.785213569109899e-05, "loss": 0.6346, "step": 1702 }, { "epoch": 0.0811860891950516, "grad_norm": 1.9929944276809692, "learning_rate": 1.7849702786286897e-05, "loss": 0.6616, "step": 1703 }, { "epoch": 0.0812337615903511, "grad_norm": 3.2783918380737305, "learning_rate": 1.78472686703465e-05, "loss": 1.0011, "step": 1704 }, { "epoch": 0.0812814339856506, "grad_norm": 3.1229450702667236, "learning_rate": 1.784483334365336e-05, "loss": 0.7122, "step": 1705 }, { "epoch": 0.08132910638095012, "grad_norm": 1.5935554504394531, "learning_rate": 1.784239680658322e-05, "loss": 0.7817, "step": 1706 }, { "epoch": 0.08137677877624962, "grad_norm": 2.66387677192688, "learning_rate": 1.7839959059512016e-05, "loss": 0.5153, "step": 1707 }, { "epoch": 0.08142445117154912, "grad_norm": 1.887986660003662, "learning_rate": 1.7837520102815862e-05, "loss": 1.0532, "step": 1708 }, { "epoch": 0.08147212356684862, "grad_norm": 0.958836019039154, "learning_rate": 1.7835079936871068e-05, "loss": 0.4794, "step": 1709 }, { "epoch": 0.08151979596214812, "grad_norm": 4.073960781097412, "learning_rate": 1.7832638562054126e-05, "loss": 0.8522, "step": 1710 }, { "epoch": 0.08156746835744762, "grad_norm": 1.1035635471343994, "learning_rate": 1.7830195978741716e-05, "loss": 0.5152, "step": 1711 }, { "epoch": 0.08161514075274712, "grad_norm": 1.3051600456237793, "learning_rate": 1.7827752187310702e-05, "loss": 1.0563, "step": 1712 }, { "epoch": 0.08166281314804662, "grad_norm": 1.4117364883422852, "learning_rate": 1.7825307188138133e-05, "loss": 0.9645, "step": 1713 }, { "epoch": 0.08171048554334613, "grad_norm": 1.8378266096115112, "learning_rate": 1.782286098160125e-05, "loss": 1.0543, "step": 1714 }, { "epoch": 0.08175815793864563, "grad_norm": 2.191606283187866, "learning_rate": 1.7820413568077478e-05, "loss": 1.0085, "step": 1715 }, { "epoch": 0.08180583033394513, "grad_norm": 1.9256677627563477, "learning_rate": 1.7817964947944427e-05, "loss": 0.6892, "step": 1716 }, { "epoch": 0.08185350272924463, "grad_norm": 1.6263107061386108, "learning_rate": 1.7815515121579897e-05, "loss": 0.9827, "step": 1717 }, { "epoch": 0.08190117512454413, "grad_norm": 1.3421369791030884, "learning_rate": 1.7813064089361866e-05, "loss": 0.7484, "step": 1718 }, { "epoch": 0.08194884751984363, "grad_norm": 1.5854623317718506, "learning_rate": 1.7810611851668503e-05, "loss": 0.8738, "step": 1719 }, { "epoch": 0.08199651991514313, "grad_norm": 2.45235013961792, "learning_rate": 1.7808158408878167e-05, "loss": 1.0454, "step": 1720 }, { "epoch": 0.08204419231044263, "grad_norm": 1.7934895753860474, "learning_rate": 1.7805703761369398e-05, "loss": 0.7913, "step": 1721 }, { "epoch": 0.08209186470574215, "grad_norm": 1.1314424276351929, "learning_rate": 1.780324790952092e-05, "loss": 0.4605, "step": 1722 }, { "epoch": 0.08213953710104165, "grad_norm": 1.9435144662857056, "learning_rate": 1.7800790853711646e-05, "loss": 0.4018, "step": 1723 }, { "epoch": 0.08218720949634115, "grad_norm": 1.6724966764450073, "learning_rate": 1.779833259432068e-05, "loss": 0.6563, "step": 1724 }, { "epoch": 0.08223488189164065, "grad_norm": 3.376499891281128, "learning_rate": 1.77958731317273e-05, "loss": 0.6798, "step": 1725 }, { "epoch": 0.08228255428694015, "grad_norm": 2.2213170528411865, "learning_rate": 1.7793412466310974e-05, "loss": 0.7773, "step": 1726 }, { "epoch": 0.08233022668223965, "grad_norm": 1.3578178882598877, "learning_rate": 1.779095059845137e-05, "loss": 0.7446, "step": 1727 }, { "epoch": 0.08237789907753915, "grad_norm": 2.146618366241455, "learning_rate": 1.7788487528528314e-05, "loss": 0.358, "step": 1728 }, { "epoch": 0.08242557147283865, "grad_norm": 2.18349552154541, "learning_rate": 1.7786023256921835e-05, "loss": 0.995, "step": 1729 }, { "epoch": 0.08247324386813816, "grad_norm": 2.4205596446990967, "learning_rate": 1.7783557784012154e-05, "loss": 0.9726, "step": 1730 }, { "epoch": 0.08252091626343766, "grad_norm": 1.362740397453308, "learning_rate": 1.7781091110179657e-05, "loss": 0.8428, "step": 1731 }, { "epoch": 0.08256858865873716, "grad_norm": 2.353976011276245, "learning_rate": 1.7778623235804935e-05, "loss": 1.2552, "step": 1732 }, { "epoch": 0.08261626105403666, "grad_norm": 3.3170006275177, "learning_rate": 1.7776154161268753e-05, "loss": 0.545, "step": 1733 }, { "epoch": 0.08266393344933616, "grad_norm": 3.282442092895508, "learning_rate": 1.777368388695206e-05, "loss": 0.4365, "step": 1734 }, { "epoch": 0.08271160584463566, "grad_norm": 1.1182255744934082, "learning_rate": 1.7771212413235997e-05, "loss": 0.673, "step": 1735 }, { "epoch": 0.08275927823993516, "grad_norm": 1.4665732383728027, "learning_rate": 1.776873974050189e-05, "loss": 0.8061, "step": 1736 }, { "epoch": 0.08280695063523466, "grad_norm": 1.7098922729492188, "learning_rate": 1.776626586913124e-05, "loss": 1.0896, "step": 1737 }, { "epoch": 0.08285462303053417, "grad_norm": 1.328294277191162, "learning_rate": 1.7763790799505746e-05, "loss": 0.6194, "step": 1738 }, { "epoch": 0.08290229542583367, "grad_norm": 1.435356855392456, "learning_rate": 1.776131453200728e-05, "loss": 0.9297, "step": 1739 }, { "epoch": 0.08294996782113317, "grad_norm": 1.5327321290969849, "learning_rate": 1.775883706701791e-05, "loss": 0.7011, "step": 1740 }, { "epoch": 0.08299764021643267, "grad_norm": 2.847810745239258, "learning_rate": 1.775635840491988e-05, "loss": 0.5931, "step": 1741 }, { "epoch": 0.08304531261173217, "grad_norm": 1.4638142585754395, "learning_rate": 1.7753878546095625e-05, "loss": 0.9, "step": 1742 }, { "epoch": 0.08309298500703168, "grad_norm": 1.4644008874893188, "learning_rate": 1.7751397490927756e-05, "loss": 0.6017, "step": 1743 }, { "epoch": 0.08314065740233118, "grad_norm": 1.6079331636428833, "learning_rate": 1.7748915239799083e-05, "loss": 0.6664, "step": 1744 }, { "epoch": 0.08318832979763068, "grad_norm": 1.2465012073516846, "learning_rate": 1.7746431793092583e-05, "loss": 0.6426, "step": 1745 }, { "epoch": 0.08323600219293019, "grad_norm": 1.6928510665893555, "learning_rate": 1.774394715119143e-05, "loss": 0.6715, "step": 1746 }, { "epoch": 0.08328367458822969, "grad_norm": 1.229422926902771, "learning_rate": 1.7741461314478986e-05, "loss": 0.5694, "step": 1747 }, { "epoch": 0.08333134698352919, "grad_norm": 2.1797072887420654, "learning_rate": 1.773897428333878e-05, "loss": 0.8181, "step": 1748 }, { "epoch": 0.08337901937882869, "grad_norm": 1.3057845830917358, "learning_rate": 1.773648605815453e-05, "loss": 1.0441, "step": 1749 }, { "epoch": 0.08342669177412819, "grad_norm": 2.776489496231079, "learning_rate": 1.7733996639310157e-05, "loss": 0.5581, "step": 1750 }, { "epoch": 0.08347436416942769, "grad_norm": 1.3012232780456543, "learning_rate": 1.773150602718975e-05, "loss": 0.6365, "step": 1751 }, { "epoch": 0.08352203656472719, "grad_norm": 1.5112508535385132, "learning_rate": 1.772901422217758e-05, "loss": 0.3585, "step": 1752 }, { "epoch": 0.0835697089600267, "grad_norm": 1.2008789777755737, "learning_rate": 1.7726521224658106e-05, "loss": 0.4583, "step": 1753 }, { "epoch": 0.0836173813553262, "grad_norm": 1.3291388750076294, "learning_rate": 1.772402703501598e-05, "loss": 0.7314, "step": 1754 }, { "epoch": 0.0836650537506257, "grad_norm": 1.5987262725830078, "learning_rate": 1.772153165363602e-05, "loss": 0.6169, "step": 1755 }, { "epoch": 0.0837127261459252, "grad_norm": 1.0725740194320679, "learning_rate": 1.771903508090324e-05, "loss": 0.3322, "step": 1756 }, { "epoch": 0.0837603985412247, "grad_norm": 2.2698404788970947, "learning_rate": 1.7716537317202848e-05, "loss": 0.6046, "step": 1757 }, { "epoch": 0.0838080709365242, "grad_norm": 1.8276516199111938, "learning_rate": 1.7714038362920205e-05, "loss": 0.6068, "step": 1758 }, { "epoch": 0.0838557433318237, "grad_norm": 1.4723917245864868, "learning_rate": 1.771153821844088e-05, "loss": 0.8854, "step": 1759 }, { "epoch": 0.0839034157271232, "grad_norm": 1.42435622215271, "learning_rate": 1.7709036884150627e-05, "loss": 0.7149, "step": 1760 }, { "epoch": 0.08395108812242272, "grad_norm": 2.8415591716766357, "learning_rate": 1.770653436043537e-05, "loss": 1.4141, "step": 1761 }, { "epoch": 0.08399876051772222, "grad_norm": 1.649634599685669, "learning_rate": 1.770403064768122e-05, "loss": 0.7977, "step": 1762 }, { "epoch": 0.08404643291302172, "grad_norm": 1.6270283460617065, "learning_rate": 1.770152574627448e-05, "loss": 0.715, "step": 1763 }, { "epoch": 0.08409410530832122, "grad_norm": 1.3826130628585815, "learning_rate": 1.7699019656601624e-05, "loss": 0.5197, "step": 1764 }, { "epoch": 0.08414177770362072, "grad_norm": 3.902764320373535, "learning_rate": 1.7696512379049323e-05, "loss": 0.5795, "step": 1765 }, { "epoch": 0.08418945009892022, "grad_norm": 3.4180245399475098, "learning_rate": 1.7694003914004422e-05, "loss": 0.3168, "step": 1766 }, { "epoch": 0.08423712249421972, "grad_norm": 2.092353582382202, "learning_rate": 1.769149426185395e-05, "loss": 0.6151, "step": 1767 }, { "epoch": 0.08428479488951922, "grad_norm": 1.543492317199707, "learning_rate": 1.7688983422985116e-05, "loss": 0.7875, "step": 1768 }, { "epoch": 0.08433246728481873, "grad_norm": 4.8802876472473145, "learning_rate": 1.7686471397785322e-05, "loss": 0.6484, "step": 1769 }, { "epoch": 0.08438013968011823, "grad_norm": 4.532622337341309, "learning_rate": 1.768395818664215e-05, "loss": 0.5281, "step": 1770 }, { "epoch": 0.08442781207541773, "grad_norm": 1.6347931623458862, "learning_rate": 1.7681443789943354e-05, "loss": 0.9002, "step": 1771 }, { "epoch": 0.08447548447071723, "grad_norm": 3.5466668605804443, "learning_rate": 1.767892820807689e-05, "loss": 0.6705, "step": 1772 }, { "epoch": 0.08452315686601673, "grad_norm": 3.5896871089935303, "learning_rate": 1.7676411441430877e-05, "loss": 0.4944, "step": 1773 }, { "epoch": 0.08457082926131623, "grad_norm": 1.654937744140625, "learning_rate": 1.7673893490393636e-05, "loss": 0.7959, "step": 1774 }, { "epoch": 0.08461850165661573, "grad_norm": 1.2499451637268066, "learning_rate": 1.767137435535365e-05, "loss": 0.2765, "step": 1775 }, { "epoch": 0.08466617405191523, "grad_norm": 1.8417538404464722, "learning_rate": 1.76688540366996e-05, "loss": 0.8872, "step": 1776 }, { "epoch": 0.08471384644721475, "grad_norm": 1.2179772853851318, "learning_rate": 1.766633253482035e-05, "loss": 0.6193, "step": 1777 }, { "epoch": 0.08476151884251425, "grad_norm": 1.9007271528244019, "learning_rate": 1.7663809850104936e-05, "loss": 0.7423, "step": 1778 }, { "epoch": 0.08480919123781375, "grad_norm": 2.493910074234009, "learning_rate": 1.7661285982942588e-05, "loss": 0.6961, "step": 1779 }, { "epoch": 0.08485686363311325, "grad_norm": 1.5310133695602417, "learning_rate": 1.7658760933722702e-05, "loss": 0.3952, "step": 1780 }, { "epoch": 0.08490453602841275, "grad_norm": 1.3763717412948608, "learning_rate": 1.7656234702834877e-05, "loss": 0.6785, "step": 1781 }, { "epoch": 0.08495220842371225, "grad_norm": 1.724355697631836, "learning_rate": 1.7653707290668882e-05, "loss": 1.0242, "step": 1782 }, { "epoch": 0.08499988081901175, "grad_norm": 1.8841114044189453, "learning_rate": 1.765117869761467e-05, "loss": 0.626, "step": 1783 }, { "epoch": 0.08504755321431125, "grad_norm": 1.0602959394454956, "learning_rate": 1.7648648924062378e-05, "loss": 0.5121, "step": 1784 }, { "epoch": 0.08509522560961076, "grad_norm": 3.4438045024871826, "learning_rate": 1.764611797040232e-05, "loss": 1.0139, "step": 1785 }, { "epoch": 0.08514289800491026, "grad_norm": 1.3330975770950317, "learning_rate": 1.7643585837025e-05, "loss": 0.6389, "step": 1786 }, { "epoch": 0.08519057040020976, "grad_norm": 1.560445785522461, "learning_rate": 1.76410525243211e-05, "loss": 0.8744, "step": 1787 }, { "epoch": 0.08523824279550926, "grad_norm": 19.96894645690918, "learning_rate": 1.7638518032681482e-05, "loss": 0.8698, "step": 1788 }, { "epoch": 0.08528591519080876, "grad_norm": 1.475574016571045, "learning_rate": 1.7635982362497195e-05, "loss": 0.8132, "step": 1789 }, { "epoch": 0.08533358758610826, "grad_norm": 1.1431615352630615, "learning_rate": 1.763344551415946e-05, "loss": 0.872, "step": 1790 }, { "epoch": 0.08538125998140776, "grad_norm": 2.552554130554199, "learning_rate": 1.76309074880597e-05, "loss": 0.4446, "step": 1791 }, { "epoch": 0.08542893237670726, "grad_norm": 3.6265108585357666, "learning_rate": 1.762836828458949e-05, "loss": 1.2209, "step": 1792 }, { "epoch": 0.08547660477200678, "grad_norm": 1.0884145498275757, "learning_rate": 1.762582790414061e-05, "loss": 0.639, "step": 1793 }, { "epoch": 0.08552427716730628, "grad_norm": 1.180105209350586, "learning_rate": 1.762328634710502e-05, "loss": 0.5889, "step": 1794 }, { "epoch": 0.08557194956260578, "grad_norm": 1.4643186330795288, "learning_rate": 1.762074361387485e-05, "loss": 0.5076, "step": 1795 }, { "epoch": 0.08561962195790528, "grad_norm": 2.3704681396484375, "learning_rate": 1.761819970484242e-05, "loss": 0.9002, "step": 1796 }, { "epoch": 0.08566729435320478, "grad_norm": 1.3049284219741821, "learning_rate": 1.7615654620400225e-05, "loss": 0.5451, "step": 1797 }, { "epoch": 0.08571496674850428, "grad_norm": 2.093545436859131, "learning_rate": 1.761310836094095e-05, "loss": 0.6278, "step": 1798 }, { "epoch": 0.08576263914380378, "grad_norm": 1.7475173473358154, "learning_rate": 1.7610560926857455e-05, "loss": 0.7074, "step": 1799 }, { "epoch": 0.08581031153910328, "grad_norm": 4.119304180145264, "learning_rate": 1.760801231854278e-05, "loss": 0.5421, "step": 1800 }, { "epoch": 0.08585798393440279, "grad_norm": 4.078807353973389, "learning_rate": 1.7605462536390155e-05, "loss": 0.3838, "step": 1801 }, { "epoch": 0.08590565632970229, "grad_norm": 1.399079442024231, "learning_rate": 1.760291158079298e-05, "loss": 0.8949, "step": 1802 }, { "epoch": 0.08595332872500179, "grad_norm": 1.9099987745285034, "learning_rate": 1.7600359452144845e-05, "loss": 0.6147, "step": 1803 }, { "epoch": 0.08600100112030129, "grad_norm": 2.8720459938049316, "learning_rate": 1.759780615083951e-05, "loss": 0.7017, "step": 1804 }, { "epoch": 0.08604867351560079, "grad_norm": 1.4967843294143677, "learning_rate": 1.7595251677270933e-05, "loss": 0.5165, "step": 1805 }, { "epoch": 0.08609634591090029, "grad_norm": 1.4108738899230957, "learning_rate": 1.7592696031833237e-05, "loss": 0.8112, "step": 1806 }, { "epoch": 0.08614401830619979, "grad_norm": 1.3807586431503296, "learning_rate": 1.7590139214920732e-05, "loss": 0.7269, "step": 1807 }, { "epoch": 0.08619169070149929, "grad_norm": 2.1828525066375732, "learning_rate": 1.758758122692791e-05, "loss": 0.7573, "step": 1808 }, { "epoch": 0.0862393630967988, "grad_norm": 1.4696754217147827, "learning_rate": 1.758502206824944e-05, "loss": 0.9103, "step": 1809 }, { "epoch": 0.0862870354920983, "grad_norm": 11.551321983337402, "learning_rate": 1.7582461739280178e-05, "loss": 1.0546, "step": 1810 }, { "epoch": 0.0863347078873978, "grad_norm": 2.826413631439209, "learning_rate": 1.7579900240415155e-05, "loss": 0.8977, "step": 1811 }, { "epoch": 0.0863823802826973, "grad_norm": 2.0058205127716064, "learning_rate": 1.757733757204958e-05, "loss": 0.8373, "step": 1812 }, { "epoch": 0.0864300526779968, "grad_norm": 1.0884069204330444, "learning_rate": 1.757477373457885e-05, "loss": 0.2408, "step": 1813 }, { "epoch": 0.0864777250732963, "grad_norm": 1.4263980388641357, "learning_rate": 1.757220872839854e-05, "loss": 0.6121, "step": 1814 }, { "epoch": 0.0865253974685958, "grad_norm": 1.186133861541748, "learning_rate": 1.75696425539044e-05, "loss": 0.781, "step": 1815 }, { "epoch": 0.0865730698638953, "grad_norm": 1.87509024143219, "learning_rate": 1.7567075211492365e-05, "loss": 0.3953, "step": 1816 }, { "epoch": 0.08662074225919482, "grad_norm": 2.214392900466919, "learning_rate": 1.756450670155855e-05, "loss": 0.7804, "step": 1817 }, { "epoch": 0.08666841465449432, "grad_norm": 1.5667976140975952, "learning_rate": 1.7561937024499252e-05, "loss": 0.8603, "step": 1818 }, { "epoch": 0.08671608704979382, "grad_norm": 1.3027293682098389, "learning_rate": 1.7559366180710942e-05, "loss": 0.8185, "step": 1819 }, { "epoch": 0.08676375944509332, "grad_norm": 3.0646231174468994, "learning_rate": 1.7556794170590282e-05, "loss": 0.8354, "step": 1820 }, { "epoch": 0.08681143184039282, "grad_norm": 1.5180717706680298, "learning_rate": 1.7554220994534096e-05, "loss": 0.8525, "step": 1821 }, { "epoch": 0.08685910423569232, "grad_norm": 2.257699966430664, "learning_rate": 1.7551646652939405e-05, "loss": 0.9406, "step": 1822 }, { "epoch": 0.08690677663099182, "grad_norm": 3.307598829269409, "learning_rate": 1.7549071146203404e-05, "loss": 1.1554, "step": 1823 }, { "epoch": 0.08695444902629132, "grad_norm": 1.2125195264816284, "learning_rate": 1.7546494474723467e-05, "loss": 0.8746, "step": 1824 }, { "epoch": 0.08700212142159083, "grad_norm": 1.5817060470581055, "learning_rate": 1.7543916638897142e-05, "loss": 0.7744, "step": 1825 }, { "epoch": 0.08704979381689033, "grad_norm": 1.317553162574768, "learning_rate": 1.754133763912217e-05, "loss": 0.7595, "step": 1826 }, { "epoch": 0.08709746621218983, "grad_norm": 2.101595163345337, "learning_rate": 1.753875747579646e-05, "loss": 1.0099, "step": 1827 }, { "epoch": 0.08714513860748933, "grad_norm": 1.7978442907333374, "learning_rate": 1.7536176149318106e-05, "loss": 0.634, "step": 1828 }, { "epoch": 0.08719281100278883, "grad_norm": 3.205493450164795, "learning_rate": 1.7533593660085378e-05, "loss": 1.4576, "step": 1829 }, { "epoch": 0.08724048339808833, "grad_norm": 1.7754584550857544, "learning_rate": 1.7531010008496733e-05, "loss": 0.9474, "step": 1830 }, { "epoch": 0.08728815579338783, "grad_norm": 2.764887809753418, "learning_rate": 1.7528425194950794e-05, "loss": 0.7215, "step": 1831 }, { "epoch": 0.08733582818868733, "grad_norm": 1.807370662689209, "learning_rate": 1.752583921984638e-05, "loss": 0.7225, "step": 1832 }, { "epoch": 0.08738350058398685, "grad_norm": 2.3763256072998047, "learning_rate": 1.752325208358247e-05, "loss": 1.0307, "step": 1833 }, { "epoch": 0.08743117297928635, "grad_norm": 8.486246109008789, "learning_rate": 1.7520663786558243e-05, "loss": 0.0538, "step": 1834 }, { "epoch": 0.08747884537458585, "grad_norm": 3.6323912143707275, "learning_rate": 1.751807432917304e-05, "loss": 0.6765, "step": 1835 }, { "epoch": 0.08752651776988535, "grad_norm": 2.1206586360931396, "learning_rate": 1.7515483711826386e-05, "loss": 1.0793, "step": 1836 }, { "epoch": 0.08757419016518485, "grad_norm": 2.404599666595459, "learning_rate": 1.7512891934917994e-05, "loss": 0.7878, "step": 1837 }, { "epoch": 0.08762186256048435, "grad_norm": 3.5932154655456543, "learning_rate": 1.7510298998847742e-05, "loss": 0.4684, "step": 1838 }, { "epoch": 0.08766953495578385, "grad_norm": 1.0536655187606812, "learning_rate": 1.7507704904015696e-05, "loss": 0.5574, "step": 1839 }, { "epoch": 0.08771720735108335, "grad_norm": 2.6022403240203857, "learning_rate": 1.7505109650822096e-05, "loss": 0.7589, "step": 1840 }, { "epoch": 0.08776487974638286, "grad_norm": 1.5821363925933838, "learning_rate": 1.7502513239667365e-05, "loss": 0.9011, "step": 1841 }, { "epoch": 0.08781255214168236, "grad_norm": 1.4837303161621094, "learning_rate": 1.7499915670952107e-05, "loss": 0.7292, "step": 1842 }, { "epoch": 0.08786022453698186, "grad_norm": 1.6185940504074097, "learning_rate": 1.749731694507709e-05, "loss": 0.5632, "step": 1843 }, { "epoch": 0.08790789693228136, "grad_norm": 1.1037304401397705, "learning_rate": 1.749471706244328e-05, "loss": 0.4907, "step": 1844 }, { "epoch": 0.08795556932758086, "grad_norm": 1.746039867401123, "learning_rate": 1.7492116023451803e-05, "loss": 0.7608, "step": 1845 }, { "epoch": 0.08800324172288036, "grad_norm": 2.322873592376709, "learning_rate": 1.748951382850398e-05, "loss": 0.5382, "step": 1846 }, { "epoch": 0.08805091411817986, "grad_norm": 0.8395581841468811, "learning_rate": 1.7486910478001303e-05, "loss": 0.1646, "step": 1847 }, { "epoch": 0.08809858651347938, "grad_norm": 6.367923259735107, "learning_rate": 1.7484305972345436e-05, "loss": 0.7985, "step": 1848 }, { "epoch": 0.08814625890877888, "grad_norm": 2.696329116821289, "learning_rate": 1.748170031193823e-05, "loss": 0.3606, "step": 1849 }, { "epoch": 0.08819393130407838, "grad_norm": 1.4912505149841309, "learning_rate": 1.7479093497181714e-05, "loss": 0.723, "step": 1850 }, { "epoch": 0.08824160369937788, "grad_norm": 1.5808112621307373, "learning_rate": 1.7476485528478093e-05, "loss": 0.6187, "step": 1851 }, { "epoch": 0.08828927609467738, "grad_norm": 1.9619344472885132, "learning_rate": 1.7473876406229744e-05, "loss": 0.7366, "step": 1852 }, { "epoch": 0.08833694848997688, "grad_norm": 1.5421297550201416, "learning_rate": 1.7471266130839235e-05, "loss": 0.9173, "step": 1853 }, { "epoch": 0.08838462088527638, "grad_norm": 1.360328197479248, "learning_rate": 1.74686547027093e-05, "loss": 0.9051, "step": 1854 }, { "epoch": 0.08843229328057588, "grad_norm": 2.355334758758545, "learning_rate": 1.7466042122242853e-05, "loss": 0.7219, "step": 1855 }, { "epoch": 0.08847996567587539, "grad_norm": 1.2616817951202393, "learning_rate": 1.7463428389842997e-05, "loss": 0.6258, "step": 1856 }, { "epoch": 0.08852763807117489, "grad_norm": 1.2362185716629028, "learning_rate": 1.7460813505912996e-05, "loss": 0.5785, "step": 1857 }, { "epoch": 0.08857531046647439, "grad_norm": 5.145622253417969, "learning_rate": 1.7458197470856305e-05, "loss": 0.8503, "step": 1858 }, { "epoch": 0.08862298286177389, "grad_norm": 3.985865354537964, "learning_rate": 1.7455580285076546e-05, "loss": 0.4183, "step": 1859 }, { "epoch": 0.08867065525707339, "grad_norm": 2.136936664581299, "learning_rate": 1.745296194897753e-05, "loss": 1.044, "step": 1860 }, { "epoch": 0.08871832765237289, "grad_norm": 2.610098123550415, "learning_rate": 1.7450342462963235e-05, "loss": 0.8033, "step": 1861 }, { "epoch": 0.08876600004767239, "grad_norm": 1.7275424003601074, "learning_rate": 1.744772182743782e-05, "loss": 0.9421, "step": 1862 }, { "epoch": 0.08881367244297189, "grad_norm": 3.1618235111236572, "learning_rate": 1.7445100042805627e-05, "loss": 1.017, "step": 1863 }, { "epoch": 0.0888613448382714, "grad_norm": 2.4394078254699707, "learning_rate": 1.744247710947116e-05, "loss": 0.5957, "step": 1864 }, { "epoch": 0.0889090172335709, "grad_norm": 1.6809029579162598, "learning_rate": 1.7439853027839124e-05, "loss": 0.7981, "step": 1865 }, { "epoch": 0.0889566896288704, "grad_norm": 3.8705766201019287, "learning_rate": 1.743722779831438e-05, "loss": 0.7981, "step": 1866 }, { "epoch": 0.0890043620241699, "grad_norm": 2.174025535583496, "learning_rate": 1.7434601421301974e-05, "loss": 0.6841, "step": 1867 }, { "epoch": 0.0890520344194694, "grad_norm": 2.056065797805786, "learning_rate": 1.743197389720713e-05, "loss": 1.1116, "step": 1868 }, { "epoch": 0.0890997068147689, "grad_norm": 3.395608425140381, "learning_rate": 1.7429345226435253e-05, "loss": 0.7421, "step": 1869 }, { "epoch": 0.0891473792100684, "grad_norm": 1.981473445892334, "learning_rate": 1.742671540939191e-05, "loss": 0.7867, "step": 1870 }, { "epoch": 0.0891950516053679, "grad_norm": 3.6642842292785645, "learning_rate": 1.742408444648286e-05, "loss": 0.7023, "step": 1871 }, { "epoch": 0.08924272400066742, "grad_norm": 1.628873348236084, "learning_rate": 1.7421452338114036e-05, "loss": 0.8923, "step": 1872 }, { "epoch": 0.08929039639596692, "grad_norm": 1.214039921760559, "learning_rate": 1.741881908469154e-05, "loss": 0.5864, "step": 1873 }, { "epoch": 0.08933806879126642, "grad_norm": 1.7834521532058716, "learning_rate": 1.741618468662166e-05, "loss": 1.0572, "step": 1874 }, { "epoch": 0.08938574118656592, "grad_norm": 15.063992500305176, "learning_rate": 1.7413549144310856e-05, "loss": 0.7541, "step": 1875 }, { "epoch": 0.08943341358186542, "grad_norm": 1.4960886240005493, "learning_rate": 1.741091245816576e-05, "loss": 0.6211, "step": 1876 }, { "epoch": 0.08948108597716492, "grad_norm": 3.792848825454712, "learning_rate": 1.7408274628593192e-05, "loss": 0.6678, "step": 1877 }, { "epoch": 0.08952875837246442, "grad_norm": 1.362256646156311, "learning_rate": 1.740563565600014e-05, "loss": 0.8872, "step": 1878 }, { "epoch": 0.08957643076776392, "grad_norm": 1.2339116334915161, "learning_rate": 1.7402995540793764e-05, "loss": 0.6797, "step": 1879 }, { "epoch": 0.08962410316306343, "grad_norm": 1.098677158355713, "learning_rate": 1.7400354283381416e-05, "loss": 0.6452, "step": 1880 }, { "epoch": 0.08967177555836293, "grad_norm": 2.208592653274536, "learning_rate": 1.7397711884170613e-05, "loss": 1.2245, "step": 1881 }, { "epoch": 0.08971944795366243, "grad_norm": 1.1734225749969482, "learning_rate": 1.7395068343569047e-05, "loss": 0.7495, "step": 1882 }, { "epoch": 0.08976712034896193, "grad_norm": 1.860902190208435, "learning_rate": 1.739242366198459e-05, "loss": 0.6499, "step": 1883 }, { "epoch": 0.08981479274426143, "grad_norm": 4.4588117599487305, "learning_rate": 1.7389777839825284e-05, "loss": 0.5647, "step": 1884 }, { "epoch": 0.08986246513956093, "grad_norm": 2.0444424152374268, "learning_rate": 1.7387130877499364e-05, "loss": 1.0156, "step": 1885 }, { "epoch": 0.08991013753486043, "grad_norm": 2.0839266777038574, "learning_rate": 1.738448277541522e-05, "loss": 1.0359, "step": 1886 }, { "epoch": 0.08995780993015993, "grad_norm": 3.001190423965454, "learning_rate": 1.738183353398143e-05, "loss": 0.6405, "step": 1887 }, { "epoch": 0.09000548232545945, "grad_norm": 1.8037917613983154, "learning_rate": 1.7379183153606743e-05, "loss": 0.6912, "step": 1888 }, { "epoch": 0.09005315472075895, "grad_norm": 2.767890453338623, "learning_rate": 1.7376531634700087e-05, "loss": 0.5733, "step": 1889 }, { "epoch": 0.09010082711605845, "grad_norm": 1.2392313480377197, "learning_rate": 1.737387897767056e-05, "loss": 0.6807, "step": 1890 }, { "epoch": 0.09014849951135795, "grad_norm": 1.0875239372253418, "learning_rate": 1.7371225182927447e-05, "loss": 0.5452, "step": 1891 }, { "epoch": 0.09019617190665745, "grad_norm": 1.3470181226730347, "learning_rate": 1.7368570250880198e-05, "loss": 0.6363, "step": 1892 }, { "epoch": 0.09024384430195695, "grad_norm": 1.1659319400787354, "learning_rate": 1.736591418193844e-05, "loss": 0.6894, "step": 1893 }, { "epoch": 0.09029151669725645, "grad_norm": 3.127140760421753, "learning_rate": 1.7363256976511972e-05, "loss": 0.6474, "step": 1894 }, { "epoch": 0.09033918909255595, "grad_norm": 1.647687315940857, "learning_rate": 1.7360598635010787e-05, "loss": 0.4843, "step": 1895 }, { "epoch": 0.09038686148785546, "grad_norm": 5.694189548492432, "learning_rate": 1.735793915784503e-05, "loss": 0.9134, "step": 1896 }, { "epoch": 0.09043453388315496, "grad_norm": 1.61495840549469, "learning_rate": 1.7355278545425033e-05, "loss": 0.3499, "step": 1897 }, { "epoch": 0.09048220627845446, "grad_norm": 1.518541932106018, "learning_rate": 1.73526167981613e-05, "loss": 0.837, "step": 1898 }, { "epoch": 0.09052987867375396, "grad_norm": 1.9157156944274902, "learning_rate": 1.7349953916464512e-05, "loss": 0.7504, "step": 1899 }, { "epoch": 0.09057755106905346, "grad_norm": 1.2907934188842773, "learning_rate": 1.7347289900745525e-05, "loss": 0.3012, "step": 1900 }, { "epoch": 0.09062522346435296, "grad_norm": 1.853825569152832, "learning_rate": 1.734462475141537e-05, "loss": 0.7005, "step": 1901 }, { "epoch": 0.09067289585965246, "grad_norm": 2.314300537109375, "learning_rate": 1.734195846888525e-05, "loss": 0.4777, "step": 1902 }, { "epoch": 0.09072056825495196, "grad_norm": 1.8735827207565308, "learning_rate": 1.7339291053566544e-05, "loss": 0.4288, "step": 1903 }, { "epoch": 0.09076824065025148, "grad_norm": 2.166480541229248, "learning_rate": 1.7336622505870813e-05, "loss": 0.8705, "step": 1904 }, { "epoch": 0.09081591304555098, "grad_norm": 1.713133692741394, "learning_rate": 1.733395282620978e-05, "loss": 0.516, "step": 1905 }, { "epoch": 0.09086358544085048, "grad_norm": 4.2016472816467285, "learning_rate": 1.7331282014995348e-05, "loss": 0.8166, "step": 1906 }, { "epoch": 0.09091125783614998, "grad_norm": 4.468966484069824, "learning_rate": 1.7328610072639604e-05, "loss": 0.0342, "step": 1907 }, { "epoch": 0.09095893023144948, "grad_norm": 1.7308324575424194, "learning_rate": 1.732593699955479e-05, "loss": 0.6723, "step": 1908 }, { "epoch": 0.09100660262674898, "grad_norm": 1.9870880842208862, "learning_rate": 1.7323262796153342e-05, "loss": 0.7298, "step": 1909 }, { "epoch": 0.09105427502204848, "grad_norm": 3.1176748275756836, "learning_rate": 1.7320587462847858e-05, "loss": 0.8399, "step": 1910 }, { "epoch": 0.09110194741734798, "grad_norm": 1.5841773748397827, "learning_rate": 1.7317911000051123e-05, "loss": 0.7456, "step": 1911 }, { "epoch": 0.09114961981264749, "grad_norm": 2.189419984817505, "learning_rate": 1.7315233408176073e-05, "loss": 1.1764, "step": 1912 }, { "epoch": 0.09119729220794699, "grad_norm": 4.199376583099365, "learning_rate": 1.7312554687635843e-05, "loss": 0.9191, "step": 1913 }, { "epoch": 0.09124496460324649, "grad_norm": 2.043581485748291, "learning_rate": 1.730987483884373e-05, "loss": 0.6627, "step": 1914 }, { "epoch": 0.09129263699854599, "grad_norm": 1.6215832233428955, "learning_rate": 1.7307193862213204e-05, "loss": 0.9885, "step": 1915 }, { "epoch": 0.09134030939384549, "grad_norm": 1.460170030593872, "learning_rate": 1.7304511758157917e-05, "loss": 0.6846, "step": 1916 }, { "epoch": 0.09138798178914499, "grad_norm": 2.418365955352783, "learning_rate": 1.7301828527091687e-05, "loss": 1.0374, "step": 1917 }, { "epoch": 0.09143565418444449, "grad_norm": 1.3839024305343628, "learning_rate": 1.7299144169428513e-05, "loss": 0.6728, "step": 1918 }, { "epoch": 0.09148332657974399, "grad_norm": 3.182678699493408, "learning_rate": 1.7296458685582557e-05, "loss": 0.7446, "step": 1919 }, { "epoch": 0.0915309989750435, "grad_norm": 2.252993106842041, "learning_rate": 1.7293772075968163e-05, "loss": 0.972, "step": 1920 }, { "epoch": 0.091578671370343, "grad_norm": 4.426293849945068, "learning_rate": 1.729108434099985e-05, "loss": 0.7634, "step": 1921 }, { "epoch": 0.0916263437656425, "grad_norm": 1.2585158348083496, "learning_rate": 1.7288395481092307e-05, "loss": 1.0344, "step": 1922 }, { "epoch": 0.091674016160942, "grad_norm": 2.2036473751068115, "learning_rate": 1.7285705496660398e-05, "loss": 0.9065, "step": 1923 }, { "epoch": 0.0917216885562415, "grad_norm": 4.132787227630615, "learning_rate": 1.728301438811916e-05, "loss": 0.2619, "step": 1924 }, { "epoch": 0.091769360951541, "grad_norm": 2.7758944034576416, "learning_rate": 1.7280322155883805e-05, "loss": 1.3749, "step": 1925 }, { "epoch": 0.09181703334684051, "grad_norm": 2.305260419845581, "learning_rate": 1.7277628800369708e-05, "loss": 0.7091, "step": 1926 }, { "epoch": 0.09186470574214001, "grad_norm": 1.176971197128296, "learning_rate": 1.7274934321992435e-05, "loss": 0.4675, "step": 1927 }, { "epoch": 0.09191237813743952, "grad_norm": 0.9848259687423706, "learning_rate": 1.7272238721167715e-05, "loss": 0.4155, "step": 1928 }, { "epoch": 0.09196005053273902, "grad_norm": 1.3047305345535278, "learning_rate": 1.7269541998311446e-05, "loss": 0.7043, "step": 1929 }, { "epoch": 0.09200772292803852, "grad_norm": 2.0758633613586426, "learning_rate": 1.726684415383971e-05, "loss": 0.692, "step": 1930 }, { "epoch": 0.09205539532333802, "grad_norm": 1.6682102680206299, "learning_rate": 1.7264145188168755e-05, "loss": 0.6277, "step": 1931 }, { "epoch": 0.09210306771863752, "grad_norm": 2.587801694869995, "learning_rate": 1.7261445101715006e-05, "loss": 0.9891, "step": 1932 }, { "epoch": 0.09215074011393702, "grad_norm": 1.7019946575164795, "learning_rate": 1.7258743894895054e-05, "loss": 1.0188, "step": 1933 }, { "epoch": 0.09219841250923652, "grad_norm": 1.3179090023040771, "learning_rate": 1.7256041568125673e-05, "loss": 0.6405, "step": 1934 }, { "epoch": 0.09224608490453602, "grad_norm": 3.0413458347320557, "learning_rate": 1.7253338121823796e-05, "loss": 0.9162, "step": 1935 }, { "epoch": 0.09229375729983554, "grad_norm": 1.7248799800872803, "learning_rate": 1.7250633556406545e-05, "loss": 1.0734, "step": 1936 }, { "epoch": 0.09234142969513504, "grad_norm": 2.9053101539611816, "learning_rate": 1.72479278722912e-05, "loss": 0.6902, "step": 1937 }, { "epoch": 0.09238910209043454, "grad_norm": 4.004610061645508, "learning_rate": 1.7245221069895227e-05, "loss": 1.6501, "step": 1938 }, { "epoch": 0.09243677448573404, "grad_norm": 3.44722318649292, "learning_rate": 1.7242513149636253e-05, "loss": 0.6126, "step": 1939 }, { "epoch": 0.09248444688103354, "grad_norm": 2.144404411315918, "learning_rate": 1.7239804111932085e-05, "loss": 0.5857, "step": 1940 }, { "epoch": 0.09253211927633304, "grad_norm": 2.8406872749328613, "learning_rate": 1.7237093957200694e-05, "loss": 0.5798, "step": 1941 }, { "epoch": 0.09257979167163254, "grad_norm": 4.07771110534668, "learning_rate": 1.7234382685860236e-05, "loss": 0.4616, "step": 1942 }, { "epoch": 0.09262746406693205, "grad_norm": 2.1129682064056396, "learning_rate": 1.723167029832903e-05, "loss": 0.7992, "step": 1943 }, { "epoch": 0.09267513646223155, "grad_norm": 1.5116686820983887, "learning_rate": 1.7228956795025565e-05, "loss": 0.5673, "step": 1944 }, { "epoch": 0.09272280885753105, "grad_norm": 0.9510489106178284, "learning_rate": 1.7226242176368515e-05, "loss": 0.3807, "step": 1945 }, { "epoch": 0.09277048125283055, "grad_norm": 1.7059881687164307, "learning_rate": 1.7223526442776712e-05, "loss": 0.7045, "step": 1946 }, { "epoch": 0.09281815364813005, "grad_norm": 2.838369607925415, "learning_rate": 1.7220809594669165e-05, "loss": 0.8146, "step": 1947 }, { "epoch": 0.09286582604342955, "grad_norm": 1.5864217281341553, "learning_rate": 1.7218091632465057e-05, "loss": 0.7502, "step": 1948 }, { "epoch": 0.09291349843872905, "grad_norm": 1.5864360332489014, "learning_rate": 1.7215372556583745e-05, "loss": 0.5373, "step": 1949 }, { "epoch": 0.09296117083402855, "grad_norm": 2.4728856086730957, "learning_rate": 1.721265236744475e-05, "loss": 0.4941, "step": 1950 }, { "epoch": 0.09300884322932806, "grad_norm": 1.8207998275756836, "learning_rate": 1.720993106546777e-05, "loss": 0.7659, "step": 1951 }, { "epoch": 0.09305651562462756, "grad_norm": 3.9849860668182373, "learning_rate": 1.7207208651072677e-05, "loss": 0.5498, "step": 1952 }, { "epoch": 0.09310418801992706, "grad_norm": 1.8867381811141968, "learning_rate": 1.7204485124679506e-05, "loss": 0.8471, "step": 1953 }, { "epoch": 0.09315186041522656, "grad_norm": 1.7520923614501953, "learning_rate": 1.720176048670847e-05, "loss": 0.6194, "step": 1954 }, { "epoch": 0.09319953281052606, "grad_norm": 2.5389485359191895, "learning_rate": 1.7199034737579962e-05, "loss": 0.7681, "step": 1955 }, { "epoch": 0.09324720520582556, "grad_norm": 3.7277798652648926, "learning_rate": 1.7196307877714523e-05, "loss": 0.8089, "step": 1956 }, { "epoch": 0.09329487760112506, "grad_norm": 3.313062906265259, "learning_rate": 1.719357990753289e-05, "loss": 1.8292, "step": 1957 }, { "epoch": 0.09334254999642456, "grad_norm": 1.7429782152175903, "learning_rate": 1.7190850827455957e-05, "loss": 0.7497, "step": 1958 }, { "epoch": 0.09339022239172408, "grad_norm": 1.3355697393417358, "learning_rate": 1.7188120637904792e-05, "loss": 0.5364, "step": 1959 }, { "epoch": 0.09343789478702358, "grad_norm": 2.6606807708740234, "learning_rate": 1.7185389339300633e-05, "loss": 0.4391, "step": 1960 }, { "epoch": 0.09348556718232308, "grad_norm": 2.7550387382507324, "learning_rate": 1.7182656932064894e-05, "loss": 0.6177, "step": 1961 }, { "epoch": 0.09353323957762258, "grad_norm": 1.9658668041229248, "learning_rate": 1.7179923416619163e-05, "loss": 0.3187, "step": 1962 }, { "epoch": 0.09358091197292208, "grad_norm": 1.2750409841537476, "learning_rate": 1.7177188793385183e-05, "loss": 0.7569, "step": 1963 }, { "epoch": 0.09362858436822158, "grad_norm": 1.623232364654541, "learning_rate": 1.7174453062784885e-05, "loss": 1.0568, "step": 1964 }, { "epoch": 0.09367625676352108, "grad_norm": 1.665212869644165, "learning_rate": 1.717171622524036e-05, "loss": 0.6435, "step": 1965 }, { "epoch": 0.09372392915882058, "grad_norm": 1.5377470254898071, "learning_rate": 1.716897828117388e-05, "loss": 0.902, "step": 1966 }, { "epoch": 0.0937716015541201, "grad_norm": 1.6416821479797363, "learning_rate": 1.7166239231007872e-05, "loss": 0.6625, "step": 1967 }, { "epoch": 0.0938192739494196, "grad_norm": 1.7977384328842163, "learning_rate": 1.716349907516495e-05, "loss": 0.8591, "step": 1968 }, { "epoch": 0.0938669463447191, "grad_norm": 1.4452898502349854, "learning_rate": 1.7160757814067895e-05, "loss": 0.6399, "step": 1969 }, { "epoch": 0.0939146187400186, "grad_norm": 2.179748296737671, "learning_rate": 1.7158015448139645e-05, "loss": 0.7528, "step": 1970 }, { "epoch": 0.0939622911353181, "grad_norm": 3.3109803199768066, "learning_rate": 1.715527197780333e-05, "loss": 0.6812, "step": 1971 }, { "epoch": 0.0940099635306176, "grad_norm": 2.4174749851226807, "learning_rate": 1.715252740348223e-05, "loss": 0.9528, "step": 1972 }, { "epoch": 0.0940576359259171, "grad_norm": 1.1760822534561157, "learning_rate": 1.714978172559981e-05, "loss": 0.6298, "step": 1973 }, { "epoch": 0.0941053083212166, "grad_norm": 1.441654920578003, "learning_rate": 1.7147034944579698e-05, "loss": 0.7768, "step": 1974 }, { "epoch": 0.09415298071651611, "grad_norm": 2.0077085494995117, "learning_rate": 1.7144287060845696e-05, "loss": 1.2364, "step": 1975 }, { "epoch": 0.09420065311181561, "grad_norm": 1.926422119140625, "learning_rate": 1.714153807482177e-05, "loss": 1.2169, "step": 1976 }, { "epoch": 0.09424832550711511, "grad_norm": 2.1796329021453857, "learning_rate": 1.713878798693206e-05, "loss": 0.7759, "step": 1977 }, { "epoch": 0.09429599790241461, "grad_norm": 1.9557242393493652, "learning_rate": 1.7136036797600882e-05, "loss": 0.9748, "step": 1978 }, { "epoch": 0.09434367029771411, "grad_norm": 1.706542730331421, "learning_rate": 1.7133284507252715e-05, "loss": 0.9708, "step": 1979 }, { "epoch": 0.09439134269301361, "grad_norm": 4.758184909820557, "learning_rate": 1.7130531116312202e-05, "loss": 1.5215, "step": 1980 }, { "epoch": 0.09443901508831311, "grad_norm": 2.4019150733947754, "learning_rate": 1.7127776625204173e-05, "loss": 0.838, "step": 1981 }, { "epoch": 0.09448668748361261, "grad_norm": 2.318128824234009, "learning_rate": 1.7125021034353614e-05, "loss": 0.7269, "step": 1982 }, { "epoch": 0.09453435987891212, "grad_norm": 1.6884076595306396, "learning_rate": 1.7122264344185677e-05, "loss": 1.1387, "step": 1983 }, { "epoch": 0.09458203227421162, "grad_norm": 1.348828911781311, "learning_rate": 1.71195065551257e-05, "loss": 0.7455, "step": 1984 }, { "epoch": 0.09462970466951112, "grad_norm": 2.05991530418396, "learning_rate": 1.711674766759918e-05, "loss": 0.7384, "step": 1985 }, { "epoch": 0.09467737706481062, "grad_norm": 1.3874835968017578, "learning_rate": 1.711398768203178e-05, "loss": 0.7218, "step": 1986 }, { "epoch": 0.09472504946011012, "grad_norm": 1.3682280778884888, "learning_rate": 1.7111226598849344e-05, "loss": 0.6839, "step": 1987 }, { "epoch": 0.09477272185540962, "grad_norm": 1.6690754890441895, "learning_rate": 1.710846441847787e-05, "loss": 0.6134, "step": 1988 }, { "epoch": 0.09482039425070912, "grad_norm": 1.2285358905792236, "learning_rate": 1.710570114134354e-05, "loss": 0.5035, "step": 1989 }, { "epoch": 0.09486806664600862, "grad_norm": 1.678049087524414, "learning_rate": 1.7102936767872704e-05, "loss": 0.7206, "step": 1990 }, { "epoch": 0.09491573904130814, "grad_norm": 2.6629180908203125, "learning_rate": 1.7100171298491866e-05, "loss": 0.7183, "step": 1991 }, { "epoch": 0.09496341143660764, "grad_norm": 1.8690797090530396, "learning_rate": 1.709740473362772e-05, "loss": 0.5785, "step": 1992 }, { "epoch": 0.09501108383190714, "grad_norm": 1.3942818641662598, "learning_rate": 1.7094637073707105e-05, "loss": 0.883, "step": 1993 }, { "epoch": 0.09505875622720664, "grad_norm": 1.499047875404358, "learning_rate": 1.7091868319157055e-05, "loss": 0.6783, "step": 1994 }, { "epoch": 0.09510642862250614, "grad_norm": 8.317218780517578, "learning_rate": 1.7089098470404755e-05, "loss": 0.5295, "step": 1995 }, { "epoch": 0.09515410101780564, "grad_norm": 1.0080902576446533, "learning_rate": 1.7086327527877563e-05, "loss": 0.4816, "step": 1996 }, { "epoch": 0.09520177341310514, "grad_norm": 1.6783298254013062, "learning_rate": 1.708355549200301e-05, "loss": 0.7104, "step": 1997 }, { "epoch": 0.09524944580840464, "grad_norm": 2.276475667953491, "learning_rate": 1.708078236320879e-05, "loss": 0.884, "step": 1998 }, { "epoch": 0.09529711820370415, "grad_norm": 2.21710467338562, "learning_rate": 1.707800814192277e-05, "loss": 0.6756, "step": 1999 }, { "epoch": 0.09534479059900365, "grad_norm": 1.2017117738723755, "learning_rate": 1.7075232828572982e-05, "loss": 0.5945, "step": 2000 }, { "epoch": 0.09539246299430315, "grad_norm": 1.1749649047851562, "learning_rate": 1.707245642358763e-05, "loss": 0.6184, "step": 2001 }, { "epoch": 0.09544013538960265, "grad_norm": 2.2212271690368652, "learning_rate": 1.7069678927395083e-05, "loss": 0.7151, "step": 2002 }, { "epoch": 0.09548780778490215, "grad_norm": 2.354304075241089, "learning_rate": 1.706690034042388e-05, "loss": 1.1843, "step": 2003 }, { "epoch": 0.09553548018020165, "grad_norm": 3.686828374862671, "learning_rate": 1.7064120663102737e-05, "loss": 0.4615, "step": 2004 }, { "epoch": 0.09558315257550115, "grad_norm": 0.9127634763717651, "learning_rate": 1.7061339895860513e-05, "loss": 0.2305, "step": 2005 }, { "epoch": 0.09563082497080065, "grad_norm": 2.3917996883392334, "learning_rate": 1.7058558039126266e-05, "loss": 0.7754, "step": 2006 }, { "epoch": 0.09567849736610017, "grad_norm": 1.8181190490722656, "learning_rate": 1.7055775093329202e-05, "loss": 0.5913, "step": 2007 }, { "epoch": 0.09572616976139967, "grad_norm": 1.2763667106628418, "learning_rate": 1.70529910588987e-05, "loss": 0.7572, "step": 2008 }, { "epoch": 0.09577384215669917, "grad_norm": 1.1833136081695557, "learning_rate": 1.705020593626431e-05, "loss": 0.5322, "step": 2009 }, { "epoch": 0.09582151455199867, "grad_norm": 1.3872525691986084, "learning_rate": 1.704741972585575e-05, "loss": 0.8074, "step": 2010 }, { "epoch": 0.09586918694729817, "grad_norm": 2.3611814975738525, "learning_rate": 1.7044632428102896e-05, "loss": 1.1546, "step": 2011 }, { "epoch": 0.09591685934259767, "grad_norm": 1.9899375438690186, "learning_rate": 1.7041844043435806e-05, "loss": 1.1795, "step": 2012 }, { "epoch": 0.09596453173789717, "grad_norm": 3.1207892894744873, "learning_rate": 1.7039054572284697e-05, "loss": 1.0391, "step": 2013 }, { "epoch": 0.09601220413319667, "grad_norm": 1.7606477737426758, "learning_rate": 1.7036264015079958e-05, "loss": 0.2867, "step": 2014 }, { "epoch": 0.09605987652849618, "grad_norm": 1.2277532815933228, "learning_rate": 1.7033472372252138e-05, "loss": 0.9605, "step": 2015 }, { "epoch": 0.09610754892379568, "grad_norm": 2.3358285427093506, "learning_rate": 1.703067964423196e-05, "loss": 1.2601, "step": 2016 }, { "epoch": 0.09615522131909518, "grad_norm": 2.904574155807495, "learning_rate": 1.7027885831450318e-05, "loss": 1.2073, "step": 2017 }, { "epoch": 0.09620289371439468, "grad_norm": 1.747771978378296, "learning_rate": 1.7025090934338266e-05, "loss": 0.8577, "step": 2018 }, { "epoch": 0.09625056610969418, "grad_norm": 2.4418985843658447, "learning_rate": 1.7022294953327025e-05, "loss": 0.7614, "step": 2019 }, { "epoch": 0.09629823850499368, "grad_norm": 1.9450327157974243, "learning_rate": 1.701949788884799e-05, "loss": 0.8571, "step": 2020 }, { "epoch": 0.09634591090029318, "grad_norm": 2.4468445777893066, "learning_rate": 1.701669974133272e-05, "loss": 0.813, "step": 2021 }, { "epoch": 0.09639358329559268, "grad_norm": 2.8782572746276855, "learning_rate": 1.7013900511212932e-05, "loss": 1.02, "step": 2022 }, { "epoch": 0.0964412556908922, "grad_norm": 1.7242225408554077, "learning_rate": 1.7011100198920528e-05, "loss": 0.8248, "step": 2023 }, { "epoch": 0.0964889280861917, "grad_norm": 2.1396656036376953, "learning_rate": 1.7008298804887565e-05, "loss": 0.7572, "step": 2024 }, { "epoch": 0.0965366004814912, "grad_norm": 1.185307502746582, "learning_rate": 1.7005496329546263e-05, "loss": 0.6201, "step": 2025 }, { "epoch": 0.0965842728767907, "grad_norm": 1.039168357849121, "learning_rate": 1.7002692773329026e-05, "loss": 0.8732, "step": 2026 }, { "epoch": 0.0966319452720902, "grad_norm": 13.774724006652832, "learning_rate": 1.6999888136668404e-05, "loss": 1.2008, "step": 2027 }, { "epoch": 0.0966796176673897, "grad_norm": 1.1632906198501587, "learning_rate": 1.6997082419997127e-05, "loss": 0.8072, "step": 2028 }, { "epoch": 0.0967272900626892, "grad_norm": 2.1988863945007324, "learning_rate": 1.6994275623748092e-05, "loss": 0.809, "step": 2029 }, { "epoch": 0.09677496245798871, "grad_norm": 2.1799628734588623, "learning_rate": 1.6991467748354352e-05, "loss": 0.5426, "step": 2030 }, { "epoch": 0.09682263485328821, "grad_norm": 1.75131356716156, "learning_rate": 1.6988658794249134e-05, "loss": 0.7247, "step": 2031 }, { "epoch": 0.09687030724858771, "grad_norm": 2.1120526790618896, "learning_rate": 1.6985848761865838e-05, "loss": 0.6104, "step": 2032 }, { "epoch": 0.09691797964388721, "grad_norm": 1.498909592628479, "learning_rate": 1.698303765163802e-05, "loss": 0.7601, "step": 2033 }, { "epoch": 0.09696565203918671, "grad_norm": 2.717449903488159, "learning_rate": 1.69802254639994e-05, "loss": 0.7249, "step": 2034 }, { "epoch": 0.09701332443448621, "grad_norm": 3.706339120864868, "learning_rate": 1.6977412199383872e-05, "loss": 0.9153, "step": 2035 }, { "epoch": 0.09706099682978571, "grad_norm": 2.9623730182647705, "learning_rate": 1.6974597858225502e-05, "loss": 0.9144, "step": 2036 }, { "epoch": 0.09710866922508521, "grad_norm": 1.8177005052566528, "learning_rate": 1.69717824409585e-05, "loss": 0.8277, "step": 2037 }, { "epoch": 0.09715634162038472, "grad_norm": 2.097759485244751, "learning_rate": 1.6968965948017266e-05, "loss": 0.7698, "step": 2038 }, { "epoch": 0.09720401401568422, "grad_norm": 1.4746730327606201, "learning_rate": 1.696614837983635e-05, "loss": 0.6789, "step": 2039 }, { "epoch": 0.09725168641098372, "grad_norm": 1.5426316261291504, "learning_rate": 1.696332973685048e-05, "loss": 0.6031, "step": 2040 }, { "epoch": 0.09729935880628322, "grad_norm": 2.9810078144073486, "learning_rate": 1.696051001949454e-05, "loss": 0.5428, "step": 2041 }, { "epoch": 0.09734703120158272, "grad_norm": 2.5323169231414795, "learning_rate": 1.6957689228203583e-05, "loss": 0.7858, "step": 2042 }, { "epoch": 0.09739470359688222, "grad_norm": 2.825571060180664, "learning_rate": 1.6954867363412827e-05, "loss": 1.1447, "step": 2043 }, { "epoch": 0.09744237599218172, "grad_norm": 1.7198448181152344, "learning_rate": 1.695204442555766e-05, "loss": 0.7294, "step": 2044 }, { "epoch": 0.09749004838748122, "grad_norm": 1.7498078346252441, "learning_rate": 1.6949220415073627e-05, "loss": 0.9041, "step": 2045 }, { "epoch": 0.09753772078278074, "grad_norm": 1.821815848350525, "learning_rate": 1.6946395332396447e-05, "loss": 0.9126, "step": 2046 }, { "epoch": 0.09758539317808024, "grad_norm": 1.6222103834152222, "learning_rate": 1.6943569177962005e-05, "loss": 0.9846, "step": 2047 }, { "epoch": 0.09763306557337974, "grad_norm": 1.535952091217041, "learning_rate": 1.6940741952206342e-05, "loss": 0.6679, "step": 2048 }, { "epoch": 0.09768073796867924, "grad_norm": 1.5526154041290283, "learning_rate": 1.693791365556567e-05, "loss": 0.7836, "step": 2049 }, { "epoch": 0.09772841036397874, "grad_norm": 1.3706446886062622, "learning_rate": 1.6935084288476365e-05, "loss": 0.6875, "step": 2050 }, { "epoch": 0.09777608275927824, "grad_norm": 1.430282473564148, "learning_rate": 1.693225385137498e-05, "loss": 0.7229, "step": 2051 }, { "epoch": 0.09782375515457774, "grad_norm": 1.5612531900405884, "learning_rate": 1.692942234469821e-05, "loss": 0.7721, "step": 2052 }, { "epoch": 0.09787142754987724, "grad_norm": 1.980994701385498, "learning_rate": 1.692658976888293e-05, "loss": 0.6863, "step": 2053 }, { "epoch": 0.09791909994517675, "grad_norm": 1.9815247058868408, "learning_rate": 1.6923756124366184e-05, "loss": 0.9042, "step": 2054 }, { "epoch": 0.09796677234047625, "grad_norm": 1.246296763420105, "learning_rate": 1.6920921411585164e-05, "loss": 0.6666, "step": 2055 }, { "epoch": 0.09801444473577575, "grad_norm": 1.1949427127838135, "learning_rate": 1.691808563097724e-05, "loss": 0.672, "step": 2056 }, { "epoch": 0.09806211713107525, "grad_norm": 1.914755940437317, "learning_rate": 1.691524878297995e-05, "loss": 0.9311, "step": 2057 }, { "epoch": 0.09810978952637475, "grad_norm": 1.8586196899414062, "learning_rate": 1.6912410868030987e-05, "loss": 0.6847, "step": 2058 }, { "epoch": 0.09815746192167425, "grad_norm": 1.937045931816101, "learning_rate": 1.6909571886568206e-05, "loss": 0.5415, "step": 2059 }, { "epoch": 0.09820513431697375, "grad_norm": 1.8809586763381958, "learning_rate": 1.690673183902964e-05, "loss": 0.566, "step": 2060 }, { "epoch": 0.09825280671227325, "grad_norm": 1.4162172079086304, "learning_rate": 1.690389072585348e-05, "loss": 0.7743, "step": 2061 }, { "epoch": 0.09830047910757277, "grad_norm": 2.278230905532837, "learning_rate": 1.6901048547478073e-05, "loss": 1.1739, "step": 2062 }, { "epoch": 0.09834815150287227, "grad_norm": 2.1912760734558105, "learning_rate": 1.6898205304341947e-05, "loss": 0.8375, "step": 2063 }, { "epoch": 0.09839582389817177, "grad_norm": 1.8050661087036133, "learning_rate": 1.6895360996883777e-05, "loss": 0.7892, "step": 2064 }, { "epoch": 0.09844349629347127, "grad_norm": 2.9720239639282227, "learning_rate": 1.6892515625542413e-05, "loss": 1.4879, "step": 2065 }, { "epoch": 0.09849116868877077, "grad_norm": 1.624472975730896, "learning_rate": 1.688966919075687e-05, "loss": 0.7702, "step": 2066 }, { "epoch": 0.09853884108407027, "grad_norm": 2.010019063949585, "learning_rate": 1.6886821692966314e-05, "loss": 0.9441, "step": 2067 }, { "epoch": 0.09858651347936977, "grad_norm": 1.6731029748916626, "learning_rate": 1.68839731326101e-05, "loss": 0.6575, "step": 2068 }, { "epoch": 0.09863418587466927, "grad_norm": 1.3605769872665405, "learning_rate": 1.6881123510127716e-05, "loss": 0.9946, "step": 2069 }, { "epoch": 0.09868185826996878, "grad_norm": 1.8632115125656128, "learning_rate": 1.687827282595884e-05, "loss": 0.846, "step": 2070 }, { "epoch": 0.09872953066526828, "grad_norm": 1.5721104145050049, "learning_rate": 1.68754210805433e-05, "loss": 0.7917, "step": 2071 }, { "epoch": 0.09877720306056778, "grad_norm": 1.5502465963363647, "learning_rate": 1.6872568274321087e-05, "loss": 0.7017, "step": 2072 }, { "epoch": 0.09882487545586728, "grad_norm": 2.1058290004730225, "learning_rate": 1.6869714407732364e-05, "loss": 0.6976, "step": 2073 }, { "epoch": 0.09887254785116678, "grad_norm": 1.461596131324768, "learning_rate": 1.6866859481217453e-05, "loss": 0.7409, "step": 2074 }, { "epoch": 0.09892022024646628, "grad_norm": 2.0125601291656494, "learning_rate": 1.686400349521684e-05, "loss": 0.4736, "step": 2075 }, { "epoch": 0.09896789264176578, "grad_norm": 1.7437502145767212, "learning_rate": 1.6861146450171177e-05, "loss": 0.8363, "step": 2076 }, { "epoch": 0.09901556503706528, "grad_norm": 1.3860204219818115, "learning_rate": 1.6858288346521265e-05, "loss": 0.5161, "step": 2077 }, { "epoch": 0.0990632374323648, "grad_norm": 1.4488458633422852, "learning_rate": 1.685542918470809e-05, "loss": 0.5047, "step": 2078 }, { "epoch": 0.0991109098276643, "grad_norm": 1.8164137601852417, "learning_rate": 1.6852568965172794e-05, "loss": 0.936, "step": 2079 }, { "epoch": 0.0991585822229638, "grad_norm": 1.4916291236877441, "learning_rate": 1.684970768835667e-05, "loss": 0.9029, "step": 2080 }, { "epoch": 0.0992062546182633, "grad_norm": 1.8367319107055664, "learning_rate": 1.684684535470119e-05, "loss": 0.8575, "step": 2081 }, { "epoch": 0.0992539270135628, "grad_norm": 3.2794089317321777, "learning_rate": 1.6843981964647976e-05, "loss": 0.7888, "step": 2082 }, { "epoch": 0.0993015994088623, "grad_norm": 1.320497989654541, "learning_rate": 1.684111751863883e-05, "loss": 0.7774, "step": 2083 }, { "epoch": 0.0993492718041618, "grad_norm": 3.0137689113616943, "learning_rate": 1.68382520171157e-05, "loss": 0.5132, "step": 2084 }, { "epoch": 0.0993969441994613, "grad_norm": 2.3612728118896484, "learning_rate": 1.68353854605207e-05, "loss": 0.9144, "step": 2085 }, { "epoch": 0.09944461659476081, "grad_norm": 2.156235694885254, "learning_rate": 1.683251784929612e-05, "loss": 0.6677, "step": 2086 }, { "epoch": 0.09949228899006031, "grad_norm": 9.895931243896484, "learning_rate": 1.6829649183884395e-05, "loss": 0.766, "step": 2087 }, { "epoch": 0.09953996138535981, "grad_norm": 1.4509943723678589, "learning_rate": 1.6826779464728132e-05, "loss": 0.7959, "step": 2088 }, { "epoch": 0.09958763378065931, "grad_norm": 1.1089597940444946, "learning_rate": 1.68239086922701e-05, "loss": 0.4899, "step": 2089 }, { "epoch": 0.09963530617595881, "grad_norm": 1.2637901306152344, "learning_rate": 1.6821036866953226e-05, "loss": 0.9288, "step": 2090 }, { "epoch": 0.09968297857125831, "grad_norm": 1.168062448501587, "learning_rate": 1.681816398922061e-05, "loss": 0.7841, "step": 2091 }, { "epoch": 0.09973065096655781, "grad_norm": 5.275248050689697, "learning_rate": 1.6815290059515504e-05, "loss": 1.3262, "step": 2092 }, { "epoch": 0.09977832336185731, "grad_norm": 1.2903294563293457, "learning_rate": 1.6812415078281324e-05, "loss": 0.5238, "step": 2093 }, { "epoch": 0.09982599575715682, "grad_norm": 1.475982427597046, "learning_rate": 1.6809539045961653e-05, "loss": 0.6875, "step": 2094 }, { "epoch": 0.09987366815245632, "grad_norm": 2.128023862838745, "learning_rate": 1.6806661963000234e-05, "loss": 1.2745, "step": 2095 }, { "epoch": 0.09992134054775582, "grad_norm": 1.3259506225585938, "learning_rate": 1.6803783829840967e-05, "loss": 0.8099, "step": 2096 }, { "epoch": 0.09996901294305532, "grad_norm": 1.4391194581985474, "learning_rate": 1.6800904646927923e-05, "loss": 0.721, "step": 2097 }, { "epoch": 0.10001668533835482, "grad_norm": 1.266096591949463, "learning_rate": 1.679802441470532e-05, "loss": 0.7507, "step": 2098 }, { "epoch": 0.10006435773365432, "grad_norm": 1.0737955570220947, "learning_rate": 1.6795143133617562e-05, "loss": 0.9599, "step": 2099 }, { "epoch": 0.10011203012895382, "grad_norm": 1.1342597007751465, "learning_rate": 1.6792260804109196e-05, "loss": 0.7678, "step": 2100 }, { "epoch": 0.10015970252425332, "grad_norm": 4.525983810424805, "learning_rate": 1.6789377426624935e-05, "loss": 0.7239, "step": 2101 }, { "epoch": 0.10020737491955284, "grad_norm": 1.9952508211135864, "learning_rate": 1.678649300160965e-05, "loss": 0.5961, "step": 2102 }, { "epoch": 0.10025504731485234, "grad_norm": 1.8149811029434204, "learning_rate": 1.6783607529508382e-05, "loss": 0.8781, "step": 2103 }, { "epoch": 0.10030271971015184, "grad_norm": 2.002239465713501, "learning_rate": 1.6780721010766335e-05, "loss": 0.708, "step": 2104 }, { "epoch": 0.10035039210545134, "grad_norm": 1.3707174062728882, "learning_rate": 1.677783344582886e-05, "loss": 0.3173, "step": 2105 }, { "epoch": 0.10039806450075084, "grad_norm": 1.379638910293579, "learning_rate": 1.6774944835141484e-05, "loss": 0.7963, "step": 2106 }, { "epoch": 0.10044573689605034, "grad_norm": 1.7050049304962158, "learning_rate": 1.6772055179149886e-05, "loss": 1.0089, "step": 2107 }, { "epoch": 0.10049340929134984, "grad_norm": 1.4809306859970093, "learning_rate": 1.676916447829992e-05, "loss": 0.8716, "step": 2108 }, { "epoch": 0.10054108168664934, "grad_norm": 1.2359148263931274, "learning_rate": 1.6766272733037575e-05, "loss": 0.6663, "step": 2109 }, { "epoch": 0.10058875408194885, "grad_norm": 2.501166582107544, "learning_rate": 1.676337994380903e-05, "loss": 0.8092, "step": 2110 }, { "epoch": 0.10063642647724835, "grad_norm": 2.328268051147461, "learning_rate": 1.6760486111060607e-05, "loss": 0.5132, "step": 2111 }, { "epoch": 0.10068409887254785, "grad_norm": 2.1387557983398438, "learning_rate": 1.67575912352388e-05, "loss": 0.8506, "step": 2112 }, { "epoch": 0.10073177126784735, "grad_norm": 5.4658355712890625, "learning_rate": 1.6754695316790255e-05, "loss": 0.7039, "step": 2113 }, { "epoch": 0.10077944366314685, "grad_norm": 2.01517915725708, "learning_rate": 1.675179835616178e-05, "loss": 0.5598, "step": 2114 }, { "epoch": 0.10082711605844635, "grad_norm": 0.8727918267250061, "learning_rate": 1.674890035380035e-05, "loss": 0.236, "step": 2115 }, { "epoch": 0.10087478845374585, "grad_norm": 3.1234583854675293, "learning_rate": 1.6746001310153095e-05, "loss": 0.4793, "step": 2116 }, { "epoch": 0.10092246084904535, "grad_norm": 2.680319309234619, "learning_rate": 1.674310122566731e-05, "loss": 0.5716, "step": 2117 }, { "epoch": 0.10097013324434487, "grad_norm": 2.324587821960449, "learning_rate": 1.6740200100790445e-05, "loss": 0.5267, "step": 2118 }, { "epoch": 0.10101780563964437, "grad_norm": 2.781690835952759, "learning_rate": 1.673729793597011e-05, "loss": 0.5512, "step": 2119 }, { "epoch": 0.10106547803494387, "grad_norm": 1.82707679271698, "learning_rate": 1.6734394731654094e-05, "loss": 0.7171, "step": 2120 }, { "epoch": 0.10111315043024337, "grad_norm": 3.9949862957000732, "learning_rate": 1.6731490488290316e-05, "loss": 0.7294, "step": 2121 }, { "epoch": 0.10116082282554287, "grad_norm": 1.492210865020752, "learning_rate": 1.672858520632688e-05, "loss": 0.6102, "step": 2122 }, { "epoch": 0.10120849522084237, "grad_norm": 1.8430193662643433, "learning_rate": 1.6725678886212034e-05, "loss": 0.706, "step": 2123 }, { "epoch": 0.10125616761614187, "grad_norm": 1.5846210718154907, "learning_rate": 1.67227715283942e-05, "loss": 0.5812, "step": 2124 }, { "epoch": 0.10130384001144138, "grad_norm": 1.7634451389312744, "learning_rate": 1.6719863133321947e-05, "loss": 0.5163, "step": 2125 }, { "epoch": 0.10135151240674088, "grad_norm": 1.6493277549743652, "learning_rate": 1.6716953701444014e-05, "loss": 0.1924, "step": 2126 }, { "epoch": 0.10139918480204038, "grad_norm": 3.5931055545806885, "learning_rate": 1.6714043233209296e-05, "loss": 1.0592, "step": 2127 }, { "epoch": 0.10144685719733988, "grad_norm": 2.2461369037628174, "learning_rate": 1.6711131729066853e-05, "loss": 0.5719, "step": 2128 }, { "epoch": 0.10149452959263938, "grad_norm": 6.407316207885742, "learning_rate": 1.6708219189465894e-05, "loss": 0.9888, "step": 2129 }, { "epoch": 0.10154220198793888, "grad_norm": 1.3646303415298462, "learning_rate": 1.670530561485579e-05, "loss": 0.362, "step": 2130 }, { "epoch": 0.10158987438323838, "grad_norm": 2.5397655963897705, "learning_rate": 1.6702391005686088e-05, "loss": 0.6678, "step": 2131 }, { "epoch": 0.10163754677853788, "grad_norm": 1.4877567291259766, "learning_rate": 1.669947536240647e-05, "loss": 0.6829, "step": 2132 }, { "epoch": 0.1016852191738374, "grad_norm": 1.0454562902450562, "learning_rate": 1.6696558685466793e-05, "loss": 0.65, "step": 2133 }, { "epoch": 0.1017328915691369, "grad_norm": 2.704338550567627, "learning_rate": 1.6693640975317078e-05, "loss": 0.8758, "step": 2134 }, { "epoch": 0.1017805639644364, "grad_norm": 3.273530960083008, "learning_rate": 1.669072223240749e-05, "loss": 0.916, "step": 2135 }, { "epoch": 0.1018282363597359, "grad_norm": 1.9205434322357178, "learning_rate": 1.668780245718836e-05, "loss": 1.0245, "step": 2136 }, { "epoch": 0.1018759087550354, "grad_norm": 1.2210420370101929, "learning_rate": 1.6684881650110186e-05, "loss": 0.7859, "step": 2137 }, { "epoch": 0.1019235811503349, "grad_norm": 1.1936390399932861, "learning_rate": 1.668195981162361e-05, "loss": 0.8466, "step": 2138 }, { "epoch": 0.1019712535456344, "grad_norm": 1.6571040153503418, "learning_rate": 1.667903694217945e-05, "loss": 0.7082, "step": 2139 }, { "epoch": 0.1020189259409339, "grad_norm": 2.0592868328094482, "learning_rate": 1.667611304222867e-05, "loss": 0.9833, "step": 2140 }, { "epoch": 0.10206659833623341, "grad_norm": 3.235091209411621, "learning_rate": 1.6673188112222394e-05, "loss": 0.3433, "step": 2141 }, { "epoch": 0.10211427073153291, "grad_norm": 2.5346899032592773, "learning_rate": 1.6670262152611916e-05, "loss": 0.6287, "step": 2142 }, { "epoch": 0.10216194312683241, "grad_norm": 1.5297119617462158, "learning_rate": 1.6667335163848682e-05, "loss": 0.8937, "step": 2143 }, { "epoch": 0.10220961552213191, "grad_norm": 3.354051351547241, "learning_rate": 1.6664407146384287e-05, "loss": 1.0068, "step": 2144 }, { "epoch": 0.10225728791743141, "grad_norm": 1.9681328535079956, "learning_rate": 1.6661478100670502e-05, "loss": 0.7261, "step": 2145 }, { "epoch": 0.10230496031273091, "grad_norm": 1.2717812061309814, "learning_rate": 1.6658548027159245e-05, "loss": 0.7001, "step": 2146 }, { "epoch": 0.10235263270803041, "grad_norm": 14.975186347961426, "learning_rate": 1.6655616926302594e-05, "loss": 1.1774, "step": 2147 }, { "epoch": 0.10240030510332991, "grad_norm": 3.2782044410705566, "learning_rate": 1.6652684798552793e-05, "loss": 0.7452, "step": 2148 }, { "epoch": 0.10244797749862943, "grad_norm": 2.828608274459839, "learning_rate": 1.664975164436224e-05, "loss": 0.9119, "step": 2149 }, { "epoch": 0.10249564989392893, "grad_norm": 1.7152819633483887, "learning_rate": 1.6646817464183485e-05, "loss": 0.7362, "step": 2150 }, { "epoch": 0.10254332228922843, "grad_norm": 2.272935390472412, "learning_rate": 1.6643882258469247e-05, "loss": 0.6275, "step": 2151 }, { "epoch": 0.10259099468452793, "grad_norm": 3.6414568424224854, "learning_rate": 1.6640946027672395e-05, "loss": 0.288, "step": 2152 }, { "epoch": 0.10263866707982743, "grad_norm": 1.2086769342422485, "learning_rate": 1.6638008772245956e-05, "loss": 0.7764, "step": 2153 }, { "epoch": 0.10268633947512693, "grad_norm": 1.8948204517364502, "learning_rate": 1.663507049264312e-05, "loss": 0.7969, "step": 2154 }, { "epoch": 0.10273401187042643, "grad_norm": 1.3868238925933838, "learning_rate": 1.663213118931724e-05, "loss": 0.4417, "step": 2155 }, { "epoch": 0.10278168426572593, "grad_norm": 2.0186548233032227, "learning_rate": 1.6629190862721813e-05, "loss": 0.9057, "step": 2156 }, { "epoch": 0.10282935666102544, "grad_norm": 1.5707331895828247, "learning_rate": 1.6626249513310505e-05, "loss": 0.6445, "step": 2157 }, { "epoch": 0.10287702905632494, "grad_norm": 2.2689332962036133, "learning_rate": 1.662330714153713e-05, "loss": 0.4731, "step": 2158 }, { "epoch": 0.10292470145162444, "grad_norm": 4.399372100830078, "learning_rate": 1.6620363747855675e-05, "loss": 1.1345, "step": 2159 }, { "epoch": 0.10297237384692394, "grad_norm": 2.119353771209717, "learning_rate": 1.6617419332720267e-05, "loss": 0.4877, "step": 2160 }, { "epoch": 0.10302004624222344, "grad_norm": 1.2800583839416504, "learning_rate": 1.6614473896585206e-05, "loss": 0.7413, "step": 2161 }, { "epoch": 0.10306771863752294, "grad_norm": 1.5062272548675537, "learning_rate": 1.6611527439904934e-05, "loss": 0.6188, "step": 2162 }, { "epoch": 0.10311539103282244, "grad_norm": 1.4648534059524536, "learning_rate": 1.6608579963134067e-05, "loss": 0.6466, "step": 2163 }, { "epoch": 0.10316306342812194, "grad_norm": 4.780557632446289, "learning_rate": 1.6605631466727365e-05, "loss": 0.9409, "step": 2164 }, { "epoch": 0.10321073582342145, "grad_norm": 1.4820151329040527, "learning_rate": 1.6602681951139752e-05, "loss": 0.5981, "step": 2165 }, { "epoch": 0.10325840821872095, "grad_norm": 1.5938807725906372, "learning_rate": 1.659973141682631e-05, "loss": 0.6836, "step": 2166 }, { "epoch": 0.10330608061402045, "grad_norm": 1.9833928346633911, "learning_rate": 1.6596779864242274e-05, "loss": 0.6456, "step": 2167 }, { "epoch": 0.10335375300931995, "grad_norm": 5.635646820068359, "learning_rate": 1.659382729384304e-05, "loss": 1.2212, "step": 2168 }, { "epoch": 0.10340142540461945, "grad_norm": 1.8882306814193726, "learning_rate": 1.6590873706084158e-05, "loss": 1.2086, "step": 2169 }, { "epoch": 0.10344909779991895, "grad_norm": 8.547324180603027, "learning_rate": 1.6587919101421333e-05, "loss": 0.6105, "step": 2170 }, { "epoch": 0.10349677019521845, "grad_norm": 1.314596176147461, "learning_rate": 1.6584963480310433e-05, "loss": 0.6726, "step": 2171 }, { "epoch": 0.10354444259051795, "grad_norm": 1.857081651687622, "learning_rate": 1.658200684320748e-05, "loss": 0.6906, "step": 2172 }, { "epoch": 0.10359211498581747, "grad_norm": 1.2799304723739624, "learning_rate": 1.6579049190568656e-05, "loss": 0.7751, "step": 2173 }, { "epoch": 0.10363978738111697, "grad_norm": 1.6420232057571411, "learning_rate": 1.6576090522850292e-05, "loss": 0.9214, "step": 2174 }, { "epoch": 0.10368745977641647, "grad_norm": 0.7685630917549133, "learning_rate": 1.657313084050888e-05, "loss": 0.2469, "step": 2175 }, { "epoch": 0.10373513217171597, "grad_norm": 1.9455807209014893, "learning_rate": 1.6570170144001067e-05, "loss": 0.7618, "step": 2176 }, { "epoch": 0.10378280456701547, "grad_norm": 4.829035758972168, "learning_rate": 1.6567208433783666e-05, "loss": 1.111, "step": 2177 }, { "epoch": 0.10383047696231497, "grad_norm": 1.4978998899459839, "learning_rate": 1.656424571031363e-05, "loss": 0.4346, "step": 2178 }, { "epoch": 0.10387814935761447, "grad_norm": 4.826923847198486, "learning_rate": 1.656128197404808e-05, "loss": 0.4021, "step": 2179 }, { "epoch": 0.10392582175291397, "grad_norm": 2.096256971359253, "learning_rate": 1.655831722544429e-05, "loss": 0.3349, "step": 2180 }, { "epoch": 0.10397349414821348, "grad_norm": 1.3158085346221924, "learning_rate": 1.655535146495969e-05, "loss": 0.5099, "step": 2181 }, { "epoch": 0.10402116654351298, "grad_norm": 3.5498571395874023, "learning_rate": 1.655238469305186e-05, "loss": 0.8809, "step": 2182 }, { "epoch": 0.10406883893881248, "grad_norm": 1.8730432987213135, "learning_rate": 1.6549416910178554e-05, "loss": 0.8471, "step": 2183 }, { "epoch": 0.10411651133411198, "grad_norm": 1.8743762969970703, "learning_rate": 1.6546448116797664e-05, "loss": 1.0615, "step": 2184 }, { "epoch": 0.10416418372941148, "grad_norm": 2.668163299560547, "learning_rate": 1.6543478313367244e-05, "loss": 1.2889, "step": 2185 }, { "epoch": 0.10421185612471098, "grad_norm": 2.1281440258026123, "learning_rate": 1.6540507500345507e-05, "loss": 0.8726, "step": 2186 }, { "epoch": 0.10425952852001048, "grad_norm": 1.0205135345458984, "learning_rate": 1.6537535678190815e-05, "loss": 0.4469, "step": 2187 }, { "epoch": 0.10430720091530998, "grad_norm": 1.752551794052124, "learning_rate": 1.6534562847361693e-05, "loss": 0.8005, "step": 2188 }, { "epoch": 0.1043548733106095, "grad_norm": 1.29226553440094, "learning_rate": 1.6531589008316816e-05, "loss": 0.9255, "step": 2189 }, { "epoch": 0.104402545705909, "grad_norm": 1.2720071077346802, "learning_rate": 1.6528614161515015e-05, "loss": 0.7026, "step": 2190 }, { "epoch": 0.1044502181012085, "grad_norm": 9.173748016357422, "learning_rate": 1.6525638307415284e-05, "loss": 1.222, "step": 2191 }, { "epoch": 0.104497890496508, "grad_norm": 1.315583348274231, "learning_rate": 1.6522661446476762e-05, "loss": 0.7529, "step": 2192 }, { "epoch": 0.1045455628918075, "grad_norm": 1.7935317754745483, "learning_rate": 1.651968357915875e-05, "loss": 1.0363, "step": 2193 }, { "epoch": 0.104593235287107, "grad_norm": 1.6630464792251587, "learning_rate": 1.6516704705920702e-05, "loss": 0.8377, "step": 2194 }, { "epoch": 0.1046409076824065, "grad_norm": 1.3903051614761353, "learning_rate": 1.6513724827222225e-05, "loss": 0.598, "step": 2195 }, { "epoch": 0.104688580077706, "grad_norm": 1.5116666555404663, "learning_rate": 1.6510743943523084e-05, "loss": 0.864, "step": 2196 }, { "epoch": 0.10473625247300551, "grad_norm": 1.4434409141540527, "learning_rate": 1.6507762055283202e-05, "loss": 0.8152, "step": 2197 }, { "epoch": 0.10478392486830501, "grad_norm": 1.8115547895431519, "learning_rate": 1.6504779162962655e-05, "loss": 0.6913, "step": 2198 }, { "epoch": 0.10483159726360451, "grad_norm": 1.300264835357666, "learning_rate": 1.6501795267021666e-05, "loss": 0.8378, "step": 2199 }, { "epoch": 0.10487926965890401, "grad_norm": 1.5797349214553833, "learning_rate": 1.6498810367920622e-05, "loss": 0.7589, "step": 2200 }, { "epoch": 0.10492694205420351, "grad_norm": 2.0074970722198486, "learning_rate": 1.6495824466120067e-05, "loss": 0.8529, "step": 2201 }, { "epoch": 0.10497461444950301, "grad_norm": 2.0940513610839844, "learning_rate": 1.649283756208069e-05, "loss": 0.5206, "step": 2202 }, { "epoch": 0.10502228684480251, "grad_norm": 2.029885768890381, "learning_rate": 1.6489849656263336e-05, "loss": 1.0338, "step": 2203 }, { "epoch": 0.10506995924010201, "grad_norm": 2.2008490562438965, "learning_rate": 1.6486860749129014e-05, "loss": 0.7554, "step": 2204 }, { "epoch": 0.10511763163540153, "grad_norm": 2.8390586376190186, "learning_rate": 1.6483870841138883e-05, "loss": 1.3196, "step": 2205 }, { "epoch": 0.10516530403070103, "grad_norm": 1.9962180852890015, "learning_rate": 1.648087993275425e-05, "loss": 0.8282, "step": 2206 }, { "epoch": 0.10521297642600053, "grad_norm": 1.681196928024292, "learning_rate": 1.6477888024436586e-05, "loss": 0.5429, "step": 2207 }, { "epoch": 0.10526064882130003, "grad_norm": 2.6438839435577393, "learning_rate": 1.6474895116647506e-05, "loss": 0.635, "step": 2208 }, { "epoch": 0.10530832121659953, "grad_norm": 1.4192218780517578, "learning_rate": 1.647190120984879e-05, "loss": 0.5541, "step": 2209 }, { "epoch": 0.10535599361189903, "grad_norm": 2.140742778778076, "learning_rate": 1.6468906304502365e-05, "loss": 1.0936, "step": 2210 }, { "epoch": 0.10540366600719853, "grad_norm": 2.0596237182617188, "learning_rate": 1.6465910401070312e-05, "loss": 0.8405, "step": 2211 }, { "epoch": 0.10545133840249803, "grad_norm": 2.1606948375701904, "learning_rate": 1.6462913500014872e-05, "loss": 0.5889, "step": 2212 }, { "epoch": 0.10549901079779754, "grad_norm": 1.2513320446014404, "learning_rate": 1.6459915601798436e-05, "loss": 0.5982, "step": 2213 }, { "epoch": 0.10554668319309704, "grad_norm": 4.00132942199707, "learning_rate": 1.6456916706883542e-05, "loss": 1.0461, "step": 2214 }, { "epoch": 0.10559435558839654, "grad_norm": 1.846450686454773, "learning_rate": 1.64539168157329e-05, "loss": 0.6549, "step": 2215 }, { "epoch": 0.10564202798369604, "grad_norm": 1.7543492317199707, "learning_rate": 1.645091592880935e-05, "loss": 0.879, "step": 2216 }, { "epoch": 0.10568970037899554, "grad_norm": 1.6810600757598877, "learning_rate": 1.6447914046575906e-05, "loss": 0.901, "step": 2217 }, { "epoch": 0.10573737277429504, "grad_norm": 1.2927902936935425, "learning_rate": 1.6444911169495727e-05, "loss": 0.713, "step": 2218 }, { "epoch": 0.10578504516959454, "grad_norm": 1.2742785215377808, "learning_rate": 1.644190729803212e-05, "loss": 0.7169, "step": 2219 }, { "epoch": 0.10583271756489405, "grad_norm": 0.8367409706115723, "learning_rate": 1.6438902432648558e-05, "loss": 0.3573, "step": 2220 }, { "epoch": 0.10588038996019355, "grad_norm": 1.2617907524108887, "learning_rate": 1.643589657380866e-05, "loss": 0.6326, "step": 2221 }, { "epoch": 0.10592806235549306, "grad_norm": 1.3132867813110352, "learning_rate": 1.6432889721976196e-05, "loss": 0.7305, "step": 2222 }, { "epoch": 0.10597573475079256, "grad_norm": 1.303444266319275, "learning_rate": 1.6429881877615094e-05, "loss": 0.5563, "step": 2223 }, { "epoch": 0.10602340714609206, "grad_norm": 2.7119174003601074, "learning_rate": 1.642687304118943e-05, "loss": 0.8568, "step": 2224 }, { "epoch": 0.10607107954139156, "grad_norm": 1.5195167064666748, "learning_rate": 1.6423863213163443e-05, "loss": 0.9413, "step": 2225 }, { "epoch": 0.10611875193669106, "grad_norm": 1.1507636308670044, "learning_rate": 1.642085239400152e-05, "loss": 0.5005, "step": 2226 }, { "epoch": 0.10616642433199056, "grad_norm": 1.988328218460083, "learning_rate": 1.6417840584168185e-05, "loss": 0.4961, "step": 2227 }, { "epoch": 0.10621409672729007, "grad_norm": 1.5127321481704712, "learning_rate": 1.6414827784128145e-05, "loss": 0.6924, "step": 2228 }, { "epoch": 0.10626176912258957, "grad_norm": 1.266319751739502, "learning_rate": 1.6411813994346237e-05, "loss": 0.6391, "step": 2229 }, { "epoch": 0.10630944151788907, "grad_norm": 1.2617981433868408, "learning_rate": 1.640879921528746e-05, "loss": 0.8278, "step": 2230 }, { "epoch": 0.10635711391318857, "grad_norm": 1.318671464920044, "learning_rate": 1.640578344741696e-05, "loss": 0.5378, "step": 2231 }, { "epoch": 0.10640478630848807, "grad_norm": 3.89214825630188, "learning_rate": 1.640276669120004e-05, "loss": 0.8062, "step": 2232 }, { "epoch": 0.10645245870378757, "grad_norm": 1.4413197040557861, "learning_rate": 1.6399748947102154e-05, "loss": 0.7213, "step": 2233 }, { "epoch": 0.10650013109908707, "grad_norm": 2.349151849746704, "learning_rate": 1.6396730215588913e-05, "loss": 0.8822, "step": 2234 }, { "epoch": 0.10654780349438657, "grad_norm": 1.390069603919983, "learning_rate": 1.6393710497126075e-05, "loss": 0.9154, "step": 2235 }, { "epoch": 0.10659547588968608, "grad_norm": 1.2687277793884277, "learning_rate": 1.6390689792179546e-05, "loss": 0.7076, "step": 2236 }, { "epoch": 0.10664314828498558, "grad_norm": 2.731628894805908, "learning_rate": 1.6387668101215397e-05, "loss": 0.9346, "step": 2237 }, { "epoch": 0.10669082068028508, "grad_norm": 1.3221529722213745, "learning_rate": 1.6384645424699835e-05, "loss": 0.8724, "step": 2238 }, { "epoch": 0.10673849307558458, "grad_norm": 1.3455795049667358, "learning_rate": 1.638162176309924e-05, "loss": 0.4084, "step": 2239 }, { "epoch": 0.10678616547088408, "grad_norm": 1.3419333696365356, "learning_rate": 1.637859711688012e-05, "loss": 0.8905, "step": 2240 }, { "epoch": 0.10683383786618358, "grad_norm": 2.7646846771240234, "learning_rate": 1.637557148650915e-05, "loss": 0.8457, "step": 2241 }, { "epoch": 0.10688151026148308, "grad_norm": 6.046072006225586, "learning_rate": 1.637254487245316e-05, "loss": 0.5759, "step": 2242 }, { "epoch": 0.10692918265678258, "grad_norm": 1.567334532737732, "learning_rate": 1.636951727517912e-05, "loss": 0.9092, "step": 2243 }, { "epoch": 0.1069768550520821, "grad_norm": 2.4486351013183594, "learning_rate": 1.6366488695154153e-05, "loss": 1.2778, "step": 2244 }, { "epoch": 0.1070245274473816, "grad_norm": 1.1488258838653564, "learning_rate": 1.636345913284555e-05, "loss": 0.6501, "step": 2245 }, { "epoch": 0.1070721998426811, "grad_norm": 1.7302095890045166, "learning_rate": 1.636042858872073e-05, "loss": 0.942, "step": 2246 }, { "epoch": 0.1071198722379806, "grad_norm": 2.4103667736053467, "learning_rate": 1.6357397063247278e-05, "loss": 0.8577, "step": 2247 }, { "epoch": 0.1071675446332801, "grad_norm": 1.3807567358016968, "learning_rate": 1.6354364556892926e-05, "loss": 0.7379, "step": 2248 }, { "epoch": 0.1072152170285796, "grad_norm": 1.0593301057815552, "learning_rate": 1.6351331070125565e-05, "loss": 0.2429, "step": 2249 }, { "epoch": 0.1072628894238791, "grad_norm": 1.4784756898880005, "learning_rate": 1.634829660341322e-05, "loss": 1.0051, "step": 2250 }, { "epoch": 0.1073105618191786, "grad_norm": 1.805490493774414, "learning_rate": 1.6345261157224088e-05, "loss": 0.9268, "step": 2251 }, { "epoch": 0.10735823421447811, "grad_norm": 1.7306004762649536, "learning_rate": 1.6342224732026503e-05, "loss": 0.4825, "step": 2252 }, { "epoch": 0.10740590660977761, "grad_norm": 1.1426695585250854, "learning_rate": 1.6339187328288953e-05, "loss": 0.5525, "step": 2253 }, { "epoch": 0.10745357900507711, "grad_norm": 1.9162170886993408, "learning_rate": 1.633614894648008e-05, "loss": 0.584, "step": 2254 }, { "epoch": 0.10750125140037661, "grad_norm": 3.6709189414978027, "learning_rate": 1.6333109587068675e-05, "loss": 0.808, "step": 2255 }, { "epoch": 0.10754892379567611, "grad_norm": 1.954192876815796, "learning_rate": 1.6330069250523675e-05, "loss": 0.9295, "step": 2256 }, { "epoch": 0.10759659619097561, "grad_norm": 1.5715235471725464, "learning_rate": 1.6327027937314183e-05, "loss": 0.7345, "step": 2257 }, { "epoch": 0.10764426858627511, "grad_norm": 1.6055649518966675, "learning_rate": 1.632398564790943e-05, "loss": 0.9674, "step": 2258 }, { "epoch": 0.10769194098157461, "grad_norm": 4.366003036499023, "learning_rate": 1.632094238277882e-05, "loss": 0.3948, "step": 2259 }, { "epoch": 0.10773961337687413, "grad_norm": 1.1283854246139526, "learning_rate": 1.631789814239189e-05, "loss": 0.5601, "step": 2260 }, { "epoch": 0.10778728577217363, "grad_norm": 1.6446889638900757, "learning_rate": 1.631485292721834e-05, "loss": 0.9231, "step": 2261 }, { "epoch": 0.10783495816747313, "grad_norm": 1.4033557176589966, "learning_rate": 1.6311806737728016e-05, "loss": 0.6566, "step": 2262 }, { "epoch": 0.10788263056277263, "grad_norm": 2.5270907878875732, "learning_rate": 1.630875957439091e-05, "loss": 0.5646, "step": 2263 }, { "epoch": 0.10793030295807213, "grad_norm": 2.8287105560302734, "learning_rate": 1.6305711437677166e-05, "loss": 0.5786, "step": 2264 }, { "epoch": 0.10797797535337163, "grad_norm": 2.170895576477051, "learning_rate": 1.630266232805709e-05, "loss": 0.9254, "step": 2265 }, { "epoch": 0.10802564774867113, "grad_norm": 1.2984474897384644, "learning_rate": 1.6299612246001118e-05, "loss": 0.6007, "step": 2266 }, { "epoch": 0.10807332014397063, "grad_norm": 2.290714740753174, "learning_rate": 1.6296561191979847e-05, "loss": 1.2458, "step": 2267 }, { "epoch": 0.10812099253927014, "grad_norm": 1.086398959159851, "learning_rate": 1.629350916646403e-05, "loss": 0.5814, "step": 2268 }, { "epoch": 0.10816866493456964, "grad_norm": 1.7387107610702515, "learning_rate": 1.629045616992456e-05, "loss": 0.5262, "step": 2269 }, { "epoch": 0.10821633732986914, "grad_norm": 1.5796606540679932, "learning_rate": 1.628740220283248e-05, "loss": 0.8908, "step": 2270 }, { "epoch": 0.10826400972516864, "grad_norm": 1.919259786605835, "learning_rate": 1.6284347265658986e-05, "loss": 0.6915, "step": 2271 }, { "epoch": 0.10831168212046814, "grad_norm": 1.1323901414871216, "learning_rate": 1.6281291358875427e-05, "loss": 0.5345, "step": 2272 }, { "epoch": 0.10835935451576764, "grad_norm": 2.3234915733337402, "learning_rate": 1.6278234482953296e-05, "loss": 1.365, "step": 2273 }, { "epoch": 0.10840702691106714, "grad_norm": 3.0318877696990967, "learning_rate": 1.627517663836424e-05, "loss": 0.5771, "step": 2274 }, { "epoch": 0.10845469930636664, "grad_norm": 2.742203712463379, "learning_rate": 1.627211782558005e-05, "loss": 0.3859, "step": 2275 }, { "epoch": 0.10850237170166616, "grad_norm": 2.0642952919006348, "learning_rate": 1.6269058045072664e-05, "loss": 0.6774, "step": 2276 }, { "epoch": 0.10855004409696566, "grad_norm": 2.5247888565063477, "learning_rate": 1.626599729731419e-05, "loss": 0.6729, "step": 2277 }, { "epoch": 0.10859771649226516, "grad_norm": 1.4916763305664062, "learning_rate": 1.626293558277685e-05, "loss": 0.7336, "step": 2278 }, { "epoch": 0.10864538888756466, "grad_norm": 1.771559715270996, "learning_rate": 1.6259872901933052e-05, "loss": 0.9819, "step": 2279 }, { "epoch": 0.10869306128286416, "grad_norm": 1.8209635019302368, "learning_rate": 1.6256809255255328e-05, "loss": 0.6779, "step": 2280 }, { "epoch": 0.10874073367816366, "grad_norm": 1.6749701499938965, "learning_rate": 1.625374464321637e-05, "loss": 0.8824, "step": 2281 }, { "epoch": 0.10878840607346316, "grad_norm": 1.253259301185608, "learning_rate": 1.6250679066289015e-05, "loss": 0.4382, "step": 2282 }, { "epoch": 0.10883607846876266, "grad_norm": 1.902420163154602, "learning_rate": 1.624761252494625e-05, "loss": 0.6701, "step": 2283 }, { "epoch": 0.10888375086406217, "grad_norm": 1.1953613758087158, "learning_rate": 1.6244545019661203e-05, "loss": 0.6139, "step": 2284 }, { "epoch": 0.10893142325936167, "grad_norm": 1.5875980854034424, "learning_rate": 1.624147655090717e-05, "loss": 0.6946, "step": 2285 }, { "epoch": 0.10897909565466117, "grad_norm": 1.731711983680725, "learning_rate": 1.6238407119157586e-05, "loss": 0.5091, "step": 2286 }, { "epoch": 0.10902676804996067, "grad_norm": 1.5690521001815796, "learning_rate": 1.623533672488602e-05, "loss": 0.8739, "step": 2287 }, { "epoch": 0.10907444044526017, "grad_norm": 1.5090197324752808, "learning_rate": 1.623226536856621e-05, "loss": 0.7878, "step": 2288 }, { "epoch": 0.10912211284055967, "grad_norm": 1.5838605165481567, "learning_rate": 1.6229193050672036e-05, "loss": 0.7314, "step": 2289 }, { "epoch": 0.10916978523585917, "grad_norm": 1.5384209156036377, "learning_rate": 1.6226119771677517e-05, "loss": 0.8772, "step": 2290 }, { "epoch": 0.10921745763115867, "grad_norm": 1.7850509881973267, "learning_rate": 1.6223045532056838e-05, "loss": 0.9649, "step": 2291 }, { "epoch": 0.10926513002645818, "grad_norm": 1.637871503829956, "learning_rate": 1.6219970332284322e-05, "loss": 1.1274, "step": 2292 }, { "epoch": 0.10931280242175768, "grad_norm": 3.082731246948242, "learning_rate": 1.621689417283443e-05, "loss": 1.1972, "step": 2293 }, { "epoch": 0.10936047481705718, "grad_norm": 1.8796583414077759, "learning_rate": 1.621381705418179e-05, "loss": 0.8037, "step": 2294 }, { "epoch": 0.10940814721235669, "grad_norm": 1.5310776233673096, "learning_rate": 1.6210738976801174e-05, "loss": 0.8149, "step": 2295 }, { "epoch": 0.10945581960765619, "grad_norm": 1.363210916519165, "learning_rate": 1.6207659941167485e-05, "loss": 0.5445, "step": 2296 }, { "epoch": 0.10950349200295569, "grad_norm": 1.5697338581085205, "learning_rate": 1.62045799477558e-05, "loss": 0.9179, "step": 2297 }, { "epoch": 0.10955116439825519, "grad_norm": 2.003814697265625, "learning_rate": 1.620149899704132e-05, "loss": 0.6131, "step": 2298 }, { "epoch": 0.10959883679355469, "grad_norm": 1.8872935771942139, "learning_rate": 1.619841708949941e-05, "loss": 0.5562, "step": 2299 }, { "epoch": 0.1096465091888542, "grad_norm": 2.1485393047332764, "learning_rate": 1.619533422560557e-05, "loss": 0.6668, "step": 2300 }, { "epoch": 0.1096941815841537, "grad_norm": 2.2373406887054443, "learning_rate": 1.619225040583546e-05, "loss": 0.7819, "step": 2301 }, { "epoch": 0.1097418539794532, "grad_norm": 2.7262094020843506, "learning_rate": 1.618916563066488e-05, "loss": 0.857, "step": 2302 }, { "epoch": 0.1097895263747527, "grad_norm": 1.5591273307800293, "learning_rate": 1.6186079900569787e-05, "loss": 0.8508, "step": 2303 }, { "epoch": 0.1098371987700522, "grad_norm": 3.53981614112854, "learning_rate": 1.618299321602626e-05, "loss": 0.6526, "step": 2304 }, { "epoch": 0.1098848711653517, "grad_norm": 1.2734298706054688, "learning_rate": 1.617990557751056e-05, "loss": 0.8925, "step": 2305 }, { "epoch": 0.1099325435606512, "grad_norm": 2.79061222076416, "learning_rate": 1.6176816985499068e-05, "loss": 1.2402, "step": 2306 }, { "epoch": 0.10998021595595071, "grad_norm": 3.840050458908081, "learning_rate": 1.6173727440468318e-05, "loss": 1.0742, "step": 2307 }, { "epoch": 0.11002788835125021, "grad_norm": 1.223572015762329, "learning_rate": 1.6170636942895006e-05, "loss": 0.6545, "step": 2308 }, { "epoch": 0.11007556074654971, "grad_norm": 1.376792550086975, "learning_rate": 1.616754549325596e-05, "loss": 0.7667, "step": 2309 }, { "epoch": 0.11012323314184921, "grad_norm": 1.145013689994812, "learning_rate": 1.6164453092028157e-05, "loss": 0.5677, "step": 2310 }, { "epoch": 0.11017090553714871, "grad_norm": 1.3333736658096313, "learning_rate": 1.616135973968872e-05, "loss": 0.8565, "step": 2311 }, { "epoch": 0.11021857793244821, "grad_norm": 2.5295960903167725, "learning_rate": 1.615826543671493e-05, "loss": 0.9761, "step": 2312 }, { "epoch": 0.11026625032774771, "grad_norm": 3.3088274002075195, "learning_rate": 1.6155170183584195e-05, "loss": 1.1821, "step": 2313 }, { "epoch": 0.11031392272304721, "grad_norm": 2.6220815181732178, "learning_rate": 1.6152073980774093e-05, "loss": 0.7295, "step": 2314 }, { "epoch": 0.11036159511834673, "grad_norm": 5.106561183929443, "learning_rate": 1.6148976828762326e-05, "loss": 1.3027, "step": 2315 }, { "epoch": 0.11040926751364623, "grad_norm": 0.8434693217277527, "learning_rate": 1.6145878728026757e-05, "loss": 0.2973, "step": 2316 }, { "epoch": 0.11045693990894573, "grad_norm": 1.7661564350128174, "learning_rate": 1.6142779679045392e-05, "loss": 0.8119, "step": 2317 }, { "epoch": 0.11050461230424523, "grad_norm": 1.7103074789047241, "learning_rate": 1.613967968229638e-05, "loss": 0.5876, "step": 2318 }, { "epoch": 0.11055228469954473, "grad_norm": 1.7080051898956299, "learning_rate": 1.613657873825802e-05, "loss": 0.6482, "step": 2319 }, { "epoch": 0.11059995709484423, "grad_norm": 1.3071163892745972, "learning_rate": 1.6133476847408754e-05, "loss": 0.9239, "step": 2320 }, { "epoch": 0.11064762949014373, "grad_norm": 1.736224889755249, "learning_rate": 1.6130374010227174e-05, "loss": 1.2638, "step": 2321 }, { "epoch": 0.11069530188544323, "grad_norm": 1.757023811340332, "learning_rate": 1.6127270227192012e-05, "loss": 0.6787, "step": 2322 }, { "epoch": 0.11074297428074274, "grad_norm": 3.932849168777466, "learning_rate": 1.6124165498782156e-05, "loss": 0.0472, "step": 2323 }, { "epoch": 0.11079064667604224, "grad_norm": 4.284938812255859, "learning_rate": 1.612105982547663e-05, "loss": 0.1567, "step": 2324 }, { "epoch": 0.11083831907134174, "grad_norm": 1.4168829917907715, "learning_rate": 1.6117953207754605e-05, "loss": 0.7796, "step": 2325 }, { "epoch": 0.11088599146664124, "grad_norm": 2.0152807235717773, "learning_rate": 1.611484564609541e-05, "loss": 0.6154, "step": 2326 }, { "epoch": 0.11093366386194074, "grad_norm": 1.5113641023635864, "learning_rate": 1.6111737140978495e-05, "loss": 0.5765, "step": 2327 }, { "epoch": 0.11098133625724024, "grad_norm": 1.8552695512771606, "learning_rate": 1.610862769288348e-05, "loss": 0.6431, "step": 2328 }, { "epoch": 0.11102900865253974, "grad_norm": 2.2148361206054688, "learning_rate": 1.6105517302290118e-05, "loss": 0.7583, "step": 2329 }, { "epoch": 0.11107668104783924, "grad_norm": 1.4242758750915527, "learning_rate": 1.6102405969678314e-05, "loss": 0.5657, "step": 2330 }, { "epoch": 0.11112435344313876, "grad_norm": 2.315258264541626, "learning_rate": 1.609929369552811e-05, "loss": 0.7418, "step": 2331 }, { "epoch": 0.11117202583843826, "grad_norm": 1.1028189659118652, "learning_rate": 1.6096180480319698e-05, "loss": 0.3196, "step": 2332 }, { "epoch": 0.11121969823373776, "grad_norm": 1.6975681781768799, "learning_rate": 1.6093066324533413e-05, "loss": 0.7525, "step": 2333 }, { "epoch": 0.11126737062903726, "grad_norm": 1.4082938432693481, "learning_rate": 1.608995122864975e-05, "loss": 0.3855, "step": 2334 }, { "epoch": 0.11131504302433676, "grad_norm": 3.16806697845459, "learning_rate": 1.6086835193149318e-05, "loss": 0.9509, "step": 2335 }, { "epoch": 0.11136271541963626, "grad_norm": 2.600700616836548, "learning_rate": 1.6083718218512904e-05, "loss": 0.9702, "step": 2336 }, { "epoch": 0.11141038781493576, "grad_norm": 1.546226978302002, "learning_rate": 1.6080600305221417e-05, "loss": 0.8758, "step": 2337 }, { "epoch": 0.11145806021023526, "grad_norm": 1.522552490234375, "learning_rate": 1.607748145375592e-05, "loss": 0.4528, "step": 2338 }, { "epoch": 0.11150573260553477, "grad_norm": 1.6641627550125122, "learning_rate": 1.607436166459762e-05, "loss": 0.6523, "step": 2339 }, { "epoch": 0.11155340500083427, "grad_norm": 1.7517926692962646, "learning_rate": 1.607124093822787e-05, "loss": 0.8662, "step": 2340 }, { "epoch": 0.11160107739613377, "grad_norm": 2.037975311279297, "learning_rate": 1.6068119275128165e-05, "loss": 0.8957, "step": 2341 }, { "epoch": 0.11164874979143327, "grad_norm": 1.7879384756088257, "learning_rate": 1.6064996675780146e-05, "loss": 0.9598, "step": 2342 }, { "epoch": 0.11169642218673277, "grad_norm": 1.971345067024231, "learning_rate": 1.60618731406656e-05, "loss": 0.8951, "step": 2343 }, { "epoch": 0.11174409458203227, "grad_norm": 1.9787291288375854, "learning_rate": 1.6058748670266445e-05, "loss": 0.8179, "step": 2344 }, { "epoch": 0.11179176697733177, "grad_norm": 1.6472965478897095, "learning_rate": 1.605562326506477e-05, "loss": 0.9642, "step": 2345 }, { "epoch": 0.11183943937263127, "grad_norm": 2.081470012664795, "learning_rate": 1.6052496925542786e-05, "loss": 1.1336, "step": 2346 }, { "epoch": 0.11188711176793079, "grad_norm": 2.2466204166412354, "learning_rate": 1.6049369652182855e-05, "loss": 1.0396, "step": 2347 }, { "epoch": 0.11193478416323029, "grad_norm": 2.0953478813171387, "learning_rate": 1.604624144546748e-05, "loss": 0.8454, "step": 2348 }, { "epoch": 0.11198245655852979, "grad_norm": 1.3336654901504517, "learning_rate": 1.6043112305879317e-05, "loss": 0.6171, "step": 2349 }, { "epoch": 0.11203012895382929, "grad_norm": 2.746633291244507, "learning_rate": 1.6039982233901155e-05, "loss": 0.9006, "step": 2350 }, { "epoch": 0.11207780134912879, "grad_norm": 1.2693136930465698, "learning_rate": 1.6036851230015935e-05, "loss": 0.6205, "step": 2351 }, { "epoch": 0.11212547374442829, "grad_norm": 1.3461228609085083, "learning_rate": 1.603371929470674e-05, "loss": 0.7251, "step": 2352 }, { "epoch": 0.11217314613972779, "grad_norm": 1.9509644508361816, "learning_rate": 1.603058642845679e-05, "loss": 1.2036, "step": 2353 }, { "epoch": 0.11222081853502729, "grad_norm": 1.3992619514465332, "learning_rate": 1.6027452631749458e-05, "loss": 0.4146, "step": 2354 }, { "epoch": 0.1122684909303268, "grad_norm": 1.40008544921875, "learning_rate": 1.6024317905068255e-05, "loss": 0.8259, "step": 2355 }, { "epoch": 0.1123161633256263, "grad_norm": 2.0970044136047363, "learning_rate": 1.602118224889684e-05, "loss": 0.8051, "step": 2356 }, { "epoch": 0.1123638357209258, "grad_norm": 1.2155324220657349, "learning_rate": 1.601804566371901e-05, "loss": 0.6436, "step": 2357 }, { "epoch": 0.1124115081162253, "grad_norm": 2.929593801498413, "learning_rate": 1.6014908150018703e-05, "loss": 1.1493, "step": 2358 }, { "epoch": 0.1124591805115248, "grad_norm": 1.783107876777649, "learning_rate": 1.601176970828002e-05, "loss": 0.6593, "step": 2359 }, { "epoch": 0.1125068529068243, "grad_norm": 1.6193472146987915, "learning_rate": 1.6008630338987173e-05, "loss": 0.5756, "step": 2360 }, { "epoch": 0.1125545253021238, "grad_norm": 2.6750545501708984, "learning_rate": 1.600549004262454e-05, "loss": 0.6709, "step": 2361 }, { "epoch": 0.1126021976974233, "grad_norm": 1.6343865394592285, "learning_rate": 1.600234881967664e-05, "loss": 0.9063, "step": 2362 }, { "epoch": 0.11264987009272281, "grad_norm": 1.5538705587387085, "learning_rate": 1.599920667062813e-05, "loss": 1.038, "step": 2363 }, { "epoch": 0.11269754248802231, "grad_norm": 1.8014729022979736, "learning_rate": 1.5996063595963813e-05, "loss": 0.7755, "step": 2364 }, { "epoch": 0.11274521488332181, "grad_norm": 2.1251120567321777, "learning_rate": 1.599291959616863e-05, "loss": 0.7735, "step": 2365 }, { "epoch": 0.11279288727862131, "grad_norm": 1.4446736574172974, "learning_rate": 1.5989774671727664e-05, "loss": 0.9707, "step": 2366 }, { "epoch": 0.11284055967392081, "grad_norm": 1.6082050800323486, "learning_rate": 1.598662882312615e-05, "loss": 0.5947, "step": 2367 }, { "epoch": 0.11288823206922032, "grad_norm": 1.1529566049575806, "learning_rate": 1.5983482050849462e-05, "loss": 0.5871, "step": 2368 }, { "epoch": 0.11293590446451982, "grad_norm": 1.4360655546188354, "learning_rate": 1.598033435538311e-05, "loss": 0.5626, "step": 2369 }, { "epoch": 0.11298357685981932, "grad_norm": 2.323112726211548, "learning_rate": 1.5977185737212756e-05, "loss": 0.5323, "step": 2370 }, { "epoch": 0.11303124925511883, "grad_norm": 2.0246341228485107, "learning_rate": 1.597403619682419e-05, "loss": 0.8636, "step": 2371 }, { "epoch": 0.11307892165041833, "grad_norm": 1.5008585453033447, "learning_rate": 1.5970885734703363e-05, "loss": 0.7506, "step": 2372 }, { "epoch": 0.11312659404571783, "grad_norm": 1.4700300693511963, "learning_rate": 1.5967734351336354e-05, "loss": 0.8429, "step": 2373 }, { "epoch": 0.11317426644101733, "grad_norm": 1.5748525857925415, "learning_rate": 1.5964582047209392e-05, "loss": 0.2053, "step": 2374 }, { "epoch": 0.11322193883631683, "grad_norm": 1.506453514099121, "learning_rate": 1.596142882280884e-05, "loss": 0.8866, "step": 2375 }, { "epoch": 0.11326961123161633, "grad_norm": 1.489759087562561, "learning_rate": 1.5958274678621217e-05, "loss": 0.6125, "step": 2376 }, { "epoch": 0.11331728362691583, "grad_norm": 1.5987987518310547, "learning_rate": 1.5955119615133163e-05, "loss": 0.6383, "step": 2377 }, { "epoch": 0.11336495602221533, "grad_norm": 1.2040554285049438, "learning_rate": 1.5951963632831482e-05, "loss": 0.6015, "step": 2378 }, { "epoch": 0.11341262841751484, "grad_norm": 1.6333727836608887, "learning_rate": 1.5948806732203105e-05, "loss": 0.855, "step": 2379 }, { "epoch": 0.11346030081281434, "grad_norm": 1.3919745683670044, "learning_rate": 1.594564891373511e-05, "loss": 0.6924, "step": 2380 }, { "epoch": 0.11350797320811384, "grad_norm": 2.072422504425049, "learning_rate": 1.5942490177914715e-05, "loss": 0.5915, "step": 2381 }, { "epoch": 0.11355564560341334, "grad_norm": 2.358393669128418, "learning_rate": 1.5939330525229285e-05, "loss": 1.1379, "step": 2382 }, { "epoch": 0.11360331799871284, "grad_norm": 1.6451317071914673, "learning_rate": 1.5936169956166316e-05, "loss": 0.9774, "step": 2383 }, { "epoch": 0.11365099039401234, "grad_norm": 1.566930890083313, "learning_rate": 1.593300847121345e-05, "loss": 0.7863, "step": 2384 }, { "epoch": 0.11369866278931184, "grad_norm": 2.387988567352295, "learning_rate": 1.592984607085848e-05, "loss": 0.8635, "step": 2385 }, { "epoch": 0.11374633518461134, "grad_norm": 1.7774720191955566, "learning_rate": 1.5926682755589325e-05, "loss": 0.4223, "step": 2386 }, { "epoch": 0.11379400757991086, "grad_norm": 1.0116852521896362, "learning_rate": 1.5923518525894053e-05, "loss": 0.2378, "step": 2387 }, { "epoch": 0.11384167997521036, "grad_norm": 1.6674070358276367, "learning_rate": 1.5920353382260876e-05, "loss": 0.6554, "step": 2388 }, { "epoch": 0.11388935237050986, "grad_norm": 1.2811578512191772, "learning_rate": 1.591718732517814e-05, "loss": 0.4141, "step": 2389 }, { "epoch": 0.11393702476580936, "grad_norm": 1.4084479808807373, "learning_rate": 1.5914020355134333e-05, "loss": 0.7754, "step": 2390 }, { "epoch": 0.11398469716110886, "grad_norm": 1.144248127937317, "learning_rate": 1.5910852472618085e-05, "loss": 0.4276, "step": 2391 }, { "epoch": 0.11403236955640836, "grad_norm": 1.5653167963027954, "learning_rate": 1.5907683678118173e-05, "loss": 0.4606, "step": 2392 }, { "epoch": 0.11408004195170786, "grad_norm": 3.081538677215576, "learning_rate": 1.5904513972123507e-05, "loss": 0.6316, "step": 2393 }, { "epoch": 0.11412771434700736, "grad_norm": 2.0674731731414795, "learning_rate": 1.590134335512314e-05, "loss": 0.7435, "step": 2394 }, { "epoch": 0.11417538674230687, "grad_norm": 1.1712582111358643, "learning_rate": 1.5898171827606264e-05, "loss": 0.3855, "step": 2395 }, { "epoch": 0.11422305913760637, "grad_norm": 1.101927638053894, "learning_rate": 1.5894999390062216e-05, "loss": 0.5076, "step": 2396 }, { "epoch": 0.11427073153290587, "grad_norm": 2.455082416534424, "learning_rate": 1.5891826042980468e-05, "loss": 0.731, "step": 2397 }, { "epoch": 0.11431840392820537, "grad_norm": 2.9067742824554443, "learning_rate": 1.5888651786850638e-05, "loss": 0.8672, "step": 2398 }, { "epoch": 0.11436607632350487, "grad_norm": 1.4049131870269775, "learning_rate": 1.5885476622162478e-05, "loss": 0.7721, "step": 2399 }, { "epoch": 0.11441374871880437, "grad_norm": 1.8493523597717285, "learning_rate": 1.588230054940588e-05, "loss": 0.7796, "step": 2400 }, { "epoch": 0.11446142111410387, "grad_norm": 0.7470552921295166, "learning_rate": 1.5879123569070888e-05, "loss": 0.3024, "step": 2401 }, { "epoch": 0.11450909350940339, "grad_norm": 1.6520780324935913, "learning_rate": 1.5875945681647672e-05, "loss": 0.5836, "step": 2402 }, { "epoch": 0.11455676590470289, "grad_norm": 1.1575690507888794, "learning_rate": 1.5872766887626546e-05, "loss": 0.6283, "step": 2403 }, { "epoch": 0.11460443830000239, "grad_norm": 1.4639043807983398, "learning_rate": 1.5869587187497965e-05, "loss": 0.659, "step": 2404 }, { "epoch": 0.11465211069530189, "grad_norm": 1.45045804977417, "learning_rate": 1.586640658175253e-05, "loss": 0.6051, "step": 2405 }, { "epoch": 0.11469978309060139, "grad_norm": 1.6251894235610962, "learning_rate": 1.586322507088097e-05, "loss": 0.6632, "step": 2406 }, { "epoch": 0.11474745548590089, "grad_norm": 1.2037824392318726, "learning_rate": 1.586004265537416e-05, "loss": 0.7546, "step": 2407 }, { "epoch": 0.11479512788120039, "grad_norm": 2.1143577098846436, "learning_rate": 1.585685933572312e-05, "loss": 0.6622, "step": 2408 }, { "epoch": 0.11484280027649989, "grad_norm": 1.87221097946167, "learning_rate": 1.5853675112418994e-05, "loss": 0.9798, "step": 2409 }, { "epoch": 0.1148904726717994, "grad_norm": 2.089597702026367, "learning_rate": 1.5850489985953076e-05, "loss": 0.9001, "step": 2410 }, { "epoch": 0.1149381450670989, "grad_norm": 2.2795889377593994, "learning_rate": 1.5847303956816808e-05, "loss": 0.8602, "step": 2411 }, { "epoch": 0.1149858174623984, "grad_norm": 2.171499013900757, "learning_rate": 1.5844117025501753e-05, "loss": 0.6345, "step": 2412 }, { "epoch": 0.1150334898576979, "grad_norm": 1.6256190538406372, "learning_rate": 1.584092919249962e-05, "loss": 0.8459, "step": 2413 }, { "epoch": 0.1150811622529974, "grad_norm": 1.9238924980163574, "learning_rate": 1.583774045830227e-05, "loss": 0.8546, "step": 2414 }, { "epoch": 0.1151288346482969, "grad_norm": 1.3114979267120361, "learning_rate": 1.583455082340168e-05, "loss": 0.4359, "step": 2415 }, { "epoch": 0.1151765070435964, "grad_norm": 1.086281180381775, "learning_rate": 1.583136028828998e-05, "loss": 0.4448, "step": 2416 }, { "epoch": 0.1152241794388959, "grad_norm": 1.503161072731018, "learning_rate": 1.5828168853459445e-05, "loss": 0.6896, "step": 2417 }, { "epoch": 0.11527185183419542, "grad_norm": 1.8496677875518799, "learning_rate": 1.582497651940247e-05, "loss": 0.5931, "step": 2418 }, { "epoch": 0.11531952422949492, "grad_norm": 1.3385798931121826, "learning_rate": 1.5821783286611604e-05, "loss": 0.8231, "step": 2419 }, { "epoch": 0.11536719662479442, "grad_norm": 1.5753676891326904, "learning_rate": 1.581858915557953e-05, "loss": 0.7206, "step": 2420 }, { "epoch": 0.11541486902009392, "grad_norm": 3.8933351039886475, "learning_rate": 1.581539412679907e-05, "loss": 0.6449, "step": 2421 }, { "epoch": 0.11546254141539342, "grad_norm": 12.029458045959473, "learning_rate": 1.581219820076318e-05, "loss": 0.9718, "step": 2422 }, { "epoch": 0.11551021381069292, "grad_norm": 2.0000948905944824, "learning_rate": 1.5809001377964966e-05, "loss": 0.9009, "step": 2423 }, { "epoch": 0.11555788620599242, "grad_norm": 1.906844139099121, "learning_rate": 1.580580365889766e-05, "loss": 0.7576, "step": 2424 }, { "epoch": 0.11560555860129192, "grad_norm": 2.046762704849243, "learning_rate": 1.5802605044054638e-05, "loss": 0.9825, "step": 2425 }, { "epoch": 0.11565323099659143, "grad_norm": 1.1829183101654053, "learning_rate": 1.579940553392941e-05, "loss": 0.8468, "step": 2426 }, { "epoch": 0.11570090339189093, "grad_norm": 1.6301301717758179, "learning_rate": 1.579620512901563e-05, "loss": 0.7414, "step": 2427 }, { "epoch": 0.11574857578719043, "grad_norm": 3.3583359718322754, "learning_rate": 1.579300382980709e-05, "loss": 0.8273, "step": 2428 }, { "epoch": 0.11579624818248993, "grad_norm": 2.3643252849578857, "learning_rate": 1.5789801636797718e-05, "loss": 1.4061, "step": 2429 }, { "epoch": 0.11584392057778943, "grad_norm": 1.3170373439788818, "learning_rate": 1.5786598550481573e-05, "loss": 0.7628, "step": 2430 }, { "epoch": 0.11589159297308893, "grad_norm": 1.9027100801467896, "learning_rate": 1.5783394571352863e-05, "loss": 0.8552, "step": 2431 }, { "epoch": 0.11593926536838843, "grad_norm": 1.4820061922073364, "learning_rate": 1.5780189699905928e-05, "loss": 0.6201, "step": 2432 }, { "epoch": 0.11598693776368793, "grad_norm": 1.4014887809753418, "learning_rate": 1.577698393663525e-05, "loss": 0.4637, "step": 2433 }, { "epoch": 0.11603461015898744, "grad_norm": 1.0882154703140259, "learning_rate": 1.5773777282035437e-05, "loss": 0.8005, "step": 2434 }, { "epoch": 0.11608228255428694, "grad_norm": 1.066767930984497, "learning_rate": 1.577056973660125e-05, "loss": 0.5197, "step": 2435 }, { "epoch": 0.11612995494958644, "grad_norm": 1.0935394763946533, "learning_rate": 1.5767361300827577e-05, "loss": 0.69, "step": 2436 }, { "epoch": 0.11617762734488594, "grad_norm": 1.9463183879852295, "learning_rate": 1.576415197520945e-05, "loss": 1.0335, "step": 2437 }, { "epoch": 0.11622529974018544, "grad_norm": 1.5867832899093628, "learning_rate": 1.576094176024203e-05, "loss": 0.1858, "step": 2438 }, { "epoch": 0.11627297213548494, "grad_norm": 2.1434361934661865, "learning_rate": 1.5757730656420626e-05, "loss": 0.5396, "step": 2439 }, { "epoch": 0.11632064453078444, "grad_norm": 2.3051917552948, "learning_rate": 1.575451866424067e-05, "loss": 0.7064, "step": 2440 }, { "epoch": 0.11636831692608395, "grad_norm": 1.6615920066833496, "learning_rate": 1.5751305784197746e-05, "loss": 0.9084, "step": 2441 }, { "epoch": 0.11641598932138346, "grad_norm": 2.000019073486328, "learning_rate": 1.5748092016787567e-05, "loss": 0.719, "step": 2442 }, { "epoch": 0.11646366171668296, "grad_norm": 3.775541305541992, "learning_rate": 1.5744877362505987e-05, "loss": 0.7955, "step": 2443 }, { "epoch": 0.11651133411198246, "grad_norm": 1.6770176887512207, "learning_rate": 1.5741661821848983e-05, "loss": 1.0948, "step": 2444 }, { "epoch": 0.11655900650728196, "grad_norm": 1.441290020942688, "learning_rate": 1.5738445395312694e-05, "loss": 0.6975, "step": 2445 }, { "epoch": 0.11660667890258146, "grad_norm": 1.6863912343978882, "learning_rate": 1.5735228083393373e-05, "loss": 0.682, "step": 2446 }, { "epoch": 0.11665435129788096, "grad_norm": 1.180029034614563, "learning_rate": 1.573200988658742e-05, "loss": 0.7952, "step": 2447 }, { "epoch": 0.11670202369318046, "grad_norm": 1.6943645477294922, "learning_rate": 1.572879080539137e-05, "loss": 0.5094, "step": 2448 }, { "epoch": 0.11674969608847996, "grad_norm": 1.7495173215866089, "learning_rate": 1.5725570840301897e-05, "loss": 0.7735, "step": 2449 }, { "epoch": 0.11679736848377947, "grad_norm": 1.1879079341888428, "learning_rate": 1.5722349991815802e-05, "loss": 0.3279, "step": 2450 }, { "epoch": 0.11684504087907897, "grad_norm": 3.4411327838897705, "learning_rate": 1.571912826043003e-05, "loss": 1.0245, "step": 2451 }, { "epoch": 0.11689271327437847, "grad_norm": 3.4305639266967773, "learning_rate": 1.5715905646641666e-05, "loss": 0.5089, "step": 2452 }, { "epoch": 0.11694038566967797, "grad_norm": 1.467454195022583, "learning_rate": 1.5712682150947926e-05, "loss": 0.4662, "step": 2453 }, { "epoch": 0.11698805806497747, "grad_norm": 2.7015254497528076, "learning_rate": 1.5709457773846155e-05, "loss": 0.9322, "step": 2454 }, { "epoch": 0.11703573046027697, "grad_norm": 1.893823266029358, "learning_rate": 1.5706232515833842e-05, "loss": 0.7901, "step": 2455 }, { "epoch": 0.11708340285557647, "grad_norm": 1.3589909076690674, "learning_rate": 1.5703006377408623e-05, "loss": 0.8506, "step": 2456 }, { "epoch": 0.11713107525087597, "grad_norm": 3.398231267929077, "learning_rate": 1.5699779359068248e-05, "loss": 0.7117, "step": 2457 }, { "epoch": 0.11717874764617549, "grad_norm": 1.307833194732666, "learning_rate": 1.569655146131061e-05, "loss": 0.9439, "step": 2458 }, { "epoch": 0.11722642004147499, "grad_norm": 1.4641598463058472, "learning_rate": 1.5693322684633747e-05, "loss": 0.5693, "step": 2459 }, { "epoch": 0.11727409243677449, "grad_norm": 1.35435950756073, "learning_rate": 1.5690093029535824e-05, "loss": 0.6255, "step": 2460 }, { "epoch": 0.11732176483207399, "grad_norm": 1.9741376638412476, "learning_rate": 1.5686862496515142e-05, "loss": 0.4204, "step": 2461 }, { "epoch": 0.11736943722737349, "grad_norm": 5.268206596374512, "learning_rate": 1.568363108607014e-05, "loss": 0.2695, "step": 2462 }, { "epoch": 0.11741710962267299, "grad_norm": 1.644921898841858, "learning_rate": 1.5680398798699395e-05, "loss": 0.4717, "step": 2463 }, { "epoch": 0.11746478201797249, "grad_norm": 6.985912322998047, "learning_rate": 1.5677165634901607e-05, "loss": 1.2432, "step": 2464 }, { "epoch": 0.11751245441327199, "grad_norm": 1.51718008518219, "learning_rate": 1.567393159517563e-05, "loss": 0.7786, "step": 2465 }, { "epoch": 0.1175601268085715, "grad_norm": 1.3289611339569092, "learning_rate": 1.5670696680020433e-05, "loss": 0.5209, "step": 2466 }, { "epoch": 0.117607799203871, "grad_norm": 1.9150704145431519, "learning_rate": 1.5667460889935138e-05, "loss": 0.9846, "step": 2467 }, { "epoch": 0.1176554715991705, "grad_norm": 2.3162319660186768, "learning_rate": 1.566422422541899e-05, "loss": 0.8315, "step": 2468 }, { "epoch": 0.11770314399447, "grad_norm": 1.7796815633773804, "learning_rate": 1.5660986686971377e-05, "loss": 0.7327, "step": 2469 }, { "epoch": 0.1177508163897695, "grad_norm": 1.4785791635513306, "learning_rate": 1.565774827509181e-05, "loss": 0.4438, "step": 2470 }, { "epoch": 0.117798488785069, "grad_norm": 3.441373825073242, "learning_rate": 1.565450899027995e-05, "loss": 0.9713, "step": 2471 }, { "epoch": 0.1178461611803685, "grad_norm": 1.719574213027954, "learning_rate": 1.5651268833035585e-05, "loss": 0.8168, "step": 2472 }, { "epoch": 0.117893833575668, "grad_norm": 3.02345871925354, "learning_rate": 1.5648027803858635e-05, "loss": 0.7226, "step": 2473 }, { "epoch": 0.11794150597096752, "grad_norm": 2.9506583213806152, "learning_rate": 1.564478590324916e-05, "loss": 0.7869, "step": 2474 }, { "epoch": 0.11798917836626702, "grad_norm": 1.3805612325668335, "learning_rate": 1.5641543131707345e-05, "loss": 0.6029, "step": 2475 }, { "epoch": 0.11803685076156652, "grad_norm": 2.700631618499756, "learning_rate": 1.5638299489733525e-05, "loss": 0.596, "step": 2476 }, { "epoch": 0.11808452315686602, "grad_norm": 1.7731691598892212, "learning_rate": 1.5635054977828156e-05, "loss": 0.7278, "step": 2477 }, { "epoch": 0.11813219555216552, "grad_norm": 2.4622464179992676, "learning_rate": 1.5631809596491833e-05, "loss": 1.2707, "step": 2478 }, { "epoch": 0.11817986794746502, "grad_norm": 1.9097596406936646, "learning_rate": 1.562856334622529e-05, "loss": 0.7455, "step": 2479 }, { "epoch": 0.11822754034276452, "grad_norm": 1.5807631015777588, "learning_rate": 1.5625316227529382e-05, "loss": 0.7761, "step": 2480 }, { "epoch": 0.11827521273806402, "grad_norm": 1.8797825574874878, "learning_rate": 1.562206824090511e-05, "loss": 0.7468, "step": 2481 }, { "epoch": 0.11832288513336353, "grad_norm": 2.1505680084228516, "learning_rate": 1.5618819386853607e-05, "loss": 0.7845, "step": 2482 }, { "epoch": 0.11837055752866303, "grad_norm": 1.4331556558609009, "learning_rate": 1.5615569665876132e-05, "loss": 0.5999, "step": 2483 }, { "epoch": 0.11841822992396253, "grad_norm": 1.5098989009857178, "learning_rate": 1.5612319078474087e-05, "loss": 0.7021, "step": 2484 }, { "epoch": 0.11846590231926203, "grad_norm": 2.388648509979248, "learning_rate": 1.5609067625149007e-05, "loss": 1.2, "step": 2485 }, { "epoch": 0.11851357471456153, "grad_norm": 1.6561309099197388, "learning_rate": 1.560581530640255e-05, "loss": 0.684, "step": 2486 }, { "epoch": 0.11856124710986103, "grad_norm": 1.3266593217849731, "learning_rate": 1.5602562122736526e-05, "loss": 0.9018, "step": 2487 }, { "epoch": 0.11860891950516053, "grad_norm": 1.415453314781189, "learning_rate": 1.5599308074652856e-05, "loss": 0.6789, "step": 2488 }, { "epoch": 0.11865659190046003, "grad_norm": 1.4097321033477783, "learning_rate": 1.5596053162653612e-05, "loss": 0.6088, "step": 2489 }, { "epoch": 0.11870426429575955, "grad_norm": 1.3818477392196655, "learning_rate": 1.5592797387240996e-05, "loss": 0.615, "step": 2490 }, { "epoch": 0.11875193669105905, "grad_norm": 1.5520590543746948, "learning_rate": 1.5589540748917336e-05, "loss": 0.5646, "step": 2491 }, { "epoch": 0.11879960908635855, "grad_norm": 2.6001250743865967, "learning_rate": 1.5586283248185102e-05, "loss": 0.7905, "step": 2492 }, { "epoch": 0.11884728148165805, "grad_norm": 2.243886947631836, "learning_rate": 1.5583024885546887e-05, "loss": 0.5465, "step": 2493 }, { "epoch": 0.11889495387695755, "grad_norm": 1.5221590995788574, "learning_rate": 1.557976566150543e-05, "loss": 0.4085, "step": 2494 }, { "epoch": 0.11894262627225705, "grad_norm": 1.786395788192749, "learning_rate": 1.5576505576563587e-05, "loss": 0.7562, "step": 2495 }, { "epoch": 0.11899029866755655, "grad_norm": 1.981576919555664, "learning_rate": 1.5573244631224364e-05, "loss": 0.6257, "step": 2496 }, { "epoch": 0.11903797106285606, "grad_norm": 2.3802356719970703, "learning_rate": 1.556998282599089e-05, "loss": 0.8141, "step": 2497 }, { "epoch": 0.11908564345815556, "grad_norm": 1.1556748151779175, "learning_rate": 1.5566720161366423e-05, "loss": 0.7201, "step": 2498 }, { "epoch": 0.11913331585345506, "grad_norm": 1.6112884283065796, "learning_rate": 1.556345663785436e-05, "loss": 0.624, "step": 2499 }, { "epoch": 0.11918098824875456, "grad_norm": 1.3968398571014404, "learning_rate": 1.556019225595823e-05, "loss": 0.6489, "step": 2500 }, { "epoch": 0.11922866064405406, "grad_norm": 1.6038421392440796, "learning_rate": 1.5556927016181694e-05, "loss": 0.943, "step": 2501 }, { "epoch": 0.11927633303935356, "grad_norm": 1.491059422492981, "learning_rate": 1.555366091902855e-05, "loss": 1.0027, "step": 2502 }, { "epoch": 0.11932400543465306, "grad_norm": 1.804369330406189, "learning_rate": 1.5550393965002712e-05, "loss": 1.1136, "step": 2503 }, { "epoch": 0.11937167782995256, "grad_norm": 2.3129498958587646, "learning_rate": 1.5547126154608246e-05, "loss": 0.5736, "step": 2504 }, { "epoch": 0.11941935022525207, "grad_norm": 1.2741669416427612, "learning_rate": 1.5543857488349335e-05, "loss": 0.7488, "step": 2505 }, { "epoch": 0.11946702262055157, "grad_norm": 1.6305344104766846, "learning_rate": 1.5540587966730306e-05, "loss": 1.0591, "step": 2506 }, { "epoch": 0.11951469501585107, "grad_norm": 3.3316562175750732, "learning_rate": 1.553731759025561e-05, "loss": 1.1484, "step": 2507 }, { "epoch": 0.11956236741115057, "grad_norm": 1.5285416841506958, "learning_rate": 1.553404635942984e-05, "loss": 0.7253, "step": 2508 }, { "epoch": 0.11961003980645007, "grad_norm": 2.707442283630371, "learning_rate": 1.5530774274757697e-05, "loss": 1.0659, "step": 2509 }, { "epoch": 0.11965771220174957, "grad_norm": 1.932203769683838, "learning_rate": 1.5527501336744046e-05, "loss": 0.7343, "step": 2510 }, { "epoch": 0.11970538459704907, "grad_norm": 1.8537970781326294, "learning_rate": 1.5524227545893856e-05, "loss": 0.8658, "step": 2511 }, { "epoch": 0.11975305699234857, "grad_norm": 1.5665045976638794, "learning_rate": 1.5520952902712246e-05, "loss": 0.8468, "step": 2512 }, { "epoch": 0.11980072938764809, "grad_norm": 1.405781865119934, "learning_rate": 1.551767740770446e-05, "loss": 0.7479, "step": 2513 }, { "epoch": 0.11984840178294759, "grad_norm": 1.7779722213745117, "learning_rate": 1.5514401061375873e-05, "loss": 0.539, "step": 2514 }, { "epoch": 0.11989607417824709, "grad_norm": 1.0430759191513062, "learning_rate": 1.5511123864231983e-05, "loss": 0.4346, "step": 2515 }, { "epoch": 0.11994374657354659, "grad_norm": 3.0844762325286865, "learning_rate": 1.550784581677844e-05, "loss": 0.4899, "step": 2516 }, { "epoch": 0.11999141896884609, "grad_norm": 2.4925875663757324, "learning_rate": 1.5504566919521e-05, "loss": 1.1701, "step": 2517 }, { "epoch": 0.12003909136414559, "grad_norm": 1.3920817375183105, "learning_rate": 1.550128717296558e-05, "loss": 0.9856, "step": 2518 }, { "epoch": 0.12008676375944509, "grad_norm": 1.6713967323303223, "learning_rate": 1.5498006577618194e-05, "loss": 0.4637, "step": 2519 }, { "epoch": 0.12013443615474459, "grad_norm": 2.2760868072509766, "learning_rate": 1.5494725133985014e-05, "loss": 0.9592, "step": 2520 }, { "epoch": 0.1201821085500441, "grad_norm": 2.14618182182312, "learning_rate": 1.549144284257233e-05, "loss": 0.7225, "step": 2521 }, { "epoch": 0.1202297809453436, "grad_norm": 1.916582703590393, "learning_rate": 1.548815970388657e-05, "loss": 0.8079, "step": 2522 }, { "epoch": 0.1202774533406431, "grad_norm": 1.9218018054962158, "learning_rate": 1.5484875718434284e-05, "loss": 0.7001, "step": 2523 }, { "epoch": 0.1203251257359426, "grad_norm": 1.4419703483581543, "learning_rate": 1.5481590886722154e-05, "loss": 0.5294, "step": 2524 }, { "epoch": 0.1203727981312421, "grad_norm": 2.566704511642456, "learning_rate": 1.5478305209257004e-05, "loss": 0.6617, "step": 2525 }, { "epoch": 0.1204204705265416, "grad_norm": 3.018068790435791, "learning_rate": 1.547501868654577e-05, "loss": 0.556, "step": 2526 }, { "epoch": 0.1204681429218411, "grad_norm": 7.7364115715026855, "learning_rate": 1.5471731319095537e-05, "loss": 0.5175, "step": 2527 }, { "epoch": 0.1205158153171406, "grad_norm": 1.0991004705429077, "learning_rate": 1.5468443107413512e-05, "loss": 0.5897, "step": 2528 }, { "epoch": 0.12056348771244012, "grad_norm": 3.0219228267669678, "learning_rate": 1.5465154052007027e-05, "loss": 0.2365, "step": 2529 }, { "epoch": 0.12061116010773962, "grad_norm": 1.6062054634094238, "learning_rate": 1.5461864153383555e-05, "loss": 0.1371, "step": 2530 }, { "epoch": 0.12065883250303912, "grad_norm": 1.0779755115509033, "learning_rate": 1.5458573412050688e-05, "loss": 0.7673, "step": 2531 }, { "epoch": 0.12070650489833862, "grad_norm": 1.4383097887039185, "learning_rate": 1.5455281828516152e-05, "loss": 0.9756, "step": 2532 }, { "epoch": 0.12075417729363812, "grad_norm": 1.650024175643921, "learning_rate": 1.5451989403287816e-05, "loss": 0.6756, "step": 2533 }, { "epoch": 0.12080184968893762, "grad_norm": 1.4167653322219849, "learning_rate": 1.544869613687366e-05, "loss": 0.2193, "step": 2534 }, { "epoch": 0.12084952208423712, "grad_norm": 0.9266788363456726, "learning_rate": 1.5445402029781792e-05, "loss": 0.4611, "step": 2535 }, { "epoch": 0.12089719447953662, "grad_norm": 2.4128856658935547, "learning_rate": 1.5442107082520475e-05, "loss": 1.0507, "step": 2536 }, { "epoch": 0.12094486687483613, "grad_norm": 2.535980463027954, "learning_rate": 1.5438811295598075e-05, "loss": 0.7295, "step": 2537 }, { "epoch": 0.12099253927013563, "grad_norm": 4.242743015289307, "learning_rate": 1.5435514669523102e-05, "loss": 0.8476, "step": 2538 }, { "epoch": 0.12104021166543513, "grad_norm": 1.952236294746399, "learning_rate": 1.543221720480419e-05, "loss": 1.0177, "step": 2539 }, { "epoch": 0.12108788406073463, "grad_norm": 5.952898979187012, "learning_rate": 1.5428918901950105e-05, "loss": 0.8368, "step": 2540 }, { "epoch": 0.12113555645603413, "grad_norm": 4.943411827087402, "learning_rate": 1.542561976146974e-05, "loss": 0.7786, "step": 2541 }, { "epoch": 0.12118322885133363, "grad_norm": 1.798194169998169, "learning_rate": 1.5422319783872118e-05, "loss": 1.1188, "step": 2542 }, { "epoch": 0.12123090124663313, "grad_norm": 1.5484708547592163, "learning_rate": 1.5419018969666396e-05, "loss": 0.9368, "step": 2543 }, { "epoch": 0.12127857364193263, "grad_norm": 1.478027105331421, "learning_rate": 1.541571731936185e-05, "loss": 0.7332, "step": 2544 }, { "epoch": 0.12132624603723215, "grad_norm": 1.9182305335998535, "learning_rate": 1.5412414833467887e-05, "loss": 0.7148, "step": 2545 }, { "epoch": 0.12137391843253165, "grad_norm": 1.8540362119674683, "learning_rate": 1.540911151249406e-05, "loss": 0.815, "step": 2546 }, { "epoch": 0.12142159082783115, "grad_norm": 1.6913847923278809, "learning_rate": 1.5405807356950028e-05, "loss": 1.1134, "step": 2547 }, { "epoch": 0.12146926322313065, "grad_norm": 1.5570502281188965, "learning_rate": 1.5402502367345588e-05, "loss": 0.9728, "step": 2548 }, { "epoch": 0.12151693561843015, "grad_norm": 1.8802764415740967, "learning_rate": 1.5399196544190668e-05, "loss": 0.7477, "step": 2549 }, { "epoch": 0.12156460801372965, "grad_norm": 1.8015141487121582, "learning_rate": 1.5395889887995324e-05, "loss": 0.8685, "step": 2550 }, { "epoch": 0.12161228040902915, "grad_norm": 3.0751209259033203, "learning_rate": 1.5392582399269735e-05, "loss": 0.7288, "step": 2551 }, { "epoch": 0.12165995280432865, "grad_norm": 2.1915316581726074, "learning_rate": 1.5389274078524217e-05, "loss": 0.8262, "step": 2552 }, { "epoch": 0.12170762519962816, "grad_norm": 1.7919362783432007, "learning_rate": 1.5385964926269206e-05, "loss": 0.9529, "step": 2553 }, { "epoch": 0.12175529759492766, "grad_norm": 1.8436754941940308, "learning_rate": 1.5382654943015274e-05, "loss": 0.7763, "step": 2554 }, { "epoch": 0.12180296999022716, "grad_norm": 1.9005080461502075, "learning_rate": 1.5379344129273112e-05, "loss": 0.8681, "step": 2555 }, { "epoch": 0.12185064238552666, "grad_norm": 1.5322390794754028, "learning_rate": 1.5376032485553543e-05, "loss": 0.7952, "step": 2556 }, { "epoch": 0.12189831478082616, "grad_norm": 1.448270559310913, "learning_rate": 1.5372720012367532e-05, "loss": 0.9949, "step": 2557 }, { "epoch": 0.12194598717612566, "grad_norm": 1.3214863538742065, "learning_rate": 1.5369406710226147e-05, "loss": 0.8112, "step": 2558 }, { "epoch": 0.12199365957142516, "grad_norm": 1.4696680307388306, "learning_rate": 1.5366092579640604e-05, "loss": 0.5208, "step": 2559 }, { "epoch": 0.12204133196672466, "grad_norm": 1.8419678211212158, "learning_rate": 1.5362777621122235e-05, "loss": 0.6654, "step": 2560 }, { "epoch": 0.12208900436202418, "grad_norm": 1.5617502927780151, "learning_rate": 1.5359461835182507e-05, "loss": 0.6595, "step": 2561 }, { "epoch": 0.12213667675732368, "grad_norm": 2.430222272872925, "learning_rate": 1.5356145222333006e-05, "loss": 0.4169, "step": 2562 }, { "epoch": 0.12218434915262318, "grad_norm": 3.5594234466552734, "learning_rate": 1.5352827783085453e-05, "loss": 0.4826, "step": 2563 }, { "epoch": 0.12223202154792268, "grad_norm": 1.7489187717437744, "learning_rate": 1.53495095179517e-05, "loss": 0.7814, "step": 2564 }, { "epoch": 0.12227969394322218, "grad_norm": 1.509758710861206, "learning_rate": 1.5346190427443716e-05, "loss": 0.8884, "step": 2565 }, { "epoch": 0.12232736633852168, "grad_norm": 1.1786763668060303, "learning_rate": 1.5342870512073605e-05, "loss": 0.7114, "step": 2566 }, { "epoch": 0.12237503873382118, "grad_norm": 3.198537588119507, "learning_rate": 1.5339549772353595e-05, "loss": 1.4127, "step": 2567 }, { "epoch": 0.12242271112912068, "grad_norm": 1.3353036642074585, "learning_rate": 1.533622820879604e-05, "loss": 0.702, "step": 2568 }, { "epoch": 0.12247038352442019, "grad_norm": 1.7240386009216309, "learning_rate": 1.533290582191343e-05, "loss": 0.5939, "step": 2569 }, { "epoch": 0.12251805591971969, "grad_norm": 4.358938694000244, "learning_rate": 1.5329582612218366e-05, "loss": 0.565, "step": 2570 }, { "epoch": 0.12256572831501919, "grad_norm": 1.288405179977417, "learning_rate": 1.532625858022359e-05, "loss": 0.6065, "step": 2571 }, { "epoch": 0.12261340071031869, "grad_norm": 1.627985954284668, "learning_rate": 1.5322933726441963e-05, "loss": 0.7879, "step": 2572 }, { "epoch": 0.12266107310561819, "grad_norm": 1.7537492513656616, "learning_rate": 1.531960805138648e-05, "loss": 0.5376, "step": 2573 }, { "epoch": 0.12270874550091769, "grad_norm": 1.8633304834365845, "learning_rate": 1.5316281555570258e-05, "loss": 0.3876, "step": 2574 }, { "epoch": 0.12275641789621719, "grad_norm": 2.610959768295288, "learning_rate": 1.5312954239506536e-05, "loss": 0.6771, "step": 2575 }, { "epoch": 0.12280409029151669, "grad_norm": 1.6779143810272217, "learning_rate": 1.530962610370869e-05, "loss": 0.5416, "step": 2576 }, { "epoch": 0.1228517626868162, "grad_norm": 3.0123038291931152, "learning_rate": 1.530629714869021e-05, "loss": 0.2452, "step": 2577 }, { "epoch": 0.1228994350821157, "grad_norm": 1.634108304977417, "learning_rate": 1.5302967374964727e-05, "loss": 0.4345, "step": 2578 }, { "epoch": 0.1229471074774152, "grad_norm": 1.60711669921875, "learning_rate": 1.5299636783045988e-05, "loss": 0.8213, "step": 2579 }, { "epoch": 0.1229947798727147, "grad_norm": 1.858851671218872, "learning_rate": 1.529630537344787e-05, "loss": 0.8619, "step": 2580 }, { "epoch": 0.1230424522680142, "grad_norm": 1.5693343877792358, "learning_rate": 1.5292973146684372e-05, "loss": 0.5578, "step": 2581 }, { "epoch": 0.1230901246633137, "grad_norm": 2.4136385917663574, "learning_rate": 1.5289640103269626e-05, "loss": 0.4917, "step": 2582 }, { "epoch": 0.1231377970586132, "grad_norm": 1.3694087266921997, "learning_rate": 1.5286306243717884e-05, "loss": 0.7852, "step": 2583 }, { "epoch": 0.12318546945391272, "grad_norm": 1.901773452758789, "learning_rate": 1.528297156854353e-05, "loss": 0.5887, "step": 2584 }, { "epoch": 0.12323314184921222, "grad_norm": 1.2055267095565796, "learning_rate": 1.5279636078261064e-05, "loss": 0.5099, "step": 2585 }, { "epoch": 0.12328081424451172, "grad_norm": 1.8217684030532837, "learning_rate": 1.5276299773385122e-05, "loss": 0.9042, "step": 2586 }, { "epoch": 0.12332848663981122, "grad_norm": 1.5840007066726685, "learning_rate": 1.527296265443046e-05, "loss": 0.629, "step": 2587 }, { "epoch": 0.12337615903511072, "grad_norm": 3.431870222091675, "learning_rate": 1.5269624721911964e-05, "loss": 0.5871, "step": 2588 }, { "epoch": 0.12342383143041022, "grad_norm": 1.165732741355896, "learning_rate": 1.5266285976344635e-05, "loss": 0.2416, "step": 2589 }, { "epoch": 0.12347150382570972, "grad_norm": 1.3594448566436768, "learning_rate": 1.5262946418243617e-05, "loss": 1.0888, "step": 2590 }, { "epoch": 0.12351917622100922, "grad_norm": 6.483646869659424, "learning_rate": 1.5259606048124162e-05, "loss": 0.8025, "step": 2591 }, { "epoch": 0.12356684861630873, "grad_norm": 1.7944142818450928, "learning_rate": 1.5256264866501655e-05, "loss": 0.739, "step": 2592 }, { "epoch": 0.12361452101160823, "grad_norm": 1.2228024005889893, "learning_rate": 1.5252922873891611e-05, "loss": 0.5483, "step": 2593 }, { "epoch": 0.12366219340690773, "grad_norm": 2.679835081100464, "learning_rate": 1.5249580070809661e-05, "loss": 0.6809, "step": 2594 }, { "epoch": 0.12370986580220723, "grad_norm": 1.1273689270019531, "learning_rate": 1.5246236457771568e-05, "loss": 0.5378, "step": 2595 }, { "epoch": 0.12375753819750673, "grad_norm": 2.1775577068328857, "learning_rate": 1.5242892035293216e-05, "loss": 1.2623, "step": 2596 }, { "epoch": 0.12380521059280623, "grad_norm": 2.6037042140960693, "learning_rate": 1.523954680389061e-05, "loss": 1.0526, "step": 2597 }, { "epoch": 0.12385288298810573, "grad_norm": 1.6117844581604004, "learning_rate": 1.5236200764079894e-05, "loss": 0.8293, "step": 2598 }, { "epoch": 0.12390055538340523, "grad_norm": 1.9459812641143799, "learning_rate": 1.5232853916377321e-05, "loss": 0.7792, "step": 2599 }, { "epoch": 0.12394822777870475, "grad_norm": 1.566213607788086, "learning_rate": 1.5229506261299276e-05, "loss": 0.7649, "step": 2600 }, { "epoch": 0.12399590017400425, "grad_norm": 3.2547764778137207, "learning_rate": 1.5226157799362267e-05, "loss": 0.3185, "step": 2601 }, { "epoch": 0.12404357256930375, "grad_norm": 1.8356401920318604, "learning_rate": 1.5222808531082929e-05, "loss": 0.9227, "step": 2602 }, { "epoch": 0.12409124496460325, "grad_norm": 2.2335805892944336, "learning_rate": 1.521945845697802e-05, "loss": 0.7411, "step": 2603 }, { "epoch": 0.12413891735990275, "grad_norm": 1.2031301259994507, "learning_rate": 1.521610757756442e-05, "loss": 0.5848, "step": 2604 }, { "epoch": 0.12418658975520225, "grad_norm": 1.1740297079086304, "learning_rate": 1.521275589335914e-05, "loss": 0.6071, "step": 2605 }, { "epoch": 0.12423426215050175, "grad_norm": 1.8852561712265015, "learning_rate": 1.5209403404879305e-05, "loss": 0.6525, "step": 2606 }, { "epoch": 0.12428193454580125, "grad_norm": 1.8858394622802734, "learning_rate": 1.520605011264217e-05, "loss": 0.9351, "step": 2607 }, { "epoch": 0.12432960694110076, "grad_norm": 1.4784988164901733, "learning_rate": 1.5202696017165114e-05, "loss": 0.5307, "step": 2608 }, { "epoch": 0.12437727933640026, "grad_norm": 1.0846854448318481, "learning_rate": 1.5199341118965641e-05, "loss": 0.6695, "step": 2609 }, { "epoch": 0.12442495173169976, "grad_norm": 1.6776355504989624, "learning_rate": 1.5195985418561377e-05, "loss": 1.054, "step": 2610 }, { "epoch": 0.12447262412699926, "grad_norm": 1.3577890396118164, "learning_rate": 1.519262891647007e-05, "loss": 0.6678, "step": 2611 }, { "epoch": 0.12452029652229876, "grad_norm": 1.2777031660079956, "learning_rate": 1.5189271613209595e-05, "loss": 0.72, "step": 2612 }, { "epoch": 0.12456796891759826, "grad_norm": 1.4406533241271973, "learning_rate": 1.518591350929795e-05, "loss": 0.7993, "step": 2613 }, { "epoch": 0.12461564131289776, "grad_norm": 1.1992273330688477, "learning_rate": 1.5182554605253254e-05, "loss": 0.5169, "step": 2614 }, { "epoch": 0.12466331370819726, "grad_norm": 1.0916426181793213, "learning_rate": 1.5179194901593752e-05, "loss": 0.4097, "step": 2615 }, { "epoch": 0.12471098610349678, "grad_norm": 1.5397124290466309, "learning_rate": 1.5175834398837814e-05, "loss": 0.5146, "step": 2616 }, { "epoch": 0.12475865849879628, "grad_norm": 1.4506062269210815, "learning_rate": 1.5172473097503928e-05, "loss": 0.7577, "step": 2617 }, { "epoch": 0.12480633089409578, "grad_norm": 1.6795587539672852, "learning_rate": 1.516911099811071e-05, "loss": 0.7902, "step": 2618 }, { "epoch": 0.12485400328939528, "grad_norm": 2.53257417678833, "learning_rate": 1.5165748101176894e-05, "loss": 0.7611, "step": 2619 }, { "epoch": 0.12490167568469478, "grad_norm": 1.3414573669433594, "learning_rate": 1.5162384407221344e-05, "loss": 0.6261, "step": 2620 }, { "epoch": 0.12494934807999428, "grad_norm": 1.2702453136444092, "learning_rate": 1.5159019916763044e-05, "loss": 0.3997, "step": 2621 }, { "epoch": 0.12499702047529378, "grad_norm": 1.7113113403320312, "learning_rate": 1.51556546303211e-05, "loss": 0.4045, "step": 2622 }, { "epoch": 0.12504469287059328, "grad_norm": 2.8945841789245605, "learning_rate": 1.5152288548414734e-05, "loss": 1.104, "step": 2623 }, { "epoch": 0.1250923652658928, "grad_norm": 1.5277979373931885, "learning_rate": 1.5148921671563309e-05, "loss": 1.1081, "step": 2624 }, { "epoch": 0.12514003766119228, "grad_norm": 1.7321184873580933, "learning_rate": 1.514555400028629e-05, "loss": 0.8964, "step": 2625 }, { "epoch": 0.1251877100564918, "grad_norm": 1.2266030311584473, "learning_rate": 1.5142185535103276e-05, "loss": 0.6426, "step": 2626 }, { "epoch": 0.12523538245179128, "grad_norm": 1.8250890970230103, "learning_rate": 1.5138816276533994e-05, "loss": 0.324, "step": 2627 }, { "epoch": 0.1252830548470908, "grad_norm": 2.3702685832977295, "learning_rate": 1.5135446225098279e-05, "loss": 0.7214, "step": 2628 }, { "epoch": 0.1253307272423903, "grad_norm": 1.1414276361465454, "learning_rate": 1.5132075381316091e-05, "loss": 0.6886, "step": 2629 }, { "epoch": 0.1253783996376898, "grad_norm": 2.723069429397583, "learning_rate": 1.5128703745707527e-05, "loss": 1.02, "step": 2630 }, { "epoch": 0.1254260720329893, "grad_norm": 2.712341785430908, "learning_rate": 1.5125331318792787e-05, "loss": 0.5714, "step": 2631 }, { "epoch": 0.1254737444282888, "grad_norm": 2.0006113052368164, "learning_rate": 1.5121958101092205e-05, "loss": 0.3528, "step": 2632 }, { "epoch": 0.1255214168235883, "grad_norm": 3.0483641624450684, "learning_rate": 1.5118584093126237e-05, "loss": 1.1299, "step": 2633 }, { "epoch": 0.1255690892188878, "grad_norm": 1.6615673303604126, "learning_rate": 1.5115209295415454e-05, "loss": 0.6836, "step": 2634 }, { "epoch": 0.1256167616141873, "grad_norm": 1.579200029373169, "learning_rate": 1.5111833708480555e-05, "loss": 0.7894, "step": 2635 }, { "epoch": 0.12566443400948682, "grad_norm": 2.0592663288116455, "learning_rate": 1.5108457332842352e-05, "loss": 0.4932, "step": 2636 }, { "epoch": 0.1257121064047863, "grad_norm": 1.6344188451766968, "learning_rate": 1.5105080169021792e-05, "loss": 0.7383, "step": 2637 }, { "epoch": 0.12575977880008582, "grad_norm": 1.7413655519485474, "learning_rate": 1.5101702217539933e-05, "loss": 0.537, "step": 2638 }, { "epoch": 0.1258074511953853, "grad_norm": 1.9291362762451172, "learning_rate": 1.509832347891796e-05, "loss": 0.9059, "step": 2639 }, { "epoch": 0.12585512359068482, "grad_norm": 1.3139088153839111, "learning_rate": 1.5094943953677175e-05, "loss": 0.5787, "step": 2640 }, { "epoch": 0.1259027959859843, "grad_norm": 1.6078028678894043, "learning_rate": 1.509156364233901e-05, "loss": 0.7048, "step": 2641 }, { "epoch": 0.12595046838128382, "grad_norm": 1.2953481674194336, "learning_rate": 1.5088182545425003e-05, "loss": 0.6951, "step": 2642 }, { "epoch": 0.1259981407765833, "grad_norm": 2.0099778175354004, "learning_rate": 1.5084800663456828e-05, "loss": 0.7213, "step": 2643 }, { "epoch": 0.12604581317188282, "grad_norm": 1.9901890754699707, "learning_rate": 1.5081417996956277e-05, "loss": 0.7414, "step": 2644 }, { "epoch": 0.12609348556718233, "grad_norm": 1.2115107774734497, "learning_rate": 1.5078034546445257e-05, "loss": 0.6107, "step": 2645 }, { "epoch": 0.12614115796248182, "grad_norm": 4.351001739501953, "learning_rate": 1.5074650312445797e-05, "loss": 0.9591, "step": 2646 }, { "epoch": 0.12618883035778133, "grad_norm": 1.9015579223632812, "learning_rate": 1.5071265295480058e-05, "loss": 0.6576, "step": 2647 }, { "epoch": 0.12623650275308082, "grad_norm": 1.5669375658035278, "learning_rate": 1.5067879496070305e-05, "loss": 0.8454, "step": 2648 }, { "epoch": 0.12628417514838033, "grad_norm": 2.2157368659973145, "learning_rate": 1.5064492914738934e-05, "loss": 0.7875, "step": 2649 }, { "epoch": 0.12633184754367982, "grad_norm": 1.331052303314209, "learning_rate": 1.5061105552008462e-05, "loss": 0.6967, "step": 2650 }, { "epoch": 0.12637951993897933, "grad_norm": 1.203689694404602, "learning_rate": 1.5057717408401523e-05, "loss": 0.4825, "step": 2651 }, { "epoch": 0.12642719233427885, "grad_norm": 3.1987273693084717, "learning_rate": 1.5054328484440868e-05, "loss": 1.0853, "step": 2652 }, { "epoch": 0.12647486472957833, "grad_norm": 1.1295535564422607, "learning_rate": 1.5050938780649382e-05, "loss": 0.4593, "step": 2653 }, { "epoch": 0.12652253712487785, "grad_norm": 2.7517635822296143, "learning_rate": 1.5047548297550054e-05, "loss": 0.3619, "step": 2654 }, { "epoch": 0.12657020952017733, "grad_norm": 2.8597254753112793, "learning_rate": 1.5044157035666003e-05, "loss": 0.978, "step": 2655 }, { "epoch": 0.12661788191547685, "grad_norm": 2.004417896270752, "learning_rate": 1.5040764995520469e-05, "loss": 0.4542, "step": 2656 }, { "epoch": 0.12666555431077633, "grad_norm": 2.3083701133728027, "learning_rate": 1.5037372177636805e-05, "loss": 0.9238, "step": 2657 }, { "epoch": 0.12671322670607585, "grad_norm": 1.2528281211853027, "learning_rate": 1.5033978582538487e-05, "loss": 0.8701, "step": 2658 }, { "epoch": 0.12676089910137534, "grad_norm": 1.7552685737609863, "learning_rate": 1.5030584210749117e-05, "loss": 0.5164, "step": 2659 }, { "epoch": 0.12680857149667485, "grad_norm": 1.3574830293655396, "learning_rate": 1.5027189062792405e-05, "loss": 0.674, "step": 2660 }, { "epoch": 0.12685624389197436, "grad_norm": 1.3860228061676025, "learning_rate": 1.5023793139192192e-05, "loss": 0.7454, "step": 2661 }, { "epoch": 0.12690391628727385, "grad_norm": 2.419560194015503, "learning_rate": 1.5020396440472433e-05, "loss": 1.5646, "step": 2662 }, { "epoch": 0.12695158868257336, "grad_norm": 1.2999653816223145, "learning_rate": 1.5016998967157201e-05, "loss": 0.8124, "step": 2663 }, { "epoch": 0.12699926107787285, "grad_norm": 1.268409252166748, "learning_rate": 1.5013600719770699e-05, "loss": 0.6687, "step": 2664 }, { "epoch": 0.12704693347317236, "grad_norm": 1.0066592693328857, "learning_rate": 1.5010201698837232e-05, "loss": 0.452, "step": 2665 }, { "epoch": 0.12709460586847185, "grad_norm": 1.6505389213562012, "learning_rate": 1.5006801904881236e-05, "loss": 0.833, "step": 2666 }, { "epoch": 0.12714227826377136, "grad_norm": 1.9982913732528687, "learning_rate": 1.5003401338427271e-05, "loss": 0.7697, "step": 2667 }, { "epoch": 0.12718995065907088, "grad_norm": 1.6034916639328003, "learning_rate": 1.5000000000000002e-05, "loss": 0.8527, "step": 2668 }, { "epoch": 0.12723762305437036, "grad_norm": 1.9523409605026245, "learning_rate": 1.4996597890124222e-05, "loss": 0.8139, "step": 2669 }, { "epoch": 0.12728529544966988, "grad_norm": 1.541053056716919, "learning_rate": 1.4993195009324844e-05, "loss": 0.7768, "step": 2670 }, { "epoch": 0.12733296784496936, "grad_norm": 2.031825065612793, "learning_rate": 1.4989791358126898e-05, "loss": 0.761, "step": 2671 }, { "epoch": 0.12738064024026888, "grad_norm": 1.5603649616241455, "learning_rate": 1.4986386937055529e-05, "loss": 0.9026, "step": 2672 }, { "epoch": 0.12742831263556836, "grad_norm": 0.939656138420105, "learning_rate": 1.4982981746636002e-05, "loss": 0.2302, "step": 2673 }, { "epoch": 0.12747598503086788, "grad_norm": 1.3674044609069824, "learning_rate": 1.4979575787393713e-05, "loss": 0.3921, "step": 2674 }, { "epoch": 0.1275236574261674, "grad_norm": 1.1105155944824219, "learning_rate": 1.4976169059854151e-05, "loss": 0.6595, "step": 2675 }, { "epoch": 0.12757132982146688, "grad_norm": 1.2926220893859863, "learning_rate": 1.4972761564542953e-05, "loss": 0.7689, "step": 2676 }, { "epoch": 0.1276190022167664, "grad_norm": 1.38959538936615, "learning_rate": 1.4969353301985856e-05, "loss": 0.8177, "step": 2677 }, { "epoch": 0.12766667461206588, "grad_norm": 1.8384571075439453, "learning_rate": 1.4965944272708717e-05, "loss": 0.8512, "step": 2678 }, { "epoch": 0.1277143470073654, "grad_norm": 6.1158623695373535, "learning_rate": 1.4962534477237516e-05, "loss": 1.4324, "step": 2679 }, { "epoch": 0.12776201940266488, "grad_norm": 1.3320478200912476, "learning_rate": 1.495912391609835e-05, "loss": 0.7636, "step": 2680 }, { "epoch": 0.1278096917979644, "grad_norm": 1.7159260511398315, "learning_rate": 1.4955712589817433e-05, "loss": 0.6336, "step": 2681 }, { "epoch": 0.12785736419326388, "grad_norm": 1.2993638515472412, "learning_rate": 1.4952300498921097e-05, "loss": 0.553, "step": 2682 }, { "epoch": 0.1279050365885634, "grad_norm": 1.6075280904769897, "learning_rate": 1.4948887643935793e-05, "loss": 0.863, "step": 2683 }, { "epoch": 0.1279527089838629, "grad_norm": 1.3749072551727295, "learning_rate": 1.494547402538809e-05, "loss": 0.5569, "step": 2684 }, { "epoch": 0.1280003813791624, "grad_norm": 1.7231132984161377, "learning_rate": 1.4942059643804671e-05, "loss": 0.8663, "step": 2685 }, { "epoch": 0.1280480537744619, "grad_norm": 2.2960851192474365, "learning_rate": 1.4938644499712342e-05, "loss": 0.894, "step": 2686 }, { "epoch": 0.1280957261697614, "grad_norm": 1.9986116886138916, "learning_rate": 1.4935228593638029e-05, "loss": 0.4589, "step": 2687 }, { "epoch": 0.1281433985650609, "grad_norm": 1.8226187229156494, "learning_rate": 1.4931811926108765e-05, "loss": 0.8236, "step": 2688 }, { "epoch": 0.1281910709603604, "grad_norm": 1.6410741806030273, "learning_rate": 1.4928394497651709e-05, "loss": 0.8845, "step": 2689 }, { "epoch": 0.1282387433556599, "grad_norm": 3.4875731468200684, "learning_rate": 1.4924976308794134e-05, "loss": 1.2421, "step": 2690 }, { "epoch": 0.12828641575095942, "grad_norm": 2.545445680618286, "learning_rate": 1.4921557360063432e-05, "loss": 0.1611, "step": 2691 }, { "epoch": 0.1283340881462589, "grad_norm": 4.536085605621338, "learning_rate": 1.4918137651987111e-05, "loss": 0.7937, "step": 2692 }, { "epoch": 0.12838176054155842, "grad_norm": 1.0321882963180542, "learning_rate": 1.4914717185092797e-05, "loss": 0.7083, "step": 2693 }, { "epoch": 0.1284294329368579, "grad_norm": 2.099994659423828, "learning_rate": 1.4911295959908235e-05, "loss": 0.7027, "step": 2694 }, { "epoch": 0.12847710533215742, "grad_norm": 3.259446382522583, "learning_rate": 1.4907873976961282e-05, "loss": 0.4003, "step": 2695 }, { "epoch": 0.1285247777274569, "grad_norm": 1.6879503726959229, "learning_rate": 1.4904451236779917e-05, "loss": 0.95, "step": 2696 }, { "epoch": 0.12857245012275642, "grad_norm": 1.7220258712768555, "learning_rate": 1.4901027739892228e-05, "loss": 0.8303, "step": 2697 }, { "epoch": 0.1286201225180559, "grad_norm": 2.255549907684326, "learning_rate": 1.4897603486826433e-05, "loss": 1.409, "step": 2698 }, { "epoch": 0.12866779491335542, "grad_norm": 1.5026378631591797, "learning_rate": 1.4894178478110856e-05, "loss": 0.7705, "step": 2699 }, { "epoch": 0.12871546730865494, "grad_norm": 2.3631350994110107, "learning_rate": 1.4890752714273936e-05, "loss": 0.5187, "step": 2700 }, { "epoch": 0.12876313970395442, "grad_norm": 1.7667535543441772, "learning_rate": 1.4887326195844243e-05, "loss": 0.6735, "step": 2701 }, { "epoch": 0.12881081209925394, "grad_norm": 1.1645985841751099, "learning_rate": 1.4883898923350446e-05, "loss": 0.5765, "step": 2702 }, { "epoch": 0.12885848449455342, "grad_norm": 1.3896843194961548, "learning_rate": 1.488047089732134e-05, "loss": 0.8479, "step": 2703 }, { "epoch": 0.12890615688985294, "grad_norm": 1.2915118932724, "learning_rate": 1.4877042118285832e-05, "loss": 0.6082, "step": 2704 }, { "epoch": 0.12895382928515242, "grad_norm": 1.1442525386810303, "learning_rate": 1.487361258677295e-05, "loss": 0.7111, "step": 2705 }, { "epoch": 0.12900150168045194, "grad_norm": 1.5689988136291504, "learning_rate": 1.487018230331183e-05, "loss": 0.7491, "step": 2706 }, { "epoch": 0.12904917407575145, "grad_norm": 3.11156964302063, "learning_rate": 1.4866751268431738e-05, "loss": 0.6102, "step": 2707 }, { "epoch": 0.12909684647105094, "grad_norm": 1.5355772972106934, "learning_rate": 1.4863319482662044e-05, "loss": 0.6756, "step": 2708 }, { "epoch": 0.12914451886635045, "grad_norm": 1.6491599082946777, "learning_rate": 1.4859886946532235e-05, "loss": 0.6627, "step": 2709 }, { "epoch": 0.12919219126164994, "grad_norm": 1.981755256652832, "learning_rate": 1.485645366057192e-05, "loss": 0.6677, "step": 2710 }, { "epoch": 0.12923986365694945, "grad_norm": 2.169858932495117, "learning_rate": 1.4853019625310813e-05, "loss": 1.0718, "step": 2711 }, { "epoch": 0.12928753605224894, "grad_norm": 1.7253448963165283, "learning_rate": 1.4849584841278755e-05, "loss": 0.2739, "step": 2712 }, { "epoch": 0.12933520844754845, "grad_norm": 1.3531230688095093, "learning_rate": 1.4846149309005697e-05, "loss": 0.525, "step": 2713 }, { "epoch": 0.12938288084284794, "grad_norm": 1.7004480361938477, "learning_rate": 1.4842713029021707e-05, "loss": 0.9376, "step": 2714 }, { "epoch": 0.12943055323814745, "grad_norm": 1.5262759923934937, "learning_rate": 1.4839276001856965e-05, "loss": 0.6307, "step": 2715 }, { "epoch": 0.12947822563344696, "grad_norm": 2.7734172344207764, "learning_rate": 1.4835838228041773e-05, "loss": 0.6477, "step": 2716 }, { "epoch": 0.12952589802874645, "grad_norm": 1.2846275568008423, "learning_rate": 1.4832399708106541e-05, "loss": 0.5238, "step": 2717 }, { "epoch": 0.12957357042404596, "grad_norm": 1.1377571821212769, "learning_rate": 1.4828960442581802e-05, "loss": 0.6442, "step": 2718 }, { "epoch": 0.12962124281934545, "grad_norm": 1.4920954704284668, "learning_rate": 1.4825520431998191e-05, "loss": 0.8488, "step": 2719 }, { "epoch": 0.12966891521464496, "grad_norm": 1.5849255323410034, "learning_rate": 1.4822079676886469e-05, "loss": 0.5422, "step": 2720 }, { "epoch": 0.12971658760994445, "grad_norm": 1.5268288850784302, "learning_rate": 1.4818638177777514e-05, "loss": 0.6045, "step": 2721 }, { "epoch": 0.12976426000524396, "grad_norm": 1.9282076358795166, "learning_rate": 1.481519593520231e-05, "loss": 0.8956, "step": 2722 }, { "epoch": 0.12981193240054348, "grad_norm": 2.7352709770202637, "learning_rate": 1.4811752949691958e-05, "loss": 0.4204, "step": 2723 }, { "epoch": 0.12985960479584296, "grad_norm": 1.0934041738510132, "learning_rate": 1.4808309221777681e-05, "loss": 0.7061, "step": 2724 }, { "epoch": 0.12990727719114248, "grad_norm": 1.905676245689392, "learning_rate": 1.4804864751990807e-05, "loss": 0.7281, "step": 2725 }, { "epoch": 0.12995494958644196, "grad_norm": 1.4097931385040283, "learning_rate": 1.4801419540862779e-05, "loss": 0.8454, "step": 2726 }, { "epoch": 0.13000262198174148, "grad_norm": 1.8227475881576538, "learning_rate": 1.4797973588925163e-05, "loss": 0.6977, "step": 2727 }, { "epoch": 0.13005029437704096, "grad_norm": 0.8902652263641357, "learning_rate": 1.479452689670963e-05, "loss": 0.353, "step": 2728 }, { "epoch": 0.13009796677234048, "grad_norm": 4.896397590637207, "learning_rate": 1.4791079464747973e-05, "loss": 0.3192, "step": 2729 }, { "epoch": 0.13014563916763996, "grad_norm": 1.6081026792526245, "learning_rate": 1.4787631293572094e-05, "loss": 0.6647, "step": 2730 }, { "epoch": 0.13019331156293948, "grad_norm": 1.456747055053711, "learning_rate": 1.4784182383714005e-05, "loss": 0.8112, "step": 2731 }, { "epoch": 0.130240983958239, "grad_norm": 1.8662186861038208, "learning_rate": 1.4780732735705847e-05, "loss": 0.9066, "step": 2732 }, { "epoch": 0.13028865635353848, "grad_norm": 1.8691082000732422, "learning_rate": 1.4777282350079858e-05, "loss": 0.3761, "step": 2733 }, { "epoch": 0.130336328748838, "grad_norm": 1.9497867822647095, "learning_rate": 1.4773831227368399e-05, "loss": 0.7607, "step": 2734 }, { "epoch": 0.13038400114413748, "grad_norm": 1.5139490365982056, "learning_rate": 1.477037936810394e-05, "loss": 0.8341, "step": 2735 }, { "epoch": 0.130431673539437, "grad_norm": 1.5750176906585693, "learning_rate": 1.4766926772819072e-05, "loss": 0.7762, "step": 2736 }, { "epoch": 0.13047934593473648, "grad_norm": 1.0719943046569824, "learning_rate": 1.476347344204649e-05, "loss": 0.6573, "step": 2737 }, { "epoch": 0.130527018330036, "grad_norm": 2.591128349304199, "learning_rate": 1.4760019376319015e-05, "loss": 1.0311, "step": 2738 }, { "epoch": 0.1305746907253355, "grad_norm": 2.266017198562622, "learning_rate": 1.4756564576169568e-05, "loss": 0.506, "step": 2739 }, { "epoch": 0.130622363120635, "grad_norm": 9.711892127990723, "learning_rate": 1.4753109042131189e-05, "loss": 1.0578, "step": 2740 }, { "epoch": 0.1306700355159345, "grad_norm": 4.1030988693237305, "learning_rate": 1.4749652774737031e-05, "loss": 1.4217, "step": 2741 }, { "epoch": 0.130717707911234, "grad_norm": 1.9199711084365845, "learning_rate": 1.4746195774520365e-05, "loss": 1.0869, "step": 2742 }, { "epoch": 0.1307653803065335, "grad_norm": 1.354860782623291, "learning_rate": 1.4742738042014563e-05, "loss": 0.6842, "step": 2743 }, { "epoch": 0.130813052701833, "grad_norm": 1.3137390613555908, "learning_rate": 1.4739279577753122e-05, "loss": 0.3451, "step": 2744 }, { "epoch": 0.1308607250971325, "grad_norm": 4.379244327545166, "learning_rate": 1.4735820382269652e-05, "loss": 0.9839, "step": 2745 }, { "epoch": 0.130908397492432, "grad_norm": 4.320148468017578, "learning_rate": 1.4732360456097862e-05, "loss": 0.2448, "step": 2746 }, { "epoch": 0.1309560698877315, "grad_norm": 1.6411312818527222, "learning_rate": 1.4728899799771591e-05, "loss": 0.6835, "step": 2747 }, { "epoch": 0.13100374228303102, "grad_norm": 2.049267292022705, "learning_rate": 1.472543841382478e-05, "loss": 0.5907, "step": 2748 }, { "epoch": 0.1310514146783305, "grad_norm": 1.070568561553955, "learning_rate": 1.472197629879148e-05, "loss": 0.6369, "step": 2749 }, { "epoch": 0.13109908707363002, "grad_norm": 0.8059070110321045, "learning_rate": 1.4718513455205867e-05, "loss": 0.3309, "step": 2750 }, { "epoch": 0.1311467594689295, "grad_norm": 1.245627760887146, "learning_rate": 1.4715049883602217e-05, "loss": 0.7835, "step": 2751 }, { "epoch": 0.13119443186422902, "grad_norm": 1.0324382781982422, "learning_rate": 1.4711585584514927e-05, "loss": 0.5744, "step": 2752 }, { "epoch": 0.1312421042595285, "grad_norm": 2.1930136680603027, "learning_rate": 1.4708120558478501e-05, "loss": 0.9225, "step": 2753 }, { "epoch": 0.13128977665482802, "grad_norm": 3.742619276046753, "learning_rate": 1.4704654806027558e-05, "loss": 1.0052, "step": 2754 }, { "epoch": 0.13133744905012754, "grad_norm": 2.5243539810180664, "learning_rate": 1.4701188327696825e-05, "loss": 0.5848, "step": 2755 }, { "epoch": 0.13138512144542702, "grad_norm": 2.040257692337036, "learning_rate": 1.4697721124021149e-05, "loss": 0.7903, "step": 2756 }, { "epoch": 0.13143279384072654, "grad_norm": 3.4746055603027344, "learning_rate": 1.4694253195535478e-05, "loss": 0.9888, "step": 2757 }, { "epoch": 0.13148046623602602, "grad_norm": 1.957668423652649, "learning_rate": 1.469078454277488e-05, "loss": 1.0133, "step": 2758 }, { "epoch": 0.13152813863132554, "grad_norm": 1.347690463066101, "learning_rate": 1.4687315166274535e-05, "loss": 0.692, "step": 2759 }, { "epoch": 0.13157581102662502, "grad_norm": 2.0428285598754883, "learning_rate": 1.4683845066569727e-05, "loss": 0.4666, "step": 2760 }, { "epoch": 0.13162348342192454, "grad_norm": 2.2979228496551514, "learning_rate": 1.4680374244195861e-05, "loss": 0.8622, "step": 2761 }, { "epoch": 0.13167115581722402, "grad_norm": 2.308122158050537, "learning_rate": 1.467690269968845e-05, "loss": 1.1501, "step": 2762 }, { "epoch": 0.13171882821252354, "grad_norm": 2.7721638679504395, "learning_rate": 1.4673430433583114e-05, "loss": 0.4583, "step": 2763 }, { "epoch": 0.13176650060782305, "grad_norm": 0.9930713176727295, "learning_rate": 1.4669957446415588e-05, "loss": 0.3261, "step": 2764 }, { "epoch": 0.13181417300312254, "grad_norm": 1.2571367025375366, "learning_rate": 1.4666483738721719e-05, "loss": 0.6704, "step": 2765 }, { "epoch": 0.13186184539842205, "grad_norm": 1.1763139963150024, "learning_rate": 1.4663009311037464e-05, "loss": 0.7018, "step": 2766 }, { "epoch": 0.13190951779372154, "grad_norm": 3.2295870780944824, "learning_rate": 1.4659534163898894e-05, "loss": 1.3125, "step": 2767 }, { "epoch": 0.13195719018902105, "grad_norm": 2.7077207565307617, "learning_rate": 1.4656058297842185e-05, "loss": 0.8727, "step": 2768 }, { "epoch": 0.13200486258432054, "grad_norm": 1.6741299629211426, "learning_rate": 1.465258171340363e-05, "loss": 0.7755, "step": 2769 }, { "epoch": 0.13205253497962005, "grad_norm": 1.9056814908981323, "learning_rate": 1.464910441111963e-05, "loss": 0.3511, "step": 2770 }, { "epoch": 0.13210020737491956, "grad_norm": 2.130284070968628, "learning_rate": 1.4645626391526694e-05, "loss": 0.7884, "step": 2771 }, { "epoch": 0.13214787977021905, "grad_norm": 1.098597526550293, "learning_rate": 1.4642147655161445e-05, "loss": 0.5392, "step": 2772 }, { "epoch": 0.13219555216551857, "grad_norm": 1.7876940965652466, "learning_rate": 1.463866820256062e-05, "loss": 0.9207, "step": 2773 }, { "epoch": 0.13224322456081805, "grad_norm": 3.6280429363250732, "learning_rate": 1.4635188034261059e-05, "loss": 0.5778, "step": 2774 }, { "epoch": 0.13229089695611757, "grad_norm": 1.6366428136825562, "learning_rate": 1.4631707150799718e-05, "loss": 0.65, "step": 2775 }, { "epoch": 0.13233856935141705, "grad_norm": 1.342584490776062, "learning_rate": 1.4628225552713662e-05, "loss": 0.7516, "step": 2776 }, { "epoch": 0.13238624174671657, "grad_norm": 1.5772377252578735, "learning_rate": 1.4624743240540064e-05, "loss": 0.6755, "step": 2777 }, { "epoch": 0.13243391414201608, "grad_norm": 1.8864952325820923, "learning_rate": 1.4621260214816211e-05, "loss": 0.5291, "step": 2778 }, { "epoch": 0.13248158653731557, "grad_norm": 1.7388826608657837, "learning_rate": 1.4617776476079495e-05, "loss": 0.6162, "step": 2779 }, { "epoch": 0.13252925893261508, "grad_norm": 3.1094038486480713, "learning_rate": 1.461429202486742e-05, "loss": 0.9037, "step": 2780 }, { "epoch": 0.13257693132791457, "grad_norm": 1.1975785493850708, "learning_rate": 1.4610806861717607e-05, "loss": 0.4569, "step": 2781 }, { "epoch": 0.13262460372321408, "grad_norm": 1.2240186929702759, "learning_rate": 1.4607320987167778e-05, "loss": 0.5962, "step": 2782 }, { "epoch": 0.13267227611851357, "grad_norm": 1.3248203992843628, "learning_rate": 1.4603834401755766e-05, "loss": 0.8201, "step": 2783 }, { "epoch": 0.13271994851381308, "grad_norm": 2.2847790718078613, "learning_rate": 1.4600347106019514e-05, "loss": 0.3645, "step": 2784 }, { "epoch": 0.13276762090911257, "grad_norm": 2.044792413711548, "learning_rate": 1.4596859100497083e-05, "loss": 0.8309, "step": 2785 }, { "epoch": 0.13281529330441208, "grad_norm": 1.340786099433899, "learning_rate": 1.4593370385726627e-05, "loss": 0.6562, "step": 2786 }, { "epoch": 0.1328629656997116, "grad_norm": 1.168285846710205, "learning_rate": 1.4589880962246424e-05, "loss": 0.638, "step": 2787 }, { "epoch": 0.13291063809501108, "grad_norm": 1.4792840480804443, "learning_rate": 1.4586390830594856e-05, "loss": 0.581, "step": 2788 }, { "epoch": 0.1329583104903106, "grad_norm": 3.6525156497955322, "learning_rate": 1.4582899991310412e-05, "loss": 0.3736, "step": 2789 }, { "epoch": 0.13300598288561008, "grad_norm": 1.2078745365142822, "learning_rate": 1.4579408444931696e-05, "loss": 0.5303, "step": 2790 }, { "epoch": 0.1330536552809096, "grad_norm": 2.6930601596832275, "learning_rate": 1.4575916191997415e-05, "loss": 1.0866, "step": 2791 }, { "epoch": 0.13310132767620908, "grad_norm": 1.617804765701294, "learning_rate": 1.4572423233046386e-05, "loss": 0.6667, "step": 2792 }, { "epoch": 0.1331490000715086, "grad_norm": 1.266524076461792, "learning_rate": 1.4568929568617542e-05, "loss": 0.7276, "step": 2793 }, { "epoch": 0.1331966724668081, "grad_norm": 1.9528487920761108, "learning_rate": 1.4565435199249915e-05, "loss": 0.7923, "step": 2794 }, { "epoch": 0.1332443448621076, "grad_norm": 1.0529392957687378, "learning_rate": 1.4561940125482652e-05, "loss": 0.4727, "step": 2795 }, { "epoch": 0.1332920172574071, "grad_norm": 1.7384309768676758, "learning_rate": 1.4558444347855008e-05, "loss": 0.7612, "step": 2796 }, { "epoch": 0.1333396896527066, "grad_norm": 1.2260239124298096, "learning_rate": 1.455494786690634e-05, "loss": 0.4105, "step": 2797 }, { "epoch": 0.1333873620480061, "grad_norm": 1.6818293333053589, "learning_rate": 1.4551450683176127e-05, "loss": 0.3869, "step": 2798 }, { "epoch": 0.1334350344433056, "grad_norm": 3.6569578647613525, "learning_rate": 1.4547952797203944e-05, "loss": 0.473, "step": 2799 }, { "epoch": 0.1334827068386051, "grad_norm": 1.3549047708511353, "learning_rate": 1.454445420952948e-05, "loss": 0.8176, "step": 2800 }, { "epoch": 0.1335303792339046, "grad_norm": 2.233328104019165, "learning_rate": 1.4540954920692528e-05, "loss": 0.6295, "step": 2801 }, { "epoch": 0.1335780516292041, "grad_norm": 1.372320294380188, "learning_rate": 1.4537454931232994e-05, "loss": 0.6335, "step": 2802 }, { "epoch": 0.13362572402450362, "grad_norm": 1.1725353002548218, "learning_rate": 1.4533954241690891e-05, "loss": 0.7121, "step": 2803 }, { "epoch": 0.1336733964198031, "grad_norm": 1.5747082233428955, "learning_rate": 1.453045285260634e-05, "loss": 1.0418, "step": 2804 }, { "epoch": 0.13372106881510262, "grad_norm": 1.311205267906189, "learning_rate": 1.452695076451957e-05, "loss": 0.9077, "step": 2805 }, { "epoch": 0.1337687412104021, "grad_norm": 1.7655737400054932, "learning_rate": 1.4523447977970913e-05, "loss": 0.9344, "step": 2806 }, { "epoch": 0.13381641360570162, "grad_norm": 1.429305076599121, "learning_rate": 1.451994449350082e-05, "loss": 1.05, "step": 2807 }, { "epoch": 0.1338640860010011, "grad_norm": 1.9649168252944946, "learning_rate": 1.4516440311649835e-05, "loss": 0.6841, "step": 2808 }, { "epoch": 0.13391175839630062, "grad_norm": 1.6590079069137573, "learning_rate": 1.451293543295862e-05, "loss": 0.7886, "step": 2809 }, { "epoch": 0.13395943079160014, "grad_norm": 1.488486409187317, "learning_rate": 1.450942985796794e-05, "loss": 0.6772, "step": 2810 }, { "epoch": 0.13400710318689962, "grad_norm": 4.184868335723877, "learning_rate": 1.4505923587218673e-05, "loss": 0.2362, "step": 2811 }, { "epoch": 0.13405477558219914, "grad_norm": 1.7442996501922607, "learning_rate": 1.4502416621251798e-05, "loss": 0.7231, "step": 2812 }, { "epoch": 0.13410244797749862, "grad_norm": 4.76901912689209, "learning_rate": 1.4498908960608407e-05, "loss": 0.3126, "step": 2813 }, { "epoch": 0.13415012037279814, "grad_norm": 1.2491589784622192, "learning_rate": 1.449540060582969e-05, "loss": 0.4685, "step": 2814 }, { "epoch": 0.13419779276809762, "grad_norm": 1.6096590757369995, "learning_rate": 1.4491891557456956e-05, "loss": 0.9791, "step": 2815 }, { "epoch": 0.13424546516339714, "grad_norm": 4.615392684936523, "learning_rate": 1.448838181603161e-05, "loss": 0.549, "step": 2816 }, { "epoch": 0.13429313755869662, "grad_norm": 2.280949354171753, "learning_rate": 1.4484871382095172e-05, "loss": 0.9488, "step": 2817 }, { "epoch": 0.13434080995399614, "grad_norm": 1.3054804801940918, "learning_rate": 1.4481360256189266e-05, "loss": 0.5984, "step": 2818 }, { "epoch": 0.13438848234929565, "grad_norm": 1.9830986261367798, "learning_rate": 1.4477848438855619e-05, "loss": 0.5297, "step": 2819 }, { "epoch": 0.13443615474459514, "grad_norm": 3.2875237464904785, "learning_rate": 1.447433593063607e-05, "loss": 1.2456, "step": 2820 }, { "epoch": 0.13448382713989465, "grad_norm": 2.07106614112854, "learning_rate": 1.4470822732072567e-05, "loss": 0.749, "step": 2821 }, { "epoch": 0.13453149953519414, "grad_norm": 3.811577320098877, "learning_rate": 1.4467308843707155e-05, "loss": 0.4224, "step": 2822 }, { "epoch": 0.13457917193049365, "grad_norm": 1.4075751304626465, "learning_rate": 1.4463794266081994e-05, "loss": 0.6388, "step": 2823 }, { "epoch": 0.13462684432579314, "grad_norm": 1.4577556848526, "learning_rate": 1.4460278999739346e-05, "loss": 0.6949, "step": 2824 }, { "epoch": 0.13467451672109265, "grad_norm": 2.3330719470977783, "learning_rate": 1.445676304522158e-05, "loss": 0.9201, "step": 2825 }, { "epoch": 0.13472218911639217, "grad_norm": 4.043055534362793, "learning_rate": 1.445324640307117e-05, "loss": 0.6145, "step": 2826 }, { "epoch": 0.13476986151169165, "grad_norm": 1.8382353782653809, "learning_rate": 1.4449729073830703e-05, "loss": 0.8629, "step": 2827 }, { "epoch": 0.13481753390699117, "grad_norm": 0.980821967124939, "learning_rate": 1.444621105804286e-05, "loss": 0.8098, "step": 2828 }, { "epoch": 0.13486520630229065, "grad_norm": 1.6699111461639404, "learning_rate": 1.4442692356250443e-05, "loss": 0.6263, "step": 2829 }, { "epoch": 0.13491287869759017, "grad_norm": 2.7426393032073975, "learning_rate": 1.4439172968996343e-05, "loss": 0.8625, "step": 2830 }, { "epoch": 0.13496055109288965, "grad_norm": 1.8165969848632812, "learning_rate": 1.4435652896823565e-05, "loss": 0.6557, "step": 2831 }, { "epoch": 0.13500822348818917, "grad_norm": 1.8837940692901611, "learning_rate": 1.4432132140275229e-05, "loss": 0.9995, "step": 2832 }, { "epoch": 0.13505589588348865, "grad_norm": 1.8482041358947754, "learning_rate": 1.4428610699894542e-05, "loss": 0.7043, "step": 2833 }, { "epoch": 0.13510356827878817, "grad_norm": 1.199134111404419, "learning_rate": 1.442508857622483e-05, "loss": 0.7605, "step": 2834 }, { "epoch": 0.13515124067408768, "grad_norm": 3.1443140506744385, "learning_rate": 1.4421565769809523e-05, "loss": 0.7874, "step": 2835 }, { "epoch": 0.13519891306938717, "grad_norm": 1.8745898008346558, "learning_rate": 1.4418042281192151e-05, "loss": 0.8903, "step": 2836 }, { "epoch": 0.13524658546468668, "grad_norm": 1.7884939908981323, "learning_rate": 1.4414518110916352e-05, "loss": 0.4199, "step": 2837 }, { "epoch": 0.13529425785998617, "grad_norm": 0.8712854981422424, "learning_rate": 1.4410993259525868e-05, "loss": 0.1338, "step": 2838 }, { "epoch": 0.13534193025528568, "grad_norm": 0.9771924018859863, "learning_rate": 1.4407467727564548e-05, "loss": 0.5966, "step": 2839 }, { "epoch": 0.13538960265058517, "grad_norm": 1.6014736890792847, "learning_rate": 1.4403941515576344e-05, "loss": 0.7721, "step": 2840 }, { "epoch": 0.13543727504588468, "grad_norm": 3.413545608520508, "learning_rate": 1.4400414624105319e-05, "loss": 1.0529, "step": 2841 }, { "epoch": 0.1354849474411842, "grad_norm": 1.4031412601470947, "learning_rate": 1.4396887053695631e-05, "loss": 0.804, "step": 2842 }, { "epoch": 0.13553261983648368, "grad_norm": 1.7395211458206177, "learning_rate": 1.439335880489155e-05, "loss": 0.8361, "step": 2843 }, { "epoch": 0.1355802922317832, "grad_norm": 1.5222361087799072, "learning_rate": 1.4389829878237451e-05, "loss": 0.5949, "step": 2844 }, { "epoch": 0.13562796462708268, "grad_norm": 1.2143298387527466, "learning_rate": 1.438630027427781e-05, "loss": 0.6202, "step": 2845 }, { "epoch": 0.1356756370223822, "grad_norm": 2.712756395339966, "learning_rate": 1.4382769993557202e-05, "loss": 1.0577, "step": 2846 }, { "epoch": 0.13572330941768168, "grad_norm": 2.0115554332733154, "learning_rate": 1.4379239036620319e-05, "loss": 0.6218, "step": 2847 }, { "epoch": 0.1357709818129812, "grad_norm": 1.4737643003463745, "learning_rate": 1.4375707404011949e-05, "loss": 0.7575, "step": 2848 }, { "epoch": 0.13581865420828068, "grad_norm": 1.5066097974777222, "learning_rate": 1.4372175096276988e-05, "loss": 0.8696, "step": 2849 }, { "epoch": 0.1358663266035802, "grad_norm": 1.6347986459732056, "learning_rate": 1.4368642113960436e-05, "loss": 0.7425, "step": 2850 }, { "epoch": 0.1359139989988797, "grad_norm": 2.0242161750793457, "learning_rate": 1.4365108457607396e-05, "loss": 0.8388, "step": 2851 }, { "epoch": 0.1359616713941792, "grad_norm": 1.824858546257019, "learning_rate": 1.4361574127763069e-05, "loss": 0.5064, "step": 2852 }, { "epoch": 0.1360093437894787, "grad_norm": 2.1713826656341553, "learning_rate": 1.4358039124972771e-05, "loss": 0.6825, "step": 2853 }, { "epoch": 0.1360570161847782, "grad_norm": 1.1008491516113281, "learning_rate": 1.4354503449781914e-05, "loss": 0.1381, "step": 2854 }, { "epoch": 0.1361046885800777, "grad_norm": 3.2490859031677246, "learning_rate": 1.435096710273602e-05, "loss": 0.3514, "step": 2855 }, { "epoch": 0.1361523609753772, "grad_norm": 1.2097240686416626, "learning_rate": 1.4347430084380705e-05, "loss": 0.6101, "step": 2856 }, { "epoch": 0.1362000333706767, "grad_norm": 1.6209847927093506, "learning_rate": 1.4343892395261699e-05, "loss": 0.6709, "step": 2857 }, { "epoch": 0.13624770576597622, "grad_norm": 1.2333697080612183, "learning_rate": 1.434035403592483e-05, "loss": 0.7103, "step": 2858 }, { "epoch": 0.1362953781612757, "grad_norm": 4.612729549407959, "learning_rate": 1.4336815006916032e-05, "loss": 0.4566, "step": 2859 }, { "epoch": 0.13634305055657522, "grad_norm": 0.8605471849441528, "learning_rate": 1.4333275308781338e-05, "loss": 0.5919, "step": 2860 }, { "epoch": 0.1363907229518747, "grad_norm": 1.9653245210647583, "learning_rate": 1.4329734942066889e-05, "loss": 0.6842, "step": 2861 }, { "epoch": 0.13643839534717422, "grad_norm": 3.233825922012329, "learning_rate": 1.4326193907318924e-05, "loss": 0.6059, "step": 2862 }, { "epoch": 0.1364860677424737, "grad_norm": 3.7866153717041016, "learning_rate": 1.432265220508379e-05, "loss": 0.4458, "step": 2863 }, { "epoch": 0.13653374013777322, "grad_norm": 1.7378908395767212, "learning_rate": 1.4319109835907936e-05, "loss": 0.5611, "step": 2864 }, { "epoch": 0.13658141253307274, "grad_norm": 1.5968841314315796, "learning_rate": 1.4315566800337914e-05, "loss": 0.9656, "step": 2865 }, { "epoch": 0.13662908492837222, "grad_norm": 2.901979923248291, "learning_rate": 1.4312023098920374e-05, "loss": 0.6457, "step": 2866 }, { "epoch": 0.13667675732367174, "grad_norm": 2.657661199569702, "learning_rate": 1.430847873220208e-05, "loss": 0.5618, "step": 2867 }, { "epoch": 0.13672442971897122, "grad_norm": 2.0658679008483887, "learning_rate": 1.4304933700729882e-05, "loss": 0.6751, "step": 2868 }, { "epoch": 0.13677210211427074, "grad_norm": 1.0008745193481445, "learning_rate": 1.4301388005050746e-05, "loss": 0.2648, "step": 2869 }, { "epoch": 0.13681977450957022, "grad_norm": 1.9280773401260376, "learning_rate": 1.4297841645711738e-05, "loss": 0.2906, "step": 2870 }, { "epoch": 0.13686744690486974, "grad_norm": 1.3655898571014404, "learning_rate": 1.4294294623260024e-05, "loss": 0.6692, "step": 2871 }, { "epoch": 0.13691511930016922, "grad_norm": 4.982389450073242, "learning_rate": 1.429074693824287e-05, "loss": 0.8454, "step": 2872 }, { "epoch": 0.13696279169546874, "grad_norm": 2.2366626262664795, "learning_rate": 1.428719859120765e-05, "loss": 0.303, "step": 2873 }, { "epoch": 0.13701046409076825, "grad_norm": 3.9326047897338867, "learning_rate": 1.428364958270184e-05, "loss": 0.3676, "step": 2874 }, { "epoch": 0.13705813648606774, "grad_norm": 1.3298144340515137, "learning_rate": 1.428009991327301e-05, "loss": 0.3698, "step": 2875 }, { "epoch": 0.13710580888136725, "grad_norm": 2.202796220779419, "learning_rate": 1.4276549583468842e-05, "loss": 0.5087, "step": 2876 }, { "epoch": 0.13715348127666674, "grad_norm": 6.823620796203613, "learning_rate": 1.4272998593837108e-05, "loss": 0.5663, "step": 2877 }, { "epoch": 0.13720115367196625, "grad_norm": 1.474297046661377, "learning_rate": 1.42694469449257e-05, "loss": 0.8557, "step": 2878 }, { "epoch": 0.13724882606726574, "grad_norm": 2.2157840728759766, "learning_rate": 1.4265894637282594e-05, "loss": 1.0354, "step": 2879 }, { "epoch": 0.13729649846256525, "grad_norm": 1.6916190385818481, "learning_rate": 1.4262341671455873e-05, "loss": 0.8873, "step": 2880 }, { "epoch": 0.13734417085786477, "grad_norm": 1.2153040170669556, "learning_rate": 1.4258788047993726e-05, "loss": 0.822, "step": 2881 }, { "epoch": 0.13739184325316425, "grad_norm": 1.2191141843795776, "learning_rate": 1.4255233767444443e-05, "loss": 0.7823, "step": 2882 }, { "epoch": 0.13743951564846377, "grad_norm": 2.412100315093994, "learning_rate": 1.4251678830356408e-05, "loss": 0.8249, "step": 2883 }, { "epoch": 0.13748718804376325, "grad_norm": 1.3940682411193848, "learning_rate": 1.4248123237278116e-05, "loss": 0.777, "step": 2884 }, { "epoch": 0.13753486043906277, "grad_norm": 0.8108394742012024, "learning_rate": 1.4244566988758152e-05, "loss": 0.3257, "step": 2885 }, { "epoch": 0.13758253283436225, "grad_norm": 1.5383516550064087, "learning_rate": 1.4241010085345216e-05, "loss": 0.806, "step": 2886 }, { "epoch": 0.13763020522966177, "grad_norm": 1.469811201095581, "learning_rate": 1.4237452527588094e-05, "loss": 0.8608, "step": 2887 }, { "epoch": 0.13767787762496125, "grad_norm": 2.5900049209594727, "learning_rate": 1.4233894316035683e-05, "loss": 0.9492, "step": 2888 }, { "epoch": 0.13772555002026077, "grad_norm": 2.3263185024261475, "learning_rate": 1.4230335451236988e-05, "loss": 0.4258, "step": 2889 }, { "epoch": 0.13777322241556028, "grad_norm": 3.216356039047241, "learning_rate": 1.422677593374109e-05, "loss": 0.3797, "step": 2890 }, { "epoch": 0.13782089481085977, "grad_norm": 1.6025211811065674, "learning_rate": 1.4223215764097194e-05, "loss": 0.6416, "step": 2891 }, { "epoch": 0.13786856720615928, "grad_norm": 1.0096967220306396, "learning_rate": 1.4219654942854598e-05, "loss": 0.4615, "step": 2892 }, { "epoch": 0.13791623960145877, "grad_norm": 2.9632620811462402, "learning_rate": 1.4216093470562698e-05, "loss": 0.3381, "step": 2893 }, { "epoch": 0.13796391199675828, "grad_norm": 1.9774034023284912, "learning_rate": 1.4212531347770987e-05, "loss": 0.7165, "step": 2894 }, { "epoch": 0.13801158439205777, "grad_norm": 1.7228517532348633, "learning_rate": 1.4208968575029077e-05, "loss": 0.6596, "step": 2895 }, { "epoch": 0.13805925678735728, "grad_norm": 2.000681161880493, "learning_rate": 1.4205405152886658e-05, "loss": 0.7726, "step": 2896 }, { "epoch": 0.1381069291826568, "grad_norm": 1.7331486940383911, "learning_rate": 1.4201841081893531e-05, "loss": 0.5399, "step": 2897 }, { "epoch": 0.13815460157795628, "grad_norm": 4.172076225280762, "learning_rate": 1.4198276362599597e-05, "loss": 1.0465, "step": 2898 }, { "epoch": 0.1382022739732558, "grad_norm": 1.299048900604248, "learning_rate": 1.4194710995554852e-05, "loss": 0.8047, "step": 2899 }, { "epoch": 0.13824994636855528, "grad_norm": 1.7845855951309204, "learning_rate": 1.4191144981309397e-05, "loss": 0.7289, "step": 2900 }, { "epoch": 0.1382976187638548, "grad_norm": 1.5964035987854004, "learning_rate": 1.4187578320413434e-05, "loss": 0.7735, "step": 2901 }, { "epoch": 0.13834529115915428, "grad_norm": 2.142331123352051, "learning_rate": 1.4184011013417258e-05, "loss": 0.5967, "step": 2902 }, { "epoch": 0.1383929635544538, "grad_norm": 2.6684093475341797, "learning_rate": 1.4180443060871269e-05, "loss": 0.7027, "step": 2903 }, { "epoch": 0.13844063594975328, "grad_norm": 1.0961874723434448, "learning_rate": 1.4176874463325967e-05, "loss": 0.664, "step": 2904 }, { "epoch": 0.1384883083450528, "grad_norm": 2.229717969894409, "learning_rate": 1.4173305221331953e-05, "loss": 0.7163, "step": 2905 }, { "epoch": 0.1385359807403523, "grad_norm": 1.9411121606826782, "learning_rate": 1.4169735335439914e-05, "loss": 0.7307, "step": 2906 }, { "epoch": 0.1385836531356518, "grad_norm": 1.4852358102798462, "learning_rate": 1.4166164806200655e-05, "loss": 0.7537, "step": 2907 }, { "epoch": 0.1386313255309513, "grad_norm": 5.4812092781066895, "learning_rate": 1.416259363416507e-05, "loss": 0.5604, "step": 2908 }, { "epoch": 0.1386789979262508, "grad_norm": 1.5105454921722412, "learning_rate": 1.415902181988415e-05, "loss": 0.876, "step": 2909 }, { "epoch": 0.1387266703215503, "grad_norm": 2.848024845123291, "learning_rate": 1.4155449363908997e-05, "loss": 1.2413, "step": 2910 }, { "epoch": 0.1387743427168498, "grad_norm": 1.7367385625839233, "learning_rate": 1.4151876266790801e-05, "loss": 0.5624, "step": 2911 }, { "epoch": 0.1388220151121493, "grad_norm": 1.502943992614746, "learning_rate": 1.414830252908085e-05, "loss": 0.6341, "step": 2912 }, { "epoch": 0.13886968750744882, "grad_norm": 1.7613787651062012, "learning_rate": 1.414472815133054e-05, "loss": 0.8833, "step": 2913 }, { "epoch": 0.1389173599027483, "grad_norm": 1.4656928777694702, "learning_rate": 1.4141153134091357e-05, "loss": 0.7196, "step": 2914 }, { "epoch": 0.13896503229804782, "grad_norm": 2.8903956413269043, "learning_rate": 1.4137577477914892e-05, "loss": 1.1933, "step": 2915 }, { "epoch": 0.1390127046933473, "grad_norm": 2.127920627593994, "learning_rate": 1.4134001183352833e-05, "loss": 0.6954, "step": 2916 }, { "epoch": 0.13906037708864682, "grad_norm": 3.422004222869873, "learning_rate": 1.4130424250956958e-05, "loss": 0.7146, "step": 2917 }, { "epoch": 0.1391080494839463, "grad_norm": 1.8332690000534058, "learning_rate": 1.4126846681279161e-05, "loss": 0.9339, "step": 2918 }, { "epoch": 0.13915572187924583, "grad_norm": 2.653010368347168, "learning_rate": 1.4123268474871417e-05, "loss": 1.1642, "step": 2919 }, { "epoch": 0.1392033942745453, "grad_norm": 2.307483196258545, "learning_rate": 1.4119689632285812e-05, "loss": 0.8394, "step": 2920 }, { "epoch": 0.13925106666984483, "grad_norm": 1.6198796033859253, "learning_rate": 1.4116110154074518e-05, "loss": 0.862, "step": 2921 }, { "epoch": 0.13929873906514434, "grad_norm": 4.71099853515625, "learning_rate": 1.4112530040789816e-05, "loss": 0.688, "step": 2922 }, { "epoch": 0.13934641146044383, "grad_norm": 3.7185537815093994, "learning_rate": 1.4108949292984077e-05, "loss": 0.9517, "step": 2923 }, { "epoch": 0.13939408385574334, "grad_norm": 2.543074369430542, "learning_rate": 1.410536791120978e-05, "loss": 1.0582, "step": 2924 }, { "epoch": 0.13944175625104283, "grad_norm": 1.5880959033966064, "learning_rate": 1.410178589601949e-05, "loss": 0.9458, "step": 2925 }, { "epoch": 0.13948942864634234, "grad_norm": 1.6787793636322021, "learning_rate": 1.4098203247965876e-05, "loss": 0.7943, "step": 2926 }, { "epoch": 0.13953710104164183, "grad_norm": 1.3269317150115967, "learning_rate": 1.4094619967601707e-05, "loss": 0.711, "step": 2927 }, { "epoch": 0.13958477343694134, "grad_norm": 1.2400355339050293, "learning_rate": 1.409103605547984e-05, "loss": 0.6322, "step": 2928 }, { "epoch": 0.13963244583224085, "grad_norm": 1.814907431602478, "learning_rate": 1.4087451512153241e-05, "loss": 0.6028, "step": 2929 }, { "epoch": 0.13968011822754034, "grad_norm": 2.8301570415496826, "learning_rate": 1.4083866338174964e-05, "loss": 1.2206, "step": 2930 }, { "epoch": 0.13972779062283985, "grad_norm": 5.362226963043213, "learning_rate": 1.4080280534098168e-05, "loss": 0.9202, "step": 2931 }, { "epoch": 0.13977546301813934, "grad_norm": 1.8723117113113403, "learning_rate": 1.4076694100476104e-05, "loss": 0.7996, "step": 2932 }, { "epoch": 0.13982313541343885, "grad_norm": 0.9941101670265198, "learning_rate": 1.4073107037862124e-05, "loss": 0.4938, "step": 2933 }, { "epoch": 0.13987080780873834, "grad_norm": 1.369470477104187, "learning_rate": 1.4069519346809673e-05, "loss": 0.7486, "step": 2934 }, { "epoch": 0.13991848020403785, "grad_norm": 1.4776984453201294, "learning_rate": 1.4065931027872293e-05, "loss": 0.7235, "step": 2935 }, { "epoch": 0.13996615259933734, "grad_norm": 3.2828266620635986, "learning_rate": 1.4062342081603626e-05, "loss": 0.5507, "step": 2936 }, { "epoch": 0.14001382499463685, "grad_norm": 3.0249550342559814, "learning_rate": 1.405875250855741e-05, "loss": 1.2229, "step": 2937 }, { "epoch": 0.14006149738993637, "grad_norm": 3.70796275138855, "learning_rate": 1.4055162309287477e-05, "loss": 0.463, "step": 2938 }, { "epoch": 0.14010916978523585, "grad_norm": 1.8504685163497925, "learning_rate": 1.4051571484347766e-05, "loss": 0.7662, "step": 2939 }, { "epoch": 0.14015684218053537, "grad_norm": 1.9240682125091553, "learning_rate": 1.4047980034292292e-05, "loss": 0.963, "step": 2940 }, { "epoch": 0.14020451457583485, "grad_norm": 1.343772292137146, "learning_rate": 1.4044387959675187e-05, "loss": 0.8986, "step": 2941 }, { "epoch": 0.14025218697113437, "grad_norm": 1.0899327993392944, "learning_rate": 1.4040795261050671e-05, "loss": 0.5131, "step": 2942 }, { "epoch": 0.14029985936643385, "grad_norm": 3.6432085037231445, "learning_rate": 1.4037201938973057e-05, "loss": 0.4439, "step": 2943 }, { "epoch": 0.14034753176173337, "grad_norm": 2.634822368621826, "learning_rate": 1.4033607993996758e-05, "loss": 0.8628, "step": 2944 }, { "epoch": 0.14039520415703288, "grad_norm": 2.0911800861358643, "learning_rate": 1.4030013426676283e-05, "loss": 0.5768, "step": 2945 }, { "epoch": 0.14044287655233237, "grad_norm": 1.9920333623886108, "learning_rate": 1.4026418237566239e-05, "loss": 0.9166, "step": 2946 }, { "epoch": 0.14049054894763188, "grad_norm": 1.877659559249878, "learning_rate": 1.4022822427221325e-05, "loss": 0.8295, "step": 2947 }, { "epoch": 0.14053822134293137, "grad_norm": 2.5049636363983154, "learning_rate": 1.4019225996196335e-05, "loss": 0.4287, "step": 2948 }, { "epoch": 0.14058589373823088, "grad_norm": 2.9660418033599854, "learning_rate": 1.4015628945046169e-05, "loss": 0.5802, "step": 2949 }, { "epoch": 0.14063356613353037, "grad_norm": 2.9606802463531494, "learning_rate": 1.4012031274325808e-05, "loss": 0.5012, "step": 2950 }, { "epoch": 0.14068123852882988, "grad_norm": 3.091475248336792, "learning_rate": 1.4008432984590333e-05, "loss": 1.3912, "step": 2951 }, { "epoch": 0.1407289109241294, "grad_norm": 0.995457649230957, "learning_rate": 1.4004834076394931e-05, "loss": 0.3238, "step": 2952 }, { "epoch": 0.14077658331942888, "grad_norm": 2.928942918777466, "learning_rate": 1.4001234550294873e-05, "loss": 0.1468, "step": 2953 }, { "epoch": 0.1408242557147284, "grad_norm": 2.151156425476074, "learning_rate": 1.3997634406845526e-05, "loss": 0.6447, "step": 2954 }, { "epoch": 0.14087192811002788, "grad_norm": 1.677695631980896, "learning_rate": 1.3994033646602359e-05, "loss": 0.8535, "step": 2955 }, { "epoch": 0.1409196005053274, "grad_norm": 1.9230855703353882, "learning_rate": 1.3990432270120933e-05, "loss": 0.8298, "step": 2956 }, { "epoch": 0.14096727290062688, "grad_norm": 1.4534255266189575, "learning_rate": 1.3986830277956899e-05, "loss": 0.8519, "step": 2957 }, { "epoch": 0.1410149452959264, "grad_norm": 1.0360122919082642, "learning_rate": 1.3983227670666011e-05, "loss": 0.722, "step": 2958 }, { "epoch": 0.14106261769122588, "grad_norm": 1.6863003969192505, "learning_rate": 1.3979624448804112e-05, "loss": 0.7972, "step": 2959 }, { "epoch": 0.1411102900865254, "grad_norm": 1.8406460285186768, "learning_rate": 1.3976020612927141e-05, "loss": 0.6768, "step": 2960 }, { "epoch": 0.1411579624818249, "grad_norm": 2.9295125007629395, "learning_rate": 1.3972416163591138e-05, "loss": 0.3901, "step": 2961 }, { "epoch": 0.1412056348771244, "grad_norm": 1.7305203676223755, "learning_rate": 1.3968811101352226e-05, "loss": 0.5098, "step": 2962 }, { "epoch": 0.1412533072724239, "grad_norm": 1.1533738374710083, "learning_rate": 1.3965205426766632e-05, "loss": 0.5401, "step": 2963 }, { "epoch": 0.1413009796677234, "grad_norm": 1.4826109409332275, "learning_rate": 1.3961599140390675e-05, "loss": 0.6049, "step": 2964 }, { "epoch": 0.1413486520630229, "grad_norm": 1.4037529230117798, "learning_rate": 1.3957992242780768e-05, "loss": 0.6412, "step": 2965 }, { "epoch": 0.1413963244583224, "grad_norm": 1.8146154880523682, "learning_rate": 1.3954384734493418e-05, "loss": 0.864, "step": 2966 }, { "epoch": 0.1414439968536219, "grad_norm": 2.8060736656188965, "learning_rate": 1.3950776616085224e-05, "loss": 0.927, "step": 2967 }, { "epoch": 0.14149166924892143, "grad_norm": 1.497754454612732, "learning_rate": 1.3947167888112882e-05, "loss": 0.9745, "step": 2968 }, { "epoch": 0.1415393416442209, "grad_norm": 1.9746789932250977, "learning_rate": 1.3943558551133186e-05, "loss": 0.927, "step": 2969 }, { "epoch": 0.14158701403952043, "grad_norm": 1.7346903085708618, "learning_rate": 1.3939948605703015e-05, "loss": 0.7102, "step": 2970 }, { "epoch": 0.1416346864348199, "grad_norm": 1.7983261346817017, "learning_rate": 1.393633805237935e-05, "loss": 0.1876, "step": 2971 }, { "epoch": 0.14168235883011943, "grad_norm": 2.2199759483337402, "learning_rate": 1.3932726891719259e-05, "loss": 0.7934, "step": 2972 }, { "epoch": 0.1417300312254189, "grad_norm": 2.119215726852417, "learning_rate": 1.3929115124279906e-05, "loss": 0.9677, "step": 2973 }, { "epoch": 0.14177770362071843, "grad_norm": 1.9493294954299927, "learning_rate": 1.392550275061855e-05, "loss": 0.9217, "step": 2974 }, { "epoch": 0.1418253760160179, "grad_norm": 2.7072625160217285, "learning_rate": 1.3921889771292546e-05, "loss": 0.8318, "step": 2975 }, { "epoch": 0.14187304841131743, "grad_norm": 1.0546387434005737, "learning_rate": 1.391827618685934e-05, "loss": 0.5448, "step": 2976 }, { "epoch": 0.14192072080661694, "grad_norm": 1.4570761919021606, "learning_rate": 1.3914661997876467e-05, "loss": 0.8481, "step": 2977 }, { "epoch": 0.14196839320191643, "grad_norm": 1.6180341243743896, "learning_rate": 1.391104720490156e-05, "loss": 0.8538, "step": 2978 }, { "epoch": 0.14201606559721594, "grad_norm": 2.4710769653320312, "learning_rate": 1.3907431808492348e-05, "loss": 1.0139, "step": 2979 }, { "epoch": 0.14206373799251543, "grad_norm": 1.6567139625549316, "learning_rate": 1.3903815809206646e-05, "loss": 0.7472, "step": 2980 }, { "epoch": 0.14211141038781494, "grad_norm": 5.065958023071289, "learning_rate": 1.3900199207602365e-05, "loss": 0.4898, "step": 2981 }, { "epoch": 0.14215908278311443, "grad_norm": 1.936753511428833, "learning_rate": 1.3896582004237514e-05, "loss": 0.8907, "step": 2982 }, { "epoch": 0.14220675517841394, "grad_norm": 1.5708733797073364, "learning_rate": 1.3892964199670181e-05, "loss": 0.8792, "step": 2983 }, { "epoch": 0.14225442757371345, "grad_norm": 1.586714267730713, "learning_rate": 1.3889345794458563e-05, "loss": 0.5538, "step": 2984 }, { "epoch": 0.14230209996901294, "grad_norm": 0.865074872970581, "learning_rate": 1.3885726789160943e-05, "loss": 0.5777, "step": 2985 }, { "epoch": 0.14234977236431245, "grad_norm": 1.160605549812317, "learning_rate": 1.3882107184335696e-05, "loss": 0.7043, "step": 2986 }, { "epoch": 0.14239744475961194, "grad_norm": 1.7783540487289429, "learning_rate": 1.3878486980541289e-05, "loss": 0.6285, "step": 2987 }, { "epoch": 0.14244511715491145, "grad_norm": 1.6727008819580078, "learning_rate": 1.3874866178336277e-05, "loss": 0.6554, "step": 2988 }, { "epoch": 0.14249278955021094, "grad_norm": 1.6376591920852661, "learning_rate": 1.387124477827932e-05, "loss": 0.4045, "step": 2989 }, { "epoch": 0.14254046194551045, "grad_norm": 4.546367168426514, "learning_rate": 1.386762278092916e-05, "loss": 1.4425, "step": 2990 }, { "epoch": 0.14258813434080994, "grad_norm": 1.674723744392395, "learning_rate": 1.3864000186844631e-05, "loss": 0.9774, "step": 2991 }, { "epoch": 0.14263580673610946, "grad_norm": 1.2198519706726074, "learning_rate": 1.3860376996584667e-05, "loss": 0.8267, "step": 2992 }, { "epoch": 0.14268347913140897, "grad_norm": 1.1878458261489868, "learning_rate": 1.3856753210708288e-05, "loss": 0.7851, "step": 2993 }, { "epoch": 0.14273115152670846, "grad_norm": 1.803183674812317, "learning_rate": 1.3853128829774605e-05, "loss": 0.7295, "step": 2994 }, { "epoch": 0.14277882392200797, "grad_norm": 1.6760066747665405, "learning_rate": 1.3849503854342823e-05, "loss": 0.6454, "step": 2995 }, { "epoch": 0.14282649631730746, "grad_norm": 6.469170093536377, "learning_rate": 1.3845878284972237e-05, "loss": 0.2724, "step": 2996 }, { "epoch": 0.14287416871260697, "grad_norm": 1.1705045700073242, "learning_rate": 1.3842252122222235e-05, "loss": 0.6299, "step": 2997 }, { "epoch": 0.14292184110790646, "grad_norm": 1.2141879796981812, "learning_rate": 1.38386253666523e-05, "loss": 0.7289, "step": 2998 }, { "epoch": 0.14296951350320597, "grad_norm": 2.3483808040618896, "learning_rate": 1.3834998018822004e-05, "loss": 0.4504, "step": 2999 }, { "epoch": 0.14301718589850548, "grad_norm": 1.2169690132141113, "learning_rate": 1.3831370079291002e-05, "loss": 0.7487, "step": 3000 }, { "epoch": 0.14306485829380497, "grad_norm": 2.0098938941955566, "learning_rate": 1.3827741548619054e-05, "loss": 0.6189, "step": 3001 }, { "epoch": 0.14311253068910448, "grad_norm": 1.406715989112854, "learning_rate": 1.3824112427366003e-05, "loss": 0.5196, "step": 3002 }, { "epoch": 0.14316020308440397, "grad_norm": 2.799132823944092, "learning_rate": 1.3820482716091786e-05, "loss": 0.687, "step": 3003 }, { "epoch": 0.14320787547970348, "grad_norm": 1.0308328866958618, "learning_rate": 1.381685241535643e-05, "loss": 0.6581, "step": 3004 }, { "epoch": 0.14325554787500297, "grad_norm": 1.8086446523666382, "learning_rate": 1.381322152572005e-05, "loss": 0.7909, "step": 3005 }, { "epoch": 0.14330322027030248, "grad_norm": 1.650424838066101, "learning_rate": 1.3809590047742858e-05, "loss": 0.8621, "step": 3006 }, { "epoch": 0.14335089266560197, "grad_norm": 2.0094616413116455, "learning_rate": 1.3805957981985154e-05, "loss": 0.5668, "step": 3007 }, { "epoch": 0.14339856506090148, "grad_norm": 1.4018728733062744, "learning_rate": 1.3802325329007324e-05, "loss": 0.9517, "step": 3008 }, { "epoch": 0.143446237456201, "grad_norm": 1.5284672975540161, "learning_rate": 1.3798692089369855e-05, "loss": 0.6185, "step": 3009 }, { "epoch": 0.14349390985150048, "grad_norm": 2.180819034576416, "learning_rate": 1.3795058263633316e-05, "loss": 0.9084, "step": 3010 }, { "epoch": 0.1435415822468, "grad_norm": 1.4202477931976318, "learning_rate": 1.3791423852358365e-05, "loss": 0.299, "step": 3011 }, { "epoch": 0.14358925464209948, "grad_norm": 1.2657815217971802, "learning_rate": 1.3787788856105762e-05, "loss": 0.5537, "step": 3012 }, { "epoch": 0.143636927037399, "grad_norm": 1.7528935670852661, "learning_rate": 1.3784153275436345e-05, "loss": 0.5828, "step": 3013 }, { "epoch": 0.14368459943269848, "grad_norm": 2.6704468727111816, "learning_rate": 1.3780517110911042e-05, "loss": 1.1112, "step": 3014 }, { "epoch": 0.143732271827998, "grad_norm": 1.6846861839294434, "learning_rate": 1.3776880363090883e-05, "loss": 0.8549, "step": 3015 }, { "epoch": 0.1437799442232975, "grad_norm": 1.3927019834518433, "learning_rate": 1.377324303253698e-05, "loss": 0.5709, "step": 3016 }, { "epoch": 0.143827616618597, "grad_norm": 2.5773043632507324, "learning_rate": 1.3769605119810533e-05, "loss": 0.9208, "step": 3017 }, { "epoch": 0.1438752890138965, "grad_norm": 1.5710643529891968, "learning_rate": 1.3765966625472837e-05, "loss": 0.5101, "step": 3018 }, { "epoch": 0.143922961409196, "grad_norm": 1.1312370300292969, "learning_rate": 1.376232755008527e-05, "loss": 0.4123, "step": 3019 }, { "epoch": 0.1439706338044955, "grad_norm": 2.383615493774414, "learning_rate": 1.3758687894209307e-05, "loss": 0.9446, "step": 3020 }, { "epoch": 0.144018306199795, "grad_norm": 1.9989885091781616, "learning_rate": 1.375504765840651e-05, "loss": 0.9089, "step": 3021 }, { "epoch": 0.1440659785950945, "grad_norm": 1.4815373420715332, "learning_rate": 1.3751406843238526e-05, "loss": 0.1729, "step": 3022 }, { "epoch": 0.144113650990394, "grad_norm": 1.3775211572647095, "learning_rate": 1.37477654492671e-05, "loss": 0.9971, "step": 3023 }, { "epoch": 0.1441613233856935, "grad_norm": 1.7749160528182983, "learning_rate": 1.374412347705406e-05, "loss": 0.7421, "step": 3024 }, { "epoch": 0.14420899578099303, "grad_norm": 2.8174102306365967, "learning_rate": 1.3740480927161326e-05, "loss": 1.0027, "step": 3025 }, { "epoch": 0.1442566681762925, "grad_norm": 1.6649245023727417, "learning_rate": 1.3736837800150903e-05, "loss": 0.9248, "step": 3026 }, { "epoch": 0.14430434057159203, "grad_norm": 1.4131121635437012, "learning_rate": 1.373319409658489e-05, "loss": 0.8903, "step": 3027 }, { "epoch": 0.1443520129668915, "grad_norm": 1.7323601245880127, "learning_rate": 1.3729549817025472e-05, "loss": 0.2271, "step": 3028 }, { "epoch": 0.14439968536219103, "grad_norm": 3.0306766033172607, "learning_rate": 1.3725904962034923e-05, "loss": 0.4708, "step": 3029 }, { "epoch": 0.1444473577574905, "grad_norm": 1.7027243375778198, "learning_rate": 1.372225953217561e-05, "loss": 1.089, "step": 3030 }, { "epoch": 0.14449503015279003, "grad_norm": 0.9926291108131409, "learning_rate": 1.3718613528009982e-05, "loss": 0.6185, "step": 3031 }, { "epoch": 0.14454270254808954, "grad_norm": 1.298276662826538, "learning_rate": 1.371496695010058e-05, "loss": 0.6894, "step": 3032 }, { "epoch": 0.14459037494338903, "grad_norm": 4.534313678741455, "learning_rate": 1.3711319799010037e-05, "loss": 0.4234, "step": 3033 }, { "epoch": 0.14463804733868854, "grad_norm": 0.8996132612228394, "learning_rate": 1.3707672075301064e-05, "loss": 0.3286, "step": 3034 }, { "epoch": 0.14468571973398803, "grad_norm": 2.3603134155273438, "learning_rate": 1.3704023779536475e-05, "loss": 0.3578, "step": 3035 }, { "epoch": 0.14473339212928754, "grad_norm": 1.9113951921463013, "learning_rate": 1.3700374912279159e-05, "loss": 0.9303, "step": 3036 }, { "epoch": 0.14478106452458703, "grad_norm": 2.1963863372802734, "learning_rate": 1.3696725474092098e-05, "loss": 1.4539, "step": 3037 }, { "epoch": 0.14482873691988654, "grad_norm": 1.2416272163391113, "learning_rate": 1.369307546553837e-05, "loss": 0.7416, "step": 3038 }, { "epoch": 0.14487640931518603, "grad_norm": 3.0142147541046143, "learning_rate": 1.3689424887181129e-05, "loss": 1.5193, "step": 3039 }, { "epoch": 0.14492408171048554, "grad_norm": 1.4682621955871582, "learning_rate": 1.368577373958362e-05, "loss": 0.8641, "step": 3040 }, { "epoch": 0.14497175410578506, "grad_norm": 0.973953127861023, "learning_rate": 1.3682122023309179e-05, "loss": 0.4439, "step": 3041 }, { "epoch": 0.14501942650108454, "grad_norm": 1.5924018621444702, "learning_rate": 1.3678469738921228e-05, "loss": 0.8522, "step": 3042 }, { "epoch": 0.14506709889638406, "grad_norm": 1.5191891193389893, "learning_rate": 1.3674816886983275e-05, "loss": 0.703, "step": 3043 }, { "epoch": 0.14511477129168354, "grad_norm": 2.1543540954589844, "learning_rate": 1.3671163468058924e-05, "loss": 0.8137, "step": 3044 }, { "epoch": 0.14516244368698306, "grad_norm": 1.0507248640060425, "learning_rate": 1.3667509482711851e-05, "loss": 0.6013, "step": 3045 }, { "epoch": 0.14521011608228254, "grad_norm": 2.3577895164489746, "learning_rate": 1.3663854931505838e-05, "loss": 1.0949, "step": 3046 }, { "epoch": 0.14525778847758206, "grad_norm": 1.5307725667953491, "learning_rate": 1.366019981500474e-05, "loss": 0.539, "step": 3047 }, { "epoch": 0.14530546087288157, "grad_norm": 2.1397852897644043, "learning_rate": 1.3656544133772499e-05, "loss": 0.7505, "step": 3048 }, { "epoch": 0.14535313326818106, "grad_norm": 1.8362572193145752, "learning_rate": 1.3652887888373155e-05, "loss": 0.9348, "step": 3049 }, { "epoch": 0.14540080566348057, "grad_norm": 1.742902159690857, "learning_rate": 1.3649231079370825e-05, "loss": 0.6217, "step": 3050 }, { "epoch": 0.14544847805878006, "grad_norm": 1.7095146179199219, "learning_rate": 1.364557370732972e-05, "loss": 1.163, "step": 3051 }, { "epoch": 0.14549615045407957, "grad_norm": 1.185315489768982, "learning_rate": 1.3641915772814137e-05, "loss": 0.6675, "step": 3052 }, { "epoch": 0.14554382284937906, "grad_norm": 2.307662010192871, "learning_rate": 1.3638257276388454e-05, "loss": 0.584, "step": 3053 }, { "epoch": 0.14559149524467857, "grad_norm": 3.863408327102661, "learning_rate": 1.3634598218617138e-05, "loss": 0.6662, "step": 3054 }, { "epoch": 0.14563916763997808, "grad_norm": 1.374801516532898, "learning_rate": 1.3630938600064748e-05, "loss": 0.7323, "step": 3055 }, { "epoch": 0.14568684003527757, "grad_norm": 1.4017853736877441, "learning_rate": 1.3627278421295925e-05, "loss": 0.4267, "step": 3056 }, { "epoch": 0.14573451243057708, "grad_norm": 1.437345027923584, "learning_rate": 1.362361768287539e-05, "loss": 0.7345, "step": 3057 }, { "epoch": 0.14578218482587657, "grad_norm": 1.4214564561843872, "learning_rate": 1.3619956385367964e-05, "loss": 0.5995, "step": 3058 }, { "epoch": 0.14582985722117608, "grad_norm": 1.473494529724121, "learning_rate": 1.3616294529338547e-05, "loss": 0.6405, "step": 3059 }, { "epoch": 0.14587752961647557, "grad_norm": 2.276869535446167, "learning_rate": 1.3612632115352126e-05, "loss": 0.7383, "step": 3060 }, { "epoch": 0.14592520201177508, "grad_norm": 6.51352071762085, "learning_rate": 1.3608969143973771e-05, "loss": 0.21, "step": 3061 }, { "epoch": 0.14597287440707457, "grad_norm": 1.3052343130111694, "learning_rate": 1.3605305615768645e-05, "loss": 0.8507, "step": 3062 }, { "epoch": 0.14602054680237408, "grad_norm": 2.879122257232666, "learning_rate": 1.3601641531301988e-05, "loss": 1.0844, "step": 3063 }, { "epoch": 0.1460682191976736, "grad_norm": 1.743299961090088, "learning_rate": 1.3597976891139132e-05, "loss": 0.8749, "step": 3064 }, { "epoch": 0.14611589159297309, "grad_norm": 1.2879037857055664, "learning_rate": 1.3594311695845494e-05, "loss": 0.3992, "step": 3065 }, { "epoch": 0.1461635639882726, "grad_norm": 1.800226092338562, "learning_rate": 1.3590645945986577e-05, "loss": 0.6878, "step": 3066 }, { "epoch": 0.14621123638357209, "grad_norm": 4.948254585266113, "learning_rate": 1.3586979642127964e-05, "loss": 1.4434, "step": 3067 }, { "epoch": 0.1462589087788716, "grad_norm": 1.3549526929855347, "learning_rate": 1.3583312784835332e-05, "loss": 0.4869, "step": 3068 }, { "epoch": 0.14630658117417109, "grad_norm": 1.0785163640975952, "learning_rate": 1.3579645374674442e-05, "loss": 0.5656, "step": 3069 }, { "epoch": 0.1463542535694706, "grad_norm": 1.4176609516143799, "learning_rate": 1.3575977412211132e-05, "loss": 0.7809, "step": 3070 }, { "epoch": 0.1464019259647701, "grad_norm": 1.3744181394577026, "learning_rate": 1.3572308898011328e-05, "loss": 0.8812, "step": 3071 }, { "epoch": 0.1464495983600696, "grad_norm": 2.3029258251190186, "learning_rate": 1.3568639832641055e-05, "loss": 0.8421, "step": 3072 }, { "epoch": 0.1464972707553691, "grad_norm": 1.3138843774795532, "learning_rate": 1.3564970216666402e-05, "loss": 0.5224, "step": 3073 }, { "epoch": 0.1465449431506686, "grad_norm": 3.187471389770508, "learning_rate": 1.3561300050653556e-05, "loss": 1.4745, "step": 3074 }, { "epoch": 0.1465926155459681, "grad_norm": 1.805351734161377, "learning_rate": 1.3557629335168789e-05, "loss": 0.5679, "step": 3075 }, { "epoch": 0.1466402879412676, "grad_norm": 1.5123603343963623, "learning_rate": 1.3553958070778452e-05, "loss": 0.83, "step": 3076 }, { "epoch": 0.1466879603365671, "grad_norm": 2.233900785446167, "learning_rate": 1.3550286258048984e-05, "loss": 0.4014, "step": 3077 }, { "epoch": 0.1467356327318666, "grad_norm": 1.0178353786468506, "learning_rate": 1.3546613897546905e-05, "loss": 0.6774, "step": 3078 }, { "epoch": 0.1467833051271661, "grad_norm": 0.8925381898880005, "learning_rate": 1.3542940989838824e-05, "loss": 0.4033, "step": 3079 }, { "epoch": 0.14683097752246563, "grad_norm": 1.7636933326721191, "learning_rate": 1.3539267535491436e-05, "loss": 1.0362, "step": 3080 }, { "epoch": 0.14687864991776511, "grad_norm": 2.7844808101654053, "learning_rate": 1.3535593535071515e-05, "loss": 0.8824, "step": 3081 }, { "epoch": 0.14692632231306463, "grad_norm": 1.1747113466262817, "learning_rate": 1.3531918989145919e-05, "loss": 0.7971, "step": 3082 }, { "epoch": 0.14697399470836411, "grad_norm": 1.5873674154281616, "learning_rate": 1.3528243898281595e-05, "loss": 0.591, "step": 3083 }, { "epoch": 0.14702166710366363, "grad_norm": 2.503666877746582, "learning_rate": 1.3524568263045572e-05, "loss": 1.0975, "step": 3084 }, { "epoch": 0.14706933949896311, "grad_norm": 1.4579899311065674, "learning_rate": 1.3520892084004961e-05, "loss": 0.746, "step": 3085 }, { "epoch": 0.14711701189426263, "grad_norm": 2.0907604694366455, "learning_rate": 1.3517215361726963e-05, "loss": 0.5951, "step": 3086 }, { "epoch": 0.14716468428956214, "grad_norm": 2.5679707527160645, "learning_rate": 1.3513538096778853e-05, "loss": 0.2363, "step": 3087 }, { "epoch": 0.14721235668486163, "grad_norm": 1.765938401222229, "learning_rate": 1.3509860289727994e-05, "loss": 0.6744, "step": 3088 }, { "epoch": 0.14726002908016114, "grad_norm": 1.4003573656082153, "learning_rate": 1.350618194114184e-05, "loss": 0.8043, "step": 3089 }, { "epoch": 0.14730770147546063, "grad_norm": 1.256507396697998, "learning_rate": 1.3502503051587921e-05, "loss": 0.4145, "step": 3090 }, { "epoch": 0.14735537387076014, "grad_norm": 1.7833360433578491, "learning_rate": 1.3498823621633848e-05, "loss": 0.6234, "step": 3091 }, { "epoch": 0.14740304626605963, "grad_norm": 6.889230251312256, "learning_rate": 1.349514365184732e-05, "loss": 0.4359, "step": 3092 }, { "epoch": 0.14745071866135914, "grad_norm": 1.4543510675430298, "learning_rate": 1.3491463142796121e-05, "loss": 0.866, "step": 3093 }, { "epoch": 0.14749839105665863, "grad_norm": 1.1712321043014526, "learning_rate": 1.3487782095048112e-05, "loss": 0.7195, "step": 3094 }, { "epoch": 0.14754606345195814, "grad_norm": 1.9908615350723267, "learning_rate": 1.3484100509171246e-05, "loss": 0.5969, "step": 3095 }, { "epoch": 0.14759373584725766, "grad_norm": 1.7359424829483032, "learning_rate": 1.3480418385733549e-05, "loss": 0.6248, "step": 3096 }, { "epoch": 0.14764140824255714, "grad_norm": 1.045398235321045, "learning_rate": 1.3476735725303134e-05, "loss": 0.437, "step": 3097 }, { "epoch": 0.14768908063785666, "grad_norm": 2.1756820678710938, "learning_rate": 1.3473052528448203e-05, "loss": 0.6527, "step": 3098 }, { "epoch": 0.14773675303315614, "grad_norm": 2.6632003784179688, "learning_rate": 1.3469368795737033e-05, "loss": 1.2887, "step": 3099 }, { "epoch": 0.14778442542845566, "grad_norm": 1.1401606798171997, "learning_rate": 1.3465684527737986e-05, "loss": 0.6396, "step": 3100 }, { "epoch": 0.14783209782375514, "grad_norm": 1.0559873580932617, "learning_rate": 1.3461999725019506e-05, "loss": 0.8043, "step": 3101 }, { "epoch": 0.14787977021905466, "grad_norm": 1.2660080194473267, "learning_rate": 1.3458314388150115e-05, "loss": 0.645, "step": 3102 }, { "epoch": 0.14792744261435417, "grad_norm": 8.582117080688477, "learning_rate": 1.3454628517698431e-05, "loss": 0.4915, "step": 3103 }, { "epoch": 0.14797511500965366, "grad_norm": 1.8201041221618652, "learning_rate": 1.3450942114233145e-05, "loss": 0.8148, "step": 3104 }, { "epoch": 0.14802278740495317, "grad_norm": 1.4995503425598145, "learning_rate": 1.3447255178323025e-05, "loss": 0.3414, "step": 3105 }, { "epoch": 0.14807045980025266, "grad_norm": 1.8104441165924072, "learning_rate": 1.3443567710536931e-05, "loss": 0.4537, "step": 3106 }, { "epoch": 0.14811813219555217, "grad_norm": 1.6128402948379517, "learning_rate": 1.3439879711443807e-05, "loss": 0.5302, "step": 3107 }, { "epoch": 0.14816580459085166, "grad_norm": 1.4941962957382202, "learning_rate": 1.3436191181612662e-05, "loss": 0.7592, "step": 3108 }, { "epoch": 0.14821347698615117, "grad_norm": 1.7309041023254395, "learning_rate": 1.3432502121612602e-05, "loss": 0.8093, "step": 3109 }, { "epoch": 0.14826114938145066, "grad_norm": 2.0706653594970703, "learning_rate": 1.3428812532012816e-05, "loss": 0.8846, "step": 3110 }, { "epoch": 0.14830882177675017, "grad_norm": 2.691295862197876, "learning_rate": 1.3425122413382563e-05, "loss": 1.3084, "step": 3111 }, { "epoch": 0.14835649417204969, "grad_norm": 1.1625548601150513, "learning_rate": 1.3421431766291198e-05, "loss": 0.6071, "step": 3112 }, { "epoch": 0.14840416656734917, "grad_norm": 1.321666955947876, "learning_rate": 1.3417740591308142e-05, "loss": 0.5195, "step": 3113 }, { "epoch": 0.14845183896264869, "grad_norm": 2.304530620574951, "learning_rate": 1.341404888900291e-05, "loss": 0.6536, "step": 3114 }, { "epoch": 0.14849951135794817, "grad_norm": 1.2831493616104126, "learning_rate": 1.3410356659945095e-05, "loss": 0.6212, "step": 3115 }, { "epoch": 0.14854718375324769, "grad_norm": 1.8855429887771606, "learning_rate": 1.3406663904704362e-05, "loss": 0.6279, "step": 3116 }, { "epoch": 0.14859485614854717, "grad_norm": 2.2084779739379883, "learning_rate": 1.3402970623850474e-05, "loss": 0.335, "step": 3117 }, { "epoch": 0.14864252854384669, "grad_norm": 2.807931423187256, "learning_rate": 1.339927681795326e-05, "loss": 1.179, "step": 3118 }, { "epoch": 0.1486902009391462, "grad_norm": 1.6841225624084473, "learning_rate": 1.3395582487582639e-05, "loss": 0.4703, "step": 3119 }, { "epoch": 0.14873787333444569, "grad_norm": 1.979477047920227, "learning_rate": 1.3391887633308609e-05, "loss": 0.8423, "step": 3120 }, { "epoch": 0.1487855457297452, "grad_norm": 3.011033535003662, "learning_rate": 1.3388192255701249e-05, "loss": 1.3704, "step": 3121 }, { "epoch": 0.1488332181250447, "grad_norm": 5.905727386474609, "learning_rate": 1.3384496355330714e-05, "loss": 0.381, "step": 3122 }, { "epoch": 0.1488808905203442, "grad_norm": 4.945913314819336, "learning_rate": 1.3380799932767243e-05, "loss": 1.3545, "step": 3123 }, { "epoch": 0.1489285629156437, "grad_norm": 1.2848289012908936, "learning_rate": 1.3377102988581162e-05, "loss": 0.6009, "step": 3124 }, { "epoch": 0.1489762353109432, "grad_norm": 1.3738328218460083, "learning_rate": 1.3373405523342862e-05, "loss": 0.6994, "step": 3125 }, { "epoch": 0.1490239077062427, "grad_norm": 1.4116965532302856, "learning_rate": 1.336970753762283e-05, "loss": 0.7592, "step": 3126 }, { "epoch": 0.1490715801015422, "grad_norm": 1.0781294107437134, "learning_rate": 1.336600903199163e-05, "loss": 0.2557, "step": 3127 }, { "epoch": 0.14911925249684171, "grad_norm": 1.3575353622436523, "learning_rate": 1.3362310007019897e-05, "loss": 0.4058, "step": 3128 }, { "epoch": 0.1491669248921412, "grad_norm": 1.770020842552185, "learning_rate": 1.3358610463278357e-05, "loss": 0.1847, "step": 3129 }, { "epoch": 0.14921459728744071, "grad_norm": 1.2070060968399048, "learning_rate": 1.335491040133781e-05, "loss": 0.7112, "step": 3130 }, { "epoch": 0.1492622696827402, "grad_norm": 1.2136505842208862, "learning_rate": 1.335120982176913e-05, "loss": 0.8173, "step": 3131 }, { "epoch": 0.14930994207803971, "grad_norm": 1.3908705711364746, "learning_rate": 1.3347508725143292e-05, "loss": 0.781, "step": 3132 }, { "epoch": 0.1493576144733392, "grad_norm": 1.0264910459518433, "learning_rate": 1.3343807112031329e-05, "loss": 0.7093, "step": 3133 }, { "epoch": 0.14940528686863871, "grad_norm": 1.4023420810699463, "learning_rate": 1.3340104983004363e-05, "loss": 0.6146, "step": 3134 }, { "epoch": 0.14945295926393823, "grad_norm": 1.5430750846862793, "learning_rate": 1.3336402338633593e-05, "loss": 1.0919, "step": 3135 }, { "epoch": 0.14950063165923771, "grad_norm": 2.278820753097534, "learning_rate": 1.3332699179490302e-05, "loss": 0.8825, "step": 3136 }, { "epoch": 0.14954830405453723, "grad_norm": 2.868929624557495, "learning_rate": 1.3328995506145849e-05, "loss": 0.9877, "step": 3137 }, { "epoch": 0.14959597644983672, "grad_norm": 1.2752020359039307, "learning_rate": 1.3325291319171669e-05, "loss": 0.6129, "step": 3138 }, { "epoch": 0.14964364884513623, "grad_norm": 4.130411148071289, "learning_rate": 1.3321586619139285e-05, "loss": 0.4079, "step": 3139 }, { "epoch": 0.14969132124043572, "grad_norm": 2.5877163410186768, "learning_rate": 1.3317881406620287e-05, "loss": 1.01, "step": 3140 }, { "epoch": 0.14973899363573523, "grad_norm": 1.3374658823013306, "learning_rate": 1.3314175682186358e-05, "loss": 0.6646, "step": 3141 }, { "epoch": 0.14978666603103474, "grad_norm": 1.1594607830047607, "learning_rate": 1.3310469446409251e-05, "loss": 0.4802, "step": 3142 }, { "epoch": 0.14983433842633423, "grad_norm": 0.9576970934867859, "learning_rate": 1.33067626998608e-05, "loss": 0.4485, "step": 3143 }, { "epoch": 0.14988201082163374, "grad_norm": 1.985435962677002, "learning_rate": 1.3303055443112918e-05, "loss": 0.2718, "step": 3144 }, { "epoch": 0.14992968321693323, "grad_norm": 1.419874906539917, "learning_rate": 1.3299347676737595e-05, "loss": 0.7879, "step": 3145 }, { "epoch": 0.14997735561223274, "grad_norm": 2.0171825885772705, "learning_rate": 1.32956394013069e-05, "loss": 0.9661, "step": 3146 }, { "epoch": 0.15002502800753223, "grad_norm": 1.5258233547210693, "learning_rate": 1.329193061739299e-05, "loss": 0.786, "step": 3147 }, { "epoch": 0.15007270040283174, "grad_norm": 1.2240711450576782, "learning_rate": 1.328822132556808e-05, "loss": 0.7653, "step": 3148 }, { "epoch": 0.15012037279813123, "grad_norm": 1.9582189321517944, "learning_rate": 1.3284511526404485e-05, "loss": 0.2899, "step": 3149 }, { "epoch": 0.15016804519343074, "grad_norm": 2.007537364959717, "learning_rate": 1.3280801220474585e-05, "loss": 0.9079, "step": 3150 }, { "epoch": 0.15021571758873026, "grad_norm": 1.0368845462799072, "learning_rate": 1.3277090408350841e-05, "loss": 0.6406, "step": 3151 }, { "epoch": 0.15026338998402974, "grad_norm": 1.4237940311431885, "learning_rate": 1.3273379090605796e-05, "loss": 0.5644, "step": 3152 }, { "epoch": 0.15031106237932926, "grad_norm": 1.3777469396591187, "learning_rate": 1.3269667267812066e-05, "loss": 0.6178, "step": 3153 }, { "epoch": 0.15035873477462874, "grad_norm": 3.5124101638793945, "learning_rate": 1.3265954940542344e-05, "loss": 0.5717, "step": 3154 }, { "epoch": 0.15040640716992826, "grad_norm": 1.1081418991088867, "learning_rate": 1.3262242109369412e-05, "loss": 0.4262, "step": 3155 }, { "epoch": 0.15045407956522774, "grad_norm": 1.704636812210083, "learning_rate": 1.3258528774866115e-05, "loss": 1.1554, "step": 3156 }, { "epoch": 0.15050175196052726, "grad_norm": 1.464885950088501, "learning_rate": 1.3254814937605385e-05, "loss": 0.621, "step": 3157 }, { "epoch": 0.15054942435582677, "grad_norm": 2.538759708404541, "learning_rate": 1.325110059816023e-05, "loss": 1.2667, "step": 3158 }, { "epoch": 0.15059709675112626, "grad_norm": 4.7555131912231445, "learning_rate": 1.324738575710373e-05, "loss": 0.4244, "step": 3159 }, { "epoch": 0.15064476914642577, "grad_norm": 1.7561062574386597, "learning_rate": 1.324367041500905e-05, "loss": 0.982, "step": 3160 }, { "epoch": 0.15069244154172526, "grad_norm": 1.8753656148910522, "learning_rate": 1.323995457244943e-05, "loss": 0.5967, "step": 3161 }, { "epoch": 0.15074011393702477, "grad_norm": 1.176792025566101, "learning_rate": 1.3236238229998181e-05, "loss": 0.333, "step": 3162 }, { "epoch": 0.15078778633232426, "grad_norm": 23.347854614257812, "learning_rate": 1.3232521388228703e-05, "loss": 0.5478, "step": 3163 }, { "epoch": 0.15083545872762377, "grad_norm": 2.1824772357940674, "learning_rate": 1.3228804047714462e-05, "loss": 0.6945, "step": 3164 }, { "epoch": 0.15088313112292326, "grad_norm": 1.1818346977233887, "learning_rate": 1.3225086209029008e-05, "loss": 0.5874, "step": 3165 }, { "epoch": 0.15093080351822277, "grad_norm": 2.823831796646118, "learning_rate": 1.3221367872745962e-05, "loss": 1.05, "step": 3166 }, { "epoch": 0.1509784759135223, "grad_norm": 1.5577776432037354, "learning_rate": 1.321764903943903e-05, "loss": 0.9005, "step": 3167 }, { "epoch": 0.15102614830882177, "grad_norm": 2.868621587753296, "learning_rate": 1.3213929709681986e-05, "loss": 0.7395, "step": 3168 }, { "epoch": 0.1510738207041213, "grad_norm": 3.710958242416382, "learning_rate": 1.321020988404868e-05, "loss": 1.5319, "step": 3169 }, { "epoch": 0.15112149309942077, "grad_norm": 1.1518117189407349, "learning_rate": 1.3206489563113054e-05, "loss": 0.1914, "step": 3170 }, { "epoch": 0.1511691654947203, "grad_norm": 2.3093178272247314, "learning_rate": 1.3202768747449104e-05, "loss": 0.776, "step": 3171 }, { "epoch": 0.15121683789001977, "grad_norm": 2.6352174282073975, "learning_rate": 1.3199047437630921e-05, "loss": 0.9145, "step": 3172 }, { "epoch": 0.1512645102853193, "grad_norm": 1.613187551498413, "learning_rate": 1.3195325634232662e-05, "loss": 0.5873, "step": 3173 }, { "epoch": 0.1513121826806188, "grad_norm": 1.3262912034988403, "learning_rate": 1.3191603337828563e-05, "loss": 0.7749, "step": 3174 }, { "epoch": 0.1513598550759183, "grad_norm": 1.0240951776504517, "learning_rate": 1.3187880548992937e-05, "loss": 0.586, "step": 3175 }, { "epoch": 0.1514075274712178, "grad_norm": 27.738183975219727, "learning_rate": 1.3184157268300168e-05, "loss": 0.6833, "step": 3176 }, { "epoch": 0.1514551998665173, "grad_norm": 2.284761905670166, "learning_rate": 1.3180433496324724e-05, "loss": 0.497, "step": 3177 }, { "epoch": 0.1515028722618168, "grad_norm": 2.069669008255005, "learning_rate": 1.3176709233641147e-05, "loss": 0.596, "step": 3178 }, { "epoch": 0.1515505446571163, "grad_norm": 1.770541787147522, "learning_rate": 1.3172984480824045e-05, "loss": 1.0247, "step": 3179 }, { "epoch": 0.1515982170524158, "grad_norm": 1.1753355264663696, "learning_rate": 1.3169259238448115e-05, "loss": 0.7585, "step": 3180 }, { "epoch": 0.1516458894477153, "grad_norm": 1.3008493185043335, "learning_rate": 1.3165533507088122e-05, "loss": 0.7156, "step": 3181 }, { "epoch": 0.1516935618430148, "grad_norm": 3.1839733123779297, "learning_rate": 1.3161807287318906e-05, "loss": 0.5957, "step": 3182 }, { "epoch": 0.15174123423831432, "grad_norm": 1.2422832250595093, "learning_rate": 1.3158080579715389e-05, "loss": 0.8522, "step": 3183 }, { "epoch": 0.1517889066336138, "grad_norm": 1.483921766281128, "learning_rate": 1.3154353384852559e-05, "loss": 0.64, "step": 3184 }, { "epoch": 0.15183657902891332, "grad_norm": 1.994581937789917, "learning_rate": 1.315062570330548e-05, "loss": 0.6769, "step": 3185 }, { "epoch": 0.1518842514242128, "grad_norm": 2.5788888931274414, "learning_rate": 1.3146897535649305e-05, "loss": 1.1391, "step": 3186 }, { "epoch": 0.15193192381951232, "grad_norm": 1.5630767345428467, "learning_rate": 1.3143168882459247e-05, "loss": 0.708, "step": 3187 }, { "epoch": 0.1519795962148118, "grad_norm": 2.3303139209747314, "learning_rate": 1.3139439744310599e-05, "loss": 1.1927, "step": 3188 }, { "epoch": 0.15202726861011132, "grad_norm": 2.9897749423980713, "learning_rate": 1.3135710121778729e-05, "loss": 1.737, "step": 3189 }, { "epoch": 0.15207494100541083, "grad_norm": 1.2306307554244995, "learning_rate": 1.3131980015439079e-05, "loss": 1.0152, "step": 3190 }, { "epoch": 0.15212261340071032, "grad_norm": 2.98582124710083, "learning_rate": 1.3128249425867161e-05, "loss": 1.3723, "step": 3191 }, { "epoch": 0.15217028579600983, "grad_norm": 1.4394237995147705, "learning_rate": 1.3124518353638575e-05, "loss": 0.8075, "step": 3192 }, { "epoch": 0.15221795819130932, "grad_norm": 1.3103983402252197, "learning_rate": 1.3120786799328982e-05, "loss": 0.7079, "step": 3193 }, { "epoch": 0.15226563058660883, "grad_norm": 1.8622562885284424, "learning_rate": 1.3117054763514126e-05, "loss": 1.0342, "step": 3194 }, { "epoch": 0.15231330298190832, "grad_norm": 3.9945762157440186, "learning_rate": 1.3113322246769817e-05, "loss": 0.9274, "step": 3195 }, { "epoch": 0.15236097537720783, "grad_norm": 2.3655645847320557, "learning_rate": 1.3109589249671947e-05, "loss": 0.7874, "step": 3196 }, { "epoch": 0.15240864777250732, "grad_norm": 4.2475128173828125, "learning_rate": 1.3105855772796482e-05, "loss": 1.1561, "step": 3197 }, { "epoch": 0.15245632016780683, "grad_norm": 1.5746185779571533, "learning_rate": 1.3102121816719453e-05, "loss": 0.9835, "step": 3198 }, { "epoch": 0.15250399256310634, "grad_norm": 1.5996003150939941, "learning_rate": 1.3098387382016971e-05, "loss": 0.2285, "step": 3199 }, { "epoch": 0.15255166495840583, "grad_norm": 1.7315154075622559, "learning_rate": 1.3094652469265225e-05, "loss": 0.6511, "step": 3200 }, { "epoch": 0.15259933735370534, "grad_norm": 3.788540840148926, "learning_rate": 1.309091707904047e-05, "loss": 0.9884, "step": 3201 }, { "epoch": 0.15264700974900483, "grad_norm": 1.3468878269195557, "learning_rate": 1.3087181211919043e-05, "loss": 0.7516, "step": 3202 }, { "epoch": 0.15269468214430434, "grad_norm": 1.5442324876785278, "learning_rate": 1.3083444868477344e-05, "loss": 0.6242, "step": 3203 }, { "epoch": 0.15274235453960383, "grad_norm": 2.502856731414795, "learning_rate": 1.3079708049291857e-05, "loss": 0.4055, "step": 3204 }, { "epoch": 0.15279002693490334, "grad_norm": 5.897099018096924, "learning_rate": 1.3075970754939134e-05, "loss": 0.6836, "step": 3205 }, { "epoch": 0.15283769933020286, "grad_norm": 1.1400001049041748, "learning_rate": 1.3072232985995798e-05, "loss": 0.6967, "step": 3206 }, { "epoch": 0.15288537172550234, "grad_norm": 1.0584015846252441, "learning_rate": 1.306849474303855e-05, "loss": 0.6489, "step": 3207 }, { "epoch": 0.15293304412080186, "grad_norm": 4.950252056121826, "learning_rate": 1.306475602664416e-05, "loss": 1.0647, "step": 3208 }, { "epoch": 0.15298071651610134, "grad_norm": 1.6873407363891602, "learning_rate": 1.3061016837389482e-05, "loss": 0.5319, "step": 3209 }, { "epoch": 0.15302838891140086, "grad_norm": 1.234108805656433, "learning_rate": 1.3057277175851426e-05, "loss": 0.5588, "step": 3210 }, { "epoch": 0.15307606130670035, "grad_norm": 3.5063700675964355, "learning_rate": 1.3053537042606985e-05, "loss": 1.1597, "step": 3211 }, { "epoch": 0.15312373370199986, "grad_norm": 1.8462144136428833, "learning_rate": 1.3049796438233225e-05, "loss": 0.6949, "step": 3212 }, { "epoch": 0.15317140609729935, "grad_norm": 1.3435649871826172, "learning_rate": 1.3046055363307277e-05, "loss": 0.737, "step": 3213 }, { "epoch": 0.15321907849259886, "grad_norm": 0.9850499033927917, "learning_rate": 1.3042313818406359e-05, "loss": 0.6589, "step": 3214 }, { "epoch": 0.15326675088789837, "grad_norm": 2.4665281772613525, "learning_rate": 1.3038571804107747e-05, "loss": 0.5102, "step": 3215 }, { "epoch": 0.15331442328319786, "grad_norm": 2.578758716583252, "learning_rate": 1.3034829320988796e-05, "loss": 0.5918, "step": 3216 }, { "epoch": 0.15336209567849737, "grad_norm": 2.080080986022949, "learning_rate": 1.3031086369626934e-05, "loss": 0.7601, "step": 3217 }, { "epoch": 0.15340976807379686, "grad_norm": 1.7336410284042358, "learning_rate": 1.302734295059966e-05, "loss": 0.4842, "step": 3218 }, { "epoch": 0.15345744046909637, "grad_norm": 2.311845302581787, "learning_rate": 1.3023599064484546e-05, "loss": 0.8222, "step": 3219 }, { "epoch": 0.15350511286439586, "grad_norm": 6.7187299728393555, "learning_rate": 1.3019854711859233e-05, "loss": 0.4773, "step": 3220 }, { "epoch": 0.15355278525969537, "grad_norm": 3.3655238151550293, "learning_rate": 1.3016109893301434e-05, "loss": 0.5521, "step": 3221 }, { "epoch": 0.1536004576549949, "grad_norm": 3.1471986770629883, "learning_rate": 1.3012364609388939e-05, "loss": 0.9256, "step": 3222 }, { "epoch": 0.15364813005029437, "grad_norm": 2.384690999984741, "learning_rate": 1.3008618860699607e-05, "loss": 0.5345, "step": 3223 }, { "epoch": 0.1536958024455939, "grad_norm": 1.6492693424224854, "learning_rate": 1.3004872647811365e-05, "loss": 0.7447, "step": 3224 }, { "epoch": 0.15374347484089337, "grad_norm": 1.4576164484024048, "learning_rate": 1.300112597130222e-05, "loss": 0.7731, "step": 3225 }, { "epoch": 0.1537911472361929, "grad_norm": 2.039154052734375, "learning_rate": 1.2997378831750242e-05, "loss": 0.7086, "step": 3226 }, { "epoch": 0.15383881963149237, "grad_norm": 2.779320001602173, "learning_rate": 1.2993631229733584e-05, "loss": 1.0048, "step": 3227 }, { "epoch": 0.1538864920267919, "grad_norm": 3.1010067462921143, "learning_rate": 1.2989883165830448e-05, "loss": 0.9285, "step": 3228 }, { "epoch": 0.1539341644220914, "grad_norm": 3.0224592685699463, "learning_rate": 1.298613464061913e-05, "loss": 0.8421, "step": 3229 }, { "epoch": 0.1539818368173909, "grad_norm": 1.478475570678711, "learning_rate": 1.2982385654677989e-05, "loss": 0.5191, "step": 3230 }, { "epoch": 0.1540295092126904, "grad_norm": 2.0774331092834473, "learning_rate": 1.2978636208585456e-05, "loss": 0.8923, "step": 3231 }, { "epoch": 0.1540771816079899, "grad_norm": 1.5826398134231567, "learning_rate": 1.2974886302920029e-05, "loss": 0.7858, "step": 3232 }, { "epoch": 0.1541248540032894, "grad_norm": 2.285444498062134, "learning_rate": 1.297113593826028e-05, "loss": 0.7862, "step": 3233 }, { "epoch": 0.1541725263985889, "grad_norm": 1.3336167335510254, "learning_rate": 1.2967385115184854e-05, "loss": 0.5737, "step": 3234 }, { "epoch": 0.1542201987938884, "grad_norm": 1.879441738128662, "learning_rate": 1.2963633834272463e-05, "loss": 0.6908, "step": 3235 }, { "epoch": 0.1542678711891879, "grad_norm": 1.8980261087417603, "learning_rate": 1.2959882096101888e-05, "loss": 0.7223, "step": 3236 }, { "epoch": 0.1543155435844874, "grad_norm": 1.6889532804489136, "learning_rate": 1.2956129901251988e-05, "loss": 0.7277, "step": 3237 }, { "epoch": 0.15436321597978692, "grad_norm": 2.410881996154785, "learning_rate": 1.2952377250301689e-05, "loss": 1.2017, "step": 3238 }, { "epoch": 0.1544108883750864, "grad_norm": 4.254683971405029, "learning_rate": 1.294862414382998e-05, "loss": 0.4348, "step": 3239 }, { "epoch": 0.15445856077038592, "grad_norm": 1.7135064601898193, "learning_rate": 1.2944870582415931e-05, "loss": 0.6869, "step": 3240 }, { "epoch": 0.1545062331656854, "grad_norm": 2.102322578430176, "learning_rate": 1.2941116566638681e-05, "loss": 0.8207, "step": 3241 }, { "epoch": 0.15455390556098492, "grad_norm": 1.275256872177124, "learning_rate": 1.293736209707743e-05, "loss": 0.6431, "step": 3242 }, { "epoch": 0.1546015779562844, "grad_norm": 4.420902729034424, "learning_rate": 1.2933607174311458e-05, "loss": 1.0482, "step": 3243 }, { "epoch": 0.15464925035158392, "grad_norm": 1.0623613595962524, "learning_rate": 1.2929851798920108e-05, "loss": 0.7346, "step": 3244 }, { "epoch": 0.15469692274688343, "grad_norm": 6.680132865905762, "learning_rate": 1.2926095971482795e-05, "loss": 1.9411, "step": 3245 }, { "epoch": 0.15474459514218292, "grad_norm": 1.8034237623214722, "learning_rate": 1.2922339692579008e-05, "loss": 0.8158, "step": 3246 }, { "epoch": 0.15479226753748243, "grad_norm": 1.094099521636963, "learning_rate": 1.2918582962788301e-05, "loss": 0.722, "step": 3247 }, { "epoch": 0.15483993993278192, "grad_norm": 2.1104376316070557, "learning_rate": 1.2914825782690299e-05, "loss": 1.0414, "step": 3248 }, { "epoch": 0.15488761232808143, "grad_norm": 1.5353261232376099, "learning_rate": 1.2911068152864697e-05, "loss": 0.3202, "step": 3249 }, { "epoch": 0.15493528472338092, "grad_norm": 1.695068359375, "learning_rate": 1.2907310073891255e-05, "loss": 0.8314, "step": 3250 }, { "epoch": 0.15498295711868043, "grad_norm": 2.4813592433929443, "learning_rate": 1.2903551546349809e-05, "loss": 0.8277, "step": 3251 }, { "epoch": 0.15503062951397992, "grad_norm": 2.1663858890533447, "learning_rate": 1.289979257082026e-05, "loss": 0.585, "step": 3252 }, { "epoch": 0.15507830190927943, "grad_norm": 2.4127964973449707, "learning_rate": 1.2896033147882576e-05, "loss": 0.863, "step": 3253 }, { "epoch": 0.15512597430457895, "grad_norm": 2.1186890602111816, "learning_rate": 1.2892273278116805e-05, "loss": 0.2626, "step": 3254 }, { "epoch": 0.15517364669987843, "grad_norm": 1.9335317611694336, "learning_rate": 1.288851296210305e-05, "loss": 0.8851, "step": 3255 }, { "epoch": 0.15522131909517795, "grad_norm": 0.9816529750823975, "learning_rate": 1.2884752200421493e-05, "loss": 0.5099, "step": 3256 }, { "epoch": 0.15526899149047743, "grad_norm": 3.36718487739563, "learning_rate": 1.2880990993652379e-05, "loss": 1.0508, "step": 3257 }, { "epoch": 0.15531666388577695, "grad_norm": 1.9627264738082886, "learning_rate": 1.287722934237602e-05, "loss": 1.0297, "step": 3258 }, { "epoch": 0.15536433628107643, "grad_norm": 2.541930675506592, "learning_rate": 1.2873467247172804e-05, "loss": 0.9226, "step": 3259 }, { "epoch": 0.15541200867637595, "grad_norm": 4.7642645835876465, "learning_rate": 1.2869704708623184e-05, "loss": 0.366, "step": 3260 }, { "epoch": 0.15545968107167546, "grad_norm": 2.4338111877441406, "learning_rate": 1.286594172730768e-05, "loss": 1.0849, "step": 3261 }, { "epoch": 0.15550735346697495, "grad_norm": 1.6886451244354248, "learning_rate": 1.2862178303806878e-05, "loss": 0.5081, "step": 3262 }, { "epoch": 0.15555502586227446, "grad_norm": 1.4763084650039673, "learning_rate": 1.285841443870144e-05, "loss": 0.9544, "step": 3263 }, { "epoch": 0.15560269825757395, "grad_norm": 3.021918773651123, "learning_rate": 1.285465013257209e-05, "loss": 0.8274, "step": 3264 }, { "epoch": 0.15565037065287346, "grad_norm": 1.5202257633209229, "learning_rate": 1.2850885385999626e-05, "loss": 0.6118, "step": 3265 }, { "epoch": 0.15569804304817295, "grad_norm": 1.6691006422042847, "learning_rate": 1.28471201995649e-05, "loss": 1.005, "step": 3266 }, { "epoch": 0.15574571544347246, "grad_norm": 2.1328299045562744, "learning_rate": 1.2843354573848849e-05, "loss": 0.799, "step": 3267 }, { "epoch": 0.15579338783877195, "grad_norm": 1.5241819620132446, "learning_rate": 1.2839588509432466e-05, "loss": 0.6374, "step": 3268 }, { "epoch": 0.15584106023407146, "grad_norm": 1.8858363628387451, "learning_rate": 1.283582200689682e-05, "loss": 0.6463, "step": 3269 }, { "epoch": 0.15588873262937097, "grad_norm": 2.1750049591064453, "learning_rate": 1.283205506682304e-05, "loss": 0.719, "step": 3270 }, { "epoch": 0.15593640502467046, "grad_norm": 1.5909249782562256, "learning_rate": 1.2828287689792331e-05, "loss": 0.8247, "step": 3271 }, { "epoch": 0.15598407741996997, "grad_norm": 1.7785180807113647, "learning_rate": 1.2824519876385957e-05, "loss": 0.8106, "step": 3272 }, { "epoch": 0.15603174981526946, "grad_norm": 5.155399799346924, "learning_rate": 1.2820751627185248e-05, "loss": 0.6847, "step": 3273 }, { "epoch": 0.15607942221056897, "grad_norm": 2.3755834102630615, "learning_rate": 1.2816982942771616e-05, "loss": 1.1485, "step": 3274 }, { "epoch": 0.15612709460586846, "grad_norm": 1.1761051416397095, "learning_rate": 1.2813213823726524e-05, "loss": 0.5474, "step": 3275 }, { "epoch": 0.15617476700116797, "grad_norm": 1.2053571939468384, "learning_rate": 1.2809444270631508e-05, "loss": 0.7944, "step": 3276 }, { "epoch": 0.1562224393964675, "grad_norm": 1.4138482809066772, "learning_rate": 1.2805674284068175e-05, "loss": 0.9313, "step": 3277 }, { "epoch": 0.15627011179176697, "grad_norm": 1.2893582582473755, "learning_rate": 1.2801903864618193e-05, "loss": 0.7347, "step": 3278 }, { "epoch": 0.1563177841870665, "grad_norm": 1.8208036422729492, "learning_rate": 1.2798133012863297e-05, "loss": 0.8083, "step": 3279 }, { "epoch": 0.15636545658236597, "grad_norm": 1.302425742149353, "learning_rate": 1.2794361729385291e-05, "loss": 0.6992, "step": 3280 }, { "epoch": 0.1564131289776655, "grad_norm": 2.7278079986572266, "learning_rate": 1.279059001476605e-05, "loss": 0.6065, "step": 3281 }, { "epoch": 0.15646080137296497, "grad_norm": 1.6937270164489746, "learning_rate": 1.2786817869587504e-05, "loss": 0.6062, "step": 3282 }, { "epoch": 0.1565084737682645, "grad_norm": 1.5980437994003296, "learning_rate": 1.2783045294431662e-05, "loss": 0.6007, "step": 3283 }, { "epoch": 0.15655614616356398, "grad_norm": 1.3968660831451416, "learning_rate": 1.2779272289880589e-05, "loss": 0.8586, "step": 3284 }, { "epoch": 0.1566038185588635, "grad_norm": 1.3071876764297485, "learning_rate": 1.2775498856516422e-05, "loss": 0.7345, "step": 3285 }, { "epoch": 0.156651490954163, "grad_norm": 1.080914855003357, "learning_rate": 1.2771724994921367e-05, "loss": 0.6449, "step": 3286 }, { "epoch": 0.1566991633494625, "grad_norm": 1.6707903146743774, "learning_rate": 1.2767950705677685e-05, "loss": 0.4934, "step": 3287 }, { "epoch": 0.156746835744762, "grad_norm": 2.411984920501709, "learning_rate": 1.2764175989367717e-05, "loss": 0.4383, "step": 3288 }, { "epoch": 0.1567945081400615, "grad_norm": 1.9843446016311646, "learning_rate": 1.2760400846573858e-05, "loss": 0.6825, "step": 3289 }, { "epoch": 0.156842180535361, "grad_norm": 1.7812992334365845, "learning_rate": 1.2756625277878571e-05, "loss": 0.8901, "step": 3290 }, { "epoch": 0.1568898529306605, "grad_norm": 1.4307302236557007, "learning_rate": 1.2752849283864395e-05, "loss": 0.9962, "step": 3291 }, { "epoch": 0.15693752532596, "grad_norm": 2.1438846588134766, "learning_rate": 1.2749072865113926e-05, "loss": 0.7809, "step": 3292 }, { "epoch": 0.15698519772125952, "grad_norm": 1.1189720630645752, "learning_rate": 1.274529602220982e-05, "loss": 0.7703, "step": 3293 }, { "epoch": 0.157032870116559, "grad_norm": 1.2004164457321167, "learning_rate": 1.2741518755734809e-05, "loss": 0.8478, "step": 3294 }, { "epoch": 0.15708054251185852, "grad_norm": 1.7471860647201538, "learning_rate": 1.2737741066271689e-05, "loss": 0.4228, "step": 3295 }, { "epoch": 0.157128214907158, "grad_norm": 3.3640732765197754, "learning_rate": 1.2733962954403311e-05, "loss": 0.7958, "step": 3296 }, { "epoch": 0.15717588730245752, "grad_norm": 1.421522617340088, "learning_rate": 1.2730184420712605e-05, "loss": 0.5254, "step": 3297 }, { "epoch": 0.157223559697757, "grad_norm": 2.8206255435943604, "learning_rate": 1.2726405465782562e-05, "loss": 1.316, "step": 3298 }, { "epoch": 0.15727123209305652, "grad_norm": 7.158819198608398, "learning_rate": 1.2722626090196229e-05, "loss": 0.7735, "step": 3299 }, { "epoch": 0.157318904488356, "grad_norm": 5.5446391105651855, "learning_rate": 1.2718846294536729e-05, "loss": 0.7064, "step": 3300 }, { "epoch": 0.15736657688365552, "grad_norm": 1.9584927558898926, "learning_rate": 1.2715066079387243e-05, "loss": 0.4701, "step": 3301 }, { "epoch": 0.15741424927895503, "grad_norm": 2.7662882804870605, "learning_rate": 1.2711285445331023e-05, "loss": 0.882, "step": 3302 }, { "epoch": 0.15746192167425452, "grad_norm": 1.7443525791168213, "learning_rate": 1.270750439295138e-05, "loss": 0.855, "step": 3303 }, { "epoch": 0.15750959406955403, "grad_norm": 1.2526910305023193, "learning_rate": 1.270372292283169e-05, "loss": 0.6725, "step": 3304 }, { "epoch": 0.15755726646485352, "grad_norm": 1.508876085281372, "learning_rate": 1.2699941035555394e-05, "loss": 1.0443, "step": 3305 }, { "epoch": 0.15760493886015303, "grad_norm": 1.8699249029159546, "learning_rate": 1.2696158731706e-05, "loss": 0.6183, "step": 3306 }, { "epoch": 0.15765261125545252, "grad_norm": 2.2554659843444824, "learning_rate": 1.269237601186708e-05, "loss": 0.513, "step": 3307 }, { "epoch": 0.15770028365075203, "grad_norm": 1.14159095287323, "learning_rate": 1.2688592876622268e-05, "loss": 0.7366, "step": 3308 }, { "epoch": 0.15774795604605155, "grad_norm": 1.5288643836975098, "learning_rate": 1.2684809326555266e-05, "loss": 0.8124, "step": 3309 }, { "epoch": 0.15779562844135103, "grad_norm": 1.5562394857406616, "learning_rate": 1.2681025362249826e-05, "loss": 0.7951, "step": 3310 }, { "epoch": 0.15784330083665055, "grad_norm": 1.2490415573120117, "learning_rate": 1.2677240984289787e-05, "loss": 0.5636, "step": 3311 }, { "epoch": 0.15789097323195003, "grad_norm": 1.3667465448379517, "learning_rate": 1.2673456193259033e-05, "loss": 0.7148, "step": 3312 }, { "epoch": 0.15793864562724955, "grad_norm": 3.3306851387023926, "learning_rate": 1.2669670989741519e-05, "loss": 0.7152, "step": 3313 }, { "epoch": 0.15798631802254903, "grad_norm": 2.215132713317871, "learning_rate": 1.2665885374321263e-05, "loss": 0.6839, "step": 3314 }, { "epoch": 0.15803399041784855, "grad_norm": 2.5594356060028076, "learning_rate": 1.2662099347582348e-05, "loss": 0.7114, "step": 3315 }, { "epoch": 0.15808166281314803, "grad_norm": 1.0005269050598145, "learning_rate": 1.2658312910108919e-05, "loss": 0.4162, "step": 3316 }, { "epoch": 0.15812933520844755, "grad_norm": 3.5879533290863037, "learning_rate": 1.2654526062485182e-05, "loss": 1.1503, "step": 3317 }, { "epoch": 0.15817700760374706, "grad_norm": 3.5539066791534424, "learning_rate": 1.265073880529541e-05, "loss": 0.4695, "step": 3318 }, { "epoch": 0.15822467999904655, "grad_norm": 2.7896761894226074, "learning_rate": 1.2646951139123935e-05, "loss": 0.7786, "step": 3319 }, { "epoch": 0.15827235239434606, "grad_norm": 1.2362169027328491, "learning_rate": 1.2643163064555163e-05, "loss": 0.6081, "step": 3320 }, { "epoch": 0.15832002478964555, "grad_norm": 1.14006769657135, "learning_rate": 1.2639374582173548e-05, "loss": 0.4521, "step": 3321 }, { "epoch": 0.15836769718494506, "grad_norm": 1.6921024322509766, "learning_rate": 1.263558569256361e-05, "loss": 0.9964, "step": 3322 }, { "epoch": 0.15841536958024455, "grad_norm": 4.475882053375244, "learning_rate": 1.2631796396309945e-05, "loss": 0.9196, "step": 3323 }, { "epoch": 0.15846304197554406, "grad_norm": 1.8194913864135742, "learning_rate": 1.2628006693997199e-05, "loss": 0.4348, "step": 3324 }, { "epoch": 0.15851071437084358, "grad_norm": 1.9415637254714966, "learning_rate": 1.2624216586210084e-05, "loss": 0.6005, "step": 3325 }, { "epoch": 0.15855838676614306, "grad_norm": 1.2305355072021484, "learning_rate": 1.2620426073533371e-05, "loss": 0.7007, "step": 3326 }, { "epoch": 0.15860605916144258, "grad_norm": 1.2120805978775024, "learning_rate": 1.2616635156551902e-05, "loss": 0.3924, "step": 3327 }, { "epoch": 0.15865373155674206, "grad_norm": 2.912013530731201, "learning_rate": 1.2612843835850574e-05, "loss": 0.3591, "step": 3328 }, { "epoch": 0.15870140395204158, "grad_norm": 1.610073447227478, "learning_rate": 1.2609052112014349e-05, "loss": 1.2035, "step": 3329 }, { "epoch": 0.15874907634734106, "grad_norm": 1.8236873149871826, "learning_rate": 1.2605259985628248e-05, "loss": 0.7862, "step": 3330 }, { "epoch": 0.15879674874264058, "grad_norm": 3.075911045074463, "learning_rate": 1.2601467457277368e-05, "loss": 0.362, "step": 3331 }, { "epoch": 0.1588444211379401, "grad_norm": 2.6457345485687256, "learning_rate": 1.2597674527546846e-05, "loss": 1.1366, "step": 3332 }, { "epoch": 0.15889209353323958, "grad_norm": 1.542930006980896, "learning_rate": 1.259388119702189e-05, "loss": 0.6585, "step": 3333 }, { "epoch": 0.1589397659285391, "grad_norm": 1.234430193901062, "learning_rate": 1.2590087466287783e-05, "loss": 0.3408, "step": 3334 }, { "epoch": 0.15898743832383858, "grad_norm": 2.957338333129883, "learning_rate": 1.2586293335929851e-05, "loss": 1.2782, "step": 3335 }, { "epoch": 0.1590351107191381, "grad_norm": 1.1967005729675293, "learning_rate": 1.258249880653349e-05, "loss": 0.5389, "step": 3336 }, { "epoch": 0.15908278311443758, "grad_norm": 3.108670711517334, "learning_rate": 1.2578703878684158e-05, "loss": 1.1146, "step": 3337 }, { "epoch": 0.1591304555097371, "grad_norm": 0.7346532344818115, "learning_rate": 1.2574908552967374e-05, "loss": 0.3113, "step": 3338 }, { "epoch": 0.15917812790503658, "grad_norm": 1.4196573495864868, "learning_rate": 1.2571112829968716e-05, "loss": 0.4386, "step": 3339 }, { "epoch": 0.1592258003003361, "grad_norm": 1.0020333528518677, "learning_rate": 1.256731671027383e-05, "loss": 0.7071, "step": 3340 }, { "epoch": 0.1592734726956356, "grad_norm": 1.1035711765289307, "learning_rate": 1.2563520194468408e-05, "loss": 0.7352, "step": 3341 }, { "epoch": 0.1593211450909351, "grad_norm": 1.7467808723449707, "learning_rate": 1.2559723283138219e-05, "loss": 1.1882, "step": 3342 }, { "epoch": 0.1593688174862346, "grad_norm": 1.1355706453323364, "learning_rate": 1.255592597686909e-05, "loss": 0.3259, "step": 3343 }, { "epoch": 0.1594164898815341, "grad_norm": 2.670177936553955, "learning_rate": 1.2552128276246905e-05, "loss": 0.8934, "step": 3344 }, { "epoch": 0.1594641622768336, "grad_norm": 1.452782154083252, "learning_rate": 1.2548330181857605e-05, "loss": 0.4914, "step": 3345 }, { "epoch": 0.1595118346721331, "grad_norm": 1.8647856712341309, "learning_rate": 1.2544531694287203e-05, "loss": 0.6514, "step": 3346 }, { "epoch": 0.1595595070674326, "grad_norm": 1.6509310007095337, "learning_rate": 1.2540732814121763e-05, "loss": 0.1358, "step": 3347 }, { "epoch": 0.15960717946273212, "grad_norm": 1.2296963930130005, "learning_rate": 1.2536933541947416e-05, "loss": 0.5515, "step": 3348 }, { "epoch": 0.1596548518580316, "grad_norm": 2.1495020389556885, "learning_rate": 1.2533133878350348e-05, "loss": 1.0511, "step": 3349 }, { "epoch": 0.15970252425333112, "grad_norm": 1.151329517364502, "learning_rate": 1.2529333823916807e-05, "loss": 0.3669, "step": 3350 }, { "epoch": 0.1597501966486306, "grad_norm": 1.3974355459213257, "learning_rate": 1.2525533379233108e-05, "loss": 0.9286, "step": 3351 }, { "epoch": 0.15979786904393012, "grad_norm": 1.4394891262054443, "learning_rate": 1.2521732544885614e-05, "loss": 0.5116, "step": 3352 }, { "epoch": 0.1598455414392296, "grad_norm": 1.2312219142913818, "learning_rate": 1.2517931321460756e-05, "loss": 0.9085, "step": 3353 }, { "epoch": 0.15989321383452912, "grad_norm": 2.1159145832061768, "learning_rate": 1.251412970954503e-05, "loss": 0.7617, "step": 3354 }, { "epoch": 0.1599408862298286, "grad_norm": 2.5563313961029053, "learning_rate": 1.2510327709724976e-05, "loss": 1.084, "step": 3355 }, { "epoch": 0.15998855862512812, "grad_norm": 1.485469102859497, "learning_rate": 1.2506525322587207e-05, "loss": 0.8285, "step": 3356 }, { "epoch": 0.16003623102042763, "grad_norm": 2.3007662296295166, "learning_rate": 1.2502722548718396e-05, "loss": 0.6163, "step": 3357 }, { "epoch": 0.16008390341572712, "grad_norm": 2.549318790435791, "learning_rate": 1.2498919388705266e-05, "loss": 0.8862, "step": 3358 }, { "epoch": 0.16013157581102663, "grad_norm": 1.3746862411499023, "learning_rate": 1.2495115843134608e-05, "loss": 0.7938, "step": 3359 }, { "epoch": 0.16017924820632612, "grad_norm": 1.6238036155700684, "learning_rate": 1.249131191259327e-05, "loss": 0.4767, "step": 3360 }, { "epoch": 0.16022692060162563, "grad_norm": 1.5184221267700195, "learning_rate": 1.2487507597668163e-05, "loss": 0.9831, "step": 3361 }, { "epoch": 0.16027459299692512, "grad_norm": 1.7125096321105957, "learning_rate": 1.2483702898946249e-05, "loss": 0.5875, "step": 3362 }, { "epoch": 0.16032226539222463, "grad_norm": 1.3688068389892578, "learning_rate": 1.2479897817014553e-05, "loss": 0.6461, "step": 3363 }, { "epoch": 0.16036993778752415, "grad_norm": 1.172893762588501, "learning_rate": 1.2476092352460161e-05, "loss": 0.6278, "step": 3364 }, { "epoch": 0.16041761018282363, "grad_norm": 2.0960302352905273, "learning_rate": 1.2472286505870222e-05, "loss": 0.4483, "step": 3365 }, { "epoch": 0.16046528257812315, "grad_norm": 1.4265042543411255, "learning_rate": 1.246848027783193e-05, "loss": 0.9282, "step": 3366 }, { "epoch": 0.16051295497342263, "grad_norm": 1.6536892652511597, "learning_rate": 1.2464673668932555e-05, "loss": 0.8627, "step": 3367 }, { "epoch": 0.16056062736872215, "grad_norm": 2.113489866256714, "learning_rate": 1.2460866679759412e-05, "loss": 0.694, "step": 3368 }, { "epoch": 0.16060829976402163, "grad_norm": 1.298811435699463, "learning_rate": 1.2457059310899887e-05, "loss": 0.8632, "step": 3369 }, { "epoch": 0.16065597215932115, "grad_norm": 2.6942338943481445, "learning_rate": 1.2453251562941406e-05, "loss": 0.2747, "step": 3370 }, { "epoch": 0.16070364455462063, "grad_norm": 1.8676705360412598, "learning_rate": 1.2449443436471476e-05, "loss": 0.9307, "step": 3371 }, { "epoch": 0.16075131694992015, "grad_norm": 1.7404451370239258, "learning_rate": 1.2445634932077648e-05, "loss": 0.6081, "step": 3372 }, { "epoch": 0.16079898934521966, "grad_norm": 1.310207724571228, "learning_rate": 1.2441826050347535e-05, "loss": 0.7807, "step": 3373 }, { "epoch": 0.16084666174051915, "grad_norm": 1.459877610206604, "learning_rate": 1.243801679186881e-05, "loss": 0.6398, "step": 3374 }, { "epoch": 0.16089433413581866, "grad_norm": 1.8162803649902344, "learning_rate": 1.24342071572292e-05, "loss": 0.5995, "step": 3375 }, { "epoch": 0.16094200653111815, "grad_norm": 0.9086304306983948, "learning_rate": 1.243039714701649e-05, "loss": 0.3061, "step": 3376 }, { "epoch": 0.16098967892641766, "grad_norm": 5.885626316070557, "learning_rate": 1.2426586761818533e-05, "loss": 0.7183, "step": 3377 }, { "epoch": 0.16103735132171715, "grad_norm": 1.9913275241851807, "learning_rate": 1.2422776002223226e-05, "loss": 0.8075, "step": 3378 }, { "epoch": 0.16108502371701666, "grad_norm": 2.7252678871154785, "learning_rate": 1.2418964868818529e-05, "loss": 1.0008, "step": 3379 }, { "epoch": 0.16113269611231618, "grad_norm": 1.4492295980453491, "learning_rate": 1.2415153362192466e-05, "loss": 0.72, "step": 3380 }, { "epoch": 0.16118036850761566, "grad_norm": 3.650787353515625, "learning_rate": 1.241134148293311e-05, "loss": 1.0848, "step": 3381 }, { "epoch": 0.16122804090291518, "grad_norm": 1.4103679656982422, "learning_rate": 1.2407529231628595e-05, "loss": 0.5846, "step": 3382 }, { "epoch": 0.16127571329821466, "grad_norm": 2.3698039054870605, "learning_rate": 1.2403716608867111e-05, "loss": 1.0237, "step": 3383 }, { "epoch": 0.16132338569351418, "grad_norm": 1.0095607042312622, "learning_rate": 1.239990361523691e-05, "loss": 0.3932, "step": 3384 }, { "epoch": 0.16137105808881366, "grad_norm": 1.4036271572113037, "learning_rate": 1.2396090251326296e-05, "loss": 1.0634, "step": 3385 }, { "epoch": 0.16141873048411318, "grad_norm": 1.1031994819641113, "learning_rate": 1.239227651772363e-05, "loss": 0.5795, "step": 3386 }, { "epoch": 0.16146640287941266, "grad_norm": 2.0513222217559814, "learning_rate": 1.2388462415017331e-05, "loss": 0.7466, "step": 3387 }, { "epoch": 0.16151407527471218, "grad_norm": 1.854997992515564, "learning_rate": 1.238464794379588e-05, "loss": 0.7148, "step": 3388 }, { "epoch": 0.1615617476700117, "grad_norm": 2.1990208625793457, "learning_rate": 1.2380833104647807e-05, "loss": 0.8308, "step": 3389 }, { "epoch": 0.16160942006531118, "grad_norm": 1.3848634958267212, "learning_rate": 1.2377017898161703e-05, "loss": 0.7257, "step": 3390 }, { "epoch": 0.1616570924606107, "grad_norm": 1.0949569940567017, "learning_rate": 1.2373202324926222e-05, "loss": 0.4295, "step": 3391 }, { "epoch": 0.16170476485591018, "grad_norm": 1.570083737373352, "learning_rate": 1.2369386385530055e-05, "loss": 0.8027, "step": 3392 }, { "epoch": 0.1617524372512097, "grad_norm": 3.2252635955810547, "learning_rate": 1.2365570080561971e-05, "loss": 1.1789, "step": 3393 }, { "epoch": 0.16180010964650918, "grad_norm": 1.8009815216064453, "learning_rate": 1.2361753410610784e-05, "loss": 0.6498, "step": 3394 }, { "epoch": 0.1618477820418087, "grad_norm": 2.185048818588257, "learning_rate": 1.2357936376265367e-05, "loss": 0.8915, "step": 3395 }, { "epoch": 0.1618954544371082, "grad_norm": 1.036126732826233, "learning_rate": 1.2354118978114648e-05, "loss": 0.3432, "step": 3396 }, { "epoch": 0.1619431268324077, "grad_norm": 1.647308349609375, "learning_rate": 1.2350301216747615e-05, "loss": 0.6703, "step": 3397 }, { "epoch": 0.1619907992277072, "grad_norm": 1.5020028352737427, "learning_rate": 1.2346483092753307e-05, "loss": 0.7109, "step": 3398 }, { "epoch": 0.1620384716230067, "grad_norm": 1.7878875732421875, "learning_rate": 1.2342664606720823e-05, "loss": 0.8738, "step": 3399 }, { "epoch": 0.1620861440183062, "grad_norm": 1.8247052431106567, "learning_rate": 1.2338845759239315e-05, "loss": 0.6921, "step": 3400 }, { "epoch": 0.1621338164136057, "grad_norm": 3.973893642425537, "learning_rate": 1.233502655089799e-05, "loss": 0.8575, "step": 3401 }, { "epoch": 0.1621814888089052, "grad_norm": 1.4395326375961304, "learning_rate": 1.2331206982286114e-05, "loss": 0.4054, "step": 3402 }, { "epoch": 0.1622291612042047, "grad_norm": 1.9963631629943848, "learning_rate": 1.232738705399301e-05, "loss": 0.6682, "step": 3403 }, { "epoch": 0.1622768335995042, "grad_norm": 3.30143404006958, "learning_rate": 1.2323566766608049e-05, "loss": 0.7009, "step": 3404 }, { "epoch": 0.16232450599480372, "grad_norm": 3.1036858558654785, "learning_rate": 1.2319746120720665e-05, "loss": 0.52, "step": 3405 }, { "epoch": 0.1623721783901032, "grad_norm": 1.8379104137420654, "learning_rate": 1.2315925116920342e-05, "loss": 0.6955, "step": 3406 }, { "epoch": 0.16241985078540272, "grad_norm": 1.5141234397888184, "learning_rate": 1.2312103755796625e-05, "loss": 0.8178, "step": 3407 }, { "epoch": 0.1624675231807022, "grad_norm": 1.6963640451431274, "learning_rate": 1.2308282037939108e-05, "loss": 0.7978, "step": 3408 }, { "epoch": 0.16251519557600172, "grad_norm": 1.6503376960754395, "learning_rate": 1.2304459963937443e-05, "loss": 0.6586, "step": 3409 }, { "epoch": 0.1625628679713012, "grad_norm": 1.685644268989563, "learning_rate": 1.2300637534381336e-05, "loss": 1.053, "step": 3410 }, { "epoch": 0.16261054036660072, "grad_norm": 3.060255289077759, "learning_rate": 1.229681474986055e-05, "loss": 1.0819, "step": 3411 }, { "epoch": 0.16265821276190023, "grad_norm": 1.2974894046783447, "learning_rate": 1.2292991610964902e-05, "loss": 0.9139, "step": 3412 }, { "epoch": 0.16270588515719972, "grad_norm": 2.8959174156188965, "learning_rate": 1.228916811828426e-05, "loss": 0.5987, "step": 3413 }, { "epoch": 0.16275355755249923, "grad_norm": 1.131511926651001, "learning_rate": 1.2285344272408553e-05, "loss": 0.7501, "step": 3414 }, { "epoch": 0.16280122994779872, "grad_norm": 1.3929219245910645, "learning_rate": 1.2281520073927757e-05, "loss": 0.4989, "step": 3415 }, { "epoch": 0.16284890234309823, "grad_norm": 1.792142391204834, "learning_rate": 1.227769552343191e-05, "loss": 0.4692, "step": 3416 }, { "epoch": 0.16289657473839772, "grad_norm": 1.2551616430282593, "learning_rate": 1.2273870621511098e-05, "loss": 0.8854, "step": 3417 }, { "epoch": 0.16294424713369723, "grad_norm": 1.6115854978561401, "learning_rate": 1.2270045368755467e-05, "loss": 0.8475, "step": 3418 }, { "epoch": 0.16299191952899675, "grad_norm": 1.9165464639663696, "learning_rate": 1.2266219765755211e-05, "loss": 0.7767, "step": 3419 }, { "epoch": 0.16303959192429623, "grad_norm": 1.447646975517273, "learning_rate": 1.2262393813100584e-05, "loss": 0.7413, "step": 3420 }, { "epoch": 0.16308726431959575, "grad_norm": 1.78177011013031, "learning_rate": 1.2258567511381891e-05, "loss": 0.978, "step": 3421 }, { "epoch": 0.16313493671489523, "grad_norm": 3.3577868938446045, "learning_rate": 1.225474086118949e-05, "loss": 0.6909, "step": 3422 }, { "epoch": 0.16318260911019475, "grad_norm": 1.737442135810852, "learning_rate": 1.2250913863113792e-05, "loss": 1.1041, "step": 3423 }, { "epoch": 0.16323028150549423, "grad_norm": 1.2454472780227661, "learning_rate": 1.2247086517745262e-05, "loss": 0.9252, "step": 3424 }, { "epoch": 0.16327795390079375, "grad_norm": 1.4345704317092896, "learning_rate": 1.2243258825674424e-05, "loss": 1.1007, "step": 3425 }, { "epoch": 0.16332562629609323, "grad_norm": 1.4201923608779907, "learning_rate": 1.2239430787491853e-05, "loss": 0.7471, "step": 3426 }, { "epoch": 0.16337329869139275, "grad_norm": 10.622687339782715, "learning_rate": 1.2235602403788172e-05, "loss": 1.0874, "step": 3427 }, { "epoch": 0.16342097108669226, "grad_norm": 1.4215333461761475, "learning_rate": 1.2231773675154062e-05, "loss": 0.7744, "step": 3428 }, { "epoch": 0.16346864348199175, "grad_norm": 1.4177908897399902, "learning_rate": 1.222794460218026e-05, "loss": 0.9184, "step": 3429 }, { "epoch": 0.16351631587729126, "grad_norm": 1.4957600831985474, "learning_rate": 1.2224115185457543e-05, "loss": 0.8884, "step": 3430 }, { "epoch": 0.16356398827259075, "grad_norm": 1.203901767730713, "learning_rate": 1.222028542557676e-05, "loss": 0.5323, "step": 3431 }, { "epoch": 0.16361166066789026, "grad_norm": 3.368590831756592, "learning_rate": 1.2216455323128801e-05, "loss": 1.0471, "step": 3432 }, { "epoch": 0.16365933306318975, "grad_norm": 1.8145172595977783, "learning_rate": 1.2212624878704612e-05, "loss": 0.8097, "step": 3433 }, { "epoch": 0.16370700545848926, "grad_norm": 1.7572021484375, "learning_rate": 1.2208794092895187e-05, "loss": 1.0413, "step": 3434 }, { "epoch": 0.16375467785378878, "grad_norm": 1.6635613441467285, "learning_rate": 1.220496296629158e-05, "loss": 0.3629, "step": 3435 }, { "epoch": 0.16380235024908826, "grad_norm": 1.844543695449829, "learning_rate": 1.2201131499484896e-05, "loss": 0.5276, "step": 3436 }, { "epoch": 0.16385002264438778, "grad_norm": 1.5143234729766846, "learning_rate": 1.219729969306629e-05, "loss": 0.8425, "step": 3437 }, { "epoch": 0.16389769503968726, "grad_norm": 2.3293912410736084, "learning_rate": 1.2193467547626966e-05, "loss": 0.8095, "step": 3438 }, { "epoch": 0.16394536743498678, "grad_norm": 1.9489166736602783, "learning_rate": 1.2189635063758188e-05, "loss": 0.8012, "step": 3439 }, { "epoch": 0.16399303983028626, "grad_norm": 1.4042048454284668, "learning_rate": 1.2185802242051267e-05, "loss": 0.6002, "step": 3440 }, { "epoch": 0.16404071222558578, "grad_norm": 1.6882002353668213, "learning_rate": 1.218196908309757e-05, "loss": 0.7757, "step": 3441 }, { "epoch": 0.16408838462088526, "grad_norm": 1.1849122047424316, "learning_rate": 1.2178135587488515e-05, "loss": 0.819, "step": 3442 }, { "epoch": 0.16413605701618478, "grad_norm": 3.708073616027832, "learning_rate": 1.2174301755815572e-05, "loss": 0.9429, "step": 3443 }, { "epoch": 0.1641837294114843, "grad_norm": 1.468268871307373, "learning_rate": 1.2170467588670256e-05, "loss": 0.6409, "step": 3444 }, { "epoch": 0.16423140180678378, "grad_norm": 2.0025761127471924, "learning_rate": 1.2166633086644142e-05, "loss": 1.0012, "step": 3445 }, { "epoch": 0.1642790742020833, "grad_norm": 1.5201904773712158, "learning_rate": 1.2162798250328857e-05, "loss": 0.7626, "step": 3446 }, { "epoch": 0.16432674659738278, "grad_norm": 1.4745078086853027, "learning_rate": 1.2158963080316071e-05, "loss": 0.6486, "step": 3447 }, { "epoch": 0.1643744189926823, "grad_norm": 2.671544313430786, "learning_rate": 1.2155127577197519e-05, "loss": 1.401, "step": 3448 }, { "epoch": 0.16442209138798178, "grad_norm": 12.405790328979492, "learning_rate": 1.2151291741564972e-05, "loss": 0.0574, "step": 3449 }, { "epoch": 0.1644697637832813, "grad_norm": 1.7385684251785278, "learning_rate": 1.2147455574010263e-05, "loss": 0.7392, "step": 3450 }, { "epoch": 0.1645174361785808, "grad_norm": 2.2188470363616943, "learning_rate": 1.2143619075125277e-05, "loss": 0.8268, "step": 3451 }, { "epoch": 0.1645651085738803, "grad_norm": 2.021573066711426, "learning_rate": 1.2139782245501942e-05, "loss": 0.5961, "step": 3452 }, { "epoch": 0.1646127809691798, "grad_norm": 1.4897230863571167, "learning_rate": 1.213594508573224e-05, "loss": 0.616, "step": 3453 }, { "epoch": 0.1646604533644793, "grad_norm": 1.7904022932052612, "learning_rate": 1.2132107596408207e-05, "loss": 0.7339, "step": 3454 }, { "epoch": 0.1647081257597788, "grad_norm": 2.0066168308258057, "learning_rate": 1.212826977812193e-05, "loss": 0.6886, "step": 3455 }, { "epoch": 0.1647557981550783, "grad_norm": 2.741666793823242, "learning_rate": 1.212443163146554e-05, "loss": 0.6876, "step": 3456 }, { "epoch": 0.1648034705503778, "grad_norm": 3.0157713890075684, "learning_rate": 1.2120593157031231e-05, "loss": 0.6456, "step": 3457 }, { "epoch": 0.1648511429456773, "grad_norm": 2.3613779544830322, "learning_rate": 1.2116754355411233e-05, "loss": 1.0501, "step": 3458 }, { "epoch": 0.1648988153409768, "grad_norm": 1.2164865732192993, "learning_rate": 1.2112915227197836e-05, "loss": 0.6544, "step": 3459 }, { "epoch": 0.16494648773627632, "grad_norm": 1.42069673538208, "learning_rate": 1.2109075772983383e-05, "loss": 0.7576, "step": 3460 }, { "epoch": 0.1649941601315758, "grad_norm": 1.328682780265808, "learning_rate": 1.2105235993360252e-05, "loss": 0.8058, "step": 3461 }, { "epoch": 0.16504183252687532, "grad_norm": 1.2135480642318726, "learning_rate": 1.2101395888920888e-05, "loss": 1.1734, "step": 3462 }, { "epoch": 0.1650895049221748, "grad_norm": 1.574333906173706, "learning_rate": 1.2097555460257779e-05, "loss": 0.8645, "step": 3463 }, { "epoch": 0.16513717731747432, "grad_norm": 1.5083953142166138, "learning_rate": 1.2093714707963464e-05, "loss": 0.5608, "step": 3464 }, { "epoch": 0.1651848497127738, "grad_norm": 1.5522809028625488, "learning_rate": 1.2089873632630531e-05, "loss": 0.5864, "step": 3465 }, { "epoch": 0.16523252210807332, "grad_norm": 2.6948087215423584, "learning_rate": 1.2086032234851616e-05, "loss": 0.8208, "step": 3466 }, { "epoch": 0.16528019450337283, "grad_norm": 3.029338836669922, "learning_rate": 1.2082190515219412e-05, "loss": 0.6971, "step": 3467 }, { "epoch": 0.16532786689867232, "grad_norm": 1.52670419216156, "learning_rate": 1.2078348474326652e-05, "loss": 0.671, "step": 3468 }, { "epoch": 0.16537553929397183, "grad_norm": 2.5811350345611572, "learning_rate": 1.2074506112766127e-05, "loss": 1.4676, "step": 3469 }, { "epoch": 0.16542321168927132, "grad_norm": 2.4120121002197266, "learning_rate": 1.2070663431130666e-05, "loss": 0.7292, "step": 3470 }, { "epoch": 0.16547088408457084, "grad_norm": 1.183908462524414, "learning_rate": 1.2066820430013168e-05, "loss": 0.325, "step": 3471 }, { "epoch": 0.16551855647987032, "grad_norm": 1.1767194271087646, "learning_rate": 1.2062977110006559e-05, "loss": 0.4957, "step": 3472 }, { "epoch": 0.16556622887516984, "grad_norm": 2.2183735370635986, "learning_rate": 1.205913347170383e-05, "loss": 0.6561, "step": 3473 }, { "epoch": 0.16561390127046932, "grad_norm": 1.6118630170822144, "learning_rate": 1.2055289515698008e-05, "loss": 0.8571, "step": 3474 }, { "epoch": 0.16566157366576884, "grad_norm": 1.7566035985946655, "learning_rate": 1.205144524258218e-05, "loss": 0.6015, "step": 3475 }, { "epoch": 0.16570924606106835, "grad_norm": 1.6769514083862305, "learning_rate": 1.2047600652949476e-05, "loss": 0.7376, "step": 3476 }, { "epoch": 0.16575691845636784, "grad_norm": 1.9794421195983887, "learning_rate": 1.2043755747393077e-05, "loss": 0.432, "step": 3477 }, { "epoch": 0.16580459085166735, "grad_norm": 2.211209774017334, "learning_rate": 1.203991052650621e-05, "loss": 0.7207, "step": 3478 }, { "epoch": 0.16585226324696684, "grad_norm": 1.421827793121338, "learning_rate": 1.2036064990882162e-05, "loss": 0.5774, "step": 3479 }, { "epoch": 0.16589993564226635, "grad_norm": 1.7831792831420898, "learning_rate": 1.2032219141114253e-05, "loss": 0.7059, "step": 3480 }, { "epoch": 0.16594760803756584, "grad_norm": 1.3848438262939453, "learning_rate": 1.2028372977795854e-05, "loss": 0.7844, "step": 3481 }, { "epoch": 0.16599528043286535, "grad_norm": 1.4584486484527588, "learning_rate": 1.2024526501520398e-05, "loss": 0.4372, "step": 3482 }, { "epoch": 0.16604295282816486, "grad_norm": 1.3964842557907104, "learning_rate": 1.2020679712881347e-05, "loss": 0.727, "step": 3483 }, { "epoch": 0.16609062522346435, "grad_norm": 1.682080626487732, "learning_rate": 1.2016832612472225e-05, "loss": 0.3451, "step": 3484 }, { "epoch": 0.16613829761876386, "grad_norm": 31.16162109375, "learning_rate": 1.2012985200886602e-05, "loss": 0.3133, "step": 3485 }, { "epoch": 0.16618597001406335, "grad_norm": 1.2970011234283447, "learning_rate": 1.2009137478718093e-05, "loss": 0.6506, "step": 3486 }, { "epoch": 0.16623364240936286, "grad_norm": 4.243335247039795, "learning_rate": 1.2005289446560357e-05, "loss": 0.3267, "step": 3487 }, { "epoch": 0.16628131480466235, "grad_norm": 1.4397087097167969, "learning_rate": 1.2001441105007114e-05, "loss": 0.8387, "step": 3488 }, { "epoch": 0.16632898719996186, "grad_norm": 3.6693356037139893, "learning_rate": 1.199759245465212e-05, "loss": 0.9004, "step": 3489 }, { "epoch": 0.16637665959526135, "grad_norm": 1.1918872594833374, "learning_rate": 1.199374349608918e-05, "loss": 0.7844, "step": 3490 }, { "epoch": 0.16642433199056086, "grad_norm": 1.837996244430542, "learning_rate": 1.198989422991215e-05, "loss": 0.9801, "step": 3491 }, { "epoch": 0.16647200438586038, "grad_norm": 1.2639696598052979, "learning_rate": 1.1986044656714933e-05, "loss": 0.7873, "step": 3492 }, { "epoch": 0.16651967678115986, "grad_norm": 2.8440091609954834, "learning_rate": 1.1982194777091476e-05, "loss": 0.8588, "step": 3493 }, { "epoch": 0.16656734917645938, "grad_norm": 3.2376017570495605, "learning_rate": 1.1978344591635779e-05, "loss": 0.4817, "step": 3494 }, { "epoch": 0.16661502157175886, "grad_norm": 1.3740285634994507, "learning_rate": 1.1974494100941884e-05, "loss": 0.8609, "step": 3495 }, { "epoch": 0.16666269396705838, "grad_norm": 2.3309359550476074, "learning_rate": 1.1970643305603885e-05, "loss": 0.0632, "step": 3496 }, { "epoch": 0.16671036636235786, "grad_norm": 2.289484739303589, "learning_rate": 1.1966792206215914e-05, "loss": 0.3885, "step": 3497 }, { "epoch": 0.16675803875765738, "grad_norm": 1.8412286043167114, "learning_rate": 1.1962940803372158e-05, "loss": 0.636, "step": 3498 }, { "epoch": 0.1668057111529569, "grad_norm": 2.3273236751556396, "learning_rate": 1.1959089097666853e-05, "loss": 0.7848, "step": 3499 }, { "epoch": 0.16685338354825638, "grad_norm": 1.3379433155059814, "learning_rate": 1.1955237089694279e-05, "loss": 0.578, "step": 3500 }, { "epoch": 0.1669010559435559, "grad_norm": 2.2512872219085693, "learning_rate": 1.1951384780048752e-05, "loss": 0.9641, "step": 3501 }, { "epoch": 0.16694872833885538, "grad_norm": 1.1500160694122314, "learning_rate": 1.1947532169324649e-05, "loss": 0.5756, "step": 3502 }, { "epoch": 0.1669964007341549, "grad_norm": 1.8214316368103027, "learning_rate": 1.194367925811639e-05, "loss": 0.657, "step": 3503 }, { "epoch": 0.16704407312945438, "grad_norm": 1.3470733165740967, "learning_rate": 1.1939826047018436e-05, "loss": 0.4145, "step": 3504 }, { "epoch": 0.1670917455247539, "grad_norm": 1.4464870691299438, "learning_rate": 1.1935972536625302e-05, "loss": 0.5483, "step": 3505 }, { "epoch": 0.1671394179200534, "grad_norm": 1.6713048219680786, "learning_rate": 1.1932118727531541e-05, "loss": 0.9356, "step": 3506 }, { "epoch": 0.1671870903153529, "grad_norm": 1.612980604171753, "learning_rate": 1.1928264620331755e-05, "loss": 0.7221, "step": 3507 }, { "epoch": 0.1672347627106524, "grad_norm": 1.2654221057891846, "learning_rate": 1.1924410215620596e-05, "loss": 0.5428, "step": 3508 }, { "epoch": 0.1672824351059519, "grad_norm": 2.2483534812927246, "learning_rate": 1.192055551399276e-05, "loss": 0.8149, "step": 3509 }, { "epoch": 0.1673301075012514, "grad_norm": 3.906214714050293, "learning_rate": 1.1916700516042986e-05, "loss": 0.7825, "step": 3510 }, { "epoch": 0.1673777798965509, "grad_norm": 2.5887796878814697, "learning_rate": 1.1912845222366061e-05, "loss": 0.9846, "step": 3511 }, { "epoch": 0.1674254522918504, "grad_norm": 2.0140204429626465, "learning_rate": 1.1908989633556816e-05, "loss": 0.6754, "step": 3512 }, { "epoch": 0.1674731246871499, "grad_norm": 1.9534940719604492, "learning_rate": 1.1905133750210126e-05, "loss": 0.6392, "step": 3513 }, { "epoch": 0.1675207970824494, "grad_norm": 1.862790822982788, "learning_rate": 1.1901277572920922e-05, "loss": 0.6461, "step": 3514 }, { "epoch": 0.16756846947774892, "grad_norm": 1.8946439027786255, "learning_rate": 1.1897421102284166e-05, "loss": 0.8956, "step": 3515 }, { "epoch": 0.1676161418730484, "grad_norm": 1.6796231269836426, "learning_rate": 1.1893564338894872e-05, "loss": 0.7929, "step": 3516 }, { "epoch": 0.16766381426834792, "grad_norm": 1.0231565237045288, "learning_rate": 1.1889707283348104e-05, "loss": 0.5957, "step": 3517 }, { "epoch": 0.1677114866636474, "grad_norm": 2.6896629333496094, "learning_rate": 1.188584993623896e-05, "loss": 0.3625, "step": 3518 }, { "epoch": 0.16775915905894692, "grad_norm": 1.7258400917053223, "learning_rate": 1.1881992298162593e-05, "loss": 0.7968, "step": 3519 }, { "epoch": 0.1678068314542464, "grad_norm": 1.5912007093429565, "learning_rate": 1.1878134369714193e-05, "loss": 0.8339, "step": 3520 }, { "epoch": 0.16785450384954592, "grad_norm": 3.77643084526062, "learning_rate": 1.1874276151489002e-05, "loss": 1.0579, "step": 3521 }, { "epoch": 0.16790217624484544, "grad_norm": 2.2483389377593994, "learning_rate": 1.1870417644082304e-05, "loss": 1.4027, "step": 3522 }, { "epoch": 0.16794984864014492, "grad_norm": 3.3636205196380615, "learning_rate": 1.1866558848089422e-05, "loss": 1.4633, "step": 3523 }, { "epoch": 0.16799752103544444, "grad_norm": 2.785327196121216, "learning_rate": 1.1862699764105731e-05, "loss": 1.1522, "step": 3524 }, { "epoch": 0.16804519343074392, "grad_norm": 1.596582293510437, "learning_rate": 1.1858840392726652e-05, "loss": 0.6806, "step": 3525 }, { "epoch": 0.16809286582604344, "grad_norm": 1.497848391532898, "learning_rate": 1.185498073454764e-05, "loss": 0.8466, "step": 3526 }, { "epoch": 0.16814053822134292, "grad_norm": 1.394997000694275, "learning_rate": 1.1851120790164206e-05, "loss": 0.5984, "step": 3527 }, { "epoch": 0.16818821061664244, "grad_norm": 2.0831704139709473, "learning_rate": 1.1847260560171895e-05, "loss": 0.7524, "step": 3528 }, { "epoch": 0.16823588301194192, "grad_norm": 1.3866229057312012, "learning_rate": 1.1843400045166305e-05, "loss": 0.6879, "step": 3529 }, { "epoch": 0.16828355540724144, "grad_norm": 1.5415539741516113, "learning_rate": 1.1839539245743066e-05, "loss": 0.738, "step": 3530 }, { "epoch": 0.16833122780254095, "grad_norm": 1.7128515243530273, "learning_rate": 1.183567816249787e-05, "loss": 0.8143, "step": 3531 }, { "epoch": 0.16837890019784044, "grad_norm": 1.454851508140564, "learning_rate": 1.1831816796026434e-05, "loss": 0.7832, "step": 3532 }, { "epoch": 0.16842657259313995, "grad_norm": 2.3555922508239746, "learning_rate": 1.1827955146924532e-05, "loss": 0.742, "step": 3533 }, { "epoch": 0.16847424498843944, "grad_norm": 1.4841701984405518, "learning_rate": 1.1824093215787977e-05, "loss": 0.6504, "step": 3534 }, { "epoch": 0.16852191738373895, "grad_norm": 1.503922462463379, "learning_rate": 1.182023100321262e-05, "loss": 0.7964, "step": 3535 }, { "epoch": 0.16856958977903844, "grad_norm": 1.419915795326233, "learning_rate": 1.1816368509794365e-05, "loss": 0.4357, "step": 3536 }, { "epoch": 0.16861726217433795, "grad_norm": 1.297147512435913, "learning_rate": 1.1812505736129156e-05, "loss": 0.8691, "step": 3537 }, { "epoch": 0.16866493456963746, "grad_norm": 2.0768415927886963, "learning_rate": 1.1808642682812973e-05, "loss": 0.6545, "step": 3538 }, { "epoch": 0.16871260696493695, "grad_norm": 1.605297327041626, "learning_rate": 1.1804779350441852e-05, "loss": 0.6993, "step": 3539 }, { "epoch": 0.16876027936023646, "grad_norm": 1.3861682415008545, "learning_rate": 1.1800915739611865e-05, "loss": 0.9421, "step": 3540 }, { "epoch": 0.16880795175553595, "grad_norm": 1.7097270488739014, "learning_rate": 1.1797051850919123e-05, "loss": 0.5509, "step": 3541 }, { "epoch": 0.16885562415083546, "grad_norm": 1.540054440498352, "learning_rate": 1.1793187684959786e-05, "loss": 0.5836, "step": 3542 }, { "epoch": 0.16890329654613495, "grad_norm": 1.5158042907714844, "learning_rate": 1.1789323242330057e-05, "loss": 0.8053, "step": 3543 }, { "epoch": 0.16895096894143447, "grad_norm": 1.1195318698883057, "learning_rate": 1.1785458523626177e-05, "loss": 0.7415, "step": 3544 }, { "epoch": 0.16899864133673395, "grad_norm": 1.5286706686019897, "learning_rate": 1.1781593529444432e-05, "loss": 0.7496, "step": 3545 }, { "epoch": 0.16904631373203347, "grad_norm": 1.5500754117965698, "learning_rate": 1.1777728260381154e-05, "loss": 0.5173, "step": 3546 }, { "epoch": 0.16909398612733298, "grad_norm": 2.987522840499878, "learning_rate": 1.1773862717032711e-05, "loss": 0.8847, "step": 3547 }, { "epoch": 0.16914165852263247, "grad_norm": 1.3472909927368164, "learning_rate": 1.176999689999552e-05, "loss": 0.4554, "step": 3548 }, { "epoch": 0.16918933091793198, "grad_norm": 3.5971341133117676, "learning_rate": 1.1766130809866037e-05, "loss": 0.9229, "step": 3549 }, { "epoch": 0.16923700331323147, "grad_norm": 3.8235254287719727, "learning_rate": 1.1762264447240753e-05, "loss": 0.6302, "step": 3550 }, { "epoch": 0.16928467570853098, "grad_norm": 1.795430064201355, "learning_rate": 1.1758397812716216e-05, "loss": 0.6269, "step": 3551 }, { "epoch": 0.16933234810383047, "grad_norm": 1.0740364789962769, "learning_rate": 1.1754530906889e-05, "loss": 0.4432, "step": 3552 }, { "epoch": 0.16938002049912998, "grad_norm": 1.1840626001358032, "learning_rate": 1.1750663730355737e-05, "loss": 0.6556, "step": 3553 }, { "epoch": 0.1694276928944295, "grad_norm": 1.3491199016571045, "learning_rate": 1.174679628371309e-05, "loss": 0.6026, "step": 3554 }, { "epoch": 0.16947536528972898, "grad_norm": 2.1667327880859375, "learning_rate": 1.174292856755776e-05, "loss": 1.043, "step": 3555 }, { "epoch": 0.1695230376850285, "grad_norm": 1.4364588260650635, "learning_rate": 1.1739060582486506e-05, "loss": 0.63, "step": 3556 }, { "epoch": 0.16957071008032798, "grad_norm": 1.307167887687683, "learning_rate": 1.173519232909611e-05, "loss": 0.7668, "step": 3557 }, { "epoch": 0.1696183824756275, "grad_norm": 3.336822271347046, "learning_rate": 1.1731323807983406e-05, "loss": 1.0308, "step": 3558 }, { "epoch": 0.16966605487092698, "grad_norm": 2.1688733100891113, "learning_rate": 1.1727455019745269e-05, "loss": 0.9717, "step": 3559 }, { "epoch": 0.1697137272662265, "grad_norm": 2.113585948944092, "learning_rate": 1.1723585964978612e-05, "loss": 0.6677, "step": 3560 }, { "epoch": 0.16976139966152598, "grad_norm": 2.2191150188446045, "learning_rate": 1.1719716644280388e-05, "loss": 0.6711, "step": 3561 }, { "epoch": 0.1698090720568255, "grad_norm": 3.1125471591949463, "learning_rate": 1.1715847058247599e-05, "loss": 0.4932, "step": 3562 }, { "epoch": 0.169856744452125, "grad_norm": 1.4719353914260864, "learning_rate": 1.1711977207477276e-05, "loss": 0.5139, "step": 3563 }, { "epoch": 0.1699044168474245, "grad_norm": 1.181674838066101, "learning_rate": 1.1708107092566501e-05, "loss": 0.4889, "step": 3564 }, { "epoch": 0.169952089242724, "grad_norm": 3.9713852405548096, "learning_rate": 1.170423671411239e-05, "loss": 0.959, "step": 3565 }, { "epoch": 0.1699997616380235, "grad_norm": 1.5800055265426636, "learning_rate": 1.1700366072712108e-05, "loss": 0.3025, "step": 3566 }, { "epoch": 0.170047434033323, "grad_norm": 3.0604093074798584, "learning_rate": 1.1696495168962848e-05, "loss": 0.8043, "step": 3567 }, { "epoch": 0.1700951064286225, "grad_norm": 1.731905460357666, "learning_rate": 1.1692624003461854e-05, "loss": 0.6767, "step": 3568 }, { "epoch": 0.170142778823922, "grad_norm": 1.6313363313674927, "learning_rate": 1.168875257680641e-05, "loss": 0.844, "step": 3569 }, { "epoch": 0.17019045121922152, "grad_norm": 1.1193355321884155, "learning_rate": 1.168488088959383e-05, "loss": 0.6755, "step": 3570 }, { "epoch": 0.170238123614521, "grad_norm": 1.2992329597473145, "learning_rate": 1.1681008942421484e-05, "loss": 0.7413, "step": 3571 }, { "epoch": 0.17028579600982052, "grad_norm": 2.0489566326141357, "learning_rate": 1.1677136735886767e-05, "loss": 0.6082, "step": 3572 }, { "epoch": 0.17033346840512, "grad_norm": 1.8728572130203247, "learning_rate": 1.1673264270587122e-05, "loss": 0.6301, "step": 3573 }, { "epoch": 0.17038114080041952, "grad_norm": 2.26562762260437, "learning_rate": 1.1669391547120032e-05, "loss": 1.0341, "step": 3574 }, { "epoch": 0.170428813195719, "grad_norm": 1.8856061697006226, "learning_rate": 1.1665518566083016e-05, "loss": 0.9334, "step": 3575 }, { "epoch": 0.17047648559101852, "grad_norm": 2.3725688457489014, "learning_rate": 1.1661645328073641e-05, "loss": 1.042, "step": 3576 }, { "epoch": 0.170524157986318, "grad_norm": 3.2011992931365967, "learning_rate": 1.16577718336895e-05, "loss": 0.39, "step": 3577 }, { "epoch": 0.17057183038161752, "grad_norm": 1.6124567985534668, "learning_rate": 1.165389808352824e-05, "loss": 0.6931, "step": 3578 }, { "epoch": 0.17061950277691704, "grad_norm": 1.1953445672988892, "learning_rate": 1.1650024078187534e-05, "loss": 0.5153, "step": 3579 }, { "epoch": 0.17066717517221652, "grad_norm": 1.2455637454986572, "learning_rate": 1.1646149818265107e-05, "loss": 0.7659, "step": 3580 }, { "epoch": 0.17071484756751604, "grad_norm": 3.581510066986084, "learning_rate": 1.1642275304358713e-05, "loss": 0.3635, "step": 3581 }, { "epoch": 0.17076251996281552, "grad_norm": 3.138016939163208, "learning_rate": 1.1638400537066152e-05, "loss": 0.2424, "step": 3582 }, { "epoch": 0.17081019235811504, "grad_norm": 2.5070574283599854, "learning_rate": 1.1634525516985264e-05, "loss": 0.8633, "step": 3583 }, { "epoch": 0.17085786475341452, "grad_norm": 3.120242118835449, "learning_rate": 1.1630650244713917e-05, "loss": 0.3051, "step": 3584 }, { "epoch": 0.17090553714871404, "grad_norm": 1.7223788499832153, "learning_rate": 1.1626774720850031e-05, "loss": 0.6868, "step": 3585 }, { "epoch": 0.17095320954401355, "grad_norm": 7.772683143615723, "learning_rate": 1.1622898945991559e-05, "loss": 0.5232, "step": 3586 }, { "epoch": 0.17100088193931304, "grad_norm": 2.0280861854553223, "learning_rate": 1.1619022920736491e-05, "loss": 0.3724, "step": 3587 }, { "epoch": 0.17104855433461255, "grad_norm": 0.9415162801742554, "learning_rate": 1.161514664568286e-05, "loss": 0.4759, "step": 3588 }, { "epoch": 0.17109622672991204, "grad_norm": 1.3747600317001343, "learning_rate": 1.1611270121428736e-05, "loss": 0.6423, "step": 3589 }, { "epoch": 0.17114389912521155, "grad_norm": 1.7101765871047974, "learning_rate": 1.160739334857222e-05, "loss": 0.7427, "step": 3590 }, { "epoch": 0.17119157152051104, "grad_norm": 2.4855165481567383, "learning_rate": 1.1603516327711466e-05, "loss": 0.9986, "step": 3591 }, { "epoch": 0.17123924391581055, "grad_norm": 1.3298126459121704, "learning_rate": 1.1599639059444657e-05, "loss": 0.7456, "step": 3592 }, { "epoch": 0.17128691631111004, "grad_norm": 4.453553676605225, "learning_rate": 1.1595761544370015e-05, "loss": 0.6444, "step": 3593 }, { "epoch": 0.17133458870640955, "grad_norm": 2.1800248622894287, "learning_rate": 1.1591883783085799e-05, "loss": 1.1236, "step": 3594 }, { "epoch": 0.17138226110170907, "grad_norm": 6.065691947937012, "learning_rate": 1.1588005776190305e-05, "loss": 0.571, "step": 3595 }, { "epoch": 0.17142993349700855, "grad_norm": 1.7908965349197388, "learning_rate": 1.1584127524281877e-05, "loss": 1.0544, "step": 3596 }, { "epoch": 0.17147760589230807, "grad_norm": 1.4564570188522339, "learning_rate": 1.1580249027958883e-05, "loss": 0.883, "step": 3597 }, { "epoch": 0.17152527828760755, "grad_norm": 2.0037107467651367, "learning_rate": 1.1576370287819737e-05, "loss": 0.8547, "step": 3598 }, { "epoch": 0.17157295068290707, "grad_norm": 1.92339026927948, "learning_rate": 1.1572491304462891e-05, "loss": 0.341, "step": 3599 }, { "epoch": 0.17162062307820655, "grad_norm": 1.6973270177841187, "learning_rate": 1.156861207848683e-05, "loss": 0.5529, "step": 3600 }, { "epoch": 0.17166829547350607, "grad_norm": 1.7145075798034668, "learning_rate": 1.156473261049008e-05, "loss": 0.6902, "step": 3601 }, { "epoch": 0.17171596786880558, "grad_norm": 2.240851879119873, "learning_rate": 1.15608529010712e-05, "loss": 0.8563, "step": 3602 }, { "epoch": 0.17176364026410507, "grad_norm": 2.0044214725494385, "learning_rate": 1.1556972950828791e-05, "loss": 0.7618, "step": 3603 }, { "epoch": 0.17181131265940458, "grad_norm": 2.3290843963623047, "learning_rate": 1.1553092760361488e-05, "loss": 1.208, "step": 3604 }, { "epoch": 0.17185898505470407, "grad_norm": 1.4555487632751465, "learning_rate": 1.1549212330267969e-05, "loss": 0.2519, "step": 3605 }, { "epoch": 0.17190665745000358, "grad_norm": 1.7847747802734375, "learning_rate": 1.1545331661146941e-05, "loss": 0.6814, "step": 3606 }, { "epoch": 0.17195432984530307, "grad_norm": 2.0160207748413086, "learning_rate": 1.1541450753597147e-05, "loss": 0.7952, "step": 3607 }, { "epoch": 0.17200200224060258, "grad_norm": 1.2327224016189575, "learning_rate": 1.1537569608217381e-05, "loss": 0.3049, "step": 3608 }, { "epoch": 0.1720496746359021, "grad_norm": 1.5503227710723877, "learning_rate": 1.1533688225606458e-05, "loss": 0.6507, "step": 3609 }, { "epoch": 0.17209734703120158, "grad_norm": 1.5489264726638794, "learning_rate": 1.1529806606363234e-05, "loss": 1.0682, "step": 3610 }, { "epoch": 0.1721450194265011, "grad_norm": 1.8247203826904297, "learning_rate": 1.1525924751086603e-05, "loss": 0.5006, "step": 3611 }, { "epoch": 0.17219269182180058, "grad_norm": 1.4162834882736206, "learning_rate": 1.15220426603755e-05, "loss": 0.78, "step": 3612 }, { "epoch": 0.1722403642171001, "grad_norm": 1.4422036409378052, "learning_rate": 1.1518160334828885e-05, "loss": 0.7021, "step": 3613 }, { "epoch": 0.17228803661239958, "grad_norm": 1.94001305103302, "learning_rate": 1.1514277775045768e-05, "loss": 0.893, "step": 3614 }, { "epoch": 0.1723357090076991, "grad_norm": 1.0234631299972534, "learning_rate": 1.1510394981625184e-05, "loss": 0.5148, "step": 3615 }, { "epoch": 0.17238338140299858, "grad_norm": 3.0914082527160645, "learning_rate": 1.1506511955166206e-05, "loss": 1.4842, "step": 3616 }, { "epoch": 0.1724310537982981, "grad_norm": 2.994213104248047, "learning_rate": 1.150262869626795e-05, "loss": 1.0629, "step": 3617 }, { "epoch": 0.1724787261935976, "grad_norm": 2.799489974975586, "learning_rate": 1.1498745205529558e-05, "loss": 0.4918, "step": 3618 }, { "epoch": 0.1725263985888971, "grad_norm": 1.6536370515823364, "learning_rate": 1.1494861483550216e-05, "loss": 1.0997, "step": 3619 }, { "epoch": 0.1725740709841966, "grad_norm": 2.3614556789398193, "learning_rate": 1.1490977530929141e-05, "loss": 0.9053, "step": 3620 }, { "epoch": 0.1726217433794961, "grad_norm": 1.4384511709213257, "learning_rate": 1.1487093348265585e-05, "loss": 0.8032, "step": 3621 }, { "epoch": 0.1726694157747956, "grad_norm": 1.4297435283660889, "learning_rate": 1.1483208936158842e-05, "loss": 0.6918, "step": 3622 }, { "epoch": 0.1727170881700951, "grad_norm": 2.1808807849884033, "learning_rate": 1.1479324295208234e-05, "loss": 1.302, "step": 3623 }, { "epoch": 0.1727647605653946, "grad_norm": 2.944615602493286, "learning_rate": 1.1475439426013122e-05, "loss": 1.0663, "step": 3624 }, { "epoch": 0.17281243296069412, "grad_norm": 0.9485723972320557, "learning_rate": 1.14715543291729e-05, "loss": 0.2848, "step": 3625 }, { "epoch": 0.1728601053559936, "grad_norm": 1.3904765844345093, "learning_rate": 1.1467669005286999e-05, "loss": 0.6437, "step": 3626 }, { "epoch": 0.17290777775129312, "grad_norm": 0.9204992055892944, "learning_rate": 1.1463783454954883e-05, "loss": 0.1347, "step": 3627 }, { "epoch": 0.1729554501465926, "grad_norm": 2.3575785160064697, "learning_rate": 1.1459897678776055e-05, "loss": 1.3396, "step": 3628 }, { "epoch": 0.17300312254189212, "grad_norm": 1.2435863018035889, "learning_rate": 1.1456011677350052e-05, "loss": 0.6202, "step": 3629 }, { "epoch": 0.1730507949371916, "grad_norm": 1.9176143407821655, "learning_rate": 1.1452125451276435e-05, "loss": 0.8125, "step": 3630 }, { "epoch": 0.17309846733249112, "grad_norm": 1.6232742071151733, "learning_rate": 1.1448239001154821e-05, "loss": 0.7718, "step": 3631 }, { "epoch": 0.1731461397277906, "grad_norm": 2.277369260787964, "learning_rate": 1.144435232758484e-05, "loss": 1.2965, "step": 3632 }, { "epoch": 0.17319381212309012, "grad_norm": 1.7735130786895752, "learning_rate": 1.144046543116617e-05, "loss": 0.6857, "step": 3633 }, { "epoch": 0.17324148451838964, "grad_norm": 1.8041505813598633, "learning_rate": 1.1436578312498518e-05, "loss": 0.7382, "step": 3634 }, { "epoch": 0.17328915691368912, "grad_norm": 1.228304147720337, "learning_rate": 1.1432690972181624e-05, "loss": 0.7391, "step": 3635 }, { "epoch": 0.17333682930898864, "grad_norm": 1.7657897472381592, "learning_rate": 1.1428803410815268e-05, "loss": 0.9357, "step": 3636 }, { "epoch": 0.17338450170428812, "grad_norm": 1.4979184865951538, "learning_rate": 1.1424915628999261e-05, "loss": 0.6252, "step": 3637 }, { "epoch": 0.17343217409958764, "grad_norm": 1.5883418321609497, "learning_rate": 1.1421027627333445e-05, "loss": 0.4806, "step": 3638 }, { "epoch": 0.17347984649488712, "grad_norm": 1.2073731422424316, "learning_rate": 1.14171394064177e-05, "loss": 0.7807, "step": 3639 }, { "epoch": 0.17352751889018664, "grad_norm": 5.799956321716309, "learning_rate": 1.141325096685194e-05, "loss": 0.8361, "step": 3640 }, { "epoch": 0.17357519128548615, "grad_norm": 1.3248827457427979, "learning_rate": 1.1409362309236107e-05, "loss": 0.5907, "step": 3641 }, { "epoch": 0.17362286368078564, "grad_norm": 1.3242886066436768, "learning_rate": 1.1405473434170185e-05, "loss": 0.605, "step": 3642 }, { "epoch": 0.17367053607608515, "grad_norm": 1.3338148593902588, "learning_rate": 1.1401584342254183e-05, "loss": 0.689, "step": 3643 }, { "epoch": 0.17371820847138464, "grad_norm": 1.0636639595031738, "learning_rate": 1.1397695034088152e-05, "loss": 0.5682, "step": 3644 }, { "epoch": 0.17376588086668415, "grad_norm": 1.3289719820022583, "learning_rate": 1.1393805510272171e-05, "loss": 0.8583, "step": 3645 }, { "epoch": 0.17381355326198364, "grad_norm": 1.484549880027771, "learning_rate": 1.1389915771406354e-05, "loss": 0.8687, "step": 3646 }, { "epoch": 0.17386122565728315, "grad_norm": 1.180335283279419, "learning_rate": 1.1386025818090847e-05, "loss": 0.6134, "step": 3647 }, { "epoch": 0.17390889805258264, "grad_norm": 2.064446449279785, "learning_rate": 1.138213565092583e-05, "loss": 0.8299, "step": 3648 }, { "epoch": 0.17395657044788215, "grad_norm": 1.5924581289291382, "learning_rate": 1.1378245270511512e-05, "loss": 0.4035, "step": 3649 }, { "epoch": 0.17400424284318167, "grad_norm": 4.692134380340576, "learning_rate": 1.1374354677448145e-05, "loss": 0.8619, "step": 3650 }, { "epoch": 0.17405191523848115, "grad_norm": 1.5653624534606934, "learning_rate": 1.1370463872336004e-05, "loss": 0.9286, "step": 3651 }, { "epoch": 0.17409958763378067, "grad_norm": 0.9868603944778442, "learning_rate": 1.1366572855775397e-05, "loss": 0.4138, "step": 3652 }, { "epoch": 0.17414726002908015, "grad_norm": 1.799821138381958, "learning_rate": 1.1362681628366676e-05, "loss": 0.6377, "step": 3653 }, { "epoch": 0.17419493242437967, "grad_norm": 2.0849995613098145, "learning_rate": 1.1358790190710213e-05, "loss": 0.856, "step": 3654 }, { "epoch": 0.17424260481967915, "grad_norm": 2.911855459213257, "learning_rate": 1.1354898543406411e-05, "loss": 0.4562, "step": 3655 }, { "epoch": 0.17429027721497867, "grad_norm": 1.458886742591858, "learning_rate": 1.1351006687055722e-05, "loss": 0.8462, "step": 3656 }, { "epoch": 0.17433794961027818, "grad_norm": 2.784414052963257, "learning_rate": 1.1347114622258613e-05, "loss": 0.9097, "step": 3657 }, { "epoch": 0.17438562200557767, "grad_norm": 1.0362164974212646, "learning_rate": 1.1343222349615585e-05, "loss": 0.6398, "step": 3658 }, { "epoch": 0.17443329440087718, "grad_norm": 2.4401583671569824, "learning_rate": 1.1339329869727187e-05, "loss": 1.0989, "step": 3659 }, { "epoch": 0.17448096679617667, "grad_norm": 1.9631332159042358, "learning_rate": 1.133543718319398e-05, "loss": 0.3957, "step": 3660 }, { "epoch": 0.17452863919147618, "grad_norm": 1.39589524269104, "learning_rate": 1.1331544290616569e-05, "loss": 0.7613, "step": 3661 }, { "epoch": 0.17457631158677567, "grad_norm": 2.2678937911987305, "learning_rate": 1.1327651192595588e-05, "loss": 0.8191, "step": 3662 }, { "epoch": 0.17462398398207518, "grad_norm": 1.3737080097198486, "learning_rate": 1.1323757889731697e-05, "loss": 0.8491, "step": 3663 }, { "epoch": 0.17467165637737467, "grad_norm": 2.4052364826202393, "learning_rate": 1.1319864382625595e-05, "loss": 1.0834, "step": 3664 }, { "epoch": 0.17471932877267418, "grad_norm": 3.4208598136901855, "learning_rate": 1.1315970671878014e-05, "loss": 0.6112, "step": 3665 }, { "epoch": 0.1747670011679737, "grad_norm": 1.5483438968658447, "learning_rate": 1.1312076758089708e-05, "loss": 0.9585, "step": 3666 }, { "epoch": 0.17481467356327318, "grad_norm": 1.1554005146026611, "learning_rate": 1.130818264186147e-05, "loss": 0.7252, "step": 3667 }, { "epoch": 0.1748623459585727, "grad_norm": 1.7683225870132446, "learning_rate": 1.1304288323794121e-05, "loss": 0.91, "step": 3668 }, { "epoch": 0.17491001835387218, "grad_norm": 1.5654011964797974, "learning_rate": 1.1300393804488519e-05, "loss": 0.64, "step": 3669 }, { "epoch": 0.1749576907491717, "grad_norm": 1.5680348873138428, "learning_rate": 1.1296499084545543e-05, "loss": 1.2859, "step": 3670 }, { "epoch": 0.17500536314447118, "grad_norm": 2.0240871906280518, "learning_rate": 1.1292604164566108e-05, "loss": 1.0842, "step": 3671 }, { "epoch": 0.1750530355397707, "grad_norm": 2.9063687324523926, "learning_rate": 1.1288709045151161e-05, "loss": 1.3241, "step": 3672 }, { "epoch": 0.1751007079350702, "grad_norm": 2.7663965225219727, "learning_rate": 1.128481372690168e-05, "loss": 1.2435, "step": 3673 }, { "epoch": 0.1751483803303697, "grad_norm": 1.1768468618392944, "learning_rate": 1.1280918210418674e-05, "loss": 0.7549, "step": 3674 }, { "epoch": 0.1751960527256692, "grad_norm": 1.5275644063949585, "learning_rate": 1.1277022496303178e-05, "loss": 0.5681, "step": 3675 }, { "epoch": 0.1752437251209687, "grad_norm": 1.4623888731002808, "learning_rate": 1.1273126585156262e-05, "loss": 0.7765, "step": 3676 }, { "epoch": 0.1752913975162682, "grad_norm": 25.827421188354492, "learning_rate": 1.1269230477579025e-05, "loss": 1.0914, "step": 3677 }, { "epoch": 0.1753390699115677, "grad_norm": 1.271023154258728, "learning_rate": 1.1265334174172593e-05, "loss": 0.6713, "step": 3678 }, { "epoch": 0.1753867423068672, "grad_norm": 1.9592735767364502, "learning_rate": 1.1261437675538132e-05, "loss": 0.3998, "step": 3679 }, { "epoch": 0.1754344147021667, "grad_norm": 1.777327299118042, "learning_rate": 1.1257540982276827e-05, "loss": 0.764, "step": 3680 }, { "epoch": 0.1754820870974662, "grad_norm": 1.659556269645691, "learning_rate": 1.1253644094989895e-05, "loss": 0.4603, "step": 3681 }, { "epoch": 0.17552975949276572, "grad_norm": 3.6654441356658936, "learning_rate": 1.1249747014278594e-05, "loss": 0.6015, "step": 3682 }, { "epoch": 0.1755774318880652, "grad_norm": 1.181654453277588, "learning_rate": 1.1245849740744198e-05, "loss": 0.4984, "step": 3683 }, { "epoch": 0.17562510428336472, "grad_norm": 1.0265041589736938, "learning_rate": 1.1241952274988015e-05, "loss": 0.709, "step": 3684 }, { "epoch": 0.1756727766786642, "grad_norm": 1.5340919494628906, "learning_rate": 1.1238054617611384e-05, "loss": 0.8452, "step": 3685 }, { "epoch": 0.17572044907396372, "grad_norm": 1.113054871559143, "learning_rate": 1.1234156769215678e-05, "loss": 0.5516, "step": 3686 }, { "epoch": 0.1757681214692632, "grad_norm": 2.633444309234619, "learning_rate": 1.123025873040229e-05, "loss": 0.9207, "step": 3687 }, { "epoch": 0.17581579386456272, "grad_norm": 1.3964942693710327, "learning_rate": 1.122636050177265e-05, "loss": 0.7796, "step": 3688 }, { "epoch": 0.17586346625986224, "grad_norm": 1.6941229104995728, "learning_rate": 1.1222462083928215e-05, "loss": 1.0525, "step": 3689 }, { "epoch": 0.17591113865516173, "grad_norm": 1.7043280601501465, "learning_rate": 1.1218563477470465e-05, "loss": 0.7201, "step": 3690 }, { "epoch": 0.17595881105046124, "grad_norm": 1.5716497898101807, "learning_rate": 1.1214664683000927e-05, "loss": 0.5047, "step": 3691 }, { "epoch": 0.17600648344576073, "grad_norm": 1.1442688703536987, "learning_rate": 1.121076570112113e-05, "loss": 0.5966, "step": 3692 }, { "epoch": 0.17605415584106024, "grad_norm": 2.763701915740967, "learning_rate": 1.1206866532432657e-05, "loss": 0.6655, "step": 3693 }, { "epoch": 0.17610182823635973, "grad_norm": 2.9142439365386963, "learning_rate": 1.1202967177537105e-05, "loss": 0.5674, "step": 3694 }, { "epoch": 0.17614950063165924, "grad_norm": 7.891298294067383, "learning_rate": 1.1199067637036106e-05, "loss": 0.8924, "step": 3695 }, { "epoch": 0.17619717302695875, "grad_norm": 1.8380059003829956, "learning_rate": 1.1195167911531317e-05, "loss": 0.8322, "step": 3696 }, { "epoch": 0.17624484542225824, "grad_norm": 1.6006748676300049, "learning_rate": 1.1191268001624431e-05, "loss": 0.7625, "step": 3697 }, { "epoch": 0.17629251781755775, "grad_norm": 1.419047474861145, "learning_rate": 1.1187367907917158e-05, "loss": 0.4179, "step": 3698 }, { "epoch": 0.17634019021285724, "grad_norm": 1.0092003345489502, "learning_rate": 1.1183467631011245e-05, "loss": 0.356, "step": 3699 }, { "epoch": 0.17638786260815675, "grad_norm": 1.2002215385437012, "learning_rate": 1.1179567171508463e-05, "loss": 0.6919, "step": 3700 }, { "epoch": 0.17643553500345624, "grad_norm": 2.0352838039398193, "learning_rate": 1.1175666530010612e-05, "loss": 0.8195, "step": 3701 }, { "epoch": 0.17648320739875575, "grad_norm": 5.819756507873535, "learning_rate": 1.1171765707119525e-05, "loss": 0.5558, "step": 3702 }, { "epoch": 0.17653087979405524, "grad_norm": 1.529212474822998, "learning_rate": 1.1167864703437054e-05, "loss": 0.8523, "step": 3703 }, { "epoch": 0.17657855218935475, "grad_norm": 4.227266311645508, "learning_rate": 1.1163963519565086e-05, "loss": 0.6324, "step": 3704 }, { "epoch": 0.17662622458465427, "grad_norm": 1.5722299814224243, "learning_rate": 1.1160062156105536e-05, "loss": 0.7751, "step": 3705 }, { "epoch": 0.17667389697995375, "grad_norm": 1.7496352195739746, "learning_rate": 1.1156160613660341e-05, "loss": 0.7878, "step": 3706 }, { "epoch": 0.17672156937525327, "grad_norm": 3.3151917457580566, "learning_rate": 1.1152258892831468e-05, "loss": 1.2712, "step": 3707 }, { "epoch": 0.17676924177055275, "grad_norm": 2.4798314571380615, "learning_rate": 1.1148356994220917e-05, "loss": 0.7813, "step": 3708 }, { "epoch": 0.17681691416585227, "grad_norm": 3.006855010986328, "learning_rate": 1.1144454918430703e-05, "loss": 0.6662, "step": 3709 }, { "epoch": 0.17686458656115175, "grad_norm": 1.4573267698287964, "learning_rate": 1.1140552666062883e-05, "loss": 0.8147, "step": 3710 }, { "epoch": 0.17691225895645127, "grad_norm": 2.5887489318847656, "learning_rate": 1.1136650237719534e-05, "loss": 0.8309, "step": 3711 }, { "epoch": 0.17695993135175078, "grad_norm": 2.1036903858184814, "learning_rate": 1.1132747634002754e-05, "loss": 0.81, "step": 3712 }, { "epoch": 0.17700760374705027, "grad_norm": 1.2894703149795532, "learning_rate": 1.1128844855514684e-05, "loss": 0.5711, "step": 3713 }, { "epoch": 0.17705527614234978, "grad_norm": 2.5630784034729004, "learning_rate": 1.1124941902857475e-05, "loss": 0.2478, "step": 3714 }, { "epoch": 0.17710294853764927, "grad_norm": 4.644442558288574, "learning_rate": 1.1121038776633315e-05, "loss": 0.3775, "step": 3715 }, { "epoch": 0.17715062093294878, "grad_norm": 1.6223654747009277, "learning_rate": 1.1117135477444417e-05, "loss": 0.7915, "step": 3716 }, { "epoch": 0.17719829332824827, "grad_norm": 11.437932014465332, "learning_rate": 1.111323200589302e-05, "loss": 0.3528, "step": 3717 }, { "epoch": 0.17724596572354778, "grad_norm": 1.5484358072280884, "learning_rate": 1.1109328362581385e-05, "loss": 1.1167, "step": 3718 }, { "epoch": 0.17729363811884727, "grad_norm": 2.085814952850342, "learning_rate": 1.110542454811181e-05, "loss": 0.752, "step": 3719 }, { "epoch": 0.17734131051414678, "grad_norm": 2.661550283432007, "learning_rate": 1.1101520563086612e-05, "loss": 0.748, "step": 3720 }, { "epoch": 0.1773889829094463, "grad_norm": 1.7953928709030151, "learning_rate": 1.1097616408108134e-05, "loss": 0.8813, "step": 3721 }, { "epoch": 0.17743665530474578, "grad_norm": 1.3717807531356812, "learning_rate": 1.1093712083778748e-05, "loss": 0.6016, "step": 3722 }, { "epoch": 0.1774843277000453, "grad_norm": 1.6871592998504639, "learning_rate": 1.1089807590700848e-05, "loss": 0.8489, "step": 3723 }, { "epoch": 0.17753200009534478, "grad_norm": 1.757038950920105, "learning_rate": 1.108590292947686e-05, "loss": 0.7763, "step": 3724 }, { "epoch": 0.1775796724906443, "grad_norm": 1.2920114994049072, "learning_rate": 1.1081998100709232e-05, "loss": 0.6205, "step": 3725 }, { "epoch": 0.17762734488594378, "grad_norm": 1.526204228401184, "learning_rate": 1.1078093105000441e-05, "loss": 0.3761, "step": 3726 }, { "epoch": 0.1776750172812433, "grad_norm": 1.0621832609176636, "learning_rate": 1.1074187942952985e-05, "loss": 0.6272, "step": 3727 }, { "epoch": 0.1777226896765428, "grad_norm": 1.669875144958496, "learning_rate": 1.1070282615169395e-05, "loss": 0.6544, "step": 3728 }, { "epoch": 0.1777703620718423, "grad_norm": 3.1241908073425293, "learning_rate": 1.1066377122252216e-05, "loss": 0.9816, "step": 3729 }, { "epoch": 0.1778180344671418, "grad_norm": 1.6103582382202148, "learning_rate": 1.106247146480403e-05, "loss": 0.6744, "step": 3730 }, { "epoch": 0.1778657068624413, "grad_norm": 1.1220877170562744, "learning_rate": 1.1058565643427439e-05, "loss": 0.716, "step": 3731 }, { "epoch": 0.1779133792577408, "grad_norm": 2.1707518100738525, "learning_rate": 1.1054659658725067e-05, "loss": 0.6998, "step": 3732 }, { "epoch": 0.1779610516530403, "grad_norm": 1.7029175758361816, "learning_rate": 1.1050753511299572e-05, "loss": 0.7615, "step": 3733 }, { "epoch": 0.1780087240483398, "grad_norm": 2.493659496307373, "learning_rate": 1.1046847201753632e-05, "loss": 0.6215, "step": 3734 }, { "epoch": 0.1780563964436393, "grad_norm": 2.4561851024627686, "learning_rate": 1.104294073068995e-05, "loss": 0.9062, "step": 3735 }, { "epoch": 0.1781040688389388, "grad_norm": 1.631298303604126, "learning_rate": 1.1039034098711251e-05, "loss": 0.367, "step": 3736 }, { "epoch": 0.17815174123423833, "grad_norm": 1.754578709602356, "learning_rate": 1.1035127306420295e-05, "loss": 0.842, "step": 3737 }, { "epoch": 0.1781994136295378, "grad_norm": 1.5287071466445923, "learning_rate": 1.1031220354419849e-05, "loss": 0.5886, "step": 3738 }, { "epoch": 0.17824708602483733, "grad_norm": 2.307190179824829, "learning_rate": 1.1027313243312726e-05, "loss": 0.5273, "step": 3739 }, { "epoch": 0.1782947584201368, "grad_norm": 1.314386248588562, "learning_rate": 1.1023405973701746e-05, "loss": 0.4677, "step": 3740 }, { "epoch": 0.17834243081543633, "grad_norm": 1.8360697031021118, "learning_rate": 1.1019498546189765e-05, "loss": 0.533, "step": 3741 }, { "epoch": 0.1783901032107358, "grad_norm": 2.1292521953582764, "learning_rate": 1.1015590961379657e-05, "loss": 0.8392, "step": 3742 }, { "epoch": 0.17843777560603533, "grad_norm": 3.5823183059692383, "learning_rate": 1.1011683219874324e-05, "loss": 1.535, "step": 3743 }, { "epoch": 0.17848544800133484, "grad_norm": 2.161022424697876, "learning_rate": 1.1007775322276687e-05, "loss": 0.8815, "step": 3744 }, { "epoch": 0.17853312039663433, "grad_norm": 1.5481677055358887, "learning_rate": 1.1003867269189696e-05, "loss": 0.5935, "step": 3745 }, { "epoch": 0.17858079279193384, "grad_norm": 1.7577028274536133, "learning_rate": 1.099995906121632e-05, "loss": 0.4881, "step": 3746 }, { "epoch": 0.17862846518723333, "grad_norm": 1.5946173667907715, "learning_rate": 1.0996050698959561e-05, "loss": 0.5955, "step": 3747 }, { "epoch": 0.17867613758253284, "grad_norm": 2.6982274055480957, "learning_rate": 1.0992142183022438e-05, "loss": 0.867, "step": 3748 }, { "epoch": 0.17872380997783233, "grad_norm": 2.225858449935913, "learning_rate": 1.0988233514007991e-05, "loss": 0.9166, "step": 3749 }, { "epoch": 0.17877148237313184, "grad_norm": 1.9222170114517212, "learning_rate": 1.0984324692519292e-05, "loss": 0.811, "step": 3750 }, { "epoch": 0.17881915476843133, "grad_norm": 1.1785584688186646, "learning_rate": 1.098041571915943e-05, "loss": 0.709, "step": 3751 }, { "epoch": 0.17886682716373084, "grad_norm": 1.0912861824035645, "learning_rate": 1.0976506594531515e-05, "loss": 0.4086, "step": 3752 }, { "epoch": 0.17891449955903035, "grad_norm": 1.475943922996521, "learning_rate": 1.0972597319238692e-05, "loss": 0.5044, "step": 3753 }, { "epoch": 0.17896217195432984, "grad_norm": 1.7712129354476929, "learning_rate": 1.0968687893884118e-05, "loss": 0.7004, "step": 3754 }, { "epoch": 0.17900984434962935, "grad_norm": 1.078445553779602, "learning_rate": 1.0964778319070974e-05, "loss": 0.1896, "step": 3755 }, { "epoch": 0.17905751674492884, "grad_norm": 1.7492010593414307, "learning_rate": 1.0960868595402474e-05, "loss": 0.8861, "step": 3756 }, { "epoch": 0.17910518914022835, "grad_norm": 6.3724517822265625, "learning_rate": 1.0956958723481845e-05, "loss": 0.74, "step": 3757 }, { "epoch": 0.17915286153552784, "grad_norm": 3.0286405086517334, "learning_rate": 1.095304870391234e-05, "loss": 0.7326, "step": 3758 }, { "epoch": 0.17920053393082735, "grad_norm": 1.0813490152359009, "learning_rate": 1.0949138537297233e-05, "loss": 0.5341, "step": 3759 }, { "epoch": 0.17924820632612687, "grad_norm": 2.031029462814331, "learning_rate": 1.0945228224239823e-05, "loss": 0.8732, "step": 3760 }, { "epoch": 0.17929587872142635, "grad_norm": 1.719015121459961, "learning_rate": 1.0941317765343433e-05, "loss": 0.9455, "step": 3761 }, { "epoch": 0.17934355111672587, "grad_norm": 1.0207024812698364, "learning_rate": 1.0937407161211406e-05, "loss": 0.5429, "step": 3762 }, { "epoch": 0.17939122351202536, "grad_norm": 1.4251781702041626, "learning_rate": 1.0933496412447105e-05, "loss": 0.6332, "step": 3763 }, { "epoch": 0.17943889590732487, "grad_norm": 1.6352126598358154, "learning_rate": 1.0929585519653924e-05, "loss": 0.8364, "step": 3764 }, { "epoch": 0.17948656830262436, "grad_norm": 1.2944365739822388, "learning_rate": 1.092567448343527e-05, "loss": 0.5629, "step": 3765 }, { "epoch": 0.17953424069792387, "grad_norm": 1.97201669216156, "learning_rate": 1.0921763304394574e-05, "loss": 0.8375, "step": 3766 }, { "epoch": 0.17958191309322336, "grad_norm": 2.4572038650512695, "learning_rate": 1.0917851983135294e-05, "loss": 0.8321, "step": 3767 }, { "epoch": 0.17962958548852287, "grad_norm": 1.7717958688735962, "learning_rate": 1.0913940520260906e-05, "loss": 0.4658, "step": 3768 }, { "epoch": 0.17967725788382238, "grad_norm": 31.82832145690918, "learning_rate": 1.0910028916374904e-05, "loss": 0.8127, "step": 3769 }, { "epoch": 0.17972493027912187, "grad_norm": 2.1751232147216797, "learning_rate": 1.0906117172080812e-05, "loss": 0.7162, "step": 3770 }, { "epoch": 0.17977260267442138, "grad_norm": 1.5871031284332275, "learning_rate": 1.0902205287982175e-05, "loss": 0.6128, "step": 3771 }, { "epoch": 0.17982027506972087, "grad_norm": 1.3700811862945557, "learning_rate": 1.0898293264682549e-05, "loss": 0.823, "step": 3772 }, { "epoch": 0.17986794746502038, "grad_norm": 1.4328159093856812, "learning_rate": 1.0894381102785527e-05, "loss": 0.5723, "step": 3773 }, { "epoch": 0.17991561986031987, "grad_norm": 1.3425663709640503, "learning_rate": 1.0890468802894712e-05, "loss": 0.7518, "step": 3774 }, { "epoch": 0.17996329225561938, "grad_norm": 1.1192046403884888, "learning_rate": 1.0886556365613725e-05, "loss": 0.7811, "step": 3775 }, { "epoch": 0.1800109646509189, "grad_norm": 1.260384202003479, "learning_rate": 1.0882643791546224e-05, "loss": 0.6662, "step": 3776 }, { "epoch": 0.18005863704621838, "grad_norm": 1.7328011989593506, "learning_rate": 1.0878731081295874e-05, "loss": 0.9567, "step": 3777 }, { "epoch": 0.1801063094415179, "grad_norm": 1.9984331130981445, "learning_rate": 1.0874818235466366e-05, "loss": 0.7515, "step": 3778 }, { "epoch": 0.18015398183681738, "grad_norm": 1.9605036973953247, "learning_rate": 1.0870905254661418e-05, "loss": 0.4301, "step": 3779 }, { "epoch": 0.1802016542321169, "grad_norm": 1.9046077728271484, "learning_rate": 1.0866992139484755e-05, "loss": 0.9394, "step": 3780 }, { "epoch": 0.18024932662741638, "grad_norm": 2.5434470176696777, "learning_rate": 1.0863078890540133e-05, "loss": 0.8089, "step": 3781 }, { "epoch": 0.1802969990227159, "grad_norm": 1.6785893440246582, "learning_rate": 1.0859165508431329e-05, "loss": 1.0743, "step": 3782 }, { "epoch": 0.1803446714180154, "grad_norm": 1.5858412981033325, "learning_rate": 1.085525199376213e-05, "loss": 0.6425, "step": 3783 }, { "epoch": 0.1803923438133149, "grad_norm": 1.2266967296600342, "learning_rate": 1.0851338347136358e-05, "loss": 0.7408, "step": 3784 }, { "epoch": 0.1804400162086144, "grad_norm": 3.621140241622925, "learning_rate": 1.0847424569157847e-05, "loss": 0.4324, "step": 3785 }, { "epoch": 0.1804876886039139, "grad_norm": 3.730668544769287, "learning_rate": 1.0843510660430447e-05, "loss": 1.2772, "step": 3786 }, { "epoch": 0.1805353609992134, "grad_norm": 3.8443429470062256, "learning_rate": 1.0839596621558045e-05, "loss": 0.6629, "step": 3787 }, { "epoch": 0.1805830333945129, "grad_norm": 3.6548566818237305, "learning_rate": 1.0835682453144527e-05, "loss": 0.8204, "step": 3788 }, { "epoch": 0.1806307057898124, "grad_norm": 2.259329080581665, "learning_rate": 1.0831768155793814e-05, "loss": 1.1254, "step": 3789 }, { "epoch": 0.1806783781851119, "grad_norm": 1.7217439413070679, "learning_rate": 1.082785373010984e-05, "loss": 0.7948, "step": 3790 }, { "epoch": 0.1807260505804114, "grad_norm": 1.624069094657898, "learning_rate": 1.0823939176696561e-05, "loss": 0.6251, "step": 3791 }, { "epoch": 0.18077372297571093, "grad_norm": 3.0085394382476807, "learning_rate": 1.082002449615795e-05, "loss": 0.7022, "step": 3792 }, { "epoch": 0.1808213953710104, "grad_norm": 1.255738377571106, "learning_rate": 1.0816109689098004e-05, "loss": 0.8176, "step": 3793 }, { "epoch": 0.18086906776630993, "grad_norm": 2.689150810241699, "learning_rate": 1.081219475612074e-05, "loss": 0.546, "step": 3794 }, { "epoch": 0.1809167401616094, "grad_norm": 1.3958780765533447, "learning_rate": 1.0808279697830188e-05, "loss": 0.8049, "step": 3795 }, { "epoch": 0.18096441255690893, "grad_norm": 1.2954460382461548, "learning_rate": 1.08043645148304e-05, "loss": 0.7955, "step": 3796 }, { "epoch": 0.1810120849522084, "grad_norm": 2.3237876892089844, "learning_rate": 1.0800449207725453e-05, "loss": 1.0901, "step": 3797 }, { "epoch": 0.18105975734750793, "grad_norm": 2.630173683166504, "learning_rate": 1.0796533777119435e-05, "loss": 0.8656, "step": 3798 }, { "epoch": 0.18110742974280744, "grad_norm": 4.283743381500244, "learning_rate": 1.079261822361646e-05, "loss": 1.0043, "step": 3799 }, { "epoch": 0.18115510213810693, "grad_norm": 1.5384756326675415, "learning_rate": 1.0788702547820654e-05, "loss": 0.8652, "step": 3800 }, { "epoch": 0.18120277453340644, "grad_norm": 2.6268727779388428, "learning_rate": 1.0784786750336165e-05, "loss": 0.6213, "step": 3801 }, { "epoch": 0.18125044692870593, "grad_norm": 2.5609583854675293, "learning_rate": 1.0780870831767166e-05, "loss": 0.8244, "step": 3802 }, { "epoch": 0.18129811932400544, "grad_norm": 1.5510481595993042, "learning_rate": 1.0776954792717835e-05, "loss": 1.0069, "step": 3803 }, { "epoch": 0.18134579171930493, "grad_norm": 1.1264429092407227, "learning_rate": 1.0773038633792385e-05, "loss": 0.4657, "step": 3804 }, { "epoch": 0.18139346411460444, "grad_norm": 2.5910754203796387, "learning_rate": 1.0769122355595031e-05, "loss": 0.8916, "step": 3805 }, { "epoch": 0.18144113650990393, "grad_norm": 0.9827473163604736, "learning_rate": 1.0765205958730018e-05, "loss": 0.5123, "step": 3806 }, { "epoch": 0.18148880890520344, "grad_norm": 2.5792088508605957, "learning_rate": 1.0761289443801608e-05, "loss": 1.0205, "step": 3807 }, { "epoch": 0.18153648130050296, "grad_norm": 1.2586830854415894, "learning_rate": 1.0757372811414075e-05, "loss": 0.7238, "step": 3808 }, { "epoch": 0.18158415369580244, "grad_norm": 2.0719401836395264, "learning_rate": 1.0753456062171716e-05, "loss": 0.6748, "step": 3809 }, { "epoch": 0.18163182609110196, "grad_norm": 2.038454532623291, "learning_rate": 1.0749539196678849e-05, "loss": 0.7693, "step": 3810 }, { "epoch": 0.18167949848640144, "grad_norm": 1.2887102365493774, "learning_rate": 1.0745622215539801e-05, "loss": 0.4609, "step": 3811 }, { "epoch": 0.18172717088170096, "grad_norm": 1.0278254747390747, "learning_rate": 1.0741705119358922e-05, "loss": 0.4919, "step": 3812 }, { "epoch": 0.18177484327700044, "grad_norm": 1.805465579032898, "learning_rate": 1.0737787908740582e-05, "loss": 0.9819, "step": 3813 }, { "epoch": 0.18182251567229996, "grad_norm": 1.6648166179656982, "learning_rate": 1.0733870584289168e-05, "loss": 0.5149, "step": 3814 }, { "epoch": 0.18187018806759947, "grad_norm": 1.674871802330017, "learning_rate": 1.0729953146609076e-05, "loss": 0.4941, "step": 3815 }, { "epoch": 0.18191786046289896, "grad_norm": 1.9727791547775269, "learning_rate": 1.0726035596304733e-05, "loss": 0.8186, "step": 3816 }, { "epoch": 0.18196553285819847, "grad_norm": 2.5273120403289795, "learning_rate": 1.0722117933980574e-05, "loss": 0.5947, "step": 3817 }, { "epoch": 0.18201320525349796, "grad_norm": 1.1986948251724243, "learning_rate": 1.0718200160241054e-05, "loss": 0.6234, "step": 3818 }, { "epoch": 0.18206087764879747, "grad_norm": 1.3057129383087158, "learning_rate": 1.0714282275690646e-05, "loss": 0.9209, "step": 3819 }, { "epoch": 0.18210855004409696, "grad_norm": 1.3036905527114868, "learning_rate": 1.0710364280933839e-05, "loss": 0.8521, "step": 3820 }, { "epoch": 0.18215622243939647, "grad_norm": 1.7449041604995728, "learning_rate": 1.0706446176575137e-05, "loss": 0.8996, "step": 3821 }, { "epoch": 0.18220389483469596, "grad_norm": 1.8671306371688843, "learning_rate": 1.0702527963219064e-05, "loss": 0.6253, "step": 3822 }, { "epoch": 0.18225156722999547, "grad_norm": 1.781281590461731, "learning_rate": 1.0698609641470161e-05, "loss": 1.0288, "step": 3823 }, { "epoch": 0.18229923962529498, "grad_norm": 1.8120882511138916, "learning_rate": 1.0694691211932986e-05, "loss": 0.9012, "step": 3824 }, { "epoch": 0.18234691202059447, "grad_norm": 2.3989880084991455, "learning_rate": 1.0690772675212112e-05, "loss": 0.888, "step": 3825 }, { "epoch": 0.18239458441589398, "grad_norm": 2.982454776763916, "learning_rate": 1.0686854031912126e-05, "loss": 0.8754, "step": 3826 }, { "epoch": 0.18244225681119347, "grad_norm": 2.698946475982666, "learning_rate": 1.0682935282637638e-05, "loss": 0.9195, "step": 3827 }, { "epoch": 0.18248992920649298, "grad_norm": 1.4840878248214722, "learning_rate": 1.0679016427993267e-05, "loss": 0.53, "step": 3828 }, { "epoch": 0.18253760160179247, "grad_norm": 2.0324044227600098, "learning_rate": 1.0675097468583653e-05, "loss": 0.8444, "step": 3829 }, { "epoch": 0.18258527399709198, "grad_norm": 1.8995931148529053, "learning_rate": 1.0671178405013454e-05, "loss": 0.5282, "step": 3830 }, { "epoch": 0.1826329463923915, "grad_norm": 1.3772923946380615, "learning_rate": 1.066725923788734e-05, "loss": 0.6219, "step": 3831 }, { "epoch": 0.18268061878769098, "grad_norm": 1.3472472429275513, "learning_rate": 1.0663339967809991e-05, "loss": 0.6163, "step": 3832 }, { "epoch": 0.1827282911829905, "grad_norm": 2.7855474948883057, "learning_rate": 1.0659420595386123e-05, "loss": 1.1621, "step": 3833 }, { "epoch": 0.18277596357828998, "grad_norm": 1.4489245414733887, "learning_rate": 1.0655501121220446e-05, "loss": 0.6857, "step": 3834 }, { "epoch": 0.1828236359735895, "grad_norm": 2.5283961296081543, "learning_rate": 1.0651581545917693e-05, "loss": 0.2947, "step": 3835 }, { "epoch": 0.18287130836888899, "grad_norm": 1.3052552938461304, "learning_rate": 1.064766187008262e-05, "loss": 0.7609, "step": 3836 }, { "epoch": 0.1829189807641885, "grad_norm": 4.39094877243042, "learning_rate": 1.0643742094319991e-05, "loss": 0.6984, "step": 3837 }, { "epoch": 0.18296665315948799, "grad_norm": 4.12162446975708, "learning_rate": 1.0639822219234583e-05, "loss": 0.1054, "step": 3838 }, { "epoch": 0.1830143255547875, "grad_norm": 1.4367767572402954, "learning_rate": 1.0635902245431198e-05, "loss": 0.4412, "step": 3839 }, { "epoch": 0.183061997950087, "grad_norm": 1.297636866569519, "learning_rate": 1.0631982173514645e-05, "loss": 0.5826, "step": 3840 }, { "epoch": 0.1831096703453865, "grad_norm": 1.4141820669174194, "learning_rate": 1.062806200408975e-05, "loss": 0.3926, "step": 3841 }, { "epoch": 0.183157342740686, "grad_norm": 1.2304821014404297, "learning_rate": 1.0624141737761356e-05, "loss": 0.7535, "step": 3842 }, { "epoch": 0.1832050151359855, "grad_norm": 1.5290307998657227, "learning_rate": 1.0620221375134319e-05, "loss": 0.9564, "step": 3843 }, { "epoch": 0.183252687531285, "grad_norm": 5.774345397949219, "learning_rate": 1.0616300916813509e-05, "loss": 0.332, "step": 3844 }, { "epoch": 0.1833003599265845, "grad_norm": 1.33428156375885, "learning_rate": 1.0612380363403818e-05, "loss": 0.6447, "step": 3845 }, { "epoch": 0.183348032321884, "grad_norm": 1.1961020231246948, "learning_rate": 1.060845971551014e-05, "loss": 0.9196, "step": 3846 }, { "epoch": 0.18339570471718353, "grad_norm": 1.279846429824829, "learning_rate": 1.0604538973737394e-05, "loss": 0.5155, "step": 3847 }, { "epoch": 0.183443377112483, "grad_norm": 1.2324726581573486, "learning_rate": 1.0600618138690514e-05, "loss": 0.726, "step": 3848 }, { "epoch": 0.18349104950778253, "grad_norm": 2.030451774597168, "learning_rate": 1.0596697210974436e-05, "loss": 0.7574, "step": 3849 }, { "epoch": 0.183538721903082, "grad_norm": 1.6552375555038452, "learning_rate": 1.0592776191194126e-05, "loss": 0.6017, "step": 3850 }, { "epoch": 0.18358639429838153, "grad_norm": 1.3873900175094604, "learning_rate": 1.0588855079954552e-05, "loss": 0.5501, "step": 3851 }, { "epoch": 0.18363406669368101, "grad_norm": 1.1930038928985596, "learning_rate": 1.05849338778607e-05, "loss": 0.7093, "step": 3852 }, { "epoch": 0.18368173908898053, "grad_norm": 1.1213637590408325, "learning_rate": 1.058101258551758e-05, "loss": 0.7103, "step": 3853 }, { "epoch": 0.18372941148428001, "grad_norm": 3.5747599601745605, "learning_rate": 1.05770912035302e-05, "loss": 0.0847, "step": 3854 }, { "epoch": 0.18377708387957953, "grad_norm": 1.7481714487075806, "learning_rate": 1.0573169732503592e-05, "loss": 0.8045, "step": 3855 }, { "epoch": 0.18382475627487904, "grad_norm": 1.6177853345870972, "learning_rate": 1.0569248173042793e-05, "loss": 0.7111, "step": 3856 }, { "epoch": 0.18387242867017853, "grad_norm": 1.1562814712524414, "learning_rate": 1.0565326525752866e-05, "loss": 0.7206, "step": 3857 }, { "epoch": 0.18392010106547804, "grad_norm": 1.4583311080932617, "learning_rate": 1.0561404791238875e-05, "loss": 0.8123, "step": 3858 }, { "epoch": 0.18396777346077753, "grad_norm": 1.5818381309509277, "learning_rate": 1.0557482970105907e-05, "loss": 1.2063, "step": 3859 }, { "epoch": 0.18401544585607704, "grad_norm": 2.0157089233398438, "learning_rate": 1.0553561062959056e-05, "loss": 0.6947, "step": 3860 }, { "epoch": 0.18406311825137653, "grad_norm": 1.8058491945266724, "learning_rate": 1.0549639070403437e-05, "loss": 1.1887, "step": 3861 }, { "epoch": 0.18411079064667604, "grad_norm": 0.9012666940689087, "learning_rate": 1.0545716993044168e-05, "loss": 0.4464, "step": 3862 }, { "epoch": 0.18415846304197556, "grad_norm": 1.0894057750701904, "learning_rate": 1.0541794831486388e-05, "loss": 0.4666, "step": 3863 }, { "epoch": 0.18420613543727504, "grad_norm": 2.1592535972595215, "learning_rate": 1.0537872586335245e-05, "loss": 0.8279, "step": 3864 }, { "epoch": 0.18425380783257456, "grad_norm": 1.3710857629776, "learning_rate": 1.05339502581959e-05, "loss": 0.7208, "step": 3865 }, { "epoch": 0.18430148022787404, "grad_norm": 2.9551916122436523, "learning_rate": 1.0530027847673526e-05, "loss": 0.6664, "step": 3866 }, { "epoch": 0.18434915262317356, "grad_norm": 1.4025201797485352, "learning_rate": 1.0526105355373318e-05, "loss": 0.7266, "step": 3867 }, { "epoch": 0.18439682501847304, "grad_norm": 1.3041059970855713, "learning_rate": 1.0522182781900467e-05, "loss": 1.0898, "step": 3868 }, { "epoch": 0.18444449741377256, "grad_norm": 2.556823492050171, "learning_rate": 1.0518260127860192e-05, "loss": 1.1303, "step": 3869 }, { "epoch": 0.18449216980907204, "grad_norm": 1.3298100233078003, "learning_rate": 1.0514337393857718e-05, "loss": 0.7412, "step": 3870 }, { "epoch": 0.18453984220437156, "grad_norm": 1.5906391143798828, "learning_rate": 1.0510414580498283e-05, "loss": 0.8786, "step": 3871 }, { "epoch": 0.18458751459967107, "grad_norm": 3.597646474838257, "learning_rate": 1.0506491688387128e-05, "loss": 0.8626, "step": 3872 }, { "epoch": 0.18463518699497056, "grad_norm": 2.4983737468719482, "learning_rate": 1.0502568718129526e-05, "loss": 0.4904, "step": 3873 }, { "epoch": 0.18468285939027007, "grad_norm": 1.5360634326934814, "learning_rate": 1.0498645670330746e-05, "loss": 0.4983, "step": 3874 }, { "epoch": 0.18473053178556956, "grad_norm": 1.3316617012023926, "learning_rate": 1.049472254559607e-05, "loss": 0.6926, "step": 3875 }, { "epoch": 0.18477820418086907, "grad_norm": 2.1072819232940674, "learning_rate": 1.0490799344530804e-05, "loss": 1.1203, "step": 3876 }, { "epoch": 0.18482587657616856, "grad_norm": 1.4880831241607666, "learning_rate": 1.0486876067740253e-05, "loss": 0.6833, "step": 3877 }, { "epoch": 0.18487354897146807, "grad_norm": 3.8898813724517822, "learning_rate": 1.0482952715829737e-05, "loss": 0.4716, "step": 3878 }, { "epoch": 0.18492122136676759, "grad_norm": 1.723595142364502, "learning_rate": 1.0479029289404592e-05, "loss": 0.9794, "step": 3879 }, { "epoch": 0.18496889376206707, "grad_norm": 3.458982229232788, "learning_rate": 1.0475105789070157e-05, "loss": 0.5757, "step": 3880 }, { "epoch": 0.18501656615736659, "grad_norm": 1.9958633184432983, "learning_rate": 1.0471182215431796e-05, "loss": 0.3679, "step": 3881 }, { "epoch": 0.18506423855266607, "grad_norm": 2.694352149963379, "learning_rate": 1.046725856909487e-05, "loss": 0.8458, "step": 3882 }, { "epoch": 0.18511191094796559, "grad_norm": 1.2007461786270142, "learning_rate": 1.0463334850664757e-05, "loss": 0.9068, "step": 3883 }, { "epoch": 0.18515958334326507, "grad_norm": 1.968867301940918, "learning_rate": 1.0459411060746848e-05, "loss": 0.9438, "step": 3884 }, { "epoch": 0.18520725573856459, "grad_norm": 4.607203006744385, "learning_rate": 1.0455487199946547e-05, "loss": 0.6974, "step": 3885 }, { "epoch": 0.1852549281338641, "grad_norm": 1.1567150354385376, "learning_rate": 1.0451563268869258e-05, "loss": 0.7309, "step": 3886 }, { "epoch": 0.18530260052916359, "grad_norm": 1.4846583604812622, "learning_rate": 1.0447639268120409e-05, "loss": 0.5524, "step": 3887 }, { "epoch": 0.1853502729244631, "grad_norm": 1.5285459756851196, "learning_rate": 1.0443715198305432e-05, "loss": 0.2858, "step": 3888 }, { "epoch": 0.18539794531976259, "grad_norm": 4.7506208419799805, "learning_rate": 1.0439791060029765e-05, "loss": 0.3033, "step": 3889 }, { "epoch": 0.1854456177150621, "grad_norm": 2.634995937347412, "learning_rate": 1.0435866853898869e-05, "loss": 0.5784, "step": 3890 }, { "epoch": 0.18549329011036159, "grad_norm": 1.5000189542770386, "learning_rate": 1.0431942580518207e-05, "loss": 0.788, "step": 3891 }, { "epoch": 0.1855409625056611, "grad_norm": 1.276197910308838, "learning_rate": 1.0428018240493247e-05, "loss": 0.7052, "step": 3892 }, { "epoch": 0.1855886349009606, "grad_norm": 1.6995823383331299, "learning_rate": 1.0424093834429487e-05, "loss": 0.7176, "step": 3893 }, { "epoch": 0.1856363072962601, "grad_norm": 1.4082237482070923, "learning_rate": 1.0420169362932416e-05, "loss": 0.7163, "step": 3894 }, { "epoch": 0.18568397969155961, "grad_norm": 6.1425251960754395, "learning_rate": 1.0416244826607533e-05, "loss": 0.4488, "step": 3895 }, { "epoch": 0.1857316520868591, "grad_norm": 2.4104936122894287, "learning_rate": 1.0412320226060364e-05, "loss": 1.0537, "step": 3896 }, { "epoch": 0.18577932448215861, "grad_norm": 1.3676190376281738, "learning_rate": 1.0408395561896429e-05, "loss": 0.7454, "step": 3897 }, { "epoch": 0.1858269968774581, "grad_norm": 1.383682131767273, "learning_rate": 1.0404470834721265e-05, "loss": 0.7932, "step": 3898 }, { "epoch": 0.18587466927275761, "grad_norm": 1.5696673393249512, "learning_rate": 1.0400546045140416e-05, "loss": 0.6699, "step": 3899 }, { "epoch": 0.1859223416680571, "grad_norm": 3.1046829223632812, "learning_rate": 1.039662119375944e-05, "loss": 1.0755, "step": 3900 }, { "epoch": 0.18597001406335661, "grad_norm": 1.342029094696045, "learning_rate": 1.0392696281183893e-05, "loss": 0.6093, "step": 3901 }, { "epoch": 0.18601768645865613, "grad_norm": 1.4734547138214111, "learning_rate": 1.0388771308019359e-05, "loss": 0.7575, "step": 3902 }, { "epoch": 0.18606535885395561, "grad_norm": 3.215017318725586, "learning_rate": 1.0384846274871412e-05, "loss": 0.6839, "step": 3903 }, { "epoch": 0.18611303124925513, "grad_norm": 2.5554721355438232, "learning_rate": 1.038092118234565e-05, "loss": 1.1114, "step": 3904 }, { "epoch": 0.18616070364455461, "grad_norm": 1.4321552515029907, "learning_rate": 1.037699603104767e-05, "loss": 0.7825, "step": 3905 }, { "epoch": 0.18620837603985413, "grad_norm": 1.71304452419281, "learning_rate": 1.0373070821583084e-05, "loss": 0.7759, "step": 3906 }, { "epoch": 0.18625604843515361, "grad_norm": 4.079225063323975, "learning_rate": 1.0369145554557516e-05, "loss": 0.2851, "step": 3907 }, { "epoch": 0.18630372083045313, "grad_norm": 1.5605511665344238, "learning_rate": 1.0365220230576592e-05, "loss": 0.7748, "step": 3908 }, { "epoch": 0.18635139322575262, "grad_norm": 30.554336547851562, "learning_rate": 1.0361294850245942e-05, "loss": 0.868, "step": 3909 }, { "epoch": 0.18639906562105213, "grad_norm": 1.908445119857788, "learning_rate": 1.0357369414171219e-05, "loss": 0.8372, "step": 3910 }, { "epoch": 0.18644673801635164, "grad_norm": 1.4731189012527466, "learning_rate": 1.0353443922958078e-05, "loss": 0.929, "step": 3911 }, { "epoch": 0.18649441041165113, "grad_norm": 1.6199272871017456, "learning_rate": 1.0349518377212175e-05, "loss": 1.0831, "step": 3912 }, { "epoch": 0.18654208280695064, "grad_norm": 1.8650221824645996, "learning_rate": 1.0345592777539189e-05, "loss": 0.5358, "step": 3913 }, { "epoch": 0.18658975520225013, "grad_norm": 1.8524514436721802, "learning_rate": 1.0341667124544797e-05, "loss": 0.5273, "step": 3914 }, { "epoch": 0.18663742759754964, "grad_norm": 8.39456558227539, "learning_rate": 1.0337741418834683e-05, "loss": 0.8568, "step": 3915 }, { "epoch": 0.18668509999284913, "grad_norm": 1.458449363708496, "learning_rate": 1.033381566101455e-05, "loss": 0.5621, "step": 3916 }, { "epoch": 0.18673277238814864, "grad_norm": 1.5252214670181274, "learning_rate": 1.0329889851690094e-05, "loss": 0.6014, "step": 3917 }, { "epoch": 0.18678044478344816, "grad_norm": 3.2252800464630127, "learning_rate": 1.0325963991467031e-05, "loss": 0.6638, "step": 3918 }, { "epoch": 0.18682811717874764, "grad_norm": 1.4949884414672852, "learning_rate": 1.0322038080951084e-05, "loss": 0.8773, "step": 3919 }, { "epoch": 0.18687578957404716, "grad_norm": 1.5134341716766357, "learning_rate": 1.0318112120747977e-05, "loss": 0.4869, "step": 3920 }, { "epoch": 0.18692346196934664, "grad_norm": 2.3841423988342285, "learning_rate": 1.0314186111463444e-05, "loss": 0.2674, "step": 3921 }, { "epoch": 0.18697113436464616, "grad_norm": 1.7857775688171387, "learning_rate": 1.0310260053703231e-05, "loss": 0.7092, "step": 3922 }, { "epoch": 0.18701880675994564, "grad_norm": 1.8670042753219604, "learning_rate": 1.0306333948073089e-05, "loss": 0.6532, "step": 3923 }, { "epoch": 0.18706647915524516, "grad_norm": 2.2968075275421143, "learning_rate": 1.030240779517877e-05, "loss": 1.337, "step": 3924 }, { "epoch": 0.18711415155054464, "grad_norm": 3.2001328468322754, "learning_rate": 1.0298481595626045e-05, "loss": 0.5752, "step": 3925 }, { "epoch": 0.18716182394584416, "grad_norm": 1.341929316520691, "learning_rate": 1.0294555350020678e-05, "loss": 0.9104, "step": 3926 }, { "epoch": 0.18720949634114367, "grad_norm": 1.3757683038711548, "learning_rate": 1.0290629058968457e-05, "loss": 0.6054, "step": 3927 }, { "epoch": 0.18725716873644316, "grad_norm": 1.4756373167037964, "learning_rate": 1.0286702723075167e-05, "loss": 0.8232, "step": 3928 }, { "epoch": 0.18730484113174267, "grad_norm": 4.100464344024658, "learning_rate": 1.0282776342946597e-05, "loss": 0.5475, "step": 3929 }, { "epoch": 0.18735251352704216, "grad_norm": 2.667470693588257, "learning_rate": 1.0278849919188551e-05, "loss": 0.0291, "step": 3930 }, { "epoch": 0.18740018592234167, "grad_norm": 1.8539137840270996, "learning_rate": 1.0274923452406835e-05, "loss": 0.6703, "step": 3931 }, { "epoch": 0.18744785831764116, "grad_norm": 2.61194109916687, "learning_rate": 1.0270996943207258e-05, "loss": 0.2901, "step": 3932 }, { "epoch": 0.18749553071294067, "grad_norm": 1.727474570274353, "learning_rate": 1.0267070392195646e-05, "loss": 0.5151, "step": 3933 }, { "epoch": 0.1875432031082402, "grad_norm": 2.6027114391326904, "learning_rate": 1.0263143799977824e-05, "loss": 0.6154, "step": 3934 }, { "epoch": 0.18759087550353967, "grad_norm": 1.6948940753936768, "learning_rate": 1.025921716715962e-05, "loss": 0.897, "step": 3935 }, { "epoch": 0.1876385478988392, "grad_norm": 1.958555817604065, "learning_rate": 1.0255290494346877e-05, "loss": 0.5878, "step": 3936 }, { "epoch": 0.18768622029413867, "grad_norm": 1.528274655342102, "learning_rate": 1.0251363782145443e-05, "loss": 0.4614, "step": 3937 }, { "epoch": 0.1877338926894382, "grad_norm": 1.605790376663208, "learning_rate": 1.0247437031161162e-05, "loss": 1.0658, "step": 3938 }, { "epoch": 0.18778156508473767, "grad_norm": 2.38838529586792, "learning_rate": 1.0243510241999898e-05, "loss": 0.4949, "step": 3939 }, { "epoch": 0.1878292374800372, "grad_norm": 15.202858924865723, "learning_rate": 1.0239583415267509e-05, "loss": 1.0663, "step": 3940 }, { "epoch": 0.18787690987533667, "grad_norm": 1.548436164855957, "learning_rate": 1.0235656551569868e-05, "loss": 0.5715, "step": 3941 }, { "epoch": 0.1879245822706362, "grad_norm": 1.743993878364563, "learning_rate": 1.0231729651512847e-05, "loss": 0.814, "step": 3942 }, { "epoch": 0.1879722546659357, "grad_norm": 1.2997312545776367, "learning_rate": 1.0227802715702326e-05, "loss": 0.7499, "step": 3943 }, { "epoch": 0.1880199270612352, "grad_norm": 1.7470053434371948, "learning_rate": 1.0223875744744194e-05, "loss": 0.8215, "step": 3944 }, { "epoch": 0.1880675994565347, "grad_norm": 1.3334025144577026, "learning_rate": 1.021994873924434e-05, "loss": 0.8206, "step": 3945 }, { "epoch": 0.1881152718518342, "grad_norm": 1.8813772201538086, "learning_rate": 1.021602169980866e-05, "loss": 0.5871, "step": 3946 }, { "epoch": 0.1881629442471337, "grad_norm": 1.4710915088653564, "learning_rate": 1.0212094627043056e-05, "loss": 0.8113, "step": 3947 }, { "epoch": 0.1882106166424332, "grad_norm": 1.6951239109039307, "learning_rate": 1.0208167521553439e-05, "loss": 1.0451, "step": 3948 }, { "epoch": 0.1882582890377327, "grad_norm": 3.9029242992401123, "learning_rate": 1.0204240383945709e-05, "loss": 0.9608, "step": 3949 }, { "epoch": 0.18830596143303222, "grad_norm": 2.543529748916626, "learning_rate": 1.0200313214825797e-05, "loss": 0.7797, "step": 3950 }, { "epoch": 0.1883536338283317, "grad_norm": 1.1120450496673584, "learning_rate": 1.0196386014799617e-05, "loss": 0.4686, "step": 3951 }, { "epoch": 0.18840130622363122, "grad_norm": 2.798509359359741, "learning_rate": 1.0192458784473099e-05, "loss": 1.1725, "step": 3952 }, { "epoch": 0.1884489786189307, "grad_norm": 1.2831599712371826, "learning_rate": 1.0188531524452173e-05, "loss": 0.7103, "step": 3953 }, { "epoch": 0.18849665101423022, "grad_norm": 1.679078221321106, "learning_rate": 1.018460423534277e-05, "loss": 0.6849, "step": 3954 }, { "epoch": 0.1885443234095297, "grad_norm": 1.7133469581604004, "learning_rate": 1.0180676917750839e-05, "loss": 0.5772, "step": 3955 }, { "epoch": 0.18859199580482922, "grad_norm": 1.840065360069275, "learning_rate": 1.0176749572282318e-05, "loss": 0.8097, "step": 3956 }, { "epoch": 0.1886396682001287, "grad_norm": 2.323977470397949, "learning_rate": 1.0172822199543155e-05, "loss": 1.2464, "step": 3957 }, { "epoch": 0.18868734059542822, "grad_norm": 4.106752872467041, "learning_rate": 1.0168894800139311e-05, "loss": 0.4467, "step": 3958 }, { "epoch": 0.18873501299072773, "grad_norm": 1.7676013708114624, "learning_rate": 1.0164967374676737e-05, "loss": 0.9212, "step": 3959 }, { "epoch": 0.18878268538602722, "grad_norm": 1.6974899768829346, "learning_rate": 1.0161039923761398e-05, "loss": 0.7235, "step": 3960 }, { "epoch": 0.18883035778132673, "grad_norm": 1.3777499198913574, "learning_rate": 1.0157112447999255e-05, "loss": 0.5702, "step": 3961 }, { "epoch": 0.18887803017662622, "grad_norm": 2.2918481826782227, "learning_rate": 1.0153184947996282e-05, "loss": 1.1125, "step": 3962 }, { "epoch": 0.18892570257192573, "grad_norm": 1.5073621273040771, "learning_rate": 1.0149257424358445e-05, "loss": 0.7786, "step": 3963 }, { "epoch": 0.18897337496722522, "grad_norm": 2.86057186126709, "learning_rate": 1.0145329877691725e-05, "loss": 1.3107, "step": 3964 }, { "epoch": 0.18902104736252473, "grad_norm": 1.8101078271865845, "learning_rate": 1.0141402308602104e-05, "loss": 0.6794, "step": 3965 }, { "epoch": 0.18906871975782424, "grad_norm": 1.1594735383987427, "learning_rate": 1.0137474717695561e-05, "loss": 0.3229, "step": 3966 }, { "epoch": 0.18911639215312373, "grad_norm": 1.6829912662506104, "learning_rate": 1.0133547105578085e-05, "loss": 0.8185, "step": 3967 }, { "epoch": 0.18916406454842324, "grad_norm": 5.886511325836182, "learning_rate": 1.012961947285567e-05, "loss": 1.3177, "step": 3968 }, { "epoch": 0.18921173694372273, "grad_norm": 2.4712436199188232, "learning_rate": 1.0125691820134299e-05, "loss": 0.9348, "step": 3969 }, { "epoch": 0.18925940933902224, "grad_norm": 1.824073314666748, "learning_rate": 1.0121764148019977e-05, "loss": 0.8565, "step": 3970 }, { "epoch": 0.18930708173432173, "grad_norm": 1.8286038637161255, "learning_rate": 1.0117836457118701e-05, "loss": 0.7106, "step": 3971 }, { "epoch": 0.18935475412962124, "grad_norm": 1.9167611598968506, "learning_rate": 1.0113908748036471e-05, "loss": 0.9992, "step": 3972 }, { "epoch": 0.18940242652492076, "grad_norm": 1.2062486410140991, "learning_rate": 1.0109981021379297e-05, "loss": 0.3719, "step": 3973 }, { "epoch": 0.18945009892022024, "grad_norm": 2.559468984603882, "learning_rate": 1.0106053277753182e-05, "loss": 0.7084, "step": 3974 }, { "epoch": 0.18949777131551976, "grad_norm": 1.1755224466323853, "learning_rate": 1.0102125517764144e-05, "loss": 0.5891, "step": 3975 }, { "epoch": 0.18954544371081924, "grad_norm": 2.8774361610412598, "learning_rate": 1.0098197742018185e-05, "loss": 0.5111, "step": 3976 }, { "epoch": 0.18959311610611876, "grad_norm": 1.055750846862793, "learning_rate": 1.0094269951121326e-05, "loss": 0.517, "step": 3977 }, { "epoch": 0.18964078850141824, "grad_norm": 11.34118366241455, "learning_rate": 1.0090342145679584e-05, "loss": 0.486, "step": 3978 }, { "epoch": 0.18968846089671776, "grad_norm": 1.7106672525405884, "learning_rate": 1.008641432629898e-05, "loss": 0.7893, "step": 3979 }, { "epoch": 0.18973613329201724, "grad_norm": 1.1219589710235596, "learning_rate": 1.0082486493585535e-05, "loss": 0.1533, "step": 3980 }, { "epoch": 0.18978380568731676, "grad_norm": 0.9951594471931458, "learning_rate": 1.0078558648145273e-05, "loss": 0.2722, "step": 3981 }, { "epoch": 0.18983147808261627, "grad_norm": 1.7601436376571655, "learning_rate": 1.0074630790584223e-05, "loss": 0.6116, "step": 3982 }, { "epoch": 0.18987915047791576, "grad_norm": 1.6753166913986206, "learning_rate": 1.0070702921508408e-05, "loss": 0.642, "step": 3983 }, { "epoch": 0.18992682287321527, "grad_norm": 1.6184039115905762, "learning_rate": 1.0066775041523864e-05, "loss": 0.706, "step": 3984 }, { "epoch": 0.18997449526851476, "grad_norm": 1.3840320110321045, "learning_rate": 1.0062847151236616e-05, "loss": 0.744, "step": 3985 }, { "epoch": 0.19002216766381427, "grad_norm": 4.343174457550049, "learning_rate": 1.00589192512527e-05, "loss": 0.6515, "step": 3986 }, { "epoch": 0.19006984005911376, "grad_norm": 2.5381290912628174, "learning_rate": 1.005499134217815e-05, "loss": 0.8103, "step": 3987 }, { "epoch": 0.19011751245441327, "grad_norm": 1.3251681327819824, "learning_rate": 1.0051063424619e-05, "loss": 0.5401, "step": 3988 }, { "epoch": 0.1901651848497128, "grad_norm": 1.403380036354065, "learning_rate": 1.0047135499181293e-05, "loss": 0.8215, "step": 3989 }, { "epoch": 0.19021285724501227, "grad_norm": 1.2663720846176147, "learning_rate": 1.0043207566471064e-05, "loss": 0.7769, "step": 3990 }, { "epoch": 0.1902605296403118, "grad_norm": 1.170798420906067, "learning_rate": 1.0039279627094352e-05, "loss": 0.4485, "step": 3991 }, { "epoch": 0.19030820203561127, "grad_norm": 2.168261766433716, "learning_rate": 1.0035351681657194e-05, "loss": 0.8187, "step": 3992 }, { "epoch": 0.1903558744309108, "grad_norm": 2.069822072982788, "learning_rate": 1.0031423730765642e-05, "loss": 0.8345, "step": 3993 }, { "epoch": 0.19040354682621027, "grad_norm": 1.8551433086395264, "learning_rate": 1.0027495775025726e-05, "loss": 0.373, "step": 3994 }, { "epoch": 0.1904512192215098, "grad_norm": 1.0069206953048706, "learning_rate": 1.0023567815043498e-05, "loss": 0.4186, "step": 3995 }, { "epoch": 0.19049889161680927, "grad_norm": 4.528952598571777, "learning_rate": 1.0019639851424998e-05, "loss": 0.6763, "step": 3996 }, { "epoch": 0.1905465640121088, "grad_norm": 1.0347591638565063, "learning_rate": 1.0015711884776274e-05, "loss": 0.4398, "step": 3997 }, { "epoch": 0.1905942364074083, "grad_norm": 3.2478513717651367, "learning_rate": 1.0011783915703367e-05, "loss": 0.7674, "step": 3998 }, { "epoch": 0.1906419088027078, "grad_norm": 1.9077250957489014, "learning_rate": 1.0007855944812321e-05, "loss": 1.1326, "step": 3999 }, { "epoch": 0.1906895811980073, "grad_norm": 2.356350898742676, "learning_rate": 1.0003927972709182e-05, "loss": 0.8314, "step": 4000 }, { "epoch": 0.1907372535933068, "grad_norm": 1.7501803636550903, "learning_rate": 1e-05, "loss": 0.7022, "step": 4001 }, { "epoch": 0.1907849259886063, "grad_norm": 1.702653408050537, "learning_rate": 9.996072027290818e-06, "loss": 1.1079, "step": 4002 }, { "epoch": 0.1908325983839058, "grad_norm": 1.1961506605148315, "learning_rate": 9.992144055187684e-06, "loss": 0.5683, "step": 4003 }, { "epoch": 0.1908802707792053, "grad_norm": 1.8550877571105957, "learning_rate": 9.988216084296637e-06, "loss": 0.7196, "step": 4004 }, { "epoch": 0.19092794317450482, "grad_norm": 1.1281499862670898, "learning_rate": 9.984288115223729e-06, "loss": 0.4459, "step": 4005 }, { "epoch": 0.1909756155698043, "grad_norm": 1.5966535806655884, "learning_rate": 9.980360148575006e-06, "loss": 0.9497, "step": 4006 }, { "epoch": 0.19102328796510382, "grad_norm": 1.5581865310668945, "learning_rate": 9.976432184956504e-06, "loss": 0.9498, "step": 4007 }, { "epoch": 0.1910709603604033, "grad_norm": 1.785199761390686, "learning_rate": 9.972504224974274e-06, "loss": 0.9696, "step": 4008 }, { "epoch": 0.19111863275570282, "grad_norm": 2.1413733959198, "learning_rate": 9.968576269234365e-06, "loss": 0.6122, "step": 4009 }, { "epoch": 0.1911663051510023, "grad_norm": 2.5225419998168945, "learning_rate": 9.964648318342807e-06, "loss": 0.9041, "step": 4010 }, { "epoch": 0.19121397754630182, "grad_norm": 1.5725998878479004, "learning_rate": 9.960720372905651e-06, "loss": 0.7385, "step": 4011 }, { "epoch": 0.1912616499416013, "grad_norm": 5.730638027191162, "learning_rate": 9.95679243352894e-06, "loss": 0.3228, "step": 4012 }, { "epoch": 0.19130932233690082, "grad_norm": 1.994070053100586, "learning_rate": 9.95286450081871e-06, "loss": 1.1382, "step": 4013 }, { "epoch": 0.19135699473220033, "grad_norm": 1.4327560663223267, "learning_rate": 9.948936575381001e-06, "loss": 0.5335, "step": 4014 }, { "epoch": 0.19140466712749982, "grad_norm": 1.881850004196167, "learning_rate": 9.945008657821856e-06, "loss": 0.4408, "step": 4015 }, { "epoch": 0.19145233952279933, "grad_norm": 2.0671184062957764, "learning_rate": 9.941080748747305e-06, "loss": 0.7581, "step": 4016 }, { "epoch": 0.19150001191809882, "grad_norm": 1.9491459131240845, "learning_rate": 9.937152848763387e-06, "loss": 0.6457, "step": 4017 }, { "epoch": 0.19154768431339833, "grad_norm": 2.5914411544799805, "learning_rate": 9.933224958476143e-06, "loss": 0.5489, "step": 4018 }, { "epoch": 0.19159535670869782, "grad_norm": 2.014749526977539, "learning_rate": 9.929297078491594e-06, "loss": 0.8239, "step": 4019 }, { "epoch": 0.19164302910399733, "grad_norm": 1.5102659463882446, "learning_rate": 9.92536920941578e-06, "loss": 0.4571, "step": 4020 }, { "epoch": 0.19169070149929684, "grad_norm": 5.525020122528076, "learning_rate": 9.921441351854727e-06, "loss": 1.7096, "step": 4021 }, { "epoch": 0.19173837389459633, "grad_norm": 2.299715995788574, "learning_rate": 9.917513506414468e-06, "loss": 0.6074, "step": 4022 }, { "epoch": 0.19178604628989585, "grad_norm": 1.9847575426101685, "learning_rate": 9.913585673701023e-06, "loss": 0.7237, "step": 4023 }, { "epoch": 0.19183371868519533, "grad_norm": 1.1055468320846558, "learning_rate": 9.909657854320417e-06, "loss": 0.3065, "step": 4024 }, { "epoch": 0.19188139108049485, "grad_norm": 1.3308771848678589, "learning_rate": 9.905730048878678e-06, "loss": 0.7146, "step": 4025 }, { "epoch": 0.19192906347579433, "grad_norm": 2.196897506713867, "learning_rate": 9.901802257981819e-06, "loss": 1.2178, "step": 4026 }, { "epoch": 0.19197673587109385, "grad_norm": 2.5359132289886475, "learning_rate": 9.897874482235862e-06, "loss": 0.366, "step": 4027 }, { "epoch": 0.19202440826639333, "grad_norm": 2.311821937561035, "learning_rate": 9.893946722246821e-06, "loss": 0.7793, "step": 4028 }, { "epoch": 0.19207208066169285, "grad_norm": 1.1079131364822388, "learning_rate": 9.890018978620706e-06, "loss": 0.641, "step": 4029 }, { "epoch": 0.19211975305699236, "grad_norm": 2.7376606464385986, "learning_rate": 9.886091251963529e-06, "loss": 0.1924, "step": 4030 }, { "epoch": 0.19216742545229185, "grad_norm": 1.7969475984573364, "learning_rate": 9.882163542881304e-06, "loss": 0.7858, "step": 4031 }, { "epoch": 0.19221509784759136, "grad_norm": 3.4324734210968018, "learning_rate": 9.878235851980027e-06, "loss": 0.5442, "step": 4032 }, { "epoch": 0.19226277024289085, "grad_norm": 1.6245849132537842, "learning_rate": 9.874308179865701e-06, "loss": 0.848, "step": 4033 }, { "epoch": 0.19231044263819036, "grad_norm": 1.5750185251235962, "learning_rate": 9.870380527144336e-06, "loss": 0.9363, "step": 4034 }, { "epoch": 0.19235811503348985, "grad_norm": 1.8717321157455444, "learning_rate": 9.866452894421918e-06, "loss": 0.566, "step": 4035 }, { "epoch": 0.19240578742878936, "grad_norm": 1.7825829982757568, "learning_rate": 9.86252528230444e-06, "loss": 0.7021, "step": 4036 }, { "epoch": 0.19245345982408887, "grad_norm": 3.949215888977051, "learning_rate": 9.858597691397901e-06, "loss": 0.986, "step": 4037 }, { "epoch": 0.19250113221938836, "grad_norm": 6.687702655792236, "learning_rate": 9.854670122308276e-06, "loss": 0.3524, "step": 4038 }, { "epoch": 0.19254880461468787, "grad_norm": 1.5576212406158447, "learning_rate": 9.850742575641557e-06, "loss": 0.9, "step": 4039 }, { "epoch": 0.19259647700998736, "grad_norm": 1.438395619392395, "learning_rate": 9.846815052003723e-06, "loss": 0.6943, "step": 4040 }, { "epoch": 0.19264414940528687, "grad_norm": 2.263612985610962, "learning_rate": 9.842887552000746e-06, "loss": 0.6922, "step": 4041 }, { "epoch": 0.19269182180058636, "grad_norm": 1.5974233150482178, "learning_rate": 9.838960076238604e-06, "loss": 0.7394, "step": 4042 }, { "epoch": 0.19273949419588587, "grad_norm": 1.368720531463623, "learning_rate": 9.835032625323265e-06, "loss": 0.788, "step": 4043 }, { "epoch": 0.19278716659118536, "grad_norm": 1.6274160146713257, "learning_rate": 9.83110519986069e-06, "loss": 0.6309, "step": 4044 }, { "epoch": 0.19283483898648487, "grad_norm": 1.827691912651062, "learning_rate": 9.827177800456843e-06, "loss": 1.0083, "step": 4045 }, { "epoch": 0.1928825113817844, "grad_norm": 1.566077470779419, "learning_rate": 9.823250427717687e-06, "loss": 0.7521, "step": 4046 }, { "epoch": 0.19293018377708387, "grad_norm": 1.1719876527786255, "learning_rate": 9.819323082249165e-06, "loss": 0.6426, "step": 4047 }, { "epoch": 0.1929778561723834, "grad_norm": 0.6636733412742615, "learning_rate": 9.81539576465723e-06, "loss": 0.4049, "step": 4048 }, { "epoch": 0.19302552856768287, "grad_norm": 2.6347463130950928, "learning_rate": 9.811468475547832e-06, "loss": 0.7113, "step": 4049 }, { "epoch": 0.1930732009629824, "grad_norm": 2.6905677318573, "learning_rate": 9.807541215526906e-06, "loss": 0.9128, "step": 4050 }, { "epoch": 0.19312087335828187, "grad_norm": 1.991546869277954, "learning_rate": 9.803613985200385e-06, "loss": 0.6767, "step": 4051 }, { "epoch": 0.1931685457535814, "grad_norm": 1.0742383003234863, "learning_rate": 9.799686785174208e-06, "loss": 0.6039, "step": 4052 }, { "epoch": 0.1932162181488809, "grad_norm": 1.293224573135376, "learning_rate": 9.795759616054293e-06, "loss": 0.4643, "step": 4053 }, { "epoch": 0.1932638905441804, "grad_norm": 1.7305622100830078, "learning_rate": 9.791832478446566e-06, "loss": 0.5691, "step": 4054 }, { "epoch": 0.1933115629394799, "grad_norm": 2.7719335556030273, "learning_rate": 9.787905372956947e-06, "loss": 0.9588, "step": 4055 }, { "epoch": 0.1933592353347794, "grad_norm": 1.6359543800354004, "learning_rate": 9.783978300191343e-06, "loss": 0.8012, "step": 4056 }, { "epoch": 0.1934069077300789, "grad_norm": 1.640873670578003, "learning_rate": 9.780051260755663e-06, "loss": 1.0023, "step": 4057 }, { "epoch": 0.1934545801253784, "grad_norm": 1.903261661529541, "learning_rate": 9.776124255255808e-06, "loss": 0.7936, "step": 4058 }, { "epoch": 0.1935022525206779, "grad_norm": 1.6489269733428955, "learning_rate": 9.772197284297677e-06, "loss": 0.8606, "step": 4059 }, { "epoch": 0.19354992491597742, "grad_norm": 2.577099323272705, "learning_rate": 9.768270348487156e-06, "loss": 1.0964, "step": 4060 }, { "epoch": 0.1935975973112769, "grad_norm": 2.1362030506134033, "learning_rate": 9.764343448430132e-06, "loss": 0.2655, "step": 4061 }, { "epoch": 0.19364526970657642, "grad_norm": 1.6332634687423706, "learning_rate": 9.760416584732494e-06, "loss": 0.5944, "step": 4062 }, { "epoch": 0.1936929421018759, "grad_norm": 2.6982839107513428, "learning_rate": 9.756489758000105e-06, "loss": 0.9662, "step": 4063 }, { "epoch": 0.19374061449717542, "grad_norm": 2.517205238342285, "learning_rate": 9.75256296883884e-06, "loss": 0.7213, "step": 4064 }, { "epoch": 0.1937882868924749, "grad_norm": 2.0091564655303955, "learning_rate": 9.748636217854562e-06, "loss": 0.7532, "step": 4065 }, { "epoch": 0.19383595928777442, "grad_norm": 2.6380233764648438, "learning_rate": 9.744709505653126e-06, "loss": 0.8532, "step": 4066 }, { "epoch": 0.1938836316830739, "grad_norm": 2.9615659713745117, "learning_rate": 9.740782832840382e-06, "loss": 1.0955, "step": 4067 }, { "epoch": 0.19393130407837342, "grad_norm": 1.9009617567062378, "learning_rate": 9.736856200022182e-06, "loss": 0.9071, "step": 4068 }, { "epoch": 0.19397897647367293, "grad_norm": 2.4427125453948975, "learning_rate": 9.732929607804357e-06, "loss": 1.1511, "step": 4069 }, { "epoch": 0.19402664886897242, "grad_norm": 1.3123414516448975, "learning_rate": 9.729003056792742e-06, "loss": 0.7402, "step": 4070 }, { "epoch": 0.19407432126427193, "grad_norm": 1.9339609146118164, "learning_rate": 9.72507654759317e-06, "loss": 0.6079, "step": 4071 }, { "epoch": 0.19412199365957142, "grad_norm": 1.584667682647705, "learning_rate": 9.721150080811452e-06, "loss": 0.9517, "step": 4072 }, { "epoch": 0.19416966605487093, "grad_norm": 1.3803532123565674, "learning_rate": 9.717223657053403e-06, "loss": 0.8105, "step": 4073 }, { "epoch": 0.19421733845017042, "grad_norm": 1.4207947254180908, "learning_rate": 9.713297276924838e-06, "loss": 0.8498, "step": 4074 }, { "epoch": 0.19426501084546993, "grad_norm": 1.5667606592178345, "learning_rate": 9.709370941031544e-06, "loss": 0.5529, "step": 4075 }, { "epoch": 0.19431268324076945, "grad_norm": 2.121001720428467, "learning_rate": 9.705444649979322e-06, "loss": 0.6464, "step": 4076 }, { "epoch": 0.19436035563606893, "grad_norm": 3.191556215286255, "learning_rate": 9.701518404373962e-06, "loss": 0.8502, "step": 4077 }, { "epoch": 0.19440802803136845, "grad_norm": 2.538217067718506, "learning_rate": 9.697592204821233e-06, "loss": 1.1788, "step": 4078 }, { "epoch": 0.19445570042666793, "grad_norm": 3.672064781188965, "learning_rate": 9.693666051926915e-06, "loss": 1.0213, "step": 4079 }, { "epoch": 0.19450337282196745, "grad_norm": 5.480026721954346, "learning_rate": 9.689739946296772e-06, "loss": 0.5324, "step": 4080 }, { "epoch": 0.19455104521726693, "grad_norm": 2.3922767639160156, "learning_rate": 9.685813888536559e-06, "loss": 0.607, "step": 4081 }, { "epoch": 0.19459871761256645, "grad_norm": 1.8322621583938599, "learning_rate": 9.681887879252025e-06, "loss": 0.9569, "step": 4082 }, { "epoch": 0.19464639000786593, "grad_norm": 3.0608551502227783, "learning_rate": 9.67796191904892e-06, "loss": 0.7135, "step": 4083 }, { "epoch": 0.19469406240316545, "grad_norm": 1.4460397958755493, "learning_rate": 9.67403600853297e-06, "loss": 0.7031, "step": 4084 }, { "epoch": 0.19474173479846496, "grad_norm": 3.0253090858459473, "learning_rate": 9.670110148309907e-06, "loss": 1.0597, "step": 4085 }, { "epoch": 0.19478940719376445, "grad_norm": 1.6821322441101074, "learning_rate": 9.666184338985456e-06, "loss": 0.4781, "step": 4086 }, { "epoch": 0.19483707958906396, "grad_norm": 2.3137426376342773, "learning_rate": 9.66225858116532e-06, "loss": 0.7536, "step": 4087 }, { "epoch": 0.19488475198436345, "grad_norm": 1.8267980813980103, "learning_rate": 9.658332875455207e-06, "loss": 0.5195, "step": 4088 }, { "epoch": 0.19493242437966296, "grad_norm": 1.5063681602478027, "learning_rate": 9.654407222460816e-06, "loss": 0.8574, "step": 4089 }, { "epoch": 0.19498009677496245, "grad_norm": 3.0024847984313965, "learning_rate": 9.650481622787829e-06, "loss": 1.3545, "step": 4090 }, { "epoch": 0.19502776917026196, "grad_norm": 1.8751912117004395, "learning_rate": 9.646556077041925e-06, "loss": 0.689, "step": 4091 }, { "epoch": 0.19507544156556147, "grad_norm": 1.4176849126815796, "learning_rate": 9.642630585828785e-06, "loss": 0.659, "step": 4092 }, { "epoch": 0.19512311396086096, "grad_norm": 1.907494068145752, "learning_rate": 9.638705149754061e-06, "loss": 0.6609, "step": 4093 }, { "epoch": 0.19517078635616047, "grad_norm": 1.0732585191726685, "learning_rate": 9.634779769423412e-06, "loss": 0.5324, "step": 4094 }, { "epoch": 0.19521845875145996, "grad_norm": 1.7960597276687622, "learning_rate": 9.630854445442486e-06, "loss": 0.5091, "step": 4095 }, { "epoch": 0.19526613114675948, "grad_norm": 2.283451795578003, "learning_rate": 9.626929178416918e-06, "loss": 0.4262, "step": 4096 }, { "epoch": 0.19531380354205896, "grad_norm": 1.4824867248535156, "learning_rate": 9.623003968952331e-06, "loss": 0.7516, "step": 4097 }, { "epoch": 0.19536147593735848, "grad_norm": 1.1608043909072876, "learning_rate": 9.619078817654352e-06, "loss": 0.4951, "step": 4098 }, { "epoch": 0.19540914833265796, "grad_norm": 1.246159315109253, "learning_rate": 9.615153725128593e-06, "loss": 0.6873, "step": 4099 }, { "epoch": 0.19545682072795748, "grad_norm": 2.472771406173706, "learning_rate": 9.611228691980644e-06, "loss": 0.9136, "step": 4100 }, { "epoch": 0.195504493123257, "grad_norm": 2.861677646636963, "learning_rate": 9.607303718816108e-06, "loss": 1.241, "step": 4101 }, { "epoch": 0.19555216551855648, "grad_norm": 3.6714723110198975, "learning_rate": 9.603378806240564e-06, "loss": 0.8348, "step": 4102 }, { "epoch": 0.195599837913856, "grad_norm": 1.849320888519287, "learning_rate": 9.599453954859586e-06, "loss": 0.6496, "step": 4103 }, { "epoch": 0.19564751030915548, "grad_norm": 0.9038025736808777, "learning_rate": 9.595529165278736e-06, "loss": 0.3049, "step": 4104 }, { "epoch": 0.195695182704455, "grad_norm": 1.3686846494674683, "learning_rate": 9.591604438103574e-06, "loss": 0.7059, "step": 4105 }, { "epoch": 0.19574285509975448, "grad_norm": 1.4731411933898926, "learning_rate": 9.587679773939637e-06, "loss": 0.629, "step": 4106 }, { "epoch": 0.195790527495054, "grad_norm": 2.0155394077301025, "learning_rate": 9.583755173392467e-06, "loss": 0.9051, "step": 4107 }, { "epoch": 0.1958381998903535, "grad_norm": 1.195552945137024, "learning_rate": 9.57983063706759e-06, "loss": 0.7321, "step": 4108 }, { "epoch": 0.195885872285653, "grad_norm": 2.286600112915039, "learning_rate": 9.575906165570515e-06, "loss": 0.5508, "step": 4109 }, { "epoch": 0.1959335446809525, "grad_norm": 2.5784952640533447, "learning_rate": 9.571981759506753e-06, "loss": 0.4551, "step": 4110 }, { "epoch": 0.195981217076252, "grad_norm": 1.0925605297088623, "learning_rate": 9.5680574194818e-06, "loss": 0.6314, "step": 4111 }, { "epoch": 0.1960288894715515, "grad_norm": 1.4741684198379517, "learning_rate": 9.564133146101134e-06, "loss": 0.7571, "step": 4112 }, { "epoch": 0.196076561866851, "grad_norm": 2.2106049060821533, "learning_rate": 9.560208939970236e-06, "loss": 0.5526, "step": 4113 }, { "epoch": 0.1961242342621505, "grad_norm": 1.1798970699310303, "learning_rate": 9.556284801694573e-06, "loss": 0.5691, "step": 4114 }, { "epoch": 0.19617190665745, "grad_norm": 1.6515107154846191, "learning_rate": 9.552360731879593e-06, "loss": 0.5732, "step": 4115 }, { "epoch": 0.1962195790527495, "grad_norm": 2.0230700969696045, "learning_rate": 9.54843673113074e-06, "loss": 0.9739, "step": 4116 }, { "epoch": 0.19626725144804902, "grad_norm": 1.4770721197128296, "learning_rate": 9.544512800053457e-06, "loss": 0.7404, "step": 4117 }, { "epoch": 0.1963149238433485, "grad_norm": 1.1223185062408447, "learning_rate": 9.540588939253153e-06, "loss": 0.4837, "step": 4118 }, { "epoch": 0.19636259623864802, "grad_norm": 3.519036293029785, "learning_rate": 9.536665149335245e-06, "loss": 0.4448, "step": 4119 }, { "epoch": 0.1964102686339475, "grad_norm": 1.6042251586914062, "learning_rate": 9.532741430905135e-06, "loss": 0.5796, "step": 4120 }, { "epoch": 0.19645794102924702, "grad_norm": 1.1171796321868896, "learning_rate": 9.528817784568207e-06, "loss": 0.5406, "step": 4121 }, { "epoch": 0.1965056134245465, "grad_norm": 1.8708196878433228, "learning_rate": 9.524894210929843e-06, "loss": 0.4985, "step": 4122 }, { "epoch": 0.19655328581984602, "grad_norm": 1.24036705493927, "learning_rate": 9.520970710595413e-06, "loss": 0.8731, "step": 4123 }, { "epoch": 0.19660095821514553, "grad_norm": 1.369493842124939, "learning_rate": 9.517047284170266e-06, "loss": 0.8139, "step": 4124 }, { "epoch": 0.19664863061044502, "grad_norm": 1.6081372499465942, "learning_rate": 9.51312393225975e-06, "loss": 0.9162, "step": 4125 }, { "epoch": 0.19669630300574453, "grad_norm": 1.2288099527359009, "learning_rate": 9.509200655469201e-06, "loss": 0.6058, "step": 4126 }, { "epoch": 0.19674397540104402, "grad_norm": 2.0774357318878174, "learning_rate": 9.505277454403932e-06, "loss": 1.003, "step": 4127 }, { "epoch": 0.19679164779634353, "grad_norm": 1.7042399644851685, "learning_rate": 9.501354329669258e-06, "loss": 0.8103, "step": 4128 }, { "epoch": 0.19683932019164302, "grad_norm": 1.418620228767395, "learning_rate": 9.497431281870479e-06, "loss": 0.5618, "step": 4129 }, { "epoch": 0.19688699258694253, "grad_norm": 2.2100532054901123, "learning_rate": 9.493508311612874e-06, "loss": 1.011, "step": 4130 }, { "epoch": 0.19693466498224202, "grad_norm": 1.827114224433899, "learning_rate": 9.48958541950172e-06, "loss": 0.732, "step": 4131 }, { "epoch": 0.19698233737754153, "grad_norm": 2.7010011672973633, "learning_rate": 9.485662606142285e-06, "loss": 0.4864, "step": 4132 }, { "epoch": 0.19703000977284105, "grad_norm": 1.5265750885009766, "learning_rate": 9.48173987213981e-06, "loss": 0.6267, "step": 4133 }, { "epoch": 0.19707768216814053, "grad_norm": 1.00454580783844, "learning_rate": 9.477817218099535e-06, "loss": 0.6996, "step": 4134 }, { "epoch": 0.19712535456344005, "grad_norm": 1.9903172254562378, "learning_rate": 9.473894644626684e-06, "loss": 0.8525, "step": 4135 }, { "epoch": 0.19717302695873953, "grad_norm": 1.7862969636917114, "learning_rate": 9.469972152326476e-06, "loss": 0.4841, "step": 4136 }, { "epoch": 0.19722069935403905, "grad_norm": 1.4141737222671509, "learning_rate": 9.466049741804104e-06, "loss": 0.4438, "step": 4137 }, { "epoch": 0.19726837174933853, "grad_norm": 2.1943376064300537, "learning_rate": 9.462127413664756e-06, "loss": 1.4865, "step": 4138 }, { "epoch": 0.19731604414463805, "grad_norm": 1.1153146028518677, "learning_rate": 9.458205168513616e-06, "loss": 0.2822, "step": 4139 }, { "epoch": 0.19736371653993756, "grad_norm": 1.5740097761154175, "learning_rate": 9.454283006955835e-06, "loss": 0.5132, "step": 4140 }, { "epoch": 0.19741138893523705, "grad_norm": 1.9186357259750366, "learning_rate": 9.450360929596565e-06, "loss": 1.0364, "step": 4141 }, { "epoch": 0.19745906133053656, "grad_norm": 0.9910565614700317, "learning_rate": 9.446438937040947e-06, "loss": 0.6058, "step": 4142 }, { "epoch": 0.19750673372583605, "grad_norm": 1.4343262910842896, "learning_rate": 9.442517029894096e-06, "loss": 0.3146, "step": 4143 }, { "epoch": 0.19755440612113556, "grad_norm": 1.9917984008789062, "learning_rate": 9.438595208761127e-06, "loss": 0.2106, "step": 4144 }, { "epoch": 0.19760207851643505, "grad_norm": 1.7350119352340698, "learning_rate": 9.43467347424714e-06, "loss": 0.6625, "step": 4145 }, { "epoch": 0.19764975091173456, "grad_norm": 1.7454272508621216, "learning_rate": 9.43075182695721e-06, "loss": 0.7095, "step": 4146 }, { "epoch": 0.19769742330703405, "grad_norm": 1.7070093154907227, "learning_rate": 9.426830267496411e-06, "loss": 0.6668, "step": 4147 }, { "epoch": 0.19774509570233356, "grad_norm": 1.5751521587371826, "learning_rate": 9.422908796469804e-06, "loss": 0.493, "step": 4148 }, { "epoch": 0.19779276809763308, "grad_norm": 57.8436393737793, "learning_rate": 9.418987414482422e-06, "loss": 0.624, "step": 4149 }, { "epoch": 0.19784044049293256, "grad_norm": 1.2226852178573608, "learning_rate": 9.415066122139298e-06, "loss": 0.851, "step": 4150 }, { "epoch": 0.19788811288823208, "grad_norm": 1.3573896884918213, "learning_rate": 9.411144920045453e-06, "loss": 0.4827, "step": 4151 }, { "epoch": 0.19793578528353156, "grad_norm": 1.973682165145874, "learning_rate": 9.407223808805878e-06, "loss": 0.6924, "step": 4152 }, { "epoch": 0.19798345767883108, "grad_norm": 3.305941104888916, "learning_rate": 9.403302789025565e-06, "loss": 0.7083, "step": 4153 }, { "epoch": 0.19803113007413056, "grad_norm": 2.532428026199341, "learning_rate": 9.399381861309491e-06, "loss": 0.8382, "step": 4154 }, { "epoch": 0.19807880246943008, "grad_norm": 1.5662442445755005, "learning_rate": 9.395461026262607e-06, "loss": 0.8271, "step": 4155 }, { "epoch": 0.1981264748647296, "grad_norm": 3.439723491668701, "learning_rate": 9.391540284489862e-06, "loss": 0.3782, "step": 4156 }, { "epoch": 0.19817414726002908, "grad_norm": 3.2685232162475586, "learning_rate": 9.387619636596189e-06, "loss": 0.9667, "step": 4157 }, { "epoch": 0.1982218196553286, "grad_norm": 1.0390326976776123, "learning_rate": 9.383699083186493e-06, "loss": 0.571, "step": 4158 }, { "epoch": 0.19826949205062808, "grad_norm": 1.4027167558670044, "learning_rate": 9.379778624865683e-06, "loss": 0.895, "step": 4159 }, { "epoch": 0.1983171644459276, "grad_norm": 2.9695451259613037, "learning_rate": 9.375858262238649e-06, "loss": 0.491, "step": 4160 }, { "epoch": 0.19836483684122708, "grad_norm": 1.326710820198059, "learning_rate": 9.371937995910254e-06, "loss": 0.7099, "step": 4161 }, { "epoch": 0.1984125092365266, "grad_norm": 2.1169679164886475, "learning_rate": 9.368017826485358e-06, "loss": 1.0591, "step": 4162 }, { "epoch": 0.1984601816318261, "grad_norm": 1.3265042304992676, "learning_rate": 9.364097754568805e-06, "loss": 0.6713, "step": 4163 }, { "epoch": 0.1985078540271256, "grad_norm": 1.2513728141784668, "learning_rate": 9.36017778076542e-06, "loss": 0.4646, "step": 4164 }, { "epoch": 0.1985555264224251, "grad_norm": 1.305857539176941, "learning_rate": 9.356257905680012e-06, "loss": 0.8509, "step": 4165 }, { "epoch": 0.1986031988177246, "grad_norm": 1.4715200662612915, "learning_rate": 9.352338129917384e-06, "loss": 0.5577, "step": 4166 }, { "epoch": 0.1986508712130241, "grad_norm": 3.029110908508301, "learning_rate": 9.348418454082309e-06, "loss": 0.5355, "step": 4167 }, { "epoch": 0.1986985436083236, "grad_norm": 1.5193862915039062, "learning_rate": 9.344498878779557e-06, "loss": 0.3891, "step": 4168 }, { "epoch": 0.1987462160036231, "grad_norm": 8.111202239990234, "learning_rate": 9.34057940461388e-06, "loss": 1.685, "step": 4169 }, { "epoch": 0.1987938883989226, "grad_norm": 2.2644665241241455, "learning_rate": 9.336660032190012e-06, "loss": 0.4843, "step": 4170 }, { "epoch": 0.1988415607942221, "grad_norm": 5.7227983474731445, "learning_rate": 9.332740762112664e-06, "loss": 1.6123, "step": 4171 }, { "epoch": 0.19888923318952162, "grad_norm": 1.6429567337036133, "learning_rate": 9.32882159498655e-06, "loss": 0.2885, "step": 4172 }, { "epoch": 0.1989369055848211, "grad_norm": 2.43394136428833, "learning_rate": 9.324902531416348e-06, "loss": 1.0933, "step": 4173 }, { "epoch": 0.19898457798012062, "grad_norm": 1.2635501623153687, "learning_rate": 9.320983572006734e-06, "loss": 0.6361, "step": 4174 }, { "epoch": 0.1990322503754201, "grad_norm": 5.41809606552124, "learning_rate": 9.317064717362363e-06, "loss": 0.5659, "step": 4175 }, { "epoch": 0.19907992277071962, "grad_norm": 2.5245330333709717, "learning_rate": 9.313145968087876e-06, "loss": 0.7528, "step": 4176 }, { "epoch": 0.1991275951660191, "grad_norm": 1.6297564506530762, "learning_rate": 9.309227324787892e-06, "loss": 0.7049, "step": 4177 }, { "epoch": 0.19917526756131862, "grad_norm": 1.8300691843032837, "learning_rate": 9.305308788067015e-06, "loss": 0.6746, "step": 4178 }, { "epoch": 0.19922293995661813, "grad_norm": 2.0935540199279785, "learning_rate": 9.301390358529842e-06, "loss": 0.6619, "step": 4179 }, { "epoch": 0.19927061235191762, "grad_norm": 1.3441247940063477, "learning_rate": 9.297472036780939e-06, "loss": 0.6582, "step": 4180 }, { "epoch": 0.19931828474721713, "grad_norm": 5.757411479949951, "learning_rate": 9.293553823424865e-06, "loss": 0.8623, "step": 4181 }, { "epoch": 0.19936595714251662, "grad_norm": 1.7407352924346924, "learning_rate": 9.289635719066166e-06, "loss": 0.756, "step": 4182 }, { "epoch": 0.19941362953781613, "grad_norm": 2.226282835006714, "learning_rate": 9.285717724309357e-06, "loss": 0.751, "step": 4183 }, { "epoch": 0.19946130193311562, "grad_norm": 1.9448678493499756, "learning_rate": 9.281799839758949e-06, "loss": 1.0263, "step": 4184 }, { "epoch": 0.19950897432841513, "grad_norm": 2.1662967205047607, "learning_rate": 9.277882066019429e-06, "loss": 0.78, "step": 4185 }, { "epoch": 0.19955664672371462, "grad_norm": 1.1644023656845093, "learning_rate": 9.27396440369527e-06, "loss": 0.7828, "step": 4186 }, { "epoch": 0.19960431911901413, "grad_norm": 0.9203680753707886, "learning_rate": 9.270046853390924e-06, "loss": 0.1869, "step": 4187 }, { "epoch": 0.19965199151431365, "grad_norm": 1.6379756927490234, "learning_rate": 9.266129415710837e-06, "loss": 0.8028, "step": 4188 }, { "epoch": 0.19969966390961313, "grad_norm": 1.378438949584961, "learning_rate": 9.26221209125942e-06, "loss": 1.0276, "step": 4189 }, { "epoch": 0.19974733630491265, "grad_norm": 2.292485475540161, "learning_rate": 9.258294880641078e-06, "loss": 0.9434, "step": 4190 }, { "epoch": 0.19979500870021213, "grad_norm": 1.6815050840377808, "learning_rate": 9.254377784460202e-06, "loss": 1.0238, "step": 4191 }, { "epoch": 0.19984268109551165, "grad_norm": 1.4740095138549805, "learning_rate": 9.250460803321156e-06, "loss": 0.5136, "step": 4192 }, { "epoch": 0.19989035349081113, "grad_norm": 2.781682014465332, "learning_rate": 9.246543937828284e-06, "loss": 1.2299, "step": 4193 }, { "epoch": 0.19993802588611065, "grad_norm": 1.1177964210510254, "learning_rate": 9.242627188585928e-06, "loss": 0.6477, "step": 4194 }, { "epoch": 0.19998569828141016, "grad_norm": 3.1113359928131104, "learning_rate": 9.238710556198395e-06, "loss": 1.2746, "step": 4195 }, { "epoch": 0.20003337067670965, "grad_norm": 2.0119636058807373, "learning_rate": 9.234794041269982e-06, "loss": 0.841, "step": 4196 }, { "epoch": 0.20008104307200916, "grad_norm": 3.1193697452545166, "learning_rate": 9.230877644404974e-06, "loss": 1.4355, "step": 4197 }, { "epoch": 0.20012871546730865, "grad_norm": 1.9395185708999634, "learning_rate": 9.226961366207619e-06, "loss": 0.7507, "step": 4198 }, { "epoch": 0.20017638786260816, "grad_norm": 5.962135314941406, "learning_rate": 9.223045207282167e-06, "loss": 0.5572, "step": 4199 }, { "epoch": 0.20022406025790765, "grad_norm": 1.622602105140686, "learning_rate": 9.21912916823284e-06, "loss": 1.0705, "step": 4200 }, { "epoch": 0.20027173265320716, "grad_norm": 2.316446542739868, "learning_rate": 9.215213249663839e-06, "loss": 0.8494, "step": 4201 }, { "epoch": 0.20031940504850665, "grad_norm": 1.837378978729248, "learning_rate": 9.211297452179348e-06, "loss": 0.849, "step": 4202 }, { "epoch": 0.20036707744380616, "grad_norm": 1.3284859657287598, "learning_rate": 9.207381776383546e-06, "loss": 0.7386, "step": 4203 }, { "epoch": 0.20041474983910568, "grad_norm": 2.2446868419647217, "learning_rate": 9.203466222880567e-06, "loss": 0.6882, "step": 4204 }, { "epoch": 0.20046242223440516, "grad_norm": 1.2276482582092285, "learning_rate": 9.199550792274548e-06, "loss": 0.3691, "step": 4205 }, { "epoch": 0.20051009462970468, "grad_norm": 0.7257246375083923, "learning_rate": 9.195635485169604e-06, "loss": 0.222, "step": 4206 }, { "epoch": 0.20055776702500416, "grad_norm": 3.1366653442382812, "learning_rate": 9.191720302169815e-06, "loss": 0.5044, "step": 4207 }, { "epoch": 0.20060543942030368, "grad_norm": 2.124419927597046, "learning_rate": 9.187805243879263e-06, "loss": 0.8913, "step": 4208 }, { "epoch": 0.20065311181560316, "grad_norm": 1.7044965028762817, "learning_rate": 9.183890310902001e-06, "loss": 0.8052, "step": 4209 }, { "epoch": 0.20070078421090268, "grad_norm": 1.2684754133224487, "learning_rate": 9.179975503842053e-06, "loss": 0.8218, "step": 4210 }, { "epoch": 0.2007484566062022, "grad_norm": 1.7904325723648071, "learning_rate": 9.176060823303442e-06, "loss": 0.7492, "step": 4211 }, { "epoch": 0.20079612900150168, "grad_norm": 1.4300603866577148, "learning_rate": 9.17214626989016e-06, "loss": 0.7673, "step": 4212 }, { "epoch": 0.2008438013968012, "grad_norm": 1.4715248346328735, "learning_rate": 9.168231844206188e-06, "loss": 0.6354, "step": 4213 }, { "epoch": 0.20089147379210068, "grad_norm": 1.2048358917236328, "learning_rate": 9.164317546855475e-06, "loss": 0.4515, "step": 4214 }, { "epoch": 0.2009391461874002, "grad_norm": 1.5780590772628784, "learning_rate": 9.160403378441957e-06, "loss": 0.7401, "step": 4215 }, { "epoch": 0.20098681858269968, "grad_norm": 1.7281519174575806, "learning_rate": 9.156489339569555e-06, "loss": 0.5638, "step": 4216 }, { "epoch": 0.2010344909779992, "grad_norm": 1.282120943069458, "learning_rate": 9.152575430842156e-06, "loss": 0.8457, "step": 4217 }, { "epoch": 0.20108216337329868, "grad_norm": 1.8585940599441528, "learning_rate": 9.148661652863644e-06, "loss": 0.6463, "step": 4218 }, { "epoch": 0.2011298357685982, "grad_norm": 1.7240244150161743, "learning_rate": 9.144748006237873e-06, "loss": 0.6968, "step": 4219 }, { "epoch": 0.2011775081638977, "grad_norm": 2.5056512355804443, "learning_rate": 9.140834491568675e-06, "loss": 1.1388, "step": 4220 }, { "epoch": 0.2012251805591972, "grad_norm": 1.2405264377593994, "learning_rate": 9.136921109459869e-06, "loss": 0.7214, "step": 4221 }, { "epoch": 0.2012728529544967, "grad_norm": 2.4351043701171875, "learning_rate": 9.133007860515248e-06, "loss": 0.9342, "step": 4222 }, { "epoch": 0.2013205253497962, "grad_norm": 7.069313049316406, "learning_rate": 9.129094745338586e-06, "loss": 0.4693, "step": 4223 }, { "epoch": 0.2013681977450957, "grad_norm": 1.1319421529769897, "learning_rate": 9.125181764533632e-06, "loss": 0.8064, "step": 4224 }, { "epoch": 0.2014158701403952, "grad_norm": 1.2453668117523193, "learning_rate": 9.12126891870413e-06, "loss": 0.5035, "step": 4225 }, { "epoch": 0.2014635425356947, "grad_norm": 1.1166303157806396, "learning_rate": 9.11735620845378e-06, "loss": 0.4722, "step": 4226 }, { "epoch": 0.20151121493099422, "grad_norm": 1.8650314807891846, "learning_rate": 9.113443634386277e-06, "loss": 0.9398, "step": 4227 }, { "epoch": 0.2015588873262937, "grad_norm": 1.099266529083252, "learning_rate": 9.109531197105295e-06, "loss": 0.7001, "step": 4228 }, { "epoch": 0.20160655972159322, "grad_norm": 1.1981068849563599, "learning_rate": 9.105618897214475e-06, "loss": 0.7549, "step": 4229 }, { "epoch": 0.2016542321168927, "grad_norm": 2.5754318237304688, "learning_rate": 9.101706735317451e-06, "loss": 0.6851, "step": 4230 }, { "epoch": 0.20170190451219222, "grad_norm": 1.9271581172943115, "learning_rate": 9.09779471201783e-06, "loss": 0.6861, "step": 4231 }, { "epoch": 0.2017495769074917, "grad_norm": 1.414303183555603, "learning_rate": 9.09388282791919e-06, "loss": 0.6399, "step": 4232 }, { "epoch": 0.20179724930279122, "grad_norm": 1.3563759326934814, "learning_rate": 9.089971083625098e-06, "loss": 0.7508, "step": 4233 }, { "epoch": 0.2018449216980907, "grad_norm": 1.411031723022461, "learning_rate": 9.086059479739099e-06, "loss": 0.9758, "step": 4234 }, { "epoch": 0.20189259409339022, "grad_norm": 1.3385288715362549, "learning_rate": 9.08214801686471e-06, "loss": 0.6474, "step": 4235 }, { "epoch": 0.20194026648868973, "grad_norm": 1.3260201215744019, "learning_rate": 9.078236695605426e-06, "loss": 0.7197, "step": 4236 }, { "epoch": 0.20198793888398922, "grad_norm": 1.3642252683639526, "learning_rate": 9.074325516564734e-06, "loss": 0.5181, "step": 4237 }, { "epoch": 0.20203561127928873, "grad_norm": 7.07196044921875, "learning_rate": 9.07041448034608e-06, "loss": 0.4042, "step": 4238 }, { "epoch": 0.20208328367458822, "grad_norm": 0.9440353512763977, "learning_rate": 9.066503587552895e-06, "loss": 0.264, "step": 4239 }, { "epoch": 0.20213095606988774, "grad_norm": 2.08874773979187, "learning_rate": 9.0625928387886e-06, "loss": 0.9922, "step": 4240 }, { "epoch": 0.20217862846518722, "grad_norm": 1.2602145671844482, "learning_rate": 9.05868223465657e-06, "loss": 0.5823, "step": 4241 }, { "epoch": 0.20222630086048674, "grad_norm": 1.8287818431854248, "learning_rate": 9.054771775760179e-06, "loss": 0.6635, "step": 4242 }, { "epoch": 0.20227397325578625, "grad_norm": 4.951713562011719, "learning_rate": 9.050861462702772e-06, "loss": 1.2398, "step": 4243 }, { "epoch": 0.20232164565108574, "grad_norm": 1.2496687173843384, "learning_rate": 9.046951296087664e-06, "loss": 0.6909, "step": 4244 }, { "epoch": 0.20236931804638525, "grad_norm": 1.6422070264816284, "learning_rate": 9.043041276518158e-06, "loss": 0.9024, "step": 4245 }, { "epoch": 0.20241699044168474, "grad_norm": 2.6905791759490967, "learning_rate": 9.039131404597531e-06, "loss": 0.5229, "step": 4246 }, { "epoch": 0.20246466283698425, "grad_norm": 1.3785916566848755, "learning_rate": 9.035221680929028e-06, "loss": 0.7383, "step": 4247 }, { "epoch": 0.20251233523228374, "grad_norm": 4.605517864227295, "learning_rate": 9.031312106115887e-06, "loss": 0.7188, "step": 4248 }, { "epoch": 0.20256000762758325, "grad_norm": 1.3591018915176392, "learning_rate": 9.02740268076131e-06, "loss": 0.6144, "step": 4249 }, { "epoch": 0.20260768002288276, "grad_norm": 2.0683789253234863, "learning_rate": 9.023493405468487e-06, "loss": 0.7007, "step": 4250 }, { "epoch": 0.20265535241818225, "grad_norm": 1.45108163356781, "learning_rate": 9.019584280840572e-06, "loss": 0.7456, "step": 4251 }, { "epoch": 0.20270302481348176, "grad_norm": 1.5825248956680298, "learning_rate": 9.01567530748071e-06, "loss": 0.8712, "step": 4252 }, { "epoch": 0.20275069720878125, "grad_norm": 2.3202617168426514, "learning_rate": 9.011766485992012e-06, "loss": 0.3998, "step": 4253 }, { "epoch": 0.20279836960408076, "grad_norm": 3.4144463539123535, "learning_rate": 9.007857816977565e-06, "loss": 1.3134, "step": 4254 }, { "epoch": 0.20284604199938025, "grad_norm": 2.3063769340515137, "learning_rate": 9.003949301040439e-06, "loss": 0.5287, "step": 4255 }, { "epoch": 0.20289371439467976, "grad_norm": 1.8982830047607422, "learning_rate": 9.000040938783681e-06, "loss": 0.442, "step": 4256 }, { "epoch": 0.20294138678997925, "grad_norm": 1.5901358127593994, "learning_rate": 8.996132730810307e-06, "loss": 0.8607, "step": 4257 }, { "epoch": 0.20298905918527876, "grad_norm": 1.1874439716339111, "learning_rate": 8.992224677723315e-06, "loss": 0.5543, "step": 4258 }, { "epoch": 0.20303673158057828, "grad_norm": 1.2114481925964355, "learning_rate": 8.98831678012568e-06, "loss": 0.4451, "step": 4259 }, { "epoch": 0.20308440397587776, "grad_norm": 1.993187665939331, "learning_rate": 8.984409038620345e-06, "loss": 0.9458, "step": 4260 }, { "epoch": 0.20313207637117728, "grad_norm": 5.413943290710449, "learning_rate": 8.980501453810237e-06, "loss": 0.2271, "step": 4261 }, { "epoch": 0.20317974876647676, "grad_norm": 1.1722928285598755, "learning_rate": 8.976594026298257e-06, "loss": 0.4938, "step": 4262 }, { "epoch": 0.20322742116177628, "grad_norm": 1.9649120569229126, "learning_rate": 8.972686756687278e-06, "loss": 0.6771, "step": 4263 }, { "epoch": 0.20327509355707576, "grad_norm": 1.772072196006775, "learning_rate": 8.968779645580153e-06, "loss": 0.7081, "step": 4264 }, { "epoch": 0.20332276595237528, "grad_norm": 2.7843313217163086, "learning_rate": 8.964872693579711e-06, "loss": 0.7345, "step": 4265 }, { "epoch": 0.2033704383476748, "grad_norm": 1.3795090913772583, "learning_rate": 8.96096590128875e-06, "loss": 0.9303, "step": 4266 }, { "epoch": 0.20341811074297428, "grad_norm": 1.534347653388977, "learning_rate": 8.957059269310054e-06, "loss": 0.8567, "step": 4267 }, { "epoch": 0.2034657831382738, "grad_norm": 1.4403856992721558, "learning_rate": 8.953152798246373e-06, "loss": 0.5977, "step": 4268 }, { "epoch": 0.20351345553357328, "grad_norm": 1.167072057723999, "learning_rate": 8.949246488700431e-06, "loss": 0.7333, "step": 4269 }, { "epoch": 0.2035611279288728, "grad_norm": 1.1575961112976074, "learning_rate": 8.945340341274934e-06, "loss": 0.5692, "step": 4270 }, { "epoch": 0.20360880032417228, "grad_norm": 3.0814316272735596, "learning_rate": 8.941434356572566e-06, "loss": 0.4905, "step": 4271 }, { "epoch": 0.2036564727194718, "grad_norm": 1.531286597251892, "learning_rate": 8.937528535195972e-06, "loss": 0.664, "step": 4272 }, { "epoch": 0.20370414511477128, "grad_norm": 2.40924072265625, "learning_rate": 8.933622877747784e-06, "loss": 0.3397, "step": 4273 }, { "epoch": 0.2037518175100708, "grad_norm": 0.9527204036712646, "learning_rate": 8.929717384830609e-06, "loss": 0.52, "step": 4274 }, { "epoch": 0.2037994899053703, "grad_norm": 1.711545467376709, "learning_rate": 8.925812057047016e-06, "loss": 0.8525, "step": 4275 }, { "epoch": 0.2038471623006698, "grad_norm": 1.5503089427947998, "learning_rate": 8.92190689499956e-06, "loss": 0.293, "step": 4276 }, { "epoch": 0.2038948346959693, "grad_norm": 1.0609380006790161, "learning_rate": 8.918001899290771e-06, "loss": 0.6366, "step": 4277 }, { "epoch": 0.2039425070912688, "grad_norm": 5.815759658813477, "learning_rate": 8.914097070523143e-06, "loss": 0.3597, "step": 4278 }, { "epoch": 0.2039901794865683, "grad_norm": 1.903731346130371, "learning_rate": 8.910192409299154e-06, "loss": 0.5367, "step": 4279 }, { "epoch": 0.2040378518818678, "grad_norm": 1.4493054151535034, "learning_rate": 8.906287916221259e-06, "loss": 0.6947, "step": 4280 }, { "epoch": 0.2040855242771673, "grad_norm": 3.5800254344940186, "learning_rate": 8.90238359189187e-06, "loss": 1.1227, "step": 4281 }, { "epoch": 0.20413319667246682, "grad_norm": 1.032273292541504, "learning_rate": 8.898479436913391e-06, "loss": 0.6177, "step": 4282 }, { "epoch": 0.2041808690677663, "grad_norm": 1.360286831855774, "learning_rate": 8.894575451888194e-06, "loss": 0.7827, "step": 4283 }, { "epoch": 0.20422854146306582, "grad_norm": 2.164604425430298, "learning_rate": 8.890671637418619e-06, "loss": 0.8343, "step": 4284 }, { "epoch": 0.2042762138583653, "grad_norm": 1.9245446920394897, "learning_rate": 8.886767994106984e-06, "loss": 1.1572, "step": 4285 }, { "epoch": 0.20432388625366482, "grad_norm": 1.5803426504135132, "learning_rate": 8.882864522555588e-06, "loss": 0.9503, "step": 4286 }, { "epoch": 0.2043715586489643, "grad_norm": 1.400686264038086, "learning_rate": 8.878961223366687e-06, "loss": 0.8529, "step": 4287 }, { "epoch": 0.20441923104426382, "grad_norm": 4.508146286010742, "learning_rate": 8.875058097142527e-06, "loss": 0.5259, "step": 4288 }, { "epoch": 0.2044669034395633, "grad_norm": 1.9589221477508545, "learning_rate": 8.87115514448532e-06, "loss": 0.8199, "step": 4289 }, { "epoch": 0.20451457583486282, "grad_norm": 2.8784475326538086, "learning_rate": 8.867252365997249e-06, "loss": 0.5785, "step": 4290 }, { "epoch": 0.20456224823016234, "grad_norm": 2.0287749767303467, "learning_rate": 8.86334976228047e-06, "loss": 0.2931, "step": 4291 }, { "epoch": 0.20460992062546182, "grad_norm": 3.9576022624969482, "learning_rate": 8.859447333937117e-06, "loss": 2.1746, "step": 4292 }, { "epoch": 0.20465759302076134, "grad_norm": 3.2679898738861084, "learning_rate": 8.8555450815693e-06, "loss": 0.5383, "step": 4293 }, { "epoch": 0.20470526541606082, "grad_norm": 1.2970616817474365, "learning_rate": 8.851643005779087e-06, "loss": 0.8373, "step": 4294 }, { "epoch": 0.20475293781136034, "grad_norm": 1.9055311679840088, "learning_rate": 8.847741107168532e-06, "loss": 0.9646, "step": 4295 }, { "epoch": 0.20480061020665982, "grad_norm": 1.735101342201233, "learning_rate": 8.843839386339662e-06, "loss": 1.0163, "step": 4296 }, { "epoch": 0.20484828260195934, "grad_norm": 1.5794428586959839, "learning_rate": 8.839937843894466e-06, "loss": 0.7539, "step": 4297 }, { "epoch": 0.20489595499725885, "grad_norm": 1.8066282272338867, "learning_rate": 8.836036480434914e-06, "loss": 0.7838, "step": 4298 }, { "epoch": 0.20494362739255834, "grad_norm": 1.0301337242126465, "learning_rate": 8.832135296562949e-06, "loss": 0.5171, "step": 4299 }, { "epoch": 0.20499129978785785, "grad_norm": 1.7273674011230469, "learning_rate": 8.828234292880479e-06, "loss": 0.7805, "step": 4300 }, { "epoch": 0.20503897218315734, "grad_norm": 1.8719673156738281, "learning_rate": 8.824333469989388e-06, "loss": 0.4028, "step": 4301 }, { "epoch": 0.20508664457845685, "grad_norm": 1.3849194049835205, "learning_rate": 8.820432828491542e-06, "loss": 0.7234, "step": 4302 }, { "epoch": 0.20513431697375634, "grad_norm": 1.609876036643982, "learning_rate": 8.816532368988758e-06, "loss": 0.52, "step": 4303 }, { "epoch": 0.20518198936905585, "grad_norm": 1.4437907934188843, "learning_rate": 8.812632092082846e-06, "loss": 0.8405, "step": 4304 }, { "epoch": 0.20522966176435534, "grad_norm": 1.6973559856414795, "learning_rate": 8.808731998375572e-06, "loss": 0.6245, "step": 4305 }, { "epoch": 0.20527733415965485, "grad_norm": 1.6523698568344116, "learning_rate": 8.804832088468685e-06, "loss": 0.7299, "step": 4306 }, { "epoch": 0.20532500655495436, "grad_norm": 1.1694504022598267, "learning_rate": 8.800932362963896e-06, "loss": 0.7143, "step": 4307 }, { "epoch": 0.20537267895025385, "grad_norm": 1.2345824241638184, "learning_rate": 8.7970328224629e-06, "loss": 0.6415, "step": 4308 }, { "epoch": 0.20542035134555336, "grad_norm": 1.5928858518600464, "learning_rate": 8.793133467567346e-06, "loss": 0.978, "step": 4309 }, { "epoch": 0.20546802374085285, "grad_norm": 2.2975971698760986, "learning_rate": 8.78923429887887e-06, "loss": 0.9865, "step": 4310 }, { "epoch": 0.20551569613615236, "grad_norm": 0.9889629483222961, "learning_rate": 8.785335316999078e-06, "loss": 0.6768, "step": 4311 }, { "epoch": 0.20556336853145185, "grad_norm": 1.3772175312042236, "learning_rate": 8.781436522529537e-06, "loss": 0.6762, "step": 4312 }, { "epoch": 0.20561104092675137, "grad_norm": 2.596837282180786, "learning_rate": 8.777537916071787e-06, "loss": 0.4926, "step": 4313 }, { "epoch": 0.20565871332205088, "grad_norm": 1.4268628358840942, "learning_rate": 8.773639498227355e-06, "loss": 0.9057, "step": 4314 }, { "epoch": 0.20570638571735037, "grad_norm": 1.710360050201416, "learning_rate": 8.769741269597713e-06, "loss": 0.6695, "step": 4315 }, { "epoch": 0.20575405811264988, "grad_norm": 1.301283359527588, "learning_rate": 8.765843230784324e-06, "loss": 0.6356, "step": 4316 }, { "epoch": 0.20580173050794937, "grad_norm": 1.2675840854644775, "learning_rate": 8.761945382388619e-06, "loss": 0.5589, "step": 4317 }, { "epoch": 0.20584940290324888, "grad_norm": 2.300196647644043, "learning_rate": 8.758047725011988e-06, "loss": 0.7804, "step": 4318 }, { "epoch": 0.20589707529854837, "grad_norm": 1.272340178489685, "learning_rate": 8.754150259255807e-06, "loss": 0.6072, "step": 4319 }, { "epoch": 0.20594474769384788, "grad_norm": 1.5963306427001953, "learning_rate": 8.75025298572141e-06, "loss": 0.7109, "step": 4320 }, { "epoch": 0.20599242008914737, "grad_norm": 0.9043671488761902, "learning_rate": 8.746355905010108e-06, "loss": 0.4623, "step": 4321 }, { "epoch": 0.20604009248444688, "grad_norm": 1.5757149457931519, "learning_rate": 8.742459017723176e-06, "loss": 0.6174, "step": 4322 }, { "epoch": 0.2060877648797464, "grad_norm": 1.5294100046157837, "learning_rate": 8.738562324461873e-06, "loss": 0.921, "step": 4323 }, { "epoch": 0.20613543727504588, "grad_norm": 1.3959301710128784, "learning_rate": 8.734665825827408e-06, "loss": 0.5261, "step": 4324 }, { "epoch": 0.2061831096703454, "grad_norm": 1.380132794380188, "learning_rate": 8.730769522420978e-06, "loss": 0.7874, "step": 4325 }, { "epoch": 0.20623078206564488, "grad_norm": 2.447565793991089, "learning_rate": 8.72687341484374e-06, "loss": 1.0617, "step": 4326 }, { "epoch": 0.2062784544609444, "grad_norm": 1.2747080326080322, "learning_rate": 8.722977503696824e-06, "loss": 0.7465, "step": 4327 }, { "epoch": 0.20632612685624388, "grad_norm": 1.7254345417022705, "learning_rate": 8.719081789581329e-06, "loss": 0.8336, "step": 4328 }, { "epoch": 0.2063737992515434, "grad_norm": 2.696746587753296, "learning_rate": 8.715186273098319e-06, "loss": 0.8615, "step": 4329 }, { "epoch": 0.2064214716468429, "grad_norm": 2.128573417663574, "learning_rate": 8.711290954848842e-06, "loss": 1.1476, "step": 4330 }, { "epoch": 0.2064691440421424, "grad_norm": 2.850130319595337, "learning_rate": 8.707395835433895e-06, "loss": 0.6364, "step": 4331 }, { "epoch": 0.2065168164374419, "grad_norm": 1.7038079500198364, "learning_rate": 8.703500915454458e-06, "loss": 0.485, "step": 4332 }, { "epoch": 0.2065644888327414, "grad_norm": 2.242863655090332, "learning_rate": 8.699606195511484e-06, "loss": 0.7257, "step": 4333 }, { "epoch": 0.2066121612280409, "grad_norm": 0.9096806049346924, "learning_rate": 8.69571167620588e-06, "loss": 0.3038, "step": 4334 }, { "epoch": 0.2066598336233404, "grad_norm": 0.9689982533454895, "learning_rate": 8.691817358138532e-06, "loss": 0.507, "step": 4335 }, { "epoch": 0.2067075060186399, "grad_norm": 1.2572424411773682, "learning_rate": 8.687923241910297e-06, "loss": 0.5053, "step": 4336 }, { "epoch": 0.20675517841393942, "grad_norm": 1.796380877494812, "learning_rate": 8.68402932812199e-06, "loss": 0.7343, "step": 4337 }, { "epoch": 0.2068028508092389, "grad_norm": 2.0053296089172363, "learning_rate": 8.680135617374406e-06, "loss": 1.2507, "step": 4338 }, { "epoch": 0.20685052320453842, "grad_norm": 2.7692112922668457, "learning_rate": 8.676242110268308e-06, "loss": 0.8011, "step": 4339 }, { "epoch": 0.2068981955998379, "grad_norm": 1.0360571146011353, "learning_rate": 8.672348807404416e-06, "loss": 0.3467, "step": 4340 }, { "epoch": 0.20694586799513742, "grad_norm": 2.7257163524627686, "learning_rate": 8.668455709383433e-06, "loss": 0.4019, "step": 4341 }, { "epoch": 0.2069935403904369, "grad_norm": 2.3275458812713623, "learning_rate": 8.664562816806022e-06, "loss": 0.7583, "step": 4342 }, { "epoch": 0.20704121278573642, "grad_norm": 3.416110038757324, "learning_rate": 8.660670130272816e-06, "loss": 0.8467, "step": 4343 }, { "epoch": 0.2070888851810359, "grad_norm": 8.440009117126465, "learning_rate": 8.656777650384415e-06, "loss": 1.1757, "step": 4344 }, { "epoch": 0.20713655757633542, "grad_norm": 1.699858546257019, "learning_rate": 8.652885377741394e-06, "loss": 0.8668, "step": 4345 }, { "epoch": 0.20718422997163494, "grad_norm": 2.423576593399048, "learning_rate": 8.648993312944282e-06, "loss": 0.592, "step": 4346 }, { "epoch": 0.20723190236693442, "grad_norm": 1.418212652206421, "learning_rate": 8.645101456593589e-06, "loss": 0.8543, "step": 4347 }, { "epoch": 0.20727957476223394, "grad_norm": 3.9124245643615723, "learning_rate": 8.641209809289792e-06, "loss": 0.7545, "step": 4348 }, { "epoch": 0.20732724715753342, "grad_norm": 1.1630877256393433, "learning_rate": 8.637318371633326e-06, "loss": 0.7485, "step": 4349 }, { "epoch": 0.20737491955283294, "grad_norm": 1.4917844533920288, "learning_rate": 8.633427144224603e-06, "loss": 0.7835, "step": 4350 }, { "epoch": 0.20742259194813242, "grad_norm": 2.170482873916626, "learning_rate": 8.629536127664002e-06, "loss": 0.9353, "step": 4351 }, { "epoch": 0.20747026434343194, "grad_norm": 1.3693220615386963, "learning_rate": 8.625645322551858e-06, "loss": 0.8203, "step": 4352 }, { "epoch": 0.20751793673873145, "grad_norm": 1.2693750858306885, "learning_rate": 8.621754729488488e-06, "loss": 0.4118, "step": 4353 }, { "epoch": 0.20756560913403094, "grad_norm": 2.084134817123413, "learning_rate": 8.617864349074176e-06, "loss": 0.9773, "step": 4354 }, { "epoch": 0.20761328152933045, "grad_norm": 1.4149044752120972, "learning_rate": 8.613974181909155e-06, "loss": 0.6085, "step": 4355 }, { "epoch": 0.20766095392462994, "grad_norm": 2.060384750366211, "learning_rate": 8.610084228593649e-06, "loss": 0.6682, "step": 4356 }, { "epoch": 0.20770862631992945, "grad_norm": 1.4337724447250366, "learning_rate": 8.60619448972783e-06, "loss": 0.9382, "step": 4357 }, { "epoch": 0.20775629871522894, "grad_norm": 1.6676647663116455, "learning_rate": 8.602304965911851e-06, "loss": 0.7652, "step": 4358 }, { "epoch": 0.20780397111052845, "grad_norm": 1.8317842483520508, "learning_rate": 8.598415657745819e-06, "loss": 0.6913, "step": 4359 }, { "epoch": 0.20785164350582794, "grad_norm": 1.6914176940917969, "learning_rate": 8.59452656582982e-06, "loss": 0.5978, "step": 4360 }, { "epoch": 0.20789931590112745, "grad_norm": 2.3064661026000977, "learning_rate": 8.590637690763896e-06, "loss": 0.6462, "step": 4361 }, { "epoch": 0.20794698829642697, "grad_norm": 1.849098801612854, "learning_rate": 8.586749033148063e-06, "loss": 0.9279, "step": 4362 }, { "epoch": 0.20799466069172645, "grad_norm": 1.5790804624557495, "learning_rate": 8.582860593582301e-06, "loss": 0.5849, "step": 4363 }, { "epoch": 0.20804233308702597, "grad_norm": 1.6418288946151733, "learning_rate": 8.578972372666557e-06, "loss": 1.0811, "step": 4364 }, { "epoch": 0.20809000548232545, "grad_norm": 1.935500144958496, "learning_rate": 8.57508437100074e-06, "loss": 0.822, "step": 4365 }, { "epoch": 0.20813767787762497, "grad_norm": 1.7451543807983398, "learning_rate": 8.571196589184732e-06, "loss": 0.8533, "step": 4366 }, { "epoch": 0.20818535027292445, "grad_norm": 1.4592797756195068, "learning_rate": 8.56730902781838e-06, "loss": 0.8098, "step": 4367 }, { "epoch": 0.20823302266822397, "grad_norm": 2.1974895000457764, "learning_rate": 8.563421687501485e-06, "loss": 1.0553, "step": 4368 }, { "epoch": 0.20828069506352348, "grad_norm": 1.570441722869873, "learning_rate": 8.559534568833832e-06, "loss": 0.2305, "step": 4369 }, { "epoch": 0.20832836745882297, "grad_norm": 1.4212414026260376, "learning_rate": 8.555647672415162e-06, "loss": 0.6988, "step": 4370 }, { "epoch": 0.20837603985412248, "grad_norm": 1.3336912393569946, "learning_rate": 8.55176099884518e-06, "loss": 0.6157, "step": 4371 }, { "epoch": 0.20842371224942197, "grad_norm": 1.3625378608703613, "learning_rate": 8.547874548723565e-06, "loss": 0.7407, "step": 4372 }, { "epoch": 0.20847138464472148, "grad_norm": 1.923500657081604, "learning_rate": 8.543988322649954e-06, "loss": 0.4212, "step": 4373 }, { "epoch": 0.20851905704002097, "grad_norm": 1.3513028621673584, "learning_rate": 8.540102321223947e-06, "loss": 0.6865, "step": 4374 }, { "epoch": 0.20856672943532048, "grad_norm": 3.3731484413146973, "learning_rate": 8.536216545045117e-06, "loss": 1.0769, "step": 4375 }, { "epoch": 0.20861440183061997, "grad_norm": 2.838672161102295, "learning_rate": 8.532330994713006e-06, "loss": 0.7988, "step": 4376 }, { "epoch": 0.20866207422591948, "grad_norm": 1.5584261417388916, "learning_rate": 8.528445670827103e-06, "loss": 0.5976, "step": 4377 }, { "epoch": 0.208709746621219, "grad_norm": 1.4673742055892944, "learning_rate": 8.52456057398688e-06, "loss": 0.886, "step": 4378 }, { "epoch": 0.20875741901651848, "grad_norm": 1.901602864265442, "learning_rate": 8.52067570479177e-06, "loss": 1.0483, "step": 4379 }, { "epoch": 0.208805091411818, "grad_norm": 1.9782612323760986, "learning_rate": 8.516791063841161e-06, "loss": 0.7423, "step": 4380 }, { "epoch": 0.20885276380711748, "grad_norm": 1.1063727140426636, "learning_rate": 8.512906651734416e-06, "loss": 0.6636, "step": 4381 }, { "epoch": 0.208900436202417, "grad_norm": 3.5044310092926025, "learning_rate": 8.509022469070864e-06, "loss": 1.1454, "step": 4382 }, { "epoch": 0.20894810859771648, "grad_norm": 1.1676899194717407, "learning_rate": 8.505138516449786e-06, "loss": 0.7145, "step": 4383 }, { "epoch": 0.208995780993016, "grad_norm": 3.7188303470611572, "learning_rate": 8.501254794470443e-06, "loss": 1.1832, "step": 4384 }, { "epoch": 0.2090434533883155, "grad_norm": 4.093562602996826, "learning_rate": 8.497371303732054e-06, "loss": 0.9244, "step": 4385 }, { "epoch": 0.209091125783615, "grad_norm": 1.4679384231567383, "learning_rate": 8.493488044833796e-06, "loss": 0.7792, "step": 4386 }, { "epoch": 0.2091387981789145, "grad_norm": 1.2527292966842651, "learning_rate": 8.48960501837482e-06, "loss": 0.7554, "step": 4387 }, { "epoch": 0.209186470574214, "grad_norm": 1.5578149557113647, "learning_rate": 8.485722224954237e-06, "loss": 0.7614, "step": 4388 }, { "epoch": 0.2092341429695135, "grad_norm": 4.238508701324463, "learning_rate": 8.481839665171117e-06, "loss": 1.0255, "step": 4389 }, { "epoch": 0.209281815364813, "grad_norm": 3.228020429611206, "learning_rate": 8.477957339624502e-06, "loss": 0.4588, "step": 4390 }, { "epoch": 0.2093294877601125, "grad_norm": 4.965100288391113, "learning_rate": 8.4740752489134e-06, "loss": 0.8342, "step": 4391 }, { "epoch": 0.209377160155412, "grad_norm": 1.7317931652069092, "learning_rate": 8.47019339363677e-06, "loss": 0.7513, "step": 4392 }, { "epoch": 0.2094248325507115, "grad_norm": 3.7609570026397705, "learning_rate": 8.466311774393544e-06, "loss": 1.0602, "step": 4393 }, { "epoch": 0.20947250494601102, "grad_norm": 1.1125895977020264, "learning_rate": 8.462430391782622e-06, "loss": 0.6997, "step": 4394 }, { "epoch": 0.2095201773413105, "grad_norm": 1.281566858291626, "learning_rate": 8.458549246402854e-06, "loss": 0.6801, "step": 4395 }, { "epoch": 0.20956784973661002, "grad_norm": 4.123861312866211, "learning_rate": 8.454668338853062e-06, "loss": 1.2021, "step": 4396 }, { "epoch": 0.2096155221319095, "grad_norm": 1.59635591506958, "learning_rate": 8.450787669732036e-06, "loss": 1.0536, "step": 4397 }, { "epoch": 0.20966319452720902, "grad_norm": 2.3589484691619873, "learning_rate": 8.446907239638514e-06, "loss": 0.4969, "step": 4398 }, { "epoch": 0.2097108669225085, "grad_norm": 2.146109104156494, "learning_rate": 8.44302704917121e-06, "loss": 1.0645, "step": 4399 }, { "epoch": 0.20975853931780802, "grad_norm": 1.9817562103271484, "learning_rate": 8.439147098928805e-06, "loss": 0.7119, "step": 4400 }, { "epoch": 0.20980621171310754, "grad_norm": 1.4278912544250488, "learning_rate": 8.435267389509924e-06, "loss": 0.3774, "step": 4401 }, { "epoch": 0.20985388410840702, "grad_norm": 1.1749287843704224, "learning_rate": 8.431387921513172e-06, "loss": 0.7329, "step": 4402 }, { "epoch": 0.20990155650370654, "grad_norm": 3.4919614791870117, "learning_rate": 8.42750869553711e-06, "loss": 0.3703, "step": 4403 }, { "epoch": 0.20994922889900602, "grad_norm": 3.8118767738342285, "learning_rate": 8.423629712180265e-06, "loss": 0.6749, "step": 4404 }, { "epoch": 0.20999690129430554, "grad_norm": 1.9759467840194702, "learning_rate": 8.419750972041119e-06, "loss": 0.6974, "step": 4405 }, { "epoch": 0.21004457368960502, "grad_norm": 2.3534319400787354, "learning_rate": 8.415872475718125e-06, "loss": 0.6538, "step": 4406 }, { "epoch": 0.21009224608490454, "grad_norm": 1.4680769443511963, "learning_rate": 8.411994223809698e-06, "loss": 0.9282, "step": 4407 }, { "epoch": 0.21013991848020402, "grad_norm": 1.6939637660980225, "learning_rate": 8.408116216914205e-06, "loss": 0.8115, "step": 4408 }, { "epoch": 0.21018759087550354, "grad_norm": 2.782266139984131, "learning_rate": 8.404238455629989e-06, "loss": 0.4806, "step": 4409 }, { "epoch": 0.21023526327080305, "grad_norm": 1.3923627138137817, "learning_rate": 8.400360940555348e-06, "loss": 0.5887, "step": 4410 }, { "epoch": 0.21028293566610254, "grad_norm": 1.1818264722824097, "learning_rate": 8.396483672288536e-06, "loss": 0.8378, "step": 4411 }, { "epoch": 0.21033060806140205, "grad_norm": 1.5832149982452393, "learning_rate": 8.392606651427781e-06, "loss": 0.6567, "step": 4412 }, { "epoch": 0.21037828045670154, "grad_norm": 2.349377393722534, "learning_rate": 8.38872987857127e-06, "loss": 0.7362, "step": 4413 }, { "epoch": 0.21042595285200105, "grad_norm": 1.9588326215744019, "learning_rate": 8.384853354317141e-06, "loss": 0.9806, "step": 4414 }, { "epoch": 0.21047362524730054, "grad_norm": 1.6584535837173462, "learning_rate": 8.380977079263509e-06, "loss": 0.341, "step": 4415 }, { "epoch": 0.21052129764260005, "grad_norm": 1.1295039653778076, "learning_rate": 8.377101054008445e-06, "loss": 0.5538, "step": 4416 }, { "epoch": 0.21056897003789957, "grad_norm": 1.7521729469299316, "learning_rate": 8.373225279149972e-06, "loss": 0.9521, "step": 4417 }, { "epoch": 0.21061664243319905, "grad_norm": 1.7571831941604614, "learning_rate": 8.369349755286084e-06, "loss": 0.9567, "step": 4418 }, { "epoch": 0.21066431482849857, "grad_norm": 1.487601399421692, "learning_rate": 8.365474483014741e-06, "loss": 0.737, "step": 4419 }, { "epoch": 0.21071198722379805, "grad_norm": 1.9085054397583008, "learning_rate": 8.36159946293385e-06, "loss": 0.8935, "step": 4420 }, { "epoch": 0.21075965961909757, "grad_norm": 1.3143061399459839, "learning_rate": 8.357724695641287e-06, "loss": 0.5924, "step": 4421 }, { "epoch": 0.21080733201439705, "grad_norm": 1.5849010944366455, "learning_rate": 8.353850181734898e-06, "loss": 0.6918, "step": 4422 }, { "epoch": 0.21085500440969657, "grad_norm": 1.8508244752883911, "learning_rate": 8.349975921812468e-06, "loss": 0.931, "step": 4423 }, { "epoch": 0.21090267680499605, "grad_norm": 2.0183169841766357, "learning_rate": 8.346101916471764e-06, "loss": 0.6971, "step": 4424 }, { "epoch": 0.21095034920029557, "grad_norm": 1.5702900886535645, "learning_rate": 8.342228166310502e-06, "loss": 0.8997, "step": 4425 }, { "epoch": 0.21099802159559508, "grad_norm": 1.6522756814956665, "learning_rate": 8.338354671926364e-06, "loss": 0.9203, "step": 4426 }, { "epoch": 0.21104569399089457, "grad_norm": 1.641060709953308, "learning_rate": 8.334481433916984e-06, "loss": 0.6685, "step": 4427 }, { "epoch": 0.21109336638619408, "grad_norm": 1.5447933673858643, "learning_rate": 8.330608452879972e-06, "loss": 0.6448, "step": 4428 }, { "epoch": 0.21114103878149357, "grad_norm": 1.6789227724075317, "learning_rate": 8.32673572941288e-06, "loss": 0.9129, "step": 4429 }, { "epoch": 0.21118871117679308, "grad_norm": 1.832602620124817, "learning_rate": 8.322863264113235e-06, "loss": 0.7104, "step": 4430 }, { "epoch": 0.21123638357209257, "grad_norm": 1.8039897680282593, "learning_rate": 8.31899105757852e-06, "loss": 0.6585, "step": 4431 }, { "epoch": 0.21128405596739208, "grad_norm": 1.4945287704467773, "learning_rate": 8.315119110406172e-06, "loss": 0.3517, "step": 4432 }, { "epoch": 0.2113317283626916, "grad_norm": 2.980860471725464, "learning_rate": 8.311247423193594e-06, "loss": 1.153, "step": 4433 }, { "epoch": 0.21137940075799108, "grad_norm": 1.6675480604171753, "learning_rate": 8.30737599653815e-06, "loss": 0.6492, "step": 4434 }, { "epoch": 0.2114270731532906, "grad_norm": 2.0930182933807373, "learning_rate": 8.303504831037154e-06, "loss": 0.7994, "step": 4435 }, { "epoch": 0.21147474554859008, "grad_norm": 1.751406192779541, "learning_rate": 8.299633927287894e-06, "loss": 0.593, "step": 4436 }, { "epoch": 0.2115224179438896, "grad_norm": 1.8691309690475464, "learning_rate": 8.295763285887613e-06, "loss": 1.0948, "step": 4437 }, { "epoch": 0.21157009033918908, "grad_norm": 0.8946505188941956, "learning_rate": 8.2918929074335e-06, "loss": 0.4341, "step": 4438 }, { "epoch": 0.2116177627344886, "grad_norm": 2.1912481784820557, "learning_rate": 8.288022792522726e-06, "loss": 0.49, "step": 4439 }, { "epoch": 0.2116654351297881, "grad_norm": 1.384350061416626, "learning_rate": 8.284152941752403e-06, "loss": 0.7269, "step": 4440 }, { "epoch": 0.2117131075250876, "grad_norm": 1.8056871891021729, "learning_rate": 8.280283355719614e-06, "loss": 0.859, "step": 4441 }, { "epoch": 0.2117607799203871, "grad_norm": 2.656151533126831, "learning_rate": 8.276414035021391e-06, "loss": 1.1618, "step": 4442 }, { "epoch": 0.2118084523156866, "grad_norm": 4.167341232299805, "learning_rate": 8.272544980254731e-06, "loss": 0.3233, "step": 4443 }, { "epoch": 0.2118561247109861, "grad_norm": 2.2939465045928955, "learning_rate": 8.268676192016598e-06, "loss": 0.902, "step": 4444 }, { "epoch": 0.2119037971062856, "grad_norm": 1.4443720579147339, "learning_rate": 8.264807670903891e-06, "loss": 0.9178, "step": 4445 }, { "epoch": 0.2119514695015851, "grad_norm": 1.7518606185913086, "learning_rate": 8.260939417513498e-06, "loss": 0.8333, "step": 4446 }, { "epoch": 0.2119991418968846, "grad_norm": 1.8861931562423706, "learning_rate": 8.25707143244224e-06, "loss": 0.8673, "step": 4447 }, { "epoch": 0.2120468142921841, "grad_norm": 1.9442663192749023, "learning_rate": 8.253203716286914e-06, "loss": 0.7208, "step": 4448 }, { "epoch": 0.21209448668748362, "grad_norm": 1.7955470085144043, "learning_rate": 8.249336269644264e-06, "loss": 0.61, "step": 4449 }, { "epoch": 0.2121421590827831, "grad_norm": 1.4134103059768677, "learning_rate": 8.245469093111002e-06, "loss": 0.9365, "step": 4450 }, { "epoch": 0.21218983147808262, "grad_norm": 3.5805282592773438, "learning_rate": 8.241602187283789e-06, "loss": 0.5981, "step": 4451 }, { "epoch": 0.2122375038733821, "grad_norm": 1.5789103507995605, "learning_rate": 8.237735552759247e-06, "loss": 0.594, "step": 4452 }, { "epoch": 0.21228517626868162, "grad_norm": 2.8894712924957275, "learning_rate": 8.233869190133968e-06, "loss": 0.5236, "step": 4453 }, { "epoch": 0.2123328486639811, "grad_norm": 58.963985443115234, "learning_rate": 8.230003100004481e-06, "loss": 0.9719, "step": 4454 }, { "epoch": 0.21238052105928062, "grad_norm": 1.3286560773849487, "learning_rate": 8.226137282967289e-06, "loss": 0.9582, "step": 4455 }, { "epoch": 0.21242819345458014, "grad_norm": 4.558205604553223, "learning_rate": 8.222271739618851e-06, "loss": 0.4667, "step": 4456 }, { "epoch": 0.21247586584987962, "grad_norm": 2.747762441635132, "learning_rate": 8.218406470555571e-06, "loss": 0.8036, "step": 4457 }, { "epoch": 0.21252353824517914, "grad_norm": 1.3894038200378418, "learning_rate": 8.214541476373824e-06, "loss": 0.5518, "step": 4458 }, { "epoch": 0.21257121064047863, "grad_norm": 0.9409295916557312, "learning_rate": 8.210676757669948e-06, "loss": 0.499, "step": 4459 }, { "epoch": 0.21261888303577814, "grad_norm": 1.3855196237564087, "learning_rate": 8.206812315040215e-06, "loss": 0.5873, "step": 4460 }, { "epoch": 0.21266655543107763, "grad_norm": 2.102444648742676, "learning_rate": 8.20294814908088e-06, "loss": 0.8072, "step": 4461 }, { "epoch": 0.21271422782637714, "grad_norm": 1.9173320531845093, "learning_rate": 8.199084260388139e-06, "loss": 0.7175, "step": 4462 }, { "epoch": 0.21276190022167663, "grad_norm": 1.828513264656067, "learning_rate": 8.19522064955815e-06, "loss": 0.8789, "step": 4463 }, { "epoch": 0.21280957261697614, "grad_norm": 2.1269195079803467, "learning_rate": 8.191357317187028e-06, "loss": 0.2446, "step": 4464 }, { "epoch": 0.21285724501227565, "grad_norm": 2.814713954925537, "learning_rate": 8.18749426387085e-06, "loss": 1.0531, "step": 4465 }, { "epoch": 0.21290491740757514, "grad_norm": 2.0500357151031494, "learning_rate": 8.183631490205636e-06, "loss": 1.0665, "step": 4466 }, { "epoch": 0.21295258980287465, "grad_norm": 1.9695978164672852, "learning_rate": 8.179768996787381e-06, "loss": 1.0367, "step": 4467 }, { "epoch": 0.21300026219817414, "grad_norm": 1.298176884651184, "learning_rate": 8.175906784212028e-06, "loss": 0.759, "step": 4468 }, { "epoch": 0.21304793459347365, "grad_norm": 1.1539537906646729, "learning_rate": 8.17204485307547e-06, "loss": 0.313, "step": 4469 }, { "epoch": 0.21309560698877314, "grad_norm": 1.644991397857666, "learning_rate": 8.168183203973568e-06, "loss": 1.0725, "step": 4470 }, { "epoch": 0.21314327938407265, "grad_norm": 1.9063230752944946, "learning_rate": 8.164321837502136e-06, "loss": 0.8613, "step": 4471 }, { "epoch": 0.21319095177937217, "grad_norm": 2.056304693222046, "learning_rate": 8.160460754256937e-06, "loss": 0.7527, "step": 4472 }, { "epoch": 0.21323862417467165, "grad_norm": 2.9836580753326416, "learning_rate": 8.156599954833699e-06, "loss": 0.5532, "step": 4473 }, { "epoch": 0.21328629656997117, "grad_norm": 1.5842543840408325, "learning_rate": 8.15273943982811e-06, "loss": 1.1751, "step": 4474 }, { "epoch": 0.21333396896527065, "grad_norm": 1.684693694114685, "learning_rate": 8.148879209835797e-06, "loss": 0.9538, "step": 4475 }, { "epoch": 0.21338164136057017, "grad_norm": 1.8569011688232422, "learning_rate": 8.145019265452361e-06, "loss": 0.7172, "step": 4476 }, { "epoch": 0.21342931375586965, "grad_norm": 3.4422695636749268, "learning_rate": 8.141159607273352e-06, "loss": 1.0018, "step": 4477 }, { "epoch": 0.21347698615116917, "grad_norm": 1.4015235900878906, "learning_rate": 8.13730023589427e-06, "loss": 0.6793, "step": 4478 }, { "epoch": 0.21352465854646865, "grad_norm": 1.6025656461715698, "learning_rate": 8.13344115191058e-06, "loss": 0.5903, "step": 4479 }, { "epoch": 0.21357233094176817, "grad_norm": 2.8859734535217285, "learning_rate": 8.129582355917698e-06, "loss": 0.9701, "step": 4480 }, { "epoch": 0.21362000333706768, "grad_norm": 1.8074482679367065, "learning_rate": 8.125723848511e-06, "loss": 0.8863, "step": 4481 }, { "epoch": 0.21366767573236717, "grad_norm": 2.0635104179382324, "learning_rate": 8.121865630285809e-06, "loss": 1.0725, "step": 4482 }, { "epoch": 0.21371534812766668, "grad_norm": 1.8617684841156006, "learning_rate": 8.118007701837409e-06, "loss": 0.9979, "step": 4483 }, { "epoch": 0.21376302052296617, "grad_norm": 1.0794591903686523, "learning_rate": 8.114150063761041e-06, "loss": 0.5675, "step": 4484 }, { "epoch": 0.21381069291826568, "grad_norm": 6.139678001403809, "learning_rate": 8.110292716651899e-06, "loss": 1.7222, "step": 4485 }, { "epoch": 0.21385836531356517, "grad_norm": 1.015209436416626, "learning_rate": 8.106435661105127e-06, "loss": 0.5727, "step": 4486 }, { "epoch": 0.21390603770886468, "grad_norm": 1.6977912187576294, "learning_rate": 8.102578897715839e-06, "loss": 0.5604, "step": 4487 }, { "epoch": 0.2139537101041642, "grad_norm": 1.292377233505249, "learning_rate": 8.098722427079082e-06, "loss": 0.4758, "step": 4488 }, { "epoch": 0.21400138249946368, "grad_norm": 1.2050788402557373, "learning_rate": 8.094866249789874e-06, "loss": 0.8011, "step": 4489 }, { "epoch": 0.2140490548947632, "grad_norm": 1.5704624652862549, "learning_rate": 8.091010366443189e-06, "loss": 0.3975, "step": 4490 }, { "epoch": 0.21409672729006268, "grad_norm": 1.0296008586883545, "learning_rate": 8.087154777633942e-06, "loss": 0.577, "step": 4491 }, { "epoch": 0.2141443996853622, "grad_norm": 2.6979355812072754, "learning_rate": 8.083299483957016e-06, "loss": 0.6808, "step": 4492 }, { "epoch": 0.21419207208066168, "grad_norm": 1.462937831878662, "learning_rate": 8.079444486007244e-06, "loss": 0.7345, "step": 4493 }, { "epoch": 0.2142397444759612, "grad_norm": 1.5809084177017212, "learning_rate": 8.075589784379407e-06, "loss": 0.8325, "step": 4494 }, { "epoch": 0.21428741687126068, "grad_norm": 1.2995939254760742, "learning_rate": 8.071735379668246e-06, "loss": 0.7495, "step": 4495 }, { "epoch": 0.2143350892665602, "grad_norm": 1.2686469554901123, "learning_rate": 8.067881272468465e-06, "loss": 0.9076, "step": 4496 }, { "epoch": 0.2143827616618597, "grad_norm": 1.4767464399337769, "learning_rate": 8.064027463374702e-06, "loss": 0.6394, "step": 4497 }, { "epoch": 0.2144304340571592, "grad_norm": 2.442845582962036, "learning_rate": 8.060173952981565e-06, "loss": 0.8026, "step": 4498 }, { "epoch": 0.2144781064524587, "grad_norm": 1.4615638256072998, "learning_rate": 8.056320741883613e-06, "loss": 0.3246, "step": 4499 }, { "epoch": 0.2145257788477582, "grad_norm": 2.036346197128296, "learning_rate": 8.052467830675353e-06, "loss": 0.4039, "step": 4500 }, { "epoch": 0.2145734512430577, "grad_norm": 2.6884419918060303, "learning_rate": 8.04861521995125e-06, "loss": 0.7426, "step": 4501 }, { "epoch": 0.2146211236383572, "grad_norm": 2.032949924468994, "learning_rate": 8.044762910305726e-06, "loss": 0.4398, "step": 4502 }, { "epoch": 0.2146687960336567, "grad_norm": 1.3604116439819336, "learning_rate": 8.040910902333149e-06, "loss": 0.3812, "step": 4503 }, { "epoch": 0.21471646842895623, "grad_norm": 2.1751201152801514, "learning_rate": 8.03705919662784e-06, "loss": 0.5535, "step": 4504 }, { "epoch": 0.2147641408242557, "grad_norm": 3.663022518157959, "learning_rate": 8.033207793784091e-06, "loss": 1.1629, "step": 4505 }, { "epoch": 0.21481181321955523, "grad_norm": 1.3970552682876587, "learning_rate": 8.02935669439612e-06, "loss": 0.2853, "step": 4506 }, { "epoch": 0.2148594856148547, "grad_norm": 5.811208724975586, "learning_rate": 8.025505899058119e-06, "loss": 0.8217, "step": 4507 }, { "epoch": 0.21490715801015423, "grad_norm": 1.6250298023223877, "learning_rate": 8.021655408364227e-06, "loss": 0.8706, "step": 4508 }, { "epoch": 0.2149548304054537, "grad_norm": 1.4636025428771973, "learning_rate": 8.017805222908528e-06, "loss": 0.7862, "step": 4509 }, { "epoch": 0.21500250280075323, "grad_norm": 1.4693152904510498, "learning_rate": 8.01395534328507e-06, "loss": 0.8315, "step": 4510 }, { "epoch": 0.2150501751960527, "grad_norm": 1.6903177499771118, "learning_rate": 8.010105770087854e-06, "loss": 1.0756, "step": 4511 }, { "epoch": 0.21509784759135223, "grad_norm": 2.1016578674316406, "learning_rate": 8.006256503910823e-06, "loss": 0.9515, "step": 4512 }, { "epoch": 0.21514551998665174, "grad_norm": 1.1254427433013916, "learning_rate": 8.002407545347881e-06, "loss": 0.553, "step": 4513 }, { "epoch": 0.21519319238195123, "grad_norm": 2.1703710556030273, "learning_rate": 7.998558894992888e-06, "loss": 0.827, "step": 4514 }, { "epoch": 0.21524086477725074, "grad_norm": 1.854142427444458, "learning_rate": 7.994710553439646e-06, "loss": 0.6809, "step": 4515 }, { "epoch": 0.21528853717255023, "grad_norm": 1.552635908126831, "learning_rate": 7.99086252128191e-06, "loss": 1.0352, "step": 4516 }, { "epoch": 0.21533620956784974, "grad_norm": 1.546877145767212, "learning_rate": 7.987014799113398e-06, "loss": 0.7329, "step": 4517 }, { "epoch": 0.21538388196314923, "grad_norm": 1.8693690299987793, "learning_rate": 7.983167387527778e-06, "loss": 0.8758, "step": 4518 }, { "epoch": 0.21543155435844874, "grad_norm": 1.925012230873108, "learning_rate": 7.979320287118656e-06, "loss": 0.7871, "step": 4519 }, { "epoch": 0.21547922675374825, "grad_norm": 2.1033551692962646, "learning_rate": 7.975473498479607e-06, "loss": 0.5098, "step": 4520 }, { "epoch": 0.21552689914904774, "grad_norm": 1.3760926723480225, "learning_rate": 7.971627022204148e-06, "loss": 0.8647, "step": 4521 }, { "epoch": 0.21557457154434725, "grad_norm": 2.3648693561553955, "learning_rate": 7.967780858885753e-06, "loss": 1.1086, "step": 4522 }, { "epoch": 0.21562224393964674, "grad_norm": 1.8046520948410034, "learning_rate": 7.963935009117838e-06, "loss": 0.5923, "step": 4523 }, { "epoch": 0.21566991633494625, "grad_norm": 1.2262309789657593, "learning_rate": 7.960089473493791e-06, "loss": 0.7717, "step": 4524 }, { "epoch": 0.21571758873024574, "grad_norm": 1.6779276132583618, "learning_rate": 7.956244252606926e-06, "loss": 0.7866, "step": 4525 }, { "epoch": 0.21576526112554525, "grad_norm": 1.3872177600860596, "learning_rate": 7.952399347050526e-06, "loss": 0.6285, "step": 4526 }, { "epoch": 0.21581293352084477, "grad_norm": 1.0930930376052856, "learning_rate": 7.948554757417825e-06, "loss": 0.6883, "step": 4527 }, { "epoch": 0.21586060591614425, "grad_norm": 1.4655370712280273, "learning_rate": 7.944710484301995e-06, "loss": 0.6577, "step": 4528 }, { "epoch": 0.21590827831144377, "grad_norm": 1.4883787631988525, "learning_rate": 7.940866528296175e-06, "loss": 0.2995, "step": 4529 }, { "epoch": 0.21595595070674325, "grad_norm": 3.252790927886963, "learning_rate": 7.937022889993444e-06, "loss": 0.5275, "step": 4530 }, { "epoch": 0.21600362310204277, "grad_norm": 1.7478657960891724, "learning_rate": 7.933179569986834e-06, "loss": 0.6609, "step": 4531 }, { "epoch": 0.21605129549734226, "grad_norm": 2.2230353355407715, "learning_rate": 7.929336568869332e-06, "loss": 0.7786, "step": 4532 }, { "epoch": 0.21609896789264177, "grad_norm": 1.3655163049697876, "learning_rate": 7.92549388723388e-06, "loss": 0.7889, "step": 4533 }, { "epoch": 0.21614664028794126, "grad_norm": 1.4297012090682983, "learning_rate": 7.92165152567335e-06, "loss": 0.9813, "step": 4534 }, { "epoch": 0.21619431268324077, "grad_norm": 2.0077500343322754, "learning_rate": 7.91780948478059e-06, "loss": 1.1264, "step": 4535 }, { "epoch": 0.21624198507854028, "grad_norm": 1.3427884578704834, "learning_rate": 7.913967765148386e-06, "loss": 0.5978, "step": 4536 }, { "epoch": 0.21628965747383977, "grad_norm": 2.992999315261841, "learning_rate": 7.910126367369474e-06, "loss": 0.6912, "step": 4537 }, { "epoch": 0.21633732986913928, "grad_norm": 2.4530725479125977, "learning_rate": 7.906285292036538e-06, "loss": 0.8698, "step": 4538 }, { "epoch": 0.21638500226443877, "grad_norm": 1.6870721578598022, "learning_rate": 7.902444539742224e-06, "loss": 0.9277, "step": 4539 }, { "epoch": 0.21643267465973828, "grad_norm": 1.311440348625183, "learning_rate": 7.898604111079115e-06, "loss": 0.5312, "step": 4540 }, { "epoch": 0.21648034705503777, "grad_norm": 3.3823249340057373, "learning_rate": 7.89476400663975e-06, "loss": 0.9218, "step": 4541 }, { "epoch": 0.21652801945033728, "grad_norm": 1.7939707040786743, "learning_rate": 7.890924227016624e-06, "loss": 0.8184, "step": 4542 }, { "epoch": 0.2165756918456368, "grad_norm": 2.0498099327087402, "learning_rate": 7.887084772802165e-06, "loss": 1.307, "step": 4543 }, { "epoch": 0.21662336424093628, "grad_norm": 1.3181850910186768, "learning_rate": 7.88324564458877e-06, "loss": 0.7324, "step": 4544 }, { "epoch": 0.2166710366362358, "grad_norm": 1.151717185974121, "learning_rate": 7.879406842968772e-06, "loss": 0.8333, "step": 4545 }, { "epoch": 0.21671870903153528, "grad_norm": 1.9521054029464722, "learning_rate": 7.875568368534463e-06, "loss": 0.789, "step": 4546 }, { "epoch": 0.2167663814268348, "grad_norm": 1.7480714321136475, "learning_rate": 7.871730221878073e-06, "loss": 0.8029, "step": 4547 }, { "epoch": 0.21681405382213428, "grad_norm": 2.3383572101593018, "learning_rate": 7.867892403591798e-06, "loss": 1.1834, "step": 4548 }, { "epoch": 0.2168617262174338, "grad_norm": 2.626546621322632, "learning_rate": 7.864054914267765e-06, "loss": 1.1061, "step": 4549 }, { "epoch": 0.21690939861273328, "grad_norm": 1.5257930755615234, "learning_rate": 7.86021775449806e-06, "loss": 0.7571, "step": 4550 }, { "epoch": 0.2169570710080328, "grad_norm": 2.134678363800049, "learning_rate": 7.856380924874726e-06, "loss": 0.764, "step": 4551 }, { "epoch": 0.2170047434033323, "grad_norm": 1.9182783365249634, "learning_rate": 7.85254442598974e-06, "loss": 0.6203, "step": 4552 }, { "epoch": 0.2170524157986318, "grad_norm": 2.3225247859954834, "learning_rate": 7.848708258435031e-06, "loss": 1.0288, "step": 4553 }, { "epoch": 0.2171000881939313, "grad_norm": 1.486769199371338, "learning_rate": 7.844872422802483e-06, "loss": 0.8181, "step": 4554 }, { "epoch": 0.2171477605892308, "grad_norm": 1.531981348991394, "learning_rate": 7.841036919683932e-06, "loss": 0.6438, "step": 4555 }, { "epoch": 0.2171954329845303, "grad_norm": 3.082106351852417, "learning_rate": 7.837201749671146e-06, "loss": 0.8642, "step": 4556 }, { "epoch": 0.2172431053798298, "grad_norm": 1.664265751838684, "learning_rate": 7.833366913355858e-06, "loss": 0.6025, "step": 4557 }, { "epoch": 0.2172907777751293, "grad_norm": 0.9437436461448669, "learning_rate": 7.829532411329747e-06, "loss": 0.3189, "step": 4558 }, { "epoch": 0.21733845017042883, "grad_norm": 4.005910396575928, "learning_rate": 7.825698244184432e-06, "loss": 0.6571, "step": 4559 }, { "epoch": 0.2173861225657283, "grad_norm": 2.060579776763916, "learning_rate": 7.821864412511485e-06, "loss": 0.7423, "step": 4560 }, { "epoch": 0.21743379496102783, "grad_norm": 1.7637516260147095, "learning_rate": 7.818030916902433e-06, "loss": 0.807, "step": 4561 }, { "epoch": 0.2174814673563273, "grad_norm": 1.4165635108947754, "learning_rate": 7.814197757948734e-06, "loss": 0.7102, "step": 4562 }, { "epoch": 0.21752913975162683, "grad_norm": 1.5000356435775757, "learning_rate": 7.810364936241814e-06, "loss": 0.902, "step": 4563 }, { "epoch": 0.2175768121469263, "grad_norm": 2.554123878479004, "learning_rate": 7.80653245237304e-06, "loss": 0.7217, "step": 4564 }, { "epoch": 0.21762448454222583, "grad_norm": 1.1652750968933105, "learning_rate": 7.802700306933716e-06, "loss": 0.5649, "step": 4565 }, { "epoch": 0.2176721569375253, "grad_norm": 2.113891839981079, "learning_rate": 7.798868500515106e-06, "loss": 0.41, "step": 4566 }, { "epoch": 0.21771982933282483, "grad_norm": 2.3827731609344482, "learning_rate": 7.795037033708422e-06, "loss": 1.0033, "step": 4567 }, { "epoch": 0.21776750172812434, "grad_norm": 1.8273991346359253, "learning_rate": 7.791205907104816e-06, "loss": 0.906, "step": 4568 }, { "epoch": 0.21781517412342383, "grad_norm": 1.6398130655288696, "learning_rate": 7.78737512129539e-06, "loss": 0.8026, "step": 4569 }, { "epoch": 0.21786284651872334, "grad_norm": 1.1766680479049683, "learning_rate": 7.783544676871202e-06, "loss": 0.6919, "step": 4570 }, { "epoch": 0.21791051891402283, "grad_norm": 3.8314054012298584, "learning_rate": 7.779714574423241e-06, "loss": 0.5949, "step": 4571 }, { "epoch": 0.21795819130932234, "grad_norm": 2.741455078125, "learning_rate": 7.775884814542457e-06, "loss": 0.5654, "step": 4572 }, { "epoch": 0.21800586370462183, "grad_norm": 1.0760807991027832, "learning_rate": 7.772055397819745e-06, "loss": 0.4576, "step": 4573 }, { "epoch": 0.21805353609992134, "grad_norm": 4.20237922668457, "learning_rate": 7.768226324845942e-06, "loss": 0.1334, "step": 4574 }, { "epoch": 0.21810120849522086, "grad_norm": 1.9570916891098022, "learning_rate": 7.76439759621183e-06, "loss": 0.7007, "step": 4575 }, { "epoch": 0.21814888089052034, "grad_norm": 1.4350074529647827, "learning_rate": 7.76056921250815e-06, "loss": 0.6344, "step": 4576 }, { "epoch": 0.21819655328581986, "grad_norm": 4.5398969650268555, "learning_rate": 7.756741174325578e-06, "loss": 0.9005, "step": 4577 }, { "epoch": 0.21824422568111934, "grad_norm": 1.3192641735076904, "learning_rate": 7.75291348225474e-06, "loss": 0.5944, "step": 4578 }, { "epoch": 0.21829189807641886, "grad_norm": 1.124534249305725, "learning_rate": 7.749086136886215e-06, "loss": 0.5157, "step": 4579 }, { "epoch": 0.21833957047171834, "grad_norm": 1.8843111991882324, "learning_rate": 7.745259138810514e-06, "loss": 0.9855, "step": 4580 }, { "epoch": 0.21838724286701786, "grad_norm": 1.7668653726577759, "learning_rate": 7.741432488618112e-06, "loss": 0.8871, "step": 4581 }, { "epoch": 0.21843491526231734, "grad_norm": 2.983565092086792, "learning_rate": 7.737606186899417e-06, "loss": 0.6264, "step": 4582 }, { "epoch": 0.21848258765761686, "grad_norm": 3.1347360610961914, "learning_rate": 7.733780234244792e-06, "loss": 0.722, "step": 4583 }, { "epoch": 0.21853026005291637, "grad_norm": 1.5097100734710693, "learning_rate": 7.729954631244536e-06, "loss": 0.8209, "step": 4584 }, { "epoch": 0.21857793244821586, "grad_norm": 1.6965235471725464, "learning_rate": 7.726129378488907e-06, "loss": 0.7019, "step": 4585 }, { "epoch": 0.21862560484351537, "grad_norm": 4.577078819274902, "learning_rate": 7.722304476568095e-06, "loss": 0.3699, "step": 4586 }, { "epoch": 0.21867327723881486, "grad_norm": 2.4130613803863525, "learning_rate": 7.718479926072244e-06, "loss": 1.2491, "step": 4587 }, { "epoch": 0.21872094963411437, "grad_norm": 2.268475294113159, "learning_rate": 7.714655727591452e-06, "loss": 0.9074, "step": 4588 }, { "epoch": 0.21876862202941386, "grad_norm": 1.489576816558838, "learning_rate": 7.710831881715742e-06, "loss": 0.6937, "step": 4589 }, { "epoch": 0.21881629442471337, "grad_norm": 1.1661494970321655, "learning_rate": 7.707008389035102e-06, "loss": 0.5234, "step": 4590 }, { "epoch": 0.21886396682001288, "grad_norm": 1.961639404296875, "learning_rate": 7.703185250139455e-06, "loss": 0.6295, "step": 4591 }, { "epoch": 0.21891163921531237, "grad_norm": 2.7637739181518555, "learning_rate": 7.699362465618667e-06, "loss": 0.8126, "step": 4592 }, { "epoch": 0.21895931161061188, "grad_norm": 2.8779118061065674, "learning_rate": 7.695540036062559e-06, "loss": 0.5957, "step": 4593 }, { "epoch": 0.21900698400591137, "grad_norm": 1.3385227918624878, "learning_rate": 7.691717962060892e-06, "loss": 0.7985, "step": 4594 }, { "epoch": 0.21905465640121088, "grad_norm": 1.152004599571228, "learning_rate": 7.687896244203377e-06, "loss": 0.7074, "step": 4595 }, { "epoch": 0.21910232879651037, "grad_norm": 1.147142767906189, "learning_rate": 7.68407488307966e-06, "loss": 0.7639, "step": 4596 }, { "epoch": 0.21915000119180988, "grad_norm": 1.3513474464416504, "learning_rate": 7.680253879279335e-06, "loss": 0.7616, "step": 4597 }, { "epoch": 0.21919767358710937, "grad_norm": 1.3045693635940552, "learning_rate": 7.676433233391955e-06, "loss": 0.5044, "step": 4598 }, { "epoch": 0.21924534598240888, "grad_norm": 2.069242477416992, "learning_rate": 7.672612946006992e-06, "loss": 0.7143, "step": 4599 }, { "epoch": 0.2192930183777084, "grad_norm": 1.8670201301574707, "learning_rate": 7.668793017713886e-06, "loss": 0.3993, "step": 4600 }, { "epoch": 0.21934069077300788, "grad_norm": 1.47934091091156, "learning_rate": 7.664973449102013e-06, "loss": 0.8515, "step": 4601 }, { "epoch": 0.2193883631683074, "grad_norm": 1.5718576908111572, "learning_rate": 7.661154240760687e-06, "loss": 0.5841, "step": 4602 }, { "epoch": 0.21943603556360688, "grad_norm": 2.0619256496429443, "learning_rate": 7.657335393279179e-06, "loss": 0.7204, "step": 4603 }, { "epoch": 0.2194837079589064, "grad_norm": 1.712097406387329, "learning_rate": 7.653516907246696e-06, "loss": 0.8824, "step": 4604 }, { "epoch": 0.21953138035420589, "grad_norm": 1.91115140914917, "learning_rate": 7.649698783252388e-06, "loss": 0.7008, "step": 4605 }, { "epoch": 0.2195790527495054, "grad_norm": 1.3914875984191895, "learning_rate": 7.645881021885353e-06, "loss": 0.8336, "step": 4606 }, { "epoch": 0.2196267251448049, "grad_norm": 1.1526566743850708, "learning_rate": 7.642063623734638e-06, "loss": 0.8268, "step": 4607 }, { "epoch": 0.2196743975401044, "grad_norm": 2.9590299129486084, "learning_rate": 7.63824658938922e-06, "loss": 0.2128, "step": 4608 }, { "epoch": 0.2197220699354039, "grad_norm": 1.3247638940811157, "learning_rate": 7.63442991943803e-06, "loss": 0.691, "step": 4609 }, { "epoch": 0.2197697423307034, "grad_norm": 1.7840033769607544, "learning_rate": 7.630613614469948e-06, "loss": 1.1023, "step": 4610 }, { "epoch": 0.2198174147260029, "grad_norm": 1.563821792602539, "learning_rate": 7.626797675073783e-06, "loss": 0.4949, "step": 4611 }, { "epoch": 0.2198650871213024, "grad_norm": 1.431036353111267, "learning_rate": 7.6229821018382965e-06, "loss": 0.4389, "step": 4612 }, { "epoch": 0.2199127595166019, "grad_norm": 1.946836233139038, "learning_rate": 7.619166895352197e-06, "loss": 0.6832, "step": 4613 }, { "epoch": 0.21996043191190143, "grad_norm": 1.4851797819137573, "learning_rate": 7.615352056204124e-06, "loss": 0.8744, "step": 4614 }, { "epoch": 0.2200081043072009, "grad_norm": 1.1410810947418213, "learning_rate": 7.61153758498267e-06, "loss": 0.5084, "step": 4615 }, { "epoch": 0.22005577670250043, "grad_norm": 1.4116249084472656, "learning_rate": 7.607723482276375e-06, "loss": 0.8347, "step": 4616 }, { "epoch": 0.2201034490977999, "grad_norm": 1.4600305557250977, "learning_rate": 7.6039097486737075e-06, "loss": 0.7696, "step": 4617 }, { "epoch": 0.22015112149309943, "grad_norm": 1.9674410820007324, "learning_rate": 7.600096384763093e-06, "loss": 0.6783, "step": 4618 }, { "epoch": 0.2201987938883989, "grad_norm": 2.6363725662231445, "learning_rate": 7.596283391132892e-06, "loss": 0.5615, "step": 4619 }, { "epoch": 0.22024646628369843, "grad_norm": 4.3245625495910645, "learning_rate": 7.592470768371409e-06, "loss": 0.6746, "step": 4620 }, { "epoch": 0.22029413867899791, "grad_norm": 7.218118190765381, "learning_rate": 7.588658517066893e-06, "loss": 1.7143, "step": 4621 }, { "epoch": 0.22034181107429743, "grad_norm": 1.9416069984436035, "learning_rate": 7.5848466378075395e-06, "loss": 0.7704, "step": 4622 }, { "epoch": 0.22038948346959694, "grad_norm": 2.2492318153381348, "learning_rate": 7.581035131181473e-06, "loss": 0.6563, "step": 4623 }, { "epoch": 0.22043715586489643, "grad_norm": 1.7324265241622925, "learning_rate": 7.577223997776777e-06, "loss": 0.5469, "step": 4624 }, { "epoch": 0.22048482826019594, "grad_norm": 1.3318227529525757, "learning_rate": 7.573413238181473e-06, "loss": 0.7346, "step": 4625 }, { "epoch": 0.22053250065549543, "grad_norm": 1.5449528694152832, "learning_rate": 7.569602852983511e-06, "loss": 0.54, "step": 4626 }, { "epoch": 0.22058017305079494, "grad_norm": 1.5341260433197021, "learning_rate": 7.565792842770805e-06, "loss": 0.9212, "step": 4627 }, { "epoch": 0.22062784544609443, "grad_norm": 0.9264002442359924, "learning_rate": 7.561983208131196e-06, "loss": 0.457, "step": 4628 }, { "epoch": 0.22067551784139394, "grad_norm": 1.8454240560531616, "learning_rate": 7.558173949652468e-06, "loss": 1.1494, "step": 4629 }, { "epoch": 0.22072319023669346, "grad_norm": 1.0589171648025513, "learning_rate": 7.554365067922353e-06, "loss": 0.5758, "step": 4630 }, { "epoch": 0.22077086263199294, "grad_norm": 1.4372797012329102, "learning_rate": 7.550556563528524e-06, "loss": 0.9499, "step": 4631 }, { "epoch": 0.22081853502729246, "grad_norm": 1.6198995113372803, "learning_rate": 7.546748437058596e-06, "loss": 0.9306, "step": 4632 }, { "epoch": 0.22086620742259194, "grad_norm": 1.3839938640594482, "learning_rate": 7.542940689100117e-06, "loss": 0.5114, "step": 4633 }, { "epoch": 0.22091387981789146, "grad_norm": 1.7403932809829712, "learning_rate": 7.539133320240589e-06, "loss": 0.6705, "step": 4634 }, { "epoch": 0.22096155221319094, "grad_norm": 1.2243397235870361, "learning_rate": 7.53532633106745e-06, "loss": 0.5372, "step": 4635 }, { "epoch": 0.22100922460849046, "grad_norm": 1.1937931776046753, "learning_rate": 7.531519722168072e-06, "loss": 0.554, "step": 4636 }, { "epoch": 0.22105689700378994, "grad_norm": 4.210997581481934, "learning_rate": 7.527713494129781e-06, "loss": 0.6425, "step": 4637 }, { "epoch": 0.22110456939908946, "grad_norm": 1.3754652738571167, "learning_rate": 7.523907647539841e-06, "loss": 0.6326, "step": 4638 }, { "epoch": 0.22115224179438897, "grad_norm": 2.4554967880249023, "learning_rate": 7.520102182985449e-06, "loss": 0.8312, "step": 4639 }, { "epoch": 0.22119991418968846, "grad_norm": 1.6669466495513916, "learning_rate": 7.516297101053754e-06, "loss": 0.9231, "step": 4640 }, { "epoch": 0.22124758658498797, "grad_norm": 1.7525601387023926, "learning_rate": 7.51249240233184e-06, "loss": 0.7735, "step": 4641 }, { "epoch": 0.22129525898028746, "grad_norm": 1.7552474737167358, "learning_rate": 7.508688087406731e-06, "loss": 0.5516, "step": 4642 }, { "epoch": 0.22134293137558697, "grad_norm": 1.4849187135696411, "learning_rate": 7.504884156865393e-06, "loss": 0.8757, "step": 4643 }, { "epoch": 0.22139060377088646, "grad_norm": 1.1317781209945679, "learning_rate": 7.501080611294739e-06, "loss": 0.6283, "step": 4644 }, { "epoch": 0.22143827616618597, "grad_norm": 1.029887080192566, "learning_rate": 7.497277451281609e-06, "loss": 0.6628, "step": 4645 }, { "epoch": 0.22148594856148549, "grad_norm": 1.9708335399627686, "learning_rate": 7.493474677412795e-06, "loss": 0.7249, "step": 4646 }, { "epoch": 0.22153362095678497, "grad_norm": 3.476801633834839, "learning_rate": 7.48967229027503e-06, "loss": 1.4375, "step": 4647 }, { "epoch": 0.22158129335208449, "grad_norm": 1.4689197540283203, "learning_rate": 7.485870290454974e-06, "loss": 0.6284, "step": 4648 }, { "epoch": 0.22162896574738397, "grad_norm": 2.1239750385284424, "learning_rate": 7.482068678539245e-06, "loss": 0.7225, "step": 4649 }, { "epoch": 0.22167663814268349, "grad_norm": 1.2528247833251953, "learning_rate": 7.478267455114391e-06, "loss": 0.5359, "step": 4650 }, { "epoch": 0.22172431053798297, "grad_norm": 6.784343242645264, "learning_rate": 7.474466620766896e-06, "loss": 1.3655, "step": 4651 }, { "epoch": 0.22177198293328249, "grad_norm": 1.987514853477478, "learning_rate": 7.470666176083193e-06, "loss": 0.9451, "step": 4652 }, { "epoch": 0.22181965532858197, "grad_norm": 1.7384040355682373, "learning_rate": 7.466866121649656e-06, "loss": 1.1158, "step": 4653 }, { "epoch": 0.22186732772388149, "grad_norm": 1.190354824066162, "learning_rate": 7.463066458052586e-06, "loss": 0.7798, "step": 4654 }, { "epoch": 0.221915000119181, "grad_norm": 1.2690644264221191, "learning_rate": 7.4592671858782365e-06, "loss": 0.6525, "step": 4655 }, { "epoch": 0.22196267251448049, "grad_norm": 0.7563767433166504, "learning_rate": 7.455468305712801e-06, "loss": 0.1129, "step": 4656 }, { "epoch": 0.22201034490978, "grad_norm": 1.5915089845657349, "learning_rate": 7.451669818142398e-06, "loss": 0.505, "step": 4657 }, { "epoch": 0.22205801730507949, "grad_norm": 1.7025084495544434, "learning_rate": 7.447871723753098e-06, "loss": 1.2199, "step": 4658 }, { "epoch": 0.222105689700379, "grad_norm": 1.5906728506088257, "learning_rate": 7.444074023130914e-06, "loss": 0.639, "step": 4659 }, { "epoch": 0.22215336209567849, "grad_norm": 1.7473750114440918, "learning_rate": 7.440276716861783e-06, "loss": 0.6684, "step": 4660 }, { "epoch": 0.222201034490978, "grad_norm": 1.0748573541641235, "learning_rate": 7.436479805531595e-06, "loss": 0.2557, "step": 4661 }, { "epoch": 0.22224870688627751, "grad_norm": 1.3791710138320923, "learning_rate": 7.432683289726177e-06, "loss": 0.7866, "step": 4662 }, { "epoch": 0.222296379281577, "grad_norm": 2.493863821029663, "learning_rate": 7.428887170031285e-06, "loss": 1.1457, "step": 4663 }, { "epoch": 0.22234405167687651, "grad_norm": 2.577347993850708, "learning_rate": 7.425091447032629e-06, "loss": 0.6169, "step": 4664 }, { "epoch": 0.222391724072176, "grad_norm": 2.4438793659210205, "learning_rate": 7.421296121315844e-06, "loss": 1.3597, "step": 4665 }, { "epoch": 0.22243939646747551, "grad_norm": 1.0990134477615356, "learning_rate": 7.417501193466513e-06, "loss": 0.8178, "step": 4666 }, { "epoch": 0.222487068862775, "grad_norm": 1.9476672410964966, "learning_rate": 7.413706664070151e-06, "loss": 0.8329, "step": 4667 }, { "epoch": 0.22253474125807451, "grad_norm": 2.050792694091797, "learning_rate": 7.409912533712218e-06, "loss": 1.0559, "step": 4668 }, { "epoch": 0.222582413653374, "grad_norm": 2.358335494995117, "learning_rate": 7.406118802978111e-06, "loss": 0.497, "step": 4669 }, { "epoch": 0.22263008604867351, "grad_norm": 2.9558937549591064, "learning_rate": 7.402325472453158e-06, "loss": 0.3128, "step": 4670 }, { "epoch": 0.22267775844397303, "grad_norm": 1.6250839233398438, "learning_rate": 7.398532542722635e-06, "loss": 0.6273, "step": 4671 }, { "epoch": 0.22272543083927251, "grad_norm": 1.8294248580932617, "learning_rate": 7.394740014371753e-06, "loss": 0.3294, "step": 4672 }, { "epoch": 0.22277310323457203, "grad_norm": 1.7346724271774292, "learning_rate": 7.390947887985654e-06, "loss": 0.8005, "step": 4673 }, { "epoch": 0.22282077562987151, "grad_norm": 3.926581621170044, "learning_rate": 7.387156164149427e-06, "loss": 0.8396, "step": 4674 }, { "epoch": 0.22286844802517103, "grad_norm": 1.3853797912597656, "learning_rate": 7.383364843448102e-06, "loss": 0.7356, "step": 4675 }, { "epoch": 0.22291612042047051, "grad_norm": 1.0769068002700806, "learning_rate": 7.379573926466631e-06, "loss": 0.7911, "step": 4676 }, { "epoch": 0.22296379281577003, "grad_norm": 1.4001976251602173, "learning_rate": 7.375783413789918e-06, "loss": 0.9862, "step": 4677 }, { "epoch": 0.22301146521106954, "grad_norm": 2.245253801345825, "learning_rate": 7.371993306002804e-06, "loss": 0.3392, "step": 4678 }, { "epoch": 0.22305913760636903, "grad_norm": 1.1093387603759766, "learning_rate": 7.368203603690057e-06, "loss": 0.3007, "step": 4679 }, { "epoch": 0.22310681000166854, "grad_norm": 1.3087025880813599, "learning_rate": 7.36441430743639e-06, "loss": 0.5277, "step": 4680 }, { "epoch": 0.22315448239696803, "grad_norm": 1.7940627336502075, "learning_rate": 7.360625417826459e-06, "loss": 0.5785, "step": 4681 }, { "epoch": 0.22320215479226754, "grad_norm": 1.332318902015686, "learning_rate": 7.356836935444841e-06, "loss": 0.5831, "step": 4682 }, { "epoch": 0.22324982718756703, "grad_norm": 1.940137505531311, "learning_rate": 7.3530488608760645e-06, "loss": 0.6727, "step": 4683 }, { "epoch": 0.22329749958286654, "grad_norm": 1.1690236330032349, "learning_rate": 7.349261194704596e-06, "loss": 0.82, "step": 4684 }, { "epoch": 0.22334517197816603, "grad_norm": 1.2394042015075684, "learning_rate": 7.345473937514822e-06, "loss": 0.7523, "step": 4685 }, { "epoch": 0.22339284437346554, "grad_norm": 2.288142442703247, "learning_rate": 7.341687089891085e-06, "loss": 1.1392, "step": 4686 }, { "epoch": 0.22344051676876506, "grad_norm": 3.9231040477752686, "learning_rate": 7.337900652417656e-06, "loss": 0.4054, "step": 4687 }, { "epoch": 0.22348818916406454, "grad_norm": 2.15238356590271, "learning_rate": 7.334114625678741e-06, "loss": 0.6911, "step": 4688 }, { "epoch": 0.22353586155936406, "grad_norm": 2.52760910987854, "learning_rate": 7.330329010258483e-06, "loss": 1.0212, "step": 4689 }, { "epoch": 0.22358353395466354, "grad_norm": 3.931504249572754, "learning_rate": 7.3265438067409725e-06, "loss": 0.8517, "step": 4690 }, { "epoch": 0.22363120634996306, "grad_norm": 1.1890994310379028, "learning_rate": 7.3227590157102165e-06, "loss": 0.5002, "step": 4691 }, { "epoch": 0.22367887874526254, "grad_norm": 1.3411054611206055, "learning_rate": 7.318974637750174e-06, "loss": 0.765, "step": 4692 }, { "epoch": 0.22372655114056206, "grad_norm": 2.177300214767456, "learning_rate": 7.31519067344474e-06, "loss": 0.1258, "step": 4693 }, { "epoch": 0.22377422353586157, "grad_norm": 2.613393545150757, "learning_rate": 7.311407123377734e-06, "loss": 0.738, "step": 4694 }, { "epoch": 0.22382189593116106, "grad_norm": 5.053560733795166, "learning_rate": 7.307623988132921e-06, "loss": 1.2962, "step": 4695 }, { "epoch": 0.22386956832646057, "grad_norm": 2.4341542720794678, "learning_rate": 7.303841268294004e-06, "loss": 0.4851, "step": 4696 }, { "epoch": 0.22391724072176006, "grad_norm": 2.830789089202881, "learning_rate": 7.30005896444461e-06, "loss": 0.7733, "step": 4697 }, { "epoch": 0.22396491311705957, "grad_norm": 2.6923956871032715, "learning_rate": 7.2962770771683144e-06, "loss": 0.5332, "step": 4698 }, { "epoch": 0.22401258551235906, "grad_norm": 2.32131028175354, "learning_rate": 7.292495607048626e-06, "loss": 0.8116, "step": 4699 }, { "epoch": 0.22406025790765857, "grad_norm": 2.7807183265686035, "learning_rate": 7.28871455466898e-06, "loss": 0.7625, "step": 4700 }, { "epoch": 0.22410793030295806, "grad_norm": 1.4085941314697266, "learning_rate": 7.284933920612759e-06, "loss": 0.73, "step": 4701 }, { "epoch": 0.22415560269825757, "grad_norm": 2.79194974899292, "learning_rate": 7.281153705463275e-06, "loss": 0.7639, "step": 4702 }, { "epoch": 0.2242032750935571, "grad_norm": 1.8848598003387451, "learning_rate": 7.277373909803774e-06, "loss": 0.7303, "step": 4703 }, { "epoch": 0.22425094748885657, "grad_norm": 1.2991321086883545, "learning_rate": 7.273594534217441e-06, "loss": 0.6501, "step": 4704 }, { "epoch": 0.2242986198841561, "grad_norm": 2.308899164199829, "learning_rate": 7.269815579287398e-06, "loss": 0.7138, "step": 4705 }, { "epoch": 0.22434629227945557, "grad_norm": 1.8376412391662598, "learning_rate": 7.266037045596692e-06, "loss": 0.6759, "step": 4706 }, { "epoch": 0.2243939646747551, "grad_norm": 2.57595157623291, "learning_rate": 7.262258933728314e-06, "loss": 1.0017, "step": 4707 }, { "epoch": 0.22444163707005457, "grad_norm": 1.2827507257461548, "learning_rate": 7.258481244265193e-06, "loss": 0.6921, "step": 4708 }, { "epoch": 0.2244893094653541, "grad_norm": 1.3698151111602783, "learning_rate": 7.254703977790183e-06, "loss": 0.8405, "step": 4709 }, { "epoch": 0.2245369818606536, "grad_norm": 0.9687865376472473, "learning_rate": 7.2509271348860785e-06, "loss": 0.5002, "step": 4710 }, { "epoch": 0.2245846542559531, "grad_norm": 0.9799315333366394, "learning_rate": 7.247150716135605e-06, "loss": 0.542, "step": 4711 }, { "epoch": 0.2246323266512526, "grad_norm": 1.283604383468628, "learning_rate": 7.243374722121431e-06, "loss": 0.6338, "step": 4712 }, { "epoch": 0.2246799990465521, "grad_norm": 1.9069286584854126, "learning_rate": 7.2395991534261456e-06, "loss": 0.89, "step": 4713 }, { "epoch": 0.2247276714418516, "grad_norm": 2.6499481201171875, "learning_rate": 7.235824010632284e-06, "loss": 0.755, "step": 4714 }, { "epoch": 0.2247753438371511, "grad_norm": 1.6619880199432373, "learning_rate": 7.232049294322316e-06, "loss": 0.7238, "step": 4715 }, { "epoch": 0.2248230162324506, "grad_norm": 2.6634881496429443, "learning_rate": 7.2282750050786374e-06, "loss": 0.8087, "step": 4716 }, { "epoch": 0.22487068862775011, "grad_norm": 1.4497495889663696, "learning_rate": 7.2245011434835775e-06, "loss": 0.7627, "step": 4717 }, { "epoch": 0.2249183610230496, "grad_norm": 1.2503986358642578, "learning_rate": 7.220727710119415e-06, "loss": 0.6057, "step": 4718 }, { "epoch": 0.22496603341834912, "grad_norm": 1.8429253101348877, "learning_rate": 7.216954705568342e-06, "loss": 0.5762, "step": 4719 }, { "epoch": 0.2250137058136486, "grad_norm": 1.2374719381332397, "learning_rate": 7.2131821304124974e-06, "loss": 0.5254, "step": 4720 }, { "epoch": 0.22506137820894812, "grad_norm": 6.295538902282715, "learning_rate": 7.209409985233955e-06, "loss": 0.9477, "step": 4721 }, { "epoch": 0.2251090506042476, "grad_norm": 2.023261785507202, "learning_rate": 7.20563827061471e-06, "loss": 0.7076, "step": 4722 }, { "epoch": 0.22515672299954712, "grad_norm": 2.349370241165161, "learning_rate": 7.201866987136706e-06, "loss": 0.6833, "step": 4723 }, { "epoch": 0.2252043953948466, "grad_norm": 2.554931163787842, "learning_rate": 7.198096135381811e-06, "loss": 0.5366, "step": 4724 }, { "epoch": 0.22525206779014612, "grad_norm": 1.7870793342590332, "learning_rate": 7.1943257159318295e-06, "loss": 0.7195, "step": 4725 }, { "epoch": 0.22529974018544563, "grad_norm": 1.7255315780639648, "learning_rate": 7.190555729368492e-06, "loss": 0.8099, "step": 4726 }, { "epoch": 0.22534741258074512, "grad_norm": 1.6862258911132812, "learning_rate": 7.18678617627348e-06, "loss": 0.6703, "step": 4727 }, { "epoch": 0.22539508497604463, "grad_norm": 2.1234540939331055, "learning_rate": 7.183017057228386e-06, "loss": 0.6835, "step": 4728 }, { "epoch": 0.22544275737134412, "grad_norm": 2.145744562149048, "learning_rate": 7.179248372814751e-06, "loss": 0.9435, "step": 4729 }, { "epoch": 0.22549042976664363, "grad_norm": 2.782910108566284, "learning_rate": 7.175480123614048e-06, "loss": 0.089, "step": 4730 }, { "epoch": 0.22553810216194312, "grad_norm": 1.0296188592910767, "learning_rate": 7.17171231020767e-06, "loss": 0.5136, "step": 4731 }, { "epoch": 0.22558577455724263, "grad_norm": 2.7136995792388916, "learning_rate": 7.16794493317696e-06, "loss": 0.4068, "step": 4732 }, { "epoch": 0.22563344695254214, "grad_norm": 1.6317429542541504, "learning_rate": 7.164177993103185e-06, "loss": 0.6417, "step": 4733 }, { "epoch": 0.22568111934784163, "grad_norm": 6.330113887786865, "learning_rate": 7.160411490567536e-06, "loss": 1.1275, "step": 4734 }, { "epoch": 0.22572879174314114, "grad_norm": 3.2850711345672607, "learning_rate": 7.156645426151154e-06, "loss": 0.2795, "step": 4735 }, { "epoch": 0.22577646413844063, "grad_norm": 1.601914644241333, "learning_rate": 7.152879800435104e-06, "loss": 0.5566, "step": 4736 }, { "epoch": 0.22582413653374014, "grad_norm": 2.2140674591064453, "learning_rate": 7.149114614000378e-06, "loss": 1.1239, "step": 4737 }, { "epoch": 0.22587180892903963, "grad_norm": 1.208021640777588, "learning_rate": 7.145349867427911e-06, "loss": 0.935, "step": 4738 }, { "epoch": 0.22591948132433914, "grad_norm": 3.035499334335327, "learning_rate": 7.141585561298563e-06, "loss": 0.7962, "step": 4739 }, { "epoch": 0.22596715371963863, "grad_norm": 2.8254246711730957, "learning_rate": 7.137821696193126e-06, "loss": 1.1237, "step": 4740 }, { "epoch": 0.22601482611493814, "grad_norm": 1.474790334701538, "learning_rate": 7.1340582726923235e-06, "loss": 0.778, "step": 4741 }, { "epoch": 0.22606249851023766, "grad_norm": 1.1334351301193237, "learning_rate": 7.1302952913768205e-06, "loss": 0.6643, "step": 4742 }, { "epoch": 0.22611017090553714, "grad_norm": 1.326080560684204, "learning_rate": 7.1265327528272e-06, "loss": 0.7613, "step": 4743 }, { "epoch": 0.22615784330083666, "grad_norm": 0.923225462436676, "learning_rate": 7.122770657623982e-06, "loss": 0.3463, "step": 4744 }, { "epoch": 0.22620551569613614, "grad_norm": 1.6519964933395386, "learning_rate": 7.119009006347625e-06, "loss": 0.6567, "step": 4745 }, { "epoch": 0.22625318809143566, "grad_norm": 1.6232659816741943, "learning_rate": 7.1152477995785095e-06, "loss": 0.9483, "step": 4746 }, { "epoch": 0.22630086048673514, "grad_norm": 1.9227045774459839, "learning_rate": 7.111487037896951e-06, "loss": 0.6603, "step": 4747 }, { "epoch": 0.22634853288203466, "grad_norm": 2.5088133811950684, "learning_rate": 7.107726721883196e-06, "loss": 0.9401, "step": 4748 }, { "epoch": 0.22639620527733417, "grad_norm": 8.9476957321167, "learning_rate": 7.1039668521174256e-06, "loss": 0.7122, "step": 4749 }, { "epoch": 0.22644387767263366, "grad_norm": 1.6333867311477661, "learning_rate": 7.100207429179744e-06, "loss": 0.7738, "step": 4750 }, { "epoch": 0.22649155006793317, "grad_norm": 1.4185580015182495, "learning_rate": 7.096448453650193e-06, "loss": 0.6116, "step": 4751 }, { "epoch": 0.22653922246323266, "grad_norm": 2.734687328338623, "learning_rate": 7.092689926108749e-06, "loss": 0.7373, "step": 4752 }, { "epoch": 0.22658689485853217, "grad_norm": 2.821073293685913, "learning_rate": 7.088931847135305e-06, "loss": 0.3472, "step": 4753 }, { "epoch": 0.22663456725383166, "grad_norm": 2.1299304962158203, "learning_rate": 7.085174217309703e-06, "loss": 0.7028, "step": 4754 }, { "epoch": 0.22668223964913117, "grad_norm": 1.0289126634597778, "learning_rate": 7.081417037211702e-06, "loss": 0.5725, "step": 4755 }, { "epoch": 0.22672991204443066, "grad_norm": 1.468214988708496, "learning_rate": 7.077660307420995e-06, "loss": 0.7818, "step": 4756 }, { "epoch": 0.22677758443973017, "grad_norm": 1.4464809894561768, "learning_rate": 7.073904028517207e-06, "loss": 0.6624, "step": 4757 }, { "epoch": 0.2268252568350297, "grad_norm": 2.84568452835083, "learning_rate": 7.070148201079898e-06, "loss": 1.1413, "step": 4758 }, { "epoch": 0.22687292923032917, "grad_norm": 2.1240897178649902, "learning_rate": 7.066392825688546e-06, "loss": 1.0717, "step": 4759 }, { "epoch": 0.2269206016256287, "grad_norm": 1.1136767864227295, "learning_rate": 7.0626379029225735e-06, "loss": 0.714, "step": 4760 }, { "epoch": 0.22696827402092817, "grad_norm": 1.9035533666610718, "learning_rate": 7.058883433361323e-06, "loss": 0.6541, "step": 4761 }, { "epoch": 0.2270159464162277, "grad_norm": 2.2511048316955566, "learning_rate": 7.05512941758407e-06, "loss": 1.4494, "step": 4762 }, { "epoch": 0.22706361881152717, "grad_norm": 2.730553150177002, "learning_rate": 7.051375856170022e-06, "loss": 0.9317, "step": 4763 }, { "epoch": 0.2271112912068267, "grad_norm": 3.4477884769439697, "learning_rate": 7.047622749698317e-06, "loss": 1.2089, "step": 4764 }, { "epoch": 0.2271589636021262, "grad_norm": 1.2691444158554077, "learning_rate": 7.043870098748013e-06, "loss": 0.5804, "step": 4765 }, { "epoch": 0.2272066359974257, "grad_norm": 1.2987852096557617, "learning_rate": 7.040117903898112e-06, "loss": 0.5485, "step": 4766 }, { "epoch": 0.2272543083927252, "grad_norm": 3.465034246444702, "learning_rate": 7.036366165727542e-06, "loss": 0.7205, "step": 4767 }, { "epoch": 0.2273019807880247, "grad_norm": 2.7303783893585205, "learning_rate": 7.0326148848151485e-06, "loss": 0.7658, "step": 4768 }, { "epoch": 0.2273496531833242, "grad_norm": 2.511943817138672, "learning_rate": 7.028864061739722e-06, "loss": 0.7951, "step": 4769 }, { "epoch": 0.2273973255786237, "grad_norm": 1.3755884170532227, "learning_rate": 7.025113697079977e-06, "loss": 0.7083, "step": 4770 }, { "epoch": 0.2274449979739232, "grad_norm": 1.078394889831543, "learning_rate": 7.021363791414548e-06, "loss": 0.3954, "step": 4771 }, { "epoch": 0.2274926703692227, "grad_norm": 2.0816476345062256, "learning_rate": 7.017614345322012e-06, "loss": 0.6315, "step": 4772 }, { "epoch": 0.2275403427645222, "grad_norm": 2.6804473400115967, "learning_rate": 7.0138653593808736e-06, "loss": 0.4759, "step": 4773 }, { "epoch": 0.22758801515982172, "grad_norm": 1.5941988229751587, "learning_rate": 7.0101168341695556e-06, "loss": 0.8696, "step": 4774 }, { "epoch": 0.2276356875551212, "grad_norm": 1.244204044342041, "learning_rate": 7.006368770266421e-06, "loss": 0.6185, "step": 4775 }, { "epoch": 0.22768335995042072, "grad_norm": 2.2992420196533203, "learning_rate": 7.002621168249759e-06, "loss": 0.4614, "step": 4776 }, { "epoch": 0.2277310323457202, "grad_norm": 1.6140018701553345, "learning_rate": 6.998874028697782e-06, "loss": 0.3395, "step": 4777 }, { "epoch": 0.22777870474101972, "grad_norm": 3.8199462890625, "learning_rate": 6.995127352188635e-06, "loss": 0.8374, "step": 4778 }, { "epoch": 0.2278263771363192, "grad_norm": 2.1203508377075195, "learning_rate": 6.9913811393003985e-06, "loss": 1.0732, "step": 4779 }, { "epoch": 0.22787404953161872, "grad_norm": 1.535758137702942, "learning_rate": 6.987635390611065e-06, "loss": 0.7637, "step": 4780 }, { "epoch": 0.22792172192691823, "grad_norm": 1.904703140258789, "learning_rate": 6.983890106698567e-06, "loss": 1.0583, "step": 4781 }, { "epoch": 0.22796939432221772, "grad_norm": 1.5071921348571777, "learning_rate": 6.980145288140772e-06, "loss": 0.635, "step": 4782 }, { "epoch": 0.22801706671751723, "grad_norm": 1.9088568687438965, "learning_rate": 6.976400935515457e-06, "loss": 0.6964, "step": 4783 }, { "epoch": 0.22806473911281672, "grad_norm": 1.2707308530807495, "learning_rate": 6.972657049400342e-06, "loss": 0.8825, "step": 4784 }, { "epoch": 0.22811241150811623, "grad_norm": 1.8479267358779907, "learning_rate": 6.968913630373066e-06, "loss": 0.7153, "step": 4785 }, { "epoch": 0.22816008390341572, "grad_norm": 2.0549099445343018, "learning_rate": 6.965170679011207e-06, "loss": 0.9486, "step": 4786 }, { "epoch": 0.22820775629871523, "grad_norm": 1.1440856456756592, "learning_rate": 6.961428195892256e-06, "loss": 0.6683, "step": 4787 }, { "epoch": 0.22825542869401472, "grad_norm": 2.016913414001465, "learning_rate": 6.957686181593642e-06, "loss": 0.36, "step": 4788 }, { "epoch": 0.22830310108931423, "grad_norm": 3.395559549331665, "learning_rate": 6.953944636692727e-06, "loss": 1.0265, "step": 4789 }, { "epoch": 0.22835077348461374, "grad_norm": 1.9130994081497192, "learning_rate": 6.95020356176678e-06, "loss": 0.7371, "step": 4790 }, { "epoch": 0.22839844587991323, "grad_norm": 5.80994176864624, "learning_rate": 6.946462957393019e-06, "loss": 0.8319, "step": 4791 }, { "epoch": 0.22844611827521275, "grad_norm": 2.279386043548584, "learning_rate": 6.94272282414858e-06, "loss": 0.7302, "step": 4792 }, { "epoch": 0.22849379067051223, "grad_norm": 1.6140658855438232, "learning_rate": 6.938983162610522e-06, "loss": 0.6806, "step": 4793 }, { "epoch": 0.22854146306581175, "grad_norm": 1.4206973314285278, "learning_rate": 6.935243973355839e-06, "loss": 0.7579, "step": 4794 }, { "epoch": 0.22858913546111123, "grad_norm": 1.15697181224823, "learning_rate": 6.931505256961454e-06, "loss": 0.3138, "step": 4795 }, { "epoch": 0.22863680785641075, "grad_norm": 1.5886199474334717, "learning_rate": 6.9277670140042055e-06, "loss": 0.5657, "step": 4796 }, { "epoch": 0.22868448025171026, "grad_norm": 1.677321195602417, "learning_rate": 6.924029245060868e-06, "loss": 0.7028, "step": 4797 }, { "epoch": 0.22873215264700975, "grad_norm": 1.933491826057434, "learning_rate": 6.920291950708144e-06, "loss": 0.9893, "step": 4798 }, { "epoch": 0.22877982504230926, "grad_norm": 1.786095142364502, "learning_rate": 6.916555131522657e-06, "loss": 0.5504, "step": 4799 }, { "epoch": 0.22882749743760875, "grad_norm": 0.9630317687988281, "learning_rate": 6.9128187880809595e-06, "loss": 0.3825, "step": 4800 }, { "epoch": 0.22887516983290826, "grad_norm": 1.4966472387313843, "learning_rate": 6.909082920959534e-06, "loss": 0.9508, "step": 4801 }, { "epoch": 0.22892284222820775, "grad_norm": 1.7875064611434937, "learning_rate": 6.905347530734778e-06, "loss": 0.6317, "step": 4802 }, { "epoch": 0.22897051462350726, "grad_norm": 1.6654691696166992, "learning_rate": 6.90161261798303e-06, "loss": 0.9723, "step": 4803 }, { "epoch": 0.22901818701880677, "grad_norm": 1.6923191547393799, "learning_rate": 6.897878183280553e-06, "loss": 0.4879, "step": 4804 }, { "epoch": 0.22906585941410626, "grad_norm": 2.3821890354156494, "learning_rate": 6.894144227203521e-06, "loss": 0.2866, "step": 4805 }, { "epoch": 0.22911353180940577, "grad_norm": 1.1901254653930664, "learning_rate": 6.890410750328054e-06, "loss": 0.7887, "step": 4806 }, { "epoch": 0.22916120420470526, "grad_norm": 4.020439624786377, "learning_rate": 6.886677753230184e-06, "loss": 0.3801, "step": 4807 }, { "epoch": 0.22920887660000477, "grad_norm": 1.9434716701507568, "learning_rate": 6.8829452364858776e-06, "loss": 0.5932, "step": 4808 }, { "epoch": 0.22925654899530426, "grad_norm": 1.2128325700759888, "learning_rate": 6.8792132006710175e-06, "loss": 0.784, "step": 4809 }, { "epoch": 0.22930422139060377, "grad_norm": 2.2217557430267334, "learning_rate": 6.875481646361428e-06, "loss": 0.6519, "step": 4810 }, { "epoch": 0.22935189378590326, "grad_norm": 0.9565367102622986, "learning_rate": 6.871750574132841e-06, "loss": 0.5196, "step": 4811 }, { "epoch": 0.22939956618120277, "grad_norm": 1.2641178369522095, "learning_rate": 6.868019984560925e-06, "loss": 0.7952, "step": 4812 }, { "epoch": 0.2294472385765023, "grad_norm": 2.4499106407165527, "learning_rate": 6.864289878221275e-06, "loss": 0.655, "step": 4813 }, { "epoch": 0.22949491097180177, "grad_norm": 0.9127827882766724, "learning_rate": 6.8605602556894056e-06, "loss": 0.3018, "step": 4814 }, { "epoch": 0.2295425833671013, "grad_norm": 1.6209328174591064, "learning_rate": 6.8568311175407546e-06, "loss": 0.7411, "step": 4815 }, { "epoch": 0.22959025576240077, "grad_norm": 1.4945902824401855, "learning_rate": 6.853102464350698e-06, "loss": 0.7779, "step": 4816 }, { "epoch": 0.2296379281577003, "grad_norm": 2.6084814071655273, "learning_rate": 6.849374296694522e-06, "loss": 0.6965, "step": 4817 }, { "epoch": 0.22968560055299977, "grad_norm": 1.3233808279037476, "learning_rate": 6.845646615147445e-06, "loss": 0.563, "step": 4818 }, { "epoch": 0.2297332729482993, "grad_norm": 1.3300272226333618, "learning_rate": 6.841919420284618e-06, "loss": 0.2962, "step": 4819 }, { "epoch": 0.2297809453435988, "grad_norm": 1.5891857147216797, "learning_rate": 6.8381927126810965e-06, "loss": 0.709, "step": 4820 }, { "epoch": 0.2298286177388983, "grad_norm": 1.7181260585784912, "learning_rate": 6.834466492911882e-06, "loss": 0.5133, "step": 4821 }, { "epoch": 0.2298762901341978, "grad_norm": 1.556504726409912, "learning_rate": 6.8307407615518865e-06, "loss": 0.8262, "step": 4822 }, { "epoch": 0.2299239625294973, "grad_norm": 1.3471596240997314, "learning_rate": 6.827015519175958e-06, "loss": 0.7419, "step": 4823 }, { "epoch": 0.2299716349247968, "grad_norm": 2.29951548576355, "learning_rate": 6.823290766358857e-06, "loss": 0.4747, "step": 4824 }, { "epoch": 0.2300193073200963, "grad_norm": 1.8909499645233154, "learning_rate": 6.819566503675274e-06, "loss": 0.37, "step": 4825 }, { "epoch": 0.2300669797153958, "grad_norm": 1.6186764240264893, "learning_rate": 6.815842731699834e-06, "loss": 0.2767, "step": 4826 }, { "epoch": 0.2301146521106953, "grad_norm": 2.155027151107788, "learning_rate": 6.812119451007067e-06, "loss": 0.6642, "step": 4827 }, { "epoch": 0.2301623245059948, "grad_norm": 2.7167179584503174, "learning_rate": 6.808396662171439e-06, "loss": 0.7051, "step": 4828 }, { "epoch": 0.23020999690129432, "grad_norm": 3.2606234550476074, "learning_rate": 6.804674365767341e-06, "loss": 0.7588, "step": 4829 }, { "epoch": 0.2302576692965938, "grad_norm": 1.379020094871521, "learning_rate": 6.8009525623690805e-06, "loss": 0.8912, "step": 4830 }, { "epoch": 0.23030534169189332, "grad_norm": 1.795343041419983, "learning_rate": 6.797231252550895e-06, "loss": 0.6966, "step": 4831 }, { "epoch": 0.2303530140871928, "grad_norm": 1.3029494285583496, "learning_rate": 6.793510436886951e-06, "loss": 0.804, "step": 4832 }, { "epoch": 0.23040068648249232, "grad_norm": 1.163956880569458, "learning_rate": 6.78979011595132e-06, "loss": 0.9016, "step": 4833 }, { "epoch": 0.2304483588777918, "grad_norm": 1.42453134059906, "learning_rate": 6.7860702903180165e-06, "loss": 0.6412, "step": 4834 }, { "epoch": 0.23049603127309132, "grad_norm": 1.3722329139709473, "learning_rate": 6.782350960560973e-06, "loss": 0.7591, "step": 4835 }, { "epoch": 0.23054370366839083, "grad_norm": 1.4869694709777832, "learning_rate": 6.778632127254039e-06, "loss": 0.6528, "step": 4836 }, { "epoch": 0.23059137606369032, "grad_norm": 1.0891196727752686, "learning_rate": 6.774913790970994e-06, "loss": 0.5512, "step": 4837 }, { "epoch": 0.23063904845898983, "grad_norm": 1.421276569366455, "learning_rate": 6.771195952285541e-06, "loss": 0.2438, "step": 4838 }, { "epoch": 0.23068672085428932, "grad_norm": 1.6381943225860596, "learning_rate": 6.7674786117712985e-06, "loss": 1.0994, "step": 4839 }, { "epoch": 0.23073439324958883, "grad_norm": 1.414202094078064, "learning_rate": 6.763761770001817e-06, "loss": 0.6413, "step": 4840 }, { "epoch": 0.23078206564488832, "grad_norm": 1.2982137203216553, "learning_rate": 6.760045427550574e-06, "loss": 0.6958, "step": 4841 }, { "epoch": 0.23082973804018783, "grad_norm": 3.3846471309661865, "learning_rate": 6.75632958499095e-06, "loss": 1.2324, "step": 4842 }, { "epoch": 0.23087741043548732, "grad_norm": 1.6973671913146973, "learning_rate": 6.752614242896271e-06, "loss": 0.7026, "step": 4843 }, { "epoch": 0.23092508283078683, "grad_norm": 0.733379602432251, "learning_rate": 6.748899401839774e-06, "loss": 0.2149, "step": 4844 }, { "epoch": 0.23097275522608635, "grad_norm": 2.3345930576324463, "learning_rate": 6.745185062394617e-06, "loss": 0.6395, "step": 4845 }, { "epoch": 0.23102042762138583, "grad_norm": 2.2964675426483154, "learning_rate": 6.741471225133886e-06, "loss": 1.4405, "step": 4846 }, { "epoch": 0.23106810001668535, "grad_norm": 1.3115711212158203, "learning_rate": 6.737757890630593e-06, "loss": 0.9501, "step": 4847 }, { "epoch": 0.23111577241198483, "grad_norm": 1.46547269821167, "learning_rate": 6.734045059457658e-06, "loss": 0.8141, "step": 4848 }, { "epoch": 0.23116344480728435, "grad_norm": 1.3806743621826172, "learning_rate": 6.7303327321879375e-06, "loss": 0.6733, "step": 4849 }, { "epoch": 0.23121111720258383, "grad_norm": 1.0296831130981445, "learning_rate": 6.7266209093942104e-06, "loss": 0.6701, "step": 4850 }, { "epoch": 0.23125878959788335, "grad_norm": 2.1191413402557373, "learning_rate": 6.722909591649163e-06, "loss": 0.6614, "step": 4851 }, { "epoch": 0.23130646199318286, "grad_norm": 1.48680579662323, "learning_rate": 6.7191987795254195e-06, "loss": 0.765, "step": 4852 }, { "epoch": 0.23135413438848235, "grad_norm": 1.0230731964111328, "learning_rate": 6.715488473595522e-06, "loss": 0.6681, "step": 4853 }, { "epoch": 0.23140180678378186, "grad_norm": 1.825308918952942, "learning_rate": 6.7117786744319235e-06, "loss": 0.7449, "step": 4854 }, { "epoch": 0.23144947917908135, "grad_norm": 1.675789475440979, "learning_rate": 6.708069382607015e-06, "loss": 0.7855, "step": 4855 }, { "epoch": 0.23149715157438086, "grad_norm": 1.8206390142440796, "learning_rate": 6.704360598693103e-06, "loss": 0.541, "step": 4856 }, { "epoch": 0.23154482396968035, "grad_norm": 2.795661449432373, "learning_rate": 6.700652323262409e-06, "loss": 1.0867, "step": 4857 }, { "epoch": 0.23159249636497986, "grad_norm": 1.0484724044799805, "learning_rate": 6.696944556887086e-06, "loss": 0.41, "step": 4858 }, { "epoch": 0.23164016876027935, "grad_norm": 2.431704044342041, "learning_rate": 6.693237300139201e-06, "loss": 0.6234, "step": 4859 }, { "epoch": 0.23168784115557886, "grad_norm": 1.1742595434188843, "learning_rate": 6.6895305535907515e-06, "loss": 0.6184, "step": 4860 }, { "epoch": 0.23173551355087837, "grad_norm": 1.6406340599060059, "learning_rate": 6.6858243178136425e-06, "loss": 0.5902, "step": 4861 }, { "epoch": 0.23178318594617786, "grad_norm": 4.000977039337158, "learning_rate": 6.682118593379713e-06, "loss": 1.3003, "step": 4862 }, { "epoch": 0.23183085834147737, "grad_norm": 2.7752816677093506, "learning_rate": 6.67841338086072e-06, "loss": 0.9206, "step": 4863 }, { "epoch": 0.23187853073677686, "grad_norm": 1.4836485385894775, "learning_rate": 6.674708680828332e-06, "loss": 0.5691, "step": 4864 }, { "epoch": 0.23192620313207638, "grad_norm": 1.0134133100509644, "learning_rate": 6.671004493854154e-06, "loss": 0.624, "step": 4865 }, { "epoch": 0.23197387552737586, "grad_norm": 1.5512118339538574, "learning_rate": 6.6673008205097e-06, "loss": 0.5365, "step": 4866 }, { "epoch": 0.23202154792267538, "grad_norm": 1.4838773012161255, "learning_rate": 6.66359766136641e-06, "loss": 0.4869, "step": 4867 }, { "epoch": 0.2320692203179749, "grad_norm": 2.7635140419006348, "learning_rate": 6.659895016995639e-06, "loss": 0.4451, "step": 4868 }, { "epoch": 0.23211689271327438, "grad_norm": 2.415874719619751, "learning_rate": 6.656192887968675e-06, "loss": 1.0728, "step": 4869 }, { "epoch": 0.2321645651085739, "grad_norm": 1.112857699394226, "learning_rate": 6.652491274856711e-06, "loss": 0.5404, "step": 4870 }, { "epoch": 0.23221223750387338, "grad_norm": 2.001843214035034, "learning_rate": 6.6487901782308685e-06, "loss": 0.6603, "step": 4871 }, { "epoch": 0.2322599098991729, "grad_norm": 1.4128066301345825, "learning_rate": 6.645089598662197e-06, "loss": 0.8395, "step": 4872 }, { "epoch": 0.23230758229447238, "grad_norm": 1.0326107740402222, "learning_rate": 6.641389536721646e-06, "loss": 0.393, "step": 4873 }, { "epoch": 0.2323552546897719, "grad_norm": 1.463822364807129, "learning_rate": 6.637689992980105e-06, "loss": 0.6671, "step": 4874 }, { "epoch": 0.23240292708507138, "grad_norm": 1.4672478437423706, "learning_rate": 6.633990968008374e-06, "loss": 0.775, "step": 4875 }, { "epoch": 0.2324505994803709, "grad_norm": 1.0843429565429688, "learning_rate": 6.630292462377172e-06, "loss": 0.6534, "step": 4876 }, { "epoch": 0.2324982718756704, "grad_norm": 1.7352875471115112, "learning_rate": 6.62659447665714e-06, "loss": 0.6898, "step": 4877 }, { "epoch": 0.2325459442709699, "grad_norm": 1.6229850053787231, "learning_rate": 6.622897011418845e-06, "loss": 1.1481, "step": 4878 }, { "epoch": 0.2325936166662694, "grad_norm": 1.8289846181869507, "learning_rate": 6.619200067232758e-06, "loss": 0.3906, "step": 4879 }, { "epoch": 0.2326412890615689, "grad_norm": 1.3244634866714478, "learning_rate": 6.6155036446692895e-06, "loss": 0.812, "step": 4880 }, { "epoch": 0.2326889614568684, "grad_norm": 1.527397871017456, "learning_rate": 6.6118077442987545e-06, "loss": 0.8692, "step": 4881 }, { "epoch": 0.2327366338521679, "grad_norm": 2.2919743061065674, "learning_rate": 6.608112366691393e-06, "loss": 0.5707, "step": 4882 }, { "epoch": 0.2327843062474674, "grad_norm": 3.870281934738159, "learning_rate": 6.604417512417362e-06, "loss": 0.2523, "step": 4883 }, { "epoch": 0.23283197864276692, "grad_norm": 1.2963998317718506, "learning_rate": 6.600723182046744e-06, "loss": 0.6588, "step": 4884 }, { "epoch": 0.2328796510380664, "grad_norm": 1.6692534685134888, "learning_rate": 6.5970293761495305e-06, "loss": 0.7753, "step": 4885 }, { "epoch": 0.23292732343336592, "grad_norm": 1.7019098997116089, "learning_rate": 6.593336095295639e-06, "loss": 0.6004, "step": 4886 }, { "epoch": 0.2329749958286654, "grad_norm": 1.8671505451202393, "learning_rate": 6.589643340054911e-06, "loss": 0.9987, "step": 4887 }, { "epoch": 0.23302266822396492, "grad_norm": 1.5101096630096436, "learning_rate": 6.585951110997092e-06, "loss": 0.8756, "step": 4888 }, { "epoch": 0.2330703406192644, "grad_norm": 1.9507817029953003, "learning_rate": 6.58225940869186e-06, "loss": 1.2067, "step": 4889 }, { "epoch": 0.23311801301456392, "grad_norm": 3.053358554840088, "learning_rate": 6.5785682337088085e-06, "loss": 0.6946, "step": 4890 }, { "epoch": 0.23316568540986343, "grad_norm": 1.3668612241744995, "learning_rate": 6.574877586617439e-06, "loss": 0.5651, "step": 4891 }, { "epoch": 0.23321335780516292, "grad_norm": 1.4881539344787598, "learning_rate": 6.571187467987187e-06, "loss": 0.656, "step": 4892 }, { "epoch": 0.23326103020046243, "grad_norm": 1.0348079204559326, "learning_rate": 6.567497878387402e-06, "loss": 0.4377, "step": 4893 }, { "epoch": 0.23330870259576192, "grad_norm": 1.7877459526062012, "learning_rate": 6.563808818387342e-06, "loss": 0.793, "step": 4894 }, { "epoch": 0.23335637499106143, "grad_norm": 2.9496564865112305, "learning_rate": 6.560120288556197e-06, "loss": 0.5007, "step": 4895 }, { "epoch": 0.23340404738636092, "grad_norm": 1.8644105195999146, "learning_rate": 6.5564322894630705e-06, "loss": 0.7221, "step": 4896 }, { "epoch": 0.23345171978166043, "grad_norm": 2.8216748237609863, "learning_rate": 6.552744821676978e-06, "loss": 0.8326, "step": 4897 }, { "epoch": 0.23349939217695992, "grad_norm": 1.0786311626434326, "learning_rate": 6.549057885766859e-06, "loss": 0.6261, "step": 4898 }, { "epoch": 0.23354706457225943, "grad_norm": 1.6184810400009155, "learning_rate": 6.545371482301568e-06, "loss": 0.6217, "step": 4899 }, { "epoch": 0.23359473696755895, "grad_norm": 1.9159892797470093, "learning_rate": 6.5416856118498874e-06, "loss": 0.7225, "step": 4900 }, { "epoch": 0.23364240936285843, "grad_norm": 2.7812931537628174, "learning_rate": 6.538000274980498e-06, "loss": 0.7393, "step": 4901 }, { "epoch": 0.23369008175815795, "grad_norm": 2.1245625019073486, "learning_rate": 6.5343154722620174e-06, "loss": 0.7782, "step": 4902 }, { "epoch": 0.23373775415345743, "grad_norm": 2.2408251762390137, "learning_rate": 6.53063120426297e-06, "loss": 1.0819, "step": 4903 }, { "epoch": 0.23378542654875695, "grad_norm": 1.921726107597351, "learning_rate": 6.526947471551799e-06, "loss": 0.6288, "step": 4904 }, { "epoch": 0.23383309894405643, "grad_norm": 2.049384593963623, "learning_rate": 6.5232642746968655e-06, "loss": 0.6967, "step": 4905 }, { "epoch": 0.23388077133935595, "grad_norm": 1.054907202720642, "learning_rate": 6.519581614266456e-06, "loss": 0.8852, "step": 4906 }, { "epoch": 0.23392844373465546, "grad_norm": 1.905211091041565, "learning_rate": 6.515899490828758e-06, "loss": 0.5267, "step": 4907 }, { "epoch": 0.23397611612995495, "grad_norm": 2.7187602519989014, "learning_rate": 6.512217904951889e-06, "loss": 1.2232, "step": 4908 }, { "epoch": 0.23402378852525446, "grad_norm": 1.1157643795013428, "learning_rate": 6.508536857203884e-06, "loss": 0.4358, "step": 4909 }, { "epoch": 0.23407146092055395, "grad_norm": 1.4176524877548218, "learning_rate": 6.504856348152682e-06, "loss": 0.6752, "step": 4910 }, { "epoch": 0.23411913331585346, "grad_norm": 1.8605366945266724, "learning_rate": 6.5011763783661564e-06, "loss": 0.5545, "step": 4911 }, { "epoch": 0.23416680571115295, "grad_norm": 7.0309834480285645, "learning_rate": 6.497496948412085e-06, "loss": 0.5398, "step": 4912 }, { "epoch": 0.23421447810645246, "grad_norm": 1.5447821617126465, "learning_rate": 6.493818058858161e-06, "loss": 0.4848, "step": 4913 }, { "epoch": 0.23426215050175195, "grad_norm": 0.7650623321533203, "learning_rate": 6.490139710272005e-06, "loss": 0.2541, "step": 4914 }, { "epoch": 0.23430982289705146, "grad_norm": 3.0292134284973145, "learning_rate": 6.486461903221153e-06, "loss": 0.8596, "step": 4915 }, { "epoch": 0.23435749529235098, "grad_norm": 1.4401601552963257, "learning_rate": 6.482784638273041e-06, "loss": 0.3891, "step": 4916 }, { "epoch": 0.23440516768765046, "grad_norm": 1.506598711013794, "learning_rate": 6.479107915995038e-06, "loss": 0.7357, "step": 4917 }, { "epoch": 0.23445284008294998, "grad_norm": 1.2489334344863892, "learning_rate": 6.475431736954431e-06, "loss": 0.7784, "step": 4918 }, { "epoch": 0.23450051247824946, "grad_norm": 2.1433074474334717, "learning_rate": 6.471756101718408e-06, "loss": 0.6334, "step": 4919 }, { "epoch": 0.23454818487354898, "grad_norm": 2.1819376945495605, "learning_rate": 6.468081010854084e-06, "loss": 0.4933, "step": 4920 }, { "epoch": 0.23459585726884846, "grad_norm": 1.8046916723251343, "learning_rate": 6.46440646492849e-06, "loss": 0.4785, "step": 4921 }, { "epoch": 0.23464352966414798, "grad_norm": 1.495320439338684, "learning_rate": 6.460732464508567e-06, "loss": 0.9471, "step": 4922 }, { "epoch": 0.2346912020594475, "grad_norm": 1.4423713684082031, "learning_rate": 6.4570590101611765e-06, "loss": 0.7328, "step": 4923 }, { "epoch": 0.23473887445474698, "grad_norm": 2.900038719177246, "learning_rate": 6.453386102453099e-06, "loss": 0.8906, "step": 4924 }, { "epoch": 0.2347865468500465, "grad_norm": 1.3181345462799072, "learning_rate": 6.449713741951021e-06, "loss": 0.731, "step": 4925 }, { "epoch": 0.23483421924534598, "grad_norm": 1.8139119148254395, "learning_rate": 6.446041929221551e-06, "loss": 0.7712, "step": 4926 }, { "epoch": 0.2348818916406455, "grad_norm": 3.980682611465454, "learning_rate": 6.442370664831214e-06, "loss": 0.7389, "step": 4927 }, { "epoch": 0.23492956403594498, "grad_norm": 7.2360968589782715, "learning_rate": 6.438699949346446e-06, "loss": 1.4024, "step": 4928 }, { "epoch": 0.2349772364312445, "grad_norm": 5.406827449798584, "learning_rate": 6.435029783333599e-06, "loss": 1.0036, "step": 4929 }, { "epoch": 0.23502490882654398, "grad_norm": 6.298873424530029, "learning_rate": 6.431360167358951e-06, "loss": 0.6144, "step": 4930 }, { "epoch": 0.2350725812218435, "grad_norm": 2.6825435161590576, "learning_rate": 6.427691101988673e-06, "loss": 0.598, "step": 4931 }, { "epoch": 0.235120253617143, "grad_norm": 1.6035325527191162, "learning_rate": 6.424022587788872e-06, "loss": 0.6953, "step": 4932 }, { "epoch": 0.2351679260124425, "grad_norm": 2.0679221153259277, "learning_rate": 6.4203546253255635e-06, "loss": 1.0477, "step": 4933 }, { "epoch": 0.235215598407742, "grad_norm": 1.2404496669769287, "learning_rate": 6.416687215164671e-06, "loss": 0.565, "step": 4934 }, { "epoch": 0.2352632708030415, "grad_norm": 1.0773966312408447, "learning_rate": 6.413020357872038e-06, "loss": 0.3685, "step": 4935 }, { "epoch": 0.235310943198341, "grad_norm": 2.640958070755005, "learning_rate": 6.409354054013425e-06, "loss": 0.9201, "step": 4936 }, { "epoch": 0.2353586155936405, "grad_norm": 1.9813711643218994, "learning_rate": 6.405688304154509e-06, "loss": 0.9941, "step": 4937 }, { "epoch": 0.23540628798894, "grad_norm": 1.335095763206482, "learning_rate": 6.4020231088608695e-06, "loss": 0.7447, "step": 4938 }, { "epoch": 0.23545396038423952, "grad_norm": 2.280273914337158, "learning_rate": 6.398358468698013e-06, "loss": 0.4129, "step": 4939 }, { "epoch": 0.235501632779539, "grad_norm": 2.7066197395324707, "learning_rate": 6.394694384231358e-06, "loss": 0.4934, "step": 4940 }, { "epoch": 0.23554930517483852, "grad_norm": 1.5054798126220703, "learning_rate": 6.3910308560262305e-06, "loss": 0.5794, "step": 4941 }, { "epoch": 0.235596977570138, "grad_norm": 1.2112187147140503, "learning_rate": 6.387367884647875e-06, "loss": 0.606, "step": 4942 }, { "epoch": 0.23564464996543752, "grad_norm": 1.5958924293518066, "learning_rate": 6.383705470661456e-06, "loss": 0.7443, "step": 4943 }, { "epoch": 0.235692322360737, "grad_norm": 2.5258569717407227, "learning_rate": 6.380043614632037e-06, "loss": 0.8325, "step": 4944 }, { "epoch": 0.23573999475603652, "grad_norm": 1.2582625150680542, "learning_rate": 6.376382317124612e-06, "loss": 0.6065, "step": 4945 }, { "epoch": 0.235787667151336, "grad_norm": 1.5497052669525146, "learning_rate": 6.372721578704082e-06, "loss": 0.527, "step": 4946 }, { "epoch": 0.23583533954663552, "grad_norm": 2.164127826690674, "learning_rate": 6.369061399935255e-06, "loss": 0.8307, "step": 4947 }, { "epoch": 0.23588301194193503, "grad_norm": 2.7174575328826904, "learning_rate": 6.365401781382865e-06, "loss": 1.0457, "step": 4948 }, { "epoch": 0.23593068433723452, "grad_norm": 1.983340859413147, "learning_rate": 6.361742723611551e-06, "loss": 1.0276, "step": 4949 }, { "epoch": 0.23597835673253403, "grad_norm": 1.5497627258300781, "learning_rate": 6.358084227185866e-06, "loss": 0.6036, "step": 4950 }, { "epoch": 0.23602602912783352, "grad_norm": 1.4730205535888672, "learning_rate": 6.354426292670279e-06, "loss": 0.7759, "step": 4951 }, { "epoch": 0.23607370152313303, "grad_norm": 2.397778272628784, "learning_rate": 6.350768920629179e-06, "loss": 0.7611, "step": 4952 }, { "epoch": 0.23612137391843252, "grad_norm": 1.3542039394378662, "learning_rate": 6.3471121116268494e-06, "loss": 0.9697, "step": 4953 }, { "epoch": 0.23616904631373203, "grad_norm": 1.4054028987884521, "learning_rate": 6.343455866227504e-06, "loss": 0.8482, "step": 4954 }, { "epoch": 0.23621671870903155, "grad_norm": 0.9641540050506592, "learning_rate": 6.339800184995266e-06, "loss": 0.5462, "step": 4955 }, { "epoch": 0.23626439110433103, "grad_norm": 1.470791220664978, "learning_rate": 6.3361450684941664e-06, "loss": 0.5811, "step": 4956 }, { "epoch": 0.23631206349963055, "grad_norm": 2.747241973876953, "learning_rate": 6.332490517288148e-06, "loss": 0.6081, "step": 4957 }, { "epoch": 0.23635973589493003, "grad_norm": 1.2244296073913574, "learning_rate": 6.328836531941081e-06, "loss": 0.6703, "step": 4958 }, { "epoch": 0.23640740829022955, "grad_norm": 1.3257007598876953, "learning_rate": 6.3251831130167264e-06, "loss": 0.8026, "step": 4959 }, { "epoch": 0.23645508068552903, "grad_norm": 2.4073638916015625, "learning_rate": 6.321530261078774e-06, "loss": 0.9719, "step": 4960 }, { "epoch": 0.23650275308082855, "grad_norm": 2.8806135654449463, "learning_rate": 6.317877976690826e-06, "loss": 1.3026, "step": 4961 }, { "epoch": 0.23655042547612803, "grad_norm": 2.24585223197937, "learning_rate": 6.314226260416383e-06, "loss": 0.9502, "step": 4962 }, { "epoch": 0.23659809787142755, "grad_norm": 1.8365782499313354, "learning_rate": 6.3105751128188756e-06, "loss": 1.238, "step": 4963 }, { "epoch": 0.23664577026672706, "grad_norm": 2.0853514671325684, "learning_rate": 6.306924534461633e-06, "loss": 0.7174, "step": 4964 }, { "epoch": 0.23669344266202655, "grad_norm": 2.7662477493286133, "learning_rate": 6.303274525907903e-06, "loss": 0.6062, "step": 4965 }, { "epoch": 0.23674111505732606, "grad_norm": 1.472198724746704, "learning_rate": 6.299625087720844e-06, "loss": 0.9402, "step": 4966 }, { "epoch": 0.23678878745262555, "grad_norm": 1.3685739040374756, "learning_rate": 6.295976220463531e-06, "loss": 0.7949, "step": 4967 }, { "epoch": 0.23683645984792506, "grad_norm": 1.2950966358184814, "learning_rate": 6.2923279246989385e-06, "loss": 0.5695, "step": 4968 }, { "epoch": 0.23688413224322455, "grad_norm": 3.6497159004211426, "learning_rate": 6.288680200989967e-06, "loss": 1.0519, "step": 4969 }, { "epoch": 0.23693180463852406, "grad_norm": 1.731481909751892, "learning_rate": 6.2850330498994235e-06, "loss": 0.9411, "step": 4970 }, { "epoch": 0.23697947703382358, "grad_norm": 1.3376823663711548, "learning_rate": 6.281386471990021e-06, "loss": 0.7521, "step": 4971 }, { "epoch": 0.23702714942912306, "grad_norm": 1.3269504308700562, "learning_rate": 6.277740467824394e-06, "loss": 0.6383, "step": 4972 }, { "epoch": 0.23707482182442258, "grad_norm": 1.881067156791687, "learning_rate": 6.2740950379650775e-06, "loss": 0.6847, "step": 4973 }, { "epoch": 0.23712249421972206, "grad_norm": 1.7035162448883057, "learning_rate": 6.270450182974532e-06, "loss": 0.9515, "step": 4974 }, { "epoch": 0.23717016661502158, "grad_norm": 3.9568660259246826, "learning_rate": 6.266805903415112e-06, "loss": 1.1311, "step": 4975 }, { "epoch": 0.23721783901032106, "grad_norm": 1.3612909317016602, "learning_rate": 6.2631621998490965e-06, "loss": 0.8954, "step": 4976 }, { "epoch": 0.23726551140562058, "grad_norm": 1.2040951251983643, "learning_rate": 6.259519072838676e-06, "loss": 0.6272, "step": 4977 }, { "epoch": 0.23731318380092006, "grad_norm": 2.760911226272583, "learning_rate": 6.255876522945941e-06, "loss": 0.3456, "step": 4978 }, { "epoch": 0.23736085619621958, "grad_norm": 2.025914192199707, "learning_rate": 6.2522345507329e-06, "loss": 0.8398, "step": 4979 }, { "epoch": 0.2374085285915191, "grad_norm": 1.8682208061218262, "learning_rate": 6.248593156761477e-06, "loss": 0.8555, "step": 4980 }, { "epoch": 0.23745620098681858, "grad_norm": 1.2527092695236206, "learning_rate": 6.244952341593493e-06, "loss": 0.4326, "step": 4981 }, { "epoch": 0.2375038733821181, "grad_norm": 2.424462080001831, "learning_rate": 6.2413121057906934e-06, "loss": 0.8241, "step": 4982 }, { "epoch": 0.23755154577741758, "grad_norm": 1.5429975986480713, "learning_rate": 6.237672449914734e-06, "loss": 0.6146, "step": 4983 }, { "epoch": 0.2375992181727171, "grad_norm": 2.2488510608673096, "learning_rate": 6.234033374527166e-06, "loss": 0.8928, "step": 4984 }, { "epoch": 0.23764689056801658, "grad_norm": 1.3556400537490845, "learning_rate": 6.230394880189468e-06, "loss": 0.8557, "step": 4985 }, { "epoch": 0.2376945629633161, "grad_norm": 2.1810364723205566, "learning_rate": 6.226756967463023e-06, "loss": 0.7507, "step": 4986 }, { "epoch": 0.2377422353586156, "grad_norm": 2.3733267784118652, "learning_rate": 6.223119636909118e-06, "loss": 0.7715, "step": 4987 }, { "epoch": 0.2377899077539151, "grad_norm": 1.7758448123931885, "learning_rate": 6.219482889088959e-06, "loss": 0.7934, "step": 4988 }, { "epoch": 0.2378375801492146, "grad_norm": 2.1177680492401123, "learning_rate": 6.215846724563661e-06, "loss": 0.6231, "step": 4989 }, { "epoch": 0.2378852525445141, "grad_norm": 2.7689030170440674, "learning_rate": 6.21221114389424e-06, "loss": 0.7026, "step": 4990 }, { "epoch": 0.2379329249398136, "grad_norm": 1.3081248998641968, "learning_rate": 6.208576147641634e-06, "loss": 0.6489, "step": 4991 }, { "epoch": 0.2379805973351131, "grad_norm": 0.6183000206947327, "learning_rate": 6.204941736366688e-06, "loss": 0.2264, "step": 4992 }, { "epoch": 0.2380282697304126, "grad_norm": 1.402753472328186, "learning_rate": 6.2013079106301454e-06, "loss": 0.6537, "step": 4993 }, { "epoch": 0.23807594212571212, "grad_norm": 2.1279776096343994, "learning_rate": 6.1976746709926775e-06, "loss": 0.8205, "step": 4994 }, { "epoch": 0.2381236145210116, "grad_norm": 2.560328245162964, "learning_rate": 6.194042018014852e-06, "loss": 0.436, "step": 4995 }, { "epoch": 0.23817128691631112, "grad_norm": 1.2721821069717407, "learning_rate": 6.1904099522571445e-06, "loss": 0.6816, "step": 4996 }, { "epoch": 0.2382189593116106, "grad_norm": 2.703101396560669, "learning_rate": 6.186778474279951e-06, "loss": 0.5278, "step": 4997 }, { "epoch": 0.23826663170691012, "grad_norm": 1.4805877208709717, "learning_rate": 6.183147584643575e-06, "loss": 0.8678, "step": 4998 }, { "epoch": 0.2383143041022096, "grad_norm": 1.6941044330596924, "learning_rate": 6.179517283908217e-06, "loss": 0.7747, "step": 4999 }, { "epoch": 0.23836197649750912, "grad_norm": 1.2779852151870728, "learning_rate": 6.175887572633998e-06, "loss": 0.6392, "step": 5000 }, { "epoch": 0.2384096488928086, "grad_norm": 1.781387209892273, "learning_rate": 6.172258451380949e-06, "loss": 0.3088, "step": 5001 }, { "epoch": 0.23845732128810812, "grad_norm": 2.359119415283203, "learning_rate": 6.168629920709002e-06, "loss": 0.8953, "step": 5002 }, { "epoch": 0.23850499368340763, "grad_norm": 2.595407724380493, "learning_rate": 6.165001981178e-06, "loss": 0.9863, "step": 5003 }, { "epoch": 0.23855266607870712, "grad_norm": 2.4940426349639893, "learning_rate": 6.161374633347703e-06, "loss": 1.3752, "step": 5004 }, { "epoch": 0.23860033847400663, "grad_norm": 2.8280820846557617, "learning_rate": 6.157747877777766e-06, "loss": 1.2055, "step": 5005 }, { "epoch": 0.23864801086930612, "grad_norm": 1.4954427480697632, "learning_rate": 6.154121715027765e-06, "loss": 0.6753, "step": 5006 }, { "epoch": 0.23869568326460563, "grad_norm": 2.7824718952178955, "learning_rate": 6.150496145657183e-06, "loss": 0.5893, "step": 5007 }, { "epoch": 0.23874335565990512, "grad_norm": 1.3478643894195557, "learning_rate": 6.146871170225398e-06, "loss": 0.5745, "step": 5008 }, { "epoch": 0.23879102805520463, "grad_norm": 2.128234386444092, "learning_rate": 6.143246789291715e-06, "loss": 0.6522, "step": 5009 }, { "epoch": 0.23883870045050415, "grad_norm": 1.1305973529815674, "learning_rate": 6.139623003415336e-06, "loss": 0.781, "step": 5010 }, { "epoch": 0.23888637284580364, "grad_norm": 1.1940646171569824, "learning_rate": 6.135999813155371e-06, "loss": 0.6379, "step": 5011 }, { "epoch": 0.23893404524110315, "grad_norm": 1.6347004175186157, "learning_rate": 6.132377219070842e-06, "loss": 0.4579, "step": 5012 }, { "epoch": 0.23898171763640264, "grad_norm": 1.928120732307434, "learning_rate": 6.128755221720682e-06, "loss": 0.617, "step": 5013 }, { "epoch": 0.23902939003170215, "grad_norm": 1.2113068103790283, "learning_rate": 6.1251338216637255e-06, "loss": 0.5078, "step": 5014 }, { "epoch": 0.23907706242700164, "grad_norm": 1.2191598415374756, "learning_rate": 6.121513019458715e-06, "loss": 0.6691, "step": 5015 }, { "epoch": 0.23912473482230115, "grad_norm": 2.199414014816284, "learning_rate": 6.117892815664306e-06, "loss": 0.5423, "step": 5016 }, { "epoch": 0.23917240721760064, "grad_norm": 4.416726589202881, "learning_rate": 6.11427321083906e-06, "loss": 1.2491, "step": 5017 }, { "epoch": 0.23922007961290015, "grad_norm": 3.6274728775024414, "learning_rate": 6.110654205541438e-06, "loss": 0.538, "step": 5018 }, { "epoch": 0.23926775200819966, "grad_norm": 3.209824800491333, "learning_rate": 6.1070358003298215e-06, "loss": 1.2816, "step": 5019 }, { "epoch": 0.23931542440349915, "grad_norm": 1.406859040260315, "learning_rate": 6.103417995762493e-06, "loss": 0.7179, "step": 5020 }, { "epoch": 0.23936309679879866, "grad_norm": 1.953831672668457, "learning_rate": 6.099800792397636e-06, "loss": 0.2952, "step": 5021 }, { "epoch": 0.23941076919409815, "grad_norm": 2.9792706966400146, "learning_rate": 6.096184190793357e-06, "loss": 0.239, "step": 5022 }, { "epoch": 0.23945844158939766, "grad_norm": 1.1545838117599487, "learning_rate": 6.092568191507655e-06, "loss": 0.5336, "step": 5023 }, { "epoch": 0.23950611398469715, "grad_norm": 1.4073158502578735, "learning_rate": 6.088952795098442e-06, "loss": 0.8207, "step": 5024 }, { "epoch": 0.23955378637999666, "grad_norm": 2.970121145248413, "learning_rate": 6.085338002123534e-06, "loss": 0.6325, "step": 5025 }, { "epoch": 0.23960145877529618, "grad_norm": 1.9289273023605347, "learning_rate": 6.081723813140664e-06, "loss": 0.6894, "step": 5026 }, { "epoch": 0.23964913117059566, "grad_norm": 1.7830263376235962, "learning_rate": 6.078110228707454e-06, "loss": 0.8976, "step": 5027 }, { "epoch": 0.23969680356589518, "grad_norm": 1.3702656030654907, "learning_rate": 6.07449724938145e-06, "loss": 0.7642, "step": 5028 }, { "epoch": 0.23974447596119466, "grad_norm": 1.7167824506759644, "learning_rate": 6.0708848757200975e-06, "loss": 0.7334, "step": 5029 }, { "epoch": 0.23979214835649418, "grad_norm": 1.097110629081726, "learning_rate": 6.067273108280745e-06, "loss": 0.7213, "step": 5030 }, { "epoch": 0.23983982075179366, "grad_norm": 1.2784687280654907, "learning_rate": 6.0636619476206534e-06, "loss": 0.6287, "step": 5031 }, { "epoch": 0.23988749314709318, "grad_norm": 17.311893463134766, "learning_rate": 6.060051394296989e-06, "loss": 0.8498, "step": 5032 }, { "epoch": 0.23993516554239266, "grad_norm": 2.501488208770752, "learning_rate": 6.056441448866817e-06, "loss": 0.9281, "step": 5033 }, { "epoch": 0.23998283793769218, "grad_norm": 1.5192574262619019, "learning_rate": 6.052832111887117e-06, "loss": 0.7549, "step": 5034 }, { "epoch": 0.2400305103329917, "grad_norm": 1.5082474946975708, "learning_rate": 6.04922338391478e-06, "loss": 0.7535, "step": 5035 }, { "epoch": 0.24007818272829118, "grad_norm": 6.296762466430664, "learning_rate": 6.045615265506585e-06, "loss": 0.0771, "step": 5036 }, { "epoch": 0.2401258551235907, "grad_norm": 2.4316208362579346, "learning_rate": 6.0420077572192325e-06, "loss": 0.4872, "step": 5037 }, { "epoch": 0.24017352751889018, "grad_norm": 1.7005482912063599, "learning_rate": 6.038400859609327e-06, "loss": 0.668, "step": 5038 }, { "epoch": 0.2402211999141897, "grad_norm": 1.3777533769607544, "learning_rate": 6.034794573233371e-06, "loss": 0.2928, "step": 5039 }, { "epoch": 0.24026887230948918, "grad_norm": 2.814288854598999, "learning_rate": 6.031188898647776e-06, "loss": 0.5485, "step": 5040 }, { "epoch": 0.2403165447047887, "grad_norm": 6.852921485900879, "learning_rate": 6.027583836408868e-06, "loss": 0.6997, "step": 5041 }, { "epoch": 0.2403642171000882, "grad_norm": 1.5040916204452515, "learning_rate": 6.023979387072861e-06, "loss": 0.5527, "step": 5042 }, { "epoch": 0.2404118894953877, "grad_norm": 1.410093069076538, "learning_rate": 6.020375551195891e-06, "loss": 0.8848, "step": 5043 }, { "epoch": 0.2404595618906872, "grad_norm": 1.325937032699585, "learning_rate": 6.016772329333993e-06, "loss": 0.5595, "step": 5044 }, { "epoch": 0.2405072342859867, "grad_norm": 1.5293211936950684, "learning_rate": 6.013169722043104e-06, "loss": 0.6897, "step": 5045 }, { "epoch": 0.2405549066812862, "grad_norm": 1.9939907789230347, "learning_rate": 6.009567729879071e-06, "loss": 0.4316, "step": 5046 }, { "epoch": 0.2406025790765857, "grad_norm": 1.5865501165390015, "learning_rate": 6.005966353397643e-06, "loss": 0.9059, "step": 5047 }, { "epoch": 0.2406502514718852, "grad_norm": 1.3185622692108154, "learning_rate": 6.002365593154478e-06, "loss": 0.819, "step": 5048 }, { "epoch": 0.2406979238671847, "grad_norm": 1.858639121055603, "learning_rate": 5.998765449705131e-06, "loss": 0.5307, "step": 5049 }, { "epoch": 0.2407455962624842, "grad_norm": 1.3617935180664062, "learning_rate": 5.9951659236050695e-06, "loss": 0.8302, "step": 5050 }, { "epoch": 0.24079326865778372, "grad_norm": 1.3620930910110474, "learning_rate": 5.99156701540967e-06, "loss": 0.6854, "step": 5051 }, { "epoch": 0.2408409410530832, "grad_norm": 1.474213719367981, "learning_rate": 5.987968725674196e-06, "loss": 0.6022, "step": 5052 }, { "epoch": 0.24088861344838272, "grad_norm": 1.3608521223068237, "learning_rate": 5.9843710549538346e-06, "loss": 0.3048, "step": 5053 }, { "epoch": 0.2409362858436822, "grad_norm": 1.830669641494751, "learning_rate": 5.980774003803668e-06, "loss": 0.9192, "step": 5054 }, { "epoch": 0.24098395823898172, "grad_norm": 1.5334892272949219, "learning_rate": 5.977177572778679e-06, "loss": 0.5967, "step": 5055 }, { "epoch": 0.2410316306342812, "grad_norm": 3.262408971786499, "learning_rate": 5.973581762433763e-06, "loss": 0.5825, "step": 5056 }, { "epoch": 0.24107930302958072, "grad_norm": 1.8143086433410645, "learning_rate": 5.969986573323721e-06, "loss": 0.8515, "step": 5057 }, { "epoch": 0.24112697542488024, "grad_norm": 1.383774757385254, "learning_rate": 5.966392006003245e-06, "loss": 1.1035, "step": 5058 }, { "epoch": 0.24117464782017972, "grad_norm": 1.8638062477111816, "learning_rate": 5.9627980610269445e-06, "loss": 0.8606, "step": 5059 }, { "epoch": 0.24122232021547924, "grad_norm": 1.5183956623077393, "learning_rate": 5.959204738949334e-06, "loss": 0.7039, "step": 5060 }, { "epoch": 0.24126999261077872, "grad_norm": 2.2757411003112793, "learning_rate": 5.955612040324815e-06, "loss": 0.697, "step": 5061 }, { "epoch": 0.24131766500607824, "grad_norm": 1.1037594079971313, "learning_rate": 5.952019965707709e-06, "loss": 0.516, "step": 5062 }, { "epoch": 0.24136533740137772, "grad_norm": 2.3021318912506104, "learning_rate": 5.948428515652241e-06, "loss": 0.5897, "step": 5063 }, { "epoch": 0.24141300979667724, "grad_norm": 1.4584852457046509, "learning_rate": 5.944837690712524e-06, "loss": 0.8717, "step": 5064 }, { "epoch": 0.24146068219197672, "grad_norm": 1.1223253011703491, "learning_rate": 5.941247491442592e-06, "loss": 0.7506, "step": 5065 }, { "epoch": 0.24150835458727624, "grad_norm": 1.5056185722351074, "learning_rate": 5.9376579183963775e-06, "loss": 0.6489, "step": 5066 }, { "epoch": 0.24155602698257575, "grad_norm": 2.4089179039001465, "learning_rate": 5.9340689721277116e-06, "loss": 0.9086, "step": 5067 }, { "epoch": 0.24160369937787524, "grad_norm": 2.3049986362457275, "learning_rate": 5.930480653190331e-06, "loss": 0.4041, "step": 5068 }, { "epoch": 0.24165137177317475, "grad_norm": 1.4334743022918701, "learning_rate": 5.9268929621378805e-06, "loss": 0.6992, "step": 5069 }, { "epoch": 0.24169904416847424, "grad_norm": 1.4559268951416016, "learning_rate": 5.923305899523899e-06, "loss": 1.0898, "step": 5070 }, { "epoch": 0.24174671656377375, "grad_norm": 1.4106165170669556, "learning_rate": 5.919719465901834e-06, "loss": 0.5322, "step": 5071 }, { "epoch": 0.24179438895907324, "grad_norm": 1.2145572900772095, "learning_rate": 5.916133661825041e-06, "loss": 0.5322, "step": 5072 }, { "epoch": 0.24184206135437275, "grad_norm": 1.256256103515625, "learning_rate": 5.9125484878467635e-06, "loss": 0.5958, "step": 5073 }, { "epoch": 0.24188973374967226, "grad_norm": 3.8242619037628174, "learning_rate": 5.908963944520162e-06, "loss": 1.1024, "step": 5074 }, { "epoch": 0.24193740614497175, "grad_norm": 1.2726141214370728, "learning_rate": 5.9053800323982976e-06, "loss": 0.8752, "step": 5075 }, { "epoch": 0.24198507854027126, "grad_norm": 3.162296772003174, "learning_rate": 5.901796752034128e-06, "loss": 1.2186, "step": 5076 }, { "epoch": 0.24203275093557075, "grad_norm": 2.238673448562622, "learning_rate": 5.8982141039805115e-06, "loss": 0.7286, "step": 5077 }, { "epoch": 0.24208042333087026, "grad_norm": 1.8592641353607178, "learning_rate": 5.894632088790224e-06, "loss": 1.0557, "step": 5078 }, { "epoch": 0.24212809572616975, "grad_norm": 1.319214940071106, "learning_rate": 5.891050707015924e-06, "loss": 0.6095, "step": 5079 }, { "epoch": 0.24217576812146926, "grad_norm": 2.3480584621429443, "learning_rate": 5.887469959210186e-06, "loss": 0.4254, "step": 5080 }, { "epoch": 0.24222344051676878, "grad_norm": 1.9646015167236328, "learning_rate": 5.883889845925487e-06, "loss": 0.4465, "step": 5081 }, { "epoch": 0.24227111291206826, "grad_norm": 2.3036208152770996, "learning_rate": 5.880310367714192e-06, "loss": 0.7986, "step": 5082 }, { "epoch": 0.24231878530736778, "grad_norm": 2.5882246494293213, "learning_rate": 5.8767315251285854e-06, "loss": 0.7113, "step": 5083 }, { "epoch": 0.24236645770266727, "grad_norm": 2.2714028358459473, "learning_rate": 5.873153318720842e-06, "loss": 1.0477, "step": 5084 }, { "epoch": 0.24241413009796678, "grad_norm": 1.8316211700439453, "learning_rate": 5.869575749043044e-06, "loss": 0.793, "step": 5085 }, { "epoch": 0.24246180249326627, "grad_norm": 1.843741536140442, "learning_rate": 5.8659988166471715e-06, "loss": 1.0857, "step": 5086 }, { "epoch": 0.24250947488856578, "grad_norm": 3.6580004692077637, "learning_rate": 5.862422522085108e-06, "loss": 0.9047, "step": 5087 }, { "epoch": 0.24255714728386527, "grad_norm": 4.874394416809082, "learning_rate": 5.858846865908645e-06, "loss": 0.3122, "step": 5088 }, { "epoch": 0.24260481967916478, "grad_norm": 1.1394366025924683, "learning_rate": 5.855271848669462e-06, "loss": 0.6588, "step": 5089 }, { "epoch": 0.2426524920744643, "grad_norm": 1.784208059310913, "learning_rate": 5.851697470919151e-06, "loss": 0.4574, "step": 5090 }, { "epoch": 0.24270016446976378, "grad_norm": 2.3072550296783447, "learning_rate": 5.8481237332092014e-06, "loss": 0.5939, "step": 5091 }, { "epoch": 0.2427478368650633, "grad_norm": 1.3365013599395752, "learning_rate": 5.844550636091004e-06, "loss": 0.5374, "step": 5092 }, { "epoch": 0.24279550926036278, "grad_norm": 6.065027713775635, "learning_rate": 5.840978180115848e-06, "loss": 0.4395, "step": 5093 }, { "epoch": 0.2428431816556623, "grad_norm": 2.194348096847534, "learning_rate": 5.837406365834934e-06, "loss": 0.9389, "step": 5094 }, { "epoch": 0.24289085405096178, "grad_norm": 2.385526418685913, "learning_rate": 5.8338351937993476e-06, "loss": 0.7008, "step": 5095 }, { "epoch": 0.2429385264462613, "grad_norm": 1.4699710607528687, "learning_rate": 5.830264664560087e-06, "loss": 0.6011, "step": 5096 }, { "epoch": 0.2429861988415608, "grad_norm": 1.800408124923706, "learning_rate": 5.826694778668053e-06, "loss": 0.705, "step": 5097 }, { "epoch": 0.2430338712368603, "grad_norm": 1.5617139339447021, "learning_rate": 5.823125536674032e-06, "loss": 0.5671, "step": 5098 }, { "epoch": 0.2430815436321598, "grad_norm": 1.5853725671768188, "learning_rate": 5.81955693912873e-06, "loss": 0.973, "step": 5099 }, { "epoch": 0.2431292160274593, "grad_norm": 1.2248514890670776, "learning_rate": 5.815988986582745e-06, "loss": 0.5335, "step": 5100 }, { "epoch": 0.2431768884227588, "grad_norm": 1.5876938104629517, "learning_rate": 5.812421679586569e-06, "loss": 0.8773, "step": 5101 }, { "epoch": 0.2432245608180583, "grad_norm": 1.6629010438919067, "learning_rate": 5.808855018690607e-06, "loss": 0.8254, "step": 5102 }, { "epoch": 0.2432722332133578, "grad_norm": 2.62546706199646, "learning_rate": 5.805289004445155e-06, "loss": 0.4642, "step": 5103 }, { "epoch": 0.2433199056086573, "grad_norm": 1.9097936153411865, "learning_rate": 5.801723637400409e-06, "loss": 0.513, "step": 5104 }, { "epoch": 0.2433675780039568, "grad_norm": 2.2730860710144043, "learning_rate": 5.798158918106471e-06, "loss": 0.7113, "step": 5105 }, { "epoch": 0.24341525039925632, "grad_norm": 1.205796480178833, "learning_rate": 5.7945948471133466e-06, "loss": 0.729, "step": 5106 }, { "epoch": 0.2434629227945558, "grad_norm": 1.7572075128555298, "learning_rate": 5.791031424970926e-06, "loss": 0.8275, "step": 5107 }, { "epoch": 0.24351059518985532, "grad_norm": 1.4006179571151733, "learning_rate": 5.787468652229012e-06, "loss": 0.666, "step": 5108 }, { "epoch": 0.2435582675851548, "grad_norm": 1.7280313968658447, "learning_rate": 5.783906529437309e-06, "loss": 0.5724, "step": 5109 }, { "epoch": 0.24360593998045432, "grad_norm": 1.581921935081482, "learning_rate": 5.7803450571454066e-06, "loss": 0.8196, "step": 5110 }, { "epoch": 0.2436536123757538, "grad_norm": 2.0756044387817383, "learning_rate": 5.776784235902807e-06, "loss": 1.0043, "step": 5111 }, { "epoch": 0.24370128477105332, "grad_norm": 1.6152479648590088, "learning_rate": 5.773224066258913e-06, "loss": 0.6942, "step": 5112 }, { "epoch": 0.24374895716635284, "grad_norm": 1.7283798456192017, "learning_rate": 5.769664548763016e-06, "loss": 0.7455, "step": 5113 }, { "epoch": 0.24379662956165232, "grad_norm": 1.8666808605194092, "learning_rate": 5.766105683964314e-06, "loss": 0.868, "step": 5114 }, { "epoch": 0.24384430195695184, "grad_norm": 5.257474422454834, "learning_rate": 5.762547472411909e-06, "loss": 1.6388, "step": 5115 }, { "epoch": 0.24389197435225132, "grad_norm": 1.4893912076950073, "learning_rate": 5.758989914654787e-06, "loss": 0.305, "step": 5116 }, { "epoch": 0.24393964674755084, "grad_norm": 1.581531047821045, "learning_rate": 5.755433011241851e-06, "loss": 0.7188, "step": 5117 }, { "epoch": 0.24398731914285032, "grad_norm": 1.022304654121399, "learning_rate": 5.751876762721887e-06, "loss": 0.5168, "step": 5118 }, { "epoch": 0.24403499153814984, "grad_norm": 1.4325313568115234, "learning_rate": 5.748321169643596e-06, "loss": 0.7221, "step": 5119 }, { "epoch": 0.24408266393344932, "grad_norm": 3.6646790504455566, "learning_rate": 5.744766232555561e-06, "loss": 0.7195, "step": 5120 }, { "epoch": 0.24413033632874884, "grad_norm": 1.5659929513931274, "learning_rate": 5.741211952006278e-06, "loss": 0.5926, "step": 5121 }, { "epoch": 0.24417800872404835, "grad_norm": 1.4932897090911865, "learning_rate": 5.737658328544131e-06, "loss": 0.6188, "step": 5122 }, { "epoch": 0.24422568111934784, "grad_norm": 1.1666932106018066, "learning_rate": 5.73410536271741e-06, "loss": 0.5176, "step": 5123 }, { "epoch": 0.24427335351464735, "grad_norm": 2.5744693279266357, "learning_rate": 5.730553055074306e-06, "loss": 0.745, "step": 5124 }, { "epoch": 0.24432102590994684, "grad_norm": 1.3289794921875, "learning_rate": 5.7270014061628935e-06, "loss": 0.5933, "step": 5125 }, { "epoch": 0.24436869830524635, "grad_norm": 1.8544728755950928, "learning_rate": 5.7234504165311626e-06, "loss": 0.8654, "step": 5126 }, { "epoch": 0.24441637070054584, "grad_norm": 1.8545337915420532, "learning_rate": 5.71990008672699e-06, "loss": 0.783, "step": 5127 }, { "epoch": 0.24446404309584535, "grad_norm": 1.5942987203598022, "learning_rate": 5.716350417298163e-06, "loss": 0.6936, "step": 5128 }, { "epoch": 0.24451171549114487, "grad_norm": 1.5744047164916992, "learning_rate": 5.71280140879235e-06, "loss": 1.1856, "step": 5129 }, { "epoch": 0.24455938788644435, "grad_norm": 7.246407508850098, "learning_rate": 5.7092530617571284e-06, "loss": 1.1715, "step": 5130 }, { "epoch": 0.24460706028174387, "grad_norm": 1.7530832290649414, "learning_rate": 5.7057053767399784e-06, "loss": 0.8354, "step": 5131 }, { "epoch": 0.24465473267704335, "grad_norm": 1.4511467218399048, "learning_rate": 5.702158354288265e-06, "loss": 0.5385, "step": 5132 }, { "epoch": 0.24470240507234287, "grad_norm": 1.5028300285339355, "learning_rate": 5.698611994949257e-06, "loss": 0.7284, "step": 5133 }, { "epoch": 0.24475007746764235, "grad_norm": 1.682446837425232, "learning_rate": 5.6950662992701245e-06, "loss": 0.8775, "step": 5134 }, { "epoch": 0.24479774986294187, "grad_norm": 2.4769394397735596, "learning_rate": 5.691521267797926e-06, "loss": 1.1797, "step": 5135 }, { "epoch": 0.24484542225824135, "grad_norm": 1.847987413406372, "learning_rate": 5.687976901079626e-06, "loss": 0.5466, "step": 5136 }, { "epoch": 0.24489309465354087, "grad_norm": 1.2661330699920654, "learning_rate": 5.684433199662091e-06, "loss": 0.6113, "step": 5137 }, { "epoch": 0.24494076704884038, "grad_norm": 1.9135509729385376, "learning_rate": 5.680890164092065e-06, "loss": 0.6979, "step": 5138 }, { "epoch": 0.24498843944413987, "grad_norm": 1.2192476987838745, "learning_rate": 5.67734779491621e-06, "loss": 0.2719, "step": 5139 }, { "epoch": 0.24503611183943938, "grad_norm": 1.6965047121047974, "learning_rate": 5.67380609268108e-06, "loss": 0.7954, "step": 5140 }, { "epoch": 0.24508378423473887, "grad_norm": 1.8437730073928833, "learning_rate": 5.670265057933114e-06, "loss": 0.9539, "step": 5141 }, { "epoch": 0.24513145663003838, "grad_norm": 1.9941530227661133, "learning_rate": 5.666724691218663e-06, "loss": 1.1797, "step": 5142 }, { "epoch": 0.24517912902533787, "grad_norm": 3.011155128479004, "learning_rate": 5.663184993083971e-06, "loss": 0.7168, "step": 5143 }, { "epoch": 0.24522680142063738, "grad_norm": 3.826796293258667, "learning_rate": 5.65964596407517e-06, "loss": 1.4929, "step": 5144 }, { "epoch": 0.2452744738159369, "grad_norm": 1.3734126091003418, "learning_rate": 5.6561076047383e-06, "loss": 0.8289, "step": 5145 }, { "epoch": 0.24532214621123638, "grad_norm": 3.8718600273132324, "learning_rate": 5.652569915619297e-06, "loss": 0.388, "step": 5146 }, { "epoch": 0.2453698186065359, "grad_norm": 2.4398751258850098, "learning_rate": 5.649032897263986e-06, "loss": 1.4178, "step": 5147 }, { "epoch": 0.24541749100183538, "grad_norm": 1.8063709735870361, "learning_rate": 5.645496550218089e-06, "loss": 0.7274, "step": 5148 }, { "epoch": 0.2454651633971349, "grad_norm": 1.481279969215393, "learning_rate": 5.6419608750272355e-06, "loss": 0.7816, "step": 5149 }, { "epoch": 0.24551283579243438, "grad_norm": 1.690308928489685, "learning_rate": 5.638425872236937e-06, "loss": 0.562, "step": 5150 }, { "epoch": 0.2455605081877339, "grad_norm": 1.94339919090271, "learning_rate": 5.634891542392608e-06, "loss": 0.3454, "step": 5151 }, { "epoch": 0.24560818058303338, "grad_norm": 1.3810763359069824, "learning_rate": 5.631357886039568e-06, "loss": 0.8602, "step": 5152 }, { "epoch": 0.2456558529783329, "grad_norm": 1.471650242805481, "learning_rate": 5.627824903723014e-06, "loss": 0.822, "step": 5153 }, { "epoch": 0.2457035253736324, "grad_norm": 2.2222437858581543, "learning_rate": 5.624292595988052e-06, "loss": 0.5791, "step": 5154 }, { "epoch": 0.2457511977689319, "grad_norm": 1.2477086782455444, "learning_rate": 5.620760963379686e-06, "loss": 0.6096, "step": 5155 }, { "epoch": 0.2457988701642314, "grad_norm": 1.5124863386154175, "learning_rate": 5.617230006442802e-06, "loss": 0.6669, "step": 5156 }, { "epoch": 0.2458465425595309, "grad_norm": 1.1730290651321411, "learning_rate": 5.6136997257221946e-06, "loss": 0.3754, "step": 5157 }, { "epoch": 0.2458942149548304, "grad_norm": 3.3835866451263428, "learning_rate": 5.610170121762553e-06, "loss": 1.0491, "step": 5158 }, { "epoch": 0.2459418873501299, "grad_norm": 1.6068484783172607, "learning_rate": 5.60664119510845e-06, "loss": 0.561, "step": 5159 }, { "epoch": 0.2459895597454294, "grad_norm": 1.995864987373352, "learning_rate": 5.603112946304368e-06, "loss": 1.1287, "step": 5160 }, { "epoch": 0.24603723214072892, "grad_norm": 1.4355392456054688, "learning_rate": 5.599585375894684e-06, "loss": 0.855, "step": 5161 }, { "epoch": 0.2460849045360284, "grad_norm": 5.265583038330078, "learning_rate": 5.5960584844236565e-06, "loss": 1.7132, "step": 5162 }, { "epoch": 0.24613257693132792, "grad_norm": 1.4439984560012817, "learning_rate": 5.592532272435458e-06, "loss": 0.7141, "step": 5163 }, { "epoch": 0.2461802493266274, "grad_norm": 11.15322494506836, "learning_rate": 5.5890067404741365e-06, "loss": 0.6837, "step": 5164 }, { "epoch": 0.24622792172192692, "grad_norm": 4.258968353271484, "learning_rate": 5.585481889083655e-06, "loss": 1.7091, "step": 5165 }, { "epoch": 0.2462755941172264, "grad_norm": 1.1728631258010864, "learning_rate": 5.581957718807854e-06, "loss": 0.6019, "step": 5166 }, { "epoch": 0.24632326651252592, "grad_norm": 1.5525482892990112, "learning_rate": 5.578434230190478e-06, "loss": 0.7247, "step": 5167 }, { "epoch": 0.24637093890782544, "grad_norm": 1.144457459449768, "learning_rate": 5.574911423775173e-06, "loss": 0.6033, "step": 5168 }, { "epoch": 0.24641861130312492, "grad_norm": 1.0099642276763916, "learning_rate": 5.571389300105461e-06, "loss": 0.684, "step": 5169 }, { "epoch": 0.24646628369842444, "grad_norm": 1.2308775186538696, "learning_rate": 5.567867859724774e-06, "loss": 0.4182, "step": 5170 }, { "epoch": 0.24651395609372392, "grad_norm": 1.3812068700790405, "learning_rate": 5.5643471031764375e-06, "loss": 0.7686, "step": 5171 }, { "epoch": 0.24656162848902344, "grad_norm": 1.4897170066833496, "learning_rate": 5.560827031003661e-06, "loss": 0.7183, "step": 5172 }, { "epoch": 0.24660930088432292, "grad_norm": 2.8295412063598633, "learning_rate": 5.557307643749559e-06, "loss": 0.7561, "step": 5173 }, { "epoch": 0.24665697327962244, "grad_norm": 2.257028579711914, "learning_rate": 5.553788941957141e-06, "loss": 0.9433, "step": 5174 }, { "epoch": 0.24670464567492192, "grad_norm": 5.300502777099609, "learning_rate": 5.550270926169298e-06, "loss": 1.3377, "step": 5175 }, { "epoch": 0.24675231807022144, "grad_norm": 2.70894455909729, "learning_rate": 5.546753596928831e-06, "loss": 0.9223, "step": 5176 }, { "epoch": 0.24679999046552095, "grad_norm": 1.5173131227493286, "learning_rate": 5.543236954778421e-06, "loss": 0.686, "step": 5177 }, { "epoch": 0.24684766286082044, "grad_norm": 1.0150842666625977, "learning_rate": 5.539721000260658e-06, "loss": 0.4478, "step": 5178 }, { "epoch": 0.24689533525611995, "grad_norm": 1.1175934076309204, "learning_rate": 5.5362057339180075e-06, "loss": 0.7047, "step": 5179 }, { "epoch": 0.24694300765141944, "grad_norm": 26.487831115722656, "learning_rate": 5.532691156292849e-06, "loss": 0.3799, "step": 5180 }, { "epoch": 0.24699068004671895, "grad_norm": 2.0606038570404053, "learning_rate": 5.529177267927437e-06, "loss": 0.8949, "step": 5181 }, { "epoch": 0.24703835244201844, "grad_norm": 1.6512166261672974, "learning_rate": 5.52566406936393e-06, "loss": 0.5806, "step": 5182 }, { "epoch": 0.24708602483731795, "grad_norm": 2.8060967922210693, "learning_rate": 5.522151561144386e-06, "loss": 0.7323, "step": 5183 }, { "epoch": 0.24713369723261747, "grad_norm": 2.039449691772461, "learning_rate": 5.518639743810738e-06, "loss": 0.8048, "step": 5184 }, { "epoch": 0.24718136962791695, "grad_norm": 1.3262771368026733, "learning_rate": 5.51512861790483e-06, "loss": 0.696, "step": 5185 }, { "epoch": 0.24722904202321647, "grad_norm": 1.9851527214050293, "learning_rate": 5.5116181839683944e-06, "loss": 0.7769, "step": 5186 }, { "epoch": 0.24727671441851595, "grad_norm": 3.5180680751800537, "learning_rate": 5.508108442543048e-06, "loss": 0.3315, "step": 5187 }, { "epoch": 0.24732438681381547, "grad_norm": 1.4458363056182861, "learning_rate": 5.5045993941703094e-06, "loss": 0.8182, "step": 5188 }, { "epoch": 0.24737205920911495, "grad_norm": 2.6698710918426514, "learning_rate": 5.501091039391596e-06, "loss": 0.6063, "step": 5189 }, { "epoch": 0.24741973160441447, "grad_norm": 1.4538973569869995, "learning_rate": 5.497583378748201e-06, "loss": 0.6288, "step": 5190 }, { "epoch": 0.24746740399971395, "grad_norm": 1.2913610935211182, "learning_rate": 5.49407641278133e-06, "loss": 0.8647, "step": 5191 }, { "epoch": 0.24751507639501347, "grad_norm": 1.0243335962295532, "learning_rate": 5.490570142032061e-06, "loss": 0.4859, "step": 5192 }, { "epoch": 0.24756274879031298, "grad_norm": 1.1922807693481445, "learning_rate": 5.487064567041387e-06, "loss": 0.6223, "step": 5193 }, { "epoch": 0.24761042118561247, "grad_norm": 1.1932332515716553, "learning_rate": 5.48355968835017e-06, "loss": 0.1719, "step": 5194 }, { "epoch": 0.24765809358091198, "grad_norm": 1.3214937448501587, "learning_rate": 5.480055506499187e-06, "loss": 0.7055, "step": 5195 }, { "epoch": 0.24770576597621147, "grad_norm": 1.7870721817016602, "learning_rate": 5.476552022029089e-06, "loss": 0.847, "step": 5196 }, { "epoch": 0.24775343837151098, "grad_norm": 1.8140438795089722, "learning_rate": 5.473049235480432e-06, "loss": 0.8939, "step": 5197 }, { "epoch": 0.24780111076681047, "grad_norm": 20.688039779663086, "learning_rate": 5.4695471473936636e-06, "loss": 0.1567, "step": 5198 }, { "epoch": 0.24784878316210998, "grad_norm": 1.7145599126815796, "learning_rate": 5.466045758309111e-06, "loss": 0.8193, "step": 5199 }, { "epoch": 0.2478964555574095, "grad_norm": 1.3122328519821167, "learning_rate": 5.462545068767008e-06, "loss": 0.7377, "step": 5200 }, { "epoch": 0.24794412795270898, "grad_norm": 1.981058955192566, "learning_rate": 5.459045079307473e-06, "loss": 0.6913, "step": 5201 }, { "epoch": 0.2479918003480085, "grad_norm": 1.760473370552063, "learning_rate": 5.455545790470524e-06, "loss": 0.929, "step": 5202 }, { "epoch": 0.24803947274330798, "grad_norm": 1.4741325378417969, "learning_rate": 5.452047202796058e-06, "loss": 0.503, "step": 5203 }, { "epoch": 0.2480871451386075, "grad_norm": 1.8108981847763062, "learning_rate": 5.448549316823873e-06, "loss": 0.5086, "step": 5204 }, { "epoch": 0.24813481753390698, "grad_norm": 1.841205358505249, "learning_rate": 5.44505213309366e-06, "loss": 0.6914, "step": 5205 }, { "epoch": 0.2481824899292065, "grad_norm": 3.5677199363708496, "learning_rate": 5.4415556521449944e-06, "loss": 0.3478, "step": 5206 }, { "epoch": 0.24823016232450598, "grad_norm": 2.2605278491973877, "learning_rate": 5.4380598745173495e-06, "loss": 0.6591, "step": 5207 }, { "epoch": 0.2482778347198055, "grad_norm": 1.3539484739303589, "learning_rate": 5.434564800750091e-06, "loss": 0.669, "step": 5208 }, { "epoch": 0.248325507115105, "grad_norm": 1.6358530521392822, "learning_rate": 5.431070431382461e-06, "loss": 0.7204, "step": 5209 }, { "epoch": 0.2483731795104045, "grad_norm": 1.1210711002349854, "learning_rate": 5.427576766953615e-06, "loss": 0.209, "step": 5210 }, { "epoch": 0.248420851905704, "grad_norm": 1.661259651184082, "learning_rate": 5.424083808002591e-06, "loss": 0.7631, "step": 5211 }, { "epoch": 0.2484685243010035, "grad_norm": 1.5903949737548828, "learning_rate": 5.420591555068308e-06, "loss": 0.4729, "step": 5212 }, { "epoch": 0.248516196696303, "grad_norm": 2.0092742443084717, "learning_rate": 5.417100008689588e-06, "loss": 0.5033, "step": 5213 }, { "epoch": 0.2485638690916025, "grad_norm": 1.7281888723373413, "learning_rate": 5.413609169405148e-06, "loss": 0.7902, "step": 5214 }, { "epoch": 0.248611541486902, "grad_norm": 1.604560136795044, "learning_rate": 5.4101190377535785e-06, "loss": 0.4432, "step": 5215 }, { "epoch": 0.24865921388220152, "grad_norm": 1.378257393836975, "learning_rate": 5.406629614273373e-06, "loss": 0.7872, "step": 5216 }, { "epoch": 0.248706886277501, "grad_norm": 1.664696455001831, "learning_rate": 5.403140899502921e-06, "loss": 0.7958, "step": 5217 }, { "epoch": 0.24875455867280052, "grad_norm": 1.842162847518921, "learning_rate": 5.399652893980486e-06, "loss": 1.2515, "step": 5218 }, { "epoch": 0.2488022310681, "grad_norm": 1.205552101135254, "learning_rate": 5.396165598244234e-06, "loss": 0.5498, "step": 5219 }, { "epoch": 0.24884990346339952, "grad_norm": 2.4317197799682617, "learning_rate": 5.392679012832225e-06, "loss": 0.6628, "step": 5220 }, { "epoch": 0.248897575858699, "grad_norm": 1.5645935535430908, "learning_rate": 5.389193138282393e-06, "loss": 0.8413, "step": 5221 }, { "epoch": 0.24894524825399852, "grad_norm": 0.9529103636741638, "learning_rate": 5.385707975132582e-06, "loss": 0.5029, "step": 5222 }, { "epoch": 0.248992920649298, "grad_norm": 3.805121898651123, "learning_rate": 5.382223523920511e-06, "loss": 1.1281, "step": 5223 }, { "epoch": 0.24904059304459752, "grad_norm": 2.4295339584350586, "learning_rate": 5.378739785183794e-06, "loss": 0.6934, "step": 5224 }, { "epoch": 0.24908826543989704, "grad_norm": 1.1411960124969482, "learning_rate": 5.375256759459939e-06, "loss": 0.7916, "step": 5225 }, { "epoch": 0.24913593783519652, "grad_norm": 1.18546724319458, "learning_rate": 5.371774447286343e-06, "loss": 0.6397, "step": 5226 }, { "epoch": 0.24918361023049604, "grad_norm": 0.9702255725860596, "learning_rate": 5.368292849200285e-06, "loss": 0.3293, "step": 5227 }, { "epoch": 0.24923128262579552, "grad_norm": 1.247194766998291, "learning_rate": 5.364811965738943e-06, "loss": 0.5138, "step": 5228 }, { "epoch": 0.24927895502109504, "grad_norm": 2.6932270526885986, "learning_rate": 5.361331797439384e-06, "loss": 0.5032, "step": 5229 }, { "epoch": 0.24932662741639453, "grad_norm": 3.3012616634368896, "learning_rate": 5.357852344838557e-06, "loss": 0.986, "step": 5230 }, { "epoch": 0.24937429981169404, "grad_norm": 2.384352922439575, "learning_rate": 5.354373608473309e-06, "loss": 1.0236, "step": 5231 }, { "epoch": 0.24942197220699355, "grad_norm": 1.5132157802581787, "learning_rate": 5.350895588880376e-06, "loss": 0.746, "step": 5232 }, { "epoch": 0.24946964460229304, "grad_norm": 1.6053189039230347, "learning_rate": 5.347418286596372e-06, "loss": 0.8488, "step": 5233 }, { "epoch": 0.24951731699759255, "grad_norm": 1.4705743789672852, "learning_rate": 5.3439417021578154e-06, "loss": 0.8896, "step": 5234 }, { "epoch": 0.24956498939289204, "grad_norm": 2.051260471343994, "learning_rate": 5.340465836101109e-06, "loss": 0.5652, "step": 5235 }, { "epoch": 0.24961266178819155, "grad_norm": 1.3342063426971436, "learning_rate": 5.336990688962537e-06, "loss": 0.4792, "step": 5236 }, { "epoch": 0.24966033418349104, "grad_norm": 2.822033405303955, "learning_rate": 5.333516261278285e-06, "loss": 0.9676, "step": 5237 }, { "epoch": 0.24970800657879055, "grad_norm": 2.3854379653930664, "learning_rate": 5.330042553584416e-06, "loss": 1.1075, "step": 5238 }, { "epoch": 0.24975567897409004, "grad_norm": 1.594692349433899, "learning_rate": 5.3265695664168926e-06, "loss": 0.7578, "step": 5239 }, { "epoch": 0.24980335136938955, "grad_norm": 1.4988423585891724, "learning_rate": 5.323097300311553e-06, "loss": 0.8371, "step": 5240 }, { "epoch": 0.24985102376468907, "grad_norm": 1.9015144109725952, "learning_rate": 5.3196257558041386e-06, "loss": 0.9726, "step": 5241 }, { "epoch": 0.24989869615998855, "grad_norm": 1.7048072814941406, "learning_rate": 5.316154933430276e-06, "loss": 0.6322, "step": 5242 }, { "epoch": 0.24994636855528807, "grad_norm": 3.4977493286132812, "learning_rate": 5.312684833725468e-06, "loss": 1.444, "step": 5243 }, { "epoch": 0.24999404095058755, "grad_norm": 1.6783192157745361, "learning_rate": 5.309215457225121e-06, "loss": 0.5289, "step": 5244 }, { "epoch": 0.25004171334588704, "grad_norm": 2.484001874923706, "learning_rate": 5.305746804464526e-06, "loss": 0.2851, "step": 5245 }, { "epoch": 0.25008938574118655, "grad_norm": 1.3448243141174316, "learning_rate": 5.302278875978855e-06, "loss": 0.7044, "step": 5246 }, { "epoch": 0.25013705813648607, "grad_norm": 1.1596590280532837, "learning_rate": 5.298811672303174e-06, "loss": 0.6366, "step": 5247 }, { "epoch": 0.2501847305317856, "grad_norm": 1.43930983543396, "learning_rate": 5.295345193972445e-06, "loss": 0.6953, "step": 5248 }, { "epoch": 0.2502324029270851, "grad_norm": 2.0993826389312744, "learning_rate": 5.291879441521499e-06, "loss": 0.3127, "step": 5249 }, { "epoch": 0.25028007532238455, "grad_norm": 1.432981014251709, "learning_rate": 5.288414415485072e-06, "loss": 0.775, "step": 5250 }, { "epoch": 0.25032774771768407, "grad_norm": 2.131667137145996, "learning_rate": 5.2849501163977846e-06, "loss": 0.8331, "step": 5251 }, { "epoch": 0.2503754201129836, "grad_norm": 1.0865910053253174, "learning_rate": 5.281486544794139e-06, "loss": 0.452, "step": 5252 }, { "epoch": 0.2504230925082831, "grad_norm": 2.509237766265869, "learning_rate": 5.278023701208523e-06, "loss": 0.8609, "step": 5253 }, { "epoch": 0.25047076490358255, "grad_norm": 1.3018155097961426, "learning_rate": 5.274561586175226e-06, "loss": 0.4136, "step": 5254 }, { "epoch": 0.25051843729888207, "grad_norm": 3.1016173362731934, "learning_rate": 5.271100200228412e-06, "loss": 1.3016, "step": 5255 }, { "epoch": 0.2505661096941816, "grad_norm": 1.2012239694595337, "learning_rate": 5.2676395439021385e-06, "loss": 0.691, "step": 5256 }, { "epoch": 0.2506137820894811, "grad_norm": 1.8148443698883057, "learning_rate": 5.264179617730353e-06, "loss": 0.83, "step": 5257 }, { "epoch": 0.2506614544847806, "grad_norm": 2.251467227935791, "learning_rate": 5.260720422246879e-06, "loss": 0.7924, "step": 5258 }, { "epoch": 0.25070912688008007, "grad_norm": 2.4197072982788086, "learning_rate": 5.257261957985438e-06, "loss": 0.3565, "step": 5259 }, { "epoch": 0.2507567992753796, "grad_norm": 1.0452011823654175, "learning_rate": 5.253804225479642e-06, "loss": 0.6552, "step": 5260 }, { "epoch": 0.2508044716706791, "grad_norm": 1.5356444120407104, "learning_rate": 5.250347225262972e-06, "loss": 0.5288, "step": 5261 }, { "epoch": 0.2508521440659786, "grad_norm": 2.0456509590148926, "learning_rate": 5.246890957868813e-06, "loss": 0.9027, "step": 5262 }, { "epoch": 0.2508998164612781, "grad_norm": 5.187041282653809, "learning_rate": 5.243435423830436e-06, "loss": 0.8829, "step": 5263 }, { "epoch": 0.2509474888565776, "grad_norm": 1.9498354196548462, "learning_rate": 5.239980623680987e-06, "loss": 1.0658, "step": 5264 }, { "epoch": 0.2509951612518771, "grad_norm": 1.5801407098770142, "learning_rate": 5.236526557953508e-06, "loss": 1.1578, "step": 5265 }, { "epoch": 0.2510428336471766, "grad_norm": 4.058075904846191, "learning_rate": 5.233073227180932e-06, "loss": 0.4922, "step": 5266 }, { "epoch": 0.2510905060424761, "grad_norm": 1.8664467334747314, "learning_rate": 5.229620631896065e-06, "loss": 0.8485, "step": 5267 }, { "epoch": 0.2511381784377756, "grad_norm": 2.2463502883911133, "learning_rate": 5.226168772631606e-06, "loss": 0.538, "step": 5268 }, { "epoch": 0.2511858508330751, "grad_norm": 1.2202684879302979, "learning_rate": 5.22271764992015e-06, "loss": 0.665, "step": 5269 }, { "epoch": 0.2512335232283746, "grad_norm": 1.7239629030227661, "learning_rate": 5.219267264294159e-06, "loss": 0.5625, "step": 5270 }, { "epoch": 0.2512811956236741, "grad_norm": 1.5027018785476685, "learning_rate": 5.215817616285996e-06, "loss": 0.5832, "step": 5271 }, { "epoch": 0.25132886801897364, "grad_norm": 1.3608691692352295, "learning_rate": 5.212368706427913e-06, "loss": 1.0427, "step": 5272 }, { "epoch": 0.2513765404142731, "grad_norm": 1.505457878112793, "learning_rate": 5.20892053525203e-06, "loss": 0.7733, "step": 5273 }, { "epoch": 0.2514242128095726, "grad_norm": 1.5560054779052734, "learning_rate": 5.2054731032903704e-06, "loss": 0.7243, "step": 5274 }, { "epoch": 0.2514718852048721, "grad_norm": 1.2661142349243164, "learning_rate": 5.202026411074841e-06, "loss": 0.6936, "step": 5275 }, { "epoch": 0.25151955760017164, "grad_norm": 1.3761965036392212, "learning_rate": 5.198580459137224e-06, "loss": 0.6419, "step": 5276 }, { "epoch": 0.2515672299954711, "grad_norm": 1.3047969341278076, "learning_rate": 5.195135248009196e-06, "loss": 0.951, "step": 5277 }, { "epoch": 0.2516149023907706, "grad_norm": 1.3792136907577515, "learning_rate": 5.191690778222318e-06, "loss": 0.6444, "step": 5278 }, { "epoch": 0.2516625747860701, "grad_norm": 1.1357731819152832, "learning_rate": 5.188247050308042e-06, "loss": 0.7093, "step": 5279 }, { "epoch": 0.25171024718136964, "grad_norm": 1.6982618570327759, "learning_rate": 5.18480406479769e-06, "loss": 0.7102, "step": 5280 }, { "epoch": 0.25175791957666915, "grad_norm": 1.6509249210357666, "learning_rate": 5.181361822222488e-06, "loss": 0.5838, "step": 5281 }, { "epoch": 0.2518055919719686, "grad_norm": 1.6463545560836792, "learning_rate": 5.177920323113531e-06, "loss": 0.7904, "step": 5282 }, { "epoch": 0.2518532643672681, "grad_norm": 2.519686222076416, "learning_rate": 5.174479568001813e-06, "loss": 0.5023, "step": 5283 }, { "epoch": 0.25190093676256764, "grad_norm": 2.3719160556793213, "learning_rate": 5.1710395574182026e-06, "loss": 1.0992, "step": 5284 }, { "epoch": 0.25194860915786715, "grad_norm": 1.668278455734253, "learning_rate": 5.167600291893462e-06, "loss": 0.9031, "step": 5285 }, { "epoch": 0.2519962815531666, "grad_norm": 1.5020698308944702, "learning_rate": 5.16416177195823e-06, "loss": 0.6652, "step": 5286 }, { "epoch": 0.2520439539484661, "grad_norm": 1.4558539390563965, "learning_rate": 5.1607239981430355e-06, "loss": 0.6531, "step": 5287 }, { "epoch": 0.25209162634376564, "grad_norm": 1.1254277229309082, "learning_rate": 5.1572869709782965e-06, "loss": 0.5503, "step": 5288 }, { "epoch": 0.25213929873906515, "grad_norm": 2.740602731704712, "learning_rate": 5.153850690994306e-06, "loss": 0.7217, "step": 5289 }, { "epoch": 0.25218697113436467, "grad_norm": 1.6636793613433838, "learning_rate": 5.150415158721247e-06, "loss": 0.6611, "step": 5290 }, { "epoch": 0.2522346435296641, "grad_norm": 2.151529312133789, "learning_rate": 5.146980374689192e-06, "loss": 0.2378, "step": 5291 }, { "epoch": 0.25228231592496364, "grad_norm": 1.524881362915039, "learning_rate": 5.143546339428085e-06, "loss": 0.4307, "step": 5292 }, { "epoch": 0.25232998832026315, "grad_norm": 1.3537802696228027, "learning_rate": 5.140113053467765e-06, "loss": 0.5461, "step": 5293 }, { "epoch": 0.25237766071556267, "grad_norm": 1.3754298686981201, "learning_rate": 5.1366805173379575e-06, "loss": 0.5483, "step": 5294 }, { "epoch": 0.2524253331108622, "grad_norm": 2.7808735370635986, "learning_rate": 5.133248731568261e-06, "loss": 0.5615, "step": 5295 }, { "epoch": 0.25247300550616164, "grad_norm": 1.1643399000167847, "learning_rate": 5.1298176966881705e-06, "loss": 0.7063, "step": 5296 }, { "epoch": 0.25252067790146115, "grad_norm": 3.938659906387329, "learning_rate": 5.126387413227053e-06, "loss": 0.7151, "step": 5297 }, { "epoch": 0.25256835029676067, "grad_norm": 1.7053200006484985, "learning_rate": 5.122957881714172e-06, "loss": 0.7394, "step": 5298 }, { "epoch": 0.2526160226920602, "grad_norm": 1.0608376264572144, "learning_rate": 5.119529102678665e-06, "loss": 0.599, "step": 5299 }, { "epoch": 0.25266369508735964, "grad_norm": 1.681151032447815, "learning_rate": 5.116101076649559e-06, "loss": 0.8599, "step": 5300 }, { "epoch": 0.25271136748265915, "grad_norm": 1.4083402156829834, "learning_rate": 5.112673804155759e-06, "loss": 0.746, "step": 5301 }, { "epoch": 0.25275903987795867, "grad_norm": 1.378050684928894, "learning_rate": 5.109247285726062e-06, "loss": 0.5942, "step": 5302 }, { "epoch": 0.2528067122732582, "grad_norm": 1.2420076131820679, "learning_rate": 5.105821521889147e-06, "loss": 0.6261, "step": 5303 }, { "epoch": 0.2528543846685577, "grad_norm": 3.186793327331543, "learning_rate": 5.102396513173569e-06, "loss": 1.267, "step": 5304 }, { "epoch": 0.25290205706385716, "grad_norm": 1.5938936471939087, "learning_rate": 5.098972260107771e-06, "loss": 0.9206, "step": 5305 }, { "epoch": 0.25294972945915667, "grad_norm": 1.4803907871246338, "learning_rate": 5.0955487632200885e-06, "loss": 0.714, "step": 5306 }, { "epoch": 0.2529974018544562, "grad_norm": 1.4852310419082642, "learning_rate": 5.0921260230387195e-06, "loss": 0.7737, "step": 5307 }, { "epoch": 0.2530450742497557, "grad_norm": 3.1453282833099365, "learning_rate": 5.088704040091765e-06, "loss": 0.9843, "step": 5308 }, { "epoch": 0.25309274664505516, "grad_norm": 1.649553656578064, "learning_rate": 5.085282814907205e-06, "loss": 0.7155, "step": 5309 }, { "epoch": 0.25314041904035467, "grad_norm": 1.3235059976577759, "learning_rate": 5.081862348012892e-06, "loss": 0.9422, "step": 5310 }, { "epoch": 0.2531880914356542, "grad_norm": 1.7374777793884277, "learning_rate": 5.0784426399365725e-06, "loss": 0.6619, "step": 5311 }, { "epoch": 0.2532357638309537, "grad_norm": 1.2235597372055054, "learning_rate": 5.075023691205869e-06, "loss": 0.5919, "step": 5312 }, { "epoch": 0.2532834362262532, "grad_norm": 2.3804304599761963, "learning_rate": 5.071605502348297e-06, "loss": 0.5582, "step": 5313 }, { "epoch": 0.25333110862155267, "grad_norm": 4.60269021987915, "learning_rate": 5.068188073891238e-06, "loss": 0.601, "step": 5314 }, { "epoch": 0.2533787810168522, "grad_norm": 1.9278857707977295, "learning_rate": 5.064771406361973e-06, "loss": 0.371, "step": 5315 }, { "epoch": 0.2534264534121517, "grad_norm": 1.3372584581375122, "learning_rate": 5.06135550028766e-06, "loss": 0.7439, "step": 5316 }, { "epoch": 0.2534741258074512, "grad_norm": 2.1416614055633545, "learning_rate": 5.057940356195332e-06, "loss": 0.6466, "step": 5317 }, { "epoch": 0.25352179820275067, "grad_norm": 1.6502330303192139, "learning_rate": 5.054525974611913e-06, "loss": 0.8499, "step": 5318 }, { "epoch": 0.2535694705980502, "grad_norm": 1.8139033317565918, "learning_rate": 5.051112356064212e-06, "loss": 0.5642, "step": 5319 }, { "epoch": 0.2536171429933497, "grad_norm": 1.177715539932251, "learning_rate": 5.047699501078905e-06, "loss": 0.4434, "step": 5320 }, { "epoch": 0.2536648153886492, "grad_norm": 2.207502603530884, "learning_rate": 5.044287410182568e-06, "loss": 0.8216, "step": 5321 }, { "epoch": 0.2537124877839487, "grad_norm": 3.4811158180236816, "learning_rate": 5.040876083901654e-06, "loss": 1.068, "step": 5322 }, { "epoch": 0.2537601601792482, "grad_norm": 1.4357497692108154, "learning_rate": 5.037465522762486e-06, "loss": 0.342, "step": 5323 }, { "epoch": 0.2538078325745477, "grad_norm": 2.882291078567505, "learning_rate": 5.034055727291283e-06, "loss": 0.7484, "step": 5324 }, { "epoch": 0.2538555049698472, "grad_norm": 2.103181838989258, "learning_rate": 5.0306466980141475e-06, "loss": 0.9806, "step": 5325 }, { "epoch": 0.2539031773651467, "grad_norm": 3.2125372886657715, "learning_rate": 5.027238435457047e-06, "loss": 1.2797, "step": 5326 }, { "epoch": 0.25395084976044624, "grad_norm": 2.267927646636963, "learning_rate": 5.023830940145851e-06, "loss": 0.9789, "step": 5327 }, { "epoch": 0.2539985221557457, "grad_norm": 4.6167449951171875, "learning_rate": 5.0204242126062964e-06, "loss": 0.509, "step": 5328 }, { "epoch": 0.2540461945510452, "grad_norm": 1.4431474208831787, "learning_rate": 5.017018253364001e-06, "loss": 0.5063, "step": 5329 }, { "epoch": 0.2540938669463447, "grad_norm": 1.4122262001037598, "learning_rate": 5.0136130629444755e-06, "loss": 0.8872, "step": 5330 }, { "epoch": 0.25414153934164424, "grad_norm": 3.4316749572753906, "learning_rate": 5.010208641873109e-06, "loss": 1.3353, "step": 5331 }, { "epoch": 0.2541892117369437, "grad_norm": 1.7836222648620605, "learning_rate": 5.006804990675158e-06, "loss": 0.706, "step": 5332 }, { "epoch": 0.2542368841322432, "grad_norm": 3.4742655754089355, "learning_rate": 5.003402109875779e-06, "loss": 1.4159, "step": 5333 }, { "epoch": 0.2542845565275427, "grad_norm": 2.4239935874938965, "learning_rate": 5.000000000000003e-06, "loss": 0.7234, "step": 5334 }, { "epoch": 0.25433222892284224, "grad_norm": 0.8686856627464294, "learning_rate": 4.996598661572732e-06, "loss": 0.3915, "step": 5335 }, { "epoch": 0.25437990131814175, "grad_norm": 2.139826774597168, "learning_rate": 4.993198095118763e-06, "loss": 1.1792, "step": 5336 }, { "epoch": 0.2544275737134412, "grad_norm": 1.431489109992981, "learning_rate": 4.989798301162772e-06, "loss": 0.612, "step": 5337 }, { "epoch": 0.2544752461087407, "grad_norm": 2.7369344234466553, "learning_rate": 4.986399280229304e-06, "loss": 1.3041, "step": 5338 }, { "epoch": 0.25452291850404024, "grad_norm": 1.7966467142105103, "learning_rate": 4.983001032842797e-06, "loss": 0.6154, "step": 5339 }, { "epoch": 0.25457059089933975, "grad_norm": 1.8476978540420532, "learning_rate": 4.979603559527569e-06, "loss": 0.8012, "step": 5340 }, { "epoch": 0.2546182632946392, "grad_norm": 1.9659759998321533, "learning_rate": 4.976206860807808e-06, "loss": 0.8354, "step": 5341 }, { "epoch": 0.2546659356899387, "grad_norm": 2.0037567615509033, "learning_rate": 4.972810937207599e-06, "loss": 0.8945, "step": 5342 }, { "epoch": 0.25471360808523824, "grad_norm": 2.5204577445983887, "learning_rate": 4.96941578925089e-06, "loss": 0.7901, "step": 5343 }, { "epoch": 0.25476128048053776, "grad_norm": 3.08522629737854, "learning_rate": 4.9660214174615165e-06, "loss": 1.033, "step": 5344 }, { "epoch": 0.25480895287583727, "grad_norm": 1.8553155660629272, "learning_rate": 4.9626278223631985e-06, "loss": 0.9732, "step": 5345 }, { "epoch": 0.2548566252711367, "grad_norm": 2.0226073265075684, "learning_rate": 4.959235004479537e-06, "loss": 0.4234, "step": 5346 }, { "epoch": 0.25490429766643624, "grad_norm": 1.4804707765579224, "learning_rate": 4.955842964334e-06, "loss": 0.5954, "step": 5347 }, { "epoch": 0.25495197006173576, "grad_norm": 1.7300760746002197, "learning_rate": 4.952451702449949e-06, "loss": 0.8861, "step": 5348 }, { "epoch": 0.25499964245703527, "grad_norm": 1.476699709892273, "learning_rate": 4.949061219350624e-06, "loss": 0.597, "step": 5349 }, { "epoch": 0.2550473148523348, "grad_norm": 1.777708888053894, "learning_rate": 4.945671515559135e-06, "loss": 0.7481, "step": 5350 }, { "epoch": 0.25509498724763424, "grad_norm": 1.6463401317596436, "learning_rate": 4.942282591598481e-06, "loss": 0.9888, "step": 5351 }, { "epoch": 0.25514265964293376, "grad_norm": 1.3731558322906494, "learning_rate": 4.938894447991544e-06, "loss": 0.5485, "step": 5352 }, { "epoch": 0.25519033203823327, "grad_norm": 1.1557267904281616, "learning_rate": 4.935507085261069e-06, "loss": 0.6922, "step": 5353 }, { "epoch": 0.2552380044335328, "grad_norm": 1.6899571418762207, "learning_rate": 4.932120503929696e-06, "loss": 0.6992, "step": 5354 }, { "epoch": 0.25528567682883224, "grad_norm": 1.7605578899383545, "learning_rate": 4.928734704519945e-06, "loss": 0.9358, "step": 5355 }, { "epoch": 0.25533334922413176, "grad_norm": 1.504723072052002, "learning_rate": 4.925349687554201e-06, "loss": 0.7706, "step": 5356 }, { "epoch": 0.25538102161943127, "grad_norm": 0.941290020942688, "learning_rate": 4.921965453554747e-06, "loss": 0.3572, "step": 5357 }, { "epoch": 0.2554286940147308, "grad_norm": 1.3282690048217773, "learning_rate": 4.918582003043724e-06, "loss": 0.9303, "step": 5358 }, { "epoch": 0.2554763664100303, "grad_norm": 1.637980580329895, "learning_rate": 4.9151993365431735e-06, "loss": 0.4201, "step": 5359 }, { "epoch": 0.25552403880532976, "grad_norm": 1.318679690361023, "learning_rate": 4.911817454575e-06, "loss": 0.9315, "step": 5360 }, { "epoch": 0.25557171120062927, "grad_norm": 2.373147487640381, "learning_rate": 4.908436357660993e-06, "loss": 0.4076, "step": 5361 }, { "epoch": 0.2556193835959288, "grad_norm": 1.6679741144180298, "learning_rate": 4.905056046322828e-06, "loss": 0.3708, "step": 5362 }, { "epoch": 0.2556670559912283, "grad_norm": 1.6805018186569214, "learning_rate": 4.901676521082043e-06, "loss": 0.8703, "step": 5363 }, { "epoch": 0.25571472838652776, "grad_norm": 2.537540912628174, "learning_rate": 4.8982977824600685e-06, "loss": 0.4596, "step": 5364 }, { "epoch": 0.25576240078182727, "grad_norm": 1.9816516637802124, "learning_rate": 4.894919830978212e-06, "loss": 1.0274, "step": 5365 }, { "epoch": 0.2558100731771268, "grad_norm": 2.5843825340270996, "learning_rate": 4.89154266715765e-06, "loss": 1.0977, "step": 5366 }, { "epoch": 0.2558577455724263, "grad_norm": 1.8490378856658936, "learning_rate": 4.888166291519449e-06, "loss": 0.9713, "step": 5367 }, { "epoch": 0.2559054179677258, "grad_norm": 2.8207590579986572, "learning_rate": 4.884790704584549e-06, "loss": 0.7729, "step": 5368 }, { "epoch": 0.25595309036302527, "grad_norm": 1.501035451889038, "learning_rate": 4.881415906873763e-06, "loss": 0.473, "step": 5369 }, { "epoch": 0.2560007627583248, "grad_norm": 2.073540210723877, "learning_rate": 4.878041898907793e-06, "loss": 0.678, "step": 5370 }, { "epoch": 0.2560484351536243, "grad_norm": 1.812528133392334, "learning_rate": 4.874668681207215e-06, "loss": 0.9846, "step": 5371 }, { "epoch": 0.2560961075489238, "grad_norm": 1.4102495908737183, "learning_rate": 4.871296254292479e-06, "loss": 0.5277, "step": 5372 }, { "epoch": 0.25614377994422327, "grad_norm": 1.2758127450942993, "learning_rate": 4.867924618683911e-06, "loss": 0.8227, "step": 5373 }, { "epoch": 0.2561914523395228, "grad_norm": 1.5390113592147827, "learning_rate": 4.8645537749017295e-06, "loss": 0.8553, "step": 5374 }, { "epoch": 0.2562391247348223, "grad_norm": 1.7708622217178345, "learning_rate": 4.861183723466011e-06, "loss": 0.6534, "step": 5375 }, { "epoch": 0.2562867971301218, "grad_norm": 1.7983982563018799, "learning_rate": 4.857814464896724e-06, "loss": 0.7854, "step": 5376 }, { "epoch": 0.2563344695254213, "grad_norm": 8.485481262207031, "learning_rate": 4.854445999713715e-06, "loss": 0.7118, "step": 5377 }, { "epoch": 0.2563821419207208, "grad_norm": 1.3313298225402832, "learning_rate": 4.851078328436696e-06, "loss": 0.745, "step": 5378 }, { "epoch": 0.2564298143160203, "grad_norm": 6.466851234436035, "learning_rate": 4.847711451585266e-06, "loss": 1.2719, "step": 5379 }, { "epoch": 0.2564774867113198, "grad_norm": 2.2267777919769287, "learning_rate": 4.8443453696789055e-06, "loss": 1.1769, "step": 5380 }, { "epoch": 0.2565251591066193, "grad_norm": 1.290576696395874, "learning_rate": 4.840980083236958e-06, "loss": 0.9364, "step": 5381 }, { "epoch": 0.25657283150191884, "grad_norm": 1.5591100454330444, "learning_rate": 4.837615592778655e-06, "loss": 0.5752, "step": 5382 }, { "epoch": 0.2566205038972183, "grad_norm": 1.2779251337051392, "learning_rate": 4.834251898823108e-06, "loss": 0.5798, "step": 5383 }, { "epoch": 0.2566681762925178, "grad_norm": 3.384645700454712, "learning_rate": 4.8308890018892914e-06, "loss": 1.7959, "step": 5384 }, { "epoch": 0.2567158486878173, "grad_norm": 1.457976222038269, "learning_rate": 4.827526902496073e-06, "loss": 0.5435, "step": 5385 }, { "epoch": 0.25676352108311684, "grad_norm": 1.695383906364441, "learning_rate": 4.8241656011621886e-06, "loss": 0.4573, "step": 5386 }, { "epoch": 0.2568111934784163, "grad_norm": 5.1100287437438965, "learning_rate": 4.8208050984062515e-06, "loss": 0.7021, "step": 5387 }, { "epoch": 0.2568588658737158, "grad_norm": 1.2557580471038818, "learning_rate": 4.817445394746749e-06, "loss": 0.5739, "step": 5388 }, { "epoch": 0.2569065382690153, "grad_norm": 1.3719812631607056, "learning_rate": 4.814086490702056e-06, "loss": 0.4644, "step": 5389 }, { "epoch": 0.25695421066431484, "grad_norm": 1.3114820718765259, "learning_rate": 4.810728386790409e-06, "loss": 0.9014, "step": 5390 }, { "epoch": 0.25700188305961436, "grad_norm": 1.641396164894104, "learning_rate": 4.807371083529933e-06, "loss": 0.4296, "step": 5391 }, { "epoch": 0.2570495554549138, "grad_norm": 3.38991379737854, "learning_rate": 4.8040145814386245e-06, "loss": 1.0257, "step": 5392 }, { "epoch": 0.25709722785021333, "grad_norm": 2.380986452102661, "learning_rate": 4.800658881034362e-06, "loss": 0.2672, "step": 5393 }, { "epoch": 0.25714490024551284, "grad_norm": 2.6957061290740967, "learning_rate": 4.797303982834887e-06, "loss": 0.694, "step": 5394 }, { "epoch": 0.25719257264081236, "grad_norm": 1.5798816680908203, "learning_rate": 4.79394988735783e-06, "loss": 0.6173, "step": 5395 }, { "epoch": 0.2572402450361118, "grad_norm": 2.6076266765594482, "learning_rate": 4.790596595120699e-06, "loss": 1.404, "step": 5396 }, { "epoch": 0.25728791743141133, "grad_norm": 1.587837815284729, "learning_rate": 4.787244106640861e-06, "loss": 0.4521, "step": 5397 }, { "epoch": 0.25733558982671084, "grad_norm": 1.3771586418151855, "learning_rate": 4.783892422435577e-06, "loss": 0.719, "step": 5398 }, { "epoch": 0.25738326222201036, "grad_norm": 1.5652211904525757, "learning_rate": 4.780541543021981e-06, "loss": 0.6053, "step": 5399 }, { "epoch": 0.25743093461730987, "grad_norm": 1.33492910861969, "learning_rate": 4.7771914689170704e-06, "loss": 0.6448, "step": 5400 }, { "epoch": 0.25747860701260933, "grad_norm": 3.785646438598633, "learning_rate": 4.773842200637736e-06, "loss": 0.8005, "step": 5401 }, { "epoch": 0.25752627940790884, "grad_norm": 1.115527868270874, "learning_rate": 4.770493738700727e-06, "loss": 0.724, "step": 5402 }, { "epoch": 0.25757395180320836, "grad_norm": 1.87267005443573, "learning_rate": 4.7671460836226845e-06, "loss": 0.5195, "step": 5403 }, { "epoch": 0.25762162419850787, "grad_norm": 1.386061429977417, "learning_rate": 4.763799235920109e-06, "loss": 0.8242, "step": 5404 }, { "epoch": 0.25766929659380733, "grad_norm": 1.486469030380249, "learning_rate": 4.760453196109394e-06, "loss": 0.6666, "step": 5405 }, { "epoch": 0.25771696898910684, "grad_norm": 1.8709571361541748, "learning_rate": 4.757107964706788e-06, "loss": 0.8234, "step": 5406 }, { "epoch": 0.25776464138440636, "grad_norm": 1.7033472061157227, "learning_rate": 4.753763542228433e-06, "loss": 0.858, "step": 5407 }, { "epoch": 0.25781231377970587, "grad_norm": 2.236379861831665, "learning_rate": 4.750419929190342e-06, "loss": 1.2991, "step": 5408 }, { "epoch": 0.2578599861750054, "grad_norm": 2.5572237968444824, "learning_rate": 4.7470771261083915e-06, "loss": 0.869, "step": 5409 }, { "epoch": 0.25790765857030484, "grad_norm": 1.9142961502075195, "learning_rate": 4.743735133498346e-06, "loss": 1.0696, "step": 5410 }, { "epoch": 0.25795533096560436, "grad_norm": 1.1364426612854004, "learning_rate": 4.740393951875843e-06, "loss": 0.6, "step": 5411 }, { "epoch": 0.25800300336090387, "grad_norm": 2.2477362155914307, "learning_rate": 4.737053581756387e-06, "loss": 1.0807, "step": 5412 }, { "epoch": 0.2580506757562034, "grad_norm": 1.9631001949310303, "learning_rate": 4.733714023655366e-06, "loss": 0.791, "step": 5413 }, { "epoch": 0.2580983481515029, "grad_norm": 2.146395444869995, "learning_rate": 4.730375278088042e-06, "loss": 0.6613, "step": 5414 }, { "epoch": 0.25814602054680236, "grad_norm": 1.670370101928711, "learning_rate": 4.727037345569542e-06, "loss": 0.912, "step": 5415 }, { "epoch": 0.25819369294210187, "grad_norm": 1.6563982963562012, "learning_rate": 4.723700226614882e-06, "loss": 0.9118, "step": 5416 }, { "epoch": 0.2582413653374014, "grad_norm": 3.493528366088867, "learning_rate": 4.7203639217389385e-06, "loss": 1.1187, "step": 5417 }, { "epoch": 0.2582890377327009, "grad_norm": 1.8224412202835083, "learning_rate": 4.717028431456475e-06, "loss": 0.5434, "step": 5418 }, { "epoch": 0.25833671012800036, "grad_norm": 2.786318778991699, "learning_rate": 4.713693756282118e-06, "loss": 0.8327, "step": 5419 }, { "epoch": 0.25838438252329987, "grad_norm": 4.453581809997559, "learning_rate": 4.710359896730379e-06, "loss": 1.0295, "step": 5420 }, { "epoch": 0.2584320549185994, "grad_norm": 1.4712653160095215, "learning_rate": 4.7070268533156315e-06, "loss": 0.6225, "step": 5421 }, { "epoch": 0.2584797273138989, "grad_norm": 1.6703158617019653, "learning_rate": 4.7036946265521335e-06, "loss": 0.7127, "step": 5422 }, { "epoch": 0.2585273997091984, "grad_norm": 1.2520533800125122, "learning_rate": 4.700363216954017e-06, "loss": 0.6962, "step": 5423 }, { "epoch": 0.25857507210449787, "grad_norm": 1.633542537689209, "learning_rate": 4.697032625035277e-06, "loss": 0.5982, "step": 5424 }, { "epoch": 0.2586227444997974, "grad_norm": 1.5557509660720825, "learning_rate": 4.693702851309793e-06, "loss": 0.3589, "step": 5425 }, { "epoch": 0.2586704168950969, "grad_norm": 1.874098300933838, "learning_rate": 4.690373896291318e-06, "loss": 0.5059, "step": 5426 }, { "epoch": 0.2587180892903964, "grad_norm": 1.8717933893203735, "learning_rate": 4.687045760493468e-06, "loss": 0.3555, "step": 5427 }, { "epoch": 0.25876576168569587, "grad_norm": 1.6686381101608276, "learning_rate": 4.683718444429746e-06, "loss": 0.6425, "step": 5428 }, { "epoch": 0.2588134340809954, "grad_norm": 1.3108073472976685, "learning_rate": 4.680391948613523e-06, "loss": 0.7588, "step": 5429 }, { "epoch": 0.2588611064762949, "grad_norm": 1.8829936981201172, "learning_rate": 4.677066273558038e-06, "loss": 0.8808, "step": 5430 }, { "epoch": 0.2589087788715944, "grad_norm": 0.9224647283554077, "learning_rate": 4.673741419776414e-06, "loss": 0.3131, "step": 5431 }, { "epoch": 0.25895645126689393, "grad_norm": 1.2959907054901123, "learning_rate": 4.670417387781638e-06, "loss": 0.7307, "step": 5432 }, { "epoch": 0.2590041236621934, "grad_norm": 1.546425700187683, "learning_rate": 4.6670941780865765e-06, "loss": 0.8808, "step": 5433 }, { "epoch": 0.2590517960574929, "grad_norm": 4.931987762451172, "learning_rate": 4.663771791203961e-06, "loss": 0.2057, "step": 5434 }, { "epoch": 0.2590994684527924, "grad_norm": 1.8837611675262451, "learning_rate": 4.660450227646407e-06, "loss": 1.178, "step": 5435 }, { "epoch": 0.25914714084809193, "grad_norm": 2.875595808029175, "learning_rate": 4.657129487926398e-06, "loss": 1.1141, "step": 5436 }, { "epoch": 0.2591948132433914, "grad_norm": 1.40999436378479, "learning_rate": 4.653809572556286e-06, "loss": 0.7087, "step": 5437 }, { "epoch": 0.2592424856386909, "grad_norm": 2.65108323097229, "learning_rate": 4.650490482048302e-06, "loss": 1.5518, "step": 5438 }, { "epoch": 0.2592901580339904, "grad_norm": 2.2369801998138428, "learning_rate": 4.647172216914551e-06, "loss": 1.1065, "step": 5439 }, { "epoch": 0.25933783042928993, "grad_norm": 1.1635276079177856, "learning_rate": 4.643854777666998e-06, "loss": 0.8449, "step": 5440 }, { "epoch": 0.25938550282458944, "grad_norm": 3.396414041519165, "learning_rate": 4.6405381648174976e-06, "loss": 0.6044, "step": 5441 }, { "epoch": 0.2594331752198889, "grad_norm": 2.0113134384155273, "learning_rate": 4.637222378877768e-06, "loss": 0.6767, "step": 5442 }, { "epoch": 0.2594808476151884, "grad_norm": 1.8615201711654663, "learning_rate": 4.633907420359397e-06, "loss": 0.8034, "step": 5443 }, { "epoch": 0.25952852001048793, "grad_norm": 2.0538718700408936, "learning_rate": 4.630593289773852e-06, "loss": 0.731, "step": 5444 }, { "epoch": 0.25957619240578744, "grad_norm": 2.7262766361236572, "learning_rate": 4.62727998763247e-06, "loss": 1.2903, "step": 5445 }, { "epoch": 0.25962386480108696, "grad_norm": 2.282655715942383, "learning_rate": 4.623967514446455e-06, "loss": 1.0324, "step": 5446 }, { "epoch": 0.2596715371963864, "grad_norm": 1.3803765773773193, "learning_rate": 4.620655870726893e-06, "loss": 0.825, "step": 5447 }, { "epoch": 0.25971920959168593, "grad_norm": 1.0821335315704346, "learning_rate": 4.617345056984734e-06, "loss": 0.553, "step": 5448 }, { "epoch": 0.25976688198698544, "grad_norm": 1.1088868379592896, "learning_rate": 4.614035073730798e-06, "loss": 0.5277, "step": 5449 }, { "epoch": 0.25981455438228496, "grad_norm": 4.25206184387207, "learning_rate": 4.610725921475786e-06, "loss": 0.4931, "step": 5450 }, { "epoch": 0.2598622267775844, "grad_norm": 2.424816370010376, "learning_rate": 4.60741760073027e-06, "loss": 0.6865, "step": 5451 }, { "epoch": 0.25990989917288393, "grad_norm": 0.9026040434837341, "learning_rate": 4.60411011200468e-06, "loss": 0.3713, "step": 5452 }, { "epoch": 0.25995757156818344, "grad_norm": 1.8744639158248901, "learning_rate": 4.600803455809334e-06, "loss": 0.5632, "step": 5453 }, { "epoch": 0.26000524396348296, "grad_norm": 1.4494572877883911, "learning_rate": 4.597497632654416e-06, "loss": 0.8144, "step": 5454 }, { "epoch": 0.26005291635878247, "grad_norm": 3.379366159439087, "learning_rate": 4.594192643049976e-06, "loss": 0.7011, "step": 5455 }, { "epoch": 0.26010058875408193, "grad_norm": 1.1887792348861694, "learning_rate": 4.590888487505941e-06, "loss": 0.7537, "step": 5456 }, { "epoch": 0.26014826114938144, "grad_norm": 1.5102972984313965, "learning_rate": 4.587585166532115e-06, "loss": 0.5196, "step": 5457 }, { "epoch": 0.26019593354468096, "grad_norm": 1.0989429950714111, "learning_rate": 4.584282680638155e-06, "loss": 0.4849, "step": 5458 }, { "epoch": 0.26024360593998047, "grad_norm": 1.5081100463867188, "learning_rate": 4.580981030333606e-06, "loss": 0.8332, "step": 5459 }, { "epoch": 0.26029127833527993, "grad_norm": 1.6823418140411377, "learning_rate": 4.577680216127885e-06, "loss": 1.0103, "step": 5460 }, { "epoch": 0.26033895073057944, "grad_norm": 2.515145778656006, "learning_rate": 4.574380238530262e-06, "loss": 0.6499, "step": 5461 }, { "epoch": 0.26038662312587896, "grad_norm": 1.6356525421142578, "learning_rate": 4.5710810980498996e-06, "loss": 0.5256, "step": 5462 }, { "epoch": 0.26043429552117847, "grad_norm": 5.0318732261657715, "learning_rate": 4.567782795195816e-06, "loss": 0.7241, "step": 5463 }, { "epoch": 0.260481967916478, "grad_norm": 1.9279476404190063, "learning_rate": 4.564485330476903e-06, "loss": 0.5398, "step": 5464 }, { "epoch": 0.26052964031177744, "grad_norm": 1.2882212400436401, "learning_rate": 4.561188704401929e-06, "loss": 0.6823, "step": 5465 }, { "epoch": 0.26057731270707696, "grad_norm": 1.5207080841064453, "learning_rate": 4.557892917479532e-06, "loss": 0.6403, "step": 5466 }, { "epoch": 0.26062498510237647, "grad_norm": 0.802099347114563, "learning_rate": 4.5545979702182105e-06, "loss": 0.4393, "step": 5467 }, { "epoch": 0.260672657497676, "grad_norm": 1.274107813835144, "learning_rate": 4.551303863126346e-06, "loss": 0.7238, "step": 5468 }, { "epoch": 0.2607203298929755, "grad_norm": 3.5149638652801514, "learning_rate": 4.5480105967121855e-06, "loss": 0.6169, "step": 5469 }, { "epoch": 0.26076800228827496, "grad_norm": 1.603751301765442, "learning_rate": 4.544718171483849e-06, "loss": 0.7579, "step": 5470 }, { "epoch": 0.26081567468357447, "grad_norm": 1.1629642248153687, "learning_rate": 4.541426587949315e-06, "loss": 0.3967, "step": 5471 }, { "epoch": 0.260863347078874, "grad_norm": 1.7083592414855957, "learning_rate": 4.538135846616447e-06, "loss": 0.6814, "step": 5472 }, { "epoch": 0.2609110194741735, "grad_norm": 1.3178696632385254, "learning_rate": 4.534845947992975e-06, "loss": 0.8221, "step": 5473 }, { "epoch": 0.26095869186947296, "grad_norm": 2.870692253112793, "learning_rate": 4.53155689258649e-06, "loss": 1.0803, "step": 5474 }, { "epoch": 0.2610063642647725, "grad_norm": 1.6962885856628418, "learning_rate": 4.528268680904465e-06, "loss": 0.6986, "step": 5475 }, { "epoch": 0.261054036660072, "grad_norm": 2.3852999210357666, "learning_rate": 4.524981313454232e-06, "loss": 1.0803, "step": 5476 }, { "epoch": 0.2611017090553715, "grad_norm": 1.1798049211502075, "learning_rate": 4.521694790743003e-06, "loss": 0.6418, "step": 5477 }, { "epoch": 0.261149381450671, "grad_norm": 1.8000279664993286, "learning_rate": 4.51840911327785e-06, "loss": 0.9297, "step": 5478 }, { "epoch": 0.2611970538459705, "grad_norm": 1.672968864440918, "learning_rate": 4.515124281565724e-06, "loss": 0.3185, "step": 5479 }, { "epoch": 0.26124472624127, "grad_norm": 1.692679524421692, "learning_rate": 4.511840296113434e-06, "loss": 1.0013, "step": 5480 }, { "epoch": 0.2612923986365695, "grad_norm": 1.3779551982879639, "learning_rate": 4.50855715742767e-06, "loss": 0.7715, "step": 5481 }, { "epoch": 0.261340071031869, "grad_norm": 1.5197752714157104, "learning_rate": 4.505274866014989e-06, "loss": 0.7554, "step": 5482 }, { "epoch": 0.2613877434271685, "grad_norm": 1.719963788986206, "learning_rate": 4.501993422381807e-06, "loss": 0.821, "step": 5483 }, { "epoch": 0.261435415822468, "grad_norm": 2.0611915588378906, "learning_rate": 4.4987128270344224e-06, "loss": 0.7306, "step": 5484 }, { "epoch": 0.2614830882177675, "grad_norm": 1.6833240985870361, "learning_rate": 4.4954330804790004e-06, "loss": 0.3735, "step": 5485 }, { "epoch": 0.261530760613067, "grad_norm": 2.8871450424194336, "learning_rate": 4.492154183221565e-06, "loss": 0.9497, "step": 5486 }, { "epoch": 0.26157843300836653, "grad_norm": 2.9115235805511475, "learning_rate": 4.488876135768017e-06, "loss": 1.0516, "step": 5487 }, { "epoch": 0.261626105403666, "grad_norm": 2.3396236896514893, "learning_rate": 4.485598938624133e-06, "loss": 0.6537, "step": 5488 }, { "epoch": 0.2616737777989655, "grad_norm": 1.2771376371383667, "learning_rate": 4.482322592295541e-06, "loss": 0.8405, "step": 5489 }, { "epoch": 0.261721450194265, "grad_norm": 1.1913001537322998, "learning_rate": 4.479047097287752e-06, "loss": 0.5927, "step": 5490 }, { "epoch": 0.26176912258956453, "grad_norm": 1.5550049543380737, "learning_rate": 4.475772454106144e-06, "loss": 0.8143, "step": 5491 }, { "epoch": 0.261816794984864, "grad_norm": 1.6769016981124878, "learning_rate": 4.47249866325596e-06, "loss": 0.7834, "step": 5492 }, { "epoch": 0.2618644673801635, "grad_norm": 2.9642839431762695, "learning_rate": 4.469225725242304e-06, "loss": 0.5986, "step": 5493 }, { "epoch": 0.261912139775463, "grad_norm": 1.7533074617385864, "learning_rate": 4.465953640570167e-06, "loss": 1.1534, "step": 5494 }, { "epoch": 0.26195981217076253, "grad_norm": 13.53782844543457, "learning_rate": 4.462682409744391e-06, "loss": 1.3399, "step": 5495 }, { "epoch": 0.26200748456606204, "grad_norm": 1.1808326244354248, "learning_rate": 4.459412033269695e-06, "loss": 0.5596, "step": 5496 }, { "epoch": 0.2620551569613615, "grad_norm": 1.5229246616363525, "learning_rate": 4.456142511650669e-06, "loss": 0.6204, "step": 5497 }, { "epoch": 0.262102829356661, "grad_norm": 2.106015205383301, "learning_rate": 4.452873845391759e-06, "loss": 0.809, "step": 5498 }, { "epoch": 0.26215050175196053, "grad_norm": 3.06536602973938, "learning_rate": 4.44960603499729e-06, "loss": 0.6219, "step": 5499 }, { "epoch": 0.26219817414726004, "grad_norm": 1.0425934791564941, "learning_rate": 4.4463390809714566e-06, "loss": 0.608, "step": 5500 }, { "epoch": 0.26224584654255956, "grad_norm": 2.1033732891082764, "learning_rate": 4.4430729838183065e-06, "loss": 0.9811, "step": 5501 }, { "epoch": 0.262293518937859, "grad_norm": 1.278491735458374, "learning_rate": 4.43980774404177e-06, "loss": 0.3289, "step": 5502 }, { "epoch": 0.26234119133315853, "grad_norm": 2.7843470573425293, "learning_rate": 4.436543362145643e-06, "loss": 0.813, "step": 5503 }, { "epoch": 0.26238886372845804, "grad_norm": 1.3629486560821533, "learning_rate": 4.433279838633581e-06, "loss": 0.8361, "step": 5504 }, { "epoch": 0.26243653612375756, "grad_norm": 1.8491785526275635, "learning_rate": 4.430017174009111e-06, "loss": 0.7298, "step": 5505 }, { "epoch": 0.262484208519057, "grad_norm": 1.9057613611221313, "learning_rate": 4.426755368775637e-06, "loss": 0.8231, "step": 5506 }, { "epoch": 0.26253188091435653, "grad_norm": 2.157076597213745, "learning_rate": 4.423494423436415e-06, "loss": 0.9327, "step": 5507 }, { "epoch": 0.26257955330965604, "grad_norm": 2.145273447036743, "learning_rate": 4.420234338494574e-06, "loss": 0.9537, "step": 5508 }, { "epoch": 0.26262722570495556, "grad_norm": 2.3533525466918945, "learning_rate": 4.416975114453114e-06, "loss": 0.9066, "step": 5509 }, { "epoch": 0.26267489810025507, "grad_norm": 2.4520652294158936, "learning_rate": 4.4137167518149025e-06, "loss": 0.821, "step": 5510 }, { "epoch": 0.26272257049555453, "grad_norm": 2.2237584590911865, "learning_rate": 4.410459251082666e-06, "loss": 0.7576, "step": 5511 }, { "epoch": 0.26277024289085404, "grad_norm": 5.048548221588135, "learning_rate": 4.407202612759005e-06, "loss": 0.4592, "step": 5512 }, { "epoch": 0.26281791528615356, "grad_norm": 1.1977540254592896, "learning_rate": 4.40394683734639e-06, "loss": 0.607, "step": 5513 }, { "epoch": 0.2628655876814531, "grad_norm": 1.9406508207321167, "learning_rate": 4.400691925347147e-06, "loss": 0.9795, "step": 5514 }, { "epoch": 0.26291326007675253, "grad_norm": 1.7332086563110352, "learning_rate": 4.397437877263478e-06, "loss": 0.7935, "step": 5515 }, { "epoch": 0.26296093247205204, "grad_norm": 1.489671230316162, "learning_rate": 4.394184693597452e-06, "loss": 0.9046, "step": 5516 }, { "epoch": 0.26300860486735156, "grad_norm": 1.7054924964904785, "learning_rate": 4.390932374850996e-06, "loss": 0.6331, "step": 5517 }, { "epoch": 0.2630562772626511, "grad_norm": 1.5960817337036133, "learning_rate": 4.387680921525912e-06, "loss": 0.9622, "step": 5518 }, { "epoch": 0.2631039496579506, "grad_norm": 2.127215623855591, "learning_rate": 4.38443033412387e-06, "loss": 1.2882, "step": 5519 }, { "epoch": 0.26315162205325004, "grad_norm": 1.7925151586532593, "learning_rate": 4.381180613146396e-06, "loss": 0.8053, "step": 5520 }, { "epoch": 0.26319929444854956, "grad_norm": 1.0914833545684814, "learning_rate": 4.377931759094892e-06, "loss": 0.3883, "step": 5521 }, { "epoch": 0.2632469668438491, "grad_norm": 1.1829966306686401, "learning_rate": 4.374683772470619e-06, "loss": 0.7149, "step": 5522 }, { "epoch": 0.2632946392391486, "grad_norm": 2.2585456371307373, "learning_rate": 4.371436653774714e-06, "loss": 0.8343, "step": 5523 }, { "epoch": 0.26334231163444805, "grad_norm": 2.3260622024536133, "learning_rate": 4.368190403508167e-06, "loss": 0.531, "step": 5524 }, { "epoch": 0.26338998402974756, "grad_norm": 2.0901191234588623, "learning_rate": 4.364945022171847e-06, "loss": 0.6537, "step": 5525 }, { "epoch": 0.2634376564250471, "grad_norm": 4.0685505867004395, "learning_rate": 4.361700510266477e-06, "loss": 1.0741, "step": 5526 }, { "epoch": 0.2634853288203466, "grad_norm": 1.9662244319915771, "learning_rate": 4.3584568682926555e-06, "loss": 1.2344, "step": 5527 }, { "epoch": 0.2635330012156461, "grad_norm": 1.5806111097335815, "learning_rate": 4.355214096750846e-06, "loss": 0.7201, "step": 5528 }, { "epoch": 0.26358067361094556, "grad_norm": 2.1054883003234863, "learning_rate": 4.351972196141368e-06, "loss": 1.0543, "step": 5529 }, { "epoch": 0.2636283460062451, "grad_norm": 1.5422677993774414, "learning_rate": 4.348731166964415e-06, "loss": 0.8393, "step": 5530 }, { "epoch": 0.2636760184015446, "grad_norm": 3.49763822555542, "learning_rate": 4.345491009720052e-06, "loss": 0.7909, "step": 5531 }, { "epoch": 0.2637236907968441, "grad_norm": 3.4069478511810303, "learning_rate": 4.342251724908191e-06, "loss": 1.0201, "step": 5532 }, { "epoch": 0.2637713631921436, "grad_norm": 1.5217338800430298, "learning_rate": 4.339013313028626e-06, "loss": 0.3428, "step": 5533 }, { "epoch": 0.2638190355874431, "grad_norm": 2.437505006790161, "learning_rate": 4.3357757745810126e-06, "loss": 1.3109, "step": 5534 }, { "epoch": 0.2638667079827426, "grad_norm": 2.4502081871032715, "learning_rate": 4.332539110064864e-06, "loss": 0.8723, "step": 5535 }, { "epoch": 0.2639143803780421, "grad_norm": 1.987545371055603, "learning_rate": 4.329303319979571e-06, "loss": 1.0509, "step": 5536 }, { "epoch": 0.2639620527733416, "grad_norm": 2.6507227420806885, "learning_rate": 4.326068404824375e-06, "loss": 0.7547, "step": 5537 }, { "epoch": 0.2640097251686411, "grad_norm": 1.219498872756958, "learning_rate": 4.322834365098398e-06, "loss": 0.9566, "step": 5538 }, { "epoch": 0.2640573975639406, "grad_norm": 2.285651683807373, "learning_rate": 4.319601201300611e-06, "loss": 0.5146, "step": 5539 }, { "epoch": 0.2641050699592401, "grad_norm": 1.547282099723816, "learning_rate": 4.316368913929864e-06, "loss": 0.872, "step": 5540 }, { "epoch": 0.2641527423545396, "grad_norm": 1.4937717914581299, "learning_rate": 4.3131375034848624e-06, "loss": 0.3426, "step": 5541 }, { "epoch": 0.26420041474983913, "grad_norm": 3.798511505126953, "learning_rate": 4.30990697046418e-06, "loss": 0.4038, "step": 5542 }, { "epoch": 0.2642480871451386, "grad_norm": 2.3030643463134766, "learning_rate": 4.306677315366258e-06, "loss": 0.4771, "step": 5543 }, { "epoch": 0.2642957595404381, "grad_norm": 2.7596776485443115, "learning_rate": 4.303448538689393e-06, "loss": 0.8721, "step": 5544 }, { "epoch": 0.2643434319357376, "grad_norm": 2.4076898097991943, "learning_rate": 4.300220640931756e-06, "loss": 1.0586, "step": 5545 }, { "epoch": 0.26439110433103713, "grad_norm": 1.0083461999893188, "learning_rate": 4.296993622591377e-06, "loss": 0.5221, "step": 5546 }, { "epoch": 0.2644387767263366, "grad_norm": 3.0331618785858154, "learning_rate": 4.293767484166157e-06, "loss": 1.1717, "step": 5547 }, { "epoch": 0.2644864491216361, "grad_norm": 1.4020642042160034, "learning_rate": 4.290542226153847e-06, "loss": 0.7718, "step": 5548 }, { "epoch": 0.2645341215169356, "grad_norm": 1.8946406841278076, "learning_rate": 4.287317849052075e-06, "loss": 0.6381, "step": 5549 }, { "epoch": 0.26458179391223513, "grad_norm": 1.953478217124939, "learning_rate": 4.284094353358334e-06, "loss": 0.7938, "step": 5550 }, { "epoch": 0.26462946630753464, "grad_norm": 1.3801584243774414, "learning_rate": 4.280871739569972e-06, "loss": 0.7182, "step": 5551 }, { "epoch": 0.2646771387028341, "grad_norm": 2.052669048309326, "learning_rate": 4.277650008184201e-06, "loss": 0.9194, "step": 5552 }, { "epoch": 0.2647248110981336, "grad_norm": 2.213862895965576, "learning_rate": 4.274429159698109e-06, "loss": 0.4967, "step": 5553 }, { "epoch": 0.26477248349343313, "grad_norm": 1.3609672784805298, "learning_rate": 4.271209194608631e-06, "loss": 0.7002, "step": 5554 }, { "epoch": 0.26482015588873264, "grad_norm": 2.0467896461486816, "learning_rate": 4.26799011341258e-06, "loss": 0.6498, "step": 5555 }, { "epoch": 0.26486782828403216, "grad_norm": 1.2618590593338013, "learning_rate": 4.26477191660663e-06, "loss": 0.5392, "step": 5556 }, { "epoch": 0.2649155006793316, "grad_norm": 3.4503111839294434, "learning_rate": 4.261554604687308e-06, "loss": 0.5836, "step": 5557 }, { "epoch": 0.26496317307463113, "grad_norm": 3.568570613861084, "learning_rate": 4.2583381781510156e-06, "loss": 0.6466, "step": 5558 }, { "epoch": 0.26501084546993064, "grad_norm": 2.2931630611419678, "learning_rate": 4.255122637494018e-06, "loss": 1.0741, "step": 5559 }, { "epoch": 0.26505851786523016, "grad_norm": 1.4329898357391357, "learning_rate": 4.251907983212435e-06, "loss": 1.2203, "step": 5560 }, { "epoch": 0.2651061902605296, "grad_norm": 1.5274324417114258, "learning_rate": 4.248694215802254e-06, "loss": 0.7771, "step": 5561 }, { "epoch": 0.26515386265582913, "grad_norm": 2.9416232109069824, "learning_rate": 4.245481335759333e-06, "loss": 0.31, "step": 5562 }, { "epoch": 0.26520153505112865, "grad_norm": 2.5782439708709717, "learning_rate": 4.2422693435793785e-06, "loss": 0.2944, "step": 5563 }, { "epoch": 0.26524920744642816, "grad_norm": 1.7043336629867554, "learning_rate": 4.23905823975797e-06, "loss": 0.9789, "step": 5564 }, { "epoch": 0.2652968798417277, "grad_norm": 1.4189471006393433, "learning_rate": 4.2358480247905535e-06, "loss": 0.1828, "step": 5565 }, { "epoch": 0.26534455223702713, "grad_norm": 1.8899550437927246, "learning_rate": 4.2326386991724235e-06, "loss": 0.885, "step": 5566 }, { "epoch": 0.26539222463232665, "grad_norm": 1.5510401725769043, "learning_rate": 4.229430263398754e-06, "loss": 0.7526, "step": 5567 }, { "epoch": 0.26543989702762616, "grad_norm": 6.7725982666015625, "learning_rate": 4.2262227179645685e-06, "loss": 1.264, "step": 5568 }, { "epoch": 0.2654875694229257, "grad_norm": 1.9495378732681274, "learning_rate": 4.2230160633647565e-06, "loss": 0.5061, "step": 5569 }, { "epoch": 0.26553524181822513, "grad_norm": 1.143641471862793, "learning_rate": 4.2198103000940735e-06, "loss": 0.7493, "step": 5570 }, { "epoch": 0.26558291421352465, "grad_norm": 1.6225953102111816, "learning_rate": 4.216605428647141e-06, "loss": 0.51, "step": 5571 }, { "epoch": 0.26563058660882416, "grad_norm": 1.806950330734253, "learning_rate": 4.213401449518431e-06, "loss": 0.5473, "step": 5572 }, { "epoch": 0.2656782590041237, "grad_norm": 1.0339572429656982, "learning_rate": 4.210198363202286e-06, "loss": 0.422, "step": 5573 }, { "epoch": 0.2657259313994232, "grad_norm": 2.8314456939697266, "learning_rate": 4.206996170192913e-06, "loss": 0.6091, "step": 5574 }, { "epoch": 0.26577360379472265, "grad_norm": 1.6317073106765747, "learning_rate": 4.203794870984371e-06, "loss": 0.9413, "step": 5575 }, { "epoch": 0.26582127619002216, "grad_norm": 4.767714500427246, "learning_rate": 4.200594466070592e-06, "loss": 1.8776, "step": 5576 }, { "epoch": 0.2658689485853217, "grad_norm": 1.2538121938705444, "learning_rate": 4.197394955945368e-06, "loss": 0.9933, "step": 5577 }, { "epoch": 0.2659166209806212, "grad_norm": 1.630173683166504, "learning_rate": 4.1941963411023425e-06, "loss": 0.8027, "step": 5578 }, { "epoch": 0.26596429337592065, "grad_norm": 1.1768262386322021, "learning_rate": 4.190998622035034e-06, "loss": 0.5997, "step": 5579 }, { "epoch": 0.26601196577122016, "grad_norm": 1.3510141372680664, "learning_rate": 4.1878017992368205e-06, "loss": 0.6135, "step": 5580 }, { "epoch": 0.2660596381665197, "grad_norm": 1.6025506258010864, "learning_rate": 4.184605873200932e-06, "loss": 0.8922, "step": 5581 }, { "epoch": 0.2661073105618192, "grad_norm": 1.844774603843689, "learning_rate": 4.181410844420473e-06, "loss": 0.4993, "step": 5582 }, { "epoch": 0.2661549829571187, "grad_norm": 1.4028141498565674, "learning_rate": 4.1782167133883985e-06, "loss": 0.6382, "step": 5583 }, { "epoch": 0.26620265535241816, "grad_norm": 2.228483200073242, "learning_rate": 4.1750234805975355e-06, "loss": 0.4291, "step": 5584 }, { "epoch": 0.2662503277477177, "grad_norm": 1.0068373680114746, "learning_rate": 4.17183114654056e-06, "loss": 0.2973, "step": 5585 }, { "epoch": 0.2662980001430172, "grad_norm": 3.4178378582000732, "learning_rate": 4.168639711710019e-06, "loss": 0.5526, "step": 5586 }, { "epoch": 0.2663456725383167, "grad_norm": 1.0401500463485718, "learning_rate": 4.165449176598325e-06, "loss": 0.574, "step": 5587 }, { "epoch": 0.2663933449336162, "grad_norm": 2.2149927616119385, "learning_rate": 4.162259541697734e-06, "loss": 0.1844, "step": 5588 }, { "epoch": 0.2664410173289157, "grad_norm": 1.817918062210083, "learning_rate": 4.159070807500378e-06, "loss": 0.5699, "step": 5589 }, { "epoch": 0.2664886897242152, "grad_norm": 3.1390933990478516, "learning_rate": 4.155882974498251e-06, "loss": 0.4993, "step": 5590 }, { "epoch": 0.2665363621195147, "grad_norm": 1.4505109786987305, "learning_rate": 4.152696043183194e-06, "loss": 0.6816, "step": 5591 }, { "epoch": 0.2665840345148142, "grad_norm": 1.4374276399612427, "learning_rate": 4.149510014046922e-06, "loss": 0.7264, "step": 5592 }, { "epoch": 0.2666317069101137, "grad_norm": 1.9963061809539795, "learning_rate": 4.14632488758101e-06, "loss": 1.2008, "step": 5593 }, { "epoch": 0.2666793793054132, "grad_norm": 1.6760303974151611, "learning_rate": 4.143140664276884e-06, "loss": 0.6839, "step": 5594 }, { "epoch": 0.2667270517007127, "grad_norm": 2.7018444538116455, "learning_rate": 4.139957344625843e-06, "loss": 0.3376, "step": 5595 }, { "epoch": 0.2667747240960122, "grad_norm": 1.9463860988616943, "learning_rate": 4.136774929119033e-06, "loss": 0.6627, "step": 5596 }, { "epoch": 0.26682239649131173, "grad_norm": 1.260648250579834, "learning_rate": 4.133593418247474e-06, "loss": 0.6292, "step": 5597 }, { "epoch": 0.2668700688866112, "grad_norm": 1.5680603981018066, "learning_rate": 4.130412812502037e-06, "loss": 0.5229, "step": 5598 }, { "epoch": 0.2669177412819107, "grad_norm": 1.5625911951065063, "learning_rate": 4.12723311237346e-06, "loss": 0.6718, "step": 5599 }, { "epoch": 0.2669654136772102, "grad_norm": 3.8781931400299072, "learning_rate": 4.124054318352333e-06, "loss": 0.699, "step": 5600 }, { "epoch": 0.26701308607250973, "grad_norm": 3.590725898742676, "learning_rate": 4.120876430929115e-06, "loss": 1.5023, "step": 5601 }, { "epoch": 0.2670607584678092, "grad_norm": 5.640691757202148, "learning_rate": 4.117699450594122e-06, "loss": 0.4176, "step": 5602 }, { "epoch": 0.2671084308631087, "grad_norm": 2.2057461738586426, "learning_rate": 4.114523377837526e-06, "loss": 0.7682, "step": 5603 }, { "epoch": 0.2671561032584082, "grad_norm": 2.3418283462524414, "learning_rate": 4.1113482131493635e-06, "loss": 1.202, "step": 5604 }, { "epoch": 0.26720377565370773, "grad_norm": 1.4537042379379272, "learning_rate": 4.108173957019534e-06, "loss": 0.7473, "step": 5605 }, { "epoch": 0.26725144804900725, "grad_norm": 1.261121392250061, "learning_rate": 4.1050006099377846e-06, "loss": 0.6996, "step": 5606 }, { "epoch": 0.2672991204443067, "grad_norm": 2.5251898765563965, "learning_rate": 4.101828172393734e-06, "loss": 0.7216, "step": 5607 }, { "epoch": 0.2673467928396062, "grad_norm": 1.6198338270187378, "learning_rate": 4.098656644876863e-06, "loss": 0.7462, "step": 5608 }, { "epoch": 0.26739446523490573, "grad_norm": 2.2518017292022705, "learning_rate": 4.095486027876494e-06, "loss": 0.6152, "step": 5609 }, { "epoch": 0.26744213763020525, "grad_norm": 3.3212130069732666, "learning_rate": 4.0923163218818265e-06, "loss": 0.5339, "step": 5610 }, { "epoch": 0.2674898100255047, "grad_norm": 2.3660006523132324, "learning_rate": 4.089147527381917e-06, "loss": 0.6379, "step": 5611 }, { "epoch": 0.2675374824208042, "grad_norm": 1.339280605316162, "learning_rate": 4.085979644865674e-06, "loss": 0.6574, "step": 5612 }, { "epoch": 0.26758515481610373, "grad_norm": 1.9048956632614136, "learning_rate": 4.082812674821865e-06, "loss": 1.1436, "step": 5613 }, { "epoch": 0.26763282721140325, "grad_norm": 1.1836352348327637, "learning_rate": 4.079646617739129e-06, "loss": 0.4625, "step": 5614 }, { "epoch": 0.26768049960670276, "grad_norm": 1.1403794288635254, "learning_rate": 4.076481474105949e-06, "loss": 0.5532, "step": 5615 }, { "epoch": 0.2677281720020022, "grad_norm": 1.2208589315414429, "learning_rate": 4.073317244410677e-06, "loss": 0.6893, "step": 5616 }, { "epoch": 0.26777584439730173, "grad_norm": 1.7238335609436035, "learning_rate": 4.070153929141524e-06, "loss": 0.7698, "step": 5617 }, { "epoch": 0.26782351679260125, "grad_norm": 1.5462297201156616, "learning_rate": 4.066991528786551e-06, "loss": 0.5943, "step": 5618 }, { "epoch": 0.26787118918790076, "grad_norm": 1.7230587005615234, "learning_rate": 4.063830043833688e-06, "loss": 0.7638, "step": 5619 }, { "epoch": 0.2679188615832003, "grad_norm": 1.6850359439849854, "learning_rate": 4.060669474770716e-06, "loss": 0.5414, "step": 5620 }, { "epoch": 0.26796653397849973, "grad_norm": 1.662850260734558, "learning_rate": 4.057509822085286e-06, "loss": 0.5417, "step": 5621 }, { "epoch": 0.26801420637379925, "grad_norm": 1.8192800283432007, "learning_rate": 4.054351086264891e-06, "loss": 0.7306, "step": 5622 }, { "epoch": 0.26806187876909876, "grad_norm": 1.3098952770233154, "learning_rate": 4.051193267796894e-06, "loss": 0.7311, "step": 5623 }, { "epoch": 0.2681095511643983, "grad_norm": 1.6049048900604248, "learning_rate": 4.048036367168521e-06, "loss": 0.8549, "step": 5624 }, { "epoch": 0.26815722355969773, "grad_norm": 1.8448766469955444, "learning_rate": 4.0448803848668374e-06, "loss": 1.2292, "step": 5625 }, { "epoch": 0.26820489595499725, "grad_norm": 1.0000351667404175, "learning_rate": 4.0417253213787885e-06, "loss": 0.2914, "step": 5626 }, { "epoch": 0.26825256835029676, "grad_norm": 1.6012147665023804, "learning_rate": 4.038571177191164e-06, "loss": 0.6561, "step": 5627 }, { "epoch": 0.2683002407455963, "grad_norm": 2.003044605255127, "learning_rate": 4.035417952790613e-06, "loss": 0.7424, "step": 5628 }, { "epoch": 0.2683479131408958, "grad_norm": 1.7002331018447876, "learning_rate": 4.032265648663649e-06, "loss": 0.7706, "step": 5629 }, { "epoch": 0.26839558553619525, "grad_norm": 3.057119131088257, "learning_rate": 4.029114265296642e-06, "loss": 1.4352, "step": 5630 }, { "epoch": 0.26844325793149476, "grad_norm": 1.6593033075332642, "learning_rate": 4.025963803175813e-06, "loss": 0.5818, "step": 5631 }, { "epoch": 0.2684909303267943, "grad_norm": 1.637102484703064, "learning_rate": 4.022814262787248e-06, "loss": 0.8918, "step": 5632 }, { "epoch": 0.2685386027220938, "grad_norm": 1.545956015586853, "learning_rate": 4.0196656446168925e-06, "loss": 0.7567, "step": 5633 }, { "epoch": 0.26858627511739325, "grad_norm": 1.71291983127594, "learning_rate": 4.01651794915054e-06, "loss": 0.9417, "step": 5634 }, { "epoch": 0.26863394751269276, "grad_norm": 4.601542949676514, "learning_rate": 4.013371176873849e-06, "loss": 1.1523, "step": 5635 }, { "epoch": 0.2686816199079923, "grad_norm": 1.7521055936813354, "learning_rate": 4.0102253282723394e-06, "loss": 0.8477, "step": 5636 }, { "epoch": 0.2687292923032918, "grad_norm": 1.587024211883545, "learning_rate": 4.007080403831374e-06, "loss": 0.5925, "step": 5637 }, { "epoch": 0.2687769646985913, "grad_norm": 1.24383544921875, "learning_rate": 4.003936404036188e-06, "loss": 0.6706, "step": 5638 }, { "epoch": 0.26882463709389076, "grad_norm": 2.295158863067627, "learning_rate": 4.000793329371872e-06, "loss": 0.5453, "step": 5639 }, { "epoch": 0.2688723094891903, "grad_norm": 3.3943676948547363, "learning_rate": 3.99765118032336e-06, "loss": 1.2696, "step": 5640 }, { "epoch": 0.2689199818844898, "grad_norm": 2.7188167572021484, "learning_rate": 3.9945099573754635e-06, "loss": 1.3469, "step": 5641 }, { "epoch": 0.2689676542797893, "grad_norm": 1.985461950302124, "learning_rate": 3.991369661012831e-06, "loss": 0.3006, "step": 5642 }, { "epoch": 0.2690153266750888, "grad_norm": 1.2259762287139893, "learning_rate": 3.988230291719987e-06, "loss": 0.6294, "step": 5643 }, { "epoch": 0.2690629990703883, "grad_norm": 1.5396753549575806, "learning_rate": 3.9850918499812976e-06, "loss": 0.5473, "step": 5644 }, { "epoch": 0.2691106714656878, "grad_norm": 1.002305269241333, "learning_rate": 3.981954336280996e-06, "loss": 0.6129, "step": 5645 }, { "epoch": 0.2691583438609873, "grad_norm": 3.2131707668304443, "learning_rate": 3.978817751103163e-06, "loss": 0.8664, "step": 5646 }, { "epoch": 0.2692060162562868, "grad_norm": 1.183082938194275, "learning_rate": 3.975682094931747e-06, "loss": 0.2161, "step": 5647 }, { "epoch": 0.2692536886515863, "grad_norm": 1.8746246099472046, "learning_rate": 3.972547368250547e-06, "loss": 0.6282, "step": 5648 }, { "epoch": 0.2693013610468858, "grad_norm": 1.2480121850967407, "learning_rate": 3.969413571543214e-06, "loss": 0.7862, "step": 5649 }, { "epoch": 0.2693490334421853, "grad_norm": 1.6659002304077148, "learning_rate": 3.9662807052932625e-06, "loss": 1.1763, "step": 5650 }, { "epoch": 0.2693967058374848, "grad_norm": 1.125238299369812, "learning_rate": 3.963148769984069e-06, "loss": 0.6492, "step": 5651 }, { "epoch": 0.26944437823278433, "grad_norm": 2.436222553253174, "learning_rate": 3.960017766098847e-06, "loss": 1.1086, "step": 5652 }, { "epoch": 0.2694920506280838, "grad_norm": 1.7079180479049683, "learning_rate": 3.956887694120685e-06, "loss": 0.3463, "step": 5653 }, { "epoch": 0.2695397230233833, "grad_norm": 2.0221309661865234, "learning_rate": 3.953758554532523e-06, "loss": 0.9493, "step": 5654 }, { "epoch": 0.2695873954186828, "grad_norm": 1.6272809505462646, "learning_rate": 3.950630347817148e-06, "loss": 0.6734, "step": 5655 }, { "epoch": 0.26963506781398233, "grad_norm": 1.688293695449829, "learning_rate": 3.947503074457219e-06, "loss": 0.9494, "step": 5656 }, { "epoch": 0.2696827402092818, "grad_norm": 2.0697226524353027, "learning_rate": 3.9443767349352315e-06, "loss": 1.1549, "step": 5657 }, { "epoch": 0.2697304126045813, "grad_norm": 3.1395578384399414, "learning_rate": 3.9412513297335574e-06, "loss": 0.7576, "step": 5658 }, { "epoch": 0.2697780849998808, "grad_norm": 0.9031580090522766, "learning_rate": 3.938126859334407e-06, "loss": 0.6, "step": 5659 }, { "epoch": 0.26982575739518033, "grad_norm": 1.6896086931228638, "learning_rate": 3.935003324219856e-06, "loss": 0.3832, "step": 5660 }, { "epoch": 0.26987342979047985, "grad_norm": 2.8483963012695312, "learning_rate": 3.931880724871838e-06, "loss": 0.5517, "step": 5661 }, { "epoch": 0.2699211021857793, "grad_norm": 1.4200193881988525, "learning_rate": 3.928759061772132e-06, "loss": 0.6245, "step": 5662 }, { "epoch": 0.2699687745810788, "grad_norm": 3.688634157180786, "learning_rate": 3.9256383354023804e-06, "loss": 1.1761, "step": 5663 }, { "epoch": 0.27001644697637833, "grad_norm": 1.4372247457504272, "learning_rate": 3.922518546244084e-06, "loss": 0.5194, "step": 5664 }, { "epoch": 0.27006411937167785, "grad_norm": 2.5982208251953125, "learning_rate": 3.919399694778586e-06, "loss": 0.7174, "step": 5665 }, { "epoch": 0.2701117917669773, "grad_norm": 1.8281327486038208, "learning_rate": 3.916281781487098e-06, "loss": 0.9557, "step": 5666 }, { "epoch": 0.2701594641622768, "grad_norm": 1.798475980758667, "learning_rate": 3.913164806850683e-06, "loss": 0.9973, "step": 5667 }, { "epoch": 0.27020713655757633, "grad_norm": 1.6262587308883667, "learning_rate": 3.910048771350253e-06, "loss": 0.7448, "step": 5668 }, { "epoch": 0.27025480895287585, "grad_norm": 1.2724647521972656, "learning_rate": 3.906933675466584e-06, "loss": 0.8324, "step": 5669 }, { "epoch": 0.27030248134817536, "grad_norm": 1.2882195711135864, "learning_rate": 3.9038195196803055e-06, "loss": 0.3717, "step": 5670 }, { "epoch": 0.2703501537434748, "grad_norm": 2.791334867477417, "learning_rate": 3.900706304471896e-06, "loss": 0.5472, "step": 5671 }, { "epoch": 0.27039782613877433, "grad_norm": 1.3858510255813599, "learning_rate": 3.89759403032169e-06, "loss": 1.0581, "step": 5672 }, { "epoch": 0.27044549853407385, "grad_norm": 1.329193115234375, "learning_rate": 3.8944826977098856e-06, "loss": 0.662, "step": 5673 }, { "epoch": 0.27049317092937336, "grad_norm": 5.558101177215576, "learning_rate": 3.891372307116523e-06, "loss": 0.9928, "step": 5674 }, { "epoch": 0.2705408433246729, "grad_norm": 1.665878415107727, "learning_rate": 3.888262859021508e-06, "loss": 0.6122, "step": 5675 }, { "epoch": 0.27058851571997233, "grad_norm": 1.253283977508545, "learning_rate": 3.885154353904598e-06, "loss": 0.5924, "step": 5676 }, { "epoch": 0.27063618811527185, "grad_norm": 1.6796064376831055, "learning_rate": 3.882046792245395e-06, "loss": 0.8122, "step": 5677 }, { "epoch": 0.27068386051057136, "grad_norm": 1.6100490093231201, "learning_rate": 3.878940174523371e-06, "loss": 0.5877, "step": 5678 }, { "epoch": 0.2707315329058709, "grad_norm": 1.7721821069717407, "learning_rate": 3.875834501217847e-06, "loss": 1.0471, "step": 5679 }, { "epoch": 0.27077920530117033, "grad_norm": 1.3024680614471436, "learning_rate": 3.872729772807989e-06, "loss": 0.8119, "step": 5680 }, { "epoch": 0.27082687769646985, "grad_norm": 1.2156068086624146, "learning_rate": 3.869625989772828e-06, "loss": 0.5751, "step": 5681 }, { "epoch": 0.27087455009176936, "grad_norm": 4.203164100646973, "learning_rate": 3.8665231525912505e-06, "loss": 1.2568, "step": 5682 }, { "epoch": 0.2709222224870689, "grad_norm": 2.334987163543701, "learning_rate": 3.863421261741983e-06, "loss": 1.3586, "step": 5683 }, { "epoch": 0.2709698948823684, "grad_norm": 2.066728353500366, "learning_rate": 3.860320317703622e-06, "loss": 0.8259, "step": 5684 }, { "epoch": 0.27101756727766785, "grad_norm": 1.5613700151443481, "learning_rate": 3.857220320954612e-06, "loss": 0.5541, "step": 5685 }, { "epoch": 0.27106523967296736, "grad_norm": 2.7656795978546143, "learning_rate": 3.854121271973245e-06, "loss": 1.0216, "step": 5686 }, { "epoch": 0.2711129120682669, "grad_norm": 1.158886194229126, "learning_rate": 3.851023171237678e-06, "loss": 0.4278, "step": 5687 }, { "epoch": 0.2711605844635664, "grad_norm": 3.701150894165039, "learning_rate": 3.8479260192259135e-06, "loss": 1.0564, "step": 5688 }, { "epoch": 0.27120825685886585, "grad_norm": 1.521830677986145, "learning_rate": 3.844829816415808e-06, "loss": 1.1317, "step": 5689 }, { "epoch": 0.27125592925416536, "grad_norm": 2.0760726928710938, "learning_rate": 3.841734563285076e-06, "loss": 0.8162, "step": 5690 }, { "epoch": 0.2713036016494649, "grad_norm": 1.6996026039123535, "learning_rate": 3.8386402603112845e-06, "loss": 0.8117, "step": 5691 }, { "epoch": 0.2713512740447644, "grad_norm": 2.6809940338134766, "learning_rate": 3.835546907971849e-06, "loss": 0.7544, "step": 5692 }, { "epoch": 0.2713989464400639, "grad_norm": 13.870550155639648, "learning_rate": 3.832454506744043e-06, "loss": 0.1932, "step": 5693 }, { "epoch": 0.27144661883536336, "grad_norm": 2.608093500137329, "learning_rate": 3.829363057104998e-06, "loss": 0.523, "step": 5694 }, { "epoch": 0.2714942912306629, "grad_norm": 2.375333786010742, "learning_rate": 3.8262725595316845e-06, "loss": 0.833, "step": 5695 }, { "epoch": 0.2715419636259624, "grad_norm": 2.1272153854370117, "learning_rate": 3.823183014500937e-06, "loss": 0.8474, "step": 5696 }, { "epoch": 0.2715896360212619, "grad_norm": 2.915160894393921, "learning_rate": 3.820094422489442e-06, "loss": 1.1539, "step": 5697 }, { "epoch": 0.27163730841656136, "grad_norm": 1.6739840507507324, "learning_rate": 3.81700678397374e-06, "loss": 0.7797, "step": 5698 }, { "epoch": 0.2716849808118609, "grad_norm": 1.1327998638153076, "learning_rate": 3.813920099430215e-06, "loss": 0.6174, "step": 5699 }, { "epoch": 0.2717326532071604, "grad_norm": 1.7080106735229492, "learning_rate": 3.810834369335118e-06, "loss": 0.6037, "step": 5700 }, { "epoch": 0.2717803256024599, "grad_norm": 1.248401165008545, "learning_rate": 3.8077495941645392e-06, "loss": 0.2708, "step": 5701 }, { "epoch": 0.2718279979977594, "grad_norm": 1.445685863494873, "learning_rate": 3.8046657743944327e-06, "loss": 0.6664, "step": 5702 }, { "epoch": 0.2718756703930589, "grad_norm": 1.719103455543518, "learning_rate": 3.801582910500594e-06, "loss": 0.6413, "step": 5703 }, { "epoch": 0.2719233427883584, "grad_norm": 1.417418122291565, "learning_rate": 3.7985010029586856e-06, "loss": 0.7251, "step": 5704 }, { "epoch": 0.2719710151836579, "grad_norm": 3.2522764205932617, "learning_rate": 3.795420052244205e-06, "loss": 1.0004, "step": 5705 }, { "epoch": 0.2720186875789574, "grad_norm": 1.6385494470596313, "learning_rate": 3.7923400588325156e-06, "loss": 0.7247, "step": 5706 }, { "epoch": 0.27206635997425693, "grad_norm": 1.4474941492080688, "learning_rate": 3.7892610231988313e-06, "loss": 0.3648, "step": 5707 }, { "epoch": 0.2721140323695564, "grad_norm": 1.151902198791504, "learning_rate": 3.786182945818211e-06, "loss": 0.4966, "step": 5708 }, { "epoch": 0.2721617047648559, "grad_norm": 1.6308925151824951, "learning_rate": 3.7831058271655707e-06, "loss": 0.8752, "step": 5709 }, { "epoch": 0.2722093771601554, "grad_norm": 4.2674665451049805, "learning_rate": 3.7800296677156844e-06, "loss": 1.0404, "step": 5710 }, { "epoch": 0.27225704955545493, "grad_norm": 2.5260775089263916, "learning_rate": 3.7769544679431624e-06, "loss": 0.4908, "step": 5711 }, { "epoch": 0.2723047219507544, "grad_norm": 6.458588600158691, "learning_rate": 3.773880228322482e-06, "loss": 1.0295, "step": 5712 }, { "epoch": 0.2723523943460539, "grad_norm": 1.3752353191375732, "learning_rate": 3.7708069493279687e-06, "loss": 0.8503, "step": 5713 }, { "epoch": 0.2724000667413534, "grad_norm": 1.3675073385238647, "learning_rate": 3.7677346314337913e-06, "loss": 0.8296, "step": 5714 }, { "epoch": 0.27244773913665293, "grad_norm": 3.524670362472534, "learning_rate": 3.7646632751139844e-06, "loss": 0.9025, "step": 5715 }, { "epoch": 0.27249541153195245, "grad_norm": 2.0363285541534424, "learning_rate": 3.7615928808424184e-06, "loss": 0.6357, "step": 5716 }, { "epoch": 0.2725430839272519, "grad_norm": 2.8297040462493896, "learning_rate": 3.7585234490928313e-06, "loss": 0.6437, "step": 5717 }, { "epoch": 0.2725907563225514, "grad_norm": 1.991547703742981, "learning_rate": 3.7554549803387984e-06, "loss": 0.4893, "step": 5718 }, { "epoch": 0.27263842871785093, "grad_norm": 1.7219820022583008, "learning_rate": 3.7523874750537593e-06, "loss": 0.9517, "step": 5719 }, { "epoch": 0.27268610111315045, "grad_norm": 2.149207353591919, "learning_rate": 3.7493209337109904e-06, "loss": 0.7515, "step": 5720 }, { "epoch": 0.2727337735084499, "grad_norm": 1.328798532485962, "learning_rate": 3.7462553567836324e-06, "loss": 1.0679, "step": 5721 }, { "epoch": 0.2727814459037494, "grad_norm": 2.60494065284729, "learning_rate": 3.743190744744675e-06, "loss": 0.5135, "step": 5722 }, { "epoch": 0.27282911829904893, "grad_norm": 1.3431496620178223, "learning_rate": 3.740127098066949e-06, "loss": 0.6711, "step": 5723 }, { "epoch": 0.27287679069434845, "grad_norm": 1.6929900646209717, "learning_rate": 3.7370644172231485e-06, "loss": 0.5891, "step": 5724 }, { "epoch": 0.27292446308964796, "grad_norm": 3.088587999343872, "learning_rate": 3.734002702685816e-06, "loss": 0.6051, "step": 5725 }, { "epoch": 0.2729721354849474, "grad_norm": 1.6434996128082275, "learning_rate": 3.730941954927335e-06, "loss": 0.5684, "step": 5726 }, { "epoch": 0.27301980788024693, "grad_norm": 1.6733932495117188, "learning_rate": 3.7278821744199524e-06, "loss": 0.9915, "step": 5727 }, { "epoch": 0.27306748027554645, "grad_norm": 2.2283565998077393, "learning_rate": 3.7248233616357633e-06, "loss": 0.7064, "step": 5728 }, { "epoch": 0.27311515267084596, "grad_norm": 1.378372073173523, "learning_rate": 3.7217655170467035e-06, "loss": 0.8543, "step": 5729 }, { "epoch": 0.2731628250661455, "grad_norm": 1.9921668767929077, "learning_rate": 3.7187086411245723e-06, "loss": 0.421, "step": 5730 }, { "epoch": 0.27321049746144493, "grad_norm": 1.7613787651062012, "learning_rate": 3.715652734341015e-06, "loss": 0.8166, "step": 5731 }, { "epoch": 0.27325816985674445, "grad_norm": 0.9894343018531799, "learning_rate": 3.7125977971675264e-06, "loss": 0.3724, "step": 5732 }, { "epoch": 0.27330584225204396, "grad_norm": 1.6460390090942383, "learning_rate": 3.709543830075445e-06, "loss": 0.5897, "step": 5733 }, { "epoch": 0.2733535146473435, "grad_norm": 1.3652790784835815, "learning_rate": 3.7064908335359716e-06, "loss": 0.2567, "step": 5734 }, { "epoch": 0.27340118704264293, "grad_norm": 1.6392582654953003, "learning_rate": 3.7034388080201557e-06, "loss": 1.0321, "step": 5735 }, { "epoch": 0.27344885943794245, "grad_norm": 2.154822826385498, "learning_rate": 3.7003877539988866e-06, "loss": 0.4373, "step": 5736 }, { "epoch": 0.27349653183324196, "grad_norm": 1.1326566934585571, "learning_rate": 3.6973376719429134e-06, "loss": 0.6395, "step": 5737 }, { "epoch": 0.2735442042285415, "grad_norm": 1.2641526460647583, "learning_rate": 3.6942885623228353e-06, "loss": 0.6231, "step": 5738 }, { "epoch": 0.273591876623841, "grad_norm": 1.6372159719467163, "learning_rate": 3.691240425609093e-06, "loss": 0.6248, "step": 5739 }, { "epoch": 0.27363954901914045, "grad_norm": 3.5587074756622314, "learning_rate": 3.6881932622719853e-06, "loss": 1.3179, "step": 5740 }, { "epoch": 0.27368722141443996, "grad_norm": 2.114997148513794, "learning_rate": 3.6851470727816617e-06, "loss": 0.7741, "step": 5741 }, { "epoch": 0.2737348938097395, "grad_norm": 2.527776002883911, "learning_rate": 3.6821018576081114e-06, "loss": 0.7492, "step": 5742 }, { "epoch": 0.273782566205039, "grad_norm": 2.212334394454956, "learning_rate": 3.679057617221181e-06, "loss": 0.7236, "step": 5743 }, { "epoch": 0.27383023860033845, "grad_norm": 6.127382755279541, "learning_rate": 3.6760143520905724e-06, "loss": 0.5891, "step": 5744 }, { "epoch": 0.27387791099563796, "grad_norm": 1.1842920780181885, "learning_rate": 3.6729720626858213e-06, "loss": 0.8814, "step": 5745 }, { "epoch": 0.2739255833909375, "grad_norm": 1.8540352582931519, "learning_rate": 3.669930749476327e-06, "loss": 0.424, "step": 5746 }, { "epoch": 0.273973255786237, "grad_norm": 2.6331565380096436, "learning_rate": 3.666890412931332e-06, "loss": 0.7251, "step": 5747 }, { "epoch": 0.2740209281815365, "grad_norm": 5.96897029876709, "learning_rate": 3.6638510535199245e-06, "loss": 0.3368, "step": 5748 }, { "epoch": 0.27406860057683596, "grad_norm": 1.5467431545257568, "learning_rate": 3.660812671711049e-06, "loss": 0.4915, "step": 5749 }, { "epoch": 0.2741162729721355, "grad_norm": 1.412362813949585, "learning_rate": 3.6577752679735023e-06, "loss": 0.774, "step": 5750 }, { "epoch": 0.274163945367435, "grad_norm": 1.6401735544204712, "learning_rate": 3.6547388427759144e-06, "loss": 0.5427, "step": 5751 }, { "epoch": 0.2742116177627345, "grad_norm": 2.3528993129730225, "learning_rate": 3.651703396586781e-06, "loss": 0.6584, "step": 5752 }, { "epoch": 0.27425929015803396, "grad_norm": 1.5741047859191895, "learning_rate": 3.6486689298744406e-06, "loss": 0.9682, "step": 5753 }, { "epoch": 0.2743069625533335, "grad_norm": 1.327043056488037, "learning_rate": 3.645635443107076e-06, "loss": 0.7082, "step": 5754 }, { "epoch": 0.274354634948633, "grad_norm": 2.1352598667144775, "learning_rate": 3.642602936752724e-06, "loss": 0.778, "step": 5755 }, { "epoch": 0.2744023073439325, "grad_norm": 1.15536630153656, "learning_rate": 3.6395714112792744e-06, "loss": 0.8755, "step": 5756 }, { "epoch": 0.274449979739232, "grad_norm": 1.1322435140609741, "learning_rate": 3.6365408671544534e-06, "loss": 0.7989, "step": 5757 }, { "epoch": 0.2744976521345315, "grad_norm": 0.9391312599182129, "learning_rate": 3.633511304845845e-06, "loss": 0.5099, "step": 5758 }, { "epoch": 0.274545324529831, "grad_norm": 2.4861743450164795, "learning_rate": 3.630482724820884e-06, "loss": 1.2608, "step": 5759 }, { "epoch": 0.2745929969251305, "grad_norm": 2.128649950027466, "learning_rate": 3.627455127546842e-06, "loss": 0.8045, "step": 5760 }, { "epoch": 0.27464066932043, "grad_norm": 2.0263898372650146, "learning_rate": 3.6244285134908517e-06, "loss": 1.14, "step": 5761 }, { "epoch": 0.27468834171572953, "grad_norm": 1.3106884956359863, "learning_rate": 3.6214028831198833e-06, "loss": 0.7058, "step": 5762 }, { "epoch": 0.274736014111029, "grad_norm": 1.9334492683410645, "learning_rate": 3.618378236900767e-06, "loss": 0.8704, "step": 5763 }, { "epoch": 0.2747836865063285, "grad_norm": 1.6236391067504883, "learning_rate": 3.6153545753001663e-06, "loss": 0.4789, "step": 5764 }, { "epoch": 0.274831358901628, "grad_norm": 1.925201654434204, "learning_rate": 3.612331898784609e-06, "loss": 0.7876, "step": 5765 }, { "epoch": 0.27487903129692753, "grad_norm": 2.073216438293457, "learning_rate": 3.6093102078204566e-06, "loss": 0.8201, "step": 5766 }, { "epoch": 0.274926703692227, "grad_norm": 2.232060194015503, "learning_rate": 3.6062895028739287e-06, "loss": 0.9654, "step": 5767 }, { "epoch": 0.2749743760875265, "grad_norm": 8.443530082702637, "learning_rate": 3.6032697844110896e-06, "loss": 1.9204, "step": 5768 }, { "epoch": 0.275022048482826, "grad_norm": 2.865467071533203, "learning_rate": 3.6002510528978473e-06, "loss": 1.1493, "step": 5769 }, { "epoch": 0.27506972087812553, "grad_norm": 1.9604463577270508, "learning_rate": 3.5972333087999622e-06, "loss": 0.5984, "step": 5770 }, { "epoch": 0.27511739327342505, "grad_norm": 0.8359993696212769, "learning_rate": 3.594216552583045e-06, "loss": 0.3696, "step": 5771 }, { "epoch": 0.2751650656687245, "grad_norm": 2.306946277618408, "learning_rate": 3.591200784712543e-06, "loss": 0.7726, "step": 5772 }, { "epoch": 0.275212738064024, "grad_norm": 1.7575749158859253, "learning_rate": 3.588186005653763e-06, "loss": 0.5739, "step": 5773 }, { "epoch": 0.27526041045932353, "grad_norm": 2.016315221786499, "learning_rate": 3.5851722158718537e-06, "loss": 0.8791, "step": 5774 }, { "epoch": 0.27530808285462305, "grad_norm": 1.2995957136154175, "learning_rate": 3.582159415831814e-06, "loss": 0.5061, "step": 5775 }, { "epoch": 0.2753557552499225, "grad_norm": 2.3488354682922363, "learning_rate": 3.5791476059984866e-06, "loss": 0.6248, "step": 5776 }, { "epoch": 0.275403427645222, "grad_norm": 1.2884950637817383, "learning_rate": 3.576136786836557e-06, "loss": 0.845, "step": 5777 }, { "epoch": 0.27545110004052153, "grad_norm": 1.571522831916809, "learning_rate": 3.5731269588105723e-06, "loss": 0.5894, "step": 5778 }, { "epoch": 0.27549877243582105, "grad_norm": 2.178201913833618, "learning_rate": 3.57011812238491e-06, "loss": 0.5167, "step": 5779 }, { "epoch": 0.27554644483112056, "grad_norm": 1.9311840534210205, "learning_rate": 3.5671102780238066e-06, "loss": 0.4011, "step": 5780 }, { "epoch": 0.27559411722642, "grad_norm": 1.3858994245529175, "learning_rate": 3.5641034261913454e-06, "loss": 0.3302, "step": 5781 }, { "epoch": 0.27564178962171954, "grad_norm": 1.957581877708435, "learning_rate": 3.561097567351445e-06, "loss": 1.0391, "step": 5782 }, { "epoch": 0.27568946201701905, "grad_norm": 1.8633755445480347, "learning_rate": 3.5580927019678812e-06, "loss": 0.2314, "step": 5783 }, { "epoch": 0.27573713441231856, "grad_norm": 1.5867997407913208, "learning_rate": 3.5550888305042785e-06, "loss": 0.6767, "step": 5784 }, { "epoch": 0.275784806807618, "grad_norm": 2.5515241622924805, "learning_rate": 3.552085953424096e-06, "loss": 0.8169, "step": 5785 }, { "epoch": 0.27583247920291754, "grad_norm": 3.5643978118896484, "learning_rate": 3.5490840711906506e-06, "loss": 1.3515, "step": 5786 }, { "epoch": 0.27588015159821705, "grad_norm": 2.232090711593628, "learning_rate": 3.546083184267105e-06, "loss": 0.6899, "step": 5787 }, { "epoch": 0.27592782399351656, "grad_norm": 1.28727126121521, "learning_rate": 3.5430832931164584e-06, "loss": 0.6688, "step": 5788 }, { "epoch": 0.2759754963888161, "grad_norm": 1.2497484683990479, "learning_rate": 3.540084398201565e-06, "loss": 0.6052, "step": 5789 }, { "epoch": 0.27602316878411554, "grad_norm": 1.0611144304275513, "learning_rate": 3.5370864999851296e-06, "loss": 0.3656, "step": 5790 }, { "epoch": 0.27607084117941505, "grad_norm": 1.406622052192688, "learning_rate": 3.534089598929691e-06, "loss": 0.8445, "step": 5791 }, { "epoch": 0.27611851357471456, "grad_norm": 1.173233151435852, "learning_rate": 3.5310936954976383e-06, "loss": 0.6267, "step": 5792 }, { "epoch": 0.2761661859700141, "grad_norm": 1.7871273756027222, "learning_rate": 3.5280987901512142e-06, "loss": 0.3913, "step": 5793 }, { "epoch": 0.2762138583653136, "grad_norm": 1.7539689540863037, "learning_rate": 3.525104883352497e-06, "loss": 0.9235, "step": 5794 }, { "epoch": 0.27626153076061305, "grad_norm": 1.887739658355713, "learning_rate": 3.522111975563417e-06, "loss": 0.4819, "step": 5795 }, { "epoch": 0.27630920315591256, "grad_norm": 1.73068368434906, "learning_rate": 3.519120067245754e-06, "loss": 0.6554, "step": 5796 }, { "epoch": 0.2763568755512121, "grad_norm": 1.55825674533844, "learning_rate": 3.51612915886112e-06, "loss": 0.5789, "step": 5797 }, { "epoch": 0.2764045479465116, "grad_norm": 2.273411512374878, "learning_rate": 3.513139250870986e-06, "loss": 0.4424, "step": 5798 }, { "epoch": 0.27645222034181105, "grad_norm": 5.278494834899902, "learning_rate": 3.5101503437366678e-06, "loss": 0.7936, "step": 5799 }, { "epoch": 0.27649989273711056, "grad_norm": 4.1573805809021, "learning_rate": 3.507162437919316e-06, "loss": 0.587, "step": 5800 }, { "epoch": 0.2765475651324101, "grad_norm": 1.4418230056762695, "learning_rate": 3.5041755338799354e-06, "loss": 0.891, "step": 5801 }, { "epoch": 0.2765952375277096, "grad_norm": 1.2996065616607666, "learning_rate": 3.5011896320793802e-06, "loss": 0.3958, "step": 5802 }, { "epoch": 0.2766429099230091, "grad_norm": 3.44665265083313, "learning_rate": 3.4982047329783362e-06, "loss": 0.5077, "step": 5803 }, { "epoch": 0.27669058231830856, "grad_norm": 1.3354766368865967, "learning_rate": 3.4952208370373475e-06, "loss": 0.9142, "step": 5804 }, { "epoch": 0.2767382547136081, "grad_norm": 2.007514715194702, "learning_rate": 3.4922379447167997e-06, "loss": 1.032, "step": 5805 }, { "epoch": 0.2767859271089076, "grad_norm": 1.655014157295227, "learning_rate": 3.4892560564769164e-06, "loss": 0.3604, "step": 5806 }, { "epoch": 0.2768335995042071, "grad_norm": 2.406719923019409, "learning_rate": 3.48627517277778e-06, "loss": 0.4553, "step": 5807 }, { "epoch": 0.27688127189950656, "grad_norm": 4.124279499053955, "learning_rate": 3.4832952940793054e-06, "loss": 0.5604, "step": 5808 }, { "epoch": 0.2769289442948061, "grad_norm": 1.866684913635254, "learning_rate": 3.4803164208412543e-06, "loss": 1.1109, "step": 5809 }, { "epoch": 0.2769766166901056, "grad_norm": 2.047614812850952, "learning_rate": 3.4773385535232408e-06, "loss": 0.9086, "step": 5810 }, { "epoch": 0.2770242890854051, "grad_norm": 2.289114236831665, "learning_rate": 3.4743616925847167e-06, "loss": 0.7814, "step": 5811 }, { "epoch": 0.2770719614807046, "grad_norm": 1.2168200016021729, "learning_rate": 3.4713858384849873e-06, "loss": 0.5156, "step": 5812 }, { "epoch": 0.2771196338760041, "grad_norm": 1.6633448600769043, "learning_rate": 3.4684109916831866e-06, "loss": 0.5143, "step": 5813 }, { "epoch": 0.2771673062713036, "grad_norm": 1.4360078573226929, "learning_rate": 3.465437152638308e-06, "loss": 1.0772, "step": 5814 }, { "epoch": 0.2772149786666031, "grad_norm": 2.982320785522461, "learning_rate": 3.462464321809188e-06, "loss": 1.3989, "step": 5815 }, { "epoch": 0.2772626510619026, "grad_norm": 2.3763790130615234, "learning_rate": 3.4594924996544952e-06, "loss": 0.9511, "step": 5816 }, { "epoch": 0.27731032345720213, "grad_norm": 1.4426672458648682, "learning_rate": 3.4565216866327556e-06, "loss": 0.8974, "step": 5817 }, { "epoch": 0.2773579958525016, "grad_norm": 2.0337131023406982, "learning_rate": 3.4535518832023383e-06, "loss": 0.4508, "step": 5818 }, { "epoch": 0.2774056682478011, "grad_norm": 7.215719699859619, "learning_rate": 3.4505830898214466e-06, "loss": 1.5514, "step": 5819 }, { "epoch": 0.2774533406431006, "grad_norm": 2.007924795150757, "learning_rate": 3.447615306948142e-06, "loss": 1.0114, "step": 5820 }, { "epoch": 0.27750101303840014, "grad_norm": 5.832476615905762, "learning_rate": 3.4446485350403145e-06, "loss": 1.5196, "step": 5821 }, { "epoch": 0.2775486854336996, "grad_norm": 3.1122994422912598, "learning_rate": 3.441682774555716e-06, "loss": 0.4405, "step": 5822 }, { "epoch": 0.2775963578289991, "grad_norm": 2.6448862552642822, "learning_rate": 3.438718025951924e-06, "loss": 0.2901, "step": 5823 }, { "epoch": 0.2776440302242986, "grad_norm": 1.5366235971450806, "learning_rate": 3.435754289686375e-06, "loss": 0.678, "step": 5824 }, { "epoch": 0.27769170261959814, "grad_norm": 1.1536791324615479, "learning_rate": 3.432791566216338e-06, "loss": 0.3196, "step": 5825 }, { "epoch": 0.27773937501489765, "grad_norm": 2.690739631652832, "learning_rate": 3.429829855998933e-06, "loss": 1.1102, "step": 5826 }, { "epoch": 0.2777870474101971, "grad_norm": 2.7489328384399414, "learning_rate": 3.426869159491124e-06, "loss": 1.1726, "step": 5827 }, { "epoch": 0.2778347198054966, "grad_norm": 1.2362381219863892, "learning_rate": 3.4239094771497104e-06, "loss": 0.2277, "step": 5828 }, { "epoch": 0.27788239220079614, "grad_norm": 1.2275652885437012, "learning_rate": 3.420950809431345e-06, "loss": 0.6145, "step": 5829 }, { "epoch": 0.27793006459609565, "grad_norm": 2.2532424926757812, "learning_rate": 3.4179931567925216e-06, "loss": 0.7512, "step": 5830 }, { "epoch": 0.2779777369913951, "grad_norm": 1.3912450075149536, "learning_rate": 3.4150365196895686e-06, "loss": 0.6163, "step": 5831 }, { "epoch": 0.2780254093866946, "grad_norm": 0.9267414808273315, "learning_rate": 3.412080898578669e-06, "loss": 0.391, "step": 5832 }, { "epoch": 0.27807308178199414, "grad_norm": 1.1409859657287598, "learning_rate": 3.4091262939158477e-06, "loss": 0.6547, "step": 5833 }, { "epoch": 0.27812075417729365, "grad_norm": 4.11335563659668, "learning_rate": 3.406172706156963e-06, "loss": 0.4091, "step": 5834 }, { "epoch": 0.27816842657259316, "grad_norm": 2.016162633895874, "learning_rate": 3.4032201357577287e-06, "loss": 0.5227, "step": 5835 }, { "epoch": 0.2782160989678926, "grad_norm": 0.9952210187911987, "learning_rate": 3.4002685831736917e-06, "loss": 0.226, "step": 5836 }, { "epoch": 0.27826377136319214, "grad_norm": 1.124870777130127, "learning_rate": 3.3973180488602508e-06, "loss": 0.4222, "step": 5837 }, { "epoch": 0.27831144375849165, "grad_norm": 1.807302474975586, "learning_rate": 3.3943685332726385e-06, "loss": 0.8145, "step": 5838 }, { "epoch": 0.27835911615379116, "grad_norm": 1.9683709144592285, "learning_rate": 3.391420036865939e-06, "loss": 1.0791, "step": 5839 }, { "epoch": 0.2784067885490906, "grad_norm": 2.589303493499756, "learning_rate": 3.3884725600950687e-06, "loss": 1.1757, "step": 5840 }, { "epoch": 0.27845446094439014, "grad_norm": 1.9659584760665894, "learning_rate": 3.385526103414798e-06, "loss": 0.8821, "step": 5841 }, { "epoch": 0.27850213333968965, "grad_norm": 2.3086142539978027, "learning_rate": 3.3825806672797355e-06, "loss": 0.5679, "step": 5842 }, { "epoch": 0.27854980573498916, "grad_norm": 1.6591477394104004, "learning_rate": 3.379636252144328e-06, "loss": 0.7197, "step": 5843 }, { "epoch": 0.2785974781302887, "grad_norm": 2.003530263900757, "learning_rate": 3.37669285846287e-06, "loss": 0.984, "step": 5844 }, { "epoch": 0.27864515052558814, "grad_norm": 1.525742530822754, "learning_rate": 3.3737504866895e-06, "loss": 0.644, "step": 5845 }, { "epoch": 0.27869282292088765, "grad_norm": 1.3682410717010498, "learning_rate": 3.3708091372781893e-06, "loss": 0.7348, "step": 5846 }, { "epoch": 0.27874049531618716, "grad_norm": 1.2351282835006714, "learning_rate": 3.3678688106827616e-06, "loss": 0.6791, "step": 5847 }, { "epoch": 0.2787881677114867, "grad_norm": 2.5197176933288574, "learning_rate": 3.364929507356881e-06, "loss": 0.8547, "step": 5848 }, { "epoch": 0.2788358401067862, "grad_norm": 1.4743976593017578, "learning_rate": 3.361991227754048e-06, "loss": 0.7519, "step": 5849 }, { "epoch": 0.27888351250208565, "grad_norm": 0.8373667001724243, "learning_rate": 3.3590539723276083e-06, "loss": 0.632, "step": 5850 }, { "epoch": 0.27893118489738516, "grad_norm": 1.3392571210861206, "learning_rate": 3.3561177415307566e-06, "loss": 0.7121, "step": 5851 }, { "epoch": 0.2789788572926847, "grad_norm": 1.9579188823699951, "learning_rate": 3.3531825358165184e-06, "loss": 0.8392, "step": 5852 }, { "epoch": 0.2790265296879842, "grad_norm": 1.713509440422058, "learning_rate": 3.3502483556377628e-06, "loss": 0.4693, "step": 5853 }, { "epoch": 0.27907420208328365, "grad_norm": 4.4942097663879395, "learning_rate": 3.3473152014472064e-06, "loss": 0.6598, "step": 5854 }, { "epoch": 0.27912187447858317, "grad_norm": 1.400954008102417, "learning_rate": 3.344383073697408e-06, "loss": 0.5549, "step": 5855 }, { "epoch": 0.2791695468738827, "grad_norm": 2.1655681133270264, "learning_rate": 3.341451972840759e-06, "loss": 0.1788, "step": 5856 }, { "epoch": 0.2792172192691822, "grad_norm": 1.1546815633773804, "learning_rate": 3.338521899329501e-06, "loss": 0.3929, "step": 5857 }, { "epoch": 0.2792648916644817, "grad_norm": 1.9194657802581787, "learning_rate": 3.335592853615717e-06, "loss": 0.6697, "step": 5858 }, { "epoch": 0.27931256405978117, "grad_norm": 1.1898773908615112, "learning_rate": 3.3326648361513227e-06, "loss": 0.6485, "step": 5859 }, { "epoch": 0.2793602364550807, "grad_norm": 3.210023880004883, "learning_rate": 3.3297378473880836e-06, "loss": 1.209, "step": 5860 }, { "epoch": 0.2794079088503802, "grad_norm": 1.895900011062622, "learning_rate": 3.326811887777607e-06, "loss": 0.8399, "step": 5861 }, { "epoch": 0.2794555812456797, "grad_norm": 1.4736049175262451, "learning_rate": 3.323886957771333e-06, "loss": 0.9595, "step": 5862 }, { "epoch": 0.27950325364097917, "grad_norm": 2.3377034664154053, "learning_rate": 3.32096305782055e-06, "loss": 0.2446, "step": 5863 }, { "epoch": 0.2795509260362787, "grad_norm": 1.1960664987564087, "learning_rate": 3.31804018837639e-06, "loss": 0.81, "step": 5864 }, { "epoch": 0.2795985984315782, "grad_norm": 1.0660697221755981, "learning_rate": 3.3151183498898155e-06, "loss": 0.1664, "step": 5865 }, { "epoch": 0.2796462708268777, "grad_norm": 1.5719130039215088, "learning_rate": 3.3121975428116414e-06, "loss": 0.7434, "step": 5866 }, { "epoch": 0.2796939432221772, "grad_norm": 2.6992483139038086, "learning_rate": 3.3092777675925145e-06, "loss": 0.8804, "step": 5867 }, { "epoch": 0.2797416156174767, "grad_norm": 2.4621758460998535, "learning_rate": 3.306359024682925e-06, "loss": 0.7341, "step": 5868 }, { "epoch": 0.2797892880127762, "grad_norm": 1.3564366102218628, "learning_rate": 3.3034413145332065e-06, "loss": 0.6703, "step": 5869 }, { "epoch": 0.2798369604080757, "grad_norm": 1.1252259016036987, "learning_rate": 3.300524637593535e-06, "loss": 0.4572, "step": 5870 }, { "epoch": 0.2798846328033752, "grad_norm": 1.4198715686798096, "learning_rate": 3.297608994313918e-06, "loss": 0.5901, "step": 5871 }, { "epoch": 0.2799323051986747, "grad_norm": 1.6074761152267456, "learning_rate": 3.29469438514421e-06, "loss": 0.8219, "step": 5872 }, { "epoch": 0.2799799775939742, "grad_norm": 1.1399770975112915, "learning_rate": 3.291780810534112e-06, "loss": 0.2135, "step": 5873 }, { "epoch": 0.2800276499892737, "grad_norm": 1.8449188470840454, "learning_rate": 3.288868270933151e-06, "loss": 0.8644, "step": 5874 }, { "epoch": 0.2800753223845732, "grad_norm": 3.027866840362549, "learning_rate": 3.285956766790703e-06, "loss": 1.3406, "step": 5875 }, { "epoch": 0.28012299477987274, "grad_norm": 1.4485067129135132, "learning_rate": 3.2830462985559884e-06, "loss": 0.8413, "step": 5876 }, { "epoch": 0.2801706671751722, "grad_norm": 2.0163278579711914, "learning_rate": 3.2801368666780552e-06, "loss": 0.7529, "step": 5877 }, { "epoch": 0.2802183395704717, "grad_norm": 2.1220314502716064, "learning_rate": 3.2772284716058032e-06, "loss": 0.8258, "step": 5878 }, { "epoch": 0.2802660119657712, "grad_norm": 1.9134745597839355, "learning_rate": 3.2743211137879693e-06, "loss": 0.6428, "step": 5879 }, { "epoch": 0.28031368436107074, "grad_norm": 2.6584293842315674, "learning_rate": 3.2714147936731234e-06, "loss": 0.9394, "step": 5880 }, { "epoch": 0.28036135675637025, "grad_norm": 1.4212067127227783, "learning_rate": 3.268509511709688e-06, "loss": 0.7571, "step": 5881 }, { "epoch": 0.2804090291516697, "grad_norm": 1.792913556098938, "learning_rate": 3.2656052683459094e-06, "loss": 0.6554, "step": 5882 }, { "epoch": 0.2804567015469692, "grad_norm": 2.194751501083374, "learning_rate": 3.26270206402989e-06, "loss": 0.7524, "step": 5883 }, { "epoch": 0.28050437394226874, "grad_norm": 2.3831229209899902, "learning_rate": 3.259799899209559e-06, "loss": 0.8005, "step": 5884 }, { "epoch": 0.28055204633756825, "grad_norm": 1.1160222291946411, "learning_rate": 3.2568987743326964e-06, "loss": 0.6014, "step": 5885 }, { "epoch": 0.2805997187328677, "grad_norm": 1.176783561706543, "learning_rate": 3.2539986898469088e-06, "loss": 0.3688, "step": 5886 }, { "epoch": 0.2806473911281672, "grad_norm": 1.5147234201431274, "learning_rate": 3.2510996461996523e-06, "loss": 0.9379, "step": 5887 }, { "epoch": 0.28069506352346674, "grad_norm": 2.1405818462371826, "learning_rate": 3.2482016438382215e-06, "loss": 0.7928, "step": 5888 }, { "epoch": 0.28074273591876625, "grad_norm": 1.048101782798767, "learning_rate": 3.245304683209749e-06, "loss": 0.4306, "step": 5889 }, { "epoch": 0.28079040831406576, "grad_norm": 4.601113796234131, "learning_rate": 3.242408764761201e-06, "loss": 0.4628, "step": 5890 }, { "epoch": 0.2808380807093652, "grad_norm": 3.8754489421844482, "learning_rate": 3.2395138889393918e-06, "loss": 0.475, "step": 5891 }, { "epoch": 0.28088575310466474, "grad_norm": 1.3963401317596436, "learning_rate": 3.236620056190972e-06, "loss": 0.3811, "step": 5892 }, { "epoch": 0.28093342549996425, "grad_norm": 1.5309783220291138, "learning_rate": 3.233727266962425e-06, "loss": 0.5391, "step": 5893 }, { "epoch": 0.28098109789526377, "grad_norm": 2.2857825756073, "learning_rate": 3.230835521700083e-06, "loss": 0.7422, "step": 5894 }, { "epoch": 0.2810287702905632, "grad_norm": 1.5979682207107544, "learning_rate": 3.2279448208501128e-06, "loss": 0.8812, "step": 5895 }, { "epoch": 0.28107644268586274, "grad_norm": 1.2685645818710327, "learning_rate": 3.2250551648585194e-06, "loss": 0.4423, "step": 5896 }, { "epoch": 0.28112411508116225, "grad_norm": 1.7394298315048218, "learning_rate": 3.222166554171141e-06, "loss": 0.8087, "step": 5897 }, { "epoch": 0.28117178747646177, "grad_norm": 1.5419667959213257, "learning_rate": 3.2192789892336694e-06, "loss": 0.9385, "step": 5898 }, { "epoch": 0.2812194598717613, "grad_norm": 1.1576173305511475, "learning_rate": 3.216392470491618e-06, "loss": 0.6088, "step": 5899 }, { "epoch": 0.28126713226706074, "grad_norm": 5.5643768310546875, "learning_rate": 3.213506998390351e-06, "loss": 1.1993, "step": 5900 }, { "epoch": 0.28131480466236025, "grad_norm": 0.9182067513465881, "learning_rate": 3.2106225733750707e-06, "loss": 0.4358, "step": 5901 }, { "epoch": 0.28136247705765977, "grad_norm": 1.6869710683822632, "learning_rate": 3.2077391958908065e-06, "loss": 1.005, "step": 5902 }, { "epoch": 0.2814101494529593, "grad_norm": 2.2454309463500977, "learning_rate": 3.2048568663824375e-06, "loss": 0.6729, "step": 5903 }, { "epoch": 0.2814578218482588, "grad_norm": 1.4682514667510986, "learning_rate": 3.20197558529468e-06, "loss": 0.6519, "step": 5904 }, { "epoch": 0.28150549424355825, "grad_norm": 1.3484890460968018, "learning_rate": 3.199095353072081e-06, "loss": 0.9018, "step": 5905 }, { "epoch": 0.28155316663885777, "grad_norm": 1.8326994180679321, "learning_rate": 3.1962161701590342e-06, "loss": 1.1445, "step": 5906 }, { "epoch": 0.2816008390341573, "grad_norm": 1.8259936571121216, "learning_rate": 3.193338036999769e-06, "loss": 0.577, "step": 5907 }, { "epoch": 0.2816485114294568, "grad_norm": 1.3793479204177856, "learning_rate": 3.1904609540383467e-06, "loss": 0.5268, "step": 5908 }, { "epoch": 0.28169618382475625, "grad_norm": 7.305703639984131, "learning_rate": 3.187584921718675e-06, "loss": 0.8085, "step": 5909 }, { "epoch": 0.28174385622005577, "grad_norm": 1.557979941368103, "learning_rate": 3.1847099404844984e-06, "loss": 0.8441, "step": 5910 }, { "epoch": 0.2817915286153553, "grad_norm": 4.730111598968506, "learning_rate": 3.1818360107793933e-06, "loss": 0.3377, "step": 5911 }, { "epoch": 0.2818392010106548, "grad_norm": 1.458540439605713, "learning_rate": 3.178963133046776e-06, "loss": 0.7024, "step": 5912 }, { "epoch": 0.2818868734059543, "grad_norm": 1.733161449432373, "learning_rate": 3.1760913077299072e-06, "loss": 0.7407, "step": 5913 }, { "epoch": 0.28193454580125377, "grad_norm": 1.6189063787460327, "learning_rate": 3.173220535271874e-06, "loss": 0.9442, "step": 5914 }, { "epoch": 0.2819822181965533, "grad_norm": 1.5450819730758667, "learning_rate": 3.1703508161156095e-06, "loss": 0.4844, "step": 5915 }, { "epoch": 0.2820298905918528, "grad_norm": 1.3176461458206177, "learning_rate": 3.1674821507038857e-06, "loss": 0.7086, "step": 5916 }, { "epoch": 0.2820775629871523, "grad_norm": 1.8015121221542358, "learning_rate": 3.1646145394793017e-06, "loss": 0.7689, "step": 5917 }, { "epoch": 0.28212523538245177, "grad_norm": 1.266664743423462, "learning_rate": 3.1617479828843023e-06, "loss": 0.858, "step": 5918 }, { "epoch": 0.2821729077777513, "grad_norm": 2.7740542888641357, "learning_rate": 3.158882481361173e-06, "loss": 0.7104, "step": 5919 }, { "epoch": 0.2822205801730508, "grad_norm": 2.237379789352417, "learning_rate": 3.156018035352024e-06, "loss": 1.5049, "step": 5920 }, { "epoch": 0.2822682525683503, "grad_norm": 1.1840728521347046, "learning_rate": 3.1531546452988127e-06, "loss": 0.8268, "step": 5921 }, { "epoch": 0.2823159249636498, "grad_norm": 2.702145576477051, "learning_rate": 3.1502923116433324e-06, "loss": 0.9402, "step": 5922 }, { "epoch": 0.2823635973589493, "grad_norm": 1.544381856918335, "learning_rate": 3.1474310348272084e-06, "loss": 0.9294, "step": 5923 }, { "epoch": 0.2824112697542488, "grad_norm": 1.5606869459152222, "learning_rate": 3.1445708152919075e-06, "loss": 0.9205, "step": 5924 }, { "epoch": 0.2824589421495483, "grad_norm": 1.5174667835235596, "learning_rate": 3.141711653478736e-06, "loss": 0.9588, "step": 5925 }, { "epoch": 0.2825066145448478, "grad_norm": 1.5139796733856201, "learning_rate": 3.1388535498288265e-06, "loss": 0.8664, "step": 5926 }, { "epoch": 0.2825542869401473, "grad_norm": 1.7629603147506714, "learning_rate": 3.135996504783161e-06, "loss": 0.5387, "step": 5927 }, { "epoch": 0.2826019593354468, "grad_norm": 3.60149884223938, "learning_rate": 3.1331405187825457e-06, "loss": 1.7011, "step": 5928 }, { "epoch": 0.2826496317307463, "grad_norm": 4.884571075439453, "learning_rate": 3.130285592267638e-06, "loss": 0.5431, "step": 5929 }, { "epoch": 0.2826973041260458, "grad_norm": 2.810913324356079, "learning_rate": 3.1274317256789144e-06, "loss": 0.7907, "step": 5930 }, { "epoch": 0.28274497652134534, "grad_norm": 2.3158459663391113, "learning_rate": 3.1245789194567024e-06, "loss": 0.8285, "step": 5931 }, { "epoch": 0.2827926489166448, "grad_norm": 3.1197385787963867, "learning_rate": 3.1217271740411626e-06, "loss": 0.9111, "step": 5932 }, { "epoch": 0.2828403213119443, "grad_norm": 1.5556949377059937, "learning_rate": 3.1188764898722843e-06, "loss": 0.4844, "step": 5933 }, { "epoch": 0.2828879937072438, "grad_norm": 1.8289040327072144, "learning_rate": 3.116026867389903e-06, "loss": 0.7285, "step": 5934 }, { "epoch": 0.28293566610254334, "grad_norm": 0.6461400389671326, "learning_rate": 3.1131783070336872e-06, "loss": 0.288, "step": 5935 }, { "epoch": 0.28298333849784285, "grad_norm": 1.816648244857788, "learning_rate": 3.110330809243134e-06, "loss": 0.9826, "step": 5936 }, { "epoch": 0.2830310108931423, "grad_norm": 1.8873894214630127, "learning_rate": 3.1074843744575877e-06, "loss": 0.7973, "step": 5937 }, { "epoch": 0.2830786832884418, "grad_norm": 1.235554814338684, "learning_rate": 3.1046390031162265e-06, "loss": 0.5818, "step": 5938 }, { "epoch": 0.28312635568374134, "grad_norm": 1.8525989055633545, "learning_rate": 3.1017946956580557e-06, "loss": 0.4799, "step": 5939 }, { "epoch": 0.28317402807904085, "grad_norm": 3.0708417892456055, "learning_rate": 3.098951452521929e-06, "loss": 0.5568, "step": 5940 }, { "epoch": 0.2832217004743403, "grad_norm": 2.0866968631744385, "learning_rate": 3.0961092741465226e-06, "loss": 0.5011, "step": 5941 }, { "epoch": 0.2832693728696398, "grad_norm": 2.3611583709716797, "learning_rate": 3.093268160970362e-06, "loss": 0.5536, "step": 5942 }, { "epoch": 0.28331704526493934, "grad_norm": 1.2229896783828735, "learning_rate": 3.090428113431795e-06, "loss": 0.5604, "step": 5943 }, { "epoch": 0.28336471766023885, "grad_norm": 1.3708198070526123, "learning_rate": 3.0875891319690188e-06, "loss": 0.907, "step": 5944 }, { "epoch": 0.28341239005553837, "grad_norm": 2.049001932144165, "learning_rate": 3.0847512170200523e-06, "loss": 0.6424, "step": 5945 }, { "epoch": 0.2834600624508378, "grad_norm": 3.2906229496002197, "learning_rate": 3.0819143690227602e-06, "loss": 0.6675, "step": 5946 }, { "epoch": 0.28350773484613734, "grad_norm": 1.4666670560836792, "learning_rate": 3.0790785884148413e-06, "loss": 0.5831, "step": 5947 }, { "epoch": 0.28355540724143685, "grad_norm": 2.1309757232666016, "learning_rate": 3.0762438756338207e-06, "loss": 0.7127, "step": 5948 }, { "epoch": 0.28360307963673637, "grad_norm": 3.729311466217041, "learning_rate": 3.0734102311170697e-06, "loss": 0.2881, "step": 5949 }, { "epoch": 0.2836507520320358, "grad_norm": 1.0012531280517578, "learning_rate": 3.070577655301793e-06, "loss": 0.769, "step": 5950 }, { "epoch": 0.28369842442733534, "grad_norm": 1.3662431240081787, "learning_rate": 3.0677461486250226e-06, "loss": 0.7394, "step": 5951 }, { "epoch": 0.28374609682263485, "grad_norm": 2.267841339111328, "learning_rate": 3.0649157115236315e-06, "loss": 0.7723, "step": 5952 }, { "epoch": 0.28379376921793437, "grad_norm": 3.1676759719848633, "learning_rate": 3.062086344434333e-06, "loss": 0.7729, "step": 5953 }, { "epoch": 0.2838414416132339, "grad_norm": 1.3230173587799072, "learning_rate": 3.0592580477936606e-06, "loss": 0.6094, "step": 5954 }, { "epoch": 0.28388911400853334, "grad_norm": 1.7950643301010132, "learning_rate": 3.0564308220380003e-06, "loss": 0.8171, "step": 5955 }, { "epoch": 0.28393678640383285, "grad_norm": 1.1964622735977173, "learning_rate": 3.0536046676035546e-06, "loss": 0.7314, "step": 5956 }, { "epoch": 0.28398445879913237, "grad_norm": 1.3880852460861206, "learning_rate": 3.050779584926379e-06, "loss": 0.681, "step": 5957 }, { "epoch": 0.2840321311944319, "grad_norm": 1.5135201215744019, "learning_rate": 3.0479555744423463e-06, "loss": 0.5061, "step": 5958 }, { "epoch": 0.28407980358973134, "grad_norm": 1.1748199462890625, "learning_rate": 3.045132636587179e-06, "loss": 0.8499, "step": 5959 }, { "epoch": 0.28412747598503085, "grad_norm": 2.5828261375427246, "learning_rate": 3.042310771796423e-06, "loss": 0.8827, "step": 5960 }, { "epoch": 0.28417514838033037, "grad_norm": 1.8506451845169067, "learning_rate": 3.0394899805054635e-06, "loss": 1.094, "step": 5961 }, { "epoch": 0.2842228207756299, "grad_norm": 1.2826437950134277, "learning_rate": 3.0366702631495237e-06, "loss": 0.7094, "step": 5962 }, { "epoch": 0.2842704931709294, "grad_norm": 1.3929500579833984, "learning_rate": 3.0338516201636516e-06, "loss": 0.7033, "step": 5963 }, { "epoch": 0.28431816556622885, "grad_norm": 1.618619441986084, "learning_rate": 3.031034051982735e-06, "loss": 0.6553, "step": 5964 }, { "epoch": 0.28436583796152837, "grad_norm": 2.004574775695801, "learning_rate": 3.0282175590415e-06, "loss": 0.8983, "step": 5965 }, { "epoch": 0.2844135103568279, "grad_norm": 1.1381690502166748, "learning_rate": 3.0254021417745027e-06, "loss": 0.3409, "step": 5966 }, { "epoch": 0.2844611827521274, "grad_norm": 1.8637462854385376, "learning_rate": 3.022587800616127e-06, "loss": 0.8184, "step": 5967 }, { "epoch": 0.2845088551474269, "grad_norm": 2.8947017192840576, "learning_rate": 3.0197745360006004e-06, "loss": 0.9563, "step": 5968 }, { "epoch": 0.28455652754272637, "grad_norm": 2.7389771938323975, "learning_rate": 3.0169623483619824e-06, "loss": 1.2036, "step": 5969 }, { "epoch": 0.2846041999380259, "grad_norm": 1.9356426000595093, "learning_rate": 3.014151238134161e-06, "loss": 0.7647, "step": 5970 }, { "epoch": 0.2846518723333254, "grad_norm": 1.2176311016082764, "learning_rate": 3.011341205750866e-06, "loss": 0.6346, "step": 5971 }, { "epoch": 0.2846995447286249, "grad_norm": 1.3626267910003662, "learning_rate": 3.0085322516456537e-06, "loss": 0.7817, "step": 5972 }, { "epoch": 0.28474721712392437, "grad_norm": 1.1688299179077148, "learning_rate": 3.0057243762519137e-06, "loss": 0.8457, "step": 5973 }, { "epoch": 0.2847948895192239, "grad_norm": 2.106661796569824, "learning_rate": 3.002917580002875e-06, "loss": 0.4141, "step": 5974 }, { "epoch": 0.2848425619145234, "grad_norm": 1.1825395822525024, "learning_rate": 3.0001118633316018e-06, "loss": 0.6835, "step": 5975 }, { "epoch": 0.2848902343098229, "grad_norm": 1.325334906578064, "learning_rate": 2.997307226670979e-06, "loss": 0.5565, "step": 5976 }, { "epoch": 0.2849379067051224, "grad_norm": 1.230260968208313, "learning_rate": 2.9945036704537376e-06, "loss": 0.6952, "step": 5977 }, { "epoch": 0.2849855791004219, "grad_norm": 1.3571439981460571, "learning_rate": 2.991701195112441e-06, "loss": 0.3431, "step": 5978 }, { "epoch": 0.2850332514957214, "grad_norm": 1.8420449495315552, "learning_rate": 2.9888998010794745e-06, "loss": 0.7724, "step": 5979 }, { "epoch": 0.2850809238910209, "grad_norm": 1.6307415962219238, "learning_rate": 2.986099488787069e-06, "loss": 0.6341, "step": 5980 }, { "epoch": 0.2851285962863204, "grad_norm": 1.3189420700073242, "learning_rate": 2.9833002586672855e-06, "loss": 0.6408, "step": 5981 }, { "epoch": 0.2851762686816199, "grad_norm": 1.9311996698379517, "learning_rate": 2.9805021111520105e-06, "loss": 0.2892, "step": 5982 }, { "epoch": 0.2852239410769194, "grad_norm": 1.2384161949157715, "learning_rate": 2.977705046672974e-06, "loss": 0.7973, "step": 5983 }, { "epoch": 0.2852716134722189, "grad_norm": 2.065157175064087, "learning_rate": 2.9749090656617363e-06, "loss": 0.8093, "step": 5984 }, { "epoch": 0.2853192858675184, "grad_norm": 1.7547610998153687, "learning_rate": 2.9721141685496825e-06, "loss": 0.7952, "step": 5985 }, { "epoch": 0.28536695826281794, "grad_norm": 4.806229114532471, "learning_rate": 2.9693203557680415e-06, "loss": 1.0673, "step": 5986 }, { "epoch": 0.2854146306581174, "grad_norm": 2.534628391265869, "learning_rate": 2.9665276277478672e-06, "loss": 0.6868, "step": 5987 }, { "epoch": 0.2854623030534169, "grad_norm": 1.5216633081436157, "learning_rate": 2.9637359849200474e-06, "loss": 0.6742, "step": 5988 }, { "epoch": 0.2855099754487164, "grad_norm": 2.193467617034912, "learning_rate": 2.960945427715305e-06, "loss": 0.6474, "step": 5989 }, { "epoch": 0.28555764784401594, "grad_norm": 1.6416079998016357, "learning_rate": 2.9581559565641983e-06, "loss": 0.2511, "step": 5990 }, { "epoch": 0.2856053202393154, "grad_norm": 1.167321801185608, "learning_rate": 2.9553675718971065e-06, "loss": 0.7189, "step": 5991 }, { "epoch": 0.2856529926346149, "grad_norm": 1.3199819326400757, "learning_rate": 2.9525802741442532e-06, "loss": 0.8042, "step": 5992 }, { "epoch": 0.2857006650299144, "grad_norm": 1.2183806896209717, "learning_rate": 2.9497940637356924e-06, "loss": 0.8987, "step": 5993 }, { "epoch": 0.28574833742521394, "grad_norm": 1.8851501941680908, "learning_rate": 2.9470089411013014e-06, "loss": 0.3647, "step": 5994 }, { "epoch": 0.28579600982051345, "grad_norm": 1.4431147575378418, "learning_rate": 2.9442249066707993e-06, "loss": 0.7273, "step": 5995 }, { "epoch": 0.2858436822158129, "grad_norm": 1.4173214435577393, "learning_rate": 2.9414419608737366e-06, "loss": 0.9537, "step": 5996 }, { "epoch": 0.2858913546111124, "grad_norm": 2.244168996810913, "learning_rate": 2.938660104139487e-06, "loss": 0.9552, "step": 5997 }, { "epoch": 0.28593902700641194, "grad_norm": 1.6251845359802246, "learning_rate": 2.935879336897265e-06, "loss": 1.3154, "step": 5998 }, { "epoch": 0.28598669940171145, "grad_norm": 4.886594295501709, "learning_rate": 2.9330996595761184e-06, "loss": 1.2302, "step": 5999 }, { "epoch": 0.28603437179701097, "grad_norm": 1.7000819444656372, "learning_rate": 2.930321072604917e-06, "loss": 0.9731, "step": 6000 }, { "epoch": 0.2860820441923104, "grad_norm": 1.326775312423706, "learning_rate": 2.927543576412373e-06, "loss": 0.6907, "step": 6001 }, { "epoch": 0.28612971658760994, "grad_norm": 2.368781566619873, "learning_rate": 2.9247671714270198e-06, "loss": 0.2685, "step": 6002 }, { "epoch": 0.28617738898290945, "grad_norm": 1.3293931484222412, "learning_rate": 2.921991858077234e-06, "loss": 0.7275, "step": 6003 }, { "epoch": 0.28622506137820897, "grad_norm": 1.5075607299804688, "learning_rate": 2.919217636791213e-06, "loss": 0.6446, "step": 6004 }, { "epoch": 0.2862727337735084, "grad_norm": 11.684954643249512, "learning_rate": 2.916444507996993e-06, "loss": 0.5891, "step": 6005 }, { "epoch": 0.28632040616880794, "grad_norm": 1.08614182472229, "learning_rate": 2.9136724721224406e-06, "loss": 0.4969, "step": 6006 }, { "epoch": 0.28636807856410745, "grad_norm": 1.8808544874191284, "learning_rate": 2.910901529595248e-06, "loss": 1.0098, "step": 6007 }, { "epoch": 0.28641575095940697, "grad_norm": 2.4806671142578125, "learning_rate": 2.908131680842946e-06, "loss": 0.3836, "step": 6008 }, { "epoch": 0.2864634233547065, "grad_norm": 2.0469717979431152, "learning_rate": 2.9053629262928974e-06, "loss": 0.7834, "step": 6009 }, { "epoch": 0.28651109575000594, "grad_norm": 1.9556336402893066, "learning_rate": 2.9025952663722845e-06, "loss": 0.7412, "step": 6010 }, { "epoch": 0.28655876814530545, "grad_norm": 1.4475878477096558, "learning_rate": 2.899828701508133e-06, "loss": 0.1685, "step": 6011 }, { "epoch": 0.28660644054060497, "grad_norm": 1.3044952154159546, "learning_rate": 2.8970632321272983e-06, "loss": 1.0663, "step": 6012 }, { "epoch": 0.2866541129359045, "grad_norm": 1.0869470834732056, "learning_rate": 2.894298858656458e-06, "loss": 0.3633, "step": 6013 }, { "epoch": 0.28670178533120394, "grad_norm": 1.657095193862915, "learning_rate": 2.8915355815221293e-06, "loss": 0.5961, "step": 6014 }, { "epoch": 0.28674945772650345, "grad_norm": 2.124903678894043, "learning_rate": 2.88877340115066e-06, "loss": 0.5006, "step": 6015 }, { "epoch": 0.28679713012180297, "grad_norm": 1.6909003257751465, "learning_rate": 2.8860123179682244e-06, "loss": 0.6944, "step": 6016 }, { "epoch": 0.2868448025171025, "grad_norm": 1.5790278911590576, "learning_rate": 2.883252332400823e-06, "loss": 0.8222, "step": 6017 }, { "epoch": 0.286892474912402, "grad_norm": 1.340075969696045, "learning_rate": 2.8804934448743037e-06, "loss": 0.3271, "step": 6018 }, { "epoch": 0.28694014730770145, "grad_norm": 1.6675771474838257, "learning_rate": 2.8777356558143255e-06, "loss": 0.7405, "step": 6019 }, { "epoch": 0.28698781970300097, "grad_norm": 1.3836605548858643, "learning_rate": 2.87497896564639e-06, "loss": 0.614, "step": 6020 }, { "epoch": 0.2870354920983005, "grad_norm": 1.848780632019043, "learning_rate": 2.8722233747958295e-06, "loss": 0.5663, "step": 6021 }, { "epoch": 0.2870831644936, "grad_norm": 1.4647642374038696, "learning_rate": 2.869468883687798e-06, "loss": 0.6524, "step": 6022 }, { "epoch": 0.2871308368888995, "grad_norm": 2.593418598175049, "learning_rate": 2.8667154927472875e-06, "loss": 0.3415, "step": 6023 }, { "epoch": 0.28717850928419897, "grad_norm": 2.556431531906128, "learning_rate": 2.8639632023991204e-06, "loss": 0.9608, "step": 6024 }, { "epoch": 0.2872261816794985, "grad_norm": 1.3714070320129395, "learning_rate": 2.861212013067941e-06, "loss": 0.4729, "step": 6025 }, { "epoch": 0.287273854074798, "grad_norm": 1.4538246393203735, "learning_rate": 2.858461925178233e-06, "loss": 0.7408, "step": 6026 }, { "epoch": 0.2873215264700975, "grad_norm": 1.4156399965286255, "learning_rate": 2.855712939154309e-06, "loss": 0.5633, "step": 6027 }, { "epoch": 0.28736919886539697, "grad_norm": 1.7659258842468262, "learning_rate": 2.852965055420305e-06, "loss": 0.3994, "step": 6028 }, { "epoch": 0.2874168712606965, "grad_norm": 1.9925000667572021, "learning_rate": 2.8502182744001903e-06, "loss": 0.5116, "step": 6029 }, { "epoch": 0.287464543655996, "grad_norm": 2.9892420768737793, "learning_rate": 2.8474725965177717e-06, "loss": 0.8502, "step": 6030 }, { "epoch": 0.2875122160512955, "grad_norm": 1.7563321590423584, "learning_rate": 2.8447280221966754e-06, "loss": 1.0132, "step": 6031 }, { "epoch": 0.287559888446595, "grad_norm": 1.1654222011566162, "learning_rate": 2.841984551860356e-06, "loss": 0.4381, "step": 6032 }, { "epoch": 0.2876075608418945, "grad_norm": 5.544343948364258, "learning_rate": 2.8392421859321105e-06, "loss": 1.0829, "step": 6033 }, { "epoch": 0.287655233237194, "grad_norm": 1.2452433109283447, "learning_rate": 2.8365009248350515e-06, "loss": 0.6825, "step": 6034 }, { "epoch": 0.2877029056324935, "grad_norm": 3.3150670528411865, "learning_rate": 2.83376076899213e-06, "loss": 0.8066, "step": 6035 }, { "epoch": 0.287750578027793, "grad_norm": 4.442493915557861, "learning_rate": 2.831021718826126e-06, "loss": 0.2103, "step": 6036 }, { "epoch": 0.2877982504230925, "grad_norm": 2.108241319656372, "learning_rate": 2.8282837747596428e-06, "loss": 0.8832, "step": 6037 }, { "epoch": 0.287845922818392, "grad_norm": 4.231440544128418, "learning_rate": 2.8255469372151178e-06, "loss": 1.0792, "step": 6038 }, { "epoch": 0.2878935952136915, "grad_norm": 1.7912887334823608, "learning_rate": 2.8228112066148173e-06, "loss": 0.4263, "step": 6039 }, { "epoch": 0.287941267608991, "grad_norm": 1.8442174196243286, "learning_rate": 2.8200765833808406e-06, "loss": 0.4157, "step": 6040 }, { "epoch": 0.28798894000429054, "grad_norm": 1.9798344373703003, "learning_rate": 2.8173430679351055e-06, "loss": 0.7357, "step": 6041 }, { "epoch": 0.28803661239959, "grad_norm": 1.8778464794158936, "learning_rate": 2.8146106606993674e-06, "loss": 0.3929, "step": 6042 }, { "epoch": 0.2880842847948895, "grad_norm": 2.1753764152526855, "learning_rate": 2.8118793620952125e-06, "loss": 0.4436, "step": 6043 }, { "epoch": 0.288131957190189, "grad_norm": 1.4933431148529053, "learning_rate": 2.8091491725440454e-06, "loss": 0.8653, "step": 6044 }, { "epoch": 0.28817962958548854, "grad_norm": 1.1439440250396729, "learning_rate": 2.8064200924671137e-06, "loss": 0.8711, "step": 6045 }, { "epoch": 0.288227301980788, "grad_norm": 1.400071382522583, "learning_rate": 2.8036921222854776e-06, "loss": 0.6402, "step": 6046 }, { "epoch": 0.2882749743760875, "grad_norm": 1.5463074445724487, "learning_rate": 2.8009652624200436e-06, "loss": 0.3914, "step": 6047 }, { "epoch": 0.288322646771387, "grad_norm": 1.7884753942489624, "learning_rate": 2.7982395132915295e-06, "loss": 0.5871, "step": 6048 }, { "epoch": 0.28837031916668654, "grad_norm": 2.3855931758880615, "learning_rate": 2.7955148753204995e-06, "loss": 1.2087, "step": 6049 }, { "epoch": 0.28841799156198605, "grad_norm": 1.2375513315200806, "learning_rate": 2.7927913489273284e-06, "loss": 0.7577, "step": 6050 }, { "epoch": 0.2884656639572855, "grad_norm": 1.47097909450531, "learning_rate": 2.790068934532232e-06, "loss": 0.6823, "step": 6051 }, { "epoch": 0.288513336352585, "grad_norm": 1.063920497894287, "learning_rate": 2.7873476325552538e-06, "loss": 0.5109, "step": 6052 }, { "epoch": 0.28856100874788454, "grad_norm": 1.6459465026855469, "learning_rate": 2.784627443416258e-06, "loss": 0.4553, "step": 6053 }, { "epoch": 0.28860868114318405, "grad_norm": 1.9386135339736938, "learning_rate": 2.7819083675349436e-06, "loss": 0.8236, "step": 6054 }, { "epoch": 0.28865635353848357, "grad_norm": 2.30006742477417, "learning_rate": 2.779190405330838e-06, "loss": 0.4692, "step": 6055 }, { "epoch": 0.288704025933783, "grad_norm": 2.1627304553985596, "learning_rate": 2.7764735572232916e-06, "loss": 0.9222, "step": 6056 }, { "epoch": 0.28875169832908254, "grad_norm": 1.87648344039917, "learning_rate": 2.773757823631487e-06, "loss": 0.6527, "step": 6057 }, { "epoch": 0.28879937072438205, "grad_norm": 2.5315017700195312, "learning_rate": 2.7710432049744363e-06, "loss": 1.4156, "step": 6058 }, { "epoch": 0.28884704311968157, "grad_norm": 1.2173198461532593, "learning_rate": 2.768329701670972e-06, "loss": 0.664, "step": 6059 }, { "epoch": 0.288894715514981, "grad_norm": 1.634997844696045, "learning_rate": 2.765617314139767e-06, "loss": 0.4119, "step": 6060 }, { "epoch": 0.28894238791028054, "grad_norm": 4.035872936248779, "learning_rate": 2.7629060427993072e-06, "loss": 0.127, "step": 6061 }, { "epoch": 0.28899006030558005, "grad_norm": 2.1331849098205566, "learning_rate": 2.7601958880679204e-06, "loss": 0.8079, "step": 6062 }, { "epoch": 0.28903773270087957, "grad_norm": 1.4828959703445435, "learning_rate": 2.7574868503637496e-06, "loss": 0.3732, "step": 6063 }, { "epoch": 0.2890854050961791, "grad_norm": 1.6004363298416138, "learning_rate": 2.754778930104778e-06, "loss": 0.6978, "step": 6064 }, { "epoch": 0.28913307749147854, "grad_norm": 1.9005931615829468, "learning_rate": 2.7520721277088023e-06, "loss": 0.7738, "step": 6065 }, { "epoch": 0.28918074988677805, "grad_norm": 1.1089800596237183, "learning_rate": 2.7493664435934574e-06, "loss": 0.9114, "step": 6066 }, { "epoch": 0.28922842228207757, "grad_norm": 0.8061476945877075, "learning_rate": 2.7466618781762077e-06, "loss": 0.2514, "step": 6067 }, { "epoch": 0.2892760946773771, "grad_norm": 1.8242262601852417, "learning_rate": 2.743958431874332e-06, "loss": 0.8326, "step": 6068 }, { "epoch": 0.28932376707267654, "grad_norm": 2.1115779876708984, "learning_rate": 2.7412561051049468e-06, "loss": 0.8542, "step": 6069 }, { "epoch": 0.28937143946797605, "grad_norm": 1.728063941001892, "learning_rate": 2.7385548982849974e-06, "loss": 0.8097, "step": 6070 }, { "epoch": 0.28941911186327557, "grad_norm": 1.3411555290222168, "learning_rate": 2.7358548118312455e-06, "loss": 0.6236, "step": 6071 }, { "epoch": 0.2894667842585751, "grad_norm": 1.8006856441497803, "learning_rate": 2.7331558461602905e-06, "loss": 0.9321, "step": 6072 }, { "epoch": 0.2895144566538746, "grad_norm": 1.475117564201355, "learning_rate": 2.7304580016885564e-06, "loss": 0.6531, "step": 6073 }, { "epoch": 0.28956212904917406, "grad_norm": 1.6995662450790405, "learning_rate": 2.727761278832288e-06, "loss": 0.5523, "step": 6074 }, { "epoch": 0.28960980144447357, "grad_norm": 1.3316490650177002, "learning_rate": 2.725065678007568e-06, "loss": 0.6085, "step": 6075 }, { "epoch": 0.2896574738397731, "grad_norm": 1.3850513696670532, "learning_rate": 2.7223711996302935e-06, "loss": 0.4116, "step": 6076 }, { "epoch": 0.2897051462350726, "grad_norm": 2.90434193611145, "learning_rate": 2.719677844116202e-06, "loss": 1.1875, "step": 6077 }, { "epoch": 0.28975281863037206, "grad_norm": 1.1194543838500977, "learning_rate": 2.7169856118808414e-06, "loss": 0.7299, "step": 6078 }, { "epoch": 0.28980049102567157, "grad_norm": 1.3235876560211182, "learning_rate": 2.714294503339602e-06, "loss": 0.525, "step": 6079 }, { "epoch": 0.2898481634209711, "grad_norm": 13.684040069580078, "learning_rate": 2.7116045189076946e-06, "loss": 0.9644, "step": 6080 }, { "epoch": 0.2898958358162706, "grad_norm": 3.838590145111084, "learning_rate": 2.708915659000151e-06, "loss": 1.1306, "step": 6081 }, { "epoch": 0.2899435082115701, "grad_norm": 1.4317800998687744, "learning_rate": 2.706227924031838e-06, "loss": 0.9855, "step": 6082 }, { "epoch": 0.28999118060686957, "grad_norm": 1.6434540748596191, "learning_rate": 2.7035413144174472e-06, "loss": 0.4999, "step": 6083 }, { "epoch": 0.2900388530021691, "grad_norm": 1.2791095972061157, "learning_rate": 2.7008558305714905e-06, "loss": 0.8824, "step": 6084 }, { "epoch": 0.2900865253974686, "grad_norm": 4.311280250549316, "learning_rate": 2.698171472908312e-06, "loss": 0.7368, "step": 6085 }, { "epoch": 0.2901341977927681, "grad_norm": 1.7473076581954956, "learning_rate": 2.6954882418420836e-06, "loss": 1.4275, "step": 6086 }, { "epoch": 0.2901818701880676, "grad_norm": 1.281606674194336, "learning_rate": 2.6928061377867954e-06, "loss": 0.5668, "step": 6087 }, { "epoch": 0.2902295425833671, "grad_norm": 2.753815174102783, "learning_rate": 2.6901251611562695e-06, "loss": 0.6141, "step": 6088 }, { "epoch": 0.2902772149786666, "grad_norm": 2.2690324783325195, "learning_rate": 2.6874453123641585e-06, "loss": 1.1821, "step": 6089 }, { "epoch": 0.2903248873739661, "grad_norm": 1.9901797771453857, "learning_rate": 2.6847665918239273e-06, "loss": 1.2156, "step": 6090 }, { "epoch": 0.2903725597692656, "grad_norm": 1.8178085088729858, "learning_rate": 2.682088999948882e-06, "loss": 1.0537, "step": 6091 }, { "epoch": 0.2904202321645651, "grad_norm": 1.5463037490844727, "learning_rate": 2.679412537152143e-06, "loss": 0.8833, "step": 6092 }, { "epoch": 0.2904679045598646, "grad_norm": 1.2732456922531128, "learning_rate": 2.67673720384666e-06, "loss": 0.6408, "step": 6093 }, { "epoch": 0.2905155769551641, "grad_norm": 1.9538484811782837, "learning_rate": 2.6740630004452115e-06, "loss": 0.783, "step": 6094 }, { "epoch": 0.2905632493504636, "grad_norm": 2.7993714809417725, "learning_rate": 2.6713899273604027e-06, "loss": 0.3994, "step": 6095 }, { "epoch": 0.29061092174576314, "grad_norm": 1.2971073389053345, "learning_rate": 2.668717985004654e-06, "loss": 0.6819, "step": 6096 }, { "epoch": 0.2906585941410626, "grad_norm": 1.4017008543014526, "learning_rate": 2.6660471737902228e-06, "loss": 0.4674, "step": 6097 }, { "epoch": 0.2907062665363621, "grad_norm": 1.4421530961990356, "learning_rate": 2.6633774941291912e-06, "loss": 0.5829, "step": 6098 }, { "epoch": 0.2907539389316616, "grad_norm": 1.4370533227920532, "learning_rate": 2.6607089464334567e-06, "loss": 0.6989, "step": 6099 }, { "epoch": 0.29080161132696114, "grad_norm": 1.6371572017669678, "learning_rate": 2.658041531114751e-06, "loss": 0.845, "step": 6100 }, { "epoch": 0.2908492837222606, "grad_norm": 1.198086142539978, "learning_rate": 2.6553752485846327e-06, "loss": 0.4365, "step": 6101 }, { "epoch": 0.2908969561175601, "grad_norm": 1.3670740127563477, "learning_rate": 2.652710099254476e-06, "loss": 0.6445, "step": 6102 }, { "epoch": 0.2909446285128596, "grad_norm": 1.2577002048492432, "learning_rate": 2.650046083535489e-06, "loss": 0.6932, "step": 6103 }, { "epoch": 0.29099230090815914, "grad_norm": 1.0973515510559082, "learning_rate": 2.6473832018387034e-06, "loss": 0.5731, "step": 6104 }, { "epoch": 0.29103997330345865, "grad_norm": 1.904301643371582, "learning_rate": 2.64472145457497e-06, "loss": 0.7919, "step": 6105 }, { "epoch": 0.2910876456987581, "grad_norm": 1.3597952127456665, "learning_rate": 2.642060842154974e-06, "loss": 0.4704, "step": 6106 }, { "epoch": 0.2911353180940576, "grad_norm": 1.5735416412353516, "learning_rate": 2.639401364989218e-06, "loss": 0.5481, "step": 6107 }, { "epoch": 0.29118299048935714, "grad_norm": 1.0483876466751099, "learning_rate": 2.6367430234880286e-06, "loss": 0.5559, "step": 6108 }, { "epoch": 0.29123066288465665, "grad_norm": 2.730222225189209, "learning_rate": 2.634085818061565e-06, "loss": 0.6675, "step": 6109 }, { "epoch": 0.29127833527995617, "grad_norm": 1.515252947807312, "learning_rate": 2.631429749119807e-06, "loss": 0.8489, "step": 6110 }, { "epoch": 0.2913260076752556, "grad_norm": 1.6235744953155518, "learning_rate": 2.6287748170725545e-06, "loss": 0.6381, "step": 6111 }, { "epoch": 0.29137368007055514, "grad_norm": 1.1834958791732788, "learning_rate": 2.62612102232944e-06, "loss": 0.6782, "step": 6112 }, { "epoch": 0.29142135246585466, "grad_norm": 1.1149227619171143, "learning_rate": 2.6234683652999173e-06, "loss": 0.5681, "step": 6113 }, { "epoch": 0.29146902486115417, "grad_norm": 1.7449109554290771, "learning_rate": 2.6208168463932595e-06, "loss": 0.5997, "step": 6114 }, { "epoch": 0.2915166972564536, "grad_norm": 1.2367126941680908, "learning_rate": 2.618166466018571e-06, "loss": 0.3397, "step": 6115 }, { "epoch": 0.29156436965175314, "grad_norm": 1.2803328037261963, "learning_rate": 2.6155172245847793e-06, "loss": 0.6679, "step": 6116 }, { "epoch": 0.29161204204705266, "grad_norm": 1.7904669046401978, "learning_rate": 2.6128691225006376e-06, "loss": 0.6717, "step": 6117 }, { "epoch": 0.29165971444235217, "grad_norm": 2.405367612838745, "learning_rate": 2.6102221601747136e-06, "loss": 0.933, "step": 6118 }, { "epoch": 0.2917073868376517, "grad_norm": 2.131350517272949, "learning_rate": 2.607576338015414e-06, "loss": 0.84, "step": 6119 }, { "epoch": 0.29175505923295114, "grad_norm": 1.3268470764160156, "learning_rate": 2.6049316564309546e-06, "loss": 0.923, "step": 6120 }, { "epoch": 0.29180273162825066, "grad_norm": 1.7321757078170776, "learning_rate": 2.60228811582939e-06, "loss": 0.7449, "step": 6121 }, { "epoch": 0.29185040402355017, "grad_norm": 4.641642093658447, "learning_rate": 2.599645716618584e-06, "loss": 0.5682, "step": 6122 }, { "epoch": 0.2918980764188497, "grad_norm": 2.070122003555298, "learning_rate": 2.597004459206238e-06, "loss": 1.0381, "step": 6123 }, { "epoch": 0.29194574881414914, "grad_norm": 1.9797791242599487, "learning_rate": 2.5943643439998644e-06, "loss": 0.7218, "step": 6124 }, { "epoch": 0.29199342120944866, "grad_norm": 1.7723979949951172, "learning_rate": 2.5917253714068104e-06, "loss": 0.5357, "step": 6125 }, { "epoch": 0.29204109360474817, "grad_norm": 3.6121599674224854, "learning_rate": 2.589087541834243e-06, "loss": 0.3378, "step": 6126 }, { "epoch": 0.2920887660000477, "grad_norm": 1.2610667943954468, "learning_rate": 2.5864508556891475e-06, "loss": 0.5998, "step": 6127 }, { "epoch": 0.2921364383953472, "grad_norm": 1.3514364957809448, "learning_rate": 2.5838153133783405e-06, "loss": 0.5067, "step": 6128 }, { "epoch": 0.29218411079064666, "grad_norm": 2.05029034614563, "learning_rate": 2.581180915308461e-06, "loss": 0.2832, "step": 6129 }, { "epoch": 0.29223178318594617, "grad_norm": 1.4210838079452515, "learning_rate": 2.578547661885965e-06, "loss": 0.9129, "step": 6130 }, { "epoch": 0.2922794555812457, "grad_norm": 2.062899112701416, "learning_rate": 2.5759155535171388e-06, "loss": 0.9497, "step": 6131 }, { "epoch": 0.2923271279765452, "grad_norm": 1.331218957901001, "learning_rate": 2.5732845906080915e-06, "loss": 0.7308, "step": 6132 }, { "epoch": 0.29237480037184466, "grad_norm": 5.356071472167969, "learning_rate": 2.570654773564749e-06, "loss": 1.0458, "step": 6133 }, { "epoch": 0.29242247276714417, "grad_norm": 1.890952229499817, "learning_rate": 2.5680261027928676e-06, "loss": 0.6532, "step": 6134 }, { "epoch": 0.2924701451624437, "grad_norm": 1.5121945142745972, "learning_rate": 2.565398578698026e-06, "loss": 0.7223, "step": 6135 }, { "epoch": 0.2925178175577432, "grad_norm": 1.5600584745407104, "learning_rate": 2.5627722016856237e-06, "loss": 0.8201, "step": 6136 }, { "epoch": 0.2925654899530427, "grad_norm": 1.4974292516708374, "learning_rate": 2.5601469721608786e-06, "loss": 1.0114, "step": 6137 }, { "epoch": 0.29261316234834217, "grad_norm": 1.9437029361724854, "learning_rate": 2.557522890528842e-06, "loss": 0.8659, "step": 6138 }, { "epoch": 0.2926608347436417, "grad_norm": 1.3690195083618164, "learning_rate": 2.554899957194379e-06, "loss": 0.8612, "step": 6139 }, { "epoch": 0.2927085071389412, "grad_norm": 2.2465035915374756, "learning_rate": 2.5522781725621814e-06, "loss": 1.0236, "step": 6140 }, { "epoch": 0.2927561795342407, "grad_norm": 1.2816989421844482, "learning_rate": 2.549657537036769e-06, "loss": 0.3911, "step": 6141 }, { "epoch": 0.2928038519295402, "grad_norm": 0.9845332503318787, "learning_rate": 2.547038051022472e-06, "loss": 0.5499, "step": 6142 }, { "epoch": 0.2928515243248397, "grad_norm": 1.6904677152633667, "learning_rate": 2.544419714923454e-06, "loss": 1.073, "step": 6143 }, { "epoch": 0.2928991967201392, "grad_norm": 1.983976125717163, "learning_rate": 2.5418025291436976e-06, "loss": 0.635, "step": 6144 }, { "epoch": 0.2929468691154387, "grad_norm": 1.4890094995498657, "learning_rate": 2.539186494087005e-06, "loss": 0.7808, "step": 6145 }, { "epoch": 0.2929945415107382, "grad_norm": 1.6880234479904175, "learning_rate": 2.5365716101570036e-06, "loss": 0.9901, "step": 6146 }, { "epoch": 0.2930422139060377, "grad_norm": 5.635754585266113, "learning_rate": 2.533957877757148e-06, "loss": 0.5273, "step": 6147 }, { "epoch": 0.2930898863013372, "grad_norm": 3.41428804397583, "learning_rate": 2.5313452972907027e-06, "loss": 0.7776, "step": 6148 }, { "epoch": 0.2931375586966367, "grad_norm": 1.9443776607513428, "learning_rate": 2.5287338691607664e-06, "loss": 0.5359, "step": 6149 }, { "epoch": 0.2931852310919362, "grad_norm": 1.348767638206482, "learning_rate": 2.5261235937702576e-06, "loss": 0.574, "step": 6150 }, { "epoch": 0.29323290348723574, "grad_norm": 1.3055534362792969, "learning_rate": 2.523514471521913e-06, "loss": 0.6479, "step": 6151 }, { "epoch": 0.2932805758825352, "grad_norm": 1.907870888710022, "learning_rate": 2.520906502818289e-06, "loss": 0.7861, "step": 6152 }, { "epoch": 0.2933282482778347, "grad_norm": 4.606992244720459, "learning_rate": 2.518299688061772e-06, "loss": 0.0486, "step": 6153 }, { "epoch": 0.2933759206731342, "grad_norm": 1.4165542125701904, "learning_rate": 2.5156940276545692e-06, "loss": 0.4829, "step": 6154 }, { "epoch": 0.29342359306843374, "grad_norm": 1.6055858135223389, "learning_rate": 2.5130895219987015e-06, "loss": 0.6591, "step": 6155 }, { "epoch": 0.2934712654637332, "grad_norm": 1.3969099521636963, "learning_rate": 2.5104861714960207e-06, "loss": 0.7502, "step": 6156 }, { "epoch": 0.2935189378590327, "grad_norm": 2.8612236976623535, "learning_rate": 2.507883976548199e-06, "loss": 0.7497, "step": 6157 }, { "epoch": 0.2935666102543322, "grad_norm": 1.6466960906982422, "learning_rate": 2.5052829375567232e-06, "loss": 0.7623, "step": 6158 }, { "epoch": 0.29361428264963174, "grad_norm": 3.407519817352295, "learning_rate": 2.5026830549229097e-06, "loss": 0.796, "step": 6159 }, { "epoch": 0.29366195504493126, "grad_norm": 1.9417352676391602, "learning_rate": 2.500084329047896e-06, "loss": 0.9675, "step": 6160 }, { "epoch": 0.2937096274402307, "grad_norm": 1.7109984159469604, "learning_rate": 2.4974867603326337e-06, "loss": 0.933, "step": 6161 }, { "epoch": 0.29375729983553023, "grad_norm": 1.6186312437057495, "learning_rate": 2.4948903491779032e-06, "loss": 0.6186, "step": 6162 }, { "epoch": 0.29380497223082974, "grad_norm": 1.6657383441925049, "learning_rate": 2.492295095984306e-06, "loss": 0.8199, "step": 6163 }, { "epoch": 0.29385264462612926, "grad_norm": 1.8358724117279053, "learning_rate": 2.4897010011522595e-06, "loss": 0.7381, "step": 6164 }, { "epoch": 0.2939003170214287, "grad_norm": 3.0252318382263184, "learning_rate": 2.48710806508201e-06, "loss": 0.5331, "step": 6165 }, { "epoch": 0.29394798941672823, "grad_norm": 2.4788169860839844, "learning_rate": 2.484516288173615e-06, "loss": 0.9319, "step": 6166 }, { "epoch": 0.29399566181202774, "grad_norm": 0.9535640478134155, "learning_rate": 2.4819256708269655e-06, "loss": 0.4834, "step": 6167 }, { "epoch": 0.29404333420732726, "grad_norm": 2.011082887649536, "learning_rate": 2.47933621344176e-06, "loss": 0.7738, "step": 6168 }, { "epoch": 0.29409100660262677, "grad_norm": 2.474637031555176, "learning_rate": 2.4767479164175323e-06, "loss": 0.9388, "step": 6169 }, { "epoch": 0.29413867899792623, "grad_norm": 2.801311492919922, "learning_rate": 2.474160780153624e-06, "loss": 0.5579, "step": 6170 }, { "epoch": 0.29418635139322574, "grad_norm": 4.237105846405029, "learning_rate": 2.471574805049206e-06, "loss": 0.6163, "step": 6171 }, { "epoch": 0.29423402378852526, "grad_norm": 1.6840664148330688, "learning_rate": 2.468989991503271e-06, "loss": 0.659, "step": 6172 }, { "epoch": 0.29428169618382477, "grad_norm": 2.800570011138916, "learning_rate": 2.4664063399146232e-06, "loss": 0.6551, "step": 6173 }, { "epoch": 0.2943293685791243, "grad_norm": 1.232972502708435, "learning_rate": 2.4638238506818958e-06, "loss": 0.7185, "step": 6174 }, { "epoch": 0.29437704097442374, "grad_norm": 2.1023988723754883, "learning_rate": 2.4612425242035432e-06, "loss": 1.0223, "step": 6175 }, { "epoch": 0.29442471336972326, "grad_norm": 1.7870928049087524, "learning_rate": 2.4586623608778324e-06, "loss": 0.8333, "step": 6176 }, { "epoch": 0.29447238576502277, "grad_norm": 1.6844615936279297, "learning_rate": 2.456083361102858e-06, "loss": 0.6692, "step": 6177 }, { "epoch": 0.2945200581603223, "grad_norm": 4.017955303192139, "learning_rate": 2.453505525276537e-06, "loss": 0.7432, "step": 6178 }, { "epoch": 0.29456773055562174, "grad_norm": 2.203190565109253, "learning_rate": 2.450928853796597e-06, "loss": 0.5097, "step": 6179 }, { "epoch": 0.29461540295092126, "grad_norm": 1.717183232307434, "learning_rate": 2.4483533470605967e-06, "loss": 0.9019, "step": 6180 }, { "epoch": 0.29466307534622077, "grad_norm": 3.6508407592773438, "learning_rate": 2.4457790054659057e-06, "loss": 1.2841, "step": 6181 }, { "epoch": 0.2947107477415203, "grad_norm": 2.009955644607544, "learning_rate": 2.443205829409724e-06, "loss": 0.9626, "step": 6182 }, { "epoch": 0.2947584201368198, "grad_norm": 1.492617130279541, "learning_rate": 2.440633819289059e-06, "loss": 0.9316, "step": 6183 }, { "epoch": 0.29480609253211926, "grad_norm": 2.5129506587982178, "learning_rate": 2.4380629755007524e-06, "loss": 0.8387, "step": 6184 }, { "epoch": 0.29485376492741877, "grad_norm": 2.931331157684326, "learning_rate": 2.4354932984414527e-06, "loss": 0.6627, "step": 6185 }, { "epoch": 0.2949014373227183, "grad_norm": 1.0418578386306763, "learning_rate": 2.432924788507638e-06, "loss": 0.4271, "step": 6186 }, { "epoch": 0.2949491097180178, "grad_norm": 4.242718696594238, "learning_rate": 2.430357446095606e-06, "loss": 1.7003, "step": 6187 }, { "epoch": 0.29499678211331726, "grad_norm": 1.9347028732299805, "learning_rate": 2.427791271601465e-06, "loss": 0.6363, "step": 6188 }, { "epoch": 0.29504445450861677, "grad_norm": 1.222937822341919, "learning_rate": 2.425226265421151e-06, "loss": 0.5721, "step": 6189 }, { "epoch": 0.2950921269039163, "grad_norm": 1.9609845876693726, "learning_rate": 2.422662427950423e-06, "loss": 0.5534, "step": 6190 }, { "epoch": 0.2951397992992158, "grad_norm": 1.2237188816070557, "learning_rate": 2.4200997595848474e-06, "loss": 0.6456, "step": 6191 }, { "epoch": 0.2951874716945153, "grad_norm": 2.6468327045440674, "learning_rate": 2.4175382607198217e-06, "loss": 0.9785, "step": 6192 }, { "epoch": 0.29523514408981477, "grad_norm": 1.7278664112091064, "learning_rate": 2.4149779317505574e-06, "loss": 0.9488, "step": 6193 }, { "epoch": 0.2952828164851143, "grad_norm": 7.507183074951172, "learning_rate": 2.4124187730720916e-06, "loss": 0.3669, "step": 6194 }, { "epoch": 0.2953304888804138, "grad_norm": 1.6379116773605347, "learning_rate": 2.4098607850792712e-06, "loss": 0.3944, "step": 6195 }, { "epoch": 0.2953781612757133, "grad_norm": 1.0105853080749512, "learning_rate": 2.4073039681667653e-06, "loss": 0.3334, "step": 6196 }, { "epoch": 0.2954258336710128, "grad_norm": 2.0226950645446777, "learning_rate": 2.4047483227290715e-06, "loss": 0.6971, "step": 6197 }, { "epoch": 0.2954735060663123, "grad_norm": 1.2883707284927368, "learning_rate": 2.4021938491604912e-06, "loss": 0.6105, "step": 6198 }, { "epoch": 0.2955211784616118, "grad_norm": 0.8866240382194519, "learning_rate": 2.3996405478551586e-06, "loss": 0.3959, "step": 6199 }, { "epoch": 0.2955688508569113, "grad_norm": 1.6138339042663574, "learning_rate": 2.3970884192070232e-06, "loss": 0.5984, "step": 6200 }, { "epoch": 0.2956165232522108, "grad_norm": 1.867600440979004, "learning_rate": 2.3945374636098474e-06, "loss": 1.014, "step": 6201 }, { "epoch": 0.2956641956475103, "grad_norm": 2.4693214893341064, "learning_rate": 2.3919876814572197e-06, "loss": 0.8062, "step": 6202 }, { "epoch": 0.2957118680428098, "grad_norm": 2.3167264461517334, "learning_rate": 2.3894390731425486e-06, "loss": 0.7653, "step": 6203 }, { "epoch": 0.2957595404381093, "grad_norm": 1.9118690490722656, "learning_rate": 2.3868916390590524e-06, "loss": 0.7576, "step": 6204 }, { "epoch": 0.29580721283340883, "grad_norm": 1.512364387512207, "learning_rate": 2.384345379599775e-06, "loss": 0.6581, "step": 6205 }, { "epoch": 0.29585488522870834, "grad_norm": 3.281097888946533, "learning_rate": 2.3818002951575834e-06, "loss": 0.566, "step": 6206 }, { "epoch": 0.2959025576240078, "grad_norm": 1.3613389730453491, "learning_rate": 2.3792563861251506e-06, "loss": 0.7637, "step": 6207 }, { "epoch": 0.2959502300193073, "grad_norm": 1.4592924118041992, "learning_rate": 2.3767136528949797e-06, "loss": 0.8039, "step": 6208 }, { "epoch": 0.29599790241460683, "grad_norm": 1.6873000860214233, "learning_rate": 2.3741720958593896e-06, "loss": 0.6793, "step": 6209 }, { "epoch": 0.29604557480990634, "grad_norm": 3.2447853088378906, "learning_rate": 2.371631715410512e-06, "loss": 0.295, "step": 6210 }, { "epoch": 0.2960932472052058, "grad_norm": 3.141862392425537, "learning_rate": 2.3690925119403065e-06, "loss": 0.4293, "step": 6211 }, { "epoch": 0.2961409196005053, "grad_norm": 1.427262783050537, "learning_rate": 2.3665544858405433e-06, "loss": 0.7697, "step": 6212 }, { "epoch": 0.29618859199580483, "grad_norm": 1.0451107025146484, "learning_rate": 2.3640176375028103e-06, "loss": 0.3854, "step": 6213 }, { "epoch": 0.29623626439110434, "grad_norm": 1.004555583000183, "learning_rate": 2.361481967318521e-06, "loss": 0.3633, "step": 6214 }, { "epoch": 0.29628393678640386, "grad_norm": 1.7891265153884888, "learning_rate": 2.3589474756789045e-06, "loss": 0.4989, "step": 6215 }, { "epoch": 0.2963316091817033, "grad_norm": 4.669949531555176, "learning_rate": 2.3564141629750026e-06, "loss": 1.2666, "step": 6216 }, { "epoch": 0.29637928157700283, "grad_norm": 0.9578444361686707, "learning_rate": 2.3538820295976816e-06, "loss": 0.5031, "step": 6217 }, { "epoch": 0.29642695397230234, "grad_norm": 2.856482982635498, "learning_rate": 2.3513510759376266e-06, "loss": 0.53, "step": 6218 }, { "epoch": 0.29647462636760186, "grad_norm": 1.3544644117355347, "learning_rate": 2.3488213023853325e-06, "loss": 0.7535, "step": 6219 }, { "epoch": 0.2965222987629013, "grad_norm": 1.7900073528289795, "learning_rate": 2.3462927093311183e-06, "loss": 0.7144, "step": 6220 }, { "epoch": 0.29656997115820083, "grad_norm": 1.481199860572815, "learning_rate": 2.343765297165125e-06, "loss": 0.6453, "step": 6221 }, { "epoch": 0.29661764355350034, "grad_norm": 1.6059015989303589, "learning_rate": 2.341239066277299e-06, "loss": 0.5346, "step": 6222 }, { "epoch": 0.29666531594879986, "grad_norm": 2.2869980335235596, "learning_rate": 2.3387140170574154e-06, "loss": 0.6164, "step": 6223 }, { "epoch": 0.29671298834409937, "grad_norm": 2.2377076148986816, "learning_rate": 2.3361901498950656e-06, "loss": 0.4721, "step": 6224 }, { "epoch": 0.29676066073939883, "grad_norm": 1.2980705499649048, "learning_rate": 2.333667465179651e-06, "loss": 0.7414, "step": 6225 }, { "epoch": 0.29680833313469834, "grad_norm": 1.2358131408691406, "learning_rate": 2.3311459633004006e-06, "loss": 0.7175, "step": 6226 }, { "epoch": 0.29685600552999786, "grad_norm": 1.0507140159606934, "learning_rate": 2.328625644646355e-06, "loss": 0.2351, "step": 6227 }, { "epoch": 0.29690367792529737, "grad_norm": 1.6839592456817627, "learning_rate": 2.3261065096063696e-06, "loss": 1.3605, "step": 6228 }, { "epoch": 0.2969513503205969, "grad_norm": 2.320051908493042, "learning_rate": 2.3235885585691243e-06, "loss": 0.8316, "step": 6229 }, { "epoch": 0.29699902271589634, "grad_norm": 1.6431183815002441, "learning_rate": 2.3210717919231117e-06, "loss": 0.6969, "step": 6230 }, { "epoch": 0.29704669511119586, "grad_norm": 1.236509919166565, "learning_rate": 2.318556210056648e-06, "loss": 0.7098, "step": 6231 }, { "epoch": 0.29709436750649537, "grad_norm": 1.4272632598876953, "learning_rate": 2.3160418133578544e-06, "loss": 1.0649, "step": 6232 }, { "epoch": 0.2971420399017949, "grad_norm": 1.1121752262115479, "learning_rate": 2.3135286022146785e-06, "loss": 0.5511, "step": 6233 }, { "epoch": 0.29718971229709434, "grad_norm": 1.8114746809005737, "learning_rate": 2.3110165770148873e-06, "loss": 0.7566, "step": 6234 }, { "epoch": 0.29723738469239386, "grad_norm": 1.8374031782150269, "learning_rate": 2.308505738146055e-06, "loss": 0.6684, "step": 6235 }, { "epoch": 0.29728505708769337, "grad_norm": 2.0366532802581787, "learning_rate": 2.3059960859955798e-06, "loss": 0.7003, "step": 6236 }, { "epoch": 0.2973327294829929, "grad_norm": 2.616689920425415, "learning_rate": 2.303487620950677e-06, "loss": 0.6459, "step": 6237 }, { "epoch": 0.2973804018782924, "grad_norm": 1.5783463716506958, "learning_rate": 2.3009803433983744e-06, "loss": 0.585, "step": 6238 }, { "epoch": 0.29742807427359186, "grad_norm": 1.1030124425888062, "learning_rate": 2.2984742537255233e-06, "loss": 0.698, "step": 6239 }, { "epoch": 0.29747574666889137, "grad_norm": 1.8951327800750732, "learning_rate": 2.2959693523187808e-06, "loss": 0.8113, "step": 6240 }, { "epoch": 0.2975234190641909, "grad_norm": 1.2447819709777832, "learning_rate": 2.2934656395646336e-06, "loss": 0.3373, "step": 6241 }, { "epoch": 0.2975710914594904, "grad_norm": 1.1514697074890137, "learning_rate": 2.290963115849375e-06, "loss": 0.5508, "step": 6242 }, { "epoch": 0.29761876385478986, "grad_norm": 2.6068334579467773, "learning_rate": 2.2884617815591213e-06, "loss": 1.0128, "step": 6243 }, { "epoch": 0.2976664362500894, "grad_norm": 2.0510177612304688, "learning_rate": 2.285961637079799e-06, "loss": 0.8102, "step": 6244 }, { "epoch": 0.2977141086453889, "grad_norm": 1.5474066734313965, "learning_rate": 2.283462682797156e-06, "loss": 0.9855, "step": 6245 }, { "epoch": 0.2977617810406884, "grad_norm": 1.4880986213684082, "learning_rate": 2.2809649190967597e-06, "loss": 0.9896, "step": 6246 }, { "epoch": 0.2978094534359879, "grad_norm": 1.9403513669967651, "learning_rate": 2.2784683463639832e-06, "loss": 0.499, "step": 6247 }, { "epoch": 0.2978571258312874, "grad_norm": 1.157433271408081, "learning_rate": 2.2759729649840232e-06, "loss": 0.7567, "step": 6248 }, { "epoch": 0.2979047982265869, "grad_norm": 2.5560498237609863, "learning_rate": 2.2734787753418965e-06, "loss": 0.8808, "step": 6249 }, { "epoch": 0.2979524706218864, "grad_norm": 2.112027168273926, "learning_rate": 2.2709857778224244e-06, "loss": 1.1075, "step": 6250 }, { "epoch": 0.2980001430171859, "grad_norm": 1.8769203424453735, "learning_rate": 2.2684939728102528e-06, "loss": 0.696, "step": 6251 }, { "epoch": 0.2980478154124854, "grad_norm": 1.9187896251678467, "learning_rate": 2.2660033606898447e-06, "loss": 0.4853, "step": 6252 }, { "epoch": 0.2980954878077849, "grad_norm": 1.8584184646606445, "learning_rate": 2.263513941845471e-06, "loss": 0.8023, "step": 6253 }, { "epoch": 0.2981431602030844, "grad_norm": 7.00104284286499, "learning_rate": 2.261025716661225e-06, "loss": 0.8411, "step": 6254 }, { "epoch": 0.2981908325983839, "grad_norm": 1.5704630613327026, "learning_rate": 2.2585386855210177e-06, "loss": 0.6705, "step": 6255 }, { "epoch": 0.29823850499368343, "grad_norm": 1.3708139657974243, "learning_rate": 2.256052848808571e-06, "loss": 0.6181, "step": 6256 }, { "epoch": 0.2982861773889829, "grad_norm": 0.9261099696159363, "learning_rate": 2.2535682069074183e-06, "loss": 0.4078, "step": 6257 }, { "epoch": 0.2983338497842824, "grad_norm": 1.676835060119629, "learning_rate": 2.251084760200921e-06, "loss": 0.9565, "step": 6258 }, { "epoch": 0.2983815221795819, "grad_norm": 1.5448845624923706, "learning_rate": 2.248602509072245e-06, "loss": 0.7191, "step": 6259 }, { "epoch": 0.29842919457488143, "grad_norm": 2.3571763038635254, "learning_rate": 2.2461214539043773e-06, "loss": 1.0151, "step": 6260 }, { "epoch": 0.29847686697018094, "grad_norm": 1.806689739227295, "learning_rate": 2.2436415950801228e-06, "loss": 0.6715, "step": 6261 }, { "epoch": 0.2985245393654804, "grad_norm": 0.9933075308799744, "learning_rate": 2.241162932982093e-06, "loss": 0.2741, "step": 6262 }, { "epoch": 0.2985722117607799, "grad_norm": 1.2085294723510742, "learning_rate": 2.2386854679927215e-06, "loss": 0.444, "step": 6263 }, { "epoch": 0.29861988415607943, "grad_norm": 1.617403507232666, "learning_rate": 2.2362092004942583e-06, "loss": 0.8379, "step": 6264 }, { "epoch": 0.29866755655137894, "grad_norm": 1.9029408693313599, "learning_rate": 2.233734130868762e-06, "loss": 0.7259, "step": 6265 }, { "epoch": 0.2987152289466784, "grad_norm": 2.145231246948242, "learning_rate": 2.2312602594981126e-06, "loss": 0.6003, "step": 6266 }, { "epoch": 0.2987629013419779, "grad_norm": 1.2889387607574463, "learning_rate": 2.228787586764004e-06, "loss": 0.6624, "step": 6267 }, { "epoch": 0.29881057373727743, "grad_norm": 2.299807548522949, "learning_rate": 2.2263161130479405e-06, "loss": 0.7913, "step": 6268 }, { "epoch": 0.29885824613257694, "grad_norm": 2.704833984375, "learning_rate": 2.2238458387312476e-06, "loss": 1.0125, "step": 6269 }, { "epoch": 0.29890591852787646, "grad_norm": 1.1400604248046875, "learning_rate": 2.2213767641950658e-06, "loss": 0.4367, "step": 6270 }, { "epoch": 0.2989535909231759, "grad_norm": 2.1746912002563477, "learning_rate": 2.2189088898203446e-06, "loss": 0.6238, "step": 6271 }, { "epoch": 0.29900126331847543, "grad_norm": 1.5861088037490845, "learning_rate": 2.2164422159878496e-06, "loss": 0.8999, "step": 6272 }, { "epoch": 0.29904893571377494, "grad_norm": 1.3148717880249023, "learning_rate": 2.2139767430781654e-06, "loss": 0.6231, "step": 6273 }, { "epoch": 0.29909660810907446, "grad_norm": 1.5059646368026733, "learning_rate": 2.211512471471692e-06, "loss": 0.5306, "step": 6274 }, { "epoch": 0.2991442805043739, "grad_norm": 1.476387619972229, "learning_rate": 2.2090494015486354e-06, "loss": 0.727, "step": 6275 }, { "epoch": 0.29919195289967343, "grad_norm": 1.2458332777023315, "learning_rate": 2.206587533689025e-06, "loss": 0.8143, "step": 6276 }, { "epoch": 0.29923962529497294, "grad_norm": 2.3385074138641357, "learning_rate": 2.2041268682727034e-06, "loss": 0.9914, "step": 6277 }, { "epoch": 0.29928729769027246, "grad_norm": 1.990761637687683, "learning_rate": 2.2016674056793232e-06, "loss": 0.822, "step": 6278 }, { "epoch": 0.29933497008557197, "grad_norm": 1.5822852849960327, "learning_rate": 2.1992091462883537e-06, "loss": 0.4592, "step": 6279 }, { "epoch": 0.29938264248087143, "grad_norm": 1.595819354057312, "learning_rate": 2.196752090479083e-06, "loss": 0.7642, "step": 6280 }, { "epoch": 0.29943031487617094, "grad_norm": 2.5592379570007324, "learning_rate": 2.194296238630604e-06, "loss": 0.4591, "step": 6281 }, { "epoch": 0.29947798727147046, "grad_norm": 1.5908269882202148, "learning_rate": 2.1918415911218327e-06, "loss": 0.3705, "step": 6282 }, { "epoch": 0.29952565966676997, "grad_norm": 1.236393690109253, "learning_rate": 2.189388148331498e-06, "loss": 0.6536, "step": 6283 }, { "epoch": 0.2995733320620695, "grad_norm": 2.7467339038848877, "learning_rate": 2.186935910638136e-06, "loss": 1.3771, "step": 6284 }, { "epoch": 0.29962100445736894, "grad_norm": 1.7299925088882446, "learning_rate": 2.1844848784201067e-06, "loss": 0.9504, "step": 6285 }, { "epoch": 0.29966867685266846, "grad_norm": 1.920395851135254, "learning_rate": 2.182035052055573e-06, "loss": 0.5913, "step": 6286 }, { "epoch": 0.299716349247968, "grad_norm": 0.8589304089546204, "learning_rate": 2.1795864319225246e-06, "loss": 0.3922, "step": 6287 }, { "epoch": 0.2997640216432675, "grad_norm": 1.6341394186019897, "learning_rate": 2.177139018398752e-06, "loss": 1.0774, "step": 6288 }, { "epoch": 0.29981169403856694, "grad_norm": 1.6117527484893799, "learning_rate": 2.1746928118618717e-06, "loss": 0.7447, "step": 6289 }, { "epoch": 0.29985936643386646, "grad_norm": 1.3888081312179565, "learning_rate": 2.1722478126893022e-06, "loss": 0.7575, "step": 6290 }, { "epoch": 0.299907038829166, "grad_norm": 3.7830941677093506, "learning_rate": 2.1698040212582862e-06, "loss": 0.8668, "step": 6291 }, { "epoch": 0.2999547112244655, "grad_norm": 1.5668525695800781, "learning_rate": 2.167361437945876e-06, "loss": 0.9243, "step": 6292 }, { "epoch": 0.300002383619765, "grad_norm": 3.770625114440918, "learning_rate": 2.1649200631289322e-06, "loss": 0.2455, "step": 6293 }, { "epoch": 0.30005005601506446, "grad_norm": 2.0723373889923096, "learning_rate": 2.162479897184139e-06, "loss": 0.9181, "step": 6294 }, { "epoch": 0.300097728410364, "grad_norm": 3.0751869678497314, "learning_rate": 2.1600409404879875e-06, "loss": 0.5347, "step": 6295 }, { "epoch": 0.3001454008056635, "grad_norm": 1.8688127994537354, "learning_rate": 2.157603193416781e-06, "loss": 0.6948, "step": 6296 }, { "epoch": 0.300193073200963, "grad_norm": 1.7637250423431396, "learning_rate": 2.1551666563466413e-06, "loss": 0.674, "step": 6297 }, { "epoch": 0.30024074559626246, "grad_norm": 2.5037527084350586, "learning_rate": 2.152731329653502e-06, "loss": 0.6819, "step": 6298 }, { "epoch": 0.300288417991562, "grad_norm": 1.2383288145065308, "learning_rate": 2.150297213713105e-06, "loss": 0.6406, "step": 6299 }, { "epoch": 0.3003360903868615, "grad_norm": 9.533638000488281, "learning_rate": 2.1478643089010143e-06, "loss": 1.0306, "step": 6300 }, { "epoch": 0.300383762782161, "grad_norm": 3.421718120574951, "learning_rate": 2.1454326155925966e-06, "loss": 0.8606, "step": 6301 }, { "epoch": 0.3004314351774605, "grad_norm": 2.745079278945923, "learning_rate": 2.1430021341630424e-06, "loss": 0.4594, "step": 6302 }, { "epoch": 0.30047910757276, "grad_norm": 1.7435271739959717, "learning_rate": 2.1405728649873458e-06, "loss": 0.7596, "step": 6303 }, { "epoch": 0.3005267799680595, "grad_norm": 1.6694341897964478, "learning_rate": 2.138144808440321e-06, "loss": 0.8125, "step": 6304 }, { "epoch": 0.300574452363359, "grad_norm": 1.1263213157653809, "learning_rate": 2.13571796489659e-06, "loss": 0.6427, "step": 6305 }, { "epoch": 0.3006221247586585, "grad_norm": 7.258553504943848, "learning_rate": 2.133292334730589e-06, "loss": 0.6564, "step": 6306 }, { "epoch": 0.300669797153958, "grad_norm": 1.4790229797363281, "learning_rate": 2.1308679183165693e-06, "loss": 0.841, "step": 6307 }, { "epoch": 0.3007174695492575, "grad_norm": 2.84000301361084, "learning_rate": 2.128444716028597e-06, "loss": 1.2394, "step": 6308 }, { "epoch": 0.300765141944557, "grad_norm": 1.6635762453079224, "learning_rate": 2.12602272824054e-06, "loss": 0.8609, "step": 6309 }, { "epoch": 0.3008128143398565, "grad_norm": 1.3851314783096313, "learning_rate": 2.123601955326091e-06, "loss": 0.6644, "step": 6310 }, { "epoch": 0.30086048673515603, "grad_norm": 1.7767314910888672, "learning_rate": 2.1211823976587508e-06, "loss": 0.8282, "step": 6311 }, { "epoch": 0.3009081591304555, "grad_norm": 1.740729808807373, "learning_rate": 2.118764055611828e-06, "loss": 0.829, "step": 6312 }, { "epoch": 0.300955831525755, "grad_norm": 1.134680986404419, "learning_rate": 2.1163469295584504e-06, "loss": 0.4486, "step": 6313 }, { "epoch": 0.3010035039210545, "grad_norm": 1.057878017425537, "learning_rate": 2.113931019871559e-06, "loss": 0.4592, "step": 6314 }, { "epoch": 0.30105117631635403, "grad_norm": 2.1879589557647705, "learning_rate": 2.1115163269238992e-06, "loss": 1.0933, "step": 6315 }, { "epoch": 0.30109884871165354, "grad_norm": 2.1401851177215576, "learning_rate": 2.109102851088033e-06, "loss": 0.4979, "step": 6316 }, { "epoch": 0.301146521106953, "grad_norm": 1.6667444705963135, "learning_rate": 2.106690592736338e-06, "loss": 0.8235, "step": 6317 }, { "epoch": 0.3011941935022525, "grad_norm": 0.9434000253677368, "learning_rate": 2.1042795522409977e-06, "loss": 0.0076, "step": 6318 }, { "epoch": 0.30124186589755203, "grad_norm": 1.4988133907318115, "learning_rate": 2.101869729974011e-06, "loss": 0.7448, "step": 6319 }, { "epoch": 0.30128953829285154, "grad_norm": 1.2174264192581177, "learning_rate": 2.099461126307194e-06, "loss": 0.5325, "step": 6320 }, { "epoch": 0.301337210688151, "grad_norm": 1.5915915966033936, "learning_rate": 2.0970537416121617e-06, "loss": 0.795, "step": 6321 }, { "epoch": 0.3013848830834505, "grad_norm": 2.000627040863037, "learning_rate": 2.0946475762603525e-06, "loss": 0.5215, "step": 6322 }, { "epoch": 0.30143255547875003, "grad_norm": 1.5578094720840454, "learning_rate": 2.092242630623016e-06, "loss": 0.8091, "step": 6323 }, { "epoch": 0.30148022787404954, "grad_norm": 1.8483134508132935, "learning_rate": 2.0898389050712044e-06, "loss": 0.6688, "step": 6324 }, { "epoch": 0.30152790026934906, "grad_norm": 1.9468728303909302, "learning_rate": 2.0874363999757906e-06, "loss": 0.7893, "step": 6325 }, { "epoch": 0.3015755726646485, "grad_norm": 1.6190681457519531, "learning_rate": 2.08503511570746e-06, "loss": 0.8619, "step": 6326 }, { "epoch": 0.30162324505994803, "grad_norm": 2.691166400909424, "learning_rate": 2.0826350526367e-06, "loss": 1.1333, "step": 6327 }, { "epoch": 0.30167091745524754, "grad_norm": 2.630610466003418, "learning_rate": 2.0802362111338183e-06, "loss": 1.0579, "step": 6328 }, { "epoch": 0.30171858985054706, "grad_norm": 1.3825013637542725, "learning_rate": 2.0778385915689336e-06, "loss": 0.6147, "step": 6329 }, { "epoch": 0.3017662622458465, "grad_norm": 1.8687647581100464, "learning_rate": 2.0754421943119695e-06, "loss": 0.7765, "step": 6330 }, { "epoch": 0.30181393464114603, "grad_norm": 4.715604782104492, "learning_rate": 2.0730470197326702e-06, "loss": 1.0323, "step": 6331 }, { "epoch": 0.30186160703644555, "grad_norm": 1.8607902526855469, "learning_rate": 2.0706530682005833e-06, "loss": 0.441, "step": 6332 }, { "epoch": 0.30190927943174506, "grad_norm": 2.3508448600769043, "learning_rate": 2.06826034008507e-06, "loss": 0.3873, "step": 6333 }, { "epoch": 0.3019569518270446, "grad_norm": 5.49057674407959, "learning_rate": 2.0658688357553036e-06, "loss": 0.4204, "step": 6334 }, { "epoch": 0.30200462422234403, "grad_norm": 1.716123342514038, "learning_rate": 2.063478555580274e-06, "loss": 0.6531, "step": 6335 }, { "epoch": 0.30205229661764355, "grad_norm": 1.1376465559005737, "learning_rate": 2.06108949992877e-06, "loss": 0.6853, "step": 6336 }, { "epoch": 0.30209996901294306, "grad_norm": 1.116325855255127, "learning_rate": 2.0587016691694006e-06, "loss": 0.8409, "step": 6337 }, { "epoch": 0.3021476414082426, "grad_norm": 1.5302807092666626, "learning_rate": 2.0563150636705873e-06, "loss": 0.6467, "step": 6338 }, { "epoch": 0.30219531380354203, "grad_norm": 1.5142567157745361, "learning_rate": 2.053929683800553e-06, "loss": 0.9118, "step": 6339 }, { "epoch": 0.30224298619884155, "grad_norm": 3.894611120223999, "learning_rate": 2.05154552992734e-06, "loss": 0.9286, "step": 6340 }, { "epoch": 0.30229065859414106, "grad_norm": 1.4084447622299194, "learning_rate": 2.0491626024188005e-06, "loss": 0.5777, "step": 6341 }, { "epoch": 0.3023383309894406, "grad_norm": 2.5469160079956055, "learning_rate": 2.046780901642591e-06, "loss": 0.9876, "step": 6342 }, { "epoch": 0.3023860033847401, "grad_norm": 1.5295445919036865, "learning_rate": 2.0444004279661866e-06, "loss": 0.7402, "step": 6343 }, { "epoch": 0.30243367578003955, "grad_norm": 1.7182564735412598, "learning_rate": 2.0420211817568724e-06, "loss": 0.7154, "step": 6344 }, { "epoch": 0.30248134817533906, "grad_norm": 6.909396648406982, "learning_rate": 2.0396431633817348e-06, "loss": 0.459, "step": 6345 }, { "epoch": 0.3025290205706386, "grad_norm": 1.3105229139328003, "learning_rate": 2.0372663732076847e-06, "loss": 0.9715, "step": 6346 }, { "epoch": 0.3025766929659381, "grad_norm": 3.02335786819458, "learning_rate": 2.03489081160143e-06, "loss": 0.295, "step": 6347 }, { "epoch": 0.3026243653612376, "grad_norm": 1.4509479999542236, "learning_rate": 2.0325164789295004e-06, "loss": 0.8722, "step": 6348 }, { "epoch": 0.30267203775653706, "grad_norm": 3.1525590419769287, "learning_rate": 2.0301433755582266e-06, "loss": 0.6557, "step": 6349 }, { "epoch": 0.3027197101518366, "grad_norm": 1.3639476299285889, "learning_rate": 2.027771501853757e-06, "loss": 0.7237, "step": 6350 }, { "epoch": 0.3027673825471361, "grad_norm": 1.5575635433197021, "learning_rate": 2.025400858182048e-06, "loss": 0.6297, "step": 6351 }, { "epoch": 0.3028150549424356, "grad_norm": 1.4706881046295166, "learning_rate": 2.0230314449088626e-06, "loss": 0.7896, "step": 6352 }, { "epoch": 0.30286272733773506, "grad_norm": 1.6276911497116089, "learning_rate": 2.020663262399778e-06, "loss": 0.9242, "step": 6353 }, { "epoch": 0.3029103997330346, "grad_norm": 1.9563448429107666, "learning_rate": 2.0182963110201823e-06, "loss": 0.7302, "step": 6354 }, { "epoch": 0.3029580721283341, "grad_norm": 1.1066612005233765, "learning_rate": 2.0159305911352688e-06, "loss": 0.5124, "step": 6355 }, { "epoch": 0.3030057445236336, "grad_norm": 1.9852733612060547, "learning_rate": 2.013566103110045e-06, "loss": 0.6534, "step": 6356 }, { "epoch": 0.3030534169189331, "grad_norm": 3.199556350708008, "learning_rate": 2.0112028473093294e-06, "loss": 1.087, "step": 6357 }, { "epoch": 0.3031010893142326, "grad_norm": 1.620105266571045, "learning_rate": 2.008840824097743e-06, "loss": 0.4853, "step": 6358 }, { "epoch": 0.3031487617095321, "grad_norm": 0.9042674899101257, "learning_rate": 2.006480033839728e-06, "loss": 0.4665, "step": 6359 }, { "epoch": 0.3031964341048316, "grad_norm": 3.8102097511291504, "learning_rate": 2.0041204768995225e-06, "loss": 1.7765, "step": 6360 }, { "epoch": 0.3032441065001311, "grad_norm": 1.1288888454437256, "learning_rate": 2.001762153641189e-06, "loss": 0.6047, "step": 6361 }, { "epoch": 0.3032917788954306, "grad_norm": 1.6786080598831177, "learning_rate": 1.999405064428587e-06, "loss": 0.6556, "step": 6362 }, { "epoch": 0.3033394512907301, "grad_norm": 1.3288825750350952, "learning_rate": 1.9970492096253955e-06, "loss": 0.5948, "step": 6363 }, { "epoch": 0.3033871236860296, "grad_norm": 1.5415656566619873, "learning_rate": 1.9946945895950943e-06, "loss": 0.8503, "step": 6364 }, { "epoch": 0.3034347960813291, "grad_norm": 1.058382511138916, "learning_rate": 1.9923412047009794e-06, "loss": 0.5966, "step": 6365 }, { "epoch": 0.30348246847662863, "grad_norm": 1.4036825895309448, "learning_rate": 1.9899890553061565e-06, "loss": 0.7594, "step": 6366 }, { "epoch": 0.3035301408719281, "grad_norm": 1.6590728759765625, "learning_rate": 1.9876381417735312e-06, "loss": 0.6407, "step": 6367 }, { "epoch": 0.3035778132672276, "grad_norm": 1.2488852739334106, "learning_rate": 1.98528846446583e-06, "loss": 0.5549, "step": 6368 }, { "epoch": 0.3036254856625271, "grad_norm": 1.3086576461791992, "learning_rate": 1.9829400237455865e-06, "loss": 0.6662, "step": 6369 }, { "epoch": 0.30367315805782663, "grad_norm": 1.4790866374969482, "learning_rate": 1.9805928199751336e-06, "loss": 0.6447, "step": 6370 }, { "epoch": 0.30372083045312614, "grad_norm": 4.150673866271973, "learning_rate": 1.9782468535166253e-06, "loss": 0.4286, "step": 6371 }, { "epoch": 0.3037685028484256, "grad_norm": 1.3232091665267944, "learning_rate": 1.975902124732022e-06, "loss": 0.7318, "step": 6372 }, { "epoch": 0.3038161752437251, "grad_norm": 1.5176620483398438, "learning_rate": 1.973558633983087e-06, "loss": 1.1292, "step": 6373 }, { "epoch": 0.30386384763902463, "grad_norm": 2.3007333278656006, "learning_rate": 1.971216381631397e-06, "loss": 0.6538, "step": 6374 }, { "epoch": 0.30391152003432415, "grad_norm": 1.425936222076416, "learning_rate": 1.968875368038342e-06, "loss": 0.8703, "step": 6375 }, { "epoch": 0.3039591924296236, "grad_norm": 2.337597370147705, "learning_rate": 1.9665355935651133e-06, "loss": 0.3978, "step": 6376 }, { "epoch": 0.3040068648249231, "grad_norm": 1.9509705305099487, "learning_rate": 1.964197058572711e-06, "loss": 0.7741, "step": 6377 }, { "epoch": 0.30405453722022263, "grad_norm": 1.8793994188308716, "learning_rate": 1.961859763421953e-06, "loss": 0.6466, "step": 6378 }, { "epoch": 0.30410220961552215, "grad_norm": 3.5728471279144287, "learning_rate": 1.959523708473453e-06, "loss": 1.5575, "step": 6379 }, { "epoch": 0.30414988201082166, "grad_norm": 2.1158430576324463, "learning_rate": 1.9571888940876436e-06, "loss": 0.5644, "step": 6380 }, { "epoch": 0.3041975544061211, "grad_norm": 1.3865089416503906, "learning_rate": 1.9548553206247667e-06, "loss": 0.7426, "step": 6381 }, { "epoch": 0.30424522680142063, "grad_norm": 1.958948016166687, "learning_rate": 1.9525229884448624e-06, "loss": 0.905, "step": 6382 }, { "epoch": 0.30429289919672015, "grad_norm": 1.395885705947876, "learning_rate": 1.9501918979077874e-06, "loss": 0.6477, "step": 6383 }, { "epoch": 0.30434057159201966, "grad_norm": 1.6745498180389404, "learning_rate": 1.947862049373206e-06, "loss": 0.7298, "step": 6384 }, { "epoch": 0.3043882439873191, "grad_norm": 1.6946300268173218, "learning_rate": 1.945533443200591e-06, "loss": 0.5748, "step": 6385 }, { "epoch": 0.30443591638261863, "grad_norm": 1.435052752494812, "learning_rate": 1.9432060797492193e-06, "loss": 0.7157, "step": 6386 }, { "epoch": 0.30448358877791815, "grad_norm": 1.2148957252502441, "learning_rate": 1.94087995937818e-06, "loss": 0.768, "step": 6387 }, { "epoch": 0.30453126117321766, "grad_norm": 1.5300604104995728, "learning_rate": 1.9385550824463727e-06, "loss": 0.7784, "step": 6388 }, { "epoch": 0.3045789335685172, "grad_norm": 1.5830923318862915, "learning_rate": 1.9362314493124965e-06, "loss": 0.9742, "step": 6389 }, { "epoch": 0.30462660596381663, "grad_norm": 1.7905744314193726, "learning_rate": 1.9339090603350698e-06, "loss": 0.677, "step": 6390 }, { "epoch": 0.30467427835911615, "grad_norm": 1.3788014650344849, "learning_rate": 1.9315879158724106e-06, "loss": 0.5513, "step": 6391 }, { "epoch": 0.30472195075441566, "grad_norm": 2.3503267765045166, "learning_rate": 1.929268016282645e-06, "loss": 0.6906, "step": 6392 }, { "epoch": 0.3047696231497152, "grad_norm": 1.0267317295074463, "learning_rate": 1.9269493619237114e-06, "loss": 0.5549, "step": 6393 }, { "epoch": 0.30481729554501463, "grad_norm": 2.3521690368652344, "learning_rate": 1.9246319531533574e-06, "loss": 0.8716, "step": 6394 }, { "epoch": 0.30486496794031415, "grad_norm": 5.018568515777588, "learning_rate": 1.9223157903291313e-06, "loss": 0.613, "step": 6395 }, { "epoch": 0.30491264033561366, "grad_norm": 1.5714318752288818, "learning_rate": 1.920000873808394e-06, "loss": 0.633, "step": 6396 }, { "epoch": 0.3049603127309132, "grad_norm": 3.5810441970825195, "learning_rate": 1.917687203948316e-06, "loss": 1.115, "step": 6397 }, { "epoch": 0.3050079851262127, "grad_norm": 2.0197510719299316, "learning_rate": 1.91537478110587e-06, "loss": 0.6015, "step": 6398 }, { "epoch": 0.30505565752151215, "grad_norm": 1.6447203159332275, "learning_rate": 1.913063605637838e-06, "loss": 0.8743, "step": 6399 }, { "epoch": 0.30510332991681166, "grad_norm": 1.374510645866394, "learning_rate": 1.9107536779008153e-06, "loss": 0.4009, "step": 6400 }, { "epoch": 0.3051510023121112, "grad_norm": 1.245357871055603, "learning_rate": 1.908444998251194e-06, "loss": 0.6025, "step": 6401 }, { "epoch": 0.3051986747074107, "grad_norm": 1.340909481048584, "learning_rate": 1.9061375670451831e-06, "loss": 0.6918, "step": 6402 }, { "epoch": 0.3052463471027102, "grad_norm": 1.7540247440338135, "learning_rate": 1.903831384638798e-06, "loss": 0.8732, "step": 6403 }, { "epoch": 0.30529401949800966, "grad_norm": 1.3919272422790527, "learning_rate": 1.9015264513878528e-06, "loss": 0.6712, "step": 6404 }, { "epoch": 0.3053416918933092, "grad_norm": 2.0504024028778076, "learning_rate": 1.8992227676479803e-06, "loss": 0.7113, "step": 6405 }, { "epoch": 0.3053893642886087, "grad_norm": 2.407709836959839, "learning_rate": 1.8969203337746101e-06, "loss": 0.7342, "step": 6406 }, { "epoch": 0.3054370366839082, "grad_norm": 1.4397438764572144, "learning_rate": 1.8946191501229905e-06, "loss": 0.9156, "step": 6407 }, { "epoch": 0.30548470907920766, "grad_norm": 1.8526594638824463, "learning_rate": 1.892319217048163e-06, "loss": 0.1576, "step": 6408 }, { "epoch": 0.3055323814745072, "grad_norm": 1.4310426712036133, "learning_rate": 1.8900205349049904e-06, "loss": 0.8332, "step": 6409 }, { "epoch": 0.3055800538698067, "grad_norm": 1.1129051446914673, "learning_rate": 1.8877231040481302e-06, "loss": 0.498, "step": 6410 }, { "epoch": 0.3056277262651062, "grad_norm": 1.3999230861663818, "learning_rate": 1.8854269248320545e-06, "loss": 0.7744, "step": 6411 }, { "epoch": 0.3056753986604057, "grad_norm": 1.8585573434829712, "learning_rate": 1.883131997611043e-06, "loss": 0.6155, "step": 6412 }, { "epoch": 0.3057230710557052, "grad_norm": 1.9685242176055908, "learning_rate": 1.8808383227391747e-06, "loss": 0.8717, "step": 6413 }, { "epoch": 0.3057707434510047, "grad_norm": 1.6285715103149414, "learning_rate": 1.8785459005703411e-06, "loss": 0.6105, "step": 6414 }, { "epoch": 0.3058184158463042, "grad_norm": 1.0559170246124268, "learning_rate": 1.8762547314582435e-06, "loss": 0.593, "step": 6415 }, { "epoch": 0.3058660882416037, "grad_norm": 1.283506989479065, "learning_rate": 1.8739648157563794e-06, "loss": 0.4569, "step": 6416 }, { "epoch": 0.3059137606369032, "grad_norm": 1.752828598022461, "learning_rate": 1.8716761538180627e-06, "loss": 0.271, "step": 6417 }, { "epoch": 0.3059614330322027, "grad_norm": 2.224846839904785, "learning_rate": 1.8693887459964123e-06, "loss": 0.1549, "step": 6418 }, { "epoch": 0.3060091054275022, "grad_norm": 1.3899378776550293, "learning_rate": 1.8671025926443464e-06, "loss": 0.5116, "step": 6419 }, { "epoch": 0.3060567778228017, "grad_norm": 0.9118931293487549, "learning_rate": 1.8648176941146012e-06, "loss": 0.4819, "step": 6420 }, { "epoch": 0.30610445021810123, "grad_norm": 1.6946864128112793, "learning_rate": 1.8625340507597056e-06, "loss": 0.8566, "step": 6421 }, { "epoch": 0.3061521226134007, "grad_norm": 1.7777106761932373, "learning_rate": 1.86025166293201e-06, "loss": 0.8144, "step": 6422 }, { "epoch": 0.3061997950087002, "grad_norm": 3.625394344329834, "learning_rate": 1.8579705309836571e-06, "loss": 0.8559, "step": 6423 }, { "epoch": 0.3062474674039997, "grad_norm": 1.3907591104507446, "learning_rate": 1.8556906552666042e-06, "loss": 0.6744, "step": 6424 }, { "epoch": 0.30629513979929923, "grad_norm": 1.1208232641220093, "learning_rate": 1.8534120361326159e-06, "loss": 0.6212, "step": 6425 }, { "epoch": 0.3063428121945987, "grad_norm": 1.2047077417373657, "learning_rate": 1.8511346739332535e-06, "loss": 0.6538, "step": 6426 }, { "epoch": 0.3063904845898982, "grad_norm": 1.06620192527771, "learning_rate": 1.8488585690198946e-06, "loss": 0.5665, "step": 6427 }, { "epoch": 0.3064381569851977, "grad_norm": 0.919082522392273, "learning_rate": 1.8465837217437199e-06, "loss": 0.3602, "step": 6428 }, { "epoch": 0.30648582938049723, "grad_norm": 1.3466386795043945, "learning_rate": 1.8443101324557111e-06, "loss": 0.624, "step": 6429 }, { "epoch": 0.30653350177579675, "grad_norm": 3.57694935798645, "learning_rate": 1.842037801506661e-06, "loss": 1.2356, "step": 6430 }, { "epoch": 0.3065811741710962, "grad_norm": 2.216155767440796, "learning_rate": 1.839766729247171e-06, "loss": 0.9596, "step": 6431 }, { "epoch": 0.3066288465663957, "grad_norm": 0.8382368683815002, "learning_rate": 1.8374969160276368e-06, "loss": 0.3866, "step": 6432 }, { "epoch": 0.30667651896169523, "grad_norm": 1.6770670413970947, "learning_rate": 1.8352283621982713e-06, "loss": 1.1551, "step": 6433 }, { "epoch": 0.30672419135699475, "grad_norm": 1.2856043577194214, "learning_rate": 1.8329610681090914e-06, "loss": 0.279, "step": 6434 }, { "epoch": 0.30677186375229426, "grad_norm": 1.3624242544174194, "learning_rate": 1.8306950341099138e-06, "loss": 0.6864, "step": 6435 }, { "epoch": 0.3068195361475937, "grad_norm": 1.1764171123504639, "learning_rate": 1.8284302605503624e-06, "loss": 0.536, "step": 6436 }, { "epoch": 0.30686720854289323, "grad_norm": 1.6602095365524292, "learning_rate": 1.826166747779874e-06, "loss": 0.8386, "step": 6437 }, { "epoch": 0.30691488093819275, "grad_norm": 1.0563730001449585, "learning_rate": 1.8239044961476794e-06, "loss": 0.6951, "step": 6438 }, { "epoch": 0.30696255333349226, "grad_norm": 1.2942439317703247, "learning_rate": 1.8216435060028237e-06, "loss": 0.447, "step": 6439 }, { "epoch": 0.3070102257287917, "grad_norm": 1.9161652326583862, "learning_rate": 1.819383777694157e-06, "loss": 0.7364, "step": 6440 }, { "epoch": 0.30705789812409123, "grad_norm": 2.8646817207336426, "learning_rate": 1.817125311570327e-06, "loss": 0.7922, "step": 6441 }, { "epoch": 0.30710557051939075, "grad_norm": 1.85245680809021, "learning_rate": 1.8148681079797925e-06, "loss": 0.9882, "step": 6442 }, { "epoch": 0.30715324291469026, "grad_norm": 1.95215904712677, "learning_rate": 1.812612167270823e-06, "loss": 0.8003, "step": 6443 }, { "epoch": 0.3072009153099898, "grad_norm": 1.0741435289382935, "learning_rate": 1.810357489791479e-06, "loss": 0.4089, "step": 6444 }, { "epoch": 0.30724858770528923, "grad_norm": 1.8579949140548706, "learning_rate": 1.8081040758896361e-06, "loss": 0.4324, "step": 6445 }, { "epoch": 0.30729626010058875, "grad_norm": 2.041034698486328, "learning_rate": 1.805851925912978e-06, "loss": 1.3024, "step": 6446 }, { "epoch": 0.30734393249588826, "grad_norm": 1.525901198387146, "learning_rate": 1.803601040208981e-06, "loss": 0.7191, "step": 6447 }, { "epoch": 0.3073916048911878, "grad_norm": 1.84542715549469, "learning_rate": 1.801351419124938e-06, "loss": 0.6522, "step": 6448 }, { "epoch": 0.30743927728648723, "grad_norm": 1.6180534362792969, "learning_rate": 1.7991030630079431e-06, "loss": 0.649, "step": 6449 }, { "epoch": 0.30748694968178675, "grad_norm": 2.1256611347198486, "learning_rate": 1.7968559722048906e-06, "loss": 0.9058, "step": 6450 }, { "epoch": 0.30753462207708626, "grad_norm": 3.1187829971313477, "learning_rate": 1.7946101470624877e-06, "loss": 0.3534, "step": 6451 }, { "epoch": 0.3075822944723858, "grad_norm": 4.317934513092041, "learning_rate": 1.7923655879272395e-06, "loss": 0.1973, "step": 6452 }, { "epoch": 0.3076299668676853, "grad_norm": 1.0948309898376465, "learning_rate": 1.7901222951454566e-06, "loss": 0.4622, "step": 6453 }, { "epoch": 0.30767763926298475, "grad_norm": 1.3716005086898804, "learning_rate": 1.7878802690632579e-06, "loss": 0.7425, "step": 6454 }, { "epoch": 0.30772531165828426, "grad_norm": 4.103541374206543, "learning_rate": 1.785639510026569e-06, "loss": 0.7147, "step": 6455 }, { "epoch": 0.3077729840535838, "grad_norm": 2.311466693878174, "learning_rate": 1.7834000183811085e-06, "loss": 1.0938, "step": 6456 }, { "epoch": 0.3078206564488833, "grad_norm": 2.1428282260894775, "learning_rate": 1.7811617944724103e-06, "loss": 0.7077, "step": 6457 }, { "epoch": 0.3078683288441828, "grad_norm": 1.4838676452636719, "learning_rate": 1.7789248386458102e-06, "loss": 0.4631, "step": 6458 }, { "epoch": 0.30791600123948226, "grad_norm": 4.157537460327148, "learning_rate": 1.7766891512464491e-06, "loss": 0.7323, "step": 6459 }, { "epoch": 0.3079636736347818, "grad_norm": 2.3386669158935547, "learning_rate": 1.7744547326192662e-06, "loss": 0.2231, "step": 6460 }, { "epoch": 0.3080113460300813, "grad_norm": 1.9055111408233643, "learning_rate": 1.7722215831090106e-06, "loss": 0.7469, "step": 6461 }, { "epoch": 0.3080590184253808, "grad_norm": 0.92010098695755, "learning_rate": 1.7699897030602376e-06, "loss": 0.3568, "step": 6462 }, { "epoch": 0.30810669082068026, "grad_norm": 1.552308440208435, "learning_rate": 1.7677590928172994e-06, "loss": 0.5546, "step": 6463 }, { "epoch": 0.3081543632159798, "grad_norm": 1.7717546224594116, "learning_rate": 1.7655297527243587e-06, "loss": 0.9173, "step": 6464 }, { "epoch": 0.3082020356112793, "grad_norm": 1.4038217067718506, "learning_rate": 1.7633016831253757e-06, "loss": 0.6635, "step": 6465 }, { "epoch": 0.3082497080065788, "grad_norm": 2.149308919906616, "learning_rate": 1.7610748843641245e-06, "loss": 0.6601, "step": 6466 }, { "epoch": 0.3082973804018783, "grad_norm": 1.7162526845932007, "learning_rate": 1.7588493567841724e-06, "loss": 0.9118, "step": 6467 }, { "epoch": 0.3083450527971778, "grad_norm": 1.1733648777008057, "learning_rate": 1.7566251007288992e-06, "loss": 0.5365, "step": 6468 }, { "epoch": 0.3083927251924773, "grad_norm": 1.8050142526626587, "learning_rate": 1.7544021165414793e-06, "loss": 0.7149, "step": 6469 }, { "epoch": 0.3084403975877768, "grad_norm": 0.983949601650238, "learning_rate": 1.7521804045649005e-06, "loss": 0.5518, "step": 6470 }, { "epoch": 0.3084880699830763, "grad_norm": 1.735499620437622, "learning_rate": 1.7499599651419508e-06, "loss": 0.8126, "step": 6471 }, { "epoch": 0.3085357423783758, "grad_norm": 1.454580307006836, "learning_rate": 1.7477407986152174e-06, "loss": 0.8105, "step": 6472 }, { "epoch": 0.3085834147736753, "grad_norm": 1.1425175666809082, "learning_rate": 1.7455229053270973e-06, "loss": 0.3065, "step": 6473 }, { "epoch": 0.3086310871689748, "grad_norm": 0.9951463937759399, "learning_rate": 1.7433062856197902e-06, "loss": 0.4717, "step": 6474 }, { "epoch": 0.3086787595642743, "grad_norm": 2.938292980194092, "learning_rate": 1.7410909398352937e-06, "loss": 0.4628, "step": 6475 }, { "epoch": 0.30872643195957383, "grad_norm": 1.60874605178833, "learning_rate": 1.7388768683154145e-06, "loss": 0.7068, "step": 6476 }, { "epoch": 0.3087741043548733, "grad_norm": 4.252487659454346, "learning_rate": 1.7366640714017647e-06, "loss": 1.1043, "step": 6477 }, { "epoch": 0.3088217767501728, "grad_norm": 1.7481223344802856, "learning_rate": 1.734452549435749e-06, "loss": 0.8181, "step": 6478 }, { "epoch": 0.3088694491454723, "grad_norm": 1.2348800897598267, "learning_rate": 1.73224230275859e-06, "loss": 0.8256, "step": 6479 }, { "epoch": 0.30891712154077183, "grad_norm": 2.4656639099121094, "learning_rate": 1.7300333317112983e-06, "loss": 0.8434, "step": 6480 }, { "epoch": 0.3089647939360713, "grad_norm": 1.6827641725540161, "learning_rate": 1.7278256366347034e-06, "loss": 1.0099, "step": 6481 }, { "epoch": 0.3090124663313708, "grad_norm": 1.728666067123413, "learning_rate": 1.725619217869422e-06, "loss": 0.8378, "step": 6482 }, { "epoch": 0.3090601387266703, "grad_norm": 1.216159701347351, "learning_rate": 1.7234140757558892e-06, "loss": 0.5811, "step": 6483 }, { "epoch": 0.30910781112196983, "grad_norm": 1.6066782474517822, "learning_rate": 1.7212102106343287e-06, "loss": 0.5712, "step": 6484 }, { "epoch": 0.30915548351726935, "grad_norm": 2.0662741661071777, "learning_rate": 1.7190076228447782e-06, "loss": 0.3293, "step": 6485 }, { "epoch": 0.3092031559125688, "grad_norm": 1.6938399076461792, "learning_rate": 1.7168063127270762e-06, "loss": 1.0509, "step": 6486 }, { "epoch": 0.3092508283078683, "grad_norm": 3.1264359951019287, "learning_rate": 1.7146062806208573e-06, "loss": 0.6917, "step": 6487 }, { "epoch": 0.30929850070316783, "grad_norm": 1.0653722286224365, "learning_rate": 1.7124075268655672e-06, "loss": 0.7217, "step": 6488 }, { "epoch": 0.30934617309846735, "grad_norm": 1.3021435737609863, "learning_rate": 1.7102100518004517e-06, "loss": 0.6076, "step": 6489 }, { "epoch": 0.30939384549376686, "grad_norm": 2.4369189739227295, "learning_rate": 1.7080138557645543e-06, "loss": 0.3072, "step": 6490 }, { "epoch": 0.3094415178890663, "grad_norm": 5.125096321105957, "learning_rate": 1.7058189390967272e-06, "loss": 1.3237, "step": 6491 }, { "epoch": 0.30948919028436583, "grad_norm": 1.2146793603897095, "learning_rate": 1.7036253021356275e-06, "loss": 0.9807, "step": 6492 }, { "epoch": 0.30953686267966535, "grad_norm": 1.4144781827926636, "learning_rate": 1.7014329452197054e-06, "loss": 0.5788, "step": 6493 }, { "epoch": 0.30958453507496486, "grad_norm": 1.1346683502197266, "learning_rate": 1.6992418686872203e-06, "loss": 0.6866, "step": 6494 }, { "epoch": 0.3096322074702643, "grad_norm": 1.5384643077850342, "learning_rate": 1.6970520728762374e-06, "loss": 0.5163, "step": 6495 }, { "epoch": 0.30967987986556383, "grad_norm": 1.5738346576690674, "learning_rate": 1.6948635581246142e-06, "loss": 0.881, "step": 6496 }, { "epoch": 0.30972755226086335, "grad_norm": 0.8995949029922485, "learning_rate": 1.6926763247700163e-06, "loss": 0.3336, "step": 6497 }, { "epoch": 0.30977522465616286, "grad_norm": 1.5127801895141602, "learning_rate": 1.6904903731499122e-06, "loss": 0.608, "step": 6498 }, { "epoch": 0.3098228970514624, "grad_norm": 1.5991016626358032, "learning_rate": 1.688305703601575e-06, "loss": 0.554, "step": 6499 }, { "epoch": 0.30987056944676183, "grad_norm": 2.663707971572876, "learning_rate": 1.686122316462071e-06, "loss": 0.7762, "step": 6500 }, { "epoch": 0.30991824184206135, "grad_norm": 1.9118679761886597, "learning_rate": 1.6839402120682768e-06, "loss": 0.981, "step": 6501 }, { "epoch": 0.30996591423736086, "grad_norm": 1.5477867126464844, "learning_rate": 1.681759390756873e-06, "loss": 0.7308, "step": 6502 }, { "epoch": 0.3100135866326604, "grad_norm": 1.5888001918792725, "learning_rate": 1.6795798528643304e-06, "loss": 0.6634, "step": 6503 }, { "epoch": 0.31006125902795983, "grad_norm": 1.959020733833313, "learning_rate": 1.677401598726932e-06, "loss": 0.6814, "step": 6504 }, { "epoch": 0.31010893142325935, "grad_norm": 1.2314298152923584, "learning_rate": 1.6752246286807638e-06, "loss": 0.6576, "step": 6505 }, { "epoch": 0.31015660381855886, "grad_norm": 2.1980175971984863, "learning_rate": 1.6730489430617048e-06, "loss": 0.5543, "step": 6506 }, { "epoch": 0.3102042762138584, "grad_norm": 3.7818870544433594, "learning_rate": 1.670874542205443e-06, "loss": 0.726, "step": 6507 }, { "epoch": 0.3102519486091579, "grad_norm": 2.2076447010040283, "learning_rate": 1.6687014264474677e-06, "loss": 0.8583, "step": 6508 }, { "epoch": 0.31029962100445735, "grad_norm": 5.225788116455078, "learning_rate": 1.6665295961230644e-06, "loss": 1.1672, "step": 6509 }, { "epoch": 0.31034729339975686, "grad_norm": 1.57306969165802, "learning_rate": 1.664359051567328e-06, "loss": 0.6766, "step": 6510 }, { "epoch": 0.3103949657950564, "grad_norm": 1.4917519092559814, "learning_rate": 1.6621897931151498e-06, "loss": 0.8296, "step": 6511 }, { "epoch": 0.3104426381903559, "grad_norm": 2.040132761001587, "learning_rate": 1.660021821101222e-06, "loss": 0.582, "step": 6512 }, { "epoch": 0.31049031058565535, "grad_norm": 1.456680178642273, "learning_rate": 1.6578551358600415e-06, "loss": 0.8104, "step": 6513 }, { "epoch": 0.31053798298095486, "grad_norm": 2.2867209911346436, "learning_rate": 1.6556897377259085e-06, "loss": 0.9111, "step": 6514 }, { "epoch": 0.3105856553762544, "grad_norm": 2.4910178184509277, "learning_rate": 1.653525627032917e-06, "loss": 1.0848, "step": 6515 }, { "epoch": 0.3106333277715539, "grad_norm": 3.2589223384857178, "learning_rate": 1.6513628041149688e-06, "loss": 0.7544, "step": 6516 }, { "epoch": 0.3106810001668534, "grad_norm": 1.4992516040802002, "learning_rate": 1.649201269305768e-06, "loss": 0.9809, "step": 6517 }, { "epoch": 0.31072867256215286, "grad_norm": 1.5746660232543945, "learning_rate": 1.6470410229388134e-06, "loss": 0.8504, "step": 6518 }, { "epoch": 0.3107763449574524, "grad_norm": 1.524914264678955, "learning_rate": 1.6448820653474084e-06, "loss": 0.5504, "step": 6519 }, { "epoch": 0.3108240173527519, "grad_norm": 2.3994109630584717, "learning_rate": 1.6427243968646632e-06, "loss": 0.5918, "step": 6520 }, { "epoch": 0.3108716897480514, "grad_norm": 1.3910152912139893, "learning_rate": 1.6405680178234784e-06, "loss": 0.655, "step": 6521 }, { "epoch": 0.3109193621433509, "grad_norm": 1.3168928623199463, "learning_rate": 1.638412928556562e-06, "loss": 0.7157, "step": 6522 }, { "epoch": 0.3109670345386504, "grad_norm": 1.7572548389434814, "learning_rate": 1.6362591293964247e-06, "loss": 0.3562, "step": 6523 }, { "epoch": 0.3110147069339499, "grad_norm": 1.877868413925171, "learning_rate": 1.634106620675373e-06, "loss": 0.7416, "step": 6524 }, { "epoch": 0.3110623793292494, "grad_norm": 12.239848136901855, "learning_rate": 1.631955402725519e-06, "loss": 0.0823, "step": 6525 }, { "epoch": 0.3111100517245489, "grad_norm": 3.99135684967041, "learning_rate": 1.6298054758787707e-06, "loss": 1.5696, "step": 6526 }, { "epoch": 0.3111577241198484, "grad_norm": 2.9434146881103516, "learning_rate": 1.6276568404668425e-06, "loss": 1.2763, "step": 6527 }, { "epoch": 0.3112053965151479, "grad_norm": 0.8180282115936279, "learning_rate": 1.6255094968212436e-06, "loss": 0.604, "step": 6528 }, { "epoch": 0.3112530689104474, "grad_norm": 1.1855956315994263, "learning_rate": 1.6233634452732916e-06, "loss": 0.6412, "step": 6529 }, { "epoch": 0.3113007413057469, "grad_norm": 2.7904341220855713, "learning_rate": 1.6212186861540946e-06, "loss": 0.6747, "step": 6530 }, { "epoch": 0.31134841370104643, "grad_norm": 1.178819179534912, "learning_rate": 1.619075219794569e-06, "loss": 0.6261, "step": 6531 }, { "epoch": 0.3113960860963459, "grad_norm": 2.6483869552612305, "learning_rate": 1.616933046525433e-06, "loss": 1.2557, "step": 6532 }, { "epoch": 0.3114437584916454, "grad_norm": 1.7681313753128052, "learning_rate": 1.614792166677197e-06, "loss": 0.9145, "step": 6533 }, { "epoch": 0.3114914308869449, "grad_norm": 1.420824408531189, "learning_rate": 1.6126525805801786e-06, "loss": 0.8921, "step": 6534 }, { "epoch": 0.31153910328224443, "grad_norm": 2.487520456314087, "learning_rate": 1.610514288564493e-06, "loss": 0.8742, "step": 6535 }, { "epoch": 0.3115867756775439, "grad_norm": 1.4098377227783203, "learning_rate": 1.6083772909600614e-06, "loss": 1.0423, "step": 6536 }, { "epoch": 0.3116344480728434, "grad_norm": 1.9439759254455566, "learning_rate": 1.6062415880965932e-06, "loss": 0.7453, "step": 6537 }, { "epoch": 0.3116821204681429, "grad_norm": 1.2742236852645874, "learning_rate": 1.60410718030361e-06, "loss": 1.0144, "step": 6538 }, { "epoch": 0.31172979286344243, "grad_norm": 1.2759597301483154, "learning_rate": 1.6019740679104301e-06, "loss": 0.8142, "step": 6539 }, { "epoch": 0.31177746525874195, "grad_norm": 3.0621325969696045, "learning_rate": 1.5998422512461687e-06, "loss": 0.7816, "step": 6540 }, { "epoch": 0.3118251376540414, "grad_norm": 1.4142330884933472, "learning_rate": 1.5977117306397394e-06, "loss": 0.7853, "step": 6541 }, { "epoch": 0.3118728100493409, "grad_norm": 1.821967363357544, "learning_rate": 1.5955825064198671e-06, "loss": 0.8065, "step": 6542 }, { "epoch": 0.31192048244464043, "grad_norm": 2.012448310852051, "learning_rate": 1.5934545789150625e-06, "loss": 0.8001, "step": 6543 }, { "epoch": 0.31196815483993995, "grad_norm": 2.912635326385498, "learning_rate": 1.591327948453646e-06, "loss": 1.2998, "step": 6544 }, { "epoch": 0.3120158272352394, "grad_norm": 2.260390043258667, "learning_rate": 1.5892026153637363e-06, "loss": 1.197, "step": 6545 }, { "epoch": 0.3120634996305389, "grad_norm": 1.7087609767913818, "learning_rate": 1.5870785799732459e-06, "loss": 0.7234, "step": 6546 }, { "epoch": 0.31211117202583843, "grad_norm": 1.6463042497634888, "learning_rate": 1.5849558426098955e-06, "loss": 0.5573, "step": 6547 }, { "epoch": 0.31215884442113795, "grad_norm": 1.0706912279129028, "learning_rate": 1.5828344036012012e-06, "loss": 0.7365, "step": 6548 }, { "epoch": 0.31220651681643746, "grad_norm": 1.188310146331787, "learning_rate": 1.5807142632744776e-06, "loss": 0.7697, "step": 6549 }, { "epoch": 0.3122541892117369, "grad_norm": 2.6883914470672607, "learning_rate": 1.57859542195684e-06, "loss": 0.6252, "step": 6550 }, { "epoch": 0.31230186160703644, "grad_norm": 3.5239317417144775, "learning_rate": 1.5764778799752079e-06, "loss": 0.6836, "step": 6551 }, { "epoch": 0.31234953400233595, "grad_norm": 1.307760238647461, "learning_rate": 1.5743616376562921e-06, "loss": 0.8936, "step": 6552 }, { "epoch": 0.31239720639763546, "grad_norm": 1.4934606552124023, "learning_rate": 1.5722466953266068e-06, "loss": 0.7663, "step": 6553 }, { "epoch": 0.312444878792935, "grad_norm": 1.3050966262817383, "learning_rate": 1.5701330533124704e-06, "loss": 0.8025, "step": 6554 }, { "epoch": 0.31249255118823444, "grad_norm": 2.0975804328918457, "learning_rate": 1.5680207119399926e-06, "loss": 0.7393, "step": 6555 }, { "epoch": 0.31254022358353395, "grad_norm": 2.0402796268463135, "learning_rate": 1.5659096715350842e-06, "loss": 0.7473, "step": 6556 }, { "epoch": 0.31258789597883346, "grad_norm": 2.0081799030303955, "learning_rate": 1.563799932423462e-06, "loss": 0.745, "step": 6557 }, { "epoch": 0.312635568374133, "grad_norm": 2.052009105682373, "learning_rate": 1.5616914949306316e-06, "loss": 0.397, "step": 6558 }, { "epoch": 0.31268324076943244, "grad_norm": 1.552558422088623, "learning_rate": 1.559584359381906e-06, "loss": 0.6583, "step": 6559 }, { "epoch": 0.31273091316473195, "grad_norm": 1.081641435623169, "learning_rate": 1.557478526102396e-06, "loss": 0.6207, "step": 6560 }, { "epoch": 0.31277858556003146, "grad_norm": 1.64644455909729, "learning_rate": 1.5553739954170055e-06, "loss": 0.4479, "step": 6561 }, { "epoch": 0.312826257955331, "grad_norm": 1.3025169372558594, "learning_rate": 1.5532707676504455e-06, "loss": 0.6613, "step": 6562 }, { "epoch": 0.3128739303506305, "grad_norm": 1.5401815176010132, "learning_rate": 1.5511688431272242e-06, "loss": 0.9271, "step": 6563 }, { "epoch": 0.31292160274592995, "grad_norm": 2.020066738128662, "learning_rate": 1.5490682221716413e-06, "loss": 0.659, "step": 6564 }, { "epoch": 0.31296927514122946, "grad_norm": 1.3544154167175293, "learning_rate": 1.5469689051078041e-06, "loss": 0.6343, "step": 6565 }, { "epoch": 0.313016947536529, "grad_norm": 1.442132592201233, "learning_rate": 1.5448708922596178e-06, "loss": 0.961, "step": 6566 }, { "epoch": 0.3130646199318285, "grad_norm": 4.535737991333008, "learning_rate": 1.5427741839507804e-06, "loss": 1.6344, "step": 6567 }, { "epoch": 0.31311229232712795, "grad_norm": 2.2868452072143555, "learning_rate": 1.540678780504793e-06, "loss": 0.8728, "step": 6568 }, { "epoch": 0.31315996472242746, "grad_norm": 2.1181046962738037, "learning_rate": 1.538584682244958e-06, "loss": 0.6222, "step": 6569 }, { "epoch": 0.313207637117727, "grad_norm": 1.742482304573059, "learning_rate": 1.5364918894943682e-06, "loss": 0.7478, "step": 6570 }, { "epoch": 0.3132553095130265, "grad_norm": 1.5931150913238525, "learning_rate": 1.534400402575925e-06, "loss": 0.6558, "step": 6571 }, { "epoch": 0.313302981908326, "grad_norm": 1.9566168785095215, "learning_rate": 1.5323102218123186e-06, "loss": 0.4564, "step": 6572 }, { "epoch": 0.31335065430362546, "grad_norm": 2.7765796184539795, "learning_rate": 1.5302213475260475e-06, "loss": 0.5948, "step": 6573 }, { "epoch": 0.313398326698925, "grad_norm": 1.4216275215148926, "learning_rate": 1.528133780039397e-06, "loss": 0.641, "step": 6574 }, { "epoch": 0.3134459990942245, "grad_norm": 1.6456646919250488, "learning_rate": 1.5260475196744618e-06, "loss": 0.955, "step": 6575 }, { "epoch": 0.313493671489524, "grad_norm": 1.4795328378677368, "learning_rate": 1.5239625667531322e-06, "loss": 0.6831, "step": 6576 }, { "epoch": 0.3135413438848235, "grad_norm": 1.8460081815719604, "learning_rate": 1.5218789215970897e-06, "loss": 0.8814, "step": 6577 }, { "epoch": 0.313589016280123, "grad_norm": 1.3663274049758911, "learning_rate": 1.5197965845278217e-06, "loss": 0.5449, "step": 6578 }, { "epoch": 0.3136366886754225, "grad_norm": 1.4252835512161255, "learning_rate": 1.5177155558666135e-06, "loss": 0.2409, "step": 6579 }, { "epoch": 0.313684361070722, "grad_norm": 3.0009138584136963, "learning_rate": 1.5156358359345425e-06, "loss": 0.5513, "step": 6580 }, { "epoch": 0.3137320334660215, "grad_norm": 2.6511809825897217, "learning_rate": 1.5135574250524898e-06, "loss": 0.8639, "step": 6581 }, { "epoch": 0.313779705861321, "grad_norm": 0.9627076983451843, "learning_rate": 1.5114803235411346e-06, "loss": 0.4883, "step": 6582 }, { "epoch": 0.3138273782566205, "grad_norm": 1.0760561227798462, "learning_rate": 1.5094045317209493e-06, "loss": 0.4245, "step": 6583 }, { "epoch": 0.31387505065192, "grad_norm": 0.994811475276947, "learning_rate": 1.5073300499122113e-06, "loss": 0.4963, "step": 6584 }, { "epoch": 0.3139227230472195, "grad_norm": 1.3928691148757935, "learning_rate": 1.5052568784349852e-06, "loss": 0.6735, "step": 6585 }, { "epoch": 0.31397039544251903, "grad_norm": 1.8815174102783203, "learning_rate": 1.5031850176091467e-06, "loss": 0.757, "step": 6586 }, { "epoch": 0.3140180678378185, "grad_norm": 1.2746731042861938, "learning_rate": 1.5011144677543576e-06, "loss": 0.7011, "step": 6587 }, { "epoch": 0.314065740233118, "grad_norm": 1.5576210021972656, "learning_rate": 1.499045229190087e-06, "loss": 0.5558, "step": 6588 }, { "epoch": 0.3141134126284175, "grad_norm": 1.9429413080215454, "learning_rate": 1.4969773022355927e-06, "loss": 1.0802, "step": 6589 }, { "epoch": 0.31416108502371703, "grad_norm": 1.713625192642212, "learning_rate": 1.494910687209935e-06, "loss": 0.6591, "step": 6590 }, { "epoch": 0.3142087574190165, "grad_norm": 1.3166879415512085, "learning_rate": 1.4928453844319769e-06, "loss": 0.6429, "step": 6591 }, { "epoch": 0.314256429814316, "grad_norm": 2.6583194732666016, "learning_rate": 1.4907813942203652e-06, "loss": 0.8937, "step": 6592 }, { "epoch": 0.3143041022096155, "grad_norm": 2.1903364658355713, "learning_rate": 1.4887187168935579e-06, "loss": 0.4286, "step": 6593 }, { "epoch": 0.31435177460491504, "grad_norm": 3.6487207412719727, "learning_rate": 1.4866573527698047e-06, "loss": 0.5949, "step": 6594 }, { "epoch": 0.31439944700021455, "grad_norm": 1.8561707735061646, "learning_rate": 1.48459730216715e-06, "loss": 0.5996, "step": 6595 }, { "epoch": 0.314447119395514, "grad_norm": 1.1115163564682007, "learning_rate": 1.4825385654034386e-06, "loss": 0.7894, "step": 6596 }, { "epoch": 0.3144947917908135, "grad_norm": 4.910205364227295, "learning_rate": 1.4804811427963173e-06, "loss": 0.7863, "step": 6597 }, { "epoch": 0.31454246418611304, "grad_norm": 2.453122854232788, "learning_rate": 1.478425034663219e-06, "loss": 0.9238, "step": 6598 }, { "epoch": 0.31459013658141255, "grad_norm": 1.2896822690963745, "learning_rate": 1.4763702413213843e-06, "loss": 0.6852, "step": 6599 }, { "epoch": 0.314637808976712, "grad_norm": 1.6508816480636597, "learning_rate": 1.474316763087843e-06, "loss": 0.6804, "step": 6600 }, { "epoch": 0.3146854813720115, "grad_norm": 1.734481930732727, "learning_rate": 1.4722646002794294e-06, "loss": 0.6244, "step": 6601 }, { "epoch": 0.31473315376731104, "grad_norm": 0.9304144978523254, "learning_rate": 1.470213753212768e-06, "loss": 0.1639, "step": 6602 }, { "epoch": 0.31478082616261055, "grad_norm": 2.0326039791107178, "learning_rate": 1.468164222204287e-06, "loss": 0.6725, "step": 6603 }, { "epoch": 0.31482849855791006, "grad_norm": 1.7764116525650024, "learning_rate": 1.4661160075702018e-06, "loss": 0.3881, "step": 6604 }, { "epoch": 0.3148761709532095, "grad_norm": 1.1036642789840698, "learning_rate": 1.4640691096265358e-06, "loss": 0.6321, "step": 6605 }, { "epoch": 0.31492384334850904, "grad_norm": 3.8955821990966797, "learning_rate": 1.4620235286891049e-06, "loss": 1.0674, "step": 6606 }, { "epoch": 0.31497151574380855, "grad_norm": 1.4753177165985107, "learning_rate": 1.4599792650735179e-06, "loss": 0.5748, "step": 6607 }, { "epoch": 0.31501918813910806, "grad_norm": 1.3320003747940063, "learning_rate": 1.4579363190951845e-06, "loss": 0.7962, "step": 6608 }, { "epoch": 0.3150668605344076, "grad_norm": 1.2139029502868652, "learning_rate": 1.4558946910693127e-06, "loss": 0.7905, "step": 6609 }, { "epoch": 0.31511453292970704, "grad_norm": 1.4506020545959473, "learning_rate": 1.453854381310902e-06, "loss": 0.6546, "step": 6610 }, { "epoch": 0.31516220532500655, "grad_norm": 1.3403805494308472, "learning_rate": 1.451815390134751e-06, "loss": 0.7319, "step": 6611 }, { "epoch": 0.31520987772030606, "grad_norm": 1.6739354133605957, "learning_rate": 1.449777717855455e-06, "loss": 0.681, "step": 6612 }, { "epoch": 0.3152575501156056, "grad_norm": 1.7977980375289917, "learning_rate": 1.4477413647874106e-06, "loss": 1.1114, "step": 6613 }, { "epoch": 0.31530522251090504, "grad_norm": 2.4470624923706055, "learning_rate": 1.4457063312447995e-06, "loss": 0.9177, "step": 6614 }, { "epoch": 0.31535289490620455, "grad_norm": 2.64920973777771, "learning_rate": 1.4436726175416116e-06, "loss": 1.0572, "step": 6615 }, { "epoch": 0.31540056730150406, "grad_norm": 1.1840165853500366, "learning_rate": 1.4416402239916261e-06, "loss": 0.4655, "step": 6616 }, { "epoch": 0.3154482396968036, "grad_norm": 1.142012357711792, "learning_rate": 1.4396091509084175e-06, "loss": 0.6814, "step": 6617 }, { "epoch": 0.3154959120921031, "grad_norm": 2.9277257919311523, "learning_rate": 1.4375793986053622e-06, "loss": 1.0581, "step": 6618 }, { "epoch": 0.31554358448740255, "grad_norm": 2.0775747299194336, "learning_rate": 1.4355509673956313e-06, "loss": 0.7016, "step": 6619 }, { "epoch": 0.31559125688270206, "grad_norm": 1.4216697216033936, "learning_rate": 1.4335238575921884e-06, "loss": 0.7154, "step": 6620 }, { "epoch": 0.3156389292780016, "grad_norm": 2.9023005962371826, "learning_rate": 1.431498069507795e-06, "loss": 0.6624, "step": 6621 }, { "epoch": 0.3156866016733011, "grad_norm": 3.2430179119110107, "learning_rate": 1.429473603455015e-06, "loss": 0.8766, "step": 6622 }, { "epoch": 0.31573427406860055, "grad_norm": 1.7777272462844849, "learning_rate": 1.4274504597461946e-06, "loss": 0.7014, "step": 6623 }, { "epoch": 0.31578194646390007, "grad_norm": 1.048928141593933, "learning_rate": 1.425428638693489e-06, "loss": 0.5853, "step": 6624 }, { "epoch": 0.3158296188591996, "grad_norm": 2.6440327167510986, "learning_rate": 1.4234081406088463e-06, "loss": 0.7435, "step": 6625 }, { "epoch": 0.3158772912544991, "grad_norm": 1.7596545219421387, "learning_rate": 1.4213889658040026e-06, "loss": 0.7789, "step": 6626 }, { "epoch": 0.3159249636497986, "grad_norm": 1.6288104057312012, "learning_rate": 1.4193711145904988e-06, "loss": 0.9032, "step": 6627 }, { "epoch": 0.31597263604509807, "grad_norm": 2.528653144836426, "learning_rate": 1.4173545872796713e-06, "loss": 0.1689, "step": 6628 }, { "epoch": 0.3160203084403976, "grad_norm": 1.156921625137329, "learning_rate": 1.4153393841826446e-06, "loss": 0.7119, "step": 6629 }, { "epoch": 0.3160679808356971, "grad_norm": 1.2893879413604736, "learning_rate": 1.4133255056103478e-06, "loss": 0.4464, "step": 6630 }, { "epoch": 0.3161156532309966, "grad_norm": 1.8145041465759277, "learning_rate": 1.4113129518735002e-06, "loss": 0.7846, "step": 6631 }, { "epoch": 0.31616332562629607, "grad_norm": 2.8903684616088867, "learning_rate": 1.4093017232826155e-06, "loss": 0.5727, "step": 6632 }, { "epoch": 0.3162109980215956, "grad_norm": 3.1536953449249268, "learning_rate": 1.4072918201480078e-06, "loss": 0.6654, "step": 6633 }, { "epoch": 0.3162586704168951, "grad_norm": 2.0173208713531494, "learning_rate": 1.405283242779787e-06, "loss": 1.0501, "step": 6634 }, { "epoch": 0.3163063428121946, "grad_norm": 1.5940346717834473, "learning_rate": 1.4032759914878501e-06, "loss": 0.9135, "step": 6635 }, { "epoch": 0.3163540152074941, "grad_norm": 1.376216173171997, "learning_rate": 1.401270066581899e-06, "loss": 0.3843, "step": 6636 }, { "epoch": 0.3164016876027936, "grad_norm": 2.7516770362854004, "learning_rate": 1.3992654683714303e-06, "loss": 1.0433, "step": 6637 }, { "epoch": 0.3164493599980931, "grad_norm": 3.3411176204681396, "learning_rate": 1.397262197165725e-06, "loss": 0.5809, "step": 6638 }, { "epoch": 0.3164970323933926, "grad_norm": 2.1718568801879883, "learning_rate": 1.3952602532738734e-06, "loss": 0.497, "step": 6639 }, { "epoch": 0.3165447047886921, "grad_norm": 2.2344655990600586, "learning_rate": 1.3932596370047547e-06, "loss": 0.9163, "step": 6640 }, { "epoch": 0.31659237718399164, "grad_norm": 1.270484447479248, "learning_rate": 1.3912603486670396e-06, "loss": 0.8538, "step": 6641 }, { "epoch": 0.3166400495792911, "grad_norm": 1.4166371822357178, "learning_rate": 1.3892623885692003e-06, "loss": 0.6165, "step": 6642 }, { "epoch": 0.3166877219745906, "grad_norm": 1.5425513982772827, "learning_rate": 1.3872657570195025e-06, "loss": 0.4812, "step": 6643 }, { "epoch": 0.3167353943698901, "grad_norm": 1.6991596221923828, "learning_rate": 1.385270454326002e-06, "loss": 0.8697, "step": 6644 }, { "epoch": 0.31678306676518964, "grad_norm": 2.2527730464935303, "learning_rate": 1.3832764807965582e-06, "loss": 0.6057, "step": 6645 }, { "epoch": 0.3168307391604891, "grad_norm": 0.9060356616973877, "learning_rate": 1.3812838367388171e-06, "loss": 0.48, "step": 6646 }, { "epoch": 0.3168784115557886, "grad_norm": 1.4182647466659546, "learning_rate": 1.379292522460225e-06, "loss": 0.3352, "step": 6647 }, { "epoch": 0.3169260839510881, "grad_norm": 3.791928291320801, "learning_rate": 1.3773025382680195e-06, "loss": 0.8462, "step": 6648 }, { "epoch": 0.31697375634638764, "grad_norm": 1.4614535570144653, "learning_rate": 1.3753138844692348e-06, "loss": 0.6568, "step": 6649 }, { "epoch": 0.31702142874168715, "grad_norm": 1.1077797412872314, "learning_rate": 1.3733265613707037e-06, "loss": 0.4703, "step": 6650 }, { "epoch": 0.3170691011369866, "grad_norm": 8.790328025817871, "learning_rate": 1.3713405692790448e-06, "loss": 0.5406, "step": 6651 }, { "epoch": 0.3171167735322861, "grad_norm": 1.914686679840088, "learning_rate": 1.3693559085006768e-06, "loss": 0.6805, "step": 6652 }, { "epoch": 0.31716444592758564, "grad_norm": 1.6303683519363403, "learning_rate": 1.367372579341817e-06, "loss": 0.4999, "step": 6653 }, { "epoch": 0.31721211832288515, "grad_norm": 1.5619796514511108, "learning_rate": 1.3653905821084668e-06, "loss": 0.6573, "step": 6654 }, { "epoch": 0.3172597907181846, "grad_norm": 2.1431751251220703, "learning_rate": 1.3634099171064297e-06, "loss": 0.9527, "step": 6655 }, { "epoch": 0.3173074631134841, "grad_norm": 1.3966847658157349, "learning_rate": 1.3614305846413056e-06, "loss": 0.8538, "step": 6656 }, { "epoch": 0.31735513550878364, "grad_norm": 1.201562523841858, "learning_rate": 1.3594525850184803e-06, "loss": 0.8162, "step": 6657 }, { "epoch": 0.31740280790408315, "grad_norm": 2.9309046268463135, "learning_rate": 1.3574759185431408e-06, "loss": 0.4181, "step": 6658 }, { "epoch": 0.31745048029938266, "grad_norm": 2.211801052093506, "learning_rate": 1.3555005855202674e-06, "loss": 0.9168, "step": 6659 }, { "epoch": 0.3174981526946821, "grad_norm": 2.020312786102295, "learning_rate": 1.3535265862546333e-06, "loss": 0.9857, "step": 6660 }, { "epoch": 0.31754582508998164, "grad_norm": 1.5531432628631592, "learning_rate": 1.3515539210508033e-06, "loss": 0.5855, "step": 6661 }, { "epoch": 0.31759349748528115, "grad_norm": 2.2170095443725586, "learning_rate": 1.3495825902131443e-06, "loss": 0.8875, "step": 6662 }, { "epoch": 0.31764116988058066, "grad_norm": 1.1779119968414307, "learning_rate": 1.3476125940458062e-06, "loss": 0.284, "step": 6663 }, { "epoch": 0.3176888422758802, "grad_norm": 1.4040954113006592, "learning_rate": 1.3456439328527426e-06, "loss": 0.5142, "step": 6664 }, { "epoch": 0.31773651467117964, "grad_norm": 1.2396644353866577, "learning_rate": 1.3436766069377006e-06, "loss": 0.3101, "step": 6665 }, { "epoch": 0.31778418706647915, "grad_norm": 1.9658242464065552, "learning_rate": 1.3417106166042127e-06, "loss": 0.6576, "step": 6666 }, { "epoch": 0.31783185946177867, "grad_norm": 3.4210305213928223, "learning_rate": 1.339745962155613e-06, "loss": 0.3107, "step": 6667 }, { "epoch": 0.3178795318570782, "grad_norm": 1.7822257280349731, "learning_rate": 1.3377826438950315e-06, "loss": 0.6504, "step": 6668 }, { "epoch": 0.31792720425237764, "grad_norm": 1.1864222288131714, "learning_rate": 1.3358206621253812e-06, "loss": 0.4216, "step": 6669 }, { "epoch": 0.31797487664767715, "grad_norm": 2.1839072704315186, "learning_rate": 1.3338600171493787e-06, "loss": 0.7742, "step": 6670 }, { "epoch": 0.31802254904297667, "grad_norm": 2.5582492351531982, "learning_rate": 1.3319007092695346e-06, "loss": 1.1295, "step": 6671 }, { "epoch": 0.3180702214382762, "grad_norm": 1.7420909404754639, "learning_rate": 1.3299427387881436e-06, "loss": 0.7763, "step": 6672 }, { "epoch": 0.3181178938335757, "grad_norm": 1.87956702709198, "learning_rate": 1.327986106007305e-06, "loss": 0.9161, "step": 6673 }, { "epoch": 0.31816556622887515, "grad_norm": 2.6080970764160156, "learning_rate": 1.3260308112289066e-06, "loss": 1.44, "step": 6674 }, { "epoch": 0.31821323862417467, "grad_norm": 3.1158368587493896, "learning_rate": 1.3240768547546302e-06, "loss": 1.006, "step": 6675 }, { "epoch": 0.3182609110194742, "grad_norm": 1.2281715869903564, "learning_rate": 1.3221242368859489e-06, "loss": 0.7667, "step": 6676 }, { "epoch": 0.3183085834147737, "grad_norm": 1.4579689502716064, "learning_rate": 1.320172957924134e-06, "loss": 0.6034, "step": 6677 }, { "epoch": 0.31835625581007315, "grad_norm": 1.3187172412872314, "learning_rate": 1.318223018170245e-06, "loss": 0.3434, "step": 6678 }, { "epoch": 0.31840392820537267, "grad_norm": 1.7120447158813477, "learning_rate": 1.3162744179251396e-06, "loss": 0.6513, "step": 6679 }, { "epoch": 0.3184516006006722, "grad_norm": 4.401436805725098, "learning_rate": 1.3143271574894677e-06, "loss": 1.8345, "step": 6680 }, { "epoch": 0.3184992729959717, "grad_norm": 0.9458251595497131, "learning_rate": 1.3123812371636691e-06, "loss": 0.5839, "step": 6681 }, { "epoch": 0.3185469453912712, "grad_norm": 1.7610641717910767, "learning_rate": 1.3104366572479798e-06, "loss": 0.7357, "step": 6682 }, { "epoch": 0.31859461778657067, "grad_norm": 2.6764273643493652, "learning_rate": 1.3084934180424324e-06, "loss": 0.9112, "step": 6683 }, { "epoch": 0.3186422901818702, "grad_norm": 2.67384672164917, "learning_rate": 1.3065515198468425e-06, "loss": 0.8334, "step": 6684 }, { "epoch": 0.3186899625771697, "grad_norm": 2.8001441955566406, "learning_rate": 1.3046109629608273e-06, "loss": 0.9927, "step": 6685 }, { "epoch": 0.3187376349724692, "grad_norm": 1.3272708654403687, "learning_rate": 1.302671747683798e-06, "loss": 0.7976, "step": 6686 }, { "epoch": 0.31878530736776867, "grad_norm": 2.6087403297424316, "learning_rate": 1.3007338743149511e-06, "loss": 1.1775, "step": 6687 }, { "epoch": 0.3188329797630682, "grad_norm": 1.4888510704040527, "learning_rate": 1.2987973431532818e-06, "loss": 0.7254, "step": 6688 }, { "epoch": 0.3188806521583677, "grad_norm": 1.4774237871170044, "learning_rate": 1.296862154497579e-06, "loss": 0.6758, "step": 6689 }, { "epoch": 0.3189283245536672, "grad_norm": 1.4781259298324585, "learning_rate": 1.2949283086464192e-06, "loss": 0.6349, "step": 6690 }, { "epoch": 0.3189759969489667, "grad_norm": 0.810100257396698, "learning_rate": 1.2929958058981796e-06, "loss": 0.5273, "step": 6691 }, { "epoch": 0.3190236693442662, "grad_norm": 1.7063322067260742, "learning_rate": 1.291064646551019e-06, "loss": 0.8014, "step": 6692 }, { "epoch": 0.3190713417395657, "grad_norm": 2.5724058151245117, "learning_rate": 1.2891348309029005e-06, "loss": 0.5987, "step": 6693 }, { "epoch": 0.3191190141348652, "grad_norm": 2.300093173980713, "learning_rate": 1.2872063592515716e-06, "loss": 0.6093, "step": 6694 }, { "epoch": 0.3191666865301647, "grad_norm": 1.4376941919326782, "learning_rate": 1.2852792318945773e-06, "loss": 0.7723, "step": 6695 }, { "epoch": 0.31921435892546424, "grad_norm": 2.926440477371216, "learning_rate": 1.2833534491292554e-06, "loss": 1.0, "step": 6696 }, { "epoch": 0.3192620313207637, "grad_norm": 1.99180006980896, "learning_rate": 1.2814290112527295e-06, "loss": 0.6256, "step": 6697 }, { "epoch": 0.3193097037160632, "grad_norm": 2.6852056980133057, "learning_rate": 1.279505918561923e-06, "loss": 1.0226, "step": 6698 }, { "epoch": 0.3193573761113627, "grad_norm": 1.7871979475021362, "learning_rate": 1.2775841713535532e-06, "loss": 0.503, "step": 6699 }, { "epoch": 0.31940504850666224, "grad_norm": 2.6340057849884033, "learning_rate": 1.2756637699241181e-06, "loss": 1.2569, "step": 6700 }, { "epoch": 0.3194527209019617, "grad_norm": 2.8678417205810547, "learning_rate": 1.273744714569921e-06, "loss": 0.7783, "step": 6701 }, { "epoch": 0.3195003932972612, "grad_norm": 25.8961181640625, "learning_rate": 1.271827005587054e-06, "loss": 0.9727, "step": 6702 }, { "epoch": 0.3195480656925607, "grad_norm": 2.0242180824279785, "learning_rate": 1.2699106432713947e-06, "loss": 0.8678, "step": 6703 }, { "epoch": 0.31959573808786024, "grad_norm": 1.177913784980774, "learning_rate": 1.2679956279186234e-06, "loss": 0.6598, "step": 6704 }, { "epoch": 0.31964341048315975, "grad_norm": 1.3602626323699951, "learning_rate": 1.2660819598242013e-06, "loss": 0.4607, "step": 6705 }, { "epoch": 0.3196910828784592, "grad_norm": 2.882046937942505, "learning_rate": 1.2641696392833935e-06, "loss": 1.0607, "step": 6706 }, { "epoch": 0.3197387552737587, "grad_norm": 1.8627959489822388, "learning_rate": 1.262258666591246e-06, "loss": 0.7894, "step": 6707 }, { "epoch": 0.31978642766905824, "grad_norm": 3.0506558418273926, "learning_rate": 1.260349042042608e-06, "loss": 1.368, "step": 6708 }, { "epoch": 0.31983410006435775, "grad_norm": 1.4230533838272095, "learning_rate": 1.2584407659321086e-06, "loss": 0.807, "step": 6709 }, { "epoch": 0.3198817724596572, "grad_norm": 1.2494001388549805, "learning_rate": 1.2565338385541792e-06, "loss": 0.5562, "step": 6710 }, { "epoch": 0.3199294448549567, "grad_norm": 2.4897875785827637, "learning_rate": 1.2546282602030402e-06, "loss": 1.0848, "step": 6711 }, { "epoch": 0.31997711725025624, "grad_norm": 0.945253312587738, "learning_rate": 1.2527240311726985e-06, "loss": 0.2731, "step": 6712 }, { "epoch": 0.32002478964555575, "grad_norm": 1.7186779975891113, "learning_rate": 1.2508211517569592e-06, "loss": 0.9098, "step": 6713 }, { "epoch": 0.32007246204085527, "grad_norm": 1.7863162755966187, "learning_rate": 1.2489196222494193e-06, "loss": 0.535, "step": 6714 }, { "epoch": 0.3201201344361547, "grad_norm": 1.260798454284668, "learning_rate": 1.2470194429434601e-06, "loss": 0.818, "step": 6715 }, { "epoch": 0.32016780683145424, "grad_norm": 2.9051167964935303, "learning_rate": 1.2451206141322635e-06, "loss": 0.6785, "step": 6716 }, { "epoch": 0.32021547922675375, "grad_norm": 2.0958476066589355, "learning_rate": 1.243223136108801e-06, "loss": 0.9215, "step": 6717 }, { "epoch": 0.32026315162205327, "grad_norm": 2.3783397674560547, "learning_rate": 1.241327009165828e-06, "loss": 0.5876, "step": 6718 }, { "epoch": 0.3203108240173527, "grad_norm": 1.8721314668655396, "learning_rate": 1.239432233595903e-06, "loss": 0.9631, "step": 6719 }, { "epoch": 0.32035849641265224, "grad_norm": 3.1029164791107178, "learning_rate": 1.2375388096913666e-06, "loss": 0.2165, "step": 6720 }, { "epoch": 0.32040616880795175, "grad_norm": 2.7242085933685303, "learning_rate": 1.235646737744357e-06, "loss": 0.5013, "step": 6721 }, { "epoch": 0.32045384120325127, "grad_norm": 1.874029278755188, "learning_rate": 1.2337560180467988e-06, "loss": 0.65, "step": 6722 }, { "epoch": 0.3205015135985508, "grad_norm": 1.5420160293579102, "learning_rate": 1.2318666508904143e-06, "loss": 0.6374, "step": 6723 }, { "epoch": 0.32054918599385024, "grad_norm": 6.614696502685547, "learning_rate": 1.2299786365667088e-06, "loss": 0.3752, "step": 6724 }, { "epoch": 0.32059685838914975, "grad_norm": 4.222172260284424, "learning_rate": 1.2280919753669863e-06, "loss": 0.3931, "step": 6725 }, { "epoch": 0.32064453078444927, "grad_norm": 2.2805488109588623, "learning_rate": 1.226206667582338e-06, "loss": 0.8805, "step": 6726 }, { "epoch": 0.3206922031797488, "grad_norm": 2.7942402362823486, "learning_rate": 1.2243227135036517e-06, "loss": 1.2933, "step": 6727 }, { "epoch": 0.3207398755750483, "grad_norm": 2.5557541847229004, "learning_rate": 1.2224401134215957e-06, "loss": 0.6964, "step": 6728 }, { "epoch": 0.32078754797034775, "grad_norm": 1.5781320333480835, "learning_rate": 1.220558867626639e-06, "loss": 0.7256, "step": 6729 }, { "epoch": 0.32083522036564727, "grad_norm": 1.4425690174102783, "learning_rate": 1.2186789764090412e-06, "loss": 0.9856, "step": 6730 }, { "epoch": 0.3208828927609468, "grad_norm": 2.1655056476593018, "learning_rate": 1.216800440058844e-06, "loss": 0.8455, "step": 6731 }, { "epoch": 0.3209305651562463, "grad_norm": 1.1953462362289429, "learning_rate": 1.21492325886589e-06, "loss": 0.4475, "step": 6732 }, { "epoch": 0.32097823755154575, "grad_norm": 3.858290195465088, "learning_rate": 1.2130474331198106e-06, "loss": 0.1909, "step": 6733 }, { "epoch": 0.32102590994684527, "grad_norm": 1.1652231216430664, "learning_rate": 1.2111729631100211e-06, "loss": 0.4421, "step": 6734 }, { "epoch": 0.3210735823421448, "grad_norm": 1.5531299114227295, "learning_rate": 1.209299849125739e-06, "loss": 0.906, "step": 6735 }, { "epoch": 0.3211212547374443, "grad_norm": 1.4137942790985107, "learning_rate": 1.2074280914559634e-06, "loss": 0.6741, "step": 6736 }, { "epoch": 0.3211689271327438, "grad_norm": 2.3914706707000732, "learning_rate": 1.205557690389485e-06, "loss": 0.2413, "step": 6737 }, { "epoch": 0.32121659952804327, "grad_norm": 7.511162757873535, "learning_rate": 1.20368864621489e-06, "loss": 0.5593, "step": 6738 }, { "epoch": 0.3212642719233428, "grad_norm": 2.186293363571167, "learning_rate": 1.2018209592205542e-06, "loss": 0.1307, "step": 6739 }, { "epoch": 0.3213119443186423, "grad_norm": 1.970169186592102, "learning_rate": 1.1999546296946386e-06, "loss": 0.391, "step": 6740 }, { "epoch": 0.3213596167139418, "grad_norm": 1.7000867128372192, "learning_rate": 1.198089657925101e-06, "loss": 0.5987, "step": 6741 }, { "epoch": 0.32140728910924127, "grad_norm": 6.522705554962158, "learning_rate": 1.1962260441996888e-06, "loss": 0.6656, "step": 6742 }, { "epoch": 0.3214549615045408, "grad_norm": 1.88416588306427, "learning_rate": 1.1943637888059346e-06, "loss": 0.8614, "step": 6743 }, { "epoch": 0.3215026338998403, "grad_norm": 1.5290926694869995, "learning_rate": 1.1925028920311676e-06, "loss": 0.9489, "step": 6744 }, { "epoch": 0.3215503062951398, "grad_norm": 3.1635072231292725, "learning_rate": 1.1906433541625063e-06, "loss": 0.7795, "step": 6745 }, { "epoch": 0.3215979786904393, "grad_norm": 1.7384538650512695, "learning_rate": 1.1887851754868551e-06, "loss": 0.5209, "step": 6746 }, { "epoch": 0.3216456510857388, "grad_norm": 0.8195374011993408, "learning_rate": 1.1869283562909128e-06, "loss": 0.2465, "step": 6747 }, { "epoch": 0.3216933234810383, "grad_norm": 1.2265413999557495, "learning_rate": 1.1850728968611702e-06, "loss": 0.5326, "step": 6748 }, { "epoch": 0.3217409958763378, "grad_norm": 1.1900591850280762, "learning_rate": 1.1832187974839015e-06, "loss": 0.5931, "step": 6749 }, { "epoch": 0.3217886682716373, "grad_norm": 1.3428053855895996, "learning_rate": 1.181366058445179e-06, "loss": 0.5351, "step": 6750 }, { "epoch": 0.32183634066693684, "grad_norm": 1.355079174041748, "learning_rate": 1.17951468003086e-06, "loss": 0.8475, "step": 6751 }, { "epoch": 0.3218840130622363, "grad_norm": 2.929975986480713, "learning_rate": 1.1776646625265897e-06, "loss": 1.1311, "step": 6752 }, { "epoch": 0.3219316854575358, "grad_norm": 1.4964652061462402, "learning_rate": 1.1758160062178093e-06, "loss": 0.7599, "step": 6753 }, { "epoch": 0.3219793578528353, "grad_norm": 3.3741443157196045, "learning_rate": 1.1739687113897501e-06, "loss": 1.1203, "step": 6754 }, { "epoch": 0.32202703024813484, "grad_norm": 2.0097317695617676, "learning_rate": 1.1721227783274259e-06, "loss": 0.6591, "step": 6755 }, { "epoch": 0.3220747026434343, "grad_norm": 2.9421274662017822, "learning_rate": 1.1702782073156482e-06, "loss": 0.9512, "step": 6756 }, { "epoch": 0.3221223750387338, "grad_norm": 1.4162436723709106, "learning_rate": 1.1684349986390154e-06, "loss": 0.8846, "step": 6757 }, { "epoch": 0.3221700474340333, "grad_norm": 2.548219680786133, "learning_rate": 1.166593152581914e-06, "loss": 0.9899, "step": 6758 }, { "epoch": 0.32221771982933284, "grad_norm": 1.7047868967056274, "learning_rate": 1.1647526694285216e-06, "loss": 0.7265, "step": 6759 }, { "epoch": 0.32226539222463235, "grad_norm": 1.1510308980941772, "learning_rate": 1.1629135494628097e-06, "loss": 0.7055, "step": 6760 }, { "epoch": 0.3223130646199318, "grad_norm": 2.1071419715881348, "learning_rate": 1.1610757929685301e-06, "loss": 0.6876, "step": 6761 }, { "epoch": 0.3223607370152313, "grad_norm": 1.4728981256484985, "learning_rate": 1.1592394002292328e-06, "loss": 0.6636, "step": 6762 }, { "epoch": 0.32240840941053084, "grad_norm": 2.160444974899292, "learning_rate": 1.1574043715282557e-06, "loss": 0.951, "step": 6763 }, { "epoch": 0.32245608180583035, "grad_norm": 2.8908801078796387, "learning_rate": 1.155570707148721e-06, "loss": 1.1914, "step": 6764 }, { "epoch": 0.3225037542011298, "grad_norm": 1.327877402305603, "learning_rate": 1.153738407373548e-06, "loss": 0.6152, "step": 6765 }, { "epoch": 0.3225514265964293, "grad_norm": 1.8120874166488647, "learning_rate": 1.1519074724854373e-06, "loss": 0.6147, "step": 6766 }, { "epoch": 0.32259909899172884, "grad_norm": 1.7921767234802246, "learning_rate": 1.1500779027668885e-06, "loss": 0.7633, "step": 6767 }, { "epoch": 0.32264677138702835, "grad_norm": 1.6870146989822388, "learning_rate": 1.1482496985001812e-06, "loss": 0.7307, "step": 6768 }, { "epoch": 0.32269444378232787, "grad_norm": 1.9880839586257935, "learning_rate": 1.1464228599673889e-06, "loss": 0.9944, "step": 6769 }, { "epoch": 0.3227421161776273, "grad_norm": 1.5080907344818115, "learning_rate": 1.144597387450378e-06, "loss": 0.8737, "step": 6770 }, { "epoch": 0.32278978857292684, "grad_norm": 1.4312371015548706, "learning_rate": 1.1427732812307945e-06, "loss": 0.9552, "step": 6771 }, { "epoch": 0.32283746096822635, "grad_norm": 2.1981687545776367, "learning_rate": 1.1409505415900823e-06, "loss": 0.9216, "step": 6772 }, { "epoch": 0.32288513336352587, "grad_norm": 1.8863228559494019, "learning_rate": 1.139129168809473e-06, "loss": 0.9831, "step": 6773 }, { "epoch": 0.3229328057588253, "grad_norm": 0.7570465207099915, "learning_rate": 1.1373091631699817e-06, "loss": 0.1448, "step": 6774 }, { "epoch": 0.32298047815412484, "grad_norm": 2.2315726280212402, "learning_rate": 1.1354905249524184e-06, "loss": 0.274, "step": 6775 }, { "epoch": 0.32302815054942435, "grad_norm": 3.448110818862915, "learning_rate": 1.133673254437383e-06, "loss": 1.1285, "step": 6776 }, { "epoch": 0.32307582294472387, "grad_norm": 1.4017457962036133, "learning_rate": 1.1318573519052556e-06, "loss": 0.841, "step": 6777 }, { "epoch": 0.3231234953400234, "grad_norm": 2.547318935394287, "learning_rate": 1.1300428176362155e-06, "loss": 1.6176, "step": 6778 }, { "epoch": 0.32317116773532284, "grad_norm": 2.899115800857544, "learning_rate": 1.1282296519102277e-06, "loss": 0.3613, "step": 6779 }, { "epoch": 0.32321884013062235, "grad_norm": 1.8724870681762695, "learning_rate": 1.1264178550070427e-06, "loss": 0.7452, "step": 6780 }, { "epoch": 0.32326651252592187, "grad_norm": 1.8066425323486328, "learning_rate": 1.1246074272062012e-06, "loss": 0.7663, "step": 6781 }, { "epoch": 0.3233141849212214, "grad_norm": 1.3729069232940674, "learning_rate": 1.1227983687870358e-06, "loss": 0.6915, "step": 6782 }, { "epoch": 0.3233618573165209, "grad_norm": 1.281703233718872, "learning_rate": 1.120990680028663e-06, "loss": 0.7031, "step": 6783 }, { "epoch": 0.32340952971182035, "grad_norm": 2.4891021251678467, "learning_rate": 1.119184361209993e-06, "loss": 0.5118, "step": 6784 }, { "epoch": 0.32345720210711987, "grad_norm": 3.2797603607177734, "learning_rate": 1.1173794126097226e-06, "loss": 0.1452, "step": 6785 }, { "epoch": 0.3235048745024194, "grad_norm": 2.2303245067596436, "learning_rate": 1.1155758345063328e-06, "loss": 0.9356, "step": 6786 }, { "epoch": 0.3235525468977189, "grad_norm": 2.5446243286132812, "learning_rate": 1.1137736271781007e-06, "loss": 0.7394, "step": 6787 }, { "epoch": 0.32360021929301835, "grad_norm": 1.061785101890564, "learning_rate": 1.1119727909030897e-06, "loss": 0.5001, "step": 6788 }, { "epoch": 0.32364789168831787, "grad_norm": 3.6874563694000244, "learning_rate": 1.1101733259591453e-06, "loss": 0.4842, "step": 6789 }, { "epoch": 0.3236955640836174, "grad_norm": 1.8435956239700317, "learning_rate": 1.1083752326239094e-06, "loss": 0.7229, "step": 6790 }, { "epoch": 0.3237432364789169, "grad_norm": 3.200688362121582, "learning_rate": 1.1065785111748117e-06, "loss": 0.6165, "step": 6791 }, { "epoch": 0.3237909088742164, "grad_norm": 2.1181397438049316, "learning_rate": 1.1047831618890625e-06, "loss": 0.9953, "step": 6792 }, { "epoch": 0.32383858126951587, "grad_norm": 2.3873510360717773, "learning_rate": 1.1029891850436691e-06, "loss": 0.6344, "step": 6793 }, { "epoch": 0.3238862536648154, "grad_norm": 6.07340669631958, "learning_rate": 1.1011965809154245e-06, "loss": 1.1416, "step": 6794 }, { "epoch": 0.3239339260601149, "grad_norm": 1.9123923778533936, "learning_rate": 1.0994053497809077e-06, "loss": 0.8761, "step": 6795 }, { "epoch": 0.3239815984554144, "grad_norm": 3.459238052368164, "learning_rate": 1.097615491916485e-06, "loss": 0.8978, "step": 6796 }, { "epoch": 0.32402927085071387, "grad_norm": 2.479867935180664, "learning_rate": 1.0958270075983167e-06, "loss": 1.0981, "step": 6797 }, { "epoch": 0.3240769432460134, "grad_norm": 4.167263031005859, "learning_rate": 1.0940398971023447e-06, "loss": 0.5798, "step": 6798 }, { "epoch": 0.3241246156413129, "grad_norm": 1.8770060539245605, "learning_rate": 1.0922541607043024e-06, "loss": 1.0562, "step": 6799 }, { "epoch": 0.3241722880366124, "grad_norm": 2.877319812774658, "learning_rate": 1.0904697986797131e-06, "loss": 0.831, "step": 6800 }, { "epoch": 0.3242199604319119, "grad_norm": 0.9527406692504883, "learning_rate": 1.0886868113038817e-06, "loss": 0.3654, "step": 6801 }, { "epoch": 0.3242676328272114, "grad_norm": 1.6785608530044556, "learning_rate": 1.0869051988519063e-06, "loss": 0.7435, "step": 6802 }, { "epoch": 0.3243153052225109, "grad_norm": 1.3738272190093994, "learning_rate": 1.0851249615986715e-06, "loss": 0.653, "step": 6803 }, { "epoch": 0.3243629776178104, "grad_norm": 3.6544318199157715, "learning_rate": 1.0833460998188516e-06, "loss": 0.6799, "step": 6804 }, { "epoch": 0.3244106500131099, "grad_norm": 1.6429892778396606, "learning_rate": 1.081568613786903e-06, "loss": 1.0817, "step": 6805 }, { "epoch": 0.3244583224084094, "grad_norm": 1.6231443881988525, "learning_rate": 1.079792503777075e-06, "loss": 0.506, "step": 6806 }, { "epoch": 0.3245059948037089, "grad_norm": 1.4431722164154053, "learning_rate": 1.0780177700634053e-06, "loss": 0.6893, "step": 6807 }, { "epoch": 0.3245536671990084, "grad_norm": 1.261094331741333, "learning_rate": 1.0762444129197136e-06, "loss": 0.2204, "step": 6808 }, { "epoch": 0.3246013395943079, "grad_norm": 4.121915340423584, "learning_rate": 1.0744724326196133e-06, "loss": 0.9662, "step": 6809 }, { "epoch": 0.32464901198960744, "grad_norm": 1.5398530960083008, "learning_rate": 1.0727018294364999e-06, "loss": 0.5778, "step": 6810 }, { "epoch": 0.3246966843849069, "grad_norm": 1.2922009229660034, "learning_rate": 1.070932603643563e-06, "loss": 0.6491, "step": 6811 }, { "epoch": 0.3247443567802064, "grad_norm": 1.8126953840255737, "learning_rate": 1.0691647555137719e-06, "loss": 0.8161, "step": 6812 }, { "epoch": 0.3247920291755059, "grad_norm": 1.652815341949463, "learning_rate": 1.0673982853198906e-06, "loss": 0.7883, "step": 6813 }, { "epoch": 0.32483970157080544, "grad_norm": 1.5472791194915771, "learning_rate": 1.0656331933344643e-06, "loss": 0.7809, "step": 6814 }, { "epoch": 0.32488737396610495, "grad_norm": 1.0428762435913086, "learning_rate": 1.06386947982983e-06, "loss": 0.39, "step": 6815 }, { "epoch": 0.3249350463614044, "grad_norm": 1.4960932731628418, "learning_rate": 1.0621071450781118e-06, "loss": 0.9479, "step": 6816 }, { "epoch": 0.3249827187567039, "grad_norm": 1.3536639213562012, "learning_rate": 1.060346189351218e-06, "loss": 0.7426, "step": 6817 }, { "epoch": 0.32503039115200344, "grad_norm": 2.0714895725250244, "learning_rate": 1.0585866129208456e-06, "loss": 0.4976, "step": 6818 }, { "epoch": 0.32507806354730295, "grad_norm": 2.729680061340332, "learning_rate": 1.0568284160584818e-06, "loss": 0.8672, "step": 6819 }, { "epoch": 0.3251257359426024, "grad_norm": 2.2662999629974365, "learning_rate": 1.0550715990353955e-06, "loss": 1.1256, "step": 6820 }, { "epoch": 0.3251734083379019, "grad_norm": 0.9396215081214905, "learning_rate": 1.0533161621226463e-06, "loss": 0.3311, "step": 6821 }, { "epoch": 0.32522108073320144, "grad_norm": 1.3509668111801147, "learning_rate": 1.051562105591082e-06, "loss": 0.7507, "step": 6822 }, { "epoch": 0.32526875312850095, "grad_norm": 3.611696720123291, "learning_rate": 1.0498094297113314e-06, "loss": 0.1463, "step": 6823 }, { "epoch": 0.32531642552380047, "grad_norm": 1.558960199356079, "learning_rate": 1.0480581347538199e-06, "loss": 0.4143, "step": 6824 }, { "epoch": 0.3253640979190999, "grad_norm": 1.4471542835235596, "learning_rate": 1.0463082209887477e-06, "loss": 0.641, "step": 6825 }, { "epoch": 0.32541177031439944, "grad_norm": 2.056330919265747, "learning_rate": 1.0445596886861143e-06, "loss": 0.819, "step": 6826 }, { "epoch": 0.32545944270969895, "grad_norm": 2.165382146835327, "learning_rate": 1.0428125381156962e-06, "loss": 0.6796, "step": 6827 }, { "epoch": 0.32550711510499847, "grad_norm": 1.7400048971176147, "learning_rate": 1.0410667695470633e-06, "loss": 0.785, "step": 6828 }, { "epoch": 0.3255547875002979, "grad_norm": 2.4737558364868164, "learning_rate": 1.039322383249568e-06, "loss": 0.5966, "step": 6829 }, { "epoch": 0.32560245989559744, "grad_norm": 2.14703631401062, "learning_rate": 1.0375793794923505e-06, "loss": 0.5545, "step": 6830 }, { "epoch": 0.32565013229089695, "grad_norm": 2.659785509109497, "learning_rate": 1.0358377585443424e-06, "loss": 0.517, "step": 6831 }, { "epoch": 0.32569780468619647, "grad_norm": 1.8968234062194824, "learning_rate": 1.0340975206742531e-06, "loss": 0.6559, "step": 6832 }, { "epoch": 0.325745477081496, "grad_norm": 1.6274470090866089, "learning_rate": 1.0323586661505858e-06, "loss": 0.6857, "step": 6833 }, { "epoch": 0.32579314947679544, "grad_norm": 2.677960157394409, "learning_rate": 1.030621195241629e-06, "loss": 1.0955, "step": 6834 }, { "epoch": 0.32584082187209495, "grad_norm": 2.5976550579071045, "learning_rate": 1.0288851082154528e-06, "loss": 0.7696, "step": 6835 }, { "epoch": 0.32588849426739447, "grad_norm": 1.0764391422271729, "learning_rate": 1.0271504053399195e-06, "loss": 0.7978, "step": 6836 }, { "epoch": 0.325936166662694, "grad_norm": 1.5717785358428955, "learning_rate": 1.0254170868826796e-06, "loss": 1.0062, "step": 6837 }, { "epoch": 0.3259838390579935, "grad_norm": 1.081852912902832, "learning_rate": 1.0236851531111592e-06, "loss": 0.4168, "step": 6838 }, { "epoch": 0.32603151145329295, "grad_norm": 1.7853864431381226, "learning_rate": 1.0219546042925842e-06, "loss": 0.7182, "step": 6839 }, { "epoch": 0.32607918384859247, "grad_norm": 1.3578150272369385, "learning_rate": 1.020225440693956e-06, "loss": 0.8344, "step": 6840 }, { "epoch": 0.326126856243892, "grad_norm": 1.8048217296600342, "learning_rate": 1.0184976625820707e-06, "loss": 0.889, "step": 6841 }, { "epoch": 0.3261745286391915, "grad_norm": 1.670409917831421, "learning_rate": 1.0167712702235023e-06, "loss": 0.4762, "step": 6842 }, { "epoch": 0.32622220103449096, "grad_norm": 1.2257848978042603, "learning_rate": 1.015046263884617e-06, "loss": 0.5661, "step": 6843 }, { "epoch": 0.32626987342979047, "grad_norm": 1.3278084993362427, "learning_rate": 1.013322643831569e-06, "loss": 0.526, "step": 6844 }, { "epoch": 0.32631754582509, "grad_norm": 1.7151070833206177, "learning_rate": 1.011600410330289e-06, "loss": 1.1557, "step": 6845 }, { "epoch": 0.3263652182203895, "grad_norm": 1.1286561489105225, "learning_rate": 1.0098795636465042e-06, "loss": 0.7666, "step": 6846 }, { "epoch": 0.326412890615689, "grad_norm": 3.2658588886260986, "learning_rate": 1.0081601040457246e-06, "loss": 1.3919, "step": 6847 }, { "epoch": 0.32646056301098847, "grad_norm": 1.2767378091812134, "learning_rate": 1.00644203179324e-06, "loss": 0.7611, "step": 6848 }, { "epoch": 0.326508235406288, "grad_norm": 3.6402831077575684, "learning_rate": 1.004725347154134e-06, "loss": 0.1893, "step": 6849 }, { "epoch": 0.3265559078015875, "grad_norm": 1.2136021852493286, "learning_rate": 1.0030100503932761e-06, "loss": 0.583, "step": 6850 }, { "epoch": 0.326603580196887, "grad_norm": 1.1178150177001953, "learning_rate": 1.0012961417753142e-06, "loss": 0.8505, "step": 6851 }, { "epoch": 0.32665125259218647, "grad_norm": 1.1358115673065186, "learning_rate": 9.995836215646892e-07, "loss": 0.8125, "step": 6852 }, { "epoch": 0.326698924987486, "grad_norm": 2.1993634700775146, "learning_rate": 9.978724900256265e-07, "loss": 1.2156, "step": 6853 }, { "epoch": 0.3267465973827855, "grad_norm": 5.942317962646484, "learning_rate": 9.961627474221324e-07, "loss": 0.4882, "step": 6854 }, { "epoch": 0.326794269778085, "grad_norm": 1.7636799812316895, "learning_rate": 9.944543940180074e-07, "loss": 0.6522, "step": 6855 }, { "epoch": 0.3268419421733845, "grad_norm": 1.9472734928131104, "learning_rate": 9.927474300768303e-07, "loss": 0.6525, "step": 6856 }, { "epoch": 0.326889614568684, "grad_norm": 1.8993598222732544, "learning_rate": 9.91041855861965e-07, "loss": 1.3042, "step": 6857 }, { "epoch": 0.3269372869639835, "grad_norm": 1.998908519744873, "learning_rate": 9.893376716365677e-07, "loss": 0.8146, "step": 6858 }, { "epoch": 0.326984959359283, "grad_norm": 1.3649706840515137, "learning_rate": 9.87634877663578e-07, "loss": 0.6859, "step": 6859 }, { "epoch": 0.3270326317545825, "grad_norm": 1.9363383054733276, "learning_rate": 9.859334742057158e-07, "loss": 0.7996, "step": 6860 }, { "epoch": 0.327080304149882, "grad_norm": 1.5329622030258179, "learning_rate": 9.842334615254901e-07, "loss": 0.8392, "step": 6861 }, { "epoch": 0.3271279765451815, "grad_norm": 1.3983296155929565, "learning_rate": 9.825348398851998e-07, "loss": 0.5687, "step": 6862 }, { "epoch": 0.327175648940481, "grad_norm": 2.9042091369628906, "learning_rate": 9.808376095469196e-07, "loss": 0.3973, "step": 6863 }, { "epoch": 0.3272233213357805, "grad_norm": 1.4467089176177979, "learning_rate": 9.791417707725171e-07, "loss": 0.4728, "step": 6864 }, { "epoch": 0.32727099373108004, "grad_norm": 1.39895498752594, "learning_rate": 9.774473238236449e-07, "loss": 0.9346, "step": 6865 }, { "epoch": 0.3273186661263795, "grad_norm": 2.028965473175049, "learning_rate": 9.757542689617328e-07, "loss": 0.8223, "step": 6866 }, { "epoch": 0.327366338521679, "grad_norm": 1.8963208198547363, "learning_rate": 9.740626064480063e-07, "loss": 1.005, "step": 6867 }, { "epoch": 0.3274140109169785, "grad_norm": 1.631163239479065, "learning_rate": 9.723723365434722e-07, "loss": 0.7241, "step": 6868 }, { "epoch": 0.32746168331227804, "grad_norm": 1.5895754098892212, "learning_rate": 9.706834595089187e-07, "loss": 0.713, "step": 6869 }, { "epoch": 0.32750935570757755, "grad_norm": 1.2809149026870728, "learning_rate": 9.68995975604925e-07, "loss": 0.2593, "step": 6870 }, { "epoch": 0.327557028102877, "grad_norm": 1.493955373764038, "learning_rate": 9.673098850918506e-07, "loss": 0.9036, "step": 6871 }, { "epoch": 0.3276047004981765, "grad_norm": 1.843968391418457, "learning_rate": 9.656251882298394e-07, "loss": 1.0281, "step": 6872 }, { "epoch": 0.32765237289347604, "grad_norm": 1.4267148971557617, "learning_rate": 9.639418852788274e-07, "loss": 0.6061, "step": 6873 }, { "epoch": 0.32770004528877555, "grad_norm": 2.1960649490356445, "learning_rate": 9.622599764985297e-07, "loss": 1.0589, "step": 6874 }, { "epoch": 0.327747717684075, "grad_norm": 1.7546650171279907, "learning_rate": 9.605794621484455e-07, "loss": 0.5425, "step": 6875 }, { "epoch": 0.3277953900793745, "grad_norm": 0.9336969256401062, "learning_rate": 9.589003424878618e-07, "loss": 0.2844, "step": 6876 }, { "epoch": 0.32784306247467404, "grad_norm": 1.0836050510406494, "learning_rate": 9.572226177758514e-07, "loss": 0.7409, "step": 6877 }, { "epoch": 0.32789073486997355, "grad_norm": 1.8114019632339478, "learning_rate": 9.555462882712684e-07, "loss": 0.6375, "step": 6878 }, { "epoch": 0.32793840726527307, "grad_norm": 1.5859652757644653, "learning_rate": 9.538713542327527e-07, "loss": 0.582, "step": 6879 }, { "epoch": 0.3279860796605725, "grad_norm": 1.504881501197815, "learning_rate": 9.521978159187295e-07, "loss": 0.7675, "step": 6880 }, { "epoch": 0.32803375205587204, "grad_norm": 2.2370870113372803, "learning_rate": 9.505256735874113e-07, "loss": 0.8555, "step": 6881 }, { "epoch": 0.32808142445117155, "grad_norm": 1.9561861753463745, "learning_rate": 9.488549274967873e-07, "loss": 0.6115, "step": 6882 }, { "epoch": 0.32812909684647107, "grad_norm": 2.1800622940063477, "learning_rate": 9.471855779046424e-07, "loss": 0.5097, "step": 6883 }, { "epoch": 0.3281767692417705, "grad_norm": 2.48305344581604, "learning_rate": 9.455176250685338e-07, "loss": 1.3924, "step": 6884 }, { "epoch": 0.32822444163707004, "grad_norm": 1.8112921714782715, "learning_rate": 9.438510692458147e-07, "loss": 0.8998, "step": 6885 }, { "epoch": 0.32827211403236956, "grad_norm": 0.9631018042564392, "learning_rate": 9.421859106936138e-07, "loss": 0.748, "step": 6886 }, { "epoch": 0.32831978642766907, "grad_norm": 2.037118673324585, "learning_rate": 9.40522149668851e-07, "loss": 0.8314, "step": 6887 }, { "epoch": 0.3283674588229686, "grad_norm": 1.617737889289856, "learning_rate": 9.388597864282245e-07, "loss": 0.6754, "step": 6888 }, { "epoch": 0.32841513121826804, "grad_norm": 2.9875071048736572, "learning_rate": 9.371988212282212e-07, "loss": 1.4998, "step": 6889 }, { "epoch": 0.32846280361356756, "grad_norm": 2.425280809402466, "learning_rate": 9.355392543251119e-07, "loss": 0.8703, "step": 6890 }, { "epoch": 0.32851047600886707, "grad_norm": 1.9492497444152832, "learning_rate": 9.338810859749492e-07, "loss": 0.5463, "step": 6891 }, { "epoch": 0.3285581484041666, "grad_norm": 5.014599323272705, "learning_rate": 9.322243164335709e-07, "loss": 0.3037, "step": 6892 }, { "epoch": 0.32860582079946604, "grad_norm": 2.4838085174560547, "learning_rate": 9.305689459566025e-07, "loss": 1.0198, "step": 6893 }, { "epoch": 0.32865349319476556, "grad_norm": 5.7384934425354, "learning_rate": 9.289149747994475e-07, "loss": 0.4571, "step": 6894 }, { "epoch": 0.32870116559006507, "grad_norm": 1.1955310106277466, "learning_rate": 9.272624032172972e-07, "loss": 0.6792, "step": 6895 }, { "epoch": 0.3287488379853646, "grad_norm": 2.9281866550445557, "learning_rate": 9.2561123146513e-07, "loss": 1.546, "step": 6896 }, { "epoch": 0.3287965103806641, "grad_norm": 6.846766471862793, "learning_rate": 9.239614597976987e-07, "loss": 0.6851, "step": 6897 }, { "epoch": 0.32884418277596356, "grad_norm": 0.7876847386360168, "learning_rate": 9.223130884695486e-07, "loss": 0.3575, "step": 6898 }, { "epoch": 0.32889185517126307, "grad_norm": 2.3945159912109375, "learning_rate": 9.206661177350096e-07, "loss": 1.3135, "step": 6899 }, { "epoch": 0.3289395275665626, "grad_norm": 1.7309443950653076, "learning_rate": 9.190205478481895e-07, "loss": 0.9645, "step": 6900 }, { "epoch": 0.3289871999618621, "grad_norm": 2.170653820037842, "learning_rate": 9.173763790629808e-07, "loss": 0.4641, "step": 6901 }, { "epoch": 0.3290348723571616, "grad_norm": 1.2487143278121948, "learning_rate": 9.15733611633065e-07, "loss": 0.6012, "step": 6902 }, { "epoch": 0.32908254475246107, "grad_norm": 3.0707266330718994, "learning_rate": 9.140922458119028e-07, "loss": 0.2333, "step": 6903 }, { "epoch": 0.3291302171477606, "grad_norm": 1.6412420272827148, "learning_rate": 9.124522818527393e-07, "loss": 0.5698, "step": 6904 }, { "epoch": 0.3291778895430601, "grad_norm": 1.5062675476074219, "learning_rate": 9.108137200086076e-07, "loss": 0.6577, "step": 6905 }, { "epoch": 0.3292255619383596, "grad_norm": 2.7576942443847656, "learning_rate": 9.091765605323155e-07, "loss": 0.7413, "step": 6906 }, { "epoch": 0.32927323433365907, "grad_norm": 1.5102096796035767, "learning_rate": 9.075408036764633e-07, "loss": 0.6647, "step": 6907 }, { "epoch": 0.3293209067289586, "grad_norm": 1.599633812904358, "learning_rate": 9.059064496934333e-07, "loss": 0.6397, "step": 6908 }, { "epoch": 0.3293685791242581, "grad_norm": 1.797092080116272, "learning_rate": 9.042734988353841e-07, "loss": 0.4712, "step": 6909 }, { "epoch": 0.3294162515195576, "grad_norm": 1.429047703742981, "learning_rate": 9.026419513542673e-07, "loss": 0.7813, "step": 6910 }, { "epoch": 0.3294639239148571, "grad_norm": 2.5892581939697266, "learning_rate": 9.010118075018137e-07, "loss": 0.8682, "step": 6911 }, { "epoch": 0.3295115963101566, "grad_norm": 2.959779739379883, "learning_rate": 8.993830675295345e-07, "loss": 1.1225, "step": 6912 }, { "epoch": 0.3295592687054561, "grad_norm": 2.6877310276031494, "learning_rate": 8.977557316887309e-07, "loss": 0.5966, "step": 6913 }, { "epoch": 0.3296069411007556, "grad_norm": 2.6235692501068115, "learning_rate": 8.961298002304841e-07, "loss": 0.5057, "step": 6914 }, { "epoch": 0.3296546134960551, "grad_norm": 1.8561090230941772, "learning_rate": 8.945052734056581e-07, "loss": 0.8651, "step": 6915 }, { "epoch": 0.3297022858913546, "grad_norm": 2.475501537322998, "learning_rate": 8.928821514648977e-07, "loss": 0.8263, "step": 6916 }, { "epoch": 0.3297499582866541, "grad_norm": 1.637777328491211, "learning_rate": 8.912604346586362e-07, "loss": 0.9822, "step": 6917 }, { "epoch": 0.3297976306819536, "grad_norm": 1.7201780080795288, "learning_rate": 8.896401232370889e-07, "loss": 0.7964, "step": 6918 }, { "epoch": 0.3298453030772531, "grad_norm": 2.49106502532959, "learning_rate": 8.880212174502512e-07, "loss": 0.8571, "step": 6919 }, { "epoch": 0.32989297547255264, "grad_norm": 0.8593341112136841, "learning_rate": 8.864037175479034e-07, "loss": 0.4201, "step": 6920 }, { "epoch": 0.3299406478678521, "grad_norm": 3.763322353363037, "learning_rate": 8.847876237796127e-07, "loss": 0.6488, "step": 6921 }, { "epoch": 0.3299883202631516, "grad_norm": 1.6638611555099487, "learning_rate": 8.831729363947216e-07, "loss": 0.7638, "step": 6922 }, { "epoch": 0.3300359926584511, "grad_norm": 0.7484946846961975, "learning_rate": 8.815596556423611e-07, "loss": 0.2955, "step": 6923 }, { "epoch": 0.33008366505375064, "grad_norm": 1.2401622533798218, "learning_rate": 8.799477817714452e-07, "loss": 0.6298, "step": 6924 }, { "epoch": 0.33013133744905016, "grad_norm": 1.2771800756454468, "learning_rate": 8.783373150306663e-07, "loss": 0.6583, "step": 6925 }, { "epoch": 0.3301790098443496, "grad_norm": 1.4808845520019531, "learning_rate": 8.767282556685053e-07, "loss": 0.6071, "step": 6926 }, { "epoch": 0.3302266822396491, "grad_norm": 1.4974414110183716, "learning_rate": 8.75120603933225e-07, "loss": 0.9551, "step": 6927 }, { "epoch": 0.33027435463494864, "grad_norm": 1.3905906677246094, "learning_rate": 8.735143600728646e-07, "loss": 0.5626, "step": 6928 }, { "epoch": 0.33032202703024816, "grad_norm": 1.1375762224197388, "learning_rate": 8.71909524335256e-07, "loss": 0.5293, "step": 6929 }, { "epoch": 0.3303696994255476, "grad_norm": 2.1217849254608154, "learning_rate": 8.703060969680055e-07, "loss": 0.3791, "step": 6930 }, { "epoch": 0.33041737182084713, "grad_norm": 1.729552149772644, "learning_rate": 8.687040782185074e-07, "loss": 0.6536, "step": 6931 }, { "epoch": 0.33046504421614664, "grad_norm": 1.1514723300933838, "learning_rate": 8.671034683339352e-07, "loss": 0.757, "step": 6932 }, { "epoch": 0.33051271661144616, "grad_norm": 1.419379472732544, "learning_rate": 8.65504267561248e-07, "loss": 0.8308, "step": 6933 }, { "epoch": 0.33056038900674567, "grad_norm": 1.2579851150512695, "learning_rate": 8.639064761471838e-07, "loss": 0.9242, "step": 6934 }, { "epoch": 0.33060806140204513, "grad_norm": 1.2018386125564575, "learning_rate": 8.623100943382667e-07, "loss": 0.5524, "step": 6935 }, { "epoch": 0.33065573379734464, "grad_norm": 1.9304592609405518, "learning_rate": 8.607151223808041e-07, "loss": 0.903, "step": 6936 }, { "epoch": 0.33070340619264416, "grad_norm": 1.6968860626220703, "learning_rate": 8.591215605208791e-07, "loss": 0.7001, "step": 6937 }, { "epoch": 0.33075107858794367, "grad_norm": 2.054097890853882, "learning_rate": 8.575294090043651e-07, "loss": 0.6849, "step": 6938 }, { "epoch": 0.33079875098324313, "grad_norm": 0.950583279132843, "learning_rate": 8.559386680769166e-07, "loss": 0.6726, "step": 6939 }, { "epoch": 0.33084642337854264, "grad_norm": 1.2018409967422485, "learning_rate": 8.543493379839629e-07, "loss": 0.5115, "step": 6940 }, { "epoch": 0.33089409577384216, "grad_norm": 0.8816591501235962, "learning_rate": 8.527614189707245e-07, "loss": 0.3203, "step": 6941 }, { "epoch": 0.33094176816914167, "grad_norm": 1.4427143335342407, "learning_rate": 8.511749112822032e-07, "loss": 0.982, "step": 6942 }, { "epoch": 0.3309894405644412, "grad_norm": 1.4803297519683838, "learning_rate": 8.495898151631765e-07, "loss": 0.8211, "step": 6943 }, { "epoch": 0.33103711295974064, "grad_norm": 2.8168013095855713, "learning_rate": 8.480061308582122e-07, "loss": 0.4794, "step": 6944 }, { "epoch": 0.33108478535504016, "grad_norm": 1.8412179946899414, "learning_rate": 8.464238586116524e-07, "loss": 0.304, "step": 6945 }, { "epoch": 0.33113245775033967, "grad_norm": 1.322967767715454, "learning_rate": 8.448429986676298e-07, "loss": 0.5554, "step": 6946 }, { "epoch": 0.3311801301456392, "grad_norm": 1.396625280380249, "learning_rate": 8.432635512700505e-07, "loss": 0.391, "step": 6947 }, { "epoch": 0.33122780254093864, "grad_norm": 1.2411015033721924, "learning_rate": 8.416855166626114e-07, "loss": 0.8087, "step": 6948 }, { "epoch": 0.33127547493623816, "grad_norm": 1.0039184093475342, "learning_rate": 8.401088950887826e-07, "loss": 0.2497, "step": 6949 }, { "epoch": 0.33132314733153767, "grad_norm": 1.602081537246704, "learning_rate": 8.385336867918226e-07, "loss": 0.6764, "step": 6950 }, { "epoch": 0.3313708197268372, "grad_norm": 2.843942165374756, "learning_rate": 8.369598920147715e-07, "loss": 0.4815, "step": 6951 }, { "epoch": 0.3314184921221367, "grad_norm": 1.881803035736084, "learning_rate": 8.353875110004462e-07, "loss": 0.7745, "step": 6952 }, { "epoch": 0.33146616451743616, "grad_norm": 2.328432321548462, "learning_rate": 8.338165439914514e-07, "loss": 1.0419, "step": 6953 }, { "epoch": 0.33151383691273567, "grad_norm": 1.8506278991699219, "learning_rate": 8.3224699123017e-07, "loss": 0.6811, "step": 6954 }, { "epoch": 0.3315615093080352, "grad_norm": 1.2373707294464111, "learning_rate": 8.306788529587695e-07, "loss": 0.7214, "step": 6955 }, { "epoch": 0.3316091817033347, "grad_norm": 1.1605180501937866, "learning_rate": 8.291121294191951e-07, "loss": 0.6749, "step": 6956 }, { "epoch": 0.3316568540986342, "grad_norm": 1.6606429815292358, "learning_rate": 8.275468208531767e-07, "loss": 0.8797, "step": 6957 }, { "epoch": 0.33170452649393367, "grad_norm": 1.0410281419754028, "learning_rate": 8.25982927502228e-07, "loss": 0.5226, "step": 6958 }, { "epoch": 0.3317521988892332, "grad_norm": 3.8279051780700684, "learning_rate": 8.244204496076402e-07, "loss": 0.5769, "step": 6959 }, { "epoch": 0.3317998712845327, "grad_norm": 2.1523706912994385, "learning_rate": 8.22859387410484e-07, "loss": 1.017, "step": 6960 }, { "epoch": 0.3318475436798322, "grad_norm": 2.217524766921997, "learning_rate": 8.212997411516199e-07, "loss": 1.0985, "step": 6961 }, { "epoch": 0.33189521607513167, "grad_norm": 1.2286893129348755, "learning_rate": 8.197415110716822e-07, "loss": 0.7791, "step": 6962 }, { "epoch": 0.3319428884704312, "grad_norm": 1.9869701862335205, "learning_rate": 8.181846974110907e-07, "loss": 1.0268, "step": 6963 }, { "epoch": 0.3319905608657307, "grad_norm": 3.1760213375091553, "learning_rate": 8.166293004100478e-07, "loss": 1.088, "step": 6964 }, { "epoch": 0.3320382332610302, "grad_norm": 1.2840080261230469, "learning_rate": 8.150753203085315e-07, "loss": 0.7817, "step": 6965 }, { "epoch": 0.3320859056563297, "grad_norm": 1.7720043659210205, "learning_rate": 8.135227573463067e-07, "loss": 0.7465, "step": 6966 }, { "epoch": 0.3321335780516292, "grad_norm": 1.7507137060165405, "learning_rate": 8.119716117629206e-07, "loss": 0.6792, "step": 6967 }, { "epoch": 0.3321812504469287, "grad_norm": 2.602229118347168, "learning_rate": 8.10421883797694e-07, "loss": 1.1148, "step": 6968 }, { "epoch": 0.3322289228422282, "grad_norm": 2.1618120670318604, "learning_rate": 8.088735736897369e-07, "loss": 0.4714, "step": 6969 }, { "epoch": 0.3322765952375277, "grad_norm": 1.3942264318466187, "learning_rate": 8.07326681677938e-07, "loss": 0.758, "step": 6970 }, { "epoch": 0.3323242676328272, "grad_norm": 1.5200979709625244, "learning_rate": 8.057812080009641e-07, "loss": 0.6898, "step": 6971 }, { "epoch": 0.3323719400281267, "grad_norm": 1.68550443649292, "learning_rate": 8.042371528972681e-07, "loss": 0.6427, "step": 6972 }, { "epoch": 0.3324196124234262, "grad_norm": 1.460419774055481, "learning_rate": 8.026945166050837e-07, "loss": 0.5732, "step": 6973 }, { "epoch": 0.33246728481872573, "grad_norm": 1.9466084241867065, "learning_rate": 8.011532993624194e-07, "loss": 0.8939, "step": 6974 }, { "epoch": 0.33251495721402524, "grad_norm": 1.8210314512252808, "learning_rate": 7.996135014070727e-07, "loss": 1.0337, "step": 6975 }, { "epoch": 0.3325626296093247, "grad_norm": 3.750234842300415, "learning_rate": 7.98075122976617e-07, "loss": 0.8677, "step": 6976 }, { "epoch": 0.3326103020046242, "grad_norm": 2.1335513591766357, "learning_rate": 7.965381643084069e-07, "loss": 0.9474, "step": 6977 }, { "epoch": 0.33265797439992373, "grad_norm": 2.9676480293273926, "learning_rate": 7.950026256395804e-07, "loss": 0.8712, "step": 6978 }, { "epoch": 0.33270564679522324, "grad_norm": 2.2473292350769043, "learning_rate": 7.934685072070569e-07, "loss": 0.9024, "step": 6979 }, { "epoch": 0.3327533191905227, "grad_norm": 1.7809481620788574, "learning_rate": 7.919358092475326e-07, "loss": 0.5509, "step": 6980 }, { "epoch": 0.3328009915858222, "grad_norm": 3.3433854579925537, "learning_rate": 7.904045319974885e-07, "loss": 0.8467, "step": 6981 }, { "epoch": 0.33284866398112173, "grad_norm": 1.1155240535736084, "learning_rate": 7.888746756931865e-07, "loss": 0.8283, "step": 6982 }, { "epoch": 0.33289633637642124, "grad_norm": 1.5901422500610352, "learning_rate": 7.873462405706633e-07, "loss": 0.9563, "step": 6983 }, { "epoch": 0.33294400877172076, "grad_norm": 1.781674861907959, "learning_rate": 7.858192268657438e-07, "loss": 0.9848, "step": 6984 }, { "epoch": 0.3329916811670202, "grad_norm": 1.6295862197875977, "learning_rate": 7.842936348140317e-07, "loss": 0.9112, "step": 6985 }, { "epoch": 0.33303935356231973, "grad_norm": 1.6032387018203735, "learning_rate": 7.827694646509065e-07, "loss": 0.7037, "step": 6986 }, { "epoch": 0.33308702595761924, "grad_norm": 2.0810365676879883, "learning_rate": 7.812467166115334e-07, "loss": 0.514, "step": 6987 }, { "epoch": 0.33313469835291876, "grad_norm": 1.4824879169464111, "learning_rate": 7.797253909308588e-07, "loss": 0.7197, "step": 6988 }, { "epoch": 0.33318237074821827, "grad_norm": 4.814878463745117, "learning_rate": 7.782054878436051e-07, "loss": 0.3217, "step": 6989 }, { "epoch": 0.33323004314351773, "grad_norm": 3.686336040496826, "learning_rate": 7.766870075842792e-07, "loss": 0.7873, "step": 6990 }, { "epoch": 0.33327771553881724, "grad_norm": 2.2819459438323975, "learning_rate": 7.751699503871646e-07, "loss": 1.2062, "step": 6991 }, { "epoch": 0.33332538793411676, "grad_norm": 1.5199083089828491, "learning_rate": 7.736543164863319e-07, "loss": 0.7416, "step": 6992 }, { "epoch": 0.33337306032941627, "grad_norm": 3.20569109916687, "learning_rate": 7.721401061156231e-07, "loss": 0.4969, "step": 6993 }, { "epoch": 0.33342073272471573, "grad_norm": 3.330939531326294, "learning_rate": 7.706273195086667e-07, "loss": 1.5155, "step": 6994 }, { "epoch": 0.33346840512001524, "grad_norm": 13.722299575805664, "learning_rate": 7.691159568988727e-07, "loss": 0.0921, "step": 6995 }, { "epoch": 0.33351607751531476, "grad_norm": 1.3149386644363403, "learning_rate": 7.676060185194256e-07, "loss": 0.7787, "step": 6996 }, { "epoch": 0.33356374991061427, "grad_norm": 1.5102317333221436, "learning_rate": 7.660975046032948e-07, "loss": 0.5024, "step": 6997 }, { "epoch": 0.3336114223059138, "grad_norm": 1.3673630952835083, "learning_rate": 7.645904153832295e-07, "loss": 0.7202, "step": 6998 }, { "epoch": 0.33365909470121324, "grad_norm": 2.5012240409851074, "learning_rate": 7.63084751091755e-07, "loss": 0.0183, "step": 6999 }, { "epoch": 0.33370676709651276, "grad_norm": 1.5828908681869507, "learning_rate": 7.615805119611818e-07, "loss": 0.7323, "step": 7000 }, { "epoch": 0.33375443949181227, "grad_norm": 1.606491208076477, "learning_rate": 7.600776982235992e-07, "loss": 1.0206, "step": 7001 }, { "epoch": 0.3338021118871118, "grad_norm": 1.3894623517990112, "learning_rate": 7.585763101108746e-07, "loss": 0.8505, "step": 7002 }, { "epoch": 0.33384978428241124, "grad_norm": 2.2927088737487793, "learning_rate": 7.570763478546572e-07, "loss": 0.5769, "step": 7003 }, { "epoch": 0.33389745667771076, "grad_norm": 2.522141695022583, "learning_rate": 7.555778116863755e-07, "loss": 0.9323, "step": 7004 }, { "epoch": 0.33394512907301027, "grad_norm": 2.126577615737915, "learning_rate": 7.540807018372387e-07, "loss": 1.2636, "step": 7005 }, { "epoch": 0.3339928014683098, "grad_norm": 2.773179292678833, "learning_rate": 7.525850185382344e-07, "loss": 1.0102, "step": 7006 }, { "epoch": 0.3340404738636093, "grad_norm": 2.7864255905151367, "learning_rate": 7.510907620201335e-07, "loss": 1.0874, "step": 7007 }, { "epoch": 0.33408814625890876, "grad_norm": 1.3754595518112183, "learning_rate": 7.495979325134806e-07, "loss": 0.8089, "step": 7008 }, { "epoch": 0.33413581865420827, "grad_norm": 3.7733309268951416, "learning_rate": 7.481065302486057e-07, "loss": 0.8764, "step": 7009 }, { "epoch": 0.3341834910495078, "grad_norm": 1.4038552045822144, "learning_rate": 7.466165554556193e-07, "loss": 0.9035, "step": 7010 }, { "epoch": 0.3342311634448073, "grad_norm": 1.0519099235534668, "learning_rate": 7.451280083644052e-07, "loss": 0.2934, "step": 7011 }, { "epoch": 0.3342788358401068, "grad_norm": 1.473276138305664, "learning_rate": 7.436408892046321e-07, "loss": 0.5284, "step": 7012 }, { "epoch": 0.3343265082354063, "grad_norm": 2.0723464488983154, "learning_rate": 7.421551982057496e-07, "loss": 1.2248, "step": 7013 }, { "epoch": 0.3343741806307058, "grad_norm": 1.693808913230896, "learning_rate": 7.406709355969821e-07, "loss": 0.581, "step": 7014 }, { "epoch": 0.3344218530260053, "grad_norm": 2.9343862533569336, "learning_rate": 7.391881016073354e-07, "loss": 0.5937, "step": 7015 }, { "epoch": 0.3344695254213048, "grad_norm": 1.7389940023422241, "learning_rate": 7.377066964655987e-07, "loss": 0.7204, "step": 7016 }, { "epoch": 0.3345171978166043, "grad_norm": 1.8688080310821533, "learning_rate": 7.362267204003337e-07, "loss": 0.7314, "step": 7017 }, { "epoch": 0.3345648702119038, "grad_norm": 1.577251672744751, "learning_rate": 7.347481736398876e-07, "loss": 0.9979, "step": 7018 }, { "epoch": 0.3346125426072033, "grad_norm": 2.4060111045837402, "learning_rate": 7.332710564123869e-07, "loss": 0.2635, "step": 7019 }, { "epoch": 0.3346602150025028, "grad_norm": 0.9208526611328125, "learning_rate": 7.317953689457325e-07, "loss": 0.5298, "step": 7020 }, { "epoch": 0.33470788739780233, "grad_norm": 1.8221689462661743, "learning_rate": 7.303211114676067e-07, "loss": 0.8214, "step": 7021 }, { "epoch": 0.3347555597931018, "grad_norm": 1.2249267101287842, "learning_rate": 7.288482842054767e-07, "loss": 1.0256, "step": 7022 }, { "epoch": 0.3348032321884013, "grad_norm": 2.083839178085327, "learning_rate": 7.273768873865794e-07, "loss": 0.9193, "step": 7023 }, { "epoch": 0.3348509045837008, "grad_norm": 1.6141384840011597, "learning_rate": 7.259069212379399e-07, "loss": 0.6222, "step": 7024 }, { "epoch": 0.33489857697900033, "grad_norm": 1.3766950368881226, "learning_rate": 7.244383859863591e-07, "loss": 0.5655, "step": 7025 }, { "epoch": 0.3349462493742998, "grad_norm": 1.5624011754989624, "learning_rate": 7.229712818584134e-07, "loss": 0.9671, "step": 7026 }, { "epoch": 0.3349939217695993, "grad_norm": 25.637557983398438, "learning_rate": 7.215056090804651e-07, "loss": 0.8141, "step": 7027 }, { "epoch": 0.3350415941648988, "grad_norm": 2.3002846240997314, "learning_rate": 7.200413678786522e-07, "loss": 0.9535, "step": 7028 }, { "epoch": 0.33508926656019833, "grad_norm": 1.5308042764663696, "learning_rate": 7.185785584788896e-07, "loss": 0.2675, "step": 7029 }, { "epoch": 0.33513693895549784, "grad_norm": 2.349839210510254, "learning_rate": 7.171171811068744e-07, "loss": 0.7634, "step": 7030 }, { "epoch": 0.3351846113507973, "grad_norm": 1.5442345142364502, "learning_rate": 7.156572359880842e-07, "loss": 0.7412, "step": 7031 }, { "epoch": 0.3352322837460968, "grad_norm": 1.4362175464630127, "learning_rate": 7.141987233477732e-07, "loss": 0.9757, "step": 7032 }, { "epoch": 0.33527995614139633, "grad_norm": 2.019601583480835, "learning_rate": 7.127416434109724e-07, "loss": 0.862, "step": 7033 }, { "epoch": 0.33532762853669584, "grad_norm": 1.7395135164260864, "learning_rate": 7.112859964024977e-07, "loss": 0.7656, "step": 7034 }, { "epoch": 0.3353753009319953, "grad_norm": 0.9082579016685486, "learning_rate": 7.098317825469381e-07, "loss": 0.3733, "step": 7035 }, { "epoch": 0.3354229733272948, "grad_norm": 1.9751553535461426, "learning_rate": 7.083790020686632e-07, "loss": 0.7905, "step": 7036 }, { "epoch": 0.33547064572259433, "grad_norm": 1.4965424537658691, "learning_rate": 7.069276551918225e-07, "loss": 0.6882, "step": 7037 }, { "epoch": 0.33551831811789384, "grad_norm": 2.9734511375427246, "learning_rate": 7.054777421403469e-07, "loss": 0.7582, "step": 7038 }, { "epoch": 0.33556599051319336, "grad_norm": 1.3904688358306885, "learning_rate": 7.040292631379386e-07, "loss": 0.3592, "step": 7039 }, { "epoch": 0.3356136629084928, "grad_norm": 5.98747444152832, "learning_rate": 7.025822184080844e-07, "loss": 2.0757, "step": 7040 }, { "epoch": 0.33566133530379233, "grad_norm": 1.2837018966674805, "learning_rate": 7.011366081740512e-07, "loss": 0.7061, "step": 7041 }, { "epoch": 0.33570900769909184, "grad_norm": 1.4962846040725708, "learning_rate": 6.996924326588772e-07, "loss": 0.5799, "step": 7042 }, { "epoch": 0.33575668009439136, "grad_norm": 4.407690525054932, "learning_rate": 6.982496920853876e-07, "loss": 0.5775, "step": 7043 }, { "epoch": 0.33580435248969087, "grad_norm": 2.727773427963257, "learning_rate": 6.968083866761821e-07, "loss": 1.1992, "step": 7044 }, { "epoch": 0.33585202488499033, "grad_norm": 1.1141144037246704, "learning_rate": 6.953685166536361e-07, "loss": 0.5329, "step": 7045 }, { "epoch": 0.33589969728028984, "grad_norm": 1.6083498001098633, "learning_rate": 6.939300822399086e-07, "loss": 0.569, "step": 7046 }, { "epoch": 0.33594736967558936, "grad_norm": 3.362513780593872, "learning_rate": 6.924930836569377e-07, "loss": 0.8628, "step": 7047 }, { "epoch": 0.33599504207088887, "grad_norm": 2.602198362350464, "learning_rate": 6.910575211264336e-07, "loss": 0.5938, "step": 7048 }, { "epoch": 0.33604271446618833, "grad_norm": 1.5819512605667114, "learning_rate": 6.896233948698916e-07, "loss": 0.7165, "step": 7049 }, { "epoch": 0.33609038686148784, "grad_norm": 2.2062723636627197, "learning_rate": 6.881907051085801e-07, "loss": 0.8791, "step": 7050 }, { "epoch": 0.33613805925678736, "grad_norm": 1.5825045108795166, "learning_rate": 6.867594520635512e-07, "loss": 0.6868, "step": 7051 }, { "epoch": 0.33618573165208687, "grad_norm": 1.9158004522323608, "learning_rate": 6.853296359556294e-07, "loss": 0.7825, "step": 7052 }, { "epoch": 0.3362334040473864, "grad_norm": 1.7363232374191284, "learning_rate": 6.839012570054249e-07, "loss": 0.8877, "step": 7053 }, { "epoch": 0.33628107644268584, "grad_norm": 1.525704026222229, "learning_rate": 6.824743154333157e-07, "loss": 0.842, "step": 7054 }, { "epoch": 0.33632874883798536, "grad_norm": 1.6364781856536865, "learning_rate": 6.810488114594694e-07, "loss": 0.5471, "step": 7055 }, { "epoch": 0.3363764212332849, "grad_norm": 2.1185033321380615, "learning_rate": 6.796247453038252e-07, "loss": 0.848, "step": 7056 }, { "epoch": 0.3364240936285844, "grad_norm": 1.9405720233917236, "learning_rate": 6.782021171861008e-07, "loss": 0.7518, "step": 7057 }, { "epoch": 0.33647176602388384, "grad_norm": 1.6473499536514282, "learning_rate": 6.76780927325793e-07, "loss": 0.7728, "step": 7058 }, { "epoch": 0.33651943841918336, "grad_norm": 1.209946870803833, "learning_rate": 6.753611759421796e-07, "loss": 0.7729, "step": 7059 }, { "epoch": 0.3365671108144829, "grad_norm": 1.3223323822021484, "learning_rate": 6.739428632543099e-07, "loss": 0.7023, "step": 7060 }, { "epoch": 0.3366147832097824, "grad_norm": 1.9575542211532593, "learning_rate": 6.725259894810165e-07, "loss": 0.9558, "step": 7061 }, { "epoch": 0.3366624556050819, "grad_norm": 1.2555452585220337, "learning_rate": 6.711105548409103e-07, "loss": 0.6409, "step": 7062 }, { "epoch": 0.33671012800038136, "grad_norm": 1.4947121143341064, "learning_rate": 6.696965595523741e-07, "loss": 0.6378, "step": 7063 }, { "epoch": 0.3367578003956809, "grad_norm": 3.0194287300109863, "learning_rate": 6.682840038335781e-07, "loss": 0.4027, "step": 7064 }, { "epoch": 0.3368054727909804, "grad_norm": 2.0665180683135986, "learning_rate": 6.6687288790246e-07, "loss": 0.6601, "step": 7065 }, { "epoch": 0.3368531451862799, "grad_norm": 3.0973949432373047, "learning_rate": 6.654632119767446e-07, "loss": 0.5614, "step": 7066 }, { "epoch": 0.33690081758157936, "grad_norm": 1.4588905572891235, "learning_rate": 6.640549762739257e-07, "loss": 0.6588, "step": 7067 }, { "epoch": 0.3369484899768789, "grad_norm": 1.5161415338516235, "learning_rate": 6.62648181011284e-07, "loss": 0.6512, "step": 7068 }, { "epoch": 0.3369961623721784, "grad_norm": 1.6196343898773193, "learning_rate": 6.612428264058723e-07, "loss": 0.5214, "step": 7069 }, { "epoch": 0.3370438347674779, "grad_norm": 1.4347542524337769, "learning_rate": 6.598389126745209e-07, "loss": 0.7676, "step": 7070 }, { "epoch": 0.3370915071627774, "grad_norm": 2.336092233657837, "learning_rate": 6.584364400338395e-07, "loss": 0.6583, "step": 7071 }, { "epoch": 0.3371391795580769, "grad_norm": 2.322232723236084, "learning_rate": 6.570354087002173e-07, "loss": 0.6857, "step": 7072 }, { "epoch": 0.3371868519533764, "grad_norm": 1.3281875848770142, "learning_rate": 6.55635818889817e-07, "loss": 0.6177, "step": 7073 }, { "epoch": 0.3372345243486759, "grad_norm": 1.6594889163970947, "learning_rate": 6.542376708185816e-07, "loss": 0.6738, "step": 7074 }, { "epoch": 0.3372821967439754, "grad_norm": 2.193472385406494, "learning_rate": 6.528409647022316e-07, "loss": 1.0536, "step": 7075 }, { "epoch": 0.33732986913927493, "grad_norm": 1.5280077457427979, "learning_rate": 6.514457007562625e-07, "loss": 0.748, "step": 7076 }, { "epoch": 0.3373775415345744, "grad_norm": 1.2045466899871826, "learning_rate": 6.500518791959498e-07, "loss": 0.6522, "step": 7077 }, { "epoch": 0.3374252139298739, "grad_norm": 2.187865734100342, "learning_rate": 6.486595002363494e-07, "loss": 1.0395, "step": 7078 }, { "epoch": 0.3374728863251734, "grad_norm": 1.5545551776885986, "learning_rate": 6.47268564092286e-07, "loss": 0.5877, "step": 7079 }, { "epoch": 0.33752055872047293, "grad_norm": 4.3865532875061035, "learning_rate": 6.45879070978368e-07, "loss": 0.9466, "step": 7080 }, { "epoch": 0.3375682311157724, "grad_norm": 1.7052801847457886, "learning_rate": 6.444910211089827e-07, "loss": 0.477, "step": 7081 }, { "epoch": 0.3376159035110719, "grad_norm": 1.2200794219970703, "learning_rate": 6.431044146982868e-07, "loss": 0.5847, "step": 7082 }, { "epoch": 0.3376635759063714, "grad_norm": 1.0847972631454468, "learning_rate": 6.417192519602233e-07, "loss": 0.477, "step": 7083 }, { "epoch": 0.33771124830167093, "grad_norm": 3.3016562461853027, "learning_rate": 6.403355331085092e-07, "loss": 0.6994, "step": 7084 }, { "epoch": 0.33775892069697044, "grad_norm": 2.6363039016723633, "learning_rate": 6.389532583566338e-07, "loss": 1.2867, "step": 7085 }, { "epoch": 0.3378065930922699, "grad_norm": 2.580029010772705, "learning_rate": 6.375724279178719e-07, "loss": 0.8993, "step": 7086 }, { "epoch": 0.3378542654875694, "grad_norm": 4.548689365386963, "learning_rate": 6.361930420052709e-07, "loss": 1.0246, "step": 7087 }, { "epoch": 0.33790193788286893, "grad_norm": 1.1172839403152466, "learning_rate": 6.348151008316539e-07, "loss": 0.899, "step": 7088 }, { "epoch": 0.33794961027816844, "grad_norm": 1.4501323699951172, "learning_rate": 6.334386046096231e-07, "loss": 0.4915, "step": 7089 }, { "epoch": 0.3379972826734679, "grad_norm": 1.4368114471435547, "learning_rate": 6.320635535515607e-07, "loss": 0.811, "step": 7090 }, { "epoch": 0.3380449550687674, "grad_norm": 1.2407218217849731, "learning_rate": 6.306899478696193e-07, "loss": 0.4421, "step": 7091 }, { "epoch": 0.33809262746406693, "grad_norm": 2.2550017833709717, "learning_rate": 6.293177877757339e-07, "loss": 0.7693, "step": 7092 }, { "epoch": 0.33814029985936644, "grad_norm": 2.1144356727600098, "learning_rate": 6.279470734816162e-07, "loss": 0.8044, "step": 7093 }, { "epoch": 0.33818797225466596, "grad_norm": 1.921338677406311, "learning_rate": 6.265778051987492e-07, "loss": 0.7113, "step": 7094 }, { "epoch": 0.3382356446499654, "grad_norm": 1.4199682474136353, "learning_rate": 6.252099831384018e-07, "loss": 0.6867, "step": 7095 }, { "epoch": 0.33828331704526493, "grad_norm": 2.187979221343994, "learning_rate": 6.238436075116117e-07, "loss": 0.2012, "step": 7096 }, { "epoch": 0.33833098944056444, "grad_norm": 2.5692050457000732, "learning_rate": 6.22478678529197e-07, "loss": 1.2634, "step": 7097 }, { "epoch": 0.33837866183586396, "grad_norm": 1.5426928997039795, "learning_rate": 6.211151964017503e-07, "loss": 0.7481, "step": 7098 }, { "epoch": 0.3384263342311634, "grad_norm": 1.6039663553237915, "learning_rate": 6.197531613396479e-07, "loss": 0.6206, "step": 7099 }, { "epoch": 0.33847400662646293, "grad_norm": 3.3389742374420166, "learning_rate": 6.183925735530327e-07, "loss": 0.7603, "step": 7100 }, { "epoch": 0.33852167902176244, "grad_norm": 2.830254077911377, "learning_rate": 6.170334332518325e-07, "loss": 0.8227, "step": 7101 }, { "epoch": 0.33856935141706196, "grad_norm": 1.7223087549209595, "learning_rate": 6.156757406457481e-07, "loss": 1.0832, "step": 7102 }, { "epoch": 0.3386170238123615, "grad_norm": 3.009058713912964, "learning_rate": 6.143194959442566e-07, "loss": 1.1031, "step": 7103 }, { "epoch": 0.33866469620766093, "grad_norm": 6.602575778961182, "learning_rate": 6.129646993566118e-07, "loss": 0.2292, "step": 7104 }, { "epoch": 0.33871236860296045, "grad_norm": 3.132328510284424, "learning_rate": 6.116113510918476e-07, "loss": 0.3777, "step": 7105 }, { "epoch": 0.33876004099825996, "grad_norm": 1.1135035753250122, "learning_rate": 6.102594513587701e-07, "loss": 0.598, "step": 7106 }, { "epoch": 0.3388077133935595, "grad_norm": 1.2780905961990356, "learning_rate": 6.089090003659637e-07, "loss": 0.4626, "step": 7107 }, { "epoch": 0.338855385788859, "grad_norm": 1.6975395679473877, "learning_rate": 6.075599983217895e-07, "loss": 0.4814, "step": 7108 }, { "epoch": 0.33890305818415845, "grad_norm": 1.5749510526657104, "learning_rate": 6.062124454343832e-07, "loss": 0.631, "step": 7109 }, { "epoch": 0.33895073057945796, "grad_norm": 6.589550971984863, "learning_rate": 6.048663419116607e-07, "loss": 0.3112, "step": 7110 }, { "epoch": 0.3389984029747575, "grad_norm": 2.9055511951446533, "learning_rate": 6.035216879613082e-07, "loss": 0.5406, "step": 7111 }, { "epoch": 0.339046075370057, "grad_norm": 1.5288498401641846, "learning_rate": 6.021784837907962e-07, "loss": 0.3707, "step": 7112 }, { "epoch": 0.33909374776535645, "grad_norm": 1.8829283714294434, "learning_rate": 6.008367296073636e-07, "loss": 0.9509, "step": 7113 }, { "epoch": 0.33914142016065596, "grad_norm": 3.490995407104492, "learning_rate": 5.994964256180313e-07, "loss": 0.4782, "step": 7114 }, { "epoch": 0.3391890925559555, "grad_norm": 1.3826745748519897, "learning_rate": 5.981575720295963e-07, "loss": 0.9863, "step": 7115 }, { "epoch": 0.339236764951255, "grad_norm": 1.6758770942687988, "learning_rate": 5.968201690486252e-07, "loss": 0.5993, "step": 7116 }, { "epoch": 0.3392844373465545, "grad_norm": 1.2160645723342896, "learning_rate": 5.954842168814679e-07, "loss": 0.7267, "step": 7117 }, { "epoch": 0.33933210974185396, "grad_norm": 1.6106619834899902, "learning_rate": 5.941497157342502e-07, "loss": 0.7206, "step": 7118 }, { "epoch": 0.3393797821371535, "grad_norm": 2.9641261100769043, "learning_rate": 5.928166658128687e-07, "loss": 0.7746, "step": 7119 }, { "epoch": 0.339427454532453, "grad_norm": 1.2934762239456177, "learning_rate": 5.914850673229988e-07, "loss": 0.7819, "step": 7120 }, { "epoch": 0.3394751269277525, "grad_norm": 1.3298192024230957, "learning_rate": 5.901549204700974e-07, "loss": 0.7705, "step": 7121 }, { "epoch": 0.33952279932305196, "grad_norm": 3.0106053352355957, "learning_rate": 5.888262254593869e-07, "loss": 1.1803, "step": 7122 }, { "epoch": 0.3395704717183515, "grad_norm": 1.988847017288208, "learning_rate": 5.874989824958744e-07, "loss": 0.7232, "step": 7123 }, { "epoch": 0.339618144113651, "grad_norm": 1.336850643157959, "learning_rate": 5.861731917843383e-07, "loss": 0.4295, "step": 7124 }, { "epoch": 0.3396658165089505, "grad_norm": 2.070082187652588, "learning_rate": 5.848488535293362e-07, "loss": 0.5574, "step": 7125 }, { "epoch": 0.33971348890425, "grad_norm": 5.496239185333252, "learning_rate": 5.835259679351968e-07, "loss": 0.5636, "step": 7126 }, { "epoch": 0.3397611612995495, "grad_norm": 3.804555654525757, "learning_rate": 5.822045352060313e-07, "loss": 0.7409, "step": 7127 }, { "epoch": 0.339808833694849, "grad_norm": 2.286628484725952, "learning_rate": 5.808845555457198e-07, "loss": 0.9169, "step": 7128 }, { "epoch": 0.3398565060901485, "grad_norm": 1.5711488723754883, "learning_rate": 5.795660291579241e-07, "loss": 0.7299, "step": 7129 }, { "epoch": 0.339904178485448, "grad_norm": 1.5138362646102905, "learning_rate": 5.782489562460791e-07, "loss": 0.3377, "step": 7130 }, { "epoch": 0.33995185088074753, "grad_norm": 1.4402294158935547, "learning_rate": 5.769333370133933e-07, "loss": 1.0856, "step": 7131 }, { "epoch": 0.339999523276047, "grad_norm": 1.6308043003082275, "learning_rate": 5.756191716628556e-07, "loss": 0.6157, "step": 7132 }, { "epoch": 0.3400471956713465, "grad_norm": 2.3055005073547363, "learning_rate": 5.743064603972282e-07, "loss": 0.3677, "step": 7133 }, { "epoch": 0.340094868066646, "grad_norm": 1.1508523225784302, "learning_rate": 5.729952034190467e-07, "loss": 0.316, "step": 7134 }, { "epoch": 0.34014254046194553, "grad_norm": 1.9965412616729736, "learning_rate": 5.71685400930626e-07, "loss": 0.7114, "step": 7135 }, { "epoch": 0.340190212857245, "grad_norm": 2.4253644943237305, "learning_rate": 5.703770531340569e-07, "loss": 0.9511, "step": 7136 }, { "epoch": 0.3402378852525445, "grad_norm": 4.56763219833374, "learning_rate": 5.69070160231201e-07, "loss": 1.3591, "step": 7137 }, { "epoch": 0.340285557647844, "grad_norm": 3.3468735218048096, "learning_rate": 5.677647224236982e-07, "loss": 0.6144, "step": 7138 }, { "epoch": 0.34033323004314353, "grad_norm": 1.9734158515930176, "learning_rate": 5.664607399129684e-07, "loss": 0.2928, "step": 7139 }, { "epoch": 0.34038090243844304, "grad_norm": 1.6915615797042847, "learning_rate": 5.651582129001987e-07, "loss": 0.5172, "step": 7140 }, { "epoch": 0.3404285748337425, "grad_norm": 5.406301021575928, "learning_rate": 5.638571415863559e-07, "loss": 2.348, "step": 7141 }, { "epoch": 0.340476247229042, "grad_norm": 0.9483609795570374, "learning_rate": 5.625575261721838e-07, "loss": 0.5575, "step": 7142 }, { "epoch": 0.34052391962434153, "grad_norm": 1.6920337677001953, "learning_rate": 5.612593668581978e-07, "loss": 0.6074, "step": 7143 }, { "epoch": 0.34057159201964105, "grad_norm": 1.9882586002349854, "learning_rate": 5.599626638446898e-07, "loss": 0.7159, "step": 7144 }, { "epoch": 0.3406192644149405, "grad_norm": 2.160698175430298, "learning_rate": 5.586674173317308e-07, "loss": 0.947, "step": 7145 }, { "epoch": 0.34066693681024, "grad_norm": 1.9571664333343506, "learning_rate": 5.573736275191622e-07, "loss": 0.6159, "step": 7146 }, { "epoch": 0.34071460920553953, "grad_norm": 1.8392515182495117, "learning_rate": 5.560812946066029e-07, "loss": 0.9505, "step": 7147 }, { "epoch": 0.34076228160083905, "grad_norm": 1.9170050621032715, "learning_rate": 5.54790418793445e-07, "loss": 0.7896, "step": 7148 }, { "epoch": 0.34080995399613856, "grad_norm": 1.3238599300384521, "learning_rate": 5.53501000278861e-07, "loss": 1.0377, "step": 7149 }, { "epoch": 0.340857626391438, "grad_norm": 7.5345377922058105, "learning_rate": 5.522130392617908e-07, "loss": 0.4883, "step": 7150 }, { "epoch": 0.34090529878673753, "grad_norm": 1.5058772563934326, "learning_rate": 5.509265359409544e-07, "loss": 0.7266, "step": 7151 }, { "epoch": 0.34095297118203705, "grad_norm": 1.70972740650177, "learning_rate": 5.496414905148495e-07, "loss": 0.756, "step": 7152 }, { "epoch": 0.34100064357733656, "grad_norm": 1.86093008518219, "learning_rate": 5.48357903181741e-07, "loss": 0.6973, "step": 7153 }, { "epoch": 0.341048315972636, "grad_norm": 2.4055392742156982, "learning_rate": 5.47075774139676e-07, "loss": 1.2593, "step": 7154 }, { "epoch": 0.34109598836793553, "grad_norm": 5.15859317779541, "learning_rate": 5.457951035864729e-07, "loss": 0.4831, "step": 7155 }, { "epoch": 0.34114366076323505, "grad_norm": 1.8685381412506104, "learning_rate": 5.445158917197246e-07, "loss": 0.6988, "step": 7156 }, { "epoch": 0.34119133315853456, "grad_norm": 4.353698253631592, "learning_rate": 5.432381387368014e-07, "loss": 1.0267, "step": 7157 }, { "epoch": 0.3412390055538341, "grad_norm": 1.3330374956130981, "learning_rate": 5.419618448348485e-07, "loss": 0.9164, "step": 7158 }, { "epoch": 0.34128667794913353, "grad_norm": 1.1249141693115234, "learning_rate": 5.40687010210783e-07, "loss": 0.5959, "step": 7159 }, { "epoch": 0.34133435034443305, "grad_norm": 4.510800838470459, "learning_rate": 5.394136350613e-07, "loss": 0.7041, "step": 7160 }, { "epoch": 0.34138202273973256, "grad_norm": 2.3502354621887207, "learning_rate": 5.381417195828698e-07, "loss": 0.5844, "step": 7161 }, { "epoch": 0.3414296951350321, "grad_norm": 2.194605588912964, "learning_rate": 5.368712639717311e-07, "loss": 0.9033, "step": 7162 }, { "epoch": 0.3414773675303316, "grad_norm": 1.9759695529937744, "learning_rate": 5.35602268423906e-07, "loss": 0.734, "step": 7163 }, { "epoch": 0.34152503992563105, "grad_norm": 1.3834506273269653, "learning_rate": 5.343347331351878e-07, "loss": 0.6274, "step": 7164 }, { "epoch": 0.34157271232093056, "grad_norm": 1.0868414640426636, "learning_rate": 5.330686583011413e-07, "loss": 0.6798, "step": 7165 }, { "epoch": 0.3416203847162301, "grad_norm": 2.306903600692749, "learning_rate": 5.318040441171101e-07, "loss": 1.0728, "step": 7166 }, { "epoch": 0.3416680571115296, "grad_norm": 1.7810614109039307, "learning_rate": 5.305408907782128e-07, "loss": 0.7936, "step": 7167 }, { "epoch": 0.34171572950682905, "grad_norm": 1.8817362785339355, "learning_rate": 5.292791984793388e-07, "loss": 0.2779, "step": 7168 }, { "epoch": 0.34176340190212856, "grad_norm": 1.3534526824951172, "learning_rate": 5.280189674151559e-07, "loss": 0.6725, "step": 7169 }, { "epoch": 0.3418110742974281, "grad_norm": 1.385118007659912, "learning_rate": 5.267601977801018e-07, "loss": 0.558, "step": 7170 }, { "epoch": 0.3418587466927276, "grad_norm": 1.1696964502334595, "learning_rate": 5.255028897683956e-07, "loss": 0.9574, "step": 7171 }, { "epoch": 0.3419064190880271, "grad_norm": 0.910984992980957, "learning_rate": 5.242470435740232e-07, "loss": 0.7166, "step": 7172 }, { "epoch": 0.34195409148332656, "grad_norm": 1.6158665418624878, "learning_rate": 5.229926593907531e-07, "loss": 0.697, "step": 7173 }, { "epoch": 0.3420017638786261, "grad_norm": 1.2293812036514282, "learning_rate": 5.217397374121192e-07, "loss": 0.5826, "step": 7174 }, { "epoch": 0.3420494362739256, "grad_norm": 1.4363961219787598, "learning_rate": 5.204882778314358e-07, "loss": 0.4648, "step": 7175 }, { "epoch": 0.3420971086692251, "grad_norm": 2.4854228496551514, "learning_rate": 5.192382808417939e-07, "loss": 0.9834, "step": 7176 }, { "epoch": 0.34214478106452456, "grad_norm": 1.4465450048446655, "learning_rate": 5.179897466360495e-07, "loss": 0.9246, "step": 7177 }, { "epoch": 0.3421924534598241, "grad_norm": 1.1187666654586792, "learning_rate": 5.167426754068427e-07, "loss": 0.8393, "step": 7178 }, { "epoch": 0.3422401258551236, "grad_norm": 1.1752023696899414, "learning_rate": 5.154970673465831e-07, "loss": 0.2504, "step": 7179 }, { "epoch": 0.3422877982504231, "grad_norm": 1.0947926044464111, "learning_rate": 5.142529226474536e-07, "loss": 0.4632, "step": 7180 }, { "epoch": 0.3423354706457226, "grad_norm": 1.5549540519714355, "learning_rate": 5.130102415014137e-07, "loss": 0.8201, "step": 7181 }, { "epoch": 0.3423831430410221, "grad_norm": 3.417045831680298, "learning_rate": 5.11769024100196e-07, "loss": 0.8468, "step": 7182 }, { "epoch": 0.3424308154363216, "grad_norm": 1.71469247341156, "learning_rate": 5.105292706353093e-07, "loss": 0.9623, "step": 7183 }, { "epoch": 0.3424784878316211, "grad_norm": 3.3496880531311035, "learning_rate": 5.09290981298034e-07, "loss": 1.0611, "step": 7184 }, { "epoch": 0.3425261602269206, "grad_norm": 1.8131933212280273, "learning_rate": 5.080541562794239e-07, "loss": 0.8119, "step": 7185 }, { "epoch": 0.3425738326222201, "grad_norm": 1.277723789215088, "learning_rate": 5.068187957703097e-07, "loss": 0.6334, "step": 7186 }, { "epoch": 0.3426215050175196, "grad_norm": 1.1603819131851196, "learning_rate": 5.055848999612934e-07, "loss": 0.7065, "step": 7187 }, { "epoch": 0.3426691774128191, "grad_norm": 1.8818080425262451, "learning_rate": 5.043524690427537e-07, "loss": 0.5874, "step": 7188 }, { "epoch": 0.3427168498081186, "grad_norm": 1.458407998085022, "learning_rate": 5.031215032048431e-07, "loss": 0.8327, "step": 7189 }, { "epoch": 0.34276452220341813, "grad_norm": 1.8858898878097534, "learning_rate": 5.018920026374841e-07, "loss": 1.028, "step": 7190 }, { "epoch": 0.3428121945987176, "grad_norm": 0.910244345664978, "learning_rate": 5.006639675303781e-07, "loss": 0.4734, "step": 7191 }, { "epoch": 0.3428598669940171, "grad_norm": 2.176889657974243, "learning_rate": 4.994373980729983e-07, "loss": 0.9091, "step": 7192 }, { "epoch": 0.3429075393893166, "grad_norm": 3.477884292602539, "learning_rate": 4.982122944545908e-07, "loss": 0.936, "step": 7193 }, { "epoch": 0.34295521178461613, "grad_norm": 1.5500426292419434, "learning_rate": 4.969886568641757e-07, "loss": 0.5856, "step": 7194 }, { "epoch": 0.34300288417991565, "grad_norm": 1.4851431846618652, "learning_rate": 4.957664854905508e-07, "loss": 0.6471, "step": 7195 }, { "epoch": 0.3430505565752151, "grad_norm": 1.420636534690857, "learning_rate": 4.945457805222809e-07, "loss": 0.1592, "step": 7196 }, { "epoch": 0.3430982289705146, "grad_norm": 2.4242300987243652, "learning_rate": 4.933265421477096e-07, "loss": 0.5091, "step": 7197 }, { "epoch": 0.34314590136581413, "grad_norm": 1.2386995553970337, "learning_rate": 4.921087705549544e-07, "loss": 0.6226, "step": 7198 }, { "epoch": 0.34319357376111365, "grad_norm": 3.5894792079925537, "learning_rate": 4.908924659319037e-07, "loss": 0.7458, "step": 7199 }, { "epoch": 0.3432412461564131, "grad_norm": 3.0469913482666016, "learning_rate": 4.896776284662186e-07, "loss": 0.8975, "step": 7200 }, { "epoch": 0.3432889185517126, "grad_norm": 1.621962070465088, "learning_rate": 4.884642583453403e-07, "loss": 0.9469, "step": 7201 }, { "epoch": 0.34333659094701213, "grad_norm": 1.3070720434188843, "learning_rate": 4.872523557564756e-07, "loss": 0.6808, "step": 7202 }, { "epoch": 0.34338426334231165, "grad_norm": 2.817579507827759, "learning_rate": 4.860419208866096e-07, "loss": 0.698, "step": 7203 }, { "epoch": 0.34343193573761116, "grad_norm": 4.396624565124512, "learning_rate": 4.848329539225027e-07, "loss": 0.4839, "step": 7204 }, { "epoch": 0.3434796081329106, "grad_norm": 3.2510831356048584, "learning_rate": 4.836254550506814e-07, "loss": 0.7595, "step": 7205 }, { "epoch": 0.34352728052821013, "grad_norm": 1.2895548343658447, "learning_rate": 4.824194244574531e-07, "loss": 0.8418, "step": 7206 }, { "epoch": 0.34357495292350965, "grad_norm": 2.766206741333008, "learning_rate": 4.81214862328897e-07, "loss": 1.4206, "step": 7207 }, { "epoch": 0.34362262531880916, "grad_norm": 1.9874190092086792, "learning_rate": 4.80011768850862e-07, "loss": 1.0453, "step": 7208 }, { "epoch": 0.3436702977141086, "grad_norm": 2.1299922466278076, "learning_rate": 4.788101442089732e-07, "loss": 0.7787, "step": 7209 }, { "epoch": 0.34371797010940813, "grad_norm": 3.9610226154327393, "learning_rate": 4.77609988588632e-07, "loss": 1.3, "step": 7210 }, { "epoch": 0.34376564250470765, "grad_norm": 1.107771635055542, "learning_rate": 4.764113021750061e-07, "loss": 0.6033, "step": 7211 }, { "epoch": 0.34381331490000716, "grad_norm": 1.764366865158081, "learning_rate": 4.752140851530429e-07, "loss": 0.514, "step": 7212 }, { "epoch": 0.3438609872953067, "grad_norm": 2.4293935298919678, "learning_rate": 4.740183377074603e-07, "loss": 1.0554, "step": 7213 }, { "epoch": 0.34390865969060613, "grad_norm": 1.5060569047927856, "learning_rate": 4.728240600227496e-07, "loss": 0.5778, "step": 7214 }, { "epoch": 0.34395633208590565, "grad_norm": 1.059520959854126, "learning_rate": 4.7163125228317565e-07, "loss": 0.4218, "step": 7215 }, { "epoch": 0.34400400448120516, "grad_norm": 1.587288737297058, "learning_rate": 4.704399146727767e-07, "loss": 0.6765, "step": 7216 }, { "epoch": 0.3440516768765047, "grad_norm": 2.7077524662017822, "learning_rate": 4.692500473753625e-07, "loss": 0.8471, "step": 7217 }, { "epoch": 0.3440993492718042, "grad_norm": 3.374375343322754, "learning_rate": 4.6806165057451833e-07, "loss": 0.5849, "step": 7218 }, { "epoch": 0.34414702166710365, "grad_norm": 1.5658568143844604, "learning_rate": 4.6687472445360206e-07, "loss": 0.609, "step": 7219 }, { "epoch": 0.34419469406240316, "grad_norm": 1.7451024055480957, "learning_rate": 4.656892691957426e-07, "loss": 0.7883, "step": 7220 }, { "epoch": 0.3442423664577027, "grad_norm": 3.42445969581604, "learning_rate": 4.6450528498384493e-07, "loss": 1.556, "step": 7221 }, { "epoch": 0.3442900388530022, "grad_norm": 3.019050121307373, "learning_rate": 4.6332277200058397e-07, "loss": 1.5462, "step": 7222 }, { "epoch": 0.34433771124830165, "grad_norm": 1.2758978605270386, "learning_rate": 4.621417304284126e-07, "loss": 0.7431, "step": 7223 }, { "epoch": 0.34438538364360116, "grad_norm": 2.1080386638641357, "learning_rate": 4.609621604495507e-07, "loss": 0.9957, "step": 7224 }, { "epoch": 0.3444330560389007, "grad_norm": 2.187030792236328, "learning_rate": 4.597840622459937e-07, "loss": 0.8028, "step": 7225 }, { "epoch": 0.3444807284342002, "grad_norm": 1.9063327312469482, "learning_rate": 4.5860743599951186e-07, "loss": 0.9747, "step": 7226 }, { "epoch": 0.3445284008294997, "grad_norm": 1.3499354124069214, "learning_rate": 4.574322818916443e-07, "loss": 0.5217, "step": 7227 }, { "epoch": 0.34457607322479916, "grad_norm": 1.9013001918792725, "learning_rate": 4.5625860010370726e-07, "loss": 0.6729, "step": 7228 }, { "epoch": 0.3446237456200987, "grad_norm": 1.1611921787261963, "learning_rate": 4.550863908167846e-07, "loss": 0.4916, "step": 7229 }, { "epoch": 0.3446714180153982, "grad_norm": 1.9586031436920166, "learning_rate": 4.5391565421174065e-07, "loss": 0.8037, "step": 7230 }, { "epoch": 0.3447190904106977, "grad_norm": 1.3925200700759888, "learning_rate": 4.527463904692042e-07, "loss": 0.7632, "step": 7231 }, { "epoch": 0.34476676280599716, "grad_norm": 2.207425832748413, "learning_rate": 4.515785997695832e-07, "loss": 0.9885, "step": 7232 }, { "epoch": 0.3448144352012967, "grad_norm": 1.337417483329773, "learning_rate": 4.5041228229305343e-07, "loss": 0.6356, "step": 7233 }, { "epoch": 0.3448621075965962, "grad_norm": 1.5648466348648071, "learning_rate": 4.492474382195666e-07, "loss": 0.8432, "step": 7234 }, { "epoch": 0.3449097799918957, "grad_norm": 1.0981671810150146, "learning_rate": 4.480840677288478e-07, "loss": 0.3318, "step": 7235 }, { "epoch": 0.3449574523871952, "grad_norm": 1.4383518695831299, "learning_rate": 4.4692217100039013e-07, "loss": 0.8603, "step": 7236 }, { "epoch": 0.3450051247824947, "grad_norm": 1.3654465675354004, "learning_rate": 4.457617482134635e-07, "loss": 0.9705, "step": 7237 }, { "epoch": 0.3450527971777942, "grad_norm": 2.0730645656585693, "learning_rate": 4.446027995471114e-07, "loss": 0.8829, "step": 7238 }, { "epoch": 0.3451004695730937, "grad_norm": 1.2500892877578735, "learning_rate": 4.4344532518014405e-07, "loss": 0.8822, "step": 7239 }, { "epoch": 0.3451481419683932, "grad_norm": 2.913982629776001, "learning_rate": 4.4228932529114975e-07, "loss": 1.0263, "step": 7240 }, { "epoch": 0.3451958143636927, "grad_norm": 1.9293720722198486, "learning_rate": 4.411348000584881e-07, "loss": 0.5467, "step": 7241 }, { "epoch": 0.3452434867589922, "grad_norm": 1.5334426164627075, "learning_rate": 4.3998174966028875e-07, "loss": 0.7143, "step": 7242 }, { "epoch": 0.3452911591542917, "grad_norm": 1.1827961206436157, "learning_rate": 4.3883017427445717e-07, "loss": 0.669, "step": 7243 }, { "epoch": 0.3453388315495912, "grad_norm": 2.1932945251464844, "learning_rate": 4.3768007407866685e-07, "loss": 0.8165, "step": 7244 }, { "epoch": 0.34538650394489073, "grad_norm": 0.846015214920044, "learning_rate": 4.3653144925037025e-07, "loss": 0.5453, "step": 7245 }, { "epoch": 0.3454341763401902, "grad_norm": 1.4479868412017822, "learning_rate": 4.3538429996678567e-07, "loss": 0.8784, "step": 7246 }, { "epoch": 0.3454818487354897, "grad_norm": 1.8788586854934692, "learning_rate": 4.342386264049081e-07, "loss": 0.8506, "step": 7247 }, { "epoch": 0.3455295211307892, "grad_norm": 1.759360671043396, "learning_rate": 4.3309442874150063e-07, "loss": 0.724, "step": 7248 }, { "epoch": 0.34557719352608873, "grad_norm": 17.884838104248047, "learning_rate": 4.319517071531021e-07, "loss": 0.9161, "step": 7249 }, { "epoch": 0.34562486592138825, "grad_norm": 1.5192292928695679, "learning_rate": 4.3081046181602583e-07, "loss": 0.5112, "step": 7250 }, { "epoch": 0.3456725383166877, "grad_norm": 5.040566444396973, "learning_rate": 4.296706929063499e-07, "loss": 0.3561, "step": 7251 }, { "epoch": 0.3457202107119872, "grad_norm": 2.5041663646698, "learning_rate": 4.285324005999303e-07, "loss": 0.5347, "step": 7252 }, { "epoch": 0.34576788310728673, "grad_norm": 2.3320980072021484, "learning_rate": 4.2739558507239543e-07, "loss": 1.0952, "step": 7253 }, { "epoch": 0.34581555550258625, "grad_norm": 2.034547805786133, "learning_rate": 4.2626024649914275e-07, "loss": 0.3327, "step": 7254 }, { "epoch": 0.3458632278978857, "grad_norm": 1.6025238037109375, "learning_rate": 4.251263850553433e-07, "loss": 0.6959, "step": 7255 }, { "epoch": 0.3459109002931852, "grad_norm": 1.8562968969345093, "learning_rate": 4.2399400091594154e-07, "loss": 0.679, "step": 7256 }, { "epoch": 0.34595857268848473, "grad_norm": 2.029865264892578, "learning_rate": 4.2286309425564997e-07, "loss": 0.6528, "step": 7257 }, { "epoch": 0.34600624508378425, "grad_norm": 0.9812089800834656, "learning_rate": 4.2173366524895787e-07, "loss": 0.5244, "step": 7258 }, { "epoch": 0.34605391747908376, "grad_norm": 1.5033107995986938, "learning_rate": 4.2060571407012583e-07, "loss": 0.6437, "step": 7259 }, { "epoch": 0.3461015898743832, "grad_norm": 1.2605870962142944, "learning_rate": 4.1947924089318247e-07, "loss": 0.8642, "step": 7260 }, { "epoch": 0.34614926226968273, "grad_norm": 3.5696067810058594, "learning_rate": 4.1835424589193096e-07, "loss": 0.6863, "step": 7261 }, { "epoch": 0.34619693466498225, "grad_norm": 2.35353684425354, "learning_rate": 4.17230729239948e-07, "loss": 1.0326, "step": 7262 }, { "epoch": 0.34624460706028176, "grad_norm": 2.9793784618377686, "learning_rate": 4.161086911105816e-07, "loss": 1.1812, "step": 7263 }, { "epoch": 0.3462922794555812, "grad_norm": 1.5557667016983032, "learning_rate": 4.1498813167694776e-07, "loss": 1.0101, "step": 7264 }, { "epoch": 0.34633995185088073, "grad_norm": 1.3478249311447144, "learning_rate": 4.138690511119381e-07, "loss": 0.74, "step": 7265 }, { "epoch": 0.34638762424618025, "grad_norm": 2.1713931560516357, "learning_rate": 4.127514495882168e-07, "loss": 0.5199, "step": 7266 }, { "epoch": 0.34643529664147976, "grad_norm": 1.6194018125534058, "learning_rate": 4.1163532727821696e-07, "loss": 0.8982, "step": 7267 }, { "epoch": 0.3464829690367793, "grad_norm": 2.487565279006958, "learning_rate": 4.1052068435414426e-07, "loss": 0.8187, "step": 7268 }, { "epoch": 0.34653064143207873, "grad_norm": 1.365309476852417, "learning_rate": 4.094075209879789e-07, "loss": 0.7434, "step": 7269 }, { "epoch": 0.34657831382737825, "grad_norm": 2.3606138229370117, "learning_rate": 4.082958373514689e-07, "loss": 1.1335, "step": 7270 }, { "epoch": 0.34662598622267776, "grad_norm": 1.2571296691894531, "learning_rate": 4.0718563361613396e-07, "loss": 0.7632, "step": 7271 }, { "epoch": 0.3466736586179773, "grad_norm": 1.0391098260879517, "learning_rate": 4.060769099532713e-07, "loss": 0.3259, "step": 7272 }, { "epoch": 0.34672133101327673, "grad_norm": 1.6397120952606201, "learning_rate": 4.04969666533942e-07, "loss": 0.6466, "step": 7273 }, { "epoch": 0.34676900340857625, "grad_norm": 2.494081974029541, "learning_rate": 4.0386390352898376e-07, "loss": 0.9244, "step": 7274 }, { "epoch": 0.34681667580387576, "grad_norm": 1.679502010345459, "learning_rate": 4.0275962110900455e-07, "loss": 0.8833, "step": 7275 }, { "epoch": 0.3468643481991753, "grad_norm": 1.6045907735824585, "learning_rate": 4.016568194443826e-07, "loss": 0.6828, "step": 7276 }, { "epoch": 0.3469120205944748, "grad_norm": 1.292438268661499, "learning_rate": 4.0055549870526955e-07, "loss": 0.6701, "step": 7277 }, { "epoch": 0.34695969298977425, "grad_norm": 1.9425338506698608, "learning_rate": 3.9945565906158833e-07, "loss": 0.9244, "step": 7278 }, { "epoch": 0.34700736538507376, "grad_norm": 1.5062963962554932, "learning_rate": 3.9835730068303215e-07, "loss": 0.5208, "step": 7279 }, { "epoch": 0.3470550377803733, "grad_norm": 1.8522506952285767, "learning_rate": 3.9726042373906536e-07, "loss": 0.6404, "step": 7280 }, { "epoch": 0.3471027101756728, "grad_norm": 1.5433088541030884, "learning_rate": 3.961650283989282e-07, "loss": 0.5705, "step": 7281 }, { "epoch": 0.3471503825709723, "grad_norm": 3.9353222846984863, "learning_rate": 3.9507111483162554e-07, "loss": 1.1046, "step": 7282 }, { "epoch": 0.34719805496627176, "grad_norm": 2.1075096130371094, "learning_rate": 3.939786832059389e-07, "loss": 0.9925, "step": 7283 }, { "epoch": 0.3472457273615713, "grad_norm": 2.5660717487335205, "learning_rate": 3.928877336904191e-07, "loss": 1.3565, "step": 7284 }, { "epoch": 0.3472933997568708, "grad_norm": 3.5466341972351074, "learning_rate": 3.9179826645338594e-07, "loss": 0.2571, "step": 7285 }, { "epoch": 0.3473410721521703, "grad_norm": 1.2514286041259766, "learning_rate": 3.90710281662936e-07, "loss": 0.706, "step": 7286 }, { "epoch": 0.34738874454746976, "grad_norm": 1.27994966506958, "learning_rate": 3.8962377948693395e-07, "loss": 0.8089, "step": 7287 }, { "epoch": 0.3474364169427693, "grad_norm": 1.498083472251892, "learning_rate": 3.885387600930135e-07, "loss": 0.5761, "step": 7288 }, { "epoch": 0.3474840893380688, "grad_norm": 3.6953487396240234, "learning_rate": 3.8745522364858513e-07, "loss": 0.9285, "step": 7289 }, { "epoch": 0.3475317617333683, "grad_norm": 1.5337369441986084, "learning_rate": 3.86373170320824e-07, "loss": 0.8, "step": 7290 }, { "epoch": 0.3475794341286678, "grad_norm": 2.4488043785095215, "learning_rate": 3.8529260027668325e-07, "loss": 0.4423, "step": 7291 }, { "epoch": 0.3476271065239673, "grad_norm": 1.5359479188919067, "learning_rate": 3.842135136828806e-07, "loss": 0.5747, "step": 7292 }, { "epoch": 0.3476747789192668, "grad_norm": 2.4388535022735596, "learning_rate": 3.831359107059096e-07, "loss": 0.6063, "step": 7293 }, { "epoch": 0.3477224513145663, "grad_norm": 3.1549928188323975, "learning_rate": 3.8205979151203274e-07, "loss": 1.0557, "step": 7294 }, { "epoch": 0.3477701237098658, "grad_norm": 1.7099775075912476, "learning_rate": 3.809851562672839e-07, "loss": 0.4977, "step": 7295 }, { "epoch": 0.3478177961051653, "grad_norm": 1.210395336151123, "learning_rate": 3.799120051374694e-07, "loss": 0.4558, "step": 7296 }, { "epoch": 0.3478654685004648, "grad_norm": 1.3912314176559448, "learning_rate": 3.7884033828816556e-07, "loss": 0.7312, "step": 7297 }, { "epoch": 0.3479131408957643, "grad_norm": 2.0125679969787598, "learning_rate": 3.77770155884718e-07, "loss": 0.609, "step": 7298 }, { "epoch": 0.3479608132910638, "grad_norm": 3.271801233291626, "learning_rate": 3.7670145809224567e-07, "loss": 0.5904, "step": 7299 }, { "epoch": 0.34800848568636333, "grad_norm": 1.4114795923233032, "learning_rate": 3.7563424507563785e-07, "loss": 0.751, "step": 7300 }, { "epoch": 0.3480561580816628, "grad_norm": 1.7076895236968994, "learning_rate": 3.745685169995539e-07, "loss": 0.9077, "step": 7301 }, { "epoch": 0.3481038304769623, "grad_norm": 1.2552075386047363, "learning_rate": 3.7350427402842446e-07, "loss": 0.5939, "step": 7302 }, { "epoch": 0.3481515028722618, "grad_norm": 1.6350072622299194, "learning_rate": 3.7244151632645387e-07, "loss": 0.8362, "step": 7303 }, { "epoch": 0.34819917526756133, "grad_norm": 3.420283555984497, "learning_rate": 3.7138024405761197e-07, "loss": 0.512, "step": 7304 }, { "epoch": 0.34824684766286085, "grad_norm": 2.005946159362793, "learning_rate": 3.7032045738564114e-07, "loss": 0.4772, "step": 7305 }, { "epoch": 0.3482945200581603, "grad_norm": 1.7450584173202515, "learning_rate": 3.692621564740584e-07, "loss": 1.0413, "step": 7306 }, { "epoch": 0.3483421924534598, "grad_norm": 2.4184982776641846, "learning_rate": 3.682053414861475e-07, "loss": 0.9437, "step": 7307 }, { "epoch": 0.34838986484875933, "grad_norm": 1.6064496040344238, "learning_rate": 3.6715001258496365e-07, "loss": 0.3939, "step": 7308 }, { "epoch": 0.34843753724405885, "grad_norm": 1.8300976753234863, "learning_rate": 3.660961699333343e-07, "loss": 0.8987, "step": 7309 }, { "epoch": 0.3484852096393583, "grad_norm": 1.73401939868927, "learning_rate": 3.65043813693855e-07, "loss": 0.7753, "step": 7310 }, { "epoch": 0.3485328820346578, "grad_norm": 2.3060131072998047, "learning_rate": 3.6399294402889473e-07, "loss": 0.6279, "step": 7311 }, { "epoch": 0.34858055442995733, "grad_norm": 2.206881284713745, "learning_rate": 3.629435611005916e-07, "loss": 0.6554, "step": 7312 }, { "epoch": 0.34862822682525685, "grad_norm": 3.581489086151123, "learning_rate": 3.618956650708549e-07, "loss": 1.2522, "step": 7313 }, { "epoch": 0.34867589922055636, "grad_norm": 1.3536385297775269, "learning_rate": 3.608492561013632e-07, "loss": 0.4236, "step": 7314 }, { "epoch": 0.3487235716158558, "grad_norm": 1.4357553720474243, "learning_rate": 3.598043343535673e-07, "loss": 0.7097, "step": 7315 }, { "epoch": 0.34877124401115533, "grad_norm": 1.215166687965393, "learning_rate": 3.5876089998868825e-07, "loss": 0.6834, "step": 7316 }, { "epoch": 0.34881891640645485, "grad_norm": 1.62375009059906, "learning_rate": 3.577189531677161e-07, "loss": 0.4891, "step": 7317 }, { "epoch": 0.34886658880175436, "grad_norm": 4.09118127822876, "learning_rate": 3.566784940514145e-07, "loss": 0.6916, "step": 7318 }, { "epoch": 0.3489142611970538, "grad_norm": 1.5236778259277344, "learning_rate": 3.55639522800314e-07, "loss": 0.5245, "step": 7319 }, { "epoch": 0.34896193359235333, "grad_norm": 2.2369823455810547, "learning_rate": 3.546020395747163e-07, "loss": 0.8088, "step": 7320 }, { "epoch": 0.34900960598765285, "grad_norm": 4.75441312789917, "learning_rate": 3.5356604453469665e-07, "loss": 0.412, "step": 7321 }, { "epoch": 0.34905727838295236, "grad_norm": 2.548539400100708, "learning_rate": 3.525315378400962e-07, "loss": 0.7128, "step": 7322 }, { "epoch": 0.3491049507782519, "grad_norm": 2.4372811317443848, "learning_rate": 3.514985196505305e-07, "loss": 0.6168, "step": 7323 }, { "epoch": 0.34915262317355134, "grad_norm": 1.3216582536697388, "learning_rate": 3.504669901253832e-07, "loss": 0.7532, "step": 7324 }, { "epoch": 0.34920029556885085, "grad_norm": 1.550697922706604, "learning_rate": 3.4943694942380704e-07, "loss": 0.6972, "step": 7325 }, { "epoch": 0.34924796796415036, "grad_norm": 2.0736286640167236, "learning_rate": 3.484083977047281e-07, "loss": 0.9249, "step": 7326 }, { "epoch": 0.3492956403594499, "grad_norm": 1.6073224544525146, "learning_rate": 3.473813351268429e-07, "loss": 0.8632, "step": 7327 }, { "epoch": 0.34934331275474934, "grad_norm": 1.408355474472046, "learning_rate": 3.463557618486135e-07, "loss": 0.2324, "step": 7328 }, { "epoch": 0.34939098515004885, "grad_norm": 1.9746421575546265, "learning_rate": 3.453316780282767e-07, "loss": 0.7641, "step": 7329 }, { "epoch": 0.34943865754534836, "grad_norm": 1.434739351272583, "learning_rate": 3.4430908382383944e-07, "loss": 0.7244, "step": 7330 }, { "epoch": 0.3494863299406479, "grad_norm": 1.6043431758880615, "learning_rate": 3.4328797939307435e-07, "loss": 0.7228, "step": 7331 }, { "epoch": 0.3495340023359474, "grad_norm": 2.1351242065429688, "learning_rate": 3.4226836489352987e-07, "loss": 0.3226, "step": 7332 }, { "epoch": 0.34958167473124685, "grad_norm": 1.3221118450164795, "learning_rate": 3.412502404825224e-07, "loss": 0.791, "step": 7333 }, { "epoch": 0.34962934712654636, "grad_norm": 1.9221763610839844, "learning_rate": 3.402336063171352e-07, "loss": 0.6204, "step": 7334 }, { "epoch": 0.3496770195218459, "grad_norm": 1.3127278089523315, "learning_rate": 3.392184625542283e-07, "loss": 0.7301, "step": 7335 }, { "epoch": 0.3497246919171454, "grad_norm": 2.440467357635498, "learning_rate": 3.382048093504242e-07, "loss": 0.9175, "step": 7336 }, { "epoch": 0.3497723643124449, "grad_norm": 2.429494857788086, "learning_rate": 3.371926468621212e-07, "loss": 0.8546, "step": 7337 }, { "epoch": 0.34982003670774436, "grad_norm": 2.4686343669891357, "learning_rate": 3.3618197524548534e-07, "loss": 0.7445, "step": 7338 }, { "epoch": 0.3498677091030439, "grad_norm": 1.4753676652908325, "learning_rate": 3.3517279465645204e-07, "loss": 0.5532, "step": 7339 }, { "epoch": 0.3499153814983434, "grad_norm": 5.123486042022705, "learning_rate": 3.3416510525072886e-07, "loss": 1.1981, "step": 7340 }, { "epoch": 0.3499630538936429, "grad_norm": 1.894282579421997, "learning_rate": 3.331589071837904e-07, "loss": 0.7651, "step": 7341 }, { "epoch": 0.35001072628894236, "grad_norm": 1.8569226264953613, "learning_rate": 3.3215420061088245e-07, "loss": 0.8511, "step": 7342 }, { "epoch": 0.3500583986842419, "grad_norm": 2.3166210651397705, "learning_rate": 3.311509856870243e-07, "loss": 0.7973, "step": 7343 }, { "epoch": 0.3501060710795414, "grad_norm": 1.7932738065719604, "learning_rate": 3.3014926256699665e-07, "loss": 0.6125, "step": 7344 }, { "epoch": 0.3501537434748409, "grad_norm": 1.515670895576477, "learning_rate": 3.2914903140535914e-07, "loss": 0.8435, "step": 7345 }, { "epoch": 0.3502014158701404, "grad_norm": 3.0000510215759277, "learning_rate": 3.2815029235643505e-07, "loss": 0.6665, "step": 7346 }, { "epoch": 0.3502490882654399, "grad_norm": 1.9593795537948608, "learning_rate": 3.2715304557431994e-07, "loss": 0.493, "step": 7347 }, { "epoch": 0.3502967606607394, "grad_norm": 1.7829817533493042, "learning_rate": 3.261572912128796e-07, "loss": 0.2872, "step": 7348 }, { "epoch": 0.3503444330560389, "grad_norm": 1.2360681295394897, "learning_rate": 3.2516302942574794e-07, "loss": 0.4964, "step": 7349 }, { "epoch": 0.3503921054513384, "grad_norm": 1.6936216354370117, "learning_rate": 3.241702603663288e-07, "loss": 0.888, "step": 7350 }, { "epoch": 0.3504397778466379, "grad_norm": 2.7934439182281494, "learning_rate": 3.2317898418779634e-07, "loss": 0.7825, "step": 7351 }, { "epoch": 0.3504874502419374, "grad_norm": 2.769547939300537, "learning_rate": 3.2218920104309605e-07, "loss": 0.62, "step": 7352 }, { "epoch": 0.3505351226372369, "grad_norm": 1.5975010395050049, "learning_rate": 3.212009110849379e-07, "loss": 1.0273, "step": 7353 }, { "epoch": 0.3505827950325364, "grad_norm": 1.4424394369125366, "learning_rate": 3.2021411446580774e-07, "loss": 0.8914, "step": 7354 }, { "epoch": 0.35063046742783593, "grad_norm": 2.261087656021118, "learning_rate": 3.1922881133795827e-07, "loss": 0.9779, "step": 7355 }, { "epoch": 0.3506781398231354, "grad_norm": 6.544918537139893, "learning_rate": 3.182450018534089e-07, "loss": 0.6098, "step": 7356 }, { "epoch": 0.3507258122184349, "grad_norm": 1.3822834491729736, "learning_rate": 3.1726268616395273e-07, "loss": 0.8185, "step": 7357 }, { "epoch": 0.3507734846137344, "grad_norm": 1.5750081539154053, "learning_rate": 3.1628186442115294e-07, "loss": 0.7656, "step": 7358 }, { "epoch": 0.35082115700903393, "grad_norm": 1.9999806880950928, "learning_rate": 3.1530253677633625e-07, "loss": 0.874, "step": 7359 }, { "epoch": 0.3508688294043334, "grad_norm": 1.9656327962875366, "learning_rate": 3.143247033806063e-07, "loss": 1.2236, "step": 7360 }, { "epoch": 0.3509165017996329, "grad_norm": 1.8565436601638794, "learning_rate": 3.133483643848323e-07, "loss": 0.8105, "step": 7361 }, { "epoch": 0.3509641741949324, "grad_norm": 2.700535774230957, "learning_rate": 3.123735199396516e-07, "loss": 1.1901, "step": 7362 }, { "epoch": 0.35101184659023194, "grad_norm": 1.5758708715438843, "learning_rate": 3.1140017019547385e-07, "loss": 0.5274, "step": 7363 }, { "epoch": 0.35105951898553145, "grad_norm": 2.287595748901367, "learning_rate": 3.1042831530247566e-07, "loss": 0.7682, "step": 7364 }, { "epoch": 0.3511071913808309, "grad_norm": 2.339594602584839, "learning_rate": 3.0945795541060696e-07, "loss": 0.8061, "step": 7365 }, { "epoch": 0.3511548637761304, "grad_norm": 3.9156291484832764, "learning_rate": 3.0848909066958035e-07, "loss": 0.433, "step": 7366 }, { "epoch": 0.35120253617142994, "grad_norm": 1.3135124444961548, "learning_rate": 3.07521721228885e-07, "loss": 0.8699, "step": 7367 }, { "epoch": 0.35125020856672945, "grad_norm": 3.769801616668701, "learning_rate": 3.06555847237775e-07, "loss": 0.4134, "step": 7368 }, { "epoch": 0.35129788096202896, "grad_norm": 2.105705738067627, "learning_rate": 3.0559146884527324e-07, "loss": 0.7116, "step": 7369 }, { "epoch": 0.3513455533573284, "grad_norm": 1.5594096183776855, "learning_rate": 3.0462858620017633e-07, "loss": 0.7299, "step": 7370 }, { "epoch": 0.35139322575262794, "grad_norm": 2.128023386001587, "learning_rate": 3.0366719945104427e-07, "loss": 1.5402, "step": 7371 }, { "epoch": 0.35144089814792745, "grad_norm": 1.2129205465316772, "learning_rate": 3.027073087462107e-07, "loss": 0.4955, "step": 7372 }, { "epoch": 0.35148857054322696, "grad_norm": 1.6877939701080322, "learning_rate": 3.0174891423377595e-07, "loss": 0.6331, "step": 7373 }, { "epoch": 0.3515362429385264, "grad_norm": 1.6392323970794678, "learning_rate": 3.007920160616129e-07, "loss": 0.7552, "step": 7374 }, { "epoch": 0.35158391533382594, "grad_norm": 2.563945770263672, "learning_rate": 2.998366143773579e-07, "loss": 0.6561, "step": 7375 }, { "epoch": 0.35163158772912545, "grad_norm": 5.188019752502441, "learning_rate": 2.988827093284219e-07, "loss": 0.9177, "step": 7376 }, { "epoch": 0.35167926012442496, "grad_norm": 1.811829924583435, "learning_rate": 2.9793030106198164e-07, "loss": 0.6108, "step": 7377 }, { "epoch": 0.3517269325197245, "grad_norm": 1.7487602233886719, "learning_rate": 2.9697938972498287e-07, "loss": 0.6295, "step": 7378 }, { "epoch": 0.35177460491502394, "grad_norm": 1.8587628602981567, "learning_rate": 2.960299754641438e-07, "loss": 0.7385, "step": 7379 }, { "epoch": 0.35182227731032345, "grad_norm": 2.0606086254119873, "learning_rate": 2.9508205842594727e-07, "loss": 0.5784, "step": 7380 }, { "epoch": 0.35186994970562296, "grad_norm": 1.1561492681503296, "learning_rate": 2.941356387566474e-07, "loss": 0.5408, "step": 7381 }, { "epoch": 0.3519176221009225, "grad_norm": 1.6568877696990967, "learning_rate": 2.9319071660226737e-07, "loss": 0.8053, "step": 7382 }, { "epoch": 0.35196529449622194, "grad_norm": 1.2379618883132935, "learning_rate": 2.922472921086006e-07, "loss": 0.5469, "step": 7383 }, { "epoch": 0.35201296689152145, "grad_norm": 4.19793176651001, "learning_rate": 2.913053654212039e-07, "loss": 0.3475, "step": 7384 }, { "epoch": 0.35206063928682096, "grad_norm": 1.9430595636367798, "learning_rate": 2.9036493668541e-07, "loss": 0.9096, "step": 7385 }, { "epoch": 0.3521083116821205, "grad_norm": 3.407219171524048, "learning_rate": 2.894260060463172e-07, "loss": 0.6603, "step": 7386 }, { "epoch": 0.35215598407742, "grad_norm": 1.756489634513855, "learning_rate": 2.884885736487919e-07, "loss": 0.7108, "step": 7387 }, { "epoch": 0.35220365647271945, "grad_norm": 2.1382393836975098, "learning_rate": 2.875526396374695e-07, "loss": 0.1815, "step": 7388 }, { "epoch": 0.35225132886801896, "grad_norm": 1.0845798254013062, "learning_rate": 2.866182041567567e-07, "loss": 0.5873, "step": 7389 }, { "epoch": 0.3522990012633185, "grad_norm": 1.7142635583877563, "learning_rate": 2.856852673508259e-07, "loss": 0.9046, "step": 7390 }, { "epoch": 0.352346673658618, "grad_norm": 0.9403621554374695, "learning_rate": 2.8475382936362095e-07, "loss": 0.3357, "step": 7391 }, { "epoch": 0.3523943460539175, "grad_norm": 1.4498497247695923, "learning_rate": 2.838238903388524e-07, "loss": 0.8775, "step": 7392 }, { "epoch": 0.35244201844921696, "grad_norm": 1.181903600692749, "learning_rate": 2.828954504199999e-07, "loss": 0.7466, "step": 7393 }, { "epoch": 0.3524896908445165, "grad_norm": 1.3346874713897705, "learning_rate": 2.819685097503133e-07, "loss": 0.7473, "step": 7394 }, { "epoch": 0.352537363239816, "grad_norm": 3.614489793777466, "learning_rate": 2.810430684728094e-07, "loss": 1.2224, "step": 7395 }, { "epoch": 0.3525850356351155, "grad_norm": 1.2386598587036133, "learning_rate": 2.8011912673027274e-07, "loss": 0.6399, "step": 7396 }, { "epoch": 0.35263270803041497, "grad_norm": 2.0606119632720947, "learning_rate": 2.791966846652594e-07, "loss": 1.0436, "step": 7397 }, { "epoch": 0.3526803804257145, "grad_norm": 3.0070900917053223, "learning_rate": 2.7827574242009434e-07, "loss": 0.9309, "step": 7398 }, { "epoch": 0.352728052821014, "grad_norm": 1.8840301036834717, "learning_rate": 2.773563001368673e-07, "loss": 0.6555, "step": 7399 }, { "epoch": 0.3527757252163135, "grad_norm": 10.14394474029541, "learning_rate": 2.764383579574381e-07, "loss": 0.2678, "step": 7400 }, { "epoch": 0.352823397611613, "grad_norm": 1.8069679737091064, "learning_rate": 2.75521916023439e-07, "loss": 0.7415, "step": 7401 }, { "epoch": 0.3528710700069125, "grad_norm": 1.3674018383026123, "learning_rate": 2.7460697447626363e-07, "loss": 0.6104, "step": 7402 }, { "epoch": 0.352918742402212, "grad_norm": 1.916283130645752, "learning_rate": 2.7369353345708006e-07, "loss": 0.8912, "step": 7403 }, { "epoch": 0.3529664147975115, "grad_norm": 3.7305352687835693, "learning_rate": 2.727815931068234e-07, "loss": 0.6913, "step": 7404 }, { "epoch": 0.353014087192811, "grad_norm": 1.9718384742736816, "learning_rate": 2.7187115356619553e-07, "loss": 0.6206, "step": 7405 }, { "epoch": 0.3530617595881105, "grad_norm": 1.2205933332443237, "learning_rate": 2.7096221497566853e-07, "loss": 0.6263, "step": 7406 }, { "epoch": 0.35310943198341, "grad_norm": 2.4745607376098633, "learning_rate": 2.7005477747548245e-07, "loss": 0.9387, "step": 7407 }, { "epoch": 0.3531571043787095, "grad_norm": 0.9382553696632385, "learning_rate": 2.691488412056442e-07, "loss": 0.3974, "step": 7408 }, { "epoch": 0.353204776774009, "grad_norm": 2.874303102493286, "learning_rate": 2.682444063059331e-07, "loss": 0.952, "step": 7409 }, { "epoch": 0.35325244916930854, "grad_norm": 1.4257711172103882, "learning_rate": 2.6734147291589075e-07, "loss": 0.6688, "step": 7410 }, { "epoch": 0.353300121564608, "grad_norm": 2.292304754257202, "learning_rate": 2.6644004117483357e-07, "loss": 0.7714, "step": 7411 }, { "epoch": 0.3533477939599075, "grad_norm": 1.4404491186141968, "learning_rate": 2.655401112218403e-07, "loss": 0.4795, "step": 7412 }, { "epoch": 0.353395466355207, "grad_norm": 1.6066272258758545, "learning_rate": 2.646416831957621e-07, "loss": 0.7284, "step": 7413 }, { "epoch": 0.35344313875050654, "grad_norm": 1.872521996498108, "learning_rate": 2.637447572352192e-07, "loss": 0.8242, "step": 7414 }, { "epoch": 0.353490811145806, "grad_norm": 1.0502004623413086, "learning_rate": 2.6284933347859534e-07, "loss": 0.573, "step": 7415 }, { "epoch": 0.3535384835411055, "grad_norm": 4.0295586585998535, "learning_rate": 2.619554120640455e-07, "loss": 1.4406, "step": 7416 }, { "epoch": 0.353586155936405, "grad_norm": 2.506178617477417, "learning_rate": 2.610629931294939e-07, "loss": 0.6825, "step": 7417 }, { "epoch": 0.35363382833170454, "grad_norm": 1.5094168186187744, "learning_rate": 2.6017207681263033e-07, "loss": 0.689, "step": 7418 }, { "epoch": 0.35368150072700405, "grad_norm": 1.7175939083099365, "learning_rate": 2.5928266325091377e-07, "loss": 1.0043, "step": 7419 }, { "epoch": 0.3537291731223035, "grad_norm": 2.555274248123169, "learning_rate": 2.583947525815733e-07, "loss": 0.4071, "step": 7420 }, { "epoch": 0.353776845517603, "grad_norm": 1.615445613861084, "learning_rate": 2.575083449416038e-07, "loss": 0.5633, "step": 7421 }, { "epoch": 0.35382451791290254, "grad_norm": 3.040313482284546, "learning_rate": 2.5662344046776697e-07, "loss": 0.2669, "step": 7422 }, { "epoch": 0.35387219030820205, "grad_norm": 2.82405424118042, "learning_rate": 2.5574003929659697e-07, "loss": 0.6578, "step": 7423 }, { "epoch": 0.35391986270350156, "grad_norm": 1.4721612930297852, "learning_rate": 2.548581415643936e-07, "loss": 0.5962, "step": 7424 }, { "epoch": 0.353967535098801, "grad_norm": 1.5721673965454102, "learning_rate": 2.5397774740722134e-07, "loss": 0.4572, "step": 7425 }, { "epoch": 0.35401520749410054, "grad_norm": 1.8483844995498657, "learning_rate": 2.5309885696091943e-07, "loss": 0.7054, "step": 7426 }, { "epoch": 0.35406287988940005, "grad_norm": 1.6586484909057617, "learning_rate": 2.5222147036108925e-07, "loss": 0.695, "step": 7427 }, { "epoch": 0.35411055228469956, "grad_norm": 1.7140986919403076, "learning_rate": 2.513455877431037e-07, "loss": 0.7036, "step": 7428 }, { "epoch": 0.354158224679999, "grad_norm": 1.3403490781784058, "learning_rate": 2.5047120924210243e-07, "loss": 0.8366, "step": 7429 }, { "epoch": 0.35420589707529854, "grad_norm": 1.1276532411575317, "learning_rate": 2.4959833499299314e-07, "loss": 0.6857, "step": 7430 }, { "epoch": 0.35425356947059805, "grad_norm": 2.1477253437042236, "learning_rate": 2.4872696513045025e-07, "loss": 0.8715, "step": 7431 }, { "epoch": 0.35430124186589756, "grad_norm": 1.7946161031723022, "learning_rate": 2.478570997889185e-07, "loss": 0.5852, "step": 7432 }, { "epoch": 0.3543489142611971, "grad_norm": 1.7223455905914307, "learning_rate": 2.4698873910260824e-07, "loss": 0.3981, "step": 7433 }, { "epoch": 0.35439658665649654, "grad_norm": 1.918836236000061, "learning_rate": 2.46121883205499e-07, "loss": 0.7247, "step": 7434 }, { "epoch": 0.35444425905179605, "grad_norm": 2.0484137535095215, "learning_rate": 2.452565322313383e-07, "loss": 0.6482, "step": 7435 }, { "epoch": 0.35449193144709557, "grad_norm": 1.259306788444519, "learning_rate": 2.4439268631363924e-07, "loss": 0.5333, "step": 7436 }, { "epoch": 0.3545396038423951, "grad_norm": 1.4508843421936035, "learning_rate": 2.435303455856863e-07, "loss": 1.1334, "step": 7437 }, { "epoch": 0.35458727623769454, "grad_norm": 2.5092461109161377, "learning_rate": 2.426695101805288e-07, "loss": 0.9257, "step": 7438 }, { "epoch": 0.35463494863299405, "grad_norm": 1.3373315334320068, "learning_rate": 2.418101802309847e-07, "loss": 0.7458, "step": 7439 }, { "epoch": 0.35468262102829357, "grad_norm": 1.5031764507293701, "learning_rate": 2.4095235586963916e-07, "loss": 0.7301, "step": 7440 }, { "epoch": 0.3547302934235931, "grad_norm": 3.019455671310425, "learning_rate": 2.4009603722884745e-07, "loss": 0.8141, "step": 7441 }, { "epoch": 0.3547779658188926, "grad_norm": 1.7121657133102417, "learning_rate": 2.392412244407294e-07, "loss": 0.7722, "step": 7442 }, { "epoch": 0.35482563821419205, "grad_norm": 1.6741019487380981, "learning_rate": 2.3838791763717283e-07, "loss": 0.7177, "step": 7443 }, { "epoch": 0.35487331060949157, "grad_norm": 2.3587350845336914, "learning_rate": 2.3753611694983693e-07, "loss": 1.0375, "step": 7444 }, { "epoch": 0.3549209830047911, "grad_norm": 2.358966827392578, "learning_rate": 2.3668582251014316e-07, "loss": 1.105, "step": 7445 }, { "epoch": 0.3549686554000906, "grad_norm": 1.7531901597976685, "learning_rate": 2.3583703444928442e-07, "loss": 0.6243, "step": 7446 }, { "epoch": 0.35501632779539005, "grad_norm": 1.5743778944015503, "learning_rate": 2.3498975289822035e-07, "loss": 0.6565, "step": 7447 }, { "epoch": 0.35506400019068957, "grad_norm": 1.841613531112671, "learning_rate": 2.341439779876775e-07, "loss": 0.3976, "step": 7448 }, { "epoch": 0.3551116725859891, "grad_norm": 1.8073867559432983, "learning_rate": 2.3329970984814932e-07, "loss": 1.0816, "step": 7449 }, { "epoch": 0.3551593449812886, "grad_norm": 5.236608982086182, "learning_rate": 2.324569486098982e-07, "loss": 1.1874, "step": 7450 }, { "epoch": 0.3552070173765881, "grad_norm": 2.4420433044433594, "learning_rate": 2.3161569440295462e-07, "loss": 1.1185, "step": 7451 }, { "epoch": 0.35525468977188757, "grad_norm": 1.7360776662826538, "learning_rate": 2.307759473571136e-07, "loss": 0.7692, "step": 7452 }, { "epoch": 0.3553023621671871, "grad_norm": 0.9339656233787537, "learning_rate": 2.2993770760194044e-07, "loss": 0.2735, "step": 7453 }, { "epoch": 0.3553500345624866, "grad_norm": 1.9637995958328247, "learning_rate": 2.2910097526676723e-07, "loss": 0.8263, "step": 7454 }, { "epoch": 0.3553977069577861, "grad_norm": 1.7706027030944824, "learning_rate": 2.2826575048069287e-07, "loss": 0.784, "step": 7455 }, { "epoch": 0.3554453793530856, "grad_norm": 1.8194026947021484, "learning_rate": 2.2743203337258323e-07, "loss": 0.9633, "step": 7456 }, { "epoch": 0.3554930517483851, "grad_norm": 1.6137317419052124, "learning_rate": 2.2659982407107427e-07, "loss": 0.9154, "step": 7457 }, { "epoch": 0.3555407241436846, "grad_norm": 1.2820813655853271, "learning_rate": 2.2576912270456442e-07, "loss": 0.6613, "step": 7458 }, { "epoch": 0.3555883965389841, "grad_norm": 1.43150794506073, "learning_rate": 2.2493992940122334e-07, "loss": 0.8796, "step": 7459 }, { "epoch": 0.3556360689342836, "grad_norm": 1.364145040512085, "learning_rate": 2.241122442889887e-07, "loss": 0.9293, "step": 7460 }, { "epoch": 0.3556837413295831, "grad_norm": 2.3687691688537598, "learning_rate": 2.232860674955617e-07, "loss": 0.6651, "step": 7461 }, { "epoch": 0.3557314137248826, "grad_norm": 1.84059739112854, "learning_rate": 2.224613991484148e-07, "loss": 0.7915, "step": 7462 }, { "epoch": 0.3557790861201821, "grad_norm": 1.2504510879516602, "learning_rate": 2.2163823937478512e-07, "loss": 1.0127, "step": 7463 }, { "epoch": 0.3558267585154816, "grad_norm": 4.887526035308838, "learning_rate": 2.2081658830167552e-07, "loss": 0.745, "step": 7464 }, { "epoch": 0.35587443091078114, "grad_norm": 2.2528183460235596, "learning_rate": 2.1999644605586122e-07, "loss": 0.8482, "step": 7465 }, { "epoch": 0.3559221033060806, "grad_norm": 1.9308404922485352, "learning_rate": 2.1917781276388217e-07, "loss": 0.7044, "step": 7466 }, { "epoch": 0.3559697757013801, "grad_norm": 3.6642274856567383, "learning_rate": 2.1836068855204174e-07, "loss": 1.0077, "step": 7467 }, { "epoch": 0.3560174480966796, "grad_norm": 1.5573298931121826, "learning_rate": 2.1754507354641686e-07, "loss": 0.9672, "step": 7468 }, { "epoch": 0.35606512049197914, "grad_norm": 1.3317711353302002, "learning_rate": 2.1673096787284686e-07, "loss": 0.8061, "step": 7469 }, { "epoch": 0.3561127928872786, "grad_norm": 1.3915892839431763, "learning_rate": 2.1591837165694018e-07, "loss": 0.5843, "step": 7470 }, { "epoch": 0.3561604652825781, "grad_norm": 2.037578582763672, "learning_rate": 2.1510728502407206e-07, "loss": 0.8099, "step": 7471 }, { "epoch": 0.3562081376778776, "grad_norm": 1.5721982717514038, "learning_rate": 2.1429770809938577e-07, "loss": 0.8496, "step": 7472 }, { "epoch": 0.35625581007317714, "grad_norm": 2.135315179824829, "learning_rate": 2.1348964100778914e-07, "loss": 0.7808, "step": 7473 }, { "epoch": 0.35630348246847665, "grad_norm": 1.717445731163025, "learning_rate": 2.1268308387395908e-07, "loss": 0.7428, "step": 7474 }, { "epoch": 0.3563511548637761, "grad_norm": 7.698898792266846, "learning_rate": 2.1187803682234055e-07, "loss": 1.355, "step": 7475 }, { "epoch": 0.3563988272590756, "grad_norm": 1.9036109447479248, "learning_rate": 2.110744999771419e-07, "loss": 0.7164, "step": 7476 }, { "epoch": 0.35644649965437514, "grad_norm": 1.195804238319397, "learning_rate": 2.102724734623407e-07, "loss": 0.7788, "step": 7477 }, { "epoch": 0.35649417204967465, "grad_norm": 1.4893684387207031, "learning_rate": 2.0947195740168347e-07, "loss": 0.4109, "step": 7478 }, { "epoch": 0.35654184444497417, "grad_norm": 1.3249890804290771, "learning_rate": 2.086729519186803e-07, "loss": 0.7046, "step": 7479 }, { "epoch": 0.3565895168402736, "grad_norm": 1.275384783744812, "learning_rate": 2.0787545713660817e-07, "loss": 0.6835, "step": 7480 }, { "epoch": 0.35663718923557314, "grad_norm": 1.1132960319519043, "learning_rate": 2.0707947317851528e-07, "loss": 0.5525, "step": 7481 }, { "epoch": 0.35668486163087265, "grad_norm": 3.5884432792663574, "learning_rate": 2.062850001672112e-07, "loss": 0.7763, "step": 7482 }, { "epoch": 0.35673253402617217, "grad_norm": 2.164539098739624, "learning_rate": 2.0549203822527675e-07, "loss": 0.716, "step": 7483 }, { "epoch": 0.3567802064214716, "grad_norm": 1.525444507598877, "learning_rate": 2.0470058747505516e-07, "loss": 0.6758, "step": 7484 }, { "epoch": 0.35682787881677114, "grad_norm": 1.6143829822540283, "learning_rate": 2.0391064803866213e-07, "loss": 0.6231, "step": 7485 }, { "epoch": 0.35687555121207065, "grad_norm": 1.754177451133728, "learning_rate": 2.0312222003797565e-07, "loss": 0.3647, "step": 7486 }, { "epoch": 0.35692322360737017, "grad_norm": 1.4544939994812012, "learning_rate": 2.0233530359464183e-07, "loss": 0.3695, "step": 7487 }, { "epoch": 0.3569708960026697, "grad_norm": 1.8666043281555176, "learning_rate": 2.0154989883007458e-07, "loss": 0.4506, "step": 7488 }, { "epoch": 0.35701856839796914, "grad_norm": 3.311408042907715, "learning_rate": 2.007660058654537e-07, "loss": 0.7648, "step": 7489 }, { "epoch": 0.35706624079326865, "grad_norm": 2.4828367233276367, "learning_rate": 1.9998362482172462e-07, "loss": 0.5591, "step": 7490 }, { "epoch": 0.35711391318856817, "grad_norm": 3.6499619483947754, "learning_rate": 1.9920275581960303e-07, "loss": 0.3009, "step": 7491 }, { "epoch": 0.3571615855838677, "grad_norm": 2.7179312705993652, "learning_rate": 1.9842339897956585e-07, "loss": 0.6891, "step": 7492 }, { "epoch": 0.35720925797916714, "grad_norm": 2.9030332565307617, "learning_rate": 1.976455544218625e-07, "loss": 0.8243, "step": 7493 }, { "epoch": 0.35725693037446665, "grad_norm": 10.5089693069458, "learning_rate": 1.9686922226650584e-07, "loss": 0.7606, "step": 7494 }, { "epoch": 0.35730460276976617, "grad_norm": 1.1255024671554565, "learning_rate": 1.960944026332745e-07, "loss": 0.6785, "step": 7495 }, { "epoch": 0.3573522751650657, "grad_norm": 1.5652834177017212, "learning_rate": 1.953210956417162e-07, "loss": 0.776, "step": 7496 }, { "epoch": 0.3573999475603652, "grad_norm": 1.469054937362671, "learning_rate": 1.9454930141114546e-07, "loss": 0.88, "step": 7497 }, { "epoch": 0.35744761995566465, "grad_norm": 1.9115630388259888, "learning_rate": 1.9377902006063932e-07, "loss": 0.6954, "step": 7498 }, { "epoch": 0.35749529235096417, "grad_norm": 1.575408697128296, "learning_rate": 1.930102517090471e-07, "loss": 0.7336, "step": 7499 }, { "epoch": 0.3575429647462637, "grad_norm": 2.5314137935638428, "learning_rate": 1.9224299647498058e-07, "loss": 0.6655, "step": 7500 }, { "epoch": 0.3575906371415632, "grad_norm": 1.3152841329574585, "learning_rate": 1.9147725447681841e-07, "loss": 0.2922, "step": 7501 }, { "epoch": 0.35763830953686265, "grad_norm": 1.0879100561141968, "learning_rate": 1.9071302583270724e-07, "loss": 0.8709, "step": 7502 }, { "epoch": 0.35768598193216217, "grad_norm": 5.678489685058594, "learning_rate": 1.8995031066056157e-07, "loss": 0.6401, "step": 7503 }, { "epoch": 0.3577336543274617, "grad_norm": 2.0973715782165527, "learning_rate": 1.8918910907805733e-07, "loss": 0.6481, "step": 7504 }, { "epoch": 0.3577813267227612, "grad_norm": 1.4970685243606567, "learning_rate": 1.8842942120264272e-07, "loss": 0.8114, "step": 7505 }, { "epoch": 0.3578289991180607, "grad_norm": 1.6479661464691162, "learning_rate": 1.8767124715152962e-07, "loss": 0.7586, "step": 7506 }, { "epoch": 0.35787667151336017, "grad_norm": 1.0549038648605347, "learning_rate": 1.8691458704169442e-07, "loss": 0.7447, "step": 7507 }, { "epoch": 0.3579243439086597, "grad_norm": 2.5356662273406982, "learning_rate": 1.861594409898826e-07, "loss": 0.4792, "step": 7508 }, { "epoch": 0.3579720163039592, "grad_norm": 2.0688745975494385, "learning_rate": 1.8540580911260764e-07, "loss": 1.3452, "step": 7509 }, { "epoch": 0.3580196886992587, "grad_norm": 2.8586885929107666, "learning_rate": 1.846536915261443e-07, "loss": 0.87, "step": 7510 }, { "epoch": 0.3580673610945582, "grad_norm": 1.8569843769073486, "learning_rate": 1.839030883465387e-07, "loss": 0.7651, "step": 7511 }, { "epoch": 0.3581150334898577, "grad_norm": 1.5996110439300537, "learning_rate": 1.8315399968960036e-07, "loss": 1.2527, "step": 7512 }, { "epoch": 0.3581627058851572, "grad_norm": 1.7045109272003174, "learning_rate": 1.824064256709046e-07, "loss": 0.8132, "step": 7513 }, { "epoch": 0.3582103782804567, "grad_norm": 1.5336089134216309, "learning_rate": 1.8166036640579697e-07, "loss": 0.8086, "step": 7514 }, { "epoch": 0.3582580506757562, "grad_norm": 1.3751543760299683, "learning_rate": 1.8091582200938652e-07, "loss": 0.72, "step": 7515 }, { "epoch": 0.3583057230710557, "grad_norm": 2.3323755264282227, "learning_rate": 1.8017279259654574e-07, "loss": 0.8241, "step": 7516 }, { "epoch": 0.3583533954663552, "grad_norm": 2.1635584831237793, "learning_rate": 1.7943127828191852e-07, "loss": 0.2504, "step": 7517 }, { "epoch": 0.3584010678616547, "grad_norm": 2.0600357055664062, "learning_rate": 1.7869127917991446e-07, "loss": 0.9565, "step": 7518 }, { "epoch": 0.3584487402569542, "grad_norm": 2.327176094055176, "learning_rate": 1.7795279540470446e-07, "loss": 0.5631, "step": 7519 }, { "epoch": 0.35849641265225374, "grad_norm": 1.288893699645996, "learning_rate": 1.7721582707023065e-07, "loss": 0.5399, "step": 7520 }, { "epoch": 0.3585440850475532, "grad_norm": 1.3500419855117798, "learning_rate": 1.7648037429019993e-07, "loss": 0.7824, "step": 7521 }, { "epoch": 0.3585917574428527, "grad_norm": 1.9134793281555176, "learning_rate": 1.7574643717808483e-07, "loss": 0.4218, "step": 7522 }, { "epoch": 0.3586394298381522, "grad_norm": 1.9522933959960938, "learning_rate": 1.7501401584712475e-07, "loss": 0.7751, "step": 7523 }, { "epoch": 0.35868710223345174, "grad_norm": 1.2565008401870728, "learning_rate": 1.7428311041032264e-07, "loss": 0.3415, "step": 7524 }, { "epoch": 0.3587347746287512, "grad_norm": 2.849976062774658, "learning_rate": 1.7355372098045274e-07, "loss": 0.7996, "step": 7525 }, { "epoch": 0.3587824470240507, "grad_norm": 2.3293449878692627, "learning_rate": 1.7282584767005062e-07, "loss": 0.8856, "step": 7526 }, { "epoch": 0.3588301194193502, "grad_norm": 1.8338991403579712, "learning_rate": 1.7209949059142084e-07, "loss": 0.7683, "step": 7527 }, { "epoch": 0.35887779181464974, "grad_norm": 1.9646183252334595, "learning_rate": 1.7137464985663045e-07, "loss": 0.6699, "step": 7528 }, { "epoch": 0.35892546420994925, "grad_norm": 1.0771929025650024, "learning_rate": 1.7065132557751662e-07, "loss": 0.2353, "step": 7529 }, { "epoch": 0.3589731366052487, "grad_norm": 1.8474793434143066, "learning_rate": 1.6992951786568123e-07, "loss": 0.8408, "step": 7530 }, { "epoch": 0.3590208090005482, "grad_norm": 1.6009588241577148, "learning_rate": 1.6920922683249076e-07, "loss": 1.0103, "step": 7531 }, { "epoch": 0.35906848139584774, "grad_norm": 3.1210105419158936, "learning_rate": 1.6849045258907848e-07, "loss": 0.7174, "step": 7532 }, { "epoch": 0.35911615379114725, "grad_norm": 2.085829496383667, "learning_rate": 1.677731952463446e-07, "loss": 0.9586, "step": 7533 }, { "epoch": 0.3591638261864467, "grad_norm": 1.3121978044509888, "learning_rate": 1.6705745491495394e-07, "loss": 0.6006, "step": 7534 }, { "epoch": 0.3592114985817462, "grad_norm": 2.7522873878479004, "learning_rate": 1.6634323170533928e-07, "loss": 0.6786, "step": 7535 }, { "epoch": 0.35925917097704574, "grad_norm": 1.1993236541748047, "learning_rate": 1.6563052572769578e-07, "loss": 0.1711, "step": 7536 }, { "epoch": 0.35930684337234525, "grad_norm": 1.1851755380630493, "learning_rate": 1.649193370919888e-07, "loss": 0.7118, "step": 7537 }, { "epoch": 0.35935451576764477, "grad_norm": 1.2754336595535278, "learning_rate": 1.6420966590794617e-07, "loss": 0.5486, "step": 7538 }, { "epoch": 0.3594021881629442, "grad_norm": 1.8204423189163208, "learning_rate": 1.6350151228506251e-07, "loss": 0.8391, "step": 7539 }, { "epoch": 0.35944986055824374, "grad_norm": 1.747872233390808, "learning_rate": 1.6279487633259926e-07, "loss": 0.7039, "step": 7540 }, { "epoch": 0.35949753295354325, "grad_norm": 1.7669453620910645, "learning_rate": 1.620897581595826e-07, "loss": 1.0443, "step": 7541 }, { "epoch": 0.35954520534884277, "grad_norm": 3.060089111328125, "learning_rate": 1.613861578748066e-07, "loss": 1.1797, "step": 7542 }, { "epoch": 0.3595928777441423, "grad_norm": 2.1141858100891113, "learning_rate": 1.6068407558682775e-07, "loss": 0.42, "step": 7543 }, { "epoch": 0.35964055013944174, "grad_norm": 4.877460956573486, "learning_rate": 1.599835114039705e-07, "loss": 1.0632, "step": 7544 }, { "epoch": 0.35968822253474125, "grad_norm": 2.6571197509765625, "learning_rate": 1.5928446543432507e-07, "loss": 0.5854, "step": 7545 }, { "epoch": 0.35973589493004077, "grad_norm": 1.6897437572479248, "learning_rate": 1.585869377857474e-07, "loss": 1.1083, "step": 7546 }, { "epoch": 0.3597835673253403, "grad_norm": 2.7866263389587402, "learning_rate": 1.5789092856585697e-07, "loss": 0.6052, "step": 7547 }, { "epoch": 0.35983123972063974, "grad_norm": 1.2658756971359253, "learning_rate": 1.571964378820434e-07, "loss": 0.2581, "step": 7548 }, { "epoch": 0.35987891211593925, "grad_norm": 3.687223196029663, "learning_rate": 1.565034658414577e-07, "loss": 1.6637, "step": 7549 }, { "epoch": 0.35992658451123877, "grad_norm": 1.7222639322280884, "learning_rate": 1.5581201255101874e-07, "loss": 0.3713, "step": 7550 }, { "epoch": 0.3599742569065383, "grad_norm": 1.7825167179107666, "learning_rate": 1.551220781174101e-07, "loss": 0.6362, "step": 7551 }, { "epoch": 0.3600219293018378, "grad_norm": 0.963318943977356, "learning_rate": 1.5443366264708326e-07, "loss": 0.3692, "step": 7552 }, { "epoch": 0.36006960169713725, "grad_norm": 1.2702791690826416, "learning_rate": 1.5374676624625218e-07, "loss": 0.4363, "step": 7553 }, { "epoch": 0.36011727409243677, "grad_norm": 3.5927670001983643, "learning_rate": 1.5306138902089763e-07, "loss": 1.5814, "step": 7554 }, { "epoch": 0.3601649464877363, "grad_norm": 1.0734039545059204, "learning_rate": 1.5237753107676721e-07, "loss": 0.3872, "step": 7555 }, { "epoch": 0.3602126188830358, "grad_norm": 1.6010364294052124, "learning_rate": 1.5169519251937325e-07, "loss": 0.9508, "step": 7556 }, { "epoch": 0.36026029127833525, "grad_norm": 2.6194820404052734, "learning_rate": 1.5101437345399262e-07, "loss": 0.6893, "step": 7557 }, { "epoch": 0.36030796367363477, "grad_norm": 1.6924545764923096, "learning_rate": 1.5033507398567017e-07, "loss": 0.4745, "step": 7558 }, { "epoch": 0.3603556360689343, "grad_norm": 1.4772309064865112, "learning_rate": 1.4965729421921425e-07, "loss": 0.6517, "step": 7559 }, { "epoch": 0.3604033084642338, "grad_norm": 2.222808837890625, "learning_rate": 1.4898103425919687e-07, "loss": 0.7971, "step": 7560 }, { "epoch": 0.3604509808595333, "grad_norm": 1.4040281772613525, "learning_rate": 1.4830629420996222e-07, "loss": 0.6371, "step": 7561 }, { "epoch": 0.36049865325483277, "grad_norm": 1.7208927869796753, "learning_rate": 1.4763307417561157e-07, "loss": 0.7422, "step": 7562 }, { "epoch": 0.3605463256501323, "grad_norm": 4.075765609741211, "learning_rate": 1.4696137426001844e-07, "loss": 0.9115, "step": 7563 }, { "epoch": 0.3605939980454318, "grad_norm": 1.319954752922058, "learning_rate": 1.4629119456681884e-07, "loss": 0.4583, "step": 7564 }, { "epoch": 0.3606416704407313, "grad_norm": 1.3423572778701782, "learning_rate": 1.456225351994156e-07, "loss": 0.79, "step": 7565 }, { "epoch": 0.3606893428360308, "grad_norm": 1.6340581178665161, "learning_rate": 1.4495539626097289e-07, "loss": 0.925, "step": 7566 }, { "epoch": 0.3607370152313303, "grad_norm": 1.685856819152832, "learning_rate": 1.44289777854425e-07, "loss": 1.08, "step": 7567 }, { "epoch": 0.3607846876266298, "grad_norm": 1.3200937509536743, "learning_rate": 1.4362568008247202e-07, "loss": 0.5431, "step": 7568 }, { "epoch": 0.3608323600219293, "grad_norm": 1.620684266090393, "learning_rate": 1.4296310304757423e-07, "loss": 0.449, "step": 7569 }, { "epoch": 0.3608800324172288, "grad_norm": 1.924071192741394, "learning_rate": 1.4230204685196202e-07, "loss": 0.6961, "step": 7570 }, { "epoch": 0.3609277048125283, "grad_norm": 1.5358649492263794, "learning_rate": 1.4164251159762944e-07, "loss": 0.8894, "step": 7571 }, { "epoch": 0.3609753772078278, "grad_norm": 5.417590141296387, "learning_rate": 1.4098449738633614e-07, "loss": 0.243, "step": 7572 }, { "epoch": 0.3610230496031273, "grad_norm": 2.6760740280151367, "learning_rate": 1.4032800431960647e-07, "loss": 0.737, "step": 7573 }, { "epoch": 0.3610707219984268, "grad_norm": 3.250913143157959, "learning_rate": 1.3967303249873053e-07, "loss": 0.5661, "step": 7574 }, { "epoch": 0.36111839439372634, "grad_norm": 2.3834540843963623, "learning_rate": 1.390195820247653e-07, "loss": 0.9905, "step": 7575 }, { "epoch": 0.3611660667890258, "grad_norm": 0.9095104932785034, "learning_rate": 1.3836765299852894e-07, "loss": 0.2751, "step": 7576 }, { "epoch": 0.3612137391843253, "grad_norm": 1.722669243812561, "learning_rate": 1.3771724552060885e-07, "loss": 0.928, "step": 7577 }, { "epoch": 0.3612614115796248, "grad_norm": 1.4960683584213257, "learning_rate": 1.3706835969135467e-07, "loss": 0.5281, "step": 7578 }, { "epoch": 0.36130908397492434, "grad_norm": 7.444228649139404, "learning_rate": 1.3642099561088528e-07, "loss": 0.0836, "step": 7579 }, { "epoch": 0.3613567563702238, "grad_norm": 0.9885214567184448, "learning_rate": 1.3577515337908076e-07, "loss": 0.435, "step": 7580 }, { "epoch": 0.3614044287655233, "grad_norm": 1.0258896350860596, "learning_rate": 1.3513083309558806e-07, "loss": 0.4495, "step": 7581 }, { "epoch": 0.3614521011608228, "grad_norm": 1.1365811824798584, "learning_rate": 1.3448803485981986e-07, "loss": 0.8391, "step": 7582 }, { "epoch": 0.36149977355612234, "grad_norm": 1.7019453048706055, "learning_rate": 1.3384675877095244e-07, "loss": 0.8294, "step": 7583 }, { "epoch": 0.36154744595142185, "grad_norm": 1.7103431224822998, "learning_rate": 1.3320700492792771e-07, "loss": 0.6849, "step": 7584 }, { "epoch": 0.3615951183467213, "grad_norm": 1.6652214527130127, "learning_rate": 1.3256877342945452e-07, "loss": 0.6364, "step": 7585 }, { "epoch": 0.3616427907420208, "grad_norm": 1.4972835779190063, "learning_rate": 1.319320643740052e-07, "loss": 0.8808, "step": 7586 }, { "epoch": 0.36169046313732034, "grad_norm": 1.9461411237716675, "learning_rate": 1.312968778598167e-07, "loss": 1.2322, "step": 7587 }, { "epoch": 0.36173813553261985, "grad_norm": 1.602824330329895, "learning_rate": 1.3066321398489178e-07, "loss": 0.7117, "step": 7588 }, { "epoch": 0.3617858079279193, "grad_norm": 1.647096037864685, "learning_rate": 1.3003107284699777e-07, "loss": 0.6963, "step": 7589 }, { "epoch": 0.3618334803232188, "grad_norm": 2.3326754570007324, "learning_rate": 1.294004545436689e-07, "loss": 0.7142, "step": 7590 }, { "epoch": 0.36188115271851834, "grad_norm": 0.892280638217926, "learning_rate": 1.2877135917220173e-07, "loss": 0.492, "step": 7591 }, { "epoch": 0.36192882511381785, "grad_norm": 3.4787204265594482, "learning_rate": 1.281437868296609e-07, "loss": 1.3923, "step": 7592 }, { "epoch": 0.36197649750911737, "grad_norm": 3.67329478263855, "learning_rate": 1.2751773761287333e-07, "loss": 1.7699, "step": 7593 }, { "epoch": 0.3620241699044168, "grad_norm": 1.018886923789978, "learning_rate": 1.2689321161843071e-07, "loss": 0.6659, "step": 7594 }, { "epoch": 0.36207184229971634, "grad_norm": 1.7219388484954834, "learning_rate": 1.262702089426926e-07, "loss": 0.6259, "step": 7595 }, { "epoch": 0.36211951469501585, "grad_norm": 0.9731652736663818, "learning_rate": 1.256487296817821e-07, "loss": 0.6641, "step": 7596 }, { "epoch": 0.36216718709031537, "grad_norm": 2.942791700363159, "learning_rate": 1.2502877393158587e-07, "loss": 0.738, "step": 7597 }, { "epoch": 0.3622148594856149, "grad_norm": 1.5527029037475586, "learning_rate": 1.2441034178775735e-07, "loss": 1.2505, "step": 7598 }, { "epoch": 0.36226253188091434, "grad_norm": 1.733080267906189, "learning_rate": 1.237934333457147e-07, "loss": 0.8964, "step": 7599 }, { "epoch": 0.36231020427621385, "grad_norm": 1.1859900951385498, "learning_rate": 1.2317804870063954e-07, "loss": 0.6388, "step": 7600 }, { "epoch": 0.36235787667151337, "grad_norm": 1.1545878648757935, "learning_rate": 1.2256418794747925e-07, "loss": 0.5054, "step": 7601 }, { "epoch": 0.3624055490668129, "grad_norm": 2.0713119506835938, "learning_rate": 1.219518511809481e-07, "loss": 0.7267, "step": 7602 }, { "epoch": 0.36245322146211234, "grad_norm": 2.233213424682617, "learning_rate": 1.213410384955227e-07, "loss": 0.7532, "step": 7603 }, { "epoch": 0.36250089385741185, "grad_norm": 1.4822250604629517, "learning_rate": 1.2073174998544323e-07, "loss": 0.8033, "step": 7604 }, { "epoch": 0.36254856625271137, "grad_norm": 1.719848394393921, "learning_rate": 1.2012398574471785e-07, "loss": 0.6197, "step": 7605 }, { "epoch": 0.3625962386480109, "grad_norm": 1.4159936904907227, "learning_rate": 1.1951774586711927e-07, "loss": 0.5608, "step": 7606 }, { "epoch": 0.3626439110433104, "grad_norm": 2.1619670391082764, "learning_rate": 1.1891303044618275e-07, "loss": 0.9584, "step": 7607 }, { "epoch": 0.36269158343860985, "grad_norm": 3.1314663887023926, "learning_rate": 1.1830983957521024e-07, "loss": 1.42, "step": 7608 }, { "epoch": 0.36273925583390937, "grad_norm": 1.7015684843063354, "learning_rate": 1.1770817334726736e-07, "loss": 0.8257, "step": 7609 }, { "epoch": 0.3627869282292089, "grad_norm": 2.1228387355804443, "learning_rate": 1.1710803185518537e-07, "loss": 0.3531, "step": 7610 }, { "epoch": 0.3628346006245084, "grad_norm": 2.5854666233062744, "learning_rate": 1.1650941519156023e-07, "loss": 0.7416, "step": 7611 }, { "epoch": 0.36288227301980785, "grad_norm": 4.184905529022217, "learning_rate": 1.1591232344875248e-07, "loss": 0.601, "step": 7612 }, { "epoch": 0.36292994541510737, "grad_norm": 1.911863923072815, "learning_rate": 1.1531675671888621e-07, "loss": 0.8122, "step": 7613 }, { "epoch": 0.3629776178104069, "grad_norm": 6.793514728546143, "learning_rate": 1.1472271509385235e-07, "loss": 0.8045, "step": 7614 }, { "epoch": 0.3630252902057064, "grad_norm": 3.0839173793792725, "learning_rate": 1.1413019866530429e-07, "loss": 0.5305, "step": 7615 }, { "epoch": 0.3630729626010059, "grad_norm": 2.522343635559082, "learning_rate": 1.1353920752466219e-07, "loss": 1.1702, "step": 7616 }, { "epoch": 0.36312063499630537, "grad_norm": 2.205889940261841, "learning_rate": 1.129497417631098e-07, "loss": 0.8711, "step": 7617 }, { "epoch": 0.3631683073916049, "grad_norm": 7.297919273376465, "learning_rate": 1.1236180147159437e-07, "loss": 0.1326, "step": 7618 }, { "epoch": 0.3632159797869044, "grad_norm": 1.5162936449050903, "learning_rate": 1.117753867408311e-07, "loss": 0.6353, "step": 7619 }, { "epoch": 0.3632636521822039, "grad_norm": 1.6964155435562134, "learning_rate": 1.1119049766129652e-07, "loss": 0.9892, "step": 7620 }, { "epoch": 0.36331132457750337, "grad_norm": 2.7673709392547607, "learning_rate": 1.1060713432323288e-07, "loss": 0.6602, "step": 7621 }, { "epoch": 0.3633589969728029, "grad_norm": 2.292043685913086, "learning_rate": 1.1002529681664598e-07, "loss": 1.2501, "step": 7622 }, { "epoch": 0.3634066693681024, "grad_norm": 1.9295915365219116, "learning_rate": 1.0944498523131064e-07, "loss": 0.7653, "step": 7623 }, { "epoch": 0.3634543417634019, "grad_norm": 8.562410354614258, "learning_rate": 1.0886619965676082e-07, "loss": 0.5469, "step": 7624 }, { "epoch": 0.3635020141587014, "grad_norm": 2.2834036350250244, "learning_rate": 1.0828894018229619e-07, "loss": 0.5907, "step": 7625 }, { "epoch": 0.3635496865540009, "grad_norm": 1.7092336416244507, "learning_rate": 1.0771320689698439e-07, "loss": 0.8352, "step": 7626 }, { "epoch": 0.3635973589493004, "grad_norm": 1.4992412328720093, "learning_rate": 1.0713899988965326e-07, "loss": 0.7809, "step": 7627 }, { "epoch": 0.3636450313445999, "grad_norm": 1.4399573802947998, "learning_rate": 1.0656631924889749e-07, "loss": 0.8174, "step": 7628 }, { "epoch": 0.3636927037398994, "grad_norm": 1.6076726913452148, "learning_rate": 1.059951650630775e-07, "loss": 0.5906, "step": 7629 }, { "epoch": 0.36374037613519894, "grad_norm": 2.066838502883911, "learning_rate": 1.0542553742031392e-07, "loss": 0.6151, "step": 7630 }, { "epoch": 0.3637880485304984, "grad_norm": 1.2221015691757202, "learning_rate": 1.0485743640849533e-07, "loss": 0.7351, "step": 7631 }, { "epoch": 0.3638357209257979, "grad_norm": 1.4276305437088013, "learning_rate": 1.0429086211527385e-07, "loss": 0.6746, "step": 7632 }, { "epoch": 0.3638833933210974, "grad_norm": 2.0552382469177246, "learning_rate": 1.037258146280673e-07, "loss": 1.1447, "step": 7633 }, { "epoch": 0.36393106571639694, "grad_norm": 5.339925765991211, "learning_rate": 1.0316229403405487e-07, "loss": 0.8708, "step": 7634 }, { "epoch": 0.3639787381116964, "grad_norm": 1.6786081790924072, "learning_rate": 1.0260030042018365e-07, "loss": 0.6252, "step": 7635 }, { "epoch": 0.3640264105069959, "grad_norm": 1.6302331686019897, "learning_rate": 1.0203983387316097e-07, "loss": 1.0601, "step": 7636 }, { "epoch": 0.3640740829022954, "grad_norm": 1.5396068096160889, "learning_rate": 1.0148089447946319e-07, "loss": 0.6368, "step": 7637 }, { "epoch": 0.36412175529759494, "grad_norm": 1.9746489524841309, "learning_rate": 1.0092348232532911e-07, "loss": 0.5671, "step": 7638 }, { "epoch": 0.36416942769289445, "grad_norm": 1.757285475730896, "learning_rate": 1.0036759749676106e-07, "loss": 0.57, "step": 7639 }, { "epoch": 0.3642171000881939, "grad_norm": 5.034051895141602, "learning_rate": 9.981324007952486e-08, "loss": 0.388, "step": 7640 }, { "epoch": 0.3642647724834934, "grad_norm": 1.8331844806671143, "learning_rate": 9.926041015915434e-08, "loss": 0.8112, "step": 7641 }, { "epoch": 0.36431244487879294, "grad_norm": 1.561575174331665, "learning_rate": 9.870910782094456e-08, "loss": 0.6361, "step": 7642 }, { "epoch": 0.36436011727409245, "grad_norm": 4.549339771270752, "learning_rate": 9.81593331499564e-08, "loss": 0.858, "step": 7643 }, { "epoch": 0.3644077896693919, "grad_norm": 1.0300893783569336, "learning_rate": 9.761108623101312e-08, "loss": 0.5238, "step": 7644 }, { "epoch": 0.3644554620646914, "grad_norm": 2.3920083045959473, "learning_rate": 9.706436714870482e-08, "loss": 0.1993, "step": 7645 }, { "epoch": 0.36450313445999094, "grad_norm": 1.635974407196045, "learning_rate": 9.651917598738402e-08, "loss": 0.6545, "step": 7646 }, { "epoch": 0.36455080685529045, "grad_norm": 1.5133309364318848, "learning_rate": 9.597551283116901e-08, "loss": 1.3502, "step": 7647 }, { "epoch": 0.36459847925058997, "grad_norm": 1.1626709699630737, "learning_rate": 9.543337776393936e-08, "loss": 0.735, "step": 7648 }, { "epoch": 0.3646461516458894, "grad_norm": 1.8082032203674316, "learning_rate": 9.489277086934257e-08, "loss": 0.7641, "step": 7649 }, { "epoch": 0.36469382404118894, "grad_norm": 1.3511099815368652, "learning_rate": 9.435369223078861e-08, "loss": 0.6228, "step": 7650 }, { "epoch": 0.36474149643648845, "grad_norm": 1.3195937871932983, "learning_rate": 9.381614193145206e-08, "loss": 0.7517, "step": 7651 }, { "epoch": 0.36478916883178797, "grad_norm": 2.796147108078003, "learning_rate": 9.32801200542699e-08, "loss": 0.3394, "step": 7652 }, { "epoch": 0.3648368412270875, "grad_norm": 0.8492860794067383, "learning_rate": 9.274562668194598e-08, "loss": 0.1851, "step": 7653 }, { "epoch": 0.36488451362238694, "grad_norm": 1.235487699508667, "learning_rate": 9.221266189694767e-08, "loss": 0.6891, "step": 7654 }, { "epoch": 0.36493218601768646, "grad_norm": 1.2746144533157349, "learning_rate": 9.168122578150363e-08, "loss": 0.7241, "step": 7655 }, { "epoch": 0.36497985841298597, "grad_norm": 1.5769391059875488, "learning_rate": 9.11513184176116e-08, "loss": 0.9307, "step": 7656 }, { "epoch": 0.3650275308082855, "grad_norm": 1.8162281513214111, "learning_rate": 9.062293988702953e-08, "loss": 0.6523, "step": 7657 }, { "epoch": 0.36507520320358494, "grad_norm": 3.195460319519043, "learning_rate": 9.009609027128108e-08, "loss": 1.0291, "step": 7658 }, { "epoch": 0.36512287559888446, "grad_norm": 2.0703372955322266, "learning_rate": 8.957076965165234e-08, "loss": 1.1073, "step": 7659 }, { "epoch": 0.36517054799418397, "grad_norm": 2.353217124938965, "learning_rate": 8.904697810919848e-08, "loss": 0.5342, "step": 7660 }, { "epoch": 0.3652182203894835, "grad_norm": 1.5540796518325806, "learning_rate": 8.852471572473153e-08, "loss": 0.6694, "step": 7661 }, { "epoch": 0.365265892784783, "grad_norm": 1.4673951864242554, "learning_rate": 8.800398257883146e-08, "loss": 0.7309, "step": 7662 }, { "epoch": 0.36531356518008246, "grad_norm": 1.7575181722640991, "learning_rate": 8.748477875184514e-08, "loss": 0.7067, "step": 7663 }, { "epoch": 0.36536123757538197, "grad_norm": 1.5961129665374756, "learning_rate": 8.696710432387733e-08, "loss": 0.4317, "step": 7664 }, { "epoch": 0.3654089099706815, "grad_norm": 1.01649010181427, "learning_rate": 8.645095937480086e-08, "loss": 0.3314, "step": 7665 }, { "epoch": 0.365456582365981, "grad_norm": 1.9757106304168701, "learning_rate": 8.593634398425199e-08, "loss": 0.7827, "step": 7666 }, { "epoch": 0.36550425476128046, "grad_norm": 1.3617305755615234, "learning_rate": 8.542325823162945e-08, "loss": 0.7212, "step": 7667 }, { "epoch": 0.36555192715657997, "grad_norm": 2.044442892074585, "learning_rate": 8.491170219609767e-08, "loss": 0.356, "step": 7668 }, { "epoch": 0.3655995995518795, "grad_norm": 1.8485454320907593, "learning_rate": 8.440167595658577e-08, "loss": 0.8628, "step": 7669 }, { "epoch": 0.365647271947179, "grad_norm": 1.822192907333374, "learning_rate": 8.3893179591783e-08, "loss": 0.7577, "step": 7670 }, { "epoch": 0.3656949443424785, "grad_norm": 1.7000517845153809, "learning_rate": 8.338621318014662e-08, "loss": 0.7554, "step": 7671 }, { "epoch": 0.36574261673777797, "grad_norm": 1.1006102561950684, "learning_rate": 8.288077679989737e-08, "loss": 0.6658, "step": 7672 }, { "epoch": 0.3657902891330775, "grad_norm": 1.5818777084350586, "learning_rate": 8.237687052901622e-08, "loss": 0.8453, "step": 7673 }, { "epoch": 0.365837961528377, "grad_norm": 3.3108766078948975, "learning_rate": 8.187449444525319e-08, "loss": 0.6104, "step": 7674 }, { "epoch": 0.3658856339236765, "grad_norm": 1.7844374179840088, "learning_rate": 8.137364862611851e-08, "loss": 0.4677, "step": 7675 }, { "epoch": 0.36593330631897597, "grad_norm": 1.8921712636947632, "learning_rate": 8.087433314888815e-08, "loss": 0.9077, "step": 7676 }, { "epoch": 0.3659809787142755, "grad_norm": 1.867358922958374, "learning_rate": 8.037654809059937e-08, "loss": 0.9268, "step": 7677 }, { "epoch": 0.366028651109575, "grad_norm": 1.1022671461105347, "learning_rate": 7.988029352805849e-08, "loss": 0.6435, "step": 7678 }, { "epoch": 0.3660763235048745, "grad_norm": 1.313672661781311, "learning_rate": 7.938556953783095e-08, "loss": 0.6473, "step": 7679 }, { "epoch": 0.366123995900174, "grad_norm": 2.8724846839904785, "learning_rate": 7.889237619624679e-08, "loss": 0.9885, "step": 7680 }, { "epoch": 0.3661716682954735, "grad_norm": 2.8557121753692627, "learning_rate": 7.840071357940072e-08, "loss": 0.749, "step": 7681 }, { "epoch": 0.366219340690773, "grad_norm": 1.3713778257369995, "learning_rate": 7.791058176315313e-08, "loss": 0.5679, "step": 7682 }, { "epoch": 0.3662670130860725, "grad_norm": 1.297159194946289, "learning_rate": 7.742198082312357e-08, "loss": 0.4253, "step": 7683 }, { "epoch": 0.366314685481372, "grad_norm": 3.1610944271087646, "learning_rate": 7.693491083470062e-08, "loss": 1.438, "step": 7684 }, { "epoch": 0.36636235787667154, "grad_norm": 2.023857831954956, "learning_rate": 7.644937187303303e-08, "loss": 0.4606, "step": 7685 }, { "epoch": 0.366410030271971, "grad_norm": 3.6449878215789795, "learning_rate": 7.596536401303422e-08, "loss": 0.6084, "step": 7686 }, { "epoch": 0.3664577026672705, "grad_norm": 2.608428716659546, "learning_rate": 7.548288732938225e-08, "loss": 0.5937, "step": 7687 }, { "epoch": 0.36650537506257, "grad_norm": 1.8767002820968628, "learning_rate": 7.500194189651866e-08, "loss": 0.62, "step": 7688 }, { "epoch": 0.36655304745786954, "grad_norm": 1.3451383113861084, "learning_rate": 7.452252778864632e-08, "loss": 0.7668, "step": 7689 }, { "epoch": 0.366600719853169, "grad_norm": 2.4728777408599854, "learning_rate": 7.404464507973608e-08, "loss": 0.3595, "step": 7690 }, { "epoch": 0.3666483922484685, "grad_norm": 1.694606900215149, "learning_rate": 7.356829384351893e-08, "loss": 0.3838, "step": 7691 }, { "epoch": 0.366696064643768, "grad_norm": 1.0287492275238037, "learning_rate": 7.309347415349278e-08, "loss": 0.2514, "step": 7692 }, { "epoch": 0.36674373703906754, "grad_norm": 1.0105295181274414, "learning_rate": 7.262018608291566e-08, "loss": 0.5265, "step": 7693 }, { "epoch": 0.36679140943436706, "grad_norm": 1.980093002319336, "learning_rate": 7.214842970481139e-08, "loss": 1.0236, "step": 7694 }, { "epoch": 0.3668390818296665, "grad_norm": 2.581512451171875, "learning_rate": 7.167820509196732e-08, "loss": 0.5972, "step": 7695 }, { "epoch": 0.366886754224966, "grad_norm": 1.16861093044281, "learning_rate": 7.12095123169343e-08, "loss": 0.5906, "step": 7696 }, { "epoch": 0.36693442662026554, "grad_norm": 3.0507681369781494, "learning_rate": 7.074235145202668e-08, "loss": 0.5603, "step": 7697 }, { "epoch": 0.36698209901556506, "grad_norm": 1.3943735361099243, "learning_rate": 7.027672256932238e-08, "loss": 0.7102, "step": 7698 }, { "epoch": 0.3670297714108645, "grad_norm": 2.4650938510894775, "learning_rate": 6.981262574066395e-08, "loss": 1.1235, "step": 7699 }, { "epoch": 0.367077443806164, "grad_norm": 2.6008851528167725, "learning_rate": 6.93500610376563e-08, "loss": 1.0542, "step": 7700 }, { "epoch": 0.36712511620146354, "grad_norm": 1.6996532678604126, "learning_rate": 6.88890285316679e-08, "loss": 0.7084, "step": 7701 }, { "epoch": 0.36717278859676306, "grad_norm": 1.408275842666626, "learning_rate": 6.842952829383187e-08, "loss": 0.7608, "step": 7702 }, { "epoch": 0.36722046099206257, "grad_norm": 1.5945616960525513, "learning_rate": 6.797156039504482e-08, "loss": 0.9326, "step": 7703 }, { "epoch": 0.36726813338736203, "grad_norm": 1.2416167259216309, "learning_rate": 6.751512490596467e-08, "loss": 0.82, "step": 7704 }, { "epoch": 0.36731580578266154, "grad_norm": 3.048807382583618, "learning_rate": 6.706022189701622e-08, "loss": 1.0169, "step": 7705 }, { "epoch": 0.36736347817796106, "grad_norm": 1.791715383529663, "learning_rate": 6.660685143838664e-08, "loss": 1.1747, "step": 7706 }, { "epoch": 0.36741115057326057, "grad_norm": 2.7997539043426514, "learning_rate": 6.615501360002552e-08, "loss": 0.8987, "step": 7707 }, { "epoch": 0.36745882296856003, "grad_norm": 1.8299788236618042, "learning_rate": 6.570470845164712e-08, "loss": 1.0327, "step": 7708 }, { "epoch": 0.36750649536385954, "grad_norm": 2.873364210128784, "learning_rate": 6.525593606272917e-08, "loss": 0.2513, "step": 7709 }, { "epoch": 0.36755416775915906, "grad_norm": 2.3427250385284424, "learning_rate": 6.480869650251187e-08, "loss": 0.837, "step": 7710 }, { "epoch": 0.36760184015445857, "grad_norm": 2.1455252170562744, "learning_rate": 6.436298983999889e-08, "loss": 1.3452, "step": 7711 }, { "epoch": 0.3676495125497581, "grad_norm": 1.2994019985198975, "learning_rate": 6.391881614396078e-08, "loss": 0.2553, "step": 7712 }, { "epoch": 0.36769718494505754, "grad_norm": 1.720689058303833, "learning_rate": 6.347617548292717e-08, "loss": 1.2418, "step": 7713 }, { "epoch": 0.36774485734035706, "grad_norm": 1.196730375289917, "learning_rate": 6.303506792519232e-08, "loss": 0.5118, "step": 7714 }, { "epoch": 0.36779252973565657, "grad_norm": 13.853625297546387, "learning_rate": 6.259549353881623e-08, "loss": 0.902, "step": 7715 }, { "epoch": 0.3678402021309561, "grad_norm": 2.442233085632324, "learning_rate": 6.215745239162018e-08, "loss": 0.9641, "step": 7716 }, { "epoch": 0.3678878745262556, "grad_norm": 1.2612642049789429, "learning_rate": 6.172094455118904e-08, "loss": 0.782, "step": 7717 }, { "epoch": 0.36793554692155506, "grad_norm": 1.414482831954956, "learning_rate": 6.128597008487225e-08, "loss": 0.7849, "step": 7718 }, { "epoch": 0.36798321931685457, "grad_norm": 3.647113561630249, "learning_rate": 6.085252905978056e-08, "loss": 0.6292, "step": 7719 }, { "epoch": 0.3680308917121541, "grad_norm": 2.5834009647369385, "learning_rate": 6.042062154279049e-08, "loss": 1.1189, "step": 7720 }, { "epoch": 0.3680785641074536, "grad_norm": 1.6807844638824463, "learning_rate": 5.999024760054095e-08, "loss": 0.6304, "step": 7721 }, { "epoch": 0.36812623650275306, "grad_norm": 1.5649101734161377, "learning_rate": 5.9561407299433274e-08, "loss": 0.8199, "step": 7722 }, { "epoch": 0.36817390889805257, "grad_norm": 1.6391302347183228, "learning_rate": 5.9134100705634525e-08, "loss": 0.8994, "step": 7723 }, { "epoch": 0.3682215812933521, "grad_norm": 1.3756425380706787, "learning_rate": 5.8708327885071966e-08, "loss": 0.6368, "step": 7724 }, { "epoch": 0.3682692536886516, "grad_norm": 1.3108423948287964, "learning_rate": 5.8284088903439726e-08, "loss": 0.8382, "step": 7725 }, { "epoch": 0.3683169260839511, "grad_norm": 1.292011022567749, "learning_rate": 5.786138382619322e-08, "loss": 0.7141, "step": 7726 }, { "epoch": 0.36836459847925057, "grad_norm": 2.530026435852051, "learning_rate": 5.744021271854916e-08, "loss": 0.5991, "step": 7727 }, { "epoch": 0.3684122708745501, "grad_norm": 1.1009055376052856, "learning_rate": 5.702057564549335e-08, "loss": 0.481, "step": 7728 }, { "epoch": 0.3684599432698496, "grad_norm": 1.625542163848877, "learning_rate": 5.660247267176844e-08, "loss": 0.9559, "step": 7729 }, { "epoch": 0.3685076156651491, "grad_norm": 1.6544519662857056, "learning_rate": 5.618590386188616e-08, "loss": 0.7864, "step": 7730 }, { "epoch": 0.36855528806044857, "grad_norm": 1.352328896522522, "learning_rate": 5.577086928011732e-08, "loss": 0.7382, "step": 7731 }, { "epoch": 0.3686029604557481, "grad_norm": 1.5142983198165894, "learning_rate": 5.535736899049626e-08, "loss": 0.6611, "step": 7732 }, { "epoch": 0.3686506328510476, "grad_norm": 1.2521287202835083, "learning_rate": 5.4945403056824164e-08, "loss": 0.8079, "step": 7733 }, { "epoch": 0.3686983052463471, "grad_norm": 3.0747945308685303, "learning_rate": 5.453497154266241e-08, "loss": 0.6277, "step": 7734 }, { "epoch": 0.3687459776416466, "grad_norm": 2.9783384799957275, "learning_rate": 5.412607451133478e-08, "loss": 0.5548, "step": 7735 }, { "epoch": 0.3687936500369461, "grad_norm": 5.556096076965332, "learning_rate": 5.371871202593193e-08, "loss": 0.3213, "step": 7736 }, { "epoch": 0.3688413224322456, "grad_norm": 2.926870584487915, "learning_rate": 5.33128841493058e-08, "loss": 0.5459, "step": 7737 }, { "epoch": 0.3688889948275451, "grad_norm": 1.4682596921920776, "learning_rate": 5.290859094406964e-08, "loss": 0.7457, "step": 7738 }, { "epoch": 0.3689366672228446, "grad_norm": 1.818982720375061, "learning_rate": 5.250583247260355e-08, "loss": 0.9037, "step": 7739 }, { "epoch": 0.3689843396181441, "grad_norm": 1.2708405256271362, "learning_rate": 5.2104608797047816e-08, "loss": 0.5607, "step": 7740 }, { "epoch": 0.3690320120134436, "grad_norm": 1.9208078384399414, "learning_rate": 5.170491997930627e-08, "loss": 0.5822, "step": 7741 }, { "epoch": 0.3690796844087431, "grad_norm": 2.572831392288208, "learning_rate": 5.1306766081048456e-08, "loss": 0.721, "step": 7742 }, { "epoch": 0.36912735680404263, "grad_norm": 1.138107419013977, "learning_rate": 5.091014716370524e-08, "loss": 0.519, "step": 7743 }, { "epoch": 0.36917502919934214, "grad_norm": 4.329532146453857, "learning_rate": 5.0515063288471e-08, "loss": 1.4959, "step": 7744 }, { "epoch": 0.3692227015946416, "grad_norm": 1.845920205116272, "learning_rate": 5.012151451630143e-08, "loss": 0.9106, "step": 7745 }, { "epoch": 0.3692703739899411, "grad_norm": 2.029707908630371, "learning_rate": 4.972950090791906e-08, "loss": 0.945, "step": 7746 }, { "epoch": 0.36931804638524063, "grad_norm": 2.0834524631500244, "learning_rate": 4.933902252380662e-08, "loss": 0.392, "step": 7747 }, { "epoch": 0.36936571878054014, "grad_norm": 1.1270073652267456, "learning_rate": 4.8950079424211484e-08, "loss": 0.4158, "step": 7748 }, { "epoch": 0.36941339117583966, "grad_norm": 4.812641143798828, "learning_rate": 4.8562671669142304e-08, "loss": 0.4369, "step": 7749 }, { "epoch": 0.3694610635711391, "grad_norm": 1.43878972530365, "learning_rate": 4.8176799318373494e-08, "loss": 0.6074, "step": 7750 }, { "epoch": 0.36950873596643863, "grad_norm": 1.7358713150024414, "learning_rate": 4.7792462431439643e-08, "loss": 0.5948, "step": 7751 }, { "epoch": 0.36955640836173814, "grad_norm": 1.4982954263687134, "learning_rate": 4.740966106764222e-08, "loss": 0.6141, "step": 7752 }, { "epoch": 0.36960408075703766, "grad_norm": 1.4248689413070679, "learning_rate": 4.702839528604064e-08, "loss": 0.5833, "step": 7753 }, { "epoch": 0.3696517531523371, "grad_norm": 1.2393172979354858, "learning_rate": 4.66486651454634e-08, "loss": 0.7083, "step": 7754 }, { "epoch": 0.36969942554763663, "grad_norm": 1.2555162906646729, "learning_rate": 4.627047070449697e-08, "loss": 0.6786, "step": 7755 }, { "epoch": 0.36974709794293614, "grad_norm": 1.9703701734542847, "learning_rate": 4.589381202149357e-08, "loss": 0.9608, "step": 7756 }, { "epoch": 0.36979477033823566, "grad_norm": 1.3421674966812134, "learning_rate": 4.55186891545667e-08, "loss": 0.7369, "step": 7757 }, { "epoch": 0.36984244273353517, "grad_norm": 1.944890022277832, "learning_rate": 4.514510216159562e-08, "loss": 0.8617, "step": 7758 }, { "epoch": 0.36989011512883463, "grad_norm": 1.885948657989502, "learning_rate": 4.4773051100219787e-08, "loss": 0.6023, "step": 7759 }, { "epoch": 0.36993778752413414, "grad_norm": 1.1965134143829346, "learning_rate": 4.440253602784328e-08, "loss": 0.3921, "step": 7760 }, { "epoch": 0.36998545991943366, "grad_norm": 1.602170705795288, "learning_rate": 4.4033557001631475e-08, "loss": 0.9863, "step": 7761 }, { "epoch": 0.37003313231473317, "grad_norm": 2.1995394229888916, "learning_rate": 4.366611407851662e-08, "loss": 0.6922, "step": 7762 }, { "epoch": 0.37008080471003263, "grad_norm": 2.047574996948242, "learning_rate": 4.3300207315190026e-08, "loss": 0.9125, "step": 7763 }, { "epoch": 0.37012847710533214, "grad_norm": 1.2888671159744263, "learning_rate": 4.293583676810653e-08, "loss": 0.8809, "step": 7764 }, { "epoch": 0.37017614950063166, "grad_norm": 1.490317463874817, "learning_rate": 4.257300249348562e-08, "loss": 0.6894, "step": 7765 }, { "epoch": 0.37022382189593117, "grad_norm": 6.8432841300964355, "learning_rate": 4.221170454730916e-08, "loss": 0.2892, "step": 7766 }, { "epoch": 0.3702714942912307, "grad_norm": 1.2396644353866577, "learning_rate": 4.185194298532147e-08, "loss": 0.8975, "step": 7767 }, { "epoch": 0.37031916668653014, "grad_norm": 1.871390700340271, "learning_rate": 4.149371786302925e-08, "loss": 0.6949, "step": 7768 }, { "epoch": 0.37036683908182966, "grad_norm": 1.211944580078125, "learning_rate": 4.113702923570384e-08, "loss": 0.7685, "step": 7769 }, { "epoch": 0.37041451147712917, "grad_norm": 0.8955574035644531, "learning_rate": 4.0781877158377894e-08, "loss": 0.5975, "step": 7770 }, { "epoch": 0.3704621838724287, "grad_norm": 1.8945472240447998, "learning_rate": 4.042826168584868e-08, "loss": 0.7167, "step": 7771 }, { "epoch": 0.3705098562677282, "grad_norm": 2.005890130996704, "learning_rate": 4.0076182872674785e-08, "loss": 0.6805, "step": 7772 }, { "epoch": 0.37055752866302766, "grad_norm": 1.0825318098068237, "learning_rate": 3.972564077317831e-08, "loss": 0.7917, "step": 7773 }, { "epoch": 0.37060520105832717, "grad_norm": 2.385758638381958, "learning_rate": 3.9376635441444874e-08, "loss": 0.8841, "step": 7774 }, { "epoch": 0.3706528734536267, "grad_norm": 1.4121153354644775, "learning_rate": 3.9029166931322524e-08, "loss": 0.6241, "step": 7775 }, { "epoch": 0.3707005458489262, "grad_norm": 1.2842817306518555, "learning_rate": 3.86832352964206e-08, "loss": 0.8474, "step": 7776 }, { "epoch": 0.37074821824422566, "grad_norm": 1.3103934526443481, "learning_rate": 3.833884059011417e-08, "loss": 0.7723, "step": 7777 }, { "epoch": 0.37079589063952517, "grad_norm": 1.2943569421768188, "learning_rate": 3.7995982865539624e-08, "loss": 0.7501, "step": 7778 }, { "epoch": 0.3708435630348247, "grad_norm": 3.371055841445923, "learning_rate": 3.765466217559577e-08, "loss": 1.0109, "step": 7779 }, { "epoch": 0.3708912354301242, "grad_norm": 2.4353268146514893, "learning_rate": 3.731487857294491e-08, "loss": 1.1428, "step": 7780 }, { "epoch": 0.3709389078254237, "grad_norm": 2.3010752201080322, "learning_rate": 3.69766321100129e-08, "loss": 0.6788, "step": 7781 }, { "epoch": 0.37098658022072317, "grad_norm": 2.7268319129943848, "learning_rate": 3.663992283898687e-08, "loss": 0.674, "step": 7782 }, { "epoch": 0.3710342526160227, "grad_norm": 5.783518314361572, "learning_rate": 3.630475081181861e-08, "loss": 0.9479, "step": 7783 }, { "epoch": 0.3710819250113222, "grad_norm": 1.2917919158935547, "learning_rate": 3.597111608022119e-08, "loss": 0.9517, "step": 7784 }, { "epoch": 0.3711295974066217, "grad_norm": 1.5208823680877686, "learning_rate": 3.56390186956701e-08, "loss": 0.6202, "step": 7785 }, { "epoch": 0.3711772698019212, "grad_norm": 1.5522223711013794, "learning_rate": 3.530845870940658e-08, "loss": 0.7482, "step": 7786 }, { "epoch": 0.3712249421972207, "grad_norm": 2.059873104095459, "learning_rate": 3.497943617242983e-08, "loss": 0.7872, "step": 7787 }, { "epoch": 0.3712726145925202, "grad_norm": 3.6424009799957275, "learning_rate": 3.465195113550701e-08, "loss": 0.3467, "step": 7788 }, { "epoch": 0.3713202869878197, "grad_norm": 1.4958759546279907, "learning_rate": 3.43260036491655e-08, "loss": 0.8711, "step": 7789 }, { "epoch": 0.37136795938311923, "grad_norm": 2.646958112716675, "learning_rate": 3.400159376369394e-08, "loss": 0.6322, "step": 7790 }, { "epoch": 0.3714156317784187, "grad_norm": 3.456530809402466, "learning_rate": 3.367872152914675e-08, "loss": 1.3635, "step": 7791 }, { "epoch": 0.3714633041737182, "grad_norm": 1.3279197216033936, "learning_rate": 3.335738699533964e-08, "loss": 0.6843, "step": 7792 }, { "epoch": 0.3715109765690177, "grad_norm": 4.989992618560791, "learning_rate": 3.3037590211851823e-08, "loss": 0.0866, "step": 7793 }, { "epoch": 0.37155864896431723, "grad_norm": 2.787041425704956, "learning_rate": 3.271933122802273e-08, "loss": 1.0779, "step": 7794 }, { "epoch": 0.3716063213596167, "grad_norm": 1.2259422540664673, "learning_rate": 3.240261009295864e-08, "loss": 0.7496, "step": 7795 }, { "epoch": 0.3716539937549162, "grad_norm": 1.8526363372802734, "learning_rate": 3.208742685552602e-08, "loss": 0.4648, "step": 7796 }, { "epoch": 0.3717016661502157, "grad_norm": 1.7674260139465332, "learning_rate": 3.1773781564352625e-08, "loss": 0.6233, "step": 7797 }, { "epoch": 0.37174933854551523, "grad_norm": 1.2214933633804321, "learning_rate": 3.146167426783198e-08, "loss": 0.5241, "step": 7798 }, { "epoch": 0.37179701094081474, "grad_norm": 5.145979881286621, "learning_rate": 3.1151105014119995e-08, "loss": 1.8501, "step": 7799 }, { "epoch": 0.3718446833361142, "grad_norm": 1.1357462406158447, "learning_rate": 3.084207385113169e-08, "loss": 0.7689, "step": 7800 }, { "epoch": 0.3718923557314137, "grad_norm": 1.7186403274536133, "learning_rate": 3.053458082655003e-08, "loss": 0.8398, "step": 7801 }, { "epoch": 0.37194002812671323, "grad_norm": 1.5030573606491089, "learning_rate": 3.0228625987817064e-08, "loss": 0.7932, "step": 7802 }, { "epoch": 0.37198770052201274, "grad_norm": 1.028721809387207, "learning_rate": 2.992420938213725e-08, "loss": 0.7488, "step": 7803 }, { "epoch": 0.37203537291731226, "grad_norm": 2.824293375015259, "learning_rate": 2.9621331056480796e-08, "loss": 0.3448, "step": 7804 }, { "epoch": 0.3720830453126117, "grad_norm": 3.2116827964782715, "learning_rate": 2.931999105757699e-08, "loss": 0.3277, "step": 7805 }, { "epoch": 0.37213071770791123, "grad_norm": 1.632785677909851, "learning_rate": 2.9020189431920865e-08, "loss": 0.7778, "step": 7806 }, { "epoch": 0.37217839010321074, "grad_norm": 1.9857631921768188, "learning_rate": 2.8721926225768748e-08, "loss": 0.7373, "step": 7807 }, { "epoch": 0.37222606249851026, "grad_norm": 1.6291468143463135, "learning_rate": 2.8425201485139387e-08, "loss": 0.4981, "step": 7808 }, { "epoch": 0.3722737348938097, "grad_norm": 2.2106332778930664, "learning_rate": 2.8130015255812827e-08, "loss": 0.59, "step": 7809 }, { "epoch": 0.37232140728910923, "grad_norm": 4.437781810760498, "learning_rate": 2.7836367583335967e-08, "loss": 0.3724, "step": 7810 }, { "epoch": 0.37236907968440874, "grad_norm": 2.524610996246338, "learning_rate": 2.7544258513013678e-08, "loss": 0.8235, "step": 7811 }, { "epoch": 0.37241675207970826, "grad_norm": 1.413041591644287, "learning_rate": 2.7253688089915466e-08, "loss": 0.5837, "step": 7812 }, { "epoch": 0.37246442447500777, "grad_norm": 1.1875849962234497, "learning_rate": 2.6964656358874353e-08, "loss": 0.5419, "step": 7813 }, { "epoch": 0.37251209687030723, "grad_norm": 4.12817907333374, "learning_rate": 2.667716336448356e-08, "loss": 1.4011, "step": 7814 }, { "epoch": 0.37255976926560674, "grad_norm": 1.5169075727462769, "learning_rate": 2.639120915110094e-08, "loss": 0.6373, "step": 7815 }, { "epoch": 0.37260744166090626, "grad_norm": 2.392352819442749, "learning_rate": 2.6106793762847858e-08, "loss": 0.9112, "step": 7816 }, { "epoch": 0.37265511405620577, "grad_norm": 3.835285186767578, "learning_rate": 2.5823917243603668e-08, "loss": 0.8103, "step": 7817 }, { "epoch": 0.37270278645150523, "grad_norm": 2.305617332458496, "learning_rate": 2.5542579637015675e-08, "loss": 0.7076, "step": 7818 }, { "epoch": 0.37275045884680474, "grad_norm": 1.2971391677856445, "learning_rate": 2.5262780986491375e-08, "loss": 0.3653, "step": 7819 }, { "epoch": 0.37279813124210426, "grad_norm": 1.72105872631073, "learning_rate": 2.4984521335198464e-08, "loss": 0.6844, "step": 7820 }, { "epoch": 0.37284580363740377, "grad_norm": 2.416649103164673, "learning_rate": 2.4707800726072594e-08, "loss": 0.961, "step": 7821 }, { "epoch": 0.3728934760327033, "grad_norm": 3.378453254699707, "learning_rate": 2.4432619201806283e-08, "loss": 0.9911, "step": 7822 }, { "epoch": 0.37294114842800274, "grad_norm": 2.2994165420532227, "learning_rate": 2.4158976804858903e-08, "loss": 1.026, "step": 7823 }, { "epoch": 0.37298882082330226, "grad_norm": 2.1853156089782715, "learning_rate": 2.3886873577450008e-08, "loss": 0.7153, "step": 7824 }, { "epoch": 0.3730364932186018, "grad_norm": 2.522794485092163, "learning_rate": 2.3616309561562688e-08, "loss": 0.8567, "step": 7825 }, { "epoch": 0.3730841656139013, "grad_norm": 2.4500808715820312, "learning_rate": 2.3347284798941327e-08, "loss": 1.0215, "step": 7826 }, { "epoch": 0.37313183800920074, "grad_norm": 2.114877939224243, "learning_rate": 2.3079799331094943e-08, "loss": 0.3447, "step": 7827 }, { "epoch": 0.37317951040450026, "grad_norm": 1.5318366289138794, "learning_rate": 2.2813853199292745e-08, "loss": 0.6472, "step": 7828 }, { "epoch": 0.3732271827997998, "grad_norm": 1.7850221395492554, "learning_rate": 2.2549446444567468e-08, "loss": 0.6343, "step": 7829 }, { "epoch": 0.3732748551950993, "grad_norm": 1.692326307296753, "learning_rate": 2.2286579107716476e-08, "loss": 0.772, "step": 7830 }, { "epoch": 0.3733225275903988, "grad_norm": 2.600684881210327, "learning_rate": 2.2025251229293997e-08, "loss": 0.5936, "step": 7831 }, { "epoch": 0.37337019998569826, "grad_norm": 1.0904062986373901, "learning_rate": 2.176546284962222e-08, "loss": 0.4863, "step": 7832 }, { "epoch": 0.3734178723809978, "grad_norm": 1.6171411275863647, "learning_rate": 2.1507214008783527e-08, "loss": 1.1358, "step": 7833 }, { "epoch": 0.3734655447762973, "grad_norm": 1.8634965419769287, "learning_rate": 2.1250504746623822e-08, "loss": 0.6109, "step": 7834 }, { "epoch": 0.3735132171715968, "grad_norm": 1.39500892162323, "learning_rate": 2.0995335102749204e-08, "loss": 0.5307, "step": 7835 }, { "epoch": 0.3735608895668963, "grad_norm": 1.882065773010254, "learning_rate": 2.0741705116531507e-08, "loss": 0.7883, "step": 7836 }, { "epoch": 0.3736085619621958, "grad_norm": 3.0995166301727295, "learning_rate": 2.0489614827101656e-08, "loss": 0.8638, "step": 7837 }, { "epoch": 0.3736562343574953, "grad_norm": 2.771300792694092, "learning_rate": 2.02390642733552e-08, "loss": 0.3219, "step": 7838 }, { "epoch": 0.3737039067527948, "grad_norm": 1.7595773935317993, "learning_rate": 1.9990053493949003e-08, "loss": 0.6482, "step": 7839 }, { "epoch": 0.3737515791480943, "grad_norm": 1.6298259496688843, "learning_rate": 1.9742582527303433e-08, "loss": 0.918, "step": 7840 }, { "epoch": 0.3737992515433938, "grad_norm": 1.8266521692276, "learning_rate": 1.9496651411601285e-08, "loss": 0.8181, "step": 7841 }, { "epoch": 0.3738469239386933, "grad_norm": 4.214056015014648, "learning_rate": 1.9252260184786652e-08, "loss": 0.3962, "step": 7842 }, { "epoch": 0.3738945963339928, "grad_norm": 1.3586162328720093, "learning_rate": 1.900940888456604e-08, "loss": 0.7629, "step": 7843 }, { "epoch": 0.3739422687292923, "grad_norm": 1.6599266529083252, "learning_rate": 1.876809754840836e-08, "loss": 0.8508, "step": 7844 }, { "epoch": 0.37398994112459183, "grad_norm": 1.5912233591079712, "learning_rate": 1.8528326213548276e-08, "loss": 0.9134, "step": 7845 }, { "epoch": 0.3740376135198913, "grad_norm": 2.298814535140991, "learning_rate": 1.829009491697731e-08, "loss": 0.8547, "step": 7846 }, { "epoch": 0.3740852859151908, "grad_norm": 2.1455953121185303, "learning_rate": 1.805340369545272e-08, "loss": 1.0501, "step": 7847 }, { "epoch": 0.3741329583104903, "grad_norm": 2.4369187355041504, "learning_rate": 1.781825258549419e-08, "loss": 0.9414, "step": 7848 }, { "epoch": 0.37418063070578983, "grad_norm": 2.005153179168701, "learning_rate": 1.7584641623381583e-08, "loss": 0.8507, "step": 7849 }, { "epoch": 0.3742283031010893, "grad_norm": 3.369074583053589, "learning_rate": 1.735257084516051e-08, "loss": 0.8743, "step": 7850 }, { "epoch": 0.3742759754963888, "grad_norm": 2.91791033744812, "learning_rate": 1.7122040286636775e-08, "loss": 1.1068, "step": 7851 }, { "epoch": 0.3743236478916883, "grad_norm": 2.0598747730255127, "learning_rate": 1.6893049983378597e-08, "loss": 0.1639, "step": 7852 }, { "epoch": 0.37437132028698783, "grad_norm": 0.9947049617767334, "learning_rate": 1.6665599970715484e-08, "loss": 0.5559, "step": 7853 }, { "epoch": 0.37441899268228734, "grad_norm": 1.0994200706481934, "learning_rate": 1.6439690283742704e-08, "loss": 0.6166, "step": 7854 }, { "epoch": 0.3744666650775868, "grad_norm": 1.6171073913574219, "learning_rate": 1.6215320957315707e-08, "loss": 0.7666, "step": 7855 }, { "epoch": 0.3745143374728863, "grad_norm": 1.652296543121338, "learning_rate": 1.5992492026050134e-08, "loss": 0.4892, "step": 7856 }, { "epoch": 0.37456200986818583, "grad_norm": 1.50570809841156, "learning_rate": 1.5771203524328483e-08, "loss": 0.4487, "step": 7857 }, { "epoch": 0.37460968226348534, "grad_norm": 1.7177133560180664, "learning_rate": 1.5551455486292333e-08, "loss": 0.7813, "step": 7858 }, { "epoch": 0.37465735465878486, "grad_norm": 2.157076358795166, "learning_rate": 1.5333247945846787e-08, "loss": 0.9098, "step": 7859 }, { "epoch": 0.3747050270540843, "grad_norm": 2.961280107498169, "learning_rate": 1.5116580936658242e-08, "loss": 0.6165, "step": 7860 }, { "epoch": 0.37475269944938383, "grad_norm": 2.552809953689575, "learning_rate": 1.4901454492157742e-08, "loss": 0.6839, "step": 7861 }, { "epoch": 0.37480037184468334, "grad_norm": 1.4613701105117798, "learning_rate": 1.4687868645535398e-08, "loss": 0.4725, "step": 7862 }, { "epoch": 0.37484804423998286, "grad_norm": 2.817723512649536, "learning_rate": 1.4475823429747071e-08, "loss": 0.635, "step": 7863 }, { "epoch": 0.3748957166352823, "grad_norm": 2.326052188873291, "learning_rate": 1.4265318877507705e-08, "loss": 1.053, "step": 7864 }, { "epoch": 0.37494338903058183, "grad_norm": 3.5985987186431885, "learning_rate": 1.4056355021295764e-08, "loss": 0.4344, "step": 7865 }, { "epoch": 0.37499106142588134, "grad_norm": 1.4357486963272095, "learning_rate": 1.3848931893353235e-08, "loss": 1.0112, "step": 7866 }, { "epoch": 0.37503873382118086, "grad_norm": 1.815887212753296, "learning_rate": 1.3643049525683405e-08, "loss": 0.5912, "step": 7867 }, { "epoch": 0.3750864062164804, "grad_norm": 1.2245608568191528, "learning_rate": 1.3438707950051978e-08, "loss": 0.6778, "step": 7868 }, { "epoch": 0.37513407861177983, "grad_norm": 1.2549368143081665, "learning_rate": 1.3235907197984843e-08, "loss": 0.349, "step": 7869 }, { "epoch": 0.37518175100707934, "grad_norm": 1.3882089853286743, "learning_rate": 1.303464730077475e-08, "loss": 0.7041, "step": 7870 }, { "epoch": 0.37522942340237886, "grad_norm": 1.3840465545654297, "learning_rate": 1.2834928289472415e-08, "loss": 0.8589, "step": 7871 }, { "epoch": 0.3752770957976784, "grad_norm": 1.1382063627243042, "learning_rate": 1.2636750194892078e-08, "loss": 0.195, "step": 7872 }, { "epoch": 0.37532476819297783, "grad_norm": 1.5908664464950562, "learning_rate": 1.2440113047611502e-08, "loss": 0.7147, "step": 7873 }, { "epoch": 0.37537244058827735, "grad_norm": 1.399793267250061, "learning_rate": 1.224501687796975e-08, "loss": 0.5627, "step": 7874 }, { "epoch": 0.37542011298357686, "grad_norm": 1.5485873222351074, "learning_rate": 1.2051461716068302e-08, "loss": 0.8783, "step": 7875 }, { "epoch": 0.3754677853788764, "grad_norm": 2.1138932704925537, "learning_rate": 1.1859447591769934e-08, "loss": 0.8431, "step": 7876 }, { "epoch": 0.3755154577741759, "grad_norm": 1.1249518394470215, "learning_rate": 1.166897453470095e-08, "loss": 0.5855, "step": 7877 }, { "epoch": 0.37556313016947535, "grad_norm": 3.097949743270874, "learning_rate": 1.148004257424895e-08, "loss": 0.5385, "step": 7878 }, { "epoch": 0.37561080256477486, "grad_norm": 2.025190830230713, "learning_rate": 1.1292651739565063e-08, "loss": 0.9114, "step": 7879 }, { "epoch": 0.3756584749600744, "grad_norm": 3.2264833450317383, "learning_rate": 1.1106802059560607e-08, "loss": 0.0937, "step": 7880 }, { "epoch": 0.3757061473553739, "grad_norm": 1.4958502054214478, "learning_rate": 1.092249356291042e-08, "loss": 0.7134, "step": 7881 }, { "epoch": 0.37575381975067335, "grad_norm": 1.5587936639785767, "learning_rate": 1.0739726278052864e-08, "loss": 0.639, "step": 7882 }, { "epoch": 0.37580149214597286, "grad_norm": 1.427049160003662, "learning_rate": 1.0558500233186498e-08, "loss": 0.7456, "step": 7883 }, { "epoch": 0.3758491645412724, "grad_norm": 1.3080843687057495, "learning_rate": 1.0378815456271174e-08, "loss": 0.589, "step": 7884 }, { "epoch": 0.3758968369365719, "grad_norm": 2.67425799369812, "learning_rate": 1.0200671975031384e-08, "loss": 1.0191, "step": 7885 }, { "epoch": 0.3759445093318714, "grad_norm": 1.9029661417007446, "learning_rate": 1.002406981695292e-08, "loss": 0.7613, "step": 7886 }, { "epoch": 0.37599218172717086, "grad_norm": 1.3547945022583008, "learning_rate": 9.849009009285093e-09, "loss": 0.8375, "step": 7887 }, { "epoch": 0.3760398541224704, "grad_norm": 1.3373538255691528, "learning_rate": 9.675489579035191e-09, "loss": 0.6045, "step": 7888 }, { "epoch": 0.3760875265177699, "grad_norm": 2.000943660736084, "learning_rate": 9.503511552977351e-09, "loss": 0.9951, "step": 7889 }, { "epoch": 0.3761351989130694, "grad_norm": 7.923572540283203, "learning_rate": 9.333074957644795e-09, "loss": 2.2791, "step": 7890 }, { "epoch": 0.3761828713083689, "grad_norm": 1.4883496761322021, "learning_rate": 9.164179819335373e-09, "loss": 0.7747, "step": 7891 }, { "epoch": 0.3762305437036684, "grad_norm": 3.276047468185425, "learning_rate": 8.996826164107131e-09, "loss": 0.5226, "step": 7892 }, { "epoch": 0.3762782160989679, "grad_norm": 1.549985647201538, "learning_rate": 8.831014017780526e-09, "loss": 0.6295, "step": 7893 }, { "epoch": 0.3763258884942674, "grad_norm": 1.7809767723083496, "learning_rate": 8.666743405940647e-09, "loss": 0.8068, "step": 7894 }, { "epoch": 0.3763735608895669, "grad_norm": 3.950303077697754, "learning_rate": 8.504014353930557e-09, "loss": 1.8445, "step": 7895 }, { "epoch": 0.3764212332848664, "grad_norm": 1.5578914880752563, "learning_rate": 8.342826886857946e-09, "loss": 0.8914, "step": 7896 }, { "epoch": 0.3764689056801659, "grad_norm": 8.306102752685547, "learning_rate": 8.183181029594034e-09, "loss": 0.6091, "step": 7897 }, { "epoch": 0.3765165780754654, "grad_norm": 1.7017185688018799, "learning_rate": 8.025076806769117e-09, "loss": 0.7321, "step": 7898 }, { "epoch": 0.3765642504707649, "grad_norm": 2.048006534576416, "learning_rate": 7.868514242777015e-09, "loss": 0.761, "step": 7899 }, { "epoch": 0.37661192286606443, "grad_norm": 1.3152148723602295, "learning_rate": 7.71349336177507e-09, "loss": 0.2279, "step": 7900 }, { "epoch": 0.3766595952613639, "grad_norm": 1.1558787822723389, "learning_rate": 7.56001418767971e-09, "loss": 0.8182, "step": 7901 }, { "epoch": 0.3767072676566634, "grad_norm": 1.4645040035247803, "learning_rate": 7.408076744171988e-09, "loss": 0.6101, "step": 7902 }, { "epoch": 0.3767549400519629, "grad_norm": 1.4637107849121094, "learning_rate": 7.257681054695375e-09, "loss": 0.7785, "step": 7903 }, { "epoch": 0.37680261244726243, "grad_norm": 1.8055284023284912, "learning_rate": 7.108827142452423e-09, "loss": 0.5899, "step": 7904 }, { "epoch": 0.3768502848425619, "grad_norm": 1.9140489101409912, "learning_rate": 6.961515030410315e-09, "loss": 0.7417, "step": 7905 }, { "epoch": 0.3768979572378614, "grad_norm": 1.172397255897522, "learning_rate": 6.8157447412975365e-09, "loss": 0.6935, "step": 7906 }, { "epoch": 0.3769456296331609, "grad_norm": 1.1835538148880005, "learning_rate": 6.671516297606095e-09, "loss": 1.0454, "step": 7907 }, { "epoch": 0.37699330202846043, "grad_norm": 1.076431393623352, "learning_rate": 6.528829721588193e-09, "loss": 0.547, "step": 7908 }, { "epoch": 0.37704097442375994, "grad_norm": 2.151554584503174, "learning_rate": 6.38768503525955e-09, "loss": 0.5472, "step": 7909 }, { "epoch": 0.3770886468190594, "grad_norm": 1.8558036088943481, "learning_rate": 6.2480822603960825e-09, "loss": 0.342, "step": 7910 }, { "epoch": 0.3771363192143589, "grad_norm": 1.9963477849960327, "learning_rate": 6.110021418538337e-09, "loss": 0.6691, "step": 7911 }, { "epoch": 0.37718399160965843, "grad_norm": 1.7131110429763794, "learning_rate": 5.973502530987052e-09, "loss": 0.6267, "step": 7912 }, { "epoch": 0.37723166400495795, "grad_norm": 1.512924313545227, "learning_rate": 5.83852561880538e-09, "loss": 0.5045, "step": 7913 }, { "epoch": 0.3772793364002574, "grad_norm": 1.1113539934158325, "learning_rate": 5.705090702819993e-09, "loss": 0.4778, "step": 7914 }, { "epoch": 0.3773270087955569, "grad_norm": 1.4719964265823364, "learning_rate": 5.573197803616648e-09, "loss": 1.0057, "step": 7915 }, { "epoch": 0.37737468119085643, "grad_norm": 1.2357630729675293, "learning_rate": 5.442846941546842e-09, "loss": 0.7613, "step": 7916 }, { "epoch": 0.37742235358615595, "grad_norm": 1.7606688737869263, "learning_rate": 5.314038136722266e-09, "loss": 0.8544, "step": 7917 }, { "epoch": 0.37747002598145546, "grad_norm": 1.3121763467788696, "learning_rate": 5.1867714090148016e-09, "loss": 0.5183, "step": 7918 }, { "epoch": 0.3775176983767549, "grad_norm": 1.782693862915039, "learning_rate": 5.061046778063183e-09, "loss": 0.918, "step": 7919 }, { "epoch": 0.37756537077205443, "grad_norm": 1.8094278573989868, "learning_rate": 4.936864263264119e-09, "loss": 0.9862, "step": 7920 }, { "epoch": 0.37761304316735395, "grad_norm": 1.605290412902832, "learning_rate": 4.814223883776725e-09, "loss": 1.0203, "step": 7921 }, { "epoch": 0.37766071556265346, "grad_norm": 1.2488490343093872, "learning_rate": 4.693125658524755e-09, "loss": 0.7673, "step": 7922 }, { "epoch": 0.377708387957953, "grad_norm": 1.2578039169311523, "learning_rate": 4.573569606191042e-09, "loss": 0.5042, "step": 7923 }, { "epoch": 0.37775606035325243, "grad_norm": 1.093462347984314, "learning_rate": 4.45555574522305e-09, "loss": 0.6331, "step": 7924 }, { "epoch": 0.37780373274855195, "grad_norm": 1.4226394891738892, "learning_rate": 4.339084093828438e-09, "loss": 1.0386, "step": 7925 }, { "epoch": 0.37785140514385146, "grad_norm": 1.2890040874481201, "learning_rate": 4.224154669978386e-09, "loss": 0.7303, "step": 7926 }, { "epoch": 0.377899077539151, "grad_norm": 2.2560782432556152, "learning_rate": 4.1107674914042665e-09, "loss": 0.357, "step": 7927 }, { "epoch": 0.37794674993445043, "grad_norm": 1.2863467931747437, "learning_rate": 3.998922575600972e-09, "loss": 0.6226, "step": 7928 }, { "epoch": 0.37799442232974995, "grad_norm": 5.049374580383301, "learning_rate": 3.8886199398247005e-09, "loss": 0.4091, "step": 7929 }, { "epoch": 0.37804209472504946, "grad_norm": 3.6122961044311523, "learning_rate": 3.77985960109517e-09, "loss": 1.4993, "step": 7930 }, { "epoch": 0.378089767120349, "grad_norm": 1.6522828340530396, "learning_rate": 3.6726415761911826e-09, "loss": 0.9534, "step": 7931 }, { "epoch": 0.3781374395156485, "grad_norm": 1.0905990600585938, "learning_rate": 3.5669658816572803e-09, "loss": 0.434, "step": 7932 }, { "epoch": 0.37818511191094795, "grad_norm": 1.8408175706863403, "learning_rate": 3.462832533795979e-09, "loss": 0.6149, "step": 7933 }, { "epoch": 0.37823278430624746, "grad_norm": 3.0016887187957764, "learning_rate": 3.360241548676646e-09, "loss": 0.9752, "step": 7934 }, { "epoch": 0.378280456701547, "grad_norm": 1.4469727277755737, "learning_rate": 3.259192942125511e-09, "loss": 0.3066, "step": 7935 }, { "epoch": 0.3783281290968465, "grad_norm": 3.8612887859344482, "learning_rate": 3.1596867297345457e-09, "loss": 1.1331, "step": 7936 }, { "epoch": 0.37837580149214595, "grad_norm": 1.598137378692627, "learning_rate": 3.0617229268570248e-09, "loss": 0.3896, "step": 7937 }, { "epoch": 0.37842347388744546, "grad_norm": 1.9002468585968018, "learning_rate": 2.9653015486064143e-09, "loss": 0.6132, "step": 7938 }, { "epoch": 0.378471146282745, "grad_norm": 1.2709959745407104, "learning_rate": 2.8704226098597023e-09, "loss": 0.624, "step": 7939 }, { "epoch": 0.3785188186780445, "grad_norm": 1.3499418497085571, "learning_rate": 2.7770861252574e-09, "loss": 0.8452, "step": 7940 }, { "epoch": 0.378566491073344, "grad_norm": 4.122645378112793, "learning_rate": 2.6852921091991e-09, "loss": 0.8923, "step": 7941 }, { "epoch": 0.37861416346864346, "grad_norm": 0.9164470434188843, "learning_rate": 2.595040575846808e-09, "loss": 0.564, "step": 7942 }, { "epoch": 0.378661835863943, "grad_norm": 1.244675636291504, "learning_rate": 2.5063315391271605e-09, "loss": 0.5326, "step": 7943 }, { "epoch": 0.3787095082592425, "grad_norm": 1.5859483480453491, "learning_rate": 2.4191650127269873e-09, "loss": 0.7998, "step": 7944 }, { "epoch": 0.378757180654542, "grad_norm": 2.027064323425293, "learning_rate": 2.3335410100933096e-09, "loss": 0.2413, "step": 7945 }, { "epoch": 0.3788048530498415, "grad_norm": 1.0773860216140747, "learning_rate": 2.249459544438892e-09, "loss": 0.5909, "step": 7946 }, { "epoch": 0.378852525445141, "grad_norm": 1.8932267427444458, "learning_rate": 2.1669206287355803e-09, "loss": 0.66, "step": 7947 }, { "epoch": 0.3789001978404405, "grad_norm": 1.4737772941589355, "learning_rate": 2.0859242757187425e-09, "loss": 1.037, "step": 7948 }, { "epoch": 0.37894787023574, "grad_norm": 1.2279188632965088, "learning_rate": 2.006470497885049e-09, "loss": 0.8493, "step": 7949 }, { "epoch": 0.3789955426310395, "grad_norm": 1.3951424360275269, "learning_rate": 1.9285593074935826e-09, "loss": 0.9716, "step": 7950 }, { "epoch": 0.379043215026339, "grad_norm": 1.5984203815460205, "learning_rate": 1.8521907165658382e-09, "loss": 0.7161, "step": 7951 }, { "epoch": 0.3790908874216385, "grad_norm": 1.5910447835922241, "learning_rate": 1.7773647368835023e-09, "loss": 0.3812, "step": 7952 }, { "epoch": 0.379138559816938, "grad_norm": 1.8767844438552856, "learning_rate": 1.7040813799917844e-09, "loss": 0.3736, "step": 7953 }, { "epoch": 0.3791862322122375, "grad_norm": 7.024414539337158, "learning_rate": 1.6323406571983058e-09, "loss": 0.4531, "step": 7954 }, { "epoch": 0.37923390460753703, "grad_norm": 1.522065281867981, "learning_rate": 1.56214257957088e-09, "loss": 0.7135, "step": 7955 }, { "epoch": 0.3792815770028365, "grad_norm": 1.781845211982727, "learning_rate": 1.4934871579408428e-09, "loss": 0.9186, "step": 7956 }, { "epoch": 0.379329249398136, "grad_norm": 1.024221658706665, "learning_rate": 1.4263744029019422e-09, "loss": 0.7709, "step": 7957 }, { "epoch": 0.3793769217934355, "grad_norm": 1.4704159498214722, "learning_rate": 1.360804324807008e-09, "loss": 0.6995, "step": 7958 }, { "epoch": 0.37942459418873503, "grad_norm": 1.6574232578277588, "learning_rate": 1.2967769337746128e-09, "loss": 0.7073, "step": 7959 }, { "epoch": 0.3794722665840345, "grad_norm": 1.122309684753418, "learning_rate": 1.2342922396824108e-09, "loss": 0.8538, "step": 7960 }, { "epoch": 0.379519938979334, "grad_norm": 1.404675841331482, "learning_rate": 1.173350252171579e-09, "loss": 0.6119, "step": 7961 }, { "epoch": 0.3795676113746335, "grad_norm": 2.6873176097869873, "learning_rate": 1.113950980645706e-09, "loss": 0.7034, "step": 7962 }, { "epoch": 0.37961528376993303, "grad_norm": 1.3230880498886108, "learning_rate": 1.0560944342674627e-09, "loss": 0.6361, "step": 7963 }, { "epoch": 0.37966295616523255, "grad_norm": 1.7919119596481323, "learning_rate": 9.997806219652628e-10, "loss": 0.4311, "step": 7964 }, { "epoch": 0.379710628560532, "grad_norm": 1.6529461145401, "learning_rate": 9.450095524266012e-10, "loss": 0.8398, "step": 7965 }, { "epoch": 0.3797583009558315, "grad_norm": 1.2438713312149048, "learning_rate": 8.917812341024956e-10, "loss": 0.3345, "step": 7966 }, { "epoch": 0.37980597335113103, "grad_norm": 1.432558298110962, "learning_rate": 8.400956752063761e-10, "loss": 0.8017, "step": 7967 }, { "epoch": 0.37985364574643055, "grad_norm": 0.9874585270881653, "learning_rate": 7.899528837118642e-10, "loss": 0.6413, "step": 7968 }, { "epoch": 0.37990131814173, "grad_norm": 1.2063425779342651, "learning_rate": 7.413528673549941e-10, "loss": 0.4471, "step": 7969 }, { "epoch": 0.3799489905370295, "grad_norm": 1.660598635673523, "learning_rate": 6.942956336353224e-10, "loss": 1.0374, "step": 7970 }, { "epoch": 0.37999666293232903, "grad_norm": 2.485856294631958, "learning_rate": 6.487811898137075e-10, "loss": 0.9347, "step": 7971 }, { "epoch": 0.38004433532762855, "grad_norm": 2.2753517627716064, "learning_rate": 6.048095429111999e-10, "loss": 0.8869, "step": 7972 }, { "epoch": 0.38009200772292806, "grad_norm": 2.6413257122039795, "learning_rate": 5.623806997123726e-10, "loss": 1.1618, "step": 7973 }, { "epoch": 0.3801396801182275, "grad_norm": 1.2967902421951294, "learning_rate": 5.214946667642106e-10, "loss": 0.7868, "step": 7974 }, { "epoch": 0.38018735251352703, "grad_norm": 1.7386435270309448, "learning_rate": 4.821514503750013e-10, "loss": 0.6167, "step": 7975 }, { "epoch": 0.38023502490882655, "grad_norm": 2.0070996284484863, "learning_rate": 4.4435105661433387e-10, "loss": 0.7583, "step": 7976 }, { "epoch": 0.38028269730412606, "grad_norm": 1.1420079469680786, "learning_rate": 4.0809349131420984e-10, "loss": 0.5281, "step": 7977 }, { "epoch": 0.3803303696994256, "grad_norm": 1.9545698165893555, "learning_rate": 3.7337876007015325e-10, "loss": 0.7945, "step": 7978 }, { "epoch": 0.38037804209472503, "grad_norm": 1.9201194047927856, "learning_rate": 3.4020686823788007e-10, "loss": 0.5952, "step": 7979 }, { "epoch": 0.38042571449002455, "grad_norm": 0.9844833612442017, "learning_rate": 3.0857782093440813e-10, "loss": 0.6198, "step": 7980 }, { "epoch": 0.38047338688532406, "grad_norm": 1.3375505208969116, "learning_rate": 2.784916230402779e-10, "loss": 1.0815, "step": 7981 }, { "epoch": 0.3805210592806236, "grad_norm": 2.765907049179077, "learning_rate": 2.49948279198442e-10, "loss": 1.1167, "step": 7982 }, { "epoch": 0.38056873167592303, "grad_norm": 1.541002869606018, "learning_rate": 2.2294779381204502e-10, "loss": 0.6469, "step": 7983 }, { "epoch": 0.38061640407122255, "grad_norm": 3.628169536590576, "learning_rate": 1.974901710466437e-10, "loss": 1.2214, "step": 7984 }, { "epoch": 0.38066407646652206, "grad_norm": 2.3656258583068848, "learning_rate": 1.7357541483020712e-10, "loss": 0.727, "step": 7985 }, { "epoch": 0.3807117488618216, "grad_norm": 1.9193719625473022, "learning_rate": 1.5120352885311663e-10, "loss": 0.785, "step": 7986 }, { "epoch": 0.3807594212571211, "grad_norm": 3.311270236968994, "learning_rate": 1.3037451656705558e-10, "loss": 0.6728, "step": 7987 }, { "epoch": 0.38080709365242055, "grad_norm": 1.8771862983703613, "learning_rate": 1.1108838118500942e-10, "loss": 0.7411, "step": 7988 }, { "epoch": 0.38085476604772006, "grad_norm": 2.5218522548675537, "learning_rate": 9.334512568348608e-11, "loss": 0.9595, "step": 7989 }, { "epoch": 0.3809024384430196, "grad_norm": 1.7077536582946777, "learning_rate": 7.714475279918531e-11, "loss": 0.731, "step": 7990 }, { "epoch": 0.3809501108383191, "grad_norm": 1.424237847328186, "learning_rate": 6.248726503232938e-11, "loss": 0.6481, "step": 7991 }, { "epoch": 0.38099778323361855, "grad_norm": 1.0298527479171753, "learning_rate": 4.937266464444257e-11, "loss": 0.1631, "step": 7992 }, { "epoch": 0.38104545562891806, "grad_norm": 3.5515754222869873, "learning_rate": 3.7800953658351236e-11, "loss": 0.4999, "step": 7993 }, { "epoch": 0.3810931280242176, "grad_norm": 1.9379377365112305, "learning_rate": 2.7772133860404227e-11, "loss": 0.7415, "step": 7994 }, { "epoch": 0.3811408004195171, "grad_norm": 3.0354959964752197, "learning_rate": 1.9286206797142214e-11, "loss": 1.2463, "step": 7995 }, { "epoch": 0.3811884728148166, "grad_norm": 4.965470314025879, "learning_rate": 1.234317377862837e-11, "loss": 0.9729, "step": 7996 }, { "epoch": 0.38123614521011606, "grad_norm": 1.3164920806884766, "learning_rate": 6.943035875117688e-12, "loss": 0.6675, "step": 7997 }, { "epoch": 0.3812838176054156, "grad_norm": 1.4870984554290771, "learning_rate": 3.0857939203876584e-12, "loss": 0.8669, "step": 7998 }, { "epoch": 0.3813314900007151, "grad_norm": 1.6616401672363281, "learning_rate": 7.714485095178248e-13, "loss": 0.5931, "step": 7999 }, { "epoch": 0.3813791623960146, "grad_norm": 1.6352343559265137, "learning_rate": 0.0, "loss": 0.7391, "step": 8000 }, { "epoch": 0.3813791623960146, "eval_loss": 0.38078346848487854, "eval_runtime": 4278.4295, "eval_samples_per_second": 1.396, "eval_steps_per_second": 1.396, "step": 8000 } ], "logging_steps": 1, "max_steps": 8000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.2236011257856e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }