diff --git "a/checkpoint-8000/trainer_state.json" "b/checkpoint-8000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-8000/trainer_state.json" @@ -0,0 +1,56041 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.3813791623960146, + "eval_steps": 500, + "global_step": 8000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 4.7672395299501825e-05, + "grad_norm": 18.2243709564209, + "learning_rate": 1e-05, + "loss": 4.1865, + "step": 1 + }, + { + "epoch": 9.534479059900365e-05, + "grad_norm": 5.30253791809082, + "learning_rate": 2e-05, + "loss": 1.7704, + "step": 2 + }, + { + "epoch": 0.00014301718589850547, + "grad_norm": 4.576740264892578, + "learning_rate": 1.999999922855149e-05, + "loss": 2.3123, + "step": 3 + }, + { + "epoch": 0.0001906895811980073, + "grad_norm": 14.161197662353516, + "learning_rate": 1.9999996914206083e-05, + "loss": 2.3138, + "step": 4 + }, + { + "epoch": 0.0002383619764975091, + "grad_norm": 4.212440490722656, + "learning_rate": 1.9999993056964127e-05, + "loss": 0.7261, + "step": 5 + }, + { + "epoch": 0.00028603437179701094, + "grad_norm": 30.711898803710938, + "learning_rate": 1.9999987656826223e-05, + "loss": 2.9592, + "step": 6 + }, + { + "epoch": 0.00033370676709651274, + "grad_norm": 17.282690048217773, + "learning_rate": 1.9999980713793205e-05, + "loss": 1.661, + "step": 7 + }, + { + "epoch": 0.0003813791623960146, + "grad_norm": 18.53205108642578, + "learning_rate": 1.9999972227866142e-05, + "loss": 1.9224, + "step": 8 + }, + { + "epoch": 0.0004290515576955164, + "grad_norm": 37.968788146972656, + "learning_rate": 1.9999962199046343e-05, + "loss": 1.9465, + "step": 9 + }, + { + "epoch": 0.0004767239529950182, + "grad_norm": 2.2596421241760254, + "learning_rate": 1.9999950627335357e-05, + "loss": 1.1301, + "step": 10 + }, + { + "epoch": 0.0005243963482945201, + "grad_norm": 4.727980136871338, + "learning_rate": 1.9999937512734968e-05, + "loss": 1.2272, + "step": 11 + }, + { + "epoch": 0.0005720687435940219, + "grad_norm": 2.9042630195617676, + "learning_rate": 1.9999922855247203e-05, + "loss": 1.3935, + "step": 12 + }, + { + "epoch": 0.0006197411388935237, + "grad_norm": 13.45541000366211, + "learning_rate": 1.999990665487432e-05, + "loss": 1.6829, + "step": 13 + }, + { + "epoch": 0.0006674135341930255, + "grad_norm": 5.85912561416626, + "learning_rate": 1.9999888911618815e-05, + "loss": 0.889, + "step": 14 + }, + { + "epoch": 0.0007150859294925274, + "grad_norm": 3.5429627895355225, + "learning_rate": 1.9999869625483433e-05, + "loss": 1.1139, + "step": 15 + }, + { + "epoch": 0.0007627583247920292, + "grad_norm": 6.764256477355957, + "learning_rate": 1.9999848796471148e-05, + "loss": 1.4913, + "step": 16 + }, + { + "epoch": 0.000810430720091531, + "grad_norm": 22.398008346557617, + "learning_rate": 1.999982642458517e-05, + "loss": 2.3193, + "step": 17 + }, + { + "epoch": 0.0008581031153910328, + "grad_norm": 22.866546630859375, + "learning_rate": 1.9999802509828955e-05, + "loss": 1.3897, + "step": 18 + }, + { + "epoch": 0.0009057755106905346, + "grad_norm": 5.639863967895508, + "learning_rate": 1.999977705220619e-05, + "loss": 1.4171, + "step": 19 + }, + { + "epoch": 0.0009534479059900364, + "grad_norm": 3.3981564044952393, + "learning_rate": 1.9999750051720802e-05, + "loss": 1.0695, + "step": 20 + }, + { + "epoch": 0.0010011203012895383, + "grad_norm": 12.594033241271973, + "learning_rate": 1.9999721508376962e-05, + "loss": 1.9061, + "step": 21 + }, + { + "epoch": 0.0010487926965890401, + "grad_norm": 4.31193733215332, + "learning_rate": 1.9999691422179066e-05, + "loss": 1.7075, + "step": 22 + }, + { + "epoch": 0.001096465091888542, + "grad_norm": 5.237526893615723, + "learning_rate": 1.9999659793131764e-05, + "loss": 1.7333, + "step": 23 + }, + { + "epoch": 0.0011441374871880437, + "grad_norm": 5.530689716339111, + "learning_rate": 1.9999626621239932e-05, + "loss": 2.1986, + "step": 24 + }, + { + "epoch": 0.0011918098824875456, + "grad_norm": 9.018251419067383, + "learning_rate": 1.9999591906508686e-05, + "loss": 0.506, + "step": 25 + }, + { + "epoch": 0.0012394822777870474, + "grad_norm": 2.500774383544922, + "learning_rate": 1.9999555648943387e-05, + "loss": 0.8808, + "step": 26 + }, + { + "epoch": 0.0012871546730865492, + "grad_norm": 4.035057544708252, + "learning_rate": 1.9999517848549628e-05, + "loss": 1.2847, + "step": 27 + }, + { + "epoch": 0.001334827068386051, + "grad_norm": 4.671477794647217, + "learning_rate": 1.9999478505333236e-05, + "loss": 1.183, + "step": 28 + }, + { + "epoch": 0.001382499463685553, + "grad_norm": 4.124985694885254, + "learning_rate": 1.999943761930029e-05, + "loss": 1.4332, + "step": 29 + }, + { + "epoch": 0.0014301718589850548, + "grad_norm": 42.83099365234375, + "learning_rate": 1.9999395190457093e-05, + "loss": 2.6162, + "step": 30 + }, + { + "epoch": 0.0014778442542845566, + "grad_norm": 15.408764839172363, + "learning_rate": 1.999935121881019e-05, + "loss": 1.5032, + "step": 31 + }, + { + "epoch": 0.0015255166495840584, + "grad_norm": 4.75313663482666, + "learning_rate": 1.999930570436637e-05, + "loss": 1.1455, + "step": 32 + }, + { + "epoch": 0.0015731890448835602, + "grad_norm": 2.505239486694336, + "learning_rate": 1.9999258647132645e-05, + "loss": 1.2099, + "step": 33 + }, + { + "epoch": 0.001620861440183062, + "grad_norm": 2.3381943702697754, + "learning_rate": 1.999921004711629e-05, + "loss": 0.977, + "step": 34 + }, + { + "epoch": 0.0016685338354825638, + "grad_norm": 4.889766693115234, + "learning_rate": 1.9999159904324793e-05, + "loss": 1.7548, + "step": 35 + }, + { + "epoch": 0.0017162062307820656, + "grad_norm": 2.6877353191375732, + "learning_rate": 1.9999108218765898e-05, + "loss": 1.0806, + "step": 36 + }, + { + "epoch": 0.0017638786260815674, + "grad_norm": 6.849803924560547, + "learning_rate": 1.9999054990447576e-05, + "loss": 1.5614, + "step": 37 + }, + { + "epoch": 0.0018115510213810692, + "grad_norm": 2.4336485862731934, + "learning_rate": 1.9999000219378036e-05, + "loss": 0.7156, + "step": 38 + }, + { + "epoch": 0.001859223416680571, + "grad_norm": 8.13830280303955, + "learning_rate": 1.9998943905565733e-05, + "loss": 1.1239, + "step": 39 + }, + { + "epoch": 0.0019068958119800728, + "grad_norm": 1.631527066230774, + "learning_rate": 1.9998886049019356e-05, + "loss": 0.3686, + "step": 40 + }, + { + "epoch": 0.001954568207279575, + "grad_norm": 39.150115966796875, + "learning_rate": 1.999882664974783e-05, + "loss": 1.166, + "step": 41 + }, + { + "epoch": 0.0020022406025790767, + "grad_norm": 3.3509507179260254, + "learning_rate": 1.999876570776032e-05, + "loss": 0.5439, + "step": 42 + }, + { + "epoch": 0.0020499129978785785, + "grad_norm": 6.455533981323242, + "learning_rate": 1.999870322306623e-05, + "loss": 1.2737, + "step": 43 + }, + { + "epoch": 0.0020975853931780803, + "grad_norm": 4.2040205001831055, + "learning_rate": 1.9998639195675197e-05, + "loss": 0.9851, + "step": 44 + }, + { + "epoch": 0.002145257788477582, + "grad_norm": 6.704083442687988, + "learning_rate": 1.99985736255971e-05, + "loss": 1.5998, + "step": 45 + }, + { + "epoch": 0.002192930183777084, + "grad_norm": 2.616666078567505, + "learning_rate": 1.9998506512842063e-05, + "loss": 1.0957, + "step": 46 + }, + { + "epoch": 0.0022406025790765857, + "grad_norm": 3.079622745513916, + "learning_rate": 1.999843785742043e-05, + "loss": 1.069, + "step": 47 + }, + { + "epoch": 0.0022882749743760875, + "grad_norm": 1.947128415107727, + "learning_rate": 1.9998367659342804e-05, + "loss": 0.7664, + "step": 48 + }, + { + "epoch": 0.0023359473696755893, + "grad_norm": 6.57976770401001, + "learning_rate": 1.999829591862001e-05, + "loss": 1.2904, + "step": 49 + }, + { + "epoch": 0.002383619764975091, + "grad_norm": 4.031615257263184, + "learning_rate": 1.9998222635263118e-05, + "loss": 1.4492, + "step": 50 + }, + { + "epoch": 0.002431292160274593, + "grad_norm": 4.04160213470459, + "learning_rate": 1.9998147809283436e-05, + "loss": 1.1642, + "step": 51 + }, + { + "epoch": 0.0024789645555740947, + "grad_norm": 3.6480441093444824, + "learning_rate": 1.9998071440692508e-05, + "loss": 1.5053, + "step": 52 + }, + { + "epoch": 0.0025266369508735965, + "grad_norm": 2.6463139057159424, + "learning_rate": 1.9997993529502116e-05, + "loss": 0.7771, + "step": 53 + }, + { + "epoch": 0.0025743093461730983, + "grad_norm": 1.7695711851119995, + "learning_rate": 1.9997914075724283e-05, + "loss": 0.7755, + "step": 54 + }, + { + "epoch": 0.0026219817414726, + "grad_norm": 2.9251327514648438, + "learning_rate": 1.9997833079371263e-05, + "loss": 1.116, + "step": 55 + }, + { + "epoch": 0.002669654136772102, + "grad_norm": 1.9323079586029053, + "learning_rate": 1.9997750540455562e-05, + "loss": 0.4896, + "step": 56 + }, + { + "epoch": 0.0027173265320716037, + "grad_norm": 1.6657203435897827, + "learning_rate": 1.999766645898991e-05, + "loss": 0.8084, + "step": 57 + }, + { + "epoch": 0.002764998927371106, + "grad_norm": 3.26259183883667, + "learning_rate": 1.9997580834987277e-05, + "loss": 1.2315, + "step": 58 + }, + { + "epoch": 0.0028126713226706078, + "grad_norm": 1.8272356986999512, + "learning_rate": 1.9997493668460876e-05, + "loss": 0.8441, + "step": 59 + }, + { + "epoch": 0.0028603437179701096, + "grad_norm": 1.9205732345581055, + "learning_rate": 1.9997404959424153e-05, + "loss": 1.1089, + "step": 60 + }, + { + "epoch": 0.0029080161132696114, + "grad_norm": 2.3295199871063232, + "learning_rate": 1.9997314707890802e-05, + "loss": 0.7738, + "step": 61 + }, + { + "epoch": 0.002955688508569113, + "grad_norm": 3.4693026542663574, + "learning_rate": 1.9997222913874745e-05, + "loss": 1.2245, + "step": 62 + }, + { + "epoch": 0.003003360903868615, + "grad_norm": 1.4701534509658813, + "learning_rate": 1.999712957739014e-05, + "loss": 0.829, + "step": 63 + }, + { + "epoch": 0.003051033299168117, + "grad_norm": 3.402221918106079, + "learning_rate": 1.9997034698451396e-05, + "loss": 1.3311, + "step": 64 + }, + { + "epoch": 0.0030987056944676186, + "grad_norm": 2.006373167037964, + "learning_rate": 1.9996938277073146e-05, + "loss": 0.8882, + "step": 65 + }, + { + "epoch": 0.0031463780897671204, + "grad_norm": 1.5337969064712524, + "learning_rate": 1.9996840313270268e-05, + "loss": 0.8829, + "step": 66 + }, + { + "epoch": 0.003194050485066622, + "grad_norm": 2.446826457977295, + "learning_rate": 1.999674080705788e-05, + "loss": 0.9319, + "step": 67 + }, + { + "epoch": 0.003241722880366124, + "grad_norm": 5.346660614013672, + "learning_rate": 1.9996639758451323e-05, + "loss": 1.5478, + "step": 68 + }, + { + "epoch": 0.003289395275665626, + "grad_norm": 6.304278373718262, + "learning_rate": 1.9996537167466205e-05, + "loss": 0.9192, + "step": 69 + }, + { + "epoch": 0.0033370676709651276, + "grad_norm": 60.73661422729492, + "learning_rate": 1.9996433034118342e-05, + "loss": 1.0926, + "step": 70 + }, + { + "epoch": 0.0033847400662646294, + "grad_norm": 5.363586902618408, + "learning_rate": 1.9996327358423812e-05, + "loss": 1.2102, + "step": 71 + }, + { + "epoch": 0.0034324124615641312, + "grad_norm": 16.80270004272461, + "learning_rate": 1.9996220140398907e-05, + "loss": 0.9231, + "step": 72 + }, + { + "epoch": 0.003480084856863633, + "grad_norm": 2.3622937202453613, + "learning_rate": 1.9996111380060177e-05, + "loss": 0.7771, + "step": 73 + }, + { + "epoch": 0.003527757252163135, + "grad_norm": 3.1270358562469482, + "learning_rate": 1.99960010774244e-05, + "loss": 0.9786, + "step": 74 + }, + { + "epoch": 0.0035754296474626367, + "grad_norm": 4.343189239501953, + "learning_rate": 1.9995889232508595e-05, + "loss": 0.8568, + "step": 75 + }, + { + "epoch": 0.0036231020427621385, + "grad_norm": 1.7904366254806519, + "learning_rate": 1.9995775845330022e-05, + "loss": 0.6424, + "step": 76 + }, + { + "epoch": 0.0036707744380616403, + "grad_norm": 2.8427789211273193, + "learning_rate": 1.999566091590617e-05, + "loss": 0.8684, + "step": 77 + }, + { + "epoch": 0.003718446833361142, + "grad_norm": 1.8012349605560303, + "learning_rate": 1.9995544444254777e-05, + "loss": 0.8509, + "step": 78 + }, + { + "epoch": 0.003766119228660644, + "grad_norm": 2.595200777053833, + "learning_rate": 1.9995426430393808e-05, + "loss": 0.9471, + "step": 79 + }, + { + "epoch": 0.0038137916239601457, + "grad_norm": 4.616239547729492, + "learning_rate": 1.9995306874341477e-05, + "loss": 1.3471, + "step": 80 + }, + { + "epoch": 0.003861464019259648, + "grad_norm": 6.4607343673706055, + "learning_rate": 1.9995185776116225e-05, + "loss": 1.5601, + "step": 81 + }, + { + "epoch": 0.00390913641455915, + "grad_norm": 4.7140936851501465, + "learning_rate": 1.9995063135736735e-05, + "loss": 0.3913, + "step": 82 + }, + { + "epoch": 0.0039568088098586515, + "grad_norm": 6.646099090576172, + "learning_rate": 1.999493895322194e-05, + "loss": 1.1003, + "step": 83 + }, + { + "epoch": 0.004004481205158153, + "grad_norm": 2.548227310180664, + "learning_rate": 1.9994813228590986e-05, + "loss": 0.8912, + "step": 84 + }, + { + "epoch": 0.004052153600457655, + "grad_norm": 4.441709995269775, + "learning_rate": 1.999468596186328e-05, + "loss": 0.8949, + "step": 85 + }, + { + "epoch": 0.004099825995757157, + "grad_norm": 90.19632720947266, + "learning_rate": 1.9994557153058456e-05, + "loss": 3.2277, + "step": 86 + }, + { + "epoch": 0.004147498391056659, + "grad_norm": 1.6170494556427002, + "learning_rate": 1.9994426802196384e-05, + "loss": 0.7079, + "step": 87 + }, + { + "epoch": 0.0041951707863561605, + "grad_norm": 1.2660447359085083, + "learning_rate": 1.999429490929718e-05, + "loss": 0.2052, + "step": 88 + }, + { + "epoch": 0.004242843181655662, + "grad_norm": 7.228525638580322, + "learning_rate": 1.9994161474381198e-05, + "loss": 1.056, + "step": 89 + }, + { + "epoch": 0.004290515576955164, + "grad_norm": 2.0865042209625244, + "learning_rate": 1.9994026497469016e-05, + "loss": 0.9977, + "step": 90 + }, + { + "epoch": 0.004338187972254666, + "grad_norm": 16.161222457885742, + "learning_rate": 1.9993889978581462e-05, + "loss": 0.9176, + "step": 91 + }, + { + "epoch": 0.004385860367554168, + "grad_norm": 1.9300395250320435, + "learning_rate": 1.9993751917739606e-05, + "loss": 0.9029, + "step": 92 + }, + { + "epoch": 0.00443353276285367, + "grad_norm": 2.299837350845337, + "learning_rate": 1.999361231496474e-05, + "loss": 1.1272, + "step": 93 + }, + { + "epoch": 0.004481205158153171, + "grad_norm": 1.8956444263458252, + "learning_rate": 1.9993471170278415e-05, + "loss": 0.8191, + "step": 94 + }, + { + "epoch": 0.004528877553452673, + "grad_norm": 1.5688307285308838, + "learning_rate": 1.9993328483702393e-05, + "loss": 0.867, + "step": 95 + }, + { + "epoch": 0.004576549948752175, + "grad_norm": 2.408003807067871, + "learning_rate": 1.9993184255258705e-05, + "loss": 0.9292, + "step": 96 + }, + { + "epoch": 0.004624222344051677, + "grad_norm": 3.870917797088623, + "learning_rate": 1.9993038484969592e-05, + "loss": 0.6028, + "step": 97 + }, + { + "epoch": 0.004671894739351179, + "grad_norm": 1.854192852973938, + "learning_rate": 1.9992891172857552e-05, + "loss": 1.0358, + "step": 98 + }, + { + "epoch": 0.00471956713465068, + "grad_norm": 4.5664591789245605, + "learning_rate": 1.9992742318945307e-05, + "loss": 0.6993, + "step": 99 + }, + { + "epoch": 0.004767239529950182, + "grad_norm": 1.7928482294082642, + "learning_rate": 1.999259192325583e-05, + "loss": 0.7773, + "step": 100 + }, + { + "epoch": 0.004814911925249684, + "grad_norm": 4.041717052459717, + "learning_rate": 1.999243998581232e-05, + "loss": 0.7349, + "step": 101 + }, + { + "epoch": 0.004862584320549186, + "grad_norm": 2.914194107055664, + "learning_rate": 1.9992286506638226e-05, + "loss": 1.0398, + "step": 102 + }, + { + "epoch": 0.004910256715848688, + "grad_norm": 1.4151759147644043, + "learning_rate": 1.9992131485757223e-05, + "loss": 0.7231, + "step": 103 + }, + { + "epoch": 0.004957929111148189, + "grad_norm": 1.6251767873764038, + "learning_rate": 1.9991974923193234e-05, + "loss": 0.8153, + "step": 104 + }, + { + "epoch": 0.005005601506447691, + "grad_norm": 2.348764181137085, + "learning_rate": 1.9991816818970408e-05, + "loss": 0.8527, + "step": 105 + }, + { + "epoch": 0.005053273901747193, + "grad_norm": 2.0763232707977295, + "learning_rate": 1.9991657173113144e-05, + "loss": 0.8315, + "step": 106 + }, + { + "epoch": 0.005100946297046695, + "grad_norm": 2.3932998180389404, + "learning_rate": 1.999149598564607e-05, + "loss": 0.6754, + "step": 107 + }, + { + "epoch": 0.005148618692346197, + "grad_norm": 3.879709482192993, + "learning_rate": 1.9991333256594062e-05, + "loss": 1.2123, + "step": 108 + }, + { + "epoch": 0.0051962910876456984, + "grad_norm": 1.5737781524658203, + "learning_rate": 1.9991168985982223e-05, + "loss": 0.6245, + "step": 109 + }, + { + "epoch": 0.0052439634829452, + "grad_norm": 1.3460618257522583, + "learning_rate": 1.9991003173835898e-05, + "loss": 0.5251, + "step": 110 + }, + { + "epoch": 0.005291635878244702, + "grad_norm": 1.8699207305908203, + "learning_rate": 1.9990835820180665e-05, + "loss": 0.632, + "step": 111 + }, + { + "epoch": 0.005339308273544204, + "grad_norm": 3.712545394897461, + "learning_rate": 1.9990666925042356e-05, + "loss": 0.8691, + "step": 112 + }, + { + "epoch": 0.005386980668843706, + "grad_norm": 4.666072845458984, + "learning_rate": 1.9990496488447024e-05, + "loss": 0.9637, + "step": 113 + }, + { + "epoch": 0.0054346530641432075, + "grad_norm": 30.670461654663086, + "learning_rate": 1.9990324510420966e-05, + "loss": 0.3979, + "step": 114 + }, + { + "epoch": 0.005482325459442709, + "grad_norm": 2.244295597076416, + "learning_rate": 1.9990150990990717e-05, + "loss": 0.7061, + "step": 115 + }, + { + "epoch": 0.005529997854742212, + "grad_norm": 1.6453338861465454, + "learning_rate": 1.998997593018305e-05, + "loss": 0.7932, + "step": 116 + }, + { + "epoch": 0.005577670250041714, + "grad_norm": 1.4517056941986084, + "learning_rate": 1.998979932802497e-05, + "loss": 0.7247, + "step": 117 + }, + { + "epoch": 0.0056253426453412156, + "grad_norm": 1.9602091312408447, + "learning_rate": 1.998962118454373e-05, + "loss": 0.7311, + "step": 118 + }, + { + "epoch": 0.005673015040640717, + "grad_norm": 2.30653715133667, + "learning_rate": 1.9989441499766814e-05, + "loss": 0.7201, + "step": 119 + }, + { + "epoch": 0.005720687435940219, + "grad_norm": 1.9459872245788574, + "learning_rate": 1.998926027372195e-05, + "loss": 1.0758, + "step": 120 + }, + { + "epoch": 0.005768359831239721, + "grad_norm": 6.566103458404541, + "learning_rate": 1.998907750643709e-05, + "loss": 1.4042, + "step": 121 + }, + { + "epoch": 0.005816032226539223, + "grad_norm": 5.204738616943359, + "learning_rate": 1.998889319794044e-05, + "loss": 0.598, + "step": 122 + }, + { + "epoch": 0.005863704621838725, + "grad_norm": 2.0690536499023438, + "learning_rate": 1.998870734826044e-05, + "loss": 1.0878, + "step": 123 + }, + { + "epoch": 0.005911377017138226, + "grad_norm": 6.710436820983887, + "learning_rate": 1.9988519957425754e-05, + "loss": 1.5044, + "step": 124 + }, + { + "epoch": 0.005959049412437728, + "grad_norm": 1.4392977952957153, + "learning_rate": 1.9988331025465298e-05, + "loss": 0.8179, + "step": 125 + }, + { + "epoch": 0.00600672180773723, + "grad_norm": 3.523577928543091, + "learning_rate": 1.998814055240823e-05, + "loss": 1.1138, + "step": 126 + }, + { + "epoch": 0.006054394203036732, + "grad_norm": 2.1186182498931885, + "learning_rate": 1.9987948538283932e-05, + "loss": 1.0725, + "step": 127 + }, + { + "epoch": 0.006102066598336234, + "grad_norm": 2.1686816215515137, + "learning_rate": 1.998775498312203e-05, + "loss": 1.1928, + "step": 128 + }, + { + "epoch": 0.006149738993635735, + "grad_norm": 1.6701370477676392, + "learning_rate": 1.998755988695239e-05, + "loss": 0.6307, + "step": 129 + }, + { + "epoch": 0.006197411388935237, + "grad_norm": 1.5671544075012207, + "learning_rate": 1.998736324980511e-05, + "loss": 0.4937, + "step": 130 + }, + { + "epoch": 0.006245083784234739, + "grad_norm": 1.2592086791992188, + "learning_rate": 1.998716507171053e-05, + "loss": 0.5423, + "step": 131 + }, + { + "epoch": 0.006292756179534241, + "grad_norm": 2.133965253829956, + "learning_rate": 1.9986965352699225e-05, + "loss": 0.9762, + "step": 132 + }, + { + "epoch": 0.006340428574833743, + "grad_norm": 2.673516273498535, + "learning_rate": 1.9986764092802015e-05, + "loss": 0.8452, + "step": 133 + }, + { + "epoch": 0.006388100970133244, + "grad_norm": 2.2198140621185303, + "learning_rate": 1.998656129204995e-05, + "loss": 1.1588, + "step": 134 + }, + { + "epoch": 0.006435773365432746, + "grad_norm": 2.097494125366211, + "learning_rate": 1.998635695047432e-05, + "loss": 0.7734, + "step": 135 + }, + { + "epoch": 0.006483445760732248, + "grad_norm": 81.89967346191406, + "learning_rate": 1.998615106810665e-05, + "loss": 1.4706, + "step": 136 + }, + { + "epoch": 0.00653111815603175, + "grad_norm": 4.292656898498535, + "learning_rate": 1.9985943644978705e-05, + "loss": 0.9319, + "step": 137 + }, + { + "epoch": 0.006578790551331252, + "grad_norm": 2.725637912750244, + "learning_rate": 1.9985734681122494e-05, + "loss": 0.8047, + "step": 138 + }, + { + "epoch": 0.0066264629466307534, + "grad_norm": 4.857945442199707, + "learning_rate": 1.9985524176570255e-05, + "loss": 1.1442, + "step": 139 + }, + { + "epoch": 0.006674135341930255, + "grad_norm": 14.465811729431152, + "learning_rate": 1.9985312131354467e-05, + "loss": 0.9426, + "step": 140 + }, + { + "epoch": 0.006721807737229757, + "grad_norm": 2.2471907138824463, + "learning_rate": 1.9985098545507843e-05, + "loss": 0.6077, + "step": 141 + }, + { + "epoch": 0.006769480132529259, + "grad_norm": 3.9241859912872314, + "learning_rate": 1.9984883419063343e-05, + "loss": 1.3541, + "step": 142 + }, + { + "epoch": 0.006817152527828761, + "grad_norm": 2.1206214427948, + "learning_rate": 1.9984666752054152e-05, + "loss": 0.7002, + "step": 143 + }, + { + "epoch": 0.0068648249231282625, + "grad_norm": 1.9665372371673584, + "learning_rate": 1.998444854451371e-05, + "loss": 0.9509, + "step": 144 + }, + { + "epoch": 0.006912497318427764, + "grad_norm": 2.6428842544555664, + "learning_rate": 1.9984228796475672e-05, + "loss": 1.2551, + "step": 145 + }, + { + "epoch": 0.006960169713727266, + "grad_norm": 1.7541123628616333, + "learning_rate": 1.9984007507973952e-05, + "loss": 0.7907, + "step": 146 + }, + { + "epoch": 0.007007842109026768, + "grad_norm": 1.826343059539795, + "learning_rate": 1.9983784679042685e-05, + "loss": 1.2216, + "step": 147 + }, + { + "epoch": 0.00705551450432627, + "grad_norm": 1.8894635438919067, + "learning_rate": 1.998356030971626e-05, + "loss": 0.9068, + "step": 148 + }, + { + "epoch": 0.0071031868996257715, + "grad_norm": 5.640469074249268, + "learning_rate": 1.9983334400029285e-05, + "loss": 0.9861, + "step": 149 + }, + { + "epoch": 0.007150859294925273, + "grad_norm": 2.9244861602783203, + "learning_rate": 1.998310695001662e-05, + "loss": 0.9534, + "step": 150 + }, + { + "epoch": 0.007198531690224775, + "grad_norm": 1.9021021127700806, + "learning_rate": 1.9982877959713366e-05, + "loss": 0.8217, + "step": 151 + }, + { + "epoch": 0.007246204085524277, + "grad_norm": 1.8560668230056763, + "learning_rate": 1.9982647429154843e-05, + "loss": 0.8384, + "step": 152 + }, + { + "epoch": 0.007293876480823779, + "grad_norm": 1.9364761114120483, + "learning_rate": 1.9982415358376623e-05, + "loss": 1.0554, + "step": 153 + }, + { + "epoch": 0.0073415488761232805, + "grad_norm": 5.366004467010498, + "learning_rate": 1.9982181747414508e-05, + "loss": 1.3124, + "step": 154 + }, + { + "epoch": 0.007389221271422782, + "grad_norm": 2.3039839267730713, + "learning_rate": 1.998194659630455e-05, + "loss": 0.9447, + "step": 155 + }, + { + "epoch": 0.007436893666722284, + "grad_norm": 3.2765309810638428, + "learning_rate": 1.9981709905083026e-05, + "loss": 1.0979, + "step": 156 + }, + { + "epoch": 0.007484566062021786, + "grad_norm": 2.7673983573913574, + "learning_rate": 1.998147167378645e-05, + "loss": 0.9274, + "step": 157 + }, + { + "epoch": 0.007532238457321288, + "grad_norm": 1.6349414587020874, + "learning_rate": 1.9981231902451595e-05, + "loss": 0.8723, + "step": 158 + }, + { + "epoch": 0.0075799108526207895, + "grad_norm": 1.4526450634002686, + "learning_rate": 1.9980990591115437e-05, + "loss": 0.8086, + "step": 159 + }, + { + "epoch": 0.007627583247920291, + "grad_norm": 2.0796725749969482, + "learning_rate": 1.9980747739815217e-05, + "loss": 0.7309, + "step": 160 + }, + { + "epoch": 0.007675255643219793, + "grad_norm": 1.8629250526428223, + "learning_rate": 1.99805033485884e-05, + "loss": 0.943, + "step": 161 + }, + { + "epoch": 0.007722928038519296, + "grad_norm": 3.6882741451263428, + "learning_rate": 1.99802574174727e-05, + "loss": 1.3126, + "step": 162 + }, + { + "epoch": 0.007770600433818798, + "grad_norm": 1.8233232498168945, + "learning_rate": 1.9980009946506053e-05, + "loss": 0.9655, + "step": 163 + }, + { + "epoch": 0.0078182728291183, + "grad_norm": 1.6087697744369507, + "learning_rate": 1.9979760935726647e-05, + "loss": 0.7855, + "step": 164 + }, + { + "epoch": 0.007865945224417801, + "grad_norm": 2.790602684020996, + "learning_rate": 1.99795103851729e-05, + "loss": 1.0484, + "step": 165 + }, + { + "epoch": 0.007913617619717303, + "grad_norm": 1.838362455368042, + "learning_rate": 1.997925829488347e-05, + "loss": 0.7105, + "step": 166 + }, + { + "epoch": 0.007961290015016805, + "grad_norm": 9.126262664794922, + "learning_rate": 1.9979004664897252e-05, + "loss": 1.3776, + "step": 167 + }, + { + "epoch": 0.008008962410316307, + "grad_norm": 2.9251487255096436, + "learning_rate": 1.9978749495253378e-05, + "loss": 0.9252, + "step": 168 + }, + { + "epoch": 0.008056634805615808, + "grad_norm": 7.816552639007568, + "learning_rate": 1.9978492785991216e-05, + "loss": 1.0545, + "step": 169 + }, + { + "epoch": 0.00810430720091531, + "grad_norm": 3.071329355239868, + "learning_rate": 1.997823453715038e-05, + "loss": 1.0586, + "step": 170 + }, + { + "epoch": 0.008151979596214812, + "grad_norm": 2.9898781776428223, + "learning_rate": 1.9977974748770708e-05, + "loss": 0.8904, + "step": 171 + }, + { + "epoch": 0.008199651991514314, + "grad_norm": 1.7111653089523315, + "learning_rate": 1.9977713420892287e-05, + "loss": 0.6194, + "step": 172 + }, + { + "epoch": 0.008247324386813816, + "grad_norm": 2.2094221115112305, + "learning_rate": 1.9977450553555434e-05, + "loss": 1.0304, + "step": 173 + }, + { + "epoch": 0.008294996782113317, + "grad_norm": 2.236921548843384, + "learning_rate": 1.9977186146800707e-05, + "loss": 1.0642, + "step": 174 + }, + { + "epoch": 0.00834266917741282, + "grad_norm": 1.5356533527374268, + "learning_rate": 1.997692020066891e-05, + "loss": 0.7384, + "step": 175 + }, + { + "epoch": 0.008390341572712321, + "grad_norm": 1.6347790956497192, + "learning_rate": 1.997665271520106e-05, + "loss": 0.916, + "step": 176 + }, + { + "epoch": 0.008438013968011823, + "grad_norm": 2.9392073154449463, + "learning_rate": 1.997638369043844e-05, + "loss": 1.1304, + "step": 177 + }, + { + "epoch": 0.008485686363311325, + "grad_norm": 3.1310737133026123, + "learning_rate": 1.9976113126422553e-05, + "loss": 1.297, + "step": 178 + }, + { + "epoch": 0.008533358758610827, + "grad_norm": 1.5780115127563477, + "learning_rate": 1.997584102319514e-05, + "loss": 0.694, + "step": 179 + }, + { + "epoch": 0.008581031153910328, + "grad_norm": 3.4661989212036133, + "learning_rate": 1.9975567380798195e-05, + "loss": 1.4794, + "step": 180 + }, + { + "epoch": 0.00862870354920983, + "grad_norm": 1.9178662300109863, + "learning_rate": 1.997529219927393e-05, + "loss": 0.6866, + "step": 181 + }, + { + "epoch": 0.008676375944509332, + "grad_norm": 2.1521716117858887, + "learning_rate": 1.9975015478664802e-05, + "loss": 0.9348, + "step": 182 + }, + { + "epoch": 0.008724048339808834, + "grad_norm": 2.1723456382751465, + "learning_rate": 1.9974737219013513e-05, + "loss": 1.0957, + "step": 183 + }, + { + "epoch": 0.008771720735108336, + "grad_norm": 2.081552505493164, + "learning_rate": 1.9974457420362986e-05, + "loss": 0.66, + "step": 184 + }, + { + "epoch": 0.008819393130407837, + "grad_norm": 0.943188488483429, + "learning_rate": 1.9974176082756397e-05, + "loss": 0.3902, + "step": 185 + }, + { + "epoch": 0.00886706552570734, + "grad_norm": 1.5033038854599, + "learning_rate": 1.9973893206237154e-05, + "loss": 0.3994, + "step": 186 + }, + { + "epoch": 0.008914737921006841, + "grad_norm": 1.846114158630371, + "learning_rate": 1.99736087908489e-05, + "loss": 0.9327, + "step": 187 + }, + { + "epoch": 0.008962410316306343, + "grad_norm": 1.4994962215423584, + "learning_rate": 1.9973322836635517e-05, + "loss": 0.6214, + "step": 188 + }, + { + "epoch": 0.009010082711605845, + "grad_norm": 2.1606719493865967, + "learning_rate": 1.9973035343641127e-05, + "loss": 0.7063, + "step": 189 + }, + { + "epoch": 0.009057755106905346, + "grad_norm": 1.8727173805236816, + "learning_rate": 1.9972746311910086e-05, + "loss": 0.8753, + "step": 190 + }, + { + "epoch": 0.009105427502204848, + "grad_norm": 3.3258538246154785, + "learning_rate": 1.997245574148699e-05, + "loss": 0.8881, + "step": 191 + }, + { + "epoch": 0.00915309989750435, + "grad_norm": 1.8385637998580933, + "learning_rate": 1.9972163632416666e-05, + "loss": 0.9557, + "step": 192 + }, + { + "epoch": 0.009200772292803852, + "grad_norm": 3.061753034591675, + "learning_rate": 1.997186998474419e-05, + "loss": 1.3106, + "step": 193 + }, + { + "epoch": 0.009248444688103354, + "grad_norm": 3.6829543113708496, + "learning_rate": 1.9971574798514862e-05, + "loss": 0.7096, + "step": 194 + }, + { + "epoch": 0.009296117083402855, + "grad_norm": 3.9840047359466553, + "learning_rate": 1.997127807377423e-05, + "loss": 0.6233, + "step": 195 + }, + { + "epoch": 0.009343789478702357, + "grad_norm": 1.8962153196334839, + "learning_rate": 1.9970979810568082e-05, + "loss": 0.7132, + "step": 196 + }, + { + "epoch": 0.009391461874001859, + "grad_norm": 1.8558872938156128, + "learning_rate": 1.9970680008942425e-05, + "loss": 0.8061, + "step": 197 + }, + { + "epoch": 0.00943913426930136, + "grad_norm": 3.1907379627227783, + "learning_rate": 1.9970378668943522e-05, + "loss": 0.7062, + "step": 198 + }, + { + "epoch": 0.009486806664600863, + "grad_norm": 3.308406352996826, + "learning_rate": 1.9970075790617865e-05, + "loss": 0.8985, + "step": 199 + }, + { + "epoch": 0.009534479059900364, + "grad_norm": 1.7010345458984375, + "learning_rate": 1.9969771374012186e-05, + "loss": 0.8402, + "step": 200 + }, + { + "epoch": 0.009582151455199866, + "grad_norm": 1.7002482414245605, + "learning_rate": 1.996946541917345e-05, + "loss": 0.7318, + "step": 201 + }, + { + "epoch": 0.009629823850499368, + "grad_norm": 2.6917359828948975, + "learning_rate": 1.996915792614887e-05, + "loss": 1.0584, + "step": 202 + }, + { + "epoch": 0.00967749624579887, + "grad_norm": 1.425291657447815, + "learning_rate": 1.9968848894985884e-05, + "loss": 0.5775, + "step": 203 + }, + { + "epoch": 0.009725168641098372, + "grad_norm": 2.461945056915283, + "learning_rate": 1.996853832573217e-05, + "loss": 0.7251, + "step": 204 + }, + { + "epoch": 0.009772841036397873, + "grad_norm": 3.177412748336792, + "learning_rate": 1.996822621843565e-05, + "loss": 0.8469, + "step": 205 + }, + { + "epoch": 0.009820513431697375, + "grad_norm": 2.5107016563415527, + "learning_rate": 1.9967912573144476e-05, + "loss": 1.1968, + "step": 206 + }, + { + "epoch": 0.009868185826996877, + "grad_norm": 3.7646501064300537, + "learning_rate": 1.9967597389907043e-05, + "loss": 1.5301, + "step": 207 + }, + { + "epoch": 0.009915858222296379, + "grad_norm": 2.7186832427978516, + "learning_rate": 1.9967280668771977e-05, + "loss": 0.8036, + "step": 208 + }, + { + "epoch": 0.00996353061759588, + "grad_norm": 1.9227368831634521, + "learning_rate": 1.996696240978815e-05, + "loss": 1.1097, + "step": 209 + }, + { + "epoch": 0.010011203012895382, + "grad_norm": 1.5649751424789429, + "learning_rate": 1.9966642613004664e-05, + "loss": 0.8102, + "step": 210 + }, + { + "epoch": 0.010058875408194884, + "grad_norm": 4.642465591430664, + "learning_rate": 1.9966321278470856e-05, + "loss": 0.4425, + "step": 211 + }, + { + "epoch": 0.010106547803494386, + "grad_norm": 15.997318267822266, + "learning_rate": 1.9965998406236306e-05, + "loss": 0.9282, + "step": 212 + }, + { + "epoch": 0.010154220198793888, + "grad_norm": 3.695793867111206, + "learning_rate": 1.9965673996350836e-05, + "loss": 1.3906, + "step": 213 + }, + { + "epoch": 0.01020189259409339, + "grad_norm": 2.3274667263031006, + "learning_rate": 1.9965348048864495e-05, + "loss": 0.4954, + "step": 214 + }, + { + "epoch": 0.010249564989392891, + "grad_norm": 5.78277063369751, + "learning_rate": 1.9965020563827574e-05, + "loss": 0.9774, + "step": 215 + }, + { + "epoch": 0.010297237384692393, + "grad_norm": 1.8216116428375244, + "learning_rate": 1.99646915412906e-05, + "loss": 1.106, + "step": 216 + }, + { + "epoch": 0.010344909779991895, + "grad_norm": 1.2086372375488281, + "learning_rate": 1.996436098130433e-05, + "loss": 0.6089, + "step": 217 + }, + { + "epoch": 0.010392582175291397, + "grad_norm": 1.2094674110412598, + "learning_rate": 1.9964028883919783e-05, + "loss": 0.6924, + "step": 218 + }, + { + "epoch": 0.010440254570590899, + "grad_norm": 1.7790417671203613, + "learning_rate": 1.9963695249188185e-05, + "loss": 0.7085, + "step": 219 + }, + { + "epoch": 0.0104879269658904, + "grad_norm": 1.6460531949996948, + "learning_rate": 1.9963360077161015e-05, + "loss": 0.7923, + "step": 220 + }, + { + "epoch": 0.010535599361189902, + "grad_norm": 3.221555471420288, + "learning_rate": 1.996302336788999e-05, + "loss": 1.0185, + "step": 221 + }, + { + "epoch": 0.010583271756489404, + "grad_norm": 3.7141506671905518, + "learning_rate": 1.9962685121427055e-05, + "loss": 0.5976, + "step": 222 + }, + { + "epoch": 0.010630944151788906, + "grad_norm": 2.564969301223755, + "learning_rate": 1.9962345337824404e-05, + "loss": 0.5263, + "step": 223 + }, + { + "epoch": 0.010678616547088408, + "grad_norm": 3.3699798583984375, + "learning_rate": 1.996200401713446e-05, + "loss": 0.5765, + "step": 224 + }, + { + "epoch": 0.01072628894238791, + "grad_norm": 1.953573226928711, + "learning_rate": 1.9961661159409885e-05, + "loss": 1.0328, + "step": 225 + }, + { + "epoch": 0.010773961337687411, + "grad_norm": 2.079005479812622, + "learning_rate": 1.9961316764703583e-05, + "loss": 0.9661, + "step": 226 + }, + { + "epoch": 0.010821633732986913, + "grad_norm": 1.4127886295318604, + "learning_rate": 1.996097083306868e-05, + "loss": 0.8102, + "step": 227 + }, + { + "epoch": 0.010869306128286415, + "grad_norm": 5.974330902099609, + "learning_rate": 1.9960623364558555e-05, + "loss": 0.6324, + "step": 228 + }, + { + "epoch": 0.010916978523585917, + "grad_norm": 6.547445297241211, + "learning_rate": 1.9960274359226824e-05, + "loss": 0.3014, + "step": 229 + }, + { + "epoch": 0.010964650918885419, + "grad_norm": 3.30557918548584, + "learning_rate": 1.9959923817127326e-05, + "loss": 0.681, + "step": 230 + }, + { + "epoch": 0.011012323314184922, + "grad_norm": 4.872275352478027, + "learning_rate": 1.9959571738314153e-05, + "loss": 1.187, + "step": 231 + }, + { + "epoch": 0.011059995709484424, + "grad_norm": 3.176126480102539, + "learning_rate": 1.9959218122841624e-05, + "loss": 0.5215, + "step": 232 + }, + { + "epoch": 0.011107668104783926, + "grad_norm": 15.39186954498291, + "learning_rate": 1.99588629707643e-05, + "loss": 1.0125, + "step": 233 + }, + { + "epoch": 0.011155340500083427, + "grad_norm": 5.650630950927734, + "learning_rate": 1.995850628213697e-05, + "loss": 1.4208, + "step": 234 + }, + { + "epoch": 0.01120301289538293, + "grad_norm": 5.125373363494873, + "learning_rate": 1.995814805701468e-05, + "loss": 0.4974, + "step": 235 + }, + { + "epoch": 0.011250685290682431, + "grad_norm": 3.3534553050994873, + "learning_rate": 1.9957788295452693e-05, + "loss": 0.7318, + "step": 236 + }, + { + "epoch": 0.011298357685981933, + "grad_norm": 1.9568144083023071, + "learning_rate": 1.9957426997506518e-05, + "loss": 0.8864, + "step": 237 + }, + { + "epoch": 0.011346030081281435, + "grad_norm": 2.102306842803955, + "learning_rate": 1.9957064163231896e-05, + "loss": 0.8255, + "step": 238 + }, + { + "epoch": 0.011393702476580937, + "grad_norm": 1.5542311668395996, + "learning_rate": 1.9956699792684812e-05, + "loss": 0.8405, + "step": 239 + }, + { + "epoch": 0.011441374871880438, + "grad_norm": 2.799365758895874, + "learning_rate": 1.9956333885921488e-05, + "loss": 0.5947, + "step": 240 + }, + { + "epoch": 0.01148904726717994, + "grad_norm": 5.494631290435791, + "learning_rate": 1.995596644299837e-05, + "loss": 1.1096, + "step": 241 + }, + { + "epoch": 0.011536719662479442, + "grad_norm": 3.8105666637420654, + "learning_rate": 1.9955597463972157e-05, + "loss": 1.1307, + "step": 242 + }, + { + "epoch": 0.011584392057778944, + "grad_norm": 1.5918257236480713, + "learning_rate": 1.9955226948899782e-05, + "loss": 0.6992, + "step": 243 + }, + { + "epoch": 0.011632064453078446, + "grad_norm": 2.063843250274658, + "learning_rate": 1.995485489783841e-05, + "loss": 0.6998, + "step": 244 + }, + { + "epoch": 0.011679736848377947, + "grad_norm": 1.2544373273849487, + "learning_rate": 1.9954481310845437e-05, + "loss": 0.6251, + "step": 245 + }, + { + "epoch": 0.01172740924367745, + "grad_norm": 3.872133731842041, + "learning_rate": 1.9954106187978507e-05, + "loss": 1.2245, + "step": 246 + }, + { + "epoch": 0.011775081638976951, + "grad_norm": 5.4492106437683105, + "learning_rate": 1.9953729529295504e-05, + "loss": 0.8282, + "step": 247 + }, + { + "epoch": 0.011822754034276453, + "grad_norm": 5.580592155456543, + "learning_rate": 1.9953351334854537e-05, + "loss": 1.1402, + "step": 248 + }, + { + "epoch": 0.011870426429575955, + "grad_norm": 1.815596580505371, + "learning_rate": 1.9952971604713963e-05, + "loss": 1.3938, + "step": 249 + }, + { + "epoch": 0.011918098824875456, + "grad_norm": 19.696008682250977, + "learning_rate": 1.995259033893236e-05, + "loss": 0.5092, + "step": 250 + }, + { + "epoch": 0.011965771220174958, + "grad_norm": 1.3136402368545532, + "learning_rate": 1.9952207537568563e-05, + "loss": 0.3227, + "step": 251 + }, + { + "epoch": 0.01201344361547446, + "grad_norm": 4.327956199645996, + "learning_rate": 1.9951823200681628e-05, + "loss": 1.2593, + "step": 252 + }, + { + "epoch": 0.012061116010773962, + "grad_norm": 1.2282180786132812, + "learning_rate": 1.995143732833086e-05, + "loss": 0.3331, + "step": 253 + }, + { + "epoch": 0.012108788406073464, + "grad_norm": 2.6325550079345703, + "learning_rate": 1.995104992057579e-05, + "loss": 0.5151, + "step": 254 + }, + { + "epoch": 0.012156460801372965, + "grad_norm": 2.4884750843048096, + "learning_rate": 1.9950660977476196e-05, + "loss": 0.6286, + "step": 255 + }, + { + "epoch": 0.012204133196672467, + "grad_norm": 6.9264655113220215, + "learning_rate": 1.9950270499092083e-05, + "loss": 0.7175, + "step": 256 + }, + { + "epoch": 0.012251805591971969, + "grad_norm": 2.0482723712921143, + "learning_rate": 1.99498784854837e-05, + "loss": 0.8469, + "step": 257 + }, + { + "epoch": 0.01229947798727147, + "grad_norm": 3.3197004795074463, + "learning_rate": 1.994948493671153e-05, + "loss": 0.6618, + "step": 258 + }, + { + "epoch": 0.012347150382570973, + "grad_norm": 1.6784948110580444, + "learning_rate": 1.9949089852836297e-05, + "loss": 0.9257, + "step": 259 + }, + { + "epoch": 0.012394822777870474, + "grad_norm": 3.88934326171875, + "learning_rate": 1.994869323391895e-05, + "loss": 0.8041, + "step": 260 + }, + { + "epoch": 0.012442495173169976, + "grad_norm": 14.89253044128418, + "learning_rate": 1.9948295080020696e-05, + "loss": 1.1327, + "step": 261 + }, + { + "epoch": 0.012490167568469478, + "grad_norm": 2.403461217880249, + "learning_rate": 1.9947895391202955e-05, + "loss": 0.9128, + "step": 262 + }, + { + "epoch": 0.01253783996376898, + "grad_norm": 3.1274027824401855, + "learning_rate": 1.9947494167527398e-05, + "loss": 0.9623, + "step": 263 + }, + { + "epoch": 0.012585512359068482, + "grad_norm": 2.862778425216675, + "learning_rate": 1.9947091409055933e-05, + "loss": 1.8018, + "step": 264 + }, + { + "epoch": 0.012633184754367983, + "grad_norm": 2.613210678100586, + "learning_rate": 1.9946687115850696e-05, + "loss": 1.1379, + "step": 265 + }, + { + "epoch": 0.012680857149667485, + "grad_norm": 2.2426257133483887, + "learning_rate": 1.994628128797407e-05, + "loss": 0.5249, + "step": 266 + }, + { + "epoch": 0.012728529544966987, + "grad_norm": 1.4983394145965576, + "learning_rate": 1.9945873925488667e-05, + "loss": 0.9067, + "step": 267 + }, + { + "epoch": 0.012776201940266489, + "grad_norm": 1.6490086317062378, + "learning_rate": 1.9945465028457337e-05, + "loss": 0.9039, + "step": 268 + }, + { + "epoch": 0.01282387433556599, + "grad_norm": 1.835891604423523, + "learning_rate": 1.9945054596943177e-05, + "loss": 0.8388, + "step": 269 + }, + { + "epoch": 0.012871546730865492, + "grad_norm": 1.9135104417800903, + "learning_rate": 1.9944642631009507e-05, + "loss": 0.7224, + "step": 270 + }, + { + "epoch": 0.012919219126164994, + "grad_norm": 1.6289864778518677, + "learning_rate": 1.9944229130719885e-05, + "loss": 0.778, + "step": 271 + }, + { + "epoch": 0.012966891521464496, + "grad_norm": 1.39491868019104, + "learning_rate": 1.9943814096138116e-05, + "loss": 0.7036, + "step": 272 + }, + { + "epoch": 0.013014563916763998, + "grad_norm": 2.2777199745178223, + "learning_rate": 1.9943397527328233e-05, + "loss": 0.6181, + "step": 273 + }, + { + "epoch": 0.0130622363120635, + "grad_norm": 3.851339340209961, + "learning_rate": 1.9942979424354506e-05, + "loss": 0.4939, + "step": 274 + }, + { + "epoch": 0.013109908707363001, + "grad_norm": 2.2557308673858643, + "learning_rate": 1.9942559787281453e-05, + "loss": 0.518, + "step": 275 + }, + { + "epoch": 0.013157581102662503, + "grad_norm": 2.261568307876587, + "learning_rate": 1.994213861617381e-05, + "loss": 0.9162, + "step": 276 + }, + { + "epoch": 0.013205253497962005, + "grad_norm": 1.7167255878448486, + "learning_rate": 1.9941715911096563e-05, + "loss": 0.7062, + "step": 277 + }, + { + "epoch": 0.013252925893261507, + "grad_norm": 4.64394474029541, + "learning_rate": 1.9941291672114928e-05, + "loss": 1.1303, + "step": 278 + }, + { + "epoch": 0.013300598288561009, + "grad_norm": 1.3798185586929321, + "learning_rate": 1.9940865899294367e-05, + "loss": 0.4313, + "step": 279 + }, + { + "epoch": 0.01334827068386051, + "grad_norm": 1.8215328454971313, + "learning_rate": 1.9940438592700568e-05, + "loss": 0.7731, + "step": 280 + }, + { + "epoch": 0.013395943079160012, + "grad_norm": 2.367222309112549, + "learning_rate": 1.9940009752399462e-05, + "loss": 0.358, + "step": 281 + }, + { + "epoch": 0.013443615474459514, + "grad_norm": 3.4352400302886963, + "learning_rate": 1.993957937845721e-05, + "loss": 0.5845, + "step": 282 + }, + { + "epoch": 0.013491287869759016, + "grad_norm": 3.778900384902954, + "learning_rate": 1.993914747094022e-05, + "loss": 0.9492, + "step": 283 + }, + { + "epoch": 0.013538960265058518, + "grad_norm": 1.7573721408843994, + "learning_rate": 1.9938714029915128e-05, + "loss": 0.8815, + "step": 284 + }, + { + "epoch": 0.01358663266035802, + "grad_norm": 2.7110109329223633, + "learning_rate": 1.9938279055448814e-05, + "loss": 1.0121, + "step": 285 + }, + { + "epoch": 0.013634305055657521, + "grad_norm": 1.6014918088912964, + "learning_rate": 1.993784254760838e-05, + "loss": 0.6007, + "step": 286 + }, + { + "epoch": 0.013681977450957023, + "grad_norm": 2.3209445476531982, + "learning_rate": 1.9937404506461187e-05, + "loss": 0.7738, + "step": 287 + }, + { + "epoch": 0.013729649846256525, + "grad_norm": 1.5373198986053467, + "learning_rate": 1.993696493207481e-05, + "loss": 0.6923, + "step": 288 + }, + { + "epoch": 0.013777322241556027, + "grad_norm": 3.516077756881714, + "learning_rate": 1.9936523824517074e-05, + "loss": 1.1835, + "step": 289 + }, + { + "epoch": 0.013824994636855529, + "grad_norm": 1.887039065361023, + "learning_rate": 1.993608118385604e-05, + "loss": 1.3008, + "step": 290 + }, + { + "epoch": 0.01387266703215503, + "grad_norm": 1.584210991859436, + "learning_rate": 1.993563701016e-05, + "loss": 0.6803, + "step": 291 + }, + { + "epoch": 0.013920339427454532, + "grad_norm": 2.204319477081299, + "learning_rate": 1.993519130349749e-05, + "loss": 0.5211, + "step": 292 + }, + { + "epoch": 0.013968011822754034, + "grad_norm": 1.4902498722076416, + "learning_rate": 1.9934744063937273e-05, + "loss": 0.8433, + "step": 293 + }, + { + "epoch": 0.014015684218053536, + "grad_norm": 1.8343721628189087, + "learning_rate": 1.9934295291548357e-05, + "loss": 0.7311, + "step": 294 + }, + { + "epoch": 0.014063356613353038, + "grad_norm": 2.033521890640259, + "learning_rate": 1.9933844986399977e-05, + "loss": 0.892, + "step": 295 + }, + { + "epoch": 0.01411102900865254, + "grad_norm": 1.453253149986267, + "learning_rate": 1.9933393148561616e-05, + "loss": 0.9422, + "step": 296 + }, + { + "epoch": 0.014158701403952041, + "grad_norm": 2.0837976932525635, + "learning_rate": 1.9932939778102985e-05, + "loss": 0.9896, + "step": 297 + }, + { + "epoch": 0.014206373799251543, + "grad_norm": 2.659848213195801, + "learning_rate": 1.9932484875094036e-05, + "loss": 0.9624, + "step": 298 + }, + { + "epoch": 0.014254046194551045, + "grad_norm": 3.5798799991607666, + "learning_rate": 1.9932028439604958e-05, + "loss": 0.6397, + "step": 299 + }, + { + "epoch": 0.014301718589850547, + "grad_norm": 4.200873851776123, + "learning_rate": 1.993157047170617e-05, + "loss": 1.1005, + "step": 300 + }, + { + "epoch": 0.014349390985150048, + "grad_norm": 2.4092493057250977, + "learning_rate": 1.9931110971468332e-05, + "loss": 0.6148, + "step": 301 + }, + { + "epoch": 0.01439706338044955, + "grad_norm": 1.8427973985671997, + "learning_rate": 1.9930649938962344e-05, + "loss": 0.7859, + "step": 302 + }, + { + "epoch": 0.014444735775749052, + "grad_norm": 3.5022478103637695, + "learning_rate": 1.9930187374259338e-05, + "loss": 0.7012, + "step": 303 + }, + { + "epoch": 0.014492408171048554, + "grad_norm": 2.042717218399048, + "learning_rate": 1.992972327743068e-05, + "loss": 0.7806, + "step": 304 + }, + { + "epoch": 0.014540080566348056, + "grad_norm": 2.3085291385650635, + "learning_rate": 1.9929257648547976e-05, + "loss": 0.7839, + "step": 305 + }, + { + "epoch": 0.014587752961647557, + "grad_norm": 2.7668402194976807, + "learning_rate": 1.992879048768307e-05, + "loss": 0.7127, + "step": 306 + }, + { + "epoch": 0.01463542535694706, + "grad_norm": 4.952498912811279, + "learning_rate": 1.9928321794908035e-05, + "loss": 1.4776, + "step": 307 + }, + { + "epoch": 0.014683097752246561, + "grad_norm": 3.2454190254211426, + "learning_rate": 1.992785157029519e-05, + "loss": 0.8308, + "step": 308 + }, + { + "epoch": 0.014730770147546063, + "grad_norm": 2.0695793628692627, + "learning_rate": 1.9927379813917087e-05, + "loss": 0.678, + "step": 309 + }, + { + "epoch": 0.014778442542845565, + "grad_norm": 26.816200256347656, + "learning_rate": 1.992690652584651e-05, + "loss": 1.2626, + "step": 310 + }, + { + "epoch": 0.014826114938145066, + "grad_norm": 6.531989574432373, + "learning_rate": 1.992643170615648e-05, + "loss": 2.0021, + "step": 311 + }, + { + "epoch": 0.014873787333444568, + "grad_norm": 3.469572067260742, + "learning_rate": 1.9925955354920265e-05, + "loss": 0.8604, + "step": 312 + }, + { + "epoch": 0.01492145972874407, + "grad_norm": 2.2434329986572266, + "learning_rate": 1.9925477472211356e-05, + "loss": 0.8491, + "step": 313 + }, + { + "epoch": 0.014969132124043572, + "grad_norm": 3.034557342529297, + "learning_rate": 1.9924998058103483e-05, + "loss": 1.5331, + "step": 314 + }, + { + "epoch": 0.015016804519343074, + "grad_norm": 1.6433539390563965, + "learning_rate": 1.9924517112670617e-05, + "loss": 0.7086, + "step": 315 + }, + { + "epoch": 0.015064476914642575, + "grad_norm": 1.574570894241333, + "learning_rate": 1.9924034635986968e-05, + "loss": 0.7385, + "step": 316 + }, + { + "epoch": 0.015112149309942077, + "grad_norm": 3.6537985801696777, + "learning_rate": 1.992355062812697e-05, + "loss": 0.8102, + "step": 317 + }, + { + "epoch": 0.015159821705241579, + "grad_norm": 2.8240246772766113, + "learning_rate": 1.99230650891653e-05, + "loss": 0.8773, + "step": 318 + }, + { + "epoch": 0.015207494100541081, + "grad_norm": 2.016932487487793, + "learning_rate": 1.9922578019176878e-05, + "loss": 0.8061, + "step": 319 + }, + { + "epoch": 0.015255166495840583, + "grad_norm": 1.351644515991211, + "learning_rate": 1.992208941823685e-05, + "loss": 0.7521, + "step": 320 + }, + { + "epoch": 0.015302838891140084, + "grad_norm": 1.7155174016952515, + "learning_rate": 1.99215992864206e-05, + "loss": 0.8785, + "step": 321 + }, + { + "epoch": 0.015350511286439586, + "grad_norm": 2.364560842514038, + "learning_rate": 1.9921107623803757e-05, + "loss": 0.7609, + "step": 322 + }, + { + "epoch": 0.01539818368173909, + "grad_norm": 2.028265953063965, + "learning_rate": 1.9920614430462173e-05, + "loss": 1.0037, + "step": 323 + }, + { + "epoch": 0.015445856077038592, + "grad_norm": 2.755866527557373, + "learning_rate": 1.9920119706471944e-05, + "loss": 0.6831, + "step": 324 + }, + { + "epoch": 0.015493528472338093, + "grad_norm": 1.3015339374542236, + "learning_rate": 1.9919623451909402e-05, + "loss": 0.6163, + "step": 325 + }, + { + "epoch": 0.015541200867637595, + "grad_norm": 2.2389609813690186, + "learning_rate": 1.9919125666851115e-05, + "loss": 0.4706, + "step": 326 + }, + { + "epoch": 0.015588873262937097, + "grad_norm": 2.114124298095703, + "learning_rate": 1.9918626351373885e-05, + "loss": 0.8427, + "step": 327 + }, + { + "epoch": 0.0156365456582366, + "grad_norm": 3.373286008834839, + "learning_rate": 1.991812550555475e-05, + "loss": 0.8408, + "step": 328 + }, + { + "epoch": 0.0156842180535361, + "grad_norm": 5.1343913078308105, + "learning_rate": 1.9917623129470985e-05, + "loss": 1.4614, + "step": 329 + }, + { + "epoch": 0.015731890448835602, + "grad_norm": 1.4291434288024902, + "learning_rate": 1.99171192232001e-05, + "loss": 0.5323, + "step": 330 + }, + { + "epoch": 0.015779562844135103, + "grad_norm": 1.5069769620895386, + "learning_rate": 1.9916613786819856e-05, + "loss": 0.7634, + "step": 331 + }, + { + "epoch": 0.015827235239434606, + "grad_norm": 1.5601898431777954, + "learning_rate": 1.991610682040822e-05, + "loss": 0.8062, + "step": 332 + }, + { + "epoch": 0.015874907634734106, + "grad_norm": 2.073275327682495, + "learning_rate": 1.9915598324043415e-05, + "loss": 1.0942, + "step": 333 + }, + { + "epoch": 0.01592258003003361, + "grad_norm": 3.8580596446990967, + "learning_rate": 1.9915088297803905e-05, + "loss": 1.0956, + "step": 334 + }, + { + "epoch": 0.01597025242533311, + "grad_norm": 2.2373106479644775, + "learning_rate": 1.9914576741768373e-05, + "loss": 0.8886, + "step": 335 + }, + { + "epoch": 0.016017924820632613, + "grad_norm": 1.4013243913650513, + "learning_rate": 1.991406365601575e-05, + "loss": 0.8137, + "step": 336 + }, + { + "epoch": 0.016065597215932113, + "grad_norm": 1.8992758989334106, + "learning_rate": 1.99135490406252e-05, + "loss": 0.9923, + "step": 337 + }, + { + "epoch": 0.016113269611231617, + "grad_norm": 2.6627261638641357, + "learning_rate": 1.9913032895676126e-05, + "loss": 0.5913, + "step": 338 + }, + { + "epoch": 0.016160942006531117, + "grad_norm": 2.40303897857666, + "learning_rate": 1.9912515221248157e-05, + "loss": 0.861, + "step": 339 + }, + { + "epoch": 0.01620861440183062, + "grad_norm": 1.565407633781433, + "learning_rate": 1.9911996017421168e-05, + "loss": 0.7945, + "step": 340 + }, + { + "epoch": 0.01625628679713012, + "grad_norm": 1.8255527019500732, + "learning_rate": 1.991147528427527e-05, + "loss": 0.9651, + "step": 341 + }, + { + "epoch": 0.016303959192429624, + "grad_norm": 3.6867518424987793, + "learning_rate": 1.9910953021890802e-05, + "loss": 0.7636, + "step": 342 + }, + { + "epoch": 0.016351631587729124, + "grad_norm": 1.8927056789398193, + "learning_rate": 1.9910429230348348e-05, + "loss": 0.7816, + "step": 343 + }, + { + "epoch": 0.016399303983028628, + "grad_norm": 2.944861650466919, + "learning_rate": 1.9909903909728722e-05, + "loss": 1.089, + "step": 344 + }, + { + "epoch": 0.016446976378328128, + "grad_norm": 3.066411018371582, + "learning_rate": 1.9909377060112973e-05, + "loss": 1.1133, + "step": 345 + }, + { + "epoch": 0.01649464877362763, + "grad_norm": 7.740360260009766, + "learning_rate": 1.990884868158239e-05, + "loss": 1.1003, + "step": 346 + }, + { + "epoch": 0.01654232116892713, + "grad_norm": 4.239732265472412, + "learning_rate": 1.9908318774218498e-05, + "loss": 0.8084, + "step": 347 + }, + { + "epoch": 0.016589993564226635, + "grad_norm": 2.522313117980957, + "learning_rate": 1.9907787338103054e-05, + "loss": 0.9969, + "step": 348 + }, + { + "epoch": 0.016637665959526135, + "grad_norm": 2.228370189666748, + "learning_rate": 1.9907254373318054e-05, + "loss": 0.4467, + "step": 349 + }, + { + "epoch": 0.01668533835482564, + "grad_norm": 1.6454826593399048, + "learning_rate": 1.9906719879945733e-05, + "loss": 0.9234, + "step": 350 + }, + { + "epoch": 0.01673301075012514, + "grad_norm": 3.1763508319854736, + "learning_rate": 1.990618385806855e-05, + "loss": 0.5725, + "step": 351 + }, + { + "epoch": 0.016780683145424642, + "grad_norm": 2.071532964706421, + "learning_rate": 1.9905646307769212e-05, + "loss": 0.6487, + "step": 352 + }, + { + "epoch": 0.016828355540724142, + "grad_norm": 2.020214796066284, + "learning_rate": 1.990510722913066e-05, + "loss": 0.9457, + "step": 353 + }, + { + "epoch": 0.016876027936023646, + "grad_norm": 2.9650723934173584, + "learning_rate": 1.9904566622236064e-05, + "loss": 1.0136, + "step": 354 + }, + { + "epoch": 0.016923700331323146, + "grad_norm": 48.10707473754883, + "learning_rate": 1.9904024487168835e-05, + "loss": 0.7203, + "step": 355 + }, + { + "epoch": 0.01697137272662265, + "grad_norm": 3.6566269397735596, + "learning_rate": 1.9903480824012617e-05, + "loss": 0.6368, + "step": 356 + }, + { + "epoch": 0.01701904512192215, + "grad_norm": 2.7905960083007812, + "learning_rate": 1.9902935632851296e-05, + "loss": 0.6772, + "step": 357 + }, + { + "epoch": 0.017066717517221653, + "grad_norm": 1.4937987327575684, + "learning_rate": 1.9902388913768987e-05, + "loss": 0.7707, + "step": 358 + }, + { + "epoch": 0.017114389912521153, + "grad_norm": 2.0287539958953857, + "learning_rate": 1.9901840666850045e-05, + "loss": 1.1541, + "step": 359 + }, + { + "epoch": 0.017162062307820657, + "grad_norm": 1.8322936296463013, + "learning_rate": 1.9901290892179056e-05, + "loss": 0.7406, + "step": 360 + }, + { + "epoch": 0.017209734703120157, + "grad_norm": 2.9989206790924072, + "learning_rate": 1.9900739589840846e-05, + "loss": 0.6528, + "step": 361 + }, + { + "epoch": 0.01725740709841966, + "grad_norm": 2.342498540878296, + "learning_rate": 1.9900186759920475e-05, + "loss": 0.8852, + "step": 362 + }, + { + "epoch": 0.01730507949371916, + "grad_norm": 2.000173330307007, + "learning_rate": 1.9899632402503242e-05, + "loss": 0.7001, + "step": 363 + }, + { + "epoch": 0.017352751889018664, + "grad_norm": 3.8555989265441895, + "learning_rate": 1.9899076517674674e-05, + "loss": 1.108, + "step": 364 + }, + { + "epoch": 0.017400424284318164, + "grad_norm": 2.1533634662628174, + "learning_rate": 1.9898519105520537e-05, + "loss": 0.746, + "step": 365 + }, + { + "epoch": 0.017448096679617667, + "grad_norm": 1.9343632459640503, + "learning_rate": 1.989796016612684e-05, + "loss": 0.7821, + "step": 366 + }, + { + "epoch": 0.017495769074917168, + "grad_norm": 1.9814963340759277, + "learning_rate": 1.989739969957982e-05, + "loss": 0.9643, + "step": 367 + }, + { + "epoch": 0.01754344147021667, + "grad_norm": 2.1211135387420654, + "learning_rate": 1.9896837705965946e-05, + "loss": 0.7709, + "step": 368 + }, + { + "epoch": 0.017591113865516175, + "grad_norm": 3.451181173324585, + "learning_rate": 1.9896274185371934e-05, + "loss": 1.0099, + "step": 369 + }, + { + "epoch": 0.017638786260815675, + "grad_norm": 1.3221015930175781, + "learning_rate": 1.9895709137884727e-05, + "loss": 0.8877, + "step": 370 + }, + { + "epoch": 0.017686458656115178, + "grad_norm": 2.346433401107788, + "learning_rate": 1.989514256359151e-05, + "loss": 0.9619, + "step": 371 + }, + { + "epoch": 0.01773413105141468, + "grad_norm": 2.7488558292388916, + "learning_rate": 1.9894574462579688e-05, + "loss": 1.2417, + "step": 372 + }, + { + "epoch": 0.017781803446714182, + "grad_norm": 2.576575517654419, + "learning_rate": 1.9894004834936924e-05, + "loss": 1.07, + "step": 373 + }, + { + "epoch": 0.017829475842013682, + "grad_norm": 1.4878088235855103, + "learning_rate": 1.9893433680751105e-05, + "loss": 0.6564, + "step": 374 + }, + { + "epoch": 0.017877148237313185, + "grad_norm": 1.783627986907959, + "learning_rate": 1.989286100011035e-05, + "loss": 0.4787, + "step": 375 + }, + { + "epoch": 0.017924820632612685, + "grad_norm": 1.658076524734497, + "learning_rate": 1.9892286793103018e-05, + "loss": 0.6887, + "step": 376 + }, + { + "epoch": 0.01797249302791219, + "grad_norm": 1.9088085889816284, + "learning_rate": 1.9891711059817705e-05, + "loss": 1.2357, + "step": 377 + }, + { + "epoch": 0.01802016542321169, + "grad_norm": 1.7909817695617676, + "learning_rate": 1.9891133800343245e-05, + "loss": 0.5678, + "step": 378 + }, + { + "epoch": 0.018067837818511193, + "grad_norm": 2.40408992767334, + "learning_rate": 1.989055501476869e-05, + "loss": 0.7371, + "step": 379 + }, + { + "epoch": 0.018115510213810693, + "grad_norm": 2.4356093406677246, + "learning_rate": 1.9889974703183354e-05, + "loss": 1.007, + "step": 380 + }, + { + "epoch": 0.018163182609110196, + "grad_norm": 1.4276548624038696, + "learning_rate": 1.988939286567677e-05, + "loss": 0.8721, + "step": 381 + }, + { + "epoch": 0.018210855004409696, + "grad_norm": 1.4086828231811523, + "learning_rate": 1.9888809502338706e-05, + "loss": 0.9469, + "step": 382 + }, + { + "epoch": 0.0182585273997092, + "grad_norm": 3.424661636352539, + "learning_rate": 1.988822461325917e-05, + "loss": 0.2993, + "step": 383 + }, + { + "epoch": 0.0183061997950087, + "grad_norm": 1.3863457441329956, + "learning_rate": 1.988763819852841e-05, + "loss": 1.0035, + "step": 384 + }, + { + "epoch": 0.018353872190308203, + "grad_norm": 1.849429726600647, + "learning_rate": 1.9887050258236894e-05, + "loss": 0.6809, + "step": 385 + }, + { + "epoch": 0.018401544585607704, + "grad_norm": 1.7745966911315918, + "learning_rate": 1.988646079247534e-05, + "loss": 0.6901, + "step": 386 + }, + { + "epoch": 0.018449216980907207, + "grad_norm": 1.6295166015625, + "learning_rate": 1.9885869801334697e-05, + "loss": 0.8989, + "step": 387 + }, + { + "epoch": 0.018496889376206707, + "grad_norm": 6.392057418823242, + "learning_rate": 1.988527728490615e-05, + "loss": 0.5829, + "step": 388 + }, + { + "epoch": 0.01854456177150621, + "grad_norm": 6.590545177459717, + "learning_rate": 1.9884683243281117e-05, + "loss": 1.1739, + "step": 389 + }, + { + "epoch": 0.01859223416680571, + "grad_norm": 1.6761903762817383, + "learning_rate": 1.988408767655125e-05, + "loss": 0.9709, + "step": 390 + }, + { + "epoch": 0.018639906562105214, + "grad_norm": 1.8666620254516602, + "learning_rate": 1.9883490584808443e-05, + "loss": 0.5188, + "step": 391 + }, + { + "epoch": 0.018687578957404714, + "grad_norm": 1.5060175657272339, + "learning_rate": 1.9882891968144816e-05, + "loss": 0.7627, + "step": 392 + }, + { + "epoch": 0.018735251352704218, + "grad_norm": 2.627070903778076, + "learning_rate": 1.9882291826652735e-05, + "loss": 0.5643, + "step": 393 + }, + { + "epoch": 0.018782923748003718, + "grad_norm": 3.0624523162841797, + "learning_rate": 1.988169016042479e-05, + "loss": 0.9113, + "step": 394 + }, + { + "epoch": 0.01883059614330322, + "grad_norm": 1.9177439212799072, + "learning_rate": 1.988108696955382e-05, + "loss": 0.9923, + "step": 395 + }, + { + "epoch": 0.01887826853860272, + "grad_norm": 1.3925445079803467, + "learning_rate": 1.988048225413288e-05, + "loss": 1.0391, + "step": 396 + }, + { + "epoch": 0.018925940933902225, + "grad_norm": 1.8818647861480713, + "learning_rate": 1.9879876014255283e-05, + "loss": 0.8457, + "step": 397 + }, + { + "epoch": 0.018973613329201725, + "grad_norm": 15.9513578414917, + "learning_rate": 1.9879268250014558e-05, + "loss": 1.0194, + "step": 398 + }, + { + "epoch": 0.01902128572450123, + "grad_norm": 1.5039842128753662, + "learning_rate": 1.987865896150448e-05, + "loss": 0.3371, + "step": 399 + }, + { + "epoch": 0.01906895811980073, + "grad_norm": 1.3381415605545044, + "learning_rate": 1.9878048148819054e-05, + "loss": 0.7649, + "step": 400 + }, + { + "epoch": 0.019116630515100232, + "grad_norm": 1.379574179649353, + "learning_rate": 1.9877435812052522e-05, + "loss": 0.6547, + "step": 401 + }, + { + "epoch": 0.019164302910399732, + "grad_norm": 1.1685925722122192, + "learning_rate": 1.9876821951299362e-05, + "loss": 0.2342, + "step": 402 + }, + { + "epoch": 0.019211975305699236, + "grad_norm": 12.052519798278809, + "learning_rate": 1.9876206566654285e-05, + "loss": 0.3376, + "step": 403 + }, + { + "epoch": 0.019259647700998736, + "grad_norm": 1.135711908340454, + "learning_rate": 1.9875589658212244e-05, + "loss": 0.4995, + "step": 404 + }, + { + "epoch": 0.01930732009629824, + "grad_norm": 1.9455606937408447, + "learning_rate": 1.9874971226068417e-05, + "loss": 0.7776, + "step": 405 + }, + { + "epoch": 0.01935499249159774, + "grad_norm": 1.3894046545028687, + "learning_rate": 1.987435127031822e-05, + "loss": 0.3752, + "step": 406 + }, + { + "epoch": 0.019402664886897243, + "grad_norm": 2.79687237739563, + "learning_rate": 1.987372979105731e-05, + "loss": 0.9352, + "step": 407 + }, + { + "epoch": 0.019450337282196743, + "grad_norm": 2.8667871952056885, + "learning_rate": 1.987310678838157e-05, + "loss": 1.3754, + "step": 408 + }, + { + "epoch": 0.019498009677496247, + "grad_norm": 1.4679111242294312, + "learning_rate": 1.9872482262387128e-05, + "loss": 0.5814, + "step": 409 + }, + { + "epoch": 0.019545682072795747, + "grad_norm": 1.8989754915237427, + "learning_rate": 1.987185621317034e-05, + "loss": 0.8112, + "step": 410 + }, + { + "epoch": 0.01959335446809525, + "grad_norm": 2.3282973766326904, + "learning_rate": 1.98712286408278e-05, + "loss": 1.0023, + "step": 411 + }, + { + "epoch": 0.01964102686339475, + "grad_norm": 1.5745048522949219, + "learning_rate": 1.9870599545456333e-05, + "loss": 0.8833, + "step": 412 + }, + { + "epoch": 0.019688699258694254, + "grad_norm": 4.777143478393555, + "learning_rate": 1.9869968927153005e-05, + "loss": 1.5177, + "step": 413 + }, + { + "epoch": 0.019736371653993754, + "grad_norm": 2.690695285797119, + "learning_rate": 1.986933678601511e-05, + "loss": 1.1467, + "step": 414 + }, + { + "epoch": 0.019784044049293258, + "grad_norm": 2.625542402267456, + "learning_rate": 1.9868703122140186e-05, + "loss": 0.4268, + "step": 415 + }, + { + "epoch": 0.019831716444592758, + "grad_norm": 1.2323603630065918, + "learning_rate": 1.9868067935625997e-05, + "loss": 0.5527, + "step": 416 + }, + { + "epoch": 0.01987938883989226, + "grad_norm": 8.139171600341797, + "learning_rate": 1.9867431226570546e-05, + "loss": 1.183, + "step": 417 + }, + { + "epoch": 0.01992706123519176, + "grad_norm": 1.7020472288131714, + "learning_rate": 1.9866792995072073e-05, + "loss": 0.9839, + "step": 418 + }, + { + "epoch": 0.019974733630491265, + "grad_norm": 1.2943341732025146, + "learning_rate": 1.986615324122905e-05, + "loss": 0.4531, + "step": 419 + }, + { + "epoch": 0.020022406025790765, + "grad_norm": 1.480353832244873, + "learning_rate": 1.986551196514018e-05, + "loss": 0.8439, + "step": 420 + }, + { + "epoch": 0.02007007842109027, + "grad_norm": 2.9385576248168945, + "learning_rate": 1.9864869166904412e-05, + "loss": 0.7533, + "step": 421 + }, + { + "epoch": 0.02011775081638977, + "grad_norm": 1.6832324266433716, + "learning_rate": 1.986422484662092e-05, + "loss": 0.8662, + "step": 422 + }, + { + "epoch": 0.020165423211689272, + "grad_norm": 1.4676250219345093, + "learning_rate": 1.9863579004389115e-05, + "loss": 0.9441, + "step": 423 + }, + { + "epoch": 0.020213095606988772, + "grad_norm": 1.6102615594863892, + "learning_rate": 1.9862931640308648e-05, + "loss": 0.651, + "step": 424 + }, + { + "epoch": 0.020260768002288276, + "grad_norm": 3.2582168579101562, + "learning_rate": 1.9862282754479394e-05, + "loss": 1.0099, + "step": 425 + }, + { + "epoch": 0.020308440397587776, + "grad_norm": 1.4158923625946045, + "learning_rate": 1.9861632347001474e-05, + "loss": 0.7244, + "step": 426 + }, + { + "epoch": 0.02035611279288728, + "grad_norm": 2.4489970207214355, + "learning_rate": 1.986098041797524e-05, + "loss": 0.9127, + "step": 427 + }, + { + "epoch": 0.02040378518818678, + "grad_norm": 1.574802279472351, + "learning_rate": 1.986032696750127e-05, + "loss": 0.9991, + "step": 428 + }, + { + "epoch": 0.020451457583486283, + "grad_norm": 5.254917144775391, + "learning_rate": 1.9859671995680395e-05, + "loss": 0.4747, + "step": 429 + }, + { + "epoch": 0.020499129978785783, + "grad_norm": 1.2564741373062134, + "learning_rate": 1.9859015502613666e-05, + "loss": 0.5081, + "step": 430 + }, + { + "epoch": 0.020546802374085286, + "grad_norm": 1.4141570329666138, + "learning_rate": 1.9858357488402374e-05, + "loss": 1.0678, + "step": 431 + }, + { + "epoch": 0.020594474769384787, + "grad_norm": 1.6442327499389648, + "learning_rate": 1.985769795314804e-05, + "loss": 0.6662, + "step": 432 + }, + { + "epoch": 0.02064214716468429, + "grad_norm": 1.1709163188934326, + "learning_rate": 1.985703689695243e-05, + "loss": 0.635, + "step": 433 + }, + { + "epoch": 0.02068981955998379, + "grad_norm": 1.7995291948318481, + "learning_rate": 1.9856374319917528e-05, + "loss": 0.6015, + "step": 434 + }, + { + "epoch": 0.020737491955283294, + "grad_norm": 1.5249440670013428, + "learning_rate": 1.9855710222145576e-05, + "loss": 0.604, + "step": 435 + }, + { + "epoch": 0.020785164350582794, + "grad_norm": 5.639601230621338, + "learning_rate": 1.985504460373903e-05, + "loss": 0.7747, + "step": 436 + }, + { + "epoch": 0.020832836745882297, + "grad_norm": 1.7618529796600342, + "learning_rate": 1.9854377464800586e-05, + "loss": 0.8751, + "step": 437 + }, + { + "epoch": 0.020880509141181797, + "grad_norm": 3.759244680404663, + "learning_rate": 1.9853708805433182e-05, + "loss": 1.1403, + "step": 438 + }, + { + "epoch": 0.0209281815364813, + "grad_norm": 1.4865508079528809, + "learning_rate": 1.985303862573998e-05, + "loss": 0.4212, + "step": 439 + }, + { + "epoch": 0.0209758539317808, + "grad_norm": 1.1901158094406128, + "learning_rate": 1.9852366925824393e-05, + "loss": 0.49, + "step": 440 + }, + { + "epoch": 0.021023526327080305, + "grad_norm": 1.6764826774597168, + "learning_rate": 1.985169370579004e-05, + "loss": 0.8045, + "step": 441 + }, + { + "epoch": 0.021071198722379805, + "grad_norm": 1.5856084823608398, + "learning_rate": 1.9851018965740806e-05, + "loss": 0.6927, + "step": 442 + }, + { + "epoch": 0.021118871117679308, + "grad_norm": 3.426955223083496, + "learning_rate": 1.9850342705780788e-05, + "loss": 1.0903, + "step": 443 + }, + { + "epoch": 0.021166543512978808, + "grad_norm": 1.8891679048538208, + "learning_rate": 1.984966492601433e-05, + "loss": 1.126, + "step": 444 + }, + { + "epoch": 0.021214215908278312, + "grad_norm": 2.5558974742889404, + "learning_rate": 1.984898562654601e-05, + "loss": 1.1514, + "step": 445 + }, + { + "epoch": 0.021261888303577812, + "grad_norm": 1.8330786228179932, + "learning_rate": 1.984830480748063e-05, + "loss": 0.8547, + "step": 446 + }, + { + "epoch": 0.021309560698877315, + "grad_norm": 2.5390944480895996, + "learning_rate": 1.9847622468923236e-05, + "loss": 1.0727, + "step": 447 + }, + { + "epoch": 0.021357233094176815, + "grad_norm": 8.325060844421387, + "learning_rate": 1.9846938610979104e-05, + "loss": 0.5672, + "step": 448 + }, + { + "epoch": 0.02140490548947632, + "grad_norm": 5.6124043464660645, + "learning_rate": 1.984625323375375e-05, + "loss": 0.9308, + "step": 449 + }, + { + "epoch": 0.02145257788477582, + "grad_norm": 2.8409430980682373, + "learning_rate": 1.984556633735292e-05, + "loss": 0.817, + "step": 450 + }, + { + "epoch": 0.021500250280075323, + "grad_norm": 2.745298385620117, + "learning_rate": 1.9844877921882593e-05, + "loss": 1.0862, + "step": 451 + }, + { + "epoch": 0.021547922675374823, + "grad_norm": 1.320275902748108, + "learning_rate": 1.9844187987448984e-05, + "loss": 0.6576, + "step": 452 + }, + { + "epoch": 0.021595595070674326, + "grad_norm": 1.8817437887191772, + "learning_rate": 1.9843496534158543e-05, + "loss": 1.1033, + "step": 453 + }, + { + "epoch": 0.021643267465973826, + "grad_norm": 1.684211015701294, + "learning_rate": 1.984280356211796e-05, + "loss": 1.0972, + "step": 454 + }, + { + "epoch": 0.02169093986127333, + "grad_norm": 1.3767002820968628, + "learning_rate": 1.9842109071434143e-05, + "loss": 0.9344, + "step": 455 + }, + { + "epoch": 0.02173861225657283, + "grad_norm": 2.4937679767608643, + "learning_rate": 1.9841413062214253e-05, + "loss": 0.764, + "step": 456 + }, + { + "epoch": 0.021786284651872333, + "grad_norm": 1.7059651613235474, + "learning_rate": 1.9840715534565677e-05, + "loss": 0.8478, + "step": 457 + }, + { + "epoch": 0.021833957047171833, + "grad_norm": 1.4310321807861328, + "learning_rate": 1.984001648859603e-05, + "loss": 0.7032, + "step": 458 + }, + { + "epoch": 0.021881629442471337, + "grad_norm": 1.4786193370819092, + "learning_rate": 1.9839315924413174e-05, + "loss": 0.7041, + "step": 459 + }, + { + "epoch": 0.021929301837770837, + "grad_norm": 1.9900134801864624, + "learning_rate": 1.9838613842125193e-05, + "loss": 0.298, + "step": 460 + }, + { + "epoch": 0.02197697423307034, + "grad_norm": 1.3430676460266113, + "learning_rate": 1.9837910241840418e-05, + "loss": 0.459, + "step": 461 + }, + { + "epoch": 0.022024646628369844, + "grad_norm": 1.3910713195800781, + "learning_rate": 1.9837205123667404e-05, + "loss": 0.6807, + "step": 462 + }, + { + "epoch": 0.022072319023669344, + "grad_norm": 2.3101654052734375, + "learning_rate": 1.983649848771494e-05, + "loss": 0.4995, + "step": 463 + }, + { + "epoch": 0.022119991418968848, + "grad_norm": 1.8263473510742188, + "learning_rate": 1.9835790334092054e-05, + "loss": 0.5541, + "step": 464 + }, + { + "epoch": 0.022167663814268348, + "grad_norm": 1.215735912322998, + "learning_rate": 1.9835080662908013e-05, + "loss": 0.6172, + "step": 465 + }, + { + "epoch": 0.02221533620956785, + "grad_norm": 7.440627574920654, + "learning_rate": 1.9834369474272307e-05, + "loss": 0.5222, + "step": 466 + }, + { + "epoch": 0.02226300860486735, + "grad_norm": 1.4894155263900757, + "learning_rate": 1.983365676829466e-05, + "loss": 0.5461, + "step": 467 + }, + { + "epoch": 0.022310681000166855, + "grad_norm": 1.6445199251174927, + "learning_rate": 1.9832942545085047e-05, + "loss": 0.6309, + "step": 468 + }, + { + "epoch": 0.022358353395466355, + "grad_norm": 5.644519805908203, + "learning_rate": 1.9832226804753658e-05, + "loss": 0.8083, + "step": 469 + }, + { + "epoch": 0.02240602579076586, + "grad_norm": 4.1564812660217285, + "learning_rate": 1.9831509547410922e-05, + "loss": 1.613, + "step": 470 + }, + { + "epoch": 0.02245369818606536, + "grad_norm": 1.6427197456359863, + "learning_rate": 1.9830790773167513e-05, + "loss": 1.0581, + "step": 471 + }, + { + "epoch": 0.022501370581364862, + "grad_norm": 5.313671588897705, + "learning_rate": 1.983007048213432e-05, + "loss": 0.5421, + "step": 472 + }, + { + "epoch": 0.022549042976664362, + "grad_norm": 1.734161376953125, + "learning_rate": 1.9829348674422488e-05, + "loss": 0.9461, + "step": 473 + }, + { + "epoch": 0.022596715371963866, + "grad_norm": 2.149139642715454, + "learning_rate": 1.982862535014337e-05, + "loss": 0.6638, + "step": 474 + }, + { + "epoch": 0.022644387767263366, + "grad_norm": 2.1884307861328125, + "learning_rate": 1.9827900509408583e-05, + "loss": 0.9279, + "step": 475 + }, + { + "epoch": 0.02269206016256287, + "grad_norm": 1.2274335622787476, + "learning_rate": 1.9827174152329952e-05, + "loss": 0.8374, + "step": 476 + }, + { + "epoch": 0.02273973255786237, + "grad_norm": 1.497936725616455, + "learning_rate": 1.9826446279019547e-05, + "loss": 0.9778, + "step": 477 + }, + { + "epoch": 0.022787404953161873, + "grad_norm": 1.2841224670410156, + "learning_rate": 1.9825716889589678e-05, + "loss": 0.5275, + "step": 478 + }, + { + "epoch": 0.022835077348461373, + "grad_norm": 1.717750072479248, + "learning_rate": 1.9824985984152877e-05, + "loss": 0.687, + "step": 479 + }, + { + "epoch": 0.022882749743760877, + "grad_norm": 3.095571517944336, + "learning_rate": 1.9824253562821915e-05, + "loss": 0.4429, + "step": 480 + }, + { + "epoch": 0.022930422139060377, + "grad_norm": 1.8677228689193726, + "learning_rate": 1.98235196257098e-05, + "loss": 0.8407, + "step": 481 + }, + { + "epoch": 0.02297809453435988, + "grad_norm": 1.6857329607009888, + "learning_rate": 1.982278417292977e-05, + "loss": 0.8925, + "step": 482 + }, + { + "epoch": 0.02302576692965938, + "grad_norm": 1.5242455005645752, + "learning_rate": 1.98220472045953e-05, + "loss": 0.6419, + "step": 483 + }, + { + "epoch": 0.023073439324958884, + "grad_norm": 2.109609603881836, + "learning_rate": 1.9821308720820086e-05, + "loss": 1.2286, + "step": 484 + }, + { + "epoch": 0.023121111720258384, + "grad_norm": 1.9739404916763306, + "learning_rate": 1.9820568721718082e-05, + "loss": 0.9189, + "step": 485 + }, + { + "epoch": 0.023168784115557887, + "grad_norm": 2.0055463314056396, + "learning_rate": 1.9819827207403458e-05, + "loss": 1.1078, + "step": 486 + }, + { + "epoch": 0.023216456510857388, + "grad_norm": 2.384929656982422, + "learning_rate": 1.9819084177990615e-05, + "loss": 0.5499, + "step": 487 + }, + { + "epoch": 0.02326412890615689, + "grad_norm": 4.874091625213623, + "learning_rate": 1.9818339633594203e-05, + "loss": 0.9564, + "step": 488 + }, + { + "epoch": 0.02331180130145639, + "grad_norm": 9.882637023925781, + "learning_rate": 1.9817593574329096e-05, + "loss": 1.3838, + "step": 489 + }, + { + "epoch": 0.023359473696755895, + "grad_norm": 1.7439748048782349, + "learning_rate": 1.9816846000310403e-05, + "loss": 0.8163, + "step": 490 + }, + { + "epoch": 0.023407146092055395, + "grad_norm": 1.8394163846969604, + "learning_rate": 1.981609691165346e-05, + "loss": 0.775, + "step": 491 + }, + { + "epoch": 0.0234548184873549, + "grad_norm": 2.234813928604126, + "learning_rate": 1.9815346308473857e-05, + "loss": 1.2283, + "step": 492 + }, + { + "epoch": 0.0235024908826544, + "grad_norm": 1.298910140991211, + "learning_rate": 1.9814594190887394e-05, + "loss": 0.5041, + "step": 493 + }, + { + "epoch": 0.023550163277953902, + "grad_norm": 2.6392388343811035, + "learning_rate": 1.9813840559010116e-05, + "loss": 0.8703, + "step": 494 + }, + { + "epoch": 0.023597835673253402, + "grad_norm": 3.647735118865967, + "learning_rate": 1.9813085412958307e-05, + "loss": 0.9736, + "step": 495 + }, + { + "epoch": 0.023645508068552906, + "grad_norm": 1.6532799005508423, + "learning_rate": 1.9812328752848474e-05, + "loss": 0.6817, + "step": 496 + }, + { + "epoch": 0.023693180463852406, + "grad_norm": 1.6136335134506226, + "learning_rate": 1.981157057879736e-05, + "loss": 1.0334, + "step": 497 + }, + { + "epoch": 0.02374085285915191, + "grad_norm": 2.0291175842285156, + "learning_rate": 1.9810810890921943e-05, + "loss": 0.6898, + "step": 498 + }, + { + "epoch": 0.02378852525445141, + "grad_norm": 1.876394510269165, + "learning_rate": 1.981004968933944e-05, + "loss": 0.8983, + "step": 499 + }, + { + "epoch": 0.023836197649750913, + "grad_norm": 1.8519922494888306, + "learning_rate": 1.9809286974167296e-05, + "loss": 0.762, + "step": 500 + }, + { + "epoch": 0.023883870045050413, + "grad_norm": 1.415461778640747, + "learning_rate": 1.9808522745523186e-05, + "loss": 0.5235, + "step": 501 + }, + { + "epoch": 0.023931542440349916, + "grad_norm": 2.0346901416778564, + "learning_rate": 1.9807757003525022e-05, + "loss": 0.9127, + "step": 502 + }, + { + "epoch": 0.023979214835649416, + "grad_norm": 1.8818345069885254, + "learning_rate": 1.9806989748290954e-05, + "loss": 0.7813, + "step": 503 + }, + { + "epoch": 0.02402688723094892, + "grad_norm": 4.557500839233398, + "learning_rate": 1.980622097993936e-05, + "loss": 0.6555, + "step": 504 + }, + { + "epoch": 0.02407455962624842, + "grad_norm": 3.821648359298706, + "learning_rate": 1.9805450698588856e-05, + "loss": 0.8257, + "step": 505 + }, + { + "epoch": 0.024122232021547924, + "grad_norm": 2.6666078567504883, + "learning_rate": 1.9804678904358284e-05, + "loss": 1.1717, + "step": 506 + }, + { + "epoch": 0.024169904416847424, + "grad_norm": 2.618318557739258, + "learning_rate": 1.9803905597366726e-05, + "loss": 0.9528, + "step": 507 + }, + { + "epoch": 0.024217576812146927, + "grad_norm": 2.3368048667907715, + "learning_rate": 1.9803130777733494e-05, + "loss": 1.1081, + "step": 508 + }, + { + "epoch": 0.024265249207446427, + "grad_norm": 4.741371154785156, + "learning_rate": 1.9802354445578137e-05, + "loss": 1.2031, + "step": 509 + }, + { + "epoch": 0.02431292160274593, + "grad_norm": 2.2617404460906982, + "learning_rate": 1.9801576601020435e-05, + "loss": 0.7604, + "step": 510 + }, + { + "epoch": 0.02436059399804543, + "grad_norm": 6.097237586975098, + "learning_rate": 1.98007972441804e-05, + "loss": 0.7608, + "step": 511 + }, + { + "epoch": 0.024408266393344934, + "grad_norm": 1.6063331365585327, + "learning_rate": 1.9800016375178276e-05, + "loss": 0.7393, + "step": 512 + }, + { + "epoch": 0.024455938788644434, + "grad_norm": 1.7038710117340088, + "learning_rate": 1.979923399413455e-05, + "loss": 0.6155, + "step": 513 + }, + { + "epoch": 0.024503611183943938, + "grad_norm": 3.836108922958374, + "learning_rate": 1.9798450101169927e-05, + "loss": 0.8108, + "step": 514 + }, + { + "epoch": 0.024551283579243438, + "grad_norm": 1.718099594116211, + "learning_rate": 1.979766469640536e-05, + "loss": 0.9674, + "step": 515 + }, + { + "epoch": 0.02459895597454294, + "grad_norm": 2.81150221824646, + "learning_rate": 1.9796877779962026e-05, + "loss": 0.4572, + "step": 516 + }, + { + "epoch": 0.02464662836984244, + "grad_norm": 3.344743013381958, + "learning_rate": 1.9796089351961338e-05, + "loss": 1.3236, + "step": 517 + }, + { + "epoch": 0.024694300765141945, + "grad_norm": 0.9414018392562866, + "learning_rate": 1.9795299412524948e-05, + "loss": 0.316, + "step": 518 + }, + { + "epoch": 0.024741973160441445, + "grad_norm": 1.675338625907898, + "learning_rate": 1.9794507961774725e-05, + "loss": 0.9532, + "step": 519 + }, + { + "epoch": 0.02478964555574095, + "grad_norm": 1.8848239183425903, + "learning_rate": 1.979371499983279e-05, + "loss": 0.6513, + "step": 520 + }, + { + "epoch": 0.02483731795104045, + "grad_norm": 2.5809998512268066, + "learning_rate": 1.9792920526821486e-05, + "loss": 0.6065, + "step": 521 + }, + { + "epoch": 0.024884990346339952, + "grad_norm": 2.36930775642395, + "learning_rate": 1.9792124542863394e-05, + "loss": 1.2801, + "step": 522 + }, + { + "epoch": 0.024932662741639453, + "grad_norm": 1.0927647352218628, + "learning_rate": 1.9791327048081322e-05, + "loss": 0.5992, + "step": 523 + }, + { + "epoch": 0.024980335136938956, + "grad_norm": 1.4300240278244019, + "learning_rate": 1.9790528042598316e-05, + "loss": 0.4546, + "step": 524 + }, + { + "epoch": 0.025028007532238456, + "grad_norm": 2.574460744857788, + "learning_rate": 1.978972752653766e-05, + "loss": 0.2686, + "step": 525 + }, + { + "epoch": 0.02507567992753796, + "grad_norm": 2.383345127105713, + "learning_rate": 1.978892550002286e-05, + "loss": 0.6973, + "step": 526 + }, + { + "epoch": 0.02512335232283746, + "grad_norm": 15.231221199035645, + "learning_rate": 1.9788121963177663e-05, + "loss": 0.6649, + "step": 527 + }, + { + "epoch": 0.025171024718136963, + "grad_norm": 1.9268141984939575, + "learning_rate": 1.978731691612604e-05, + "loss": 0.771, + "step": 528 + }, + { + "epoch": 0.025218697113436463, + "grad_norm": 0.9012970328330994, + "learning_rate": 1.9786510358992213e-05, + "loss": 0.3601, + "step": 529 + }, + { + "epoch": 0.025266369508735967, + "grad_norm": 1.5469326972961426, + "learning_rate": 1.9785702291900616e-05, + "loss": 0.8453, + "step": 530 + }, + { + "epoch": 0.025314041904035467, + "grad_norm": 2.350041627883911, + "learning_rate": 1.978489271497593e-05, + "loss": 0.6276, + "step": 531 + }, + { + "epoch": 0.02536171429933497, + "grad_norm": 1.523940920829773, + "learning_rate": 1.978408162834306e-05, + "loss": 0.6375, + "step": 532 + }, + { + "epoch": 0.02540938669463447, + "grad_norm": 1.175184965133667, + "learning_rate": 1.9783269032127156e-05, + "loss": 0.5706, + "step": 533 + }, + { + "epoch": 0.025457059089933974, + "grad_norm": 1.4187142848968506, + "learning_rate": 1.9782454926453585e-05, + "loss": 1.2135, + "step": 534 + }, + { + "epoch": 0.025504731485233474, + "grad_norm": 1.718551754951477, + "learning_rate": 1.978163931144796e-05, + "loss": 0.6412, + "step": 535 + }, + { + "epoch": 0.025552403880532978, + "grad_norm": 3.0916621685028076, + "learning_rate": 1.978082218723612e-05, + "loss": 1.0229, + "step": 536 + }, + { + "epoch": 0.025600076275832478, + "grad_norm": 1.2344852685928345, + "learning_rate": 1.978000355394414e-05, + "loss": 0.391, + "step": 537 + }, + { + "epoch": 0.02564774867113198, + "grad_norm": 2.242340087890625, + "learning_rate": 1.9779183411698327e-05, + "loss": 1.0096, + "step": 538 + }, + { + "epoch": 0.02569542106643148, + "grad_norm": 2.2608070373535156, + "learning_rate": 1.977836176062522e-05, + "loss": 0.6205, + "step": 539 + }, + { + "epoch": 0.025743093461730985, + "grad_norm": 1.1044822931289673, + "learning_rate": 1.977753860085159e-05, + "loss": 0.7183, + "step": 540 + }, + { + "epoch": 0.025790765857030485, + "grad_norm": 1.8876334428787231, + "learning_rate": 1.977671393250444e-05, + "loss": 1.0274, + "step": 541 + }, + { + "epoch": 0.02583843825232999, + "grad_norm": 1.5824873447418213, + "learning_rate": 1.977588775571101e-05, + "loss": 0.9702, + "step": 542 + }, + { + "epoch": 0.02588611064762949, + "grad_norm": 2.3319790363311768, + "learning_rate": 1.9775060070598777e-05, + "loss": 0.9352, + "step": 543 + }, + { + "epoch": 0.025933783042928992, + "grad_norm": 1.3221014738082886, + "learning_rate": 1.977423087729544e-05, + "loss": 0.5927, + "step": 544 + }, + { + "epoch": 0.025981455438228492, + "grad_norm": 1.501172661781311, + "learning_rate": 1.977340017592893e-05, + "loss": 0.7887, + "step": 545 + }, + { + "epoch": 0.026029127833527996, + "grad_norm": 3.196305751800537, + "learning_rate": 1.9772567966627417e-05, + "loss": 0.9204, + "step": 546 + }, + { + "epoch": 0.026076800228827496, + "grad_norm": 1.732845664024353, + "learning_rate": 1.9771734249519307e-05, + "loss": 0.7779, + "step": 547 + }, + { + "epoch": 0.026124472624127, + "grad_norm": 7.03700065612793, + "learning_rate": 1.9770899024733235e-05, + "loss": 0.3785, + "step": 548 + }, + { + "epoch": 0.0261721450194265, + "grad_norm": 4.985541343688965, + "learning_rate": 1.9770062292398062e-05, + "loss": 0.3287, + "step": 549 + }, + { + "epoch": 0.026219817414726003, + "grad_norm": 3.5213537216186523, + "learning_rate": 1.9769224052642887e-05, + "loss": 0.8526, + "step": 550 + }, + { + "epoch": 0.026267489810025503, + "grad_norm": 2.2162954807281494, + "learning_rate": 1.9768384305597048e-05, + "loss": 1.0172, + "step": 551 + }, + { + "epoch": 0.026315162205325007, + "grad_norm": 2.0130722522735596, + "learning_rate": 1.9767543051390103e-05, + "loss": 0.5717, + "step": 552 + }, + { + "epoch": 0.026362834600624507, + "grad_norm": 2.2378265857696533, + "learning_rate": 1.9766700290151853e-05, + "loss": 0.7639, + "step": 553 + }, + { + "epoch": 0.02641050699592401, + "grad_norm": 1.358456015586853, + "learning_rate": 1.9765856022012326e-05, + "loss": 0.6762, + "step": 554 + }, + { + "epoch": 0.026458179391223514, + "grad_norm": 5.6245951652526855, + "learning_rate": 1.9765010247101783e-05, + "loss": 0.9299, + "step": 555 + }, + { + "epoch": 0.026505851786523014, + "grad_norm": 1.4390608072280884, + "learning_rate": 1.9764162965550718e-05, + "loss": 0.7357, + "step": 556 + }, + { + "epoch": 0.026553524181822517, + "grad_norm": 1.5818909406661987, + "learning_rate": 1.9763314177489858e-05, + "loss": 0.7468, + "step": 557 + }, + { + "epoch": 0.026601196577122017, + "grad_norm": 1.5696935653686523, + "learning_rate": 1.9762463883050165e-05, + "loss": 0.8834, + "step": 558 + }, + { + "epoch": 0.02664886897242152, + "grad_norm": 1.6481987237930298, + "learning_rate": 1.9761612082362828e-05, + "loss": 0.7375, + "step": 559 + }, + { + "epoch": 0.02669654136772102, + "grad_norm": 5.267855167388916, + "learning_rate": 1.9760758775559275e-05, + "loss": 0.4963, + "step": 560 + }, + { + "epoch": 0.026744213763020525, + "grad_norm": 1.7277523279190063, + "learning_rate": 1.9759903962771155e-05, + "loss": 0.8688, + "step": 561 + }, + { + "epoch": 0.026791886158320025, + "grad_norm": 3.814375162124634, + "learning_rate": 1.9759047644130362e-05, + "loss": 1.1375, + "step": 562 + }, + { + "epoch": 0.026839558553619528, + "grad_norm": 1.6285319328308105, + "learning_rate": 1.9758189819769017e-05, + "loss": 0.6798, + "step": 563 + }, + { + "epoch": 0.026887230948919028, + "grad_norm": 1.6149711608886719, + "learning_rate": 1.9757330489819472e-05, + "loss": 1.0756, + "step": 564 + }, + { + "epoch": 0.026934903344218532, + "grad_norm": 1.523033618927002, + "learning_rate": 1.9756469654414316e-05, + "loss": 0.6719, + "step": 565 + }, + { + "epoch": 0.026982575739518032, + "grad_norm": 2.452061653137207, + "learning_rate": 1.9755607313686363e-05, + "loss": 0.3536, + "step": 566 + }, + { + "epoch": 0.027030248134817535, + "grad_norm": 1.6935207843780518, + "learning_rate": 1.9754743467768663e-05, + "loss": 1.0144, + "step": 567 + }, + { + "epoch": 0.027077920530117035, + "grad_norm": 1.7408450841903687, + "learning_rate": 1.9753878116794504e-05, + "loss": 0.9138, + "step": 568 + }, + { + "epoch": 0.02712559292541654, + "grad_norm": 1.9482660293579102, + "learning_rate": 1.9753011260897392e-05, + "loss": 0.8797, + "step": 569 + }, + { + "epoch": 0.02717326532071604, + "grad_norm": 1.166355848312378, + "learning_rate": 1.9752142900211084e-05, + "loss": 0.568, + "step": 570 + }, + { + "epoch": 0.027220937716015543, + "grad_norm": 1.862307071685791, + "learning_rate": 1.9751273034869552e-05, + "loss": 0.777, + "step": 571 + }, + { + "epoch": 0.027268610111315043, + "grad_norm": 1.4859095811843872, + "learning_rate": 1.975040166500701e-05, + "loss": 0.6208, + "step": 572 + }, + { + "epoch": 0.027316282506614546, + "grad_norm": 1.7215423583984375, + "learning_rate": 1.97495287907579e-05, + "loss": 0.8189, + "step": 573 + }, + { + "epoch": 0.027363954901914046, + "grad_norm": 3.2154273986816406, + "learning_rate": 1.97486544122569e-05, + "loss": 0.7525, + "step": 574 + }, + { + "epoch": 0.02741162729721355, + "grad_norm": 1.9213719367980957, + "learning_rate": 1.974777852963891e-05, + "loss": 0.9057, + "step": 575 + }, + { + "epoch": 0.02745929969251305, + "grad_norm": 11.46835994720459, + "learning_rate": 1.9746901143039082e-05, + "loss": 1.1341, + "step": 576 + }, + { + "epoch": 0.027506972087812553, + "grad_norm": 1.9643545150756836, + "learning_rate": 1.974602225259278e-05, + "loss": 0.6747, + "step": 577 + }, + { + "epoch": 0.027554644483112053, + "grad_norm": 3.3582406044006348, + "learning_rate": 1.9745141858435607e-05, + "loss": 0.5221, + "step": 578 + }, + { + "epoch": 0.027602316878411557, + "grad_norm": 2.9419772624969482, + "learning_rate": 1.9744259960703405e-05, + "loss": 0.7216, + "step": 579 + }, + { + "epoch": 0.027649989273711057, + "grad_norm": 3.037371873855591, + "learning_rate": 1.9743376559532234e-05, + "loss": 0.6489, + "step": 580 + }, + { + "epoch": 0.02769766166901056, + "grad_norm": 1.3546128273010254, + "learning_rate": 1.9742491655058396e-05, + "loss": 0.679, + "step": 581 + }, + { + "epoch": 0.02774533406431006, + "grad_norm": 1.8137181997299194, + "learning_rate": 1.974160524741843e-05, + "loss": 0.8666, + "step": 582 + }, + { + "epoch": 0.027793006459609564, + "grad_norm": 0.9405362606048584, + "learning_rate": 1.974071733674909e-05, + "loss": 0.3082, + "step": 583 + }, + { + "epoch": 0.027840678854909064, + "grad_norm": 1.7752642631530762, + "learning_rate": 1.973982792318737e-05, + "loss": 0.6208, + "step": 584 + }, + { + "epoch": 0.027888351250208568, + "grad_norm": 2.4724225997924805, + "learning_rate": 1.9738937006870507e-05, + "loss": 0.7182, + "step": 585 + }, + { + "epoch": 0.027936023645508068, + "grad_norm": 1.5407928228378296, + "learning_rate": 1.9738044587935957e-05, + "loss": 0.8679, + "step": 586 + }, + { + "epoch": 0.02798369604080757, + "grad_norm": 1.9845658540725708, + "learning_rate": 1.9737150666521408e-05, + "loss": 0.9731, + "step": 587 + }, + { + "epoch": 0.02803136843610707, + "grad_norm": 2.7778193950653076, + "learning_rate": 1.9736255242764782e-05, + "loss": 0.9185, + "step": 588 + }, + { + "epoch": 0.028079040831406575, + "grad_norm": 2.502527952194214, + "learning_rate": 1.973535831680424e-05, + "loss": 0.7861, + "step": 589 + }, + { + "epoch": 0.028126713226706075, + "grad_norm": 2.2385640144348145, + "learning_rate": 1.973445988877816e-05, + "loss": 0.6714, + "step": 590 + }, + { + "epoch": 0.02817438562200558, + "grad_norm": 2.457892894744873, + "learning_rate": 1.9733559958825167e-05, + "loss": 0.5676, + "step": 591 + }, + { + "epoch": 0.02822205801730508, + "grad_norm": 1.67270827293396, + "learning_rate": 1.973265852708411e-05, + "loss": 0.8599, + "step": 592 + }, + { + "epoch": 0.028269730412604582, + "grad_norm": 1.2847950458526611, + "learning_rate": 1.973175559369407e-05, + "loss": 0.4154, + "step": 593 + }, + { + "epoch": 0.028317402807904082, + "grad_norm": 2.8948774337768555, + "learning_rate": 1.9730851158794358e-05, + "loss": 1.462, + "step": 594 + }, + { + "epoch": 0.028365075203203586, + "grad_norm": 1.836553931236267, + "learning_rate": 1.972994522252452e-05, + "loss": 1.0466, + "step": 595 + }, + { + "epoch": 0.028412747598503086, + "grad_norm": 1.6009653806686401, + "learning_rate": 1.9729037785024333e-05, + "loss": 0.4853, + "step": 596 + }, + { + "epoch": 0.02846041999380259, + "grad_norm": 1.9315605163574219, + "learning_rate": 1.972812884643381e-05, + "loss": 0.9313, + "step": 597 + }, + { + "epoch": 0.02850809238910209, + "grad_norm": 1.5604279041290283, + "learning_rate": 1.9727218406893177e-05, + "loss": 0.7766, + "step": 598 + }, + { + "epoch": 0.028555764784401593, + "grad_norm": 1.1914689540863037, + "learning_rate": 1.9726306466542923e-05, + "loss": 0.6603, + "step": 599 + }, + { + "epoch": 0.028603437179701093, + "grad_norm": 1.6006221771240234, + "learning_rate": 1.972539302552374e-05, + "loss": 0.9719, + "step": 600 + }, + { + "epoch": 0.028651109575000597, + "grad_norm": 1.6553188562393188, + "learning_rate": 1.9724478083976565e-05, + "loss": 0.7604, + "step": 601 + }, + { + "epoch": 0.028698781970300097, + "grad_norm": 1.3023021221160889, + "learning_rate": 1.9723561642042563e-05, + "loss": 0.7185, + "step": 602 + }, + { + "epoch": 0.0287464543655996, + "grad_norm": 1.863294243812561, + "learning_rate": 1.9722643699863135e-05, + "loss": 0.9524, + "step": 603 + }, + { + "epoch": 0.0287941267608991, + "grad_norm": 3.4323480129241943, + "learning_rate": 1.9721724257579907e-05, + "loss": 1.1218, + "step": 604 + }, + { + "epoch": 0.028841799156198604, + "grad_norm": 1.5496827363967896, + "learning_rate": 1.972080331533474e-05, + "loss": 0.7628, + "step": 605 + }, + { + "epoch": 0.028889471551498104, + "grad_norm": 1.1948590278625488, + "learning_rate": 1.971988087326973e-05, + "loss": 0.6586, + "step": 606 + }, + { + "epoch": 0.028937143946797608, + "grad_norm": 1.685185432434082, + "learning_rate": 1.9718956931527193e-05, + "loss": 0.7733, + "step": 607 + }, + { + "epoch": 0.028984816342097108, + "grad_norm": 1.624538540840149, + "learning_rate": 1.9718031490249688e-05, + "loss": 0.7968, + "step": 608 + }, + { + "epoch": 0.02903248873739661, + "grad_norm": 2.243673086166382, + "learning_rate": 1.9717104549580003e-05, + "loss": 0.9498, + "step": 609 + }, + { + "epoch": 0.02908016113269611, + "grad_norm": 2.4444425106048584, + "learning_rate": 1.9716176109661148e-05, + "loss": 1.3126, + "step": 610 + }, + { + "epoch": 0.029127833527995615, + "grad_norm": 1.7432293891906738, + "learning_rate": 1.9715246170636383e-05, + "loss": 0.7163, + "step": 611 + }, + { + "epoch": 0.029175505923295115, + "grad_norm": 10.065876007080078, + "learning_rate": 1.9714314732649174e-05, + "loss": 1.1127, + "step": 612 + }, + { + "epoch": 0.02922317831859462, + "grad_norm": 3.2807443141937256, + "learning_rate": 1.9713381795843244e-05, + "loss": 1.0007, + "step": 613 + }, + { + "epoch": 0.02927085071389412, + "grad_norm": 2.5899031162261963, + "learning_rate": 1.9712447360362534e-05, + "loss": 1.2204, + "step": 614 + }, + { + "epoch": 0.029318523109193622, + "grad_norm": 1.3960285186767578, + "learning_rate": 1.971151142635121e-05, + "loss": 0.7094, + "step": 615 + }, + { + "epoch": 0.029366195504493122, + "grad_norm": 2.9793753623962402, + "learning_rate": 1.9710573993953685e-05, + "loss": 0.6854, + "step": 616 + }, + { + "epoch": 0.029413867899792626, + "grad_norm": 1.1436901092529297, + "learning_rate": 1.9709635063314592e-05, + "loss": 0.6775, + "step": 617 + }, + { + "epoch": 0.029461540295092126, + "grad_norm": 9.052082061767578, + "learning_rate": 1.97086946345788e-05, + "loss": 0.9781, + "step": 618 + }, + { + "epoch": 0.02950921269039163, + "grad_norm": 1.6794508695602417, + "learning_rate": 1.9707752707891404e-05, + "loss": 0.8579, + "step": 619 + }, + { + "epoch": 0.02955688508569113, + "grad_norm": 2.077829122543335, + "learning_rate": 1.9706809283397733e-05, + "loss": 0.6141, + "step": 620 + }, + { + "epoch": 0.029604557480990633, + "grad_norm": 1.7477926015853882, + "learning_rate": 1.9705864361243355e-05, + "loss": 0.7191, + "step": 621 + }, + { + "epoch": 0.029652229876290133, + "grad_norm": 1.7621954679489136, + "learning_rate": 1.9704917941574053e-05, + "loss": 0.9054, + "step": 622 + }, + { + "epoch": 0.029699902271589636, + "grad_norm": 2.0777807235717773, + "learning_rate": 1.9703970024535855e-05, + "loss": 0.7892, + "step": 623 + }, + { + "epoch": 0.029747574666889137, + "grad_norm": 2.7849626541137695, + "learning_rate": 1.970302061027502e-05, + "loss": 0.8543, + "step": 624 + }, + { + "epoch": 0.02979524706218864, + "grad_norm": 2.029613494873047, + "learning_rate": 1.970206969893802e-05, + "loss": 0.9293, + "step": 625 + }, + { + "epoch": 0.02984291945748814, + "grad_norm": 3.735873222351074, + "learning_rate": 1.970111729067158e-05, + "loss": 1.4074, + "step": 626 + }, + { + "epoch": 0.029890591852787644, + "grad_norm": 1.1065034866333008, + "learning_rate": 1.9700163385622642e-05, + "loss": 0.4194, + "step": 627 + }, + { + "epoch": 0.029938264248087144, + "grad_norm": 1.4932465553283691, + "learning_rate": 1.969920798393839e-05, + "loss": 0.8679, + "step": 628 + }, + { + "epoch": 0.029985936643386647, + "grad_norm": 1.1497528553009033, + "learning_rate": 1.9698251085766226e-05, + "loss": 0.4865, + "step": 629 + }, + { + "epoch": 0.030033609038686147, + "grad_norm": 1.2771016359329224, + "learning_rate": 1.969729269125379e-05, + "loss": 0.79, + "step": 630 + }, + { + "epoch": 0.03008128143398565, + "grad_norm": 1.7814726829528809, + "learning_rate": 1.969633280054896e-05, + "loss": 0.964, + "step": 631 + }, + { + "epoch": 0.03012895382928515, + "grad_norm": 1.6634962558746338, + "learning_rate": 1.9695371413799825e-05, + "loss": 0.8258, + "step": 632 + }, + { + "epoch": 0.030176626224584654, + "grad_norm": 7.164558410644531, + "learning_rate": 1.9694408531154728e-05, + "loss": 0.374, + "step": 633 + }, + { + "epoch": 0.030224298619884155, + "grad_norm": 1.5273154973983765, + "learning_rate": 1.969344415276223e-05, + "loss": 0.8219, + "step": 634 + }, + { + "epoch": 0.030271971015183658, + "grad_norm": 1.0765341520309448, + "learning_rate": 1.9692478278771118e-05, + "loss": 0.3298, + "step": 635 + }, + { + "epoch": 0.030319643410483158, + "grad_norm": 5.0884013175964355, + "learning_rate": 1.969151090933042e-05, + "loss": 0.9095, + "step": 636 + }, + { + "epoch": 0.03036731580578266, + "grad_norm": 15.906243324279785, + "learning_rate": 1.9690542044589395e-05, + "loss": 0.1833, + "step": 637 + }, + { + "epoch": 0.030414988201082162, + "grad_norm": 2.3113362789154053, + "learning_rate": 1.9689571684697527e-05, + "loss": 0.8799, + "step": 638 + }, + { + "epoch": 0.030462660596381665, + "grad_norm": 1.4296796321868896, + "learning_rate": 1.9688599829804528e-05, + "loss": 0.9618, + "step": 639 + }, + { + "epoch": 0.030510332991681165, + "grad_norm": 2.759082078933716, + "learning_rate": 1.968762648006035e-05, + "loss": 1.0813, + "step": 640 + }, + { + "epoch": 0.03055800538698067, + "grad_norm": 1.532196044921875, + "learning_rate": 1.9686651635615172e-05, + "loss": 0.7161, + "step": 641 + }, + { + "epoch": 0.03060567778228017, + "grad_norm": 2.626645088195801, + "learning_rate": 1.9685675296619397e-05, + "loss": 1.155, + "step": 642 + }, + { + "epoch": 0.030653350177579673, + "grad_norm": 0.9614362120628357, + "learning_rate": 1.9684697463223664e-05, + "loss": 0.4707, + "step": 643 + }, + { + "epoch": 0.030701022572879173, + "grad_norm": 1.7256051301956177, + "learning_rate": 1.968371813557885e-05, + "loss": 1.0067, + "step": 644 + }, + { + "epoch": 0.030748694968178676, + "grad_norm": 1.5617140531539917, + "learning_rate": 1.968273731383605e-05, + "loss": 0.3811, + "step": 645 + }, + { + "epoch": 0.03079636736347818, + "grad_norm": 2.0715904235839844, + "learning_rate": 1.9681754998146592e-05, + "loss": 0.8022, + "step": 646 + }, + { + "epoch": 0.03084403975877768, + "grad_norm": 2.6158509254455566, + "learning_rate": 1.9680771188662044e-05, + "loss": 0.8486, + "step": 647 + }, + { + "epoch": 0.030891712154077183, + "grad_norm": 4.827423572540283, + "learning_rate": 1.9679785885534196e-05, + "loss": 0.4986, + "step": 648 + }, + { + "epoch": 0.030939384549376683, + "grad_norm": 1.2536042928695679, + "learning_rate": 1.9678799088915064e-05, + "loss": 0.7874, + "step": 649 + }, + { + "epoch": 0.030987056944676187, + "grad_norm": 1.6905231475830078, + "learning_rate": 1.9677810798956906e-05, + "loss": 0.7333, + "step": 650 + }, + { + "epoch": 0.031034729339975687, + "grad_norm": 3.6590423583984375, + "learning_rate": 1.9676821015812203e-05, + "loss": 1.2042, + "step": 651 + }, + { + "epoch": 0.03108240173527519, + "grad_norm": 2.787641763687134, + "learning_rate": 1.967582973963367e-05, + "loss": 0.6705, + "step": 652 + }, + { + "epoch": 0.03113007413057469, + "grad_norm": 1.9217485189437866, + "learning_rate": 1.9674836970574253e-05, + "loss": 0.7618, + "step": 653 + }, + { + "epoch": 0.031177746525874194, + "grad_norm": 1.7533555030822754, + "learning_rate": 1.967384270878712e-05, + "loss": 0.9996, + "step": 654 + }, + { + "epoch": 0.031225418921173694, + "grad_norm": 2.0206172466278076, + "learning_rate": 1.967284695442568e-05, + "loss": 1.3232, + "step": 655 + }, + { + "epoch": 0.0312730913164732, + "grad_norm": 1.805066704750061, + "learning_rate": 1.9671849707643567e-05, + "loss": 0.6891, + "step": 656 + }, + { + "epoch": 0.0313207637117727, + "grad_norm": 1.5261296033859253, + "learning_rate": 1.9670850968594642e-05, + "loss": 0.9105, + "step": 657 + }, + { + "epoch": 0.0313684361070722, + "grad_norm": 1.3711668252944946, + "learning_rate": 1.9669850737433002e-05, + "loss": 0.8131, + "step": 658 + }, + { + "epoch": 0.031416108502371705, + "grad_norm": 8.398372650146484, + "learning_rate": 1.9668849014312978e-05, + "loss": 0.412, + "step": 659 + }, + { + "epoch": 0.031463780897671205, + "grad_norm": 3.589878797531128, + "learning_rate": 1.9667845799389117e-05, + "loss": 1.141, + "step": 660 + }, + { + "epoch": 0.031511453292970705, + "grad_norm": 1.4862463474273682, + "learning_rate": 1.9666841092816212e-05, + "loss": 0.6138, + "step": 661 + }, + { + "epoch": 0.031559125688270205, + "grad_norm": 1.5963002443313599, + "learning_rate": 1.9665834894749275e-05, + "loss": 0.7301, + "step": 662 + }, + { + "epoch": 0.03160679808356971, + "grad_norm": 1.64510178565979, + "learning_rate": 1.966482720534355e-05, + "loss": 0.5707, + "step": 663 + }, + { + "epoch": 0.03165447047886921, + "grad_norm": 1.807691216468811, + "learning_rate": 1.9663818024754516e-05, + "loss": 0.6151, + "step": 664 + }, + { + "epoch": 0.03170214287416871, + "grad_norm": 1.6394729614257812, + "learning_rate": 1.966280735313788e-05, + "loss": 0.6204, + "step": 665 + }, + { + "epoch": 0.03174981526946821, + "grad_norm": 1.5085763931274414, + "learning_rate": 1.9661795190649578e-05, + "loss": 0.7999, + "step": 666 + }, + { + "epoch": 0.03179748766476772, + "grad_norm": 2.1185641288757324, + "learning_rate": 1.9660781537445774e-05, + "loss": 0.4124, + "step": 667 + }, + { + "epoch": 0.03184516006006722, + "grad_norm": 1.159080982208252, + "learning_rate": 1.9659766393682867e-05, + "loss": 0.4458, + "step": 668 + }, + { + "epoch": 0.03189283245536672, + "grad_norm": 1.7421625852584839, + "learning_rate": 1.965874975951748e-05, + "loss": 0.4852, + "step": 669 + }, + { + "epoch": 0.03194050485066622, + "grad_norm": 3.620832920074463, + "learning_rate": 1.965773163510647e-05, + "loss": 0.2355, + "step": 670 + }, + { + "epoch": 0.03198817724596573, + "grad_norm": 2.30461049079895, + "learning_rate": 1.9656712020606926e-05, + "loss": 1.0058, + "step": 671 + }, + { + "epoch": 0.03203584964126523, + "grad_norm": 6.597954273223877, + "learning_rate": 1.9655690916176164e-05, + "loss": 0.4127, + "step": 672 + }, + { + "epoch": 0.03208352203656473, + "grad_norm": 4.061854839324951, + "learning_rate": 1.9654668321971724e-05, + "loss": 0.8925, + "step": 673 + }, + { + "epoch": 0.03213119443186423, + "grad_norm": 1.1764100790023804, + "learning_rate": 1.965364423815139e-05, + "loss": 0.4638, + "step": 674 + }, + { + "epoch": 0.032178866827163734, + "grad_norm": 2.0967013835906982, + "learning_rate": 1.965261866487316e-05, + "loss": 0.975, + "step": 675 + }, + { + "epoch": 0.032226539222463234, + "grad_norm": 9.140423774719238, + "learning_rate": 1.9651591602295275e-05, + "loss": 0.6945, + "step": 676 + }, + { + "epoch": 0.032274211617762734, + "grad_norm": 1.62010657787323, + "learning_rate": 1.9650563050576195e-05, + "loss": 0.8773, + "step": 677 + }, + { + "epoch": 0.032321884013062234, + "grad_norm": 1.2245630025863647, + "learning_rate": 1.964953300987462e-05, + "loss": 0.4385, + "step": 678 + }, + { + "epoch": 0.03236955640836174, + "grad_norm": 1.9532051086425781, + "learning_rate": 1.9648501480349473e-05, + "loss": 0.784, + "step": 679 + }, + { + "epoch": 0.03241722880366124, + "grad_norm": 4.412245750427246, + "learning_rate": 1.9647468462159906e-05, + "loss": 0.1273, + "step": 680 + }, + { + "epoch": 0.03246490119896074, + "grad_norm": 1.2344199419021606, + "learning_rate": 1.9646433955465307e-05, + "loss": 0.5754, + "step": 681 + }, + { + "epoch": 0.03251257359426024, + "grad_norm": 1.2929292917251587, + "learning_rate": 1.9645397960425287e-05, + "loss": 0.5648, + "step": 682 + }, + { + "epoch": 0.03256024598955975, + "grad_norm": 2.8150668144226074, + "learning_rate": 1.964436047719969e-05, + "loss": 0.9021, + "step": 683 + }, + { + "epoch": 0.03260791838485925, + "grad_norm": 2.1372106075286865, + "learning_rate": 1.9643321505948588e-05, + "loss": 0.885, + "step": 684 + }, + { + "epoch": 0.03265559078015875, + "grad_norm": 2.2482917308807373, + "learning_rate": 1.9642281046832287e-05, + "loss": 0.9908, + "step": 685 + }, + { + "epoch": 0.03270326317545825, + "grad_norm": 1.6600946187973022, + "learning_rate": 1.9641239100011312e-05, + "loss": 0.7879, + "step": 686 + }, + { + "epoch": 0.032750935570757755, + "grad_norm": 1.259423851966858, + "learning_rate": 1.9640195665646434e-05, + "loss": 0.8581, + "step": 687 + }, + { + "epoch": 0.032798607966057255, + "grad_norm": 1.942671298980713, + "learning_rate": 1.963915074389864e-05, + "loss": 0.8663, + "step": 688 + }, + { + "epoch": 0.032846280361356756, + "grad_norm": 1.6967015266418457, + "learning_rate": 1.9638104334929145e-05, + "loss": 0.5918, + "step": 689 + }, + { + "epoch": 0.032893952756656256, + "grad_norm": 2.120243549346924, + "learning_rate": 1.963705643889941e-05, + "loss": 1.2791, + "step": 690 + }, + { + "epoch": 0.03294162515195576, + "grad_norm": 2.302832841873169, + "learning_rate": 1.9636007055971106e-05, + "loss": 1.1948, + "step": 691 + }, + { + "epoch": 0.03298929754725526, + "grad_norm": 3.452000856399536, + "learning_rate": 1.9634956186306147e-05, + "loss": 0.6068, + "step": 692 + }, + { + "epoch": 0.03303696994255476, + "grad_norm": 1.3084638118743896, + "learning_rate": 1.963390383006667e-05, + "loss": 0.3356, + "step": 693 + }, + { + "epoch": 0.03308464233785426, + "grad_norm": 1.6580809354782104, + "learning_rate": 1.9632849987415038e-05, + "loss": 1.0911, + "step": 694 + }, + { + "epoch": 0.03313231473315377, + "grad_norm": 1.4388517141342163, + "learning_rate": 1.9631794658513853e-05, + "loss": 0.5739, + "step": 695 + }, + { + "epoch": 0.03317998712845327, + "grad_norm": 1.9334794282913208, + "learning_rate": 1.9630737843525946e-05, + "loss": 0.7706, + "step": 696 + }, + { + "epoch": 0.03322765952375277, + "grad_norm": 4.213309288024902, + "learning_rate": 1.9629679542614363e-05, + "loss": 1.1035, + "step": 697 + }, + { + "epoch": 0.03327533191905227, + "grad_norm": 2.349947929382324, + "learning_rate": 1.962861975594239e-05, + "loss": 0.9024, + "step": 698 + }, + { + "epoch": 0.03332300431435178, + "grad_norm": 1.374952793121338, + "learning_rate": 1.9627558483673546e-05, + "loss": 0.5061, + "step": 699 + }, + { + "epoch": 0.03337067670965128, + "grad_norm": 2.7832841873168945, + "learning_rate": 1.962649572597158e-05, + "loss": 0.7869, + "step": 700 + }, + { + "epoch": 0.03341834910495078, + "grad_norm": 2.1559762954711914, + "learning_rate": 1.9625431483000448e-05, + "loss": 0.752, + "step": 701 + }, + { + "epoch": 0.03346602150025028, + "grad_norm": 2.687613010406494, + "learning_rate": 1.9624365754924364e-05, + "loss": 0.7765, + "step": 702 + }, + { + "epoch": 0.033513693895549784, + "grad_norm": 9.63939380645752, + "learning_rate": 1.9623298541907756e-05, + "loss": 1.0306, + "step": 703 + }, + { + "epoch": 0.033561366290849284, + "grad_norm": 2.1271538734436035, + "learning_rate": 1.9622229844115284e-05, + "loss": 1.0692, + "step": 704 + }, + { + "epoch": 0.033609038686148784, + "grad_norm": 4.3584442138671875, + "learning_rate": 1.9621159661711834e-05, + "loss": 1.4418, + "step": 705 + }, + { + "epoch": 0.033656711081448284, + "grad_norm": 1.646856427192688, + "learning_rate": 1.9620087994862534e-05, + "loss": 0.9123, + "step": 706 + }, + { + "epoch": 0.03370438347674779, + "grad_norm": 2.322556734085083, + "learning_rate": 1.961901484373272e-05, + "loss": 1.0779, + "step": 707 + }, + { + "epoch": 0.03375205587204729, + "grad_norm": 1.4996213912963867, + "learning_rate": 1.9617940208487968e-05, + "loss": 0.5601, + "step": 708 + }, + { + "epoch": 0.03379972826734679, + "grad_norm": 3.1180505752563477, + "learning_rate": 1.9616864089294095e-05, + "loss": 1.0012, + "step": 709 + }, + { + "epoch": 0.03384740066264629, + "grad_norm": 4.107066631317139, + "learning_rate": 1.9615786486317124e-05, + "loss": 0.8514, + "step": 710 + }, + { + "epoch": 0.0338950730579458, + "grad_norm": 4.024191379547119, + "learning_rate": 1.9614707399723318e-05, + "loss": 0.7237, + "step": 711 + }, + { + "epoch": 0.0339427454532453, + "grad_norm": 3.5964674949645996, + "learning_rate": 1.9613626829679176e-05, + "loss": 1.4357, + "step": 712 + }, + { + "epoch": 0.0339904178485448, + "grad_norm": 2.3197476863861084, + "learning_rate": 1.9612544776351415e-05, + "loss": 0.4594, + "step": 713 + }, + { + "epoch": 0.0340380902438443, + "grad_norm": 2.2602896690368652, + "learning_rate": 1.961146123990699e-05, + "loss": 0.7322, + "step": 714 + }, + { + "epoch": 0.034085762639143806, + "grad_norm": 2.804417610168457, + "learning_rate": 1.9610376220513067e-05, + "loss": 1.5153, + "step": 715 + }, + { + "epoch": 0.034133435034443306, + "grad_norm": 2.9005067348480225, + "learning_rate": 1.9609289718337067e-05, + "loss": 0.95, + "step": 716 + }, + { + "epoch": 0.034181107429742806, + "grad_norm": 7.997469902038574, + "learning_rate": 1.9608201733546615e-05, + "loss": 1.3378, + "step": 717 + }, + { + "epoch": 0.034228779825042306, + "grad_norm": 3.4590885639190674, + "learning_rate": 1.9607112266309585e-05, + "loss": 1.5682, + "step": 718 + }, + { + "epoch": 0.03427645222034181, + "grad_norm": 2.0878725051879883, + "learning_rate": 1.9606021316794065e-05, + "loss": 0.9559, + "step": 719 + }, + { + "epoch": 0.03432412461564131, + "grad_norm": 1.4294086694717407, + "learning_rate": 1.9604928885168376e-05, + "loss": 0.6045, + "step": 720 + }, + { + "epoch": 0.03437179701094081, + "grad_norm": 1.5459392070770264, + "learning_rate": 1.9603834971601075e-05, + "loss": 0.7154, + "step": 721 + }, + { + "epoch": 0.03441946940624031, + "grad_norm": 1.3652042150497437, + "learning_rate": 1.9602739576260937e-05, + "loss": 0.7979, + "step": 722 + }, + { + "epoch": 0.03446714180153982, + "grad_norm": 2.1262052059173584, + "learning_rate": 1.9601642699316968e-05, + "loss": 0.8921, + "step": 723 + }, + { + "epoch": 0.03451481419683932, + "grad_norm": 2.031620740890503, + "learning_rate": 1.9600544340938415e-05, + "loss": 0.9675, + "step": 724 + }, + { + "epoch": 0.03456248659213882, + "grad_norm": 1.8469467163085938, + "learning_rate": 1.9599444501294733e-05, + "loss": 0.7833, + "step": 725 + }, + { + "epoch": 0.03461015898743832, + "grad_norm": 1.959503173828125, + "learning_rate": 1.959834318055562e-05, + "loss": 1.132, + "step": 726 + }, + { + "epoch": 0.03465783138273783, + "grad_norm": 1.5650908946990967, + "learning_rate": 1.9597240378891e-05, + "loss": 0.7967, + "step": 727 + }, + { + "epoch": 0.03470550377803733, + "grad_norm": 2.6273374557495117, + "learning_rate": 1.959613609647102e-05, + "loss": 1.2453, + "step": 728 + }, + { + "epoch": 0.03475317617333683, + "grad_norm": 1.9722366333007812, + "learning_rate": 1.959503033346606e-05, + "loss": 0.8889, + "step": 729 + }, + { + "epoch": 0.03480084856863633, + "grad_norm": 1.6233927011489868, + "learning_rate": 1.959392309004673e-05, + "loss": 0.8429, + "step": 730 + }, + { + "epoch": 0.034848520963935835, + "grad_norm": 1.907275915145874, + "learning_rate": 1.959281436638387e-05, + "loss": 0.9834, + "step": 731 + }, + { + "epoch": 0.034896193359235335, + "grad_norm": 2.6624152660369873, + "learning_rate": 1.9591704162648532e-05, + "loss": 0.9018, + "step": 732 + }, + { + "epoch": 0.034943865754534835, + "grad_norm": 2.5180978775024414, + "learning_rate": 1.9590592479012022e-05, + "loss": 1.1962, + "step": 733 + }, + { + "epoch": 0.034991538149834335, + "grad_norm": 2.042426824569702, + "learning_rate": 1.9589479315645857e-05, + "loss": 0.6018, + "step": 734 + }, + { + "epoch": 0.03503921054513384, + "grad_norm": 1.856363296508789, + "learning_rate": 1.9588364672721785e-05, + "loss": 0.989, + "step": 735 + }, + { + "epoch": 0.03508688294043334, + "grad_norm": 3.38390851020813, + "learning_rate": 1.9587248550411786e-05, + "loss": 0.6093, + "step": 736 + }, + { + "epoch": 0.03513455533573284, + "grad_norm": 1.378178358078003, + "learning_rate": 1.9586130948888064e-05, + "loss": 0.5984, + "step": 737 + }, + { + "epoch": 0.03518222773103235, + "grad_norm": 4.037237644195557, + "learning_rate": 1.9585011868323052e-05, + "loss": 0.7504, + "step": 738 + }, + { + "epoch": 0.03522990012633185, + "grad_norm": 1.9365379810333252, + "learning_rate": 1.958389130888942e-05, + "loss": 0.3903, + "step": 739 + }, + { + "epoch": 0.03527757252163135, + "grad_norm": 1.7285670042037964, + "learning_rate": 1.9582769270760055e-05, + "loss": 0.9453, + "step": 740 + }, + { + "epoch": 0.03532524491693085, + "grad_norm": 1.756253719329834, + "learning_rate": 1.958164575410807e-05, + "loss": 0.8585, + "step": 741 + }, + { + "epoch": 0.035372917312230356, + "grad_norm": 4.003931522369385, + "learning_rate": 1.958052075910682e-05, + "loss": 0.8641, + "step": 742 + }, + { + "epoch": 0.035420589707529856, + "grad_norm": 3.024296760559082, + "learning_rate": 1.9579394285929877e-05, + "loss": 1.0505, + "step": 743 + }, + { + "epoch": 0.03546826210282936, + "grad_norm": 1.138126015663147, + "learning_rate": 1.9578266334751045e-05, + "loss": 0.339, + "step": 744 + }, + { + "epoch": 0.03551593449812886, + "grad_norm": 2.1584057807922363, + "learning_rate": 1.9577136905744353e-05, + "loss": 0.7884, + "step": 745 + }, + { + "epoch": 0.035563606893428364, + "grad_norm": 1.7360907793045044, + "learning_rate": 1.957600599908406e-05, + "loss": 0.6847, + "step": 746 + }, + { + "epoch": 0.035611279288727864, + "grad_norm": 1.7327625751495361, + "learning_rate": 1.9574873614944657e-05, + "loss": 0.3045, + "step": 747 + }, + { + "epoch": 0.035658951684027364, + "grad_norm": 1.9865176677703857, + "learning_rate": 1.9573739753500857e-05, + "loss": 1.1515, + "step": 748 + }, + { + "epoch": 0.035706624079326864, + "grad_norm": 1.4304611682891846, + "learning_rate": 1.9572604414927604e-05, + "loss": 0.9681, + "step": 749 + }, + { + "epoch": 0.03575429647462637, + "grad_norm": 1.925475835800171, + "learning_rate": 1.957146759940007e-05, + "loss": 0.7573, + "step": 750 + }, + { + "epoch": 0.03580196886992587, + "grad_norm": 2.9961085319519043, + "learning_rate": 1.9570329307093652e-05, + "loss": 1.0703, + "step": 751 + }, + { + "epoch": 0.03584964126522537, + "grad_norm": 1.74911367893219, + "learning_rate": 1.9569189538183978e-05, + "loss": 0.5074, + "step": 752 + }, + { + "epoch": 0.03589731366052487, + "grad_norm": 3.323850393295288, + "learning_rate": 1.95680482928469e-05, + "loss": 0.698, + "step": 753 + }, + { + "epoch": 0.03594498605582438, + "grad_norm": 2.003148078918457, + "learning_rate": 1.9566905571258502e-05, + "loss": 1.0439, + "step": 754 + }, + { + "epoch": 0.03599265845112388, + "grad_norm": 3.6319541931152344, + "learning_rate": 1.9565761373595094e-05, + "loss": 0.7797, + "step": 755 + }, + { + "epoch": 0.03604033084642338, + "grad_norm": 1.8461650609970093, + "learning_rate": 1.9564615700033215e-05, + "loss": 0.788, + "step": 756 + }, + { + "epoch": 0.03608800324172288, + "grad_norm": 1.9516270160675049, + "learning_rate": 1.956346855074963e-05, + "loss": 0.9934, + "step": 757 + }, + { + "epoch": 0.036135675637022385, + "grad_norm": 2.031405448913574, + "learning_rate": 1.9562319925921333e-05, + "loss": 0.7105, + "step": 758 + }, + { + "epoch": 0.036183348032321885, + "grad_norm": 1.166115403175354, + "learning_rate": 1.9561169825725546e-05, + "loss": 0.7417, + "step": 759 + }, + { + "epoch": 0.036231020427621385, + "grad_norm": 2.524076461791992, + "learning_rate": 1.9560018250339712e-05, + "loss": 1.2142, + "step": 760 + }, + { + "epoch": 0.036278692822920885, + "grad_norm": 1.542688012123108, + "learning_rate": 1.9558865199941515e-05, + "loss": 0.2566, + "step": 761 + }, + { + "epoch": 0.03632636521822039, + "grad_norm": 3.6229541301727295, + "learning_rate": 1.9557710674708853e-05, + "loss": 0.5391, + "step": 762 + }, + { + "epoch": 0.03637403761351989, + "grad_norm": 2.812225818634033, + "learning_rate": 1.955655467481986e-05, + "loss": 0.7649, + "step": 763 + }, + { + "epoch": 0.03642171000881939, + "grad_norm": 1.541874647140503, + "learning_rate": 1.9555397200452892e-05, + "loss": 0.8504, + "step": 764 + }, + { + "epoch": 0.03646938240411889, + "grad_norm": 1.5217840671539307, + "learning_rate": 1.9554238251786538e-05, + "loss": 0.8975, + "step": 765 + }, + { + "epoch": 0.0365170547994184, + "grad_norm": 2.1495580673217773, + "learning_rate": 1.9553077828999614e-05, + "loss": 0.9806, + "step": 766 + }, + { + "epoch": 0.0365647271947179, + "grad_norm": 1.58120596408844, + "learning_rate": 1.9551915932271156e-05, + "loss": 0.576, + "step": 767 + }, + { + "epoch": 0.0366123995900174, + "grad_norm": 1.8964204788208008, + "learning_rate": 1.9550752561780434e-05, + "loss": 0.5773, + "step": 768 + }, + { + "epoch": 0.0366600719853169, + "grad_norm": 2.6379222869873047, + "learning_rate": 1.9549587717706952e-05, + "loss": 0.6676, + "step": 769 + }, + { + "epoch": 0.03670774438061641, + "grad_norm": 1.5131341218948364, + "learning_rate": 1.9548421400230418e-05, + "loss": 0.6419, + "step": 770 + }, + { + "epoch": 0.03675541677591591, + "grad_norm": 4.6081414222717285, + "learning_rate": 1.9547253609530797e-05, + "loss": 0.2493, + "step": 771 + }, + { + "epoch": 0.03680308917121541, + "grad_norm": 1.161028504371643, + "learning_rate": 1.954608434578826e-05, + "loss": 0.3664, + "step": 772 + }, + { + "epoch": 0.03685076156651491, + "grad_norm": 1.630754828453064, + "learning_rate": 1.9544913609183214e-05, + "loss": 0.9671, + "step": 773 + }, + { + "epoch": 0.036898433961814414, + "grad_norm": 6.992397308349609, + "learning_rate": 1.9543741399896295e-05, + "loss": 0.1969, + "step": 774 + }, + { + "epoch": 0.036946106357113914, + "grad_norm": 1.7193506956100464, + "learning_rate": 1.9542567718108357e-05, + "loss": 0.3565, + "step": 775 + }, + { + "epoch": 0.036993778752413414, + "grad_norm": 32.21617126464844, + "learning_rate": 1.954139256400049e-05, + "loss": 0.9591, + "step": 776 + }, + { + "epoch": 0.037041451147712914, + "grad_norm": 1.2193129062652588, + "learning_rate": 1.954021593775401e-05, + "loss": 0.6884, + "step": 777 + }, + { + "epoch": 0.03708912354301242, + "grad_norm": 1.4926356077194214, + "learning_rate": 1.953903783955045e-05, + "loss": 0.7923, + "step": 778 + }, + { + "epoch": 0.03713679593831192, + "grad_norm": 5.782951354980469, + "learning_rate": 1.953785826957159e-05, + "loss": 0.8947, + "step": 779 + }, + { + "epoch": 0.03718446833361142, + "grad_norm": 1.9362940788269043, + "learning_rate": 1.9536677227999415e-05, + "loss": 0.7882, + "step": 780 + }, + { + "epoch": 0.03723214072891092, + "grad_norm": 2.3215765953063965, + "learning_rate": 1.953549471501616e-05, + "loss": 0.5856, + "step": 781 + }, + { + "epoch": 0.03727981312421043, + "grad_norm": 1.9120577573776245, + "learning_rate": 1.953431073080426e-05, + "loss": 0.6924, + "step": 782 + }, + { + "epoch": 0.03732748551950993, + "grad_norm": 6.096733093261719, + "learning_rate": 1.95331252755464e-05, + "loss": 0.4019, + "step": 783 + }, + { + "epoch": 0.03737515791480943, + "grad_norm": 1.5409770011901855, + "learning_rate": 1.9531938349425484e-05, + "loss": 0.8029, + "step": 784 + }, + { + "epoch": 0.03742283031010893, + "grad_norm": 2.147329092025757, + "learning_rate": 1.953074995262464e-05, + "loss": 1.0116, + "step": 785 + }, + { + "epoch": 0.037470502705408436, + "grad_norm": 2.0393097400665283, + "learning_rate": 1.9529560085327227e-05, + "loss": 1.0883, + "step": 786 + }, + { + "epoch": 0.037518175100707936, + "grad_norm": 1.857731819152832, + "learning_rate": 1.9528368747716827e-05, + "loss": 0.8214, + "step": 787 + }, + { + "epoch": 0.037565847496007436, + "grad_norm": 7.41095495223999, + "learning_rate": 1.9527175939977252e-05, + "loss": 1.1811, + "step": 788 + }, + { + "epoch": 0.037613519891306936, + "grad_norm": 1.9293097257614136, + "learning_rate": 1.952598166229254e-05, + "loss": 0.5862, + "step": 789 + }, + { + "epoch": 0.03766119228660644, + "grad_norm": 6.753727436065674, + "learning_rate": 1.9524785914846956e-05, + "loss": 0.5971, + "step": 790 + }, + { + "epoch": 0.03770886468190594, + "grad_norm": 1.517684817314148, + "learning_rate": 1.9523588697824995e-05, + "loss": 0.7392, + "step": 791 + }, + { + "epoch": 0.03775653707720544, + "grad_norm": 1.3403767347335815, + "learning_rate": 1.952239001141137e-05, + "loss": 0.6231, + "step": 792 + }, + { + "epoch": 0.03780420947250494, + "grad_norm": 5.754158020019531, + "learning_rate": 1.9521189855791026e-05, + "loss": 1.4619, + "step": 793 + }, + { + "epoch": 0.03785188186780445, + "grad_norm": 1.7080199718475342, + "learning_rate": 1.9519988231149142e-05, + "loss": 0.84, + "step": 794 + }, + { + "epoch": 0.03789955426310395, + "grad_norm": 2.198056936264038, + "learning_rate": 1.9518785137671107e-05, + "loss": 0.7139, + "step": 795 + }, + { + "epoch": 0.03794722665840345, + "grad_norm": 1.9409888982772827, + "learning_rate": 1.9517580575542546e-05, + "loss": 0.8135, + "step": 796 + }, + { + "epoch": 0.03799489905370295, + "grad_norm": 1.7660938501358032, + "learning_rate": 1.951637454494932e-05, + "loss": 0.7041, + "step": 797 + }, + { + "epoch": 0.03804257144900246, + "grad_norm": 1.3157745599746704, + "learning_rate": 1.95151670460775e-05, + "loss": 0.6999, + "step": 798 + }, + { + "epoch": 0.03809024384430196, + "grad_norm": 1.2755836248397827, + "learning_rate": 1.951395807911339e-05, + "loss": 0.8245, + "step": 799 + }, + { + "epoch": 0.03813791623960146, + "grad_norm": 1.8460443019866943, + "learning_rate": 1.9512747644243525e-05, + "loss": 1.016, + "step": 800 + }, + { + "epoch": 0.03818558863490096, + "grad_norm": 1.0960500240325928, + "learning_rate": 1.9511535741654663e-05, + "loss": 0.6018, + "step": 801 + }, + { + "epoch": 0.038233261030200465, + "grad_norm": 1.4150505065917969, + "learning_rate": 1.9510322371533783e-05, + "loss": 0.9056, + "step": 802 + }, + { + "epoch": 0.038280933425499965, + "grad_norm": 2.163686513900757, + "learning_rate": 1.95091075340681e-05, + "loss": 0.8344, + "step": 803 + }, + { + "epoch": 0.038328605820799465, + "grad_norm": 4.431417465209961, + "learning_rate": 1.950789122944505e-05, + "loss": 0.1334, + "step": 804 + }, + { + "epoch": 0.038376278216098965, + "grad_norm": 2.1695454120635986, + "learning_rate": 1.9506673457852293e-05, + "loss": 0.9056, + "step": 805 + }, + { + "epoch": 0.03842395061139847, + "grad_norm": 2.3549680709838867, + "learning_rate": 1.9505454219477718e-05, + "loss": 1.0862, + "step": 806 + }, + { + "epoch": 0.03847162300669797, + "grad_norm": 1.5020617246627808, + "learning_rate": 1.950423351450945e-05, + "loss": 0.8241, + "step": 807 + }, + { + "epoch": 0.03851929540199747, + "grad_norm": 1.5390362739562988, + "learning_rate": 1.9503011343135828e-05, + "loss": 0.7707, + "step": 808 + }, + { + "epoch": 0.03856696779729697, + "grad_norm": 2.2688674926757812, + "learning_rate": 1.9501787705545412e-05, + "loss": 0.9565, + "step": 809 + }, + { + "epoch": 0.03861464019259648, + "grad_norm": 1.713032841682434, + "learning_rate": 1.9500562601927003e-05, + "loss": 0.6219, + "step": 810 + }, + { + "epoch": 0.03866231258789598, + "grad_norm": 1.7460881471633911, + "learning_rate": 1.9499336032469626e-05, + "loss": 0.8585, + "step": 811 + }, + { + "epoch": 0.03870998498319548, + "grad_norm": 2.7170376777648926, + "learning_rate": 1.949810799736252e-05, + "loss": 1.1543, + "step": 812 + }, + { + "epoch": 0.03875765737849498, + "grad_norm": 1.212214469909668, + "learning_rate": 1.949687849679516e-05, + "loss": 0.6487, + "step": 813 + }, + { + "epoch": 0.038805329773794486, + "grad_norm": 4.563180446624756, + "learning_rate": 1.949564753095725e-05, + "loss": 1.3672, + "step": 814 + }, + { + "epoch": 0.038853002169093986, + "grad_norm": 1.9729410409927368, + "learning_rate": 1.949441510003871e-05, + "loss": 0.998, + "step": 815 + }, + { + "epoch": 0.038900674564393486, + "grad_norm": 1.927474021911621, + "learning_rate": 1.9493181204229696e-05, + "loss": 0.96, + "step": 816 + }, + { + "epoch": 0.03894834695969299, + "grad_norm": 1.0984028577804565, + "learning_rate": 1.949194584372058e-05, + "loss": 0.6064, + "step": 817 + }, + { + "epoch": 0.038996019354992494, + "grad_norm": 2.086822748184204, + "learning_rate": 1.9490709018701967e-05, + "loss": 1.0785, + "step": 818 + }, + { + "epoch": 0.039043691750291994, + "grad_norm": 2.5809688568115234, + "learning_rate": 1.9489470729364694e-05, + "loss": 0.7195, + "step": 819 + }, + { + "epoch": 0.039091364145591494, + "grad_norm": 1.8625272512435913, + "learning_rate": 1.9488230975899804e-05, + "loss": 0.9258, + "step": 820 + }, + { + "epoch": 0.039139036540890994, + "grad_norm": 1.8708604574203491, + "learning_rate": 1.948698975849859e-05, + "loss": 1.1211, + "step": 821 + }, + { + "epoch": 0.0391867089361905, + "grad_norm": 1.9733126163482666, + "learning_rate": 1.9485747077352547e-05, + "loss": 1.0904, + "step": 822 + }, + { + "epoch": 0.03923438133149, + "grad_norm": 1.703894019126892, + "learning_rate": 1.948450293265342e-05, + "loss": 0.6765, + "step": 823 + }, + { + "epoch": 0.0392820537267895, + "grad_norm": 1.9978395700454712, + "learning_rate": 1.948325732459316e-05, + "loss": 0.539, + "step": 824 + }, + { + "epoch": 0.039329726122089, + "grad_norm": 1.8047763109207153, + "learning_rate": 1.948201025336395e-05, + "loss": 0.9598, + "step": 825 + }, + { + "epoch": 0.03937739851738851, + "grad_norm": 2.928663969039917, + "learning_rate": 1.9480761719158208e-05, + "loss": 0.7686, + "step": 826 + }, + { + "epoch": 0.03942507091268801, + "grad_norm": 1.6891142129898071, + "learning_rate": 1.9479511722168567e-05, + "loss": 0.696, + "step": 827 + }, + { + "epoch": 0.03947274330798751, + "grad_norm": 1.5037572383880615, + "learning_rate": 1.947826026258788e-05, + "loss": 0.9345, + "step": 828 + }, + { + "epoch": 0.03952041570328701, + "grad_norm": 1.8177145719528198, + "learning_rate": 1.947700734060925e-05, + "loss": 0.7538, + "step": 829 + }, + { + "epoch": 0.039568088098586515, + "grad_norm": 1.433455467224121, + "learning_rate": 1.9475752956425978e-05, + "loss": 0.7786, + "step": 830 + }, + { + "epoch": 0.039615760493886015, + "grad_norm": 3.665915012359619, + "learning_rate": 1.9474497110231607e-05, + "loss": 1.3014, + "step": 831 + }, + { + "epoch": 0.039663432889185515, + "grad_norm": 3.8191745281219482, + "learning_rate": 1.94732398022199e-05, + "loss": 0.9444, + "step": 832 + }, + { + "epoch": 0.03971110528448502, + "grad_norm": 1.385881781578064, + "learning_rate": 1.9471981032584846e-05, + "loss": 0.5207, + "step": 833 + }, + { + "epoch": 0.03975877767978452, + "grad_norm": 1.8778563737869263, + "learning_rate": 1.9470720801520665e-05, + "loss": 0.624, + "step": 834 + }, + { + "epoch": 0.03980645007508402, + "grad_norm": 2.0841329097747803, + "learning_rate": 1.946945910922179e-05, + "loss": 0.4488, + "step": 835 + }, + { + "epoch": 0.03985412247038352, + "grad_norm": 3.347536087036133, + "learning_rate": 1.9468195955882892e-05, + "loss": 0.8075, + "step": 836 + }, + { + "epoch": 0.03990179486568303, + "grad_norm": 3.5374181270599365, + "learning_rate": 1.946693134169886e-05, + "loss": 0.7185, + "step": 837 + }, + { + "epoch": 0.03994946726098253, + "grad_norm": 1.4362339973449707, + "learning_rate": 1.9465665266864815e-05, + "loss": 0.6004, + "step": 838 + }, + { + "epoch": 0.03999713965628203, + "grad_norm": 2.940203905105591, + "learning_rate": 1.9464397731576093e-05, + "loss": 0.9549, + "step": 839 + }, + { + "epoch": 0.04004481205158153, + "grad_norm": 1.480191946029663, + "learning_rate": 1.946312873602827e-05, + "loss": 0.6108, + "step": 840 + }, + { + "epoch": 0.04009248444688104, + "grad_norm": 1.2709978818893433, + "learning_rate": 1.9461858280417134e-05, + "loss": 0.6637, + "step": 841 + }, + { + "epoch": 0.04014015684218054, + "grad_norm": 1.8085658550262451, + "learning_rate": 1.94605863649387e-05, + "loss": 1.2645, + "step": 842 + }, + { + "epoch": 0.04018782923748004, + "grad_norm": 1.4298356771469116, + "learning_rate": 1.945931298978922e-05, + "loss": 0.2238, + "step": 843 + }, + { + "epoch": 0.04023550163277954, + "grad_norm": 9.08450698852539, + "learning_rate": 1.9458038155165157e-05, + "loss": 2.0859, + "step": 844 + }, + { + "epoch": 0.040283174028079044, + "grad_norm": 2.0305638313293457, + "learning_rate": 1.94567618612632e-05, + "loss": 0.7403, + "step": 845 + }, + { + "epoch": 0.040330846423378544, + "grad_norm": 2.2413032054901123, + "learning_rate": 1.9455484108280277e-05, + "loss": 0.8184, + "step": 846 + }, + { + "epoch": 0.040378518818678044, + "grad_norm": 1.2897911071777344, + "learning_rate": 1.945420489641353e-05, + "loss": 0.4618, + "step": 847 + }, + { + "epoch": 0.040426191213977544, + "grad_norm": 3.2265336513519287, + "learning_rate": 1.945292422586033e-05, + "loss": 1.0359, + "step": 848 + }, + { + "epoch": 0.04047386360927705, + "grad_norm": 1.4464894533157349, + "learning_rate": 1.9451642096818258e-05, + "loss": 0.662, + "step": 849 + }, + { + "epoch": 0.04052153600457655, + "grad_norm": 1.6860398054122925, + "learning_rate": 1.9450358509485152e-05, + "loss": 1.149, + "step": 850 + }, + { + "epoch": 0.04056920839987605, + "grad_norm": 1.7164076566696167, + "learning_rate": 1.9449073464059048e-05, + "loss": 0.7881, + "step": 851 + }, + { + "epoch": 0.04061688079517555, + "grad_norm": 1.583937644958496, + "learning_rate": 1.9447786960738212e-05, + "loss": 0.7219, + "step": 852 + }, + { + "epoch": 0.04066455319047506, + "grad_norm": 2.1447505950927734, + "learning_rate": 1.944649899972114e-05, + "loss": 0.5416, + "step": 853 + }, + { + "epoch": 0.04071222558577456, + "grad_norm": 2.40384840965271, + "learning_rate": 1.9445209581206557e-05, + "loss": 0.9789, + "step": 854 + }, + { + "epoch": 0.04075989798107406, + "grad_norm": 1.6884760856628418, + "learning_rate": 1.94439187053934e-05, + "loss": 0.9452, + "step": 855 + }, + { + "epoch": 0.04080757037637356, + "grad_norm": 1.678084373474121, + "learning_rate": 1.9442626372480838e-05, + "loss": 0.9266, + "step": 856 + }, + { + "epoch": 0.040855242771673066, + "grad_norm": 1.510408878326416, + "learning_rate": 1.944133258266827e-05, + "loss": 0.5553, + "step": 857 + }, + { + "epoch": 0.040902915166972566, + "grad_norm": 1.5903170108795166, + "learning_rate": 1.944003733615531e-05, + "loss": 0.8193, + "step": 858 + }, + { + "epoch": 0.040950587562272066, + "grad_norm": 1.9291753768920898, + "learning_rate": 1.9438740633141804e-05, + "loss": 0.7051, + "step": 859 + }, + { + "epoch": 0.040998259957571566, + "grad_norm": 1.2462096214294434, + "learning_rate": 1.9437442473827818e-05, + "loss": 0.4045, + "step": 860 + }, + { + "epoch": 0.04104593235287107, + "grad_norm": 1.6083624362945557, + "learning_rate": 1.9436142858413648e-05, + "loss": 0.6697, + "step": 861 + }, + { + "epoch": 0.04109360474817057, + "grad_norm": 1.9976297616958618, + "learning_rate": 1.9434841787099804e-05, + "loss": 0.8119, + "step": 862 + }, + { + "epoch": 0.04114127714347007, + "grad_norm": 1.580064058303833, + "learning_rate": 1.9433539260087033e-05, + "loss": 1.0078, + "step": 863 + }, + { + "epoch": 0.04118894953876957, + "grad_norm": 1.811970829963684, + "learning_rate": 1.9432235277576304e-05, + "loss": 0.8918, + "step": 864 + }, + { + "epoch": 0.04123662193406908, + "grad_norm": 2.8056108951568604, + "learning_rate": 1.9430929839768803e-05, + "loss": 0.4796, + "step": 865 + }, + { + "epoch": 0.04128429432936858, + "grad_norm": 3.3034982681274414, + "learning_rate": 1.9429622946865946e-05, + "loss": 0.9444, + "step": 866 + }, + { + "epoch": 0.04133196672466808, + "grad_norm": 1.4448636770248413, + "learning_rate": 1.9428314599069375e-05, + "loss": 0.7058, + "step": 867 + }, + { + "epoch": 0.04137963911996758, + "grad_norm": 1.519383192062378, + "learning_rate": 1.9427004796580954e-05, + "loss": 0.7638, + "step": 868 + }, + { + "epoch": 0.04142731151526709, + "grad_norm": 4.433587074279785, + "learning_rate": 1.9425693539602773e-05, + "loss": 0.8145, + "step": 869 + }, + { + "epoch": 0.04147498391056659, + "grad_norm": 1.7333942651748657, + "learning_rate": 1.9424380828337146e-05, + "loss": 0.7182, + "step": 870 + }, + { + "epoch": 0.04152265630586609, + "grad_norm": 1.652495265007019, + "learning_rate": 1.9423066662986607e-05, + "loss": 0.3631, + "step": 871 + }, + { + "epoch": 0.04157032870116559, + "grad_norm": 1.9444339275360107, + "learning_rate": 1.942175104375392e-05, + "loss": 1.16, + "step": 872 + }, + { + "epoch": 0.041618001096465095, + "grad_norm": 2.744253396987915, + "learning_rate": 1.9420433970842078e-05, + "loss": 0.545, + "step": 873 + }, + { + "epoch": 0.041665673491764595, + "grad_norm": 1.701715111732483, + "learning_rate": 1.941911544445428e-05, + "loss": 0.8751, + "step": 874 + }, + { + "epoch": 0.041713345887064095, + "grad_norm": 1.4953023195266724, + "learning_rate": 1.941779546479397e-05, + "loss": 0.8856, + "step": 875 + }, + { + "epoch": 0.041761018282363595, + "grad_norm": 1.8698402643203735, + "learning_rate": 1.9416474032064803e-05, + "loss": 0.6765, + "step": 876 + }, + { + "epoch": 0.0418086906776631, + "grad_norm": 3.711998224258423, + "learning_rate": 1.9415151146470665e-05, + "loss": 0.8479, + "step": 877 + }, + { + "epoch": 0.0418563630729626, + "grad_norm": 2.123957633972168, + "learning_rate": 1.9413826808215665e-05, + "loss": 0.6198, + "step": 878 + }, + { + "epoch": 0.0419040354682621, + "grad_norm": 1.6366348266601562, + "learning_rate": 1.941250101750413e-05, + "loss": 0.7139, + "step": 879 + }, + { + "epoch": 0.0419517078635616, + "grad_norm": 1.9562444686889648, + "learning_rate": 1.9411173774540616e-05, + "loss": 0.6294, + "step": 880 + }, + { + "epoch": 0.04199938025886111, + "grad_norm": 3.5185775756835938, + "learning_rate": 1.9409845079529907e-05, + "loss": 0.9229, + "step": 881 + }, + { + "epoch": 0.04204705265416061, + "grad_norm": 1.930294394493103, + "learning_rate": 1.9408514932677e-05, + "loss": 0.8511, + "step": 882 + }, + { + "epoch": 0.04209472504946011, + "grad_norm": 1.7541227340698242, + "learning_rate": 1.9407183334187132e-05, + "loss": 0.8352, + "step": 883 + }, + { + "epoch": 0.04214239744475961, + "grad_norm": 1.5182242393493652, + "learning_rate": 1.940585028426575e-05, + "loss": 0.6217, + "step": 884 + }, + { + "epoch": 0.042190069840059116, + "grad_norm": 1.8734807968139648, + "learning_rate": 1.9404515783118533e-05, + "loss": 0.6807, + "step": 885 + }, + { + "epoch": 0.042237742235358616, + "grad_norm": 2.094998598098755, + "learning_rate": 1.9403179830951376e-05, + "loss": 0.4722, + "step": 886 + }, + { + "epoch": 0.042285414630658116, + "grad_norm": 1.5760083198547363, + "learning_rate": 1.9401842427970406e-05, + "loss": 0.9682, + "step": 887 + }, + { + "epoch": 0.042333087025957616, + "grad_norm": 1.9804329872131348, + "learning_rate": 1.940050357438197e-05, + "loss": 0.2387, + "step": 888 + }, + { + "epoch": 0.04238075942125712, + "grad_norm": 1.821393609046936, + "learning_rate": 1.9399163270392637e-05, + "loss": 0.9595, + "step": 889 + }, + { + "epoch": 0.042428431816556623, + "grad_norm": 14.376471519470215, + "learning_rate": 1.9397821516209207e-05, + "loss": 0.4811, + "step": 890 + }, + { + "epoch": 0.042476104211856124, + "grad_norm": 3.35683012008667, + "learning_rate": 1.9396478312038694e-05, + "loss": 1.1819, + "step": 891 + }, + { + "epoch": 0.042523776607155624, + "grad_norm": 2.020669937133789, + "learning_rate": 1.9395133658088344e-05, + "loss": 1.2711, + "step": 892 + }, + { + "epoch": 0.04257144900245513, + "grad_norm": 1.6300127506256104, + "learning_rate": 1.9393787554565618e-05, + "loss": 1.1336, + "step": 893 + }, + { + "epoch": 0.04261912139775463, + "grad_norm": 2.556307554244995, + "learning_rate": 1.9392440001678213e-05, + "loss": 1.2215, + "step": 894 + }, + { + "epoch": 0.04266679379305413, + "grad_norm": 1.5415953397750854, + "learning_rate": 1.9391090999634038e-05, + "loss": 1.1214, + "step": 895 + }, + { + "epoch": 0.04271446618835363, + "grad_norm": 1.6759424209594727, + "learning_rate": 1.9389740548641232e-05, + "loss": 0.6674, + "step": 896 + }, + { + "epoch": 0.04276213858365314, + "grad_norm": 1.8798645734786987, + "learning_rate": 1.9388388648908156e-05, + "loss": 0.8414, + "step": 897 + }, + { + "epoch": 0.04280981097895264, + "grad_norm": 1.722214698791504, + "learning_rate": 1.9387035300643392e-05, + "loss": 0.8246, + "step": 898 + }, + { + "epoch": 0.04285748337425214, + "grad_norm": 1.241234302520752, + "learning_rate": 1.9385680504055746e-05, + "loss": 0.7037, + "step": 899 + }, + { + "epoch": 0.04290515576955164, + "grad_norm": 1.4477009773254395, + "learning_rate": 1.9384324259354254e-05, + "loss": 0.5023, + "step": 900 + }, + { + "epoch": 0.042952828164851145, + "grad_norm": 1.56638503074646, + "learning_rate": 1.938296656674817e-05, + "loss": 0.7424, + "step": 901 + }, + { + "epoch": 0.043000500560150645, + "grad_norm": 3.264298677444458, + "learning_rate": 1.938160742644697e-05, + "loss": 0.6612, + "step": 902 + }, + { + "epoch": 0.043048172955450145, + "grad_norm": 1.6142951250076294, + "learning_rate": 1.9380246838660356e-05, + "loss": 0.9295, + "step": 903 + }, + { + "epoch": 0.043095845350749645, + "grad_norm": 2.484809637069702, + "learning_rate": 1.937888480359825e-05, + "loss": 1.0406, + "step": 904 + }, + { + "epoch": 0.04314351774604915, + "grad_norm": 1.3895137310028076, + "learning_rate": 1.9377521321470806e-05, + "loss": 1.0379, + "step": 905 + }, + { + "epoch": 0.04319119014134865, + "grad_norm": 1.5292794704437256, + "learning_rate": 1.937615639248839e-05, + "loss": 0.7214, + "step": 906 + }, + { + "epoch": 0.04323886253664815, + "grad_norm": 1.3762729167938232, + "learning_rate": 1.93747900168616e-05, + "loss": 0.6806, + "step": 907 + }, + { + "epoch": 0.04328653493194765, + "grad_norm": 4.4749755859375, + "learning_rate": 1.937342219480125e-05, + "loss": 0.621, + "step": 908 + }, + { + "epoch": 0.04333420732724716, + "grad_norm": 1.5729974508285522, + "learning_rate": 1.9372052926518386e-05, + "loss": 0.8714, + "step": 909 + }, + { + "epoch": 0.04338187972254666, + "grad_norm": 1.6284842491149902, + "learning_rate": 1.937068221222427e-05, + "loss": 0.6602, + "step": 910 + }, + { + "epoch": 0.04342955211784616, + "grad_norm": 4.40969181060791, + "learning_rate": 1.936931005213038e-05, + "loss": 1.0772, + "step": 911 + }, + { + "epoch": 0.04347722451314566, + "grad_norm": 3.7697627544403076, + "learning_rate": 1.936793644644844e-05, + "loss": 1.2258, + "step": 912 + }, + { + "epoch": 0.04352489690844517, + "grad_norm": 1.448448896408081, + "learning_rate": 1.936656139539038e-05, + "loss": 0.7728, + "step": 913 + }, + { + "epoch": 0.04357256930374467, + "grad_norm": 1.5610464811325073, + "learning_rate": 1.936518489916835e-05, + "loss": 0.7313, + "step": 914 + }, + { + "epoch": 0.04362024169904417, + "grad_norm": 7.224313735961914, + "learning_rate": 1.936380695799473e-05, + "loss": 0.9441, + "step": 915 + }, + { + "epoch": 0.04366791409434367, + "grad_norm": 1.8962578773498535, + "learning_rate": 1.936242757208213e-05, + "loss": 0.9251, + "step": 916 + }, + { + "epoch": 0.043715586489643174, + "grad_norm": 4.338345527648926, + "learning_rate": 1.936104674164337e-05, + "loss": 1.07, + "step": 917 + }, + { + "epoch": 0.043763258884942674, + "grad_norm": 1.7759220600128174, + "learning_rate": 1.9359664466891495e-05, + "loss": 0.9401, + "step": 918 + }, + { + "epoch": 0.043810931280242174, + "grad_norm": 1.3625247478485107, + "learning_rate": 1.9358280748039776e-05, + "loss": 0.6422, + "step": 919 + }, + { + "epoch": 0.043858603675541674, + "grad_norm": 5.836887359619141, + "learning_rate": 1.9356895585301715e-05, + "loss": 1.2319, + "step": 920 + }, + { + "epoch": 0.04390627607084118, + "grad_norm": 2.191404342651367, + "learning_rate": 1.935550897889102e-05, + "loss": 1.1688, + "step": 921 + }, + { + "epoch": 0.04395394846614068, + "grad_norm": 1.1320335865020752, + "learning_rate": 1.9354120929021633e-05, + "loss": 0.8847, + "step": 922 + }, + { + "epoch": 0.04400162086144018, + "grad_norm": 1.4333887100219727, + "learning_rate": 1.9352731435907715e-05, + "loss": 0.702, + "step": 923 + }, + { + "epoch": 0.04404929325673969, + "grad_norm": 1.5401864051818848, + "learning_rate": 1.9351340499763654e-05, + "loss": 0.8675, + "step": 924 + }, + { + "epoch": 0.04409696565203919, + "grad_norm": 1.810653805732727, + "learning_rate": 1.934994812080405e-05, + "loss": 0.7717, + "step": 925 + }, + { + "epoch": 0.04414463804733869, + "grad_norm": 1.6668885946273804, + "learning_rate": 1.9348554299243737e-05, + "loss": 0.6602, + "step": 926 + }, + { + "epoch": 0.04419231044263819, + "grad_norm": 0.8640025854110718, + "learning_rate": 1.934715903529777e-05, + "loss": 0.372, + "step": 927 + }, + { + "epoch": 0.044239982837937696, + "grad_norm": 6.436641693115234, + "learning_rate": 1.934576232918142e-05, + "loss": 0.4044, + "step": 928 + }, + { + "epoch": 0.044287655233237196, + "grad_norm": 3.303285598754883, + "learning_rate": 1.9344364181110185e-05, + "loss": 1.3519, + "step": 929 + }, + { + "epoch": 0.044335327628536696, + "grad_norm": 2.375943183898926, + "learning_rate": 1.9342964591299785e-05, + "loss": 0.4268, + "step": 930 + }, + { + "epoch": 0.044383000023836196, + "grad_norm": 2.2730588912963867, + "learning_rate": 1.934156355996616e-05, + "loss": 0.5779, + "step": 931 + }, + { + "epoch": 0.0444306724191357, + "grad_norm": 3.625521183013916, + "learning_rate": 1.9340161087325483e-05, + "loss": 0.8949, + "step": 932 + }, + { + "epoch": 0.0444783448144352, + "grad_norm": 1.8681919574737549, + "learning_rate": 1.9338757173594128e-05, + "loss": 1.163, + "step": 933 + }, + { + "epoch": 0.0445260172097347, + "grad_norm": 1.3595960140228271, + "learning_rate": 1.9337351818988718e-05, + "loss": 0.6256, + "step": 934 + }, + { + "epoch": 0.0445736896050342, + "grad_norm": 1.5224964618682861, + "learning_rate": 1.9335945023726076e-05, + "loss": 0.7347, + "step": 935 + }, + { + "epoch": 0.04462136200033371, + "grad_norm": 1.216257095336914, + "learning_rate": 1.933453678802326e-05, + "loss": 0.537, + "step": 936 + }, + { + "epoch": 0.04466903439563321, + "grad_norm": 1.4969592094421387, + "learning_rate": 1.9333127112097543e-05, + "loss": 0.6825, + "step": 937 + }, + { + "epoch": 0.04471670679093271, + "grad_norm": 1.4242290258407593, + "learning_rate": 1.9331715996166424e-05, + "loss": 0.5725, + "step": 938 + }, + { + "epoch": 0.04476437918623221, + "grad_norm": 1.623018503189087, + "learning_rate": 1.9330303440447627e-05, + "loss": 0.6613, + "step": 939 + }, + { + "epoch": 0.04481205158153172, + "grad_norm": 1.5217169523239136, + "learning_rate": 1.9328889445159094e-05, + "loss": 0.6965, + "step": 940 + }, + { + "epoch": 0.04485972397683122, + "grad_norm": 2.455193281173706, + "learning_rate": 1.9327474010518983e-05, + "loss": 0.7261, + "step": 941 + }, + { + "epoch": 0.04490739637213072, + "grad_norm": 3.400991201400757, + "learning_rate": 1.932605713674569e-05, + "loss": 0.9366, + "step": 942 + }, + { + "epoch": 0.04495506876743022, + "grad_norm": 3.395486354827881, + "learning_rate": 1.932463882405782e-05, + "loss": 0.8198, + "step": 943 + }, + { + "epoch": 0.045002741162729724, + "grad_norm": 3.2462515830993652, + "learning_rate": 1.9323219072674207e-05, + "loss": 1.3122, + "step": 944 + }, + { + "epoch": 0.045050413558029224, + "grad_norm": 2.7918636798858643, + "learning_rate": 1.9321797882813903e-05, + "loss": 1.2664, + "step": 945 + }, + { + "epoch": 0.045098085953328725, + "grad_norm": 9.712040901184082, + "learning_rate": 1.9320375254696177e-05, + "loss": 0.6191, + "step": 946 + }, + { + "epoch": 0.045145758348628225, + "grad_norm": 2.626828908920288, + "learning_rate": 1.9318951188540534e-05, + "loss": 0.3006, + "step": 947 + }, + { + "epoch": 0.04519343074392773, + "grad_norm": 1.2030017375946045, + "learning_rate": 1.9317525684566686e-05, + "loss": 0.4254, + "step": 948 + }, + { + "epoch": 0.04524110313922723, + "grad_norm": 1.4220699071884155, + "learning_rate": 1.9316098742994578e-05, + "loss": 0.7015, + "step": 949 + }, + { + "epoch": 0.04528877553452673, + "grad_norm": 1.5147393941879272, + "learning_rate": 1.9314670364044374e-05, + "loss": 0.6388, + "step": 950 + }, + { + "epoch": 0.04533644792982623, + "grad_norm": 1.8807706832885742, + "learning_rate": 1.931324054793645e-05, + "loss": 0.7108, + "step": 951 + }, + { + "epoch": 0.04538412032512574, + "grad_norm": 1.460456132888794, + "learning_rate": 1.9311809294891422e-05, + "loss": 0.7543, + "step": 952 + }, + { + "epoch": 0.04543179272042524, + "grad_norm": 1.7707245349884033, + "learning_rate": 1.931037660513011e-05, + "loss": 1.1482, + "step": 953 + }, + { + "epoch": 0.04547946511572474, + "grad_norm": 1.3123273849487305, + "learning_rate": 1.930894247887357e-05, + "loss": 0.728, + "step": 954 + }, + { + "epoch": 0.04552713751102424, + "grad_norm": 0.8915164470672607, + "learning_rate": 1.9307506916343066e-05, + "loss": 0.3218, + "step": 955 + }, + { + "epoch": 0.045574809906323746, + "grad_norm": 2.099499225616455, + "learning_rate": 1.930606991776009e-05, + "loss": 0.9247, + "step": 956 + }, + { + "epoch": 0.045622482301623246, + "grad_norm": 3.0598840713500977, + "learning_rate": 1.9304631483346364e-05, + "loss": 0.6489, + "step": 957 + }, + { + "epoch": 0.045670154696922746, + "grad_norm": 1.5167876482009888, + "learning_rate": 1.930319161332382e-05, + "loss": 0.7977, + "step": 958 + }, + { + "epoch": 0.045717827092222246, + "grad_norm": 6.17631721496582, + "learning_rate": 1.930175030791461e-05, + "loss": 1.1156, + "step": 959 + }, + { + "epoch": 0.04576549948752175, + "grad_norm": 1.7657543420791626, + "learning_rate": 1.9300307567341124e-05, + "loss": 0.7452, + "step": 960 + }, + { + "epoch": 0.04581317188282125, + "grad_norm": 1.6475632190704346, + "learning_rate": 1.9298863391825954e-05, + "loss": 0.4463, + "step": 961 + }, + { + "epoch": 0.04586084427812075, + "grad_norm": 4.049325942993164, + "learning_rate": 1.929741778159192e-05, + "loss": 0.4196, + "step": 962 + }, + { + "epoch": 0.045908516673420253, + "grad_norm": 4.7456865310668945, + "learning_rate": 1.9295970736862063e-05, + "loss": 0.7544, + "step": 963 + }, + { + "epoch": 0.04595618906871976, + "grad_norm": 1.6404321193695068, + "learning_rate": 1.9294522257859655e-05, + "loss": 0.8724, + "step": 964 + }, + { + "epoch": 0.04600386146401926, + "grad_norm": 1.412407398223877, + "learning_rate": 1.929307234480818e-05, + "loss": 0.9744, + "step": 965 + }, + { + "epoch": 0.04605153385931876, + "grad_norm": 3.00014591217041, + "learning_rate": 1.929162099793134e-05, + "loss": 0.9947, + "step": 966 + }, + { + "epoch": 0.04609920625461826, + "grad_norm": 1.8120754957199097, + "learning_rate": 1.9290168217453066e-05, + "loss": 1.1211, + "step": 967 + }, + { + "epoch": 0.04614687864991777, + "grad_norm": 1.742107629776001, + "learning_rate": 1.9288714003597504e-05, + "loss": 1.0559, + "step": 968 + }, + { + "epoch": 0.04619455104521727, + "grad_norm": 1.3778812885284424, + "learning_rate": 1.928725835658903e-05, + "loss": 0.5452, + "step": 969 + }, + { + "epoch": 0.04624222344051677, + "grad_norm": 1.5840113162994385, + "learning_rate": 1.9285801276652226e-05, + "loss": 0.7015, + "step": 970 + }, + { + "epoch": 0.04628989583581627, + "grad_norm": 2.1550915241241455, + "learning_rate": 1.9284342764011917e-05, + "loss": 0.838, + "step": 971 + }, + { + "epoch": 0.046337568231115775, + "grad_norm": 3.1609108448028564, + "learning_rate": 1.9282882818893126e-05, + "loss": 0.9829, + "step": 972 + }, + { + "epoch": 0.046385240626415275, + "grad_norm": 4.29646635055542, + "learning_rate": 1.9281421441521113e-05, + "loss": 1.0196, + "step": 973 + }, + { + "epoch": 0.046432913021714775, + "grad_norm": 1.2184792757034302, + "learning_rate": 1.927995863212135e-05, + "loss": 0.5431, + "step": 974 + }, + { + "epoch": 0.046480585417014275, + "grad_norm": 2.432730197906494, + "learning_rate": 1.9278494390919538e-05, + "loss": 0.7913, + "step": 975 + }, + { + "epoch": 0.04652825781231378, + "grad_norm": 1.3183215856552124, + "learning_rate": 1.927702871814159e-05, + "loss": 0.5146, + "step": 976 + }, + { + "epoch": 0.04657593020761328, + "grad_norm": 1.6891658306121826, + "learning_rate": 1.9275561614013644e-05, + "loss": 0.9708, + "step": 977 + }, + { + "epoch": 0.04662360260291278, + "grad_norm": 1.7222672700881958, + "learning_rate": 1.9274093078762063e-05, + "loss": 0.4512, + "step": 978 + }, + { + "epoch": 0.04667127499821228, + "grad_norm": 1.4556636810302734, + "learning_rate": 1.9272623112613425e-05, + "loss": 0.8314, + "step": 979 + }, + { + "epoch": 0.04671894739351179, + "grad_norm": 1.9452040195465088, + "learning_rate": 1.927115171579453e-05, + "loss": 1.1497, + "step": 980 + }, + { + "epoch": 0.04676661978881129, + "grad_norm": 1.5137896537780762, + "learning_rate": 1.9269678888532394e-05, + "loss": 0.8747, + "step": 981 + }, + { + "epoch": 0.04681429218411079, + "grad_norm": 1.9581245183944702, + "learning_rate": 1.926820463105427e-05, + "loss": 0.5568, + "step": 982 + }, + { + "epoch": 0.04686196457941029, + "grad_norm": 1.609912395477295, + "learning_rate": 1.9266728943587615e-05, + "loss": 0.6783, + "step": 983 + }, + { + "epoch": 0.0469096369747098, + "grad_norm": 1.1952351331710815, + "learning_rate": 1.926525182636011e-05, + "loss": 0.5354, + "step": 984 + }, + { + "epoch": 0.0469573093700093, + "grad_norm": 2.4685332775115967, + "learning_rate": 1.926377327959967e-05, + "loss": 0.801, + "step": 985 + }, + { + "epoch": 0.0470049817653088, + "grad_norm": 1.0144199132919312, + "learning_rate": 1.9262293303534403e-05, + "loss": 0.3081, + "step": 986 + }, + { + "epoch": 0.0470526541606083, + "grad_norm": 1.8896009922027588, + "learning_rate": 1.9260811898392665e-05, + "loss": 0.8822, + "step": 987 + }, + { + "epoch": 0.047100326555907804, + "grad_norm": 1.4001494646072388, + "learning_rate": 1.925932906440302e-05, + "loss": 0.5165, + "step": 988 + }, + { + "epoch": 0.047147998951207304, + "grad_norm": 2.435675621032715, + "learning_rate": 1.9257844801794253e-05, + "loss": 0.8977, + "step": 989 + }, + { + "epoch": 0.047195671346506804, + "grad_norm": 1.3700307607650757, + "learning_rate": 1.925635911079537e-05, + "loss": 0.7306, + "step": 990 + }, + { + "epoch": 0.047243343741806304, + "grad_norm": 2.761582851409912, + "learning_rate": 1.9254871991635598e-05, + "loss": 1.0047, + "step": 991 + }, + { + "epoch": 0.04729101613710581, + "grad_norm": 1.5248297452926636, + "learning_rate": 1.9253383444544386e-05, + "loss": 0.6427, + "step": 992 + }, + { + "epoch": 0.04733868853240531, + "grad_norm": 1.0941801071166992, + "learning_rate": 1.9251893469751396e-05, + "loss": 0.3129, + "step": 993 + }, + { + "epoch": 0.04738636092770481, + "grad_norm": 1.0071507692337036, + "learning_rate": 1.9250402067486523e-05, + "loss": 0.3861, + "step": 994 + }, + { + "epoch": 0.04743403332300431, + "grad_norm": 1.5965173244476318, + "learning_rate": 1.924890923797987e-05, + "loss": 1.0344, + "step": 995 + }, + { + "epoch": 0.04748170571830382, + "grad_norm": 1.3915090560913086, + "learning_rate": 1.9247414981461768e-05, + "loss": 1.0315, + "step": 996 + }, + { + "epoch": 0.04752937811360332, + "grad_norm": 1.3044167757034302, + "learning_rate": 1.9245919298162763e-05, + "loss": 0.6425, + "step": 997 + }, + { + "epoch": 0.04757705050890282, + "grad_norm": 1.6621586084365845, + "learning_rate": 1.9244422188313624e-05, + "loss": 0.8937, + "step": 998 + }, + { + "epoch": 0.04762472290420232, + "grad_norm": 2.1778156757354736, + "learning_rate": 1.9242923652145345e-05, + "loss": 1.0258, + "step": 999 + }, + { + "epoch": 0.047672395299501825, + "grad_norm": 1.8101787567138672, + "learning_rate": 1.9241423689889126e-05, + "loss": 0.7544, + "step": 1000 + }, + { + "epoch": 0.047720067694801326, + "grad_norm": 1.024300217628479, + "learning_rate": 1.9239922301776404e-05, + "loss": 0.4062, + "step": 1001 + }, + { + "epoch": 0.047767740090100826, + "grad_norm": 1.6066406965255737, + "learning_rate": 1.923841948803882e-05, + "loss": 1.1287, + "step": 1002 + }, + { + "epoch": 0.047815412485400326, + "grad_norm": 1.7720218896865845, + "learning_rate": 1.9236915248908244e-05, + "loss": 1.1275, + "step": 1003 + }, + { + "epoch": 0.04786308488069983, + "grad_norm": 2.4469680786132812, + "learning_rate": 1.9235409584616774e-05, + "loss": 0.9508, + "step": 1004 + }, + { + "epoch": 0.04791075727599933, + "grad_norm": 2.878873825073242, + "learning_rate": 1.9233902495396707e-05, + "loss": 0.6744, + "step": 1005 + }, + { + "epoch": 0.04795842967129883, + "grad_norm": 1.3485652208328247, + "learning_rate": 1.9232393981480576e-05, + "loss": 0.531, + "step": 1006 + }, + { + "epoch": 0.04800610206659833, + "grad_norm": 1.3326756954193115, + "learning_rate": 1.923088404310113e-05, + "loss": 0.3869, + "step": 1007 + }, + { + "epoch": 0.04805377446189784, + "grad_norm": 2.376333475112915, + "learning_rate": 1.9229372680491334e-05, + "loss": 0.7226, + "step": 1008 + }, + { + "epoch": 0.04810144685719734, + "grad_norm": 1.694778323173523, + "learning_rate": 1.922785989388438e-05, + "loss": 1.0696, + "step": 1009 + }, + { + "epoch": 0.04814911925249684, + "grad_norm": 1.7380653619766235, + "learning_rate": 1.922634568351367e-05, + "loss": 0.3589, + "step": 1010 + }, + { + "epoch": 0.04819679164779634, + "grad_norm": 1.8209391832351685, + "learning_rate": 1.922483004961284e-05, + "loss": 0.7724, + "step": 1011 + }, + { + "epoch": 0.04824446404309585, + "grad_norm": 4.721229553222656, + "learning_rate": 1.9223312992415723e-05, + "loss": 0.5786, + "step": 1012 + }, + { + "epoch": 0.04829213643839535, + "grad_norm": 1.639879822731018, + "learning_rate": 1.9221794512156394e-05, + "loss": 0.8323, + "step": 1013 + }, + { + "epoch": 0.04833980883369485, + "grad_norm": 1.9955905675888062, + "learning_rate": 1.9220274609069143e-05, + "loss": 0.9613, + "step": 1014 + }, + { + "epoch": 0.048387481228994354, + "grad_norm": 1.4991236925125122, + "learning_rate": 1.921875328338847e-05, + "loss": 0.6176, + "step": 1015 + }, + { + "epoch": 0.048435153624293854, + "grad_norm": 1.8712117671966553, + "learning_rate": 1.9217230535349097e-05, + "loss": 0.7881, + "step": 1016 + }, + { + "epoch": 0.048482826019593354, + "grad_norm": 1.6780242919921875, + "learning_rate": 1.9215706365185973e-05, + "loss": 0.9323, + "step": 1017 + }, + { + "epoch": 0.048530498414892854, + "grad_norm": 1.497131586074829, + "learning_rate": 1.9214180773134257e-05, + "loss": 0.6449, + "step": 1018 + }, + { + "epoch": 0.04857817081019236, + "grad_norm": 1.2731022834777832, + "learning_rate": 1.921265375942934e-05, + "loss": 0.5281, + "step": 1019 + }, + { + "epoch": 0.04862584320549186, + "grad_norm": 1.69632089138031, + "learning_rate": 1.9211125324306816e-05, + "loss": 0.6828, + "step": 1020 + }, + { + "epoch": 0.04867351560079136, + "grad_norm": 2.5650973320007324, + "learning_rate": 1.9209595468002515e-05, + "loss": 0.4365, + "step": 1021 + }, + { + "epoch": 0.04872118799609086, + "grad_norm": 1.9558252096176147, + "learning_rate": 1.920806419075247e-05, + "loss": 1.0096, + "step": 1022 + }, + { + "epoch": 0.04876886039139037, + "grad_norm": 1.3596681356430054, + "learning_rate": 1.9206531492792945e-05, + "loss": 0.3994, + "step": 1023 + }, + { + "epoch": 0.04881653278668987, + "grad_norm": 0.8595488667488098, + "learning_rate": 1.9204997374360423e-05, + "loss": 0.3795, + "step": 1024 + }, + { + "epoch": 0.04886420518198937, + "grad_norm": 2.1809303760528564, + "learning_rate": 1.9203461835691596e-05, + "loss": 0.5414, + "step": 1025 + }, + { + "epoch": 0.04891187757728887, + "grad_norm": 3.2933461666107178, + "learning_rate": 1.9201924877023388e-05, + "loss": 1.125, + "step": 1026 + }, + { + "epoch": 0.048959549972588376, + "grad_norm": 1.5086666345596313, + "learning_rate": 1.9200386498592932e-05, + "loss": 0.6402, + "step": 1027 + }, + { + "epoch": 0.049007222367887876, + "grad_norm": 2.256472587585449, + "learning_rate": 1.9198846700637582e-05, + "loss": 0.8795, + "step": 1028 + }, + { + "epoch": 0.049054894763187376, + "grad_norm": 1.6830346584320068, + "learning_rate": 1.9197305483394917e-05, + "loss": 0.7627, + "step": 1029 + }, + { + "epoch": 0.049102567158486876, + "grad_norm": 6.12313175201416, + "learning_rate": 1.9195762847102732e-05, + "loss": 1.9816, + "step": 1030 + }, + { + "epoch": 0.04915023955378638, + "grad_norm": 1.7161015272140503, + "learning_rate": 1.9194218791999037e-05, + "loss": 0.9729, + "step": 1031 + }, + { + "epoch": 0.04919791194908588, + "grad_norm": 5.440549373626709, + "learning_rate": 1.9192673318322062e-05, + "loss": 0.4963, + "step": 1032 + }, + { + "epoch": 0.04924558434438538, + "grad_norm": 1.7278692722320557, + "learning_rate": 1.9191126426310264e-05, + "loss": 1.2882, + "step": 1033 + }, + { + "epoch": 0.04929325673968488, + "grad_norm": 1.1383099555969238, + "learning_rate": 1.918957811620231e-05, + "loss": 0.4966, + "step": 1034 + }, + { + "epoch": 0.04934092913498439, + "grad_norm": 2.120032787322998, + "learning_rate": 1.9188028388237084e-05, + "loss": 0.3744, + "step": 1035 + }, + { + "epoch": 0.04938860153028389, + "grad_norm": 2.3618710041046143, + "learning_rate": 1.9186477242653693e-05, + "loss": 0.4978, + "step": 1036 + }, + { + "epoch": 0.04943627392558339, + "grad_norm": 3.8753726482391357, + "learning_rate": 1.9184924679691474e-05, + "loss": 0.1962, + "step": 1037 + }, + { + "epoch": 0.04948394632088289, + "grad_norm": 1.73308265209198, + "learning_rate": 1.9183370699589954e-05, + "loss": 0.7439, + "step": 1038 + }, + { + "epoch": 0.0495316187161824, + "grad_norm": 4.969413757324219, + "learning_rate": 1.918181530258891e-05, + "loss": 1.3968, + "step": 1039 + }, + { + "epoch": 0.0495792911114819, + "grad_norm": 1.4473387002944946, + "learning_rate": 1.918025848892832e-05, + "loss": 1.0092, + "step": 1040 + }, + { + "epoch": 0.0496269635067814, + "grad_norm": 1.799941897392273, + "learning_rate": 1.9178700258848383e-05, + "loss": 0.7403, + "step": 1041 + }, + { + "epoch": 0.0496746359020809, + "grad_norm": 1.4860204458236694, + "learning_rate": 1.9177140612589517e-05, + "loss": 0.5133, + "step": 1042 + }, + { + "epoch": 0.049722308297380405, + "grad_norm": 2.0992960929870605, + "learning_rate": 1.9175579550392362e-05, + "loss": 0.2272, + "step": 1043 + }, + { + "epoch": 0.049769980692679905, + "grad_norm": 8.47647476196289, + "learning_rate": 1.9174017072497773e-05, + "loss": 0.5295, + "step": 1044 + }, + { + "epoch": 0.049817653087979405, + "grad_norm": 1.3025389909744263, + "learning_rate": 1.9172453179146822e-05, + "loss": 0.5968, + "step": 1045 + }, + { + "epoch": 0.049865325483278905, + "grad_norm": 2.5897867679595947, + "learning_rate": 1.9170887870580806e-05, + "loss": 1.284, + "step": 1046 + }, + { + "epoch": 0.04991299787857841, + "grad_norm": 1.8203742504119873, + "learning_rate": 1.9169321147041234e-05, + "loss": 0.79, + "step": 1047 + }, + { + "epoch": 0.04996067027387791, + "grad_norm": 1.2220009565353394, + "learning_rate": 1.916775300876983e-05, + "loss": 0.7042, + "step": 1048 + }, + { + "epoch": 0.05000834266917741, + "grad_norm": 1.4266115427017212, + "learning_rate": 1.916618345600855e-05, + "loss": 0.7694, + "step": 1049 + }, + { + "epoch": 0.05005601506447691, + "grad_norm": 1.64155113697052, + "learning_rate": 1.9164612488999556e-05, + "loss": 0.8167, + "step": 1050 + }, + { + "epoch": 0.05010368745977642, + "grad_norm": 1.6256442070007324, + "learning_rate": 1.916304010798523e-05, + "loss": 0.6897, + "step": 1051 + }, + { + "epoch": 0.05015135985507592, + "grad_norm": 3.0482420921325684, + "learning_rate": 1.916146631320818e-05, + "loss": 0.7566, + "step": 1052 + }, + { + "epoch": 0.05019903225037542, + "grad_norm": 1.2535357475280762, + "learning_rate": 1.915989110491122e-05, + "loss": 0.3171, + "step": 1053 + }, + { + "epoch": 0.05024670464567492, + "grad_norm": 1.5403010845184326, + "learning_rate": 1.9158314483337394e-05, + "loss": 0.669, + "step": 1054 + }, + { + "epoch": 0.050294377040974426, + "grad_norm": 1.7651960849761963, + "learning_rate": 1.9156736448729952e-05, + "loss": 1.1459, + "step": 1055 + }, + { + "epoch": 0.05034204943627393, + "grad_norm": 2.693230152130127, + "learning_rate": 1.9155157001332374e-05, + "loss": 0.6859, + "step": 1056 + }, + { + "epoch": 0.05038972183157343, + "grad_norm": 1.4504109621047974, + "learning_rate": 1.915357614138835e-05, + "loss": 0.7047, + "step": 1057 + }, + { + "epoch": 0.05043739422687293, + "grad_norm": 1.6796247959136963, + "learning_rate": 1.915199386914179e-05, + "loss": 0.7399, + "step": 1058 + }, + { + "epoch": 0.050485066622172434, + "grad_norm": 3.7206671237945557, + "learning_rate": 1.9150410184836826e-05, + "loss": 1.3115, + "step": 1059 + }, + { + "epoch": 0.050532739017471934, + "grad_norm": 2.770829916000366, + "learning_rate": 1.91488250887178e-05, + "loss": 1.1037, + "step": 1060 + }, + { + "epoch": 0.050580411412771434, + "grad_norm": 1.524247169494629, + "learning_rate": 1.9147238581029276e-05, + "loss": 0.752, + "step": 1061 + }, + { + "epoch": 0.050628083808070934, + "grad_norm": 2.784879684448242, + "learning_rate": 1.914565066201604e-05, + "loss": 0.5708, + "step": 1062 + }, + { + "epoch": 0.05067575620337044, + "grad_norm": 3.14414381980896, + "learning_rate": 1.9144061331923086e-05, + "loss": 0.5818, + "step": 1063 + }, + { + "epoch": 0.05072342859866994, + "grad_norm": 2.0844810009002686, + "learning_rate": 1.9142470590995636e-05, + "loss": 0.662, + "step": 1064 + }, + { + "epoch": 0.05077110099396944, + "grad_norm": 1.2482565641403198, + "learning_rate": 1.9140878439479123e-05, + "loss": 0.5516, + "step": 1065 + }, + { + "epoch": 0.05081877338926894, + "grad_norm": 1.4249321222305298, + "learning_rate": 1.9139284877619196e-05, + "loss": 0.4251, + "step": 1066 + }, + { + "epoch": 0.05086644578456845, + "grad_norm": 1.3449572324752808, + "learning_rate": 1.9137689905661733e-05, + "loss": 1.008, + "step": 1067 + }, + { + "epoch": 0.05091411817986795, + "grad_norm": 1.894120216369629, + "learning_rate": 1.9136093523852817e-05, + "loss": 0.9094, + "step": 1068 + }, + { + "epoch": 0.05096179057516745, + "grad_norm": 2.2801637649536133, + "learning_rate": 1.9134495732438755e-05, + "loss": 1.0731, + "step": 1069 + }, + { + "epoch": 0.05100946297046695, + "grad_norm": 2.6421236991882324, + "learning_rate": 1.9132896531666067e-05, + "loss": 1.0985, + "step": 1070 + }, + { + "epoch": 0.051057135365766455, + "grad_norm": 4.324747562408447, + "learning_rate": 1.9131295921781495e-05, + "loss": 0.7025, + "step": 1071 + }, + { + "epoch": 0.051104807761065955, + "grad_norm": 2.1336493492126465, + "learning_rate": 1.9129693903031995e-05, + "loss": 0.9208, + "step": 1072 + }, + { + "epoch": 0.051152480156365455, + "grad_norm": 1.2150462865829468, + "learning_rate": 1.9128090475664748e-05, + "loss": 0.8818, + "step": 1073 + }, + { + "epoch": 0.051200152551664956, + "grad_norm": 1.9461040496826172, + "learning_rate": 1.9126485639927137e-05, + "loss": 0.7112, + "step": 1074 + }, + { + "epoch": 0.05124782494696446, + "grad_norm": 1.807349681854248, + "learning_rate": 1.9124879396066778e-05, + "loss": 0.9164, + "step": 1075 + }, + { + "epoch": 0.05129549734226396, + "grad_norm": 3.4449281692504883, + "learning_rate": 1.9123271744331494e-05, + "loss": 1.3733, + "step": 1076 + }, + { + "epoch": 0.05134316973756346, + "grad_norm": 1.3120534420013428, + "learning_rate": 1.9121662684969337e-05, + "loss": 0.883, + "step": 1077 + }, + { + "epoch": 0.05139084213286296, + "grad_norm": 1.1500074863433838, + "learning_rate": 1.9120052218228558e-05, + "loss": 0.6904, + "step": 1078 + }, + { + "epoch": 0.05143851452816247, + "grad_norm": 1.375466227531433, + "learning_rate": 1.911844034435764e-05, + "loss": 0.6998, + "step": 1079 + }, + { + "epoch": 0.05148618692346197, + "grad_norm": 5.473826885223389, + "learning_rate": 1.911682706360528e-05, + "loss": 0.5018, + "step": 1080 + }, + { + "epoch": 0.05153385931876147, + "grad_norm": 1.468106985092163, + "learning_rate": 1.9115212376220392e-05, + "loss": 0.8559, + "step": 1081 + }, + { + "epoch": 0.05158153171406097, + "grad_norm": 7.54047155380249, + "learning_rate": 1.91135962824521e-05, + "loss": 1.0628, + "step": 1082 + }, + { + "epoch": 0.05162920410936048, + "grad_norm": 2.1193039417266846, + "learning_rate": 1.911197878254975e-05, + "loss": 0.7618, + "step": 1083 + }, + { + "epoch": 0.05167687650465998, + "grad_norm": 1.572967290878296, + "learning_rate": 1.9110359876762913e-05, + "loss": 0.8235, + "step": 1084 + }, + { + "epoch": 0.05172454889995948, + "grad_norm": 1.452972173690796, + "learning_rate": 1.9108739565341365e-05, + "loss": 0.7287, + "step": 1085 + }, + { + "epoch": 0.05177222129525898, + "grad_norm": 2.0626370906829834, + "learning_rate": 1.9107117848535105e-05, + "loss": 0.8926, + "step": 1086 + }, + { + "epoch": 0.051819893690558484, + "grad_norm": 2.210892915725708, + "learning_rate": 1.9105494726594344e-05, + "loss": 1.0116, + "step": 1087 + }, + { + "epoch": 0.051867566085857984, + "grad_norm": 2.8380939960479736, + "learning_rate": 1.910387019976952e-05, + "loss": 0.6746, + "step": 1088 + }, + { + "epoch": 0.051915238481157484, + "grad_norm": 1.2834552526474, + "learning_rate": 1.910224426831127e-05, + "loss": 0.6688, + "step": 1089 + }, + { + "epoch": 0.051962910876456984, + "grad_norm": 2.0969395637512207, + "learning_rate": 1.910061693247047e-05, + "loss": 0.7519, + "step": 1090 + }, + { + "epoch": 0.05201058327175649, + "grad_norm": 2.172556161880493, + "learning_rate": 1.909898819249819e-05, + "loss": 0.717, + "step": 1091 + }, + { + "epoch": 0.05205825566705599, + "grad_norm": 1.823603630065918, + "learning_rate": 1.9097358048645732e-05, + "loss": 0.4221, + "step": 1092 + }, + { + "epoch": 0.05210592806235549, + "grad_norm": 2.516322612762451, + "learning_rate": 1.9095726501164616e-05, + "loss": 0.4237, + "step": 1093 + }, + { + "epoch": 0.05215360045765499, + "grad_norm": 1.9163011312484741, + "learning_rate": 1.909409355030657e-05, + "loss": 1.0891, + "step": 1094 + }, + { + "epoch": 0.0522012728529545, + "grad_norm": 1.3037455081939697, + "learning_rate": 1.909245919632354e-05, + "loss": 0.5999, + "step": 1095 + }, + { + "epoch": 0.052248945248254, + "grad_norm": 1.5745213031768799, + "learning_rate": 1.9090823439467686e-05, + "loss": 0.5965, + "step": 1096 + }, + { + "epoch": 0.0522966176435535, + "grad_norm": 1.977568507194519, + "learning_rate": 1.9089186279991398e-05, + "loss": 0.7052, + "step": 1097 + }, + { + "epoch": 0.052344290038853, + "grad_norm": 2.6339094638824463, + "learning_rate": 1.908754771814726e-05, + "loss": 1.0466, + "step": 1098 + }, + { + "epoch": 0.052391962434152506, + "grad_norm": 1.6164226531982422, + "learning_rate": 1.90859077541881e-05, + "loss": 0.6806, + "step": 1099 + }, + { + "epoch": 0.052439634829452006, + "grad_norm": 1.2467166185379028, + "learning_rate": 1.9084266388366937e-05, + "loss": 0.5265, + "step": 1100 + }, + { + "epoch": 0.052487307224751506, + "grad_norm": 1.7109839916229248, + "learning_rate": 1.9082623620937023e-05, + "loss": 0.8662, + "step": 1101 + }, + { + "epoch": 0.052534979620051006, + "grad_norm": 1.3563034534454346, + "learning_rate": 1.9080979452151813e-05, + "loss": 0.9214, + "step": 1102 + }, + { + "epoch": 0.05258265201535051, + "grad_norm": 2.4167439937591553, + "learning_rate": 1.9079333882264994e-05, + "loss": 1.1707, + "step": 1103 + }, + { + "epoch": 0.05263032441065001, + "grad_norm": 1.283447265625, + "learning_rate": 1.907768691153045e-05, + "loss": 0.7402, + "step": 1104 + }, + { + "epoch": 0.05267799680594951, + "grad_norm": 2.731553316116333, + "learning_rate": 1.90760385402023e-05, + "loss": 0.7584, + "step": 1105 + }, + { + "epoch": 0.05272566920124901, + "grad_norm": 3.888896942138672, + "learning_rate": 1.9074388768534872e-05, + "loss": 1.5162, + "step": 1106 + }, + { + "epoch": 0.05277334159654852, + "grad_norm": 1.4903335571289062, + "learning_rate": 1.9072737596782703e-05, + "loss": 0.5361, + "step": 1107 + }, + { + "epoch": 0.05282101399184802, + "grad_norm": 1.0833760499954224, + "learning_rate": 1.9071085025200555e-05, + "loss": 0.5811, + "step": 1108 + }, + { + "epoch": 0.05286868638714752, + "grad_norm": 1.0817785263061523, + "learning_rate": 1.9069431054043398e-05, + "loss": 0.5059, + "step": 1109 + }, + { + "epoch": 0.05291635878244703, + "grad_norm": 2.820789337158203, + "learning_rate": 1.9067775683566433e-05, + "loss": 0.9822, + "step": 1110 + }, + { + "epoch": 0.05296403117774653, + "grad_norm": 4.069901943206787, + "learning_rate": 1.9066118914025054e-05, + "loss": 0.8095, + "step": 1111 + }, + { + "epoch": 0.05301170357304603, + "grad_norm": 2.1332006454467773, + "learning_rate": 1.906446074567489e-05, + "loss": 0.7816, + "step": 1112 + }, + { + "epoch": 0.05305937596834553, + "grad_norm": 1.5364291667938232, + "learning_rate": 1.906280117877178e-05, + "loss": 0.7996, + "step": 1113 + }, + { + "epoch": 0.053107048363645035, + "grad_norm": 4.230134010314941, + "learning_rate": 1.9061140213571777e-05, + "loss": 0.9161, + "step": 1114 + }, + { + "epoch": 0.053154720758944535, + "grad_norm": 2.7443134784698486, + "learning_rate": 1.905947785033115e-05, + "loss": 0.7761, + "step": 1115 + }, + { + "epoch": 0.053202393154244035, + "grad_norm": 1.4390854835510254, + "learning_rate": 1.9057814089306388e-05, + "loss": 0.7214, + "step": 1116 + }, + { + "epoch": 0.053250065549543535, + "grad_norm": 3.119563341140747, + "learning_rate": 1.905614893075419e-05, + "loss": 0.236, + "step": 1117 + }, + { + "epoch": 0.05329773794484304, + "grad_norm": 3.512956380844116, + "learning_rate": 1.905448237493147e-05, + "loss": 1.1971, + "step": 1118 + }, + { + "epoch": 0.05334541034014254, + "grad_norm": 1.7571995258331299, + "learning_rate": 1.905281442209536e-05, + "loss": 0.9151, + "step": 1119 + }, + { + "epoch": 0.05339308273544204, + "grad_norm": 1.2232495546340942, + "learning_rate": 1.9051145072503216e-05, + "loss": 0.7435, + "step": 1120 + }, + { + "epoch": 0.05344075513074154, + "grad_norm": 3.606189489364624, + "learning_rate": 1.9049474326412593e-05, + "loss": 0.3272, + "step": 1121 + }, + { + "epoch": 0.05348842752604105, + "grad_norm": 0.963822603225708, + "learning_rate": 1.904780218408127e-05, + "loss": 0.5257, + "step": 1122 + }, + { + "epoch": 0.05353609992134055, + "grad_norm": 2.12300181388855, + "learning_rate": 1.9046128645767247e-05, + "loss": 0.5798, + "step": 1123 + }, + { + "epoch": 0.05358377231664005, + "grad_norm": 2.374272108078003, + "learning_rate": 1.9044453711728733e-05, + "loss": 0.3698, + "step": 1124 + }, + { + "epoch": 0.05363144471193955, + "grad_norm": 1.8397048711776733, + "learning_rate": 1.904277738222415e-05, + "loss": 0.7794, + "step": 1125 + }, + { + "epoch": 0.053679117107239056, + "grad_norm": 3.58266282081604, + "learning_rate": 1.9041099657512138e-05, + "loss": 0.7956, + "step": 1126 + }, + { + "epoch": 0.053726789502538556, + "grad_norm": 1.8295494318008423, + "learning_rate": 1.903942053785156e-05, + "loss": 0.7855, + "step": 1127 + }, + { + "epoch": 0.053774461897838056, + "grad_norm": 1.737149953842163, + "learning_rate": 1.9037740023501473e-05, + "loss": 0.622, + "step": 1128 + }, + { + "epoch": 0.05382213429313756, + "grad_norm": 1.806960105895996, + "learning_rate": 1.9036058114721174e-05, + "loss": 0.5765, + "step": 1129 + }, + { + "epoch": 0.053869806688437064, + "grad_norm": 1.7252026796340942, + "learning_rate": 1.9034374811770163e-05, + "loss": 1.071, + "step": 1130 + }, + { + "epoch": 0.053917479083736564, + "grad_norm": 1.6096805334091187, + "learning_rate": 1.9032690114908155e-05, + "loss": 0.8907, + "step": 1131 + }, + { + "epoch": 0.053965151479036064, + "grad_norm": 2.1335132122039795, + "learning_rate": 1.903100402439508e-05, + "loss": 0.8775, + "step": 1132 + }, + { + "epoch": 0.054012823874335564, + "grad_norm": 1.6958810091018677, + "learning_rate": 1.902931654049108e-05, + "loss": 0.731, + "step": 1133 + }, + { + "epoch": 0.05406049626963507, + "grad_norm": 2.1730849742889404, + "learning_rate": 1.9027627663456528e-05, + "loss": 0.8711, + "step": 1134 + }, + { + "epoch": 0.05410816866493457, + "grad_norm": 2.0798728466033936, + "learning_rate": 1.9025937393551993e-05, + "loss": 0.8086, + "step": 1135 + }, + { + "epoch": 0.05415584106023407, + "grad_norm": 2.0575478076934814, + "learning_rate": 1.902424573103827e-05, + "loss": 0.9894, + "step": 1136 + }, + { + "epoch": 0.05420351345553357, + "grad_norm": 1.999036192893982, + "learning_rate": 1.9022552676176358e-05, + "loss": 1.1162, + "step": 1137 + }, + { + "epoch": 0.05425118585083308, + "grad_norm": 1.2748388051986694, + "learning_rate": 1.9020858229227483e-05, + "loss": 0.4934, + "step": 1138 + }, + { + "epoch": 0.05429885824613258, + "grad_norm": 2.557769298553467, + "learning_rate": 1.901916239045308e-05, + "loss": 1.2284, + "step": 1139 + }, + { + "epoch": 0.05434653064143208, + "grad_norm": 1.9223003387451172, + "learning_rate": 1.9017465160114804e-05, + "loss": 0.7192, + "step": 1140 + }, + { + "epoch": 0.05439420303673158, + "grad_norm": 1.5044459104537964, + "learning_rate": 1.901576653847451e-05, + "loss": 0.4125, + "step": 1141 + }, + { + "epoch": 0.054441875432031085, + "grad_norm": 3.2889225482940674, + "learning_rate": 1.9014066525794284e-05, + "loss": 0.6339, + "step": 1142 + }, + { + "epoch": 0.054489547827330585, + "grad_norm": 4.039918899536133, + "learning_rate": 1.9012365122336425e-05, + "loss": 1.2816, + "step": 1143 + }, + { + "epoch": 0.054537220222630085, + "grad_norm": 2.790966272354126, + "learning_rate": 1.9010662328363435e-05, + "loss": 1.1504, + "step": 1144 + }, + { + "epoch": 0.054584892617929585, + "grad_norm": 1.5610294342041016, + "learning_rate": 1.900895814413804e-05, + "loss": 0.689, + "step": 1145 + }, + { + "epoch": 0.05463256501322909, + "grad_norm": 1.8879177570343018, + "learning_rate": 1.9007252569923173e-05, + "loss": 0.8787, + "step": 1146 + }, + { + "epoch": 0.05468023740852859, + "grad_norm": 1.4097404479980469, + "learning_rate": 1.9005545605981996e-05, + "loss": 0.7708, + "step": 1147 + }, + { + "epoch": 0.05472790980382809, + "grad_norm": 1.7411558628082275, + "learning_rate": 1.900383725257787e-05, + "loss": 0.6831, + "step": 1148 + }, + { + "epoch": 0.05477558219912759, + "grad_norm": 4.240265846252441, + "learning_rate": 1.9002127509974376e-05, + "loss": 1.2961, + "step": 1149 + }, + { + "epoch": 0.0548232545944271, + "grad_norm": 2.1117870807647705, + "learning_rate": 1.9000416378435312e-05, + "loss": 0.8604, + "step": 1150 + }, + { + "epoch": 0.0548709269897266, + "grad_norm": 2.5202038288116455, + "learning_rate": 1.899870385822469e-05, + "loss": 0.9414, + "step": 1151 + }, + { + "epoch": 0.0549185993850261, + "grad_norm": 1.6602882146835327, + "learning_rate": 1.8996989949606724e-05, + "loss": 0.5965, + "step": 1152 + }, + { + "epoch": 0.0549662717803256, + "grad_norm": 2.404963493347168, + "learning_rate": 1.8995274652845867e-05, + "loss": 0.5455, + "step": 1153 + }, + { + "epoch": 0.05501394417562511, + "grad_norm": 2.229182004928589, + "learning_rate": 1.8993557968206763e-05, + "loss": 0.6388, + "step": 1154 + }, + { + "epoch": 0.05506161657092461, + "grad_norm": 1.6714191436767578, + "learning_rate": 1.8991839895954277e-05, + "loss": 0.9903, + "step": 1155 + }, + { + "epoch": 0.05510928896622411, + "grad_norm": 1.0693182945251465, + "learning_rate": 1.8990120436353496e-05, + "loss": 0.5146, + "step": 1156 + }, + { + "epoch": 0.05515696136152361, + "grad_norm": 2.527358055114746, + "learning_rate": 1.898839958966971e-05, + "loss": 0.4123, + "step": 1157 + }, + { + "epoch": 0.055204633756823114, + "grad_norm": 2.7824013233184814, + "learning_rate": 1.8986677356168433e-05, + "loss": 0.4801, + "step": 1158 + }, + { + "epoch": 0.055252306152122614, + "grad_norm": 1.8934613466262817, + "learning_rate": 1.8984953736115382e-05, + "loss": 0.8082, + "step": 1159 + }, + { + "epoch": 0.055299978547422114, + "grad_norm": 2.3900678157806396, + "learning_rate": 1.89832287297765e-05, + "loss": 0.496, + "step": 1160 + }, + { + "epoch": 0.055347650942721614, + "grad_norm": 12.552268981933594, + "learning_rate": 1.8981502337417933e-05, + "loss": 1.0013, + "step": 1161 + }, + { + "epoch": 0.05539532333802112, + "grad_norm": 1.6940616369247437, + "learning_rate": 1.8979774559306046e-05, + "loss": 0.9139, + "step": 1162 + }, + { + "epoch": 0.05544299573332062, + "grad_norm": 1.5934605598449707, + "learning_rate": 1.897804539570742e-05, + "loss": 0.8557, + "step": 1163 + }, + { + "epoch": 0.05549066812862012, + "grad_norm": 1.645617961883545, + "learning_rate": 1.8976314846888845e-05, + "loss": 0.5171, + "step": 1164 + }, + { + "epoch": 0.05553834052391962, + "grad_norm": 1.3355939388275146, + "learning_rate": 1.8974582913117323e-05, + "loss": 0.8613, + "step": 1165 + }, + { + "epoch": 0.05558601291921913, + "grad_norm": 2.7546401023864746, + "learning_rate": 1.897284959466008e-05, + "loss": 0.7212, + "step": 1166 + }, + { + "epoch": 0.05563368531451863, + "grad_norm": 2.5125839710235596, + "learning_rate": 1.897111489178455e-05, + "loss": 0.6046, + "step": 1167 + }, + { + "epoch": 0.05568135770981813, + "grad_norm": 3.391326665878296, + "learning_rate": 1.8969378804758375e-05, + "loss": 1.043, + "step": 1168 + }, + { + "epoch": 0.05572903010511763, + "grad_norm": 3.3391637802124023, + "learning_rate": 1.8967641333849417e-05, + "loss": 0.7025, + "step": 1169 + }, + { + "epoch": 0.055776702500417136, + "grad_norm": 1.606366753578186, + "learning_rate": 1.896590247932575e-05, + "loss": 0.6304, + "step": 1170 + }, + { + "epoch": 0.055824374895716636, + "grad_norm": 3.1777889728546143, + "learning_rate": 1.8964162241455662e-05, + "loss": 0.4038, + "step": 1171 + }, + { + "epoch": 0.055872047291016136, + "grad_norm": 1.2329596281051636, + "learning_rate": 1.896242062050765e-05, + "loss": 0.5807, + "step": 1172 + }, + { + "epoch": 0.055919719686315636, + "grad_norm": 2.3602383136749268, + "learning_rate": 1.8960677616750435e-05, + "loss": 1.1764, + "step": 1173 + }, + { + "epoch": 0.05596739208161514, + "grad_norm": 1.634207010269165, + "learning_rate": 1.8958933230452938e-05, + "loss": 0.8832, + "step": 1174 + }, + { + "epoch": 0.05601506447691464, + "grad_norm": 1.593907117843628, + "learning_rate": 1.8957187461884308e-05, + "loss": 0.716, + "step": 1175 + }, + { + "epoch": 0.05606273687221414, + "grad_norm": 1.369978666305542, + "learning_rate": 1.895544031131389e-05, + "loss": 0.708, + "step": 1176 + }, + { + "epoch": 0.05611040926751364, + "grad_norm": 2.046199083328247, + "learning_rate": 1.8953691779011255e-05, + "loss": 0.6917, + "step": 1177 + }, + { + "epoch": 0.05615808166281315, + "grad_norm": 2.0883688926696777, + "learning_rate": 1.895194186524618e-05, + "loss": 0.9998, + "step": 1178 + }, + { + "epoch": 0.05620575405811265, + "grad_norm": 1.8095883131027222, + "learning_rate": 1.895019057028867e-05, + "loss": 0.8894, + "step": 1179 + }, + { + "epoch": 0.05625342645341215, + "grad_norm": 1.895460605621338, + "learning_rate": 1.894843789440892e-05, + "loss": 0.7409, + "step": 1180 + }, + { + "epoch": 0.05630109884871165, + "grad_norm": 1.6952366828918457, + "learning_rate": 1.8946683837877354e-05, + "loss": 0.6736, + "step": 1181 + }, + { + "epoch": 0.05634877124401116, + "grad_norm": 1.567142128944397, + "learning_rate": 1.8944928400964606e-05, + "loss": 1.0353, + "step": 1182 + }, + { + "epoch": 0.05639644363931066, + "grad_norm": 1.685753583908081, + "learning_rate": 1.894317158394152e-05, + "loss": 0.8358, + "step": 1183 + }, + { + "epoch": 0.05644411603461016, + "grad_norm": 1.324472427368164, + "learning_rate": 1.8941413387079156e-05, + "loss": 0.5607, + "step": 1184 + }, + { + "epoch": 0.05649178842990966, + "grad_norm": 5.449647426605225, + "learning_rate": 1.8939653810648785e-05, + "loss": 0.8004, + "step": 1185 + }, + { + "epoch": 0.056539460825209165, + "grad_norm": 1.8052685260772705, + "learning_rate": 1.8937892854921892e-05, + "loss": 0.9804, + "step": 1186 + }, + { + "epoch": 0.056587133220508665, + "grad_norm": 1.7985178232192993, + "learning_rate": 1.8936130520170172e-05, + "loss": 0.9373, + "step": 1187 + }, + { + "epoch": 0.056634805615808165, + "grad_norm": 2.4171130657196045, + "learning_rate": 1.893436680666554e-05, + "loss": 0.6786, + "step": 1188 + }, + { + "epoch": 0.056682478011107665, + "grad_norm": 1.7155203819274902, + "learning_rate": 1.893260171468011e-05, + "loss": 0.6025, + "step": 1189 + }, + { + "epoch": 0.05673015040640717, + "grad_norm": 1.4676045179367065, + "learning_rate": 1.8930835244486232e-05, + "loss": 0.4387, + "step": 1190 + }, + { + "epoch": 0.05677782280170667, + "grad_norm": 2.474015951156616, + "learning_rate": 1.892906739635644e-05, + "loss": 1.1813, + "step": 1191 + }, + { + "epoch": 0.05682549519700617, + "grad_norm": 1.848732829093933, + "learning_rate": 1.8927298170563503e-05, + "loss": 1.0332, + "step": 1192 + }, + { + "epoch": 0.05687316759230567, + "grad_norm": 3.2035136222839355, + "learning_rate": 1.892552756738039e-05, + "loss": 0.9814, + "step": 1193 + }, + { + "epoch": 0.05692083998760518, + "grad_norm": 3.143311023712158, + "learning_rate": 1.8923755587080288e-05, + "loss": 0.8789, + "step": 1194 + }, + { + "epoch": 0.05696851238290468, + "grad_norm": 1.4218758344650269, + "learning_rate": 1.8921982229936597e-05, + "loss": 0.7748, + "step": 1195 + }, + { + "epoch": 0.05701618477820418, + "grad_norm": 1.3497434854507446, + "learning_rate": 1.8920207496222924e-05, + "loss": 0.7783, + "step": 1196 + }, + { + "epoch": 0.05706385717350368, + "grad_norm": 2.282148838043213, + "learning_rate": 1.89184313862131e-05, + "loss": 0.8524, + "step": 1197 + }, + { + "epoch": 0.057111529568803186, + "grad_norm": 1.9011353254318237, + "learning_rate": 1.891665390018115e-05, + "loss": 0.902, + "step": 1198 + }, + { + "epoch": 0.057159201964102686, + "grad_norm": 2.767425060272217, + "learning_rate": 1.891487503840133e-05, + "loss": 1.0341, + "step": 1199 + }, + { + "epoch": 0.057206874359402186, + "grad_norm": 1.7280144691467285, + "learning_rate": 1.8913094801148096e-05, + "loss": 0.6495, + "step": 1200 + }, + { + "epoch": 0.05725454675470169, + "grad_norm": 1.4182579517364502, + "learning_rate": 1.891131318869612e-05, + "loss": 1.0231, + "step": 1201 + }, + { + "epoch": 0.057302219150001193, + "grad_norm": 1.7430015802383423, + "learning_rate": 1.8909530201320288e-05, + "loss": 0.8205, + "step": 1202 + }, + { + "epoch": 0.057349891545300694, + "grad_norm": 1.7929915189743042, + "learning_rate": 1.89077458392957e-05, + "loss": 1.0896, + "step": 1203 + }, + { + "epoch": 0.057397563940600194, + "grad_norm": 2.15480637550354, + "learning_rate": 1.890596010289766e-05, + "loss": 0.9131, + "step": 1204 + }, + { + "epoch": 0.0574452363358997, + "grad_norm": 1.5927636623382568, + "learning_rate": 1.8904172992401685e-05, + "loss": 0.6557, + "step": 1205 + }, + { + "epoch": 0.0574929087311992, + "grad_norm": 2.2707607746124268, + "learning_rate": 1.8902384508083518e-05, + "loss": 0.9403, + "step": 1206 + }, + { + "epoch": 0.0575405811264987, + "grad_norm": 2.514889717102051, + "learning_rate": 1.8900594650219096e-05, + "loss": 0.5346, + "step": 1207 + }, + { + "epoch": 0.0575882535217982, + "grad_norm": 2.6210289001464844, + "learning_rate": 1.8898803419084578e-05, + "loss": 0.9477, + "step": 1208 + }, + { + "epoch": 0.05763592591709771, + "grad_norm": 1.4959616661071777, + "learning_rate": 1.889701081495633e-05, + "loss": 0.5462, + "step": 1209 + }, + { + "epoch": 0.05768359831239721, + "grad_norm": 4.131924152374268, + "learning_rate": 1.8895216838110938e-05, + "loss": 0.7199, + "step": 1210 + }, + { + "epoch": 0.05773127070769671, + "grad_norm": 1.0754896402359009, + "learning_rate": 1.889342148882519e-05, + "loss": 0.5139, + "step": 1211 + }, + { + "epoch": 0.05777894310299621, + "grad_norm": 3.7189688682556152, + "learning_rate": 1.889162476737609e-05, + "loss": 0.6986, + "step": 1212 + }, + { + "epoch": 0.057826615498295715, + "grad_norm": 4.594300270080566, + "learning_rate": 1.8889826674040855e-05, + "loss": 0.514, + "step": 1213 + }, + { + "epoch": 0.057874287893595215, + "grad_norm": 1.001029372215271, + "learning_rate": 1.8888027209096913e-05, + "loss": 0.2689, + "step": 1214 + }, + { + "epoch": 0.057921960288894715, + "grad_norm": 2.0562429428100586, + "learning_rate": 1.88862263728219e-05, + "loss": 0.8285, + "step": 1215 + }, + { + "epoch": 0.057969632684194215, + "grad_norm": 1.5957159996032715, + "learning_rate": 1.888442416549367e-05, + "loss": 0.6524, + "step": 1216 + }, + { + "epoch": 0.05801730507949372, + "grad_norm": 1.7283976078033447, + "learning_rate": 1.888262058739028e-05, + "loss": 0.6009, + "step": 1217 + }, + { + "epoch": 0.05806497747479322, + "grad_norm": 1.5523749589920044, + "learning_rate": 1.888081563879001e-05, + "loss": 0.7249, + "step": 1218 + }, + { + "epoch": 0.05811264987009272, + "grad_norm": 1.2984051704406738, + "learning_rate": 1.887900931997134e-05, + "loss": 0.7523, + "step": 1219 + }, + { + "epoch": 0.05816032226539222, + "grad_norm": 3.114650011062622, + "learning_rate": 1.8877201631212966e-05, + "loss": 0.354, + "step": 1220 + }, + { + "epoch": 0.05820799466069173, + "grad_norm": 1.8985135555267334, + "learning_rate": 1.88753925727938e-05, + "loss": 0.9879, + "step": 1221 + }, + { + "epoch": 0.05825566705599123, + "grad_norm": 1.6196322441101074, + "learning_rate": 1.887358214499296e-05, + "loss": 0.4475, + "step": 1222 + }, + { + "epoch": 0.05830333945129073, + "grad_norm": 1.1466352939605713, + "learning_rate": 1.8871770348089774e-05, + "loss": 0.6485, + "step": 1223 + }, + { + "epoch": 0.05835101184659023, + "grad_norm": 2.748375654220581, + "learning_rate": 1.8869957182363784e-05, + "loss": 0.9876, + "step": 1224 + }, + { + "epoch": 0.05839868424188974, + "grad_norm": 2.3730061054229736, + "learning_rate": 1.8868142648094745e-05, + "loss": 1.2395, + "step": 1225 + }, + { + "epoch": 0.05844635663718924, + "grad_norm": 2.2381269931793213, + "learning_rate": 1.886632674556262e-05, + "loss": 0.7707, + "step": 1226 + }, + { + "epoch": 0.05849402903248874, + "grad_norm": 1.3274270296096802, + "learning_rate": 1.8864509475047583e-05, + "loss": 0.5131, + "step": 1227 + }, + { + "epoch": 0.05854170142778824, + "grad_norm": 2.31314754486084, + "learning_rate": 1.886269083683002e-05, + "loss": 0.9866, + "step": 1228 + }, + { + "epoch": 0.058589373823087744, + "grad_norm": 2.0299339294433594, + "learning_rate": 1.886087083119053e-05, + "loss": 0.8491, + "step": 1229 + }, + { + "epoch": 0.058637046218387244, + "grad_norm": 3.188847541809082, + "learning_rate": 1.885904945840992e-05, + "loss": 0.6063, + "step": 1230 + }, + { + "epoch": 0.058684718613686744, + "grad_norm": 1.6119741201400757, + "learning_rate": 1.885722671876921e-05, + "loss": 0.5036, + "step": 1231 + }, + { + "epoch": 0.058732391008986244, + "grad_norm": 2.852628231048584, + "learning_rate": 1.8855402612549624e-05, + "loss": 0.5136, + "step": 1232 + }, + { + "epoch": 0.05878006340428575, + "grad_norm": 1.553695559501648, + "learning_rate": 1.8853577140032614e-05, + "loss": 0.6988, + "step": 1233 + }, + { + "epoch": 0.05882773579958525, + "grad_norm": 2.5913333892822266, + "learning_rate": 1.885175030149982e-05, + "loss": 1.0885, + "step": 1234 + }, + { + "epoch": 0.05887540819488475, + "grad_norm": 1.3405808210372925, + "learning_rate": 1.8849922097233115e-05, + "loss": 0.8781, + "step": 1235 + }, + { + "epoch": 0.05892308059018425, + "grad_norm": 1.288750171661377, + "learning_rate": 1.8848092527514564e-05, + "loss": 0.7509, + "step": 1236 + }, + { + "epoch": 0.05897075298548376, + "grad_norm": 2.9218556880950928, + "learning_rate": 1.8846261592626455e-05, + "loss": 0.7463, + "step": 1237 + }, + { + "epoch": 0.05901842538078326, + "grad_norm": 1.2847224473953247, + "learning_rate": 1.8844429292851282e-05, + "loss": 0.4134, + "step": 1238 + }, + { + "epoch": 0.05906609777608276, + "grad_norm": 1.4831727743148804, + "learning_rate": 1.8842595628471746e-05, + "loss": 0.691, + "step": 1239 + }, + { + "epoch": 0.05911377017138226, + "grad_norm": 2.680004835128784, + "learning_rate": 1.884076059977077e-05, + "loss": 0.7823, + "step": 1240 + }, + { + "epoch": 0.059161442566681766, + "grad_norm": 1.819541573524475, + "learning_rate": 1.8838924207031474e-05, + "loss": 0.8122, + "step": 1241 + }, + { + "epoch": 0.059209114961981266, + "grad_norm": 6.2091593742370605, + "learning_rate": 1.8837086450537195e-05, + "loss": 0.964, + "step": 1242 + }, + { + "epoch": 0.059256787357280766, + "grad_norm": 1.8609195947647095, + "learning_rate": 1.883524733057148e-05, + "loss": 0.9592, + "step": 1243 + }, + { + "epoch": 0.059304459752580266, + "grad_norm": 3.6004459857940674, + "learning_rate": 1.8833406847418088e-05, + "loss": 1.033, + "step": 1244 + }, + { + "epoch": 0.05935213214787977, + "grad_norm": 2.253506660461426, + "learning_rate": 1.8831565001360987e-05, + "loss": 0.3874, + "step": 1245 + }, + { + "epoch": 0.05939980454317927, + "grad_norm": 2.738685131072998, + "learning_rate": 1.8829721792684353e-05, + "loss": 0.6196, + "step": 1246 + }, + { + "epoch": 0.05944747693847877, + "grad_norm": 1.635756254196167, + "learning_rate": 1.8827877221672578e-05, + "loss": 0.7863, + "step": 1247 + }, + { + "epoch": 0.05949514933377827, + "grad_norm": 0.8421286344528198, + "learning_rate": 1.8826031288610255e-05, + "loss": 0.3319, + "step": 1248 + }, + { + "epoch": 0.05954282172907778, + "grad_norm": 1.600510597229004, + "learning_rate": 1.8824183993782193e-05, + "loss": 0.9212, + "step": 1249 + }, + { + "epoch": 0.05959049412437728, + "grad_norm": 1.8550633192062378, + "learning_rate": 1.8822335337473413e-05, + "loss": 0.6204, + "step": 1250 + }, + { + "epoch": 0.05963816651967678, + "grad_norm": 1.3988538980484009, + "learning_rate": 1.8820485319969145e-05, + "loss": 0.6403, + "step": 1251 + }, + { + "epoch": 0.05968583891497628, + "grad_norm": 2.387962818145752, + "learning_rate": 1.881863394155482e-05, + "loss": 0.7924, + "step": 1252 + }, + { + "epoch": 0.05973351131027579, + "grad_norm": 1.4238137006759644, + "learning_rate": 1.88167812025161e-05, + "loss": 0.3732, + "step": 1253 + }, + { + "epoch": 0.05978118370557529, + "grad_norm": 1.3570970296859741, + "learning_rate": 1.881492710313883e-05, + "loss": 0.8902, + "step": 1254 + }, + { + "epoch": 0.05982885610087479, + "grad_norm": 1.7247503995895386, + "learning_rate": 1.8813071643709087e-05, + "loss": 0.6211, + "step": 1255 + }, + { + "epoch": 0.05987652849617429, + "grad_norm": 2.3720498085021973, + "learning_rate": 1.8811214824513145e-05, + "loss": 1.2221, + "step": 1256 + }, + { + "epoch": 0.059924200891473794, + "grad_norm": 1.7537853717803955, + "learning_rate": 1.8809356645837495e-05, + "loss": 0.6426, + "step": 1257 + }, + { + "epoch": 0.059971873286773295, + "grad_norm": 1.4743638038635254, + "learning_rate": 1.8807497107968834e-05, + "loss": 0.5793, + "step": 1258 + }, + { + "epoch": 0.060019545682072795, + "grad_norm": 1.7508403062820435, + "learning_rate": 1.8805636211194066e-05, + "loss": 0.7718, + "step": 1259 + }, + { + "epoch": 0.060067218077372295, + "grad_norm": 11.860549926757812, + "learning_rate": 1.8803773955800313e-05, + "loss": 0.6367, + "step": 1260 + }, + { + "epoch": 0.0601148904726718, + "grad_norm": 1.9697636365890503, + "learning_rate": 1.88019103420749e-05, + "loss": 0.9679, + "step": 1261 + }, + { + "epoch": 0.0601625628679713, + "grad_norm": 5.275759220123291, + "learning_rate": 1.8800045370305365e-05, + "loss": 0.9563, + "step": 1262 + }, + { + "epoch": 0.0602102352632708, + "grad_norm": 4.440974235534668, + "learning_rate": 1.879817904077945e-05, + "loss": 0.8436, + "step": 1263 + }, + { + "epoch": 0.0602579076585703, + "grad_norm": 2.7934744358062744, + "learning_rate": 1.879631135378511e-05, + "loss": 1.5814, + "step": 1264 + }, + { + "epoch": 0.06030558005386981, + "grad_norm": 1.363739013671875, + "learning_rate": 1.8794442309610518e-05, + "loss": 0.8923, + "step": 1265 + }, + { + "epoch": 0.06035325244916931, + "grad_norm": 2.2420594692230225, + "learning_rate": 1.879257190854404e-05, + "loss": 0.8111, + "step": 1266 + }, + { + "epoch": 0.06040092484446881, + "grad_norm": 1.7087407112121582, + "learning_rate": 1.879070015087426e-05, + "loss": 0.9341, + "step": 1267 + }, + { + "epoch": 0.06044859723976831, + "grad_norm": 3.1180970668792725, + "learning_rate": 1.8788827036889978e-05, + "loss": 0.8685, + "step": 1268 + }, + { + "epoch": 0.060496269635067816, + "grad_norm": 0.9729019403457642, + "learning_rate": 1.8786952566880192e-05, + "loss": 0.5399, + "step": 1269 + }, + { + "epoch": 0.060543942030367316, + "grad_norm": 2.52308988571167, + "learning_rate": 1.878507674113411e-05, + "loss": 1.035, + "step": 1270 + }, + { + "epoch": 0.060591614425666816, + "grad_norm": 0.9263543486595154, + "learning_rate": 1.878319955994116e-05, + "loss": 0.5863, + "step": 1271 + }, + { + "epoch": 0.060639286820966316, + "grad_norm": 1.5016515254974365, + "learning_rate": 1.8781321023590962e-05, + "loss": 0.9493, + "step": 1272 + }, + { + "epoch": 0.06068695921626582, + "grad_norm": 2.0149483680725098, + "learning_rate": 1.877944113237336e-05, + "loss": 0.8444, + "step": 1273 + }, + { + "epoch": 0.06073463161156532, + "grad_norm": 1.6895701885223389, + "learning_rate": 1.8777559886578407e-05, + "loss": 0.6487, + "step": 1274 + }, + { + "epoch": 0.060782304006864823, + "grad_norm": 1.780248761177063, + "learning_rate": 1.877567728649635e-05, + "loss": 1.1951, + "step": 1275 + }, + { + "epoch": 0.060829976402164324, + "grad_norm": 1.427687406539917, + "learning_rate": 1.8773793332417664e-05, + "loss": 0.7353, + "step": 1276 + }, + { + "epoch": 0.06087764879746383, + "grad_norm": 4.466861724853516, + "learning_rate": 1.8771908024633017e-05, + "loss": 1.0419, + "step": 1277 + }, + { + "epoch": 0.06092532119276333, + "grad_norm": 3.2163162231445312, + "learning_rate": 1.8770021363433295e-05, + "loss": 0.4114, + "step": 1278 + }, + { + "epoch": 0.06097299358806283, + "grad_norm": 1.3906358480453491, + "learning_rate": 1.876813334910959e-05, + "loss": 0.991, + "step": 1279 + }, + { + "epoch": 0.06102066598336233, + "grad_norm": 2.0710666179656982, + "learning_rate": 1.8766243981953204e-05, + "loss": 0.6592, + "step": 1280 + }, + { + "epoch": 0.06106833837866184, + "grad_norm": 2.256863832473755, + "learning_rate": 1.876435326225565e-05, + "loss": 0.7951, + "step": 1281 + }, + { + "epoch": 0.06111601077396134, + "grad_norm": 1.7044591903686523, + "learning_rate": 1.8762461190308637e-05, + "loss": 0.8216, + "step": 1282 + }, + { + "epoch": 0.06116368316926084, + "grad_norm": 3.1906909942626953, + "learning_rate": 1.8760567766404102e-05, + "loss": 1.1828, + "step": 1283 + }, + { + "epoch": 0.06121135556456034, + "grad_norm": 2.333390951156616, + "learning_rate": 1.8758672990834172e-05, + "loss": 0.7787, + "step": 1284 + }, + { + "epoch": 0.061259027959859845, + "grad_norm": 1.56510329246521, + "learning_rate": 1.87567768638912e-05, + "loss": 1.0974, + "step": 1285 + }, + { + "epoch": 0.061306700355159345, + "grad_norm": 2.0056633949279785, + "learning_rate": 1.8754879385867738e-05, + "loss": 0.6958, + "step": 1286 + }, + { + "epoch": 0.061354372750458845, + "grad_norm": 2.849614143371582, + "learning_rate": 1.875298055705654e-05, + "loss": 1.1496, + "step": 1287 + }, + { + "epoch": 0.061402045145758345, + "grad_norm": 2.2136480808258057, + "learning_rate": 1.8751080377750585e-05, + "loss": 0.7184, + "step": 1288 + }, + { + "epoch": 0.06144971754105785, + "grad_norm": 1.6875826120376587, + "learning_rate": 1.8749178848243042e-05, + "loss": 0.5162, + "step": 1289 + }, + { + "epoch": 0.06149738993635735, + "grad_norm": 8.317864418029785, + "learning_rate": 1.8747275968827304e-05, + "loss": 0.5979, + "step": 1290 + }, + { + "epoch": 0.06154506233165685, + "grad_norm": 2.540807008743286, + "learning_rate": 1.8745371739796962e-05, + "loss": 0.8541, + "step": 1291 + }, + { + "epoch": 0.06159273472695636, + "grad_norm": 3.1549360752105713, + "learning_rate": 1.8743466161445823e-05, + "loss": 0.5086, + "step": 1292 + }, + { + "epoch": 0.06164040712225586, + "grad_norm": 1.5895482301712036, + "learning_rate": 1.8741559234067893e-05, + "loss": 0.7999, + "step": 1293 + }, + { + "epoch": 0.06168807951755536, + "grad_norm": 1.2056721448898315, + "learning_rate": 1.8739650957957396e-05, + "loss": 0.5553, + "step": 1294 + }, + { + "epoch": 0.06173575191285486, + "grad_norm": 1.6600650548934937, + "learning_rate": 1.8737741333408757e-05, + "loss": 0.885, + "step": 1295 + }, + { + "epoch": 0.06178342430815437, + "grad_norm": 2.5915048122406006, + "learning_rate": 1.873583036071661e-05, + "loss": 0.4749, + "step": 1296 + }, + { + "epoch": 0.06183109670345387, + "grad_norm": 1.690521478652954, + "learning_rate": 1.87339180401758e-05, + "loss": 0.6927, + "step": 1297 + }, + { + "epoch": 0.06187876909875337, + "grad_norm": 1.1942768096923828, + "learning_rate": 1.873200437208138e-05, + "loss": 0.6184, + "step": 1298 + }, + { + "epoch": 0.06192644149405287, + "grad_norm": 4.745242118835449, + "learning_rate": 1.8730089356728605e-05, + "loss": 0.4487, + "step": 1299 + }, + { + "epoch": 0.061974113889352374, + "grad_norm": 2.1504414081573486, + "learning_rate": 1.8728172994412948e-05, + "loss": 0.6687, + "step": 1300 + }, + { + "epoch": 0.062021786284651874, + "grad_norm": 2.5773351192474365, + "learning_rate": 1.872625528543008e-05, + "loss": 0.8053, + "step": 1301 + }, + { + "epoch": 0.062069458679951374, + "grad_norm": 1.972769021987915, + "learning_rate": 1.8724336230075885e-05, + "loss": 0.8092, + "step": 1302 + }, + { + "epoch": 0.062117131075250874, + "grad_norm": 2.7558534145355225, + "learning_rate": 1.872241582864645e-05, + "loss": 0.6769, + "step": 1303 + }, + { + "epoch": 0.06216480347055038, + "grad_norm": 4.685482025146484, + "learning_rate": 1.872049408143808e-05, + "loss": 1.0463, + "step": 1304 + }, + { + "epoch": 0.06221247586584988, + "grad_norm": 6.796249866485596, + "learning_rate": 1.871857098874727e-05, + "loss": 0.2981, + "step": 1305 + }, + { + "epoch": 0.06226014826114938, + "grad_norm": 1.691559910774231, + "learning_rate": 1.8716646550870746e-05, + "loss": 0.738, + "step": 1306 + }, + { + "epoch": 0.06230782065644888, + "grad_norm": 2.390437364578247, + "learning_rate": 1.8714720768105425e-05, + "loss": 1.4487, + "step": 1307 + }, + { + "epoch": 0.06235549305174839, + "grad_norm": 1.676360845565796, + "learning_rate": 1.8712793640748433e-05, + "loss": 0.6378, + "step": 1308 + }, + { + "epoch": 0.06240316544704789, + "grad_norm": 1.6796358823776245, + "learning_rate": 1.8710865169097102e-05, + "loss": 0.667, + "step": 1309 + }, + { + "epoch": 0.06245083784234739, + "grad_norm": 2.0294792652130127, + "learning_rate": 1.8708935353448982e-05, + "loss": 0.7727, + "step": 1310 + }, + { + "epoch": 0.06249851023764689, + "grad_norm": 2.0255770683288574, + "learning_rate": 1.8707004194101825e-05, + "loss": 1.3701, + "step": 1311 + }, + { + "epoch": 0.0625461826329464, + "grad_norm": 1.7787353992462158, + "learning_rate": 1.8705071691353583e-05, + "loss": 0.745, + "step": 1312 + }, + { + "epoch": 0.0625938550282459, + "grad_norm": 2.5972752571105957, + "learning_rate": 1.870313784550242e-05, + "loss": 0.4273, + "step": 1313 + }, + { + "epoch": 0.0626415274235454, + "grad_norm": 1.857809066772461, + "learning_rate": 1.8701202656846717e-05, + "loss": 0.641, + "step": 1314 + }, + { + "epoch": 0.0626891998188449, + "grad_norm": 3.2368533611297607, + "learning_rate": 1.8699266125685052e-05, + "loss": 0.2691, + "step": 1315 + }, + { + "epoch": 0.0627368722141444, + "grad_norm": 2.417619466781616, + "learning_rate": 1.8697328252316205e-05, + "loss": 0.6198, + "step": 1316 + }, + { + "epoch": 0.0627845446094439, + "grad_norm": 4.069061279296875, + "learning_rate": 1.8695389037039172e-05, + "loss": 0.4663, + "step": 1317 + }, + { + "epoch": 0.06283221700474341, + "grad_norm": 3.2209033966064453, + "learning_rate": 1.869344848015316e-05, + "loss": 0.5283, + "step": 1318 + }, + { + "epoch": 0.06287988940004291, + "grad_norm": 6.288067817687988, + "learning_rate": 1.869150658195757e-05, + "loss": 0.2191, + "step": 1319 + }, + { + "epoch": 0.06292756179534241, + "grad_norm": 1.6023699045181274, + "learning_rate": 1.868956334275202e-05, + "loss": 1.0093, + "step": 1320 + }, + { + "epoch": 0.06297523419064191, + "grad_norm": 1.6245163679122925, + "learning_rate": 1.8687618762836334e-05, + "loss": 0.7076, + "step": 1321 + }, + { + "epoch": 0.06302290658594141, + "grad_norm": 1.7596030235290527, + "learning_rate": 1.8685672842510536e-05, + "loss": 0.475, + "step": 1322 + }, + { + "epoch": 0.06307057898124091, + "grad_norm": 1.1735817193984985, + "learning_rate": 1.8683725582074862e-05, + "loss": 0.6964, + "step": 1323 + }, + { + "epoch": 0.06311825137654041, + "grad_norm": 1.7039296627044678, + "learning_rate": 1.868177698182976e-05, + "loss": 0.5079, + "step": 1324 + }, + { + "epoch": 0.06316592377183991, + "grad_norm": 4.908930778503418, + "learning_rate": 1.867982704207587e-05, + "loss": 0.2962, + "step": 1325 + }, + { + "epoch": 0.06321359616713942, + "grad_norm": 1.541994333267212, + "learning_rate": 1.8677875763114054e-05, + "loss": 0.6977, + "step": 1326 + }, + { + "epoch": 0.06326126856243892, + "grad_norm": 2.8934872150421143, + "learning_rate": 1.8675923145245373e-05, + "loss": 0.8652, + "step": 1327 + }, + { + "epoch": 0.06330894095773842, + "grad_norm": 0.9850158095359802, + "learning_rate": 1.8673969188771094e-05, + "loss": 0.5087, + "step": 1328 + }, + { + "epoch": 0.06335661335303792, + "grad_norm": 1.6392902135849, + "learning_rate": 1.8672013893992697e-05, + "loss": 0.5989, + "step": 1329 + }, + { + "epoch": 0.06340428574833742, + "grad_norm": 3.292081117630005, + "learning_rate": 1.8670057261211857e-05, + "loss": 0.8183, + "step": 1330 + }, + { + "epoch": 0.06345195814363692, + "grad_norm": 1.6214336156845093, + "learning_rate": 1.8668099290730468e-05, + "loss": 0.9003, + "step": 1331 + }, + { + "epoch": 0.06349963053893642, + "grad_norm": 1.6833115816116333, + "learning_rate": 1.8666139982850626e-05, + "loss": 0.7086, + "step": 1332 + }, + { + "epoch": 0.06354730293423592, + "grad_norm": 1.3699870109558105, + "learning_rate": 1.8664179337874618e-05, + "loss": 0.7194, + "step": 1333 + }, + { + "epoch": 0.06359497532953544, + "grad_norm": 2.6119396686553955, + "learning_rate": 1.866221735610497e-05, + "loss": 0.5412, + "step": 1334 + }, + { + "epoch": 0.06364264772483494, + "grad_norm": 2.3003079891204834, + "learning_rate": 1.866025403784439e-05, + "loss": 0.9531, + "step": 1335 + }, + { + "epoch": 0.06369032012013444, + "grad_norm": 1.4991408586502075, + "learning_rate": 1.865828938339579e-05, + "loss": 0.4751, + "step": 1336 + }, + { + "epoch": 0.06373799251543394, + "grad_norm": 1.6355737447738647, + "learning_rate": 1.86563233930623e-05, + "loss": 0.8553, + "step": 1337 + }, + { + "epoch": 0.06378566491073344, + "grad_norm": 1.9629335403442383, + "learning_rate": 1.8654356067147258e-05, + "loss": 1.0921, + "step": 1338 + }, + { + "epoch": 0.06383333730603294, + "grad_norm": 1.5107303857803345, + "learning_rate": 1.8652387405954196e-05, + "loss": 0.3482, + "step": 1339 + }, + { + "epoch": 0.06388100970133244, + "grad_norm": 2.5146117210388184, + "learning_rate": 1.865041740978686e-05, + "loss": 0.6937, + "step": 1340 + }, + { + "epoch": 0.06392868209663194, + "grad_norm": 1.4010186195373535, + "learning_rate": 1.86484460789492e-05, + "loss": 0.9053, + "step": 1341 + }, + { + "epoch": 0.06397635449193145, + "grad_norm": 2.845616102218628, + "learning_rate": 1.864647341374537e-05, + "loss": 1.1672, + "step": 1342 + }, + { + "epoch": 0.06402402688723095, + "grad_norm": 4.536278247833252, + "learning_rate": 1.8644499414479735e-05, + "loss": 0.5932, + "step": 1343 + }, + { + "epoch": 0.06407169928253045, + "grad_norm": 2.0457026958465576, + "learning_rate": 1.864252408145686e-05, + "loss": 0.9775, + "step": 1344 + }, + { + "epoch": 0.06411937167782995, + "grad_norm": 1.767301082611084, + "learning_rate": 1.8640547414981523e-05, + "loss": 0.7896, + "step": 1345 + }, + { + "epoch": 0.06416704407312945, + "grad_norm": 5.789084434509277, + "learning_rate": 1.8638569415358696e-05, + "loss": 0.7714, + "step": 1346 + }, + { + "epoch": 0.06421471646842895, + "grad_norm": 1.4148178100585938, + "learning_rate": 1.863659008289357e-05, + "loss": 0.6687, + "step": 1347 + }, + { + "epoch": 0.06426238886372845, + "grad_norm": 1.5837879180908203, + "learning_rate": 1.8634609417891535e-05, + "loss": 0.7297, + "step": 1348 + }, + { + "epoch": 0.06431006125902795, + "grad_norm": 2.070500135421753, + "learning_rate": 1.8632627420658184e-05, + "loss": 0.5979, + "step": 1349 + }, + { + "epoch": 0.06435773365432747, + "grad_norm": 1.9625242948532104, + "learning_rate": 1.8630644091499322e-05, + "loss": 0.5878, + "step": 1350 + }, + { + "epoch": 0.06440540604962697, + "grad_norm": 4.076782703399658, + "learning_rate": 1.8628659430720958e-05, + "loss": 1.4679, + "step": 1351 + }, + { + "epoch": 0.06445307844492647, + "grad_norm": 1.3314266204833984, + "learning_rate": 1.86266734386293e-05, + "loss": 0.7836, + "step": 1352 + }, + { + "epoch": 0.06450075084022597, + "grad_norm": 2.396193504333496, + "learning_rate": 1.8624686115530767e-05, + "loss": 1.1108, + "step": 1353 + }, + { + "epoch": 0.06454842323552547, + "grad_norm": 1.7645889520645142, + "learning_rate": 1.8622697461731983e-05, + "loss": 1.1186, + "step": 1354 + }, + { + "epoch": 0.06459609563082497, + "grad_norm": 1.8896069526672363, + "learning_rate": 1.8620707477539776e-05, + "loss": 0.9623, + "step": 1355 + }, + { + "epoch": 0.06464376802612447, + "grad_norm": 1.6902443170547485, + "learning_rate": 1.8618716163261185e-05, + "loss": 0.5137, + "step": 1356 + }, + { + "epoch": 0.06469144042142397, + "grad_norm": 1.171393632888794, + "learning_rate": 1.8616723519203445e-05, + "loss": 0.7234, + "step": 1357 + }, + { + "epoch": 0.06473911281672348, + "grad_norm": 1.263073444366455, + "learning_rate": 1.8614729545674e-05, + "loss": 0.4881, + "step": 1358 + }, + { + "epoch": 0.06478678521202298, + "grad_norm": 2.569489002227783, + "learning_rate": 1.86127342429805e-05, + "loss": 0.9636, + "step": 1359 + }, + { + "epoch": 0.06483445760732248, + "grad_norm": 1.6446890830993652, + "learning_rate": 1.86107376114308e-05, + "loss": 0.8164, + "step": 1360 + }, + { + "epoch": 0.06488213000262198, + "grad_norm": 1.5528117418289185, + "learning_rate": 1.8608739651332965e-05, + "loss": 0.8567, + "step": 1361 + }, + { + "epoch": 0.06492980239792148, + "grad_norm": 2.500760793685913, + "learning_rate": 1.8606740362995247e-05, + "loss": 0.7141, + "step": 1362 + }, + { + "epoch": 0.06497747479322098, + "grad_norm": 1.1972280740737915, + "learning_rate": 1.8604739746726128e-05, + "loss": 0.7305, + "step": 1363 + }, + { + "epoch": 0.06502514718852048, + "grad_norm": 1.8277705907821655, + "learning_rate": 1.8602737802834275e-05, + "loss": 0.8047, + "step": 1364 + }, + { + "epoch": 0.06507281958381998, + "grad_norm": 2.029378652572632, + "learning_rate": 1.8600734531628573e-05, + "loss": 0.8269, + "step": 1365 + }, + { + "epoch": 0.0651204919791195, + "grad_norm": 1.831621766090393, + "learning_rate": 1.8598729933418102e-05, + "loss": 0.7176, + "step": 1366 + }, + { + "epoch": 0.065168164374419, + "grad_norm": 1.9126170873641968, + "learning_rate": 1.8596724008512153e-05, + "loss": 0.7685, + "step": 1367 + }, + { + "epoch": 0.0652158367697185, + "grad_norm": 1.3067179918289185, + "learning_rate": 1.8594716757220218e-05, + "loss": 0.4599, + "step": 1368 + }, + { + "epoch": 0.065263509165018, + "grad_norm": 1.5223742723464966, + "learning_rate": 1.8592708179851994e-05, + "loss": 0.9997, + "step": 1369 + }, + { + "epoch": 0.0653111815603175, + "grad_norm": 1.269461750984192, + "learning_rate": 1.8590698276717386e-05, + "loss": 0.8008, + "step": 1370 + }, + { + "epoch": 0.065358853955617, + "grad_norm": 1.3915373086929321, + "learning_rate": 1.8588687048126503e-05, + "loss": 0.5982, + "step": 1371 + }, + { + "epoch": 0.0654065263509165, + "grad_norm": 2.4347476959228516, + "learning_rate": 1.8586674494389653e-05, + "loss": 0.4196, + "step": 1372 + }, + { + "epoch": 0.065454198746216, + "grad_norm": 1.9680578708648682, + "learning_rate": 1.858466061581736e-05, + "loss": 0.4175, + "step": 1373 + }, + { + "epoch": 0.06550187114151551, + "grad_norm": 3.6322875022888184, + "learning_rate": 1.858264541272033e-05, + "loss": 0.655, + "step": 1374 + }, + { + "epoch": 0.06554954353681501, + "grad_norm": 1.1171382665634155, + "learning_rate": 1.8580628885409502e-05, + "loss": 0.2657, + "step": 1375 + }, + { + "epoch": 0.06559721593211451, + "grad_norm": 2.2409839630126953, + "learning_rate": 1.8578611034196e-05, + "loss": 0.9376, + "step": 1376 + }, + { + "epoch": 0.06564488832741401, + "grad_norm": 1.258958339691162, + "learning_rate": 1.8576591859391158e-05, + "loss": 0.9122, + "step": 1377 + }, + { + "epoch": 0.06569256072271351, + "grad_norm": 1.190206527709961, + "learning_rate": 1.857457136130651e-05, + "loss": 0.7015, + "step": 1378 + }, + { + "epoch": 0.06574023311801301, + "grad_norm": 1.8071290254592896, + "learning_rate": 1.857254954025381e-05, + "loss": 0.6204, + "step": 1379 + }, + { + "epoch": 0.06578790551331251, + "grad_norm": 1.597461462020874, + "learning_rate": 1.857052639654499e-05, + "loss": 0.6745, + "step": 1380 + }, + { + "epoch": 0.06583557790861201, + "grad_norm": 1.2859656810760498, + "learning_rate": 1.8568501930492204e-05, + "loss": 0.4271, + "step": 1381 + }, + { + "epoch": 0.06588325030391153, + "grad_norm": 1.1040730476379395, + "learning_rate": 1.8566476142407814e-05, + "loss": 0.1298, + "step": 1382 + }, + { + "epoch": 0.06593092269921103, + "grad_norm": 2.955308675765991, + "learning_rate": 1.856444903260437e-05, + "loss": 0.6103, + "step": 1383 + }, + { + "epoch": 0.06597859509451053, + "grad_norm": 1.4735954999923706, + "learning_rate": 1.856242060139464e-05, + "loss": 1.0844, + "step": 1384 + }, + { + "epoch": 0.06602626748981003, + "grad_norm": 2.774493932723999, + "learning_rate": 1.8560390849091585e-05, + "loss": 1.103, + "step": 1385 + }, + { + "epoch": 0.06607393988510953, + "grad_norm": 1.628644585609436, + "learning_rate": 1.8558359776008377e-05, + "loss": 0.7914, + "step": 1386 + }, + { + "epoch": 0.06612161228040903, + "grad_norm": 1.5361404418945312, + "learning_rate": 1.855632738245839e-05, + "loss": 0.5651, + "step": 1387 + }, + { + "epoch": 0.06616928467570853, + "grad_norm": 4.0101752281188965, + "learning_rate": 1.8554293668755203e-05, + "loss": 0.7962, + "step": 1388 + }, + { + "epoch": 0.06621695707100804, + "grad_norm": 1.46977698802948, + "learning_rate": 1.855225863521259e-05, + "loss": 0.8936, + "step": 1389 + }, + { + "epoch": 0.06626462946630754, + "grad_norm": 2.5550119876861572, + "learning_rate": 1.8550222282144544e-05, + "loss": 1.2993, + "step": 1390 + }, + { + "epoch": 0.06631230186160704, + "grad_norm": 1.6742734909057617, + "learning_rate": 1.854818460986525e-05, + "loss": 0.5964, + "step": 1391 + }, + { + "epoch": 0.06635997425690654, + "grad_norm": 1.6683356761932373, + "learning_rate": 1.85461456186891e-05, + "loss": 0.4198, + "step": 1392 + }, + { + "epoch": 0.06640764665220604, + "grad_norm": 1.9733245372772217, + "learning_rate": 1.8544105308930688e-05, + "loss": 0.6145, + "step": 1393 + }, + { + "epoch": 0.06645531904750554, + "grad_norm": 1.9773991107940674, + "learning_rate": 1.8542063680904818e-05, + "loss": 1.0799, + "step": 1394 + }, + { + "epoch": 0.06650299144280504, + "grad_norm": 2.01918888092041, + "learning_rate": 1.8540020734926483e-05, + "loss": 1.0645, + "step": 1395 + }, + { + "epoch": 0.06655066383810454, + "grad_norm": 1.5858941078186035, + "learning_rate": 1.85379764713109e-05, + "loss": 1.0481, + "step": 1396 + }, + { + "epoch": 0.06659833623340405, + "grad_norm": 2.7093989849090576, + "learning_rate": 1.8535930890373467e-05, + "loss": 1.0287, + "step": 1397 + }, + { + "epoch": 0.06664600862870355, + "grad_norm": 4.241600036621094, + "learning_rate": 1.85338839924298e-05, + "loss": 0.3816, + "step": 1398 + }, + { + "epoch": 0.06669368102400305, + "grad_norm": 1.649603247642517, + "learning_rate": 1.853183577779572e-05, + "loss": 0.81, + "step": 1399 + }, + { + "epoch": 0.06674135341930255, + "grad_norm": 1.113547682762146, + "learning_rate": 1.8529786246787235e-05, + "loss": 0.6889, + "step": 1400 + }, + { + "epoch": 0.06678902581460205, + "grad_norm": 1.7195663452148438, + "learning_rate": 1.8527735399720575e-05, + "loss": 0.6214, + "step": 1401 + }, + { + "epoch": 0.06683669820990155, + "grad_norm": 2.9614198207855225, + "learning_rate": 1.852568323691216e-05, + "loss": 0.5665, + "step": 1402 + }, + { + "epoch": 0.06688437060520105, + "grad_norm": 2.033257246017456, + "learning_rate": 1.8523629758678618e-05, + "loss": 0.4025, + "step": 1403 + }, + { + "epoch": 0.06693204300050055, + "grad_norm": 1.3746601343154907, + "learning_rate": 1.8521574965336783e-05, + "loss": 0.3748, + "step": 1404 + }, + { + "epoch": 0.06697971539580007, + "grad_norm": 2.830188274383545, + "learning_rate": 1.8519518857203686e-05, + "loss": 1.1046, + "step": 1405 + }, + { + "epoch": 0.06702738779109957, + "grad_norm": 1.7468923330307007, + "learning_rate": 1.8517461434596563e-05, + "loss": 0.825, + "step": 1406 + }, + { + "epoch": 0.06707506018639907, + "grad_norm": 2.2459235191345215, + "learning_rate": 1.851540269783285e-05, + "loss": 0.9809, + "step": 1407 + }, + { + "epoch": 0.06712273258169857, + "grad_norm": 1.8404539823532104, + "learning_rate": 1.8513342647230197e-05, + "loss": 0.7006, + "step": 1408 + }, + { + "epoch": 0.06717040497699807, + "grad_norm": 2.422340154647827, + "learning_rate": 1.8511281283106442e-05, + "loss": 0.7631, + "step": 1409 + }, + { + "epoch": 0.06721807737229757, + "grad_norm": 1.9104775190353394, + "learning_rate": 1.850921860577964e-05, + "loss": 0.8258, + "step": 1410 + }, + { + "epoch": 0.06726574976759707, + "grad_norm": 1.5409177541732788, + "learning_rate": 1.8507154615568027e-05, + "loss": 0.7888, + "step": 1411 + }, + { + "epoch": 0.06731342216289657, + "grad_norm": 1.7304933071136475, + "learning_rate": 1.8505089312790067e-05, + "loss": 0.5328, + "step": 1412 + }, + { + "epoch": 0.06736109455819608, + "grad_norm": 1.835681676864624, + "learning_rate": 1.850302269776441e-05, + "loss": 0.8584, + "step": 1413 + }, + { + "epoch": 0.06740876695349558, + "grad_norm": 2.1688413619995117, + "learning_rate": 1.8500954770809915e-05, + "loss": 0.8466, + "step": 1414 + }, + { + "epoch": 0.06745643934879508, + "grad_norm": 2.751159429550171, + "learning_rate": 1.8498885532245643e-05, + "loss": 0.6508, + "step": 1415 + }, + { + "epoch": 0.06750411174409458, + "grad_norm": 1.539573311805725, + "learning_rate": 1.8496814982390856e-05, + "loss": 0.5461, + "step": 1416 + }, + { + "epoch": 0.06755178413939408, + "grad_norm": 1.7922183275222778, + "learning_rate": 1.8494743121565015e-05, + "loss": 0.8011, + "step": 1417 + }, + { + "epoch": 0.06759945653469358, + "grad_norm": 1.9376097917556763, + "learning_rate": 1.8492669950087792e-05, + "loss": 0.8742, + "step": 1418 + }, + { + "epoch": 0.06764712892999308, + "grad_norm": 1.8946641683578491, + "learning_rate": 1.849059546827905e-05, + "loss": 0.8625, + "step": 1419 + }, + { + "epoch": 0.06769480132529258, + "grad_norm": 1.5216120481491089, + "learning_rate": 1.8488519676458868e-05, + "loss": 0.8587, + "step": 1420 + }, + { + "epoch": 0.0677424737205921, + "grad_norm": 1.8230687379837036, + "learning_rate": 1.848644257494751e-05, + "loss": 0.4559, + "step": 1421 + }, + { + "epoch": 0.0677901461158916, + "grad_norm": 1.2247145175933838, + "learning_rate": 1.8484364164065457e-05, + "loss": 0.3531, + "step": 1422 + }, + { + "epoch": 0.0678378185111911, + "grad_norm": 4.0727434158325195, + "learning_rate": 1.8482284444133388e-05, + "loss": 1.3372, + "step": 1423 + }, + { + "epoch": 0.0678854909064906, + "grad_norm": 2.97822642326355, + "learning_rate": 1.848020341547218e-05, + "loss": 1.1882, + "step": 1424 + }, + { + "epoch": 0.0679331633017901, + "grad_norm": 1.7735543251037598, + "learning_rate": 1.8478121078402914e-05, + "loss": 0.8361, + "step": 1425 + }, + { + "epoch": 0.0679808356970896, + "grad_norm": 1.4802738428115845, + "learning_rate": 1.847603743324687e-05, + "loss": 0.8286, + "step": 1426 + }, + { + "epoch": 0.0680285080923891, + "grad_norm": 1.2952511310577393, + "learning_rate": 1.847395248032554e-05, + "loss": 0.8702, + "step": 1427 + }, + { + "epoch": 0.0680761804876886, + "grad_norm": 1.3495980501174927, + "learning_rate": 1.8471866219960604e-05, + "loss": 0.8121, + "step": 1428 + }, + { + "epoch": 0.06812385288298811, + "grad_norm": 1.3693236112594604, + "learning_rate": 1.8469778652473955e-05, + "loss": 0.8524, + "step": 1429 + }, + { + "epoch": 0.06817152527828761, + "grad_norm": 1.3611301183700562, + "learning_rate": 1.8467689778187684e-05, + "loss": 0.4362, + "step": 1430 + }, + { + "epoch": 0.06821919767358711, + "grad_norm": 1.2123875617980957, + "learning_rate": 1.8465599597424076e-05, + "loss": 0.5462, + "step": 1431 + }, + { + "epoch": 0.06826687006888661, + "grad_norm": 5.222366809844971, + "learning_rate": 1.8463508110505635e-05, + "loss": 0.7139, + "step": 1432 + }, + { + "epoch": 0.06831454246418611, + "grad_norm": 1.6212717294692993, + "learning_rate": 1.8461415317755046e-05, + "loss": 0.8212, + "step": 1433 + }, + { + "epoch": 0.06836221485948561, + "grad_norm": 1.7302656173706055, + "learning_rate": 1.8459321219495207e-05, + "loss": 0.7832, + "step": 1434 + }, + { + "epoch": 0.06840988725478511, + "grad_norm": 2.1390864849090576, + "learning_rate": 1.845722581604922e-05, + "loss": 0.6001, + "step": 1435 + }, + { + "epoch": 0.06845755965008461, + "grad_norm": 1.9199094772338867, + "learning_rate": 1.8455129107740383e-05, + "loss": 0.9711, + "step": 1436 + }, + { + "epoch": 0.06850523204538413, + "grad_norm": 1.5917038917541504, + "learning_rate": 1.8453031094892196e-05, + "loss": 0.7969, + "step": 1437 + }, + { + "epoch": 0.06855290444068363, + "grad_norm": 3.0862739086151123, + "learning_rate": 1.845093177782836e-05, + "loss": 1.115, + "step": 1438 + }, + { + "epoch": 0.06860057683598313, + "grad_norm": 2.6472458839416504, + "learning_rate": 1.844883115687278e-05, + "loss": 0.8385, + "step": 1439 + }, + { + "epoch": 0.06864824923128263, + "grad_norm": 1.9862874746322632, + "learning_rate": 1.8446729232349557e-05, + "loss": 0.6876, + "step": 1440 + }, + { + "epoch": 0.06869592162658213, + "grad_norm": 2.886171817779541, + "learning_rate": 1.8444626004582998e-05, + "loss": 0.7345, + "step": 1441 + }, + { + "epoch": 0.06874359402188163, + "grad_norm": 1.312821388244629, + "learning_rate": 1.8442521473897606e-05, + "loss": 0.7208, + "step": 1442 + }, + { + "epoch": 0.06879126641718113, + "grad_norm": 1.1149388551712036, + "learning_rate": 1.8440415640618097e-05, + "loss": 0.684, + "step": 1443 + }, + { + "epoch": 0.06883893881248063, + "grad_norm": 1.811888575553894, + "learning_rate": 1.843830850506937e-05, + "loss": 0.6486, + "step": 1444 + }, + { + "epoch": 0.06888661120778014, + "grad_norm": 7.313960552215576, + "learning_rate": 1.843620006757654e-05, + "loss": 0.5611, + "step": 1445 + }, + { + "epoch": 0.06893428360307964, + "grad_norm": 2.800572633743286, + "learning_rate": 1.8434090328464916e-05, + "loss": 1.0487, + "step": 1446 + }, + { + "epoch": 0.06898195599837914, + "grad_norm": 0.9820416569709778, + "learning_rate": 1.843197928806001e-05, + "loss": 0.4441, + "step": 1447 + }, + { + "epoch": 0.06902962839367864, + "grad_norm": 2.877941131591797, + "learning_rate": 1.842986694668753e-05, + "loss": 0.2468, + "step": 1448 + }, + { + "epoch": 0.06907730078897814, + "grad_norm": 1.351880431175232, + "learning_rate": 1.8427753304673395e-05, + "loss": 0.8793, + "step": 1449 + }, + { + "epoch": 0.06912497318427764, + "grad_norm": 2.89223313331604, + "learning_rate": 1.842563836234371e-05, + "loss": 0.3041, + "step": 1450 + }, + { + "epoch": 0.06917264557957714, + "grad_norm": 1.8016570806503296, + "learning_rate": 1.8423522120024793e-05, + "loss": 0.9527, + "step": 1451 + }, + { + "epoch": 0.06922031797487664, + "grad_norm": 2.361917495727539, + "learning_rate": 1.842140457804316e-05, + "loss": 0.9726, + "step": 1452 + }, + { + "epoch": 0.06926799037017616, + "grad_norm": 2.007028341293335, + "learning_rate": 1.8419285736725524e-05, + "loss": 1.1446, + "step": 1453 + }, + { + "epoch": 0.06931566276547566, + "grad_norm": 1.4965935945510864, + "learning_rate": 1.8417165596398803e-05, + "loss": 0.871, + "step": 1454 + }, + { + "epoch": 0.06936333516077516, + "grad_norm": 1.5803940296173096, + "learning_rate": 1.8415044157390105e-05, + "loss": 0.8411, + "step": 1455 + }, + { + "epoch": 0.06941100755607466, + "grad_norm": 2.941607713699341, + "learning_rate": 1.8412921420026757e-05, + "loss": 0.4878, + "step": 1456 + }, + { + "epoch": 0.06945867995137416, + "grad_norm": 1.9873912334442139, + "learning_rate": 1.8410797384636267e-05, + "loss": 0.7835, + "step": 1457 + }, + { + "epoch": 0.06950635234667366, + "grad_norm": 1.763690710067749, + "learning_rate": 1.8408672051546355e-05, + "loss": 0.7951, + "step": 1458 + }, + { + "epoch": 0.06955402474197316, + "grad_norm": 6.737948417663574, + "learning_rate": 1.840654542108494e-05, + "loss": 0.452, + "step": 1459 + }, + { + "epoch": 0.06960169713727266, + "grad_norm": 2.328399181365967, + "learning_rate": 1.8404417493580138e-05, + "loss": 0.8578, + "step": 1460 + }, + { + "epoch": 0.06964936953257217, + "grad_norm": 1.7654873132705688, + "learning_rate": 1.840228826936026e-05, + "loss": 0.6632, + "step": 1461 + }, + { + "epoch": 0.06969704192787167, + "grad_norm": 1.4532341957092285, + "learning_rate": 1.8400157748753835e-05, + "loss": 0.4032, + "step": 1462 + }, + { + "epoch": 0.06974471432317117, + "grad_norm": 11.273175239562988, + "learning_rate": 1.839802593208957e-05, + "loss": 0.8069, + "step": 1463 + }, + { + "epoch": 0.06979238671847067, + "grad_norm": 1.4662144184112549, + "learning_rate": 1.839589281969639e-05, + "loss": 0.3238, + "step": 1464 + }, + { + "epoch": 0.06984005911377017, + "grad_norm": 2.0241737365722656, + "learning_rate": 1.8393758411903406e-05, + "loss": 0.7023, + "step": 1465 + }, + { + "epoch": 0.06988773150906967, + "grad_norm": 1.6374173164367676, + "learning_rate": 1.839162270903994e-05, + "loss": 0.6782, + "step": 1466 + }, + { + "epoch": 0.06993540390436917, + "grad_norm": 1.3327624797821045, + "learning_rate": 1.8389485711435505e-05, + "loss": 0.5533, + "step": 1467 + }, + { + "epoch": 0.06998307629966867, + "grad_norm": 2.0282294750213623, + "learning_rate": 1.8387347419419824e-05, + "loss": 0.6456, + "step": 1468 + }, + { + "epoch": 0.07003074869496818, + "grad_norm": 1.6322304010391235, + "learning_rate": 1.8385207833322805e-05, + "loss": 0.5039, + "step": 1469 + }, + { + "epoch": 0.07007842109026768, + "grad_norm": 4.069650650024414, + "learning_rate": 1.838306695347457e-05, + "loss": 0.354, + "step": 1470 + }, + { + "epoch": 0.07012609348556718, + "grad_norm": 1.6614806652069092, + "learning_rate": 1.8380924780205434e-05, + "loss": 0.6034, + "step": 1471 + }, + { + "epoch": 0.07017376588086668, + "grad_norm": 1.1433995962142944, + "learning_rate": 1.837878131384591e-05, + "loss": 0.518, + "step": 1472 + }, + { + "epoch": 0.07022143827616618, + "grad_norm": 1.8702892065048218, + "learning_rate": 1.8376636554726713e-05, + "loss": 0.6899, + "step": 1473 + }, + { + "epoch": 0.07026911067146568, + "grad_norm": 1.6741303205490112, + "learning_rate": 1.8374490503178758e-05, + "loss": 0.683, + "step": 1474 + }, + { + "epoch": 0.07031678306676518, + "grad_norm": 1.2368090152740479, + "learning_rate": 1.837234315953316e-05, + "loss": 0.9024, + "step": 1475 + }, + { + "epoch": 0.0703644554620647, + "grad_norm": 2.6631646156311035, + "learning_rate": 1.8370194524121232e-05, + "loss": 0.9153, + "step": 1476 + }, + { + "epoch": 0.0704121278573642, + "grad_norm": 2.2205801010131836, + "learning_rate": 1.8368044597274483e-05, + "loss": 0.811, + "step": 1477 + }, + { + "epoch": 0.0704598002526637, + "grad_norm": 1.5205700397491455, + "learning_rate": 1.8365893379324628e-05, + "loss": 0.7697, + "step": 1478 + }, + { + "epoch": 0.0705074726479632, + "grad_norm": 1.1955779790878296, + "learning_rate": 1.8363740870603578e-05, + "loss": 0.6193, + "step": 1479 + }, + { + "epoch": 0.0705551450432627, + "grad_norm": 1.6952061653137207, + "learning_rate": 1.836158707144344e-05, + "loss": 0.6845, + "step": 1480 + }, + { + "epoch": 0.0706028174385622, + "grad_norm": 1.358463168144226, + "learning_rate": 1.8359431982176526e-05, + "loss": 0.7407, + "step": 1481 + }, + { + "epoch": 0.0706504898338617, + "grad_norm": 1.8397055864334106, + "learning_rate": 1.835727560313534e-05, + "loss": 0.6134, + "step": 1482 + }, + { + "epoch": 0.0706981622291612, + "grad_norm": 0.8710074424743652, + "learning_rate": 1.8355117934652593e-05, + "loss": 0.4185, + "step": 1483 + }, + { + "epoch": 0.07074583462446071, + "grad_norm": 1.4832335710525513, + "learning_rate": 1.835295897706119e-05, + "loss": 0.7756, + "step": 1484 + }, + { + "epoch": 0.07079350701976021, + "grad_norm": 4.303059101104736, + "learning_rate": 1.8350798730694234e-05, + "loss": 1.1799, + "step": 1485 + }, + { + "epoch": 0.07084117941505971, + "grad_norm": 3.442786931991577, + "learning_rate": 1.8348637195885033e-05, + "loss": 0.7305, + "step": 1486 + }, + { + "epoch": 0.07088885181035921, + "grad_norm": 1.6812269687652588, + "learning_rate": 1.8346474372967086e-05, + "loss": 0.751, + "step": 1487 + }, + { + "epoch": 0.07093652420565871, + "grad_norm": 1.54777193069458, + "learning_rate": 1.8344310262274093e-05, + "loss": 0.8818, + "step": 1488 + }, + { + "epoch": 0.07098419660095821, + "grad_norm": 2.2691755294799805, + "learning_rate": 1.8342144864139962e-05, + "loss": 0.6995, + "step": 1489 + }, + { + "epoch": 0.07103186899625771, + "grad_norm": 2.8284544944763184, + "learning_rate": 1.833997817889878e-05, + "loss": 1.4897, + "step": 1490 + }, + { + "epoch": 0.07107954139155721, + "grad_norm": 1.894149661064148, + "learning_rate": 1.8337810206884853e-05, + "loss": 0.7277, + "step": 1491 + }, + { + "epoch": 0.07112721378685673, + "grad_norm": 3.558352470397949, + "learning_rate": 1.8335640948432675e-05, + "loss": 0.19, + "step": 1492 + }, + { + "epoch": 0.07117488618215623, + "grad_norm": 1.9296996593475342, + "learning_rate": 1.8333470403876935e-05, + "loss": 0.7115, + "step": 1493 + }, + { + "epoch": 0.07122255857745573, + "grad_norm": 2.4193904399871826, + "learning_rate": 1.8331298573552534e-05, + "loss": 0.9175, + "step": 1494 + }, + { + "epoch": 0.07127023097275523, + "grad_norm": 1.8003171682357788, + "learning_rate": 1.8329125457794557e-05, + "loss": 0.6669, + "step": 1495 + }, + { + "epoch": 0.07131790336805473, + "grad_norm": 1.8310233354568481, + "learning_rate": 1.8326951056938295e-05, + "loss": 0.6499, + "step": 1496 + }, + { + "epoch": 0.07136557576335423, + "grad_norm": 1.5464767217636108, + "learning_rate": 1.832477537131924e-05, + "loss": 0.8482, + "step": 1497 + }, + { + "epoch": 0.07141324815865373, + "grad_norm": 0.9716111421585083, + "learning_rate": 1.8322598401273067e-05, + "loss": 0.2961, + "step": 1498 + }, + { + "epoch": 0.07146092055395323, + "grad_norm": 1.4455845355987549, + "learning_rate": 1.8320420147135674e-05, + "loss": 0.2451, + "step": 1499 + }, + { + "epoch": 0.07150859294925274, + "grad_norm": 1.3346799612045288, + "learning_rate": 1.831824060924313e-05, + "loss": 0.7899, + "step": 1500 + }, + { + "epoch": 0.07155626534455224, + "grad_norm": 1.3627867698669434, + "learning_rate": 1.8316059787931725e-05, + "loss": 0.8287, + "step": 1501 + }, + { + "epoch": 0.07160393773985174, + "grad_norm": 1.9677326679229736, + "learning_rate": 1.831387768353793e-05, + "loss": 1.0129, + "step": 1502 + }, + { + "epoch": 0.07165161013515124, + "grad_norm": 1.306816577911377, + "learning_rate": 1.831169429639843e-05, + "loss": 0.4651, + "step": 1503 + }, + { + "epoch": 0.07169928253045074, + "grad_norm": 1.2083654403686523, + "learning_rate": 1.830950962685009e-05, + "loss": 0.6701, + "step": 1504 + }, + { + "epoch": 0.07174695492575024, + "grad_norm": 1.4216758012771606, + "learning_rate": 1.8307323675229986e-05, + "loss": 0.702, + "step": 1505 + }, + { + "epoch": 0.07179462732104974, + "grad_norm": 1.6769384145736694, + "learning_rate": 1.8305136441875388e-05, + "loss": 0.8615, + "step": 1506 + }, + { + "epoch": 0.07184229971634924, + "grad_norm": 1.4091429710388184, + "learning_rate": 1.8302947927123767e-05, + "loss": 0.6523, + "step": 1507 + }, + { + "epoch": 0.07188997211164876, + "grad_norm": 0.9538880586624146, + "learning_rate": 1.8300758131312778e-05, + "loss": 0.5335, + "step": 1508 + }, + { + "epoch": 0.07193764450694826, + "grad_norm": 1.8690299987792969, + "learning_rate": 1.8298567054780295e-05, + "loss": 0.7042, + "step": 1509 + }, + { + "epoch": 0.07198531690224776, + "grad_norm": 1.7488499879837036, + "learning_rate": 1.8296374697864376e-05, + "loss": 0.5988, + "step": 1510 + }, + { + "epoch": 0.07203298929754726, + "grad_norm": 2.2740328311920166, + "learning_rate": 1.8294181060903275e-05, + "loss": 0.7583, + "step": 1511 + }, + { + "epoch": 0.07208066169284676, + "grad_norm": 2.075404405593872, + "learning_rate": 1.829198614423545e-05, + "loss": 0.7631, + "step": 1512 + }, + { + "epoch": 0.07212833408814626, + "grad_norm": 3.8949429988861084, + "learning_rate": 1.8289789948199553e-05, + "loss": 0.3551, + "step": 1513 + }, + { + "epoch": 0.07217600648344576, + "grad_norm": 2.0617401599884033, + "learning_rate": 1.8287592473134436e-05, + "loss": 0.6646, + "step": 1514 + }, + { + "epoch": 0.07222367887874526, + "grad_norm": 2.184965133666992, + "learning_rate": 1.8285393719379146e-05, + "loss": 0.901, + "step": 1515 + }, + { + "epoch": 0.07227135127404477, + "grad_norm": 1.4092891216278076, + "learning_rate": 1.8283193687272927e-05, + "loss": 0.8031, + "step": 1516 + }, + { + "epoch": 0.07231902366934427, + "grad_norm": 2.0028107166290283, + "learning_rate": 1.8280992377155224e-05, + "loss": 0.7283, + "step": 1517 + }, + { + "epoch": 0.07236669606464377, + "grad_norm": 2.107572555541992, + "learning_rate": 1.8278789789365675e-05, + "loss": 0.9222, + "step": 1518 + }, + { + "epoch": 0.07241436845994327, + "grad_norm": 1.6521728038787842, + "learning_rate": 1.8276585924244113e-05, + "loss": 0.5648, + "step": 1519 + }, + { + "epoch": 0.07246204085524277, + "grad_norm": 0.9772564768791199, + "learning_rate": 1.827438078213058e-05, + "loss": 0.6166, + "step": 1520 + }, + { + "epoch": 0.07250971325054227, + "grad_norm": 2.127978801727295, + "learning_rate": 1.82721743633653e-05, + "loss": 1.1091, + "step": 1521 + }, + { + "epoch": 0.07255738564584177, + "grad_norm": 4.721246242523193, + "learning_rate": 1.8269966668288704e-05, + "loss": 1.0732, + "step": 1522 + }, + { + "epoch": 0.07260505804114127, + "grad_norm": 2.551135778427124, + "learning_rate": 1.8267757697241415e-05, + "loss": 1.3089, + "step": 1523 + }, + { + "epoch": 0.07265273043644078, + "grad_norm": 2.385348320007324, + "learning_rate": 1.826554745056425e-05, + "loss": 0.7595, + "step": 1524 + }, + { + "epoch": 0.07270040283174029, + "grad_norm": 1.948810338973999, + "learning_rate": 1.8263335928598237e-05, + "loss": 1.1947, + "step": 1525 + }, + { + "epoch": 0.07274807522703979, + "grad_norm": 1.3272274732589722, + "learning_rate": 1.8261123131684587e-05, + "loss": 0.8178, + "step": 1526 + }, + { + "epoch": 0.07279574762233929, + "grad_norm": 1.742832899093628, + "learning_rate": 1.8258909060164706e-05, + "loss": 0.7982, + "step": 1527 + }, + { + "epoch": 0.07284342001763879, + "grad_norm": 1.7113311290740967, + "learning_rate": 1.8256693714380214e-05, + "loss": 0.7727, + "step": 1528 + }, + { + "epoch": 0.07289109241293829, + "grad_norm": 2.825806140899658, + "learning_rate": 1.8254477094672903e-05, + "loss": 1.2915, + "step": 1529 + }, + { + "epoch": 0.07293876480823779, + "grad_norm": 1.0234042406082153, + "learning_rate": 1.8252259201384786e-05, + "loss": 0.4247, + "step": 1530 + }, + { + "epoch": 0.07298643720353729, + "grad_norm": 1.4936145544052124, + "learning_rate": 1.825004003485805e-05, + "loss": 0.7923, + "step": 1531 + }, + { + "epoch": 0.0730341095988368, + "grad_norm": 3.9392523765563965, + "learning_rate": 1.8247819595435102e-05, + "loss": 0.4282, + "step": 1532 + }, + { + "epoch": 0.0730817819941363, + "grad_norm": 1.4250996112823486, + "learning_rate": 1.8245597883458524e-05, + "loss": 0.7803, + "step": 1533 + }, + { + "epoch": 0.0731294543894358, + "grad_norm": 1.8672455549240112, + "learning_rate": 1.8243374899271103e-05, + "loss": 0.8509, + "step": 1534 + }, + { + "epoch": 0.0731771267847353, + "grad_norm": 1.7403994798660278, + "learning_rate": 1.8241150643215828e-05, + "loss": 0.4006, + "step": 1535 + }, + { + "epoch": 0.0732247991800348, + "grad_norm": 1.4508147239685059, + "learning_rate": 1.823892511563588e-05, + "loss": 0.7365, + "step": 1536 + }, + { + "epoch": 0.0732724715753343, + "grad_norm": 1.2154483795166016, + "learning_rate": 1.8236698316874625e-05, + "loss": 0.4724, + "step": 1537 + }, + { + "epoch": 0.0733201439706338, + "grad_norm": 1.597983717918396, + "learning_rate": 1.8234470247275644e-05, + "loss": 0.9239, + "step": 1538 + }, + { + "epoch": 0.0733678163659333, + "grad_norm": 1.3959144353866577, + "learning_rate": 1.8232240907182702e-05, + "loss": 0.8261, + "step": 1539 + }, + { + "epoch": 0.07341548876123281, + "grad_norm": 1.3057143688201904, + "learning_rate": 1.8230010296939764e-05, + "loss": 0.7321, + "step": 1540 + }, + { + "epoch": 0.07346316115653231, + "grad_norm": 1.770998239517212, + "learning_rate": 1.822777841689099e-05, + "loss": 0.7913, + "step": 1541 + }, + { + "epoch": 0.07351083355183181, + "grad_norm": 7.75728178024292, + "learning_rate": 1.8225545267380736e-05, + "loss": 0.8328, + "step": 1542 + }, + { + "epoch": 0.07355850594713131, + "grad_norm": 2.049203634262085, + "learning_rate": 1.8223310848753552e-05, + "loss": 0.7342, + "step": 1543 + }, + { + "epoch": 0.07360617834243081, + "grad_norm": 3.39388108253479, + "learning_rate": 1.822107516135419e-05, + "loss": 1.4048, + "step": 1544 + }, + { + "epoch": 0.07365385073773031, + "grad_norm": 2.0998194217681885, + "learning_rate": 1.821883820552759e-05, + "loss": 0.2866, + "step": 1545 + }, + { + "epoch": 0.07370152313302981, + "grad_norm": 3.1076385974884033, + "learning_rate": 1.8216599981618895e-05, + "loss": 0.9114, + "step": 1546 + }, + { + "epoch": 0.07374919552832931, + "grad_norm": 0.9885009527206421, + "learning_rate": 1.8214360489973435e-05, + "loss": 0.6137, + "step": 1547 + }, + { + "epoch": 0.07379686792362883, + "grad_norm": 1.954210877418518, + "learning_rate": 1.8212119730936745e-05, + "loss": 0.7286, + "step": 1548 + }, + { + "epoch": 0.07384454031892833, + "grad_norm": 3.475102186203003, + "learning_rate": 1.8209877704854547e-05, + "loss": 0.7378, + "step": 1549 + }, + { + "epoch": 0.07389221271422783, + "grad_norm": 1.5269593000411987, + "learning_rate": 1.8207634412072765e-05, + "loss": 0.8207, + "step": 1550 + }, + { + "epoch": 0.07393988510952733, + "grad_norm": 1.8889150619506836, + "learning_rate": 1.8205389852937516e-05, + "loss": 0.7764, + "step": 1551 + }, + { + "epoch": 0.07398755750482683, + "grad_norm": 1.664652705192566, + "learning_rate": 1.820314402779511e-05, + "loss": 0.3929, + "step": 1552 + }, + { + "epoch": 0.07403522990012633, + "grad_norm": 1.946104645729065, + "learning_rate": 1.820089693699206e-05, + "loss": 1.0455, + "step": 1553 + }, + { + "epoch": 0.07408290229542583, + "grad_norm": 1.2063770294189453, + "learning_rate": 1.8198648580875063e-05, + "loss": 0.2833, + "step": 1554 + }, + { + "epoch": 0.07413057469072533, + "grad_norm": 1.4298354387283325, + "learning_rate": 1.8196398959791022e-05, + "loss": 0.7905, + "step": 1555 + }, + { + "epoch": 0.07417824708602484, + "grad_norm": 1.6317298412322998, + "learning_rate": 1.8194148074087025e-05, + "loss": 0.6171, + "step": 1556 + }, + { + "epoch": 0.07422591948132434, + "grad_norm": 1.4596104621887207, + "learning_rate": 1.8191895924110364e-05, + "loss": 0.4242, + "step": 1557 + }, + { + "epoch": 0.07427359187662384, + "grad_norm": 2.159130334854126, + "learning_rate": 1.8189642510208525e-05, + "loss": 1.1579, + "step": 1558 + }, + { + "epoch": 0.07432126427192334, + "grad_norm": 1.8697502613067627, + "learning_rate": 1.818738783272918e-05, + "loss": 0.6715, + "step": 1559 + }, + { + "epoch": 0.07436893666722284, + "grad_norm": 1.6227350234985352, + "learning_rate": 1.818513189202021e-05, + "loss": 0.8521, + "step": 1560 + }, + { + "epoch": 0.07441660906252234, + "grad_norm": 2.830514907836914, + "learning_rate": 1.8182874688429674e-05, + "loss": 1.6047, + "step": 1561 + }, + { + "epoch": 0.07446428145782184, + "grad_norm": 1.4403462409973145, + "learning_rate": 1.8180616222305847e-05, + "loss": 0.753, + "step": 1562 + }, + { + "epoch": 0.07451195385312134, + "grad_norm": 1.779158353805542, + "learning_rate": 1.817835649399718e-05, + "loss": 0.6482, + "step": 1563 + }, + { + "epoch": 0.07455962624842086, + "grad_norm": 3.1841413974761963, + "learning_rate": 1.817609550385232e-05, + "loss": 0.6392, + "step": 1564 + }, + { + "epoch": 0.07460729864372036, + "grad_norm": 2.0441768169403076, + "learning_rate": 1.817383325222013e-05, + "loss": 0.5695, + "step": 1565 + }, + { + "epoch": 0.07465497103901986, + "grad_norm": 1.3322478532791138, + "learning_rate": 1.8171569739449642e-05, + "loss": 0.6165, + "step": 1566 + }, + { + "epoch": 0.07470264343431936, + "grad_norm": 28.239721298217773, + "learning_rate": 1.8169304965890088e-05, + "loss": 0.3622, + "step": 1567 + }, + { + "epoch": 0.07475031582961886, + "grad_norm": 1.3498544692993164, + "learning_rate": 1.816703893189091e-05, + "loss": 0.8487, + "step": 1568 + }, + { + "epoch": 0.07479798822491836, + "grad_norm": 2.1444287300109863, + "learning_rate": 1.816477163780173e-05, + "loss": 1.01, + "step": 1569 + }, + { + "epoch": 0.07484566062021786, + "grad_norm": 1.8706082105636597, + "learning_rate": 1.8162503083972365e-05, + "loss": 0.5758, + "step": 1570 + }, + { + "epoch": 0.07489333301551737, + "grad_norm": 1.309287190437317, + "learning_rate": 1.816023327075283e-05, + "loss": 0.3584, + "step": 1571 + }, + { + "epoch": 0.07494100541081687, + "grad_norm": 3.1240546703338623, + "learning_rate": 1.815796219849334e-05, + "loss": 1.1831, + "step": 1572 + }, + { + "epoch": 0.07498867780611637, + "grad_norm": 2.0304906368255615, + "learning_rate": 1.815568986754429e-05, + "loss": 0.8677, + "step": 1573 + }, + { + "epoch": 0.07503635020141587, + "grad_norm": 3.12980318069458, + "learning_rate": 1.815341627825628e-05, + "loss": 1.0142, + "step": 1574 + }, + { + "epoch": 0.07508402259671537, + "grad_norm": 1.3019670248031616, + "learning_rate": 1.8151141430980106e-05, + "loss": 1.0115, + "step": 1575 + }, + { + "epoch": 0.07513169499201487, + "grad_norm": 3.11868953704834, + "learning_rate": 1.814886532606675e-05, + "loss": 0.958, + "step": 1576 + }, + { + "epoch": 0.07517936738731437, + "grad_norm": 1.458827018737793, + "learning_rate": 1.8146587963867388e-05, + "loss": 0.935, + "step": 1577 + }, + { + "epoch": 0.07522703978261387, + "grad_norm": 1.6715525388717651, + "learning_rate": 1.8144309344733397e-05, + "loss": 0.7555, + "step": 1578 + }, + { + "epoch": 0.07527471217791339, + "grad_norm": 4.716001987457275, + "learning_rate": 1.8142029469016345e-05, + "loss": 1.0024, + "step": 1579 + }, + { + "epoch": 0.07532238457321289, + "grad_norm": 3.0512728691101074, + "learning_rate": 1.8139748337067993e-05, + "loss": 0.5921, + "step": 1580 + }, + { + "epoch": 0.07537005696851239, + "grad_norm": 1.8836324214935303, + "learning_rate": 1.8137465949240294e-05, + "loss": 0.9274, + "step": 1581 + }, + { + "epoch": 0.07541772936381189, + "grad_norm": 1.914754867553711, + "learning_rate": 1.8135182305885403e-05, + "loss": 0.579, + "step": 1582 + }, + { + "epoch": 0.07546540175911139, + "grad_norm": 1.2325947284698486, + "learning_rate": 1.8132897407355657e-05, + "loss": 0.8546, + "step": 1583 + }, + { + "epoch": 0.07551307415441089, + "grad_norm": 1.4415030479431152, + "learning_rate": 1.813061125400359e-05, + "loss": 0.5904, + "step": 1584 + }, + { + "epoch": 0.07556074654971039, + "grad_norm": 1.9776607751846313, + "learning_rate": 1.812832384618194e-05, + "loss": 0.9808, + "step": 1585 + }, + { + "epoch": 0.07560841894500989, + "grad_norm": 1.2737736701965332, + "learning_rate": 1.8126035184243623e-05, + "loss": 0.7254, + "step": 1586 + }, + { + "epoch": 0.0756560913403094, + "grad_norm": 1.6993833780288696, + "learning_rate": 1.812374526854176e-05, + "loss": 0.4322, + "step": 1587 + }, + { + "epoch": 0.0757037637356089, + "grad_norm": 1.276158094406128, + "learning_rate": 1.812145409942966e-05, + "loss": 0.772, + "step": 1588 + }, + { + "epoch": 0.0757514361309084, + "grad_norm": 2.059755325317383, + "learning_rate": 1.8119161677260827e-05, + "loss": 1.1274, + "step": 1589 + }, + { + "epoch": 0.0757991085262079, + "grad_norm": 2.593461513519287, + "learning_rate": 1.811686800238896e-05, + "loss": 0.5567, + "step": 1590 + }, + { + "epoch": 0.0758467809215074, + "grad_norm": 2.1723110675811768, + "learning_rate": 1.8114573075167947e-05, + "loss": 0.8908, + "step": 1591 + }, + { + "epoch": 0.0758944533168069, + "grad_norm": 1.1874706745147705, + "learning_rate": 1.8112276895951872e-05, + "loss": 0.144, + "step": 1592 + }, + { + "epoch": 0.0759421257121064, + "grad_norm": 2.7023580074310303, + "learning_rate": 1.8109979465095014e-05, + "loss": 0.9457, + "step": 1593 + }, + { + "epoch": 0.0759897981074059, + "grad_norm": 2.2425551414489746, + "learning_rate": 1.810768078295184e-05, + "loss": 1.0159, + "step": 1594 + }, + { + "epoch": 0.07603747050270541, + "grad_norm": 2.7102904319763184, + "learning_rate": 1.8105380849877013e-05, + "loss": 1.297, + "step": 1595 + }, + { + "epoch": 0.07608514289800491, + "grad_norm": 1.3896920680999756, + "learning_rate": 1.810307966622539e-05, + "loss": 0.9087, + "step": 1596 + }, + { + "epoch": 0.07613281529330441, + "grad_norm": 1.352295160293579, + "learning_rate": 1.8100777232352022e-05, + "loss": 0.8202, + "step": 1597 + }, + { + "epoch": 0.07618048768860392, + "grad_norm": 1.3081729412078857, + "learning_rate": 1.8098473548612146e-05, + "loss": 0.8788, + "step": 1598 + }, + { + "epoch": 0.07622816008390342, + "grad_norm": 1.5088319778442383, + "learning_rate": 1.8096168615361203e-05, + "loss": 0.6741, + "step": 1599 + }, + { + "epoch": 0.07627583247920292, + "grad_norm": 2.373295307159424, + "learning_rate": 1.8093862432954815e-05, + "loss": 0.6484, + "step": 1600 + }, + { + "epoch": 0.07632350487450242, + "grad_norm": 2.361781120300293, + "learning_rate": 1.809155500174881e-05, + "loss": 0.8694, + "step": 1601 + }, + { + "epoch": 0.07637117726980192, + "grad_norm": 2.5788233280181885, + "learning_rate": 1.8089246322099188e-05, + "loss": 0.8709, + "step": 1602 + }, + { + "epoch": 0.07641884966510143, + "grad_norm": 2.2018938064575195, + "learning_rate": 1.8086936394362165e-05, + "loss": 0.3707, + "step": 1603 + }, + { + "epoch": 0.07646652206040093, + "grad_norm": 2.9525387287139893, + "learning_rate": 1.808462521889413e-05, + "loss": 0.8743, + "step": 1604 + }, + { + "epoch": 0.07651419445570043, + "grad_norm": 9.3198823928833, + "learning_rate": 1.8082312796051685e-05, + "loss": 0.2377, + "step": 1605 + }, + { + "epoch": 0.07656186685099993, + "grad_norm": 2.2434709072113037, + "learning_rate": 1.807999912619161e-05, + "loss": 0.6437, + "step": 1606 + }, + { + "epoch": 0.07660953924629943, + "grad_norm": 2.3313891887664795, + "learning_rate": 1.807768420967087e-05, + "loss": 0.9975, + "step": 1607 + }, + { + "epoch": 0.07665721164159893, + "grad_norm": 1.0797885656356812, + "learning_rate": 1.8075368046846647e-05, + "loss": 0.3199, + "step": 1608 + }, + { + "epoch": 0.07670488403689843, + "grad_norm": 3.6646196842193604, + "learning_rate": 1.807305063807629e-05, + "loss": 0.9589, + "step": 1609 + }, + { + "epoch": 0.07675255643219793, + "grad_norm": 1.667553424835205, + "learning_rate": 1.8070731983717357e-05, + "loss": 0.957, + "step": 1610 + }, + { + "epoch": 0.07680022882749744, + "grad_norm": 1.9350658655166626, + "learning_rate": 1.8068412084127594e-05, + "loss": 0.9101, + "step": 1611 + }, + { + "epoch": 0.07684790122279694, + "grad_norm": 2.238814115524292, + "learning_rate": 1.8066090939664934e-05, + "loss": 0.7569, + "step": 1612 + }, + { + "epoch": 0.07689557361809644, + "grad_norm": 2.1624526977539062, + "learning_rate": 1.8063768550687504e-05, + "loss": 0.8103, + "step": 1613 + }, + { + "epoch": 0.07694324601339594, + "grad_norm": 1.4392058849334717, + "learning_rate": 1.806144491755363e-05, + "loss": 0.8295, + "step": 1614 + }, + { + "epoch": 0.07699091840869544, + "grad_norm": 1.195868968963623, + "learning_rate": 1.805912004062182e-05, + "loss": 0.6247, + "step": 1615 + }, + { + "epoch": 0.07703859080399494, + "grad_norm": 1.4350346326828003, + "learning_rate": 1.8056793920250784e-05, + "loss": 1.0748, + "step": 1616 + }, + { + "epoch": 0.07708626319929444, + "grad_norm": 2.500607967376709, + "learning_rate": 1.805446655679941e-05, + "loss": 0.4902, + "step": 1617 + }, + { + "epoch": 0.07713393559459394, + "grad_norm": 1.0993422269821167, + "learning_rate": 1.8052137950626795e-05, + "loss": 0.3553, + "step": 1618 + }, + { + "epoch": 0.07718160798989346, + "grad_norm": 2.4582574367523193, + "learning_rate": 1.8049808102092213e-05, + "loss": 0.8212, + "step": 1619 + }, + { + "epoch": 0.07722928038519296, + "grad_norm": 1.8061506748199463, + "learning_rate": 1.8047477011555142e-05, + "loss": 0.9618, + "step": 1620 + }, + { + "epoch": 0.07727695278049246, + "grad_norm": 1.089192509651184, + "learning_rate": 1.804514467937524e-05, + "loss": 0.8592, + "step": 1621 + }, + { + "epoch": 0.07732462517579196, + "grad_norm": 2.998304605484009, + "learning_rate": 1.804281110591236e-05, + "loss": 0.725, + "step": 1622 + }, + { + "epoch": 0.07737229757109146, + "grad_norm": 2.592383623123169, + "learning_rate": 1.804047629152655e-05, + "loss": 0.6466, + "step": 1623 + }, + { + "epoch": 0.07741996996639096, + "grad_norm": 1.3715007305145264, + "learning_rate": 1.8038140236578053e-05, + "loss": 0.7357, + "step": 1624 + }, + { + "epoch": 0.07746764236169046, + "grad_norm": 1.8576723337173462, + "learning_rate": 1.803580294142729e-05, + "loss": 0.8056, + "step": 1625 + }, + { + "epoch": 0.07751531475698996, + "grad_norm": 2.1937034130096436, + "learning_rate": 1.803346440643489e-05, + "loss": 0.1831, + "step": 1626 + }, + { + "epoch": 0.07756298715228947, + "grad_norm": 1.4345577955245972, + "learning_rate": 1.803112463196166e-05, + "loss": 0.9221, + "step": 1627 + }, + { + "epoch": 0.07761065954758897, + "grad_norm": 1.2779603004455566, + "learning_rate": 1.8028783618368603e-05, + "loss": 0.7634, + "step": 1628 + }, + { + "epoch": 0.07765833194288847, + "grad_norm": 3.2183146476745605, + "learning_rate": 1.8026441366016915e-05, + "loss": 0.3845, + "step": 1629 + }, + { + "epoch": 0.07770600433818797, + "grad_norm": 1.1786555051803589, + "learning_rate": 1.8024097875267982e-05, + "loss": 0.6195, + "step": 1630 + }, + { + "epoch": 0.07775367673348747, + "grad_norm": 2.7002832889556885, + "learning_rate": 1.8021753146483373e-05, + "loss": 0.7568, + "step": 1631 + }, + { + "epoch": 0.07780134912878697, + "grad_norm": 2.2595055103302, + "learning_rate": 1.8019407180024867e-05, + "loss": 0.6159, + "step": 1632 + }, + { + "epoch": 0.07784902152408647, + "grad_norm": 1.8262847661972046, + "learning_rate": 1.8017059976254415e-05, + "loss": 0.9114, + "step": 1633 + }, + { + "epoch": 0.07789669391938597, + "grad_norm": 1.3823014497756958, + "learning_rate": 1.801471153553417e-05, + "loss": 0.8128, + "step": 1634 + }, + { + "epoch": 0.07794436631468549, + "grad_norm": 1.43936026096344, + "learning_rate": 1.801236185822647e-05, + "loss": 0.6423, + "step": 1635 + }, + { + "epoch": 0.07799203870998499, + "grad_norm": 2.8125345706939697, + "learning_rate": 1.8010010944693846e-05, + "loss": 0.919, + "step": 1636 + }, + { + "epoch": 0.07803971110528449, + "grad_norm": 1.6097040176391602, + "learning_rate": 1.8007658795299023e-05, + "loss": 0.845, + "step": 1637 + }, + { + "epoch": 0.07808738350058399, + "grad_norm": 2.403620719909668, + "learning_rate": 1.800530541040491e-05, + "loss": 1.0715, + "step": 1638 + }, + { + "epoch": 0.07813505589588349, + "grad_norm": 3.5078952312469482, + "learning_rate": 1.800295079037461e-05, + "loss": 1.1827, + "step": 1639 + }, + { + "epoch": 0.07818272829118299, + "grad_norm": 1.1118353605270386, + "learning_rate": 1.8000594935571416e-05, + "loss": 0.2507, + "step": 1640 + }, + { + "epoch": 0.07823040068648249, + "grad_norm": 1.5571948289871216, + "learning_rate": 1.7998237846358812e-05, + "loss": 0.7452, + "step": 1641 + }, + { + "epoch": 0.07827807308178199, + "grad_norm": 2.5637552738189697, + "learning_rate": 1.7995879523100478e-05, + "loss": 0.828, + "step": 1642 + }, + { + "epoch": 0.0783257454770815, + "grad_norm": 2.611086368560791, + "learning_rate": 1.7993519966160276e-05, + "loss": 0.7582, + "step": 1643 + }, + { + "epoch": 0.078373417872381, + "grad_norm": 1.3323707580566406, + "learning_rate": 1.7991159175902257e-05, + "loss": 0.369, + "step": 1644 + }, + { + "epoch": 0.0784210902676805, + "grad_norm": 2.1701362133026123, + "learning_rate": 1.798879715269067e-05, + "loss": 1.1436, + "step": 1645 + }, + { + "epoch": 0.07846876266298, + "grad_norm": 1.1985524892807007, + "learning_rate": 1.7986433896889955e-05, + "loss": 0.9122, + "step": 1646 + }, + { + "epoch": 0.0785164350582795, + "grad_norm": 1.9176534414291382, + "learning_rate": 1.7984069408864733e-05, + "loss": 1.1291, + "step": 1647 + }, + { + "epoch": 0.078564107453579, + "grad_norm": 2.425058364868164, + "learning_rate": 1.798170368897982e-05, + "loss": 0.4954, + "step": 1648 + }, + { + "epoch": 0.0786117798488785, + "grad_norm": 2.8729782104492188, + "learning_rate": 1.7979336737600225e-05, + "loss": 0.3185, + "step": 1649 + }, + { + "epoch": 0.078659452244178, + "grad_norm": 3.787822961807251, + "learning_rate": 1.797696855509114e-05, + "loss": 0.67, + "step": 1650 + }, + { + "epoch": 0.07870712463947752, + "grad_norm": 1.6017532348632812, + "learning_rate": 1.7974599141817953e-05, + "loss": 0.4133, + "step": 1651 + }, + { + "epoch": 0.07875479703477702, + "grad_norm": 1.2655279636383057, + "learning_rate": 1.7972228498146243e-05, + "loss": 0.6146, + "step": 1652 + }, + { + "epoch": 0.07880246943007652, + "grad_norm": 1.6031553745269775, + "learning_rate": 1.7969856624441778e-05, + "loss": 0.8803, + "step": 1653 + }, + { + "epoch": 0.07885014182537602, + "grad_norm": 1.5829217433929443, + "learning_rate": 1.7967483521070502e-05, + "loss": 0.9961, + "step": 1654 + }, + { + "epoch": 0.07889781422067552, + "grad_norm": 1.976746916770935, + "learning_rate": 1.7965109188398572e-05, + "loss": 0.8139, + "step": 1655 + }, + { + "epoch": 0.07894548661597502, + "grad_norm": 1.462019920349121, + "learning_rate": 1.796273362679232e-05, + "loss": 0.4961, + "step": 1656 + }, + { + "epoch": 0.07899315901127452, + "grad_norm": 1.5143170356750488, + "learning_rate": 1.7960356836618265e-05, + "loss": 0.8146, + "step": 1657 + }, + { + "epoch": 0.07904083140657402, + "grad_norm": 6.419618606567383, + "learning_rate": 1.795797881824313e-05, + "loss": 0.2693, + "step": 1658 + }, + { + "epoch": 0.07908850380187353, + "grad_norm": 3.069215774536133, + "learning_rate": 1.7955599572033816e-05, + "loss": 0.7735, + "step": 1659 + }, + { + "epoch": 0.07913617619717303, + "grad_norm": 1.1946492195129395, + "learning_rate": 1.795321909835741e-05, + "loss": 0.5344, + "step": 1660 + }, + { + "epoch": 0.07918384859247253, + "grad_norm": 3.634439706802368, + "learning_rate": 1.79508373975812e-05, + "loss": 1.1054, + "step": 1661 + }, + { + "epoch": 0.07923152098777203, + "grad_norm": 1.0564801692962646, + "learning_rate": 1.794845447007266e-05, + "loss": 0.6648, + "step": 1662 + }, + { + "epoch": 0.07927919338307153, + "grad_norm": 1.1457446813583374, + "learning_rate": 1.7946070316199448e-05, + "loss": 0.3502, + "step": 1663 + }, + { + "epoch": 0.07932686577837103, + "grad_norm": 3.404604911804199, + "learning_rate": 1.794368493632942e-05, + "loss": 0.3236, + "step": 1664 + }, + { + "epoch": 0.07937453817367053, + "grad_norm": 1.3143208026885986, + "learning_rate": 1.79412983308306e-05, + "loss": 0.2308, + "step": 1665 + }, + { + "epoch": 0.07942221056897004, + "grad_norm": 1.3976773023605347, + "learning_rate": 1.7938910500071233e-05, + "loss": 0.872, + "step": 1666 + }, + { + "epoch": 0.07946988296426954, + "grad_norm": 9.12065601348877, + "learning_rate": 1.793652144441973e-05, + "loss": 1.4998, + "step": 1667 + }, + { + "epoch": 0.07951755535956904, + "grad_norm": 2.228480577468872, + "learning_rate": 1.79341311642447e-05, + "loss": 0.7514, + "step": 1668 + }, + { + "epoch": 0.07956522775486854, + "grad_norm": 1.416662573814392, + "learning_rate": 1.7931739659914936e-05, + "loss": 0.6806, + "step": 1669 + }, + { + "epoch": 0.07961290015016804, + "grad_norm": 1.256575345993042, + "learning_rate": 1.792934693179942e-05, + "loss": 0.7686, + "step": 1670 + }, + { + "epoch": 0.07966057254546755, + "grad_norm": 2.7229995727539062, + "learning_rate": 1.7926952980267335e-05, + "loss": 1.1292, + "step": 1671 + }, + { + "epoch": 0.07970824494076705, + "grad_norm": 2.3350911140441895, + "learning_rate": 1.7924557805688033e-05, + "loss": 0.9161, + "step": 1672 + }, + { + "epoch": 0.07975591733606655, + "grad_norm": 1.7685816287994385, + "learning_rate": 1.792216140843107e-05, + "loss": 0.8183, + "step": 1673 + }, + { + "epoch": 0.07980358973136606, + "grad_norm": 2.5546646118164062, + "learning_rate": 1.791976378886618e-05, + "loss": 0.378, + "step": 1674 + }, + { + "epoch": 0.07985126212666556, + "grad_norm": 1.115749716758728, + "learning_rate": 1.79173649473633e-05, + "loss": 0.5229, + "step": 1675 + }, + { + "epoch": 0.07989893452196506, + "grad_norm": 1.6336308717727661, + "learning_rate": 1.7914964884292543e-05, + "loss": 0.7181, + "step": 1676 + }, + { + "epoch": 0.07994660691726456, + "grad_norm": 1.73137629032135, + "learning_rate": 1.7912563600024212e-05, + "loss": 0.7605, + "step": 1677 + }, + { + "epoch": 0.07999427931256406, + "grad_norm": 1.929141879081726, + "learning_rate": 1.79101610949288e-05, + "loss": 0.9823, + "step": 1678 + }, + { + "epoch": 0.08004195170786356, + "grad_norm": 1.803404688835144, + "learning_rate": 1.7907757369376984e-05, + "loss": 0.9783, + "step": 1679 + }, + { + "epoch": 0.08008962410316306, + "grad_norm": 1.7051862478256226, + "learning_rate": 1.7905352423739648e-05, + "loss": 0.8257, + "step": 1680 + }, + { + "epoch": 0.08013729649846256, + "grad_norm": 2.1527657508850098, + "learning_rate": 1.790294625838784e-05, + "loss": 0.8811, + "step": 1681 + }, + { + "epoch": 0.08018496889376207, + "grad_norm": 1.4188205003738403, + "learning_rate": 1.790053887369281e-05, + "loss": 0.6163, + "step": 1682 + }, + { + "epoch": 0.08023264128906157, + "grad_norm": 1.7763410806655884, + "learning_rate": 1.7898130270025992e-05, + "loss": 0.8388, + "step": 1683 + }, + { + "epoch": 0.08028031368436107, + "grad_norm": 2.461472511291504, + "learning_rate": 1.7895720447759007e-05, + "loss": 0.6545, + "step": 1684 + }, + { + "epoch": 0.08032798607966057, + "grad_norm": 1.487349510192871, + "learning_rate": 1.7893309407263665e-05, + "loss": 0.8915, + "step": 1685 + }, + { + "epoch": 0.08037565847496007, + "grad_norm": 1.9209725856781006, + "learning_rate": 1.789089714891197e-05, + "loss": 0.8508, + "step": 1686 + }, + { + "epoch": 0.08042333087025957, + "grad_norm": 1.7194499969482422, + "learning_rate": 1.7888483673076104e-05, + "loss": 0.6629, + "step": 1687 + }, + { + "epoch": 0.08047100326555907, + "grad_norm": 1.2119414806365967, + "learning_rate": 1.7886068980128444e-05, + "loss": 0.7833, + "step": 1688 + }, + { + "epoch": 0.08051867566085857, + "grad_norm": 2.0298550128936768, + "learning_rate": 1.7883653070441548e-05, + "loss": 1.0334, + "step": 1689 + }, + { + "epoch": 0.08056634805615809, + "grad_norm": 1.4614098072052002, + "learning_rate": 1.7881235944388173e-05, + "loss": 0.5798, + "step": 1690 + }, + { + "epoch": 0.08061402045145759, + "grad_norm": 4.900351524353027, + "learning_rate": 1.7878817602341252e-05, + "loss": 0.3148, + "step": 1691 + }, + { + "epoch": 0.08066169284675709, + "grad_norm": 8.526938438415527, + "learning_rate": 1.7876398044673912e-05, + "loss": 0.6285, + "step": 1692 + }, + { + "epoch": 0.08070936524205659, + "grad_norm": 2.280665397644043, + "learning_rate": 1.787397727175946e-05, + "loss": 1.1673, + "step": 1693 + }, + { + "epoch": 0.08075703763735609, + "grad_norm": 1.4818731546401978, + "learning_rate": 1.7871555283971408e-05, + "loss": 0.7537, + "step": 1694 + }, + { + "epoch": 0.08080471003265559, + "grad_norm": 2.9122393131256104, + "learning_rate": 1.786913208168343e-05, + "loss": 1.2202, + "step": 1695 + }, + { + "epoch": 0.08085238242795509, + "grad_norm": 1.7488834857940674, + "learning_rate": 1.7866707665269413e-05, + "loss": 0.6305, + "step": 1696 + }, + { + "epoch": 0.08090005482325459, + "grad_norm": 2.081796646118164, + "learning_rate": 1.7864282035103415e-05, + "loss": 0.9135, + "step": 1697 + }, + { + "epoch": 0.0809477272185541, + "grad_norm": 1.6956685781478882, + "learning_rate": 1.7861855191559682e-05, + "loss": 0.7385, + "step": 1698 + }, + { + "epoch": 0.0809953996138536, + "grad_norm": 3.47800612449646, + "learning_rate": 1.785942713501266e-05, + "loss": 0.7017, + "step": 1699 + }, + { + "epoch": 0.0810430720091531, + "grad_norm": 1.4851592779159546, + "learning_rate": 1.785699786583696e-05, + "loss": 1.1012, + "step": 1700 + }, + { + "epoch": 0.0810907444044526, + "grad_norm": 2.6097171306610107, + "learning_rate": 1.7854567384407407e-05, + "loss": 1.1458, + "step": 1701 + }, + { + "epoch": 0.0811384167997521, + "grad_norm": 2.019005537033081, + "learning_rate": 1.785213569109899e-05, + "loss": 0.6346, + "step": 1702 + }, + { + "epoch": 0.0811860891950516, + "grad_norm": 1.9929944276809692, + "learning_rate": 1.7849702786286897e-05, + "loss": 0.6616, + "step": 1703 + }, + { + "epoch": 0.0812337615903511, + "grad_norm": 3.2783918380737305, + "learning_rate": 1.78472686703465e-05, + "loss": 1.0011, + "step": 1704 + }, + { + "epoch": 0.0812814339856506, + "grad_norm": 3.1229450702667236, + "learning_rate": 1.784483334365336e-05, + "loss": 0.7122, + "step": 1705 + }, + { + "epoch": 0.08132910638095012, + "grad_norm": 1.5935554504394531, + "learning_rate": 1.784239680658322e-05, + "loss": 0.7817, + "step": 1706 + }, + { + "epoch": 0.08137677877624962, + "grad_norm": 2.66387677192688, + "learning_rate": 1.7839959059512016e-05, + "loss": 0.5153, + "step": 1707 + }, + { + "epoch": 0.08142445117154912, + "grad_norm": 1.887986660003662, + "learning_rate": 1.7837520102815862e-05, + "loss": 1.0532, + "step": 1708 + }, + { + "epoch": 0.08147212356684862, + "grad_norm": 0.958836019039154, + "learning_rate": 1.7835079936871068e-05, + "loss": 0.4794, + "step": 1709 + }, + { + "epoch": 0.08151979596214812, + "grad_norm": 4.073960781097412, + "learning_rate": 1.7832638562054126e-05, + "loss": 0.8522, + "step": 1710 + }, + { + "epoch": 0.08156746835744762, + "grad_norm": 1.1035635471343994, + "learning_rate": 1.7830195978741716e-05, + "loss": 0.5152, + "step": 1711 + }, + { + "epoch": 0.08161514075274712, + "grad_norm": 1.3051600456237793, + "learning_rate": 1.7827752187310702e-05, + "loss": 1.0563, + "step": 1712 + }, + { + "epoch": 0.08166281314804662, + "grad_norm": 1.4117364883422852, + "learning_rate": 1.7825307188138133e-05, + "loss": 0.9645, + "step": 1713 + }, + { + "epoch": 0.08171048554334613, + "grad_norm": 1.8378266096115112, + "learning_rate": 1.782286098160125e-05, + "loss": 1.0543, + "step": 1714 + }, + { + "epoch": 0.08175815793864563, + "grad_norm": 2.191606283187866, + "learning_rate": 1.7820413568077478e-05, + "loss": 1.0085, + "step": 1715 + }, + { + "epoch": 0.08180583033394513, + "grad_norm": 1.9256677627563477, + "learning_rate": 1.7817964947944427e-05, + "loss": 0.6892, + "step": 1716 + }, + { + "epoch": 0.08185350272924463, + "grad_norm": 1.6263107061386108, + "learning_rate": 1.7815515121579897e-05, + "loss": 0.9827, + "step": 1717 + }, + { + "epoch": 0.08190117512454413, + "grad_norm": 1.3421369791030884, + "learning_rate": 1.7813064089361866e-05, + "loss": 0.7484, + "step": 1718 + }, + { + "epoch": 0.08194884751984363, + "grad_norm": 1.5854623317718506, + "learning_rate": 1.7810611851668503e-05, + "loss": 0.8738, + "step": 1719 + }, + { + "epoch": 0.08199651991514313, + "grad_norm": 2.45235013961792, + "learning_rate": 1.7808158408878167e-05, + "loss": 1.0454, + "step": 1720 + }, + { + "epoch": 0.08204419231044263, + "grad_norm": 1.7934895753860474, + "learning_rate": 1.7805703761369398e-05, + "loss": 0.7913, + "step": 1721 + }, + { + "epoch": 0.08209186470574215, + "grad_norm": 1.1314424276351929, + "learning_rate": 1.780324790952092e-05, + "loss": 0.4605, + "step": 1722 + }, + { + "epoch": 0.08213953710104165, + "grad_norm": 1.9435144662857056, + "learning_rate": 1.7800790853711646e-05, + "loss": 0.4018, + "step": 1723 + }, + { + "epoch": 0.08218720949634115, + "grad_norm": 1.6724966764450073, + "learning_rate": 1.779833259432068e-05, + "loss": 0.6563, + "step": 1724 + }, + { + "epoch": 0.08223488189164065, + "grad_norm": 3.376499891281128, + "learning_rate": 1.77958731317273e-05, + "loss": 0.6798, + "step": 1725 + }, + { + "epoch": 0.08228255428694015, + "grad_norm": 2.2213170528411865, + "learning_rate": 1.7793412466310974e-05, + "loss": 0.7773, + "step": 1726 + }, + { + "epoch": 0.08233022668223965, + "grad_norm": 1.3578178882598877, + "learning_rate": 1.779095059845137e-05, + "loss": 0.7446, + "step": 1727 + }, + { + "epoch": 0.08237789907753915, + "grad_norm": 2.146618366241455, + "learning_rate": 1.7788487528528314e-05, + "loss": 0.358, + "step": 1728 + }, + { + "epoch": 0.08242557147283865, + "grad_norm": 2.18349552154541, + "learning_rate": 1.7786023256921835e-05, + "loss": 0.995, + "step": 1729 + }, + { + "epoch": 0.08247324386813816, + "grad_norm": 2.4205596446990967, + "learning_rate": 1.7783557784012154e-05, + "loss": 0.9726, + "step": 1730 + }, + { + "epoch": 0.08252091626343766, + "grad_norm": 1.362740397453308, + "learning_rate": 1.7781091110179657e-05, + "loss": 0.8428, + "step": 1731 + }, + { + "epoch": 0.08256858865873716, + "grad_norm": 2.353976011276245, + "learning_rate": 1.7778623235804935e-05, + "loss": 1.2552, + "step": 1732 + }, + { + "epoch": 0.08261626105403666, + "grad_norm": 3.3170006275177, + "learning_rate": 1.7776154161268753e-05, + "loss": 0.545, + "step": 1733 + }, + { + "epoch": 0.08266393344933616, + "grad_norm": 3.282442092895508, + "learning_rate": 1.777368388695206e-05, + "loss": 0.4365, + "step": 1734 + }, + { + "epoch": 0.08271160584463566, + "grad_norm": 1.1182255744934082, + "learning_rate": 1.7771212413235997e-05, + "loss": 0.673, + "step": 1735 + }, + { + "epoch": 0.08275927823993516, + "grad_norm": 1.4665732383728027, + "learning_rate": 1.776873974050189e-05, + "loss": 0.8061, + "step": 1736 + }, + { + "epoch": 0.08280695063523466, + "grad_norm": 1.7098922729492188, + "learning_rate": 1.776626586913124e-05, + "loss": 1.0896, + "step": 1737 + }, + { + "epoch": 0.08285462303053417, + "grad_norm": 1.328294277191162, + "learning_rate": 1.7763790799505746e-05, + "loss": 0.6194, + "step": 1738 + }, + { + "epoch": 0.08290229542583367, + "grad_norm": 1.435356855392456, + "learning_rate": 1.776131453200728e-05, + "loss": 0.9297, + "step": 1739 + }, + { + "epoch": 0.08294996782113317, + "grad_norm": 1.5327321290969849, + "learning_rate": 1.775883706701791e-05, + "loss": 0.7011, + "step": 1740 + }, + { + "epoch": 0.08299764021643267, + "grad_norm": 2.847810745239258, + "learning_rate": 1.775635840491988e-05, + "loss": 0.5931, + "step": 1741 + }, + { + "epoch": 0.08304531261173217, + "grad_norm": 1.4638142585754395, + "learning_rate": 1.7753878546095625e-05, + "loss": 0.9, + "step": 1742 + }, + { + "epoch": 0.08309298500703168, + "grad_norm": 1.4644008874893188, + "learning_rate": 1.7751397490927756e-05, + "loss": 0.6017, + "step": 1743 + }, + { + "epoch": 0.08314065740233118, + "grad_norm": 1.6079331636428833, + "learning_rate": 1.7748915239799083e-05, + "loss": 0.6664, + "step": 1744 + }, + { + "epoch": 0.08318832979763068, + "grad_norm": 1.2465012073516846, + "learning_rate": 1.7746431793092583e-05, + "loss": 0.6426, + "step": 1745 + }, + { + "epoch": 0.08323600219293019, + "grad_norm": 1.6928510665893555, + "learning_rate": 1.774394715119143e-05, + "loss": 0.6715, + "step": 1746 + }, + { + "epoch": 0.08328367458822969, + "grad_norm": 1.229422926902771, + "learning_rate": 1.7741461314478986e-05, + "loss": 0.5694, + "step": 1747 + }, + { + "epoch": 0.08333134698352919, + "grad_norm": 2.1797072887420654, + "learning_rate": 1.773897428333878e-05, + "loss": 0.8181, + "step": 1748 + }, + { + "epoch": 0.08337901937882869, + "grad_norm": 1.3057845830917358, + "learning_rate": 1.773648605815453e-05, + "loss": 1.0441, + "step": 1749 + }, + { + "epoch": 0.08342669177412819, + "grad_norm": 2.776489496231079, + "learning_rate": 1.7733996639310157e-05, + "loss": 0.5581, + "step": 1750 + }, + { + "epoch": 0.08347436416942769, + "grad_norm": 1.3012232780456543, + "learning_rate": 1.773150602718975e-05, + "loss": 0.6365, + "step": 1751 + }, + { + "epoch": 0.08352203656472719, + "grad_norm": 1.5112508535385132, + "learning_rate": 1.772901422217758e-05, + "loss": 0.3585, + "step": 1752 + }, + { + "epoch": 0.0835697089600267, + "grad_norm": 1.2008789777755737, + "learning_rate": 1.7726521224658106e-05, + "loss": 0.4583, + "step": 1753 + }, + { + "epoch": 0.0836173813553262, + "grad_norm": 1.3291388750076294, + "learning_rate": 1.772402703501598e-05, + "loss": 0.7314, + "step": 1754 + }, + { + "epoch": 0.0836650537506257, + "grad_norm": 1.5987262725830078, + "learning_rate": 1.772153165363602e-05, + "loss": 0.6169, + "step": 1755 + }, + { + "epoch": 0.0837127261459252, + "grad_norm": 1.0725740194320679, + "learning_rate": 1.771903508090324e-05, + "loss": 0.3322, + "step": 1756 + }, + { + "epoch": 0.0837603985412247, + "grad_norm": 2.2698404788970947, + "learning_rate": 1.7716537317202848e-05, + "loss": 0.6046, + "step": 1757 + }, + { + "epoch": 0.0838080709365242, + "grad_norm": 1.8276516199111938, + "learning_rate": 1.7714038362920205e-05, + "loss": 0.6068, + "step": 1758 + }, + { + "epoch": 0.0838557433318237, + "grad_norm": 1.4723917245864868, + "learning_rate": 1.771153821844088e-05, + "loss": 0.8854, + "step": 1759 + }, + { + "epoch": 0.0839034157271232, + "grad_norm": 1.42435622215271, + "learning_rate": 1.7709036884150627e-05, + "loss": 0.7149, + "step": 1760 + }, + { + "epoch": 0.08395108812242272, + "grad_norm": 2.8415591716766357, + "learning_rate": 1.770653436043537e-05, + "loss": 1.4141, + "step": 1761 + }, + { + "epoch": 0.08399876051772222, + "grad_norm": 1.649634599685669, + "learning_rate": 1.770403064768122e-05, + "loss": 0.7977, + "step": 1762 + }, + { + "epoch": 0.08404643291302172, + "grad_norm": 1.6270283460617065, + "learning_rate": 1.770152574627448e-05, + "loss": 0.715, + "step": 1763 + }, + { + "epoch": 0.08409410530832122, + "grad_norm": 1.3826130628585815, + "learning_rate": 1.7699019656601624e-05, + "loss": 0.5197, + "step": 1764 + }, + { + "epoch": 0.08414177770362072, + "grad_norm": 3.902764320373535, + "learning_rate": 1.7696512379049323e-05, + "loss": 0.5795, + "step": 1765 + }, + { + "epoch": 0.08418945009892022, + "grad_norm": 3.4180245399475098, + "learning_rate": 1.7694003914004422e-05, + "loss": 0.3168, + "step": 1766 + }, + { + "epoch": 0.08423712249421972, + "grad_norm": 2.092353582382202, + "learning_rate": 1.769149426185395e-05, + "loss": 0.6151, + "step": 1767 + }, + { + "epoch": 0.08428479488951922, + "grad_norm": 1.543492317199707, + "learning_rate": 1.7688983422985116e-05, + "loss": 0.7875, + "step": 1768 + }, + { + "epoch": 0.08433246728481873, + "grad_norm": 4.8802876472473145, + "learning_rate": 1.7686471397785322e-05, + "loss": 0.6484, + "step": 1769 + }, + { + "epoch": 0.08438013968011823, + "grad_norm": 4.532622337341309, + "learning_rate": 1.768395818664215e-05, + "loss": 0.5281, + "step": 1770 + }, + { + "epoch": 0.08442781207541773, + "grad_norm": 1.6347931623458862, + "learning_rate": 1.7681443789943354e-05, + "loss": 0.9002, + "step": 1771 + }, + { + "epoch": 0.08447548447071723, + "grad_norm": 3.5466668605804443, + "learning_rate": 1.767892820807689e-05, + "loss": 0.6705, + "step": 1772 + }, + { + "epoch": 0.08452315686601673, + "grad_norm": 3.5896871089935303, + "learning_rate": 1.7676411441430877e-05, + "loss": 0.4944, + "step": 1773 + }, + { + "epoch": 0.08457082926131623, + "grad_norm": 1.654937744140625, + "learning_rate": 1.7673893490393636e-05, + "loss": 0.7959, + "step": 1774 + }, + { + "epoch": 0.08461850165661573, + "grad_norm": 1.2499451637268066, + "learning_rate": 1.767137435535365e-05, + "loss": 0.2765, + "step": 1775 + }, + { + "epoch": 0.08466617405191523, + "grad_norm": 1.8417538404464722, + "learning_rate": 1.76688540366996e-05, + "loss": 0.8872, + "step": 1776 + }, + { + "epoch": 0.08471384644721475, + "grad_norm": 1.2179772853851318, + "learning_rate": 1.766633253482035e-05, + "loss": 0.6193, + "step": 1777 + }, + { + "epoch": 0.08476151884251425, + "grad_norm": 1.9007271528244019, + "learning_rate": 1.7663809850104936e-05, + "loss": 0.7423, + "step": 1778 + }, + { + "epoch": 0.08480919123781375, + "grad_norm": 2.493910074234009, + "learning_rate": 1.7661285982942588e-05, + "loss": 0.6961, + "step": 1779 + }, + { + "epoch": 0.08485686363311325, + "grad_norm": 1.5310133695602417, + "learning_rate": 1.7658760933722702e-05, + "loss": 0.3952, + "step": 1780 + }, + { + "epoch": 0.08490453602841275, + "grad_norm": 1.3763717412948608, + "learning_rate": 1.7656234702834877e-05, + "loss": 0.6785, + "step": 1781 + }, + { + "epoch": 0.08495220842371225, + "grad_norm": 1.724355697631836, + "learning_rate": 1.7653707290668882e-05, + "loss": 1.0242, + "step": 1782 + }, + { + "epoch": 0.08499988081901175, + "grad_norm": 1.8841114044189453, + "learning_rate": 1.765117869761467e-05, + "loss": 0.626, + "step": 1783 + }, + { + "epoch": 0.08504755321431125, + "grad_norm": 1.0602959394454956, + "learning_rate": 1.7648648924062378e-05, + "loss": 0.5121, + "step": 1784 + }, + { + "epoch": 0.08509522560961076, + "grad_norm": 3.4438045024871826, + "learning_rate": 1.764611797040232e-05, + "loss": 1.0139, + "step": 1785 + }, + { + "epoch": 0.08514289800491026, + "grad_norm": 1.3330975770950317, + "learning_rate": 1.7643585837025e-05, + "loss": 0.6389, + "step": 1786 + }, + { + "epoch": 0.08519057040020976, + "grad_norm": 1.560445785522461, + "learning_rate": 1.76410525243211e-05, + "loss": 0.8744, + "step": 1787 + }, + { + "epoch": 0.08523824279550926, + "grad_norm": 19.96894645690918, + "learning_rate": 1.7638518032681482e-05, + "loss": 0.8698, + "step": 1788 + }, + { + "epoch": 0.08528591519080876, + "grad_norm": 1.475574016571045, + "learning_rate": 1.7635982362497195e-05, + "loss": 0.8132, + "step": 1789 + }, + { + "epoch": 0.08533358758610826, + "grad_norm": 1.1431615352630615, + "learning_rate": 1.763344551415946e-05, + "loss": 0.872, + "step": 1790 + }, + { + "epoch": 0.08538125998140776, + "grad_norm": 2.552554130554199, + "learning_rate": 1.76309074880597e-05, + "loss": 0.4446, + "step": 1791 + }, + { + "epoch": 0.08542893237670726, + "grad_norm": 3.6265108585357666, + "learning_rate": 1.762836828458949e-05, + "loss": 1.2209, + "step": 1792 + }, + { + "epoch": 0.08547660477200678, + "grad_norm": 1.0884145498275757, + "learning_rate": 1.762582790414061e-05, + "loss": 0.639, + "step": 1793 + }, + { + "epoch": 0.08552427716730628, + "grad_norm": 1.180105209350586, + "learning_rate": 1.762328634710502e-05, + "loss": 0.5889, + "step": 1794 + }, + { + "epoch": 0.08557194956260578, + "grad_norm": 1.4643186330795288, + "learning_rate": 1.762074361387485e-05, + "loss": 0.5076, + "step": 1795 + }, + { + "epoch": 0.08561962195790528, + "grad_norm": 2.3704681396484375, + "learning_rate": 1.761819970484242e-05, + "loss": 0.9002, + "step": 1796 + }, + { + "epoch": 0.08566729435320478, + "grad_norm": 1.3049284219741821, + "learning_rate": 1.7615654620400225e-05, + "loss": 0.5451, + "step": 1797 + }, + { + "epoch": 0.08571496674850428, + "grad_norm": 2.093545436859131, + "learning_rate": 1.761310836094095e-05, + "loss": 0.6278, + "step": 1798 + }, + { + "epoch": 0.08576263914380378, + "grad_norm": 1.7475173473358154, + "learning_rate": 1.7610560926857455e-05, + "loss": 0.7074, + "step": 1799 + }, + { + "epoch": 0.08581031153910328, + "grad_norm": 4.119304180145264, + "learning_rate": 1.760801231854278e-05, + "loss": 0.5421, + "step": 1800 + }, + { + "epoch": 0.08585798393440279, + "grad_norm": 4.078807353973389, + "learning_rate": 1.7605462536390155e-05, + "loss": 0.3838, + "step": 1801 + }, + { + "epoch": 0.08590565632970229, + "grad_norm": 1.399079442024231, + "learning_rate": 1.760291158079298e-05, + "loss": 0.8949, + "step": 1802 + }, + { + "epoch": 0.08595332872500179, + "grad_norm": 1.9099987745285034, + "learning_rate": 1.7600359452144845e-05, + "loss": 0.6147, + "step": 1803 + }, + { + "epoch": 0.08600100112030129, + "grad_norm": 2.8720459938049316, + "learning_rate": 1.759780615083951e-05, + "loss": 0.7017, + "step": 1804 + }, + { + "epoch": 0.08604867351560079, + "grad_norm": 1.4967843294143677, + "learning_rate": 1.7595251677270933e-05, + "loss": 0.5165, + "step": 1805 + }, + { + "epoch": 0.08609634591090029, + "grad_norm": 1.4108738899230957, + "learning_rate": 1.7592696031833237e-05, + "loss": 0.8112, + "step": 1806 + }, + { + "epoch": 0.08614401830619979, + "grad_norm": 1.3807586431503296, + "learning_rate": 1.7590139214920732e-05, + "loss": 0.7269, + "step": 1807 + }, + { + "epoch": 0.08619169070149929, + "grad_norm": 2.1828525066375732, + "learning_rate": 1.758758122692791e-05, + "loss": 0.7573, + "step": 1808 + }, + { + "epoch": 0.0862393630967988, + "grad_norm": 1.4696754217147827, + "learning_rate": 1.758502206824944e-05, + "loss": 0.9103, + "step": 1809 + }, + { + "epoch": 0.0862870354920983, + "grad_norm": 11.551321983337402, + "learning_rate": 1.7582461739280178e-05, + "loss": 1.0546, + "step": 1810 + }, + { + "epoch": 0.0863347078873978, + "grad_norm": 2.826413631439209, + "learning_rate": 1.7579900240415155e-05, + "loss": 0.8977, + "step": 1811 + }, + { + "epoch": 0.0863823802826973, + "grad_norm": 2.0058205127716064, + "learning_rate": 1.757733757204958e-05, + "loss": 0.8373, + "step": 1812 + }, + { + "epoch": 0.0864300526779968, + "grad_norm": 1.0884069204330444, + "learning_rate": 1.757477373457885e-05, + "loss": 0.2408, + "step": 1813 + }, + { + "epoch": 0.0864777250732963, + "grad_norm": 1.4263980388641357, + "learning_rate": 1.757220872839854e-05, + "loss": 0.6121, + "step": 1814 + }, + { + "epoch": 0.0865253974685958, + "grad_norm": 1.186133861541748, + "learning_rate": 1.75696425539044e-05, + "loss": 0.781, + "step": 1815 + }, + { + "epoch": 0.0865730698638953, + "grad_norm": 1.87509024143219, + "learning_rate": 1.7567075211492365e-05, + "loss": 0.3953, + "step": 1816 + }, + { + "epoch": 0.08662074225919482, + "grad_norm": 2.214392900466919, + "learning_rate": 1.756450670155855e-05, + "loss": 0.7804, + "step": 1817 + }, + { + "epoch": 0.08666841465449432, + "grad_norm": 1.5667976140975952, + "learning_rate": 1.7561937024499252e-05, + "loss": 0.8603, + "step": 1818 + }, + { + "epoch": 0.08671608704979382, + "grad_norm": 1.3027293682098389, + "learning_rate": 1.7559366180710942e-05, + "loss": 0.8185, + "step": 1819 + }, + { + "epoch": 0.08676375944509332, + "grad_norm": 3.0646231174468994, + "learning_rate": 1.7556794170590282e-05, + "loss": 0.8354, + "step": 1820 + }, + { + "epoch": 0.08681143184039282, + "grad_norm": 1.5180717706680298, + "learning_rate": 1.7554220994534096e-05, + "loss": 0.8525, + "step": 1821 + }, + { + "epoch": 0.08685910423569232, + "grad_norm": 2.257699966430664, + "learning_rate": 1.7551646652939405e-05, + "loss": 0.9406, + "step": 1822 + }, + { + "epoch": 0.08690677663099182, + "grad_norm": 3.307598829269409, + "learning_rate": 1.7549071146203404e-05, + "loss": 1.1554, + "step": 1823 + }, + { + "epoch": 0.08695444902629132, + "grad_norm": 1.2125195264816284, + "learning_rate": 1.7546494474723467e-05, + "loss": 0.8746, + "step": 1824 + }, + { + "epoch": 0.08700212142159083, + "grad_norm": 1.5817060470581055, + "learning_rate": 1.7543916638897142e-05, + "loss": 0.7744, + "step": 1825 + }, + { + "epoch": 0.08704979381689033, + "grad_norm": 1.317553162574768, + "learning_rate": 1.754133763912217e-05, + "loss": 0.7595, + "step": 1826 + }, + { + "epoch": 0.08709746621218983, + "grad_norm": 2.101595163345337, + "learning_rate": 1.753875747579646e-05, + "loss": 1.0099, + "step": 1827 + }, + { + "epoch": 0.08714513860748933, + "grad_norm": 1.7978442907333374, + "learning_rate": 1.7536176149318106e-05, + "loss": 0.634, + "step": 1828 + }, + { + "epoch": 0.08719281100278883, + "grad_norm": 3.205493450164795, + "learning_rate": 1.7533593660085378e-05, + "loss": 1.4576, + "step": 1829 + }, + { + "epoch": 0.08724048339808833, + "grad_norm": 1.7754584550857544, + "learning_rate": 1.7531010008496733e-05, + "loss": 0.9474, + "step": 1830 + }, + { + "epoch": 0.08728815579338783, + "grad_norm": 2.764887809753418, + "learning_rate": 1.7528425194950794e-05, + "loss": 0.7215, + "step": 1831 + }, + { + "epoch": 0.08733582818868733, + "grad_norm": 1.807370662689209, + "learning_rate": 1.752583921984638e-05, + "loss": 0.7225, + "step": 1832 + }, + { + "epoch": 0.08738350058398685, + "grad_norm": 2.3763256072998047, + "learning_rate": 1.752325208358247e-05, + "loss": 1.0307, + "step": 1833 + }, + { + "epoch": 0.08743117297928635, + "grad_norm": 8.486246109008789, + "learning_rate": 1.7520663786558243e-05, + "loss": 0.0538, + "step": 1834 + }, + { + "epoch": 0.08747884537458585, + "grad_norm": 3.6323912143707275, + "learning_rate": 1.751807432917304e-05, + "loss": 0.6765, + "step": 1835 + }, + { + "epoch": 0.08752651776988535, + "grad_norm": 2.1206586360931396, + "learning_rate": 1.7515483711826386e-05, + "loss": 1.0793, + "step": 1836 + }, + { + "epoch": 0.08757419016518485, + "grad_norm": 2.404599666595459, + "learning_rate": 1.7512891934917994e-05, + "loss": 0.7878, + "step": 1837 + }, + { + "epoch": 0.08762186256048435, + "grad_norm": 3.5932154655456543, + "learning_rate": 1.7510298998847742e-05, + "loss": 0.4684, + "step": 1838 + }, + { + "epoch": 0.08766953495578385, + "grad_norm": 1.0536655187606812, + "learning_rate": 1.7507704904015696e-05, + "loss": 0.5574, + "step": 1839 + }, + { + "epoch": 0.08771720735108335, + "grad_norm": 2.6022403240203857, + "learning_rate": 1.7505109650822096e-05, + "loss": 0.7589, + "step": 1840 + }, + { + "epoch": 0.08776487974638286, + "grad_norm": 1.5821363925933838, + "learning_rate": 1.7502513239667365e-05, + "loss": 0.9011, + "step": 1841 + }, + { + "epoch": 0.08781255214168236, + "grad_norm": 1.4837303161621094, + "learning_rate": 1.7499915670952107e-05, + "loss": 0.7292, + "step": 1842 + }, + { + "epoch": 0.08786022453698186, + "grad_norm": 1.6185940504074097, + "learning_rate": 1.749731694507709e-05, + "loss": 0.5632, + "step": 1843 + }, + { + "epoch": 0.08790789693228136, + "grad_norm": 1.1037304401397705, + "learning_rate": 1.749471706244328e-05, + "loss": 0.4907, + "step": 1844 + }, + { + "epoch": 0.08795556932758086, + "grad_norm": 1.746039867401123, + "learning_rate": 1.7492116023451803e-05, + "loss": 0.7608, + "step": 1845 + }, + { + "epoch": 0.08800324172288036, + "grad_norm": 2.322873592376709, + "learning_rate": 1.748951382850398e-05, + "loss": 0.5382, + "step": 1846 + }, + { + "epoch": 0.08805091411817986, + "grad_norm": 0.8395581841468811, + "learning_rate": 1.7486910478001303e-05, + "loss": 0.1646, + "step": 1847 + }, + { + "epoch": 0.08809858651347938, + "grad_norm": 6.367923259735107, + "learning_rate": 1.7484305972345436e-05, + "loss": 0.7985, + "step": 1848 + }, + { + "epoch": 0.08814625890877888, + "grad_norm": 2.696329116821289, + "learning_rate": 1.748170031193823e-05, + "loss": 0.3606, + "step": 1849 + }, + { + "epoch": 0.08819393130407838, + "grad_norm": 1.4912505149841309, + "learning_rate": 1.7479093497181714e-05, + "loss": 0.723, + "step": 1850 + }, + { + "epoch": 0.08824160369937788, + "grad_norm": 1.5808112621307373, + "learning_rate": 1.7476485528478093e-05, + "loss": 0.6187, + "step": 1851 + }, + { + "epoch": 0.08828927609467738, + "grad_norm": 1.9619344472885132, + "learning_rate": 1.7473876406229744e-05, + "loss": 0.7366, + "step": 1852 + }, + { + "epoch": 0.08833694848997688, + "grad_norm": 1.5421297550201416, + "learning_rate": 1.7471266130839235e-05, + "loss": 0.9173, + "step": 1853 + }, + { + "epoch": 0.08838462088527638, + "grad_norm": 1.360328197479248, + "learning_rate": 1.74686547027093e-05, + "loss": 0.9051, + "step": 1854 + }, + { + "epoch": 0.08843229328057588, + "grad_norm": 2.355334758758545, + "learning_rate": 1.7466042122242853e-05, + "loss": 0.7219, + "step": 1855 + }, + { + "epoch": 0.08847996567587539, + "grad_norm": 1.2616817951202393, + "learning_rate": 1.7463428389842997e-05, + "loss": 0.6258, + "step": 1856 + }, + { + "epoch": 0.08852763807117489, + "grad_norm": 1.2362185716629028, + "learning_rate": 1.7460813505912996e-05, + "loss": 0.5785, + "step": 1857 + }, + { + "epoch": 0.08857531046647439, + "grad_norm": 5.145622253417969, + "learning_rate": 1.7458197470856305e-05, + "loss": 0.8503, + "step": 1858 + }, + { + "epoch": 0.08862298286177389, + "grad_norm": 3.985865354537964, + "learning_rate": 1.7455580285076546e-05, + "loss": 0.4183, + "step": 1859 + }, + { + "epoch": 0.08867065525707339, + "grad_norm": 2.136936664581299, + "learning_rate": 1.745296194897753e-05, + "loss": 1.044, + "step": 1860 + }, + { + "epoch": 0.08871832765237289, + "grad_norm": 2.610098123550415, + "learning_rate": 1.7450342462963235e-05, + "loss": 0.8033, + "step": 1861 + }, + { + "epoch": 0.08876600004767239, + "grad_norm": 1.7275424003601074, + "learning_rate": 1.744772182743782e-05, + "loss": 0.9421, + "step": 1862 + }, + { + "epoch": 0.08881367244297189, + "grad_norm": 3.1618235111236572, + "learning_rate": 1.7445100042805627e-05, + "loss": 1.017, + "step": 1863 + }, + { + "epoch": 0.0888613448382714, + "grad_norm": 2.4394078254699707, + "learning_rate": 1.744247710947116e-05, + "loss": 0.5957, + "step": 1864 + }, + { + "epoch": 0.0889090172335709, + "grad_norm": 1.6809029579162598, + "learning_rate": 1.7439853027839124e-05, + "loss": 0.7981, + "step": 1865 + }, + { + "epoch": 0.0889566896288704, + "grad_norm": 3.8705766201019287, + "learning_rate": 1.743722779831438e-05, + "loss": 0.7981, + "step": 1866 + }, + { + "epoch": 0.0890043620241699, + "grad_norm": 2.174025535583496, + "learning_rate": 1.7434601421301974e-05, + "loss": 0.6841, + "step": 1867 + }, + { + "epoch": 0.0890520344194694, + "grad_norm": 2.056065797805786, + "learning_rate": 1.743197389720713e-05, + "loss": 1.1116, + "step": 1868 + }, + { + "epoch": 0.0890997068147689, + "grad_norm": 3.395608425140381, + "learning_rate": 1.7429345226435253e-05, + "loss": 0.7421, + "step": 1869 + }, + { + "epoch": 0.0891473792100684, + "grad_norm": 1.981473445892334, + "learning_rate": 1.742671540939191e-05, + "loss": 0.7867, + "step": 1870 + }, + { + "epoch": 0.0891950516053679, + "grad_norm": 3.6642842292785645, + "learning_rate": 1.742408444648286e-05, + "loss": 0.7023, + "step": 1871 + }, + { + "epoch": 0.08924272400066742, + "grad_norm": 1.628873348236084, + "learning_rate": 1.7421452338114036e-05, + "loss": 0.8923, + "step": 1872 + }, + { + "epoch": 0.08929039639596692, + "grad_norm": 1.214039921760559, + "learning_rate": 1.741881908469154e-05, + "loss": 0.5864, + "step": 1873 + }, + { + "epoch": 0.08933806879126642, + "grad_norm": 1.7834521532058716, + "learning_rate": 1.741618468662166e-05, + "loss": 1.0572, + "step": 1874 + }, + { + "epoch": 0.08938574118656592, + "grad_norm": 15.063992500305176, + "learning_rate": 1.7413549144310856e-05, + "loss": 0.7541, + "step": 1875 + }, + { + "epoch": 0.08943341358186542, + "grad_norm": 1.4960886240005493, + "learning_rate": 1.741091245816576e-05, + "loss": 0.6211, + "step": 1876 + }, + { + "epoch": 0.08948108597716492, + "grad_norm": 3.792848825454712, + "learning_rate": 1.7408274628593192e-05, + "loss": 0.6678, + "step": 1877 + }, + { + "epoch": 0.08952875837246442, + "grad_norm": 1.362256646156311, + "learning_rate": 1.740563565600014e-05, + "loss": 0.8872, + "step": 1878 + }, + { + "epoch": 0.08957643076776392, + "grad_norm": 1.2339116334915161, + "learning_rate": 1.7402995540793764e-05, + "loss": 0.6797, + "step": 1879 + }, + { + "epoch": 0.08962410316306343, + "grad_norm": 1.098677158355713, + "learning_rate": 1.7400354283381416e-05, + "loss": 0.6452, + "step": 1880 + }, + { + "epoch": 0.08967177555836293, + "grad_norm": 2.208592653274536, + "learning_rate": 1.7397711884170613e-05, + "loss": 1.2245, + "step": 1881 + }, + { + "epoch": 0.08971944795366243, + "grad_norm": 1.1734225749969482, + "learning_rate": 1.7395068343569047e-05, + "loss": 0.7495, + "step": 1882 + }, + { + "epoch": 0.08976712034896193, + "grad_norm": 1.860902190208435, + "learning_rate": 1.739242366198459e-05, + "loss": 0.6499, + "step": 1883 + }, + { + "epoch": 0.08981479274426143, + "grad_norm": 4.4588117599487305, + "learning_rate": 1.7389777839825284e-05, + "loss": 0.5647, + "step": 1884 + }, + { + "epoch": 0.08986246513956093, + "grad_norm": 2.0444424152374268, + "learning_rate": 1.7387130877499364e-05, + "loss": 1.0156, + "step": 1885 + }, + { + "epoch": 0.08991013753486043, + "grad_norm": 2.0839266777038574, + "learning_rate": 1.738448277541522e-05, + "loss": 1.0359, + "step": 1886 + }, + { + "epoch": 0.08995780993015993, + "grad_norm": 3.001190423965454, + "learning_rate": 1.738183353398143e-05, + "loss": 0.6405, + "step": 1887 + }, + { + "epoch": 0.09000548232545945, + "grad_norm": 1.8037917613983154, + "learning_rate": 1.7379183153606743e-05, + "loss": 0.6912, + "step": 1888 + }, + { + "epoch": 0.09005315472075895, + "grad_norm": 2.767890453338623, + "learning_rate": 1.7376531634700087e-05, + "loss": 0.5733, + "step": 1889 + }, + { + "epoch": 0.09010082711605845, + "grad_norm": 1.2392313480377197, + "learning_rate": 1.737387897767056e-05, + "loss": 0.6807, + "step": 1890 + }, + { + "epoch": 0.09014849951135795, + "grad_norm": 1.0875239372253418, + "learning_rate": 1.7371225182927447e-05, + "loss": 0.5452, + "step": 1891 + }, + { + "epoch": 0.09019617190665745, + "grad_norm": 1.3470181226730347, + "learning_rate": 1.7368570250880198e-05, + "loss": 0.6363, + "step": 1892 + }, + { + "epoch": 0.09024384430195695, + "grad_norm": 1.1659319400787354, + "learning_rate": 1.736591418193844e-05, + "loss": 0.6894, + "step": 1893 + }, + { + "epoch": 0.09029151669725645, + "grad_norm": 3.127140760421753, + "learning_rate": 1.7363256976511972e-05, + "loss": 0.6474, + "step": 1894 + }, + { + "epoch": 0.09033918909255595, + "grad_norm": 1.647687315940857, + "learning_rate": 1.7360598635010787e-05, + "loss": 0.4843, + "step": 1895 + }, + { + "epoch": 0.09038686148785546, + "grad_norm": 5.694189548492432, + "learning_rate": 1.735793915784503e-05, + "loss": 0.9134, + "step": 1896 + }, + { + "epoch": 0.09043453388315496, + "grad_norm": 1.61495840549469, + "learning_rate": 1.7355278545425033e-05, + "loss": 0.3499, + "step": 1897 + }, + { + "epoch": 0.09048220627845446, + "grad_norm": 1.518541932106018, + "learning_rate": 1.73526167981613e-05, + "loss": 0.837, + "step": 1898 + }, + { + "epoch": 0.09052987867375396, + "grad_norm": 1.9157156944274902, + "learning_rate": 1.7349953916464512e-05, + "loss": 0.7504, + "step": 1899 + }, + { + "epoch": 0.09057755106905346, + "grad_norm": 1.2907934188842773, + "learning_rate": 1.7347289900745525e-05, + "loss": 0.3012, + "step": 1900 + }, + { + "epoch": 0.09062522346435296, + "grad_norm": 1.853825569152832, + "learning_rate": 1.734462475141537e-05, + "loss": 0.7005, + "step": 1901 + }, + { + "epoch": 0.09067289585965246, + "grad_norm": 2.314300537109375, + "learning_rate": 1.734195846888525e-05, + "loss": 0.4777, + "step": 1902 + }, + { + "epoch": 0.09072056825495196, + "grad_norm": 1.8735827207565308, + "learning_rate": 1.7339291053566544e-05, + "loss": 0.4288, + "step": 1903 + }, + { + "epoch": 0.09076824065025148, + "grad_norm": 2.166480541229248, + "learning_rate": 1.7336622505870813e-05, + "loss": 0.8705, + "step": 1904 + }, + { + "epoch": 0.09081591304555098, + "grad_norm": 1.713133692741394, + "learning_rate": 1.733395282620978e-05, + "loss": 0.516, + "step": 1905 + }, + { + "epoch": 0.09086358544085048, + "grad_norm": 4.2016472816467285, + "learning_rate": 1.7331282014995348e-05, + "loss": 0.8166, + "step": 1906 + }, + { + "epoch": 0.09091125783614998, + "grad_norm": 4.468966484069824, + "learning_rate": 1.7328610072639604e-05, + "loss": 0.0342, + "step": 1907 + }, + { + "epoch": 0.09095893023144948, + "grad_norm": 1.7308324575424194, + "learning_rate": 1.732593699955479e-05, + "loss": 0.6723, + "step": 1908 + }, + { + "epoch": 0.09100660262674898, + "grad_norm": 1.9870880842208862, + "learning_rate": 1.7323262796153342e-05, + "loss": 0.7298, + "step": 1909 + }, + { + "epoch": 0.09105427502204848, + "grad_norm": 3.1176748275756836, + "learning_rate": 1.7320587462847858e-05, + "loss": 0.8399, + "step": 1910 + }, + { + "epoch": 0.09110194741734798, + "grad_norm": 1.5841773748397827, + "learning_rate": 1.7317911000051123e-05, + "loss": 0.7456, + "step": 1911 + }, + { + "epoch": 0.09114961981264749, + "grad_norm": 2.189419984817505, + "learning_rate": 1.7315233408176073e-05, + "loss": 1.1764, + "step": 1912 + }, + { + "epoch": 0.09119729220794699, + "grad_norm": 4.199376583099365, + "learning_rate": 1.7312554687635843e-05, + "loss": 0.9191, + "step": 1913 + }, + { + "epoch": 0.09124496460324649, + "grad_norm": 2.043581485748291, + "learning_rate": 1.730987483884373e-05, + "loss": 0.6627, + "step": 1914 + }, + { + "epoch": 0.09129263699854599, + "grad_norm": 1.6215832233428955, + "learning_rate": 1.7307193862213204e-05, + "loss": 0.9885, + "step": 1915 + }, + { + "epoch": 0.09134030939384549, + "grad_norm": 1.460170030593872, + "learning_rate": 1.7304511758157917e-05, + "loss": 0.6846, + "step": 1916 + }, + { + "epoch": 0.09138798178914499, + "grad_norm": 2.418365955352783, + "learning_rate": 1.7301828527091687e-05, + "loss": 1.0374, + "step": 1917 + }, + { + "epoch": 0.09143565418444449, + "grad_norm": 1.3839024305343628, + "learning_rate": 1.7299144169428513e-05, + "loss": 0.6728, + "step": 1918 + }, + { + "epoch": 0.09148332657974399, + "grad_norm": 3.182678699493408, + "learning_rate": 1.7296458685582557e-05, + "loss": 0.7446, + "step": 1919 + }, + { + "epoch": 0.0915309989750435, + "grad_norm": 2.252993106842041, + "learning_rate": 1.7293772075968163e-05, + "loss": 0.972, + "step": 1920 + }, + { + "epoch": 0.091578671370343, + "grad_norm": 4.426293849945068, + "learning_rate": 1.729108434099985e-05, + "loss": 0.7634, + "step": 1921 + }, + { + "epoch": 0.0916263437656425, + "grad_norm": 1.2585158348083496, + "learning_rate": 1.7288395481092307e-05, + "loss": 1.0344, + "step": 1922 + }, + { + "epoch": 0.091674016160942, + "grad_norm": 2.2036473751068115, + "learning_rate": 1.7285705496660398e-05, + "loss": 0.9065, + "step": 1923 + }, + { + "epoch": 0.0917216885562415, + "grad_norm": 4.132787227630615, + "learning_rate": 1.728301438811916e-05, + "loss": 0.2619, + "step": 1924 + }, + { + "epoch": 0.091769360951541, + "grad_norm": 2.7758944034576416, + "learning_rate": 1.7280322155883805e-05, + "loss": 1.3749, + "step": 1925 + }, + { + "epoch": 0.09181703334684051, + "grad_norm": 2.305260419845581, + "learning_rate": 1.7277628800369708e-05, + "loss": 0.7091, + "step": 1926 + }, + { + "epoch": 0.09186470574214001, + "grad_norm": 1.176971197128296, + "learning_rate": 1.7274934321992435e-05, + "loss": 0.4675, + "step": 1927 + }, + { + "epoch": 0.09191237813743952, + "grad_norm": 0.9848259687423706, + "learning_rate": 1.7272238721167715e-05, + "loss": 0.4155, + "step": 1928 + }, + { + "epoch": 0.09196005053273902, + "grad_norm": 1.3047305345535278, + "learning_rate": 1.7269541998311446e-05, + "loss": 0.7043, + "step": 1929 + }, + { + "epoch": 0.09200772292803852, + "grad_norm": 2.0758633613586426, + "learning_rate": 1.726684415383971e-05, + "loss": 0.692, + "step": 1930 + }, + { + "epoch": 0.09205539532333802, + "grad_norm": 1.6682102680206299, + "learning_rate": 1.7264145188168755e-05, + "loss": 0.6277, + "step": 1931 + }, + { + "epoch": 0.09210306771863752, + "grad_norm": 2.587801694869995, + "learning_rate": 1.7261445101715006e-05, + "loss": 0.9891, + "step": 1932 + }, + { + "epoch": 0.09215074011393702, + "grad_norm": 1.7019946575164795, + "learning_rate": 1.7258743894895054e-05, + "loss": 1.0188, + "step": 1933 + }, + { + "epoch": 0.09219841250923652, + "grad_norm": 1.3179090023040771, + "learning_rate": 1.7256041568125673e-05, + "loss": 0.6405, + "step": 1934 + }, + { + "epoch": 0.09224608490453602, + "grad_norm": 3.0413458347320557, + "learning_rate": 1.7253338121823796e-05, + "loss": 0.9162, + "step": 1935 + }, + { + "epoch": 0.09229375729983554, + "grad_norm": 1.7248799800872803, + "learning_rate": 1.7250633556406545e-05, + "loss": 1.0734, + "step": 1936 + }, + { + "epoch": 0.09234142969513504, + "grad_norm": 2.9053101539611816, + "learning_rate": 1.72479278722912e-05, + "loss": 0.6902, + "step": 1937 + }, + { + "epoch": 0.09238910209043454, + "grad_norm": 4.004610061645508, + "learning_rate": 1.7245221069895227e-05, + "loss": 1.6501, + "step": 1938 + }, + { + "epoch": 0.09243677448573404, + "grad_norm": 3.44722318649292, + "learning_rate": 1.7242513149636253e-05, + "loss": 0.6126, + "step": 1939 + }, + { + "epoch": 0.09248444688103354, + "grad_norm": 2.144404411315918, + "learning_rate": 1.7239804111932085e-05, + "loss": 0.5857, + "step": 1940 + }, + { + "epoch": 0.09253211927633304, + "grad_norm": 2.8406872749328613, + "learning_rate": 1.7237093957200694e-05, + "loss": 0.5798, + "step": 1941 + }, + { + "epoch": 0.09257979167163254, + "grad_norm": 4.07771110534668, + "learning_rate": 1.7234382685860236e-05, + "loss": 0.4616, + "step": 1942 + }, + { + "epoch": 0.09262746406693205, + "grad_norm": 2.1129682064056396, + "learning_rate": 1.723167029832903e-05, + "loss": 0.7992, + "step": 1943 + }, + { + "epoch": 0.09267513646223155, + "grad_norm": 1.5116686820983887, + "learning_rate": 1.7228956795025565e-05, + "loss": 0.5673, + "step": 1944 + }, + { + "epoch": 0.09272280885753105, + "grad_norm": 0.9510489106178284, + "learning_rate": 1.7226242176368515e-05, + "loss": 0.3807, + "step": 1945 + }, + { + "epoch": 0.09277048125283055, + "grad_norm": 1.7059881687164307, + "learning_rate": 1.7223526442776712e-05, + "loss": 0.7045, + "step": 1946 + }, + { + "epoch": 0.09281815364813005, + "grad_norm": 2.838369607925415, + "learning_rate": 1.7220809594669165e-05, + "loss": 0.8146, + "step": 1947 + }, + { + "epoch": 0.09286582604342955, + "grad_norm": 1.5864217281341553, + "learning_rate": 1.7218091632465057e-05, + "loss": 0.7502, + "step": 1948 + }, + { + "epoch": 0.09291349843872905, + "grad_norm": 1.5864360332489014, + "learning_rate": 1.7215372556583745e-05, + "loss": 0.5373, + "step": 1949 + }, + { + "epoch": 0.09296117083402855, + "grad_norm": 2.4728856086730957, + "learning_rate": 1.721265236744475e-05, + "loss": 0.4941, + "step": 1950 + }, + { + "epoch": 0.09300884322932806, + "grad_norm": 1.8207998275756836, + "learning_rate": 1.720993106546777e-05, + "loss": 0.7659, + "step": 1951 + }, + { + "epoch": 0.09305651562462756, + "grad_norm": 3.9849860668182373, + "learning_rate": 1.7207208651072677e-05, + "loss": 0.5498, + "step": 1952 + }, + { + "epoch": 0.09310418801992706, + "grad_norm": 1.8867381811141968, + "learning_rate": 1.7204485124679506e-05, + "loss": 0.8471, + "step": 1953 + }, + { + "epoch": 0.09315186041522656, + "grad_norm": 1.7520923614501953, + "learning_rate": 1.720176048670847e-05, + "loss": 0.6194, + "step": 1954 + }, + { + "epoch": 0.09319953281052606, + "grad_norm": 2.5389485359191895, + "learning_rate": 1.7199034737579962e-05, + "loss": 0.7681, + "step": 1955 + }, + { + "epoch": 0.09324720520582556, + "grad_norm": 3.7277798652648926, + "learning_rate": 1.7196307877714523e-05, + "loss": 0.8089, + "step": 1956 + }, + { + "epoch": 0.09329487760112506, + "grad_norm": 3.313062906265259, + "learning_rate": 1.719357990753289e-05, + "loss": 1.8292, + "step": 1957 + }, + { + "epoch": 0.09334254999642456, + "grad_norm": 1.7429782152175903, + "learning_rate": 1.7190850827455957e-05, + "loss": 0.7497, + "step": 1958 + }, + { + "epoch": 0.09339022239172408, + "grad_norm": 1.3355697393417358, + "learning_rate": 1.7188120637904792e-05, + "loss": 0.5364, + "step": 1959 + }, + { + "epoch": 0.09343789478702358, + "grad_norm": 2.6606807708740234, + "learning_rate": 1.7185389339300633e-05, + "loss": 0.4391, + "step": 1960 + }, + { + "epoch": 0.09348556718232308, + "grad_norm": 2.7550387382507324, + "learning_rate": 1.7182656932064894e-05, + "loss": 0.6177, + "step": 1961 + }, + { + "epoch": 0.09353323957762258, + "grad_norm": 1.9658668041229248, + "learning_rate": 1.7179923416619163e-05, + "loss": 0.3187, + "step": 1962 + }, + { + "epoch": 0.09358091197292208, + "grad_norm": 1.2750409841537476, + "learning_rate": 1.7177188793385183e-05, + "loss": 0.7569, + "step": 1963 + }, + { + "epoch": 0.09362858436822158, + "grad_norm": 1.623232364654541, + "learning_rate": 1.7174453062784885e-05, + "loss": 1.0568, + "step": 1964 + }, + { + "epoch": 0.09367625676352108, + "grad_norm": 1.665212869644165, + "learning_rate": 1.717171622524036e-05, + "loss": 0.6435, + "step": 1965 + }, + { + "epoch": 0.09372392915882058, + "grad_norm": 1.5377470254898071, + "learning_rate": 1.716897828117388e-05, + "loss": 0.902, + "step": 1966 + }, + { + "epoch": 0.0937716015541201, + "grad_norm": 1.6416821479797363, + "learning_rate": 1.7166239231007872e-05, + "loss": 0.6625, + "step": 1967 + }, + { + "epoch": 0.0938192739494196, + "grad_norm": 1.7977384328842163, + "learning_rate": 1.716349907516495e-05, + "loss": 0.8591, + "step": 1968 + }, + { + "epoch": 0.0938669463447191, + "grad_norm": 1.4452898502349854, + "learning_rate": 1.7160757814067895e-05, + "loss": 0.6399, + "step": 1969 + }, + { + "epoch": 0.0939146187400186, + "grad_norm": 2.179748296737671, + "learning_rate": 1.7158015448139645e-05, + "loss": 0.7528, + "step": 1970 + }, + { + "epoch": 0.0939622911353181, + "grad_norm": 3.3109803199768066, + "learning_rate": 1.715527197780333e-05, + "loss": 0.6812, + "step": 1971 + }, + { + "epoch": 0.0940099635306176, + "grad_norm": 2.4174749851226807, + "learning_rate": 1.715252740348223e-05, + "loss": 0.9528, + "step": 1972 + }, + { + "epoch": 0.0940576359259171, + "grad_norm": 1.1760822534561157, + "learning_rate": 1.714978172559981e-05, + "loss": 0.6298, + "step": 1973 + }, + { + "epoch": 0.0941053083212166, + "grad_norm": 1.441654920578003, + "learning_rate": 1.7147034944579698e-05, + "loss": 0.7768, + "step": 1974 + }, + { + "epoch": 0.09415298071651611, + "grad_norm": 2.0077085494995117, + "learning_rate": 1.7144287060845696e-05, + "loss": 1.2364, + "step": 1975 + }, + { + "epoch": 0.09420065311181561, + "grad_norm": 1.926422119140625, + "learning_rate": 1.714153807482177e-05, + "loss": 1.2169, + "step": 1976 + }, + { + "epoch": 0.09424832550711511, + "grad_norm": 2.1796329021453857, + "learning_rate": 1.713878798693206e-05, + "loss": 0.7759, + "step": 1977 + }, + { + "epoch": 0.09429599790241461, + "grad_norm": 1.9557242393493652, + "learning_rate": 1.7136036797600882e-05, + "loss": 0.9748, + "step": 1978 + }, + { + "epoch": 0.09434367029771411, + "grad_norm": 1.706542730331421, + "learning_rate": 1.7133284507252715e-05, + "loss": 0.9708, + "step": 1979 + }, + { + "epoch": 0.09439134269301361, + "grad_norm": 4.758184909820557, + "learning_rate": 1.7130531116312202e-05, + "loss": 1.5215, + "step": 1980 + }, + { + "epoch": 0.09443901508831311, + "grad_norm": 2.4019150733947754, + "learning_rate": 1.7127776625204173e-05, + "loss": 0.838, + "step": 1981 + }, + { + "epoch": 0.09448668748361261, + "grad_norm": 2.318128824234009, + "learning_rate": 1.7125021034353614e-05, + "loss": 0.7269, + "step": 1982 + }, + { + "epoch": 0.09453435987891212, + "grad_norm": 1.6884076595306396, + "learning_rate": 1.7122264344185677e-05, + "loss": 1.1387, + "step": 1983 + }, + { + "epoch": 0.09458203227421162, + "grad_norm": 1.348828911781311, + "learning_rate": 1.71195065551257e-05, + "loss": 0.7455, + "step": 1984 + }, + { + "epoch": 0.09462970466951112, + "grad_norm": 2.05991530418396, + "learning_rate": 1.711674766759918e-05, + "loss": 0.7384, + "step": 1985 + }, + { + "epoch": 0.09467737706481062, + "grad_norm": 1.3874835968017578, + "learning_rate": 1.711398768203178e-05, + "loss": 0.7218, + "step": 1986 + }, + { + "epoch": 0.09472504946011012, + "grad_norm": 1.3682280778884888, + "learning_rate": 1.7111226598849344e-05, + "loss": 0.6839, + "step": 1987 + }, + { + "epoch": 0.09477272185540962, + "grad_norm": 1.6690754890441895, + "learning_rate": 1.710846441847787e-05, + "loss": 0.6134, + "step": 1988 + }, + { + "epoch": 0.09482039425070912, + "grad_norm": 1.2285358905792236, + "learning_rate": 1.710570114134354e-05, + "loss": 0.5035, + "step": 1989 + }, + { + "epoch": 0.09486806664600862, + "grad_norm": 1.678049087524414, + "learning_rate": 1.7102936767872704e-05, + "loss": 0.7206, + "step": 1990 + }, + { + "epoch": 0.09491573904130814, + "grad_norm": 2.6629180908203125, + "learning_rate": 1.7100171298491866e-05, + "loss": 0.7183, + "step": 1991 + }, + { + "epoch": 0.09496341143660764, + "grad_norm": 1.8690797090530396, + "learning_rate": 1.709740473362772e-05, + "loss": 0.5785, + "step": 1992 + }, + { + "epoch": 0.09501108383190714, + "grad_norm": 1.3942818641662598, + "learning_rate": 1.7094637073707105e-05, + "loss": 0.883, + "step": 1993 + }, + { + "epoch": 0.09505875622720664, + "grad_norm": 1.499047875404358, + "learning_rate": 1.7091868319157055e-05, + "loss": 0.6783, + "step": 1994 + }, + { + "epoch": 0.09510642862250614, + "grad_norm": 8.317218780517578, + "learning_rate": 1.7089098470404755e-05, + "loss": 0.5295, + "step": 1995 + }, + { + "epoch": 0.09515410101780564, + "grad_norm": 1.0080902576446533, + "learning_rate": 1.7086327527877563e-05, + "loss": 0.4816, + "step": 1996 + }, + { + "epoch": 0.09520177341310514, + "grad_norm": 1.6783298254013062, + "learning_rate": 1.708355549200301e-05, + "loss": 0.7104, + "step": 1997 + }, + { + "epoch": 0.09524944580840464, + "grad_norm": 2.276475667953491, + "learning_rate": 1.708078236320879e-05, + "loss": 0.884, + "step": 1998 + }, + { + "epoch": 0.09529711820370415, + "grad_norm": 2.21710467338562, + "learning_rate": 1.707800814192277e-05, + "loss": 0.6756, + "step": 1999 + }, + { + "epoch": 0.09534479059900365, + "grad_norm": 1.2017117738723755, + "learning_rate": 1.7075232828572982e-05, + "loss": 0.5945, + "step": 2000 + }, + { + "epoch": 0.09539246299430315, + "grad_norm": 1.1749649047851562, + "learning_rate": 1.707245642358763e-05, + "loss": 0.6184, + "step": 2001 + }, + { + "epoch": 0.09544013538960265, + "grad_norm": 2.2212271690368652, + "learning_rate": 1.7069678927395083e-05, + "loss": 0.7151, + "step": 2002 + }, + { + "epoch": 0.09548780778490215, + "grad_norm": 2.354304075241089, + "learning_rate": 1.706690034042388e-05, + "loss": 1.1843, + "step": 2003 + }, + { + "epoch": 0.09553548018020165, + "grad_norm": 3.686828374862671, + "learning_rate": 1.7064120663102737e-05, + "loss": 0.4615, + "step": 2004 + }, + { + "epoch": 0.09558315257550115, + "grad_norm": 0.9127634763717651, + "learning_rate": 1.7061339895860513e-05, + "loss": 0.2305, + "step": 2005 + }, + { + "epoch": 0.09563082497080065, + "grad_norm": 2.3917996883392334, + "learning_rate": 1.7058558039126266e-05, + "loss": 0.7754, + "step": 2006 + }, + { + "epoch": 0.09567849736610017, + "grad_norm": 1.8181190490722656, + "learning_rate": 1.7055775093329202e-05, + "loss": 0.5913, + "step": 2007 + }, + { + "epoch": 0.09572616976139967, + "grad_norm": 1.2763667106628418, + "learning_rate": 1.70529910588987e-05, + "loss": 0.7572, + "step": 2008 + }, + { + "epoch": 0.09577384215669917, + "grad_norm": 1.1833136081695557, + "learning_rate": 1.705020593626431e-05, + "loss": 0.5322, + "step": 2009 + }, + { + "epoch": 0.09582151455199867, + "grad_norm": 1.3872525691986084, + "learning_rate": 1.704741972585575e-05, + "loss": 0.8074, + "step": 2010 + }, + { + "epoch": 0.09586918694729817, + "grad_norm": 2.3611814975738525, + "learning_rate": 1.7044632428102896e-05, + "loss": 1.1546, + "step": 2011 + }, + { + "epoch": 0.09591685934259767, + "grad_norm": 1.9899375438690186, + "learning_rate": 1.7041844043435806e-05, + "loss": 1.1795, + "step": 2012 + }, + { + "epoch": 0.09596453173789717, + "grad_norm": 3.1207892894744873, + "learning_rate": 1.7039054572284697e-05, + "loss": 1.0391, + "step": 2013 + }, + { + "epoch": 0.09601220413319667, + "grad_norm": 1.7606477737426758, + "learning_rate": 1.7036264015079958e-05, + "loss": 0.2867, + "step": 2014 + }, + { + "epoch": 0.09605987652849618, + "grad_norm": 1.2277532815933228, + "learning_rate": 1.7033472372252138e-05, + "loss": 0.9605, + "step": 2015 + }, + { + "epoch": 0.09610754892379568, + "grad_norm": 2.3358285427093506, + "learning_rate": 1.703067964423196e-05, + "loss": 1.2601, + "step": 2016 + }, + { + "epoch": 0.09615522131909518, + "grad_norm": 2.904574155807495, + "learning_rate": 1.7027885831450318e-05, + "loss": 1.2073, + "step": 2017 + }, + { + "epoch": 0.09620289371439468, + "grad_norm": 1.747771978378296, + "learning_rate": 1.7025090934338266e-05, + "loss": 0.8577, + "step": 2018 + }, + { + "epoch": 0.09625056610969418, + "grad_norm": 2.4418985843658447, + "learning_rate": 1.7022294953327025e-05, + "loss": 0.7614, + "step": 2019 + }, + { + "epoch": 0.09629823850499368, + "grad_norm": 1.9450327157974243, + "learning_rate": 1.701949788884799e-05, + "loss": 0.8571, + "step": 2020 + }, + { + "epoch": 0.09634591090029318, + "grad_norm": 2.4468445777893066, + "learning_rate": 1.701669974133272e-05, + "loss": 0.813, + "step": 2021 + }, + { + "epoch": 0.09639358329559268, + "grad_norm": 2.8782572746276855, + "learning_rate": 1.7013900511212932e-05, + "loss": 1.02, + "step": 2022 + }, + { + "epoch": 0.0964412556908922, + "grad_norm": 1.7242225408554077, + "learning_rate": 1.7011100198920528e-05, + "loss": 0.8248, + "step": 2023 + }, + { + "epoch": 0.0964889280861917, + "grad_norm": 2.1396656036376953, + "learning_rate": 1.7008298804887565e-05, + "loss": 0.7572, + "step": 2024 + }, + { + "epoch": 0.0965366004814912, + "grad_norm": 1.185307502746582, + "learning_rate": 1.7005496329546263e-05, + "loss": 0.6201, + "step": 2025 + }, + { + "epoch": 0.0965842728767907, + "grad_norm": 1.039168357849121, + "learning_rate": 1.7002692773329026e-05, + "loss": 0.8732, + "step": 2026 + }, + { + "epoch": 0.0966319452720902, + "grad_norm": 13.774724006652832, + "learning_rate": 1.6999888136668404e-05, + "loss": 1.2008, + "step": 2027 + }, + { + "epoch": 0.0966796176673897, + "grad_norm": 1.1632906198501587, + "learning_rate": 1.6997082419997127e-05, + "loss": 0.8072, + "step": 2028 + }, + { + "epoch": 0.0967272900626892, + "grad_norm": 2.1988863945007324, + "learning_rate": 1.6994275623748092e-05, + "loss": 0.809, + "step": 2029 + }, + { + "epoch": 0.09677496245798871, + "grad_norm": 2.1799628734588623, + "learning_rate": 1.6991467748354352e-05, + "loss": 0.5426, + "step": 2030 + }, + { + "epoch": 0.09682263485328821, + "grad_norm": 1.75131356716156, + "learning_rate": 1.6988658794249134e-05, + "loss": 0.7247, + "step": 2031 + }, + { + "epoch": 0.09687030724858771, + "grad_norm": 2.1120526790618896, + "learning_rate": 1.6985848761865838e-05, + "loss": 0.6104, + "step": 2032 + }, + { + "epoch": 0.09691797964388721, + "grad_norm": 1.498909592628479, + "learning_rate": 1.698303765163802e-05, + "loss": 0.7601, + "step": 2033 + }, + { + "epoch": 0.09696565203918671, + "grad_norm": 2.717449903488159, + "learning_rate": 1.69802254639994e-05, + "loss": 0.7249, + "step": 2034 + }, + { + "epoch": 0.09701332443448621, + "grad_norm": 3.706339120864868, + "learning_rate": 1.6977412199383872e-05, + "loss": 0.9153, + "step": 2035 + }, + { + "epoch": 0.09706099682978571, + "grad_norm": 2.9623730182647705, + "learning_rate": 1.6974597858225502e-05, + "loss": 0.9144, + "step": 2036 + }, + { + "epoch": 0.09710866922508521, + "grad_norm": 1.8177005052566528, + "learning_rate": 1.69717824409585e-05, + "loss": 0.8277, + "step": 2037 + }, + { + "epoch": 0.09715634162038472, + "grad_norm": 2.097759485244751, + "learning_rate": 1.6968965948017266e-05, + "loss": 0.7698, + "step": 2038 + }, + { + "epoch": 0.09720401401568422, + "grad_norm": 1.4746730327606201, + "learning_rate": 1.696614837983635e-05, + "loss": 0.6789, + "step": 2039 + }, + { + "epoch": 0.09725168641098372, + "grad_norm": 1.5426316261291504, + "learning_rate": 1.696332973685048e-05, + "loss": 0.6031, + "step": 2040 + }, + { + "epoch": 0.09729935880628322, + "grad_norm": 2.9810078144073486, + "learning_rate": 1.696051001949454e-05, + "loss": 0.5428, + "step": 2041 + }, + { + "epoch": 0.09734703120158272, + "grad_norm": 2.5323169231414795, + "learning_rate": 1.6957689228203583e-05, + "loss": 0.7858, + "step": 2042 + }, + { + "epoch": 0.09739470359688222, + "grad_norm": 2.825571060180664, + "learning_rate": 1.6954867363412827e-05, + "loss": 1.1447, + "step": 2043 + }, + { + "epoch": 0.09744237599218172, + "grad_norm": 1.7198448181152344, + "learning_rate": 1.695204442555766e-05, + "loss": 0.7294, + "step": 2044 + }, + { + "epoch": 0.09749004838748122, + "grad_norm": 1.7498078346252441, + "learning_rate": 1.6949220415073627e-05, + "loss": 0.9041, + "step": 2045 + }, + { + "epoch": 0.09753772078278074, + "grad_norm": 1.821815848350525, + "learning_rate": 1.6946395332396447e-05, + "loss": 0.9126, + "step": 2046 + }, + { + "epoch": 0.09758539317808024, + "grad_norm": 1.6222103834152222, + "learning_rate": 1.6943569177962005e-05, + "loss": 0.9846, + "step": 2047 + }, + { + "epoch": 0.09763306557337974, + "grad_norm": 1.535952091217041, + "learning_rate": 1.6940741952206342e-05, + "loss": 0.6679, + "step": 2048 + }, + { + "epoch": 0.09768073796867924, + "grad_norm": 1.5526154041290283, + "learning_rate": 1.693791365556567e-05, + "loss": 0.7836, + "step": 2049 + }, + { + "epoch": 0.09772841036397874, + "grad_norm": 1.3706446886062622, + "learning_rate": 1.6935084288476365e-05, + "loss": 0.6875, + "step": 2050 + }, + { + "epoch": 0.09777608275927824, + "grad_norm": 1.430282473564148, + "learning_rate": 1.693225385137498e-05, + "loss": 0.7229, + "step": 2051 + }, + { + "epoch": 0.09782375515457774, + "grad_norm": 1.5612531900405884, + "learning_rate": 1.692942234469821e-05, + "loss": 0.7721, + "step": 2052 + }, + { + "epoch": 0.09787142754987724, + "grad_norm": 1.980994701385498, + "learning_rate": 1.692658976888293e-05, + "loss": 0.6863, + "step": 2053 + }, + { + "epoch": 0.09791909994517675, + "grad_norm": 1.9815247058868408, + "learning_rate": 1.6923756124366184e-05, + "loss": 0.9042, + "step": 2054 + }, + { + "epoch": 0.09796677234047625, + "grad_norm": 1.246296763420105, + "learning_rate": 1.6920921411585164e-05, + "loss": 0.6666, + "step": 2055 + }, + { + "epoch": 0.09801444473577575, + "grad_norm": 1.1949427127838135, + "learning_rate": 1.691808563097724e-05, + "loss": 0.672, + "step": 2056 + }, + { + "epoch": 0.09806211713107525, + "grad_norm": 1.914755940437317, + "learning_rate": 1.691524878297995e-05, + "loss": 0.9311, + "step": 2057 + }, + { + "epoch": 0.09810978952637475, + "grad_norm": 1.8586196899414062, + "learning_rate": 1.6912410868030987e-05, + "loss": 0.6847, + "step": 2058 + }, + { + "epoch": 0.09815746192167425, + "grad_norm": 1.937045931816101, + "learning_rate": 1.6909571886568206e-05, + "loss": 0.5415, + "step": 2059 + }, + { + "epoch": 0.09820513431697375, + "grad_norm": 1.8809586763381958, + "learning_rate": 1.690673183902964e-05, + "loss": 0.566, + "step": 2060 + }, + { + "epoch": 0.09825280671227325, + "grad_norm": 1.4162172079086304, + "learning_rate": 1.690389072585348e-05, + "loss": 0.7743, + "step": 2061 + }, + { + "epoch": 0.09830047910757277, + "grad_norm": 2.278230905532837, + "learning_rate": 1.6901048547478073e-05, + "loss": 1.1739, + "step": 2062 + }, + { + "epoch": 0.09834815150287227, + "grad_norm": 2.1912760734558105, + "learning_rate": 1.6898205304341947e-05, + "loss": 0.8375, + "step": 2063 + }, + { + "epoch": 0.09839582389817177, + "grad_norm": 1.8050661087036133, + "learning_rate": 1.6895360996883777e-05, + "loss": 0.7892, + "step": 2064 + }, + { + "epoch": 0.09844349629347127, + "grad_norm": 2.9720239639282227, + "learning_rate": 1.6892515625542413e-05, + "loss": 1.4879, + "step": 2065 + }, + { + "epoch": 0.09849116868877077, + "grad_norm": 1.624472975730896, + "learning_rate": 1.688966919075687e-05, + "loss": 0.7702, + "step": 2066 + }, + { + "epoch": 0.09853884108407027, + "grad_norm": 2.010019063949585, + "learning_rate": 1.6886821692966314e-05, + "loss": 0.9441, + "step": 2067 + }, + { + "epoch": 0.09858651347936977, + "grad_norm": 1.6731029748916626, + "learning_rate": 1.68839731326101e-05, + "loss": 0.6575, + "step": 2068 + }, + { + "epoch": 0.09863418587466927, + "grad_norm": 1.3605769872665405, + "learning_rate": 1.6881123510127716e-05, + "loss": 0.9946, + "step": 2069 + }, + { + "epoch": 0.09868185826996878, + "grad_norm": 1.8632115125656128, + "learning_rate": 1.687827282595884e-05, + "loss": 0.846, + "step": 2070 + }, + { + "epoch": 0.09872953066526828, + "grad_norm": 1.5721104145050049, + "learning_rate": 1.68754210805433e-05, + "loss": 0.7917, + "step": 2071 + }, + { + "epoch": 0.09877720306056778, + "grad_norm": 1.5502465963363647, + "learning_rate": 1.6872568274321087e-05, + "loss": 0.7017, + "step": 2072 + }, + { + "epoch": 0.09882487545586728, + "grad_norm": 2.1058290004730225, + "learning_rate": 1.6869714407732364e-05, + "loss": 0.6976, + "step": 2073 + }, + { + "epoch": 0.09887254785116678, + "grad_norm": 1.461596131324768, + "learning_rate": 1.6866859481217453e-05, + "loss": 0.7409, + "step": 2074 + }, + { + "epoch": 0.09892022024646628, + "grad_norm": 2.0125601291656494, + "learning_rate": 1.686400349521684e-05, + "loss": 0.4736, + "step": 2075 + }, + { + "epoch": 0.09896789264176578, + "grad_norm": 1.7437502145767212, + "learning_rate": 1.6861146450171177e-05, + "loss": 0.8363, + "step": 2076 + }, + { + "epoch": 0.09901556503706528, + "grad_norm": 1.3860204219818115, + "learning_rate": 1.6858288346521265e-05, + "loss": 0.5161, + "step": 2077 + }, + { + "epoch": 0.0990632374323648, + "grad_norm": 1.4488458633422852, + "learning_rate": 1.685542918470809e-05, + "loss": 0.5047, + "step": 2078 + }, + { + "epoch": 0.0991109098276643, + "grad_norm": 1.8164137601852417, + "learning_rate": 1.6852568965172794e-05, + "loss": 0.936, + "step": 2079 + }, + { + "epoch": 0.0991585822229638, + "grad_norm": 1.4916291236877441, + "learning_rate": 1.684970768835667e-05, + "loss": 0.9029, + "step": 2080 + }, + { + "epoch": 0.0992062546182633, + "grad_norm": 1.8367319107055664, + "learning_rate": 1.684684535470119e-05, + "loss": 0.8575, + "step": 2081 + }, + { + "epoch": 0.0992539270135628, + "grad_norm": 3.2794089317321777, + "learning_rate": 1.6843981964647976e-05, + "loss": 0.7888, + "step": 2082 + }, + { + "epoch": 0.0993015994088623, + "grad_norm": 1.320497989654541, + "learning_rate": 1.684111751863883e-05, + "loss": 0.7774, + "step": 2083 + }, + { + "epoch": 0.0993492718041618, + "grad_norm": 3.0137689113616943, + "learning_rate": 1.68382520171157e-05, + "loss": 0.5132, + "step": 2084 + }, + { + "epoch": 0.0993969441994613, + "grad_norm": 2.3612728118896484, + "learning_rate": 1.68353854605207e-05, + "loss": 0.9144, + "step": 2085 + }, + { + "epoch": 0.09944461659476081, + "grad_norm": 2.156235694885254, + "learning_rate": 1.683251784929612e-05, + "loss": 0.6677, + "step": 2086 + }, + { + "epoch": 0.09949228899006031, + "grad_norm": 9.895931243896484, + "learning_rate": 1.6829649183884395e-05, + "loss": 0.766, + "step": 2087 + }, + { + "epoch": 0.09953996138535981, + "grad_norm": 1.4509943723678589, + "learning_rate": 1.6826779464728132e-05, + "loss": 0.7959, + "step": 2088 + }, + { + "epoch": 0.09958763378065931, + "grad_norm": 1.1089597940444946, + "learning_rate": 1.68239086922701e-05, + "loss": 0.4899, + "step": 2089 + }, + { + "epoch": 0.09963530617595881, + "grad_norm": 1.2637901306152344, + "learning_rate": 1.6821036866953226e-05, + "loss": 0.9288, + "step": 2090 + }, + { + "epoch": 0.09968297857125831, + "grad_norm": 1.168062448501587, + "learning_rate": 1.681816398922061e-05, + "loss": 0.7841, + "step": 2091 + }, + { + "epoch": 0.09973065096655781, + "grad_norm": 5.275248050689697, + "learning_rate": 1.6815290059515504e-05, + "loss": 1.3262, + "step": 2092 + }, + { + "epoch": 0.09977832336185731, + "grad_norm": 1.2903294563293457, + "learning_rate": 1.6812415078281324e-05, + "loss": 0.5238, + "step": 2093 + }, + { + "epoch": 0.09982599575715682, + "grad_norm": 1.475982427597046, + "learning_rate": 1.6809539045961653e-05, + "loss": 0.6875, + "step": 2094 + }, + { + "epoch": 0.09987366815245632, + "grad_norm": 2.128023862838745, + "learning_rate": 1.6806661963000234e-05, + "loss": 1.2745, + "step": 2095 + }, + { + "epoch": 0.09992134054775582, + "grad_norm": 1.3259506225585938, + "learning_rate": 1.6803783829840967e-05, + "loss": 0.8099, + "step": 2096 + }, + { + "epoch": 0.09996901294305532, + "grad_norm": 1.4391194581985474, + "learning_rate": 1.6800904646927923e-05, + "loss": 0.721, + "step": 2097 + }, + { + "epoch": 0.10001668533835482, + "grad_norm": 1.266096591949463, + "learning_rate": 1.679802441470532e-05, + "loss": 0.7507, + "step": 2098 + }, + { + "epoch": 0.10006435773365432, + "grad_norm": 1.0737955570220947, + "learning_rate": 1.6795143133617562e-05, + "loss": 0.9599, + "step": 2099 + }, + { + "epoch": 0.10011203012895382, + "grad_norm": 1.1342597007751465, + "learning_rate": 1.6792260804109196e-05, + "loss": 0.7678, + "step": 2100 + }, + { + "epoch": 0.10015970252425332, + "grad_norm": 4.525983810424805, + "learning_rate": 1.6789377426624935e-05, + "loss": 0.7239, + "step": 2101 + }, + { + "epoch": 0.10020737491955284, + "grad_norm": 1.9952508211135864, + "learning_rate": 1.678649300160965e-05, + "loss": 0.5961, + "step": 2102 + }, + { + "epoch": 0.10025504731485234, + "grad_norm": 1.8149811029434204, + "learning_rate": 1.6783607529508382e-05, + "loss": 0.8781, + "step": 2103 + }, + { + "epoch": 0.10030271971015184, + "grad_norm": 2.002239465713501, + "learning_rate": 1.6780721010766335e-05, + "loss": 0.708, + "step": 2104 + }, + { + "epoch": 0.10035039210545134, + "grad_norm": 1.3707174062728882, + "learning_rate": 1.677783344582886e-05, + "loss": 0.3173, + "step": 2105 + }, + { + "epoch": 0.10039806450075084, + "grad_norm": 1.379638910293579, + "learning_rate": 1.6774944835141484e-05, + "loss": 0.7963, + "step": 2106 + }, + { + "epoch": 0.10044573689605034, + "grad_norm": 1.7050049304962158, + "learning_rate": 1.6772055179149886e-05, + "loss": 1.0089, + "step": 2107 + }, + { + "epoch": 0.10049340929134984, + "grad_norm": 1.4809306859970093, + "learning_rate": 1.676916447829992e-05, + "loss": 0.8716, + "step": 2108 + }, + { + "epoch": 0.10054108168664934, + "grad_norm": 1.2359148263931274, + "learning_rate": 1.6766272733037575e-05, + "loss": 0.6663, + "step": 2109 + }, + { + "epoch": 0.10058875408194885, + "grad_norm": 2.501166582107544, + "learning_rate": 1.676337994380903e-05, + "loss": 0.8092, + "step": 2110 + }, + { + "epoch": 0.10063642647724835, + "grad_norm": 2.328268051147461, + "learning_rate": 1.6760486111060607e-05, + "loss": 0.5132, + "step": 2111 + }, + { + "epoch": 0.10068409887254785, + "grad_norm": 2.1387557983398438, + "learning_rate": 1.67575912352388e-05, + "loss": 0.8506, + "step": 2112 + }, + { + "epoch": 0.10073177126784735, + "grad_norm": 5.4658355712890625, + "learning_rate": 1.6754695316790255e-05, + "loss": 0.7039, + "step": 2113 + }, + { + "epoch": 0.10077944366314685, + "grad_norm": 2.01517915725708, + "learning_rate": 1.675179835616178e-05, + "loss": 0.5598, + "step": 2114 + }, + { + "epoch": 0.10082711605844635, + "grad_norm": 0.8727918267250061, + "learning_rate": 1.674890035380035e-05, + "loss": 0.236, + "step": 2115 + }, + { + "epoch": 0.10087478845374585, + "grad_norm": 3.1234583854675293, + "learning_rate": 1.6746001310153095e-05, + "loss": 0.4793, + "step": 2116 + }, + { + "epoch": 0.10092246084904535, + "grad_norm": 2.680319309234619, + "learning_rate": 1.674310122566731e-05, + "loss": 0.5716, + "step": 2117 + }, + { + "epoch": 0.10097013324434487, + "grad_norm": 2.324587821960449, + "learning_rate": 1.6740200100790445e-05, + "loss": 0.5267, + "step": 2118 + }, + { + "epoch": 0.10101780563964437, + "grad_norm": 2.781690835952759, + "learning_rate": 1.673729793597011e-05, + "loss": 0.5512, + "step": 2119 + }, + { + "epoch": 0.10106547803494387, + "grad_norm": 1.82707679271698, + "learning_rate": 1.6734394731654094e-05, + "loss": 0.7171, + "step": 2120 + }, + { + "epoch": 0.10111315043024337, + "grad_norm": 3.9949862957000732, + "learning_rate": 1.6731490488290316e-05, + "loss": 0.7294, + "step": 2121 + }, + { + "epoch": 0.10116082282554287, + "grad_norm": 1.492210865020752, + "learning_rate": 1.672858520632688e-05, + "loss": 0.6102, + "step": 2122 + }, + { + "epoch": 0.10120849522084237, + "grad_norm": 1.8430193662643433, + "learning_rate": 1.6725678886212034e-05, + "loss": 0.706, + "step": 2123 + }, + { + "epoch": 0.10125616761614187, + "grad_norm": 1.5846210718154907, + "learning_rate": 1.67227715283942e-05, + "loss": 0.5812, + "step": 2124 + }, + { + "epoch": 0.10130384001144138, + "grad_norm": 1.7634451389312744, + "learning_rate": 1.6719863133321947e-05, + "loss": 0.5163, + "step": 2125 + }, + { + "epoch": 0.10135151240674088, + "grad_norm": 1.6493277549743652, + "learning_rate": 1.6716953701444014e-05, + "loss": 0.1924, + "step": 2126 + }, + { + "epoch": 0.10139918480204038, + "grad_norm": 3.5931055545806885, + "learning_rate": 1.6714043233209296e-05, + "loss": 1.0592, + "step": 2127 + }, + { + "epoch": 0.10144685719733988, + "grad_norm": 2.2461369037628174, + "learning_rate": 1.6711131729066853e-05, + "loss": 0.5719, + "step": 2128 + }, + { + "epoch": 0.10149452959263938, + "grad_norm": 6.407316207885742, + "learning_rate": 1.6708219189465894e-05, + "loss": 0.9888, + "step": 2129 + }, + { + "epoch": 0.10154220198793888, + "grad_norm": 1.3646303415298462, + "learning_rate": 1.670530561485579e-05, + "loss": 0.362, + "step": 2130 + }, + { + "epoch": 0.10158987438323838, + "grad_norm": 2.5397655963897705, + "learning_rate": 1.6702391005686088e-05, + "loss": 0.6678, + "step": 2131 + }, + { + "epoch": 0.10163754677853788, + "grad_norm": 1.4877567291259766, + "learning_rate": 1.669947536240647e-05, + "loss": 0.6829, + "step": 2132 + }, + { + "epoch": 0.1016852191738374, + "grad_norm": 1.0454562902450562, + "learning_rate": 1.6696558685466793e-05, + "loss": 0.65, + "step": 2133 + }, + { + "epoch": 0.1017328915691369, + "grad_norm": 2.704338550567627, + "learning_rate": 1.6693640975317078e-05, + "loss": 0.8758, + "step": 2134 + }, + { + "epoch": 0.1017805639644364, + "grad_norm": 3.273530960083008, + "learning_rate": 1.669072223240749e-05, + "loss": 0.916, + "step": 2135 + }, + { + "epoch": 0.1018282363597359, + "grad_norm": 1.9205434322357178, + "learning_rate": 1.668780245718836e-05, + "loss": 1.0245, + "step": 2136 + }, + { + "epoch": 0.1018759087550354, + "grad_norm": 1.2210420370101929, + "learning_rate": 1.6684881650110186e-05, + "loss": 0.7859, + "step": 2137 + }, + { + "epoch": 0.1019235811503349, + "grad_norm": 1.1936390399932861, + "learning_rate": 1.668195981162361e-05, + "loss": 0.8466, + "step": 2138 + }, + { + "epoch": 0.1019712535456344, + "grad_norm": 1.6571040153503418, + "learning_rate": 1.667903694217945e-05, + "loss": 0.7082, + "step": 2139 + }, + { + "epoch": 0.1020189259409339, + "grad_norm": 2.0592868328094482, + "learning_rate": 1.667611304222867e-05, + "loss": 0.9833, + "step": 2140 + }, + { + "epoch": 0.10206659833623341, + "grad_norm": 3.235091209411621, + "learning_rate": 1.6673188112222394e-05, + "loss": 0.3433, + "step": 2141 + }, + { + "epoch": 0.10211427073153291, + "grad_norm": 2.5346899032592773, + "learning_rate": 1.6670262152611916e-05, + "loss": 0.6287, + "step": 2142 + }, + { + "epoch": 0.10216194312683241, + "grad_norm": 1.5297119617462158, + "learning_rate": 1.6667335163848682e-05, + "loss": 0.8937, + "step": 2143 + }, + { + "epoch": 0.10220961552213191, + "grad_norm": 3.354051351547241, + "learning_rate": 1.6664407146384287e-05, + "loss": 1.0068, + "step": 2144 + }, + { + "epoch": 0.10225728791743141, + "grad_norm": 1.9681328535079956, + "learning_rate": 1.6661478100670502e-05, + "loss": 0.7261, + "step": 2145 + }, + { + "epoch": 0.10230496031273091, + "grad_norm": 1.2717812061309814, + "learning_rate": 1.6658548027159245e-05, + "loss": 0.7001, + "step": 2146 + }, + { + "epoch": 0.10235263270803041, + "grad_norm": 14.975186347961426, + "learning_rate": 1.6655616926302594e-05, + "loss": 1.1774, + "step": 2147 + }, + { + "epoch": 0.10240030510332991, + "grad_norm": 3.2782044410705566, + "learning_rate": 1.6652684798552793e-05, + "loss": 0.7452, + "step": 2148 + }, + { + "epoch": 0.10244797749862943, + "grad_norm": 2.828608274459839, + "learning_rate": 1.664975164436224e-05, + "loss": 0.9119, + "step": 2149 + }, + { + "epoch": 0.10249564989392893, + "grad_norm": 1.7152819633483887, + "learning_rate": 1.6646817464183485e-05, + "loss": 0.7362, + "step": 2150 + }, + { + "epoch": 0.10254332228922843, + "grad_norm": 2.272935390472412, + "learning_rate": 1.6643882258469247e-05, + "loss": 0.6275, + "step": 2151 + }, + { + "epoch": 0.10259099468452793, + "grad_norm": 3.6414568424224854, + "learning_rate": 1.6640946027672395e-05, + "loss": 0.288, + "step": 2152 + }, + { + "epoch": 0.10263866707982743, + "grad_norm": 1.2086769342422485, + "learning_rate": 1.6638008772245956e-05, + "loss": 0.7764, + "step": 2153 + }, + { + "epoch": 0.10268633947512693, + "grad_norm": 1.8948204517364502, + "learning_rate": 1.663507049264312e-05, + "loss": 0.7969, + "step": 2154 + }, + { + "epoch": 0.10273401187042643, + "grad_norm": 1.3868238925933838, + "learning_rate": 1.663213118931724e-05, + "loss": 0.4417, + "step": 2155 + }, + { + "epoch": 0.10278168426572593, + "grad_norm": 2.0186548233032227, + "learning_rate": 1.6629190862721813e-05, + "loss": 0.9057, + "step": 2156 + }, + { + "epoch": 0.10282935666102544, + "grad_norm": 1.5707331895828247, + "learning_rate": 1.6626249513310505e-05, + "loss": 0.6445, + "step": 2157 + }, + { + "epoch": 0.10287702905632494, + "grad_norm": 2.2689332962036133, + "learning_rate": 1.662330714153713e-05, + "loss": 0.4731, + "step": 2158 + }, + { + "epoch": 0.10292470145162444, + "grad_norm": 4.399372100830078, + "learning_rate": 1.6620363747855675e-05, + "loss": 1.1345, + "step": 2159 + }, + { + "epoch": 0.10297237384692394, + "grad_norm": 2.119353771209717, + "learning_rate": 1.6617419332720267e-05, + "loss": 0.4877, + "step": 2160 + }, + { + "epoch": 0.10302004624222344, + "grad_norm": 1.2800583839416504, + "learning_rate": 1.6614473896585206e-05, + "loss": 0.7413, + "step": 2161 + }, + { + "epoch": 0.10306771863752294, + "grad_norm": 1.5062272548675537, + "learning_rate": 1.6611527439904934e-05, + "loss": 0.6188, + "step": 2162 + }, + { + "epoch": 0.10311539103282244, + "grad_norm": 1.4648534059524536, + "learning_rate": 1.6608579963134067e-05, + "loss": 0.6466, + "step": 2163 + }, + { + "epoch": 0.10316306342812194, + "grad_norm": 4.780557632446289, + "learning_rate": 1.6605631466727365e-05, + "loss": 0.9409, + "step": 2164 + }, + { + "epoch": 0.10321073582342145, + "grad_norm": 1.4820151329040527, + "learning_rate": 1.6602681951139752e-05, + "loss": 0.5981, + "step": 2165 + }, + { + "epoch": 0.10325840821872095, + "grad_norm": 1.5938807725906372, + "learning_rate": 1.659973141682631e-05, + "loss": 0.6836, + "step": 2166 + }, + { + "epoch": 0.10330608061402045, + "grad_norm": 1.9833928346633911, + "learning_rate": 1.6596779864242274e-05, + "loss": 0.6456, + "step": 2167 + }, + { + "epoch": 0.10335375300931995, + "grad_norm": 5.635646820068359, + "learning_rate": 1.659382729384304e-05, + "loss": 1.2212, + "step": 2168 + }, + { + "epoch": 0.10340142540461945, + "grad_norm": 1.8882306814193726, + "learning_rate": 1.6590873706084158e-05, + "loss": 1.2086, + "step": 2169 + }, + { + "epoch": 0.10344909779991895, + "grad_norm": 8.547324180603027, + "learning_rate": 1.6587919101421333e-05, + "loss": 0.6105, + "step": 2170 + }, + { + "epoch": 0.10349677019521845, + "grad_norm": 1.314596176147461, + "learning_rate": 1.6584963480310433e-05, + "loss": 0.6726, + "step": 2171 + }, + { + "epoch": 0.10354444259051795, + "grad_norm": 1.857081651687622, + "learning_rate": 1.658200684320748e-05, + "loss": 0.6906, + "step": 2172 + }, + { + "epoch": 0.10359211498581747, + "grad_norm": 1.2799304723739624, + "learning_rate": 1.6579049190568656e-05, + "loss": 0.7751, + "step": 2173 + }, + { + "epoch": 0.10363978738111697, + "grad_norm": 1.6420232057571411, + "learning_rate": 1.6576090522850292e-05, + "loss": 0.9214, + "step": 2174 + }, + { + "epoch": 0.10368745977641647, + "grad_norm": 0.7685630917549133, + "learning_rate": 1.657313084050888e-05, + "loss": 0.2469, + "step": 2175 + }, + { + "epoch": 0.10373513217171597, + "grad_norm": 1.9455807209014893, + "learning_rate": 1.6570170144001067e-05, + "loss": 0.7618, + "step": 2176 + }, + { + "epoch": 0.10378280456701547, + "grad_norm": 4.829035758972168, + "learning_rate": 1.6567208433783666e-05, + "loss": 1.111, + "step": 2177 + }, + { + "epoch": 0.10383047696231497, + "grad_norm": 1.4978998899459839, + "learning_rate": 1.656424571031363e-05, + "loss": 0.4346, + "step": 2178 + }, + { + "epoch": 0.10387814935761447, + "grad_norm": 4.826923847198486, + "learning_rate": 1.656128197404808e-05, + "loss": 0.4021, + "step": 2179 + }, + { + "epoch": 0.10392582175291397, + "grad_norm": 2.096256971359253, + "learning_rate": 1.655831722544429e-05, + "loss": 0.3349, + "step": 2180 + }, + { + "epoch": 0.10397349414821348, + "grad_norm": 1.3158085346221924, + "learning_rate": 1.655535146495969e-05, + "loss": 0.5099, + "step": 2181 + }, + { + "epoch": 0.10402116654351298, + "grad_norm": 3.5498571395874023, + "learning_rate": 1.655238469305186e-05, + "loss": 0.8809, + "step": 2182 + }, + { + "epoch": 0.10406883893881248, + "grad_norm": 1.8730432987213135, + "learning_rate": 1.6549416910178554e-05, + "loss": 0.8471, + "step": 2183 + }, + { + "epoch": 0.10411651133411198, + "grad_norm": 1.8743762969970703, + "learning_rate": 1.6546448116797664e-05, + "loss": 1.0615, + "step": 2184 + }, + { + "epoch": 0.10416418372941148, + "grad_norm": 2.668163299560547, + "learning_rate": 1.6543478313367244e-05, + "loss": 1.2889, + "step": 2185 + }, + { + "epoch": 0.10421185612471098, + "grad_norm": 2.1281440258026123, + "learning_rate": 1.6540507500345507e-05, + "loss": 0.8726, + "step": 2186 + }, + { + "epoch": 0.10425952852001048, + "grad_norm": 1.0205135345458984, + "learning_rate": 1.6537535678190815e-05, + "loss": 0.4469, + "step": 2187 + }, + { + "epoch": 0.10430720091530998, + "grad_norm": 1.752551794052124, + "learning_rate": 1.6534562847361693e-05, + "loss": 0.8005, + "step": 2188 + }, + { + "epoch": 0.1043548733106095, + "grad_norm": 1.29226553440094, + "learning_rate": 1.6531589008316816e-05, + "loss": 0.9255, + "step": 2189 + }, + { + "epoch": 0.104402545705909, + "grad_norm": 1.2720071077346802, + "learning_rate": 1.6528614161515015e-05, + "loss": 0.7026, + "step": 2190 + }, + { + "epoch": 0.1044502181012085, + "grad_norm": 9.173748016357422, + "learning_rate": 1.6525638307415284e-05, + "loss": 1.222, + "step": 2191 + }, + { + "epoch": 0.104497890496508, + "grad_norm": 1.315583348274231, + "learning_rate": 1.6522661446476762e-05, + "loss": 0.7529, + "step": 2192 + }, + { + "epoch": 0.1045455628918075, + "grad_norm": 1.7935317754745483, + "learning_rate": 1.651968357915875e-05, + "loss": 1.0363, + "step": 2193 + }, + { + "epoch": 0.104593235287107, + "grad_norm": 1.6630464792251587, + "learning_rate": 1.6516704705920702e-05, + "loss": 0.8377, + "step": 2194 + }, + { + "epoch": 0.1046409076824065, + "grad_norm": 1.3903051614761353, + "learning_rate": 1.6513724827222225e-05, + "loss": 0.598, + "step": 2195 + }, + { + "epoch": 0.104688580077706, + "grad_norm": 1.5116666555404663, + "learning_rate": 1.6510743943523084e-05, + "loss": 0.864, + "step": 2196 + }, + { + "epoch": 0.10473625247300551, + "grad_norm": 1.4434409141540527, + "learning_rate": 1.6507762055283202e-05, + "loss": 0.8152, + "step": 2197 + }, + { + "epoch": 0.10478392486830501, + "grad_norm": 1.8115547895431519, + "learning_rate": 1.6504779162962655e-05, + "loss": 0.6913, + "step": 2198 + }, + { + "epoch": 0.10483159726360451, + "grad_norm": 1.300264835357666, + "learning_rate": 1.6501795267021666e-05, + "loss": 0.8378, + "step": 2199 + }, + { + "epoch": 0.10487926965890401, + "grad_norm": 1.5797349214553833, + "learning_rate": 1.6498810367920622e-05, + "loss": 0.7589, + "step": 2200 + }, + { + "epoch": 0.10492694205420351, + "grad_norm": 2.0074970722198486, + "learning_rate": 1.6495824466120067e-05, + "loss": 0.8529, + "step": 2201 + }, + { + "epoch": 0.10497461444950301, + "grad_norm": 2.0940513610839844, + "learning_rate": 1.649283756208069e-05, + "loss": 0.5206, + "step": 2202 + }, + { + "epoch": 0.10502228684480251, + "grad_norm": 2.029885768890381, + "learning_rate": 1.6489849656263336e-05, + "loss": 1.0338, + "step": 2203 + }, + { + "epoch": 0.10506995924010201, + "grad_norm": 2.2008490562438965, + "learning_rate": 1.6486860749129014e-05, + "loss": 0.7554, + "step": 2204 + }, + { + "epoch": 0.10511763163540153, + "grad_norm": 2.8390586376190186, + "learning_rate": 1.6483870841138883e-05, + "loss": 1.3196, + "step": 2205 + }, + { + "epoch": 0.10516530403070103, + "grad_norm": 1.9962180852890015, + "learning_rate": 1.648087993275425e-05, + "loss": 0.8282, + "step": 2206 + }, + { + "epoch": 0.10521297642600053, + "grad_norm": 1.681196928024292, + "learning_rate": 1.6477888024436586e-05, + "loss": 0.5429, + "step": 2207 + }, + { + "epoch": 0.10526064882130003, + "grad_norm": 2.6438839435577393, + "learning_rate": 1.6474895116647506e-05, + "loss": 0.635, + "step": 2208 + }, + { + "epoch": 0.10530832121659953, + "grad_norm": 1.4192218780517578, + "learning_rate": 1.647190120984879e-05, + "loss": 0.5541, + "step": 2209 + }, + { + "epoch": 0.10535599361189903, + "grad_norm": 2.140742778778076, + "learning_rate": 1.6468906304502365e-05, + "loss": 1.0936, + "step": 2210 + }, + { + "epoch": 0.10540366600719853, + "grad_norm": 2.0596237182617188, + "learning_rate": 1.6465910401070312e-05, + "loss": 0.8405, + "step": 2211 + }, + { + "epoch": 0.10545133840249803, + "grad_norm": 2.1606948375701904, + "learning_rate": 1.6462913500014872e-05, + "loss": 0.5889, + "step": 2212 + }, + { + "epoch": 0.10549901079779754, + "grad_norm": 1.2513320446014404, + "learning_rate": 1.6459915601798436e-05, + "loss": 0.5982, + "step": 2213 + }, + { + "epoch": 0.10554668319309704, + "grad_norm": 4.00132942199707, + "learning_rate": 1.6456916706883542e-05, + "loss": 1.0461, + "step": 2214 + }, + { + "epoch": 0.10559435558839654, + "grad_norm": 1.846450686454773, + "learning_rate": 1.64539168157329e-05, + "loss": 0.6549, + "step": 2215 + }, + { + "epoch": 0.10564202798369604, + "grad_norm": 1.7543492317199707, + "learning_rate": 1.645091592880935e-05, + "loss": 0.879, + "step": 2216 + }, + { + "epoch": 0.10568970037899554, + "grad_norm": 1.6810600757598877, + "learning_rate": 1.6447914046575906e-05, + "loss": 0.901, + "step": 2217 + }, + { + "epoch": 0.10573737277429504, + "grad_norm": 1.2927902936935425, + "learning_rate": 1.6444911169495727e-05, + "loss": 0.713, + "step": 2218 + }, + { + "epoch": 0.10578504516959454, + "grad_norm": 1.2742785215377808, + "learning_rate": 1.644190729803212e-05, + "loss": 0.7169, + "step": 2219 + }, + { + "epoch": 0.10583271756489405, + "grad_norm": 0.8367409706115723, + "learning_rate": 1.6438902432648558e-05, + "loss": 0.3573, + "step": 2220 + }, + { + "epoch": 0.10588038996019355, + "grad_norm": 1.2617907524108887, + "learning_rate": 1.643589657380866e-05, + "loss": 0.6326, + "step": 2221 + }, + { + "epoch": 0.10592806235549306, + "grad_norm": 1.3132867813110352, + "learning_rate": 1.6432889721976196e-05, + "loss": 0.7305, + "step": 2222 + }, + { + "epoch": 0.10597573475079256, + "grad_norm": 1.303444266319275, + "learning_rate": 1.6429881877615094e-05, + "loss": 0.5563, + "step": 2223 + }, + { + "epoch": 0.10602340714609206, + "grad_norm": 2.7119174003601074, + "learning_rate": 1.642687304118943e-05, + "loss": 0.8568, + "step": 2224 + }, + { + "epoch": 0.10607107954139156, + "grad_norm": 1.5195167064666748, + "learning_rate": 1.6423863213163443e-05, + "loss": 0.9413, + "step": 2225 + }, + { + "epoch": 0.10611875193669106, + "grad_norm": 1.1507636308670044, + "learning_rate": 1.642085239400152e-05, + "loss": 0.5005, + "step": 2226 + }, + { + "epoch": 0.10616642433199056, + "grad_norm": 1.988328218460083, + "learning_rate": 1.6417840584168185e-05, + "loss": 0.4961, + "step": 2227 + }, + { + "epoch": 0.10621409672729007, + "grad_norm": 1.5127321481704712, + "learning_rate": 1.6414827784128145e-05, + "loss": 0.6924, + "step": 2228 + }, + { + "epoch": 0.10626176912258957, + "grad_norm": 1.266319751739502, + "learning_rate": 1.6411813994346237e-05, + "loss": 0.6391, + "step": 2229 + }, + { + "epoch": 0.10630944151788907, + "grad_norm": 1.2617981433868408, + "learning_rate": 1.640879921528746e-05, + "loss": 0.8278, + "step": 2230 + }, + { + "epoch": 0.10635711391318857, + "grad_norm": 1.318671464920044, + "learning_rate": 1.640578344741696e-05, + "loss": 0.5378, + "step": 2231 + }, + { + "epoch": 0.10640478630848807, + "grad_norm": 3.89214825630188, + "learning_rate": 1.640276669120004e-05, + "loss": 0.8062, + "step": 2232 + }, + { + "epoch": 0.10645245870378757, + "grad_norm": 1.4413197040557861, + "learning_rate": 1.6399748947102154e-05, + "loss": 0.7213, + "step": 2233 + }, + { + "epoch": 0.10650013109908707, + "grad_norm": 2.349151849746704, + "learning_rate": 1.6396730215588913e-05, + "loss": 0.8822, + "step": 2234 + }, + { + "epoch": 0.10654780349438657, + "grad_norm": 1.390069603919983, + "learning_rate": 1.6393710497126075e-05, + "loss": 0.9154, + "step": 2235 + }, + { + "epoch": 0.10659547588968608, + "grad_norm": 1.2687277793884277, + "learning_rate": 1.6390689792179546e-05, + "loss": 0.7076, + "step": 2236 + }, + { + "epoch": 0.10664314828498558, + "grad_norm": 2.731628894805908, + "learning_rate": 1.6387668101215397e-05, + "loss": 0.9346, + "step": 2237 + }, + { + "epoch": 0.10669082068028508, + "grad_norm": 1.3221529722213745, + "learning_rate": 1.6384645424699835e-05, + "loss": 0.8724, + "step": 2238 + }, + { + "epoch": 0.10673849307558458, + "grad_norm": 1.3455795049667358, + "learning_rate": 1.638162176309924e-05, + "loss": 0.4084, + "step": 2239 + }, + { + "epoch": 0.10678616547088408, + "grad_norm": 1.3419333696365356, + "learning_rate": 1.637859711688012e-05, + "loss": 0.8905, + "step": 2240 + }, + { + "epoch": 0.10683383786618358, + "grad_norm": 2.7646846771240234, + "learning_rate": 1.637557148650915e-05, + "loss": 0.8457, + "step": 2241 + }, + { + "epoch": 0.10688151026148308, + "grad_norm": 6.046072006225586, + "learning_rate": 1.637254487245316e-05, + "loss": 0.5759, + "step": 2242 + }, + { + "epoch": 0.10692918265678258, + "grad_norm": 1.567334532737732, + "learning_rate": 1.636951727517912e-05, + "loss": 0.9092, + "step": 2243 + }, + { + "epoch": 0.1069768550520821, + "grad_norm": 2.4486351013183594, + "learning_rate": 1.6366488695154153e-05, + "loss": 1.2778, + "step": 2244 + }, + { + "epoch": 0.1070245274473816, + "grad_norm": 1.1488258838653564, + "learning_rate": 1.636345913284555e-05, + "loss": 0.6501, + "step": 2245 + }, + { + "epoch": 0.1070721998426811, + "grad_norm": 1.7302095890045166, + "learning_rate": 1.636042858872073e-05, + "loss": 0.942, + "step": 2246 + }, + { + "epoch": 0.1071198722379806, + "grad_norm": 2.4103667736053467, + "learning_rate": 1.6357397063247278e-05, + "loss": 0.8577, + "step": 2247 + }, + { + "epoch": 0.1071675446332801, + "grad_norm": 1.3807567358016968, + "learning_rate": 1.6354364556892926e-05, + "loss": 0.7379, + "step": 2248 + }, + { + "epoch": 0.1072152170285796, + "grad_norm": 1.0593301057815552, + "learning_rate": 1.6351331070125565e-05, + "loss": 0.2429, + "step": 2249 + }, + { + "epoch": 0.1072628894238791, + "grad_norm": 1.4784756898880005, + "learning_rate": 1.634829660341322e-05, + "loss": 1.0051, + "step": 2250 + }, + { + "epoch": 0.1073105618191786, + "grad_norm": 1.805490493774414, + "learning_rate": 1.6345261157224088e-05, + "loss": 0.9268, + "step": 2251 + }, + { + "epoch": 0.10735823421447811, + "grad_norm": 1.7306004762649536, + "learning_rate": 1.6342224732026503e-05, + "loss": 0.4825, + "step": 2252 + }, + { + "epoch": 0.10740590660977761, + "grad_norm": 1.1426695585250854, + "learning_rate": 1.6339187328288953e-05, + "loss": 0.5525, + "step": 2253 + }, + { + "epoch": 0.10745357900507711, + "grad_norm": 1.9162170886993408, + "learning_rate": 1.633614894648008e-05, + "loss": 0.584, + "step": 2254 + }, + { + "epoch": 0.10750125140037661, + "grad_norm": 3.6709189414978027, + "learning_rate": 1.6333109587068675e-05, + "loss": 0.808, + "step": 2255 + }, + { + "epoch": 0.10754892379567611, + "grad_norm": 1.954192876815796, + "learning_rate": 1.6330069250523675e-05, + "loss": 0.9295, + "step": 2256 + }, + { + "epoch": 0.10759659619097561, + "grad_norm": 1.5715235471725464, + "learning_rate": 1.6327027937314183e-05, + "loss": 0.7345, + "step": 2257 + }, + { + "epoch": 0.10764426858627511, + "grad_norm": 1.6055649518966675, + "learning_rate": 1.632398564790943e-05, + "loss": 0.9674, + "step": 2258 + }, + { + "epoch": 0.10769194098157461, + "grad_norm": 4.366003036499023, + "learning_rate": 1.632094238277882e-05, + "loss": 0.3948, + "step": 2259 + }, + { + "epoch": 0.10773961337687413, + "grad_norm": 1.1283854246139526, + "learning_rate": 1.631789814239189e-05, + "loss": 0.5601, + "step": 2260 + }, + { + "epoch": 0.10778728577217363, + "grad_norm": 1.6446889638900757, + "learning_rate": 1.631485292721834e-05, + "loss": 0.9231, + "step": 2261 + }, + { + "epoch": 0.10783495816747313, + "grad_norm": 1.4033557176589966, + "learning_rate": 1.6311806737728016e-05, + "loss": 0.6566, + "step": 2262 + }, + { + "epoch": 0.10788263056277263, + "grad_norm": 2.5270907878875732, + "learning_rate": 1.630875957439091e-05, + "loss": 0.5646, + "step": 2263 + }, + { + "epoch": 0.10793030295807213, + "grad_norm": 2.8287105560302734, + "learning_rate": 1.6305711437677166e-05, + "loss": 0.5786, + "step": 2264 + }, + { + "epoch": 0.10797797535337163, + "grad_norm": 2.170895576477051, + "learning_rate": 1.630266232805709e-05, + "loss": 0.9254, + "step": 2265 + }, + { + "epoch": 0.10802564774867113, + "grad_norm": 1.2984474897384644, + "learning_rate": 1.6299612246001118e-05, + "loss": 0.6007, + "step": 2266 + }, + { + "epoch": 0.10807332014397063, + "grad_norm": 2.290714740753174, + "learning_rate": 1.6296561191979847e-05, + "loss": 1.2458, + "step": 2267 + }, + { + "epoch": 0.10812099253927014, + "grad_norm": 1.086398959159851, + "learning_rate": 1.629350916646403e-05, + "loss": 0.5814, + "step": 2268 + }, + { + "epoch": 0.10816866493456964, + "grad_norm": 1.7387107610702515, + "learning_rate": 1.629045616992456e-05, + "loss": 0.5262, + "step": 2269 + }, + { + "epoch": 0.10821633732986914, + "grad_norm": 1.5796606540679932, + "learning_rate": 1.628740220283248e-05, + "loss": 0.8908, + "step": 2270 + }, + { + "epoch": 0.10826400972516864, + "grad_norm": 1.919259786605835, + "learning_rate": 1.6284347265658986e-05, + "loss": 0.6915, + "step": 2271 + }, + { + "epoch": 0.10831168212046814, + "grad_norm": 1.1323901414871216, + "learning_rate": 1.6281291358875427e-05, + "loss": 0.5345, + "step": 2272 + }, + { + "epoch": 0.10835935451576764, + "grad_norm": 2.3234915733337402, + "learning_rate": 1.6278234482953296e-05, + "loss": 1.365, + "step": 2273 + }, + { + "epoch": 0.10840702691106714, + "grad_norm": 3.0318877696990967, + "learning_rate": 1.627517663836424e-05, + "loss": 0.5771, + "step": 2274 + }, + { + "epoch": 0.10845469930636664, + "grad_norm": 2.742203712463379, + "learning_rate": 1.627211782558005e-05, + "loss": 0.3859, + "step": 2275 + }, + { + "epoch": 0.10850237170166616, + "grad_norm": 2.0642952919006348, + "learning_rate": 1.6269058045072664e-05, + "loss": 0.6774, + "step": 2276 + }, + { + "epoch": 0.10855004409696566, + "grad_norm": 2.5247888565063477, + "learning_rate": 1.626599729731419e-05, + "loss": 0.6729, + "step": 2277 + }, + { + "epoch": 0.10859771649226516, + "grad_norm": 1.4916763305664062, + "learning_rate": 1.626293558277685e-05, + "loss": 0.7336, + "step": 2278 + }, + { + "epoch": 0.10864538888756466, + "grad_norm": 1.771559715270996, + "learning_rate": 1.6259872901933052e-05, + "loss": 0.9819, + "step": 2279 + }, + { + "epoch": 0.10869306128286416, + "grad_norm": 1.8209635019302368, + "learning_rate": 1.6256809255255328e-05, + "loss": 0.6779, + "step": 2280 + }, + { + "epoch": 0.10874073367816366, + "grad_norm": 1.6749701499938965, + "learning_rate": 1.625374464321637e-05, + "loss": 0.8824, + "step": 2281 + }, + { + "epoch": 0.10878840607346316, + "grad_norm": 1.253259301185608, + "learning_rate": 1.6250679066289015e-05, + "loss": 0.4382, + "step": 2282 + }, + { + "epoch": 0.10883607846876266, + "grad_norm": 1.902420163154602, + "learning_rate": 1.624761252494625e-05, + "loss": 0.6701, + "step": 2283 + }, + { + "epoch": 0.10888375086406217, + "grad_norm": 1.1953613758087158, + "learning_rate": 1.6244545019661203e-05, + "loss": 0.6139, + "step": 2284 + }, + { + "epoch": 0.10893142325936167, + "grad_norm": 1.5875980854034424, + "learning_rate": 1.624147655090717e-05, + "loss": 0.6946, + "step": 2285 + }, + { + "epoch": 0.10897909565466117, + "grad_norm": 1.731711983680725, + "learning_rate": 1.6238407119157586e-05, + "loss": 0.5091, + "step": 2286 + }, + { + "epoch": 0.10902676804996067, + "grad_norm": 1.5690521001815796, + "learning_rate": 1.623533672488602e-05, + "loss": 0.8739, + "step": 2287 + }, + { + "epoch": 0.10907444044526017, + "grad_norm": 1.5090197324752808, + "learning_rate": 1.623226536856621e-05, + "loss": 0.7878, + "step": 2288 + }, + { + "epoch": 0.10912211284055967, + "grad_norm": 1.5838605165481567, + "learning_rate": 1.6229193050672036e-05, + "loss": 0.7314, + "step": 2289 + }, + { + "epoch": 0.10916978523585917, + "grad_norm": 1.5384209156036377, + "learning_rate": 1.6226119771677517e-05, + "loss": 0.8772, + "step": 2290 + }, + { + "epoch": 0.10921745763115867, + "grad_norm": 1.7850509881973267, + "learning_rate": 1.6223045532056838e-05, + "loss": 0.9649, + "step": 2291 + }, + { + "epoch": 0.10926513002645818, + "grad_norm": 1.637871503829956, + "learning_rate": 1.6219970332284322e-05, + "loss": 1.1274, + "step": 2292 + }, + { + "epoch": 0.10931280242175768, + "grad_norm": 3.082731246948242, + "learning_rate": 1.621689417283443e-05, + "loss": 1.1972, + "step": 2293 + }, + { + "epoch": 0.10936047481705718, + "grad_norm": 1.8796583414077759, + "learning_rate": 1.621381705418179e-05, + "loss": 0.8037, + "step": 2294 + }, + { + "epoch": 0.10940814721235669, + "grad_norm": 1.5310776233673096, + "learning_rate": 1.6210738976801174e-05, + "loss": 0.8149, + "step": 2295 + }, + { + "epoch": 0.10945581960765619, + "grad_norm": 1.363210916519165, + "learning_rate": 1.6207659941167485e-05, + "loss": 0.5445, + "step": 2296 + }, + { + "epoch": 0.10950349200295569, + "grad_norm": 1.5697338581085205, + "learning_rate": 1.62045799477558e-05, + "loss": 0.9179, + "step": 2297 + }, + { + "epoch": 0.10955116439825519, + "grad_norm": 2.003814697265625, + "learning_rate": 1.620149899704132e-05, + "loss": 0.6131, + "step": 2298 + }, + { + "epoch": 0.10959883679355469, + "grad_norm": 1.8872935771942139, + "learning_rate": 1.619841708949941e-05, + "loss": 0.5562, + "step": 2299 + }, + { + "epoch": 0.1096465091888542, + "grad_norm": 2.1485393047332764, + "learning_rate": 1.619533422560557e-05, + "loss": 0.6668, + "step": 2300 + }, + { + "epoch": 0.1096941815841537, + "grad_norm": 2.2373406887054443, + "learning_rate": 1.619225040583546e-05, + "loss": 0.7819, + "step": 2301 + }, + { + "epoch": 0.1097418539794532, + "grad_norm": 2.7262094020843506, + "learning_rate": 1.618916563066488e-05, + "loss": 0.857, + "step": 2302 + }, + { + "epoch": 0.1097895263747527, + "grad_norm": 1.5591273307800293, + "learning_rate": 1.6186079900569787e-05, + "loss": 0.8508, + "step": 2303 + }, + { + "epoch": 0.1098371987700522, + "grad_norm": 3.53981614112854, + "learning_rate": 1.618299321602626e-05, + "loss": 0.6526, + "step": 2304 + }, + { + "epoch": 0.1098848711653517, + "grad_norm": 1.2734298706054688, + "learning_rate": 1.617990557751056e-05, + "loss": 0.8925, + "step": 2305 + }, + { + "epoch": 0.1099325435606512, + "grad_norm": 2.79061222076416, + "learning_rate": 1.6176816985499068e-05, + "loss": 1.2402, + "step": 2306 + }, + { + "epoch": 0.10998021595595071, + "grad_norm": 3.840050458908081, + "learning_rate": 1.6173727440468318e-05, + "loss": 1.0742, + "step": 2307 + }, + { + "epoch": 0.11002788835125021, + "grad_norm": 1.223572015762329, + "learning_rate": 1.6170636942895006e-05, + "loss": 0.6545, + "step": 2308 + }, + { + "epoch": 0.11007556074654971, + "grad_norm": 1.376792550086975, + "learning_rate": 1.616754549325596e-05, + "loss": 0.7667, + "step": 2309 + }, + { + "epoch": 0.11012323314184921, + "grad_norm": 1.145013689994812, + "learning_rate": 1.6164453092028157e-05, + "loss": 0.5677, + "step": 2310 + }, + { + "epoch": 0.11017090553714871, + "grad_norm": 1.3333736658096313, + "learning_rate": 1.616135973968872e-05, + "loss": 0.8565, + "step": 2311 + }, + { + "epoch": 0.11021857793244821, + "grad_norm": 2.5295960903167725, + "learning_rate": 1.615826543671493e-05, + "loss": 0.9761, + "step": 2312 + }, + { + "epoch": 0.11026625032774771, + "grad_norm": 3.3088274002075195, + "learning_rate": 1.6155170183584195e-05, + "loss": 1.1821, + "step": 2313 + }, + { + "epoch": 0.11031392272304721, + "grad_norm": 2.6220815181732178, + "learning_rate": 1.6152073980774093e-05, + "loss": 0.7295, + "step": 2314 + }, + { + "epoch": 0.11036159511834673, + "grad_norm": 5.106561183929443, + "learning_rate": 1.6148976828762326e-05, + "loss": 1.3027, + "step": 2315 + }, + { + "epoch": 0.11040926751364623, + "grad_norm": 0.8434693217277527, + "learning_rate": 1.6145878728026757e-05, + "loss": 0.2973, + "step": 2316 + }, + { + "epoch": 0.11045693990894573, + "grad_norm": 1.7661564350128174, + "learning_rate": 1.6142779679045392e-05, + "loss": 0.8119, + "step": 2317 + }, + { + "epoch": 0.11050461230424523, + "grad_norm": 1.7103074789047241, + "learning_rate": 1.613967968229638e-05, + "loss": 0.5876, + "step": 2318 + }, + { + "epoch": 0.11055228469954473, + "grad_norm": 1.7080051898956299, + "learning_rate": 1.613657873825802e-05, + "loss": 0.6482, + "step": 2319 + }, + { + "epoch": 0.11059995709484423, + "grad_norm": 1.3071163892745972, + "learning_rate": 1.6133476847408754e-05, + "loss": 0.9239, + "step": 2320 + }, + { + "epoch": 0.11064762949014373, + "grad_norm": 1.736224889755249, + "learning_rate": 1.6130374010227174e-05, + "loss": 1.2638, + "step": 2321 + }, + { + "epoch": 0.11069530188544323, + "grad_norm": 1.757023811340332, + "learning_rate": 1.6127270227192012e-05, + "loss": 0.6787, + "step": 2322 + }, + { + "epoch": 0.11074297428074274, + "grad_norm": 3.932849168777466, + "learning_rate": 1.6124165498782156e-05, + "loss": 0.0472, + "step": 2323 + }, + { + "epoch": 0.11079064667604224, + "grad_norm": 4.284938812255859, + "learning_rate": 1.612105982547663e-05, + "loss": 0.1567, + "step": 2324 + }, + { + "epoch": 0.11083831907134174, + "grad_norm": 1.4168829917907715, + "learning_rate": 1.6117953207754605e-05, + "loss": 0.7796, + "step": 2325 + }, + { + "epoch": 0.11088599146664124, + "grad_norm": 2.0152807235717773, + "learning_rate": 1.611484564609541e-05, + "loss": 0.6154, + "step": 2326 + }, + { + "epoch": 0.11093366386194074, + "grad_norm": 1.5113641023635864, + "learning_rate": 1.6111737140978495e-05, + "loss": 0.5765, + "step": 2327 + }, + { + "epoch": 0.11098133625724024, + "grad_norm": 1.8552695512771606, + "learning_rate": 1.610862769288348e-05, + "loss": 0.6431, + "step": 2328 + }, + { + "epoch": 0.11102900865253974, + "grad_norm": 2.2148361206054688, + "learning_rate": 1.6105517302290118e-05, + "loss": 0.7583, + "step": 2329 + }, + { + "epoch": 0.11107668104783924, + "grad_norm": 1.4242758750915527, + "learning_rate": 1.6102405969678314e-05, + "loss": 0.5657, + "step": 2330 + }, + { + "epoch": 0.11112435344313876, + "grad_norm": 2.315258264541626, + "learning_rate": 1.609929369552811e-05, + "loss": 0.7418, + "step": 2331 + }, + { + "epoch": 0.11117202583843826, + "grad_norm": 1.1028189659118652, + "learning_rate": 1.6096180480319698e-05, + "loss": 0.3196, + "step": 2332 + }, + { + "epoch": 0.11121969823373776, + "grad_norm": 1.6975681781768799, + "learning_rate": 1.6093066324533413e-05, + "loss": 0.7525, + "step": 2333 + }, + { + "epoch": 0.11126737062903726, + "grad_norm": 1.4082938432693481, + "learning_rate": 1.608995122864975e-05, + "loss": 0.3855, + "step": 2334 + }, + { + "epoch": 0.11131504302433676, + "grad_norm": 3.16806697845459, + "learning_rate": 1.6086835193149318e-05, + "loss": 0.9509, + "step": 2335 + }, + { + "epoch": 0.11136271541963626, + "grad_norm": 2.600700616836548, + "learning_rate": 1.6083718218512904e-05, + "loss": 0.9702, + "step": 2336 + }, + { + "epoch": 0.11141038781493576, + "grad_norm": 1.546226978302002, + "learning_rate": 1.6080600305221417e-05, + "loss": 0.8758, + "step": 2337 + }, + { + "epoch": 0.11145806021023526, + "grad_norm": 1.522552490234375, + "learning_rate": 1.607748145375592e-05, + "loss": 0.4528, + "step": 2338 + }, + { + "epoch": 0.11150573260553477, + "grad_norm": 1.6641627550125122, + "learning_rate": 1.607436166459762e-05, + "loss": 0.6523, + "step": 2339 + }, + { + "epoch": 0.11155340500083427, + "grad_norm": 1.7517926692962646, + "learning_rate": 1.607124093822787e-05, + "loss": 0.8662, + "step": 2340 + }, + { + "epoch": 0.11160107739613377, + "grad_norm": 2.037975311279297, + "learning_rate": 1.6068119275128165e-05, + "loss": 0.8957, + "step": 2341 + }, + { + "epoch": 0.11164874979143327, + "grad_norm": 1.7879384756088257, + "learning_rate": 1.6064996675780146e-05, + "loss": 0.9598, + "step": 2342 + }, + { + "epoch": 0.11169642218673277, + "grad_norm": 1.971345067024231, + "learning_rate": 1.60618731406656e-05, + "loss": 0.8951, + "step": 2343 + }, + { + "epoch": 0.11174409458203227, + "grad_norm": 1.9787291288375854, + "learning_rate": 1.6058748670266445e-05, + "loss": 0.8179, + "step": 2344 + }, + { + "epoch": 0.11179176697733177, + "grad_norm": 1.6472965478897095, + "learning_rate": 1.605562326506477e-05, + "loss": 0.9642, + "step": 2345 + }, + { + "epoch": 0.11183943937263127, + "grad_norm": 2.081470012664795, + "learning_rate": 1.6052496925542786e-05, + "loss": 1.1336, + "step": 2346 + }, + { + "epoch": 0.11188711176793079, + "grad_norm": 2.2466204166412354, + "learning_rate": 1.6049369652182855e-05, + "loss": 1.0396, + "step": 2347 + }, + { + "epoch": 0.11193478416323029, + "grad_norm": 2.0953478813171387, + "learning_rate": 1.604624144546748e-05, + "loss": 0.8454, + "step": 2348 + }, + { + "epoch": 0.11198245655852979, + "grad_norm": 1.3336654901504517, + "learning_rate": 1.6043112305879317e-05, + "loss": 0.6171, + "step": 2349 + }, + { + "epoch": 0.11203012895382929, + "grad_norm": 2.746633291244507, + "learning_rate": 1.6039982233901155e-05, + "loss": 0.9006, + "step": 2350 + }, + { + "epoch": 0.11207780134912879, + "grad_norm": 1.2693136930465698, + "learning_rate": 1.6036851230015935e-05, + "loss": 0.6205, + "step": 2351 + }, + { + "epoch": 0.11212547374442829, + "grad_norm": 1.3461228609085083, + "learning_rate": 1.603371929470674e-05, + "loss": 0.7251, + "step": 2352 + }, + { + "epoch": 0.11217314613972779, + "grad_norm": 1.9509644508361816, + "learning_rate": 1.603058642845679e-05, + "loss": 1.2036, + "step": 2353 + }, + { + "epoch": 0.11222081853502729, + "grad_norm": 1.3992619514465332, + "learning_rate": 1.6027452631749458e-05, + "loss": 0.4146, + "step": 2354 + }, + { + "epoch": 0.1122684909303268, + "grad_norm": 1.40008544921875, + "learning_rate": 1.6024317905068255e-05, + "loss": 0.8259, + "step": 2355 + }, + { + "epoch": 0.1123161633256263, + "grad_norm": 2.0970044136047363, + "learning_rate": 1.602118224889684e-05, + "loss": 0.8051, + "step": 2356 + }, + { + "epoch": 0.1123638357209258, + "grad_norm": 1.2155324220657349, + "learning_rate": 1.601804566371901e-05, + "loss": 0.6436, + "step": 2357 + }, + { + "epoch": 0.1124115081162253, + "grad_norm": 2.929593801498413, + "learning_rate": 1.6014908150018703e-05, + "loss": 1.1493, + "step": 2358 + }, + { + "epoch": 0.1124591805115248, + "grad_norm": 1.783107876777649, + "learning_rate": 1.601176970828002e-05, + "loss": 0.6593, + "step": 2359 + }, + { + "epoch": 0.1125068529068243, + "grad_norm": 1.6193472146987915, + "learning_rate": 1.6008630338987173e-05, + "loss": 0.5756, + "step": 2360 + }, + { + "epoch": 0.1125545253021238, + "grad_norm": 2.6750545501708984, + "learning_rate": 1.600549004262454e-05, + "loss": 0.6709, + "step": 2361 + }, + { + "epoch": 0.1126021976974233, + "grad_norm": 1.6343865394592285, + "learning_rate": 1.600234881967664e-05, + "loss": 0.9063, + "step": 2362 + }, + { + "epoch": 0.11264987009272281, + "grad_norm": 1.5538705587387085, + "learning_rate": 1.599920667062813e-05, + "loss": 1.038, + "step": 2363 + }, + { + "epoch": 0.11269754248802231, + "grad_norm": 1.8014729022979736, + "learning_rate": 1.5996063595963813e-05, + "loss": 0.7755, + "step": 2364 + }, + { + "epoch": 0.11274521488332181, + "grad_norm": 2.1251120567321777, + "learning_rate": 1.599291959616863e-05, + "loss": 0.7735, + "step": 2365 + }, + { + "epoch": 0.11279288727862131, + "grad_norm": 1.4446736574172974, + "learning_rate": 1.5989774671727664e-05, + "loss": 0.9707, + "step": 2366 + }, + { + "epoch": 0.11284055967392081, + "grad_norm": 1.6082050800323486, + "learning_rate": 1.598662882312615e-05, + "loss": 0.5947, + "step": 2367 + }, + { + "epoch": 0.11288823206922032, + "grad_norm": 1.1529566049575806, + "learning_rate": 1.5983482050849462e-05, + "loss": 0.5871, + "step": 2368 + }, + { + "epoch": 0.11293590446451982, + "grad_norm": 1.4360655546188354, + "learning_rate": 1.598033435538311e-05, + "loss": 0.5626, + "step": 2369 + }, + { + "epoch": 0.11298357685981932, + "grad_norm": 2.323112726211548, + "learning_rate": 1.5977185737212756e-05, + "loss": 0.5323, + "step": 2370 + }, + { + "epoch": 0.11303124925511883, + "grad_norm": 2.0246341228485107, + "learning_rate": 1.597403619682419e-05, + "loss": 0.8636, + "step": 2371 + }, + { + "epoch": 0.11307892165041833, + "grad_norm": 1.5008585453033447, + "learning_rate": 1.5970885734703363e-05, + "loss": 0.7506, + "step": 2372 + }, + { + "epoch": 0.11312659404571783, + "grad_norm": 1.4700300693511963, + "learning_rate": 1.5967734351336354e-05, + "loss": 0.8429, + "step": 2373 + }, + { + "epoch": 0.11317426644101733, + "grad_norm": 1.5748525857925415, + "learning_rate": 1.5964582047209392e-05, + "loss": 0.2053, + "step": 2374 + }, + { + "epoch": 0.11322193883631683, + "grad_norm": 1.506453514099121, + "learning_rate": 1.596142882280884e-05, + "loss": 0.8866, + "step": 2375 + }, + { + "epoch": 0.11326961123161633, + "grad_norm": 1.489759087562561, + "learning_rate": 1.5958274678621217e-05, + "loss": 0.6125, + "step": 2376 + }, + { + "epoch": 0.11331728362691583, + "grad_norm": 1.5987987518310547, + "learning_rate": 1.5955119615133163e-05, + "loss": 0.6383, + "step": 2377 + }, + { + "epoch": 0.11336495602221533, + "grad_norm": 1.2040554285049438, + "learning_rate": 1.5951963632831482e-05, + "loss": 0.6015, + "step": 2378 + }, + { + "epoch": 0.11341262841751484, + "grad_norm": 1.6333727836608887, + "learning_rate": 1.5948806732203105e-05, + "loss": 0.855, + "step": 2379 + }, + { + "epoch": 0.11346030081281434, + "grad_norm": 1.3919745683670044, + "learning_rate": 1.594564891373511e-05, + "loss": 0.6924, + "step": 2380 + }, + { + "epoch": 0.11350797320811384, + "grad_norm": 2.072422504425049, + "learning_rate": 1.5942490177914715e-05, + "loss": 0.5915, + "step": 2381 + }, + { + "epoch": 0.11355564560341334, + "grad_norm": 2.358393669128418, + "learning_rate": 1.5939330525229285e-05, + "loss": 1.1379, + "step": 2382 + }, + { + "epoch": 0.11360331799871284, + "grad_norm": 1.6451317071914673, + "learning_rate": 1.5936169956166316e-05, + "loss": 0.9774, + "step": 2383 + }, + { + "epoch": 0.11365099039401234, + "grad_norm": 1.566930890083313, + "learning_rate": 1.593300847121345e-05, + "loss": 0.7863, + "step": 2384 + }, + { + "epoch": 0.11369866278931184, + "grad_norm": 2.387988567352295, + "learning_rate": 1.592984607085848e-05, + "loss": 0.8635, + "step": 2385 + }, + { + "epoch": 0.11374633518461134, + "grad_norm": 1.7774720191955566, + "learning_rate": 1.5926682755589325e-05, + "loss": 0.4223, + "step": 2386 + }, + { + "epoch": 0.11379400757991086, + "grad_norm": 1.0116852521896362, + "learning_rate": 1.5923518525894053e-05, + "loss": 0.2378, + "step": 2387 + }, + { + "epoch": 0.11384167997521036, + "grad_norm": 1.6674070358276367, + "learning_rate": 1.5920353382260876e-05, + "loss": 0.6554, + "step": 2388 + }, + { + "epoch": 0.11388935237050986, + "grad_norm": 1.2811578512191772, + "learning_rate": 1.591718732517814e-05, + "loss": 0.4141, + "step": 2389 + }, + { + "epoch": 0.11393702476580936, + "grad_norm": 1.4084479808807373, + "learning_rate": 1.5914020355134333e-05, + "loss": 0.7754, + "step": 2390 + }, + { + "epoch": 0.11398469716110886, + "grad_norm": 1.144248127937317, + "learning_rate": 1.5910852472618085e-05, + "loss": 0.4276, + "step": 2391 + }, + { + "epoch": 0.11403236955640836, + "grad_norm": 1.5653167963027954, + "learning_rate": 1.5907683678118173e-05, + "loss": 0.4606, + "step": 2392 + }, + { + "epoch": 0.11408004195170786, + "grad_norm": 3.081538677215576, + "learning_rate": 1.5904513972123507e-05, + "loss": 0.6316, + "step": 2393 + }, + { + "epoch": 0.11412771434700736, + "grad_norm": 2.0674731731414795, + "learning_rate": 1.590134335512314e-05, + "loss": 0.7435, + "step": 2394 + }, + { + "epoch": 0.11417538674230687, + "grad_norm": 1.1712582111358643, + "learning_rate": 1.5898171827606264e-05, + "loss": 0.3855, + "step": 2395 + }, + { + "epoch": 0.11422305913760637, + "grad_norm": 1.101927638053894, + "learning_rate": 1.5894999390062216e-05, + "loss": 0.5076, + "step": 2396 + }, + { + "epoch": 0.11427073153290587, + "grad_norm": 2.455082416534424, + "learning_rate": 1.5891826042980468e-05, + "loss": 0.731, + "step": 2397 + }, + { + "epoch": 0.11431840392820537, + "grad_norm": 2.9067742824554443, + "learning_rate": 1.5888651786850638e-05, + "loss": 0.8672, + "step": 2398 + }, + { + "epoch": 0.11436607632350487, + "grad_norm": 1.4049131870269775, + "learning_rate": 1.5885476622162478e-05, + "loss": 0.7721, + "step": 2399 + }, + { + "epoch": 0.11441374871880437, + "grad_norm": 1.8493523597717285, + "learning_rate": 1.588230054940588e-05, + "loss": 0.7796, + "step": 2400 + }, + { + "epoch": 0.11446142111410387, + "grad_norm": 0.7470552921295166, + "learning_rate": 1.5879123569070888e-05, + "loss": 0.3024, + "step": 2401 + }, + { + "epoch": 0.11450909350940339, + "grad_norm": 1.6520780324935913, + "learning_rate": 1.5875945681647672e-05, + "loss": 0.5836, + "step": 2402 + }, + { + "epoch": 0.11455676590470289, + "grad_norm": 1.1575690507888794, + "learning_rate": 1.5872766887626546e-05, + "loss": 0.6283, + "step": 2403 + }, + { + "epoch": 0.11460443830000239, + "grad_norm": 1.4639043807983398, + "learning_rate": 1.5869587187497965e-05, + "loss": 0.659, + "step": 2404 + }, + { + "epoch": 0.11465211069530189, + "grad_norm": 1.45045804977417, + "learning_rate": 1.586640658175253e-05, + "loss": 0.6051, + "step": 2405 + }, + { + "epoch": 0.11469978309060139, + "grad_norm": 1.6251894235610962, + "learning_rate": 1.586322507088097e-05, + "loss": 0.6632, + "step": 2406 + }, + { + "epoch": 0.11474745548590089, + "grad_norm": 1.2037824392318726, + "learning_rate": 1.586004265537416e-05, + "loss": 0.7546, + "step": 2407 + }, + { + "epoch": 0.11479512788120039, + "grad_norm": 2.1143577098846436, + "learning_rate": 1.585685933572312e-05, + "loss": 0.6622, + "step": 2408 + }, + { + "epoch": 0.11484280027649989, + "grad_norm": 1.87221097946167, + "learning_rate": 1.5853675112418994e-05, + "loss": 0.9798, + "step": 2409 + }, + { + "epoch": 0.1148904726717994, + "grad_norm": 2.089597702026367, + "learning_rate": 1.5850489985953076e-05, + "loss": 0.9001, + "step": 2410 + }, + { + "epoch": 0.1149381450670989, + "grad_norm": 2.2795889377593994, + "learning_rate": 1.5847303956816808e-05, + "loss": 0.8602, + "step": 2411 + }, + { + "epoch": 0.1149858174623984, + "grad_norm": 2.171499013900757, + "learning_rate": 1.5844117025501753e-05, + "loss": 0.6345, + "step": 2412 + }, + { + "epoch": 0.1150334898576979, + "grad_norm": 1.6256190538406372, + "learning_rate": 1.584092919249962e-05, + "loss": 0.8459, + "step": 2413 + }, + { + "epoch": 0.1150811622529974, + "grad_norm": 1.9238924980163574, + "learning_rate": 1.583774045830227e-05, + "loss": 0.8546, + "step": 2414 + }, + { + "epoch": 0.1151288346482969, + "grad_norm": 1.3114979267120361, + "learning_rate": 1.583455082340168e-05, + "loss": 0.4359, + "step": 2415 + }, + { + "epoch": 0.1151765070435964, + "grad_norm": 1.086281180381775, + "learning_rate": 1.583136028828998e-05, + "loss": 0.4448, + "step": 2416 + }, + { + "epoch": 0.1152241794388959, + "grad_norm": 1.503161072731018, + "learning_rate": 1.5828168853459445e-05, + "loss": 0.6896, + "step": 2417 + }, + { + "epoch": 0.11527185183419542, + "grad_norm": 1.8496677875518799, + "learning_rate": 1.582497651940247e-05, + "loss": 0.5931, + "step": 2418 + }, + { + "epoch": 0.11531952422949492, + "grad_norm": 1.3385798931121826, + "learning_rate": 1.5821783286611604e-05, + "loss": 0.8231, + "step": 2419 + }, + { + "epoch": 0.11536719662479442, + "grad_norm": 1.5753676891326904, + "learning_rate": 1.581858915557953e-05, + "loss": 0.7206, + "step": 2420 + }, + { + "epoch": 0.11541486902009392, + "grad_norm": 3.8933351039886475, + "learning_rate": 1.581539412679907e-05, + "loss": 0.6449, + "step": 2421 + }, + { + "epoch": 0.11546254141539342, + "grad_norm": 12.029458045959473, + "learning_rate": 1.581219820076318e-05, + "loss": 0.9718, + "step": 2422 + }, + { + "epoch": 0.11551021381069292, + "grad_norm": 2.0000948905944824, + "learning_rate": 1.5809001377964966e-05, + "loss": 0.9009, + "step": 2423 + }, + { + "epoch": 0.11555788620599242, + "grad_norm": 1.906844139099121, + "learning_rate": 1.580580365889766e-05, + "loss": 0.7576, + "step": 2424 + }, + { + "epoch": 0.11560555860129192, + "grad_norm": 2.046762704849243, + "learning_rate": 1.5802605044054638e-05, + "loss": 0.9825, + "step": 2425 + }, + { + "epoch": 0.11565323099659143, + "grad_norm": 1.1829183101654053, + "learning_rate": 1.579940553392941e-05, + "loss": 0.8468, + "step": 2426 + }, + { + "epoch": 0.11570090339189093, + "grad_norm": 1.6301301717758179, + "learning_rate": 1.579620512901563e-05, + "loss": 0.7414, + "step": 2427 + }, + { + "epoch": 0.11574857578719043, + "grad_norm": 3.3583359718322754, + "learning_rate": 1.579300382980709e-05, + "loss": 0.8273, + "step": 2428 + }, + { + "epoch": 0.11579624818248993, + "grad_norm": 2.3643252849578857, + "learning_rate": 1.5789801636797718e-05, + "loss": 1.4061, + "step": 2429 + }, + { + "epoch": 0.11584392057778943, + "grad_norm": 1.3170373439788818, + "learning_rate": 1.5786598550481573e-05, + "loss": 0.7628, + "step": 2430 + }, + { + "epoch": 0.11589159297308893, + "grad_norm": 1.9027100801467896, + "learning_rate": 1.5783394571352863e-05, + "loss": 0.8552, + "step": 2431 + }, + { + "epoch": 0.11593926536838843, + "grad_norm": 1.4820061922073364, + "learning_rate": 1.5780189699905928e-05, + "loss": 0.6201, + "step": 2432 + }, + { + "epoch": 0.11598693776368793, + "grad_norm": 1.4014887809753418, + "learning_rate": 1.577698393663525e-05, + "loss": 0.4637, + "step": 2433 + }, + { + "epoch": 0.11603461015898744, + "grad_norm": 1.0882154703140259, + "learning_rate": 1.5773777282035437e-05, + "loss": 0.8005, + "step": 2434 + }, + { + "epoch": 0.11608228255428694, + "grad_norm": 1.066767930984497, + "learning_rate": 1.577056973660125e-05, + "loss": 0.5197, + "step": 2435 + }, + { + "epoch": 0.11612995494958644, + "grad_norm": 1.0935394763946533, + "learning_rate": 1.5767361300827577e-05, + "loss": 0.69, + "step": 2436 + }, + { + "epoch": 0.11617762734488594, + "grad_norm": 1.9463183879852295, + "learning_rate": 1.576415197520945e-05, + "loss": 1.0335, + "step": 2437 + }, + { + "epoch": 0.11622529974018544, + "grad_norm": 1.5867832899093628, + "learning_rate": 1.576094176024203e-05, + "loss": 0.1858, + "step": 2438 + }, + { + "epoch": 0.11627297213548494, + "grad_norm": 2.1434361934661865, + "learning_rate": 1.5757730656420626e-05, + "loss": 0.5396, + "step": 2439 + }, + { + "epoch": 0.11632064453078444, + "grad_norm": 2.3051917552948, + "learning_rate": 1.575451866424067e-05, + "loss": 0.7064, + "step": 2440 + }, + { + "epoch": 0.11636831692608395, + "grad_norm": 1.6615920066833496, + "learning_rate": 1.5751305784197746e-05, + "loss": 0.9084, + "step": 2441 + }, + { + "epoch": 0.11641598932138346, + "grad_norm": 2.000019073486328, + "learning_rate": 1.5748092016787567e-05, + "loss": 0.719, + "step": 2442 + }, + { + "epoch": 0.11646366171668296, + "grad_norm": 3.775541305541992, + "learning_rate": 1.5744877362505987e-05, + "loss": 0.7955, + "step": 2443 + }, + { + "epoch": 0.11651133411198246, + "grad_norm": 1.6770176887512207, + "learning_rate": 1.5741661821848983e-05, + "loss": 1.0948, + "step": 2444 + }, + { + "epoch": 0.11655900650728196, + "grad_norm": 1.441290020942688, + "learning_rate": 1.5738445395312694e-05, + "loss": 0.6975, + "step": 2445 + }, + { + "epoch": 0.11660667890258146, + "grad_norm": 1.6863912343978882, + "learning_rate": 1.5735228083393373e-05, + "loss": 0.682, + "step": 2446 + }, + { + "epoch": 0.11665435129788096, + "grad_norm": 1.180029034614563, + "learning_rate": 1.573200988658742e-05, + "loss": 0.7952, + "step": 2447 + }, + { + "epoch": 0.11670202369318046, + "grad_norm": 1.6943645477294922, + "learning_rate": 1.572879080539137e-05, + "loss": 0.5094, + "step": 2448 + }, + { + "epoch": 0.11674969608847996, + "grad_norm": 1.7495173215866089, + "learning_rate": 1.5725570840301897e-05, + "loss": 0.7735, + "step": 2449 + }, + { + "epoch": 0.11679736848377947, + "grad_norm": 1.1879079341888428, + "learning_rate": 1.5722349991815802e-05, + "loss": 0.3279, + "step": 2450 + }, + { + "epoch": 0.11684504087907897, + "grad_norm": 3.4411327838897705, + "learning_rate": 1.571912826043003e-05, + "loss": 1.0245, + "step": 2451 + }, + { + "epoch": 0.11689271327437847, + "grad_norm": 3.4305639266967773, + "learning_rate": 1.5715905646641666e-05, + "loss": 0.5089, + "step": 2452 + }, + { + "epoch": 0.11694038566967797, + "grad_norm": 1.467454195022583, + "learning_rate": 1.5712682150947926e-05, + "loss": 0.4662, + "step": 2453 + }, + { + "epoch": 0.11698805806497747, + "grad_norm": 2.7015254497528076, + "learning_rate": 1.5709457773846155e-05, + "loss": 0.9322, + "step": 2454 + }, + { + "epoch": 0.11703573046027697, + "grad_norm": 1.893823266029358, + "learning_rate": 1.5706232515833842e-05, + "loss": 0.7901, + "step": 2455 + }, + { + "epoch": 0.11708340285557647, + "grad_norm": 1.3589909076690674, + "learning_rate": 1.5703006377408623e-05, + "loss": 0.8506, + "step": 2456 + }, + { + "epoch": 0.11713107525087597, + "grad_norm": 3.398231267929077, + "learning_rate": 1.5699779359068248e-05, + "loss": 0.7117, + "step": 2457 + }, + { + "epoch": 0.11717874764617549, + "grad_norm": 1.307833194732666, + "learning_rate": 1.569655146131061e-05, + "loss": 0.9439, + "step": 2458 + }, + { + "epoch": 0.11722642004147499, + "grad_norm": 1.4641598463058472, + "learning_rate": 1.5693322684633747e-05, + "loss": 0.5693, + "step": 2459 + }, + { + "epoch": 0.11727409243677449, + "grad_norm": 1.35435950756073, + "learning_rate": 1.5690093029535824e-05, + "loss": 0.6255, + "step": 2460 + }, + { + "epoch": 0.11732176483207399, + "grad_norm": 1.9741376638412476, + "learning_rate": 1.5686862496515142e-05, + "loss": 0.4204, + "step": 2461 + }, + { + "epoch": 0.11736943722737349, + "grad_norm": 5.268206596374512, + "learning_rate": 1.568363108607014e-05, + "loss": 0.2695, + "step": 2462 + }, + { + "epoch": 0.11741710962267299, + "grad_norm": 1.644921898841858, + "learning_rate": 1.5680398798699395e-05, + "loss": 0.4717, + "step": 2463 + }, + { + "epoch": 0.11746478201797249, + "grad_norm": 6.985912322998047, + "learning_rate": 1.5677165634901607e-05, + "loss": 1.2432, + "step": 2464 + }, + { + "epoch": 0.11751245441327199, + "grad_norm": 1.51718008518219, + "learning_rate": 1.567393159517563e-05, + "loss": 0.7786, + "step": 2465 + }, + { + "epoch": 0.1175601268085715, + "grad_norm": 1.3289611339569092, + "learning_rate": 1.5670696680020433e-05, + "loss": 0.5209, + "step": 2466 + }, + { + "epoch": 0.117607799203871, + "grad_norm": 1.9150704145431519, + "learning_rate": 1.5667460889935138e-05, + "loss": 0.9846, + "step": 2467 + }, + { + "epoch": 0.1176554715991705, + "grad_norm": 2.3162319660186768, + "learning_rate": 1.566422422541899e-05, + "loss": 0.8315, + "step": 2468 + }, + { + "epoch": 0.11770314399447, + "grad_norm": 1.7796815633773804, + "learning_rate": 1.5660986686971377e-05, + "loss": 0.7327, + "step": 2469 + }, + { + "epoch": 0.1177508163897695, + "grad_norm": 1.4785791635513306, + "learning_rate": 1.565774827509181e-05, + "loss": 0.4438, + "step": 2470 + }, + { + "epoch": 0.117798488785069, + "grad_norm": 3.441373825073242, + "learning_rate": 1.565450899027995e-05, + "loss": 0.9713, + "step": 2471 + }, + { + "epoch": 0.1178461611803685, + "grad_norm": 1.719574213027954, + "learning_rate": 1.5651268833035585e-05, + "loss": 0.8168, + "step": 2472 + }, + { + "epoch": 0.117893833575668, + "grad_norm": 3.02345871925354, + "learning_rate": 1.5648027803858635e-05, + "loss": 0.7226, + "step": 2473 + }, + { + "epoch": 0.11794150597096752, + "grad_norm": 2.9506583213806152, + "learning_rate": 1.564478590324916e-05, + "loss": 0.7869, + "step": 2474 + }, + { + "epoch": 0.11798917836626702, + "grad_norm": 1.3805612325668335, + "learning_rate": 1.5641543131707345e-05, + "loss": 0.6029, + "step": 2475 + }, + { + "epoch": 0.11803685076156652, + "grad_norm": 2.700631618499756, + "learning_rate": 1.5638299489733525e-05, + "loss": 0.596, + "step": 2476 + }, + { + "epoch": 0.11808452315686602, + "grad_norm": 1.7731691598892212, + "learning_rate": 1.5635054977828156e-05, + "loss": 0.7278, + "step": 2477 + }, + { + "epoch": 0.11813219555216552, + "grad_norm": 2.4622464179992676, + "learning_rate": 1.5631809596491833e-05, + "loss": 1.2707, + "step": 2478 + }, + { + "epoch": 0.11817986794746502, + "grad_norm": 1.9097596406936646, + "learning_rate": 1.562856334622529e-05, + "loss": 0.7455, + "step": 2479 + }, + { + "epoch": 0.11822754034276452, + "grad_norm": 1.5807631015777588, + "learning_rate": 1.5625316227529382e-05, + "loss": 0.7761, + "step": 2480 + }, + { + "epoch": 0.11827521273806402, + "grad_norm": 1.8797825574874878, + "learning_rate": 1.562206824090511e-05, + "loss": 0.7468, + "step": 2481 + }, + { + "epoch": 0.11832288513336353, + "grad_norm": 2.1505680084228516, + "learning_rate": 1.5618819386853607e-05, + "loss": 0.7845, + "step": 2482 + }, + { + "epoch": 0.11837055752866303, + "grad_norm": 1.4331556558609009, + "learning_rate": 1.5615569665876132e-05, + "loss": 0.5999, + "step": 2483 + }, + { + "epoch": 0.11841822992396253, + "grad_norm": 1.5098989009857178, + "learning_rate": 1.5612319078474087e-05, + "loss": 0.7021, + "step": 2484 + }, + { + "epoch": 0.11846590231926203, + "grad_norm": 2.388648509979248, + "learning_rate": 1.5609067625149007e-05, + "loss": 1.2, + "step": 2485 + }, + { + "epoch": 0.11851357471456153, + "grad_norm": 1.6561309099197388, + "learning_rate": 1.560581530640255e-05, + "loss": 0.684, + "step": 2486 + }, + { + "epoch": 0.11856124710986103, + "grad_norm": 1.3266593217849731, + "learning_rate": 1.5602562122736526e-05, + "loss": 0.9018, + "step": 2487 + }, + { + "epoch": 0.11860891950516053, + "grad_norm": 1.415453314781189, + "learning_rate": 1.5599308074652856e-05, + "loss": 0.6789, + "step": 2488 + }, + { + "epoch": 0.11865659190046003, + "grad_norm": 1.4097321033477783, + "learning_rate": 1.5596053162653612e-05, + "loss": 0.6088, + "step": 2489 + }, + { + "epoch": 0.11870426429575955, + "grad_norm": 1.3818477392196655, + "learning_rate": 1.5592797387240996e-05, + "loss": 0.615, + "step": 2490 + }, + { + "epoch": 0.11875193669105905, + "grad_norm": 1.5520590543746948, + "learning_rate": 1.5589540748917336e-05, + "loss": 0.5646, + "step": 2491 + }, + { + "epoch": 0.11879960908635855, + "grad_norm": 2.6001250743865967, + "learning_rate": 1.5586283248185102e-05, + "loss": 0.7905, + "step": 2492 + }, + { + "epoch": 0.11884728148165805, + "grad_norm": 2.243886947631836, + "learning_rate": 1.5583024885546887e-05, + "loss": 0.5465, + "step": 2493 + }, + { + "epoch": 0.11889495387695755, + "grad_norm": 1.5221590995788574, + "learning_rate": 1.557976566150543e-05, + "loss": 0.4085, + "step": 2494 + }, + { + "epoch": 0.11894262627225705, + "grad_norm": 1.786395788192749, + "learning_rate": 1.5576505576563587e-05, + "loss": 0.7562, + "step": 2495 + }, + { + "epoch": 0.11899029866755655, + "grad_norm": 1.981576919555664, + "learning_rate": 1.5573244631224364e-05, + "loss": 0.6257, + "step": 2496 + }, + { + "epoch": 0.11903797106285606, + "grad_norm": 2.3802356719970703, + "learning_rate": 1.556998282599089e-05, + "loss": 0.8141, + "step": 2497 + }, + { + "epoch": 0.11908564345815556, + "grad_norm": 1.1556748151779175, + "learning_rate": 1.5566720161366423e-05, + "loss": 0.7201, + "step": 2498 + }, + { + "epoch": 0.11913331585345506, + "grad_norm": 1.6112884283065796, + "learning_rate": 1.556345663785436e-05, + "loss": 0.624, + "step": 2499 + }, + { + "epoch": 0.11918098824875456, + "grad_norm": 1.3968398571014404, + "learning_rate": 1.556019225595823e-05, + "loss": 0.6489, + "step": 2500 + }, + { + "epoch": 0.11922866064405406, + "grad_norm": 1.6038421392440796, + "learning_rate": 1.5556927016181694e-05, + "loss": 0.943, + "step": 2501 + }, + { + "epoch": 0.11927633303935356, + "grad_norm": 1.491059422492981, + "learning_rate": 1.555366091902855e-05, + "loss": 1.0027, + "step": 2502 + }, + { + "epoch": 0.11932400543465306, + "grad_norm": 1.804369330406189, + "learning_rate": 1.5550393965002712e-05, + "loss": 1.1136, + "step": 2503 + }, + { + "epoch": 0.11937167782995256, + "grad_norm": 2.3129498958587646, + "learning_rate": 1.5547126154608246e-05, + "loss": 0.5736, + "step": 2504 + }, + { + "epoch": 0.11941935022525207, + "grad_norm": 1.2741669416427612, + "learning_rate": 1.5543857488349335e-05, + "loss": 0.7488, + "step": 2505 + }, + { + "epoch": 0.11946702262055157, + "grad_norm": 1.6305344104766846, + "learning_rate": 1.5540587966730306e-05, + "loss": 1.0591, + "step": 2506 + }, + { + "epoch": 0.11951469501585107, + "grad_norm": 3.3316562175750732, + "learning_rate": 1.553731759025561e-05, + "loss": 1.1484, + "step": 2507 + }, + { + "epoch": 0.11956236741115057, + "grad_norm": 1.5285416841506958, + "learning_rate": 1.553404635942984e-05, + "loss": 0.7253, + "step": 2508 + }, + { + "epoch": 0.11961003980645007, + "grad_norm": 2.707442283630371, + "learning_rate": 1.5530774274757697e-05, + "loss": 1.0659, + "step": 2509 + }, + { + "epoch": 0.11965771220174957, + "grad_norm": 1.932203769683838, + "learning_rate": 1.5527501336744046e-05, + "loss": 0.7343, + "step": 2510 + }, + { + "epoch": 0.11970538459704907, + "grad_norm": 1.8537970781326294, + "learning_rate": 1.5524227545893856e-05, + "loss": 0.8658, + "step": 2511 + }, + { + "epoch": 0.11975305699234857, + "grad_norm": 1.5665045976638794, + "learning_rate": 1.5520952902712246e-05, + "loss": 0.8468, + "step": 2512 + }, + { + "epoch": 0.11980072938764809, + "grad_norm": 1.405781865119934, + "learning_rate": 1.551767740770446e-05, + "loss": 0.7479, + "step": 2513 + }, + { + "epoch": 0.11984840178294759, + "grad_norm": 1.7779722213745117, + "learning_rate": 1.5514401061375873e-05, + "loss": 0.539, + "step": 2514 + }, + { + "epoch": 0.11989607417824709, + "grad_norm": 1.0430759191513062, + "learning_rate": 1.5511123864231983e-05, + "loss": 0.4346, + "step": 2515 + }, + { + "epoch": 0.11994374657354659, + "grad_norm": 3.0844762325286865, + "learning_rate": 1.550784581677844e-05, + "loss": 0.4899, + "step": 2516 + }, + { + "epoch": 0.11999141896884609, + "grad_norm": 2.4925875663757324, + "learning_rate": 1.5504566919521e-05, + "loss": 1.1701, + "step": 2517 + }, + { + "epoch": 0.12003909136414559, + "grad_norm": 1.3920817375183105, + "learning_rate": 1.550128717296558e-05, + "loss": 0.9856, + "step": 2518 + }, + { + "epoch": 0.12008676375944509, + "grad_norm": 1.6713967323303223, + "learning_rate": 1.5498006577618194e-05, + "loss": 0.4637, + "step": 2519 + }, + { + "epoch": 0.12013443615474459, + "grad_norm": 2.2760868072509766, + "learning_rate": 1.5494725133985014e-05, + "loss": 0.9592, + "step": 2520 + }, + { + "epoch": 0.1201821085500441, + "grad_norm": 2.14618182182312, + "learning_rate": 1.549144284257233e-05, + "loss": 0.7225, + "step": 2521 + }, + { + "epoch": 0.1202297809453436, + "grad_norm": 1.916582703590393, + "learning_rate": 1.548815970388657e-05, + "loss": 0.8079, + "step": 2522 + }, + { + "epoch": 0.1202774533406431, + "grad_norm": 1.9218018054962158, + "learning_rate": 1.5484875718434284e-05, + "loss": 0.7001, + "step": 2523 + }, + { + "epoch": 0.1203251257359426, + "grad_norm": 1.4419703483581543, + "learning_rate": 1.5481590886722154e-05, + "loss": 0.5294, + "step": 2524 + }, + { + "epoch": 0.1203727981312421, + "grad_norm": 2.566704511642456, + "learning_rate": 1.5478305209257004e-05, + "loss": 0.6617, + "step": 2525 + }, + { + "epoch": 0.1204204705265416, + "grad_norm": 3.018068790435791, + "learning_rate": 1.547501868654577e-05, + "loss": 0.556, + "step": 2526 + }, + { + "epoch": 0.1204681429218411, + "grad_norm": 7.7364115715026855, + "learning_rate": 1.5471731319095537e-05, + "loss": 0.5175, + "step": 2527 + }, + { + "epoch": 0.1205158153171406, + "grad_norm": 1.0991004705429077, + "learning_rate": 1.5468443107413512e-05, + "loss": 0.5897, + "step": 2528 + }, + { + "epoch": 0.12056348771244012, + "grad_norm": 3.0219228267669678, + "learning_rate": 1.5465154052007027e-05, + "loss": 0.2365, + "step": 2529 + }, + { + "epoch": 0.12061116010773962, + "grad_norm": 1.6062054634094238, + "learning_rate": 1.5461864153383555e-05, + "loss": 0.1371, + "step": 2530 + }, + { + "epoch": 0.12065883250303912, + "grad_norm": 1.0779755115509033, + "learning_rate": 1.5458573412050688e-05, + "loss": 0.7673, + "step": 2531 + }, + { + "epoch": 0.12070650489833862, + "grad_norm": 1.4383097887039185, + "learning_rate": 1.5455281828516152e-05, + "loss": 0.9756, + "step": 2532 + }, + { + "epoch": 0.12075417729363812, + "grad_norm": 1.650024175643921, + "learning_rate": 1.5451989403287816e-05, + "loss": 0.6756, + "step": 2533 + }, + { + "epoch": 0.12080184968893762, + "grad_norm": 1.4167653322219849, + "learning_rate": 1.544869613687366e-05, + "loss": 0.2193, + "step": 2534 + }, + { + "epoch": 0.12084952208423712, + "grad_norm": 0.9266788363456726, + "learning_rate": 1.5445402029781792e-05, + "loss": 0.4611, + "step": 2535 + }, + { + "epoch": 0.12089719447953662, + "grad_norm": 2.4128856658935547, + "learning_rate": 1.5442107082520475e-05, + "loss": 1.0507, + "step": 2536 + }, + { + "epoch": 0.12094486687483613, + "grad_norm": 2.535980463027954, + "learning_rate": 1.5438811295598075e-05, + "loss": 0.7295, + "step": 2537 + }, + { + "epoch": 0.12099253927013563, + "grad_norm": 4.242743015289307, + "learning_rate": 1.5435514669523102e-05, + "loss": 0.8476, + "step": 2538 + }, + { + "epoch": 0.12104021166543513, + "grad_norm": 1.952236294746399, + "learning_rate": 1.543221720480419e-05, + "loss": 1.0177, + "step": 2539 + }, + { + "epoch": 0.12108788406073463, + "grad_norm": 5.952898979187012, + "learning_rate": 1.5428918901950105e-05, + "loss": 0.8368, + "step": 2540 + }, + { + "epoch": 0.12113555645603413, + "grad_norm": 4.943411827087402, + "learning_rate": 1.542561976146974e-05, + "loss": 0.7786, + "step": 2541 + }, + { + "epoch": 0.12118322885133363, + "grad_norm": 1.798194169998169, + "learning_rate": 1.5422319783872118e-05, + "loss": 1.1188, + "step": 2542 + }, + { + "epoch": 0.12123090124663313, + "grad_norm": 1.5484708547592163, + "learning_rate": 1.5419018969666396e-05, + "loss": 0.9368, + "step": 2543 + }, + { + "epoch": 0.12127857364193263, + "grad_norm": 1.478027105331421, + "learning_rate": 1.541571731936185e-05, + "loss": 0.7332, + "step": 2544 + }, + { + "epoch": 0.12132624603723215, + "grad_norm": 1.9182305335998535, + "learning_rate": 1.5412414833467887e-05, + "loss": 0.7148, + "step": 2545 + }, + { + "epoch": 0.12137391843253165, + "grad_norm": 1.8540362119674683, + "learning_rate": 1.540911151249406e-05, + "loss": 0.815, + "step": 2546 + }, + { + "epoch": 0.12142159082783115, + "grad_norm": 1.6913847923278809, + "learning_rate": 1.5405807356950028e-05, + "loss": 1.1134, + "step": 2547 + }, + { + "epoch": 0.12146926322313065, + "grad_norm": 1.5570502281188965, + "learning_rate": 1.5402502367345588e-05, + "loss": 0.9728, + "step": 2548 + }, + { + "epoch": 0.12151693561843015, + "grad_norm": 1.8802764415740967, + "learning_rate": 1.5399196544190668e-05, + "loss": 0.7477, + "step": 2549 + }, + { + "epoch": 0.12156460801372965, + "grad_norm": 1.8015141487121582, + "learning_rate": 1.5395889887995324e-05, + "loss": 0.8685, + "step": 2550 + }, + { + "epoch": 0.12161228040902915, + "grad_norm": 3.0751209259033203, + "learning_rate": 1.5392582399269735e-05, + "loss": 0.7288, + "step": 2551 + }, + { + "epoch": 0.12165995280432865, + "grad_norm": 2.1915316581726074, + "learning_rate": 1.5389274078524217e-05, + "loss": 0.8262, + "step": 2552 + }, + { + "epoch": 0.12170762519962816, + "grad_norm": 1.7919362783432007, + "learning_rate": 1.5385964926269206e-05, + "loss": 0.9529, + "step": 2553 + }, + { + "epoch": 0.12175529759492766, + "grad_norm": 1.8436754941940308, + "learning_rate": 1.5382654943015274e-05, + "loss": 0.7763, + "step": 2554 + }, + { + "epoch": 0.12180296999022716, + "grad_norm": 1.9005080461502075, + "learning_rate": 1.5379344129273112e-05, + "loss": 0.8681, + "step": 2555 + }, + { + "epoch": 0.12185064238552666, + "grad_norm": 1.5322390794754028, + "learning_rate": 1.5376032485553543e-05, + "loss": 0.7952, + "step": 2556 + }, + { + "epoch": 0.12189831478082616, + "grad_norm": 1.448270559310913, + "learning_rate": 1.5372720012367532e-05, + "loss": 0.9949, + "step": 2557 + }, + { + "epoch": 0.12194598717612566, + "grad_norm": 1.3214863538742065, + "learning_rate": 1.5369406710226147e-05, + "loss": 0.8112, + "step": 2558 + }, + { + "epoch": 0.12199365957142516, + "grad_norm": 1.4696680307388306, + "learning_rate": 1.5366092579640604e-05, + "loss": 0.5208, + "step": 2559 + }, + { + "epoch": 0.12204133196672466, + "grad_norm": 1.8419678211212158, + "learning_rate": 1.5362777621122235e-05, + "loss": 0.6654, + "step": 2560 + }, + { + "epoch": 0.12208900436202418, + "grad_norm": 1.5617502927780151, + "learning_rate": 1.5359461835182507e-05, + "loss": 0.6595, + "step": 2561 + }, + { + "epoch": 0.12213667675732368, + "grad_norm": 2.430222272872925, + "learning_rate": 1.5356145222333006e-05, + "loss": 0.4169, + "step": 2562 + }, + { + "epoch": 0.12218434915262318, + "grad_norm": 3.5594234466552734, + "learning_rate": 1.5352827783085453e-05, + "loss": 0.4826, + "step": 2563 + }, + { + "epoch": 0.12223202154792268, + "grad_norm": 1.7489187717437744, + "learning_rate": 1.53495095179517e-05, + "loss": 0.7814, + "step": 2564 + }, + { + "epoch": 0.12227969394322218, + "grad_norm": 1.509758710861206, + "learning_rate": 1.5346190427443716e-05, + "loss": 0.8884, + "step": 2565 + }, + { + "epoch": 0.12232736633852168, + "grad_norm": 1.1786763668060303, + "learning_rate": 1.5342870512073605e-05, + "loss": 0.7114, + "step": 2566 + }, + { + "epoch": 0.12237503873382118, + "grad_norm": 3.198537588119507, + "learning_rate": 1.5339549772353595e-05, + "loss": 1.4127, + "step": 2567 + }, + { + "epoch": 0.12242271112912068, + "grad_norm": 1.3353036642074585, + "learning_rate": 1.533622820879604e-05, + "loss": 0.702, + "step": 2568 + }, + { + "epoch": 0.12247038352442019, + "grad_norm": 1.7240386009216309, + "learning_rate": 1.533290582191343e-05, + "loss": 0.5939, + "step": 2569 + }, + { + "epoch": 0.12251805591971969, + "grad_norm": 4.358938694000244, + "learning_rate": 1.5329582612218366e-05, + "loss": 0.565, + "step": 2570 + }, + { + "epoch": 0.12256572831501919, + "grad_norm": 1.288405179977417, + "learning_rate": 1.532625858022359e-05, + "loss": 0.6065, + "step": 2571 + }, + { + "epoch": 0.12261340071031869, + "grad_norm": 1.627985954284668, + "learning_rate": 1.5322933726441963e-05, + "loss": 0.7879, + "step": 2572 + }, + { + "epoch": 0.12266107310561819, + "grad_norm": 1.7537492513656616, + "learning_rate": 1.531960805138648e-05, + "loss": 0.5376, + "step": 2573 + }, + { + "epoch": 0.12270874550091769, + "grad_norm": 1.8633304834365845, + "learning_rate": 1.5316281555570258e-05, + "loss": 0.3876, + "step": 2574 + }, + { + "epoch": 0.12275641789621719, + "grad_norm": 2.610959768295288, + "learning_rate": 1.5312954239506536e-05, + "loss": 0.6771, + "step": 2575 + }, + { + "epoch": 0.12280409029151669, + "grad_norm": 1.6779143810272217, + "learning_rate": 1.530962610370869e-05, + "loss": 0.5416, + "step": 2576 + }, + { + "epoch": 0.1228517626868162, + "grad_norm": 3.0123038291931152, + "learning_rate": 1.530629714869021e-05, + "loss": 0.2452, + "step": 2577 + }, + { + "epoch": 0.1228994350821157, + "grad_norm": 1.634108304977417, + "learning_rate": 1.5302967374964727e-05, + "loss": 0.4345, + "step": 2578 + }, + { + "epoch": 0.1229471074774152, + "grad_norm": 1.60711669921875, + "learning_rate": 1.5299636783045988e-05, + "loss": 0.8213, + "step": 2579 + }, + { + "epoch": 0.1229947798727147, + "grad_norm": 1.858851671218872, + "learning_rate": 1.529630537344787e-05, + "loss": 0.8619, + "step": 2580 + }, + { + "epoch": 0.1230424522680142, + "grad_norm": 1.5693343877792358, + "learning_rate": 1.5292973146684372e-05, + "loss": 0.5578, + "step": 2581 + }, + { + "epoch": 0.1230901246633137, + "grad_norm": 2.4136385917663574, + "learning_rate": 1.5289640103269626e-05, + "loss": 0.4917, + "step": 2582 + }, + { + "epoch": 0.1231377970586132, + "grad_norm": 1.3694087266921997, + "learning_rate": 1.5286306243717884e-05, + "loss": 0.7852, + "step": 2583 + }, + { + "epoch": 0.12318546945391272, + "grad_norm": 1.901773452758789, + "learning_rate": 1.528297156854353e-05, + "loss": 0.5887, + "step": 2584 + }, + { + "epoch": 0.12323314184921222, + "grad_norm": 1.2055267095565796, + "learning_rate": 1.5279636078261064e-05, + "loss": 0.5099, + "step": 2585 + }, + { + "epoch": 0.12328081424451172, + "grad_norm": 1.8217684030532837, + "learning_rate": 1.5276299773385122e-05, + "loss": 0.9042, + "step": 2586 + }, + { + "epoch": 0.12332848663981122, + "grad_norm": 1.5840007066726685, + "learning_rate": 1.527296265443046e-05, + "loss": 0.629, + "step": 2587 + }, + { + "epoch": 0.12337615903511072, + "grad_norm": 3.431870222091675, + "learning_rate": 1.5269624721911964e-05, + "loss": 0.5871, + "step": 2588 + }, + { + "epoch": 0.12342383143041022, + "grad_norm": 1.165732741355896, + "learning_rate": 1.5266285976344635e-05, + "loss": 0.2416, + "step": 2589 + }, + { + "epoch": 0.12347150382570972, + "grad_norm": 1.3594448566436768, + "learning_rate": 1.5262946418243617e-05, + "loss": 1.0888, + "step": 2590 + }, + { + "epoch": 0.12351917622100922, + "grad_norm": 6.483646869659424, + "learning_rate": 1.5259606048124162e-05, + "loss": 0.8025, + "step": 2591 + }, + { + "epoch": 0.12356684861630873, + "grad_norm": 1.7944142818450928, + "learning_rate": 1.5256264866501655e-05, + "loss": 0.739, + "step": 2592 + }, + { + "epoch": 0.12361452101160823, + "grad_norm": 1.2228024005889893, + "learning_rate": 1.5252922873891611e-05, + "loss": 0.5483, + "step": 2593 + }, + { + "epoch": 0.12366219340690773, + "grad_norm": 2.679835081100464, + "learning_rate": 1.5249580070809661e-05, + "loss": 0.6809, + "step": 2594 + }, + { + "epoch": 0.12370986580220723, + "grad_norm": 1.1273689270019531, + "learning_rate": 1.5246236457771568e-05, + "loss": 0.5378, + "step": 2595 + }, + { + "epoch": 0.12375753819750673, + "grad_norm": 2.1775577068328857, + "learning_rate": 1.5242892035293216e-05, + "loss": 1.2623, + "step": 2596 + }, + { + "epoch": 0.12380521059280623, + "grad_norm": 2.6037042140960693, + "learning_rate": 1.523954680389061e-05, + "loss": 1.0526, + "step": 2597 + }, + { + "epoch": 0.12385288298810573, + "grad_norm": 1.6117844581604004, + "learning_rate": 1.5236200764079894e-05, + "loss": 0.8293, + "step": 2598 + }, + { + "epoch": 0.12390055538340523, + "grad_norm": 1.9459812641143799, + "learning_rate": 1.5232853916377321e-05, + "loss": 0.7792, + "step": 2599 + }, + { + "epoch": 0.12394822777870475, + "grad_norm": 1.566213607788086, + "learning_rate": 1.5229506261299276e-05, + "loss": 0.7649, + "step": 2600 + }, + { + "epoch": 0.12399590017400425, + "grad_norm": 3.2547764778137207, + "learning_rate": 1.5226157799362267e-05, + "loss": 0.3185, + "step": 2601 + }, + { + "epoch": 0.12404357256930375, + "grad_norm": 1.8356401920318604, + "learning_rate": 1.5222808531082929e-05, + "loss": 0.9227, + "step": 2602 + }, + { + "epoch": 0.12409124496460325, + "grad_norm": 2.2335805892944336, + "learning_rate": 1.521945845697802e-05, + "loss": 0.7411, + "step": 2603 + }, + { + "epoch": 0.12413891735990275, + "grad_norm": 1.2031301259994507, + "learning_rate": 1.521610757756442e-05, + "loss": 0.5848, + "step": 2604 + }, + { + "epoch": 0.12418658975520225, + "grad_norm": 1.1740297079086304, + "learning_rate": 1.521275589335914e-05, + "loss": 0.6071, + "step": 2605 + }, + { + "epoch": 0.12423426215050175, + "grad_norm": 1.8852561712265015, + "learning_rate": 1.5209403404879305e-05, + "loss": 0.6525, + "step": 2606 + }, + { + "epoch": 0.12428193454580125, + "grad_norm": 1.8858394622802734, + "learning_rate": 1.520605011264217e-05, + "loss": 0.9351, + "step": 2607 + }, + { + "epoch": 0.12432960694110076, + "grad_norm": 1.4784988164901733, + "learning_rate": 1.5202696017165114e-05, + "loss": 0.5307, + "step": 2608 + }, + { + "epoch": 0.12437727933640026, + "grad_norm": 1.0846854448318481, + "learning_rate": 1.5199341118965641e-05, + "loss": 0.6695, + "step": 2609 + }, + { + "epoch": 0.12442495173169976, + "grad_norm": 1.6776355504989624, + "learning_rate": 1.5195985418561377e-05, + "loss": 1.054, + "step": 2610 + }, + { + "epoch": 0.12447262412699926, + "grad_norm": 1.3577890396118164, + "learning_rate": 1.519262891647007e-05, + "loss": 0.6678, + "step": 2611 + }, + { + "epoch": 0.12452029652229876, + "grad_norm": 1.2777031660079956, + "learning_rate": 1.5189271613209595e-05, + "loss": 0.72, + "step": 2612 + }, + { + "epoch": 0.12456796891759826, + "grad_norm": 1.4406533241271973, + "learning_rate": 1.518591350929795e-05, + "loss": 0.7993, + "step": 2613 + }, + { + "epoch": 0.12461564131289776, + "grad_norm": 1.1992273330688477, + "learning_rate": 1.5182554605253254e-05, + "loss": 0.5169, + "step": 2614 + }, + { + "epoch": 0.12466331370819726, + "grad_norm": 1.0916426181793213, + "learning_rate": 1.5179194901593752e-05, + "loss": 0.4097, + "step": 2615 + }, + { + "epoch": 0.12471098610349678, + "grad_norm": 1.5397124290466309, + "learning_rate": 1.5175834398837814e-05, + "loss": 0.5146, + "step": 2616 + }, + { + "epoch": 0.12475865849879628, + "grad_norm": 1.4506062269210815, + "learning_rate": 1.5172473097503928e-05, + "loss": 0.7577, + "step": 2617 + }, + { + "epoch": 0.12480633089409578, + "grad_norm": 1.6795587539672852, + "learning_rate": 1.516911099811071e-05, + "loss": 0.7902, + "step": 2618 + }, + { + "epoch": 0.12485400328939528, + "grad_norm": 2.53257417678833, + "learning_rate": 1.5165748101176894e-05, + "loss": 0.7611, + "step": 2619 + }, + { + "epoch": 0.12490167568469478, + "grad_norm": 1.3414573669433594, + "learning_rate": 1.5162384407221344e-05, + "loss": 0.6261, + "step": 2620 + }, + { + "epoch": 0.12494934807999428, + "grad_norm": 1.2702453136444092, + "learning_rate": 1.5159019916763044e-05, + "loss": 0.3997, + "step": 2621 + }, + { + "epoch": 0.12499702047529378, + "grad_norm": 1.7113113403320312, + "learning_rate": 1.51556546303211e-05, + "loss": 0.4045, + "step": 2622 + }, + { + "epoch": 0.12504469287059328, + "grad_norm": 2.8945841789245605, + "learning_rate": 1.5152288548414734e-05, + "loss": 1.104, + "step": 2623 + }, + { + "epoch": 0.1250923652658928, + "grad_norm": 1.5277979373931885, + "learning_rate": 1.5148921671563309e-05, + "loss": 1.1081, + "step": 2624 + }, + { + "epoch": 0.12514003766119228, + "grad_norm": 1.7321184873580933, + "learning_rate": 1.514555400028629e-05, + "loss": 0.8964, + "step": 2625 + }, + { + "epoch": 0.1251877100564918, + "grad_norm": 1.2266030311584473, + "learning_rate": 1.5142185535103276e-05, + "loss": 0.6426, + "step": 2626 + }, + { + "epoch": 0.12523538245179128, + "grad_norm": 1.8250890970230103, + "learning_rate": 1.5138816276533994e-05, + "loss": 0.324, + "step": 2627 + }, + { + "epoch": 0.1252830548470908, + "grad_norm": 2.3702685832977295, + "learning_rate": 1.5135446225098279e-05, + "loss": 0.7214, + "step": 2628 + }, + { + "epoch": 0.1253307272423903, + "grad_norm": 1.1414276361465454, + "learning_rate": 1.5132075381316091e-05, + "loss": 0.6886, + "step": 2629 + }, + { + "epoch": 0.1253783996376898, + "grad_norm": 2.723069429397583, + "learning_rate": 1.5128703745707527e-05, + "loss": 1.02, + "step": 2630 + }, + { + "epoch": 0.1254260720329893, + "grad_norm": 2.712341785430908, + "learning_rate": 1.5125331318792787e-05, + "loss": 0.5714, + "step": 2631 + }, + { + "epoch": 0.1254737444282888, + "grad_norm": 2.0006113052368164, + "learning_rate": 1.5121958101092205e-05, + "loss": 0.3528, + "step": 2632 + }, + { + "epoch": 0.1255214168235883, + "grad_norm": 3.0483641624450684, + "learning_rate": 1.5118584093126237e-05, + "loss": 1.1299, + "step": 2633 + }, + { + "epoch": 0.1255690892188878, + "grad_norm": 1.6615673303604126, + "learning_rate": 1.5115209295415454e-05, + "loss": 0.6836, + "step": 2634 + }, + { + "epoch": 0.1256167616141873, + "grad_norm": 1.579200029373169, + "learning_rate": 1.5111833708480555e-05, + "loss": 0.7894, + "step": 2635 + }, + { + "epoch": 0.12566443400948682, + "grad_norm": 2.0592663288116455, + "learning_rate": 1.5108457332842352e-05, + "loss": 0.4932, + "step": 2636 + }, + { + "epoch": 0.1257121064047863, + "grad_norm": 1.6344188451766968, + "learning_rate": 1.5105080169021792e-05, + "loss": 0.7383, + "step": 2637 + }, + { + "epoch": 0.12575977880008582, + "grad_norm": 1.7413655519485474, + "learning_rate": 1.5101702217539933e-05, + "loss": 0.537, + "step": 2638 + }, + { + "epoch": 0.1258074511953853, + "grad_norm": 1.9291362762451172, + "learning_rate": 1.509832347891796e-05, + "loss": 0.9059, + "step": 2639 + }, + { + "epoch": 0.12585512359068482, + "grad_norm": 1.3139088153839111, + "learning_rate": 1.5094943953677175e-05, + "loss": 0.5787, + "step": 2640 + }, + { + "epoch": 0.1259027959859843, + "grad_norm": 1.6078028678894043, + "learning_rate": 1.509156364233901e-05, + "loss": 0.7048, + "step": 2641 + }, + { + "epoch": 0.12595046838128382, + "grad_norm": 1.2953481674194336, + "learning_rate": 1.5088182545425003e-05, + "loss": 0.6951, + "step": 2642 + }, + { + "epoch": 0.1259981407765833, + "grad_norm": 2.0099778175354004, + "learning_rate": 1.5084800663456828e-05, + "loss": 0.7213, + "step": 2643 + }, + { + "epoch": 0.12604581317188282, + "grad_norm": 1.9901890754699707, + "learning_rate": 1.5081417996956277e-05, + "loss": 0.7414, + "step": 2644 + }, + { + "epoch": 0.12609348556718233, + "grad_norm": 1.2115107774734497, + "learning_rate": 1.5078034546445257e-05, + "loss": 0.6107, + "step": 2645 + }, + { + "epoch": 0.12614115796248182, + "grad_norm": 4.351001739501953, + "learning_rate": 1.5074650312445797e-05, + "loss": 0.9591, + "step": 2646 + }, + { + "epoch": 0.12618883035778133, + "grad_norm": 1.9015579223632812, + "learning_rate": 1.5071265295480058e-05, + "loss": 0.6576, + "step": 2647 + }, + { + "epoch": 0.12623650275308082, + "grad_norm": 1.5669375658035278, + "learning_rate": 1.5067879496070305e-05, + "loss": 0.8454, + "step": 2648 + }, + { + "epoch": 0.12628417514838033, + "grad_norm": 2.2157368659973145, + "learning_rate": 1.5064492914738934e-05, + "loss": 0.7875, + "step": 2649 + }, + { + "epoch": 0.12633184754367982, + "grad_norm": 1.331052303314209, + "learning_rate": 1.5061105552008462e-05, + "loss": 0.6967, + "step": 2650 + }, + { + "epoch": 0.12637951993897933, + "grad_norm": 1.203689694404602, + "learning_rate": 1.5057717408401523e-05, + "loss": 0.4825, + "step": 2651 + }, + { + "epoch": 0.12642719233427885, + "grad_norm": 3.1987273693084717, + "learning_rate": 1.5054328484440868e-05, + "loss": 1.0853, + "step": 2652 + }, + { + "epoch": 0.12647486472957833, + "grad_norm": 1.1295535564422607, + "learning_rate": 1.5050938780649382e-05, + "loss": 0.4593, + "step": 2653 + }, + { + "epoch": 0.12652253712487785, + "grad_norm": 2.7517635822296143, + "learning_rate": 1.5047548297550054e-05, + "loss": 0.3619, + "step": 2654 + }, + { + "epoch": 0.12657020952017733, + "grad_norm": 2.8597254753112793, + "learning_rate": 1.5044157035666003e-05, + "loss": 0.978, + "step": 2655 + }, + { + "epoch": 0.12661788191547685, + "grad_norm": 2.004417896270752, + "learning_rate": 1.5040764995520469e-05, + "loss": 0.4542, + "step": 2656 + }, + { + "epoch": 0.12666555431077633, + "grad_norm": 2.3083701133728027, + "learning_rate": 1.5037372177636805e-05, + "loss": 0.9238, + "step": 2657 + }, + { + "epoch": 0.12671322670607585, + "grad_norm": 1.2528281211853027, + "learning_rate": 1.5033978582538487e-05, + "loss": 0.8701, + "step": 2658 + }, + { + "epoch": 0.12676089910137534, + "grad_norm": 1.7552685737609863, + "learning_rate": 1.5030584210749117e-05, + "loss": 0.5164, + "step": 2659 + }, + { + "epoch": 0.12680857149667485, + "grad_norm": 1.3574830293655396, + "learning_rate": 1.5027189062792405e-05, + "loss": 0.674, + "step": 2660 + }, + { + "epoch": 0.12685624389197436, + "grad_norm": 1.3860228061676025, + "learning_rate": 1.5023793139192192e-05, + "loss": 0.7454, + "step": 2661 + }, + { + "epoch": 0.12690391628727385, + "grad_norm": 2.419560194015503, + "learning_rate": 1.5020396440472433e-05, + "loss": 1.5646, + "step": 2662 + }, + { + "epoch": 0.12695158868257336, + "grad_norm": 1.2999653816223145, + "learning_rate": 1.5016998967157201e-05, + "loss": 0.8124, + "step": 2663 + }, + { + "epoch": 0.12699926107787285, + "grad_norm": 1.268409252166748, + "learning_rate": 1.5013600719770699e-05, + "loss": 0.6687, + "step": 2664 + }, + { + "epoch": 0.12704693347317236, + "grad_norm": 1.0066592693328857, + "learning_rate": 1.5010201698837232e-05, + "loss": 0.452, + "step": 2665 + }, + { + "epoch": 0.12709460586847185, + "grad_norm": 1.6505389213562012, + "learning_rate": 1.5006801904881236e-05, + "loss": 0.833, + "step": 2666 + }, + { + "epoch": 0.12714227826377136, + "grad_norm": 1.9982913732528687, + "learning_rate": 1.5003401338427271e-05, + "loss": 0.7697, + "step": 2667 + }, + { + "epoch": 0.12718995065907088, + "grad_norm": 1.6034916639328003, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.8527, + "step": 2668 + }, + { + "epoch": 0.12723762305437036, + "grad_norm": 1.9523409605026245, + "learning_rate": 1.4996597890124222e-05, + "loss": 0.8139, + "step": 2669 + }, + { + "epoch": 0.12728529544966988, + "grad_norm": 1.541053056716919, + "learning_rate": 1.4993195009324844e-05, + "loss": 0.7768, + "step": 2670 + }, + { + "epoch": 0.12733296784496936, + "grad_norm": 2.031825065612793, + "learning_rate": 1.4989791358126898e-05, + "loss": 0.761, + "step": 2671 + }, + { + "epoch": 0.12738064024026888, + "grad_norm": 1.5603649616241455, + "learning_rate": 1.4986386937055529e-05, + "loss": 0.9026, + "step": 2672 + }, + { + "epoch": 0.12742831263556836, + "grad_norm": 0.939656138420105, + "learning_rate": 1.4982981746636002e-05, + "loss": 0.2302, + "step": 2673 + }, + { + "epoch": 0.12747598503086788, + "grad_norm": 1.3674044609069824, + "learning_rate": 1.4979575787393713e-05, + "loss": 0.3921, + "step": 2674 + }, + { + "epoch": 0.1275236574261674, + "grad_norm": 1.1105155944824219, + "learning_rate": 1.4976169059854151e-05, + "loss": 0.6595, + "step": 2675 + }, + { + "epoch": 0.12757132982146688, + "grad_norm": 1.2926220893859863, + "learning_rate": 1.4972761564542953e-05, + "loss": 0.7689, + "step": 2676 + }, + { + "epoch": 0.1276190022167664, + "grad_norm": 1.38959538936615, + "learning_rate": 1.4969353301985856e-05, + "loss": 0.8177, + "step": 2677 + }, + { + "epoch": 0.12766667461206588, + "grad_norm": 1.8384571075439453, + "learning_rate": 1.4965944272708717e-05, + "loss": 0.8512, + "step": 2678 + }, + { + "epoch": 0.1277143470073654, + "grad_norm": 6.1158623695373535, + "learning_rate": 1.4962534477237516e-05, + "loss": 1.4324, + "step": 2679 + }, + { + "epoch": 0.12776201940266488, + "grad_norm": 1.3320478200912476, + "learning_rate": 1.495912391609835e-05, + "loss": 0.7636, + "step": 2680 + }, + { + "epoch": 0.1278096917979644, + "grad_norm": 1.7159260511398315, + "learning_rate": 1.4955712589817433e-05, + "loss": 0.6336, + "step": 2681 + }, + { + "epoch": 0.12785736419326388, + "grad_norm": 1.2993638515472412, + "learning_rate": 1.4952300498921097e-05, + "loss": 0.553, + "step": 2682 + }, + { + "epoch": 0.1279050365885634, + "grad_norm": 1.6075280904769897, + "learning_rate": 1.4948887643935793e-05, + "loss": 0.863, + "step": 2683 + }, + { + "epoch": 0.1279527089838629, + "grad_norm": 1.3749072551727295, + "learning_rate": 1.494547402538809e-05, + "loss": 0.5569, + "step": 2684 + }, + { + "epoch": 0.1280003813791624, + "grad_norm": 1.7231132984161377, + "learning_rate": 1.4942059643804671e-05, + "loss": 0.8663, + "step": 2685 + }, + { + "epoch": 0.1280480537744619, + "grad_norm": 2.2960851192474365, + "learning_rate": 1.4938644499712342e-05, + "loss": 0.894, + "step": 2686 + }, + { + "epoch": 0.1280957261697614, + "grad_norm": 1.9986116886138916, + "learning_rate": 1.4935228593638029e-05, + "loss": 0.4589, + "step": 2687 + }, + { + "epoch": 0.1281433985650609, + "grad_norm": 1.8226187229156494, + "learning_rate": 1.4931811926108765e-05, + "loss": 0.8236, + "step": 2688 + }, + { + "epoch": 0.1281910709603604, + "grad_norm": 1.6410741806030273, + "learning_rate": 1.4928394497651709e-05, + "loss": 0.8845, + "step": 2689 + }, + { + "epoch": 0.1282387433556599, + "grad_norm": 3.4875731468200684, + "learning_rate": 1.4924976308794134e-05, + "loss": 1.2421, + "step": 2690 + }, + { + "epoch": 0.12828641575095942, + "grad_norm": 2.545445680618286, + "learning_rate": 1.4921557360063432e-05, + "loss": 0.1611, + "step": 2691 + }, + { + "epoch": 0.1283340881462589, + "grad_norm": 4.536085605621338, + "learning_rate": 1.4918137651987111e-05, + "loss": 0.7937, + "step": 2692 + }, + { + "epoch": 0.12838176054155842, + "grad_norm": 1.0321882963180542, + "learning_rate": 1.4914717185092797e-05, + "loss": 0.7083, + "step": 2693 + }, + { + "epoch": 0.1284294329368579, + "grad_norm": 2.099994659423828, + "learning_rate": 1.4911295959908235e-05, + "loss": 0.7027, + "step": 2694 + }, + { + "epoch": 0.12847710533215742, + "grad_norm": 3.259446382522583, + "learning_rate": 1.4907873976961282e-05, + "loss": 0.4003, + "step": 2695 + }, + { + "epoch": 0.1285247777274569, + "grad_norm": 1.6879503726959229, + "learning_rate": 1.4904451236779917e-05, + "loss": 0.95, + "step": 2696 + }, + { + "epoch": 0.12857245012275642, + "grad_norm": 1.7220258712768555, + "learning_rate": 1.4901027739892228e-05, + "loss": 0.8303, + "step": 2697 + }, + { + "epoch": 0.1286201225180559, + "grad_norm": 2.255549907684326, + "learning_rate": 1.4897603486826433e-05, + "loss": 1.409, + "step": 2698 + }, + { + "epoch": 0.12866779491335542, + "grad_norm": 1.5026378631591797, + "learning_rate": 1.4894178478110856e-05, + "loss": 0.7705, + "step": 2699 + }, + { + "epoch": 0.12871546730865494, + "grad_norm": 2.3631350994110107, + "learning_rate": 1.4890752714273936e-05, + "loss": 0.5187, + "step": 2700 + }, + { + "epoch": 0.12876313970395442, + "grad_norm": 1.7667535543441772, + "learning_rate": 1.4887326195844243e-05, + "loss": 0.6735, + "step": 2701 + }, + { + "epoch": 0.12881081209925394, + "grad_norm": 1.1645985841751099, + "learning_rate": 1.4883898923350446e-05, + "loss": 0.5765, + "step": 2702 + }, + { + "epoch": 0.12885848449455342, + "grad_norm": 1.3896843194961548, + "learning_rate": 1.488047089732134e-05, + "loss": 0.8479, + "step": 2703 + }, + { + "epoch": 0.12890615688985294, + "grad_norm": 1.2915118932724, + "learning_rate": 1.4877042118285832e-05, + "loss": 0.6082, + "step": 2704 + }, + { + "epoch": 0.12895382928515242, + "grad_norm": 1.1442525386810303, + "learning_rate": 1.487361258677295e-05, + "loss": 0.7111, + "step": 2705 + }, + { + "epoch": 0.12900150168045194, + "grad_norm": 1.5689988136291504, + "learning_rate": 1.487018230331183e-05, + "loss": 0.7491, + "step": 2706 + }, + { + "epoch": 0.12904917407575145, + "grad_norm": 3.11156964302063, + "learning_rate": 1.4866751268431738e-05, + "loss": 0.6102, + "step": 2707 + }, + { + "epoch": 0.12909684647105094, + "grad_norm": 1.5355772972106934, + "learning_rate": 1.4863319482662044e-05, + "loss": 0.6756, + "step": 2708 + }, + { + "epoch": 0.12914451886635045, + "grad_norm": 1.6491599082946777, + "learning_rate": 1.4859886946532235e-05, + "loss": 0.6627, + "step": 2709 + }, + { + "epoch": 0.12919219126164994, + "grad_norm": 1.981755256652832, + "learning_rate": 1.485645366057192e-05, + "loss": 0.6677, + "step": 2710 + }, + { + "epoch": 0.12923986365694945, + "grad_norm": 2.169858932495117, + "learning_rate": 1.4853019625310813e-05, + "loss": 1.0718, + "step": 2711 + }, + { + "epoch": 0.12928753605224894, + "grad_norm": 1.7253448963165283, + "learning_rate": 1.4849584841278755e-05, + "loss": 0.2739, + "step": 2712 + }, + { + "epoch": 0.12933520844754845, + "grad_norm": 1.3531230688095093, + "learning_rate": 1.4846149309005697e-05, + "loss": 0.525, + "step": 2713 + }, + { + "epoch": 0.12938288084284794, + "grad_norm": 1.7004480361938477, + "learning_rate": 1.4842713029021707e-05, + "loss": 0.9376, + "step": 2714 + }, + { + "epoch": 0.12943055323814745, + "grad_norm": 1.5262759923934937, + "learning_rate": 1.4839276001856965e-05, + "loss": 0.6307, + "step": 2715 + }, + { + "epoch": 0.12947822563344696, + "grad_norm": 2.7734172344207764, + "learning_rate": 1.4835838228041773e-05, + "loss": 0.6477, + "step": 2716 + }, + { + "epoch": 0.12952589802874645, + "grad_norm": 1.2846275568008423, + "learning_rate": 1.4832399708106541e-05, + "loss": 0.5238, + "step": 2717 + }, + { + "epoch": 0.12957357042404596, + "grad_norm": 1.1377571821212769, + "learning_rate": 1.4828960442581802e-05, + "loss": 0.6442, + "step": 2718 + }, + { + "epoch": 0.12962124281934545, + "grad_norm": 1.4920954704284668, + "learning_rate": 1.4825520431998191e-05, + "loss": 0.8488, + "step": 2719 + }, + { + "epoch": 0.12966891521464496, + "grad_norm": 1.5849255323410034, + "learning_rate": 1.4822079676886469e-05, + "loss": 0.5422, + "step": 2720 + }, + { + "epoch": 0.12971658760994445, + "grad_norm": 1.5268288850784302, + "learning_rate": 1.4818638177777514e-05, + "loss": 0.6045, + "step": 2721 + }, + { + "epoch": 0.12976426000524396, + "grad_norm": 1.9282076358795166, + "learning_rate": 1.481519593520231e-05, + "loss": 0.8956, + "step": 2722 + }, + { + "epoch": 0.12981193240054348, + "grad_norm": 2.7352709770202637, + "learning_rate": 1.4811752949691958e-05, + "loss": 0.4204, + "step": 2723 + }, + { + "epoch": 0.12985960479584296, + "grad_norm": 1.0934041738510132, + "learning_rate": 1.4808309221777681e-05, + "loss": 0.7061, + "step": 2724 + }, + { + "epoch": 0.12990727719114248, + "grad_norm": 1.905676245689392, + "learning_rate": 1.4804864751990807e-05, + "loss": 0.7281, + "step": 2725 + }, + { + "epoch": 0.12995494958644196, + "grad_norm": 1.4097931385040283, + "learning_rate": 1.4801419540862779e-05, + "loss": 0.8454, + "step": 2726 + }, + { + "epoch": 0.13000262198174148, + "grad_norm": 1.8227475881576538, + "learning_rate": 1.4797973588925163e-05, + "loss": 0.6977, + "step": 2727 + }, + { + "epoch": 0.13005029437704096, + "grad_norm": 0.8902652263641357, + "learning_rate": 1.479452689670963e-05, + "loss": 0.353, + "step": 2728 + }, + { + "epoch": 0.13009796677234048, + "grad_norm": 4.896397590637207, + "learning_rate": 1.4791079464747973e-05, + "loss": 0.3192, + "step": 2729 + }, + { + "epoch": 0.13014563916763996, + "grad_norm": 1.6081026792526245, + "learning_rate": 1.4787631293572094e-05, + "loss": 0.6647, + "step": 2730 + }, + { + "epoch": 0.13019331156293948, + "grad_norm": 1.456747055053711, + "learning_rate": 1.4784182383714005e-05, + "loss": 0.8112, + "step": 2731 + }, + { + "epoch": 0.130240983958239, + "grad_norm": 1.8662186861038208, + "learning_rate": 1.4780732735705847e-05, + "loss": 0.9066, + "step": 2732 + }, + { + "epoch": 0.13028865635353848, + "grad_norm": 1.8691082000732422, + "learning_rate": 1.4777282350079858e-05, + "loss": 0.3761, + "step": 2733 + }, + { + "epoch": 0.130336328748838, + "grad_norm": 1.9497867822647095, + "learning_rate": 1.4773831227368399e-05, + "loss": 0.7607, + "step": 2734 + }, + { + "epoch": 0.13038400114413748, + "grad_norm": 1.5139490365982056, + "learning_rate": 1.477037936810394e-05, + "loss": 0.8341, + "step": 2735 + }, + { + "epoch": 0.130431673539437, + "grad_norm": 1.5750176906585693, + "learning_rate": 1.4766926772819072e-05, + "loss": 0.7762, + "step": 2736 + }, + { + "epoch": 0.13047934593473648, + "grad_norm": 1.0719943046569824, + "learning_rate": 1.476347344204649e-05, + "loss": 0.6573, + "step": 2737 + }, + { + "epoch": 0.130527018330036, + "grad_norm": 2.591128349304199, + "learning_rate": 1.4760019376319015e-05, + "loss": 1.0311, + "step": 2738 + }, + { + "epoch": 0.1305746907253355, + "grad_norm": 2.266017198562622, + "learning_rate": 1.4756564576169568e-05, + "loss": 0.506, + "step": 2739 + }, + { + "epoch": 0.130622363120635, + "grad_norm": 9.711892127990723, + "learning_rate": 1.4753109042131189e-05, + "loss": 1.0578, + "step": 2740 + }, + { + "epoch": 0.1306700355159345, + "grad_norm": 4.1030988693237305, + "learning_rate": 1.4749652774737031e-05, + "loss": 1.4217, + "step": 2741 + }, + { + "epoch": 0.130717707911234, + "grad_norm": 1.9199711084365845, + "learning_rate": 1.4746195774520365e-05, + "loss": 1.0869, + "step": 2742 + }, + { + "epoch": 0.1307653803065335, + "grad_norm": 1.354860782623291, + "learning_rate": 1.4742738042014563e-05, + "loss": 0.6842, + "step": 2743 + }, + { + "epoch": 0.130813052701833, + "grad_norm": 1.3137390613555908, + "learning_rate": 1.4739279577753122e-05, + "loss": 0.3451, + "step": 2744 + }, + { + "epoch": 0.1308607250971325, + "grad_norm": 4.379244327545166, + "learning_rate": 1.4735820382269652e-05, + "loss": 0.9839, + "step": 2745 + }, + { + "epoch": 0.130908397492432, + "grad_norm": 4.320148468017578, + "learning_rate": 1.4732360456097862e-05, + "loss": 0.2448, + "step": 2746 + }, + { + "epoch": 0.1309560698877315, + "grad_norm": 1.6411312818527222, + "learning_rate": 1.4728899799771591e-05, + "loss": 0.6835, + "step": 2747 + }, + { + "epoch": 0.13100374228303102, + "grad_norm": 2.049267292022705, + "learning_rate": 1.472543841382478e-05, + "loss": 0.5907, + "step": 2748 + }, + { + "epoch": 0.1310514146783305, + "grad_norm": 1.070568561553955, + "learning_rate": 1.472197629879148e-05, + "loss": 0.6369, + "step": 2749 + }, + { + "epoch": 0.13109908707363002, + "grad_norm": 0.8059070110321045, + "learning_rate": 1.4718513455205867e-05, + "loss": 0.3309, + "step": 2750 + }, + { + "epoch": 0.1311467594689295, + "grad_norm": 1.245627760887146, + "learning_rate": 1.4715049883602217e-05, + "loss": 0.7835, + "step": 2751 + }, + { + "epoch": 0.13119443186422902, + "grad_norm": 1.0324382781982422, + "learning_rate": 1.4711585584514927e-05, + "loss": 0.5744, + "step": 2752 + }, + { + "epoch": 0.1312421042595285, + "grad_norm": 2.1930136680603027, + "learning_rate": 1.4708120558478501e-05, + "loss": 0.9225, + "step": 2753 + }, + { + "epoch": 0.13128977665482802, + "grad_norm": 3.742619276046753, + "learning_rate": 1.4704654806027558e-05, + "loss": 1.0052, + "step": 2754 + }, + { + "epoch": 0.13133744905012754, + "grad_norm": 2.5243539810180664, + "learning_rate": 1.4701188327696825e-05, + "loss": 0.5848, + "step": 2755 + }, + { + "epoch": 0.13138512144542702, + "grad_norm": 2.040257692337036, + "learning_rate": 1.4697721124021149e-05, + "loss": 0.7903, + "step": 2756 + }, + { + "epoch": 0.13143279384072654, + "grad_norm": 3.4746055603027344, + "learning_rate": 1.4694253195535478e-05, + "loss": 0.9888, + "step": 2757 + }, + { + "epoch": 0.13148046623602602, + "grad_norm": 1.957668423652649, + "learning_rate": 1.469078454277488e-05, + "loss": 1.0133, + "step": 2758 + }, + { + "epoch": 0.13152813863132554, + "grad_norm": 1.347690463066101, + "learning_rate": 1.4687315166274535e-05, + "loss": 0.692, + "step": 2759 + }, + { + "epoch": 0.13157581102662502, + "grad_norm": 2.0428285598754883, + "learning_rate": 1.4683845066569727e-05, + "loss": 0.4666, + "step": 2760 + }, + { + "epoch": 0.13162348342192454, + "grad_norm": 2.2979228496551514, + "learning_rate": 1.4680374244195861e-05, + "loss": 0.8622, + "step": 2761 + }, + { + "epoch": 0.13167115581722402, + "grad_norm": 2.308122158050537, + "learning_rate": 1.467690269968845e-05, + "loss": 1.1501, + "step": 2762 + }, + { + "epoch": 0.13171882821252354, + "grad_norm": 2.7721638679504395, + "learning_rate": 1.4673430433583114e-05, + "loss": 0.4583, + "step": 2763 + }, + { + "epoch": 0.13176650060782305, + "grad_norm": 0.9930713176727295, + "learning_rate": 1.4669957446415588e-05, + "loss": 0.3261, + "step": 2764 + }, + { + "epoch": 0.13181417300312254, + "grad_norm": 1.2571367025375366, + "learning_rate": 1.4666483738721719e-05, + "loss": 0.6704, + "step": 2765 + }, + { + "epoch": 0.13186184539842205, + "grad_norm": 1.1763139963150024, + "learning_rate": 1.4663009311037464e-05, + "loss": 0.7018, + "step": 2766 + }, + { + "epoch": 0.13190951779372154, + "grad_norm": 3.2295870780944824, + "learning_rate": 1.4659534163898894e-05, + "loss": 1.3125, + "step": 2767 + }, + { + "epoch": 0.13195719018902105, + "grad_norm": 2.7077207565307617, + "learning_rate": 1.4656058297842185e-05, + "loss": 0.8727, + "step": 2768 + }, + { + "epoch": 0.13200486258432054, + "grad_norm": 1.6741299629211426, + "learning_rate": 1.465258171340363e-05, + "loss": 0.7755, + "step": 2769 + }, + { + "epoch": 0.13205253497962005, + "grad_norm": 1.9056814908981323, + "learning_rate": 1.464910441111963e-05, + "loss": 0.3511, + "step": 2770 + }, + { + "epoch": 0.13210020737491956, + "grad_norm": 2.130284070968628, + "learning_rate": 1.4645626391526694e-05, + "loss": 0.7884, + "step": 2771 + }, + { + "epoch": 0.13214787977021905, + "grad_norm": 1.098597526550293, + "learning_rate": 1.4642147655161445e-05, + "loss": 0.5392, + "step": 2772 + }, + { + "epoch": 0.13219555216551857, + "grad_norm": 1.7876940965652466, + "learning_rate": 1.463866820256062e-05, + "loss": 0.9207, + "step": 2773 + }, + { + "epoch": 0.13224322456081805, + "grad_norm": 3.6280429363250732, + "learning_rate": 1.4635188034261059e-05, + "loss": 0.5778, + "step": 2774 + }, + { + "epoch": 0.13229089695611757, + "grad_norm": 1.6366428136825562, + "learning_rate": 1.4631707150799718e-05, + "loss": 0.65, + "step": 2775 + }, + { + "epoch": 0.13233856935141705, + "grad_norm": 1.342584490776062, + "learning_rate": 1.4628225552713662e-05, + "loss": 0.7516, + "step": 2776 + }, + { + "epoch": 0.13238624174671657, + "grad_norm": 1.5772377252578735, + "learning_rate": 1.4624743240540064e-05, + "loss": 0.6755, + "step": 2777 + }, + { + "epoch": 0.13243391414201608, + "grad_norm": 1.8864952325820923, + "learning_rate": 1.4621260214816211e-05, + "loss": 0.5291, + "step": 2778 + }, + { + "epoch": 0.13248158653731557, + "grad_norm": 1.7388826608657837, + "learning_rate": 1.4617776476079495e-05, + "loss": 0.6162, + "step": 2779 + }, + { + "epoch": 0.13252925893261508, + "grad_norm": 3.1094038486480713, + "learning_rate": 1.461429202486742e-05, + "loss": 0.9037, + "step": 2780 + }, + { + "epoch": 0.13257693132791457, + "grad_norm": 1.1975785493850708, + "learning_rate": 1.4610806861717607e-05, + "loss": 0.4569, + "step": 2781 + }, + { + "epoch": 0.13262460372321408, + "grad_norm": 1.2240186929702759, + "learning_rate": 1.4607320987167778e-05, + "loss": 0.5962, + "step": 2782 + }, + { + "epoch": 0.13267227611851357, + "grad_norm": 1.3248203992843628, + "learning_rate": 1.4603834401755766e-05, + "loss": 0.8201, + "step": 2783 + }, + { + "epoch": 0.13271994851381308, + "grad_norm": 2.2847790718078613, + "learning_rate": 1.4600347106019514e-05, + "loss": 0.3645, + "step": 2784 + }, + { + "epoch": 0.13276762090911257, + "grad_norm": 2.044792413711548, + "learning_rate": 1.4596859100497083e-05, + "loss": 0.8309, + "step": 2785 + }, + { + "epoch": 0.13281529330441208, + "grad_norm": 1.340786099433899, + "learning_rate": 1.4593370385726627e-05, + "loss": 0.6562, + "step": 2786 + }, + { + "epoch": 0.1328629656997116, + "grad_norm": 1.168285846710205, + "learning_rate": 1.4589880962246424e-05, + "loss": 0.638, + "step": 2787 + }, + { + "epoch": 0.13291063809501108, + "grad_norm": 1.4792840480804443, + "learning_rate": 1.4586390830594856e-05, + "loss": 0.581, + "step": 2788 + }, + { + "epoch": 0.1329583104903106, + "grad_norm": 3.6525156497955322, + "learning_rate": 1.4582899991310412e-05, + "loss": 0.3736, + "step": 2789 + }, + { + "epoch": 0.13300598288561008, + "grad_norm": 1.2078745365142822, + "learning_rate": 1.4579408444931696e-05, + "loss": 0.5303, + "step": 2790 + }, + { + "epoch": 0.1330536552809096, + "grad_norm": 2.6930601596832275, + "learning_rate": 1.4575916191997415e-05, + "loss": 1.0866, + "step": 2791 + }, + { + "epoch": 0.13310132767620908, + "grad_norm": 1.617804765701294, + "learning_rate": 1.4572423233046386e-05, + "loss": 0.6667, + "step": 2792 + }, + { + "epoch": 0.1331490000715086, + "grad_norm": 1.266524076461792, + "learning_rate": 1.4568929568617542e-05, + "loss": 0.7276, + "step": 2793 + }, + { + "epoch": 0.1331966724668081, + "grad_norm": 1.9528487920761108, + "learning_rate": 1.4565435199249915e-05, + "loss": 0.7923, + "step": 2794 + }, + { + "epoch": 0.1332443448621076, + "grad_norm": 1.0529392957687378, + "learning_rate": 1.4561940125482652e-05, + "loss": 0.4727, + "step": 2795 + }, + { + "epoch": 0.1332920172574071, + "grad_norm": 1.7384309768676758, + "learning_rate": 1.4558444347855008e-05, + "loss": 0.7612, + "step": 2796 + }, + { + "epoch": 0.1333396896527066, + "grad_norm": 1.2260239124298096, + "learning_rate": 1.455494786690634e-05, + "loss": 0.4105, + "step": 2797 + }, + { + "epoch": 0.1333873620480061, + "grad_norm": 1.6818293333053589, + "learning_rate": 1.4551450683176127e-05, + "loss": 0.3869, + "step": 2798 + }, + { + "epoch": 0.1334350344433056, + "grad_norm": 3.6569578647613525, + "learning_rate": 1.4547952797203944e-05, + "loss": 0.473, + "step": 2799 + }, + { + "epoch": 0.1334827068386051, + "grad_norm": 1.3549047708511353, + "learning_rate": 1.454445420952948e-05, + "loss": 0.8176, + "step": 2800 + }, + { + "epoch": 0.1335303792339046, + "grad_norm": 2.233328104019165, + "learning_rate": 1.4540954920692528e-05, + "loss": 0.6295, + "step": 2801 + }, + { + "epoch": 0.1335780516292041, + "grad_norm": 1.372320294380188, + "learning_rate": 1.4537454931232994e-05, + "loss": 0.6335, + "step": 2802 + }, + { + "epoch": 0.13362572402450362, + "grad_norm": 1.1725353002548218, + "learning_rate": 1.4533954241690891e-05, + "loss": 0.7121, + "step": 2803 + }, + { + "epoch": 0.1336733964198031, + "grad_norm": 1.5747082233428955, + "learning_rate": 1.453045285260634e-05, + "loss": 1.0418, + "step": 2804 + }, + { + "epoch": 0.13372106881510262, + "grad_norm": 1.311205267906189, + "learning_rate": 1.452695076451957e-05, + "loss": 0.9077, + "step": 2805 + }, + { + "epoch": 0.1337687412104021, + "grad_norm": 1.7655737400054932, + "learning_rate": 1.4523447977970913e-05, + "loss": 0.9344, + "step": 2806 + }, + { + "epoch": 0.13381641360570162, + "grad_norm": 1.429305076599121, + "learning_rate": 1.451994449350082e-05, + "loss": 1.05, + "step": 2807 + }, + { + "epoch": 0.1338640860010011, + "grad_norm": 1.9649168252944946, + "learning_rate": 1.4516440311649835e-05, + "loss": 0.6841, + "step": 2808 + }, + { + "epoch": 0.13391175839630062, + "grad_norm": 1.6590079069137573, + "learning_rate": 1.451293543295862e-05, + "loss": 0.7886, + "step": 2809 + }, + { + "epoch": 0.13395943079160014, + "grad_norm": 1.488486409187317, + "learning_rate": 1.450942985796794e-05, + "loss": 0.6772, + "step": 2810 + }, + { + "epoch": 0.13400710318689962, + "grad_norm": 4.184868335723877, + "learning_rate": 1.4505923587218673e-05, + "loss": 0.2362, + "step": 2811 + }, + { + "epoch": 0.13405477558219914, + "grad_norm": 1.7442996501922607, + "learning_rate": 1.4502416621251798e-05, + "loss": 0.7231, + "step": 2812 + }, + { + "epoch": 0.13410244797749862, + "grad_norm": 4.76901912689209, + "learning_rate": 1.4498908960608407e-05, + "loss": 0.3126, + "step": 2813 + }, + { + "epoch": 0.13415012037279814, + "grad_norm": 1.2491589784622192, + "learning_rate": 1.449540060582969e-05, + "loss": 0.4685, + "step": 2814 + }, + { + "epoch": 0.13419779276809762, + "grad_norm": 1.6096590757369995, + "learning_rate": 1.4491891557456956e-05, + "loss": 0.9791, + "step": 2815 + }, + { + "epoch": 0.13424546516339714, + "grad_norm": 4.615392684936523, + "learning_rate": 1.448838181603161e-05, + "loss": 0.549, + "step": 2816 + }, + { + "epoch": 0.13429313755869662, + "grad_norm": 2.280949354171753, + "learning_rate": 1.4484871382095172e-05, + "loss": 0.9488, + "step": 2817 + }, + { + "epoch": 0.13434080995399614, + "grad_norm": 1.3054804801940918, + "learning_rate": 1.4481360256189266e-05, + "loss": 0.5984, + "step": 2818 + }, + { + "epoch": 0.13438848234929565, + "grad_norm": 1.9830986261367798, + "learning_rate": 1.4477848438855619e-05, + "loss": 0.5297, + "step": 2819 + }, + { + "epoch": 0.13443615474459514, + "grad_norm": 3.2875237464904785, + "learning_rate": 1.447433593063607e-05, + "loss": 1.2456, + "step": 2820 + }, + { + "epoch": 0.13448382713989465, + "grad_norm": 2.07106614112854, + "learning_rate": 1.4470822732072567e-05, + "loss": 0.749, + "step": 2821 + }, + { + "epoch": 0.13453149953519414, + "grad_norm": 3.811577320098877, + "learning_rate": 1.4467308843707155e-05, + "loss": 0.4224, + "step": 2822 + }, + { + "epoch": 0.13457917193049365, + "grad_norm": 1.4075751304626465, + "learning_rate": 1.4463794266081994e-05, + "loss": 0.6388, + "step": 2823 + }, + { + "epoch": 0.13462684432579314, + "grad_norm": 1.4577556848526, + "learning_rate": 1.4460278999739346e-05, + "loss": 0.6949, + "step": 2824 + }, + { + "epoch": 0.13467451672109265, + "grad_norm": 2.3330719470977783, + "learning_rate": 1.445676304522158e-05, + "loss": 0.9201, + "step": 2825 + }, + { + "epoch": 0.13472218911639217, + "grad_norm": 4.043055534362793, + "learning_rate": 1.445324640307117e-05, + "loss": 0.6145, + "step": 2826 + }, + { + "epoch": 0.13476986151169165, + "grad_norm": 1.8382353782653809, + "learning_rate": 1.4449729073830703e-05, + "loss": 0.8629, + "step": 2827 + }, + { + "epoch": 0.13481753390699117, + "grad_norm": 0.980821967124939, + "learning_rate": 1.444621105804286e-05, + "loss": 0.8098, + "step": 2828 + }, + { + "epoch": 0.13486520630229065, + "grad_norm": 1.6699111461639404, + "learning_rate": 1.4442692356250443e-05, + "loss": 0.6263, + "step": 2829 + }, + { + "epoch": 0.13491287869759017, + "grad_norm": 2.7426393032073975, + "learning_rate": 1.4439172968996343e-05, + "loss": 0.8625, + "step": 2830 + }, + { + "epoch": 0.13496055109288965, + "grad_norm": 1.8165969848632812, + "learning_rate": 1.4435652896823565e-05, + "loss": 0.6557, + "step": 2831 + }, + { + "epoch": 0.13500822348818917, + "grad_norm": 1.8837940692901611, + "learning_rate": 1.4432132140275229e-05, + "loss": 0.9995, + "step": 2832 + }, + { + "epoch": 0.13505589588348865, + "grad_norm": 1.8482041358947754, + "learning_rate": 1.4428610699894542e-05, + "loss": 0.7043, + "step": 2833 + }, + { + "epoch": 0.13510356827878817, + "grad_norm": 1.199134111404419, + "learning_rate": 1.442508857622483e-05, + "loss": 0.7605, + "step": 2834 + }, + { + "epoch": 0.13515124067408768, + "grad_norm": 3.1443140506744385, + "learning_rate": 1.4421565769809523e-05, + "loss": 0.7874, + "step": 2835 + }, + { + "epoch": 0.13519891306938717, + "grad_norm": 1.8745898008346558, + "learning_rate": 1.4418042281192151e-05, + "loss": 0.8903, + "step": 2836 + }, + { + "epoch": 0.13524658546468668, + "grad_norm": 1.7884939908981323, + "learning_rate": 1.4414518110916352e-05, + "loss": 0.4199, + "step": 2837 + }, + { + "epoch": 0.13529425785998617, + "grad_norm": 0.8712854981422424, + "learning_rate": 1.4410993259525868e-05, + "loss": 0.1338, + "step": 2838 + }, + { + "epoch": 0.13534193025528568, + "grad_norm": 0.9771924018859863, + "learning_rate": 1.4407467727564548e-05, + "loss": 0.5966, + "step": 2839 + }, + { + "epoch": 0.13538960265058517, + "grad_norm": 1.6014736890792847, + "learning_rate": 1.4403941515576344e-05, + "loss": 0.7721, + "step": 2840 + }, + { + "epoch": 0.13543727504588468, + "grad_norm": 3.413545608520508, + "learning_rate": 1.4400414624105319e-05, + "loss": 1.0529, + "step": 2841 + }, + { + "epoch": 0.1354849474411842, + "grad_norm": 1.4031412601470947, + "learning_rate": 1.4396887053695631e-05, + "loss": 0.804, + "step": 2842 + }, + { + "epoch": 0.13553261983648368, + "grad_norm": 1.7395211458206177, + "learning_rate": 1.439335880489155e-05, + "loss": 0.8361, + "step": 2843 + }, + { + "epoch": 0.1355802922317832, + "grad_norm": 1.5222361087799072, + "learning_rate": 1.4389829878237451e-05, + "loss": 0.5949, + "step": 2844 + }, + { + "epoch": 0.13562796462708268, + "grad_norm": 1.2143298387527466, + "learning_rate": 1.438630027427781e-05, + "loss": 0.6202, + "step": 2845 + }, + { + "epoch": 0.1356756370223822, + "grad_norm": 2.712756395339966, + "learning_rate": 1.4382769993557202e-05, + "loss": 1.0577, + "step": 2846 + }, + { + "epoch": 0.13572330941768168, + "grad_norm": 2.0115554332733154, + "learning_rate": 1.4379239036620319e-05, + "loss": 0.6218, + "step": 2847 + }, + { + "epoch": 0.1357709818129812, + "grad_norm": 1.4737643003463745, + "learning_rate": 1.4375707404011949e-05, + "loss": 0.7575, + "step": 2848 + }, + { + "epoch": 0.13581865420828068, + "grad_norm": 1.5066097974777222, + "learning_rate": 1.4372175096276988e-05, + "loss": 0.8696, + "step": 2849 + }, + { + "epoch": 0.1358663266035802, + "grad_norm": 1.6347986459732056, + "learning_rate": 1.4368642113960436e-05, + "loss": 0.7425, + "step": 2850 + }, + { + "epoch": 0.1359139989988797, + "grad_norm": 2.0242161750793457, + "learning_rate": 1.4365108457607396e-05, + "loss": 0.8388, + "step": 2851 + }, + { + "epoch": 0.1359616713941792, + "grad_norm": 1.824858546257019, + "learning_rate": 1.4361574127763069e-05, + "loss": 0.5064, + "step": 2852 + }, + { + "epoch": 0.1360093437894787, + "grad_norm": 2.1713826656341553, + "learning_rate": 1.4358039124972771e-05, + "loss": 0.6825, + "step": 2853 + }, + { + "epoch": 0.1360570161847782, + "grad_norm": 1.1008491516113281, + "learning_rate": 1.4354503449781914e-05, + "loss": 0.1381, + "step": 2854 + }, + { + "epoch": 0.1361046885800777, + "grad_norm": 3.2490859031677246, + "learning_rate": 1.435096710273602e-05, + "loss": 0.3514, + "step": 2855 + }, + { + "epoch": 0.1361523609753772, + "grad_norm": 1.2097240686416626, + "learning_rate": 1.4347430084380705e-05, + "loss": 0.6101, + "step": 2856 + }, + { + "epoch": 0.1362000333706767, + "grad_norm": 1.6209847927093506, + "learning_rate": 1.4343892395261699e-05, + "loss": 0.6709, + "step": 2857 + }, + { + "epoch": 0.13624770576597622, + "grad_norm": 1.2333697080612183, + "learning_rate": 1.434035403592483e-05, + "loss": 0.7103, + "step": 2858 + }, + { + "epoch": 0.1362953781612757, + "grad_norm": 4.612729549407959, + "learning_rate": 1.4336815006916032e-05, + "loss": 0.4566, + "step": 2859 + }, + { + "epoch": 0.13634305055657522, + "grad_norm": 0.8605471849441528, + "learning_rate": 1.4333275308781338e-05, + "loss": 0.5919, + "step": 2860 + }, + { + "epoch": 0.1363907229518747, + "grad_norm": 1.9653245210647583, + "learning_rate": 1.4329734942066889e-05, + "loss": 0.6842, + "step": 2861 + }, + { + "epoch": 0.13643839534717422, + "grad_norm": 3.233825922012329, + "learning_rate": 1.4326193907318924e-05, + "loss": 0.6059, + "step": 2862 + }, + { + "epoch": 0.1364860677424737, + "grad_norm": 3.7866153717041016, + "learning_rate": 1.432265220508379e-05, + "loss": 0.4458, + "step": 2863 + }, + { + "epoch": 0.13653374013777322, + "grad_norm": 1.7378908395767212, + "learning_rate": 1.4319109835907936e-05, + "loss": 0.5611, + "step": 2864 + }, + { + "epoch": 0.13658141253307274, + "grad_norm": 1.5968841314315796, + "learning_rate": 1.4315566800337914e-05, + "loss": 0.9656, + "step": 2865 + }, + { + "epoch": 0.13662908492837222, + "grad_norm": 2.901979923248291, + "learning_rate": 1.4312023098920374e-05, + "loss": 0.6457, + "step": 2866 + }, + { + "epoch": 0.13667675732367174, + "grad_norm": 2.657661199569702, + "learning_rate": 1.430847873220208e-05, + "loss": 0.5618, + "step": 2867 + }, + { + "epoch": 0.13672442971897122, + "grad_norm": 2.0658679008483887, + "learning_rate": 1.4304933700729882e-05, + "loss": 0.6751, + "step": 2868 + }, + { + "epoch": 0.13677210211427074, + "grad_norm": 1.0008745193481445, + "learning_rate": 1.4301388005050746e-05, + "loss": 0.2648, + "step": 2869 + }, + { + "epoch": 0.13681977450957022, + "grad_norm": 1.9280773401260376, + "learning_rate": 1.4297841645711738e-05, + "loss": 0.2906, + "step": 2870 + }, + { + "epoch": 0.13686744690486974, + "grad_norm": 1.3655898571014404, + "learning_rate": 1.4294294623260024e-05, + "loss": 0.6692, + "step": 2871 + }, + { + "epoch": 0.13691511930016922, + "grad_norm": 4.982389450073242, + "learning_rate": 1.429074693824287e-05, + "loss": 0.8454, + "step": 2872 + }, + { + "epoch": 0.13696279169546874, + "grad_norm": 2.2366626262664795, + "learning_rate": 1.428719859120765e-05, + "loss": 0.303, + "step": 2873 + }, + { + "epoch": 0.13701046409076825, + "grad_norm": 3.9326047897338867, + "learning_rate": 1.428364958270184e-05, + "loss": 0.3676, + "step": 2874 + }, + { + "epoch": 0.13705813648606774, + "grad_norm": 1.3298144340515137, + "learning_rate": 1.428009991327301e-05, + "loss": 0.3698, + "step": 2875 + }, + { + "epoch": 0.13710580888136725, + "grad_norm": 2.202796220779419, + "learning_rate": 1.4276549583468842e-05, + "loss": 0.5087, + "step": 2876 + }, + { + "epoch": 0.13715348127666674, + "grad_norm": 6.823620796203613, + "learning_rate": 1.4272998593837108e-05, + "loss": 0.5663, + "step": 2877 + }, + { + "epoch": 0.13720115367196625, + "grad_norm": 1.474297046661377, + "learning_rate": 1.42694469449257e-05, + "loss": 0.8557, + "step": 2878 + }, + { + "epoch": 0.13724882606726574, + "grad_norm": 2.2157840728759766, + "learning_rate": 1.4265894637282594e-05, + "loss": 1.0354, + "step": 2879 + }, + { + "epoch": 0.13729649846256525, + "grad_norm": 1.6916190385818481, + "learning_rate": 1.4262341671455873e-05, + "loss": 0.8873, + "step": 2880 + }, + { + "epoch": 0.13734417085786477, + "grad_norm": 1.2153040170669556, + "learning_rate": 1.4258788047993726e-05, + "loss": 0.822, + "step": 2881 + }, + { + "epoch": 0.13739184325316425, + "grad_norm": 1.2191141843795776, + "learning_rate": 1.4255233767444443e-05, + "loss": 0.7823, + "step": 2882 + }, + { + "epoch": 0.13743951564846377, + "grad_norm": 2.412100315093994, + "learning_rate": 1.4251678830356408e-05, + "loss": 0.8249, + "step": 2883 + }, + { + "epoch": 0.13748718804376325, + "grad_norm": 1.3940682411193848, + "learning_rate": 1.4248123237278116e-05, + "loss": 0.777, + "step": 2884 + }, + { + "epoch": 0.13753486043906277, + "grad_norm": 0.8108394742012024, + "learning_rate": 1.4244566988758152e-05, + "loss": 0.3257, + "step": 2885 + }, + { + "epoch": 0.13758253283436225, + "grad_norm": 1.5383516550064087, + "learning_rate": 1.4241010085345216e-05, + "loss": 0.806, + "step": 2886 + }, + { + "epoch": 0.13763020522966177, + "grad_norm": 1.469811201095581, + "learning_rate": 1.4237452527588094e-05, + "loss": 0.8608, + "step": 2887 + }, + { + "epoch": 0.13767787762496125, + "grad_norm": 2.5900049209594727, + "learning_rate": 1.4233894316035683e-05, + "loss": 0.9492, + "step": 2888 + }, + { + "epoch": 0.13772555002026077, + "grad_norm": 2.3263185024261475, + "learning_rate": 1.4230335451236988e-05, + "loss": 0.4258, + "step": 2889 + }, + { + "epoch": 0.13777322241556028, + "grad_norm": 3.216356039047241, + "learning_rate": 1.422677593374109e-05, + "loss": 0.3797, + "step": 2890 + }, + { + "epoch": 0.13782089481085977, + "grad_norm": 1.6025211811065674, + "learning_rate": 1.4223215764097194e-05, + "loss": 0.6416, + "step": 2891 + }, + { + "epoch": 0.13786856720615928, + "grad_norm": 1.0096967220306396, + "learning_rate": 1.4219654942854598e-05, + "loss": 0.4615, + "step": 2892 + }, + { + "epoch": 0.13791623960145877, + "grad_norm": 2.9632620811462402, + "learning_rate": 1.4216093470562698e-05, + "loss": 0.3381, + "step": 2893 + }, + { + "epoch": 0.13796391199675828, + "grad_norm": 1.9774034023284912, + "learning_rate": 1.4212531347770987e-05, + "loss": 0.7165, + "step": 2894 + }, + { + "epoch": 0.13801158439205777, + "grad_norm": 1.7228517532348633, + "learning_rate": 1.4208968575029077e-05, + "loss": 0.6596, + "step": 2895 + }, + { + "epoch": 0.13805925678735728, + "grad_norm": 2.000681161880493, + "learning_rate": 1.4205405152886658e-05, + "loss": 0.7726, + "step": 2896 + }, + { + "epoch": 0.1381069291826568, + "grad_norm": 1.7331486940383911, + "learning_rate": 1.4201841081893531e-05, + "loss": 0.5399, + "step": 2897 + }, + { + "epoch": 0.13815460157795628, + "grad_norm": 4.172076225280762, + "learning_rate": 1.4198276362599597e-05, + "loss": 1.0465, + "step": 2898 + }, + { + "epoch": 0.1382022739732558, + "grad_norm": 1.299048900604248, + "learning_rate": 1.4194710995554852e-05, + "loss": 0.8047, + "step": 2899 + }, + { + "epoch": 0.13824994636855528, + "grad_norm": 1.7845855951309204, + "learning_rate": 1.4191144981309397e-05, + "loss": 0.7289, + "step": 2900 + }, + { + "epoch": 0.1382976187638548, + "grad_norm": 1.5964035987854004, + "learning_rate": 1.4187578320413434e-05, + "loss": 0.7735, + "step": 2901 + }, + { + "epoch": 0.13834529115915428, + "grad_norm": 2.142331123352051, + "learning_rate": 1.4184011013417258e-05, + "loss": 0.5967, + "step": 2902 + }, + { + "epoch": 0.1383929635544538, + "grad_norm": 2.6684093475341797, + "learning_rate": 1.4180443060871269e-05, + "loss": 0.7027, + "step": 2903 + }, + { + "epoch": 0.13844063594975328, + "grad_norm": 1.0961874723434448, + "learning_rate": 1.4176874463325967e-05, + "loss": 0.664, + "step": 2904 + }, + { + "epoch": 0.1384883083450528, + "grad_norm": 2.229717969894409, + "learning_rate": 1.4173305221331953e-05, + "loss": 0.7163, + "step": 2905 + }, + { + "epoch": 0.1385359807403523, + "grad_norm": 1.9411121606826782, + "learning_rate": 1.4169735335439914e-05, + "loss": 0.7307, + "step": 2906 + }, + { + "epoch": 0.1385836531356518, + "grad_norm": 1.4852358102798462, + "learning_rate": 1.4166164806200655e-05, + "loss": 0.7537, + "step": 2907 + }, + { + "epoch": 0.1386313255309513, + "grad_norm": 5.4812092781066895, + "learning_rate": 1.416259363416507e-05, + "loss": 0.5604, + "step": 2908 + }, + { + "epoch": 0.1386789979262508, + "grad_norm": 1.5105454921722412, + "learning_rate": 1.415902181988415e-05, + "loss": 0.876, + "step": 2909 + }, + { + "epoch": 0.1387266703215503, + "grad_norm": 2.848024845123291, + "learning_rate": 1.4155449363908997e-05, + "loss": 1.2413, + "step": 2910 + }, + { + "epoch": 0.1387743427168498, + "grad_norm": 1.7367385625839233, + "learning_rate": 1.4151876266790801e-05, + "loss": 0.5624, + "step": 2911 + }, + { + "epoch": 0.1388220151121493, + "grad_norm": 1.502943992614746, + "learning_rate": 1.414830252908085e-05, + "loss": 0.6341, + "step": 2912 + }, + { + "epoch": 0.13886968750744882, + "grad_norm": 1.7613787651062012, + "learning_rate": 1.414472815133054e-05, + "loss": 0.8833, + "step": 2913 + }, + { + "epoch": 0.1389173599027483, + "grad_norm": 1.4656928777694702, + "learning_rate": 1.4141153134091357e-05, + "loss": 0.7196, + "step": 2914 + }, + { + "epoch": 0.13896503229804782, + "grad_norm": 2.8903956413269043, + "learning_rate": 1.4137577477914892e-05, + "loss": 1.1933, + "step": 2915 + }, + { + "epoch": 0.1390127046933473, + "grad_norm": 2.127920627593994, + "learning_rate": 1.4134001183352833e-05, + "loss": 0.6954, + "step": 2916 + }, + { + "epoch": 0.13906037708864682, + "grad_norm": 3.422004222869873, + "learning_rate": 1.4130424250956958e-05, + "loss": 0.7146, + "step": 2917 + }, + { + "epoch": 0.1391080494839463, + "grad_norm": 1.8332690000534058, + "learning_rate": 1.4126846681279161e-05, + "loss": 0.9339, + "step": 2918 + }, + { + "epoch": 0.13915572187924583, + "grad_norm": 2.653010368347168, + "learning_rate": 1.4123268474871417e-05, + "loss": 1.1642, + "step": 2919 + }, + { + "epoch": 0.1392033942745453, + "grad_norm": 2.307483196258545, + "learning_rate": 1.4119689632285812e-05, + "loss": 0.8394, + "step": 2920 + }, + { + "epoch": 0.13925106666984483, + "grad_norm": 1.6198796033859253, + "learning_rate": 1.4116110154074518e-05, + "loss": 0.862, + "step": 2921 + }, + { + "epoch": 0.13929873906514434, + "grad_norm": 4.71099853515625, + "learning_rate": 1.4112530040789816e-05, + "loss": 0.688, + "step": 2922 + }, + { + "epoch": 0.13934641146044383, + "grad_norm": 3.7185537815093994, + "learning_rate": 1.4108949292984077e-05, + "loss": 0.9517, + "step": 2923 + }, + { + "epoch": 0.13939408385574334, + "grad_norm": 2.543074369430542, + "learning_rate": 1.410536791120978e-05, + "loss": 1.0582, + "step": 2924 + }, + { + "epoch": 0.13944175625104283, + "grad_norm": 1.5880959033966064, + "learning_rate": 1.410178589601949e-05, + "loss": 0.9458, + "step": 2925 + }, + { + "epoch": 0.13948942864634234, + "grad_norm": 1.6787793636322021, + "learning_rate": 1.4098203247965876e-05, + "loss": 0.7943, + "step": 2926 + }, + { + "epoch": 0.13953710104164183, + "grad_norm": 1.3269317150115967, + "learning_rate": 1.4094619967601707e-05, + "loss": 0.711, + "step": 2927 + }, + { + "epoch": 0.13958477343694134, + "grad_norm": 1.2400355339050293, + "learning_rate": 1.409103605547984e-05, + "loss": 0.6322, + "step": 2928 + }, + { + "epoch": 0.13963244583224085, + "grad_norm": 1.814907431602478, + "learning_rate": 1.4087451512153241e-05, + "loss": 0.6028, + "step": 2929 + }, + { + "epoch": 0.13968011822754034, + "grad_norm": 2.8301570415496826, + "learning_rate": 1.4083866338174964e-05, + "loss": 1.2206, + "step": 2930 + }, + { + "epoch": 0.13972779062283985, + "grad_norm": 5.362226963043213, + "learning_rate": 1.4080280534098168e-05, + "loss": 0.9202, + "step": 2931 + }, + { + "epoch": 0.13977546301813934, + "grad_norm": 1.8723117113113403, + "learning_rate": 1.4076694100476104e-05, + "loss": 0.7996, + "step": 2932 + }, + { + "epoch": 0.13982313541343885, + "grad_norm": 0.9941101670265198, + "learning_rate": 1.4073107037862124e-05, + "loss": 0.4938, + "step": 2933 + }, + { + "epoch": 0.13987080780873834, + "grad_norm": 1.369470477104187, + "learning_rate": 1.4069519346809673e-05, + "loss": 0.7486, + "step": 2934 + }, + { + "epoch": 0.13991848020403785, + "grad_norm": 1.4776984453201294, + "learning_rate": 1.4065931027872293e-05, + "loss": 0.7235, + "step": 2935 + }, + { + "epoch": 0.13996615259933734, + "grad_norm": 3.2828266620635986, + "learning_rate": 1.4062342081603626e-05, + "loss": 0.5507, + "step": 2936 + }, + { + "epoch": 0.14001382499463685, + "grad_norm": 3.0249550342559814, + "learning_rate": 1.405875250855741e-05, + "loss": 1.2229, + "step": 2937 + }, + { + "epoch": 0.14006149738993637, + "grad_norm": 3.70796275138855, + "learning_rate": 1.4055162309287477e-05, + "loss": 0.463, + "step": 2938 + }, + { + "epoch": 0.14010916978523585, + "grad_norm": 1.8504685163497925, + "learning_rate": 1.4051571484347766e-05, + "loss": 0.7662, + "step": 2939 + }, + { + "epoch": 0.14015684218053537, + "grad_norm": 1.9240682125091553, + "learning_rate": 1.4047980034292292e-05, + "loss": 0.963, + "step": 2940 + }, + { + "epoch": 0.14020451457583485, + "grad_norm": 1.343772292137146, + "learning_rate": 1.4044387959675187e-05, + "loss": 0.8986, + "step": 2941 + }, + { + "epoch": 0.14025218697113437, + "grad_norm": 1.0899327993392944, + "learning_rate": 1.4040795261050671e-05, + "loss": 0.5131, + "step": 2942 + }, + { + "epoch": 0.14029985936643385, + "grad_norm": 3.6432085037231445, + "learning_rate": 1.4037201938973057e-05, + "loss": 0.4439, + "step": 2943 + }, + { + "epoch": 0.14034753176173337, + "grad_norm": 2.634822368621826, + "learning_rate": 1.4033607993996758e-05, + "loss": 0.8628, + "step": 2944 + }, + { + "epoch": 0.14039520415703288, + "grad_norm": 2.0911800861358643, + "learning_rate": 1.4030013426676283e-05, + "loss": 0.5768, + "step": 2945 + }, + { + "epoch": 0.14044287655233237, + "grad_norm": 1.9920333623886108, + "learning_rate": 1.4026418237566239e-05, + "loss": 0.9166, + "step": 2946 + }, + { + "epoch": 0.14049054894763188, + "grad_norm": 1.877659559249878, + "learning_rate": 1.4022822427221325e-05, + "loss": 0.8295, + "step": 2947 + }, + { + "epoch": 0.14053822134293137, + "grad_norm": 2.5049636363983154, + "learning_rate": 1.4019225996196335e-05, + "loss": 0.4287, + "step": 2948 + }, + { + "epoch": 0.14058589373823088, + "grad_norm": 2.9660418033599854, + "learning_rate": 1.4015628945046169e-05, + "loss": 0.5802, + "step": 2949 + }, + { + "epoch": 0.14063356613353037, + "grad_norm": 2.9606802463531494, + "learning_rate": 1.4012031274325808e-05, + "loss": 0.5012, + "step": 2950 + }, + { + "epoch": 0.14068123852882988, + "grad_norm": 3.091475248336792, + "learning_rate": 1.4008432984590333e-05, + "loss": 1.3912, + "step": 2951 + }, + { + "epoch": 0.1407289109241294, + "grad_norm": 0.995457649230957, + "learning_rate": 1.4004834076394931e-05, + "loss": 0.3238, + "step": 2952 + }, + { + "epoch": 0.14077658331942888, + "grad_norm": 2.928942918777466, + "learning_rate": 1.4001234550294873e-05, + "loss": 0.1468, + "step": 2953 + }, + { + "epoch": 0.1408242557147284, + "grad_norm": 2.151156425476074, + "learning_rate": 1.3997634406845526e-05, + "loss": 0.6447, + "step": 2954 + }, + { + "epoch": 0.14087192811002788, + "grad_norm": 1.677695631980896, + "learning_rate": 1.3994033646602359e-05, + "loss": 0.8535, + "step": 2955 + }, + { + "epoch": 0.1409196005053274, + "grad_norm": 1.9230855703353882, + "learning_rate": 1.3990432270120933e-05, + "loss": 0.8298, + "step": 2956 + }, + { + "epoch": 0.14096727290062688, + "grad_norm": 1.4534255266189575, + "learning_rate": 1.3986830277956899e-05, + "loss": 0.8519, + "step": 2957 + }, + { + "epoch": 0.1410149452959264, + "grad_norm": 1.0360122919082642, + "learning_rate": 1.3983227670666011e-05, + "loss": 0.722, + "step": 2958 + }, + { + "epoch": 0.14106261769122588, + "grad_norm": 1.6863003969192505, + "learning_rate": 1.3979624448804112e-05, + "loss": 0.7972, + "step": 2959 + }, + { + "epoch": 0.1411102900865254, + "grad_norm": 1.8406460285186768, + "learning_rate": 1.3976020612927141e-05, + "loss": 0.6768, + "step": 2960 + }, + { + "epoch": 0.1411579624818249, + "grad_norm": 2.9295125007629395, + "learning_rate": 1.3972416163591138e-05, + "loss": 0.3901, + "step": 2961 + }, + { + "epoch": 0.1412056348771244, + "grad_norm": 1.7305203676223755, + "learning_rate": 1.3968811101352226e-05, + "loss": 0.5098, + "step": 2962 + }, + { + "epoch": 0.1412533072724239, + "grad_norm": 1.1533738374710083, + "learning_rate": 1.3965205426766632e-05, + "loss": 0.5401, + "step": 2963 + }, + { + "epoch": 0.1413009796677234, + "grad_norm": 1.4826109409332275, + "learning_rate": 1.3961599140390675e-05, + "loss": 0.6049, + "step": 2964 + }, + { + "epoch": 0.1413486520630229, + "grad_norm": 1.4037529230117798, + "learning_rate": 1.3957992242780768e-05, + "loss": 0.6412, + "step": 2965 + }, + { + "epoch": 0.1413963244583224, + "grad_norm": 1.8146154880523682, + "learning_rate": 1.3954384734493418e-05, + "loss": 0.864, + "step": 2966 + }, + { + "epoch": 0.1414439968536219, + "grad_norm": 2.8060736656188965, + "learning_rate": 1.3950776616085224e-05, + "loss": 0.927, + "step": 2967 + }, + { + "epoch": 0.14149166924892143, + "grad_norm": 1.497754454612732, + "learning_rate": 1.3947167888112882e-05, + "loss": 0.9745, + "step": 2968 + }, + { + "epoch": 0.1415393416442209, + "grad_norm": 1.9746789932250977, + "learning_rate": 1.3943558551133186e-05, + "loss": 0.927, + "step": 2969 + }, + { + "epoch": 0.14158701403952043, + "grad_norm": 1.7346903085708618, + "learning_rate": 1.3939948605703015e-05, + "loss": 0.7102, + "step": 2970 + }, + { + "epoch": 0.1416346864348199, + "grad_norm": 1.7983261346817017, + "learning_rate": 1.393633805237935e-05, + "loss": 0.1876, + "step": 2971 + }, + { + "epoch": 0.14168235883011943, + "grad_norm": 2.2199759483337402, + "learning_rate": 1.3932726891719259e-05, + "loss": 0.7934, + "step": 2972 + }, + { + "epoch": 0.1417300312254189, + "grad_norm": 2.119215726852417, + "learning_rate": 1.3929115124279906e-05, + "loss": 0.9677, + "step": 2973 + }, + { + "epoch": 0.14177770362071843, + "grad_norm": 1.9493294954299927, + "learning_rate": 1.392550275061855e-05, + "loss": 0.9217, + "step": 2974 + }, + { + "epoch": 0.1418253760160179, + "grad_norm": 2.7072625160217285, + "learning_rate": 1.3921889771292546e-05, + "loss": 0.8318, + "step": 2975 + }, + { + "epoch": 0.14187304841131743, + "grad_norm": 1.0546387434005737, + "learning_rate": 1.391827618685934e-05, + "loss": 0.5448, + "step": 2976 + }, + { + "epoch": 0.14192072080661694, + "grad_norm": 1.4570761919021606, + "learning_rate": 1.3914661997876467e-05, + "loss": 0.8481, + "step": 2977 + }, + { + "epoch": 0.14196839320191643, + "grad_norm": 1.6180341243743896, + "learning_rate": 1.391104720490156e-05, + "loss": 0.8538, + "step": 2978 + }, + { + "epoch": 0.14201606559721594, + "grad_norm": 2.4710769653320312, + "learning_rate": 1.3907431808492348e-05, + "loss": 1.0139, + "step": 2979 + }, + { + "epoch": 0.14206373799251543, + "grad_norm": 1.6567139625549316, + "learning_rate": 1.3903815809206646e-05, + "loss": 0.7472, + "step": 2980 + }, + { + "epoch": 0.14211141038781494, + "grad_norm": 5.065958023071289, + "learning_rate": 1.3900199207602365e-05, + "loss": 0.4898, + "step": 2981 + }, + { + "epoch": 0.14215908278311443, + "grad_norm": 1.936753511428833, + "learning_rate": 1.3896582004237514e-05, + "loss": 0.8907, + "step": 2982 + }, + { + "epoch": 0.14220675517841394, + "grad_norm": 1.5708733797073364, + "learning_rate": 1.3892964199670181e-05, + "loss": 0.8792, + "step": 2983 + }, + { + "epoch": 0.14225442757371345, + "grad_norm": 1.586714267730713, + "learning_rate": 1.3889345794458563e-05, + "loss": 0.5538, + "step": 2984 + }, + { + "epoch": 0.14230209996901294, + "grad_norm": 0.865074872970581, + "learning_rate": 1.3885726789160943e-05, + "loss": 0.5777, + "step": 2985 + }, + { + "epoch": 0.14234977236431245, + "grad_norm": 1.160605549812317, + "learning_rate": 1.3882107184335696e-05, + "loss": 0.7043, + "step": 2986 + }, + { + "epoch": 0.14239744475961194, + "grad_norm": 1.7783540487289429, + "learning_rate": 1.3878486980541289e-05, + "loss": 0.6285, + "step": 2987 + }, + { + "epoch": 0.14244511715491145, + "grad_norm": 1.6727008819580078, + "learning_rate": 1.3874866178336277e-05, + "loss": 0.6554, + "step": 2988 + }, + { + "epoch": 0.14249278955021094, + "grad_norm": 1.6376591920852661, + "learning_rate": 1.387124477827932e-05, + "loss": 0.4045, + "step": 2989 + }, + { + "epoch": 0.14254046194551045, + "grad_norm": 4.546367168426514, + "learning_rate": 1.386762278092916e-05, + "loss": 1.4425, + "step": 2990 + }, + { + "epoch": 0.14258813434080994, + "grad_norm": 1.674723744392395, + "learning_rate": 1.3864000186844631e-05, + "loss": 0.9774, + "step": 2991 + }, + { + "epoch": 0.14263580673610946, + "grad_norm": 1.2198519706726074, + "learning_rate": 1.3860376996584667e-05, + "loss": 0.8267, + "step": 2992 + }, + { + "epoch": 0.14268347913140897, + "grad_norm": 1.1878458261489868, + "learning_rate": 1.3856753210708288e-05, + "loss": 0.7851, + "step": 2993 + }, + { + "epoch": 0.14273115152670846, + "grad_norm": 1.803183674812317, + "learning_rate": 1.3853128829774605e-05, + "loss": 0.7295, + "step": 2994 + }, + { + "epoch": 0.14277882392200797, + "grad_norm": 1.6760066747665405, + "learning_rate": 1.3849503854342823e-05, + "loss": 0.6454, + "step": 2995 + }, + { + "epoch": 0.14282649631730746, + "grad_norm": 6.469170093536377, + "learning_rate": 1.3845878284972237e-05, + "loss": 0.2724, + "step": 2996 + }, + { + "epoch": 0.14287416871260697, + "grad_norm": 1.1705045700073242, + "learning_rate": 1.3842252122222235e-05, + "loss": 0.6299, + "step": 2997 + }, + { + "epoch": 0.14292184110790646, + "grad_norm": 1.2141879796981812, + "learning_rate": 1.38386253666523e-05, + "loss": 0.7289, + "step": 2998 + }, + { + "epoch": 0.14296951350320597, + "grad_norm": 2.3483808040618896, + "learning_rate": 1.3834998018822004e-05, + "loss": 0.4504, + "step": 2999 + }, + { + "epoch": 0.14301718589850548, + "grad_norm": 1.2169690132141113, + "learning_rate": 1.3831370079291002e-05, + "loss": 0.7487, + "step": 3000 + }, + { + "epoch": 0.14306485829380497, + "grad_norm": 2.0098938941955566, + "learning_rate": 1.3827741548619054e-05, + "loss": 0.6189, + "step": 3001 + }, + { + "epoch": 0.14311253068910448, + "grad_norm": 1.406715989112854, + "learning_rate": 1.3824112427366003e-05, + "loss": 0.5196, + "step": 3002 + }, + { + "epoch": 0.14316020308440397, + "grad_norm": 2.799132823944092, + "learning_rate": 1.3820482716091786e-05, + "loss": 0.687, + "step": 3003 + }, + { + "epoch": 0.14320787547970348, + "grad_norm": 1.0308328866958618, + "learning_rate": 1.381685241535643e-05, + "loss": 0.6581, + "step": 3004 + }, + { + "epoch": 0.14325554787500297, + "grad_norm": 1.8086446523666382, + "learning_rate": 1.381322152572005e-05, + "loss": 0.7909, + "step": 3005 + }, + { + "epoch": 0.14330322027030248, + "grad_norm": 1.650424838066101, + "learning_rate": 1.3809590047742858e-05, + "loss": 0.8621, + "step": 3006 + }, + { + "epoch": 0.14335089266560197, + "grad_norm": 2.0094616413116455, + "learning_rate": 1.3805957981985154e-05, + "loss": 0.5668, + "step": 3007 + }, + { + "epoch": 0.14339856506090148, + "grad_norm": 1.4018728733062744, + "learning_rate": 1.3802325329007324e-05, + "loss": 0.9517, + "step": 3008 + }, + { + "epoch": 0.143446237456201, + "grad_norm": 1.5284672975540161, + "learning_rate": 1.3798692089369855e-05, + "loss": 0.6185, + "step": 3009 + }, + { + "epoch": 0.14349390985150048, + "grad_norm": 2.180819034576416, + "learning_rate": 1.3795058263633316e-05, + "loss": 0.9084, + "step": 3010 + }, + { + "epoch": 0.1435415822468, + "grad_norm": 1.4202477931976318, + "learning_rate": 1.3791423852358365e-05, + "loss": 0.299, + "step": 3011 + }, + { + "epoch": 0.14358925464209948, + "grad_norm": 1.2657815217971802, + "learning_rate": 1.3787788856105762e-05, + "loss": 0.5537, + "step": 3012 + }, + { + "epoch": 0.143636927037399, + "grad_norm": 1.7528935670852661, + "learning_rate": 1.3784153275436345e-05, + "loss": 0.5828, + "step": 3013 + }, + { + "epoch": 0.14368459943269848, + "grad_norm": 2.6704468727111816, + "learning_rate": 1.3780517110911042e-05, + "loss": 1.1112, + "step": 3014 + }, + { + "epoch": 0.143732271827998, + "grad_norm": 1.6846861839294434, + "learning_rate": 1.3776880363090883e-05, + "loss": 0.8549, + "step": 3015 + }, + { + "epoch": 0.1437799442232975, + "grad_norm": 1.3927019834518433, + "learning_rate": 1.377324303253698e-05, + "loss": 0.5709, + "step": 3016 + }, + { + "epoch": 0.143827616618597, + "grad_norm": 2.5773043632507324, + "learning_rate": 1.3769605119810533e-05, + "loss": 0.9208, + "step": 3017 + }, + { + "epoch": 0.1438752890138965, + "grad_norm": 1.5710643529891968, + "learning_rate": 1.3765966625472837e-05, + "loss": 0.5101, + "step": 3018 + }, + { + "epoch": 0.143922961409196, + "grad_norm": 1.1312370300292969, + "learning_rate": 1.376232755008527e-05, + "loss": 0.4123, + "step": 3019 + }, + { + "epoch": 0.1439706338044955, + "grad_norm": 2.383615493774414, + "learning_rate": 1.3758687894209307e-05, + "loss": 0.9446, + "step": 3020 + }, + { + "epoch": 0.144018306199795, + "grad_norm": 1.9989885091781616, + "learning_rate": 1.375504765840651e-05, + "loss": 0.9089, + "step": 3021 + }, + { + "epoch": 0.1440659785950945, + "grad_norm": 1.4815373420715332, + "learning_rate": 1.3751406843238526e-05, + "loss": 0.1729, + "step": 3022 + }, + { + "epoch": 0.144113650990394, + "grad_norm": 1.3775211572647095, + "learning_rate": 1.37477654492671e-05, + "loss": 0.9971, + "step": 3023 + }, + { + "epoch": 0.1441613233856935, + "grad_norm": 1.7749160528182983, + "learning_rate": 1.374412347705406e-05, + "loss": 0.7421, + "step": 3024 + }, + { + "epoch": 0.14420899578099303, + "grad_norm": 2.8174102306365967, + "learning_rate": 1.3740480927161326e-05, + "loss": 1.0027, + "step": 3025 + }, + { + "epoch": 0.1442566681762925, + "grad_norm": 1.6649245023727417, + "learning_rate": 1.3736837800150903e-05, + "loss": 0.9248, + "step": 3026 + }, + { + "epoch": 0.14430434057159203, + "grad_norm": 1.4131121635437012, + "learning_rate": 1.373319409658489e-05, + "loss": 0.8903, + "step": 3027 + }, + { + "epoch": 0.1443520129668915, + "grad_norm": 1.7323601245880127, + "learning_rate": 1.3729549817025472e-05, + "loss": 0.2271, + "step": 3028 + }, + { + "epoch": 0.14439968536219103, + "grad_norm": 3.0306766033172607, + "learning_rate": 1.3725904962034923e-05, + "loss": 0.4708, + "step": 3029 + }, + { + "epoch": 0.1444473577574905, + "grad_norm": 1.7027243375778198, + "learning_rate": 1.372225953217561e-05, + "loss": 1.089, + "step": 3030 + }, + { + "epoch": 0.14449503015279003, + "grad_norm": 0.9926291108131409, + "learning_rate": 1.3718613528009982e-05, + "loss": 0.6185, + "step": 3031 + }, + { + "epoch": 0.14454270254808954, + "grad_norm": 1.298276662826538, + "learning_rate": 1.371496695010058e-05, + "loss": 0.6894, + "step": 3032 + }, + { + "epoch": 0.14459037494338903, + "grad_norm": 4.534313678741455, + "learning_rate": 1.3711319799010037e-05, + "loss": 0.4234, + "step": 3033 + }, + { + "epoch": 0.14463804733868854, + "grad_norm": 0.8996132612228394, + "learning_rate": 1.3707672075301064e-05, + "loss": 0.3286, + "step": 3034 + }, + { + "epoch": 0.14468571973398803, + "grad_norm": 2.3603134155273438, + "learning_rate": 1.3704023779536475e-05, + "loss": 0.3578, + "step": 3035 + }, + { + "epoch": 0.14473339212928754, + "grad_norm": 1.9113951921463013, + "learning_rate": 1.3700374912279159e-05, + "loss": 0.9303, + "step": 3036 + }, + { + "epoch": 0.14478106452458703, + "grad_norm": 2.1963863372802734, + "learning_rate": 1.3696725474092098e-05, + "loss": 1.4539, + "step": 3037 + }, + { + "epoch": 0.14482873691988654, + "grad_norm": 1.2416272163391113, + "learning_rate": 1.369307546553837e-05, + "loss": 0.7416, + "step": 3038 + }, + { + "epoch": 0.14487640931518603, + "grad_norm": 3.0142147541046143, + "learning_rate": 1.3689424887181129e-05, + "loss": 1.5193, + "step": 3039 + }, + { + "epoch": 0.14492408171048554, + "grad_norm": 1.4682621955871582, + "learning_rate": 1.368577373958362e-05, + "loss": 0.8641, + "step": 3040 + }, + { + "epoch": 0.14497175410578506, + "grad_norm": 0.973953127861023, + "learning_rate": 1.3682122023309179e-05, + "loss": 0.4439, + "step": 3041 + }, + { + "epoch": 0.14501942650108454, + "grad_norm": 1.5924018621444702, + "learning_rate": 1.3678469738921228e-05, + "loss": 0.8522, + "step": 3042 + }, + { + "epoch": 0.14506709889638406, + "grad_norm": 1.5191891193389893, + "learning_rate": 1.3674816886983275e-05, + "loss": 0.703, + "step": 3043 + }, + { + "epoch": 0.14511477129168354, + "grad_norm": 2.1543540954589844, + "learning_rate": 1.3671163468058924e-05, + "loss": 0.8137, + "step": 3044 + }, + { + "epoch": 0.14516244368698306, + "grad_norm": 1.0507248640060425, + "learning_rate": 1.3667509482711851e-05, + "loss": 0.6013, + "step": 3045 + }, + { + "epoch": 0.14521011608228254, + "grad_norm": 2.3577895164489746, + "learning_rate": 1.3663854931505838e-05, + "loss": 1.0949, + "step": 3046 + }, + { + "epoch": 0.14525778847758206, + "grad_norm": 1.5307725667953491, + "learning_rate": 1.366019981500474e-05, + "loss": 0.539, + "step": 3047 + }, + { + "epoch": 0.14530546087288157, + "grad_norm": 2.1397852897644043, + "learning_rate": 1.3656544133772499e-05, + "loss": 0.7505, + "step": 3048 + }, + { + "epoch": 0.14535313326818106, + "grad_norm": 1.8362572193145752, + "learning_rate": 1.3652887888373155e-05, + "loss": 0.9348, + "step": 3049 + }, + { + "epoch": 0.14540080566348057, + "grad_norm": 1.742902159690857, + "learning_rate": 1.3649231079370825e-05, + "loss": 0.6217, + "step": 3050 + }, + { + "epoch": 0.14544847805878006, + "grad_norm": 1.7095146179199219, + "learning_rate": 1.364557370732972e-05, + "loss": 1.163, + "step": 3051 + }, + { + "epoch": 0.14549615045407957, + "grad_norm": 1.185315489768982, + "learning_rate": 1.3641915772814137e-05, + "loss": 0.6675, + "step": 3052 + }, + { + "epoch": 0.14554382284937906, + "grad_norm": 2.307662010192871, + "learning_rate": 1.3638257276388454e-05, + "loss": 0.584, + "step": 3053 + }, + { + "epoch": 0.14559149524467857, + "grad_norm": 3.863408327102661, + "learning_rate": 1.3634598218617138e-05, + "loss": 0.6662, + "step": 3054 + }, + { + "epoch": 0.14563916763997808, + "grad_norm": 1.374801516532898, + "learning_rate": 1.3630938600064748e-05, + "loss": 0.7323, + "step": 3055 + }, + { + "epoch": 0.14568684003527757, + "grad_norm": 1.4017853736877441, + "learning_rate": 1.3627278421295925e-05, + "loss": 0.4267, + "step": 3056 + }, + { + "epoch": 0.14573451243057708, + "grad_norm": 1.437345027923584, + "learning_rate": 1.362361768287539e-05, + "loss": 0.7345, + "step": 3057 + }, + { + "epoch": 0.14578218482587657, + "grad_norm": 1.4214564561843872, + "learning_rate": 1.3619956385367964e-05, + "loss": 0.5995, + "step": 3058 + }, + { + "epoch": 0.14582985722117608, + "grad_norm": 1.473494529724121, + "learning_rate": 1.3616294529338547e-05, + "loss": 0.6405, + "step": 3059 + }, + { + "epoch": 0.14587752961647557, + "grad_norm": 2.276869535446167, + "learning_rate": 1.3612632115352126e-05, + "loss": 0.7383, + "step": 3060 + }, + { + "epoch": 0.14592520201177508, + "grad_norm": 6.51352071762085, + "learning_rate": 1.3608969143973771e-05, + "loss": 0.21, + "step": 3061 + }, + { + "epoch": 0.14597287440707457, + "grad_norm": 1.3052343130111694, + "learning_rate": 1.3605305615768645e-05, + "loss": 0.8507, + "step": 3062 + }, + { + "epoch": 0.14602054680237408, + "grad_norm": 2.879122257232666, + "learning_rate": 1.3601641531301988e-05, + "loss": 1.0844, + "step": 3063 + }, + { + "epoch": 0.1460682191976736, + "grad_norm": 1.743299961090088, + "learning_rate": 1.3597976891139132e-05, + "loss": 0.8749, + "step": 3064 + }, + { + "epoch": 0.14611589159297309, + "grad_norm": 1.2879037857055664, + "learning_rate": 1.3594311695845494e-05, + "loss": 0.3992, + "step": 3065 + }, + { + "epoch": 0.1461635639882726, + "grad_norm": 1.800226092338562, + "learning_rate": 1.3590645945986577e-05, + "loss": 0.6878, + "step": 3066 + }, + { + "epoch": 0.14621123638357209, + "grad_norm": 4.948254585266113, + "learning_rate": 1.3586979642127964e-05, + "loss": 1.4434, + "step": 3067 + }, + { + "epoch": 0.1462589087788716, + "grad_norm": 1.3549526929855347, + "learning_rate": 1.3583312784835332e-05, + "loss": 0.4869, + "step": 3068 + }, + { + "epoch": 0.14630658117417109, + "grad_norm": 1.0785163640975952, + "learning_rate": 1.3579645374674442e-05, + "loss": 0.5656, + "step": 3069 + }, + { + "epoch": 0.1463542535694706, + "grad_norm": 1.4176609516143799, + "learning_rate": 1.3575977412211132e-05, + "loss": 0.7809, + "step": 3070 + }, + { + "epoch": 0.1464019259647701, + "grad_norm": 1.3744181394577026, + "learning_rate": 1.3572308898011328e-05, + "loss": 0.8812, + "step": 3071 + }, + { + "epoch": 0.1464495983600696, + "grad_norm": 2.3029258251190186, + "learning_rate": 1.3568639832641055e-05, + "loss": 0.8421, + "step": 3072 + }, + { + "epoch": 0.1464972707553691, + "grad_norm": 1.3138843774795532, + "learning_rate": 1.3564970216666402e-05, + "loss": 0.5224, + "step": 3073 + }, + { + "epoch": 0.1465449431506686, + "grad_norm": 3.187471389770508, + "learning_rate": 1.3561300050653556e-05, + "loss": 1.4745, + "step": 3074 + }, + { + "epoch": 0.1465926155459681, + "grad_norm": 1.805351734161377, + "learning_rate": 1.3557629335168789e-05, + "loss": 0.5679, + "step": 3075 + }, + { + "epoch": 0.1466402879412676, + "grad_norm": 1.5123603343963623, + "learning_rate": 1.3553958070778452e-05, + "loss": 0.83, + "step": 3076 + }, + { + "epoch": 0.1466879603365671, + "grad_norm": 2.233900785446167, + "learning_rate": 1.3550286258048984e-05, + "loss": 0.4014, + "step": 3077 + }, + { + "epoch": 0.1467356327318666, + "grad_norm": 1.0178353786468506, + "learning_rate": 1.3546613897546905e-05, + "loss": 0.6774, + "step": 3078 + }, + { + "epoch": 0.1467833051271661, + "grad_norm": 0.8925381898880005, + "learning_rate": 1.3542940989838824e-05, + "loss": 0.4033, + "step": 3079 + }, + { + "epoch": 0.14683097752246563, + "grad_norm": 1.7636933326721191, + "learning_rate": 1.3539267535491436e-05, + "loss": 1.0362, + "step": 3080 + }, + { + "epoch": 0.14687864991776511, + "grad_norm": 2.7844808101654053, + "learning_rate": 1.3535593535071515e-05, + "loss": 0.8824, + "step": 3081 + }, + { + "epoch": 0.14692632231306463, + "grad_norm": 1.1747113466262817, + "learning_rate": 1.3531918989145919e-05, + "loss": 0.7971, + "step": 3082 + }, + { + "epoch": 0.14697399470836411, + "grad_norm": 1.5873674154281616, + "learning_rate": 1.3528243898281595e-05, + "loss": 0.591, + "step": 3083 + }, + { + "epoch": 0.14702166710366363, + "grad_norm": 2.503666877746582, + "learning_rate": 1.3524568263045572e-05, + "loss": 1.0975, + "step": 3084 + }, + { + "epoch": 0.14706933949896311, + "grad_norm": 1.4579899311065674, + "learning_rate": 1.3520892084004961e-05, + "loss": 0.746, + "step": 3085 + }, + { + "epoch": 0.14711701189426263, + "grad_norm": 2.0907604694366455, + "learning_rate": 1.3517215361726963e-05, + "loss": 0.5951, + "step": 3086 + }, + { + "epoch": 0.14716468428956214, + "grad_norm": 2.5679707527160645, + "learning_rate": 1.3513538096778853e-05, + "loss": 0.2363, + "step": 3087 + }, + { + "epoch": 0.14721235668486163, + "grad_norm": 1.765938401222229, + "learning_rate": 1.3509860289727994e-05, + "loss": 0.6744, + "step": 3088 + }, + { + "epoch": 0.14726002908016114, + "grad_norm": 1.4003573656082153, + "learning_rate": 1.350618194114184e-05, + "loss": 0.8043, + "step": 3089 + }, + { + "epoch": 0.14730770147546063, + "grad_norm": 1.256507396697998, + "learning_rate": 1.3502503051587921e-05, + "loss": 0.4145, + "step": 3090 + }, + { + "epoch": 0.14735537387076014, + "grad_norm": 1.7833360433578491, + "learning_rate": 1.3498823621633848e-05, + "loss": 0.6234, + "step": 3091 + }, + { + "epoch": 0.14740304626605963, + "grad_norm": 6.889230251312256, + "learning_rate": 1.349514365184732e-05, + "loss": 0.4359, + "step": 3092 + }, + { + "epoch": 0.14745071866135914, + "grad_norm": 1.4543510675430298, + "learning_rate": 1.3491463142796121e-05, + "loss": 0.866, + "step": 3093 + }, + { + "epoch": 0.14749839105665863, + "grad_norm": 1.1712321043014526, + "learning_rate": 1.3487782095048112e-05, + "loss": 0.7195, + "step": 3094 + }, + { + "epoch": 0.14754606345195814, + "grad_norm": 1.9908615350723267, + "learning_rate": 1.3484100509171246e-05, + "loss": 0.5969, + "step": 3095 + }, + { + "epoch": 0.14759373584725766, + "grad_norm": 1.7359424829483032, + "learning_rate": 1.3480418385733549e-05, + "loss": 0.6248, + "step": 3096 + }, + { + "epoch": 0.14764140824255714, + "grad_norm": 1.045398235321045, + "learning_rate": 1.3476735725303134e-05, + "loss": 0.437, + "step": 3097 + }, + { + "epoch": 0.14768908063785666, + "grad_norm": 2.1756820678710938, + "learning_rate": 1.3473052528448203e-05, + "loss": 0.6527, + "step": 3098 + }, + { + "epoch": 0.14773675303315614, + "grad_norm": 2.6632003784179688, + "learning_rate": 1.3469368795737033e-05, + "loss": 1.2887, + "step": 3099 + }, + { + "epoch": 0.14778442542845566, + "grad_norm": 1.1401606798171997, + "learning_rate": 1.3465684527737986e-05, + "loss": 0.6396, + "step": 3100 + }, + { + "epoch": 0.14783209782375514, + "grad_norm": 1.0559873580932617, + "learning_rate": 1.3461999725019506e-05, + "loss": 0.8043, + "step": 3101 + }, + { + "epoch": 0.14787977021905466, + "grad_norm": 1.2660080194473267, + "learning_rate": 1.3458314388150115e-05, + "loss": 0.645, + "step": 3102 + }, + { + "epoch": 0.14792744261435417, + "grad_norm": 8.582117080688477, + "learning_rate": 1.3454628517698431e-05, + "loss": 0.4915, + "step": 3103 + }, + { + "epoch": 0.14797511500965366, + "grad_norm": 1.8201041221618652, + "learning_rate": 1.3450942114233145e-05, + "loss": 0.8148, + "step": 3104 + }, + { + "epoch": 0.14802278740495317, + "grad_norm": 1.4995503425598145, + "learning_rate": 1.3447255178323025e-05, + "loss": 0.3414, + "step": 3105 + }, + { + "epoch": 0.14807045980025266, + "grad_norm": 1.8104441165924072, + "learning_rate": 1.3443567710536931e-05, + "loss": 0.4537, + "step": 3106 + }, + { + "epoch": 0.14811813219555217, + "grad_norm": 1.6128402948379517, + "learning_rate": 1.3439879711443807e-05, + "loss": 0.5302, + "step": 3107 + }, + { + "epoch": 0.14816580459085166, + "grad_norm": 1.4941962957382202, + "learning_rate": 1.3436191181612662e-05, + "loss": 0.7592, + "step": 3108 + }, + { + "epoch": 0.14821347698615117, + "grad_norm": 1.7309041023254395, + "learning_rate": 1.3432502121612602e-05, + "loss": 0.8093, + "step": 3109 + }, + { + "epoch": 0.14826114938145066, + "grad_norm": 2.0706653594970703, + "learning_rate": 1.3428812532012816e-05, + "loss": 0.8846, + "step": 3110 + }, + { + "epoch": 0.14830882177675017, + "grad_norm": 2.691295862197876, + "learning_rate": 1.3425122413382563e-05, + "loss": 1.3084, + "step": 3111 + }, + { + "epoch": 0.14835649417204969, + "grad_norm": 1.1625548601150513, + "learning_rate": 1.3421431766291198e-05, + "loss": 0.6071, + "step": 3112 + }, + { + "epoch": 0.14840416656734917, + "grad_norm": 1.321666955947876, + "learning_rate": 1.3417740591308142e-05, + "loss": 0.5195, + "step": 3113 + }, + { + "epoch": 0.14845183896264869, + "grad_norm": 2.304530620574951, + "learning_rate": 1.341404888900291e-05, + "loss": 0.6536, + "step": 3114 + }, + { + "epoch": 0.14849951135794817, + "grad_norm": 1.2831493616104126, + "learning_rate": 1.3410356659945095e-05, + "loss": 0.6212, + "step": 3115 + }, + { + "epoch": 0.14854718375324769, + "grad_norm": 1.8855429887771606, + "learning_rate": 1.3406663904704362e-05, + "loss": 0.6279, + "step": 3116 + }, + { + "epoch": 0.14859485614854717, + "grad_norm": 2.2084779739379883, + "learning_rate": 1.3402970623850474e-05, + "loss": 0.335, + "step": 3117 + }, + { + "epoch": 0.14864252854384669, + "grad_norm": 2.807931423187256, + "learning_rate": 1.339927681795326e-05, + "loss": 1.179, + "step": 3118 + }, + { + "epoch": 0.1486902009391462, + "grad_norm": 1.6841225624084473, + "learning_rate": 1.3395582487582639e-05, + "loss": 0.4703, + "step": 3119 + }, + { + "epoch": 0.14873787333444569, + "grad_norm": 1.979477047920227, + "learning_rate": 1.3391887633308609e-05, + "loss": 0.8423, + "step": 3120 + }, + { + "epoch": 0.1487855457297452, + "grad_norm": 3.011033535003662, + "learning_rate": 1.3388192255701249e-05, + "loss": 1.3704, + "step": 3121 + }, + { + "epoch": 0.1488332181250447, + "grad_norm": 5.905727386474609, + "learning_rate": 1.3384496355330714e-05, + "loss": 0.381, + "step": 3122 + }, + { + "epoch": 0.1488808905203442, + "grad_norm": 4.945913314819336, + "learning_rate": 1.3380799932767243e-05, + "loss": 1.3545, + "step": 3123 + }, + { + "epoch": 0.1489285629156437, + "grad_norm": 1.2848289012908936, + "learning_rate": 1.3377102988581162e-05, + "loss": 0.6009, + "step": 3124 + }, + { + "epoch": 0.1489762353109432, + "grad_norm": 1.3738328218460083, + "learning_rate": 1.3373405523342862e-05, + "loss": 0.6994, + "step": 3125 + }, + { + "epoch": 0.1490239077062427, + "grad_norm": 1.4116965532302856, + "learning_rate": 1.336970753762283e-05, + "loss": 0.7592, + "step": 3126 + }, + { + "epoch": 0.1490715801015422, + "grad_norm": 1.0781294107437134, + "learning_rate": 1.336600903199163e-05, + "loss": 0.2557, + "step": 3127 + }, + { + "epoch": 0.14911925249684171, + "grad_norm": 1.3575353622436523, + "learning_rate": 1.3362310007019897e-05, + "loss": 0.4058, + "step": 3128 + }, + { + "epoch": 0.1491669248921412, + "grad_norm": 1.770020842552185, + "learning_rate": 1.3358610463278357e-05, + "loss": 0.1847, + "step": 3129 + }, + { + "epoch": 0.14921459728744071, + "grad_norm": 1.2070060968399048, + "learning_rate": 1.335491040133781e-05, + "loss": 0.7112, + "step": 3130 + }, + { + "epoch": 0.1492622696827402, + "grad_norm": 1.2136505842208862, + "learning_rate": 1.335120982176913e-05, + "loss": 0.8173, + "step": 3131 + }, + { + "epoch": 0.14930994207803971, + "grad_norm": 1.3908705711364746, + "learning_rate": 1.3347508725143292e-05, + "loss": 0.781, + "step": 3132 + }, + { + "epoch": 0.1493576144733392, + "grad_norm": 1.0264910459518433, + "learning_rate": 1.3343807112031329e-05, + "loss": 0.7093, + "step": 3133 + }, + { + "epoch": 0.14940528686863871, + "grad_norm": 1.4023420810699463, + "learning_rate": 1.3340104983004363e-05, + "loss": 0.6146, + "step": 3134 + }, + { + "epoch": 0.14945295926393823, + "grad_norm": 1.5430750846862793, + "learning_rate": 1.3336402338633593e-05, + "loss": 1.0919, + "step": 3135 + }, + { + "epoch": 0.14950063165923771, + "grad_norm": 2.278820753097534, + "learning_rate": 1.3332699179490302e-05, + "loss": 0.8825, + "step": 3136 + }, + { + "epoch": 0.14954830405453723, + "grad_norm": 2.868929624557495, + "learning_rate": 1.3328995506145849e-05, + "loss": 0.9877, + "step": 3137 + }, + { + "epoch": 0.14959597644983672, + "grad_norm": 1.2752020359039307, + "learning_rate": 1.3325291319171669e-05, + "loss": 0.6129, + "step": 3138 + }, + { + "epoch": 0.14964364884513623, + "grad_norm": 4.130411148071289, + "learning_rate": 1.3321586619139285e-05, + "loss": 0.4079, + "step": 3139 + }, + { + "epoch": 0.14969132124043572, + "grad_norm": 2.5877163410186768, + "learning_rate": 1.3317881406620287e-05, + "loss": 1.01, + "step": 3140 + }, + { + "epoch": 0.14973899363573523, + "grad_norm": 1.3374658823013306, + "learning_rate": 1.3314175682186358e-05, + "loss": 0.6646, + "step": 3141 + }, + { + "epoch": 0.14978666603103474, + "grad_norm": 1.1594607830047607, + "learning_rate": 1.3310469446409251e-05, + "loss": 0.4802, + "step": 3142 + }, + { + "epoch": 0.14983433842633423, + "grad_norm": 0.9576970934867859, + "learning_rate": 1.33067626998608e-05, + "loss": 0.4485, + "step": 3143 + }, + { + "epoch": 0.14988201082163374, + "grad_norm": 1.985435962677002, + "learning_rate": 1.3303055443112918e-05, + "loss": 0.2718, + "step": 3144 + }, + { + "epoch": 0.14992968321693323, + "grad_norm": 1.419874906539917, + "learning_rate": 1.3299347676737595e-05, + "loss": 0.7879, + "step": 3145 + }, + { + "epoch": 0.14997735561223274, + "grad_norm": 2.0171825885772705, + "learning_rate": 1.32956394013069e-05, + "loss": 0.9661, + "step": 3146 + }, + { + "epoch": 0.15002502800753223, + "grad_norm": 1.5258233547210693, + "learning_rate": 1.329193061739299e-05, + "loss": 0.786, + "step": 3147 + }, + { + "epoch": 0.15007270040283174, + "grad_norm": 1.2240711450576782, + "learning_rate": 1.328822132556808e-05, + "loss": 0.7653, + "step": 3148 + }, + { + "epoch": 0.15012037279813123, + "grad_norm": 1.9582189321517944, + "learning_rate": 1.3284511526404485e-05, + "loss": 0.2899, + "step": 3149 + }, + { + "epoch": 0.15016804519343074, + "grad_norm": 2.007537364959717, + "learning_rate": 1.3280801220474585e-05, + "loss": 0.9079, + "step": 3150 + }, + { + "epoch": 0.15021571758873026, + "grad_norm": 1.0368845462799072, + "learning_rate": 1.3277090408350841e-05, + "loss": 0.6406, + "step": 3151 + }, + { + "epoch": 0.15026338998402974, + "grad_norm": 1.4237940311431885, + "learning_rate": 1.3273379090605796e-05, + "loss": 0.5644, + "step": 3152 + }, + { + "epoch": 0.15031106237932926, + "grad_norm": 1.3777469396591187, + "learning_rate": 1.3269667267812066e-05, + "loss": 0.6178, + "step": 3153 + }, + { + "epoch": 0.15035873477462874, + "grad_norm": 3.5124101638793945, + "learning_rate": 1.3265954940542344e-05, + "loss": 0.5717, + "step": 3154 + }, + { + "epoch": 0.15040640716992826, + "grad_norm": 1.1081418991088867, + "learning_rate": 1.3262242109369412e-05, + "loss": 0.4262, + "step": 3155 + }, + { + "epoch": 0.15045407956522774, + "grad_norm": 1.704636812210083, + "learning_rate": 1.3258528774866115e-05, + "loss": 1.1554, + "step": 3156 + }, + { + "epoch": 0.15050175196052726, + "grad_norm": 1.464885950088501, + "learning_rate": 1.3254814937605385e-05, + "loss": 0.621, + "step": 3157 + }, + { + "epoch": 0.15054942435582677, + "grad_norm": 2.538759708404541, + "learning_rate": 1.325110059816023e-05, + "loss": 1.2667, + "step": 3158 + }, + { + "epoch": 0.15059709675112626, + "grad_norm": 4.7555131912231445, + "learning_rate": 1.324738575710373e-05, + "loss": 0.4244, + "step": 3159 + }, + { + "epoch": 0.15064476914642577, + "grad_norm": 1.7561062574386597, + "learning_rate": 1.324367041500905e-05, + "loss": 0.982, + "step": 3160 + }, + { + "epoch": 0.15069244154172526, + "grad_norm": 1.8753656148910522, + "learning_rate": 1.323995457244943e-05, + "loss": 0.5967, + "step": 3161 + }, + { + "epoch": 0.15074011393702477, + "grad_norm": 1.176792025566101, + "learning_rate": 1.3236238229998181e-05, + "loss": 0.333, + "step": 3162 + }, + { + "epoch": 0.15078778633232426, + "grad_norm": 23.347854614257812, + "learning_rate": 1.3232521388228703e-05, + "loss": 0.5478, + "step": 3163 + }, + { + "epoch": 0.15083545872762377, + "grad_norm": 2.1824772357940674, + "learning_rate": 1.3228804047714462e-05, + "loss": 0.6945, + "step": 3164 + }, + { + "epoch": 0.15088313112292326, + "grad_norm": 1.1818346977233887, + "learning_rate": 1.3225086209029008e-05, + "loss": 0.5874, + "step": 3165 + }, + { + "epoch": 0.15093080351822277, + "grad_norm": 2.823831796646118, + "learning_rate": 1.3221367872745962e-05, + "loss": 1.05, + "step": 3166 + }, + { + "epoch": 0.1509784759135223, + "grad_norm": 1.5577776432037354, + "learning_rate": 1.321764903943903e-05, + "loss": 0.9005, + "step": 3167 + }, + { + "epoch": 0.15102614830882177, + "grad_norm": 2.868621587753296, + "learning_rate": 1.3213929709681986e-05, + "loss": 0.7395, + "step": 3168 + }, + { + "epoch": 0.1510738207041213, + "grad_norm": 3.710958242416382, + "learning_rate": 1.321020988404868e-05, + "loss": 1.5319, + "step": 3169 + }, + { + "epoch": 0.15112149309942077, + "grad_norm": 1.1518117189407349, + "learning_rate": 1.3206489563113054e-05, + "loss": 0.1914, + "step": 3170 + }, + { + "epoch": 0.1511691654947203, + "grad_norm": 2.3093178272247314, + "learning_rate": 1.3202768747449104e-05, + "loss": 0.776, + "step": 3171 + }, + { + "epoch": 0.15121683789001977, + "grad_norm": 2.6352174282073975, + "learning_rate": 1.3199047437630921e-05, + "loss": 0.9145, + "step": 3172 + }, + { + "epoch": 0.1512645102853193, + "grad_norm": 1.613187551498413, + "learning_rate": 1.3195325634232662e-05, + "loss": 0.5873, + "step": 3173 + }, + { + "epoch": 0.1513121826806188, + "grad_norm": 1.3262912034988403, + "learning_rate": 1.3191603337828563e-05, + "loss": 0.7749, + "step": 3174 + }, + { + "epoch": 0.1513598550759183, + "grad_norm": 1.0240951776504517, + "learning_rate": 1.3187880548992937e-05, + "loss": 0.586, + "step": 3175 + }, + { + "epoch": 0.1514075274712178, + "grad_norm": 27.738183975219727, + "learning_rate": 1.3184157268300168e-05, + "loss": 0.6833, + "step": 3176 + }, + { + "epoch": 0.1514551998665173, + "grad_norm": 2.284761905670166, + "learning_rate": 1.3180433496324724e-05, + "loss": 0.497, + "step": 3177 + }, + { + "epoch": 0.1515028722618168, + "grad_norm": 2.069669008255005, + "learning_rate": 1.3176709233641147e-05, + "loss": 0.596, + "step": 3178 + }, + { + "epoch": 0.1515505446571163, + "grad_norm": 1.770541787147522, + "learning_rate": 1.3172984480824045e-05, + "loss": 1.0247, + "step": 3179 + }, + { + "epoch": 0.1515982170524158, + "grad_norm": 1.1753355264663696, + "learning_rate": 1.3169259238448115e-05, + "loss": 0.7585, + "step": 3180 + }, + { + "epoch": 0.1516458894477153, + "grad_norm": 1.3008493185043335, + "learning_rate": 1.3165533507088122e-05, + "loss": 0.7156, + "step": 3181 + }, + { + "epoch": 0.1516935618430148, + "grad_norm": 3.1839733123779297, + "learning_rate": 1.3161807287318906e-05, + "loss": 0.5957, + "step": 3182 + }, + { + "epoch": 0.15174123423831432, + "grad_norm": 1.2422832250595093, + "learning_rate": 1.3158080579715389e-05, + "loss": 0.8522, + "step": 3183 + }, + { + "epoch": 0.1517889066336138, + "grad_norm": 1.483921766281128, + "learning_rate": 1.3154353384852559e-05, + "loss": 0.64, + "step": 3184 + }, + { + "epoch": 0.15183657902891332, + "grad_norm": 1.994581937789917, + "learning_rate": 1.315062570330548e-05, + "loss": 0.6769, + "step": 3185 + }, + { + "epoch": 0.1518842514242128, + "grad_norm": 2.5788888931274414, + "learning_rate": 1.3146897535649305e-05, + "loss": 1.1391, + "step": 3186 + }, + { + "epoch": 0.15193192381951232, + "grad_norm": 1.5630767345428467, + "learning_rate": 1.3143168882459247e-05, + "loss": 0.708, + "step": 3187 + }, + { + "epoch": 0.1519795962148118, + "grad_norm": 2.3303139209747314, + "learning_rate": 1.3139439744310599e-05, + "loss": 1.1927, + "step": 3188 + }, + { + "epoch": 0.15202726861011132, + "grad_norm": 2.9897749423980713, + "learning_rate": 1.3135710121778729e-05, + "loss": 1.737, + "step": 3189 + }, + { + "epoch": 0.15207494100541083, + "grad_norm": 1.2306307554244995, + "learning_rate": 1.3131980015439079e-05, + "loss": 1.0152, + "step": 3190 + }, + { + "epoch": 0.15212261340071032, + "grad_norm": 2.98582124710083, + "learning_rate": 1.3128249425867161e-05, + "loss": 1.3723, + "step": 3191 + }, + { + "epoch": 0.15217028579600983, + "grad_norm": 1.4394237995147705, + "learning_rate": 1.3124518353638575e-05, + "loss": 0.8075, + "step": 3192 + }, + { + "epoch": 0.15221795819130932, + "grad_norm": 1.3103983402252197, + "learning_rate": 1.3120786799328982e-05, + "loss": 0.7079, + "step": 3193 + }, + { + "epoch": 0.15226563058660883, + "grad_norm": 1.8622562885284424, + "learning_rate": 1.3117054763514126e-05, + "loss": 1.0342, + "step": 3194 + }, + { + "epoch": 0.15231330298190832, + "grad_norm": 3.9945762157440186, + "learning_rate": 1.3113322246769817e-05, + "loss": 0.9274, + "step": 3195 + }, + { + "epoch": 0.15236097537720783, + "grad_norm": 2.3655645847320557, + "learning_rate": 1.3109589249671947e-05, + "loss": 0.7874, + "step": 3196 + }, + { + "epoch": 0.15240864777250732, + "grad_norm": 4.2475128173828125, + "learning_rate": 1.3105855772796482e-05, + "loss": 1.1561, + "step": 3197 + }, + { + "epoch": 0.15245632016780683, + "grad_norm": 1.5746185779571533, + "learning_rate": 1.3102121816719453e-05, + "loss": 0.9835, + "step": 3198 + }, + { + "epoch": 0.15250399256310634, + "grad_norm": 1.5996003150939941, + "learning_rate": 1.3098387382016971e-05, + "loss": 0.2285, + "step": 3199 + }, + { + "epoch": 0.15255166495840583, + "grad_norm": 1.7315154075622559, + "learning_rate": 1.3094652469265225e-05, + "loss": 0.6511, + "step": 3200 + }, + { + "epoch": 0.15259933735370534, + "grad_norm": 3.788540840148926, + "learning_rate": 1.309091707904047e-05, + "loss": 0.9884, + "step": 3201 + }, + { + "epoch": 0.15264700974900483, + "grad_norm": 1.3468878269195557, + "learning_rate": 1.3087181211919043e-05, + "loss": 0.7516, + "step": 3202 + }, + { + "epoch": 0.15269468214430434, + "grad_norm": 1.5442324876785278, + "learning_rate": 1.3083444868477344e-05, + "loss": 0.6242, + "step": 3203 + }, + { + "epoch": 0.15274235453960383, + "grad_norm": 2.502856731414795, + "learning_rate": 1.3079708049291857e-05, + "loss": 0.4055, + "step": 3204 + }, + { + "epoch": 0.15279002693490334, + "grad_norm": 5.897099018096924, + "learning_rate": 1.3075970754939134e-05, + "loss": 0.6836, + "step": 3205 + }, + { + "epoch": 0.15283769933020286, + "grad_norm": 1.1400001049041748, + "learning_rate": 1.3072232985995798e-05, + "loss": 0.6967, + "step": 3206 + }, + { + "epoch": 0.15288537172550234, + "grad_norm": 1.0584015846252441, + "learning_rate": 1.306849474303855e-05, + "loss": 0.6489, + "step": 3207 + }, + { + "epoch": 0.15293304412080186, + "grad_norm": 4.950252056121826, + "learning_rate": 1.306475602664416e-05, + "loss": 1.0647, + "step": 3208 + }, + { + "epoch": 0.15298071651610134, + "grad_norm": 1.6873407363891602, + "learning_rate": 1.3061016837389482e-05, + "loss": 0.5319, + "step": 3209 + }, + { + "epoch": 0.15302838891140086, + "grad_norm": 1.234108805656433, + "learning_rate": 1.3057277175851426e-05, + "loss": 0.5588, + "step": 3210 + }, + { + "epoch": 0.15307606130670035, + "grad_norm": 3.5063700675964355, + "learning_rate": 1.3053537042606985e-05, + "loss": 1.1597, + "step": 3211 + }, + { + "epoch": 0.15312373370199986, + "grad_norm": 1.8462144136428833, + "learning_rate": 1.3049796438233225e-05, + "loss": 0.6949, + "step": 3212 + }, + { + "epoch": 0.15317140609729935, + "grad_norm": 1.3435649871826172, + "learning_rate": 1.3046055363307277e-05, + "loss": 0.737, + "step": 3213 + }, + { + "epoch": 0.15321907849259886, + "grad_norm": 0.9850499033927917, + "learning_rate": 1.3042313818406359e-05, + "loss": 0.6589, + "step": 3214 + }, + { + "epoch": 0.15326675088789837, + "grad_norm": 2.4665281772613525, + "learning_rate": 1.3038571804107747e-05, + "loss": 0.5102, + "step": 3215 + }, + { + "epoch": 0.15331442328319786, + "grad_norm": 2.578758716583252, + "learning_rate": 1.3034829320988796e-05, + "loss": 0.5918, + "step": 3216 + }, + { + "epoch": 0.15336209567849737, + "grad_norm": 2.080080986022949, + "learning_rate": 1.3031086369626934e-05, + "loss": 0.7601, + "step": 3217 + }, + { + "epoch": 0.15340976807379686, + "grad_norm": 1.7336410284042358, + "learning_rate": 1.302734295059966e-05, + "loss": 0.4842, + "step": 3218 + }, + { + "epoch": 0.15345744046909637, + "grad_norm": 2.311845302581787, + "learning_rate": 1.3023599064484546e-05, + "loss": 0.8222, + "step": 3219 + }, + { + "epoch": 0.15350511286439586, + "grad_norm": 6.7187299728393555, + "learning_rate": 1.3019854711859233e-05, + "loss": 0.4773, + "step": 3220 + }, + { + "epoch": 0.15355278525969537, + "grad_norm": 3.3655238151550293, + "learning_rate": 1.3016109893301434e-05, + "loss": 0.5521, + "step": 3221 + }, + { + "epoch": 0.1536004576549949, + "grad_norm": 3.1471986770629883, + "learning_rate": 1.3012364609388939e-05, + "loss": 0.9256, + "step": 3222 + }, + { + "epoch": 0.15364813005029437, + "grad_norm": 2.384690999984741, + "learning_rate": 1.3008618860699607e-05, + "loss": 0.5345, + "step": 3223 + }, + { + "epoch": 0.1536958024455939, + "grad_norm": 1.6492693424224854, + "learning_rate": 1.3004872647811365e-05, + "loss": 0.7447, + "step": 3224 + }, + { + "epoch": 0.15374347484089337, + "grad_norm": 1.4576164484024048, + "learning_rate": 1.300112597130222e-05, + "loss": 0.7731, + "step": 3225 + }, + { + "epoch": 0.1537911472361929, + "grad_norm": 2.039154052734375, + "learning_rate": 1.2997378831750242e-05, + "loss": 0.7086, + "step": 3226 + }, + { + "epoch": 0.15383881963149237, + "grad_norm": 2.779320001602173, + "learning_rate": 1.2993631229733584e-05, + "loss": 1.0048, + "step": 3227 + }, + { + "epoch": 0.1538864920267919, + "grad_norm": 3.1010067462921143, + "learning_rate": 1.2989883165830448e-05, + "loss": 0.9285, + "step": 3228 + }, + { + "epoch": 0.1539341644220914, + "grad_norm": 3.0224592685699463, + "learning_rate": 1.298613464061913e-05, + "loss": 0.8421, + "step": 3229 + }, + { + "epoch": 0.1539818368173909, + "grad_norm": 1.478475570678711, + "learning_rate": 1.2982385654677989e-05, + "loss": 0.5191, + "step": 3230 + }, + { + "epoch": 0.1540295092126904, + "grad_norm": 2.0774331092834473, + "learning_rate": 1.2978636208585456e-05, + "loss": 0.8923, + "step": 3231 + }, + { + "epoch": 0.1540771816079899, + "grad_norm": 1.5826398134231567, + "learning_rate": 1.2974886302920029e-05, + "loss": 0.7858, + "step": 3232 + }, + { + "epoch": 0.1541248540032894, + "grad_norm": 2.285444498062134, + "learning_rate": 1.297113593826028e-05, + "loss": 0.7862, + "step": 3233 + }, + { + "epoch": 0.1541725263985889, + "grad_norm": 1.3336167335510254, + "learning_rate": 1.2967385115184854e-05, + "loss": 0.5737, + "step": 3234 + }, + { + "epoch": 0.1542201987938884, + "grad_norm": 1.879441738128662, + "learning_rate": 1.2963633834272463e-05, + "loss": 0.6908, + "step": 3235 + }, + { + "epoch": 0.1542678711891879, + "grad_norm": 1.8980261087417603, + "learning_rate": 1.2959882096101888e-05, + "loss": 0.7223, + "step": 3236 + }, + { + "epoch": 0.1543155435844874, + "grad_norm": 1.6889532804489136, + "learning_rate": 1.2956129901251988e-05, + "loss": 0.7277, + "step": 3237 + }, + { + "epoch": 0.15436321597978692, + "grad_norm": 2.410881996154785, + "learning_rate": 1.2952377250301689e-05, + "loss": 1.2017, + "step": 3238 + }, + { + "epoch": 0.1544108883750864, + "grad_norm": 4.254683971405029, + "learning_rate": 1.294862414382998e-05, + "loss": 0.4348, + "step": 3239 + }, + { + "epoch": 0.15445856077038592, + "grad_norm": 1.7135064601898193, + "learning_rate": 1.2944870582415931e-05, + "loss": 0.6869, + "step": 3240 + }, + { + "epoch": 0.1545062331656854, + "grad_norm": 2.102322578430176, + "learning_rate": 1.2941116566638681e-05, + "loss": 0.8207, + "step": 3241 + }, + { + "epoch": 0.15455390556098492, + "grad_norm": 1.275256872177124, + "learning_rate": 1.293736209707743e-05, + "loss": 0.6431, + "step": 3242 + }, + { + "epoch": 0.1546015779562844, + "grad_norm": 4.420902729034424, + "learning_rate": 1.2933607174311458e-05, + "loss": 1.0482, + "step": 3243 + }, + { + "epoch": 0.15464925035158392, + "grad_norm": 1.0623613595962524, + "learning_rate": 1.2929851798920108e-05, + "loss": 0.7346, + "step": 3244 + }, + { + "epoch": 0.15469692274688343, + "grad_norm": 6.680132865905762, + "learning_rate": 1.2926095971482795e-05, + "loss": 1.9411, + "step": 3245 + }, + { + "epoch": 0.15474459514218292, + "grad_norm": 1.8034237623214722, + "learning_rate": 1.2922339692579008e-05, + "loss": 0.8158, + "step": 3246 + }, + { + "epoch": 0.15479226753748243, + "grad_norm": 1.094099521636963, + "learning_rate": 1.2918582962788301e-05, + "loss": 0.722, + "step": 3247 + }, + { + "epoch": 0.15483993993278192, + "grad_norm": 2.1104376316070557, + "learning_rate": 1.2914825782690299e-05, + "loss": 1.0414, + "step": 3248 + }, + { + "epoch": 0.15488761232808143, + "grad_norm": 1.5353261232376099, + "learning_rate": 1.2911068152864697e-05, + "loss": 0.3202, + "step": 3249 + }, + { + "epoch": 0.15493528472338092, + "grad_norm": 1.695068359375, + "learning_rate": 1.2907310073891255e-05, + "loss": 0.8314, + "step": 3250 + }, + { + "epoch": 0.15498295711868043, + "grad_norm": 2.4813592433929443, + "learning_rate": 1.2903551546349809e-05, + "loss": 0.8277, + "step": 3251 + }, + { + "epoch": 0.15503062951397992, + "grad_norm": 2.1663858890533447, + "learning_rate": 1.289979257082026e-05, + "loss": 0.585, + "step": 3252 + }, + { + "epoch": 0.15507830190927943, + "grad_norm": 2.4127964973449707, + "learning_rate": 1.2896033147882576e-05, + "loss": 0.863, + "step": 3253 + }, + { + "epoch": 0.15512597430457895, + "grad_norm": 2.1186890602111816, + "learning_rate": 1.2892273278116805e-05, + "loss": 0.2626, + "step": 3254 + }, + { + "epoch": 0.15517364669987843, + "grad_norm": 1.9335317611694336, + "learning_rate": 1.288851296210305e-05, + "loss": 0.8851, + "step": 3255 + }, + { + "epoch": 0.15522131909517795, + "grad_norm": 0.9816529750823975, + "learning_rate": 1.2884752200421493e-05, + "loss": 0.5099, + "step": 3256 + }, + { + "epoch": 0.15526899149047743, + "grad_norm": 3.36718487739563, + "learning_rate": 1.2880990993652379e-05, + "loss": 1.0508, + "step": 3257 + }, + { + "epoch": 0.15531666388577695, + "grad_norm": 1.9627264738082886, + "learning_rate": 1.287722934237602e-05, + "loss": 1.0297, + "step": 3258 + }, + { + "epoch": 0.15536433628107643, + "grad_norm": 2.541930675506592, + "learning_rate": 1.2873467247172804e-05, + "loss": 0.9226, + "step": 3259 + }, + { + "epoch": 0.15541200867637595, + "grad_norm": 4.7642645835876465, + "learning_rate": 1.2869704708623184e-05, + "loss": 0.366, + "step": 3260 + }, + { + "epoch": 0.15545968107167546, + "grad_norm": 2.4338111877441406, + "learning_rate": 1.286594172730768e-05, + "loss": 1.0849, + "step": 3261 + }, + { + "epoch": 0.15550735346697495, + "grad_norm": 1.6886451244354248, + "learning_rate": 1.2862178303806878e-05, + "loss": 0.5081, + "step": 3262 + }, + { + "epoch": 0.15555502586227446, + "grad_norm": 1.4763084650039673, + "learning_rate": 1.285841443870144e-05, + "loss": 0.9544, + "step": 3263 + }, + { + "epoch": 0.15560269825757395, + "grad_norm": 3.021918773651123, + "learning_rate": 1.285465013257209e-05, + "loss": 0.8274, + "step": 3264 + }, + { + "epoch": 0.15565037065287346, + "grad_norm": 1.5202257633209229, + "learning_rate": 1.2850885385999626e-05, + "loss": 0.6118, + "step": 3265 + }, + { + "epoch": 0.15569804304817295, + "grad_norm": 1.6691006422042847, + "learning_rate": 1.28471201995649e-05, + "loss": 1.005, + "step": 3266 + }, + { + "epoch": 0.15574571544347246, + "grad_norm": 2.1328299045562744, + "learning_rate": 1.2843354573848849e-05, + "loss": 0.799, + "step": 3267 + }, + { + "epoch": 0.15579338783877195, + "grad_norm": 1.5241819620132446, + "learning_rate": 1.2839588509432466e-05, + "loss": 0.6374, + "step": 3268 + }, + { + "epoch": 0.15584106023407146, + "grad_norm": 1.8858363628387451, + "learning_rate": 1.283582200689682e-05, + "loss": 0.6463, + "step": 3269 + }, + { + "epoch": 0.15588873262937097, + "grad_norm": 2.1750049591064453, + "learning_rate": 1.283205506682304e-05, + "loss": 0.719, + "step": 3270 + }, + { + "epoch": 0.15593640502467046, + "grad_norm": 1.5909249782562256, + "learning_rate": 1.2828287689792331e-05, + "loss": 0.8247, + "step": 3271 + }, + { + "epoch": 0.15598407741996997, + "grad_norm": 1.7785180807113647, + "learning_rate": 1.2824519876385957e-05, + "loss": 0.8106, + "step": 3272 + }, + { + "epoch": 0.15603174981526946, + "grad_norm": 5.155399799346924, + "learning_rate": 1.2820751627185248e-05, + "loss": 0.6847, + "step": 3273 + }, + { + "epoch": 0.15607942221056897, + "grad_norm": 2.3755834102630615, + "learning_rate": 1.2816982942771616e-05, + "loss": 1.1485, + "step": 3274 + }, + { + "epoch": 0.15612709460586846, + "grad_norm": 1.1761051416397095, + "learning_rate": 1.2813213823726524e-05, + "loss": 0.5474, + "step": 3275 + }, + { + "epoch": 0.15617476700116797, + "grad_norm": 1.2053571939468384, + "learning_rate": 1.2809444270631508e-05, + "loss": 0.7944, + "step": 3276 + }, + { + "epoch": 0.1562224393964675, + "grad_norm": 1.4138482809066772, + "learning_rate": 1.2805674284068175e-05, + "loss": 0.9313, + "step": 3277 + }, + { + "epoch": 0.15627011179176697, + "grad_norm": 1.2893582582473755, + "learning_rate": 1.2801903864618193e-05, + "loss": 0.7347, + "step": 3278 + }, + { + "epoch": 0.1563177841870665, + "grad_norm": 1.8208036422729492, + "learning_rate": 1.2798133012863297e-05, + "loss": 0.8083, + "step": 3279 + }, + { + "epoch": 0.15636545658236597, + "grad_norm": 1.302425742149353, + "learning_rate": 1.2794361729385291e-05, + "loss": 0.6992, + "step": 3280 + }, + { + "epoch": 0.1564131289776655, + "grad_norm": 2.7278079986572266, + "learning_rate": 1.279059001476605e-05, + "loss": 0.6065, + "step": 3281 + }, + { + "epoch": 0.15646080137296497, + "grad_norm": 1.6937270164489746, + "learning_rate": 1.2786817869587504e-05, + "loss": 0.6062, + "step": 3282 + }, + { + "epoch": 0.1565084737682645, + "grad_norm": 1.5980437994003296, + "learning_rate": 1.2783045294431662e-05, + "loss": 0.6007, + "step": 3283 + }, + { + "epoch": 0.15655614616356398, + "grad_norm": 1.3968660831451416, + "learning_rate": 1.2779272289880589e-05, + "loss": 0.8586, + "step": 3284 + }, + { + "epoch": 0.1566038185588635, + "grad_norm": 1.3071876764297485, + "learning_rate": 1.2775498856516422e-05, + "loss": 0.7345, + "step": 3285 + }, + { + "epoch": 0.156651490954163, + "grad_norm": 1.080914855003357, + "learning_rate": 1.2771724994921367e-05, + "loss": 0.6449, + "step": 3286 + }, + { + "epoch": 0.1566991633494625, + "grad_norm": 1.6707903146743774, + "learning_rate": 1.2767950705677685e-05, + "loss": 0.4934, + "step": 3287 + }, + { + "epoch": 0.156746835744762, + "grad_norm": 2.411984920501709, + "learning_rate": 1.2764175989367717e-05, + "loss": 0.4383, + "step": 3288 + }, + { + "epoch": 0.1567945081400615, + "grad_norm": 1.9843446016311646, + "learning_rate": 1.2760400846573858e-05, + "loss": 0.6825, + "step": 3289 + }, + { + "epoch": 0.156842180535361, + "grad_norm": 1.7812992334365845, + "learning_rate": 1.2756625277878571e-05, + "loss": 0.8901, + "step": 3290 + }, + { + "epoch": 0.1568898529306605, + "grad_norm": 1.4307302236557007, + "learning_rate": 1.2752849283864395e-05, + "loss": 0.9962, + "step": 3291 + }, + { + "epoch": 0.15693752532596, + "grad_norm": 2.1438846588134766, + "learning_rate": 1.2749072865113926e-05, + "loss": 0.7809, + "step": 3292 + }, + { + "epoch": 0.15698519772125952, + "grad_norm": 1.1189720630645752, + "learning_rate": 1.274529602220982e-05, + "loss": 0.7703, + "step": 3293 + }, + { + "epoch": 0.157032870116559, + "grad_norm": 1.2004164457321167, + "learning_rate": 1.2741518755734809e-05, + "loss": 0.8478, + "step": 3294 + }, + { + "epoch": 0.15708054251185852, + "grad_norm": 1.7471860647201538, + "learning_rate": 1.2737741066271689e-05, + "loss": 0.4228, + "step": 3295 + }, + { + "epoch": 0.157128214907158, + "grad_norm": 3.3640732765197754, + "learning_rate": 1.2733962954403311e-05, + "loss": 0.7958, + "step": 3296 + }, + { + "epoch": 0.15717588730245752, + "grad_norm": 1.421522617340088, + "learning_rate": 1.2730184420712605e-05, + "loss": 0.5254, + "step": 3297 + }, + { + "epoch": 0.157223559697757, + "grad_norm": 2.8206255435943604, + "learning_rate": 1.2726405465782562e-05, + "loss": 1.316, + "step": 3298 + }, + { + "epoch": 0.15727123209305652, + "grad_norm": 7.158819198608398, + "learning_rate": 1.2722626090196229e-05, + "loss": 0.7735, + "step": 3299 + }, + { + "epoch": 0.157318904488356, + "grad_norm": 5.5446391105651855, + "learning_rate": 1.2718846294536729e-05, + "loss": 0.7064, + "step": 3300 + }, + { + "epoch": 0.15736657688365552, + "grad_norm": 1.9584927558898926, + "learning_rate": 1.2715066079387243e-05, + "loss": 0.4701, + "step": 3301 + }, + { + "epoch": 0.15741424927895503, + "grad_norm": 2.7662882804870605, + "learning_rate": 1.2711285445331023e-05, + "loss": 0.882, + "step": 3302 + }, + { + "epoch": 0.15746192167425452, + "grad_norm": 1.7443525791168213, + "learning_rate": 1.270750439295138e-05, + "loss": 0.855, + "step": 3303 + }, + { + "epoch": 0.15750959406955403, + "grad_norm": 1.2526910305023193, + "learning_rate": 1.270372292283169e-05, + "loss": 0.6725, + "step": 3304 + }, + { + "epoch": 0.15755726646485352, + "grad_norm": 1.508876085281372, + "learning_rate": 1.2699941035555394e-05, + "loss": 1.0443, + "step": 3305 + }, + { + "epoch": 0.15760493886015303, + "grad_norm": 1.8699249029159546, + "learning_rate": 1.2696158731706e-05, + "loss": 0.6183, + "step": 3306 + }, + { + "epoch": 0.15765261125545252, + "grad_norm": 2.2554659843444824, + "learning_rate": 1.269237601186708e-05, + "loss": 0.513, + "step": 3307 + }, + { + "epoch": 0.15770028365075203, + "grad_norm": 1.14159095287323, + "learning_rate": 1.2688592876622268e-05, + "loss": 0.7366, + "step": 3308 + }, + { + "epoch": 0.15774795604605155, + "grad_norm": 1.5288643836975098, + "learning_rate": 1.2684809326555266e-05, + "loss": 0.8124, + "step": 3309 + }, + { + "epoch": 0.15779562844135103, + "grad_norm": 1.5562394857406616, + "learning_rate": 1.2681025362249826e-05, + "loss": 0.7951, + "step": 3310 + }, + { + "epoch": 0.15784330083665055, + "grad_norm": 1.2490415573120117, + "learning_rate": 1.2677240984289787e-05, + "loss": 0.5636, + "step": 3311 + }, + { + "epoch": 0.15789097323195003, + "grad_norm": 1.3667465448379517, + "learning_rate": 1.2673456193259033e-05, + "loss": 0.7148, + "step": 3312 + }, + { + "epoch": 0.15793864562724955, + "grad_norm": 3.3306851387023926, + "learning_rate": 1.2669670989741519e-05, + "loss": 0.7152, + "step": 3313 + }, + { + "epoch": 0.15798631802254903, + "grad_norm": 2.215132713317871, + "learning_rate": 1.2665885374321263e-05, + "loss": 0.6839, + "step": 3314 + }, + { + "epoch": 0.15803399041784855, + "grad_norm": 2.5594356060028076, + "learning_rate": 1.2662099347582348e-05, + "loss": 0.7114, + "step": 3315 + }, + { + "epoch": 0.15808166281314803, + "grad_norm": 1.0005269050598145, + "learning_rate": 1.2658312910108919e-05, + "loss": 0.4162, + "step": 3316 + }, + { + "epoch": 0.15812933520844755, + "grad_norm": 3.5879533290863037, + "learning_rate": 1.2654526062485182e-05, + "loss": 1.1503, + "step": 3317 + }, + { + "epoch": 0.15817700760374706, + "grad_norm": 3.5539066791534424, + "learning_rate": 1.265073880529541e-05, + "loss": 0.4695, + "step": 3318 + }, + { + "epoch": 0.15822467999904655, + "grad_norm": 2.7896761894226074, + "learning_rate": 1.2646951139123935e-05, + "loss": 0.7786, + "step": 3319 + }, + { + "epoch": 0.15827235239434606, + "grad_norm": 1.2362169027328491, + "learning_rate": 1.2643163064555163e-05, + "loss": 0.6081, + "step": 3320 + }, + { + "epoch": 0.15832002478964555, + "grad_norm": 1.14006769657135, + "learning_rate": 1.2639374582173548e-05, + "loss": 0.4521, + "step": 3321 + }, + { + "epoch": 0.15836769718494506, + "grad_norm": 1.6921024322509766, + "learning_rate": 1.263558569256361e-05, + "loss": 0.9964, + "step": 3322 + }, + { + "epoch": 0.15841536958024455, + "grad_norm": 4.475882053375244, + "learning_rate": 1.2631796396309945e-05, + "loss": 0.9196, + "step": 3323 + }, + { + "epoch": 0.15846304197554406, + "grad_norm": 1.8194913864135742, + "learning_rate": 1.2628006693997199e-05, + "loss": 0.4348, + "step": 3324 + }, + { + "epoch": 0.15851071437084358, + "grad_norm": 1.9415637254714966, + "learning_rate": 1.2624216586210084e-05, + "loss": 0.6005, + "step": 3325 + }, + { + "epoch": 0.15855838676614306, + "grad_norm": 1.2305355072021484, + "learning_rate": 1.2620426073533371e-05, + "loss": 0.7007, + "step": 3326 + }, + { + "epoch": 0.15860605916144258, + "grad_norm": 1.2120805978775024, + "learning_rate": 1.2616635156551902e-05, + "loss": 0.3924, + "step": 3327 + }, + { + "epoch": 0.15865373155674206, + "grad_norm": 2.912013530731201, + "learning_rate": 1.2612843835850574e-05, + "loss": 0.3591, + "step": 3328 + }, + { + "epoch": 0.15870140395204158, + "grad_norm": 1.610073447227478, + "learning_rate": 1.2609052112014349e-05, + "loss": 1.2035, + "step": 3329 + }, + { + "epoch": 0.15874907634734106, + "grad_norm": 1.8236873149871826, + "learning_rate": 1.2605259985628248e-05, + "loss": 0.7862, + "step": 3330 + }, + { + "epoch": 0.15879674874264058, + "grad_norm": 3.075911045074463, + "learning_rate": 1.2601467457277368e-05, + "loss": 0.362, + "step": 3331 + }, + { + "epoch": 0.1588444211379401, + "grad_norm": 2.6457345485687256, + "learning_rate": 1.2597674527546846e-05, + "loss": 1.1366, + "step": 3332 + }, + { + "epoch": 0.15889209353323958, + "grad_norm": 1.542930006980896, + "learning_rate": 1.259388119702189e-05, + "loss": 0.6585, + "step": 3333 + }, + { + "epoch": 0.1589397659285391, + "grad_norm": 1.234430193901062, + "learning_rate": 1.2590087466287783e-05, + "loss": 0.3408, + "step": 3334 + }, + { + "epoch": 0.15898743832383858, + "grad_norm": 2.957338333129883, + "learning_rate": 1.2586293335929851e-05, + "loss": 1.2782, + "step": 3335 + }, + { + "epoch": 0.1590351107191381, + "grad_norm": 1.1967005729675293, + "learning_rate": 1.258249880653349e-05, + "loss": 0.5389, + "step": 3336 + }, + { + "epoch": 0.15908278311443758, + "grad_norm": 3.108670711517334, + "learning_rate": 1.2578703878684158e-05, + "loss": 1.1146, + "step": 3337 + }, + { + "epoch": 0.1591304555097371, + "grad_norm": 0.7346532344818115, + "learning_rate": 1.2574908552967374e-05, + "loss": 0.3113, + "step": 3338 + }, + { + "epoch": 0.15917812790503658, + "grad_norm": 1.4196573495864868, + "learning_rate": 1.2571112829968716e-05, + "loss": 0.4386, + "step": 3339 + }, + { + "epoch": 0.1592258003003361, + "grad_norm": 1.0020333528518677, + "learning_rate": 1.256731671027383e-05, + "loss": 0.7071, + "step": 3340 + }, + { + "epoch": 0.1592734726956356, + "grad_norm": 1.1035711765289307, + "learning_rate": 1.2563520194468408e-05, + "loss": 0.7352, + "step": 3341 + }, + { + "epoch": 0.1593211450909351, + "grad_norm": 1.7467808723449707, + "learning_rate": 1.2559723283138219e-05, + "loss": 1.1882, + "step": 3342 + }, + { + "epoch": 0.1593688174862346, + "grad_norm": 1.1355706453323364, + "learning_rate": 1.255592597686909e-05, + "loss": 0.3259, + "step": 3343 + }, + { + "epoch": 0.1594164898815341, + "grad_norm": 2.670177936553955, + "learning_rate": 1.2552128276246905e-05, + "loss": 0.8934, + "step": 3344 + }, + { + "epoch": 0.1594641622768336, + "grad_norm": 1.452782154083252, + "learning_rate": 1.2548330181857605e-05, + "loss": 0.4914, + "step": 3345 + }, + { + "epoch": 0.1595118346721331, + "grad_norm": 1.8647856712341309, + "learning_rate": 1.2544531694287203e-05, + "loss": 0.6514, + "step": 3346 + }, + { + "epoch": 0.1595595070674326, + "grad_norm": 1.6509310007095337, + "learning_rate": 1.2540732814121763e-05, + "loss": 0.1358, + "step": 3347 + }, + { + "epoch": 0.15960717946273212, + "grad_norm": 1.2296963930130005, + "learning_rate": 1.2536933541947416e-05, + "loss": 0.5515, + "step": 3348 + }, + { + "epoch": 0.1596548518580316, + "grad_norm": 2.1495020389556885, + "learning_rate": 1.2533133878350348e-05, + "loss": 1.0511, + "step": 3349 + }, + { + "epoch": 0.15970252425333112, + "grad_norm": 1.151329517364502, + "learning_rate": 1.2529333823916807e-05, + "loss": 0.3669, + "step": 3350 + }, + { + "epoch": 0.1597501966486306, + "grad_norm": 1.3974355459213257, + "learning_rate": 1.2525533379233108e-05, + "loss": 0.9286, + "step": 3351 + }, + { + "epoch": 0.15979786904393012, + "grad_norm": 1.4394891262054443, + "learning_rate": 1.2521732544885614e-05, + "loss": 0.5116, + "step": 3352 + }, + { + "epoch": 0.1598455414392296, + "grad_norm": 1.2312219142913818, + "learning_rate": 1.2517931321460756e-05, + "loss": 0.9085, + "step": 3353 + }, + { + "epoch": 0.15989321383452912, + "grad_norm": 2.1159145832061768, + "learning_rate": 1.251412970954503e-05, + "loss": 0.7617, + "step": 3354 + }, + { + "epoch": 0.1599408862298286, + "grad_norm": 2.5563313961029053, + "learning_rate": 1.2510327709724976e-05, + "loss": 1.084, + "step": 3355 + }, + { + "epoch": 0.15998855862512812, + "grad_norm": 1.485469102859497, + "learning_rate": 1.2506525322587207e-05, + "loss": 0.8285, + "step": 3356 + }, + { + "epoch": 0.16003623102042763, + "grad_norm": 2.3007662296295166, + "learning_rate": 1.2502722548718396e-05, + "loss": 0.6163, + "step": 3357 + }, + { + "epoch": 0.16008390341572712, + "grad_norm": 2.549318790435791, + "learning_rate": 1.2498919388705266e-05, + "loss": 0.8862, + "step": 3358 + }, + { + "epoch": 0.16013157581102663, + "grad_norm": 1.3746862411499023, + "learning_rate": 1.2495115843134608e-05, + "loss": 0.7938, + "step": 3359 + }, + { + "epoch": 0.16017924820632612, + "grad_norm": 1.6238036155700684, + "learning_rate": 1.249131191259327e-05, + "loss": 0.4767, + "step": 3360 + }, + { + "epoch": 0.16022692060162563, + "grad_norm": 1.5184221267700195, + "learning_rate": 1.2487507597668163e-05, + "loss": 0.9831, + "step": 3361 + }, + { + "epoch": 0.16027459299692512, + "grad_norm": 1.7125096321105957, + "learning_rate": 1.2483702898946249e-05, + "loss": 0.5875, + "step": 3362 + }, + { + "epoch": 0.16032226539222463, + "grad_norm": 1.3688068389892578, + "learning_rate": 1.2479897817014553e-05, + "loss": 0.6461, + "step": 3363 + }, + { + "epoch": 0.16036993778752415, + "grad_norm": 1.172893762588501, + "learning_rate": 1.2476092352460161e-05, + "loss": 0.6278, + "step": 3364 + }, + { + "epoch": 0.16041761018282363, + "grad_norm": 2.0960302352905273, + "learning_rate": 1.2472286505870222e-05, + "loss": 0.4483, + "step": 3365 + }, + { + "epoch": 0.16046528257812315, + "grad_norm": 1.4265042543411255, + "learning_rate": 1.246848027783193e-05, + "loss": 0.9282, + "step": 3366 + }, + { + "epoch": 0.16051295497342263, + "grad_norm": 1.6536892652511597, + "learning_rate": 1.2464673668932555e-05, + "loss": 0.8627, + "step": 3367 + }, + { + "epoch": 0.16056062736872215, + "grad_norm": 2.113489866256714, + "learning_rate": 1.2460866679759412e-05, + "loss": 0.694, + "step": 3368 + }, + { + "epoch": 0.16060829976402163, + "grad_norm": 1.298811435699463, + "learning_rate": 1.2457059310899887e-05, + "loss": 0.8632, + "step": 3369 + }, + { + "epoch": 0.16065597215932115, + "grad_norm": 2.6942338943481445, + "learning_rate": 1.2453251562941406e-05, + "loss": 0.2747, + "step": 3370 + }, + { + "epoch": 0.16070364455462063, + "grad_norm": 1.8676705360412598, + "learning_rate": 1.2449443436471476e-05, + "loss": 0.9307, + "step": 3371 + }, + { + "epoch": 0.16075131694992015, + "grad_norm": 1.7404451370239258, + "learning_rate": 1.2445634932077648e-05, + "loss": 0.6081, + "step": 3372 + }, + { + "epoch": 0.16079898934521966, + "grad_norm": 1.310207724571228, + "learning_rate": 1.2441826050347535e-05, + "loss": 0.7807, + "step": 3373 + }, + { + "epoch": 0.16084666174051915, + "grad_norm": 1.459877610206604, + "learning_rate": 1.243801679186881e-05, + "loss": 0.6398, + "step": 3374 + }, + { + "epoch": 0.16089433413581866, + "grad_norm": 1.8162803649902344, + "learning_rate": 1.24342071572292e-05, + "loss": 0.5995, + "step": 3375 + }, + { + "epoch": 0.16094200653111815, + "grad_norm": 0.9086304306983948, + "learning_rate": 1.243039714701649e-05, + "loss": 0.3061, + "step": 3376 + }, + { + "epoch": 0.16098967892641766, + "grad_norm": 5.885626316070557, + "learning_rate": 1.2426586761818533e-05, + "loss": 0.7183, + "step": 3377 + }, + { + "epoch": 0.16103735132171715, + "grad_norm": 1.9913275241851807, + "learning_rate": 1.2422776002223226e-05, + "loss": 0.8075, + "step": 3378 + }, + { + "epoch": 0.16108502371701666, + "grad_norm": 2.7252678871154785, + "learning_rate": 1.2418964868818529e-05, + "loss": 1.0008, + "step": 3379 + }, + { + "epoch": 0.16113269611231618, + "grad_norm": 1.4492295980453491, + "learning_rate": 1.2415153362192466e-05, + "loss": 0.72, + "step": 3380 + }, + { + "epoch": 0.16118036850761566, + "grad_norm": 3.650787353515625, + "learning_rate": 1.241134148293311e-05, + "loss": 1.0848, + "step": 3381 + }, + { + "epoch": 0.16122804090291518, + "grad_norm": 1.4103679656982422, + "learning_rate": 1.2407529231628595e-05, + "loss": 0.5846, + "step": 3382 + }, + { + "epoch": 0.16127571329821466, + "grad_norm": 2.3698039054870605, + "learning_rate": 1.2403716608867111e-05, + "loss": 1.0237, + "step": 3383 + }, + { + "epoch": 0.16132338569351418, + "grad_norm": 1.0095607042312622, + "learning_rate": 1.239990361523691e-05, + "loss": 0.3932, + "step": 3384 + }, + { + "epoch": 0.16137105808881366, + "grad_norm": 1.4036271572113037, + "learning_rate": 1.2396090251326296e-05, + "loss": 1.0634, + "step": 3385 + }, + { + "epoch": 0.16141873048411318, + "grad_norm": 1.1031994819641113, + "learning_rate": 1.239227651772363e-05, + "loss": 0.5795, + "step": 3386 + }, + { + "epoch": 0.16146640287941266, + "grad_norm": 2.0513222217559814, + "learning_rate": 1.2388462415017331e-05, + "loss": 0.7466, + "step": 3387 + }, + { + "epoch": 0.16151407527471218, + "grad_norm": 1.854997992515564, + "learning_rate": 1.238464794379588e-05, + "loss": 0.7148, + "step": 3388 + }, + { + "epoch": 0.1615617476700117, + "grad_norm": 2.1990208625793457, + "learning_rate": 1.2380833104647807e-05, + "loss": 0.8308, + "step": 3389 + }, + { + "epoch": 0.16160942006531118, + "grad_norm": 1.3848634958267212, + "learning_rate": 1.2377017898161703e-05, + "loss": 0.7257, + "step": 3390 + }, + { + "epoch": 0.1616570924606107, + "grad_norm": 1.0949569940567017, + "learning_rate": 1.2373202324926222e-05, + "loss": 0.4295, + "step": 3391 + }, + { + "epoch": 0.16170476485591018, + "grad_norm": 1.570083737373352, + "learning_rate": 1.2369386385530055e-05, + "loss": 0.8027, + "step": 3392 + }, + { + "epoch": 0.1617524372512097, + "grad_norm": 3.2252635955810547, + "learning_rate": 1.2365570080561971e-05, + "loss": 1.1789, + "step": 3393 + }, + { + "epoch": 0.16180010964650918, + "grad_norm": 1.8009815216064453, + "learning_rate": 1.2361753410610784e-05, + "loss": 0.6498, + "step": 3394 + }, + { + "epoch": 0.1618477820418087, + "grad_norm": 2.185048818588257, + "learning_rate": 1.2357936376265367e-05, + "loss": 0.8915, + "step": 3395 + }, + { + "epoch": 0.1618954544371082, + "grad_norm": 1.036126732826233, + "learning_rate": 1.2354118978114648e-05, + "loss": 0.3432, + "step": 3396 + }, + { + "epoch": 0.1619431268324077, + "grad_norm": 1.647308349609375, + "learning_rate": 1.2350301216747615e-05, + "loss": 0.6703, + "step": 3397 + }, + { + "epoch": 0.1619907992277072, + "grad_norm": 1.5020028352737427, + "learning_rate": 1.2346483092753307e-05, + "loss": 0.7109, + "step": 3398 + }, + { + "epoch": 0.1620384716230067, + "grad_norm": 1.7878875732421875, + "learning_rate": 1.2342664606720823e-05, + "loss": 0.8738, + "step": 3399 + }, + { + "epoch": 0.1620861440183062, + "grad_norm": 1.8247052431106567, + "learning_rate": 1.2338845759239315e-05, + "loss": 0.6921, + "step": 3400 + }, + { + "epoch": 0.1621338164136057, + "grad_norm": 3.973893642425537, + "learning_rate": 1.233502655089799e-05, + "loss": 0.8575, + "step": 3401 + }, + { + "epoch": 0.1621814888089052, + "grad_norm": 1.4395326375961304, + "learning_rate": 1.2331206982286114e-05, + "loss": 0.4054, + "step": 3402 + }, + { + "epoch": 0.1622291612042047, + "grad_norm": 1.9963631629943848, + "learning_rate": 1.232738705399301e-05, + "loss": 0.6682, + "step": 3403 + }, + { + "epoch": 0.1622768335995042, + "grad_norm": 3.30143404006958, + "learning_rate": 1.2323566766608049e-05, + "loss": 0.7009, + "step": 3404 + }, + { + "epoch": 0.16232450599480372, + "grad_norm": 3.1036858558654785, + "learning_rate": 1.2319746120720665e-05, + "loss": 0.52, + "step": 3405 + }, + { + "epoch": 0.1623721783901032, + "grad_norm": 1.8379104137420654, + "learning_rate": 1.2315925116920342e-05, + "loss": 0.6955, + "step": 3406 + }, + { + "epoch": 0.16241985078540272, + "grad_norm": 1.5141234397888184, + "learning_rate": 1.2312103755796625e-05, + "loss": 0.8178, + "step": 3407 + }, + { + "epoch": 0.1624675231807022, + "grad_norm": 1.6963640451431274, + "learning_rate": 1.2308282037939108e-05, + "loss": 0.7978, + "step": 3408 + }, + { + "epoch": 0.16251519557600172, + "grad_norm": 1.6503376960754395, + "learning_rate": 1.2304459963937443e-05, + "loss": 0.6586, + "step": 3409 + }, + { + "epoch": 0.1625628679713012, + "grad_norm": 1.685644268989563, + "learning_rate": 1.2300637534381336e-05, + "loss": 1.053, + "step": 3410 + }, + { + "epoch": 0.16261054036660072, + "grad_norm": 3.060255289077759, + "learning_rate": 1.229681474986055e-05, + "loss": 1.0819, + "step": 3411 + }, + { + "epoch": 0.16265821276190023, + "grad_norm": 1.2974894046783447, + "learning_rate": 1.2292991610964902e-05, + "loss": 0.9139, + "step": 3412 + }, + { + "epoch": 0.16270588515719972, + "grad_norm": 2.8959174156188965, + "learning_rate": 1.228916811828426e-05, + "loss": 0.5987, + "step": 3413 + }, + { + "epoch": 0.16275355755249923, + "grad_norm": 1.131511926651001, + "learning_rate": 1.2285344272408553e-05, + "loss": 0.7501, + "step": 3414 + }, + { + "epoch": 0.16280122994779872, + "grad_norm": 1.3929219245910645, + "learning_rate": 1.2281520073927757e-05, + "loss": 0.4989, + "step": 3415 + }, + { + "epoch": 0.16284890234309823, + "grad_norm": 1.792142391204834, + "learning_rate": 1.227769552343191e-05, + "loss": 0.4692, + "step": 3416 + }, + { + "epoch": 0.16289657473839772, + "grad_norm": 1.2551616430282593, + "learning_rate": 1.2273870621511098e-05, + "loss": 0.8854, + "step": 3417 + }, + { + "epoch": 0.16294424713369723, + "grad_norm": 1.6115854978561401, + "learning_rate": 1.2270045368755467e-05, + "loss": 0.8475, + "step": 3418 + }, + { + "epoch": 0.16299191952899675, + "grad_norm": 1.9165464639663696, + "learning_rate": 1.2266219765755211e-05, + "loss": 0.7767, + "step": 3419 + }, + { + "epoch": 0.16303959192429623, + "grad_norm": 1.447646975517273, + "learning_rate": 1.2262393813100584e-05, + "loss": 0.7413, + "step": 3420 + }, + { + "epoch": 0.16308726431959575, + "grad_norm": 1.78177011013031, + "learning_rate": 1.2258567511381891e-05, + "loss": 0.978, + "step": 3421 + }, + { + "epoch": 0.16313493671489523, + "grad_norm": 3.3577868938446045, + "learning_rate": 1.225474086118949e-05, + "loss": 0.6909, + "step": 3422 + }, + { + "epoch": 0.16318260911019475, + "grad_norm": 1.737442135810852, + "learning_rate": 1.2250913863113792e-05, + "loss": 1.1041, + "step": 3423 + }, + { + "epoch": 0.16323028150549423, + "grad_norm": 1.2454472780227661, + "learning_rate": 1.2247086517745262e-05, + "loss": 0.9252, + "step": 3424 + }, + { + "epoch": 0.16327795390079375, + "grad_norm": 1.4345704317092896, + "learning_rate": 1.2243258825674424e-05, + "loss": 1.1007, + "step": 3425 + }, + { + "epoch": 0.16332562629609323, + "grad_norm": 1.4201923608779907, + "learning_rate": 1.2239430787491853e-05, + "loss": 0.7471, + "step": 3426 + }, + { + "epoch": 0.16337329869139275, + "grad_norm": 10.622687339782715, + "learning_rate": 1.2235602403788172e-05, + "loss": 1.0874, + "step": 3427 + }, + { + "epoch": 0.16342097108669226, + "grad_norm": 1.4215333461761475, + "learning_rate": 1.2231773675154062e-05, + "loss": 0.7744, + "step": 3428 + }, + { + "epoch": 0.16346864348199175, + "grad_norm": 1.4177908897399902, + "learning_rate": 1.222794460218026e-05, + "loss": 0.9184, + "step": 3429 + }, + { + "epoch": 0.16351631587729126, + "grad_norm": 1.4957600831985474, + "learning_rate": 1.2224115185457543e-05, + "loss": 0.8884, + "step": 3430 + }, + { + "epoch": 0.16356398827259075, + "grad_norm": 1.203901767730713, + "learning_rate": 1.222028542557676e-05, + "loss": 0.5323, + "step": 3431 + }, + { + "epoch": 0.16361166066789026, + "grad_norm": 3.368590831756592, + "learning_rate": 1.2216455323128801e-05, + "loss": 1.0471, + "step": 3432 + }, + { + "epoch": 0.16365933306318975, + "grad_norm": 1.8145172595977783, + "learning_rate": 1.2212624878704612e-05, + "loss": 0.8097, + "step": 3433 + }, + { + "epoch": 0.16370700545848926, + "grad_norm": 1.7572021484375, + "learning_rate": 1.2208794092895187e-05, + "loss": 1.0413, + "step": 3434 + }, + { + "epoch": 0.16375467785378878, + "grad_norm": 1.6635613441467285, + "learning_rate": 1.220496296629158e-05, + "loss": 0.3629, + "step": 3435 + }, + { + "epoch": 0.16380235024908826, + "grad_norm": 1.844543695449829, + "learning_rate": 1.2201131499484896e-05, + "loss": 0.5276, + "step": 3436 + }, + { + "epoch": 0.16385002264438778, + "grad_norm": 1.5143234729766846, + "learning_rate": 1.219729969306629e-05, + "loss": 0.8425, + "step": 3437 + }, + { + "epoch": 0.16389769503968726, + "grad_norm": 2.3293912410736084, + "learning_rate": 1.2193467547626966e-05, + "loss": 0.8095, + "step": 3438 + }, + { + "epoch": 0.16394536743498678, + "grad_norm": 1.9489166736602783, + "learning_rate": 1.2189635063758188e-05, + "loss": 0.8012, + "step": 3439 + }, + { + "epoch": 0.16399303983028626, + "grad_norm": 1.4042048454284668, + "learning_rate": 1.2185802242051267e-05, + "loss": 0.6002, + "step": 3440 + }, + { + "epoch": 0.16404071222558578, + "grad_norm": 1.6882002353668213, + "learning_rate": 1.218196908309757e-05, + "loss": 0.7757, + "step": 3441 + }, + { + "epoch": 0.16408838462088526, + "grad_norm": 1.1849122047424316, + "learning_rate": 1.2178135587488515e-05, + "loss": 0.819, + "step": 3442 + }, + { + "epoch": 0.16413605701618478, + "grad_norm": 3.708073616027832, + "learning_rate": 1.2174301755815572e-05, + "loss": 0.9429, + "step": 3443 + }, + { + "epoch": 0.1641837294114843, + "grad_norm": 1.468268871307373, + "learning_rate": 1.2170467588670256e-05, + "loss": 0.6409, + "step": 3444 + }, + { + "epoch": 0.16423140180678378, + "grad_norm": 2.0025761127471924, + "learning_rate": 1.2166633086644142e-05, + "loss": 1.0012, + "step": 3445 + }, + { + "epoch": 0.1642790742020833, + "grad_norm": 1.5201904773712158, + "learning_rate": 1.2162798250328857e-05, + "loss": 0.7626, + "step": 3446 + }, + { + "epoch": 0.16432674659738278, + "grad_norm": 1.4745078086853027, + "learning_rate": 1.2158963080316071e-05, + "loss": 0.6486, + "step": 3447 + }, + { + "epoch": 0.1643744189926823, + "grad_norm": 2.671544313430786, + "learning_rate": 1.2155127577197519e-05, + "loss": 1.401, + "step": 3448 + }, + { + "epoch": 0.16442209138798178, + "grad_norm": 12.405790328979492, + "learning_rate": 1.2151291741564972e-05, + "loss": 0.0574, + "step": 3449 + }, + { + "epoch": 0.1644697637832813, + "grad_norm": 1.7385684251785278, + "learning_rate": 1.2147455574010263e-05, + "loss": 0.7392, + "step": 3450 + }, + { + "epoch": 0.1645174361785808, + "grad_norm": 2.2188470363616943, + "learning_rate": 1.2143619075125277e-05, + "loss": 0.8268, + "step": 3451 + }, + { + "epoch": 0.1645651085738803, + "grad_norm": 2.021573066711426, + "learning_rate": 1.2139782245501942e-05, + "loss": 0.5961, + "step": 3452 + }, + { + "epoch": 0.1646127809691798, + "grad_norm": 1.4897230863571167, + "learning_rate": 1.213594508573224e-05, + "loss": 0.616, + "step": 3453 + }, + { + "epoch": 0.1646604533644793, + "grad_norm": 1.7904022932052612, + "learning_rate": 1.2132107596408207e-05, + "loss": 0.7339, + "step": 3454 + }, + { + "epoch": 0.1647081257597788, + "grad_norm": 2.0066168308258057, + "learning_rate": 1.212826977812193e-05, + "loss": 0.6886, + "step": 3455 + }, + { + "epoch": 0.1647557981550783, + "grad_norm": 2.741666793823242, + "learning_rate": 1.212443163146554e-05, + "loss": 0.6876, + "step": 3456 + }, + { + "epoch": 0.1648034705503778, + "grad_norm": 3.0157713890075684, + "learning_rate": 1.2120593157031231e-05, + "loss": 0.6456, + "step": 3457 + }, + { + "epoch": 0.1648511429456773, + "grad_norm": 2.3613779544830322, + "learning_rate": 1.2116754355411233e-05, + "loss": 1.0501, + "step": 3458 + }, + { + "epoch": 0.1648988153409768, + "grad_norm": 1.2164865732192993, + "learning_rate": 1.2112915227197836e-05, + "loss": 0.6544, + "step": 3459 + }, + { + "epoch": 0.16494648773627632, + "grad_norm": 1.42069673538208, + "learning_rate": 1.2109075772983383e-05, + "loss": 0.7576, + "step": 3460 + }, + { + "epoch": 0.1649941601315758, + "grad_norm": 1.328682780265808, + "learning_rate": 1.2105235993360252e-05, + "loss": 0.8058, + "step": 3461 + }, + { + "epoch": 0.16504183252687532, + "grad_norm": 1.2135480642318726, + "learning_rate": 1.2101395888920888e-05, + "loss": 1.1734, + "step": 3462 + }, + { + "epoch": 0.1650895049221748, + "grad_norm": 1.574333906173706, + "learning_rate": 1.2097555460257779e-05, + "loss": 0.8645, + "step": 3463 + }, + { + "epoch": 0.16513717731747432, + "grad_norm": 1.5083953142166138, + "learning_rate": 1.2093714707963464e-05, + "loss": 0.5608, + "step": 3464 + }, + { + "epoch": 0.1651848497127738, + "grad_norm": 1.5522809028625488, + "learning_rate": 1.2089873632630531e-05, + "loss": 0.5864, + "step": 3465 + }, + { + "epoch": 0.16523252210807332, + "grad_norm": 2.6948087215423584, + "learning_rate": 1.2086032234851616e-05, + "loss": 0.8208, + "step": 3466 + }, + { + "epoch": 0.16528019450337283, + "grad_norm": 3.029338836669922, + "learning_rate": 1.2082190515219412e-05, + "loss": 0.6971, + "step": 3467 + }, + { + "epoch": 0.16532786689867232, + "grad_norm": 1.52670419216156, + "learning_rate": 1.2078348474326652e-05, + "loss": 0.671, + "step": 3468 + }, + { + "epoch": 0.16537553929397183, + "grad_norm": 2.5811350345611572, + "learning_rate": 1.2074506112766127e-05, + "loss": 1.4676, + "step": 3469 + }, + { + "epoch": 0.16542321168927132, + "grad_norm": 2.4120121002197266, + "learning_rate": 1.2070663431130666e-05, + "loss": 0.7292, + "step": 3470 + }, + { + "epoch": 0.16547088408457084, + "grad_norm": 1.183908462524414, + "learning_rate": 1.2066820430013168e-05, + "loss": 0.325, + "step": 3471 + }, + { + "epoch": 0.16551855647987032, + "grad_norm": 1.1767194271087646, + "learning_rate": 1.2062977110006559e-05, + "loss": 0.4957, + "step": 3472 + }, + { + "epoch": 0.16556622887516984, + "grad_norm": 2.2183735370635986, + "learning_rate": 1.205913347170383e-05, + "loss": 0.6561, + "step": 3473 + }, + { + "epoch": 0.16561390127046932, + "grad_norm": 1.6118630170822144, + "learning_rate": 1.2055289515698008e-05, + "loss": 0.8571, + "step": 3474 + }, + { + "epoch": 0.16566157366576884, + "grad_norm": 1.7566035985946655, + "learning_rate": 1.205144524258218e-05, + "loss": 0.6015, + "step": 3475 + }, + { + "epoch": 0.16570924606106835, + "grad_norm": 1.6769514083862305, + "learning_rate": 1.2047600652949476e-05, + "loss": 0.7376, + "step": 3476 + }, + { + "epoch": 0.16575691845636784, + "grad_norm": 1.9794421195983887, + "learning_rate": 1.2043755747393077e-05, + "loss": 0.432, + "step": 3477 + }, + { + "epoch": 0.16580459085166735, + "grad_norm": 2.211209774017334, + "learning_rate": 1.203991052650621e-05, + "loss": 0.7207, + "step": 3478 + }, + { + "epoch": 0.16585226324696684, + "grad_norm": 1.421827793121338, + "learning_rate": 1.2036064990882162e-05, + "loss": 0.5774, + "step": 3479 + }, + { + "epoch": 0.16589993564226635, + "grad_norm": 1.7831792831420898, + "learning_rate": 1.2032219141114253e-05, + "loss": 0.7059, + "step": 3480 + }, + { + "epoch": 0.16594760803756584, + "grad_norm": 1.3848438262939453, + "learning_rate": 1.2028372977795854e-05, + "loss": 0.7844, + "step": 3481 + }, + { + "epoch": 0.16599528043286535, + "grad_norm": 1.4584486484527588, + "learning_rate": 1.2024526501520398e-05, + "loss": 0.4372, + "step": 3482 + }, + { + "epoch": 0.16604295282816486, + "grad_norm": 1.3964842557907104, + "learning_rate": 1.2020679712881347e-05, + "loss": 0.727, + "step": 3483 + }, + { + "epoch": 0.16609062522346435, + "grad_norm": 1.682080626487732, + "learning_rate": 1.2016832612472225e-05, + "loss": 0.3451, + "step": 3484 + }, + { + "epoch": 0.16613829761876386, + "grad_norm": 31.16162109375, + "learning_rate": 1.2012985200886602e-05, + "loss": 0.3133, + "step": 3485 + }, + { + "epoch": 0.16618597001406335, + "grad_norm": 1.2970011234283447, + "learning_rate": 1.2009137478718093e-05, + "loss": 0.6506, + "step": 3486 + }, + { + "epoch": 0.16623364240936286, + "grad_norm": 4.243335247039795, + "learning_rate": 1.2005289446560357e-05, + "loss": 0.3267, + "step": 3487 + }, + { + "epoch": 0.16628131480466235, + "grad_norm": 1.4397087097167969, + "learning_rate": 1.2001441105007114e-05, + "loss": 0.8387, + "step": 3488 + }, + { + "epoch": 0.16632898719996186, + "grad_norm": 3.6693356037139893, + "learning_rate": 1.199759245465212e-05, + "loss": 0.9004, + "step": 3489 + }, + { + "epoch": 0.16637665959526135, + "grad_norm": 1.1918872594833374, + "learning_rate": 1.199374349608918e-05, + "loss": 0.7844, + "step": 3490 + }, + { + "epoch": 0.16642433199056086, + "grad_norm": 1.837996244430542, + "learning_rate": 1.198989422991215e-05, + "loss": 0.9801, + "step": 3491 + }, + { + "epoch": 0.16647200438586038, + "grad_norm": 1.2639696598052979, + "learning_rate": 1.1986044656714933e-05, + "loss": 0.7873, + "step": 3492 + }, + { + "epoch": 0.16651967678115986, + "grad_norm": 2.8440091609954834, + "learning_rate": 1.1982194777091476e-05, + "loss": 0.8588, + "step": 3493 + }, + { + "epoch": 0.16656734917645938, + "grad_norm": 3.2376017570495605, + "learning_rate": 1.1978344591635779e-05, + "loss": 0.4817, + "step": 3494 + }, + { + "epoch": 0.16661502157175886, + "grad_norm": 1.3740285634994507, + "learning_rate": 1.1974494100941884e-05, + "loss": 0.8609, + "step": 3495 + }, + { + "epoch": 0.16666269396705838, + "grad_norm": 2.3309359550476074, + "learning_rate": 1.1970643305603885e-05, + "loss": 0.0632, + "step": 3496 + }, + { + "epoch": 0.16671036636235786, + "grad_norm": 2.289484739303589, + "learning_rate": 1.1966792206215914e-05, + "loss": 0.3885, + "step": 3497 + }, + { + "epoch": 0.16675803875765738, + "grad_norm": 1.8412286043167114, + "learning_rate": 1.1962940803372158e-05, + "loss": 0.636, + "step": 3498 + }, + { + "epoch": 0.1668057111529569, + "grad_norm": 2.3273236751556396, + "learning_rate": 1.1959089097666853e-05, + "loss": 0.7848, + "step": 3499 + }, + { + "epoch": 0.16685338354825638, + "grad_norm": 1.3379433155059814, + "learning_rate": 1.1955237089694279e-05, + "loss": 0.578, + "step": 3500 + }, + { + "epoch": 0.1669010559435559, + "grad_norm": 2.2512872219085693, + "learning_rate": 1.1951384780048752e-05, + "loss": 0.9641, + "step": 3501 + }, + { + "epoch": 0.16694872833885538, + "grad_norm": 1.1500160694122314, + "learning_rate": 1.1947532169324649e-05, + "loss": 0.5756, + "step": 3502 + }, + { + "epoch": 0.1669964007341549, + "grad_norm": 1.8214316368103027, + "learning_rate": 1.194367925811639e-05, + "loss": 0.657, + "step": 3503 + }, + { + "epoch": 0.16704407312945438, + "grad_norm": 1.3470733165740967, + "learning_rate": 1.1939826047018436e-05, + "loss": 0.4145, + "step": 3504 + }, + { + "epoch": 0.1670917455247539, + "grad_norm": 1.4464870691299438, + "learning_rate": 1.1935972536625302e-05, + "loss": 0.5483, + "step": 3505 + }, + { + "epoch": 0.1671394179200534, + "grad_norm": 1.6713048219680786, + "learning_rate": 1.1932118727531541e-05, + "loss": 0.9356, + "step": 3506 + }, + { + "epoch": 0.1671870903153529, + "grad_norm": 1.612980604171753, + "learning_rate": 1.1928264620331755e-05, + "loss": 0.7221, + "step": 3507 + }, + { + "epoch": 0.1672347627106524, + "grad_norm": 1.2654221057891846, + "learning_rate": 1.1924410215620596e-05, + "loss": 0.5428, + "step": 3508 + }, + { + "epoch": 0.1672824351059519, + "grad_norm": 2.2483534812927246, + "learning_rate": 1.192055551399276e-05, + "loss": 0.8149, + "step": 3509 + }, + { + "epoch": 0.1673301075012514, + "grad_norm": 3.906214714050293, + "learning_rate": 1.1916700516042986e-05, + "loss": 0.7825, + "step": 3510 + }, + { + "epoch": 0.1673777798965509, + "grad_norm": 2.5887796878814697, + "learning_rate": 1.1912845222366061e-05, + "loss": 0.9846, + "step": 3511 + }, + { + "epoch": 0.1674254522918504, + "grad_norm": 2.0140204429626465, + "learning_rate": 1.1908989633556816e-05, + "loss": 0.6754, + "step": 3512 + }, + { + "epoch": 0.1674731246871499, + "grad_norm": 1.9534940719604492, + "learning_rate": 1.1905133750210126e-05, + "loss": 0.6392, + "step": 3513 + }, + { + "epoch": 0.1675207970824494, + "grad_norm": 1.862790822982788, + "learning_rate": 1.1901277572920922e-05, + "loss": 0.6461, + "step": 3514 + }, + { + "epoch": 0.16756846947774892, + "grad_norm": 1.8946439027786255, + "learning_rate": 1.1897421102284166e-05, + "loss": 0.8956, + "step": 3515 + }, + { + "epoch": 0.1676161418730484, + "grad_norm": 1.6796231269836426, + "learning_rate": 1.1893564338894872e-05, + "loss": 0.7929, + "step": 3516 + }, + { + "epoch": 0.16766381426834792, + "grad_norm": 1.0231565237045288, + "learning_rate": 1.1889707283348104e-05, + "loss": 0.5957, + "step": 3517 + }, + { + "epoch": 0.1677114866636474, + "grad_norm": 2.6896629333496094, + "learning_rate": 1.188584993623896e-05, + "loss": 0.3625, + "step": 3518 + }, + { + "epoch": 0.16775915905894692, + "grad_norm": 1.7258400917053223, + "learning_rate": 1.1881992298162593e-05, + "loss": 0.7968, + "step": 3519 + }, + { + "epoch": 0.1678068314542464, + "grad_norm": 1.5912007093429565, + "learning_rate": 1.1878134369714193e-05, + "loss": 0.8339, + "step": 3520 + }, + { + "epoch": 0.16785450384954592, + "grad_norm": 3.77643084526062, + "learning_rate": 1.1874276151489002e-05, + "loss": 1.0579, + "step": 3521 + }, + { + "epoch": 0.16790217624484544, + "grad_norm": 2.2483389377593994, + "learning_rate": 1.1870417644082304e-05, + "loss": 1.4027, + "step": 3522 + }, + { + "epoch": 0.16794984864014492, + "grad_norm": 3.3636205196380615, + "learning_rate": 1.1866558848089422e-05, + "loss": 1.4633, + "step": 3523 + }, + { + "epoch": 0.16799752103544444, + "grad_norm": 2.785327196121216, + "learning_rate": 1.1862699764105731e-05, + "loss": 1.1522, + "step": 3524 + }, + { + "epoch": 0.16804519343074392, + "grad_norm": 1.596582293510437, + "learning_rate": 1.1858840392726652e-05, + "loss": 0.6806, + "step": 3525 + }, + { + "epoch": 0.16809286582604344, + "grad_norm": 1.497848391532898, + "learning_rate": 1.185498073454764e-05, + "loss": 0.8466, + "step": 3526 + }, + { + "epoch": 0.16814053822134292, + "grad_norm": 1.394997000694275, + "learning_rate": 1.1851120790164206e-05, + "loss": 0.5984, + "step": 3527 + }, + { + "epoch": 0.16818821061664244, + "grad_norm": 2.0831704139709473, + "learning_rate": 1.1847260560171895e-05, + "loss": 0.7524, + "step": 3528 + }, + { + "epoch": 0.16823588301194192, + "grad_norm": 1.3866229057312012, + "learning_rate": 1.1843400045166305e-05, + "loss": 0.6879, + "step": 3529 + }, + { + "epoch": 0.16828355540724144, + "grad_norm": 1.5415539741516113, + "learning_rate": 1.1839539245743066e-05, + "loss": 0.738, + "step": 3530 + }, + { + "epoch": 0.16833122780254095, + "grad_norm": 1.7128515243530273, + "learning_rate": 1.183567816249787e-05, + "loss": 0.8143, + "step": 3531 + }, + { + "epoch": 0.16837890019784044, + "grad_norm": 1.454851508140564, + "learning_rate": 1.1831816796026434e-05, + "loss": 0.7832, + "step": 3532 + }, + { + "epoch": 0.16842657259313995, + "grad_norm": 2.3555922508239746, + "learning_rate": 1.1827955146924532e-05, + "loss": 0.742, + "step": 3533 + }, + { + "epoch": 0.16847424498843944, + "grad_norm": 1.4841701984405518, + "learning_rate": 1.1824093215787977e-05, + "loss": 0.6504, + "step": 3534 + }, + { + "epoch": 0.16852191738373895, + "grad_norm": 1.503922462463379, + "learning_rate": 1.182023100321262e-05, + "loss": 0.7964, + "step": 3535 + }, + { + "epoch": 0.16856958977903844, + "grad_norm": 1.419915795326233, + "learning_rate": 1.1816368509794365e-05, + "loss": 0.4357, + "step": 3536 + }, + { + "epoch": 0.16861726217433795, + "grad_norm": 1.297147512435913, + "learning_rate": 1.1812505736129156e-05, + "loss": 0.8691, + "step": 3537 + }, + { + "epoch": 0.16866493456963746, + "grad_norm": 2.0768415927886963, + "learning_rate": 1.1808642682812973e-05, + "loss": 0.6545, + "step": 3538 + }, + { + "epoch": 0.16871260696493695, + "grad_norm": 1.605297327041626, + "learning_rate": 1.1804779350441852e-05, + "loss": 0.6993, + "step": 3539 + }, + { + "epoch": 0.16876027936023646, + "grad_norm": 1.3861682415008545, + "learning_rate": 1.1800915739611865e-05, + "loss": 0.9421, + "step": 3540 + }, + { + "epoch": 0.16880795175553595, + "grad_norm": 1.7097270488739014, + "learning_rate": 1.1797051850919123e-05, + "loss": 0.5509, + "step": 3541 + }, + { + "epoch": 0.16885562415083546, + "grad_norm": 1.540054440498352, + "learning_rate": 1.1793187684959786e-05, + "loss": 0.5836, + "step": 3542 + }, + { + "epoch": 0.16890329654613495, + "grad_norm": 1.5158042907714844, + "learning_rate": 1.1789323242330057e-05, + "loss": 0.8053, + "step": 3543 + }, + { + "epoch": 0.16895096894143447, + "grad_norm": 1.1195318698883057, + "learning_rate": 1.1785458523626177e-05, + "loss": 0.7415, + "step": 3544 + }, + { + "epoch": 0.16899864133673395, + "grad_norm": 1.5286706686019897, + "learning_rate": 1.1781593529444432e-05, + "loss": 0.7496, + "step": 3545 + }, + { + "epoch": 0.16904631373203347, + "grad_norm": 1.5500754117965698, + "learning_rate": 1.1777728260381154e-05, + "loss": 0.5173, + "step": 3546 + }, + { + "epoch": 0.16909398612733298, + "grad_norm": 2.987522840499878, + "learning_rate": 1.1773862717032711e-05, + "loss": 0.8847, + "step": 3547 + }, + { + "epoch": 0.16914165852263247, + "grad_norm": 1.3472909927368164, + "learning_rate": 1.176999689999552e-05, + "loss": 0.4554, + "step": 3548 + }, + { + "epoch": 0.16918933091793198, + "grad_norm": 3.5971341133117676, + "learning_rate": 1.1766130809866037e-05, + "loss": 0.9229, + "step": 3549 + }, + { + "epoch": 0.16923700331323147, + "grad_norm": 3.8235254287719727, + "learning_rate": 1.1762264447240753e-05, + "loss": 0.6302, + "step": 3550 + }, + { + "epoch": 0.16928467570853098, + "grad_norm": 1.795430064201355, + "learning_rate": 1.1758397812716216e-05, + "loss": 0.6269, + "step": 3551 + }, + { + "epoch": 0.16933234810383047, + "grad_norm": 1.0740364789962769, + "learning_rate": 1.1754530906889e-05, + "loss": 0.4432, + "step": 3552 + }, + { + "epoch": 0.16938002049912998, + "grad_norm": 1.1840626001358032, + "learning_rate": 1.1750663730355737e-05, + "loss": 0.6556, + "step": 3553 + }, + { + "epoch": 0.1694276928944295, + "grad_norm": 1.3491199016571045, + "learning_rate": 1.174679628371309e-05, + "loss": 0.6026, + "step": 3554 + }, + { + "epoch": 0.16947536528972898, + "grad_norm": 2.1667327880859375, + "learning_rate": 1.174292856755776e-05, + "loss": 1.043, + "step": 3555 + }, + { + "epoch": 0.1695230376850285, + "grad_norm": 1.4364588260650635, + "learning_rate": 1.1739060582486506e-05, + "loss": 0.63, + "step": 3556 + }, + { + "epoch": 0.16957071008032798, + "grad_norm": 1.307167887687683, + "learning_rate": 1.173519232909611e-05, + "loss": 0.7668, + "step": 3557 + }, + { + "epoch": 0.1696183824756275, + "grad_norm": 3.336822271347046, + "learning_rate": 1.1731323807983406e-05, + "loss": 1.0308, + "step": 3558 + }, + { + "epoch": 0.16966605487092698, + "grad_norm": 2.1688733100891113, + "learning_rate": 1.1727455019745269e-05, + "loss": 0.9717, + "step": 3559 + }, + { + "epoch": 0.1697137272662265, + "grad_norm": 2.113585948944092, + "learning_rate": 1.1723585964978612e-05, + "loss": 0.6677, + "step": 3560 + }, + { + "epoch": 0.16976139966152598, + "grad_norm": 2.2191150188446045, + "learning_rate": 1.1719716644280388e-05, + "loss": 0.6711, + "step": 3561 + }, + { + "epoch": 0.1698090720568255, + "grad_norm": 3.1125471591949463, + "learning_rate": 1.1715847058247599e-05, + "loss": 0.4932, + "step": 3562 + }, + { + "epoch": 0.169856744452125, + "grad_norm": 1.4719353914260864, + "learning_rate": 1.1711977207477276e-05, + "loss": 0.5139, + "step": 3563 + }, + { + "epoch": 0.1699044168474245, + "grad_norm": 1.181674838066101, + "learning_rate": 1.1708107092566501e-05, + "loss": 0.4889, + "step": 3564 + }, + { + "epoch": 0.169952089242724, + "grad_norm": 3.9713852405548096, + "learning_rate": 1.170423671411239e-05, + "loss": 0.959, + "step": 3565 + }, + { + "epoch": 0.1699997616380235, + "grad_norm": 1.5800055265426636, + "learning_rate": 1.1700366072712108e-05, + "loss": 0.3025, + "step": 3566 + }, + { + "epoch": 0.170047434033323, + "grad_norm": 3.0604093074798584, + "learning_rate": 1.1696495168962848e-05, + "loss": 0.8043, + "step": 3567 + }, + { + "epoch": 0.1700951064286225, + "grad_norm": 1.731905460357666, + "learning_rate": 1.1692624003461854e-05, + "loss": 0.6767, + "step": 3568 + }, + { + "epoch": 0.170142778823922, + "grad_norm": 1.6313363313674927, + "learning_rate": 1.168875257680641e-05, + "loss": 0.844, + "step": 3569 + }, + { + "epoch": 0.17019045121922152, + "grad_norm": 1.1193355321884155, + "learning_rate": 1.168488088959383e-05, + "loss": 0.6755, + "step": 3570 + }, + { + "epoch": 0.170238123614521, + "grad_norm": 1.2992329597473145, + "learning_rate": 1.1681008942421484e-05, + "loss": 0.7413, + "step": 3571 + }, + { + "epoch": 0.17028579600982052, + "grad_norm": 2.0489566326141357, + "learning_rate": 1.1677136735886767e-05, + "loss": 0.6082, + "step": 3572 + }, + { + "epoch": 0.17033346840512, + "grad_norm": 1.8728572130203247, + "learning_rate": 1.1673264270587122e-05, + "loss": 0.6301, + "step": 3573 + }, + { + "epoch": 0.17038114080041952, + "grad_norm": 2.26562762260437, + "learning_rate": 1.1669391547120032e-05, + "loss": 1.0341, + "step": 3574 + }, + { + "epoch": 0.170428813195719, + "grad_norm": 1.8856061697006226, + "learning_rate": 1.1665518566083016e-05, + "loss": 0.9334, + "step": 3575 + }, + { + "epoch": 0.17047648559101852, + "grad_norm": 2.3725688457489014, + "learning_rate": 1.1661645328073641e-05, + "loss": 1.042, + "step": 3576 + }, + { + "epoch": 0.170524157986318, + "grad_norm": 3.2011992931365967, + "learning_rate": 1.16577718336895e-05, + "loss": 0.39, + "step": 3577 + }, + { + "epoch": 0.17057183038161752, + "grad_norm": 1.6124567985534668, + "learning_rate": 1.165389808352824e-05, + "loss": 0.6931, + "step": 3578 + }, + { + "epoch": 0.17061950277691704, + "grad_norm": 1.1953445672988892, + "learning_rate": 1.1650024078187534e-05, + "loss": 0.5153, + "step": 3579 + }, + { + "epoch": 0.17066717517221652, + "grad_norm": 1.2455637454986572, + "learning_rate": 1.1646149818265107e-05, + "loss": 0.7659, + "step": 3580 + }, + { + "epoch": 0.17071484756751604, + "grad_norm": 3.581510066986084, + "learning_rate": 1.1642275304358713e-05, + "loss": 0.3635, + "step": 3581 + }, + { + "epoch": 0.17076251996281552, + "grad_norm": 3.138016939163208, + "learning_rate": 1.1638400537066152e-05, + "loss": 0.2424, + "step": 3582 + }, + { + "epoch": 0.17081019235811504, + "grad_norm": 2.5070574283599854, + "learning_rate": 1.1634525516985264e-05, + "loss": 0.8633, + "step": 3583 + }, + { + "epoch": 0.17085786475341452, + "grad_norm": 3.120242118835449, + "learning_rate": 1.1630650244713917e-05, + "loss": 0.3051, + "step": 3584 + }, + { + "epoch": 0.17090553714871404, + "grad_norm": 1.7223788499832153, + "learning_rate": 1.1626774720850031e-05, + "loss": 0.6868, + "step": 3585 + }, + { + "epoch": 0.17095320954401355, + "grad_norm": 7.772683143615723, + "learning_rate": 1.1622898945991559e-05, + "loss": 0.5232, + "step": 3586 + }, + { + "epoch": 0.17100088193931304, + "grad_norm": 2.0280861854553223, + "learning_rate": 1.1619022920736491e-05, + "loss": 0.3724, + "step": 3587 + }, + { + "epoch": 0.17104855433461255, + "grad_norm": 0.9415162801742554, + "learning_rate": 1.161514664568286e-05, + "loss": 0.4759, + "step": 3588 + }, + { + "epoch": 0.17109622672991204, + "grad_norm": 1.3747600317001343, + "learning_rate": 1.1611270121428736e-05, + "loss": 0.6423, + "step": 3589 + }, + { + "epoch": 0.17114389912521155, + "grad_norm": 1.7101765871047974, + "learning_rate": 1.160739334857222e-05, + "loss": 0.7427, + "step": 3590 + }, + { + "epoch": 0.17119157152051104, + "grad_norm": 2.4855165481567383, + "learning_rate": 1.1603516327711466e-05, + "loss": 0.9986, + "step": 3591 + }, + { + "epoch": 0.17123924391581055, + "grad_norm": 1.3298126459121704, + "learning_rate": 1.1599639059444657e-05, + "loss": 0.7456, + "step": 3592 + }, + { + "epoch": 0.17128691631111004, + "grad_norm": 4.453553676605225, + "learning_rate": 1.1595761544370015e-05, + "loss": 0.6444, + "step": 3593 + }, + { + "epoch": 0.17133458870640955, + "grad_norm": 2.1800248622894287, + "learning_rate": 1.1591883783085799e-05, + "loss": 1.1236, + "step": 3594 + }, + { + "epoch": 0.17138226110170907, + "grad_norm": 6.065691947937012, + "learning_rate": 1.1588005776190305e-05, + "loss": 0.571, + "step": 3595 + }, + { + "epoch": 0.17142993349700855, + "grad_norm": 1.7908965349197388, + "learning_rate": 1.1584127524281877e-05, + "loss": 1.0544, + "step": 3596 + }, + { + "epoch": 0.17147760589230807, + "grad_norm": 1.4564570188522339, + "learning_rate": 1.1580249027958883e-05, + "loss": 0.883, + "step": 3597 + }, + { + "epoch": 0.17152527828760755, + "grad_norm": 2.0037107467651367, + "learning_rate": 1.1576370287819737e-05, + "loss": 0.8547, + "step": 3598 + }, + { + "epoch": 0.17157295068290707, + "grad_norm": 1.92339026927948, + "learning_rate": 1.1572491304462891e-05, + "loss": 0.341, + "step": 3599 + }, + { + "epoch": 0.17162062307820655, + "grad_norm": 1.6973270177841187, + "learning_rate": 1.156861207848683e-05, + "loss": 0.5529, + "step": 3600 + }, + { + "epoch": 0.17166829547350607, + "grad_norm": 1.7145075798034668, + "learning_rate": 1.156473261049008e-05, + "loss": 0.6902, + "step": 3601 + }, + { + "epoch": 0.17171596786880558, + "grad_norm": 2.240851879119873, + "learning_rate": 1.15608529010712e-05, + "loss": 0.8563, + "step": 3602 + }, + { + "epoch": 0.17176364026410507, + "grad_norm": 2.0044214725494385, + "learning_rate": 1.1556972950828791e-05, + "loss": 0.7618, + "step": 3603 + }, + { + "epoch": 0.17181131265940458, + "grad_norm": 2.3290843963623047, + "learning_rate": 1.1553092760361488e-05, + "loss": 1.208, + "step": 3604 + }, + { + "epoch": 0.17185898505470407, + "grad_norm": 1.4555487632751465, + "learning_rate": 1.1549212330267969e-05, + "loss": 0.2519, + "step": 3605 + }, + { + "epoch": 0.17190665745000358, + "grad_norm": 1.7847747802734375, + "learning_rate": 1.1545331661146941e-05, + "loss": 0.6814, + "step": 3606 + }, + { + "epoch": 0.17195432984530307, + "grad_norm": 2.0160207748413086, + "learning_rate": 1.1541450753597147e-05, + "loss": 0.7952, + "step": 3607 + }, + { + "epoch": 0.17200200224060258, + "grad_norm": 1.2327224016189575, + "learning_rate": 1.1537569608217381e-05, + "loss": 0.3049, + "step": 3608 + }, + { + "epoch": 0.1720496746359021, + "grad_norm": 1.5503227710723877, + "learning_rate": 1.1533688225606458e-05, + "loss": 0.6507, + "step": 3609 + }, + { + "epoch": 0.17209734703120158, + "grad_norm": 1.5489264726638794, + "learning_rate": 1.1529806606363234e-05, + "loss": 1.0682, + "step": 3610 + }, + { + "epoch": 0.1721450194265011, + "grad_norm": 1.8247203826904297, + "learning_rate": 1.1525924751086603e-05, + "loss": 0.5006, + "step": 3611 + }, + { + "epoch": 0.17219269182180058, + "grad_norm": 1.4162834882736206, + "learning_rate": 1.15220426603755e-05, + "loss": 0.78, + "step": 3612 + }, + { + "epoch": 0.1722403642171001, + "grad_norm": 1.4422036409378052, + "learning_rate": 1.1518160334828885e-05, + "loss": 0.7021, + "step": 3613 + }, + { + "epoch": 0.17228803661239958, + "grad_norm": 1.94001305103302, + "learning_rate": 1.1514277775045768e-05, + "loss": 0.893, + "step": 3614 + }, + { + "epoch": 0.1723357090076991, + "grad_norm": 1.0234631299972534, + "learning_rate": 1.1510394981625184e-05, + "loss": 0.5148, + "step": 3615 + }, + { + "epoch": 0.17238338140299858, + "grad_norm": 3.0914082527160645, + "learning_rate": 1.1506511955166206e-05, + "loss": 1.4842, + "step": 3616 + }, + { + "epoch": 0.1724310537982981, + "grad_norm": 2.994213104248047, + "learning_rate": 1.150262869626795e-05, + "loss": 1.0629, + "step": 3617 + }, + { + "epoch": 0.1724787261935976, + "grad_norm": 2.799489974975586, + "learning_rate": 1.1498745205529558e-05, + "loss": 0.4918, + "step": 3618 + }, + { + "epoch": 0.1725263985888971, + "grad_norm": 1.6536370515823364, + "learning_rate": 1.1494861483550216e-05, + "loss": 1.0997, + "step": 3619 + }, + { + "epoch": 0.1725740709841966, + "grad_norm": 2.3614556789398193, + "learning_rate": 1.1490977530929141e-05, + "loss": 0.9053, + "step": 3620 + }, + { + "epoch": 0.1726217433794961, + "grad_norm": 1.4384511709213257, + "learning_rate": 1.1487093348265585e-05, + "loss": 0.8032, + "step": 3621 + }, + { + "epoch": 0.1726694157747956, + "grad_norm": 1.4297435283660889, + "learning_rate": 1.1483208936158842e-05, + "loss": 0.6918, + "step": 3622 + }, + { + "epoch": 0.1727170881700951, + "grad_norm": 2.1808807849884033, + "learning_rate": 1.1479324295208234e-05, + "loss": 1.302, + "step": 3623 + }, + { + "epoch": 0.1727647605653946, + "grad_norm": 2.944615602493286, + "learning_rate": 1.1475439426013122e-05, + "loss": 1.0663, + "step": 3624 + }, + { + "epoch": 0.17281243296069412, + "grad_norm": 0.9485723972320557, + "learning_rate": 1.14715543291729e-05, + "loss": 0.2848, + "step": 3625 + }, + { + "epoch": 0.1728601053559936, + "grad_norm": 1.3904765844345093, + "learning_rate": 1.1467669005286999e-05, + "loss": 0.6437, + "step": 3626 + }, + { + "epoch": 0.17290777775129312, + "grad_norm": 0.9204992055892944, + "learning_rate": 1.1463783454954883e-05, + "loss": 0.1347, + "step": 3627 + }, + { + "epoch": 0.1729554501465926, + "grad_norm": 2.3575785160064697, + "learning_rate": 1.1459897678776055e-05, + "loss": 1.3396, + "step": 3628 + }, + { + "epoch": 0.17300312254189212, + "grad_norm": 1.2435863018035889, + "learning_rate": 1.1456011677350052e-05, + "loss": 0.6202, + "step": 3629 + }, + { + "epoch": 0.1730507949371916, + "grad_norm": 1.9176143407821655, + "learning_rate": 1.1452125451276435e-05, + "loss": 0.8125, + "step": 3630 + }, + { + "epoch": 0.17309846733249112, + "grad_norm": 1.6232742071151733, + "learning_rate": 1.1448239001154821e-05, + "loss": 0.7718, + "step": 3631 + }, + { + "epoch": 0.1731461397277906, + "grad_norm": 2.277369260787964, + "learning_rate": 1.144435232758484e-05, + "loss": 1.2965, + "step": 3632 + }, + { + "epoch": 0.17319381212309012, + "grad_norm": 1.7735130786895752, + "learning_rate": 1.144046543116617e-05, + "loss": 0.6857, + "step": 3633 + }, + { + "epoch": 0.17324148451838964, + "grad_norm": 1.8041505813598633, + "learning_rate": 1.1436578312498518e-05, + "loss": 0.7382, + "step": 3634 + }, + { + "epoch": 0.17328915691368912, + "grad_norm": 1.228304147720337, + "learning_rate": 1.1432690972181624e-05, + "loss": 0.7391, + "step": 3635 + }, + { + "epoch": 0.17333682930898864, + "grad_norm": 1.7657897472381592, + "learning_rate": 1.1428803410815268e-05, + "loss": 0.9357, + "step": 3636 + }, + { + "epoch": 0.17338450170428812, + "grad_norm": 1.4979184865951538, + "learning_rate": 1.1424915628999261e-05, + "loss": 0.6252, + "step": 3637 + }, + { + "epoch": 0.17343217409958764, + "grad_norm": 1.5883418321609497, + "learning_rate": 1.1421027627333445e-05, + "loss": 0.4806, + "step": 3638 + }, + { + "epoch": 0.17347984649488712, + "grad_norm": 1.2073731422424316, + "learning_rate": 1.14171394064177e-05, + "loss": 0.7807, + "step": 3639 + }, + { + "epoch": 0.17352751889018664, + "grad_norm": 5.799956321716309, + "learning_rate": 1.141325096685194e-05, + "loss": 0.8361, + "step": 3640 + }, + { + "epoch": 0.17357519128548615, + "grad_norm": 1.3248827457427979, + "learning_rate": 1.1409362309236107e-05, + "loss": 0.5907, + "step": 3641 + }, + { + "epoch": 0.17362286368078564, + "grad_norm": 1.3242886066436768, + "learning_rate": 1.1405473434170185e-05, + "loss": 0.605, + "step": 3642 + }, + { + "epoch": 0.17367053607608515, + "grad_norm": 1.3338148593902588, + "learning_rate": 1.1401584342254183e-05, + "loss": 0.689, + "step": 3643 + }, + { + "epoch": 0.17371820847138464, + "grad_norm": 1.0636639595031738, + "learning_rate": 1.1397695034088152e-05, + "loss": 0.5682, + "step": 3644 + }, + { + "epoch": 0.17376588086668415, + "grad_norm": 1.3289719820022583, + "learning_rate": 1.1393805510272171e-05, + "loss": 0.8583, + "step": 3645 + }, + { + "epoch": 0.17381355326198364, + "grad_norm": 1.484549880027771, + "learning_rate": 1.1389915771406354e-05, + "loss": 0.8687, + "step": 3646 + }, + { + "epoch": 0.17386122565728315, + "grad_norm": 1.180335283279419, + "learning_rate": 1.1386025818090847e-05, + "loss": 0.6134, + "step": 3647 + }, + { + "epoch": 0.17390889805258264, + "grad_norm": 2.064446449279785, + "learning_rate": 1.138213565092583e-05, + "loss": 0.8299, + "step": 3648 + }, + { + "epoch": 0.17395657044788215, + "grad_norm": 1.5924581289291382, + "learning_rate": 1.1378245270511512e-05, + "loss": 0.4035, + "step": 3649 + }, + { + "epoch": 0.17400424284318167, + "grad_norm": 4.692134380340576, + "learning_rate": 1.1374354677448145e-05, + "loss": 0.8619, + "step": 3650 + }, + { + "epoch": 0.17405191523848115, + "grad_norm": 1.5653624534606934, + "learning_rate": 1.1370463872336004e-05, + "loss": 0.9286, + "step": 3651 + }, + { + "epoch": 0.17409958763378067, + "grad_norm": 0.9868603944778442, + "learning_rate": 1.1366572855775397e-05, + "loss": 0.4138, + "step": 3652 + }, + { + "epoch": 0.17414726002908015, + "grad_norm": 1.799821138381958, + "learning_rate": 1.1362681628366676e-05, + "loss": 0.6377, + "step": 3653 + }, + { + "epoch": 0.17419493242437967, + "grad_norm": 2.0849995613098145, + "learning_rate": 1.1358790190710213e-05, + "loss": 0.856, + "step": 3654 + }, + { + "epoch": 0.17424260481967915, + "grad_norm": 2.911855459213257, + "learning_rate": 1.1354898543406411e-05, + "loss": 0.4562, + "step": 3655 + }, + { + "epoch": 0.17429027721497867, + "grad_norm": 1.458886742591858, + "learning_rate": 1.1351006687055722e-05, + "loss": 0.8462, + "step": 3656 + }, + { + "epoch": 0.17433794961027818, + "grad_norm": 2.784414052963257, + "learning_rate": 1.1347114622258613e-05, + "loss": 0.9097, + "step": 3657 + }, + { + "epoch": 0.17438562200557767, + "grad_norm": 1.0362164974212646, + "learning_rate": 1.1343222349615585e-05, + "loss": 0.6398, + "step": 3658 + }, + { + "epoch": 0.17443329440087718, + "grad_norm": 2.4401583671569824, + "learning_rate": 1.1339329869727187e-05, + "loss": 1.0989, + "step": 3659 + }, + { + "epoch": 0.17448096679617667, + "grad_norm": 1.9631332159042358, + "learning_rate": 1.133543718319398e-05, + "loss": 0.3957, + "step": 3660 + }, + { + "epoch": 0.17452863919147618, + "grad_norm": 1.39589524269104, + "learning_rate": 1.1331544290616569e-05, + "loss": 0.7613, + "step": 3661 + }, + { + "epoch": 0.17457631158677567, + "grad_norm": 2.2678937911987305, + "learning_rate": 1.1327651192595588e-05, + "loss": 0.8191, + "step": 3662 + }, + { + "epoch": 0.17462398398207518, + "grad_norm": 1.3737080097198486, + "learning_rate": 1.1323757889731697e-05, + "loss": 0.8491, + "step": 3663 + }, + { + "epoch": 0.17467165637737467, + "grad_norm": 2.4052364826202393, + "learning_rate": 1.1319864382625595e-05, + "loss": 1.0834, + "step": 3664 + }, + { + "epoch": 0.17471932877267418, + "grad_norm": 3.4208598136901855, + "learning_rate": 1.1315970671878014e-05, + "loss": 0.6112, + "step": 3665 + }, + { + "epoch": 0.1747670011679737, + "grad_norm": 1.5483438968658447, + "learning_rate": 1.1312076758089708e-05, + "loss": 0.9585, + "step": 3666 + }, + { + "epoch": 0.17481467356327318, + "grad_norm": 1.1554005146026611, + "learning_rate": 1.130818264186147e-05, + "loss": 0.7252, + "step": 3667 + }, + { + "epoch": 0.1748623459585727, + "grad_norm": 1.7683225870132446, + "learning_rate": 1.1304288323794121e-05, + "loss": 0.91, + "step": 3668 + }, + { + "epoch": 0.17491001835387218, + "grad_norm": 1.5654011964797974, + "learning_rate": 1.1300393804488519e-05, + "loss": 0.64, + "step": 3669 + }, + { + "epoch": 0.1749576907491717, + "grad_norm": 1.5680348873138428, + "learning_rate": 1.1296499084545543e-05, + "loss": 1.2859, + "step": 3670 + }, + { + "epoch": 0.17500536314447118, + "grad_norm": 2.0240871906280518, + "learning_rate": 1.1292604164566108e-05, + "loss": 1.0842, + "step": 3671 + }, + { + "epoch": 0.1750530355397707, + "grad_norm": 2.9063687324523926, + "learning_rate": 1.1288709045151161e-05, + "loss": 1.3241, + "step": 3672 + }, + { + "epoch": 0.1751007079350702, + "grad_norm": 2.7663965225219727, + "learning_rate": 1.128481372690168e-05, + "loss": 1.2435, + "step": 3673 + }, + { + "epoch": 0.1751483803303697, + "grad_norm": 1.1768468618392944, + "learning_rate": 1.1280918210418674e-05, + "loss": 0.7549, + "step": 3674 + }, + { + "epoch": 0.1751960527256692, + "grad_norm": 1.5275644063949585, + "learning_rate": 1.1277022496303178e-05, + "loss": 0.5681, + "step": 3675 + }, + { + "epoch": 0.1752437251209687, + "grad_norm": 1.4623888731002808, + "learning_rate": 1.1273126585156262e-05, + "loss": 0.7765, + "step": 3676 + }, + { + "epoch": 0.1752913975162682, + "grad_norm": 25.827421188354492, + "learning_rate": 1.1269230477579025e-05, + "loss": 1.0914, + "step": 3677 + }, + { + "epoch": 0.1753390699115677, + "grad_norm": 1.271023154258728, + "learning_rate": 1.1265334174172593e-05, + "loss": 0.6713, + "step": 3678 + }, + { + "epoch": 0.1753867423068672, + "grad_norm": 1.9592735767364502, + "learning_rate": 1.1261437675538132e-05, + "loss": 0.3998, + "step": 3679 + }, + { + "epoch": 0.1754344147021667, + "grad_norm": 1.777327299118042, + "learning_rate": 1.1257540982276827e-05, + "loss": 0.764, + "step": 3680 + }, + { + "epoch": 0.1754820870974662, + "grad_norm": 1.659556269645691, + "learning_rate": 1.1253644094989895e-05, + "loss": 0.4603, + "step": 3681 + }, + { + "epoch": 0.17552975949276572, + "grad_norm": 3.6654441356658936, + "learning_rate": 1.1249747014278594e-05, + "loss": 0.6015, + "step": 3682 + }, + { + "epoch": 0.1755774318880652, + "grad_norm": 1.181654453277588, + "learning_rate": 1.1245849740744198e-05, + "loss": 0.4984, + "step": 3683 + }, + { + "epoch": 0.17562510428336472, + "grad_norm": 1.0265041589736938, + "learning_rate": 1.1241952274988015e-05, + "loss": 0.709, + "step": 3684 + }, + { + "epoch": 0.1756727766786642, + "grad_norm": 1.5340919494628906, + "learning_rate": 1.1238054617611384e-05, + "loss": 0.8452, + "step": 3685 + }, + { + "epoch": 0.17572044907396372, + "grad_norm": 1.113054871559143, + "learning_rate": 1.1234156769215678e-05, + "loss": 0.5516, + "step": 3686 + }, + { + "epoch": 0.1757681214692632, + "grad_norm": 2.633444309234619, + "learning_rate": 1.123025873040229e-05, + "loss": 0.9207, + "step": 3687 + }, + { + "epoch": 0.17581579386456272, + "grad_norm": 1.3964942693710327, + "learning_rate": 1.122636050177265e-05, + "loss": 0.7796, + "step": 3688 + }, + { + "epoch": 0.17586346625986224, + "grad_norm": 1.6941229104995728, + "learning_rate": 1.1222462083928215e-05, + "loss": 1.0525, + "step": 3689 + }, + { + "epoch": 0.17591113865516173, + "grad_norm": 1.7043280601501465, + "learning_rate": 1.1218563477470465e-05, + "loss": 0.7201, + "step": 3690 + }, + { + "epoch": 0.17595881105046124, + "grad_norm": 1.5716497898101807, + "learning_rate": 1.1214664683000927e-05, + "loss": 0.5047, + "step": 3691 + }, + { + "epoch": 0.17600648344576073, + "grad_norm": 1.1442688703536987, + "learning_rate": 1.121076570112113e-05, + "loss": 0.5966, + "step": 3692 + }, + { + "epoch": 0.17605415584106024, + "grad_norm": 2.763701915740967, + "learning_rate": 1.1206866532432657e-05, + "loss": 0.6655, + "step": 3693 + }, + { + "epoch": 0.17610182823635973, + "grad_norm": 2.9142439365386963, + "learning_rate": 1.1202967177537105e-05, + "loss": 0.5674, + "step": 3694 + }, + { + "epoch": 0.17614950063165924, + "grad_norm": 7.891298294067383, + "learning_rate": 1.1199067637036106e-05, + "loss": 0.8924, + "step": 3695 + }, + { + "epoch": 0.17619717302695875, + "grad_norm": 1.8380059003829956, + "learning_rate": 1.1195167911531317e-05, + "loss": 0.8322, + "step": 3696 + }, + { + "epoch": 0.17624484542225824, + "grad_norm": 1.6006748676300049, + "learning_rate": 1.1191268001624431e-05, + "loss": 0.7625, + "step": 3697 + }, + { + "epoch": 0.17629251781755775, + "grad_norm": 1.419047474861145, + "learning_rate": 1.1187367907917158e-05, + "loss": 0.4179, + "step": 3698 + }, + { + "epoch": 0.17634019021285724, + "grad_norm": 1.0092003345489502, + "learning_rate": 1.1183467631011245e-05, + "loss": 0.356, + "step": 3699 + }, + { + "epoch": 0.17638786260815675, + "grad_norm": 1.2002215385437012, + "learning_rate": 1.1179567171508463e-05, + "loss": 0.6919, + "step": 3700 + }, + { + "epoch": 0.17643553500345624, + "grad_norm": 2.0352838039398193, + "learning_rate": 1.1175666530010612e-05, + "loss": 0.8195, + "step": 3701 + }, + { + "epoch": 0.17648320739875575, + "grad_norm": 5.819756507873535, + "learning_rate": 1.1171765707119525e-05, + "loss": 0.5558, + "step": 3702 + }, + { + "epoch": 0.17653087979405524, + "grad_norm": 1.529212474822998, + "learning_rate": 1.1167864703437054e-05, + "loss": 0.8523, + "step": 3703 + }, + { + "epoch": 0.17657855218935475, + "grad_norm": 4.227266311645508, + "learning_rate": 1.1163963519565086e-05, + "loss": 0.6324, + "step": 3704 + }, + { + "epoch": 0.17662622458465427, + "grad_norm": 1.5722299814224243, + "learning_rate": 1.1160062156105536e-05, + "loss": 0.7751, + "step": 3705 + }, + { + "epoch": 0.17667389697995375, + "grad_norm": 1.7496352195739746, + "learning_rate": 1.1156160613660341e-05, + "loss": 0.7878, + "step": 3706 + }, + { + "epoch": 0.17672156937525327, + "grad_norm": 3.3151917457580566, + "learning_rate": 1.1152258892831468e-05, + "loss": 1.2712, + "step": 3707 + }, + { + "epoch": 0.17676924177055275, + "grad_norm": 2.4798314571380615, + "learning_rate": 1.1148356994220917e-05, + "loss": 0.7813, + "step": 3708 + }, + { + "epoch": 0.17681691416585227, + "grad_norm": 3.006855010986328, + "learning_rate": 1.1144454918430703e-05, + "loss": 0.6662, + "step": 3709 + }, + { + "epoch": 0.17686458656115175, + "grad_norm": 1.4573267698287964, + "learning_rate": 1.1140552666062883e-05, + "loss": 0.8147, + "step": 3710 + }, + { + "epoch": 0.17691225895645127, + "grad_norm": 2.5887489318847656, + "learning_rate": 1.1136650237719534e-05, + "loss": 0.8309, + "step": 3711 + }, + { + "epoch": 0.17695993135175078, + "grad_norm": 2.1036903858184814, + "learning_rate": 1.1132747634002754e-05, + "loss": 0.81, + "step": 3712 + }, + { + "epoch": 0.17700760374705027, + "grad_norm": 1.2894703149795532, + "learning_rate": 1.1128844855514684e-05, + "loss": 0.5711, + "step": 3713 + }, + { + "epoch": 0.17705527614234978, + "grad_norm": 2.5630784034729004, + "learning_rate": 1.1124941902857475e-05, + "loss": 0.2478, + "step": 3714 + }, + { + "epoch": 0.17710294853764927, + "grad_norm": 4.644442558288574, + "learning_rate": 1.1121038776633315e-05, + "loss": 0.3775, + "step": 3715 + }, + { + "epoch": 0.17715062093294878, + "grad_norm": 1.6223654747009277, + "learning_rate": 1.1117135477444417e-05, + "loss": 0.7915, + "step": 3716 + }, + { + "epoch": 0.17719829332824827, + "grad_norm": 11.437932014465332, + "learning_rate": 1.111323200589302e-05, + "loss": 0.3528, + "step": 3717 + }, + { + "epoch": 0.17724596572354778, + "grad_norm": 1.5484358072280884, + "learning_rate": 1.1109328362581385e-05, + "loss": 1.1167, + "step": 3718 + }, + { + "epoch": 0.17729363811884727, + "grad_norm": 2.085814952850342, + "learning_rate": 1.110542454811181e-05, + "loss": 0.752, + "step": 3719 + }, + { + "epoch": 0.17734131051414678, + "grad_norm": 2.661550283432007, + "learning_rate": 1.1101520563086612e-05, + "loss": 0.748, + "step": 3720 + }, + { + "epoch": 0.1773889829094463, + "grad_norm": 1.7953928709030151, + "learning_rate": 1.1097616408108134e-05, + "loss": 0.8813, + "step": 3721 + }, + { + "epoch": 0.17743665530474578, + "grad_norm": 1.3717807531356812, + "learning_rate": 1.1093712083778748e-05, + "loss": 0.6016, + "step": 3722 + }, + { + "epoch": 0.1774843277000453, + "grad_norm": 1.6871592998504639, + "learning_rate": 1.1089807590700848e-05, + "loss": 0.8489, + "step": 3723 + }, + { + "epoch": 0.17753200009534478, + "grad_norm": 1.757038950920105, + "learning_rate": 1.108590292947686e-05, + "loss": 0.7763, + "step": 3724 + }, + { + "epoch": 0.1775796724906443, + "grad_norm": 1.2920114994049072, + "learning_rate": 1.1081998100709232e-05, + "loss": 0.6205, + "step": 3725 + }, + { + "epoch": 0.17762734488594378, + "grad_norm": 1.526204228401184, + "learning_rate": 1.1078093105000441e-05, + "loss": 0.3761, + "step": 3726 + }, + { + "epoch": 0.1776750172812433, + "grad_norm": 1.0621832609176636, + "learning_rate": 1.1074187942952985e-05, + "loss": 0.6272, + "step": 3727 + }, + { + "epoch": 0.1777226896765428, + "grad_norm": 1.669875144958496, + "learning_rate": 1.1070282615169395e-05, + "loss": 0.6544, + "step": 3728 + }, + { + "epoch": 0.1777703620718423, + "grad_norm": 3.1241908073425293, + "learning_rate": 1.1066377122252216e-05, + "loss": 0.9816, + "step": 3729 + }, + { + "epoch": 0.1778180344671418, + "grad_norm": 1.6103582382202148, + "learning_rate": 1.106247146480403e-05, + "loss": 0.6744, + "step": 3730 + }, + { + "epoch": 0.1778657068624413, + "grad_norm": 1.1220877170562744, + "learning_rate": 1.1058565643427439e-05, + "loss": 0.716, + "step": 3731 + }, + { + "epoch": 0.1779133792577408, + "grad_norm": 2.1707518100738525, + "learning_rate": 1.1054659658725067e-05, + "loss": 0.6998, + "step": 3732 + }, + { + "epoch": 0.1779610516530403, + "grad_norm": 1.7029175758361816, + "learning_rate": 1.1050753511299572e-05, + "loss": 0.7615, + "step": 3733 + }, + { + "epoch": 0.1780087240483398, + "grad_norm": 2.493659496307373, + "learning_rate": 1.1046847201753632e-05, + "loss": 0.6215, + "step": 3734 + }, + { + "epoch": 0.1780563964436393, + "grad_norm": 2.4561851024627686, + "learning_rate": 1.104294073068995e-05, + "loss": 0.9062, + "step": 3735 + }, + { + "epoch": 0.1781040688389388, + "grad_norm": 1.631298303604126, + "learning_rate": 1.1039034098711251e-05, + "loss": 0.367, + "step": 3736 + }, + { + "epoch": 0.17815174123423833, + "grad_norm": 1.754578709602356, + "learning_rate": 1.1035127306420295e-05, + "loss": 0.842, + "step": 3737 + }, + { + "epoch": 0.1781994136295378, + "grad_norm": 1.5287071466445923, + "learning_rate": 1.1031220354419849e-05, + "loss": 0.5886, + "step": 3738 + }, + { + "epoch": 0.17824708602483733, + "grad_norm": 2.307190179824829, + "learning_rate": 1.1027313243312726e-05, + "loss": 0.5273, + "step": 3739 + }, + { + "epoch": 0.1782947584201368, + "grad_norm": 1.314386248588562, + "learning_rate": 1.1023405973701746e-05, + "loss": 0.4677, + "step": 3740 + }, + { + "epoch": 0.17834243081543633, + "grad_norm": 1.8360697031021118, + "learning_rate": 1.1019498546189765e-05, + "loss": 0.533, + "step": 3741 + }, + { + "epoch": 0.1783901032107358, + "grad_norm": 2.1292521953582764, + "learning_rate": 1.1015590961379657e-05, + "loss": 0.8392, + "step": 3742 + }, + { + "epoch": 0.17843777560603533, + "grad_norm": 3.5823183059692383, + "learning_rate": 1.1011683219874324e-05, + "loss": 1.535, + "step": 3743 + }, + { + "epoch": 0.17848544800133484, + "grad_norm": 2.161022424697876, + "learning_rate": 1.1007775322276687e-05, + "loss": 0.8815, + "step": 3744 + }, + { + "epoch": 0.17853312039663433, + "grad_norm": 1.5481677055358887, + "learning_rate": 1.1003867269189696e-05, + "loss": 0.5935, + "step": 3745 + }, + { + "epoch": 0.17858079279193384, + "grad_norm": 1.7577028274536133, + "learning_rate": 1.099995906121632e-05, + "loss": 0.4881, + "step": 3746 + }, + { + "epoch": 0.17862846518723333, + "grad_norm": 1.5946173667907715, + "learning_rate": 1.0996050698959561e-05, + "loss": 0.5955, + "step": 3747 + }, + { + "epoch": 0.17867613758253284, + "grad_norm": 2.6982274055480957, + "learning_rate": 1.0992142183022438e-05, + "loss": 0.867, + "step": 3748 + }, + { + "epoch": 0.17872380997783233, + "grad_norm": 2.225858449935913, + "learning_rate": 1.0988233514007991e-05, + "loss": 0.9166, + "step": 3749 + }, + { + "epoch": 0.17877148237313184, + "grad_norm": 1.9222170114517212, + "learning_rate": 1.0984324692519292e-05, + "loss": 0.811, + "step": 3750 + }, + { + "epoch": 0.17881915476843133, + "grad_norm": 1.1785584688186646, + "learning_rate": 1.098041571915943e-05, + "loss": 0.709, + "step": 3751 + }, + { + "epoch": 0.17886682716373084, + "grad_norm": 1.0912861824035645, + "learning_rate": 1.0976506594531515e-05, + "loss": 0.4086, + "step": 3752 + }, + { + "epoch": 0.17891449955903035, + "grad_norm": 1.475943922996521, + "learning_rate": 1.0972597319238692e-05, + "loss": 0.5044, + "step": 3753 + }, + { + "epoch": 0.17896217195432984, + "grad_norm": 1.7712129354476929, + "learning_rate": 1.0968687893884118e-05, + "loss": 0.7004, + "step": 3754 + }, + { + "epoch": 0.17900984434962935, + "grad_norm": 1.078445553779602, + "learning_rate": 1.0964778319070974e-05, + "loss": 0.1896, + "step": 3755 + }, + { + "epoch": 0.17905751674492884, + "grad_norm": 1.7492010593414307, + "learning_rate": 1.0960868595402474e-05, + "loss": 0.8861, + "step": 3756 + }, + { + "epoch": 0.17910518914022835, + "grad_norm": 6.3724517822265625, + "learning_rate": 1.0956958723481845e-05, + "loss": 0.74, + "step": 3757 + }, + { + "epoch": 0.17915286153552784, + "grad_norm": 3.0286405086517334, + "learning_rate": 1.095304870391234e-05, + "loss": 0.7326, + "step": 3758 + }, + { + "epoch": 0.17920053393082735, + "grad_norm": 1.0813490152359009, + "learning_rate": 1.0949138537297233e-05, + "loss": 0.5341, + "step": 3759 + }, + { + "epoch": 0.17924820632612687, + "grad_norm": 2.031029462814331, + "learning_rate": 1.0945228224239823e-05, + "loss": 0.8732, + "step": 3760 + }, + { + "epoch": 0.17929587872142635, + "grad_norm": 1.719015121459961, + "learning_rate": 1.0941317765343433e-05, + "loss": 0.9455, + "step": 3761 + }, + { + "epoch": 0.17934355111672587, + "grad_norm": 1.0207024812698364, + "learning_rate": 1.0937407161211406e-05, + "loss": 0.5429, + "step": 3762 + }, + { + "epoch": 0.17939122351202536, + "grad_norm": 1.4251781702041626, + "learning_rate": 1.0933496412447105e-05, + "loss": 0.6332, + "step": 3763 + }, + { + "epoch": 0.17943889590732487, + "grad_norm": 1.6352126598358154, + "learning_rate": 1.0929585519653924e-05, + "loss": 0.8364, + "step": 3764 + }, + { + "epoch": 0.17948656830262436, + "grad_norm": 1.2944365739822388, + "learning_rate": 1.092567448343527e-05, + "loss": 0.5629, + "step": 3765 + }, + { + "epoch": 0.17953424069792387, + "grad_norm": 1.97201669216156, + "learning_rate": 1.0921763304394574e-05, + "loss": 0.8375, + "step": 3766 + }, + { + "epoch": 0.17958191309322336, + "grad_norm": 2.4572038650512695, + "learning_rate": 1.0917851983135294e-05, + "loss": 0.8321, + "step": 3767 + }, + { + "epoch": 0.17962958548852287, + "grad_norm": 1.7717958688735962, + "learning_rate": 1.0913940520260906e-05, + "loss": 0.4658, + "step": 3768 + }, + { + "epoch": 0.17967725788382238, + "grad_norm": 31.82832145690918, + "learning_rate": 1.0910028916374904e-05, + "loss": 0.8127, + "step": 3769 + }, + { + "epoch": 0.17972493027912187, + "grad_norm": 2.1751232147216797, + "learning_rate": 1.0906117172080812e-05, + "loss": 0.7162, + "step": 3770 + }, + { + "epoch": 0.17977260267442138, + "grad_norm": 1.5871031284332275, + "learning_rate": 1.0902205287982175e-05, + "loss": 0.6128, + "step": 3771 + }, + { + "epoch": 0.17982027506972087, + "grad_norm": 1.3700811862945557, + "learning_rate": 1.0898293264682549e-05, + "loss": 0.823, + "step": 3772 + }, + { + "epoch": 0.17986794746502038, + "grad_norm": 1.4328159093856812, + "learning_rate": 1.0894381102785527e-05, + "loss": 0.5723, + "step": 3773 + }, + { + "epoch": 0.17991561986031987, + "grad_norm": 1.3425663709640503, + "learning_rate": 1.0890468802894712e-05, + "loss": 0.7518, + "step": 3774 + }, + { + "epoch": 0.17996329225561938, + "grad_norm": 1.1192046403884888, + "learning_rate": 1.0886556365613725e-05, + "loss": 0.7811, + "step": 3775 + }, + { + "epoch": 0.1800109646509189, + "grad_norm": 1.260384202003479, + "learning_rate": 1.0882643791546224e-05, + "loss": 0.6662, + "step": 3776 + }, + { + "epoch": 0.18005863704621838, + "grad_norm": 1.7328011989593506, + "learning_rate": 1.0878731081295874e-05, + "loss": 0.9567, + "step": 3777 + }, + { + "epoch": 0.1801063094415179, + "grad_norm": 1.9984331130981445, + "learning_rate": 1.0874818235466366e-05, + "loss": 0.7515, + "step": 3778 + }, + { + "epoch": 0.18015398183681738, + "grad_norm": 1.9605036973953247, + "learning_rate": 1.0870905254661418e-05, + "loss": 0.4301, + "step": 3779 + }, + { + "epoch": 0.1802016542321169, + "grad_norm": 1.9046077728271484, + "learning_rate": 1.0866992139484755e-05, + "loss": 0.9394, + "step": 3780 + }, + { + "epoch": 0.18024932662741638, + "grad_norm": 2.5434470176696777, + "learning_rate": 1.0863078890540133e-05, + "loss": 0.8089, + "step": 3781 + }, + { + "epoch": 0.1802969990227159, + "grad_norm": 1.6785893440246582, + "learning_rate": 1.0859165508431329e-05, + "loss": 1.0743, + "step": 3782 + }, + { + "epoch": 0.1803446714180154, + "grad_norm": 1.5858412981033325, + "learning_rate": 1.085525199376213e-05, + "loss": 0.6425, + "step": 3783 + }, + { + "epoch": 0.1803923438133149, + "grad_norm": 1.2266967296600342, + "learning_rate": 1.0851338347136358e-05, + "loss": 0.7408, + "step": 3784 + }, + { + "epoch": 0.1804400162086144, + "grad_norm": 3.621140241622925, + "learning_rate": 1.0847424569157847e-05, + "loss": 0.4324, + "step": 3785 + }, + { + "epoch": 0.1804876886039139, + "grad_norm": 3.730668544769287, + "learning_rate": 1.0843510660430447e-05, + "loss": 1.2772, + "step": 3786 + }, + { + "epoch": 0.1805353609992134, + "grad_norm": 3.8443429470062256, + "learning_rate": 1.0839596621558045e-05, + "loss": 0.6629, + "step": 3787 + }, + { + "epoch": 0.1805830333945129, + "grad_norm": 3.6548566818237305, + "learning_rate": 1.0835682453144527e-05, + "loss": 0.8204, + "step": 3788 + }, + { + "epoch": 0.1806307057898124, + "grad_norm": 2.259329080581665, + "learning_rate": 1.0831768155793814e-05, + "loss": 1.1254, + "step": 3789 + }, + { + "epoch": 0.1806783781851119, + "grad_norm": 1.7217439413070679, + "learning_rate": 1.082785373010984e-05, + "loss": 0.7948, + "step": 3790 + }, + { + "epoch": 0.1807260505804114, + "grad_norm": 1.624069094657898, + "learning_rate": 1.0823939176696561e-05, + "loss": 0.6251, + "step": 3791 + }, + { + "epoch": 0.18077372297571093, + "grad_norm": 3.0085394382476807, + "learning_rate": 1.082002449615795e-05, + "loss": 0.7022, + "step": 3792 + }, + { + "epoch": 0.1808213953710104, + "grad_norm": 1.255738377571106, + "learning_rate": 1.0816109689098004e-05, + "loss": 0.8176, + "step": 3793 + }, + { + "epoch": 0.18086906776630993, + "grad_norm": 2.689150810241699, + "learning_rate": 1.081219475612074e-05, + "loss": 0.546, + "step": 3794 + }, + { + "epoch": 0.1809167401616094, + "grad_norm": 1.3958780765533447, + "learning_rate": 1.0808279697830188e-05, + "loss": 0.8049, + "step": 3795 + }, + { + "epoch": 0.18096441255690893, + "grad_norm": 1.2954460382461548, + "learning_rate": 1.08043645148304e-05, + "loss": 0.7955, + "step": 3796 + }, + { + "epoch": 0.1810120849522084, + "grad_norm": 2.3237876892089844, + "learning_rate": 1.0800449207725453e-05, + "loss": 1.0901, + "step": 3797 + }, + { + "epoch": 0.18105975734750793, + "grad_norm": 2.630173683166504, + "learning_rate": 1.0796533777119435e-05, + "loss": 0.8656, + "step": 3798 + }, + { + "epoch": 0.18110742974280744, + "grad_norm": 4.283743381500244, + "learning_rate": 1.079261822361646e-05, + "loss": 1.0043, + "step": 3799 + }, + { + "epoch": 0.18115510213810693, + "grad_norm": 1.5384756326675415, + "learning_rate": 1.0788702547820654e-05, + "loss": 0.8652, + "step": 3800 + }, + { + "epoch": 0.18120277453340644, + "grad_norm": 2.6268727779388428, + "learning_rate": 1.0784786750336165e-05, + "loss": 0.6213, + "step": 3801 + }, + { + "epoch": 0.18125044692870593, + "grad_norm": 2.5609583854675293, + "learning_rate": 1.0780870831767166e-05, + "loss": 0.8244, + "step": 3802 + }, + { + "epoch": 0.18129811932400544, + "grad_norm": 1.5510481595993042, + "learning_rate": 1.0776954792717835e-05, + "loss": 1.0069, + "step": 3803 + }, + { + "epoch": 0.18134579171930493, + "grad_norm": 1.1264429092407227, + "learning_rate": 1.0773038633792385e-05, + "loss": 0.4657, + "step": 3804 + }, + { + "epoch": 0.18139346411460444, + "grad_norm": 2.5910754203796387, + "learning_rate": 1.0769122355595031e-05, + "loss": 0.8916, + "step": 3805 + }, + { + "epoch": 0.18144113650990393, + "grad_norm": 0.9827473163604736, + "learning_rate": 1.0765205958730018e-05, + "loss": 0.5123, + "step": 3806 + }, + { + "epoch": 0.18148880890520344, + "grad_norm": 2.5792088508605957, + "learning_rate": 1.0761289443801608e-05, + "loss": 1.0205, + "step": 3807 + }, + { + "epoch": 0.18153648130050296, + "grad_norm": 1.2586830854415894, + "learning_rate": 1.0757372811414075e-05, + "loss": 0.7238, + "step": 3808 + }, + { + "epoch": 0.18158415369580244, + "grad_norm": 2.0719401836395264, + "learning_rate": 1.0753456062171716e-05, + "loss": 0.6748, + "step": 3809 + }, + { + "epoch": 0.18163182609110196, + "grad_norm": 2.038454532623291, + "learning_rate": 1.0749539196678849e-05, + "loss": 0.7693, + "step": 3810 + }, + { + "epoch": 0.18167949848640144, + "grad_norm": 1.2887102365493774, + "learning_rate": 1.0745622215539801e-05, + "loss": 0.4609, + "step": 3811 + }, + { + "epoch": 0.18172717088170096, + "grad_norm": 1.0278254747390747, + "learning_rate": 1.0741705119358922e-05, + "loss": 0.4919, + "step": 3812 + }, + { + "epoch": 0.18177484327700044, + "grad_norm": 1.805465579032898, + "learning_rate": 1.0737787908740582e-05, + "loss": 0.9819, + "step": 3813 + }, + { + "epoch": 0.18182251567229996, + "grad_norm": 1.6648166179656982, + "learning_rate": 1.0733870584289168e-05, + "loss": 0.5149, + "step": 3814 + }, + { + "epoch": 0.18187018806759947, + "grad_norm": 1.674871802330017, + "learning_rate": 1.0729953146609076e-05, + "loss": 0.4941, + "step": 3815 + }, + { + "epoch": 0.18191786046289896, + "grad_norm": 1.9727791547775269, + "learning_rate": 1.0726035596304733e-05, + "loss": 0.8186, + "step": 3816 + }, + { + "epoch": 0.18196553285819847, + "grad_norm": 2.5273120403289795, + "learning_rate": 1.0722117933980574e-05, + "loss": 0.5947, + "step": 3817 + }, + { + "epoch": 0.18201320525349796, + "grad_norm": 1.1986948251724243, + "learning_rate": 1.0718200160241054e-05, + "loss": 0.6234, + "step": 3818 + }, + { + "epoch": 0.18206087764879747, + "grad_norm": 1.3057129383087158, + "learning_rate": 1.0714282275690646e-05, + "loss": 0.9209, + "step": 3819 + }, + { + "epoch": 0.18210855004409696, + "grad_norm": 1.3036905527114868, + "learning_rate": 1.0710364280933839e-05, + "loss": 0.8521, + "step": 3820 + }, + { + "epoch": 0.18215622243939647, + "grad_norm": 1.7449041604995728, + "learning_rate": 1.0706446176575137e-05, + "loss": 0.8996, + "step": 3821 + }, + { + "epoch": 0.18220389483469596, + "grad_norm": 1.8671306371688843, + "learning_rate": 1.0702527963219064e-05, + "loss": 0.6253, + "step": 3822 + }, + { + "epoch": 0.18225156722999547, + "grad_norm": 1.781281590461731, + "learning_rate": 1.0698609641470161e-05, + "loss": 1.0288, + "step": 3823 + }, + { + "epoch": 0.18229923962529498, + "grad_norm": 1.8120882511138916, + "learning_rate": 1.0694691211932986e-05, + "loss": 0.9012, + "step": 3824 + }, + { + "epoch": 0.18234691202059447, + "grad_norm": 2.3989880084991455, + "learning_rate": 1.0690772675212112e-05, + "loss": 0.888, + "step": 3825 + }, + { + "epoch": 0.18239458441589398, + "grad_norm": 2.982454776763916, + "learning_rate": 1.0686854031912126e-05, + "loss": 0.8754, + "step": 3826 + }, + { + "epoch": 0.18244225681119347, + "grad_norm": 2.698946475982666, + "learning_rate": 1.0682935282637638e-05, + "loss": 0.9195, + "step": 3827 + }, + { + "epoch": 0.18248992920649298, + "grad_norm": 1.4840878248214722, + "learning_rate": 1.0679016427993267e-05, + "loss": 0.53, + "step": 3828 + }, + { + "epoch": 0.18253760160179247, + "grad_norm": 2.0324044227600098, + "learning_rate": 1.0675097468583653e-05, + "loss": 0.8444, + "step": 3829 + }, + { + "epoch": 0.18258527399709198, + "grad_norm": 1.8995931148529053, + "learning_rate": 1.0671178405013454e-05, + "loss": 0.5282, + "step": 3830 + }, + { + "epoch": 0.1826329463923915, + "grad_norm": 1.3772923946380615, + "learning_rate": 1.066725923788734e-05, + "loss": 0.6219, + "step": 3831 + }, + { + "epoch": 0.18268061878769098, + "grad_norm": 1.3472472429275513, + "learning_rate": 1.0663339967809991e-05, + "loss": 0.6163, + "step": 3832 + }, + { + "epoch": 0.1827282911829905, + "grad_norm": 2.7855474948883057, + "learning_rate": 1.0659420595386123e-05, + "loss": 1.1621, + "step": 3833 + }, + { + "epoch": 0.18277596357828998, + "grad_norm": 1.4489245414733887, + "learning_rate": 1.0655501121220446e-05, + "loss": 0.6857, + "step": 3834 + }, + { + "epoch": 0.1828236359735895, + "grad_norm": 2.5283961296081543, + "learning_rate": 1.0651581545917693e-05, + "loss": 0.2947, + "step": 3835 + }, + { + "epoch": 0.18287130836888899, + "grad_norm": 1.3052552938461304, + "learning_rate": 1.064766187008262e-05, + "loss": 0.7609, + "step": 3836 + }, + { + "epoch": 0.1829189807641885, + "grad_norm": 4.39094877243042, + "learning_rate": 1.0643742094319991e-05, + "loss": 0.6984, + "step": 3837 + }, + { + "epoch": 0.18296665315948799, + "grad_norm": 4.12162446975708, + "learning_rate": 1.0639822219234583e-05, + "loss": 0.1054, + "step": 3838 + }, + { + "epoch": 0.1830143255547875, + "grad_norm": 1.4367767572402954, + "learning_rate": 1.0635902245431198e-05, + "loss": 0.4412, + "step": 3839 + }, + { + "epoch": 0.183061997950087, + "grad_norm": 1.297636866569519, + "learning_rate": 1.0631982173514645e-05, + "loss": 0.5826, + "step": 3840 + }, + { + "epoch": 0.1831096703453865, + "grad_norm": 1.4141820669174194, + "learning_rate": 1.062806200408975e-05, + "loss": 0.3926, + "step": 3841 + }, + { + "epoch": 0.183157342740686, + "grad_norm": 1.2304821014404297, + "learning_rate": 1.0624141737761356e-05, + "loss": 0.7535, + "step": 3842 + }, + { + "epoch": 0.1832050151359855, + "grad_norm": 1.5290307998657227, + "learning_rate": 1.0620221375134319e-05, + "loss": 0.9564, + "step": 3843 + }, + { + "epoch": 0.183252687531285, + "grad_norm": 5.774345397949219, + "learning_rate": 1.0616300916813509e-05, + "loss": 0.332, + "step": 3844 + }, + { + "epoch": 0.1833003599265845, + "grad_norm": 1.33428156375885, + "learning_rate": 1.0612380363403818e-05, + "loss": 0.6447, + "step": 3845 + }, + { + "epoch": 0.183348032321884, + "grad_norm": 1.1961020231246948, + "learning_rate": 1.060845971551014e-05, + "loss": 0.9196, + "step": 3846 + }, + { + "epoch": 0.18339570471718353, + "grad_norm": 1.279846429824829, + "learning_rate": 1.0604538973737394e-05, + "loss": 0.5155, + "step": 3847 + }, + { + "epoch": 0.183443377112483, + "grad_norm": 1.2324726581573486, + "learning_rate": 1.0600618138690514e-05, + "loss": 0.726, + "step": 3848 + }, + { + "epoch": 0.18349104950778253, + "grad_norm": 2.030451774597168, + "learning_rate": 1.0596697210974436e-05, + "loss": 0.7574, + "step": 3849 + }, + { + "epoch": 0.183538721903082, + "grad_norm": 1.6552375555038452, + "learning_rate": 1.0592776191194126e-05, + "loss": 0.6017, + "step": 3850 + }, + { + "epoch": 0.18358639429838153, + "grad_norm": 1.3873900175094604, + "learning_rate": 1.0588855079954552e-05, + "loss": 0.5501, + "step": 3851 + }, + { + "epoch": 0.18363406669368101, + "grad_norm": 1.1930038928985596, + "learning_rate": 1.05849338778607e-05, + "loss": 0.7093, + "step": 3852 + }, + { + "epoch": 0.18368173908898053, + "grad_norm": 1.1213637590408325, + "learning_rate": 1.058101258551758e-05, + "loss": 0.7103, + "step": 3853 + }, + { + "epoch": 0.18372941148428001, + "grad_norm": 3.5747599601745605, + "learning_rate": 1.05770912035302e-05, + "loss": 0.0847, + "step": 3854 + }, + { + "epoch": 0.18377708387957953, + "grad_norm": 1.7481714487075806, + "learning_rate": 1.0573169732503592e-05, + "loss": 0.8045, + "step": 3855 + }, + { + "epoch": 0.18382475627487904, + "grad_norm": 1.6177853345870972, + "learning_rate": 1.0569248173042793e-05, + "loss": 0.7111, + "step": 3856 + }, + { + "epoch": 0.18387242867017853, + "grad_norm": 1.1562814712524414, + "learning_rate": 1.0565326525752866e-05, + "loss": 0.7206, + "step": 3857 + }, + { + "epoch": 0.18392010106547804, + "grad_norm": 1.4583311080932617, + "learning_rate": 1.0561404791238875e-05, + "loss": 0.8123, + "step": 3858 + }, + { + "epoch": 0.18396777346077753, + "grad_norm": 1.5818381309509277, + "learning_rate": 1.0557482970105907e-05, + "loss": 1.2063, + "step": 3859 + }, + { + "epoch": 0.18401544585607704, + "grad_norm": 2.0157089233398438, + "learning_rate": 1.0553561062959056e-05, + "loss": 0.6947, + "step": 3860 + }, + { + "epoch": 0.18406311825137653, + "grad_norm": 1.8058491945266724, + "learning_rate": 1.0549639070403437e-05, + "loss": 1.1887, + "step": 3861 + }, + { + "epoch": 0.18411079064667604, + "grad_norm": 0.9012666940689087, + "learning_rate": 1.0545716993044168e-05, + "loss": 0.4464, + "step": 3862 + }, + { + "epoch": 0.18415846304197556, + "grad_norm": 1.0894057750701904, + "learning_rate": 1.0541794831486388e-05, + "loss": 0.4666, + "step": 3863 + }, + { + "epoch": 0.18420613543727504, + "grad_norm": 2.1592535972595215, + "learning_rate": 1.0537872586335245e-05, + "loss": 0.8279, + "step": 3864 + }, + { + "epoch": 0.18425380783257456, + "grad_norm": 1.3710857629776, + "learning_rate": 1.05339502581959e-05, + "loss": 0.7208, + "step": 3865 + }, + { + "epoch": 0.18430148022787404, + "grad_norm": 2.9551916122436523, + "learning_rate": 1.0530027847673526e-05, + "loss": 0.6664, + "step": 3866 + }, + { + "epoch": 0.18434915262317356, + "grad_norm": 1.4025201797485352, + "learning_rate": 1.0526105355373318e-05, + "loss": 0.7266, + "step": 3867 + }, + { + "epoch": 0.18439682501847304, + "grad_norm": 1.3041059970855713, + "learning_rate": 1.0522182781900467e-05, + "loss": 1.0898, + "step": 3868 + }, + { + "epoch": 0.18444449741377256, + "grad_norm": 2.556823492050171, + "learning_rate": 1.0518260127860192e-05, + "loss": 1.1303, + "step": 3869 + }, + { + "epoch": 0.18449216980907204, + "grad_norm": 1.3298100233078003, + "learning_rate": 1.0514337393857718e-05, + "loss": 0.7412, + "step": 3870 + }, + { + "epoch": 0.18453984220437156, + "grad_norm": 1.5906391143798828, + "learning_rate": 1.0510414580498283e-05, + "loss": 0.8786, + "step": 3871 + }, + { + "epoch": 0.18458751459967107, + "grad_norm": 3.597646474838257, + "learning_rate": 1.0506491688387128e-05, + "loss": 0.8626, + "step": 3872 + }, + { + "epoch": 0.18463518699497056, + "grad_norm": 2.4983737468719482, + "learning_rate": 1.0502568718129526e-05, + "loss": 0.4904, + "step": 3873 + }, + { + "epoch": 0.18468285939027007, + "grad_norm": 1.5360634326934814, + "learning_rate": 1.0498645670330746e-05, + "loss": 0.4983, + "step": 3874 + }, + { + "epoch": 0.18473053178556956, + "grad_norm": 1.3316617012023926, + "learning_rate": 1.049472254559607e-05, + "loss": 0.6926, + "step": 3875 + }, + { + "epoch": 0.18477820418086907, + "grad_norm": 2.1072819232940674, + "learning_rate": 1.0490799344530804e-05, + "loss": 1.1203, + "step": 3876 + }, + { + "epoch": 0.18482587657616856, + "grad_norm": 1.4880831241607666, + "learning_rate": 1.0486876067740253e-05, + "loss": 0.6833, + "step": 3877 + }, + { + "epoch": 0.18487354897146807, + "grad_norm": 3.8898813724517822, + "learning_rate": 1.0482952715829737e-05, + "loss": 0.4716, + "step": 3878 + }, + { + "epoch": 0.18492122136676759, + "grad_norm": 1.723595142364502, + "learning_rate": 1.0479029289404592e-05, + "loss": 0.9794, + "step": 3879 + }, + { + "epoch": 0.18496889376206707, + "grad_norm": 3.458982229232788, + "learning_rate": 1.0475105789070157e-05, + "loss": 0.5757, + "step": 3880 + }, + { + "epoch": 0.18501656615736659, + "grad_norm": 1.9958633184432983, + "learning_rate": 1.0471182215431796e-05, + "loss": 0.3679, + "step": 3881 + }, + { + "epoch": 0.18506423855266607, + "grad_norm": 2.694352149963379, + "learning_rate": 1.046725856909487e-05, + "loss": 0.8458, + "step": 3882 + }, + { + "epoch": 0.18511191094796559, + "grad_norm": 1.2007461786270142, + "learning_rate": 1.0463334850664757e-05, + "loss": 0.9068, + "step": 3883 + }, + { + "epoch": 0.18515958334326507, + "grad_norm": 1.968867301940918, + "learning_rate": 1.0459411060746848e-05, + "loss": 0.9438, + "step": 3884 + }, + { + "epoch": 0.18520725573856459, + "grad_norm": 4.607203006744385, + "learning_rate": 1.0455487199946547e-05, + "loss": 0.6974, + "step": 3885 + }, + { + "epoch": 0.1852549281338641, + "grad_norm": 1.1567150354385376, + "learning_rate": 1.0451563268869258e-05, + "loss": 0.7309, + "step": 3886 + }, + { + "epoch": 0.18530260052916359, + "grad_norm": 1.4846583604812622, + "learning_rate": 1.0447639268120409e-05, + "loss": 0.5524, + "step": 3887 + }, + { + "epoch": 0.1853502729244631, + "grad_norm": 1.5285459756851196, + "learning_rate": 1.0443715198305432e-05, + "loss": 0.2858, + "step": 3888 + }, + { + "epoch": 0.18539794531976259, + "grad_norm": 4.7506208419799805, + "learning_rate": 1.0439791060029765e-05, + "loss": 0.3033, + "step": 3889 + }, + { + "epoch": 0.1854456177150621, + "grad_norm": 2.634995937347412, + "learning_rate": 1.0435866853898869e-05, + "loss": 0.5784, + "step": 3890 + }, + { + "epoch": 0.18549329011036159, + "grad_norm": 1.5000189542770386, + "learning_rate": 1.0431942580518207e-05, + "loss": 0.788, + "step": 3891 + }, + { + "epoch": 0.1855409625056611, + "grad_norm": 1.276197910308838, + "learning_rate": 1.0428018240493247e-05, + "loss": 0.7052, + "step": 3892 + }, + { + "epoch": 0.1855886349009606, + "grad_norm": 1.6995823383331299, + "learning_rate": 1.0424093834429487e-05, + "loss": 0.7176, + "step": 3893 + }, + { + "epoch": 0.1856363072962601, + "grad_norm": 1.4082237482070923, + "learning_rate": 1.0420169362932416e-05, + "loss": 0.7163, + "step": 3894 + }, + { + "epoch": 0.18568397969155961, + "grad_norm": 6.1425251960754395, + "learning_rate": 1.0416244826607533e-05, + "loss": 0.4488, + "step": 3895 + }, + { + "epoch": 0.1857316520868591, + "grad_norm": 2.4104936122894287, + "learning_rate": 1.0412320226060364e-05, + "loss": 1.0537, + "step": 3896 + }, + { + "epoch": 0.18577932448215861, + "grad_norm": 1.3676190376281738, + "learning_rate": 1.0408395561896429e-05, + "loss": 0.7454, + "step": 3897 + }, + { + "epoch": 0.1858269968774581, + "grad_norm": 1.383682131767273, + "learning_rate": 1.0404470834721265e-05, + "loss": 0.7932, + "step": 3898 + }, + { + "epoch": 0.18587466927275761, + "grad_norm": 1.5696673393249512, + "learning_rate": 1.0400546045140416e-05, + "loss": 0.6699, + "step": 3899 + }, + { + "epoch": 0.1859223416680571, + "grad_norm": 3.1046829223632812, + "learning_rate": 1.039662119375944e-05, + "loss": 1.0755, + "step": 3900 + }, + { + "epoch": 0.18597001406335661, + "grad_norm": 1.342029094696045, + "learning_rate": 1.0392696281183893e-05, + "loss": 0.6093, + "step": 3901 + }, + { + "epoch": 0.18601768645865613, + "grad_norm": 1.4734547138214111, + "learning_rate": 1.0388771308019359e-05, + "loss": 0.7575, + "step": 3902 + }, + { + "epoch": 0.18606535885395561, + "grad_norm": 3.215017318725586, + "learning_rate": 1.0384846274871412e-05, + "loss": 0.6839, + "step": 3903 + }, + { + "epoch": 0.18611303124925513, + "grad_norm": 2.5554721355438232, + "learning_rate": 1.038092118234565e-05, + "loss": 1.1114, + "step": 3904 + }, + { + "epoch": 0.18616070364455461, + "grad_norm": 1.4321552515029907, + "learning_rate": 1.037699603104767e-05, + "loss": 0.7825, + "step": 3905 + }, + { + "epoch": 0.18620837603985413, + "grad_norm": 1.71304452419281, + "learning_rate": 1.0373070821583084e-05, + "loss": 0.7759, + "step": 3906 + }, + { + "epoch": 0.18625604843515361, + "grad_norm": 4.079225063323975, + "learning_rate": 1.0369145554557516e-05, + "loss": 0.2851, + "step": 3907 + }, + { + "epoch": 0.18630372083045313, + "grad_norm": 1.5605511665344238, + "learning_rate": 1.0365220230576592e-05, + "loss": 0.7748, + "step": 3908 + }, + { + "epoch": 0.18635139322575262, + "grad_norm": 30.554336547851562, + "learning_rate": 1.0361294850245942e-05, + "loss": 0.868, + "step": 3909 + }, + { + "epoch": 0.18639906562105213, + "grad_norm": 1.908445119857788, + "learning_rate": 1.0357369414171219e-05, + "loss": 0.8372, + "step": 3910 + }, + { + "epoch": 0.18644673801635164, + "grad_norm": 1.4731189012527466, + "learning_rate": 1.0353443922958078e-05, + "loss": 0.929, + "step": 3911 + }, + { + "epoch": 0.18649441041165113, + "grad_norm": 1.6199272871017456, + "learning_rate": 1.0349518377212175e-05, + "loss": 1.0831, + "step": 3912 + }, + { + "epoch": 0.18654208280695064, + "grad_norm": 1.8650221824645996, + "learning_rate": 1.0345592777539189e-05, + "loss": 0.5358, + "step": 3913 + }, + { + "epoch": 0.18658975520225013, + "grad_norm": 1.8524514436721802, + "learning_rate": 1.0341667124544797e-05, + "loss": 0.5273, + "step": 3914 + }, + { + "epoch": 0.18663742759754964, + "grad_norm": 8.39456558227539, + "learning_rate": 1.0337741418834683e-05, + "loss": 0.8568, + "step": 3915 + }, + { + "epoch": 0.18668509999284913, + "grad_norm": 1.458449363708496, + "learning_rate": 1.033381566101455e-05, + "loss": 0.5621, + "step": 3916 + }, + { + "epoch": 0.18673277238814864, + "grad_norm": 1.5252214670181274, + "learning_rate": 1.0329889851690094e-05, + "loss": 0.6014, + "step": 3917 + }, + { + "epoch": 0.18678044478344816, + "grad_norm": 3.2252800464630127, + "learning_rate": 1.0325963991467031e-05, + "loss": 0.6638, + "step": 3918 + }, + { + "epoch": 0.18682811717874764, + "grad_norm": 1.4949884414672852, + "learning_rate": 1.0322038080951084e-05, + "loss": 0.8773, + "step": 3919 + }, + { + "epoch": 0.18687578957404716, + "grad_norm": 1.5134341716766357, + "learning_rate": 1.0318112120747977e-05, + "loss": 0.4869, + "step": 3920 + }, + { + "epoch": 0.18692346196934664, + "grad_norm": 2.3841423988342285, + "learning_rate": 1.0314186111463444e-05, + "loss": 0.2674, + "step": 3921 + }, + { + "epoch": 0.18697113436464616, + "grad_norm": 1.7857775688171387, + "learning_rate": 1.0310260053703231e-05, + "loss": 0.7092, + "step": 3922 + }, + { + "epoch": 0.18701880675994564, + "grad_norm": 1.8670042753219604, + "learning_rate": 1.0306333948073089e-05, + "loss": 0.6532, + "step": 3923 + }, + { + "epoch": 0.18706647915524516, + "grad_norm": 2.2968075275421143, + "learning_rate": 1.030240779517877e-05, + "loss": 1.337, + "step": 3924 + }, + { + "epoch": 0.18711415155054464, + "grad_norm": 3.2001328468322754, + "learning_rate": 1.0298481595626045e-05, + "loss": 0.5752, + "step": 3925 + }, + { + "epoch": 0.18716182394584416, + "grad_norm": 1.341929316520691, + "learning_rate": 1.0294555350020678e-05, + "loss": 0.9104, + "step": 3926 + }, + { + "epoch": 0.18720949634114367, + "grad_norm": 1.3757683038711548, + "learning_rate": 1.0290629058968457e-05, + "loss": 0.6054, + "step": 3927 + }, + { + "epoch": 0.18725716873644316, + "grad_norm": 1.4756373167037964, + "learning_rate": 1.0286702723075167e-05, + "loss": 0.8232, + "step": 3928 + }, + { + "epoch": 0.18730484113174267, + "grad_norm": 4.100464344024658, + "learning_rate": 1.0282776342946597e-05, + "loss": 0.5475, + "step": 3929 + }, + { + "epoch": 0.18735251352704216, + "grad_norm": 2.667470693588257, + "learning_rate": 1.0278849919188551e-05, + "loss": 0.0291, + "step": 3930 + }, + { + "epoch": 0.18740018592234167, + "grad_norm": 1.8539137840270996, + "learning_rate": 1.0274923452406835e-05, + "loss": 0.6703, + "step": 3931 + }, + { + "epoch": 0.18744785831764116, + "grad_norm": 2.61194109916687, + "learning_rate": 1.0270996943207258e-05, + "loss": 0.2901, + "step": 3932 + }, + { + "epoch": 0.18749553071294067, + "grad_norm": 1.727474570274353, + "learning_rate": 1.0267070392195646e-05, + "loss": 0.5151, + "step": 3933 + }, + { + "epoch": 0.1875432031082402, + "grad_norm": 2.6027114391326904, + "learning_rate": 1.0263143799977824e-05, + "loss": 0.6154, + "step": 3934 + }, + { + "epoch": 0.18759087550353967, + "grad_norm": 1.6948940753936768, + "learning_rate": 1.025921716715962e-05, + "loss": 0.897, + "step": 3935 + }, + { + "epoch": 0.1876385478988392, + "grad_norm": 1.958555817604065, + "learning_rate": 1.0255290494346877e-05, + "loss": 0.5878, + "step": 3936 + }, + { + "epoch": 0.18768622029413867, + "grad_norm": 1.528274655342102, + "learning_rate": 1.0251363782145443e-05, + "loss": 0.4614, + "step": 3937 + }, + { + "epoch": 0.1877338926894382, + "grad_norm": 1.605790376663208, + "learning_rate": 1.0247437031161162e-05, + "loss": 1.0658, + "step": 3938 + }, + { + "epoch": 0.18778156508473767, + "grad_norm": 2.38838529586792, + "learning_rate": 1.0243510241999898e-05, + "loss": 0.4949, + "step": 3939 + }, + { + "epoch": 0.1878292374800372, + "grad_norm": 15.202858924865723, + "learning_rate": 1.0239583415267509e-05, + "loss": 1.0663, + "step": 3940 + }, + { + "epoch": 0.18787690987533667, + "grad_norm": 1.548436164855957, + "learning_rate": 1.0235656551569868e-05, + "loss": 0.5715, + "step": 3941 + }, + { + "epoch": 0.1879245822706362, + "grad_norm": 1.743993878364563, + "learning_rate": 1.0231729651512847e-05, + "loss": 0.814, + "step": 3942 + }, + { + "epoch": 0.1879722546659357, + "grad_norm": 1.2997312545776367, + "learning_rate": 1.0227802715702326e-05, + "loss": 0.7499, + "step": 3943 + }, + { + "epoch": 0.1880199270612352, + "grad_norm": 1.7470053434371948, + "learning_rate": 1.0223875744744194e-05, + "loss": 0.8215, + "step": 3944 + }, + { + "epoch": 0.1880675994565347, + "grad_norm": 1.3334025144577026, + "learning_rate": 1.021994873924434e-05, + "loss": 0.8206, + "step": 3945 + }, + { + "epoch": 0.1881152718518342, + "grad_norm": 1.8813772201538086, + "learning_rate": 1.021602169980866e-05, + "loss": 0.5871, + "step": 3946 + }, + { + "epoch": 0.1881629442471337, + "grad_norm": 1.4710915088653564, + "learning_rate": 1.0212094627043056e-05, + "loss": 0.8113, + "step": 3947 + }, + { + "epoch": 0.1882106166424332, + "grad_norm": 1.6951239109039307, + "learning_rate": 1.0208167521553439e-05, + "loss": 1.0451, + "step": 3948 + }, + { + "epoch": 0.1882582890377327, + "grad_norm": 3.9029242992401123, + "learning_rate": 1.0204240383945709e-05, + "loss": 0.9608, + "step": 3949 + }, + { + "epoch": 0.18830596143303222, + "grad_norm": 2.543529748916626, + "learning_rate": 1.0200313214825797e-05, + "loss": 0.7797, + "step": 3950 + }, + { + "epoch": 0.1883536338283317, + "grad_norm": 1.1120450496673584, + "learning_rate": 1.0196386014799617e-05, + "loss": 0.4686, + "step": 3951 + }, + { + "epoch": 0.18840130622363122, + "grad_norm": 2.798509359359741, + "learning_rate": 1.0192458784473099e-05, + "loss": 1.1725, + "step": 3952 + }, + { + "epoch": 0.1884489786189307, + "grad_norm": 1.2831599712371826, + "learning_rate": 1.0188531524452173e-05, + "loss": 0.7103, + "step": 3953 + }, + { + "epoch": 0.18849665101423022, + "grad_norm": 1.679078221321106, + "learning_rate": 1.018460423534277e-05, + "loss": 0.6849, + "step": 3954 + }, + { + "epoch": 0.1885443234095297, + "grad_norm": 1.7133469581604004, + "learning_rate": 1.0180676917750839e-05, + "loss": 0.5772, + "step": 3955 + }, + { + "epoch": 0.18859199580482922, + "grad_norm": 1.840065360069275, + "learning_rate": 1.0176749572282318e-05, + "loss": 0.8097, + "step": 3956 + }, + { + "epoch": 0.1886396682001287, + "grad_norm": 2.323977470397949, + "learning_rate": 1.0172822199543155e-05, + "loss": 1.2464, + "step": 3957 + }, + { + "epoch": 0.18868734059542822, + "grad_norm": 4.106752872467041, + "learning_rate": 1.0168894800139311e-05, + "loss": 0.4467, + "step": 3958 + }, + { + "epoch": 0.18873501299072773, + "grad_norm": 1.7676013708114624, + "learning_rate": 1.0164967374676737e-05, + "loss": 0.9212, + "step": 3959 + }, + { + "epoch": 0.18878268538602722, + "grad_norm": 1.6974899768829346, + "learning_rate": 1.0161039923761398e-05, + "loss": 0.7235, + "step": 3960 + }, + { + "epoch": 0.18883035778132673, + "grad_norm": 1.3777499198913574, + "learning_rate": 1.0157112447999255e-05, + "loss": 0.5702, + "step": 3961 + }, + { + "epoch": 0.18887803017662622, + "grad_norm": 2.2918481826782227, + "learning_rate": 1.0153184947996282e-05, + "loss": 1.1125, + "step": 3962 + }, + { + "epoch": 0.18892570257192573, + "grad_norm": 1.5073621273040771, + "learning_rate": 1.0149257424358445e-05, + "loss": 0.7786, + "step": 3963 + }, + { + "epoch": 0.18897337496722522, + "grad_norm": 2.86057186126709, + "learning_rate": 1.0145329877691725e-05, + "loss": 1.3107, + "step": 3964 + }, + { + "epoch": 0.18902104736252473, + "grad_norm": 1.8101078271865845, + "learning_rate": 1.0141402308602104e-05, + "loss": 0.6794, + "step": 3965 + }, + { + "epoch": 0.18906871975782424, + "grad_norm": 1.1594735383987427, + "learning_rate": 1.0137474717695561e-05, + "loss": 0.3229, + "step": 3966 + }, + { + "epoch": 0.18911639215312373, + "grad_norm": 1.6829912662506104, + "learning_rate": 1.0133547105578085e-05, + "loss": 0.8185, + "step": 3967 + }, + { + "epoch": 0.18916406454842324, + "grad_norm": 5.886511325836182, + "learning_rate": 1.012961947285567e-05, + "loss": 1.3177, + "step": 3968 + }, + { + "epoch": 0.18921173694372273, + "grad_norm": 2.4712436199188232, + "learning_rate": 1.0125691820134299e-05, + "loss": 0.9348, + "step": 3969 + }, + { + "epoch": 0.18925940933902224, + "grad_norm": 1.824073314666748, + "learning_rate": 1.0121764148019977e-05, + "loss": 0.8565, + "step": 3970 + }, + { + "epoch": 0.18930708173432173, + "grad_norm": 1.8286038637161255, + "learning_rate": 1.0117836457118701e-05, + "loss": 0.7106, + "step": 3971 + }, + { + "epoch": 0.18935475412962124, + "grad_norm": 1.9167611598968506, + "learning_rate": 1.0113908748036471e-05, + "loss": 0.9992, + "step": 3972 + }, + { + "epoch": 0.18940242652492076, + "grad_norm": 1.2062486410140991, + "learning_rate": 1.0109981021379297e-05, + "loss": 0.3719, + "step": 3973 + }, + { + "epoch": 0.18945009892022024, + "grad_norm": 2.559468984603882, + "learning_rate": 1.0106053277753182e-05, + "loss": 0.7084, + "step": 3974 + }, + { + "epoch": 0.18949777131551976, + "grad_norm": 1.1755224466323853, + "learning_rate": 1.0102125517764144e-05, + "loss": 0.5891, + "step": 3975 + }, + { + "epoch": 0.18954544371081924, + "grad_norm": 2.8774361610412598, + "learning_rate": 1.0098197742018185e-05, + "loss": 0.5111, + "step": 3976 + }, + { + "epoch": 0.18959311610611876, + "grad_norm": 1.055750846862793, + "learning_rate": 1.0094269951121326e-05, + "loss": 0.517, + "step": 3977 + }, + { + "epoch": 0.18964078850141824, + "grad_norm": 11.34118366241455, + "learning_rate": 1.0090342145679584e-05, + "loss": 0.486, + "step": 3978 + }, + { + "epoch": 0.18968846089671776, + "grad_norm": 1.7106672525405884, + "learning_rate": 1.008641432629898e-05, + "loss": 0.7893, + "step": 3979 + }, + { + "epoch": 0.18973613329201724, + "grad_norm": 1.1219589710235596, + "learning_rate": 1.0082486493585535e-05, + "loss": 0.1533, + "step": 3980 + }, + { + "epoch": 0.18978380568731676, + "grad_norm": 0.9951594471931458, + "learning_rate": 1.0078558648145273e-05, + "loss": 0.2722, + "step": 3981 + }, + { + "epoch": 0.18983147808261627, + "grad_norm": 1.7601436376571655, + "learning_rate": 1.0074630790584223e-05, + "loss": 0.6116, + "step": 3982 + }, + { + "epoch": 0.18987915047791576, + "grad_norm": 1.6753166913986206, + "learning_rate": 1.0070702921508408e-05, + "loss": 0.642, + "step": 3983 + }, + { + "epoch": 0.18992682287321527, + "grad_norm": 1.6184039115905762, + "learning_rate": 1.0066775041523864e-05, + "loss": 0.706, + "step": 3984 + }, + { + "epoch": 0.18997449526851476, + "grad_norm": 1.3840320110321045, + "learning_rate": 1.0062847151236616e-05, + "loss": 0.744, + "step": 3985 + }, + { + "epoch": 0.19002216766381427, + "grad_norm": 4.343174457550049, + "learning_rate": 1.00589192512527e-05, + "loss": 0.6515, + "step": 3986 + }, + { + "epoch": 0.19006984005911376, + "grad_norm": 2.5381290912628174, + "learning_rate": 1.005499134217815e-05, + "loss": 0.8103, + "step": 3987 + }, + { + "epoch": 0.19011751245441327, + "grad_norm": 1.3251681327819824, + "learning_rate": 1.0051063424619e-05, + "loss": 0.5401, + "step": 3988 + }, + { + "epoch": 0.1901651848497128, + "grad_norm": 1.403380036354065, + "learning_rate": 1.0047135499181293e-05, + "loss": 0.8215, + "step": 3989 + }, + { + "epoch": 0.19021285724501227, + "grad_norm": 1.2663720846176147, + "learning_rate": 1.0043207566471064e-05, + "loss": 0.7769, + "step": 3990 + }, + { + "epoch": 0.1902605296403118, + "grad_norm": 1.170798420906067, + "learning_rate": 1.0039279627094352e-05, + "loss": 0.4485, + "step": 3991 + }, + { + "epoch": 0.19030820203561127, + "grad_norm": 2.168261766433716, + "learning_rate": 1.0035351681657194e-05, + "loss": 0.8187, + "step": 3992 + }, + { + "epoch": 0.1903558744309108, + "grad_norm": 2.069822072982788, + "learning_rate": 1.0031423730765642e-05, + "loss": 0.8345, + "step": 3993 + }, + { + "epoch": 0.19040354682621027, + "grad_norm": 1.8551433086395264, + "learning_rate": 1.0027495775025726e-05, + "loss": 0.373, + "step": 3994 + }, + { + "epoch": 0.1904512192215098, + "grad_norm": 1.0069206953048706, + "learning_rate": 1.0023567815043498e-05, + "loss": 0.4186, + "step": 3995 + }, + { + "epoch": 0.19049889161680927, + "grad_norm": 4.528952598571777, + "learning_rate": 1.0019639851424998e-05, + "loss": 0.6763, + "step": 3996 + }, + { + "epoch": 0.1905465640121088, + "grad_norm": 1.0347591638565063, + "learning_rate": 1.0015711884776274e-05, + "loss": 0.4398, + "step": 3997 + }, + { + "epoch": 0.1905942364074083, + "grad_norm": 3.2478513717651367, + "learning_rate": 1.0011783915703367e-05, + "loss": 0.7674, + "step": 3998 + }, + { + "epoch": 0.1906419088027078, + "grad_norm": 1.9077250957489014, + "learning_rate": 1.0007855944812321e-05, + "loss": 1.1326, + "step": 3999 + }, + { + "epoch": 0.1906895811980073, + "grad_norm": 2.356350898742676, + "learning_rate": 1.0003927972709182e-05, + "loss": 0.8314, + "step": 4000 + }, + { + "epoch": 0.1907372535933068, + "grad_norm": 1.7501803636550903, + "learning_rate": 1e-05, + "loss": 0.7022, + "step": 4001 + }, + { + "epoch": 0.1907849259886063, + "grad_norm": 1.702653408050537, + "learning_rate": 9.996072027290818e-06, + "loss": 1.1079, + "step": 4002 + }, + { + "epoch": 0.1908325983839058, + "grad_norm": 1.1961506605148315, + "learning_rate": 9.992144055187684e-06, + "loss": 0.5683, + "step": 4003 + }, + { + "epoch": 0.1908802707792053, + "grad_norm": 1.8550877571105957, + "learning_rate": 9.988216084296637e-06, + "loss": 0.7196, + "step": 4004 + }, + { + "epoch": 0.19092794317450482, + "grad_norm": 1.1281499862670898, + "learning_rate": 9.984288115223729e-06, + "loss": 0.4459, + "step": 4005 + }, + { + "epoch": 0.1909756155698043, + "grad_norm": 1.5966535806655884, + "learning_rate": 9.980360148575006e-06, + "loss": 0.9497, + "step": 4006 + }, + { + "epoch": 0.19102328796510382, + "grad_norm": 1.5581865310668945, + "learning_rate": 9.976432184956504e-06, + "loss": 0.9498, + "step": 4007 + }, + { + "epoch": 0.1910709603604033, + "grad_norm": 1.785199761390686, + "learning_rate": 9.972504224974274e-06, + "loss": 0.9696, + "step": 4008 + }, + { + "epoch": 0.19111863275570282, + "grad_norm": 2.1413733959198, + "learning_rate": 9.968576269234365e-06, + "loss": 0.6122, + "step": 4009 + }, + { + "epoch": 0.1911663051510023, + "grad_norm": 2.5225419998168945, + "learning_rate": 9.964648318342807e-06, + "loss": 0.9041, + "step": 4010 + }, + { + "epoch": 0.19121397754630182, + "grad_norm": 1.5725998878479004, + "learning_rate": 9.960720372905651e-06, + "loss": 0.7385, + "step": 4011 + }, + { + "epoch": 0.1912616499416013, + "grad_norm": 5.730638027191162, + "learning_rate": 9.95679243352894e-06, + "loss": 0.3228, + "step": 4012 + }, + { + "epoch": 0.19130932233690082, + "grad_norm": 1.994070053100586, + "learning_rate": 9.95286450081871e-06, + "loss": 1.1382, + "step": 4013 + }, + { + "epoch": 0.19135699473220033, + "grad_norm": 1.4327560663223267, + "learning_rate": 9.948936575381001e-06, + "loss": 0.5335, + "step": 4014 + }, + { + "epoch": 0.19140466712749982, + "grad_norm": 1.881850004196167, + "learning_rate": 9.945008657821856e-06, + "loss": 0.4408, + "step": 4015 + }, + { + "epoch": 0.19145233952279933, + "grad_norm": 2.0671184062957764, + "learning_rate": 9.941080748747305e-06, + "loss": 0.7581, + "step": 4016 + }, + { + "epoch": 0.19150001191809882, + "grad_norm": 1.9491459131240845, + "learning_rate": 9.937152848763387e-06, + "loss": 0.6457, + "step": 4017 + }, + { + "epoch": 0.19154768431339833, + "grad_norm": 2.5914411544799805, + "learning_rate": 9.933224958476143e-06, + "loss": 0.5489, + "step": 4018 + }, + { + "epoch": 0.19159535670869782, + "grad_norm": 2.014749526977539, + "learning_rate": 9.929297078491594e-06, + "loss": 0.8239, + "step": 4019 + }, + { + "epoch": 0.19164302910399733, + "grad_norm": 1.5102659463882446, + "learning_rate": 9.92536920941578e-06, + "loss": 0.4571, + "step": 4020 + }, + { + "epoch": 0.19169070149929684, + "grad_norm": 5.525020122528076, + "learning_rate": 9.921441351854727e-06, + "loss": 1.7096, + "step": 4021 + }, + { + "epoch": 0.19173837389459633, + "grad_norm": 2.299715995788574, + "learning_rate": 9.917513506414468e-06, + "loss": 0.6074, + "step": 4022 + }, + { + "epoch": 0.19178604628989585, + "grad_norm": 1.9847575426101685, + "learning_rate": 9.913585673701023e-06, + "loss": 0.7237, + "step": 4023 + }, + { + "epoch": 0.19183371868519533, + "grad_norm": 1.1055468320846558, + "learning_rate": 9.909657854320417e-06, + "loss": 0.3065, + "step": 4024 + }, + { + "epoch": 0.19188139108049485, + "grad_norm": 1.3308771848678589, + "learning_rate": 9.905730048878678e-06, + "loss": 0.7146, + "step": 4025 + }, + { + "epoch": 0.19192906347579433, + "grad_norm": 2.196897506713867, + "learning_rate": 9.901802257981819e-06, + "loss": 1.2178, + "step": 4026 + }, + { + "epoch": 0.19197673587109385, + "grad_norm": 2.5359132289886475, + "learning_rate": 9.897874482235862e-06, + "loss": 0.366, + "step": 4027 + }, + { + "epoch": 0.19202440826639333, + "grad_norm": 2.311821937561035, + "learning_rate": 9.893946722246821e-06, + "loss": 0.7793, + "step": 4028 + }, + { + "epoch": 0.19207208066169285, + "grad_norm": 1.1079131364822388, + "learning_rate": 9.890018978620706e-06, + "loss": 0.641, + "step": 4029 + }, + { + "epoch": 0.19211975305699236, + "grad_norm": 2.7376606464385986, + "learning_rate": 9.886091251963529e-06, + "loss": 0.1924, + "step": 4030 + }, + { + "epoch": 0.19216742545229185, + "grad_norm": 1.7969475984573364, + "learning_rate": 9.882163542881304e-06, + "loss": 0.7858, + "step": 4031 + }, + { + "epoch": 0.19221509784759136, + "grad_norm": 3.4324734210968018, + "learning_rate": 9.878235851980027e-06, + "loss": 0.5442, + "step": 4032 + }, + { + "epoch": 0.19226277024289085, + "grad_norm": 1.6245849132537842, + "learning_rate": 9.874308179865701e-06, + "loss": 0.848, + "step": 4033 + }, + { + "epoch": 0.19231044263819036, + "grad_norm": 1.5750185251235962, + "learning_rate": 9.870380527144336e-06, + "loss": 0.9363, + "step": 4034 + }, + { + "epoch": 0.19235811503348985, + "grad_norm": 1.8717321157455444, + "learning_rate": 9.866452894421918e-06, + "loss": 0.566, + "step": 4035 + }, + { + "epoch": 0.19240578742878936, + "grad_norm": 1.7825829982757568, + "learning_rate": 9.86252528230444e-06, + "loss": 0.7021, + "step": 4036 + }, + { + "epoch": 0.19245345982408887, + "grad_norm": 3.949215888977051, + "learning_rate": 9.858597691397901e-06, + "loss": 0.986, + "step": 4037 + }, + { + "epoch": 0.19250113221938836, + "grad_norm": 6.687702655792236, + "learning_rate": 9.854670122308276e-06, + "loss": 0.3524, + "step": 4038 + }, + { + "epoch": 0.19254880461468787, + "grad_norm": 1.5576212406158447, + "learning_rate": 9.850742575641557e-06, + "loss": 0.9, + "step": 4039 + }, + { + "epoch": 0.19259647700998736, + "grad_norm": 1.438395619392395, + "learning_rate": 9.846815052003723e-06, + "loss": 0.6943, + "step": 4040 + }, + { + "epoch": 0.19264414940528687, + "grad_norm": 2.263612985610962, + "learning_rate": 9.842887552000746e-06, + "loss": 0.6922, + "step": 4041 + }, + { + "epoch": 0.19269182180058636, + "grad_norm": 1.5974233150482178, + "learning_rate": 9.838960076238604e-06, + "loss": 0.7394, + "step": 4042 + }, + { + "epoch": 0.19273949419588587, + "grad_norm": 1.368720531463623, + "learning_rate": 9.835032625323265e-06, + "loss": 0.788, + "step": 4043 + }, + { + "epoch": 0.19278716659118536, + "grad_norm": 1.6274160146713257, + "learning_rate": 9.83110519986069e-06, + "loss": 0.6309, + "step": 4044 + }, + { + "epoch": 0.19283483898648487, + "grad_norm": 1.827691912651062, + "learning_rate": 9.827177800456843e-06, + "loss": 1.0083, + "step": 4045 + }, + { + "epoch": 0.1928825113817844, + "grad_norm": 1.566077470779419, + "learning_rate": 9.823250427717687e-06, + "loss": 0.7521, + "step": 4046 + }, + { + "epoch": 0.19293018377708387, + "grad_norm": 1.1719876527786255, + "learning_rate": 9.819323082249165e-06, + "loss": 0.6426, + "step": 4047 + }, + { + "epoch": 0.1929778561723834, + "grad_norm": 0.6636733412742615, + "learning_rate": 9.81539576465723e-06, + "loss": 0.4049, + "step": 4048 + }, + { + "epoch": 0.19302552856768287, + "grad_norm": 2.6347463130950928, + "learning_rate": 9.811468475547832e-06, + "loss": 0.7113, + "step": 4049 + }, + { + "epoch": 0.1930732009629824, + "grad_norm": 2.6905677318573, + "learning_rate": 9.807541215526906e-06, + "loss": 0.9128, + "step": 4050 + }, + { + "epoch": 0.19312087335828187, + "grad_norm": 1.991546869277954, + "learning_rate": 9.803613985200385e-06, + "loss": 0.6767, + "step": 4051 + }, + { + "epoch": 0.1931685457535814, + "grad_norm": 1.0742383003234863, + "learning_rate": 9.799686785174208e-06, + "loss": 0.6039, + "step": 4052 + }, + { + "epoch": 0.1932162181488809, + "grad_norm": 1.293224573135376, + "learning_rate": 9.795759616054293e-06, + "loss": 0.4643, + "step": 4053 + }, + { + "epoch": 0.1932638905441804, + "grad_norm": 1.7305622100830078, + "learning_rate": 9.791832478446566e-06, + "loss": 0.5691, + "step": 4054 + }, + { + "epoch": 0.1933115629394799, + "grad_norm": 2.7719335556030273, + "learning_rate": 9.787905372956947e-06, + "loss": 0.9588, + "step": 4055 + }, + { + "epoch": 0.1933592353347794, + "grad_norm": 1.6359543800354004, + "learning_rate": 9.783978300191343e-06, + "loss": 0.8012, + "step": 4056 + }, + { + "epoch": 0.1934069077300789, + "grad_norm": 1.640873670578003, + "learning_rate": 9.780051260755663e-06, + "loss": 1.0023, + "step": 4057 + }, + { + "epoch": 0.1934545801253784, + "grad_norm": 1.903261661529541, + "learning_rate": 9.776124255255808e-06, + "loss": 0.7936, + "step": 4058 + }, + { + "epoch": 0.1935022525206779, + "grad_norm": 1.6489269733428955, + "learning_rate": 9.772197284297677e-06, + "loss": 0.8606, + "step": 4059 + }, + { + "epoch": 0.19354992491597742, + "grad_norm": 2.577099323272705, + "learning_rate": 9.768270348487156e-06, + "loss": 1.0964, + "step": 4060 + }, + { + "epoch": 0.1935975973112769, + "grad_norm": 2.1362030506134033, + "learning_rate": 9.764343448430132e-06, + "loss": 0.2655, + "step": 4061 + }, + { + "epoch": 0.19364526970657642, + "grad_norm": 1.6332634687423706, + "learning_rate": 9.760416584732494e-06, + "loss": 0.5944, + "step": 4062 + }, + { + "epoch": 0.1936929421018759, + "grad_norm": 2.6982839107513428, + "learning_rate": 9.756489758000105e-06, + "loss": 0.9662, + "step": 4063 + }, + { + "epoch": 0.19374061449717542, + "grad_norm": 2.517205238342285, + "learning_rate": 9.75256296883884e-06, + "loss": 0.7213, + "step": 4064 + }, + { + "epoch": 0.1937882868924749, + "grad_norm": 2.0091564655303955, + "learning_rate": 9.748636217854562e-06, + "loss": 0.7532, + "step": 4065 + }, + { + "epoch": 0.19383595928777442, + "grad_norm": 2.6380233764648438, + "learning_rate": 9.744709505653126e-06, + "loss": 0.8532, + "step": 4066 + }, + { + "epoch": 0.1938836316830739, + "grad_norm": 2.9615659713745117, + "learning_rate": 9.740782832840382e-06, + "loss": 1.0955, + "step": 4067 + }, + { + "epoch": 0.19393130407837342, + "grad_norm": 1.9009617567062378, + "learning_rate": 9.736856200022182e-06, + "loss": 0.9071, + "step": 4068 + }, + { + "epoch": 0.19397897647367293, + "grad_norm": 2.4427125453948975, + "learning_rate": 9.732929607804357e-06, + "loss": 1.1511, + "step": 4069 + }, + { + "epoch": 0.19402664886897242, + "grad_norm": 1.3123414516448975, + "learning_rate": 9.729003056792742e-06, + "loss": 0.7402, + "step": 4070 + }, + { + "epoch": 0.19407432126427193, + "grad_norm": 1.9339609146118164, + "learning_rate": 9.72507654759317e-06, + "loss": 0.6079, + "step": 4071 + }, + { + "epoch": 0.19412199365957142, + "grad_norm": 1.584667682647705, + "learning_rate": 9.721150080811452e-06, + "loss": 0.9517, + "step": 4072 + }, + { + "epoch": 0.19416966605487093, + "grad_norm": 1.3803532123565674, + "learning_rate": 9.717223657053403e-06, + "loss": 0.8105, + "step": 4073 + }, + { + "epoch": 0.19421733845017042, + "grad_norm": 1.4207947254180908, + "learning_rate": 9.713297276924838e-06, + "loss": 0.8498, + "step": 4074 + }, + { + "epoch": 0.19426501084546993, + "grad_norm": 1.5667606592178345, + "learning_rate": 9.709370941031544e-06, + "loss": 0.5529, + "step": 4075 + }, + { + "epoch": 0.19431268324076945, + "grad_norm": 2.121001720428467, + "learning_rate": 9.705444649979322e-06, + "loss": 0.6464, + "step": 4076 + }, + { + "epoch": 0.19436035563606893, + "grad_norm": 3.191556215286255, + "learning_rate": 9.701518404373962e-06, + "loss": 0.8502, + "step": 4077 + }, + { + "epoch": 0.19440802803136845, + "grad_norm": 2.538217067718506, + "learning_rate": 9.697592204821233e-06, + "loss": 1.1788, + "step": 4078 + }, + { + "epoch": 0.19445570042666793, + "grad_norm": 3.672064781188965, + "learning_rate": 9.693666051926915e-06, + "loss": 1.0213, + "step": 4079 + }, + { + "epoch": 0.19450337282196745, + "grad_norm": 5.480026721954346, + "learning_rate": 9.689739946296772e-06, + "loss": 0.5324, + "step": 4080 + }, + { + "epoch": 0.19455104521726693, + "grad_norm": 2.3922767639160156, + "learning_rate": 9.685813888536559e-06, + "loss": 0.607, + "step": 4081 + }, + { + "epoch": 0.19459871761256645, + "grad_norm": 1.8322621583938599, + "learning_rate": 9.681887879252025e-06, + "loss": 0.9569, + "step": 4082 + }, + { + "epoch": 0.19464639000786593, + "grad_norm": 3.0608551502227783, + "learning_rate": 9.67796191904892e-06, + "loss": 0.7135, + "step": 4083 + }, + { + "epoch": 0.19469406240316545, + "grad_norm": 1.4460397958755493, + "learning_rate": 9.67403600853297e-06, + "loss": 0.7031, + "step": 4084 + }, + { + "epoch": 0.19474173479846496, + "grad_norm": 3.0253090858459473, + "learning_rate": 9.670110148309907e-06, + "loss": 1.0597, + "step": 4085 + }, + { + "epoch": 0.19478940719376445, + "grad_norm": 1.6821322441101074, + "learning_rate": 9.666184338985456e-06, + "loss": 0.4781, + "step": 4086 + }, + { + "epoch": 0.19483707958906396, + "grad_norm": 2.3137426376342773, + "learning_rate": 9.66225858116532e-06, + "loss": 0.7536, + "step": 4087 + }, + { + "epoch": 0.19488475198436345, + "grad_norm": 1.8267980813980103, + "learning_rate": 9.658332875455207e-06, + "loss": 0.5195, + "step": 4088 + }, + { + "epoch": 0.19493242437966296, + "grad_norm": 1.5063681602478027, + "learning_rate": 9.654407222460816e-06, + "loss": 0.8574, + "step": 4089 + }, + { + "epoch": 0.19498009677496245, + "grad_norm": 3.0024847984313965, + "learning_rate": 9.650481622787829e-06, + "loss": 1.3545, + "step": 4090 + }, + { + "epoch": 0.19502776917026196, + "grad_norm": 1.8751912117004395, + "learning_rate": 9.646556077041925e-06, + "loss": 0.689, + "step": 4091 + }, + { + "epoch": 0.19507544156556147, + "grad_norm": 1.4176849126815796, + "learning_rate": 9.642630585828785e-06, + "loss": 0.659, + "step": 4092 + }, + { + "epoch": 0.19512311396086096, + "grad_norm": 1.907494068145752, + "learning_rate": 9.638705149754061e-06, + "loss": 0.6609, + "step": 4093 + }, + { + "epoch": 0.19517078635616047, + "grad_norm": 1.0732585191726685, + "learning_rate": 9.634779769423412e-06, + "loss": 0.5324, + "step": 4094 + }, + { + "epoch": 0.19521845875145996, + "grad_norm": 1.7960597276687622, + "learning_rate": 9.630854445442486e-06, + "loss": 0.5091, + "step": 4095 + }, + { + "epoch": 0.19526613114675948, + "grad_norm": 2.283451795578003, + "learning_rate": 9.626929178416918e-06, + "loss": 0.4262, + "step": 4096 + }, + { + "epoch": 0.19531380354205896, + "grad_norm": 1.4824867248535156, + "learning_rate": 9.623003968952331e-06, + "loss": 0.7516, + "step": 4097 + }, + { + "epoch": 0.19536147593735848, + "grad_norm": 1.1608043909072876, + "learning_rate": 9.619078817654352e-06, + "loss": 0.4951, + "step": 4098 + }, + { + "epoch": 0.19540914833265796, + "grad_norm": 1.246159315109253, + "learning_rate": 9.615153725128593e-06, + "loss": 0.6873, + "step": 4099 + }, + { + "epoch": 0.19545682072795748, + "grad_norm": 2.472771406173706, + "learning_rate": 9.611228691980644e-06, + "loss": 0.9136, + "step": 4100 + }, + { + "epoch": 0.195504493123257, + "grad_norm": 2.861677646636963, + "learning_rate": 9.607303718816108e-06, + "loss": 1.241, + "step": 4101 + }, + { + "epoch": 0.19555216551855648, + "grad_norm": 3.6714723110198975, + "learning_rate": 9.603378806240564e-06, + "loss": 0.8348, + "step": 4102 + }, + { + "epoch": 0.195599837913856, + "grad_norm": 1.849320888519287, + "learning_rate": 9.599453954859586e-06, + "loss": 0.6496, + "step": 4103 + }, + { + "epoch": 0.19564751030915548, + "grad_norm": 0.9038025736808777, + "learning_rate": 9.595529165278736e-06, + "loss": 0.3049, + "step": 4104 + }, + { + "epoch": 0.195695182704455, + "grad_norm": 1.3686846494674683, + "learning_rate": 9.591604438103574e-06, + "loss": 0.7059, + "step": 4105 + }, + { + "epoch": 0.19574285509975448, + "grad_norm": 1.4731411933898926, + "learning_rate": 9.587679773939637e-06, + "loss": 0.629, + "step": 4106 + }, + { + "epoch": 0.195790527495054, + "grad_norm": 2.0155394077301025, + "learning_rate": 9.583755173392467e-06, + "loss": 0.9051, + "step": 4107 + }, + { + "epoch": 0.1958381998903535, + "grad_norm": 1.195552945137024, + "learning_rate": 9.57983063706759e-06, + "loss": 0.7321, + "step": 4108 + }, + { + "epoch": 0.195885872285653, + "grad_norm": 2.286600112915039, + "learning_rate": 9.575906165570515e-06, + "loss": 0.5508, + "step": 4109 + }, + { + "epoch": 0.1959335446809525, + "grad_norm": 2.5784952640533447, + "learning_rate": 9.571981759506753e-06, + "loss": 0.4551, + "step": 4110 + }, + { + "epoch": 0.195981217076252, + "grad_norm": 1.0925605297088623, + "learning_rate": 9.5680574194818e-06, + "loss": 0.6314, + "step": 4111 + }, + { + "epoch": 0.1960288894715515, + "grad_norm": 1.4741684198379517, + "learning_rate": 9.564133146101134e-06, + "loss": 0.7571, + "step": 4112 + }, + { + "epoch": 0.196076561866851, + "grad_norm": 2.2106049060821533, + "learning_rate": 9.560208939970236e-06, + "loss": 0.5526, + "step": 4113 + }, + { + "epoch": 0.1961242342621505, + "grad_norm": 1.1798970699310303, + "learning_rate": 9.556284801694573e-06, + "loss": 0.5691, + "step": 4114 + }, + { + "epoch": 0.19617190665745, + "grad_norm": 1.6515107154846191, + "learning_rate": 9.552360731879593e-06, + "loss": 0.5732, + "step": 4115 + }, + { + "epoch": 0.1962195790527495, + "grad_norm": 2.0230700969696045, + "learning_rate": 9.54843673113074e-06, + "loss": 0.9739, + "step": 4116 + }, + { + "epoch": 0.19626725144804902, + "grad_norm": 1.4770721197128296, + "learning_rate": 9.544512800053457e-06, + "loss": 0.7404, + "step": 4117 + }, + { + "epoch": 0.1963149238433485, + "grad_norm": 1.1223185062408447, + "learning_rate": 9.540588939253153e-06, + "loss": 0.4837, + "step": 4118 + }, + { + "epoch": 0.19636259623864802, + "grad_norm": 3.519036293029785, + "learning_rate": 9.536665149335245e-06, + "loss": 0.4448, + "step": 4119 + }, + { + "epoch": 0.1964102686339475, + "grad_norm": 1.6042251586914062, + "learning_rate": 9.532741430905135e-06, + "loss": 0.5796, + "step": 4120 + }, + { + "epoch": 0.19645794102924702, + "grad_norm": 1.1171796321868896, + "learning_rate": 9.528817784568207e-06, + "loss": 0.5406, + "step": 4121 + }, + { + "epoch": 0.1965056134245465, + "grad_norm": 1.8708196878433228, + "learning_rate": 9.524894210929843e-06, + "loss": 0.4985, + "step": 4122 + }, + { + "epoch": 0.19655328581984602, + "grad_norm": 1.24036705493927, + "learning_rate": 9.520970710595413e-06, + "loss": 0.8731, + "step": 4123 + }, + { + "epoch": 0.19660095821514553, + "grad_norm": 1.369493842124939, + "learning_rate": 9.517047284170266e-06, + "loss": 0.8139, + "step": 4124 + }, + { + "epoch": 0.19664863061044502, + "grad_norm": 1.6081372499465942, + "learning_rate": 9.51312393225975e-06, + "loss": 0.9162, + "step": 4125 + }, + { + "epoch": 0.19669630300574453, + "grad_norm": 1.2288099527359009, + "learning_rate": 9.509200655469201e-06, + "loss": 0.6058, + "step": 4126 + }, + { + "epoch": 0.19674397540104402, + "grad_norm": 2.0774357318878174, + "learning_rate": 9.505277454403932e-06, + "loss": 1.003, + "step": 4127 + }, + { + "epoch": 0.19679164779634353, + "grad_norm": 1.7042399644851685, + "learning_rate": 9.501354329669258e-06, + "loss": 0.8103, + "step": 4128 + }, + { + "epoch": 0.19683932019164302, + "grad_norm": 1.418620228767395, + "learning_rate": 9.497431281870479e-06, + "loss": 0.5618, + "step": 4129 + }, + { + "epoch": 0.19688699258694253, + "grad_norm": 2.2100532054901123, + "learning_rate": 9.493508311612874e-06, + "loss": 1.011, + "step": 4130 + }, + { + "epoch": 0.19693466498224202, + "grad_norm": 1.827114224433899, + "learning_rate": 9.48958541950172e-06, + "loss": 0.732, + "step": 4131 + }, + { + "epoch": 0.19698233737754153, + "grad_norm": 2.7010011672973633, + "learning_rate": 9.485662606142285e-06, + "loss": 0.4864, + "step": 4132 + }, + { + "epoch": 0.19703000977284105, + "grad_norm": 1.5265750885009766, + "learning_rate": 9.48173987213981e-06, + "loss": 0.6267, + "step": 4133 + }, + { + "epoch": 0.19707768216814053, + "grad_norm": 1.00454580783844, + "learning_rate": 9.477817218099535e-06, + "loss": 0.6996, + "step": 4134 + }, + { + "epoch": 0.19712535456344005, + "grad_norm": 1.9903172254562378, + "learning_rate": 9.473894644626684e-06, + "loss": 0.8525, + "step": 4135 + }, + { + "epoch": 0.19717302695873953, + "grad_norm": 1.7862969636917114, + "learning_rate": 9.469972152326476e-06, + "loss": 0.4841, + "step": 4136 + }, + { + "epoch": 0.19722069935403905, + "grad_norm": 1.4141737222671509, + "learning_rate": 9.466049741804104e-06, + "loss": 0.4438, + "step": 4137 + }, + { + "epoch": 0.19726837174933853, + "grad_norm": 2.1943376064300537, + "learning_rate": 9.462127413664756e-06, + "loss": 1.4865, + "step": 4138 + }, + { + "epoch": 0.19731604414463805, + "grad_norm": 1.1153146028518677, + "learning_rate": 9.458205168513616e-06, + "loss": 0.2822, + "step": 4139 + }, + { + "epoch": 0.19736371653993756, + "grad_norm": 1.5740097761154175, + "learning_rate": 9.454283006955835e-06, + "loss": 0.5132, + "step": 4140 + }, + { + "epoch": 0.19741138893523705, + "grad_norm": 1.9186357259750366, + "learning_rate": 9.450360929596565e-06, + "loss": 1.0364, + "step": 4141 + }, + { + "epoch": 0.19745906133053656, + "grad_norm": 0.9910565614700317, + "learning_rate": 9.446438937040947e-06, + "loss": 0.6058, + "step": 4142 + }, + { + "epoch": 0.19750673372583605, + "grad_norm": 1.4343262910842896, + "learning_rate": 9.442517029894096e-06, + "loss": 0.3146, + "step": 4143 + }, + { + "epoch": 0.19755440612113556, + "grad_norm": 1.9917984008789062, + "learning_rate": 9.438595208761127e-06, + "loss": 0.2106, + "step": 4144 + }, + { + "epoch": 0.19760207851643505, + "grad_norm": 1.7350119352340698, + "learning_rate": 9.43467347424714e-06, + "loss": 0.6625, + "step": 4145 + }, + { + "epoch": 0.19764975091173456, + "grad_norm": 1.7454272508621216, + "learning_rate": 9.43075182695721e-06, + "loss": 0.7095, + "step": 4146 + }, + { + "epoch": 0.19769742330703405, + "grad_norm": 1.7070093154907227, + "learning_rate": 9.426830267496411e-06, + "loss": 0.6668, + "step": 4147 + }, + { + "epoch": 0.19774509570233356, + "grad_norm": 1.5751521587371826, + "learning_rate": 9.422908796469804e-06, + "loss": 0.493, + "step": 4148 + }, + { + "epoch": 0.19779276809763308, + "grad_norm": 57.8436393737793, + "learning_rate": 9.418987414482422e-06, + "loss": 0.624, + "step": 4149 + }, + { + "epoch": 0.19784044049293256, + "grad_norm": 1.2226852178573608, + "learning_rate": 9.415066122139298e-06, + "loss": 0.851, + "step": 4150 + }, + { + "epoch": 0.19788811288823208, + "grad_norm": 1.3573896884918213, + "learning_rate": 9.411144920045453e-06, + "loss": 0.4827, + "step": 4151 + }, + { + "epoch": 0.19793578528353156, + "grad_norm": 1.973682165145874, + "learning_rate": 9.407223808805878e-06, + "loss": 0.6924, + "step": 4152 + }, + { + "epoch": 0.19798345767883108, + "grad_norm": 3.305941104888916, + "learning_rate": 9.403302789025565e-06, + "loss": 0.7083, + "step": 4153 + }, + { + "epoch": 0.19803113007413056, + "grad_norm": 2.532428026199341, + "learning_rate": 9.399381861309491e-06, + "loss": 0.8382, + "step": 4154 + }, + { + "epoch": 0.19807880246943008, + "grad_norm": 1.5662442445755005, + "learning_rate": 9.395461026262607e-06, + "loss": 0.8271, + "step": 4155 + }, + { + "epoch": 0.1981264748647296, + "grad_norm": 3.439723491668701, + "learning_rate": 9.391540284489862e-06, + "loss": 0.3782, + "step": 4156 + }, + { + "epoch": 0.19817414726002908, + "grad_norm": 3.2685232162475586, + "learning_rate": 9.387619636596189e-06, + "loss": 0.9667, + "step": 4157 + }, + { + "epoch": 0.1982218196553286, + "grad_norm": 1.0390326976776123, + "learning_rate": 9.383699083186493e-06, + "loss": 0.571, + "step": 4158 + }, + { + "epoch": 0.19826949205062808, + "grad_norm": 1.4027167558670044, + "learning_rate": 9.379778624865683e-06, + "loss": 0.895, + "step": 4159 + }, + { + "epoch": 0.1983171644459276, + "grad_norm": 2.9695451259613037, + "learning_rate": 9.375858262238649e-06, + "loss": 0.491, + "step": 4160 + }, + { + "epoch": 0.19836483684122708, + "grad_norm": 1.326710820198059, + "learning_rate": 9.371937995910254e-06, + "loss": 0.7099, + "step": 4161 + }, + { + "epoch": 0.1984125092365266, + "grad_norm": 2.1169679164886475, + "learning_rate": 9.368017826485358e-06, + "loss": 1.0591, + "step": 4162 + }, + { + "epoch": 0.1984601816318261, + "grad_norm": 1.3265042304992676, + "learning_rate": 9.364097754568805e-06, + "loss": 0.6713, + "step": 4163 + }, + { + "epoch": 0.1985078540271256, + "grad_norm": 1.2513728141784668, + "learning_rate": 9.36017778076542e-06, + "loss": 0.4646, + "step": 4164 + }, + { + "epoch": 0.1985555264224251, + "grad_norm": 1.305857539176941, + "learning_rate": 9.356257905680012e-06, + "loss": 0.8509, + "step": 4165 + }, + { + "epoch": 0.1986031988177246, + "grad_norm": 1.4715200662612915, + "learning_rate": 9.352338129917384e-06, + "loss": 0.5577, + "step": 4166 + }, + { + "epoch": 0.1986508712130241, + "grad_norm": 3.029110908508301, + "learning_rate": 9.348418454082309e-06, + "loss": 0.5355, + "step": 4167 + }, + { + "epoch": 0.1986985436083236, + "grad_norm": 1.5193862915039062, + "learning_rate": 9.344498878779557e-06, + "loss": 0.3891, + "step": 4168 + }, + { + "epoch": 0.1987462160036231, + "grad_norm": 8.111202239990234, + "learning_rate": 9.34057940461388e-06, + "loss": 1.685, + "step": 4169 + }, + { + "epoch": 0.1987938883989226, + "grad_norm": 2.2644665241241455, + "learning_rate": 9.336660032190012e-06, + "loss": 0.4843, + "step": 4170 + }, + { + "epoch": 0.1988415607942221, + "grad_norm": 5.7227983474731445, + "learning_rate": 9.332740762112664e-06, + "loss": 1.6123, + "step": 4171 + }, + { + "epoch": 0.19888923318952162, + "grad_norm": 1.6429567337036133, + "learning_rate": 9.32882159498655e-06, + "loss": 0.2885, + "step": 4172 + }, + { + "epoch": 0.1989369055848211, + "grad_norm": 2.43394136428833, + "learning_rate": 9.324902531416348e-06, + "loss": 1.0933, + "step": 4173 + }, + { + "epoch": 0.19898457798012062, + "grad_norm": 1.2635501623153687, + "learning_rate": 9.320983572006734e-06, + "loss": 0.6361, + "step": 4174 + }, + { + "epoch": 0.1990322503754201, + "grad_norm": 5.41809606552124, + "learning_rate": 9.317064717362363e-06, + "loss": 0.5659, + "step": 4175 + }, + { + "epoch": 0.19907992277071962, + "grad_norm": 2.5245330333709717, + "learning_rate": 9.313145968087876e-06, + "loss": 0.7528, + "step": 4176 + }, + { + "epoch": 0.1991275951660191, + "grad_norm": 1.6297564506530762, + "learning_rate": 9.309227324787892e-06, + "loss": 0.7049, + "step": 4177 + }, + { + "epoch": 0.19917526756131862, + "grad_norm": 1.8300691843032837, + "learning_rate": 9.305308788067015e-06, + "loss": 0.6746, + "step": 4178 + }, + { + "epoch": 0.19922293995661813, + "grad_norm": 2.0935540199279785, + "learning_rate": 9.301390358529842e-06, + "loss": 0.6619, + "step": 4179 + }, + { + "epoch": 0.19927061235191762, + "grad_norm": 1.3441247940063477, + "learning_rate": 9.297472036780939e-06, + "loss": 0.6582, + "step": 4180 + }, + { + "epoch": 0.19931828474721713, + "grad_norm": 5.757411479949951, + "learning_rate": 9.293553823424865e-06, + "loss": 0.8623, + "step": 4181 + }, + { + "epoch": 0.19936595714251662, + "grad_norm": 1.7407352924346924, + "learning_rate": 9.289635719066166e-06, + "loss": 0.756, + "step": 4182 + }, + { + "epoch": 0.19941362953781613, + "grad_norm": 2.226282835006714, + "learning_rate": 9.285717724309357e-06, + "loss": 0.751, + "step": 4183 + }, + { + "epoch": 0.19946130193311562, + "grad_norm": 1.9448678493499756, + "learning_rate": 9.281799839758949e-06, + "loss": 1.0263, + "step": 4184 + }, + { + "epoch": 0.19950897432841513, + "grad_norm": 2.1662967205047607, + "learning_rate": 9.277882066019429e-06, + "loss": 0.78, + "step": 4185 + }, + { + "epoch": 0.19955664672371462, + "grad_norm": 1.1644023656845093, + "learning_rate": 9.27396440369527e-06, + "loss": 0.7828, + "step": 4186 + }, + { + "epoch": 0.19960431911901413, + "grad_norm": 0.9203680753707886, + "learning_rate": 9.270046853390924e-06, + "loss": 0.1869, + "step": 4187 + }, + { + "epoch": 0.19965199151431365, + "grad_norm": 1.6379756927490234, + "learning_rate": 9.266129415710837e-06, + "loss": 0.8028, + "step": 4188 + }, + { + "epoch": 0.19969966390961313, + "grad_norm": 1.378438949584961, + "learning_rate": 9.26221209125942e-06, + "loss": 1.0276, + "step": 4189 + }, + { + "epoch": 0.19974733630491265, + "grad_norm": 2.292485475540161, + "learning_rate": 9.258294880641078e-06, + "loss": 0.9434, + "step": 4190 + }, + { + "epoch": 0.19979500870021213, + "grad_norm": 1.6815050840377808, + "learning_rate": 9.254377784460202e-06, + "loss": 1.0238, + "step": 4191 + }, + { + "epoch": 0.19984268109551165, + "grad_norm": 1.4740095138549805, + "learning_rate": 9.250460803321156e-06, + "loss": 0.5136, + "step": 4192 + }, + { + "epoch": 0.19989035349081113, + "grad_norm": 2.781682014465332, + "learning_rate": 9.246543937828284e-06, + "loss": 1.2299, + "step": 4193 + }, + { + "epoch": 0.19993802588611065, + "grad_norm": 1.1177964210510254, + "learning_rate": 9.242627188585928e-06, + "loss": 0.6477, + "step": 4194 + }, + { + "epoch": 0.19998569828141016, + "grad_norm": 3.1113359928131104, + "learning_rate": 9.238710556198395e-06, + "loss": 1.2746, + "step": 4195 + }, + { + "epoch": 0.20003337067670965, + "grad_norm": 2.0119636058807373, + "learning_rate": 9.234794041269982e-06, + "loss": 0.841, + "step": 4196 + }, + { + "epoch": 0.20008104307200916, + "grad_norm": 3.1193697452545166, + "learning_rate": 9.230877644404974e-06, + "loss": 1.4355, + "step": 4197 + }, + { + "epoch": 0.20012871546730865, + "grad_norm": 1.9395185708999634, + "learning_rate": 9.226961366207619e-06, + "loss": 0.7507, + "step": 4198 + }, + { + "epoch": 0.20017638786260816, + "grad_norm": 5.962135314941406, + "learning_rate": 9.223045207282167e-06, + "loss": 0.5572, + "step": 4199 + }, + { + "epoch": 0.20022406025790765, + "grad_norm": 1.622602105140686, + "learning_rate": 9.21912916823284e-06, + "loss": 1.0705, + "step": 4200 + }, + { + "epoch": 0.20027173265320716, + "grad_norm": 2.316446542739868, + "learning_rate": 9.215213249663839e-06, + "loss": 0.8494, + "step": 4201 + }, + { + "epoch": 0.20031940504850665, + "grad_norm": 1.837378978729248, + "learning_rate": 9.211297452179348e-06, + "loss": 0.849, + "step": 4202 + }, + { + "epoch": 0.20036707744380616, + "grad_norm": 1.3284859657287598, + "learning_rate": 9.207381776383546e-06, + "loss": 0.7386, + "step": 4203 + }, + { + "epoch": 0.20041474983910568, + "grad_norm": 2.2446868419647217, + "learning_rate": 9.203466222880567e-06, + "loss": 0.6882, + "step": 4204 + }, + { + "epoch": 0.20046242223440516, + "grad_norm": 1.2276482582092285, + "learning_rate": 9.199550792274548e-06, + "loss": 0.3691, + "step": 4205 + }, + { + "epoch": 0.20051009462970468, + "grad_norm": 0.7257246375083923, + "learning_rate": 9.195635485169604e-06, + "loss": 0.222, + "step": 4206 + }, + { + "epoch": 0.20055776702500416, + "grad_norm": 3.1366653442382812, + "learning_rate": 9.191720302169815e-06, + "loss": 0.5044, + "step": 4207 + }, + { + "epoch": 0.20060543942030368, + "grad_norm": 2.124419927597046, + "learning_rate": 9.187805243879263e-06, + "loss": 0.8913, + "step": 4208 + }, + { + "epoch": 0.20065311181560316, + "grad_norm": 1.7044965028762817, + "learning_rate": 9.183890310902001e-06, + "loss": 0.8052, + "step": 4209 + }, + { + "epoch": 0.20070078421090268, + "grad_norm": 1.2684754133224487, + "learning_rate": 9.179975503842053e-06, + "loss": 0.8218, + "step": 4210 + }, + { + "epoch": 0.2007484566062022, + "grad_norm": 1.7904325723648071, + "learning_rate": 9.176060823303442e-06, + "loss": 0.7492, + "step": 4211 + }, + { + "epoch": 0.20079612900150168, + "grad_norm": 1.4300603866577148, + "learning_rate": 9.17214626989016e-06, + "loss": 0.7673, + "step": 4212 + }, + { + "epoch": 0.2008438013968012, + "grad_norm": 1.4715248346328735, + "learning_rate": 9.168231844206188e-06, + "loss": 0.6354, + "step": 4213 + }, + { + "epoch": 0.20089147379210068, + "grad_norm": 1.2048358917236328, + "learning_rate": 9.164317546855475e-06, + "loss": 0.4515, + "step": 4214 + }, + { + "epoch": 0.2009391461874002, + "grad_norm": 1.5780590772628784, + "learning_rate": 9.160403378441957e-06, + "loss": 0.7401, + "step": 4215 + }, + { + "epoch": 0.20098681858269968, + "grad_norm": 1.7281519174575806, + "learning_rate": 9.156489339569555e-06, + "loss": 0.5638, + "step": 4216 + }, + { + "epoch": 0.2010344909779992, + "grad_norm": 1.282120943069458, + "learning_rate": 9.152575430842156e-06, + "loss": 0.8457, + "step": 4217 + }, + { + "epoch": 0.20108216337329868, + "grad_norm": 1.8585940599441528, + "learning_rate": 9.148661652863644e-06, + "loss": 0.6463, + "step": 4218 + }, + { + "epoch": 0.2011298357685982, + "grad_norm": 1.7240244150161743, + "learning_rate": 9.144748006237873e-06, + "loss": 0.6968, + "step": 4219 + }, + { + "epoch": 0.2011775081638977, + "grad_norm": 2.5056512355804443, + "learning_rate": 9.140834491568675e-06, + "loss": 1.1388, + "step": 4220 + }, + { + "epoch": 0.2012251805591972, + "grad_norm": 1.2405264377593994, + "learning_rate": 9.136921109459869e-06, + "loss": 0.7214, + "step": 4221 + }, + { + "epoch": 0.2012728529544967, + "grad_norm": 2.4351043701171875, + "learning_rate": 9.133007860515248e-06, + "loss": 0.9342, + "step": 4222 + }, + { + "epoch": 0.2013205253497962, + "grad_norm": 7.069313049316406, + "learning_rate": 9.129094745338586e-06, + "loss": 0.4693, + "step": 4223 + }, + { + "epoch": 0.2013681977450957, + "grad_norm": 1.1319421529769897, + "learning_rate": 9.125181764533632e-06, + "loss": 0.8064, + "step": 4224 + }, + { + "epoch": 0.2014158701403952, + "grad_norm": 1.2453668117523193, + "learning_rate": 9.12126891870413e-06, + "loss": 0.5035, + "step": 4225 + }, + { + "epoch": 0.2014635425356947, + "grad_norm": 1.1166303157806396, + "learning_rate": 9.11735620845378e-06, + "loss": 0.4722, + "step": 4226 + }, + { + "epoch": 0.20151121493099422, + "grad_norm": 1.8650314807891846, + "learning_rate": 9.113443634386277e-06, + "loss": 0.9398, + "step": 4227 + }, + { + "epoch": 0.2015588873262937, + "grad_norm": 1.099266529083252, + "learning_rate": 9.109531197105295e-06, + "loss": 0.7001, + "step": 4228 + }, + { + "epoch": 0.20160655972159322, + "grad_norm": 1.1981068849563599, + "learning_rate": 9.105618897214475e-06, + "loss": 0.7549, + "step": 4229 + }, + { + "epoch": 0.2016542321168927, + "grad_norm": 2.5754318237304688, + "learning_rate": 9.101706735317451e-06, + "loss": 0.6851, + "step": 4230 + }, + { + "epoch": 0.20170190451219222, + "grad_norm": 1.9271581172943115, + "learning_rate": 9.09779471201783e-06, + "loss": 0.6861, + "step": 4231 + }, + { + "epoch": 0.2017495769074917, + "grad_norm": 1.414303183555603, + "learning_rate": 9.09388282791919e-06, + "loss": 0.6399, + "step": 4232 + }, + { + "epoch": 0.20179724930279122, + "grad_norm": 1.3563759326934814, + "learning_rate": 9.089971083625098e-06, + "loss": 0.7508, + "step": 4233 + }, + { + "epoch": 0.2018449216980907, + "grad_norm": 1.411031723022461, + "learning_rate": 9.086059479739099e-06, + "loss": 0.9758, + "step": 4234 + }, + { + "epoch": 0.20189259409339022, + "grad_norm": 1.3385288715362549, + "learning_rate": 9.08214801686471e-06, + "loss": 0.6474, + "step": 4235 + }, + { + "epoch": 0.20194026648868973, + "grad_norm": 1.3260201215744019, + "learning_rate": 9.078236695605426e-06, + "loss": 0.7197, + "step": 4236 + }, + { + "epoch": 0.20198793888398922, + "grad_norm": 1.3642252683639526, + "learning_rate": 9.074325516564734e-06, + "loss": 0.5181, + "step": 4237 + }, + { + "epoch": 0.20203561127928873, + "grad_norm": 7.07196044921875, + "learning_rate": 9.07041448034608e-06, + "loss": 0.4042, + "step": 4238 + }, + { + "epoch": 0.20208328367458822, + "grad_norm": 0.9440353512763977, + "learning_rate": 9.066503587552895e-06, + "loss": 0.264, + "step": 4239 + }, + { + "epoch": 0.20213095606988774, + "grad_norm": 2.08874773979187, + "learning_rate": 9.0625928387886e-06, + "loss": 0.9922, + "step": 4240 + }, + { + "epoch": 0.20217862846518722, + "grad_norm": 1.2602145671844482, + "learning_rate": 9.05868223465657e-06, + "loss": 0.5823, + "step": 4241 + }, + { + "epoch": 0.20222630086048674, + "grad_norm": 1.8287818431854248, + "learning_rate": 9.054771775760179e-06, + "loss": 0.6635, + "step": 4242 + }, + { + "epoch": 0.20227397325578625, + "grad_norm": 4.951713562011719, + "learning_rate": 9.050861462702772e-06, + "loss": 1.2398, + "step": 4243 + }, + { + "epoch": 0.20232164565108574, + "grad_norm": 1.2496687173843384, + "learning_rate": 9.046951296087664e-06, + "loss": 0.6909, + "step": 4244 + }, + { + "epoch": 0.20236931804638525, + "grad_norm": 1.6422070264816284, + "learning_rate": 9.043041276518158e-06, + "loss": 0.9024, + "step": 4245 + }, + { + "epoch": 0.20241699044168474, + "grad_norm": 2.6905791759490967, + "learning_rate": 9.039131404597531e-06, + "loss": 0.5229, + "step": 4246 + }, + { + "epoch": 0.20246466283698425, + "grad_norm": 1.3785916566848755, + "learning_rate": 9.035221680929028e-06, + "loss": 0.7383, + "step": 4247 + }, + { + "epoch": 0.20251233523228374, + "grad_norm": 4.605517864227295, + "learning_rate": 9.031312106115887e-06, + "loss": 0.7188, + "step": 4248 + }, + { + "epoch": 0.20256000762758325, + "grad_norm": 1.3591018915176392, + "learning_rate": 9.02740268076131e-06, + "loss": 0.6144, + "step": 4249 + }, + { + "epoch": 0.20260768002288276, + "grad_norm": 2.0683789253234863, + "learning_rate": 9.023493405468487e-06, + "loss": 0.7007, + "step": 4250 + }, + { + "epoch": 0.20265535241818225, + "grad_norm": 1.45108163356781, + "learning_rate": 9.019584280840572e-06, + "loss": 0.7456, + "step": 4251 + }, + { + "epoch": 0.20270302481348176, + "grad_norm": 1.5825248956680298, + "learning_rate": 9.01567530748071e-06, + "loss": 0.8712, + "step": 4252 + }, + { + "epoch": 0.20275069720878125, + "grad_norm": 2.3202617168426514, + "learning_rate": 9.011766485992012e-06, + "loss": 0.3998, + "step": 4253 + }, + { + "epoch": 0.20279836960408076, + "grad_norm": 3.4144463539123535, + "learning_rate": 9.007857816977565e-06, + "loss": 1.3134, + "step": 4254 + }, + { + "epoch": 0.20284604199938025, + "grad_norm": 2.3063769340515137, + "learning_rate": 9.003949301040439e-06, + "loss": 0.5287, + "step": 4255 + }, + { + "epoch": 0.20289371439467976, + "grad_norm": 1.8982830047607422, + "learning_rate": 9.000040938783681e-06, + "loss": 0.442, + "step": 4256 + }, + { + "epoch": 0.20294138678997925, + "grad_norm": 1.5901358127593994, + "learning_rate": 8.996132730810307e-06, + "loss": 0.8607, + "step": 4257 + }, + { + "epoch": 0.20298905918527876, + "grad_norm": 1.1874439716339111, + "learning_rate": 8.992224677723315e-06, + "loss": 0.5543, + "step": 4258 + }, + { + "epoch": 0.20303673158057828, + "grad_norm": 1.2114481925964355, + "learning_rate": 8.98831678012568e-06, + "loss": 0.4451, + "step": 4259 + }, + { + "epoch": 0.20308440397587776, + "grad_norm": 1.993187665939331, + "learning_rate": 8.984409038620345e-06, + "loss": 0.9458, + "step": 4260 + }, + { + "epoch": 0.20313207637117728, + "grad_norm": 5.413943290710449, + "learning_rate": 8.980501453810237e-06, + "loss": 0.2271, + "step": 4261 + }, + { + "epoch": 0.20317974876647676, + "grad_norm": 1.1722928285598755, + "learning_rate": 8.976594026298257e-06, + "loss": 0.4938, + "step": 4262 + }, + { + "epoch": 0.20322742116177628, + "grad_norm": 1.9649120569229126, + "learning_rate": 8.972686756687278e-06, + "loss": 0.6771, + "step": 4263 + }, + { + "epoch": 0.20327509355707576, + "grad_norm": 1.772072196006775, + "learning_rate": 8.968779645580153e-06, + "loss": 0.7081, + "step": 4264 + }, + { + "epoch": 0.20332276595237528, + "grad_norm": 2.7843313217163086, + "learning_rate": 8.964872693579711e-06, + "loss": 0.7345, + "step": 4265 + }, + { + "epoch": 0.2033704383476748, + "grad_norm": 1.3795090913772583, + "learning_rate": 8.96096590128875e-06, + "loss": 0.9303, + "step": 4266 + }, + { + "epoch": 0.20341811074297428, + "grad_norm": 1.534347653388977, + "learning_rate": 8.957059269310054e-06, + "loss": 0.8567, + "step": 4267 + }, + { + "epoch": 0.2034657831382738, + "grad_norm": 1.4403856992721558, + "learning_rate": 8.953152798246373e-06, + "loss": 0.5977, + "step": 4268 + }, + { + "epoch": 0.20351345553357328, + "grad_norm": 1.167072057723999, + "learning_rate": 8.949246488700431e-06, + "loss": 0.7333, + "step": 4269 + }, + { + "epoch": 0.2035611279288728, + "grad_norm": 1.1575961112976074, + "learning_rate": 8.945340341274934e-06, + "loss": 0.5692, + "step": 4270 + }, + { + "epoch": 0.20360880032417228, + "grad_norm": 3.0814316272735596, + "learning_rate": 8.941434356572566e-06, + "loss": 0.4905, + "step": 4271 + }, + { + "epoch": 0.2036564727194718, + "grad_norm": 1.531286597251892, + "learning_rate": 8.937528535195972e-06, + "loss": 0.664, + "step": 4272 + }, + { + "epoch": 0.20370414511477128, + "grad_norm": 2.40924072265625, + "learning_rate": 8.933622877747784e-06, + "loss": 0.3397, + "step": 4273 + }, + { + "epoch": 0.2037518175100708, + "grad_norm": 0.9527204036712646, + "learning_rate": 8.929717384830609e-06, + "loss": 0.52, + "step": 4274 + }, + { + "epoch": 0.2037994899053703, + "grad_norm": 1.711545467376709, + "learning_rate": 8.925812057047016e-06, + "loss": 0.8525, + "step": 4275 + }, + { + "epoch": 0.2038471623006698, + "grad_norm": 1.5503089427947998, + "learning_rate": 8.92190689499956e-06, + "loss": 0.293, + "step": 4276 + }, + { + "epoch": 0.2038948346959693, + "grad_norm": 1.0609380006790161, + "learning_rate": 8.918001899290771e-06, + "loss": 0.6366, + "step": 4277 + }, + { + "epoch": 0.2039425070912688, + "grad_norm": 5.815759658813477, + "learning_rate": 8.914097070523143e-06, + "loss": 0.3597, + "step": 4278 + }, + { + "epoch": 0.2039901794865683, + "grad_norm": 1.903731346130371, + "learning_rate": 8.910192409299154e-06, + "loss": 0.5367, + "step": 4279 + }, + { + "epoch": 0.2040378518818678, + "grad_norm": 1.4493054151535034, + "learning_rate": 8.906287916221259e-06, + "loss": 0.6947, + "step": 4280 + }, + { + "epoch": 0.2040855242771673, + "grad_norm": 3.5800254344940186, + "learning_rate": 8.90238359189187e-06, + "loss": 1.1227, + "step": 4281 + }, + { + "epoch": 0.20413319667246682, + "grad_norm": 1.032273292541504, + "learning_rate": 8.898479436913391e-06, + "loss": 0.6177, + "step": 4282 + }, + { + "epoch": 0.2041808690677663, + "grad_norm": 1.360286831855774, + "learning_rate": 8.894575451888194e-06, + "loss": 0.7827, + "step": 4283 + }, + { + "epoch": 0.20422854146306582, + "grad_norm": 2.164604425430298, + "learning_rate": 8.890671637418619e-06, + "loss": 0.8343, + "step": 4284 + }, + { + "epoch": 0.2042762138583653, + "grad_norm": 1.9245446920394897, + "learning_rate": 8.886767994106984e-06, + "loss": 1.1572, + "step": 4285 + }, + { + "epoch": 0.20432388625366482, + "grad_norm": 1.5803426504135132, + "learning_rate": 8.882864522555588e-06, + "loss": 0.9503, + "step": 4286 + }, + { + "epoch": 0.2043715586489643, + "grad_norm": 1.400686264038086, + "learning_rate": 8.878961223366687e-06, + "loss": 0.8529, + "step": 4287 + }, + { + "epoch": 0.20441923104426382, + "grad_norm": 4.508146286010742, + "learning_rate": 8.875058097142527e-06, + "loss": 0.5259, + "step": 4288 + }, + { + "epoch": 0.2044669034395633, + "grad_norm": 1.9589221477508545, + "learning_rate": 8.87115514448532e-06, + "loss": 0.8199, + "step": 4289 + }, + { + "epoch": 0.20451457583486282, + "grad_norm": 2.8784475326538086, + "learning_rate": 8.867252365997249e-06, + "loss": 0.5785, + "step": 4290 + }, + { + "epoch": 0.20456224823016234, + "grad_norm": 2.0287749767303467, + "learning_rate": 8.86334976228047e-06, + "loss": 0.2931, + "step": 4291 + }, + { + "epoch": 0.20460992062546182, + "grad_norm": 3.9576022624969482, + "learning_rate": 8.859447333937117e-06, + "loss": 2.1746, + "step": 4292 + }, + { + "epoch": 0.20465759302076134, + "grad_norm": 3.2679898738861084, + "learning_rate": 8.8555450815693e-06, + "loss": 0.5383, + "step": 4293 + }, + { + "epoch": 0.20470526541606082, + "grad_norm": 1.2970616817474365, + "learning_rate": 8.851643005779087e-06, + "loss": 0.8373, + "step": 4294 + }, + { + "epoch": 0.20475293781136034, + "grad_norm": 1.9055311679840088, + "learning_rate": 8.847741107168532e-06, + "loss": 0.9646, + "step": 4295 + }, + { + "epoch": 0.20480061020665982, + "grad_norm": 1.735101342201233, + "learning_rate": 8.843839386339662e-06, + "loss": 1.0163, + "step": 4296 + }, + { + "epoch": 0.20484828260195934, + "grad_norm": 1.5794428586959839, + "learning_rate": 8.839937843894466e-06, + "loss": 0.7539, + "step": 4297 + }, + { + "epoch": 0.20489595499725885, + "grad_norm": 1.8066282272338867, + "learning_rate": 8.836036480434914e-06, + "loss": 0.7838, + "step": 4298 + }, + { + "epoch": 0.20494362739255834, + "grad_norm": 1.0301337242126465, + "learning_rate": 8.832135296562949e-06, + "loss": 0.5171, + "step": 4299 + }, + { + "epoch": 0.20499129978785785, + "grad_norm": 1.7273674011230469, + "learning_rate": 8.828234292880479e-06, + "loss": 0.7805, + "step": 4300 + }, + { + "epoch": 0.20503897218315734, + "grad_norm": 1.8719673156738281, + "learning_rate": 8.824333469989388e-06, + "loss": 0.4028, + "step": 4301 + }, + { + "epoch": 0.20508664457845685, + "grad_norm": 1.3849194049835205, + "learning_rate": 8.820432828491542e-06, + "loss": 0.7234, + "step": 4302 + }, + { + "epoch": 0.20513431697375634, + "grad_norm": 1.609876036643982, + "learning_rate": 8.816532368988758e-06, + "loss": 0.52, + "step": 4303 + }, + { + "epoch": 0.20518198936905585, + "grad_norm": 1.4437907934188843, + "learning_rate": 8.812632092082846e-06, + "loss": 0.8405, + "step": 4304 + }, + { + "epoch": 0.20522966176435534, + "grad_norm": 1.6973559856414795, + "learning_rate": 8.808731998375572e-06, + "loss": 0.6245, + "step": 4305 + }, + { + "epoch": 0.20527733415965485, + "grad_norm": 1.6523698568344116, + "learning_rate": 8.804832088468685e-06, + "loss": 0.7299, + "step": 4306 + }, + { + "epoch": 0.20532500655495436, + "grad_norm": 1.1694504022598267, + "learning_rate": 8.800932362963896e-06, + "loss": 0.7143, + "step": 4307 + }, + { + "epoch": 0.20537267895025385, + "grad_norm": 1.2345824241638184, + "learning_rate": 8.7970328224629e-06, + "loss": 0.6415, + "step": 4308 + }, + { + "epoch": 0.20542035134555336, + "grad_norm": 1.5928858518600464, + "learning_rate": 8.793133467567346e-06, + "loss": 0.978, + "step": 4309 + }, + { + "epoch": 0.20546802374085285, + "grad_norm": 2.2975971698760986, + "learning_rate": 8.78923429887887e-06, + "loss": 0.9865, + "step": 4310 + }, + { + "epoch": 0.20551569613615236, + "grad_norm": 0.9889629483222961, + "learning_rate": 8.785335316999078e-06, + "loss": 0.6768, + "step": 4311 + }, + { + "epoch": 0.20556336853145185, + "grad_norm": 1.3772175312042236, + "learning_rate": 8.781436522529537e-06, + "loss": 0.6762, + "step": 4312 + }, + { + "epoch": 0.20561104092675137, + "grad_norm": 2.596837282180786, + "learning_rate": 8.777537916071787e-06, + "loss": 0.4926, + "step": 4313 + }, + { + "epoch": 0.20565871332205088, + "grad_norm": 1.4268628358840942, + "learning_rate": 8.773639498227355e-06, + "loss": 0.9057, + "step": 4314 + }, + { + "epoch": 0.20570638571735037, + "grad_norm": 1.710360050201416, + "learning_rate": 8.769741269597713e-06, + "loss": 0.6695, + "step": 4315 + }, + { + "epoch": 0.20575405811264988, + "grad_norm": 1.301283359527588, + "learning_rate": 8.765843230784324e-06, + "loss": 0.6356, + "step": 4316 + }, + { + "epoch": 0.20580173050794937, + "grad_norm": 1.2675840854644775, + "learning_rate": 8.761945382388619e-06, + "loss": 0.5589, + "step": 4317 + }, + { + "epoch": 0.20584940290324888, + "grad_norm": 2.300196647644043, + "learning_rate": 8.758047725011988e-06, + "loss": 0.7804, + "step": 4318 + }, + { + "epoch": 0.20589707529854837, + "grad_norm": 1.272340178489685, + "learning_rate": 8.754150259255807e-06, + "loss": 0.6072, + "step": 4319 + }, + { + "epoch": 0.20594474769384788, + "grad_norm": 1.5963306427001953, + "learning_rate": 8.75025298572141e-06, + "loss": 0.7109, + "step": 4320 + }, + { + "epoch": 0.20599242008914737, + "grad_norm": 0.9043671488761902, + "learning_rate": 8.746355905010108e-06, + "loss": 0.4623, + "step": 4321 + }, + { + "epoch": 0.20604009248444688, + "grad_norm": 1.5757149457931519, + "learning_rate": 8.742459017723176e-06, + "loss": 0.6174, + "step": 4322 + }, + { + "epoch": 0.2060877648797464, + "grad_norm": 1.5294100046157837, + "learning_rate": 8.738562324461873e-06, + "loss": 0.921, + "step": 4323 + }, + { + "epoch": 0.20613543727504588, + "grad_norm": 1.3959301710128784, + "learning_rate": 8.734665825827408e-06, + "loss": 0.5261, + "step": 4324 + }, + { + "epoch": 0.2061831096703454, + "grad_norm": 1.380132794380188, + "learning_rate": 8.730769522420978e-06, + "loss": 0.7874, + "step": 4325 + }, + { + "epoch": 0.20623078206564488, + "grad_norm": 2.447565793991089, + "learning_rate": 8.72687341484374e-06, + "loss": 1.0617, + "step": 4326 + }, + { + "epoch": 0.2062784544609444, + "grad_norm": 1.2747080326080322, + "learning_rate": 8.722977503696824e-06, + "loss": 0.7465, + "step": 4327 + }, + { + "epoch": 0.20632612685624388, + "grad_norm": 1.7254345417022705, + "learning_rate": 8.719081789581329e-06, + "loss": 0.8336, + "step": 4328 + }, + { + "epoch": 0.2063737992515434, + "grad_norm": 2.696746587753296, + "learning_rate": 8.715186273098319e-06, + "loss": 0.8615, + "step": 4329 + }, + { + "epoch": 0.2064214716468429, + "grad_norm": 2.128573417663574, + "learning_rate": 8.711290954848842e-06, + "loss": 1.1476, + "step": 4330 + }, + { + "epoch": 0.2064691440421424, + "grad_norm": 2.850130319595337, + "learning_rate": 8.707395835433895e-06, + "loss": 0.6364, + "step": 4331 + }, + { + "epoch": 0.2065168164374419, + "grad_norm": 1.7038079500198364, + "learning_rate": 8.703500915454458e-06, + "loss": 0.485, + "step": 4332 + }, + { + "epoch": 0.2065644888327414, + "grad_norm": 2.242863655090332, + "learning_rate": 8.699606195511484e-06, + "loss": 0.7257, + "step": 4333 + }, + { + "epoch": 0.2066121612280409, + "grad_norm": 0.9096806049346924, + "learning_rate": 8.69571167620588e-06, + "loss": 0.3038, + "step": 4334 + }, + { + "epoch": 0.2066598336233404, + "grad_norm": 0.9689982533454895, + "learning_rate": 8.691817358138532e-06, + "loss": 0.507, + "step": 4335 + }, + { + "epoch": 0.2067075060186399, + "grad_norm": 1.2572424411773682, + "learning_rate": 8.687923241910297e-06, + "loss": 0.5053, + "step": 4336 + }, + { + "epoch": 0.20675517841393942, + "grad_norm": 1.796380877494812, + "learning_rate": 8.68402932812199e-06, + "loss": 0.7343, + "step": 4337 + }, + { + "epoch": 0.2068028508092389, + "grad_norm": 2.0053296089172363, + "learning_rate": 8.680135617374406e-06, + "loss": 1.2507, + "step": 4338 + }, + { + "epoch": 0.20685052320453842, + "grad_norm": 2.7692112922668457, + "learning_rate": 8.676242110268308e-06, + "loss": 0.8011, + "step": 4339 + }, + { + "epoch": 0.2068981955998379, + "grad_norm": 1.0360571146011353, + "learning_rate": 8.672348807404416e-06, + "loss": 0.3467, + "step": 4340 + }, + { + "epoch": 0.20694586799513742, + "grad_norm": 2.7257163524627686, + "learning_rate": 8.668455709383433e-06, + "loss": 0.4019, + "step": 4341 + }, + { + "epoch": 0.2069935403904369, + "grad_norm": 2.3275458812713623, + "learning_rate": 8.664562816806022e-06, + "loss": 0.7583, + "step": 4342 + }, + { + "epoch": 0.20704121278573642, + "grad_norm": 3.416110038757324, + "learning_rate": 8.660670130272816e-06, + "loss": 0.8467, + "step": 4343 + }, + { + "epoch": 0.2070888851810359, + "grad_norm": 8.440009117126465, + "learning_rate": 8.656777650384415e-06, + "loss": 1.1757, + "step": 4344 + }, + { + "epoch": 0.20713655757633542, + "grad_norm": 1.699858546257019, + "learning_rate": 8.652885377741394e-06, + "loss": 0.8668, + "step": 4345 + }, + { + "epoch": 0.20718422997163494, + "grad_norm": 2.423576593399048, + "learning_rate": 8.648993312944282e-06, + "loss": 0.592, + "step": 4346 + }, + { + "epoch": 0.20723190236693442, + "grad_norm": 1.418212652206421, + "learning_rate": 8.645101456593589e-06, + "loss": 0.8543, + "step": 4347 + }, + { + "epoch": 0.20727957476223394, + "grad_norm": 3.9124245643615723, + "learning_rate": 8.641209809289792e-06, + "loss": 0.7545, + "step": 4348 + }, + { + "epoch": 0.20732724715753342, + "grad_norm": 1.1630877256393433, + "learning_rate": 8.637318371633326e-06, + "loss": 0.7485, + "step": 4349 + }, + { + "epoch": 0.20737491955283294, + "grad_norm": 1.4917844533920288, + "learning_rate": 8.633427144224603e-06, + "loss": 0.7835, + "step": 4350 + }, + { + "epoch": 0.20742259194813242, + "grad_norm": 2.170482873916626, + "learning_rate": 8.629536127664002e-06, + "loss": 0.9353, + "step": 4351 + }, + { + "epoch": 0.20747026434343194, + "grad_norm": 1.3693220615386963, + "learning_rate": 8.625645322551858e-06, + "loss": 0.8203, + "step": 4352 + }, + { + "epoch": 0.20751793673873145, + "grad_norm": 1.2693750858306885, + "learning_rate": 8.621754729488488e-06, + "loss": 0.4118, + "step": 4353 + }, + { + "epoch": 0.20756560913403094, + "grad_norm": 2.084134817123413, + "learning_rate": 8.617864349074176e-06, + "loss": 0.9773, + "step": 4354 + }, + { + "epoch": 0.20761328152933045, + "grad_norm": 1.4149044752120972, + "learning_rate": 8.613974181909155e-06, + "loss": 0.6085, + "step": 4355 + }, + { + "epoch": 0.20766095392462994, + "grad_norm": 2.060384750366211, + "learning_rate": 8.610084228593649e-06, + "loss": 0.6682, + "step": 4356 + }, + { + "epoch": 0.20770862631992945, + "grad_norm": 1.4337724447250366, + "learning_rate": 8.60619448972783e-06, + "loss": 0.9382, + "step": 4357 + }, + { + "epoch": 0.20775629871522894, + "grad_norm": 1.6676647663116455, + "learning_rate": 8.602304965911851e-06, + "loss": 0.7652, + "step": 4358 + }, + { + "epoch": 0.20780397111052845, + "grad_norm": 1.8317842483520508, + "learning_rate": 8.598415657745819e-06, + "loss": 0.6913, + "step": 4359 + }, + { + "epoch": 0.20785164350582794, + "grad_norm": 1.6914176940917969, + "learning_rate": 8.59452656582982e-06, + "loss": 0.5978, + "step": 4360 + }, + { + "epoch": 0.20789931590112745, + "grad_norm": 2.3064661026000977, + "learning_rate": 8.590637690763896e-06, + "loss": 0.6462, + "step": 4361 + }, + { + "epoch": 0.20794698829642697, + "grad_norm": 1.849098801612854, + "learning_rate": 8.586749033148063e-06, + "loss": 0.9279, + "step": 4362 + }, + { + "epoch": 0.20799466069172645, + "grad_norm": 1.5790804624557495, + "learning_rate": 8.582860593582301e-06, + "loss": 0.5849, + "step": 4363 + }, + { + "epoch": 0.20804233308702597, + "grad_norm": 1.6418288946151733, + "learning_rate": 8.578972372666557e-06, + "loss": 1.0811, + "step": 4364 + }, + { + "epoch": 0.20809000548232545, + "grad_norm": 1.935500144958496, + "learning_rate": 8.57508437100074e-06, + "loss": 0.822, + "step": 4365 + }, + { + "epoch": 0.20813767787762497, + "grad_norm": 1.7451543807983398, + "learning_rate": 8.571196589184732e-06, + "loss": 0.8533, + "step": 4366 + }, + { + "epoch": 0.20818535027292445, + "grad_norm": 1.4592797756195068, + "learning_rate": 8.56730902781838e-06, + "loss": 0.8098, + "step": 4367 + }, + { + "epoch": 0.20823302266822397, + "grad_norm": 2.1974895000457764, + "learning_rate": 8.563421687501485e-06, + "loss": 1.0553, + "step": 4368 + }, + { + "epoch": 0.20828069506352348, + "grad_norm": 1.570441722869873, + "learning_rate": 8.559534568833832e-06, + "loss": 0.2305, + "step": 4369 + }, + { + "epoch": 0.20832836745882297, + "grad_norm": 1.4212414026260376, + "learning_rate": 8.555647672415162e-06, + "loss": 0.6988, + "step": 4370 + }, + { + "epoch": 0.20837603985412248, + "grad_norm": 1.3336912393569946, + "learning_rate": 8.55176099884518e-06, + "loss": 0.6157, + "step": 4371 + }, + { + "epoch": 0.20842371224942197, + "grad_norm": 1.3625378608703613, + "learning_rate": 8.547874548723565e-06, + "loss": 0.7407, + "step": 4372 + }, + { + "epoch": 0.20847138464472148, + "grad_norm": 1.923500657081604, + "learning_rate": 8.543988322649954e-06, + "loss": 0.4212, + "step": 4373 + }, + { + "epoch": 0.20851905704002097, + "grad_norm": 1.3513028621673584, + "learning_rate": 8.540102321223947e-06, + "loss": 0.6865, + "step": 4374 + }, + { + "epoch": 0.20856672943532048, + "grad_norm": 3.3731484413146973, + "learning_rate": 8.536216545045117e-06, + "loss": 1.0769, + "step": 4375 + }, + { + "epoch": 0.20861440183061997, + "grad_norm": 2.838672161102295, + "learning_rate": 8.532330994713006e-06, + "loss": 0.7988, + "step": 4376 + }, + { + "epoch": 0.20866207422591948, + "grad_norm": 1.5584261417388916, + "learning_rate": 8.528445670827103e-06, + "loss": 0.5976, + "step": 4377 + }, + { + "epoch": 0.208709746621219, + "grad_norm": 1.4673742055892944, + "learning_rate": 8.52456057398688e-06, + "loss": 0.886, + "step": 4378 + }, + { + "epoch": 0.20875741901651848, + "grad_norm": 1.901602864265442, + "learning_rate": 8.52067570479177e-06, + "loss": 1.0483, + "step": 4379 + }, + { + "epoch": 0.208805091411818, + "grad_norm": 1.9782612323760986, + "learning_rate": 8.516791063841161e-06, + "loss": 0.7423, + "step": 4380 + }, + { + "epoch": 0.20885276380711748, + "grad_norm": 1.1063727140426636, + "learning_rate": 8.512906651734416e-06, + "loss": 0.6636, + "step": 4381 + }, + { + "epoch": 0.208900436202417, + "grad_norm": 3.5044310092926025, + "learning_rate": 8.509022469070864e-06, + "loss": 1.1454, + "step": 4382 + }, + { + "epoch": 0.20894810859771648, + "grad_norm": 1.1676899194717407, + "learning_rate": 8.505138516449786e-06, + "loss": 0.7145, + "step": 4383 + }, + { + "epoch": 0.208995780993016, + "grad_norm": 3.7188303470611572, + "learning_rate": 8.501254794470443e-06, + "loss": 1.1832, + "step": 4384 + }, + { + "epoch": 0.2090434533883155, + "grad_norm": 4.093562602996826, + "learning_rate": 8.497371303732054e-06, + "loss": 0.9244, + "step": 4385 + }, + { + "epoch": 0.209091125783615, + "grad_norm": 1.4679384231567383, + "learning_rate": 8.493488044833796e-06, + "loss": 0.7792, + "step": 4386 + }, + { + "epoch": 0.2091387981789145, + "grad_norm": 1.2527292966842651, + "learning_rate": 8.48960501837482e-06, + "loss": 0.7554, + "step": 4387 + }, + { + "epoch": 0.209186470574214, + "grad_norm": 1.5578149557113647, + "learning_rate": 8.485722224954237e-06, + "loss": 0.7614, + "step": 4388 + }, + { + "epoch": 0.2092341429695135, + "grad_norm": 4.238508701324463, + "learning_rate": 8.481839665171117e-06, + "loss": 1.0255, + "step": 4389 + }, + { + "epoch": 0.209281815364813, + "grad_norm": 3.228020429611206, + "learning_rate": 8.477957339624502e-06, + "loss": 0.4588, + "step": 4390 + }, + { + "epoch": 0.2093294877601125, + "grad_norm": 4.965100288391113, + "learning_rate": 8.4740752489134e-06, + "loss": 0.8342, + "step": 4391 + }, + { + "epoch": 0.209377160155412, + "grad_norm": 1.7317931652069092, + "learning_rate": 8.47019339363677e-06, + "loss": 0.7513, + "step": 4392 + }, + { + "epoch": 0.2094248325507115, + "grad_norm": 3.7609570026397705, + "learning_rate": 8.466311774393544e-06, + "loss": 1.0602, + "step": 4393 + }, + { + "epoch": 0.20947250494601102, + "grad_norm": 1.1125895977020264, + "learning_rate": 8.462430391782622e-06, + "loss": 0.6997, + "step": 4394 + }, + { + "epoch": 0.2095201773413105, + "grad_norm": 1.281566858291626, + "learning_rate": 8.458549246402854e-06, + "loss": 0.6801, + "step": 4395 + }, + { + "epoch": 0.20956784973661002, + "grad_norm": 4.123861312866211, + "learning_rate": 8.454668338853062e-06, + "loss": 1.2021, + "step": 4396 + }, + { + "epoch": 0.2096155221319095, + "grad_norm": 1.59635591506958, + "learning_rate": 8.450787669732036e-06, + "loss": 1.0536, + "step": 4397 + }, + { + "epoch": 0.20966319452720902, + "grad_norm": 2.3589484691619873, + "learning_rate": 8.446907239638514e-06, + "loss": 0.4969, + "step": 4398 + }, + { + "epoch": 0.2097108669225085, + "grad_norm": 2.146109104156494, + "learning_rate": 8.44302704917121e-06, + "loss": 1.0645, + "step": 4399 + }, + { + "epoch": 0.20975853931780802, + "grad_norm": 1.9817562103271484, + "learning_rate": 8.439147098928805e-06, + "loss": 0.7119, + "step": 4400 + }, + { + "epoch": 0.20980621171310754, + "grad_norm": 1.4278912544250488, + "learning_rate": 8.435267389509924e-06, + "loss": 0.3774, + "step": 4401 + }, + { + "epoch": 0.20985388410840702, + "grad_norm": 1.1749287843704224, + "learning_rate": 8.431387921513172e-06, + "loss": 0.7329, + "step": 4402 + }, + { + "epoch": 0.20990155650370654, + "grad_norm": 3.4919614791870117, + "learning_rate": 8.42750869553711e-06, + "loss": 0.3703, + "step": 4403 + }, + { + "epoch": 0.20994922889900602, + "grad_norm": 3.8118767738342285, + "learning_rate": 8.423629712180265e-06, + "loss": 0.6749, + "step": 4404 + }, + { + "epoch": 0.20999690129430554, + "grad_norm": 1.9759467840194702, + "learning_rate": 8.419750972041119e-06, + "loss": 0.6974, + "step": 4405 + }, + { + "epoch": 0.21004457368960502, + "grad_norm": 2.3534319400787354, + "learning_rate": 8.415872475718125e-06, + "loss": 0.6538, + "step": 4406 + }, + { + "epoch": 0.21009224608490454, + "grad_norm": 1.4680769443511963, + "learning_rate": 8.411994223809698e-06, + "loss": 0.9282, + "step": 4407 + }, + { + "epoch": 0.21013991848020402, + "grad_norm": 1.6939637660980225, + "learning_rate": 8.408116216914205e-06, + "loss": 0.8115, + "step": 4408 + }, + { + "epoch": 0.21018759087550354, + "grad_norm": 2.782266139984131, + "learning_rate": 8.404238455629989e-06, + "loss": 0.4806, + "step": 4409 + }, + { + "epoch": 0.21023526327080305, + "grad_norm": 1.3923627138137817, + "learning_rate": 8.400360940555348e-06, + "loss": 0.5887, + "step": 4410 + }, + { + "epoch": 0.21028293566610254, + "grad_norm": 1.1818264722824097, + "learning_rate": 8.396483672288536e-06, + "loss": 0.8378, + "step": 4411 + }, + { + "epoch": 0.21033060806140205, + "grad_norm": 1.5832149982452393, + "learning_rate": 8.392606651427781e-06, + "loss": 0.6567, + "step": 4412 + }, + { + "epoch": 0.21037828045670154, + "grad_norm": 2.349377393722534, + "learning_rate": 8.38872987857127e-06, + "loss": 0.7362, + "step": 4413 + }, + { + "epoch": 0.21042595285200105, + "grad_norm": 1.9588326215744019, + "learning_rate": 8.384853354317141e-06, + "loss": 0.9806, + "step": 4414 + }, + { + "epoch": 0.21047362524730054, + "grad_norm": 1.6584535837173462, + "learning_rate": 8.380977079263509e-06, + "loss": 0.341, + "step": 4415 + }, + { + "epoch": 0.21052129764260005, + "grad_norm": 1.1295039653778076, + "learning_rate": 8.377101054008445e-06, + "loss": 0.5538, + "step": 4416 + }, + { + "epoch": 0.21056897003789957, + "grad_norm": 1.7521729469299316, + "learning_rate": 8.373225279149972e-06, + "loss": 0.9521, + "step": 4417 + }, + { + "epoch": 0.21061664243319905, + "grad_norm": 1.7571831941604614, + "learning_rate": 8.369349755286084e-06, + "loss": 0.9567, + "step": 4418 + }, + { + "epoch": 0.21066431482849857, + "grad_norm": 1.487601399421692, + "learning_rate": 8.365474483014741e-06, + "loss": 0.737, + "step": 4419 + }, + { + "epoch": 0.21071198722379805, + "grad_norm": 1.9085054397583008, + "learning_rate": 8.36159946293385e-06, + "loss": 0.8935, + "step": 4420 + }, + { + "epoch": 0.21075965961909757, + "grad_norm": 1.3143061399459839, + "learning_rate": 8.357724695641287e-06, + "loss": 0.5924, + "step": 4421 + }, + { + "epoch": 0.21080733201439705, + "grad_norm": 1.5849010944366455, + "learning_rate": 8.353850181734898e-06, + "loss": 0.6918, + "step": 4422 + }, + { + "epoch": 0.21085500440969657, + "grad_norm": 1.8508244752883911, + "learning_rate": 8.349975921812468e-06, + "loss": 0.931, + "step": 4423 + }, + { + "epoch": 0.21090267680499605, + "grad_norm": 2.0183169841766357, + "learning_rate": 8.346101916471764e-06, + "loss": 0.6971, + "step": 4424 + }, + { + "epoch": 0.21095034920029557, + "grad_norm": 1.5702900886535645, + "learning_rate": 8.342228166310502e-06, + "loss": 0.8997, + "step": 4425 + }, + { + "epoch": 0.21099802159559508, + "grad_norm": 1.6522756814956665, + "learning_rate": 8.338354671926364e-06, + "loss": 0.9203, + "step": 4426 + }, + { + "epoch": 0.21104569399089457, + "grad_norm": 1.641060709953308, + "learning_rate": 8.334481433916984e-06, + "loss": 0.6685, + "step": 4427 + }, + { + "epoch": 0.21109336638619408, + "grad_norm": 1.5447933673858643, + "learning_rate": 8.330608452879972e-06, + "loss": 0.6448, + "step": 4428 + }, + { + "epoch": 0.21114103878149357, + "grad_norm": 1.6789227724075317, + "learning_rate": 8.32673572941288e-06, + "loss": 0.9129, + "step": 4429 + }, + { + "epoch": 0.21118871117679308, + "grad_norm": 1.832602620124817, + "learning_rate": 8.322863264113235e-06, + "loss": 0.7104, + "step": 4430 + }, + { + "epoch": 0.21123638357209257, + "grad_norm": 1.8039897680282593, + "learning_rate": 8.31899105757852e-06, + "loss": 0.6585, + "step": 4431 + }, + { + "epoch": 0.21128405596739208, + "grad_norm": 1.4945287704467773, + "learning_rate": 8.315119110406172e-06, + "loss": 0.3517, + "step": 4432 + }, + { + "epoch": 0.2113317283626916, + "grad_norm": 2.980860471725464, + "learning_rate": 8.311247423193594e-06, + "loss": 1.153, + "step": 4433 + }, + { + "epoch": 0.21137940075799108, + "grad_norm": 1.6675480604171753, + "learning_rate": 8.30737599653815e-06, + "loss": 0.6492, + "step": 4434 + }, + { + "epoch": 0.2114270731532906, + "grad_norm": 2.0930182933807373, + "learning_rate": 8.303504831037154e-06, + "loss": 0.7994, + "step": 4435 + }, + { + "epoch": 0.21147474554859008, + "grad_norm": 1.751406192779541, + "learning_rate": 8.299633927287894e-06, + "loss": 0.593, + "step": 4436 + }, + { + "epoch": 0.2115224179438896, + "grad_norm": 1.8691309690475464, + "learning_rate": 8.295763285887613e-06, + "loss": 1.0948, + "step": 4437 + }, + { + "epoch": 0.21157009033918908, + "grad_norm": 0.8946505188941956, + "learning_rate": 8.2918929074335e-06, + "loss": 0.4341, + "step": 4438 + }, + { + "epoch": 0.2116177627344886, + "grad_norm": 2.1912481784820557, + "learning_rate": 8.288022792522726e-06, + "loss": 0.49, + "step": 4439 + }, + { + "epoch": 0.2116654351297881, + "grad_norm": 1.384350061416626, + "learning_rate": 8.284152941752403e-06, + "loss": 0.7269, + "step": 4440 + }, + { + "epoch": 0.2117131075250876, + "grad_norm": 1.8056871891021729, + "learning_rate": 8.280283355719614e-06, + "loss": 0.859, + "step": 4441 + }, + { + "epoch": 0.2117607799203871, + "grad_norm": 2.656151533126831, + "learning_rate": 8.276414035021391e-06, + "loss": 1.1618, + "step": 4442 + }, + { + "epoch": 0.2118084523156866, + "grad_norm": 4.167341232299805, + "learning_rate": 8.272544980254731e-06, + "loss": 0.3233, + "step": 4443 + }, + { + "epoch": 0.2118561247109861, + "grad_norm": 2.2939465045928955, + "learning_rate": 8.268676192016598e-06, + "loss": 0.902, + "step": 4444 + }, + { + "epoch": 0.2119037971062856, + "grad_norm": 1.4443720579147339, + "learning_rate": 8.264807670903891e-06, + "loss": 0.9178, + "step": 4445 + }, + { + "epoch": 0.2119514695015851, + "grad_norm": 1.7518606185913086, + "learning_rate": 8.260939417513498e-06, + "loss": 0.8333, + "step": 4446 + }, + { + "epoch": 0.2119991418968846, + "grad_norm": 1.8861931562423706, + "learning_rate": 8.25707143244224e-06, + "loss": 0.8673, + "step": 4447 + }, + { + "epoch": 0.2120468142921841, + "grad_norm": 1.9442663192749023, + "learning_rate": 8.253203716286914e-06, + "loss": 0.7208, + "step": 4448 + }, + { + "epoch": 0.21209448668748362, + "grad_norm": 1.7955470085144043, + "learning_rate": 8.249336269644264e-06, + "loss": 0.61, + "step": 4449 + }, + { + "epoch": 0.2121421590827831, + "grad_norm": 1.4134103059768677, + "learning_rate": 8.245469093111002e-06, + "loss": 0.9365, + "step": 4450 + }, + { + "epoch": 0.21218983147808262, + "grad_norm": 3.5805282592773438, + "learning_rate": 8.241602187283789e-06, + "loss": 0.5981, + "step": 4451 + }, + { + "epoch": 0.2122375038733821, + "grad_norm": 1.5789103507995605, + "learning_rate": 8.237735552759247e-06, + "loss": 0.594, + "step": 4452 + }, + { + "epoch": 0.21228517626868162, + "grad_norm": 2.8894712924957275, + "learning_rate": 8.233869190133968e-06, + "loss": 0.5236, + "step": 4453 + }, + { + "epoch": 0.2123328486639811, + "grad_norm": 58.963985443115234, + "learning_rate": 8.230003100004481e-06, + "loss": 0.9719, + "step": 4454 + }, + { + "epoch": 0.21238052105928062, + "grad_norm": 1.3286560773849487, + "learning_rate": 8.226137282967289e-06, + "loss": 0.9582, + "step": 4455 + }, + { + "epoch": 0.21242819345458014, + "grad_norm": 4.558205604553223, + "learning_rate": 8.222271739618851e-06, + "loss": 0.4667, + "step": 4456 + }, + { + "epoch": 0.21247586584987962, + "grad_norm": 2.747762441635132, + "learning_rate": 8.218406470555571e-06, + "loss": 0.8036, + "step": 4457 + }, + { + "epoch": 0.21252353824517914, + "grad_norm": 1.3894038200378418, + "learning_rate": 8.214541476373824e-06, + "loss": 0.5518, + "step": 4458 + }, + { + "epoch": 0.21257121064047863, + "grad_norm": 0.9409295916557312, + "learning_rate": 8.210676757669948e-06, + "loss": 0.499, + "step": 4459 + }, + { + "epoch": 0.21261888303577814, + "grad_norm": 1.3855196237564087, + "learning_rate": 8.206812315040215e-06, + "loss": 0.5873, + "step": 4460 + }, + { + "epoch": 0.21266655543107763, + "grad_norm": 2.102444648742676, + "learning_rate": 8.20294814908088e-06, + "loss": 0.8072, + "step": 4461 + }, + { + "epoch": 0.21271422782637714, + "grad_norm": 1.9173320531845093, + "learning_rate": 8.199084260388139e-06, + "loss": 0.7175, + "step": 4462 + }, + { + "epoch": 0.21276190022167663, + "grad_norm": 1.828513264656067, + "learning_rate": 8.19522064955815e-06, + "loss": 0.8789, + "step": 4463 + }, + { + "epoch": 0.21280957261697614, + "grad_norm": 2.1269195079803467, + "learning_rate": 8.191357317187028e-06, + "loss": 0.2446, + "step": 4464 + }, + { + "epoch": 0.21285724501227565, + "grad_norm": 2.814713954925537, + "learning_rate": 8.18749426387085e-06, + "loss": 1.0531, + "step": 4465 + }, + { + "epoch": 0.21290491740757514, + "grad_norm": 2.0500357151031494, + "learning_rate": 8.183631490205636e-06, + "loss": 1.0665, + "step": 4466 + }, + { + "epoch": 0.21295258980287465, + "grad_norm": 1.9695978164672852, + "learning_rate": 8.179768996787381e-06, + "loss": 1.0367, + "step": 4467 + }, + { + "epoch": 0.21300026219817414, + "grad_norm": 1.298176884651184, + "learning_rate": 8.175906784212028e-06, + "loss": 0.759, + "step": 4468 + }, + { + "epoch": 0.21304793459347365, + "grad_norm": 1.1539537906646729, + "learning_rate": 8.17204485307547e-06, + "loss": 0.313, + "step": 4469 + }, + { + "epoch": 0.21309560698877314, + "grad_norm": 1.644991397857666, + "learning_rate": 8.168183203973568e-06, + "loss": 1.0725, + "step": 4470 + }, + { + "epoch": 0.21314327938407265, + "grad_norm": 1.9063230752944946, + "learning_rate": 8.164321837502136e-06, + "loss": 0.8613, + "step": 4471 + }, + { + "epoch": 0.21319095177937217, + "grad_norm": 2.056304693222046, + "learning_rate": 8.160460754256937e-06, + "loss": 0.7527, + "step": 4472 + }, + { + "epoch": 0.21323862417467165, + "grad_norm": 2.9836580753326416, + "learning_rate": 8.156599954833699e-06, + "loss": 0.5532, + "step": 4473 + }, + { + "epoch": 0.21328629656997117, + "grad_norm": 1.5842543840408325, + "learning_rate": 8.15273943982811e-06, + "loss": 1.1751, + "step": 4474 + }, + { + "epoch": 0.21333396896527065, + "grad_norm": 1.684693694114685, + "learning_rate": 8.148879209835797e-06, + "loss": 0.9538, + "step": 4475 + }, + { + "epoch": 0.21338164136057017, + "grad_norm": 1.8569011688232422, + "learning_rate": 8.145019265452361e-06, + "loss": 0.7172, + "step": 4476 + }, + { + "epoch": 0.21342931375586965, + "grad_norm": 3.4422695636749268, + "learning_rate": 8.141159607273352e-06, + "loss": 1.0018, + "step": 4477 + }, + { + "epoch": 0.21347698615116917, + "grad_norm": 1.4015235900878906, + "learning_rate": 8.13730023589427e-06, + "loss": 0.6793, + "step": 4478 + }, + { + "epoch": 0.21352465854646865, + "grad_norm": 1.6025656461715698, + "learning_rate": 8.13344115191058e-06, + "loss": 0.5903, + "step": 4479 + }, + { + "epoch": 0.21357233094176817, + "grad_norm": 2.8859734535217285, + "learning_rate": 8.129582355917698e-06, + "loss": 0.9701, + "step": 4480 + }, + { + "epoch": 0.21362000333706768, + "grad_norm": 1.8074482679367065, + "learning_rate": 8.125723848511e-06, + "loss": 0.8863, + "step": 4481 + }, + { + "epoch": 0.21366767573236717, + "grad_norm": 2.0635104179382324, + "learning_rate": 8.121865630285809e-06, + "loss": 1.0725, + "step": 4482 + }, + { + "epoch": 0.21371534812766668, + "grad_norm": 1.8617684841156006, + "learning_rate": 8.118007701837409e-06, + "loss": 0.9979, + "step": 4483 + }, + { + "epoch": 0.21376302052296617, + "grad_norm": 1.0794591903686523, + "learning_rate": 8.114150063761041e-06, + "loss": 0.5675, + "step": 4484 + }, + { + "epoch": 0.21381069291826568, + "grad_norm": 6.139678001403809, + "learning_rate": 8.110292716651899e-06, + "loss": 1.7222, + "step": 4485 + }, + { + "epoch": 0.21385836531356517, + "grad_norm": 1.015209436416626, + "learning_rate": 8.106435661105127e-06, + "loss": 0.5727, + "step": 4486 + }, + { + "epoch": 0.21390603770886468, + "grad_norm": 1.6977912187576294, + "learning_rate": 8.102578897715839e-06, + "loss": 0.5604, + "step": 4487 + }, + { + "epoch": 0.2139537101041642, + "grad_norm": 1.292377233505249, + "learning_rate": 8.098722427079082e-06, + "loss": 0.4758, + "step": 4488 + }, + { + "epoch": 0.21400138249946368, + "grad_norm": 1.2050788402557373, + "learning_rate": 8.094866249789874e-06, + "loss": 0.8011, + "step": 4489 + }, + { + "epoch": 0.2140490548947632, + "grad_norm": 1.5704624652862549, + "learning_rate": 8.091010366443189e-06, + "loss": 0.3975, + "step": 4490 + }, + { + "epoch": 0.21409672729006268, + "grad_norm": 1.0296008586883545, + "learning_rate": 8.087154777633942e-06, + "loss": 0.577, + "step": 4491 + }, + { + "epoch": 0.2141443996853622, + "grad_norm": 2.6979355812072754, + "learning_rate": 8.083299483957016e-06, + "loss": 0.6808, + "step": 4492 + }, + { + "epoch": 0.21419207208066168, + "grad_norm": 1.462937831878662, + "learning_rate": 8.079444486007244e-06, + "loss": 0.7345, + "step": 4493 + }, + { + "epoch": 0.2142397444759612, + "grad_norm": 1.5809084177017212, + "learning_rate": 8.075589784379407e-06, + "loss": 0.8325, + "step": 4494 + }, + { + "epoch": 0.21428741687126068, + "grad_norm": 1.2995939254760742, + "learning_rate": 8.071735379668246e-06, + "loss": 0.7495, + "step": 4495 + }, + { + "epoch": 0.2143350892665602, + "grad_norm": 1.2686469554901123, + "learning_rate": 8.067881272468465e-06, + "loss": 0.9076, + "step": 4496 + }, + { + "epoch": 0.2143827616618597, + "grad_norm": 1.4767464399337769, + "learning_rate": 8.064027463374702e-06, + "loss": 0.6394, + "step": 4497 + }, + { + "epoch": 0.2144304340571592, + "grad_norm": 2.442845582962036, + "learning_rate": 8.060173952981565e-06, + "loss": 0.8026, + "step": 4498 + }, + { + "epoch": 0.2144781064524587, + "grad_norm": 1.4615638256072998, + "learning_rate": 8.056320741883613e-06, + "loss": 0.3246, + "step": 4499 + }, + { + "epoch": 0.2145257788477582, + "grad_norm": 2.036346197128296, + "learning_rate": 8.052467830675353e-06, + "loss": 0.4039, + "step": 4500 + }, + { + "epoch": 0.2145734512430577, + "grad_norm": 2.6884419918060303, + "learning_rate": 8.04861521995125e-06, + "loss": 0.7426, + "step": 4501 + }, + { + "epoch": 0.2146211236383572, + "grad_norm": 2.032949924468994, + "learning_rate": 8.044762910305726e-06, + "loss": 0.4398, + "step": 4502 + }, + { + "epoch": 0.2146687960336567, + "grad_norm": 1.3604116439819336, + "learning_rate": 8.040910902333149e-06, + "loss": 0.3812, + "step": 4503 + }, + { + "epoch": 0.21471646842895623, + "grad_norm": 2.1751201152801514, + "learning_rate": 8.03705919662784e-06, + "loss": 0.5535, + "step": 4504 + }, + { + "epoch": 0.2147641408242557, + "grad_norm": 3.663022518157959, + "learning_rate": 8.033207793784091e-06, + "loss": 1.1629, + "step": 4505 + }, + { + "epoch": 0.21481181321955523, + "grad_norm": 1.3970552682876587, + "learning_rate": 8.02935669439612e-06, + "loss": 0.2853, + "step": 4506 + }, + { + "epoch": 0.2148594856148547, + "grad_norm": 5.811208724975586, + "learning_rate": 8.025505899058119e-06, + "loss": 0.8217, + "step": 4507 + }, + { + "epoch": 0.21490715801015423, + "grad_norm": 1.6250298023223877, + "learning_rate": 8.021655408364227e-06, + "loss": 0.8706, + "step": 4508 + }, + { + "epoch": 0.2149548304054537, + "grad_norm": 1.4636025428771973, + "learning_rate": 8.017805222908528e-06, + "loss": 0.7862, + "step": 4509 + }, + { + "epoch": 0.21500250280075323, + "grad_norm": 1.4693152904510498, + "learning_rate": 8.01395534328507e-06, + "loss": 0.8315, + "step": 4510 + }, + { + "epoch": 0.2150501751960527, + "grad_norm": 1.6903177499771118, + "learning_rate": 8.010105770087854e-06, + "loss": 1.0756, + "step": 4511 + }, + { + "epoch": 0.21509784759135223, + "grad_norm": 2.1016578674316406, + "learning_rate": 8.006256503910823e-06, + "loss": 0.9515, + "step": 4512 + }, + { + "epoch": 0.21514551998665174, + "grad_norm": 1.1254427433013916, + "learning_rate": 8.002407545347881e-06, + "loss": 0.553, + "step": 4513 + }, + { + "epoch": 0.21519319238195123, + "grad_norm": 2.1703710556030273, + "learning_rate": 7.998558894992888e-06, + "loss": 0.827, + "step": 4514 + }, + { + "epoch": 0.21524086477725074, + "grad_norm": 1.854142427444458, + "learning_rate": 7.994710553439646e-06, + "loss": 0.6809, + "step": 4515 + }, + { + "epoch": 0.21528853717255023, + "grad_norm": 1.552635908126831, + "learning_rate": 7.99086252128191e-06, + "loss": 1.0352, + "step": 4516 + }, + { + "epoch": 0.21533620956784974, + "grad_norm": 1.546877145767212, + "learning_rate": 7.987014799113398e-06, + "loss": 0.7329, + "step": 4517 + }, + { + "epoch": 0.21538388196314923, + "grad_norm": 1.8693690299987793, + "learning_rate": 7.983167387527778e-06, + "loss": 0.8758, + "step": 4518 + }, + { + "epoch": 0.21543155435844874, + "grad_norm": 1.925012230873108, + "learning_rate": 7.979320287118656e-06, + "loss": 0.7871, + "step": 4519 + }, + { + "epoch": 0.21547922675374825, + "grad_norm": 2.1033551692962646, + "learning_rate": 7.975473498479607e-06, + "loss": 0.5098, + "step": 4520 + }, + { + "epoch": 0.21552689914904774, + "grad_norm": 1.3760926723480225, + "learning_rate": 7.971627022204148e-06, + "loss": 0.8647, + "step": 4521 + }, + { + "epoch": 0.21557457154434725, + "grad_norm": 2.3648693561553955, + "learning_rate": 7.967780858885753e-06, + "loss": 1.1086, + "step": 4522 + }, + { + "epoch": 0.21562224393964674, + "grad_norm": 1.8046520948410034, + "learning_rate": 7.963935009117838e-06, + "loss": 0.5923, + "step": 4523 + }, + { + "epoch": 0.21566991633494625, + "grad_norm": 1.2262309789657593, + "learning_rate": 7.960089473493791e-06, + "loss": 0.7717, + "step": 4524 + }, + { + "epoch": 0.21571758873024574, + "grad_norm": 1.6779276132583618, + "learning_rate": 7.956244252606926e-06, + "loss": 0.7866, + "step": 4525 + }, + { + "epoch": 0.21576526112554525, + "grad_norm": 1.3872177600860596, + "learning_rate": 7.952399347050526e-06, + "loss": 0.6285, + "step": 4526 + }, + { + "epoch": 0.21581293352084477, + "grad_norm": 1.0930930376052856, + "learning_rate": 7.948554757417825e-06, + "loss": 0.6883, + "step": 4527 + }, + { + "epoch": 0.21586060591614425, + "grad_norm": 1.4655370712280273, + "learning_rate": 7.944710484301995e-06, + "loss": 0.6577, + "step": 4528 + }, + { + "epoch": 0.21590827831144377, + "grad_norm": 1.4883787631988525, + "learning_rate": 7.940866528296175e-06, + "loss": 0.2995, + "step": 4529 + }, + { + "epoch": 0.21595595070674325, + "grad_norm": 3.252790927886963, + "learning_rate": 7.937022889993444e-06, + "loss": 0.5275, + "step": 4530 + }, + { + "epoch": 0.21600362310204277, + "grad_norm": 1.7478657960891724, + "learning_rate": 7.933179569986834e-06, + "loss": 0.6609, + "step": 4531 + }, + { + "epoch": 0.21605129549734226, + "grad_norm": 2.2230353355407715, + "learning_rate": 7.929336568869332e-06, + "loss": 0.7786, + "step": 4532 + }, + { + "epoch": 0.21609896789264177, + "grad_norm": 1.3655163049697876, + "learning_rate": 7.92549388723388e-06, + "loss": 0.7889, + "step": 4533 + }, + { + "epoch": 0.21614664028794126, + "grad_norm": 1.4297012090682983, + "learning_rate": 7.92165152567335e-06, + "loss": 0.9813, + "step": 4534 + }, + { + "epoch": 0.21619431268324077, + "grad_norm": 2.0077500343322754, + "learning_rate": 7.91780948478059e-06, + "loss": 1.1264, + "step": 4535 + }, + { + "epoch": 0.21624198507854028, + "grad_norm": 1.3427884578704834, + "learning_rate": 7.913967765148386e-06, + "loss": 0.5978, + "step": 4536 + }, + { + "epoch": 0.21628965747383977, + "grad_norm": 2.992999315261841, + "learning_rate": 7.910126367369474e-06, + "loss": 0.6912, + "step": 4537 + }, + { + "epoch": 0.21633732986913928, + "grad_norm": 2.4530725479125977, + "learning_rate": 7.906285292036538e-06, + "loss": 0.8698, + "step": 4538 + }, + { + "epoch": 0.21638500226443877, + "grad_norm": 1.6870721578598022, + "learning_rate": 7.902444539742224e-06, + "loss": 0.9277, + "step": 4539 + }, + { + "epoch": 0.21643267465973828, + "grad_norm": 1.311440348625183, + "learning_rate": 7.898604111079115e-06, + "loss": 0.5312, + "step": 4540 + }, + { + "epoch": 0.21648034705503777, + "grad_norm": 3.3823249340057373, + "learning_rate": 7.89476400663975e-06, + "loss": 0.9218, + "step": 4541 + }, + { + "epoch": 0.21652801945033728, + "grad_norm": 1.7939707040786743, + "learning_rate": 7.890924227016624e-06, + "loss": 0.8184, + "step": 4542 + }, + { + "epoch": 0.2165756918456368, + "grad_norm": 2.0498099327087402, + "learning_rate": 7.887084772802165e-06, + "loss": 1.307, + "step": 4543 + }, + { + "epoch": 0.21662336424093628, + "grad_norm": 1.3181850910186768, + "learning_rate": 7.88324564458877e-06, + "loss": 0.7324, + "step": 4544 + }, + { + "epoch": 0.2166710366362358, + "grad_norm": 1.151717185974121, + "learning_rate": 7.879406842968772e-06, + "loss": 0.8333, + "step": 4545 + }, + { + "epoch": 0.21671870903153528, + "grad_norm": 1.9521054029464722, + "learning_rate": 7.875568368534463e-06, + "loss": 0.789, + "step": 4546 + }, + { + "epoch": 0.2167663814268348, + "grad_norm": 1.7480714321136475, + "learning_rate": 7.871730221878073e-06, + "loss": 0.8029, + "step": 4547 + }, + { + "epoch": 0.21681405382213428, + "grad_norm": 2.3383572101593018, + "learning_rate": 7.867892403591798e-06, + "loss": 1.1834, + "step": 4548 + }, + { + "epoch": 0.2168617262174338, + "grad_norm": 2.626546621322632, + "learning_rate": 7.864054914267765e-06, + "loss": 1.1061, + "step": 4549 + }, + { + "epoch": 0.21690939861273328, + "grad_norm": 1.5257930755615234, + "learning_rate": 7.86021775449806e-06, + "loss": 0.7571, + "step": 4550 + }, + { + "epoch": 0.2169570710080328, + "grad_norm": 2.134678363800049, + "learning_rate": 7.856380924874726e-06, + "loss": 0.764, + "step": 4551 + }, + { + "epoch": 0.2170047434033323, + "grad_norm": 1.9182783365249634, + "learning_rate": 7.85254442598974e-06, + "loss": 0.6203, + "step": 4552 + }, + { + "epoch": 0.2170524157986318, + "grad_norm": 2.3225247859954834, + "learning_rate": 7.848708258435031e-06, + "loss": 1.0288, + "step": 4553 + }, + { + "epoch": 0.2171000881939313, + "grad_norm": 1.486769199371338, + "learning_rate": 7.844872422802483e-06, + "loss": 0.8181, + "step": 4554 + }, + { + "epoch": 0.2171477605892308, + "grad_norm": 1.531981348991394, + "learning_rate": 7.841036919683932e-06, + "loss": 0.6438, + "step": 4555 + }, + { + "epoch": 0.2171954329845303, + "grad_norm": 3.082106351852417, + "learning_rate": 7.837201749671146e-06, + "loss": 0.8642, + "step": 4556 + }, + { + "epoch": 0.2172431053798298, + "grad_norm": 1.664265751838684, + "learning_rate": 7.833366913355858e-06, + "loss": 0.6025, + "step": 4557 + }, + { + "epoch": 0.2172907777751293, + "grad_norm": 0.9437436461448669, + "learning_rate": 7.829532411329747e-06, + "loss": 0.3189, + "step": 4558 + }, + { + "epoch": 0.21733845017042883, + "grad_norm": 4.005910396575928, + "learning_rate": 7.825698244184432e-06, + "loss": 0.6571, + "step": 4559 + }, + { + "epoch": 0.2173861225657283, + "grad_norm": 2.060579776763916, + "learning_rate": 7.821864412511485e-06, + "loss": 0.7423, + "step": 4560 + }, + { + "epoch": 0.21743379496102783, + "grad_norm": 1.7637516260147095, + "learning_rate": 7.818030916902433e-06, + "loss": 0.807, + "step": 4561 + }, + { + "epoch": 0.2174814673563273, + "grad_norm": 1.4165635108947754, + "learning_rate": 7.814197757948734e-06, + "loss": 0.7102, + "step": 4562 + }, + { + "epoch": 0.21752913975162683, + "grad_norm": 1.5000356435775757, + "learning_rate": 7.810364936241814e-06, + "loss": 0.902, + "step": 4563 + }, + { + "epoch": 0.2175768121469263, + "grad_norm": 2.554123878479004, + "learning_rate": 7.80653245237304e-06, + "loss": 0.7217, + "step": 4564 + }, + { + "epoch": 0.21762448454222583, + "grad_norm": 1.1652750968933105, + "learning_rate": 7.802700306933716e-06, + "loss": 0.5649, + "step": 4565 + }, + { + "epoch": 0.2176721569375253, + "grad_norm": 2.113891839981079, + "learning_rate": 7.798868500515106e-06, + "loss": 0.41, + "step": 4566 + }, + { + "epoch": 0.21771982933282483, + "grad_norm": 2.3827731609344482, + "learning_rate": 7.795037033708422e-06, + "loss": 1.0033, + "step": 4567 + }, + { + "epoch": 0.21776750172812434, + "grad_norm": 1.8273991346359253, + "learning_rate": 7.791205907104816e-06, + "loss": 0.906, + "step": 4568 + }, + { + "epoch": 0.21781517412342383, + "grad_norm": 1.6398130655288696, + "learning_rate": 7.78737512129539e-06, + "loss": 0.8026, + "step": 4569 + }, + { + "epoch": 0.21786284651872334, + "grad_norm": 1.1766680479049683, + "learning_rate": 7.783544676871202e-06, + "loss": 0.6919, + "step": 4570 + }, + { + "epoch": 0.21791051891402283, + "grad_norm": 3.8314054012298584, + "learning_rate": 7.779714574423241e-06, + "loss": 0.5949, + "step": 4571 + }, + { + "epoch": 0.21795819130932234, + "grad_norm": 2.741455078125, + "learning_rate": 7.775884814542457e-06, + "loss": 0.5654, + "step": 4572 + }, + { + "epoch": 0.21800586370462183, + "grad_norm": 1.0760807991027832, + "learning_rate": 7.772055397819745e-06, + "loss": 0.4576, + "step": 4573 + }, + { + "epoch": 0.21805353609992134, + "grad_norm": 4.20237922668457, + "learning_rate": 7.768226324845942e-06, + "loss": 0.1334, + "step": 4574 + }, + { + "epoch": 0.21810120849522086, + "grad_norm": 1.9570916891098022, + "learning_rate": 7.76439759621183e-06, + "loss": 0.7007, + "step": 4575 + }, + { + "epoch": 0.21814888089052034, + "grad_norm": 1.4350074529647827, + "learning_rate": 7.76056921250815e-06, + "loss": 0.6344, + "step": 4576 + }, + { + "epoch": 0.21819655328581986, + "grad_norm": 4.5398969650268555, + "learning_rate": 7.756741174325578e-06, + "loss": 0.9005, + "step": 4577 + }, + { + "epoch": 0.21824422568111934, + "grad_norm": 1.3192641735076904, + "learning_rate": 7.75291348225474e-06, + "loss": 0.5944, + "step": 4578 + }, + { + "epoch": 0.21829189807641886, + "grad_norm": 1.124534249305725, + "learning_rate": 7.749086136886215e-06, + "loss": 0.5157, + "step": 4579 + }, + { + "epoch": 0.21833957047171834, + "grad_norm": 1.8843111991882324, + "learning_rate": 7.745259138810514e-06, + "loss": 0.9855, + "step": 4580 + }, + { + "epoch": 0.21838724286701786, + "grad_norm": 1.7668653726577759, + "learning_rate": 7.741432488618112e-06, + "loss": 0.8871, + "step": 4581 + }, + { + "epoch": 0.21843491526231734, + "grad_norm": 2.983565092086792, + "learning_rate": 7.737606186899417e-06, + "loss": 0.6264, + "step": 4582 + }, + { + "epoch": 0.21848258765761686, + "grad_norm": 3.1347360610961914, + "learning_rate": 7.733780234244792e-06, + "loss": 0.722, + "step": 4583 + }, + { + "epoch": 0.21853026005291637, + "grad_norm": 1.5097100734710693, + "learning_rate": 7.729954631244536e-06, + "loss": 0.8209, + "step": 4584 + }, + { + "epoch": 0.21857793244821586, + "grad_norm": 1.6965235471725464, + "learning_rate": 7.726129378488907e-06, + "loss": 0.7019, + "step": 4585 + }, + { + "epoch": 0.21862560484351537, + "grad_norm": 4.577078819274902, + "learning_rate": 7.722304476568095e-06, + "loss": 0.3699, + "step": 4586 + }, + { + "epoch": 0.21867327723881486, + "grad_norm": 2.4130613803863525, + "learning_rate": 7.718479926072244e-06, + "loss": 1.2491, + "step": 4587 + }, + { + "epoch": 0.21872094963411437, + "grad_norm": 2.268475294113159, + "learning_rate": 7.714655727591452e-06, + "loss": 0.9074, + "step": 4588 + }, + { + "epoch": 0.21876862202941386, + "grad_norm": 1.489576816558838, + "learning_rate": 7.710831881715742e-06, + "loss": 0.6937, + "step": 4589 + }, + { + "epoch": 0.21881629442471337, + "grad_norm": 1.1661494970321655, + "learning_rate": 7.707008389035102e-06, + "loss": 0.5234, + "step": 4590 + }, + { + "epoch": 0.21886396682001288, + "grad_norm": 1.961639404296875, + "learning_rate": 7.703185250139455e-06, + "loss": 0.6295, + "step": 4591 + }, + { + "epoch": 0.21891163921531237, + "grad_norm": 2.7637739181518555, + "learning_rate": 7.699362465618667e-06, + "loss": 0.8126, + "step": 4592 + }, + { + "epoch": 0.21895931161061188, + "grad_norm": 2.8779118061065674, + "learning_rate": 7.695540036062559e-06, + "loss": 0.5957, + "step": 4593 + }, + { + "epoch": 0.21900698400591137, + "grad_norm": 1.3385227918624878, + "learning_rate": 7.691717962060892e-06, + "loss": 0.7985, + "step": 4594 + }, + { + "epoch": 0.21905465640121088, + "grad_norm": 1.152004599571228, + "learning_rate": 7.687896244203377e-06, + "loss": 0.7074, + "step": 4595 + }, + { + "epoch": 0.21910232879651037, + "grad_norm": 1.147142767906189, + "learning_rate": 7.68407488307966e-06, + "loss": 0.7639, + "step": 4596 + }, + { + "epoch": 0.21915000119180988, + "grad_norm": 1.3513474464416504, + "learning_rate": 7.680253879279335e-06, + "loss": 0.7616, + "step": 4597 + }, + { + "epoch": 0.21919767358710937, + "grad_norm": 1.3045693635940552, + "learning_rate": 7.676433233391955e-06, + "loss": 0.5044, + "step": 4598 + }, + { + "epoch": 0.21924534598240888, + "grad_norm": 2.069242477416992, + "learning_rate": 7.672612946006992e-06, + "loss": 0.7143, + "step": 4599 + }, + { + "epoch": 0.2192930183777084, + "grad_norm": 1.8670201301574707, + "learning_rate": 7.668793017713886e-06, + "loss": 0.3993, + "step": 4600 + }, + { + "epoch": 0.21934069077300788, + "grad_norm": 1.47934091091156, + "learning_rate": 7.664973449102013e-06, + "loss": 0.8515, + "step": 4601 + }, + { + "epoch": 0.2193883631683074, + "grad_norm": 1.5718576908111572, + "learning_rate": 7.661154240760687e-06, + "loss": 0.5841, + "step": 4602 + }, + { + "epoch": 0.21943603556360688, + "grad_norm": 2.0619256496429443, + "learning_rate": 7.657335393279179e-06, + "loss": 0.7204, + "step": 4603 + }, + { + "epoch": 0.2194837079589064, + "grad_norm": 1.712097406387329, + "learning_rate": 7.653516907246696e-06, + "loss": 0.8824, + "step": 4604 + }, + { + "epoch": 0.21953138035420589, + "grad_norm": 1.91115140914917, + "learning_rate": 7.649698783252388e-06, + "loss": 0.7008, + "step": 4605 + }, + { + "epoch": 0.2195790527495054, + "grad_norm": 1.3914875984191895, + "learning_rate": 7.645881021885353e-06, + "loss": 0.8336, + "step": 4606 + }, + { + "epoch": 0.2196267251448049, + "grad_norm": 1.1526566743850708, + "learning_rate": 7.642063623734638e-06, + "loss": 0.8268, + "step": 4607 + }, + { + "epoch": 0.2196743975401044, + "grad_norm": 2.9590299129486084, + "learning_rate": 7.63824658938922e-06, + "loss": 0.2128, + "step": 4608 + }, + { + "epoch": 0.2197220699354039, + "grad_norm": 1.3247638940811157, + "learning_rate": 7.63442991943803e-06, + "loss": 0.691, + "step": 4609 + }, + { + "epoch": 0.2197697423307034, + "grad_norm": 1.7840033769607544, + "learning_rate": 7.630613614469948e-06, + "loss": 1.1023, + "step": 4610 + }, + { + "epoch": 0.2198174147260029, + "grad_norm": 1.563821792602539, + "learning_rate": 7.626797675073783e-06, + "loss": 0.4949, + "step": 4611 + }, + { + "epoch": 0.2198650871213024, + "grad_norm": 1.431036353111267, + "learning_rate": 7.6229821018382965e-06, + "loss": 0.4389, + "step": 4612 + }, + { + "epoch": 0.2199127595166019, + "grad_norm": 1.946836233139038, + "learning_rate": 7.619166895352197e-06, + "loss": 0.6832, + "step": 4613 + }, + { + "epoch": 0.21996043191190143, + "grad_norm": 1.4851797819137573, + "learning_rate": 7.615352056204124e-06, + "loss": 0.8744, + "step": 4614 + }, + { + "epoch": 0.2200081043072009, + "grad_norm": 1.1410810947418213, + "learning_rate": 7.61153758498267e-06, + "loss": 0.5084, + "step": 4615 + }, + { + "epoch": 0.22005577670250043, + "grad_norm": 1.4116249084472656, + "learning_rate": 7.607723482276375e-06, + "loss": 0.8347, + "step": 4616 + }, + { + "epoch": 0.2201034490977999, + "grad_norm": 1.4600305557250977, + "learning_rate": 7.6039097486737075e-06, + "loss": 0.7696, + "step": 4617 + }, + { + "epoch": 0.22015112149309943, + "grad_norm": 1.9674410820007324, + "learning_rate": 7.600096384763093e-06, + "loss": 0.6783, + "step": 4618 + }, + { + "epoch": 0.2201987938883989, + "grad_norm": 2.6363725662231445, + "learning_rate": 7.596283391132892e-06, + "loss": 0.5615, + "step": 4619 + }, + { + "epoch": 0.22024646628369843, + "grad_norm": 4.3245625495910645, + "learning_rate": 7.592470768371409e-06, + "loss": 0.6746, + "step": 4620 + }, + { + "epoch": 0.22029413867899791, + "grad_norm": 7.218118190765381, + "learning_rate": 7.588658517066893e-06, + "loss": 1.7143, + "step": 4621 + }, + { + "epoch": 0.22034181107429743, + "grad_norm": 1.9416069984436035, + "learning_rate": 7.5848466378075395e-06, + "loss": 0.7704, + "step": 4622 + }, + { + "epoch": 0.22038948346959694, + "grad_norm": 2.2492318153381348, + "learning_rate": 7.581035131181473e-06, + "loss": 0.6563, + "step": 4623 + }, + { + "epoch": 0.22043715586489643, + "grad_norm": 1.7324265241622925, + "learning_rate": 7.577223997776777e-06, + "loss": 0.5469, + "step": 4624 + }, + { + "epoch": 0.22048482826019594, + "grad_norm": 1.3318227529525757, + "learning_rate": 7.573413238181473e-06, + "loss": 0.7346, + "step": 4625 + }, + { + "epoch": 0.22053250065549543, + "grad_norm": 1.5449528694152832, + "learning_rate": 7.569602852983511e-06, + "loss": 0.54, + "step": 4626 + }, + { + "epoch": 0.22058017305079494, + "grad_norm": 1.5341260433197021, + "learning_rate": 7.565792842770805e-06, + "loss": 0.9212, + "step": 4627 + }, + { + "epoch": 0.22062784544609443, + "grad_norm": 0.9264002442359924, + "learning_rate": 7.561983208131196e-06, + "loss": 0.457, + "step": 4628 + }, + { + "epoch": 0.22067551784139394, + "grad_norm": 1.8454240560531616, + "learning_rate": 7.558173949652468e-06, + "loss": 1.1494, + "step": 4629 + }, + { + "epoch": 0.22072319023669346, + "grad_norm": 1.0589171648025513, + "learning_rate": 7.554365067922353e-06, + "loss": 0.5758, + "step": 4630 + }, + { + "epoch": 0.22077086263199294, + "grad_norm": 1.4372797012329102, + "learning_rate": 7.550556563528524e-06, + "loss": 0.9499, + "step": 4631 + }, + { + "epoch": 0.22081853502729246, + "grad_norm": 1.6198995113372803, + "learning_rate": 7.546748437058596e-06, + "loss": 0.9306, + "step": 4632 + }, + { + "epoch": 0.22086620742259194, + "grad_norm": 1.3839938640594482, + "learning_rate": 7.542940689100117e-06, + "loss": 0.5114, + "step": 4633 + }, + { + "epoch": 0.22091387981789146, + "grad_norm": 1.7403932809829712, + "learning_rate": 7.539133320240589e-06, + "loss": 0.6705, + "step": 4634 + }, + { + "epoch": 0.22096155221319094, + "grad_norm": 1.2243397235870361, + "learning_rate": 7.53532633106745e-06, + "loss": 0.5372, + "step": 4635 + }, + { + "epoch": 0.22100922460849046, + "grad_norm": 1.1937931776046753, + "learning_rate": 7.531519722168072e-06, + "loss": 0.554, + "step": 4636 + }, + { + "epoch": 0.22105689700378994, + "grad_norm": 4.210997581481934, + "learning_rate": 7.527713494129781e-06, + "loss": 0.6425, + "step": 4637 + }, + { + "epoch": 0.22110456939908946, + "grad_norm": 1.3754652738571167, + "learning_rate": 7.523907647539841e-06, + "loss": 0.6326, + "step": 4638 + }, + { + "epoch": 0.22115224179438897, + "grad_norm": 2.4554967880249023, + "learning_rate": 7.520102182985449e-06, + "loss": 0.8312, + "step": 4639 + }, + { + "epoch": 0.22119991418968846, + "grad_norm": 1.6669466495513916, + "learning_rate": 7.516297101053754e-06, + "loss": 0.9231, + "step": 4640 + }, + { + "epoch": 0.22124758658498797, + "grad_norm": 1.7525601387023926, + "learning_rate": 7.51249240233184e-06, + "loss": 0.7735, + "step": 4641 + }, + { + "epoch": 0.22129525898028746, + "grad_norm": 1.7552474737167358, + "learning_rate": 7.508688087406731e-06, + "loss": 0.5516, + "step": 4642 + }, + { + "epoch": 0.22134293137558697, + "grad_norm": 1.4849187135696411, + "learning_rate": 7.504884156865393e-06, + "loss": 0.8757, + "step": 4643 + }, + { + "epoch": 0.22139060377088646, + "grad_norm": 1.1317781209945679, + "learning_rate": 7.501080611294739e-06, + "loss": 0.6283, + "step": 4644 + }, + { + "epoch": 0.22143827616618597, + "grad_norm": 1.029887080192566, + "learning_rate": 7.497277451281609e-06, + "loss": 0.6628, + "step": 4645 + }, + { + "epoch": 0.22148594856148549, + "grad_norm": 1.9708335399627686, + "learning_rate": 7.493474677412795e-06, + "loss": 0.7249, + "step": 4646 + }, + { + "epoch": 0.22153362095678497, + "grad_norm": 3.476801633834839, + "learning_rate": 7.48967229027503e-06, + "loss": 1.4375, + "step": 4647 + }, + { + "epoch": 0.22158129335208449, + "grad_norm": 1.4689197540283203, + "learning_rate": 7.485870290454974e-06, + "loss": 0.6284, + "step": 4648 + }, + { + "epoch": 0.22162896574738397, + "grad_norm": 2.1239750385284424, + "learning_rate": 7.482068678539245e-06, + "loss": 0.7225, + "step": 4649 + }, + { + "epoch": 0.22167663814268349, + "grad_norm": 1.2528247833251953, + "learning_rate": 7.478267455114391e-06, + "loss": 0.5359, + "step": 4650 + }, + { + "epoch": 0.22172431053798297, + "grad_norm": 6.784343242645264, + "learning_rate": 7.474466620766896e-06, + "loss": 1.3655, + "step": 4651 + }, + { + "epoch": 0.22177198293328249, + "grad_norm": 1.987514853477478, + "learning_rate": 7.470666176083193e-06, + "loss": 0.9451, + "step": 4652 + }, + { + "epoch": 0.22181965532858197, + "grad_norm": 1.7384040355682373, + "learning_rate": 7.466866121649656e-06, + "loss": 1.1158, + "step": 4653 + }, + { + "epoch": 0.22186732772388149, + "grad_norm": 1.190354824066162, + "learning_rate": 7.463066458052586e-06, + "loss": 0.7798, + "step": 4654 + }, + { + "epoch": 0.221915000119181, + "grad_norm": 1.2690644264221191, + "learning_rate": 7.4592671858782365e-06, + "loss": 0.6525, + "step": 4655 + }, + { + "epoch": 0.22196267251448049, + "grad_norm": 0.7563767433166504, + "learning_rate": 7.455468305712801e-06, + "loss": 0.1129, + "step": 4656 + }, + { + "epoch": 0.22201034490978, + "grad_norm": 1.5915089845657349, + "learning_rate": 7.451669818142398e-06, + "loss": 0.505, + "step": 4657 + }, + { + "epoch": 0.22205801730507949, + "grad_norm": 1.7025084495544434, + "learning_rate": 7.447871723753098e-06, + "loss": 1.2199, + "step": 4658 + }, + { + "epoch": 0.222105689700379, + "grad_norm": 1.5906728506088257, + "learning_rate": 7.444074023130914e-06, + "loss": 0.639, + "step": 4659 + }, + { + "epoch": 0.22215336209567849, + "grad_norm": 1.7473750114440918, + "learning_rate": 7.440276716861783e-06, + "loss": 0.6684, + "step": 4660 + }, + { + "epoch": 0.222201034490978, + "grad_norm": 1.0748573541641235, + "learning_rate": 7.436479805531595e-06, + "loss": 0.2557, + "step": 4661 + }, + { + "epoch": 0.22224870688627751, + "grad_norm": 1.3791710138320923, + "learning_rate": 7.432683289726177e-06, + "loss": 0.7866, + "step": 4662 + }, + { + "epoch": 0.222296379281577, + "grad_norm": 2.493863821029663, + "learning_rate": 7.428887170031285e-06, + "loss": 1.1457, + "step": 4663 + }, + { + "epoch": 0.22234405167687651, + "grad_norm": 2.577347993850708, + "learning_rate": 7.425091447032629e-06, + "loss": 0.6169, + "step": 4664 + }, + { + "epoch": 0.222391724072176, + "grad_norm": 2.4438793659210205, + "learning_rate": 7.421296121315844e-06, + "loss": 1.3597, + "step": 4665 + }, + { + "epoch": 0.22243939646747551, + "grad_norm": 1.0990134477615356, + "learning_rate": 7.417501193466513e-06, + "loss": 0.8178, + "step": 4666 + }, + { + "epoch": 0.222487068862775, + "grad_norm": 1.9476672410964966, + "learning_rate": 7.413706664070151e-06, + "loss": 0.8329, + "step": 4667 + }, + { + "epoch": 0.22253474125807451, + "grad_norm": 2.050792694091797, + "learning_rate": 7.409912533712218e-06, + "loss": 1.0559, + "step": 4668 + }, + { + "epoch": 0.222582413653374, + "grad_norm": 2.358335494995117, + "learning_rate": 7.406118802978111e-06, + "loss": 0.497, + "step": 4669 + }, + { + "epoch": 0.22263008604867351, + "grad_norm": 2.9558937549591064, + "learning_rate": 7.402325472453158e-06, + "loss": 0.3128, + "step": 4670 + }, + { + "epoch": 0.22267775844397303, + "grad_norm": 1.6250839233398438, + "learning_rate": 7.398532542722635e-06, + "loss": 0.6273, + "step": 4671 + }, + { + "epoch": 0.22272543083927251, + "grad_norm": 1.8294248580932617, + "learning_rate": 7.394740014371753e-06, + "loss": 0.3294, + "step": 4672 + }, + { + "epoch": 0.22277310323457203, + "grad_norm": 1.7346724271774292, + "learning_rate": 7.390947887985654e-06, + "loss": 0.8005, + "step": 4673 + }, + { + "epoch": 0.22282077562987151, + "grad_norm": 3.926581621170044, + "learning_rate": 7.387156164149427e-06, + "loss": 0.8396, + "step": 4674 + }, + { + "epoch": 0.22286844802517103, + "grad_norm": 1.3853797912597656, + "learning_rate": 7.383364843448102e-06, + "loss": 0.7356, + "step": 4675 + }, + { + "epoch": 0.22291612042047051, + "grad_norm": 1.0769068002700806, + "learning_rate": 7.379573926466631e-06, + "loss": 0.7911, + "step": 4676 + }, + { + "epoch": 0.22296379281577003, + "grad_norm": 1.4001976251602173, + "learning_rate": 7.375783413789918e-06, + "loss": 0.9862, + "step": 4677 + }, + { + "epoch": 0.22301146521106954, + "grad_norm": 2.245253801345825, + "learning_rate": 7.371993306002804e-06, + "loss": 0.3392, + "step": 4678 + }, + { + "epoch": 0.22305913760636903, + "grad_norm": 1.1093387603759766, + "learning_rate": 7.368203603690057e-06, + "loss": 0.3007, + "step": 4679 + }, + { + "epoch": 0.22310681000166854, + "grad_norm": 1.3087025880813599, + "learning_rate": 7.36441430743639e-06, + "loss": 0.5277, + "step": 4680 + }, + { + "epoch": 0.22315448239696803, + "grad_norm": 1.7940627336502075, + "learning_rate": 7.360625417826459e-06, + "loss": 0.5785, + "step": 4681 + }, + { + "epoch": 0.22320215479226754, + "grad_norm": 1.332318902015686, + "learning_rate": 7.356836935444841e-06, + "loss": 0.5831, + "step": 4682 + }, + { + "epoch": 0.22324982718756703, + "grad_norm": 1.940137505531311, + "learning_rate": 7.3530488608760645e-06, + "loss": 0.6727, + "step": 4683 + }, + { + "epoch": 0.22329749958286654, + "grad_norm": 1.1690236330032349, + "learning_rate": 7.349261194704596e-06, + "loss": 0.82, + "step": 4684 + }, + { + "epoch": 0.22334517197816603, + "grad_norm": 1.2394042015075684, + "learning_rate": 7.345473937514822e-06, + "loss": 0.7523, + "step": 4685 + }, + { + "epoch": 0.22339284437346554, + "grad_norm": 2.288142442703247, + "learning_rate": 7.341687089891085e-06, + "loss": 1.1392, + "step": 4686 + }, + { + "epoch": 0.22344051676876506, + "grad_norm": 3.9231040477752686, + "learning_rate": 7.337900652417656e-06, + "loss": 0.4054, + "step": 4687 + }, + { + "epoch": 0.22348818916406454, + "grad_norm": 2.15238356590271, + "learning_rate": 7.334114625678741e-06, + "loss": 0.6911, + "step": 4688 + }, + { + "epoch": 0.22353586155936406, + "grad_norm": 2.52760910987854, + "learning_rate": 7.330329010258483e-06, + "loss": 1.0212, + "step": 4689 + }, + { + "epoch": 0.22358353395466354, + "grad_norm": 3.931504249572754, + "learning_rate": 7.3265438067409725e-06, + "loss": 0.8517, + "step": 4690 + }, + { + "epoch": 0.22363120634996306, + "grad_norm": 1.1890994310379028, + "learning_rate": 7.3227590157102165e-06, + "loss": 0.5002, + "step": 4691 + }, + { + "epoch": 0.22367887874526254, + "grad_norm": 1.3411054611206055, + "learning_rate": 7.318974637750174e-06, + "loss": 0.765, + "step": 4692 + }, + { + "epoch": 0.22372655114056206, + "grad_norm": 2.177300214767456, + "learning_rate": 7.31519067344474e-06, + "loss": 0.1258, + "step": 4693 + }, + { + "epoch": 0.22377422353586157, + "grad_norm": 2.613393545150757, + "learning_rate": 7.311407123377734e-06, + "loss": 0.738, + "step": 4694 + }, + { + "epoch": 0.22382189593116106, + "grad_norm": 5.053560733795166, + "learning_rate": 7.307623988132921e-06, + "loss": 1.2962, + "step": 4695 + }, + { + "epoch": 0.22386956832646057, + "grad_norm": 2.4341542720794678, + "learning_rate": 7.303841268294004e-06, + "loss": 0.4851, + "step": 4696 + }, + { + "epoch": 0.22391724072176006, + "grad_norm": 2.830789089202881, + "learning_rate": 7.30005896444461e-06, + "loss": 0.7733, + "step": 4697 + }, + { + "epoch": 0.22396491311705957, + "grad_norm": 2.6923956871032715, + "learning_rate": 7.2962770771683144e-06, + "loss": 0.5332, + "step": 4698 + }, + { + "epoch": 0.22401258551235906, + "grad_norm": 2.32131028175354, + "learning_rate": 7.292495607048626e-06, + "loss": 0.8116, + "step": 4699 + }, + { + "epoch": 0.22406025790765857, + "grad_norm": 2.7807183265686035, + "learning_rate": 7.28871455466898e-06, + "loss": 0.7625, + "step": 4700 + }, + { + "epoch": 0.22410793030295806, + "grad_norm": 1.4085941314697266, + "learning_rate": 7.284933920612759e-06, + "loss": 0.73, + "step": 4701 + }, + { + "epoch": 0.22415560269825757, + "grad_norm": 2.79194974899292, + "learning_rate": 7.281153705463275e-06, + "loss": 0.7639, + "step": 4702 + }, + { + "epoch": 0.2242032750935571, + "grad_norm": 1.8848598003387451, + "learning_rate": 7.277373909803774e-06, + "loss": 0.7303, + "step": 4703 + }, + { + "epoch": 0.22425094748885657, + "grad_norm": 1.2991321086883545, + "learning_rate": 7.273594534217441e-06, + "loss": 0.6501, + "step": 4704 + }, + { + "epoch": 0.2242986198841561, + "grad_norm": 2.308899164199829, + "learning_rate": 7.269815579287398e-06, + "loss": 0.7138, + "step": 4705 + }, + { + "epoch": 0.22434629227945557, + "grad_norm": 1.8376412391662598, + "learning_rate": 7.266037045596692e-06, + "loss": 0.6759, + "step": 4706 + }, + { + "epoch": 0.2243939646747551, + "grad_norm": 2.57595157623291, + "learning_rate": 7.262258933728314e-06, + "loss": 1.0017, + "step": 4707 + }, + { + "epoch": 0.22444163707005457, + "grad_norm": 1.2827507257461548, + "learning_rate": 7.258481244265193e-06, + "loss": 0.6921, + "step": 4708 + }, + { + "epoch": 0.2244893094653541, + "grad_norm": 1.3698151111602783, + "learning_rate": 7.254703977790183e-06, + "loss": 0.8405, + "step": 4709 + }, + { + "epoch": 0.2245369818606536, + "grad_norm": 0.9687865376472473, + "learning_rate": 7.2509271348860785e-06, + "loss": 0.5002, + "step": 4710 + }, + { + "epoch": 0.2245846542559531, + "grad_norm": 0.9799315333366394, + "learning_rate": 7.247150716135605e-06, + "loss": 0.542, + "step": 4711 + }, + { + "epoch": 0.2246323266512526, + "grad_norm": 1.283604383468628, + "learning_rate": 7.243374722121431e-06, + "loss": 0.6338, + "step": 4712 + }, + { + "epoch": 0.2246799990465521, + "grad_norm": 1.9069286584854126, + "learning_rate": 7.2395991534261456e-06, + "loss": 0.89, + "step": 4713 + }, + { + "epoch": 0.2247276714418516, + "grad_norm": 2.6499481201171875, + "learning_rate": 7.235824010632284e-06, + "loss": 0.755, + "step": 4714 + }, + { + "epoch": 0.2247753438371511, + "grad_norm": 1.6619880199432373, + "learning_rate": 7.232049294322316e-06, + "loss": 0.7238, + "step": 4715 + }, + { + "epoch": 0.2248230162324506, + "grad_norm": 2.6634881496429443, + "learning_rate": 7.2282750050786374e-06, + "loss": 0.8087, + "step": 4716 + }, + { + "epoch": 0.22487068862775011, + "grad_norm": 1.4497495889663696, + "learning_rate": 7.2245011434835775e-06, + "loss": 0.7627, + "step": 4717 + }, + { + "epoch": 0.2249183610230496, + "grad_norm": 1.2503986358642578, + "learning_rate": 7.220727710119415e-06, + "loss": 0.6057, + "step": 4718 + }, + { + "epoch": 0.22496603341834912, + "grad_norm": 1.8429253101348877, + "learning_rate": 7.216954705568342e-06, + "loss": 0.5762, + "step": 4719 + }, + { + "epoch": 0.2250137058136486, + "grad_norm": 1.2374719381332397, + "learning_rate": 7.2131821304124974e-06, + "loss": 0.5254, + "step": 4720 + }, + { + "epoch": 0.22506137820894812, + "grad_norm": 6.295538902282715, + "learning_rate": 7.209409985233955e-06, + "loss": 0.9477, + "step": 4721 + }, + { + "epoch": 0.2251090506042476, + "grad_norm": 2.023261785507202, + "learning_rate": 7.20563827061471e-06, + "loss": 0.7076, + "step": 4722 + }, + { + "epoch": 0.22515672299954712, + "grad_norm": 2.349370241165161, + "learning_rate": 7.201866987136706e-06, + "loss": 0.6833, + "step": 4723 + }, + { + "epoch": 0.2252043953948466, + "grad_norm": 2.554931163787842, + "learning_rate": 7.198096135381811e-06, + "loss": 0.5366, + "step": 4724 + }, + { + "epoch": 0.22525206779014612, + "grad_norm": 1.7870793342590332, + "learning_rate": 7.1943257159318295e-06, + "loss": 0.7195, + "step": 4725 + }, + { + "epoch": 0.22529974018544563, + "grad_norm": 1.7255315780639648, + "learning_rate": 7.190555729368492e-06, + "loss": 0.8099, + "step": 4726 + }, + { + "epoch": 0.22534741258074512, + "grad_norm": 1.6862258911132812, + "learning_rate": 7.18678617627348e-06, + "loss": 0.6703, + "step": 4727 + }, + { + "epoch": 0.22539508497604463, + "grad_norm": 2.1234540939331055, + "learning_rate": 7.183017057228386e-06, + "loss": 0.6835, + "step": 4728 + }, + { + "epoch": 0.22544275737134412, + "grad_norm": 2.145744562149048, + "learning_rate": 7.179248372814751e-06, + "loss": 0.9435, + "step": 4729 + }, + { + "epoch": 0.22549042976664363, + "grad_norm": 2.782910108566284, + "learning_rate": 7.175480123614048e-06, + "loss": 0.089, + "step": 4730 + }, + { + "epoch": 0.22553810216194312, + "grad_norm": 1.0296188592910767, + "learning_rate": 7.17171231020767e-06, + "loss": 0.5136, + "step": 4731 + }, + { + "epoch": 0.22558577455724263, + "grad_norm": 2.7136995792388916, + "learning_rate": 7.16794493317696e-06, + "loss": 0.4068, + "step": 4732 + }, + { + "epoch": 0.22563344695254214, + "grad_norm": 1.6317429542541504, + "learning_rate": 7.164177993103185e-06, + "loss": 0.6417, + "step": 4733 + }, + { + "epoch": 0.22568111934784163, + "grad_norm": 6.330113887786865, + "learning_rate": 7.160411490567536e-06, + "loss": 1.1275, + "step": 4734 + }, + { + "epoch": 0.22572879174314114, + "grad_norm": 3.2850711345672607, + "learning_rate": 7.156645426151154e-06, + "loss": 0.2795, + "step": 4735 + }, + { + "epoch": 0.22577646413844063, + "grad_norm": 1.601914644241333, + "learning_rate": 7.152879800435104e-06, + "loss": 0.5566, + "step": 4736 + }, + { + "epoch": 0.22582413653374014, + "grad_norm": 2.2140674591064453, + "learning_rate": 7.149114614000378e-06, + "loss": 1.1239, + "step": 4737 + }, + { + "epoch": 0.22587180892903963, + "grad_norm": 1.208021640777588, + "learning_rate": 7.145349867427911e-06, + "loss": 0.935, + "step": 4738 + }, + { + "epoch": 0.22591948132433914, + "grad_norm": 3.035499334335327, + "learning_rate": 7.141585561298563e-06, + "loss": 0.7962, + "step": 4739 + }, + { + "epoch": 0.22596715371963863, + "grad_norm": 2.8254246711730957, + "learning_rate": 7.137821696193126e-06, + "loss": 1.1237, + "step": 4740 + }, + { + "epoch": 0.22601482611493814, + "grad_norm": 1.474790334701538, + "learning_rate": 7.1340582726923235e-06, + "loss": 0.778, + "step": 4741 + }, + { + "epoch": 0.22606249851023766, + "grad_norm": 1.1334351301193237, + "learning_rate": 7.1302952913768205e-06, + "loss": 0.6643, + "step": 4742 + }, + { + "epoch": 0.22611017090553714, + "grad_norm": 1.326080560684204, + "learning_rate": 7.1265327528272e-06, + "loss": 0.7613, + "step": 4743 + }, + { + "epoch": 0.22615784330083666, + "grad_norm": 0.923225462436676, + "learning_rate": 7.122770657623982e-06, + "loss": 0.3463, + "step": 4744 + }, + { + "epoch": 0.22620551569613614, + "grad_norm": 1.6519964933395386, + "learning_rate": 7.119009006347625e-06, + "loss": 0.6567, + "step": 4745 + }, + { + "epoch": 0.22625318809143566, + "grad_norm": 1.6232659816741943, + "learning_rate": 7.1152477995785095e-06, + "loss": 0.9483, + "step": 4746 + }, + { + "epoch": 0.22630086048673514, + "grad_norm": 1.9227045774459839, + "learning_rate": 7.111487037896951e-06, + "loss": 0.6603, + "step": 4747 + }, + { + "epoch": 0.22634853288203466, + "grad_norm": 2.5088133811950684, + "learning_rate": 7.107726721883196e-06, + "loss": 0.9401, + "step": 4748 + }, + { + "epoch": 0.22639620527733417, + "grad_norm": 8.9476957321167, + "learning_rate": 7.1039668521174256e-06, + "loss": 0.7122, + "step": 4749 + }, + { + "epoch": 0.22644387767263366, + "grad_norm": 1.6333867311477661, + "learning_rate": 7.100207429179744e-06, + "loss": 0.7738, + "step": 4750 + }, + { + "epoch": 0.22649155006793317, + "grad_norm": 1.4185580015182495, + "learning_rate": 7.096448453650193e-06, + "loss": 0.6116, + "step": 4751 + }, + { + "epoch": 0.22653922246323266, + "grad_norm": 2.734687328338623, + "learning_rate": 7.092689926108749e-06, + "loss": 0.7373, + "step": 4752 + }, + { + "epoch": 0.22658689485853217, + "grad_norm": 2.821073293685913, + "learning_rate": 7.088931847135305e-06, + "loss": 0.3472, + "step": 4753 + }, + { + "epoch": 0.22663456725383166, + "grad_norm": 2.1299304962158203, + "learning_rate": 7.085174217309703e-06, + "loss": 0.7028, + "step": 4754 + }, + { + "epoch": 0.22668223964913117, + "grad_norm": 1.0289126634597778, + "learning_rate": 7.081417037211702e-06, + "loss": 0.5725, + "step": 4755 + }, + { + "epoch": 0.22672991204443066, + "grad_norm": 1.468214988708496, + "learning_rate": 7.077660307420995e-06, + "loss": 0.7818, + "step": 4756 + }, + { + "epoch": 0.22677758443973017, + "grad_norm": 1.4464809894561768, + "learning_rate": 7.073904028517207e-06, + "loss": 0.6624, + "step": 4757 + }, + { + "epoch": 0.2268252568350297, + "grad_norm": 2.84568452835083, + "learning_rate": 7.070148201079898e-06, + "loss": 1.1413, + "step": 4758 + }, + { + "epoch": 0.22687292923032917, + "grad_norm": 2.1240897178649902, + "learning_rate": 7.066392825688546e-06, + "loss": 1.0717, + "step": 4759 + }, + { + "epoch": 0.2269206016256287, + "grad_norm": 1.1136767864227295, + "learning_rate": 7.0626379029225735e-06, + "loss": 0.714, + "step": 4760 + }, + { + "epoch": 0.22696827402092817, + "grad_norm": 1.9035533666610718, + "learning_rate": 7.058883433361323e-06, + "loss": 0.6541, + "step": 4761 + }, + { + "epoch": 0.2270159464162277, + "grad_norm": 2.2511048316955566, + "learning_rate": 7.05512941758407e-06, + "loss": 1.4494, + "step": 4762 + }, + { + "epoch": 0.22706361881152717, + "grad_norm": 2.730553150177002, + "learning_rate": 7.051375856170022e-06, + "loss": 0.9317, + "step": 4763 + }, + { + "epoch": 0.2271112912068267, + "grad_norm": 3.4477884769439697, + "learning_rate": 7.047622749698317e-06, + "loss": 1.2089, + "step": 4764 + }, + { + "epoch": 0.2271589636021262, + "grad_norm": 1.2691444158554077, + "learning_rate": 7.043870098748013e-06, + "loss": 0.5804, + "step": 4765 + }, + { + "epoch": 0.2272066359974257, + "grad_norm": 1.2987852096557617, + "learning_rate": 7.040117903898112e-06, + "loss": 0.5485, + "step": 4766 + }, + { + "epoch": 0.2272543083927252, + "grad_norm": 3.465034246444702, + "learning_rate": 7.036366165727542e-06, + "loss": 0.7205, + "step": 4767 + }, + { + "epoch": 0.2273019807880247, + "grad_norm": 2.7303783893585205, + "learning_rate": 7.0326148848151485e-06, + "loss": 0.7658, + "step": 4768 + }, + { + "epoch": 0.2273496531833242, + "grad_norm": 2.511943817138672, + "learning_rate": 7.028864061739722e-06, + "loss": 0.7951, + "step": 4769 + }, + { + "epoch": 0.2273973255786237, + "grad_norm": 1.3755884170532227, + "learning_rate": 7.025113697079977e-06, + "loss": 0.7083, + "step": 4770 + }, + { + "epoch": 0.2274449979739232, + "grad_norm": 1.078394889831543, + "learning_rate": 7.021363791414548e-06, + "loss": 0.3954, + "step": 4771 + }, + { + "epoch": 0.2274926703692227, + "grad_norm": 2.0816476345062256, + "learning_rate": 7.017614345322012e-06, + "loss": 0.6315, + "step": 4772 + }, + { + "epoch": 0.2275403427645222, + "grad_norm": 2.6804473400115967, + "learning_rate": 7.0138653593808736e-06, + "loss": 0.4759, + "step": 4773 + }, + { + "epoch": 0.22758801515982172, + "grad_norm": 1.5941988229751587, + "learning_rate": 7.0101168341695556e-06, + "loss": 0.8696, + "step": 4774 + }, + { + "epoch": 0.2276356875551212, + "grad_norm": 1.244204044342041, + "learning_rate": 7.006368770266421e-06, + "loss": 0.6185, + "step": 4775 + }, + { + "epoch": 0.22768335995042072, + "grad_norm": 2.2992420196533203, + "learning_rate": 7.002621168249759e-06, + "loss": 0.4614, + "step": 4776 + }, + { + "epoch": 0.2277310323457202, + "grad_norm": 1.6140018701553345, + "learning_rate": 6.998874028697782e-06, + "loss": 0.3395, + "step": 4777 + }, + { + "epoch": 0.22777870474101972, + "grad_norm": 3.8199462890625, + "learning_rate": 6.995127352188635e-06, + "loss": 0.8374, + "step": 4778 + }, + { + "epoch": 0.2278263771363192, + "grad_norm": 2.1203508377075195, + "learning_rate": 6.9913811393003985e-06, + "loss": 1.0732, + "step": 4779 + }, + { + "epoch": 0.22787404953161872, + "grad_norm": 1.535758137702942, + "learning_rate": 6.987635390611065e-06, + "loss": 0.7637, + "step": 4780 + }, + { + "epoch": 0.22792172192691823, + "grad_norm": 1.904703140258789, + "learning_rate": 6.983890106698567e-06, + "loss": 1.0583, + "step": 4781 + }, + { + "epoch": 0.22796939432221772, + "grad_norm": 1.5071921348571777, + "learning_rate": 6.980145288140772e-06, + "loss": 0.635, + "step": 4782 + }, + { + "epoch": 0.22801706671751723, + "grad_norm": 1.9088568687438965, + "learning_rate": 6.976400935515457e-06, + "loss": 0.6964, + "step": 4783 + }, + { + "epoch": 0.22806473911281672, + "grad_norm": 1.2707308530807495, + "learning_rate": 6.972657049400342e-06, + "loss": 0.8825, + "step": 4784 + }, + { + "epoch": 0.22811241150811623, + "grad_norm": 1.8479267358779907, + "learning_rate": 6.968913630373066e-06, + "loss": 0.7153, + "step": 4785 + }, + { + "epoch": 0.22816008390341572, + "grad_norm": 2.0549099445343018, + "learning_rate": 6.965170679011207e-06, + "loss": 0.9486, + "step": 4786 + }, + { + "epoch": 0.22820775629871523, + "grad_norm": 1.1440856456756592, + "learning_rate": 6.961428195892256e-06, + "loss": 0.6683, + "step": 4787 + }, + { + "epoch": 0.22825542869401472, + "grad_norm": 2.016913414001465, + "learning_rate": 6.957686181593642e-06, + "loss": 0.36, + "step": 4788 + }, + { + "epoch": 0.22830310108931423, + "grad_norm": 3.395559549331665, + "learning_rate": 6.953944636692727e-06, + "loss": 1.0265, + "step": 4789 + }, + { + "epoch": 0.22835077348461374, + "grad_norm": 1.9130994081497192, + "learning_rate": 6.95020356176678e-06, + "loss": 0.7371, + "step": 4790 + }, + { + "epoch": 0.22839844587991323, + "grad_norm": 5.80994176864624, + "learning_rate": 6.946462957393019e-06, + "loss": 0.8319, + "step": 4791 + }, + { + "epoch": 0.22844611827521275, + "grad_norm": 2.279386043548584, + "learning_rate": 6.94272282414858e-06, + "loss": 0.7302, + "step": 4792 + }, + { + "epoch": 0.22849379067051223, + "grad_norm": 1.6140658855438232, + "learning_rate": 6.938983162610522e-06, + "loss": 0.6806, + "step": 4793 + }, + { + "epoch": 0.22854146306581175, + "grad_norm": 1.4206973314285278, + "learning_rate": 6.935243973355839e-06, + "loss": 0.7579, + "step": 4794 + }, + { + "epoch": 0.22858913546111123, + "grad_norm": 1.15697181224823, + "learning_rate": 6.931505256961454e-06, + "loss": 0.3138, + "step": 4795 + }, + { + "epoch": 0.22863680785641075, + "grad_norm": 1.5886199474334717, + "learning_rate": 6.9277670140042055e-06, + "loss": 0.5657, + "step": 4796 + }, + { + "epoch": 0.22868448025171026, + "grad_norm": 1.677321195602417, + "learning_rate": 6.924029245060868e-06, + "loss": 0.7028, + "step": 4797 + }, + { + "epoch": 0.22873215264700975, + "grad_norm": 1.933491826057434, + "learning_rate": 6.920291950708144e-06, + "loss": 0.9893, + "step": 4798 + }, + { + "epoch": 0.22877982504230926, + "grad_norm": 1.786095142364502, + "learning_rate": 6.916555131522657e-06, + "loss": 0.5504, + "step": 4799 + }, + { + "epoch": 0.22882749743760875, + "grad_norm": 0.9630317687988281, + "learning_rate": 6.9128187880809595e-06, + "loss": 0.3825, + "step": 4800 + }, + { + "epoch": 0.22887516983290826, + "grad_norm": 1.4966472387313843, + "learning_rate": 6.909082920959534e-06, + "loss": 0.9508, + "step": 4801 + }, + { + "epoch": 0.22892284222820775, + "grad_norm": 1.7875064611434937, + "learning_rate": 6.905347530734778e-06, + "loss": 0.6317, + "step": 4802 + }, + { + "epoch": 0.22897051462350726, + "grad_norm": 1.6654691696166992, + "learning_rate": 6.90161261798303e-06, + "loss": 0.9723, + "step": 4803 + }, + { + "epoch": 0.22901818701880677, + "grad_norm": 1.6923191547393799, + "learning_rate": 6.897878183280553e-06, + "loss": 0.4879, + "step": 4804 + }, + { + "epoch": 0.22906585941410626, + "grad_norm": 2.3821890354156494, + "learning_rate": 6.894144227203521e-06, + "loss": 0.2866, + "step": 4805 + }, + { + "epoch": 0.22911353180940577, + "grad_norm": 1.1901254653930664, + "learning_rate": 6.890410750328054e-06, + "loss": 0.7887, + "step": 4806 + }, + { + "epoch": 0.22916120420470526, + "grad_norm": 4.020439624786377, + "learning_rate": 6.886677753230184e-06, + "loss": 0.3801, + "step": 4807 + }, + { + "epoch": 0.22920887660000477, + "grad_norm": 1.9434716701507568, + "learning_rate": 6.8829452364858776e-06, + "loss": 0.5932, + "step": 4808 + }, + { + "epoch": 0.22925654899530426, + "grad_norm": 1.2128325700759888, + "learning_rate": 6.8792132006710175e-06, + "loss": 0.784, + "step": 4809 + }, + { + "epoch": 0.22930422139060377, + "grad_norm": 2.2217557430267334, + "learning_rate": 6.875481646361428e-06, + "loss": 0.6519, + "step": 4810 + }, + { + "epoch": 0.22935189378590326, + "grad_norm": 0.9565367102622986, + "learning_rate": 6.871750574132841e-06, + "loss": 0.5196, + "step": 4811 + }, + { + "epoch": 0.22939956618120277, + "grad_norm": 1.2641178369522095, + "learning_rate": 6.868019984560925e-06, + "loss": 0.7952, + "step": 4812 + }, + { + "epoch": 0.2294472385765023, + "grad_norm": 2.4499106407165527, + "learning_rate": 6.864289878221275e-06, + "loss": 0.655, + "step": 4813 + }, + { + "epoch": 0.22949491097180177, + "grad_norm": 0.9127827882766724, + "learning_rate": 6.8605602556894056e-06, + "loss": 0.3018, + "step": 4814 + }, + { + "epoch": 0.2295425833671013, + "grad_norm": 1.6209328174591064, + "learning_rate": 6.8568311175407546e-06, + "loss": 0.7411, + "step": 4815 + }, + { + "epoch": 0.22959025576240077, + "grad_norm": 1.4945902824401855, + "learning_rate": 6.853102464350698e-06, + "loss": 0.7779, + "step": 4816 + }, + { + "epoch": 0.2296379281577003, + "grad_norm": 2.6084814071655273, + "learning_rate": 6.849374296694522e-06, + "loss": 0.6965, + "step": 4817 + }, + { + "epoch": 0.22968560055299977, + "grad_norm": 1.3233808279037476, + "learning_rate": 6.845646615147445e-06, + "loss": 0.563, + "step": 4818 + }, + { + "epoch": 0.2297332729482993, + "grad_norm": 1.3300272226333618, + "learning_rate": 6.841919420284618e-06, + "loss": 0.2962, + "step": 4819 + }, + { + "epoch": 0.2297809453435988, + "grad_norm": 1.5891857147216797, + "learning_rate": 6.8381927126810965e-06, + "loss": 0.709, + "step": 4820 + }, + { + "epoch": 0.2298286177388983, + "grad_norm": 1.7181260585784912, + "learning_rate": 6.834466492911882e-06, + "loss": 0.5133, + "step": 4821 + }, + { + "epoch": 0.2298762901341978, + "grad_norm": 1.556504726409912, + "learning_rate": 6.8307407615518865e-06, + "loss": 0.8262, + "step": 4822 + }, + { + "epoch": 0.2299239625294973, + "grad_norm": 1.3471596240997314, + "learning_rate": 6.827015519175958e-06, + "loss": 0.7419, + "step": 4823 + }, + { + "epoch": 0.2299716349247968, + "grad_norm": 2.29951548576355, + "learning_rate": 6.823290766358857e-06, + "loss": 0.4747, + "step": 4824 + }, + { + "epoch": 0.2300193073200963, + "grad_norm": 1.8909499645233154, + "learning_rate": 6.819566503675274e-06, + "loss": 0.37, + "step": 4825 + }, + { + "epoch": 0.2300669797153958, + "grad_norm": 1.6186764240264893, + "learning_rate": 6.815842731699834e-06, + "loss": 0.2767, + "step": 4826 + }, + { + "epoch": 0.2301146521106953, + "grad_norm": 2.155027151107788, + "learning_rate": 6.812119451007067e-06, + "loss": 0.6642, + "step": 4827 + }, + { + "epoch": 0.2301623245059948, + "grad_norm": 2.7167179584503174, + "learning_rate": 6.808396662171439e-06, + "loss": 0.7051, + "step": 4828 + }, + { + "epoch": 0.23020999690129432, + "grad_norm": 3.2606234550476074, + "learning_rate": 6.804674365767341e-06, + "loss": 0.7588, + "step": 4829 + }, + { + "epoch": 0.2302576692965938, + "grad_norm": 1.379020094871521, + "learning_rate": 6.8009525623690805e-06, + "loss": 0.8912, + "step": 4830 + }, + { + "epoch": 0.23030534169189332, + "grad_norm": 1.795343041419983, + "learning_rate": 6.797231252550895e-06, + "loss": 0.6966, + "step": 4831 + }, + { + "epoch": 0.2303530140871928, + "grad_norm": 1.3029494285583496, + "learning_rate": 6.793510436886951e-06, + "loss": 0.804, + "step": 4832 + }, + { + "epoch": 0.23040068648249232, + "grad_norm": 1.163956880569458, + "learning_rate": 6.78979011595132e-06, + "loss": 0.9016, + "step": 4833 + }, + { + "epoch": 0.2304483588777918, + "grad_norm": 1.42453134059906, + "learning_rate": 6.7860702903180165e-06, + "loss": 0.6412, + "step": 4834 + }, + { + "epoch": 0.23049603127309132, + "grad_norm": 1.3722329139709473, + "learning_rate": 6.782350960560973e-06, + "loss": 0.7591, + "step": 4835 + }, + { + "epoch": 0.23054370366839083, + "grad_norm": 1.4869694709777832, + "learning_rate": 6.778632127254039e-06, + "loss": 0.6528, + "step": 4836 + }, + { + "epoch": 0.23059137606369032, + "grad_norm": 1.0891196727752686, + "learning_rate": 6.774913790970994e-06, + "loss": 0.5512, + "step": 4837 + }, + { + "epoch": 0.23063904845898983, + "grad_norm": 1.421276569366455, + "learning_rate": 6.771195952285541e-06, + "loss": 0.2438, + "step": 4838 + }, + { + "epoch": 0.23068672085428932, + "grad_norm": 1.6381943225860596, + "learning_rate": 6.7674786117712985e-06, + "loss": 1.0994, + "step": 4839 + }, + { + "epoch": 0.23073439324958883, + "grad_norm": 1.414202094078064, + "learning_rate": 6.763761770001817e-06, + "loss": 0.6413, + "step": 4840 + }, + { + "epoch": 0.23078206564488832, + "grad_norm": 1.2982137203216553, + "learning_rate": 6.760045427550574e-06, + "loss": 0.6958, + "step": 4841 + }, + { + "epoch": 0.23082973804018783, + "grad_norm": 3.3846471309661865, + "learning_rate": 6.75632958499095e-06, + "loss": 1.2324, + "step": 4842 + }, + { + "epoch": 0.23087741043548732, + "grad_norm": 1.6973671913146973, + "learning_rate": 6.752614242896271e-06, + "loss": 0.7026, + "step": 4843 + }, + { + "epoch": 0.23092508283078683, + "grad_norm": 0.733379602432251, + "learning_rate": 6.748899401839774e-06, + "loss": 0.2149, + "step": 4844 + }, + { + "epoch": 0.23097275522608635, + "grad_norm": 2.3345930576324463, + "learning_rate": 6.745185062394617e-06, + "loss": 0.6395, + "step": 4845 + }, + { + "epoch": 0.23102042762138583, + "grad_norm": 2.2964675426483154, + "learning_rate": 6.741471225133886e-06, + "loss": 1.4405, + "step": 4846 + }, + { + "epoch": 0.23106810001668535, + "grad_norm": 1.3115711212158203, + "learning_rate": 6.737757890630593e-06, + "loss": 0.9501, + "step": 4847 + }, + { + "epoch": 0.23111577241198483, + "grad_norm": 1.46547269821167, + "learning_rate": 6.734045059457658e-06, + "loss": 0.8141, + "step": 4848 + }, + { + "epoch": 0.23116344480728435, + "grad_norm": 1.3806743621826172, + "learning_rate": 6.7303327321879375e-06, + "loss": 0.6733, + "step": 4849 + }, + { + "epoch": 0.23121111720258383, + "grad_norm": 1.0296831130981445, + "learning_rate": 6.7266209093942104e-06, + "loss": 0.6701, + "step": 4850 + }, + { + "epoch": 0.23125878959788335, + "grad_norm": 2.1191413402557373, + "learning_rate": 6.722909591649163e-06, + "loss": 0.6614, + "step": 4851 + }, + { + "epoch": 0.23130646199318286, + "grad_norm": 1.48680579662323, + "learning_rate": 6.7191987795254195e-06, + "loss": 0.765, + "step": 4852 + }, + { + "epoch": 0.23135413438848235, + "grad_norm": 1.0230731964111328, + "learning_rate": 6.715488473595522e-06, + "loss": 0.6681, + "step": 4853 + }, + { + "epoch": 0.23140180678378186, + "grad_norm": 1.825308918952942, + "learning_rate": 6.7117786744319235e-06, + "loss": 0.7449, + "step": 4854 + }, + { + "epoch": 0.23144947917908135, + "grad_norm": 1.675789475440979, + "learning_rate": 6.708069382607015e-06, + "loss": 0.7855, + "step": 4855 + }, + { + "epoch": 0.23149715157438086, + "grad_norm": 1.8206390142440796, + "learning_rate": 6.704360598693103e-06, + "loss": 0.541, + "step": 4856 + }, + { + "epoch": 0.23154482396968035, + "grad_norm": 2.795661449432373, + "learning_rate": 6.700652323262409e-06, + "loss": 1.0867, + "step": 4857 + }, + { + "epoch": 0.23159249636497986, + "grad_norm": 1.0484724044799805, + "learning_rate": 6.696944556887086e-06, + "loss": 0.41, + "step": 4858 + }, + { + "epoch": 0.23164016876027935, + "grad_norm": 2.431704044342041, + "learning_rate": 6.693237300139201e-06, + "loss": 0.6234, + "step": 4859 + }, + { + "epoch": 0.23168784115557886, + "grad_norm": 1.1742595434188843, + "learning_rate": 6.6895305535907515e-06, + "loss": 0.6184, + "step": 4860 + }, + { + "epoch": 0.23173551355087837, + "grad_norm": 1.6406340599060059, + "learning_rate": 6.6858243178136425e-06, + "loss": 0.5902, + "step": 4861 + }, + { + "epoch": 0.23178318594617786, + "grad_norm": 4.000977039337158, + "learning_rate": 6.682118593379713e-06, + "loss": 1.3003, + "step": 4862 + }, + { + "epoch": 0.23183085834147737, + "grad_norm": 2.7752816677093506, + "learning_rate": 6.67841338086072e-06, + "loss": 0.9206, + "step": 4863 + }, + { + "epoch": 0.23187853073677686, + "grad_norm": 1.4836485385894775, + "learning_rate": 6.674708680828332e-06, + "loss": 0.5691, + "step": 4864 + }, + { + "epoch": 0.23192620313207638, + "grad_norm": 1.0134133100509644, + "learning_rate": 6.671004493854154e-06, + "loss": 0.624, + "step": 4865 + }, + { + "epoch": 0.23197387552737586, + "grad_norm": 1.5512118339538574, + "learning_rate": 6.6673008205097e-06, + "loss": 0.5365, + "step": 4866 + }, + { + "epoch": 0.23202154792267538, + "grad_norm": 1.4838773012161255, + "learning_rate": 6.66359766136641e-06, + "loss": 0.4869, + "step": 4867 + }, + { + "epoch": 0.2320692203179749, + "grad_norm": 2.7635140419006348, + "learning_rate": 6.659895016995639e-06, + "loss": 0.4451, + "step": 4868 + }, + { + "epoch": 0.23211689271327438, + "grad_norm": 2.415874719619751, + "learning_rate": 6.656192887968675e-06, + "loss": 1.0728, + "step": 4869 + }, + { + "epoch": 0.2321645651085739, + "grad_norm": 1.112857699394226, + "learning_rate": 6.652491274856711e-06, + "loss": 0.5404, + "step": 4870 + }, + { + "epoch": 0.23221223750387338, + "grad_norm": 2.001843214035034, + "learning_rate": 6.6487901782308685e-06, + "loss": 0.6603, + "step": 4871 + }, + { + "epoch": 0.2322599098991729, + "grad_norm": 1.4128066301345825, + "learning_rate": 6.645089598662197e-06, + "loss": 0.8395, + "step": 4872 + }, + { + "epoch": 0.23230758229447238, + "grad_norm": 1.0326107740402222, + "learning_rate": 6.641389536721646e-06, + "loss": 0.393, + "step": 4873 + }, + { + "epoch": 0.2323552546897719, + "grad_norm": 1.463822364807129, + "learning_rate": 6.637689992980105e-06, + "loss": 0.6671, + "step": 4874 + }, + { + "epoch": 0.23240292708507138, + "grad_norm": 1.4672478437423706, + "learning_rate": 6.633990968008374e-06, + "loss": 0.775, + "step": 4875 + }, + { + "epoch": 0.2324505994803709, + "grad_norm": 1.0843429565429688, + "learning_rate": 6.630292462377172e-06, + "loss": 0.6534, + "step": 4876 + }, + { + "epoch": 0.2324982718756704, + "grad_norm": 1.7352875471115112, + "learning_rate": 6.62659447665714e-06, + "loss": 0.6898, + "step": 4877 + }, + { + "epoch": 0.2325459442709699, + "grad_norm": 1.6229850053787231, + "learning_rate": 6.622897011418845e-06, + "loss": 1.1481, + "step": 4878 + }, + { + "epoch": 0.2325936166662694, + "grad_norm": 1.8289846181869507, + "learning_rate": 6.619200067232758e-06, + "loss": 0.3906, + "step": 4879 + }, + { + "epoch": 0.2326412890615689, + "grad_norm": 1.3244634866714478, + "learning_rate": 6.6155036446692895e-06, + "loss": 0.812, + "step": 4880 + }, + { + "epoch": 0.2326889614568684, + "grad_norm": 1.527397871017456, + "learning_rate": 6.6118077442987545e-06, + "loss": 0.8692, + "step": 4881 + }, + { + "epoch": 0.2327366338521679, + "grad_norm": 2.2919743061065674, + "learning_rate": 6.608112366691393e-06, + "loss": 0.5707, + "step": 4882 + }, + { + "epoch": 0.2327843062474674, + "grad_norm": 3.870281934738159, + "learning_rate": 6.604417512417362e-06, + "loss": 0.2523, + "step": 4883 + }, + { + "epoch": 0.23283197864276692, + "grad_norm": 1.2963998317718506, + "learning_rate": 6.600723182046744e-06, + "loss": 0.6588, + "step": 4884 + }, + { + "epoch": 0.2328796510380664, + "grad_norm": 1.6692534685134888, + "learning_rate": 6.5970293761495305e-06, + "loss": 0.7753, + "step": 4885 + }, + { + "epoch": 0.23292732343336592, + "grad_norm": 1.7019098997116089, + "learning_rate": 6.593336095295639e-06, + "loss": 0.6004, + "step": 4886 + }, + { + "epoch": 0.2329749958286654, + "grad_norm": 1.8671505451202393, + "learning_rate": 6.589643340054911e-06, + "loss": 0.9987, + "step": 4887 + }, + { + "epoch": 0.23302266822396492, + "grad_norm": 1.5101096630096436, + "learning_rate": 6.585951110997092e-06, + "loss": 0.8756, + "step": 4888 + }, + { + "epoch": 0.2330703406192644, + "grad_norm": 1.9507817029953003, + "learning_rate": 6.58225940869186e-06, + "loss": 1.2067, + "step": 4889 + }, + { + "epoch": 0.23311801301456392, + "grad_norm": 3.053358554840088, + "learning_rate": 6.5785682337088085e-06, + "loss": 0.6946, + "step": 4890 + }, + { + "epoch": 0.23316568540986343, + "grad_norm": 1.3668612241744995, + "learning_rate": 6.574877586617439e-06, + "loss": 0.5651, + "step": 4891 + }, + { + "epoch": 0.23321335780516292, + "grad_norm": 1.4881539344787598, + "learning_rate": 6.571187467987187e-06, + "loss": 0.656, + "step": 4892 + }, + { + "epoch": 0.23326103020046243, + "grad_norm": 1.0348079204559326, + "learning_rate": 6.567497878387402e-06, + "loss": 0.4377, + "step": 4893 + }, + { + "epoch": 0.23330870259576192, + "grad_norm": 1.7877459526062012, + "learning_rate": 6.563808818387342e-06, + "loss": 0.793, + "step": 4894 + }, + { + "epoch": 0.23335637499106143, + "grad_norm": 2.9496564865112305, + "learning_rate": 6.560120288556197e-06, + "loss": 0.5007, + "step": 4895 + }, + { + "epoch": 0.23340404738636092, + "grad_norm": 1.8644105195999146, + "learning_rate": 6.5564322894630705e-06, + "loss": 0.7221, + "step": 4896 + }, + { + "epoch": 0.23345171978166043, + "grad_norm": 2.8216748237609863, + "learning_rate": 6.552744821676978e-06, + "loss": 0.8326, + "step": 4897 + }, + { + "epoch": 0.23349939217695992, + "grad_norm": 1.0786311626434326, + "learning_rate": 6.549057885766859e-06, + "loss": 0.6261, + "step": 4898 + }, + { + "epoch": 0.23354706457225943, + "grad_norm": 1.6184810400009155, + "learning_rate": 6.545371482301568e-06, + "loss": 0.6217, + "step": 4899 + }, + { + "epoch": 0.23359473696755895, + "grad_norm": 1.9159892797470093, + "learning_rate": 6.5416856118498874e-06, + "loss": 0.7225, + "step": 4900 + }, + { + "epoch": 0.23364240936285843, + "grad_norm": 2.7812931537628174, + "learning_rate": 6.538000274980498e-06, + "loss": 0.7393, + "step": 4901 + }, + { + "epoch": 0.23369008175815795, + "grad_norm": 2.1245625019073486, + "learning_rate": 6.5343154722620174e-06, + "loss": 0.7782, + "step": 4902 + }, + { + "epoch": 0.23373775415345743, + "grad_norm": 2.2408251762390137, + "learning_rate": 6.53063120426297e-06, + "loss": 1.0819, + "step": 4903 + }, + { + "epoch": 0.23378542654875695, + "grad_norm": 1.921726107597351, + "learning_rate": 6.526947471551799e-06, + "loss": 0.6288, + "step": 4904 + }, + { + "epoch": 0.23383309894405643, + "grad_norm": 2.049384593963623, + "learning_rate": 6.5232642746968655e-06, + "loss": 0.6967, + "step": 4905 + }, + { + "epoch": 0.23388077133935595, + "grad_norm": 1.054907202720642, + "learning_rate": 6.519581614266456e-06, + "loss": 0.8852, + "step": 4906 + }, + { + "epoch": 0.23392844373465546, + "grad_norm": 1.905211091041565, + "learning_rate": 6.515899490828758e-06, + "loss": 0.5267, + "step": 4907 + }, + { + "epoch": 0.23397611612995495, + "grad_norm": 2.7187602519989014, + "learning_rate": 6.512217904951889e-06, + "loss": 1.2232, + "step": 4908 + }, + { + "epoch": 0.23402378852525446, + "grad_norm": 1.1157643795013428, + "learning_rate": 6.508536857203884e-06, + "loss": 0.4358, + "step": 4909 + }, + { + "epoch": 0.23407146092055395, + "grad_norm": 1.4176524877548218, + "learning_rate": 6.504856348152682e-06, + "loss": 0.6752, + "step": 4910 + }, + { + "epoch": 0.23411913331585346, + "grad_norm": 1.8605366945266724, + "learning_rate": 6.5011763783661564e-06, + "loss": 0.5545, + "step": 4911 + }, + { + "epoch": 0.23416680571115295, + "grad_norm": 7.0309834480285645, + "learning_rate": 6.497496948412085e-06, + "loss": 0.5398, + "step": 4912 + }, + { + "epoch": 0.23421447810645246, + "grad_norm": 1.5447821617126465, + "learning_rate": 6.493818058858161e-06, + "loss": 0.4848, + "step": 4913 + }, + { + "epoch": 0.23426215050175195, + "grad_norm": 0.7650623321533203, + "learning_rate": 6.490139710272005e-06, + "loss": 0.2541, + "step": 4914 + }, + { + "epoch": 0.23430982289705146, + "grad_norm": 3.0292134284973145, + "learning_rate": 6.486461903221153e-06, + "loss": 0.8596, + "step": 4915 + }, + { + "epoch": 0.23435749529235098, + "grad_norm": 1.4401601552963257, + "learning_rate": 6.482784638273041e-06, + "loss": 0.3891, + "step": 4916 + }, + { + "epoch": 0.23440516768765046, + "grad_norm": 1.506598711013794, + "learning_rate": 6.479107915995038e-06, + "loss": 0.7357, + "step": 4917 + }, + { + "epoch": 0.23445284008294998, + "grad_norm": 1.2489334344863892, + "learning_rate": 6.475431736954431e-06, + "loss": 0.7784, + "step": 4918 + }, + { + "epoch": 0.23450051247824946, + "grad_norm": 2.1433074474334717, + "learning_rate": 6.471756101718408e-06, + "loss": 0.6334, + "step": 4919 + }, + { + "epoch": 0.23454818487354898, + "grad_norm": 2.1819376945495605, + "learning_rate": 6.468081010854084e-06, + "loss": 0.4933, + "step": 4920 + }, + { + "epoch": 0.23459585726884846, + "grad_norm": 1.8046916723251343, + "learning_rate": 6.46440646492849e-06, + "loss": 0.4785, + "step": 4921 + }, + { + "epoch": 0.23464352966414798, + "grad_norm": 1.495320439338684, + "learning_rate": 6.460732464508567e-06, + "loss": 0.9471, + "step": 4922 + }, + { + "epoch": 0.2346912020594475, + "grad_norm": 1.4423713684082031, + "learning_rate": 6.4570590101611765e-06, + "loss": 0.7328, + "step": 4923 + }, + { + "epoch": 0.23473887445474698, + "grad_norm": 2.900038719177246, + "learning_rate": 6.453386102453099e-06, + "loss": 0.8906, + "step": 4924 + }, + { + "epoch": 0.2347865468500465, + "grad_norm": 1.3181345462799072, + "learning_rate": 6.449713741951021e-06, + "loss": 0.731, + "step": 4925 + }, + { + "epoch": 0.23483421924534598, + "grad_norm": 1.8139119148254395, + "learning_rate": 6.446041929221551e-06, + "loss": 0.7712, + "step": 4926 + }, + { + "epoch": 0.2348818916406455, + "grad_norm": 3.980682611465454, + "learning_rate": 6.442370664831214e-06, + "loss": 0.7389, + "step": 4927 + }, + { + "epoch": 0.23492956403594498, + "grad_norm": 7.2360968589782715, + "learning_rate": 6.438699949346446e-06, + "loss": 1.4024, + "step": 4928 + }, + { + "epoch": 0.2349772364312445, + "grad_norm": 5.406827449798584, + "learning_rate": 6.435029783333599e-06, + "loss": 1.0036, + "step": 4929 + }, + { + "epoch": 0.23502490882654398, + "grad_norm": 6.298873424530029, + "learning_rate": 6.431360167358951e-06, + "loss": 0.6144, + "step": 4930 + }, + { + "epoch": 0.2350725812218435, + "grad_norm": 2.6825435161590576, + "learning_rate": 6.427691101988673e-06, + "loss": 0.598, + "step": 4931 + }, + { + "epoch": 0.235120253617143, + "grad_norm": 1.6035325527191162, + "learning_rate": 6.424022587788872e-06, + "loss": 0.6953, + "step": 4932 + }, + { + "epoch": 0.2351679260124425, + "grad_norm": 2.0679221153259277, + "learning_rate": 6.4203546253255635e-06, + "loss": 1.0477, + "step": 4933 + }, + { + "epoch": 0.235215598407742, + "grad_norm": 1.2404496669769287, + "learning_rate": 6.416687215164671e-06, + "loss": 0.565, + "step": 4934 + }, + { + "epoch": 0.2352632708030415, + "grad_norm": 1.0773966312408447, + "learning_rate": 6.413020357872038e-06, + "loss": 0.3685, + "step": 4935 + }, + { + "epoch": 0.235310943198341, + "grad_norm": 2.640958070755005, + "learning_rate": 6.409354054013425e-06, + "loss": 0.9201, + "step": 4936 + }, + { + "epoch": 0.2353586155936405, + "grad_norm": 1.9813711643218994, + "learning_rate": 6.405688304154509e-06, + "loss": 0.9941, + "step": 4937 + }, + { + "epoch": 0.23540628798894, + "grad_norm": 1.335095763206482, + "learning_rate": 6.4020231088608695e-06, + "loss": 0.7447, + "step": 4938 + }, + { + "epoch": 0.23545396038423952, + "grad_norm": 2.280273914337158, + "learning_rate": 6.398358468698013e-06, + "loss": 0.4129, + "step": 4939 + }, + { + "epoch": 0.235501632779539, + "grad_norm": 2.7066197395324707, + "learning_rate": 6.394694384231358e-06, + "loss": 0.4934, + "step": 4940 + }, + { + "epoch": 0.23554930517483852, + "grad_norm": 1.5054798126220703, + "learning_rate": 6.3910308560262305e-06, + "loss": 0.5794, + "step": 4941 + }, + { + "epoch": 0.235596977570138, + "grad_norm": 1.2112187147140503, + "learning_rate": 6.387367884647875e-06, + "loss": 0.606, + "step": 4942 + }, + { + "epoch": 0.23564464996543752, + "grad_norm": 1.5958924293518066, + "learning_rate": 6.383705470661456e-06, + "loss": 0.7443, + "step": 4943 + }, + { + "epoch": 0.235692322360737, + "grad_norm": 2.5258569717407227, + "learning_rate": 6.380043614632037e-06, + "loss": 0.8325, + "step": 4944 + }, + { + "epoch": 0.23573999475603652, + "grad_norm": 1.2582625150680542, + "learning_rate": 6.376382317124612e-06, + "loss": 0.6065, + "step": 4945 + }, + { + "epoch": 0.235787667151336, + "grad_norm": 1.5497052669525146, + "learning_rate": 6.372721578704082e-06, + "loss": 0.527, + "step": 4946 + }, + { + "epoch": 0.23583533954663552, + "grad_norm": 2.164127826690674, + "learning_rate": 6.369061399935255e-06, + "loss": 0.8307, + "step": 4947 + }, + { + "epoch": 0.23588301194193503, + "grad_norm": 2.7174575328826904, + "learning_rate": 6.365401781382865e-06, + "loss": 1.0457, + "step": 4948 + }, + { + "epoch": 0.23593068433723452, + "grad_norm": 1.983340859413147, + "learning_rate": 6.361742723611551e-06, + "loss": 1.0276, + "step": 4949 + }, + { + "epoch": 0.23597835673253403, + "grad_norm": 1.5497627258300781, + "learning_rate": 6.358084227185866e-06, + "loss": 0.6036, + "step": 4950 + }, + { + "epoch": 0.23602602912783352, + "grad_norm": 1.4730205535888672, + "learning_rate": 6.354426292670279e-06, + "loss": 0.7759, + "step": 4951 + }, + { + "epoch": 0.23607370152313303, + "grad_norm": 2.397778272628784, + "learning_rate": 6.350768920629179e-06, + "loss": 0.7611, + "step": 4952 + }, + { + "epoch": 0.23612137391843252, + "grad_norm": 1.3542039394378662, + "learning_rate": 6.3471121116268494e-06, + "loss": 0.9697, + "step": 4953 + }, + { + "epoch": 0.23616904631373203, + "grad_norm": 1.4054028987884521, + "learning_rate": 6.343455866227504e-06, + "loss": 0.8482, + "step": 4954 + }, + { + "epoch": 0.23621671870903155, + "grad_norm": 0.9641540050506592, + "learning_rate": 6.339800184995266e-06, + "loss": 0.5462, + "step": 4955 + }, + { + "epoch": 0.23626439110433103, + "grad_norm": 1.470791220664978, + "learning_rate": 6.3361450684941664e-06, + "loss": 0.5811, + "step": 4956 + }, + { + "epoch": 0.23631206349963055, + "grad_norm": 2.747241973876953, + "learning_rate": 6.332490517288148e-06, + "loss": 0.6081, + "step": 4957 + }, + { + "epoch": 0.23635973589493003, + "grad_norm": 1.2244296073913574, + "learning_rate": 6.328836531941081e-06, + "loss": 0.6703, + "step": 4958 + }, + { + "epoch": 0.23640740829022955, + "grad_norm": 1.3257007598876953, + "learning_rate": 6.3251831130167264e-06, + "loss": 0.8026, + "step": 4959 + }, + { + "epoch": 0.23645508068552903, + "grad_norm": 2.4073638916015625, + "learning_rate": 6.321530261078774e-06, + "loss": 0.9719, + "step": 4960 + }, + { + "epoch": 0.23650275308082855, + "grad_norm": 2.8806135654449463, + "learning_rate": 6.317877976690826e-06, + "loss": 1.3026, + "step": 4961 + }, + { + "epoch": 0.23655042547612803, + "grad_norm": 2.24585223197937, + "learning_rate": 6.314226260416383e-06, + "loss": 0.9502, + "step": 4962 + }, + { + "epoch": 0.23659809787142755, + "grad_norm": 1.8365782499313354, + "learning_rate": 6.3105751128188756e-06, + "loss": 1.238, + "step": 4963 + }, + { + "epoch": 0.23664577026672706, + "grad_norm": 2.0853514671325684, + "learning_rate": 6.306924534461633e-06, + "loss": 0.7174, + "step": 4964 + }, + { + "epoch": 0.23669344266202655, + "grad_norm": 2.7662477493286133, + "learning_rate": 6.303274525907903e-06, + "loss": 0.6062, + "step": 4965 + }, + { + "epoch": 0.23674111505732606, + "grad_norm": 1.472198724746704, + "learning_rate": 6.299625087720844e-06, + "loss": 0.9402, + "step": 4966 + }, + { + "epoch": 0.23678878745262555, + "grad_norm": 1.3685739040374756, + "learning_rate": 6.295976220463531e-06, + "loss": 0.7949, + "step": 4967 + }, + { + "epoch": 0.23683645984792506, + "grad_norm": 1.2950966358184814, + "learning_rate": 6.2923279246989385e-06, + "loss": 0.5695, + "step": 4968 + }, + { + "epoch": 0.23688413224322455, + "grad_norm": 3.6497159004211426, + "learning_rate": 6.288680200989967e-06, + "loss": 1.0519, + "step": 4969 + }, + { + "epoch": 0.23693180463852406, + "grad_norm": 1.731481909751892, + "learning_rate": 6.2850330498994235e-06, + "loss": 0.9411, + "step": 4970 + }, + { + "epoch": 0.23697947703382358, + "grad_norm": 1.3376823663711548, + "learning_rate": 6.281386471990021e-06, + "loss": 0.7521, + "step": 4971 + }, + { + "epoch": 0.23702714942912306, + "grad_norm": 1.3269504308700562, + "learning_rate": 6.277740467824394e-06, + "loss": 0.6383, + "step": 4972 + }, + { + "epoch": 0.23707482182442258, + "grad_norm": 1.881067156791687, + "learning_rate": 6.2740950379650775e-06, + "loss": 0.6847, + "step": 4973 + }, + { + "epoch": 0.23712249421972206, + "grad_norm": 1.7035162448883057, + "learning_rate": 6.270450182974532e-06, + "loss": 0.9515, + "step": 4974 + }, + { + "epoch": 0.23717016661502158, + "grad_norm": 3.9568660259246826, + "learning_rate": 6.266805903415112e-06, + "loss": 1.1311, + "step": 4975 + }, + { + "epoch": 0.23721783901032106, + "grad_norm": 1.3612909317016602, + "learning_rate": 6.2631621998490965e-06, + "loss": 0.8954, + "step": 4976 + }, + { + "epoch": 0.23726551140562058, + "grad_norm": 1.2040951251983643, + "learning_rate": 6.259519072838676e-06, + "loss": 0.6272, + "step": 4977 + }, + { + "epoch": 0.23731318380092006, + "grad_norm": 2.760911226272583, + "learning_rate": 6.255876522945941e-06, + "loss": 0.3456, + "step": 4978 + }, + { + "epoch": 0.23736085619621958, + "grad_norm": 2.025914192199707, + "learning_rate": 6.2522345507329e-06, + "loss": 0.8398, + "step": 4979 + }, + { + "epoch": 0.2374085285915191, + "grad_norm": 1.8682208061218262, + "learning_rate": 6.248593156761477e-06, + "loss": 0.8555, + "step": 4980 + }, + { + "epoch": 0.23745620098681858, + "grad_norm": 1.2527092695236206, + "learning_rate": 6.244952341593493e-06, + "loss": 0.4326, + "step": 4981 + }, + { + "epoch": 0.2375038733821181, + "grad_norm": 2.424462080001831, + "learning_rate": 6.2413121057906934e-06, + "loss": 0.8241, + "step": 4982 + }, + { + "epoch": 0.23755154577741758, + "grad_norm": 1.5429975986480713, + "learning_rate": 6.237672449914734e-06, + "loss": 0.6146, + "step": 4983 + }, + { + "epoch": 0.2375992181727171, + "grad_norm": 2.2488510608673096, + "learning_rate": 6.234033374527166e-06, + "loss": 0.8928, + "step": 4984 + }, + { + "epoch": 0.23764689056801658, + "grad_norm": 1.3556400537490845, + "learning_rate": 6.230394880189468e-06, + "loss": 0.8557, + "step": 4985 + }, + { + "epoch": 0.2376945629633161, + "grad_norm": 2.1810364723205566, + "learning_rate": 6.226756967463023e-06, + "loss": 0.7507, + "step": 4986 + }, + { + "epoch": 0.2377422353586156, + "grad_norm": 2.3733267784118652, + "learning_rate": 6.223119636909118e-06, + "loss": 0.7715, + "step": 4987 + }, + { + "epoch": 0.2377899077539151, + "grad_norm": 1.7758448123931885, + "learning_rate": 6.219482889088959e-06, + "loss": 0.7934, + "step": 4988 + }, + { + "epoch": 0.2378375801492146, + "grad_norm": 2.1177680492401123, + "learning_rate": 6.215846724563661e-06, + "loss": 0.6231, + "step": 4989 + }, + { + "epoch": 0.2378852525445141, + "grad_norm": 2.7689030170440674, + "learning_rate": 6.21221114389424e-06, + "loss": 0.7026, + "step": 4990 + }, + { + "epoch": 0.2379329249398136, + "grad_norm": 1.3081248998641968, + "learning_rate": 6.208576147641634e-06, + "loss": 0.6489, + "step": 4991 + }, + { + "epoch": 0.2379805973351131, + "grad_norm": 0.6183000206947327, + "learning_rate": 6.204941736366688e-06, + "loss": 0.2264, + "step": 4992 + }, + { + "epoch": 0.2380282697304126, + "grad_norm": 1.402753472328186, + "learning_rate": 6.2013079106301454e-06, + "loss": 0.6537, + "step": 4993 + }, + { + "epoch": 0.23807594212571212, + "grad_norm": 2.1279776096343994, + "learning_rate": 6.1976746709926775e-06, + "loss": 0.8205, + "step": 4994 + }, + { + "epoch": 0.2381236145210116, + "grad_norm": 2.560328245162964, + "learning_rate": 6.194042018014852e-06, + "loss": 0.436, + "step": 4995 + }, + { + "epoch": 0.23817128691631112, + "grad_norm": 1.2721821069717407, + "learning_rate": 6.1904099522571445e-06, + "loss": 0.6816, + "step": 4996 + }, + { + "epoch": 0.2382189593116106, + "grad_norm": 2.703101396560669, + "learning_rate": 6.186778474279951e-06, + "loss": 0.5278, + "step": 4997 + }, + { + "epoch": 0.23826663170691012, + "grad_norm": 1.4805877208709717, + "learning_rate": 6.183147584643575e-06, + "loss": 0.8678, + "step": 4998 + }, + { + "epoch": 0.2383143041022096, + "grad_norm": 1.6941044330596924, + "learning_rate": 6.179517283908217e-06, + "loss": 0.7747, + "step": 4999 + }, + { + "epoch": 0.23836197649750912, + "grad_norm": 1.2779852151870728, + "learning_rate": 6.175887572633998e-06, + "loss": 0.6392, + "step": 5000 + }, + { + "epoch": 0.2384096488928086, + "grad_norm": 1.781387209892273, + "learning_rate": 6.172258451380949e-06, + "loss": 0.3088, + "step": 5001 + }, + { + "epoch": 0.23845732128810812, + "grad_norm": 2.359119415283203, + "learning_rate": 6.168629920709002e-06, + "loss": 0.8953, + "step": 5002 + }, + { + "epoch": 0.23850499368340763, + "grad_norm": 2.595407724380493, + "learning_rate": 6.165001981178e-06, + "loss": 0.9863, + "step": 5003 + }, + { + "epoch": 0.23855266607870712, + "grad_norm": 2.4940426349639893, + "learning_rate": 6.161374633347703e-06, + "loss": 1.3752, + "step": 5004 + }, + { + "epoch": 0.23860033847400663, + "grad_norm": 2.8280820846557617, + "learning_rate": 6.157747877777766e-06, + "loss": 1.2055, + "step": 5005 + }, + { + "epoch": 0.23864801086930612, + "grad_norm": 1.4954427480697632, + "learning_rate": 6.154121715027765e-06, + "loss": 0.6753, + "step": 5006 + }, + { + "epoch": 0.23869568326460563, + "grad_norm": 2.7824718952178955, + "learning_rate": 6.150496145657183e-06, + "loss": 0.5893, + "step": 5007 + }, + { + "epoch": 0.23874335565990512, + "grad_norm": 1.3478643894195557, + "learning_rate": 6.146871170225398e-06, + "loss": 0.5745, + "step": 5008 + }, + { + "epoch": 0.23879102805520463, + "grad_norm": 2.128234386444092, + "learning_rate": 6.143246789291715e-06, + "loss": 0.6522, + "step": 5009 + }, + { + "epoch": 0.23883870045050415, + "grad_norm": 1.1305973529815674, + "learning_rate": 6.139623003415336e-06, + "loss": 0.781, + "step": 5010 + }, + { + "epoch": 0.23888637284580364, + "grad_norm": 1.1940646171569824, + "learning_rate": 6.135999813155371e-06, + "loss": 0.6379, + "step": 5011 + }, + { + "epoch": 0.23893404524110315, + "grad_norm": 1.6347004175186157, + "learning_rate": 6.132377219070842e-06, + "loss": 0.4579, + "step": 5012 + }, + { + "epoch": 0.23898171763640264, + "grad_norm": 1.928120732307434, + "learning_rate": 6.128755221720682e-06, + "loss": 0.617, + "step": 5013 + }, + { + "epoch": 0.23902939003170215, + "grad_norm": 1.2113068103790283, + "learning_rate": 6.1251338216637255e-06, + "loss": 0.5078, + "step": 5014 + }, + { + "epoch": 0.23907706242700164, + "grad_norm": 1.2191598415374756, + "learning_rate": 6.121513019458715e-06, + "loss": 0.6691, + "step": 5015 + }, + { + "epoch": 0.23912473482230115, + "grad_norm": 2.199414014816284, + "learning_rate": 6.117892815664306e-06, + "loss": 0.5423, + "step": 5016 + }, + { + "epoch": 0.23917240721760064, + "grad_norm": 4.416726589202881, + "learning_rate": 6.11427321083906e-06, + "loss": 1.2491, + "step": 5017 + }, + { + "epoch": 0.23922007961290015, + "grad_norm": 3.6274728775024414, + "learning_rate": 6.110654205541438e-06, + "loss": 0.538, + "step": 5018 + }, + { + "epoch": 0.23926775200819966, + "grad_norm": 3.209824800491333, + "learning_rate": 6.1070358003298215e-06, + "loss": 1.2816, + "step": 5019 + }, + { + "epoch": 0.23931542440349915, + "grad_norm": 1.406859040260315, + "learning_rate": 6.103417995762493e-06, + "loss": 0.7179, + "step": 5020 + }, + { + "epoch": 0.23936309679879866, + "grad_norm": 1.953831672668457, + "learning_rate": 6.099800792397636e-06, + "loss": 0.2952, + "step": 5021 + }, + { + "epoch": 0.23941076919409815, + "grad_norm": 2.9792706966400146, + "learning_rate": 6.096184190793357e-06, + "loss": 0.239, + "step": 5022 + }, + { + "epoch": 0.23945844158939766, + "grad_norm": 1.1545838117599487, + "learning_rate": 6.092568191507655e-06, + "loss": 0.5336, + "step": 5023 + }, + { + "epoch": 0.23950611398469715, + "grad_norm": 1.4073158502578735, + "learning_rate": 6.088952795098442e-06, + "loss": 0.8207, + "step": 5024 + }, + { + "epoch": 0.23955378637999666, + "grad_norm": 2.970121145248413, + "learning_rate": 6.085338002123534e-06, + "loss": 0.6325, + "step": 5025 + }, + { + "epoch": 0.23960145877529618, + "grad_norm": 1.9289273023605347, + "learning_rate": 6.081723813140664e-06, + "loss": 0.6894, + "step": 5026 + }, + { + "epoch": 0.23964913117059566, + "grad_norm": 1.7830263376235962, + "learning_rate": 6.078110228707454e-06, + "loss": 0.8976, + "step": 5027 + }, + { + "epoch": 0.23969680356589518, + "grad_norm": 1.3702656030654907, + "learning_rate": 6.07449724938145e-06, + "loss": 0.7642, + "step": 5028 + }, + { + "epoch": 0.23974447596119466, + "grad_norm": 1.7167824506759644, + "learning_rate": 6.0708848757200975e-06, + "loss": 0.7334, + "step": 5029 + }, + { + "epoch": 0.23979214835649418, + "grad_norm": 1.097110629081726, + "learning_rate": 6.067273108280745e-06, + "loss": 0.7213, + "step": 5030 + }, + { + "epoch": 0.23983982075179366, + "grad_norm": 1.2784687280654907, + "learning_rate": 6.0636619476206534e-06, + "loss": 0.6287, + "step": 5031 + }, + { + "epoch": 0.23988749314709318, + "grad_norm": 17.311893463134766, + "learning_rate": 6.060051394296989e-06, + "loss": 0.8498, + "step": 5032 + }, + { + "epoch": 0.23993516554239266, + "grad_norm": 2.501488208770752, + "learning_rate": 6.056441448866817e-06, + "loss": 0.9281, + "step": 5033 + }, + { + "epoch": 0.23998283793769218, + "grad_norm": 1.5192574262619019, + "learning_rate": 6.052832111887117e-06, + "loss": 0.7549, + "step": 5034 + }, + { + "epoch": 0.2400305103329917, + "grad_norm": 1.5082474946975708, + "learning_rate": 6.04922338391478e-06, + "loss": 0.7535, + "step": 5035 + }, + { + "epoch": 0.24007818272829118, + "grad_norm": 6.296762466430664, + "learning_rate": 6.045615265506585e-06, + "loss": 0.0771, + "step": 5036 + }, + { + "epoch": 0.2401258551235907, + "grad_norm": 2.4316208362579346, + "learning_rate": 6.0420077572192325e-06, + "loss": 0.4872, + "step": 5037 + }, + { + "epoch": 0.24017352751889018, + "grad_norm": 1.7005482912063599, + "learning_rate": 6.038400859609327e-06, + "loss": 0.668, + "step": 5038 + }, + { + "epoch": 0.2402211999141897, + "grad_norm": 1.3777533769607544, + "learning_rate": 6.034794573233371e-06, + "loss": 0.2928, + "step": 5039 + }, + { + "epoch": 0.24026887230948918, + "grad_norm": 2.814288854598999, + "learning_rate": 6.031188898647776e-06, + "loss": 0.5485, + "step": 5040 + }, + { + "epoch": 0.2403165447047887, + "grad_norm": 6.852921485900879, + "learning_rate": 6.027583836408868e-06, + "loss": 0.6997, + "step": 5041 + }, + { + "epoch": 0.2403642171000882, + "grad_norm": 1.5040916204452515, + "learning_rate": 6.023979387072861e-06, + "loss": 0.5527, + "step": 5042 + }, + { + "epoch": 0.2404118894953877, + "grad_norm": 1.410093069076538, + "learning_rate": 6.020375551195891e-06, + "loss": 0.8848, + "step": 5043 + }, + { + "epoch": 0.2404595618906872, + "grad_norm": 1.325937032699585, + "learning_rate": 6.016772329333993e-06, + "loss": 0.5595, + "step": 5044 + }, + { + "epoch": 0.2405072342859867, + "grad_norm": 1.5293211936950684, + "learning_rate": 6.013169722043104e-06, + "loss": 0.6897, + "step": 5045 + }, + { + "epoch": 0.2405549066812862, + "grad_norm": 1.9939907789230347, + "learning_rate": 6.009567729879071e-06, + "loss": 0.4316, + "step": 5046 + }, + { + "epoch": 0.2406025790765857, + "grad_norm": 1.5865501165390015, + "learning_rate": 6.005966353397643e-06, + "loss": 0.9059, + "step": 5047 + }, + { + "epoch": 0.2406502514718852, + "grad_norm": 1.3185622692108154, + "learning_rate": 6.002365593154478e-06, + "loss": 0.819, + "step": 5048 + }, + { + "epoch": 0.2406979238671847, + "grad_norm": 1.858639121055603, + "learning_rate": 5.998765449705131e-06, + "loss": 0.5307, + "step": 5049 + }, + { + "epoch": 0.2407455962624842, + "grad_norm": 1.3617935180664062, + "learning_rate": 5.9951659236050695e-06, + "loss": 0.8302, + "step": 5050 + }, + { + "epoch": 0.24079326865778372, + "grad_norm": 1.3620930910110474, + "learning_rate": 5.99156701540967e-06, + "loss": 0.6854, + "step": 5051 + }, + { + "epoch": 0.2408409410530832, + "grad_norm": 1.474213719367981, + "learning_rate": 5.987968725674196e-06, + "loss": 0.6022, + "step": 5052 + }, + { + "epoch": 0.24088861344838272, + "grad_norm": 1.3608521223068237, + "learning_rate": 5.9843710549538346e-06, + "loss": 0.3048, + "step": 5053 + }, + { + "epoch": 0.2409362858436822, + "grad_norm": 1.830669641494751, + "learning_rate": 5.980774003803668e-06, + "loss": 0.9192, + "step": 5054 + }, + { + "epoch": 0.24098395823898172, + "grad_norm": 1.5334892272949219, + "learning_rate": 5.977177572778679e-06, + "loss": 0.5967, + "step": 5055 + }, + { + "epoch": 0.2410316306342812, + "grad_norm": 3.262408971786499, + "learning_rate": 5.973581762433763e-06, + "loss": 0.5825, + "step": 5056 + }, + { + "epoch": 0.24107930302958072, + "grad_norm": 1.8143086433410645, + "learning_rate": 5.969986573323721e-06, + "loss": 0.8515, + "step": 5057 + }, + { + "epoch": 0.24112697542488024, + "grad_norm": 1.383774757385254, + "learning_rate": 5.966392006003245e-06, + "loss": 1.1035, + "step": 5058 + }, + { + "epoch": 0.24117464782017972, + "grad_norm": 1.8638062477111816, + "learning_rate": 5.9627980610269445e-06, + "loss": 0.8606, + "step": 5059 + }, + { + "epoch": 0.24122232021547924, + "grad_norm": 1.5183956623077393, + "learning_rate": 5.959204738949334e-06, + "loss": 0.7039, + "step": 5060 + }, + { + "epoch": 0.24126999261077872, + "grad_norm": 2.2757411003112793, + "learning_rate": 5.955612040324815e-06, + "loss": 0.697, + "step": 5061 + }, + { + "epoch": 0.24131766500607824, + "grad_norm": 1.1037594079971313, + "learning_rate": 5.952019965707709e-06, + "loss": 0.516, + "step": 5062 + }, + { + "epoch": 0.24136533740137772, + "grad_norm": 2.3021318912506104, + "learning_rate": 5.948428515652241e-06, + "loss": 0.5897, + "step": 5063 + }, + { + "epoch": 0.24141300979667724, + "grad_norm": 1.4584852457046509, + "learning_rate": 5.944837690712524e-06, + "loss": 0.8717, + "step": 5064 + }, + { + "epoch": 0.24146068219197672, + "grad_norm": 1.1223253011703491, + "learning_rate": 5.941247491442592e-06, + "loss": 0.7506, + "step": 5065 + }, + { + "epoch": 0.24150835458727624, + "grad_norm": 1.5056185722351074, + "learning_rate": 5.9376579183963775e-06, + "loss": 0.6489, + "step": 5066 + }, + { + "epoch": 0.24155602698257575, + "grad_norm": 2.4089179039001465, + "learning_rate": 5.9340689721277116e-06, + "loss": 0.9086, + "step": 5067 + }, + { + "epoch": 0.24160369937787524, + "grad_norm": 2.3049986362457275, + "learning_rate": 5.930480653190331e-06, + "loss": 0.4041, + "step": 5068 + }, + { + "epoch": 0.24165137177317475, + "grad_norm": 1.4334743022918701, + "learning_rate": 5.9268929621378805e-06, + "loss": 0.6992, + "step": 5069 + }, + { + "epoch": 0.24169904416847424, + "grad_norm": 1.4559268951416016, + "learning_rate": 5.923305899523899e-06, + "loss": 1.0898, + "step": 5070 + }, + { + "epoch": 0.24174671656377375, + "grad_norm": 1.4106165170669556, + "learning_rate": 5.919719465901834e-06, + "loss": 0.5322, + "step": 5071 + }, + { + "epoch": 0.24179438895907324, + "grad_norm": 1.2145572900772095, + "learning_rate": 5.916133661825041e-06, + "loss": 0.5322, + "step": 5072 + }, + { + "epoch": 0.24184206135437275, + "grad_norm": 1.256256103515625, + "learning_rate": 5.9125484878467635e-06, + "loss": 0.5958, + "step": 5073 + }, + { + "epoch": 0.24188973374967226, + "grad_norm": 3.8242619037628174, + "learning_rate": 5.908963944520162e-06, + "loss": 1.1024, + "step": 5074 + }, + { + "epoch": 0.24193740614497175, + "grad_norm": 1.2726141214370728, + "learning_rate": 5.9053800323982976e-06, + "loss": 0.8752, + "step": 5075 + }, + { + "epoch": 0.24198507854027126, + "grad_norm": 3.162296772003174, + "learning_rate": 5.901796752034128e-06, + "loss": 1.2186, + "step": 5076 + }, + { + "epoch": 0.24203275093557075, + "grad_norm": 2.238673448562622, + "learning_rate": 5.8982141039805115e-06, + "loss": 0.7286, + "step": 5077 + }, + { + "epoch": 0.24208042333087026, + "grad_norm": 1.8592641353607178, + "learning_rate": 5.894632088790224e-06, + "loss": 1.0557, + "step": 5078 + }, + { + "epoch": 0.24212809572616975, + "grad_norm": 1.319214940071106, + "learning_rate": 5.891050707015924e-06, + "loss": 0.6095, + "step": 5079 + }, + { + "epoch": 0.24217576812146926, + "grad_norm": 2.3480584621429443, + "learning_rate": 5.887469959210186e-06, + "loss": 0.4254, + "step": 5080 + }, + { + "epoch": 0.24222344051676878, + "grad_norm": 1.9646015167236328, + "learning_rate": 5.883889845925487e-06, + "loss": 0.4465, + "step": 5081 + }, + { + "epoch": 0.24227111291206826, + "grad_norm": 2.3036208152770996, + "learning_rate": 5.880310367714192e-06, + "loss": 0.7986, + "step": 5082 + }, + { + "epoch": 0.24231878530736778, + "grad_norm": 2.5882246494293213, + "learning_rate": 5.8767315251285854e-06, + "loss": 0.7113, + "step": 5083 + }, + { + "epoch": 0.24236645770266727, + "grad_norm": 2.2714028358459473, + "learning_rate": 5.873153318720842e-06, + "loss": 1.0477, + "step": 5084 + }, + { + "epoch": 0.24241413009796678, + "grad_norm": 1.8316211700439453, + "learning_rate": 5.869575749043044e-06, + "loss": 0.793, + "step": 5085 + }, + { + "epoch": 0.24246180249326627, + "grad_norm": 1.843741536140442, + "learning_rate": 5.8659988166471715e-06, + "loss": 1.0857, + "step": 5086 + }, + { + "epoch": 0.24250947488856578, + "grad_norm": 3.6580004692077637, + "learning_rate": 5.862422522085108e-06, + "loss": 0.9047, + "step": 5087 + }, + { + "epoch": 0.24255714728386527, + "grad_norm": 4.874394416809082, + "learning_rate": 5.858846865908645e-06, + "loss": 0.3122, + "step": 5088 + }, + { + "epoch": 0.24260481967916478, + "grad_norm": 1.1394366025924683, + "learning_rate": 5.855271848669462e-06, + "loss": 0.6588, + "step": 5089 + }, + { + "epoch": 0.2426524920744643, + "grad_norm": 1.784208059310913, + "learning_rate": 5.851697470919151e-06, + "loss": 0.4574, + "step": 5090 + }, + { + "epoch": 0.24270016446976378, + "grad_norm": 2.3072550296783447, + "learning_rate": 5.8481237332092014e-06, + "loss": 0.5939, + "step": 5091 + }, + { + "epoch": 0.2427478368650633, + "grad_norm": 1.3365013599395752, + "learning_rate": 5.844550636091004e-06, + "loss": 0.5374, + "step": 5092 + }, + { + "epoch": 0.24279550926036278, + "grad_norm": 6.065027713775635, + "learning_rate": 5.840978180115848e-06, + "loss": 0.4395, + "step": 5093 + }, + { + "epoch": 0.2428431816556623, + "grad_norm": 2.194348096847534, + "learning_rate": 5.837406365834934e-06, + "loss": 0.9389, + "step": 5094 + }, + { + "epoch": 0.24289085405096178, + "grad_norm": 2.385526418685913, + "learning_rate": 5.8338351937993476e-06, + "loss": 0.7008, + "step": 5095 + }, + { + "epoch": 0.2429385264462613, + "grad_norm": 1.4699710607528687, + "learning_rate": 5.830264664560087e-06, + "loss": 0.6011, + "step": 5096 + }, + { + "epoch": 0.2429861988415608, + "grad_norm": 1.800408124923706, + "learning_rate": 5.826694778668053e-06, + "loss": 0.705, + "step": 5097 + }, + { + "epoch": 0.2430338712368603, + "grad_norm": 1.5617139339447021, + "learning_rate": 5.823125536674032e-06, + "loss": 0.5671, + "step": 5098 + }, + { + "epoch": 0.2430815436321598, + "grad_norm": 1.5853725671768188, + "learning_rate": 5.81955693912873e-06, + "loss": 0.973, + "step": 5099 + }, + { + "epoch": 0.2431292160274593, + "grad_norm": 1.2248514890670776, + "learning_rate": 5.815988986582745e-06, + "loss": 0.5335, + "step": 5100 + }, + { + "epoch": 0.2431768884227588, + "grad_norm": 1.5876938104629517, + "learning_rate": 5.812421679586569e-06, + "loss": 0.8773, + "step": 5101 + }, + { + "epoch": 0.2432245608180583, + "grad_norm": 1.6629010438919067, + "learning_rate": 5.808855018690607e-06, + "loss": 0.8254, + "step": 5102 + }, + { + "epoch": 0.2432722332133578, + "grad_norm": 2.62546706199646, + "learning_rate": 5.805289004445155e-06, + "loss": 0.4642, + "step": 5103 + }, + { + "epoch": 0.2433199056086573, + "grad_norm": 1.9097936153411865, + "learning_rate": 5.801723637400409e-06, + "loss": 0.513, + "step": 5104 + }, + { + "epoch": 0.2433675780039568, + "grad_norm": 2.2730860710144043, + "learning_rate": 5.798158918106471e-06, + "loss": 0.7113, + "step": 5105 + }, + { + "epoch": 0.24341525039925632, + "grad_norm": 1.205796480178833, + "learning_rate": 5.7945948471133466e-06, + "loss": 0.729, + "step": 5106 + }, + { + "epoch": 0.2434629227945558, + "grad_norm": 1.7572075128555298, + "learning_rate": 5.791031424970926e-06, + "loss": 0.8275, + "step": 5107 + }, + { + "epoch": 0.24351059518985532, + "grad_norm": 1.4006179571151733, + "learning_rate": 5.787468652229012e-06, + "loss": 0.666, + "step": 5108 + }, + { + "epoch": 0.2435582675851548, + "grad_norm": 1.7280313968658447, + "learning_rate": 5.783906529437309e-06, + "loss": 0.5724, + "step": 5109 + }, + { + "epoch": 0.24360593998045432, + "grad_norm": 1.581921935081482, + "learning_rate": 5.7803450571454066e-06, + "loss": 0.8196, + "step": 5110 + }, + { + "epoch": 0.2436536123757538, + "grad_norm": 2.0756044387817383, + "learning_rate": 5.776784235902807e-06, + "loss": 1.0043, + "step": 5111 + }, + { + "epoch": 0.24370128477105332, + "grad_norm": 1.6152479648590088, + "learning_rate": 5.773224066258913e-06, + "loss": 0.6942, + "step": 5112 + }, + { + "epoch": 0.24374895716635284, + "grad_norm": 1.7283798456192017, + "learning_rate": 5.769664548763016e-06, + "loss": 0.7455, + "step": 5113 + }, + { + "epoch": 0.24379662956165232, + "grad_norm": 1.8666808605194092, + "learning_rate": 5.766105683964314e-06, + "loss": 0.868, + "step": 5114 + }, + { + "epoch": 0.24384430195695184, + "grad_norm": 5.257474422454834, + "learning_rate": 5.762547472411909e-06, + "loss": 1.6388, + "step": 5115 + }, + { + "epoch": 0.24389197435225132, + "grad_norm": 1.4893912076950073, + "learning_rate": 5.758989914654787e-06, + "loss": 0.305, + "step": 5116 + }, + { + "epoch": 0.24393964674755084, + "grad_norm": 1.581531047821045, + "learning_rate": 5.755433011241851e-06, + "loss": 0.7188, + "step": 5117 + }, + { + "epoch": 0.24398731914285032, + "grad_norm": 1.022304654121399, + "learning_rate": 5.751876762721887e-06, + "loss": 0.5168, + "step": 5118 + }, + { + "epoch": 0.24403499153814984, + "grad_norm": 1.4325313568115234, + "learning_rate": 5.748321169643596e-06, + "loss": 0.7221, + "step": 5119 + }, + { + "epoch": 0.24408266393344932, + "grad_norm": 3.6646790504455566, + "learning_rate": 5.744766232555561e-06, + "loss": 0.7195, + "step": 5120 + }, + { + "epoch": 0.24413033632874884, + "grad_norm": 1.5659929513931274, + "learning_rate": 5.741211952006278e-06, + "loss": 0.5926, + "step": 5121 + }, + { + "epoch": 0.24417800872404835, + "grad_norm": 1.4932897090911865, + "learning_rate": 5.737658328544131e-06, + "loss": 0.6188, + "step": 5122 + }, + { + "epoch": 0.24422568111934784, + "grad_norm": 1.1666932106018066, + "learning_rate": 5.73410536271741e-06, + "loss": 0.5176, + "step": 5123 + }, + { + "epoch": 0.24427335351464735, + "grad_norm": 2.5744693279266357, + "learning_rate": 5.730553055074306e-06, + "loss": 0.745, + "step": 5124 + }, + { + "epoch": 0.24432102590994684, + "grad_norm": 1.3289794921875, + "learning_rate": 5.7270014061628935e-06, + "loss": 0.5933, + "step": 5125 + }, + { + "epoch": 0.24436869830524635, + "grad_norm": 1.8544728755950928, + "learning_rate": 5.7234504165311626e-06, + "loss": 0.8654, + "step": 5126 + }, + { + "epoch": 0.24441637070054584, + "grad_norm": 1.8545337915420532, + "learning_rate": 5.71990008672699e-06, + "loss": 0.783, + "step": 5127 + }, + { + "epoch": 0.24446404309584535, + "grad_norm": 1.5942987203598022, + "learning_rate": 5.716350417298163e-06, + "loss": 0.6936, + "step": 5128 + }, + { + "epoch": 0.24451171549114487, + "grad_norm": 1.5744047164916992, + "learning_rate": 5.71280140879235e-06, + "loss": 1.1856, + "step": 5129 + }, + { + "epoch": 0.24455938788644435, + "grad_norm": 7.246407508850098, + "learning_rate": 5.7092530617571284e-06, + "loss": 1.1715, + "step": 5130 + }, + { + "epoch": 0.24460706028174387, + "grad_norm": 1.7530832290649414, + "learning_rate": 5.7057053767399784e-06, + "loss": 0.8354, + "step": 5131 + }, + { + "epoch": 0.24465473267704335, + "grad_norm": 1.4511467218399048, + "learning_rate": 5.702158354288265e-06, + "loss": 0.5385, + "step": 5132 + }, + { + "epoch": 0.24470240507234287, + "grad_norm": 1.5028300285339355, + "learning_rate": 5.698611994949257e-06, + "loss": 0.7284, + "step": 5133 + }, + { + "epoch": 0.24475007746764235, + "grad_norm": 1.682446837425232, + "learning_rate": 5.6950662992701245e-06, + "loss": 0.8775, + "step": 5134 + }, + { + "epoch": 0.24479774986294187, + "grad_norm": 2.4769394397735596, + "learning_rate": 5.691521267797926e-06, + "loss": 1.1797, + "step": 5135 + }, + { + "epoch": 0.24484542225824135, + "grad_norm": 1.847987413406372, + "learning_rate": 5.687976901079626e-06, + "loss": 0.5466, + "step": 5136 + }, + { + "epoch": 0.24489309465354087, + "grad_norm": 1.2661330699920654, + "learning_rate": 5.684433199662091e-06, + "loss": 0.6113, + "step": 5137 + }, + { + "epoch": 0.24494076704884038, + "grad_norm": 1.9135509729385376, + "learning_rate": 5.680890164092065e-06, + "loss": 0.6979, + "step": 5138 + }, + { + "epoch": 0.24498843944413987, + "grad_norm": 1.2192476987838745, + "learning_rate": 5.67734779491621e-06, + "loss": 0.2719, + "step": 5139 + }, + { + "epoch": 0.24503611183943938, + "grad_norm": 1.6965047121047974, + "learning_rate": 5.67380609268108e-06, + "loss": 0.7954, + "step": 5140 + }, + { + "epoch": 0.24508378423473887, + "grad_norm": 1.8437730073928833, + "learning_rate": 5.670265057933114e-06, + "loss": 0.9539, + "step": 5141 + }, + { + "epoch": 0.24513145663003838, + "grad_norm": 1.9941530227661133, + "learning_rate": 5.666724691218663e-06, + "loss": 1.1797, + "step": 5142 + }, + { + "epoch": 0.24517912902533787, + "grad_norm": 3.011155128479004, + "learning_rate": 5.663184993083971e-06, + "loss": 0.7168, + "step": 5143 + }, + { + "epoch": 0.24522680142063738, + "grad_norm": 3.826796293258667, + "learning_rate": 5.65964596407517e-06, + "loss": 1.4929, + "step": 5144 + }, + { + "epoch": 0.2452744738159369, + "grad_norm": 1.3734126091003418, + "learning_rate": 5.6561076047383e-06, + "loss": 0.8289, + "step": 5145 + }, + { + "epoch": 0.24532214621123638, + "grad_norm": 3.8718600273132324, + "learning_rate": 5.652569915619297e-06, + "loss": 0.388, + "step": 5146 + }, + { + "epoch": 0.2453698186065359, + "grad_norm": 2.4398751258850098, + "learning_rate": 5.649032897263986e-06, + "loss": 1.4178, + "step": 5147 + }, + { + "epoch": 0.24541749100183538, + "grad_norm": 1.8063709735870361, + "learning_rate": 5.645496550218089e-06, + "loss": 0.7274, + "step": 5148 + }, + { + "epoch": 0.2454651633971349, + "grad_norm": 1.481279969215393, + "learning_rate": 5.6419608750272355e-06, + "loss": 0.7816, + "step": 5149 + }, + { + "epoch": 0.24551283579243438, + "grad_norm": 1.690308928489685, + "learning_rate": 5.638425872236937e-06, + "loss": 0.562, + "step": 5150 + }, + { + "epoch": 0.2455605081877339, + "grad_norm": 1.94339919090271, + "learning_rate": 5.634891542392608e-06, + "loss": 0.3454, + "step": 5151 + }, + { + "epoch": 0.24560818058303338, + "grad_norm": 1.3810763359069824, + "learning_rate": 5.631357886039568e-06, + "loss": 0.8602, + "step": 5152 + }, + { + "epoch": 0.2456558529783329, + "grad_norm": 1.471650242805481, + "learning_rate": 5.627824903723014e-06, + "loss": 0.822, + "step": 5153 + }, + { + "epoch": 0.2457035253736324, + "grad_norm": 2.2222437858581543, + "learning_rate": 5.624292595988052e-06, + "loss": 0.5791, + "step": 5154 + }, + { + "epoch": 0.2457511977689319, + "grad_norm": 1.2477086782455444, + "learning_rate": 5.620760963379686e-06, + "loss": 0.6096, + "step": 5155 + }, + { + "epoch": 0.2457988701642314, + "grad_norm": 1.5124863386154175, + "learning_rate": 5.617230006442802e-06, + "loss": 0.6669, + "step": 5156 + }, + { + "epoch": 0.2458465425595309, + "grad_norm": 1.1730290651321411, + "learning_rate": 5.6136997257221946e-06, + "loss": 0.3754, + "step": 5157 + }, + { + "epoch": 0.2458942149548304, + "grad_norm": 3.3835866451263428, + "learning_rate": 5.610170121762553e-06, + "loss": 1.0491, + "step": 5158 + }, + { + "epoch": 0.2459418873501299, + "grad_norm": 1.6068484783172607, + "learning_rate": 5.60664119510845e-06, + "loss": 0.561, + "step": 5159 + }, + { + "epoch": 0.2459895597454294, + "grad_norm": 1.995864987373352, + "learning_rate": 5.603112946304368e-06, + "loss": 1.1287, + "step": 5160 + }, + { + "epoch": 0.24603723214072892, + "grad_norm": 1.4355392456054688, + "learning_rate": 5.599585375894684e-06, + "loss": 0.855, + "step": 5161 + }, + { + "epoch": 0.2460849045360284, + "grad_norm": 5.265583038330078, + "learning_rate": 5.5960584844236565e-06, + "loss": 1.7132, + "step": 5162 + }, + { + "epoch": 0.24613257693132792, + "grad_norm": 1.4439984560012817, + "learning_rate": 5.592532272435458e-06, + "loss": 0.7141, + "step": 5163 + }, + { + "epoch": 0.2461802493266274, + "grad_norm": 11.15322494506836, + "learning_rate": 5.5890067404741365e-06, + "loss": 0.6837, + "step": 5164 + }, + { + "epoch": 0.24622792172192692, + "grad_norm": 4.258968353271484, + "learning_rate": 5.585481889083655e-06, + "loss": 1.7091, + "step": 5165 + }, + { + "epoch": 0.2462755941172264, + "grad_norm": 1.1728631258010864, + "learning_rate": 5.581957718807854e-06, + "loss": 0.6019, + "step": 5166 + }, + { + "epoch": 0.24632326651252592, + "grad_norm": 1.5525482892990112, + "learning_rate": 5.578434230190478e-06, + "loss": 0.7247, + "step": 5167 + }, + { + "epoch": 0.24637093890782544, + "grad_norm": 1.144457459449768, + "learning_rate": 5.574911423775173e-06, + "loss": 0.6033, + "step": 5168 + }, + { + "epoch": 0.24641861130312492, + "grad_norm": 1.0099642276763916, + "learning_rate": 5.571389300105461e-06, + "loss": 0.684, + "step": 5169 + }, + { + "epoch": 0.24646628369842444, + "grad_norm": 1.2308775186538696, + "learning_rate": 5.567867859724774e-06, + "loss": 0.4182, + "step": 5170 + }, + { + "epoch": 0.24651395609372392, + "grad_norm": 1.3812068700790405, + "learning_rate": 5.5643471031764375e-06, + "loss": 0.7686, + "step": 5171 + }, + { + "epoch": 0.24656162848902344, + "grad_norm": 1.4897170066833496, + "learning_rate": 5.560827031003661e-06, + "loss": 0.7183, + "step": 5172 + }, + { + "epoch": 0.24660930088432292, + "grad_norm": 2.8295412063598633, + "learning_rate": 5.557307643749559e-06, + "loss": 0.7561, + "step": 5173 + }, + { + "epoch": 0.24665697327962244, + "grad_norm": 2.257028579711914, + "learning_rate": 5.553788941957141e-06, + "loss": 0.9433, + "step": 5174 + }, + { + "epoch": 0.24670464567492192, + "grad_norm": 5.300502777099609, + "learning_rate": 5.550270926169298e-06, + "loss": 1.3377, + "step": 5175 + }, + { + "epoch": 0.24675231807022144, + "grad_norm": 2.70894455909729, + "learning_rate": 5.546753596928831e-06, + "loss": 0.9223, + "step": 5176 + }, + { + "epoch": 0.24679999046552095, + "grad_norm": 1.5173131227493286, + "learning_rate": 5.543236954778421e-06, + "loss": 0.686, + "step": 5177 + }, + { + "epoch": 0.24684766286082044, + "grad_norm": 1.0150842666625977, + "learning_rate": 5.539721000260658e-06, + "loss": 0.4478, + "step": 5178 + }, + { + "epoch": 0.24689533525611995, + "grad_norm": 1.1175934076309204, + "learning_rate": 5.5362057339180075e-06, + "loss": 0.7047, + "step": 5179 + }, + { + "epoch": 0.24694300765141944, + "grad_norm": 26.487831115722656, + "learning_rate": 5.532691156292849e-06, + "loss": 0.3799, + "step": 5180 + }, + { + "epoch": 0.24699068004671895, + "grad_norm": 2.0606038570404053, + "learning_rate": 5.529177267927437e-06, + "loss": 0.8949, + "step": 5181 + }, + { + "epoch": 0.24703835244201844, + "grad_norm": 1.6512166261672974, + "learning_rate": 5.52566406936393e-06, + "loss": 0.5806, + "step": 5182 + }, + { + "epoch": 0.24708602483731795, + "grad_norm": 2.8060967922210693, + "learning_rate": 5.522151561144386e-06, + "loss": 0.7323, + "step": 5183 + }, + { + "epoch": 0.24713369723261747, + "grad_norm": 2.039449691772461, + "learning_rate": 5.518639743810738e-06, + "loss": 0.8048, + "step": 5184 + }, + { + "epoch": 0.24718136962791695, + "grad_norm": 1.3262771368026733, + "learning_rate": 5.51512861790483e-06, + "loss": 0.696, + "step": 5185 + }, + { + "epoch": 0.24722904202321647, + "grad_norm": 1.9851527214050293, + "learning_rate": 5.5116181839683944e-06, + "loss": 0.7769, + "step": 5186 + }, + { + "epoch": 0.24727671441851595, + "grad_norm": 3.5180680751800537, + "learning_rate": 5.508108442543048e-06, + "loss": 0.3315, + "step": 5187 + }, + { + "epoch": 0.24732438681381547, + "grad_norm": 1.4458363056182861, + "learning_rate": 5.5045993941703094e-06, + "loss": 0.8182, + "step": 5188 + }, + { + "epoch": 0.24737205920911495, + "grad_norm": 2.6698710918426514, + "learning_rate": 5.501091039391596e-06, + "loss": 0.6063, + "step": 5189 + }, + { + "epoch": 0.24741973160441447, + "grad_norm": 1.4538973569869995, + "learning_rate": 5.497583378748201e-06, + "loss": 0.6288, + "step": 5190 + }, + { + "epoch": 0.24746740399971395, + "grad_norm": 1.2913610935211182, + "learning_rate": 5.49407641278133e-06, + "loss": 0.8647, + "step": 5191 + }, + { + "epoch": 0.24751507639501347, + "grad_norm": 1.0243335962295532, + "learning_rate": 5.490570142032061e-06, + "loss": 0.4859, + "step": 5192 + }, + { + "epoch": 0.24756274879031298, + "grad_norm": 1.1922807693481445, + "learning_rate": 5.487064567041387e-06, + "loss": 0.6223, + "step": 5193 + }, + { + "epoch": 0.24761042118561247, + "grad_norm": 1.1932332515716553, + "learning_rate": 5.48355968835017e-06, + "loss": 0.1719, + "step": 5194 + }, + { + "epoch": 0.24765809358091198, + "grad_norm": 1.3214937448501587, + "learning_rate": 5.480055506499187e-06, + "loss": 0.7055, + "step": 5195 + }, + { + "epoch": 0.24770576597621147, + "grad_norm": 1.7870721817016602, + "learning_rate": 5.476552022029089e-06, + "loss": 0.847, + "step": 5196 + }, + { + "epoch": 0.24775343837151098, + "grad_norm": 1.8140438795089722, + "learning_rate": 5.473049235480432e-06, + "loss": 0.8939, + "step": 5197 + }, + { + "epoch": 0.24780111076681047, + "grad_norm": 20.688039779663086, + "learning_rate": 5.4695471473936636e-06, + "loss": 0.1567, + "step": 5198 + }, + { + "epoch": 0.24784878316210998, + "grad_norm": 1.7145599126815796, + "learning_rate": 5.466045758309111e-06, + "loss": 0.8193, + "step": 5199 + }, + { + "epoch": 0.2478964555574095, + "grad_norm": 1.3122328519821167, + "learning_rate": 5.462545068767008e-06, + "loss": 0.7377, + "step": 5200 + }, + { + "epoch": 0.24794412795270898, + "grad_norm": 1.981058955192566, + "learning_rate": 5.459045079307473e-06, + "loss": 0.6913, + "step": 5201 + }, + { + "epoch": 0.2479918003480085, + "grad_norm": 1.760473370552063, + "learning_rate": 5.455545790470524e-06, + "loss": 0.929, + "step": 5202 + }, + { + "epoch": 0.24803947274330798, + "grad_norm": 1.4741325378417969, + "learning_rate": 5.452047202796058e-06, + "loss": 0.503, + "step": 5203 + }, + { + "epoch": 0.2480871451386075, + "grad_norm": 1.8108981847763062, + "learning_rate": 5.448549316823873e-06, + "loss": 0.5086, + "step": 5204 + }, + { + "epoch": 0.24813481753390698, + "grad_norm": 1.841205358505249, + "learning_rate": 5.44505213309366e-06, + "loss": 0.6914, + "step": 5205 + }, + { + "epoch": 0.2481824899292065, + "grad_norm": 3.5677199363708496, + "learning_rate": 5.4415556521449944e-06, + "loss": 0.3478, + "step": 5206 + }, + { + "epoch": 0.24823016232450598, + "grad_norm": 2.2605278491973877, + "learning_rate": 5.4380598745173495e-06, + "loss": 0.6591, + "step": 5207 + }, + { + "epoch": 0.2482778347198055, + "grad_norm": 1.3539484739303589, + "learning_rate": 5.434564800750091e-06, + "loss": 0.669, + "step": 5208 + }, + { + "epoch": 0.248325507115105, + "grad_norm": 1.6358530521392822, + "learning_rate": 5.431070431382461e-06, + "loss": 0.7204, + "step": 5209 + }, + { + "epoch": 0.2483731795104045, + "grad_norm": 1.1210711002349854, + "learning_rate": 5.427576766953615e-06, + "loss": 0.209, + "step": 5210 + }, + { + "epoch": 0.248420851905704, + "grad_norm": 1.661259651184082, + "learning_rate": 5.424083808002591e-06, + "loss": 0.7631, + "step": 5211 + }, + { + "epoch": 0.2484685243010035, + "grad_norm": 1.5903949737548828, + "learning_rate": 5.420591555068308e-06, + "loss": 0.4729, + "step": 5212 + }, + { + "epoch": 0.248516196696303, + "grad_norm": 2.0092742443084717, + "learning_rate": 5.417100008689588e-06, + "loss": 0.5033, + "step": 5213 + }, + { + "epoch": 0.2485638690916025, + "grad_norm": 1.7281888723373413, + "learning_rate": 5.413609169405148e-06, + "loss": 0.7902, + "step": 5214 + }, + { + "epoch": 0.248611541486902, + "grad_norm": 1.604560136795044, + "learning_rate": 5.4101190377535785e-06, + "loss": 0.4432, + "step": 5215 + }, + { + "epoch": 0.24865921388220152, + "grad_norm": 1.378257393836975, + "learning_rate": 5.406629614273373e-06, + "loss": 0.7872, + "step": 5216 + }, + { + "epoch": 0.248706886277501, + "grad_norm": 1.664696455001831, + "learning_rate": 5.403140899502921e-06, + "loss": 0.7958, + "step": 5217 + }, + { + "epoch": 0.24875455867280052, + "grad_norm": 1.842162847518921, + "learning_rate": 5.399652893980486e-06, + "loss": 1.2515, + "step": 5218 + }, + { + "epoch": 0.2488022310681, + "grad_norm": 1.205552101135254, + "learning_rate": 5.396165598244234e-06, + "loss": 0.5498, + "step": 5219 + }, + { + "epoch": 0.24884990346339952, + "grad_norm": 2.4317197799682617, + "learning_rate": 5.392679012832225e-06, + "loss": 0.6628, + "step": 5220 + }, + { + "epoch": 0.248897575858699, + "grad_norm": 1.5645935535430908, + "learning_rate": 5.389193138282393e-06, + "loss": 0.8413, + "step": 5221 + }, + { + "epoch": 0.24894524825399852, + "grad_norm": 0.9529103636741638, + "learning_rate": 5.385707975132582e-06, + "loss": 0.5029, + "step": 5222 + }, + { + "epoch": 0.248992920649298, + "grad_norm": 3.805121898651123, + "learning_rate": 5.382223523920511e-06, + "loss": 1.1281, + "step": 5223 + }, + { + "epoch": 0.24904059304459752, + "grad_norm": 2.4295339584350586, + "learning_rate": 5.378739785183794e-06, + "loss": 0.6934, + "step": 5224 + }, + { + "epoch": 0.24908826543989704, + "grad_norm": 1.1411960124969482, + "learning_rate": 5.375256759459939e-06, + "loss": 0.7916, + "step": 5225 + }, + { + "epoch": 0.24913593783519652, + "grad_norm": 1.18546724319458, + "learning_rate": 5.371774447286343e-06, + "loss": 0.6397, + "step": 5226 + }, + { + "epoch": 0.24918361023049604, + "grad_norm": 0.9702255725860596, + "learning_rate": 5.368292849200285e-06, + "loss": 0.3293, + "step": 5227 + }, + { + "epoch": 0.24923128262579552, + "grad_norm": 1.247194766998291, + "learning_rate": 5.364811965738943e-06, + "loss": 0.5138, + "step": 5228 + }, + { + "epoch": 0.24927895502109504, + "grad_norm": 2.6932270526885986, + "learning_rate": 5.361331797439384e-06, + "loss": 0.5032, + "step": 5229 + }, + { + "epoch": 0.24932662741639453, + "grad_norm": 3.3012616634368896, + "learning_rate": 5.357852344838557e-06, + "loss": 0.986, + "step": 5230 + }, + { + "epoch": 0.24937429981169404, + "grad_norm": 2.384352922439575, + "learning_rate": 5.354373608473309e-06, + "loss": 1.0236, + "step": 5231 + }, + { + "epoch": 0.24942197220699355, + "grad_norm": 1.5132157802581787, + "learning_rate": 5.350895588880376e-06, + "loss": 0.746, + "step": 5232 + }, + { + "epoch": 0.24946964460229304, + "grad_norm": 1.6053189039230347, + "learning_rate": 5.347418286596372e-06, + "loss": 0.8488, + "step": 5233 + }, + { + "epoch": 0.24951731699759255, + "grad_norm": 1.4705743789672852, + "learning_rate": 5.3439417021578154e-06, + "loss": 0.8896, + "step": 5234 + }, + { + "epoch": 0.24956498939289204, + "grad_norm": 2.051260471343994, + "learning_rate": 5.340465836101109e-06, + "loss": 0.5652, + "step": 5235 + }, + { + "epoch": 0.24961266178819155, + "grad_norm": 1.3342063426971436, + "learning_rate": 5.336990688962537e-06, + "loss": 0.4792, + "step": 5236 + }, + { + "epoch": 0.24966033418349104, + "grad_norm": 2.822033405303955, + "learning_rate": 5.333516261278285e-06, + "loss": 0.9676, + "step": 5237 + }, + { + "epoch": 0.24970800657879055, + "grad_norm": 2.3854379653930664, + "learning_rate": 5.330042553584416e-06, + "loss": 1.1075, + "step": 5238 + }, + { + "epoch": 0.24975567897409004, + "grad_norm": 1.594692349433899, + "learning_rate": 5.3265695664168926e-06, + "loss": 0.7578, + "step": 5239 + }, + { + "epoch": 0.24980335136938955, + "grad_norm": 1.4988423585891724, + "learning_rate": 5.323097300311553e-06, + "loss": 0.8371, + "step": 5240 + }, + { + "epoch": 0.24985102376468907, + "grad_norm": 1.9015144109725952, + "learning_rate": 5.3196257558041386e-06, + "loss": 0.9726, + "step": 5241 + }, + { + "epoch": 0.24989869615998855, + "grad_norm": 1.7048072814941406, + "learning_rate": 5.316154933430276e-06, + "loss": 0.6322, + "step": 5242 + }, + { + "epoch": 0.24994636855528807, + "grad_norm": 3.4977493286132812, + "learning_rate": 5.312684833725468e-06, + "loss": 1.444, + "step": 5243 + }, + { + "epoch": 0.24999404095058755, + "grad_norm": 1.6783192157745361, + "learning_rate": 5.309215457225121e-06, + "loss": 0.5289, + "step": 5244 + }, + { + "epoch": 0.25004171334588704, + "grad_norm": 2.484001874923706, + "learning_rate": 5.305746804464526e-06, + "loss": 0.2851, + "step": 5245 + }, + { + "epoch": 0.25008938574118655, + "grad_norm": 1.3448243141174316, + "learning_rate": 5.302278875978855e-06, + "loss": 0.7044, + "step": 5246 + }, + { + "epoch": 0.25013705813648607, + "grad_norm": 1.1596590280532837, + "learning_rate": 5.298811672303174e-06, + "loss": 0.6366, + "step": 5247 + }, + { + "epoch": 0.2501847305317856, + "grad_norm": 1.43930983543396, + "learning_rate": 5.295345193972445e-06, + "loss": 0.6953, + "step": 5248 + }, + { + "epoch": 0.2502324029270851, + "grad_norm": 2.0993826389312744, + "learning_rate": 5.291879441521499e-06, + "loss": 0.3127, + "step": 5249 + }, + { + "epoch": 0.25028007532238455, + "grad_norm": 1.432981014251709, + "learning_rate": 5.288414415485072e-06, + "loss": 0.775, + "step": 5250 + }, + { + "epoch": 0.25032774771768407, + "grad_norm": 2.131667137145996, + "learning_rate": 5.2849501163977846e-06, + "loss": 0.8331, + "step": 5251 + }, + { + "epoch": 0.2503754201129836, + "grad_norm": 1.0865910053253174, + "learning_rate": 5.281486544794139e-06, + "loss": 0.452, + "step": 5252 + }, + { + "epoch": 0.2504230925082831, + "grad_norm": 2.509237766265869, + "learning_rate": 5.278023701208523e-06, + "loss": 0.8609, + "step": 5253 + }, + { + "epoch": 0.25047076490358255, + "grad_norm": 1.3018155097961426, + "learning_rate": 5.274561586175226e-06, + "loss": 0.4136, + "step": 5254 + }, + { + "epoch": 0.25051843729888207, + "grad_norm": 3.1016173362731934, + "learning_rate": 5.271100200228412e-06, + "loss": 1.3016, + "step": 5255 + }, + { + "epoch": 0.2505661096941816, + "grad_norm": 1.2012239694595337, + "learning_rate": 5.2676395439021385e-06, + "loss": 0.691, + "step": 5256 + }, + { + "epoch": 0.2506137820894811, + "grad_norm": 1.8148443698883057, + "learning_rate": 5.264179617730353e-06, + "loss": 0.83, + "step": 5257 + }, + { + "epoch": 0.2506614544847806, + "grad_norm": 2.251467227935791, + "learning_rate": 5.260720422246879e-06, + "loss": 0.7924, + "step": 5258 + }, + { + "epoch": 0.25070912688008007, + "grad_norm": 2.4197072982788086, + "learning_rate": 5.257261957985438e-06, + "loss": 0.3565, + "step": 5259 + }, + { + "epoch": 0.2507567992753796, + "grad_norm": 1.0452011823654175, + "learning_rate": 5.253804225479642e-06, + "loss": 0.6552, + "step": 5260 + }, + { + "epoch": 0.2508044716706791, + "grad_norm": 1.5356444120407104, + "learning_rate": 5.250347225262972e-06, + "loss": 0.5288, + "step": 5261 + }, + { + "epoch": 0.2508521440659786, + "grad_norm": 2.0456509590148926, + "learning_rate": 5.246890957868813e-06, + "loss": 0.9027, + "step": 5262 + }, + { + "epoch": 0.2508998164612781, + "grad_norm": 5.187041282653809, + "learning_rate": 5.243435423830436e-06, + "loss": 0.8829, + "step": 5263 + }, + { + "epoch": 0.2509474888565776, + "grad_norm": 1.9498354196548462, + "learning_rate": 5.239980623680987e-06, + "loss": 1.0658, + "step": 5264 + }, + { + "epoch": 0.2509951612518771, + "grad_norm": 1.5801407098770142, + "learning_rate": 5.236526557953508e-06, + "loss": 1.1578, + "step": 5265 + }, + { + "epoch": 0.2510428336471766, + "grad_norm": 4.058075904846191, + "learning_rate": 5.233073227180932e-06, + "loss": 0.4922, + "step": 5266 + }, + { + "epoch": 0.2510905060424761, + "grad_norm": 1.8664467334747314, + "learning_rate": 5.229620631896065e-06, + "loss": 0.8485, + "step": 5267 + }, + { + "epoch": 0.2511381784377756, + "grad_norm": 2.2463502883911133, + "learning_rate": 5.226168772631606e-06, + "loss": 0.538, + "step": 5268 + }, + { + "epoch": 0.2511858508330751, + "grad_norm": 1.2202684879302979, + "learning_rate": 5.22271764992015e-06, + "loss": 0.665, + "step": 5269 + }, + { + "epoch": 0.2512335232283746, + "grad_norm": 1.7239629030227661, + "learning_rate": 5.219267264294159e-06, + "loss": 0.5625, + "step": 5270 + }, + { + "epoch": 0.2512811956236741, + "grad_norm": 1.5027018785476685, + "learning_rate": 5.215817616285996e-06, + "loss": 0.5832, + "step": 5271 + }, + { + "epoch": 0.25132886801897364, + "grad_norm": 1.3608691692352295, + "learning_rate": 5.212368706427913e-06, + "loss": 1.0427, + "step": 5272 + }, + { + "epoch": 0.2513765404142731, + "grad_norm": 1.505457878112793, + "learning_rate": 5.20892053525203e-06, + "loss": 0.7733, + "step": 5273 + }, + { + "epoch": 0.2514242128095726, + "grad_norm": 1.5560054779052734, + "learning_rate": 5.2054731032903704e-06, + "loss": 0.7243, + "step": 5274 + }, + { + "epoch": 0.2514718852048721, + "grad_norm": 1.2661142349243164, + "learning_rate": 5.202026411074841e-06, + "loss": 0.6936, + "step": 5275 + }, + { + "epoch": 0.25151955760017164, + "grad_norm": 1.3761965036392212, + "learning_rate": 5.198580459137224e-06, + "loss": 0.6419, + "step": 5276 + }, + { + "epoch": 0.2515672299954711, + "grad_norm": 1.3047969341278076, + "learning_rate": 5.195135248009196e-06, + "loss": 0.951, + "step": 5277 + }, + { + "epoch": 0.2516149023907706, + "grad_norm": 1.3792136907577515, + "learning_rate": 5.191690778222318e-06, + "loss": 0.6444, + "step": 5278 + }, + { + "epoch": 0.2516625747860701, + "grad_norm": 1.1357731819152832, + "learning_rate": 5.188247050308042e-06, + "loss": 0.7093, + "step": 5279 + }, + { + "epoch": 0.25171024718136964, + "grad_norm": 1.6982618570327759, + "learning_rate": 5.18480406479769e-06, + "loss": 0.7102, + "step": 5280 + }, + { + "epoch": 0.25175791957666915, + "grad_norm": 1.6509249210357666, + "learning_rate": 5.181361822222488e-06, + "loss": 0.5838, + "step": 5281 + }, + { + "epoch": 0.2518055919719686, + "grad_norm": 1.6463545560836792, + "learning_rate": 5.177920323113531e-06, + "loss": 0.7904, + "step": 5282 + }, + { + "epoch": 0.2518532643672681, + "grad_norm": 2.519686222076416, + "learning_rate": 5.174479568001813e-06, + "loss": 0.5023, + "step": 5283 + }, + { + "epoch": 0.25190093676256764, + "grad_norm": 2.3719160556793213, + "learning_rate": 5.1710395574182026e-06, + "loss": 1.0992, + "step": 5284 + }, + { + "epoch": 0.25194860915786715, + "grad_norm": 1.668278455734253, + "learning_rate": 5.167600291893462e-06, + "loss": 0.9031, + "step": 5285 + }, + { + "epoch": 0.2519962815531666, + "grad_norm": 1.5020698308944702, + "learning_rate": 5.16416177195823e-06, + "loss": 0.6652, + "step": 5286 + }, + { + "epoch": 0.2520439539484661, + "grad_norm": 1.4558539390563965, + "learning_rate": 5.1607239981430355e-06, + "loss": 0.6531, + "step": 5287 + }, + { + "epoch": 0.25209162634376564, + "grad_norm": 1.1254277229309082, + "learning_rate": 5.1572869709782965e-06, + "loss": 0.5503, + "step": 5288 + }, + { + "epoch": 0.25213929873906515, + "grad_norm": 2.740602731704712, + "learning_rate": 5.153850690994306e-06, + "loss": 0.7217, + "step": 5289 + }, + { + "epoch": 0.25218697113436467, + "grad_norm": 1.6636793613433838, + "learning_rate": 5.150415158721247e-06, + "loss": 0.6611, + "step": 5290 + }, + { + "epoch": 0.2522346435296641, + "grad_norm": 2.151529312133789, + "learning_rate": 5.146980374689192e-06, + "loss": 0.2378, + "step": 5291 + }, + { + "epoch": 0.25228231592496364, + "grad_norm": 1.524881362915039, + "learning_rate": 5.143546339428085e-06, + "loss": 0.4307, + "step": 5292 + }, + { + "epoch": 0.25232998832026315, + "grad_norm": 1.3537802696228027, + "learning_rate": 5.140113053467765e-06, + "loss": 0.5461, + "step": 5293 + }, + { + "epoch": 0.25237766071556267, + "grad_norm": 1.3754298686981201, + "learning_rate": 5.1366805173379575e-06, + "loss": 0.5483, + "step": 5294 + }, + { + "epoch": 0.2524253331108622, + "grad_norm": 2.7808735370635986, + "learning_rate": 5.133248731568261e-06, + "loss": 0.5615, + "step": 5295 + }, + { + "epoch": 0.25247300550616164, + "grad_norm": 1.1643399000167847, + "learning_rate": 5.1298176966881705e-06, + "loss": 0.7063, + "step": 5296 + }, + { + "epoch": 0.25252067790146115, + "grad_norm": 3.938659906387329, + "learning_rate": 5.126387413227053e-06, + "loss": 0.7151, + "step": 5297 + }, + { + "epoch": 0.25256835029676067, + "grad_norm": 1.7053200006484985, + "learning_rate": 5.122957881714172e-06, + "loss": 0.7394, + "step": 5298 + }, + { + "epoch": 0.2526160226920602, + "grad_norm": 1.0608376264572144, + "learning_rate": 5.119529102678665e-06, + "loss": 0.599, + "step": 5299 + }, + { + "epoch": 0.25266369508735964, + "grad_norm": 1.681151032447815, + "learning_rate": 5.116101076649559e-06, + "loss": 0.8599, + "step": 5300 + }, + { + "epoch": 0.25271136748265915, + "grad_norm": 1.4083402156829834, + "learning_rate": 5.112673804155759e-06, + "loss": 0.746, + "step": 5301 + }, + { + "epoch": 0.25275903987795867, + "grad_norm": 1.378050684928894, + "learning_rate": 5.109247285726062e-06, + "loss": 0.5942, + "step": 5302 + }, + { + "epoch": 0.2528067122732582, + "grad_norm": 1.2420076131820679, + "learning_rate": 5.105821521889147e-06, + "loss": 0.6261, + "step": 5303 + }, + { + "epoch": 0.2528543846685577, + "grad_norm": 3.186793327331543, + "learning_rate": 5.102396513173569e-06, + "loss": 1.267, + "step": 5304 + }, + { + "epoch": 0.25290205706385716, + "grad_norm": 1.5938936471939087, + "learning_rate": 5.098972260107771e-06, + "loss": 0.9206, + "step": 5305 + }, + { + "epoch": 0.25294972945915667, + "grad_norm": 1.4803907871246338, + "learning_rate": 5.0955487632200885e-06, + "loss": 0.714, + "step": 5306 + }, + { + "epoch": 0.2529974018544562, + "grad_norm": 1.4852310419082642, + "learning_rate": 5.0921260230387195e-06, + "loss": 0.7737, + "step": 5307 + }, + { + "epoch": 0.2530450742497557, + "grad_norm": 3.1453282833099365, + "learning_rate": 5.088704040091765e-06, + "loss": 0.9843, + "step": 5308 + }, + { + "epoch": 0.25309274664505516, + "grad_norm": 1.649553656578064, + "learning_rate": 5.085282814907205e-06, + "loss": 0.7155, + "step": 5309 + }, + { + "epoch": 0.25314041904035467, + "grad_norm": 1.3235059976577759, + "learning_rate": 5.081862348012892e-06, + "loss": 0.9422, + "step": 5310 + }, + { + "epoch": 0.2531880914356542, + "grad_norm": 1.7374777793884277, + "learning_rate": 5.0784426399365725e-06, + "loss": 0.6619, + "step": 5311 + }, + { + "epoch": 0.2532357638309537, + "grad_norm": 1.2235597372055054, + "learning_rate": 5.075023691205869e-06, + "loss": 0.5919, + "step": 5312 + }, + { + "epoch": 0.2532834362262532, + "grad_norm": 2.3804304599761963, + "learning_rate": 5.071605502348297e-06, + "loss": 0.5582, + "step": 5313 + }, + { + "epoch": 0.25333110862155267, + "grad_norm": 4.60269021987915, + "learning_rate": 5.068188073891238e-06, + "loss": 0.601, + "step": 5314 + }, + { + "epoch": 0.2533787810168522, + "grad_norm": 1.9278857707977295, + "learning_rate": 5.064771406361973e-06, + "loss": 0.371, + "step": 5315 + }, + { + "epoch": 0.2534264534121517, + "grad_norm": 1.3372584581375122, + "learning_rate": 5.06135550028766e-06, + "loss": 0.7439, + "step": 5316 + }, + { + "epoch": 0.2534741258074512, + "grad_norm": 2.1416614055633545, + "learning_rate": 5.057940356195332e-06, + "loss": 0.6466, + "step": 5317 + }, + { + "epoch": 0.25352179820275067, + "grad_norm": 1.6502330303192139, + "learning_rate": 5.054525974611913e-06, + "loss": 0.8499, + "step": 5318 + }, + { + "epoch": 0.2535694705980502, + "grad_norm": 1.8139033317565918, + "learning_rate": 5.051112356064212e-06, + "loss": 0.5642, + "step": 5319 + }, + { + "epoch": 0.2536171429933497, + "grad_norm": 1.177715539932251, + "learning_rate": 5.047699501078905e-06, + "loss": 0.4434, + "step": 5320 + }, + { + "epoch": 0.2536648153886492, + "grad_norm": 2.207502603530884, + "learning_rate": 5.044287410182568e-06, + "loss": 0.8216, + "step": 5321 + }, + { + "epoch": 0.2537124877839487, + "grad_norm": 3.4811158180236816, + "learning_rate": 5.040876083901654e-06, + "loss": 1.068, + "step": 5322 + }, + { + "epoch": 0.2537601601792482, + "grad_norm": 1.4357497692108154, + "learning_rate": 5.037465522762486e-06, + "loss": 0.342, + "step": 5323 + }, + { + "epoch": 0.2538078325745477, + "grad_norm": 2.882291078567505, + "learning_rate": 5.034055727291283e-06, + "loss": 0.7484, + "step": 5324 + }, + { + "epoch": 0.2538555049698472, + "grad_norm": 2.103181838989258, + "learning_rate": 5.0306466980141475e-06, + "loss": 0.9806, + "step": 5325 + }, + { + "epoch": 0.2539031773651467, + "grad_norm": 3.2125372886657715, + "learning_rate": 5.027238435457047e-06, + "loss": 1.2797, + "step": 5326 + }, + { + "epoch": 0.25395084976044624, + "grad_norm": 2.267927646636963, + "learning_rate": 5.023830940145851e-06, + "loss": 0.9789, + "step": 5327 + }, + { + "epoch": 0.2539985221557457, + "grad_norm": 4.6167449951171875, + "learning_rate": 5.0204242126062964e-06, + "loss": 0.509, + "step": 5328 + }, + { + "epoch": 0.2540461945510452, + "grad_norm": 1.4431474208831787, + "learning_rate": 5.017018253364001e-06, + "loss": 0.5063, + "step": 5329 + }, + { + "epoch": 0.2540938669463447, + "grad_norm": 1.4122262001037598, + "learning_rate": 5.0136130629444755e-06, + "loss": 0.8872, + "step": 5330 + }, + { + "epoch": 0.25414153934164424, + "grad_norm": 3.4316749572753906, + "learning_rate": 5.010208641873109e-06, + "loss": 1.3353, + "step": 5331 + }, + { + "epoch": 0.2541892117369437, + "grad_norm": 1.7836222648620605, + "learning_rate": 5.006804990675158e-06, + "loss": 0.706, + "step": 5332 + }, + { + "epoch": 0.2542368841322432, + "grad_norm": 3.4742655754089355, + "learning_rate": 5.003402109875779e-06, + "loss": 1.4159, + "step": 5333 + }, + { + "epoch": 0.2542845565275427, + "grad_norm": 2.4239935874938965, + "learning_rate": 5.000000000000003e-06, + "loss": 0.7234, + "step": 5334 + }, + { + "epoch": 0.25433222892284224, + "grad_norm": 0.8686856627464294, + "learning_rate": 4.996598661572732e-06, + "loss": 0.3915, + "step": 5335 + }, + { + "epoch": 0.25437990131814175, + "grad_norm": 2.139826774597168, + "learning_rate": 4.993198095118763e-06, + "loss": 1.1792, + "step": 5336 + }, + { + "epoch": 0.2544275737134412, + "grad_norm": 1.431489109992981, + "learning_rate": 4.989798301162772e-06, + "loss": 0.612, + "step": 5337 + }, + { + "epoch": 0.2544752461087407, + "grad_norm": 2.7369344234466553, + "learning_rate": 4.986399280229304e-06, + "loss": 1.3041, + "step": 5338 + }, + { + "epoch": 0.25452291850404024, + "grad_norm": 1.7966467142105103, + "learning_rate": 4.983001032842797e-06, + "loss": 0.6154, + "step": 5339 + }, + { + "epoch": 0.25457059089933975, + "grad_norm": 1.8476978540420532, + "learning_rate": 4.979603559527569e-06, + "loss": 0.8012, + "step": 5340 + }, + { + "epoch": 0.2546182632946392, + "grad_norm": 1.9659759998321533, + "learning_rate": 4.976206860807808e-06, + "loss": 0.8354, + "step": 5341 + }, + { + "epoch": 0.2546659356899387, + "grad_norm": 2.0037567615509033, + "learning_rate": 4.972810937207599e-06, + "loss": 0.8945, + "step": 5342 + }, + { + "epoch": 0.25471360808523824, + "grad_norm": 2.5204577445983887, + "learning_rate": 4.96941578925089e-06, + "loss": 0.7901, + "step": 5343 + }, + { + "epoch": 0.25476128048053776, + "grad_norm": 3.08522629737854, + "learning_rate": 4.9660214174615165e-06, + "loss": 1.033, + "step": 5344 + }, + { + "epoch": 0.25480895287583727, + "grad_norm": 1.8553155660629272, + "learning_rate": 4.9626278223631985e-06, + "loss": 0.9732, + "step": 5345 + }, + { + "epoch": 0.2548566252711367, + "grad_norm": 2.0226073265075684, + "learning_rate": 4.959235004479537e-06, + "loss": 0.4234, + "step": 5346 + }, + { + "epoch": 0.25490429766643624, + "grad_norm": 1.4804707765579224, + "learning_rate": 4.955842964334e-06, + "loss": 0.5954, + "step": 5347 + }, + { + "epoch": 0.25495197006173576, + "grad_norm": 1.7300760746002197, + "learning_rate": 4.952451702449949e-06, + "loss": 0.8861, + "step": 5348 + }, + { + "epoch": 0.25499964245703527, + "grad_norm": 1.476699709892273, + "learning_rate": 4.949061219350624e-06, + "loss": 0.597, + "step": 5349 + }, + { + "epoch": 0.2550473148523348, + "grad_norm": 1.777708888053894, + "learning_rate": 4.945671515559135e-06, + "loss": 0.7481, + "step": 5350 + }, + { + "epoch": 0.25509498724763424, + "grad_norm": 1.6463401317596436, + "learning_rate": 4.942282591598481e-06, + "loss": 0.9888, + "step": 5351 + }, + { + "epoch": 0.25514265964293376, + "grad_norm": 1.3731558322906494, + "learning_rate": 4.938894447991544e-06, + "loss": 0.5485, + "step": 5352 + }, + { + "epoch": 0.25519033203823327, + "grad_norm": 1.1557267904281616, + "learning_rate": 4.935507085261069e-06, + "loss": 0.6922, + "step": 5353 + }, + { + "epoch": 0.2552380044335328, + "grad_norm": 1.6899571418762207, + "learning_rate": 4.932120503929696e-06, + "loss": 0.6992, + "step": 5354 + }, + { + "epoch": 0.25528567682883224, + "grad_norm": 1.7605578899383545, + "learning_rate": 4.928734704519945e-06, + "loss": 0.9358, + "step": 5355 + }, + { + "epoch": 0.25533334922413176, + "grad_norm": 1.504723072052002, + "learning_rate": 4.925349687554201e-06, + "loss": 0.7706, + "step": 5356 + }, + { + "epoch": 0.25538102161943127, + "grad_norm": 0.941290020942688, + "learning_rate": 4.921965453554747e-06, + "loss": 0.3572, + "step": 5357 + }, + { + "epoch": 0.2554286940147308, + "grad_norm": 1.3282690048217773, + "learning_rate": 4.918582003043724e-06, + "loss": 0.9303, + "step": 5358 + }, + { + "epoch": 0.2554763664100303, + "grad_norm": 1.637980580329895, + "learning_rate": 4.9151993365431735e-06, + "loss": 0.4201, + "step": 5359 + }, + { + "epoch": 0.25552403880532976, + "grad_norm": 1.318679690361023, + "learning_rate": 4.911817454575e-06, + "loss": 0.9315, + "step": 5360 + }, + { + "epoch": 0.25557171120062927, + "grad_norm": 2.373147487640381, + "learning_rate": 4.908436357660993e-06, + "loss": 0.4076, + "step": 5361 + }, + { + "epoch": 0.2556193835959288, + "grad_norm": 1.6679741144180298, + "learning_rate": 4.905056046322828e-06, + "loss": 0.3708, + "step": 5362 + }, + { + "epoch": 0.2556670559912283, + "grad_norm": 1.6805018186569214, + "learning_rate": 4.901676521082043e-06, + "loss": 0.8703, + "step": 5363 + }, + { + "epoch": 0.25571472838652776, + "grad_norm": 2.537540912628174, + "learning_rate": 4.8982977824600685e-06, + "loss": 0.4596, + "step": 5364 + }, + { + "epoch": 0.25576240078182727, + "grad_norm": 1.9816516637802124, + "learning_rate": 4.894919830978212e-06, + "loss": 1.0274, + "step": 5365 + }, + { + "epoch": 0.2558100731771268, + "grad_norm": 2.5843825340270996, + "learning_rate": 4.89154266715765e-06, + "loss": 1.0977, + "step": 5366 + }, + { + "epoch": 0.2558577455724263, + "grad_norm": 1.8490378856658936, + "learning_rate": 4.888166291519449e-06, + "loss": 0.9713, + "step": 5367 + }, + { + "epoch": 0.2559054179677258, + "grad_norm": 2.8207590579986572, + "learning_rate": 4.884790704584549e-06, + "loss": 0.7729, + "step": 5368 + }, + { + "epoch": 0.25595309036302527, + "grad_norm": 1.501035451889038, + "learning_rate": 4.881415906873763e-06, + "loss": 0.473, + "step": 5369 + }, + { + "epoch": 0.2560007627583248, + "grad_norm": 2.073540210723877, + "learning_rate": 4.878041898907793e-06, + "loss": 0.678, + "step": 5370 + }, + { + "epoch": 0.2560484351536243, + "grad_norm": 1.812528133392334, + "learning_rate": 4.874668681207215e-06, + "loss": 0.9846, + "step": 5371 + }, + { + "epoch": 0.2560961075489238, + "grad_norm": 1.4102495908737183, + "learning_rate": 4.871296254292479e-06, + "loss": 0.5277, + "step": 5372 + }, + { + "epoch": 0.25614377994422327, + "grad_norm": 1.2758127450942993, + "learning_rate": 4.867924618683911e-06, + "loss": 0.8227, + "step": 5373 + }, + { + "epoch": 0.2561914523395228, + "grad_norm": 1.5390113592147827, + "learning_rate": 4.8645537749017295e-06, + "loss": 0.8553, + "step": 5374 + }, + { + "epoch": 0.2562391247348223, + "grad_norm": 1.7708622217178345, + "learning_rate": 4.861183723466011e-06, + "loss": 0.6534, + "step": 5375 + }, + { + "epoch": 0.2562867971301218, + "grad_norm": 1.7983982563018799, + "learning_rate": 4.857814464896724e-06, + "loss": 0.7854, + "step": 5376 + }, + { + "epoch": 0.2563344695254213, + "grad_norm": 8.485481262207031, + "learning_rate": 4.854445999713715e-06, + "loss": 0.7118, + "step": 5377 + }, + { + "epoch": 0.2563821419207208, + "grad_norm": 1.3313298225402832, + "learning_rate": 4.851078328436696e-06, + "loss": 0.745, + "step": 5378 + }, + { + "epoch": 0.2564298143160203, + "grad_norm": 6.466851234436035, + "learning_rate": 4.847711451585266e-06, + "loss": 1.2719, + "step": 5379 + }, + { + "epoch": 0.2564774867113198, + "grad_norm": 2.2267777919769287, + "learning_rate": 4.8443453696789055e-06, + "loss": 1.1769, + "step": 5380 + }, + { + "epoch": 0.2565251591066193, + "grad_norm": 1.290576696395874, + "learning_rate": 4.840980083236958e-06, + "loss": 0.9364, + "step": 5381 + }, + { + "epoch": 0.25657283150191884, + "grad_norm": 1.5591100454330444, + "learning_rate": 4.837615592778655e-06, + "loss": 0.5752, + "step": 5382 + }, + { + "epoch": 0.2566205038972183, + "grad_norm": 1.2779251337051392, + "learning_rate": 4.834251898823108e-06, + "loss": 0.5798, + "step": 5383 + }, + { + "epoch": 0.2566681762925178, + "grad_norm": 3.384645700454712, + "learning_rate": 4.8308890018892914e-06, + "loss": 1.7959, + "step": 5384 + }, + { + "epoch": 0.2567158486878173, + "grad_norm": 1.457976222038269, + "learning_rate": 4.827526902496073e-06, + "loss": 0.5435, + "step": 5385 + }, + { + "epoch": 0.25676352108311684, + "grad_norm": 1.695383906364441, + "learning_rate": 4.8241656011621886e-06, + "loss": 0.4573, + "step": 5386 + }, + { + "epoch": 0.2568111934784163, + "grad_norm": 5.1100287437438965, + "learning_rate": 4.8208050984062515e-06, + "loss": 0.7021, + "step": 5387 + }, + { + "epoch": 0.2568588658737158, + "grad_norm": 1.2557580471038818, + "learning_rate": 4.817445394746749e-06, + "loss": 0.5739, + "step": 5388 + }, + { + "epoch": 0.2569065382690153, + "grad_norm": 1.3719812631607056, + "learning_rate": 4.814086490702056e-06, + "loss": 0.4644, + "step": 5389 + }, + { + "epoch": 0.25695421066431484, + "grad_norm": 1.3114820718765259, + "learning_rate": 4.810728386790409e-06, + "loss": 0.9014, + "step": 5390 + }, + { + "epoch": 0.25700188305961436, + "grad_norm": 1.641396164894104, + "learning_rate": 4.807371083529933e-06, + "loss": 0.4296, + "step": 5391 + }, + { + "epoch": 0.2570495554549138, + "grad_norm": 3.38991379737854, + "learning_rate": 4.8040145814386245e-06, + "loss": 1.0257, + "step": 5392 + }, + { + "epoch": 0.25709722785021333, + "grad_norm": 2.380986452102661, + "learning_rate": 4.800658881034362e-06, + "loss": 0.2672, + "step": 5393 + }, + { + "epoch": 0.25714490024551284, + "grad_norm": 2.6957061290740967, + "learning_rate": 4.797303982834887e-06, + "loss": 0.694, + "step": 5394 + }, + { + "epoch": 0.25719257264081236, + "grad_norm": 1.5798816680908203, + "learning_rate": 4.79394988735783e-06, + "loss": 0.6173, + "step": 5395 + }, + { + "epoch": 0.2572402450361118, + "grad_norm": 2.6076266765594482, + "learning_rate": 4.790596595120699e-06, + "loss": 1.404, + "step": 5396 + }, + { + "epoch": 0.25728791743141133, + "grad_norm": 1.587837815284729, + "learning_rate": 4.787244106640861e-06, + "loss": 0.4521, + "step": 5397 + }, + { + "epoch": 0.25733558982671084, + "grad_norm": 1.3771586418151855, + "learning_rate": 4.783892422435577e-06, + "loss": 0.719, + "step": 5398 + }, + { + "epoch": 0.25738326222201036, + "grad_norm": 1.5652211904525757, + "learning_rate": 4.780541543021981e-06, + "loss": 0.6053, + "step": 5399 + }, + { + "epoch": 0.25743093461730987, + "grad_norm": 1.33492910861969, + "learning_rate": 4.7771914689170704e-06, + "loss": 0.6448, + "step": 5400 + }, + { + "epoch": 0.25747860701260933, + "grad_norm": 3.785646438598633, + "learning_rate": 4.773842200637736e-06, + "loss": 0.8005, + "step": 5401 + }, + { + "epoch": 0.25752627940790884, + "grad_norm": 1.115527868270874, + "learning_rate": 4.770493738700727e-06, + "loss": 0.724, + "step": 5402 + }, + { + "epoch": 0.25757395180320836, + "grad_norm": 1.87267005443573, + "learning_rate": 4.7671460836226845e-06, + "loss": 0.5195, + "step": 5403 + }, + { + "epoch": 0.25762162419850787, + "grad_norm": 1.386061429977417, + "learning_rate": 4.763799235920109e-06, + "loss": 0.8242, + "step": 5404 + }, + { + "epoch": 0.25766929659380733, + "grad_norm": 1.486469030380249, + "learning_rate": 4.760453196109394e-06, + "loss": 0.6666, + "step": 5405 + }, + { + "epoch": 0.25771696898910684, + "grad_norm": 1.8709571361541748, + "learning_rate": 4.757107964706788e-06, + "loss": 0.8234, + "step": 5406 + }, + { + "epoch": 0.25776464138440636, + "grad_norm": 1.7033472061157227, + "learning_rate": 4.753763542228433e-06, + "loss": 0.858, + "step": 5407 + }, + { + "epoch": 0.25781231377970587, + "grad_norm": 2.236379861831665, + "learning_rate": 4.750419929190342e-06, + "loss": 1.2991, + "step": 5408 + }, + { + "epoch": 0.2578599861750054, + "grad_norm": 2.5572237968444824, + "learning_rate": 4.7470771261083915e-06, + "loss": 0.869, + "step": 5409 + }, + { + "epoch": 0.25790765857030484, + "grad_norm": 1.9142961502075195, + "learning_rate": 4.743735133498346e-06, + "loss": 1.0696, + "step": 5410 + }, + { + "epoch": 0.25795533096560436, + "grad_norm": 1.1364426612854004, + "learning_rate": 4.740393951875843e-06, + "loss": 0.6, + "step": 5411 + }, + { + "epoch": 0.25800300336090387, + "grad_norm": 2.2477362155914307, + "learning_rate": 4.737053581756387e-06, + "loss": 1.0807, + "step": 5412 + }, + { + "epoch": 0.2580506757562034, + "grad_norm": 1.9631001949310303, + "learning_rate": 4.733714023655366e-06, + "loss": 0.791, + "step": 5413 + }, + { + "epoch": 0.2580983481515029, + "grad_norm": 2.146395444869995, + "learning_rate": 4.730375278088042e-06, + "loss": 0.6613, + "step": 5414 + }, + { + "epoch": 0.25814602054680236, + "grad_norm": 1.670370101928711, + "learning_rate": 4.727037345569542e-06, + "loss": 0.912, + "step": 5415 + }, + { + "epoch": 0.25819369294210187, + "grad_norm": 1.6563982963562012, + "learning_rate": 4.723700226614882e-06, + "loss": 0.9118, + "step": 5416 + }, + { + "epoch": 0.2582413653374014, + "grad_norm": 3.493528366088867, + "learning_rate": 4.7203639217389385e-06, + "loss": 1.1187, + "step": 5417 + }, + { + "epoch": 0.2582890377327009, + "grad_norm": 1.8224412202835083, + "learning_rate": 4.717028431456475e-06, + "loss": 0.5434, + "step": 5418 + }, + { + "epoch": 0.25833671012800036, + "grad_norm": 2.786318778991699, + "learning_rate": 4.713693756282118e-06, + "loss": 0.8327, + "step": 5419 + }, + { + "epoch": 0.25838438252329987, + "grad_norm": 4.453581809997559, + "learning_rate": 4.710359896730379e-06, + "loss": 1.0295, + "step": 5420 + }, + { + "epoch": 0.2584320549185994, + "grad_norm": 1.4712653160095215, + "learning_rate": 4.7070268533156315e-06, + "loss": 0.6225, + "step": 5421 + }, + { + "epoch": 0.2584797273138989, + "grad_norm": 1.6703158617019653, + "learning_rate": 4.7036946265521335e-06, + "loss": 0.7127, + "step": 5422 + }, + { + "epoch": 0.2585273997091984, + "grad_norm": 1.2520533800125122, + "learning_rate": 4.700363216954017e-06, + "loss": 0.6962, + "step": 5423 + }, + { + "epoch": 0.25857507210449787, + "grad_norm": 1.633542537689209, + "learning_rate": 4.697032625035277e-06, + "loss": 0.5982, + "step": 5424 + }, + { + "epoch": 0.2586227444997974, + "grad_norm": 1.5557509660720825, + "learning_rate": 4.693702851309793e-06, + "loss": 0.3589, + "step": 5425 + }, + { + "epoch": 0.2586704168950969, + "grad_norm": 1.874098300933838, + "learning_rate": 4.690373896291318e-06, + "loss": 0.5059, + "step": 5426 + }, + { + "epoch": 0.2587180892903964, + "grad_norm": 1.8717933893203735, + "learning_rate": 4.687045760493468e-06, + "loss": 0.3555, + "step": 5427 + }, + { + "epoch": 0.25876576168569587, + "grad_norm": 1.6686381101608276, + "learning_rate": 4.683718444429746e-06, + "loss": 0.6425, + "step": 5428 + }, + { + "epoch": 0.2588134340809954, + "grad_norm": 1.3108073472976685, + "learning_rate": 4.680391948613523e-06, + "loss": 0.7588, + "step": 5429 + }, + { + "epoch": 0.2588611064762949, + "grad_norm": 1.8829936981201172, + "learning_rate": 4.677066273558038e-06, + "loss": 0.8808, + "step": 5430 + }, + { + "epoch": 0.2589087788715944, + "grad_norm": 0.9224647283554077, + "learning_rate": 4.673741419776414e-06, + "loss": 0.3131, + "step": 5431 + }, + { + "epoch": 0.25895645126689393, + "grad_norm": 1.2959907054901123, + "learning_rate": 4.670417387781638e-06, + "loss": 0.7307, + "step": 5432 + }, + { + "epoch": 0.2590041236621934, + "grad_norm": 1.546425700187683, + "learning_rate": 4.6670941780865765e-06, + "loss": 0.8808, + "step": 5433 + }, + { + "epoch": 0.2590517960574929, + "grad_norm": 4.931987762451172, + "learning_rate": 4.663771791203961e-06, + "loss": 0.2057, + "step": 5434 + }, + { + "epoch": 0.2590994684527924, + "grad_norm": 1.8837611675262451, + "learning_rate": 4.660450227646407e-06, + "loss": 1.178, + "step": 5435 + }, + { + "epoch": 0.25914714084809193, + "grad_norm": 2.875595808029175, + "learning_rate": 4.657129487926398e-06, + "loss": 1.1141, + "step": 5436 + }, + { + "epoch": 0.2591948132433914, + "grad_norm": 1.40999436378479, + "learning_rate": 4.653809572556286e-06, + "loss": 0.7087, + "step": 5437 + }, + { + "epoch": 0.2592424856386909, + "grad_norm": 2.65108323097229, + "learning_rate": 4.650490482048302e-06, + "loss": 1.5518, + "step": 5438 + }, + { + "epoch": 0.2592901580339904, + "grad_norm": 2.2369801998138428, + "learning_rate": 4.647172216914551e-06, + "loss": 1.1065, + "step": 5439 + }, + { + "epoch": 0.25933783042928993, + "grad_norm": 1.1635276079177856, + "learning_rate": 4.643854777666998e-06, + "loss": 0.8449, + "step": 5440 + }, + { + "epoch": 0.25938550282458944, + "grad_norm": 3.396414041519165, + "learning_rate": 4.6405381648174976e-06, + "loss": 0.6044, + "step": 5441 + }, + { + "epoch": 0.2594331752198889, + "grad_norm": 2.0113134384155273, + "learning_rate": 4.637222378877768e-06, + "loss": 0.6767, + "step": 5442 + }, + { + "epoch": 0.2594808476151884, + "grad_norm": 1.8615201711654663, + "learning_rate": 4.633907420359397e-06, + "loss": 0.8034, + "step": 5443 + }, + { + "epoch": 0.25952852001048793, + "grad_norm": 2.0538718700408936, + "learning_rate": 4.630593289773852e-06, + "loss": 0.731, + "step": 5444 + }, + { + "epoch": 0.25957619240578744, + "grad_norm": 2.7262766361236572, + "learning_rate": 4.62727998763247e-06, + "loss": 1.2903, + "step": 5445 + }, + { + "epoch": 0.25962386480108696, + "grad_norm": 2.282655715942383, + "learning_rate": 4.623967514446455e-06, + "loss": 1.0324, + "step": 5446 + }, + { + "epoch": 0.2596715371963864, + "grad_norm": 1.3803765773773193, + "learning_rate": 4.620655870726893e-06, + "loss": 0.825, + "step": 5447 + }, + { + "epoch": 0.25971920959168593, + "grad_norm": 1.0821335315704346, + "learning_rate": 4.617345056984734e-06, + "loss": 0.553, + "step": 5448 + }, + { + "epoch": 0.25976688198698544, + "grad_norm": 1.1088868379592896, + "learning_rate": 4.614035073730798e-06, + "loss": 0.5277, + "step": 5449 + }, + { + "epoch": 0.25981455438228496, + "grad_norm": 4.25206184387207, + "learning_rate": 4.610725921475786e-06, + "loss": 0.4931, + "step": 5450 + }, + { + "epoch": 0.2598622267775844, + "grad_norm": 2.424816370010376, + "learning_rate": 4.60741760073027e-06, + "loss": 0.6865, + "step": 5451 + }, + { + "epoch": 0.25990989917288393, + "grad_norm": 0.9026040434837341, + "learning_rate": 4.60411011200468e-06, + "loss": 0.3713, + "step": 5452 + }, + { + "epoch": 0.25995757156818344, + "grad_norm": 1.8744639158248901, + "learning_rate": 4.600803455809334e-06, + "loss": 0.5632, + "step": 5453 + }, + { + "epoch": 0.26000524396348296, + "grad_norm": 1.4494572877883911, + "learning_rate": 4.597497632654416e-06, + "loss": 0.8144, + "step": 5454 + }, + { + "epoch": 0.26005291635878247, + "grad_norm": 3.379366159439087, + "learning_rate": 4.594192643049976e-06, + "loss": 0.7011, + "step": 5455 + }, + { + "epoch": 0.26010058875408193, + "grad_norm": 1.1887792348861694, + "learning_rate": 4.590888487505941e-06, + "loss": 0.7537, + "step": 5456 + }, + { + "epoch": 0.26014826114938144, + "grad_norm": 1.5102972984313965, + "learning_rate": 4.587585166532115e-06, + "loss": 0.5196, + "step": 5457 + }, + { + "epoch": 0.26019593354468096, + "grad_norm": 1.0989429950714111, + "learning_rate": 4.584282680638155e-06, + "loss": 0.4849, + "step": 5458 + }, + { + "epoch": 0.26024360593998047, + "grad_norm": 1.5081100463867188, + "learning_rate": 4.580981030333606e-06, + "loss": 0.8332, + "step": 5459 + }, + { + "epoch": 0.26029127833527993, + "grad_norm": 1.6823418140411377, + "learning_rate": 4.577680216127885e-06, + "loss": 1.0103, + "step": 5460 + }, + { + "epoch": 0.26033895073057944, + "grad_norm": 2.515145778656006, + "learning_rate": 4.574380238530262e-06, + "loss": 0.6499, + "step": 5461 + }, + { + "epoch": 0.26038662312587896, + "grad_norm": 1.6356525421142578, + "learning_rate": 4.5710810980498996e-06, + "loss": 0.5256, + "step": 5462 + }, + { + "epoch": 0.26043429552117847, + "grad_norm": 5.0318732261657715, + "learning_rate": 4.567782795195816e-06, + "loss": 0.7241, + "step": 5463 + }, + { + "epoch": 0.260481967916478, + "grad_norm": 1.9279476404190063, + "learning_rate": 4.564485330476903e-06, + "loss": 0.5398, + "step": 5464 + }, + { + "epoch": 0.26052964031177744, + "grad_norm": 1.2882212400436401, + "learning_rate": 4.561188704401929e-06, + "loss": 0.6823, + "step": 5465 + }, + { + "epoch": 0.26057731270707696, + "grad_norm": 1.5207080841064453, + "learning_rate": 4.557892917479532e-06, + "loss": 0.6403, + "step": 5466 + }, + { + "epoch": 0.26062498510237647, + "grad_norm": 0.802099347114563, + "learning_rate": 4.5545979702182105e-06, + "loss": 0.4393, + "step": 5467 + }, + { + "epoch": 0.260672657497676, + "grad_norm": 1.274107813835144, + "learning_rate": 4.551303863126346e-06, + "loss": 0.7238, + "step": 5468 + }, + { + "epoch": 0.2607203298929755, + "grad_norm": 3.5149638652801514, + "learning_rate": 4.5480105967121855e-06, + "loss": 0.6169, + "step": 5469 + }, + { + "epoch": 0.26076800228827496, + "grad_norm": 1.603751301765442, + "learning_rate": 4.544718171483849e-06, + "loss": 0.7579, + "step": 5470 + }, + { + "epoch": 0.26081567468357447, + "grad_norm": 1.1629642248153687, + "learning_rate": 4.541426587949315e-06, + "loss": 0.3967, + "step": 5471 + }, + { + "epoch": 0.260863347078874, + "grad_norm": 1.7083592414855957, + "learning_rate": 4.538135846616447e-06, + "loss": 0.6814, + "step": 5472 + }, + { + "epoch": 0.2609110194741735, + "grad_norm": 1.3178696632385254, + "learning_rate": 4.534845947992975e-06, + "loss": 0.8221, + "step": 5473 + }, + { + "epoch": 0.26095869186947296, + "grad_norm": 2.870692253112793, + "learning_rate": 4.53155689258649e-06, + "loss": 1.0803, + "step": 5474 + }, + { + "epoch": 0.2610063642647725, + "grad_norm": 1.6962885856628418, + "learning_rate": 4.528268680904465e-06, + "loss": 0.6986, + "step": 5475 + }, + { + "epoch": 0.261054036660072, + "grad_norm": 2.3852999210357666, + "learning_rate": 4.524981313454232e-06, + "loss": 1.0803, + "step": 5476 + }, + { + "epoch": 0.2611017090553715, + "grad_norm": 1.1798049211502075, + "learning_rate": 4.521694790743003e-06, + "loss": 0.6418, + "step": 5477 + }, + { + "epoch": 0.261149381450671, + "grad_norm": 1.8000279664993286, + "learning_rate": 4.51840911327785e-06, + "loss": 0.9297, + "step": 5478 + }, + { + "epoch": 0.2611970538459705, + "grad_norm": 1.672968864440918, + "learning_rate": 4.515124281565724e-06, + "loss": 0.3185, + "step": 5479 + }, + { + "epoch": 0.26124472624127, + "grad_norm": 1.692679524421692, + "learning_rate": 4.511840296113434e-06, + "loss": 1.0013, + "step": 5480 + }, + { + "epoch": 0.2612923986365695, + "grad_norm": 1.3779551982879639, + "learning_rate": 4.50855715742767e-06, + "loss": 0.7715, + "step": 5481 + }, + { + "epoch": 0.261340071031869, + "grad_norm": 1.5197752714157104, + "learning_rate": 4.505274866014989e-06, + "loss": 0.7554, + "step": 5482 + }, + { + "epoch": 0.2613877434271685, + "grad_norm": 1.719963788986206, + "learning_rate": 4.501993422381807e-06, + "loss": 0.821, + "step": 5483 + }, + { + "epoch": 0.261435415822468, + "grad_norm": 2.0611915588378906, + "learning_rate": 4.4987128270344224e-06, + "loss": 0.7306, + "step": 5484 + }, + { + "epoch": 0.2614830882177675, + "grad_norm": 1.6833240985870361, + "learning_rate": 4.4954330804790004e-06, + "loss": 0.3735, + "step": 5485 + }, + { + "epoch": 0.261530760613067, + "grad_norm": 2.8871450424194336, + "learning_rate": 4.492154183221565e-06, + "loss": 0.9497, + "step": 5486 + }, + { + "epoch": 0.26157843300836653, + "grad_norm": 2.9115235805511475, + "learning_rate": 4.488876135768017e-06, + "loss": 1.0516, + "step": 5487 + }, + { + "epoch": 0.261626105403666, + "grad_norm": 2.3396236896514893, + "learning_rate": 4.485598938624133e-06, + "loss": 0.6537, + "step": 5488 + }, + { + "epoch": 0.2616737777989655, + "grad_norm": 1.2771376371383667, + "learning_rate": 4.482322592295541e-06, + "loss": 0.8405, + "step": 5489 + }, + { + "epoch": 0.261721450194265, + "grad_norm": 1.1913001537322998, + "learning_rate": 4.479047097287752e-06, + "loss": 0.5927, + "step": 5490 + }, + { + "epoch": 0.26176912258956453, + "grad_norm": 1.5550049543380737, + "learning_rate": 4.475772454106144e-06, + "loss": 0.8143, + "step": 5491 + }, + { + "epoch": 0.261816794984864, + "grad_norm": 1.6769016981124878, + "learning_rate": 4.47249866325596e-06, + "loss": 0.7834, + "step": 5492 + }, + { + "epoch": 0.2618644673801635, + "grad_norm": 2.9642839431762695, + "learning_rate": 4.469225725242304e-06, + "loss": 0.5986, + "step": 5493 + }, + { + "epoch": 0.261912139775463, + "grad_norm": 1.7533074617385864, + "learning_rate": 4.465953640570167e-06, + "loss": 1.1534, + "step": 5494 + }, + { + "epoch": 0.26195981217076253, + "grad_norm": 13.53782844543457, + "learning_rate": 4.462682409744391e-06, + "loss": 1.3399, + "step": 5495 + }, + { + "epoch": 0.26200748456606204, + "grad_norm": 1.1808326244354248, + "learning_rate": 4.459412033269695e-06, + "loss": 0.5596, + "step": 5496 + }, + { + "epoch": 0.2620551569613615, + "grad_norm": 1.5229246616363525, + "learning_rate": 4.456142511650669e-06, + "loss": 0.6204, + "step": 5497 + }, + { + "epoch": 0.262102829356661, + "grad_norm": 2.106015205383301, + "learning_rate": 4.452873845391759e-06, + "loss": 0.809, + "step": 5498 + }, + { + "epoch": 0.26215050175196053, + "grad_norm": 3.06536602973938, + "learning_rate": 4.44960603499729e-06, + "loss": 0.6219, + "step": 5499 + }, + { + "epoch": 0.26219817414726004, + "grad_norm": 1.0425934791564941, + "learning_rate": 4.4463390809714566e-06, + "loss": 0.608, + "step": 5500 + }, + { + "epoch": 0.26224584654255956, + "grad_norm": 2.1033732891082764, + "learning_rate": 4.4430729838183065e-06, + "loss": 0.9811, + "step": 5501 + }, + { + "epoch": 0.262293518937859, + "grad_norm": 1.278491735458374, + "learning_rate": 4.43980774404177e-06, + "loss": 0.3289, + "step": 5502 + }, + { + "epoch": 0.26234119133315853, + "grad_norm": 2.7843470573425293, + "learning_rate": 4.436543362145643e-06, + "loss": 0.813, + "step": 5503 + }, + { + "epoch": 0.26238886372845804, + "grad_norm": 1.3629486560821533, + "learning_rate": 4.433279838633581e-06, + "loss": 0.8361, + "step": 5504 + }, + { + "epoch": 0.26243653612375756, + "grad_norm": 1.8491785526275635, + "learning_rate": 4.430017174009111e-06, + "loss": 0.7298, + "step": 5505 + }, + { + "epoch": 0.262484208519057, + "grad_norm": 1.9057613611221313, + "learning_rate": 4.426755368775637e-06, + "loss": 0.8231, + "step": 5506 + }, + { + "epoch": 0.26253188091435653, + "grad_norm": 2.157076597213745, + "learning_rate": 4.423494423436415e-06, + "loss": 0.9327, + "step": 5507 + }, + { + "epoch": 0.26257955330965604, + "grad_norm": 2.145273447036743, + "learning_rate": 4.420234338494574e-06, + "loss": 0.9537, + "step": 5508 + }, + { + "epoch": 0.26262722570495556, + "grad_norm": 2.3533525466918945, + "learning_rate": 4.416975114453114e-06, + "loss": 0.9066, + "step": 5509 + }, + { + "epoch": 0.26267489810025507, + "grad_norm": 2.4520652294158936, + "learning_rate": 4.4137167518149025e-06, + "loss": 0.821, + "step": 5510 + }, + { + "epoch": 0.26272257049555453, + "grad_norm": 2.2237584590911865, + "learning_rate": 4.410459251082666e-06, + "loss": 0.7576, + "step": 5511 + }, + { + "epoch": 0.26277024289085404, + "grad_norm": 5.048548221588135, + "learning_rate": 4.407202612759005e-06, + "loss": 0.4592, + "step": 5512 + }, + { + "epoch": 0.26281791528615356, + "grad_norm": 1.1977540254592896, + "learning_rate": 4.40394683734639e-06, + "loss": 0.607, + "step": 5513 + }, + { + "epoch": 0.2628655876814531, + "grad_norm": 1.9406508207321167, + "learning_rate": 4.400691925347147e-06, + "loss": 0.9795, + "step": 5514 + }, + { + "epoch": 0.26291326007675253, + "grad_norm": 1.7332086563110352, + "learning_rate": 4.397437877263478e-06, + "loss": 0.7935, + "step": 5515 + }, + { + "epoch": 0.26296093247205204, + "grad_norm": 1.489671230316162, + "learning_rate": 4.394184693597452e-06, + "loss": 0.9046, + "step": 5516 + }, + { + "epoch": 0.26300860486735156, + "grad_norm": 1.7054924964904785, + "learning_rate": 4.390932374850996e-06, + "loss": 0.6331, + "step": 5517 + }, + { + "epoch": 0.2630562772626511, + "grad_norm": 1.5960817337036133, + "learning_rate": 4.387680921525912e-06, + "loss": 0.9622, + "step": 5518 + }, + { + "epoch": 0.2631039496579506, + "grad_norm": 2.127215623855591, + "learning_rate": 4.38443033412387e-06, + "loss": 1.2882, + "step": 5519 + }, + { + "epoch": 0.26315162205325004, + "grad_norm": 1.7925151586532593, + "learning_rate": 4.381180613146396e-06, + "loss": 0.8053, + "step": 5520 + }, + { + "epoch": 0.26319929444854956, + "grad_norm": 1.0914833545684814, + "learning_rate": 4.377931759094892e-06, + "loss": 0.3883, + "step": 5521 + }, + { + "epoch": 0.2632469668438491, + "grad_norm": 1.1829966306686401, + "learning_rate": 4.374683772470619e-06, + "loss": 0.7149, + "step": 5522 + }, + { + "epoch": 0.2632946392391486, + "grad_norm": 2.2585456371307373, + "learning_rate": 4.371436653774714e-06, + "loss": 0.8343, + "step": 5523 + }, + { + "epoch": 0.26334231163444805, + "grad_norm": 2.3260622024536133, + "learning_rate": 4.368190403508167e-06, + "loss": 0.531, + "step": 5524 + }, + { + "epoch": 0.26338998402974756, + "grad_norm": 2.0901191234588623, + "learning_rate": 4.364945022171847e-06, + "loss": 0.6537, + "step": 5525 + }, + { + "epoch": 0.2634376564250471, + "grad_norm": 4.0685505867004395, + "learning_rate": 4.361700510266477e-06, + "loss": 1.0741, + "step": 5526 + }, + { + "epoch": 0.2634853288203466, + "grad_norm": 1.9662244319915771, + "learning_rate": 4.3584568682926555e-06, + "loss": 1.2344, + "step": 5527 + }, + { + "epoch": 0.2635330012156461, + "grad_norm": 1.5806111097335815, + "learning_rate": 4.355214096750846e-06, + "loss": 0.7201, + "step": 5528 + }, + { + "epoch": 0.26358067361094556, + "grad_norm": 2.1054883003234863, + "learning_rate": 4.351972196141368e-06, + "loss": 1.0543, + "step": 5529 + }, + { + "epoch": 0.2636283460062451, + "grad_norm": 1.5422677993774414, + "learning_rate": 4.348731166964415e-06, + "loss": 0.8393, + "step": 5530 + }, + { + "epoch": 0.2636760184015446, + "grad_norm": 3.49763822555542, + "learning_rate": 4.345491009720052e-06, + "loss": 0.7909, + "step": 5531 + }, + { + "epoch": 0.2637236907968441, + "grad_norm": 3.4069478511810303, + "learning_rate": 4.342251724908191e-06, + "loss": 1.0201, + "step": 5532 + }, + { + "epoch": 0.2637713631921436, + "grad_norm": 1.5217338800430298, + "learning_rate": 4.339013313028626e-06, + "loss": 0.3428, + "step": 5533 + }, + { + "epoch": 0.2638190355874431, + "grad_norm": 2.437505006790161, + "learning_rate": 4.3357757745810126e-06, + "loss": 1.3109, + "step": 5534 + }, + { + "epoch": 0.2638667079827426, + "grad_norm": 2.4502081871032715, + "learning_rate": 4.332539110064864e-06, + "loss": 0.8723, + "step": 5535 + }, + { + "epoch": 0.2639143803780421, + "grad_norm": 1.987545371055603, + "learning_rate": 4.329303319979571e-06, + "loss": 1.0509, + "step": 5536 + }, + { + "epoch": 0.2639620527733416, + "grad_norm": 2.6507227420806885, + "learning_rate": 4.326068404824375e-06, + "loss": 0.7547, + "step": 5537 + }, + { + "epoch": 0.2640097251686411, + "grad_norm": 1.219498872756958, + "learning_rate": 4.322834365098398e-06, + "loss": 0.9566, + "step": 5538 + }, + { + "epoch": 0.2640573975639406, + "grad_norm": 2.285651683807373, + "learning_rate": 4.319601201300611e-06, + "loss": 0.5146, + "step": 5539 + }, + { + "epoch": 0.2641050699592401, + "grad_norm": 1.547282099723816, + "learning_rate": 4.316368913929864e-06, + "loss": 0.872, + "step": 5540 + }, + { + "epoch": 0.2641527423545396, + "grad_norm": 1.4937717914581299, + "learning_rate": 4.3131375034848624e-06, + "loss": 0.3426, + "step": 5541 + }, + { + "epoch": 0.26420041474983913, + "grad_norm": 3.798511505126953, + "learning_rate": 4.30990697046418e-06, + "loss": 0.4038, + "step": 5542 + }, + { + "epoch": 0.2642480871451386, + "grad_norm": 2.3030643463134766, + "learning_rate": 4.306677315366258e-06, + "loss": 0.4771, + "step": 5543 + }, + { + "epoch": 0.2642957595404381, + "grad_norm": 2.7596776485443115, + "learning_rate": 4.303448538689393e-06, + "loss": 0.8721, + "step": 5544 + }, + { + "epoch": 0.2643434319357376, + "grad_norm": 2.4076898097991943, + "learning_rate": 4.300220640931756e-06, + "loss": 1.0586, + "step": 5545 + }, + { + "epoch": 0.26439110433103713, + "grad_norm": 1.0083461999893188, + "learning_rate": 4.296993622591377e-06, + "loss": 0.5221, + "step": 5546 + }, + { + "epoch": 0.2644387767263366, + "grad_norm": 3.0331618785858154, + "learning_rate": 4.293767484166157e-06, + "loss": 1.1717, + "step": 5547 + }, + { + "epoch": 0.2644864491216361, + "grad_norm": 1.4020642042160034, + "learning_rate": 4.290542226153847e-06, + "loss": 0.7718, + "step": 5548 + }, + { + "epoch": 0.2645341215169356, + "grad_norm": 1.8946406841278076, + "learning_rate": 4.287317849052075e-06, + "loss": 0.6381, + "step": 5549 + }, + { + "epoch": 0.26458179391223513, + "grad_norm": 1.953478217124939, + "learning_rate": 4.284094353358334e-06, + "loss": 0.7938, + "step": 5550 + }, + { + "epoch": 0.26462946630753464, + "grad_norm": 1.3801584243774414, + "learning_rate": 4.280871739569972e-06, + "loss": 0.7182, + "step": 5551 + }, + { + "epoch": 0.2646771387028341, + "grad_norm": 2.052669048309326, + "learning_rate": 4.277650008184201e-06, + "loss": 0.9194, + "step": 5552 + }, + { + "epoch": 0.2647248110981336, + "grad_norm": 2.213862895965576, + "learning_rate": 4.274429159698109e-06, + "loss": 0.4967, + "step": 5553 + }, + { + "epoch": 0.26477248349343313, + "grad_norm": 1.3609672784805298, + "learning_rate": 4.271209194608631e-06, + "loss": 0.7002, + "step": 5554 + }, + { + "epoch": 0.26482015588873264, + "grad_norm": 2.0467896461486816, + "learning_rate": 4.26799011341258e-06, + "loss": 0.6498, + "step": 5555 + }, + { + "epoch": 0.26486782828403216, + "grad_norm": 1.2618590593338013, + "learning_rate": 4.26477191660663e-06, + "loss": 0.5392, + "step": 5556 + }, + { + "epoch": 0.2649155006793316, + "grad_norm": 3.4503111839294434, + "learning_rate": 4.261554604687308e-06, + "loss": 0.5836, + "step": 5557 + }, + { + "epoch": 0.26496317307463113, + "grad_norm": 3.568570613861084, + "learning_rate": 4.2583381781510156e-06, + "loss": 0.6466, + "step": 5558 + }, + { + "epoch": 0.26501084546993064, + "grad_norm": 2.2931630611419678, + "learning_rate": 4.255122637494018e-06, + "loss": 1.0741, + "step": 5559 + }, + { + "epoch": 0.26505851786523016, + "grad_norm": 1.4329898357391357, + "learning_rate": 4.251907983212435e-06, + "loss": 1.2203, + "step": 5560 + }, + { + "epoch": 0.2651061902605296, + "grad_norm": 1.5274324417114258, + "learning_rate": 4.248694215802254e-06, + "loss": 0.7771, + "step": 5561 + }, + { + "epoch": 0.26515386265582913, + "grad_norm": 2.9416232109069824, + "learning_rate": 4.245481335759333e-06, + "loss": 0.31, + "step": 5562 + }, + { + "epoch": 0.26520153505112865, + "grad_norm": 2.5782439708709717, + "learning_rate": 4.2422693435793785e-06, + "loss": 0.2944, + "step": 5563 + }, + { + "epoch": 0.26524920744642816, + "grad_norm": 1.7043336629867554, + "learning_rate": 4.23905823975797e-06, + "loss": 0.9789, + "step": 5564 + }, + { + "epoch": 0.2652968798417277, + "grad_norm": 1.4189471006393433, + "learning_rate": 4.2358480247905535e-06, + "loss": 0.1828, + "step": 5565 + }, + { + "epoch": 0.26534455223702713, + "grad_norm": 1.8899550437927246, + "learning_rate": 4.2326386991724235e-06, + "loss": 0.885, + "step": 5566 + }, + { + "epoch": 0.26539222463232665, + "grad_norm": 1.5510401725769043, + "learning_rate": 4.229430263398754e-06, + "loss": 0.7526, + "step": 5567 + }, + { + "epoch": 0.26543989702762616, + "grad_norm": 6.7725982666015625, + "learning_rate": 4.2262227179645685e-06, + "loss": 1.264, + "step": 5568 + }, + { + "epoch": 0.2654875694229257, + "grad_norm": 1.9495378732681274, + "learning_rate": 4.2230160633647565e-06, + "loss": 0.5061, + "step": 5569 + }, + { + "epoch": 0.26553524181822513, + "grad_norm": 1.143641471862793, + "learning_rate": 4.2198103000940735e-06, + "loss": 0.7493, + "step": 5570 + }, + { + "epoch": 0.26558291421352465, + "grad_norm": 1.6225953102111816, + "learning_rate": 4.216605428647141e-06, + "loss": 0.51, + "step": 5571 + }, + { + "epoch": 0.26563058660882416, + "grad_norm": 1.806950330734253, + "learning_rate": 4.213401449518431e-06, + "loss": 0.5473, + "step": 5572 + }, + { + "epoch": 0.2656782590041237, + "grad_norm": 1.0339572429656982, + "learning_rate": 4.210198363202286e-06, + "loss": 0.422, + "step": 5573 + }, + { + "epoch": 0.2657259313994232, + "grad_norm": 2.8314456939697266, + "learning_rate": 4.206996170192913e-06, + "loss": 0.6091, + "step": 5574 + }, + { + "epoch": 0.26577360379472265, + "grad_norm": 1.6317073106765747, + "learning_rate": 4.203794870984371e-06, + "loss": 0.9413, + "step": 5575 + }, + { + "epoch": 0.26582127619002216, + "grad_norm": 4.767714500427246, + "learning_rate": 4.200594466070592e-06, + "loss": 1.8776, + "step": 5576 + }, + { + "epoch": 0.2658689485853217, + "grad_norm": 1.2538121938705444, + "learning_rate": 4.197394955945368e-06, + "loss": 0.9933, + "step": 5577 + }, + { + "epoch": 0.2659166209806212, + "grad_norm": 1.630173683166504, + "learning_rate": 4.1941963411023425e-06, + "loss": 0.8027, + "step": 5578 + }, + { + "epoch": 0.26596429337592065, + "grad_norm": 1.1768262386322021, + "learning_rate": 4.190998622035034e-06, + "loss": 0.5997, + "step": 5579 + }, + { + "epoch": 0.26601196577122016, + "grad_norm": 1.3510141372680664, + "learning_rate": 4.1878017992368205e-06, + "loss": 0.6135, + "step": 5580 + }, + { + "epoch": 0.2660596381665197, + "grad_norm": 1.6025506258010864, + "learning_rate": 4.184605873200932e-06, + "loss": 0.8922, + "step": 5581 + }, + { + "epoch": 0.2661073105618192, + "grad_norm": 1.844774603843689, + "learning_rate": 4.181410844420473e-06, + "loss": 0.4993, + "step": 5582 + }, + { + "epoch": 0.2661549829571187, + "grad_norm": 1.4028141498565674, + "learning_rate": 4.1782167133883985e-06, + "loss": 0.6382, + "step": 5583 + }, + { + "epoch": 0.26620265535241816, + "grad_norm": 2.228483200073242, + "learning_rate": 4.1750234805975355e-06, + "loss": 0.4291, + "step": 5584 + }, + { + "epoch": 0.2662503277477177, + "grad_norm": 1.0068373680114746, + "learning_rate": 4.17183114654056e-06, + "loss": 0.2973, + "step": 5585 + }, + { + "epoch": 0.2662980001430172, + "grad_norm": 3.4178378582000732, + "learning_rate": 4.168639711710019e-06, + "loss": 0.5526, + "step": 5586 + }, + { + "epoch": 0.2663456725383167, + "grad_norm": 1.0401500463485718, + "learning_rate": 4.165449176598325e-06, + "loss": 0.574, + "step": 5587 + }, + { + "epoch": 0.2663933449336162, + "grad_norm": 2.2149927616119385, + "learning_rate": 4.162259541697734e-06, + "loss": 0.1844, + "step": 5588 + }, + { + "epoch": 0.2664410173289157, + "grad_norm": 1.817918062210083, + "learning_rate": 4.159070807500378e-06, + "loss": 0.5699, + "step": 5589 + }, + { + "epoch": 0.2664886897242152, + "grad_norm": 3.1390933990478516, + "learning_rate": 4.155882974498251e-06, + "loss": 0.4993, + "step": 5590 + }, + { + "epoch": 0.2665363621195147, + "grad_norm": 1.4505109786987305, + "learning_rate": 4.152696043183194e-06, + "loss": 0.6816, + "step": 5591 + }, + { + "epoch": 0.2665840345148142, + "grad_norm": 1.4374276399612427, + "learning_rate": 4.149510014046922e-06, + "loss": 0.7264, + "step": 5592 + }, + { + "epoch": 0.2666317069101137, + "grad_norm": 1.9963061809539795, + "learning_rate": 4.14632488758101e-06, + "loss": 1.2008, + "step": 5593 + }, + { + "epoch": 0.2666793793054132, + "grad_norm": 1.6760303974151611, + "learning_rate": 4.143140664276884e-06, + "loss": 0.6839, + "step": 5594 + }, + { + "epoch": 0.2667270517007127, + "grad_norm": 2.7018444538116455, + "learning_rate": 4.139957344625843e-06, + "loss": 0.3376, + "step": 5595 + }, + { + "epoch": 0.2667747240960122, + "grad_norm": 1.9463860988616943, + "learning_rate": 4.136774929119033e-06, + "loss": 0.6627, + "step": 5596 + }, + { + "epoch": 0.26682239649131173, + "grad_norm": 1.260648250579834, + "learning_rate": 4.133593418247474e-06, + "loss": 0.6292, + "step": 5597 + }, + { + "epoch": 0.2668700688866112, + "grad_norm": 1.5680603981018066, + "learning_rate": 4.130412812502037e-06, + "loss": 0.5229, + "step": 5598 + }, + { + "epoch": 0.2669177412819107, + "grad_norm": 1.5625911951065063, + "learning_rate": 4.12723311237346e-06, + "loss": 0.6718, + "step": 5599 + }, + { + "epoch": 0.2669654136772102, + "grad_norm": 3.8781931400299072, + "learning_rate": 4.124054318352333e-06, + "loss": 0.699, + "step": 5600 + }, + { + "epoch": 0.26701308607250973, + "grad_norm": 3.590725898742676, + "learning_rate": 4.120876430929115e-06, + "loss": 1.5023, + "step": 5601 + }, + { + "epoch": 0.2670607584678092, + "grad_norm": 5.640691757202148, + "learning_rate": 4.117699450594122e-06, + "loss": 0.4176, + "step": 5602 + }, + { + "epoch": 0.2671084308631087, + "grad_norm": 2.2057461738586426, + "learning_rate": 4.114523377837526e-06, + "loss": 0.7682, + "step": 5603 + }, + { + "epoch": 0.2671561032584082, + "grad_norm": 2.3418283462524414, + "learning_rate": 4.1113482131493635e-06, + "loss": 1.202, + "step": 5604 + }, + { + "epoch": 0.26720377565370773, + "grad_norm": 1.4537042379379272, + "learning_rate": 4.108173957019534e-06, + "loss": 0.7473, + "step": 5605 + }, + { + "epoch": 0.26725144804900725, + "grad_norm": 1.261121392250061, + "learning_rate": 4.1050006099377846e-06, + "loss": 0.6996, + "step": 5606 + }, + { + "epoch": 0.2672991204443067, + "grad_norm": 2.5251898765563965, + "learning_rate": 4.101828172393734e-06, + "loss": 0.7216, + "step": 5607 + }, + { + "epoch": 0.2673467928396062, + "grad_norm": 1.6198338270187378, + "learning_rate": 4.098656644876863e-06, + "loss": 0.7462, + "step": 5608 + }, + { + "epoch": 0.26739446523490573, + "grad_norm": 2.2518017292022705, + "learning_rate": 4.095486027876494e-06, + "loss": 0.6152, + "step": 5609 + }, + { + "epoch": 0.26744213763020525, + "grad_norm": 3.3212130069732666, + "learning_rate": 4.0923163218818265e-06, + "loss": 0.5339, + "step": 5610 + }, + { + "epoch": 0.2674898100255047, + "grad_norm": 2.3660006523132324, + "learning_rate": 4.089147527381917e-06, + "loss": 0.6379, + "step": 5611 + }, + { + "epoch": 0.2675374824208042, + "grad_norm": 1.339280605316162, + "learning_rate": 4.085979644865674e-06, + "loss": 0.6574, + "step": 5612 + }, + { + "epoch": 0.26758515481610373, + "grad_norm": 1.9048956632614136, + "learning_rate": 4.082812674821865e-06, + "loss": 1.1436, + "step": 5613 + }, + { + "epoch": 0.26763282721140325, + "grad_norm": 1.1836352348327637, + "learning_rate": 4.079646617739129e-06, + "loss": 0.4625, + "step": 5614 + }, + { + "epoch": 0.26768049960670276, + "grad_norm": 1.1403794288635254, + "learning_rate": 4.076481474105949e-06, + "loss": 0.5532, + "step": 5615 + }, + { + "epoch": 0.2677281720020022, + "grad_norm": 1.2208589315414429, + "learning_rate": 4.073317244410677e-06, + "loss": 0.6893, + "step": 5616 + }, + { + "epoch": 0.26777584439730173, + "grad_norm": 1.7238335609436035, + "learning_rate": 4.070153929141524e-06, + "loss": 0.7698, + "step": 5617 + }, + { + "epoch": 0.26782351679260125, + "grad_norm": 1.5462297201156616, + "learning_rate": 4.066991528786551e-06, + "loss": 0.5943, + "step": 5618 + }, + { + "epoch": 0.26787118918790076, + "grad_norm": 1.7230587005615234, + "learning_rate": 4.063830043833688e-06, + "loss": 0.7638, + "step": 5619 + }, + { + "epoch": 0.2679188615832003, + "grad_norm": 1.6850359439849854, + "learning_rate": 4.060669474770716e-06, + "loss": 0.5414, + "step": 5620 + }, + { + "epoch": 0.26796653397849973, + "grad_norm": 1.662850260734558, + "learning_rate": 4.057509822085286e-06, + "loss": 0.5417, + "step": 5621 + }, + { + "epoch": 0.26801420637379925, + "grad_norm": 1.8192800283432007, + "learning_rate": 4.054351086264891e-06, + "loss": 0.7306, + "step": 5622 + }, + { + "epoch": 0.26806187876909876, + "grad_norm": 1.3098952770233154, + "learning_rate": 4.051193267796894e-06, + "loss": 0.7311, + "step": 5623 + }, + { + "epoch": 0.2681095511643983, + "grad_norm": 1.6049048900604248, + "learning_rate": 4.048036367168521e-06, + "loss": 0.8549, + "step": 5624 + }, + { + "epoch": 0.26815722355969773, + "grad_norm": 1.8448766469955444, + "learning_rate": 4.0448803848668374e-06, + "loss": 1.2292, + "step": 5625 + }, + { + "epoch": 0.26820489595499725, + "grad_norm": 1.0000351667404175, + "learning_rate": 4.0417253213787885e-06, + "loss": 0.2914, + "step": 5626 + }, + { + "epoch": 0.26825256835029676, + "grad_norm": 1.6012147665023804, + "learning_rate": 4.038571177191164e-06, + "loss": 0.6561, + "step": 5627 + }, + { + "epoch": 0.2683002407455963, + "grad_norm": 2.003044605255127, + "learning_rate": 4.035417952790613e-06, + "loss": 0.7424, + "step": 5628 + }, + { + "epoch": 0.2683479131408958, + "grad_norm": 1.7002331018447876, + "learning_rate": 4.032265648663649e-06, + "loss": 0.7706, + "step": 5629 + }, + { + "epoch": 0.26839558553619525, + "grad_norm": 3.057119131088257, + "learning_rate": 4.029114265296642e-06, + "loss": 1.4352, + "step": 5630 + }, + { + "epoch": 0.26844325793149476, + "grad_norm": 1.6593033075332642, + "learning_rate": 4.025963803175813e-06, + "loss": 0.5818, + "step": 5631 + }, + { + "epoch": 0.2684909303267943, + "grad_norm": 1.637102484703064, + "learning_rate": 4.022814262787248e-06, + "loss": 0.8918, + "step": 5632 + }, + { + "epoch": 0.2685386027220938, + "grad_norm": 1.545956015586853, + "learning_rate": 4.0196656446168925e-06, + "loss": 0.7567, + "step": 5633 + }, + { + "epoch": 0.26858627511739325, + "grad_norm": 1.71291983127594, + "learning_rate": 4.01651794915054e-06, + "loss": 0.9417, + "step": 5634 + }, + { + "epoch": 0.26863394751269276, + "grad_norm": 4.601542949676514, + "learning_rate": 4.013371176873849e-06, + "loss": 1.1523, + "step": 5635 + }, + { + "epoch": 0.2686816199079923, + "grad_norm": 1.7521055936813354, + "learning_rate": 4.0102253282723394e-06, + "loss": 0.8477, + "step": 5636 + }, + { + "epoch": 0.2687292923032918, + "grad_norm": 1.587024211883545, + "learning_rate": 4.007080403831374e-06, + "loss": 0.5925, + "step": 5637 + }, + { + "epoch": 0.2687769646985913, + "grad_norm": 1.24383544921875, + "learning_rate": 4.003936404036188e-06, + "loss": 0.6706, + "step": 5638 + }, + { + "epoch": 0.26882463709389076, + "grad_norm": 2.295158863067627, + "learning_rate": 4.000793329371872e-06, + "loss": 0.5453, + "step": 5639 + }, + { + "epoch": 0.2688723094891903, + "grad_norm": 3.3943676948547363, + "learning_rate": 3.99765118032336e-06, + "loss": 1.2696, + "step": 5640 + }, + { + "epoch": 0.2689199818844898, + "grad_norm": 2.7188167572021484, + "learning_rate": 3.9945099573754635e-06, + "loss": 1.3469, + "step": 5641 + }, + { + "epoch": 0.2689676542797893, + "grad_norm": 1.985461950302124, + "learning_rate": 3.991369661012831e-06, + "loss": 0.3006, + "step": 5642 + }, + { + "epoch": 0.2690153266750888, + "grad_norm": 1.2259762287139893, + "learning_rate": 3.988230291719987e-06, + "loss": 0.6294, + "step": 5643 + }, + { + "epoch": 0.2690629990703883, + "grad_norm": 1.5396753549575806, + "learning_rate": 3.9850918499812976e-06, + "loss": 0.5473, + "step": 5644 + }, + { + "epoch": 0.2691106714656878, + "grad_norm": 1.002305269241333, + "learning_rate": 3.981954336280996e-06, + "loss": 0.6129, + "step": 5645 + }, + { + "epoch": 0.2691583438609873, + "grad_norm": 3.2131707668304443, + "learning_rate": 3.978817751103163e-06, + "loss": 0.8664, + "step": 5646 + }, + { + "epoch": 0.2692060162562868, + "grad_norm": 1.183082938194275, + "learning_rate": 3.975682094931747e-06, + "loss": 0.2161, + "step": 5647 + }, + { + "epoch": 0.2692536886515863, + "grad_norm": 1.8746246099472046, + "learning_rate": 3.972547368250547e-06, + "loss": 0.6282, + "step": 5648 + }, + { + "epoch": 0.2693013610468858, + "grad_norm": 1.2480121850967407, + "learning_rate": 3.969413571543214e-06, + "loss": 0.7862, + "step": 5649 + }, + { + "epoch": 0.2693490334421853, + "grad_norm": 1.6659002304077148, + "learning_rate": 3.9662807052932625e-06, + "loss": 1.1763, + "step": 5650 + }, + { + "epoch": 0.2693967058374848, + "grad_norm": 1.125238299369812, + "learning_rate": 3.963148769984069e-06, + "loss": 0.6492, + "step": 5651 + }, + { + "epoch": 0.26944437823278433, + "grad_norm": 2.436222553253174, + "learning_rate": 3.960017766098847e-06, + "loss": 1.1086, + "step": 5652 + }, + { + "epoch": 0.2694920506280838, + "grad_norm": 1.7079180479049683, + "learning_rate": 3.956887694120685e-06, + "loss": 0.3463, + "step": 5653 + }, + { + "epoch": 0.2695397230233833, + "grad_norm": 2.0221309661865234, + "learning_rate": 3.953758554532523e-06, + "loss": 0.9493, + "step": 5654 + }, + { + "epoch": 0.2695873954186828, + "grad_norm": 1.6272809505462646, + "learning_rate": 3.950630347817148e-06, + "loss": 0.6734, + "step": 5655 + }, + { + "epoch": 0.26963506781398233, + "grad_norm": 1.688293695449829, + "learning_rate": 3.947503074457219e-06, + "loss": 0.9494, + "step": 5656 + }, + { + "epoch": 0.2696827402092818, + "grad_norm": 2.0697226524353027, + "learning_rate": 3.9443767349352315e-06, + "loss": 1.1549, + "step": 5657 + }, + { + "epoch": 0.2697304126045813, + "grad_norm": 3.1395578384399414, + "learning_rate": 3.9412513297335574e-06, + "loss": 0.7576, + "step": 5658 + }, + { + "epoch": 0.2697780849998808, + "grad_norm": 0.9031580090522766, + "learning_rate": 3.938126859334407e-06, + "loss": 0.6, + "step": 5659 + }, + { + "epoch": 0.26982575739518033, + "grad_norm": 1.6896086931228638, + "learning_rate": 3.935003324219856e-06, + "loss": 0.3832, + "step": 5660 + }, + { + "epoch": 0.26987342979047985, + "grad_norm": 2.8483963012695312, + "learning_rate": 3.931880724871838e-06, + "loss": 0.5517, + "step": 5661 + }, + { + "epoch": 0.2699211021857793, + "grad_norm": 1.4200193881988525, + "learning_rate": 3.928759061772132e-06, + "loss": 0.6245, + "step": 5662 + }, + { + "epoch": 0.2699687745810788, + "grad_norm": 3.688634157180786, + "learning_rate": 3.9256383354023804e-06, + "loss": 1.1761, + "step": 5663 + }, + { + "epoch": 0.27001644697637833, + "grad_norm": 1.4372247457504272, + "learning_rate": 3.922518546244084e-06, + "loss": 0.5194, + "step": 5664 + }, + { + "epoch": 0.27006411937167785, + "grad_norm": 2.5982208251953125, + "learning_rate": 3.919399694778586e-06, + "loss": 0.7174, + "step": 5665 + }, + { + "epoch": 0.2701117917669773, + "grad_norm": 1.8281327486038208, + "learning_rate": 3.916281781487098e-06, + "loss": 0.9557, + "step": 5666 + }, + { + "epoch": 0.2701594641622768, + "grad_norm": 1.798475980758667, + "learning_rate": 3.913164806850683e-06, + "loss": 0.9973, + "step": 5667 + }, + { + "epoch": 0.27020713655757633, + "grad_norm": 1.6262587308883667, + "learning_rate": 3.910048771350253e-06, + "loss": 0.7448, + "step": 5668 + }, + { + "epoch": 0.27025480895287585, + "grad_norm": 1.2724647521972656, + "learning_rate": 3.906933675466584e-06, + "loss": 0.8324, + "step": 5669 + }, + { + "epoch": 0.27030248134817536, + "grad_norm": 1.2882195711135864, + "learning_rate": 3.9038195196803055e-06, + "loss": 0.3717, + "step": 5670 + }, + { + "epoch": 0.2703501537434748, + "grad_norm": 2.791334867477417, + "learning_rate": 3.900706304471896e-06, + "loss": 0.5472, + "step": 5671 + }, + { + "epoch": 0.27039782613877433, + "grad_norm": 1.3858510255813599, + "learning_rate": 3.89759403032169e-06, + "loss": 1.0581, + "step": 5672 + }, + { + "epoch": 0.27044549853407385, + "grad_norm": 1.329193115234375, + "learning_rate": 3.8944826977098856e-06, + "loss": 0.662, + "step": 5673 + }, + { + "epoch": 0.27049317092937336, + "grad_norm": 5.558101177215576, + "learning_rate": 3.891372307116523e-06, + "loss": 0.9928, + "step": 5674 + }, + { + "epoch": 0.2705408433246729, + "grad_norm": 1.665878415107727, + "learning_rate": 3.888262859021508e-06, + "loss": 0.6122, + "step": 5675 + }, + { + "epoch": 0.27058851571997233, + "grad_norm": 1.253283977508545, + "learning_rate": 3.885154353904598e-06, + "loss": 0.5924, + "step": 5676 + }, + { + "epoch": 0.27063618811527185, + "grad_norm": 1.6796064376831055, + "learning_rate": 3.882046792245395e-06, + "loss": 0.8122, + "step": 5677 + }, + { + "epoch": 0.27068386051057136, + "grad_norm": 1.6100490093231201, + "learning_rate": 3.878940174523371e-06, + "loss": 0.5877, + "step": 5678 + }, + { + "epoch": 0.2707315329058709, + "grad_norm": 1.7721821069717407, + "learning_rate": 3.875834501217847e-06, + "loss": 1.0471, + "step": 5679 + }, + { + "epoch": 0.27077920530117033, + "grad_norm": 1.3024680614471436, + "learning_rate": 3.872729772807989e-06, + "loss": 0.8119, + "step": 5680 + }, + { + "epoch": 0.27082687769646985, + "grad_norm": 1.2156068086624146, + "learning_rate": 3.869625989772828e-06, + "loss": 0.5751, + "step": 5681 + }, + { + "epoch": 0.27087455009176936, + "grad_norm": 4.203164100646973, + "learning_rate": 3.8665231525912505e-06, + "loss": 1.2568, + "step": 5682 + }, + { + "epoch": 0.2709222224870689, + "grad_norm": 2.334987163543701, + "learning_rate": 3.863421261741983e-06, + "loss": 1.3586, + "step": 5683 + }, + { + "epoch": 0.2709698948823684, + "grad_norm": 2.066728353500366, + "learning_rate": 3.860320317703622e-06, + "loss": 0.8259, + "step": 5684 + }, + { + "epoch": 0.27101756727766785, + "grad_norm": 1.5613700151443481, + "learning_rate": 3.857220320954612e-06, + "loss": 0.5541, + "step": 5685 + }, + { + "epoch": 0.27106523967296736, + "grad_norm": 2.7656795978546143, + "learning_rate": 3.854121271973245e-06, + "loss": 1.0216, + "step": 5686 + }, + { + "epoch": 0.2711129120682669, + "grad_norm": 1.158886194229126, + "learning_rate": 3.851023171237678e-06, + "loss": 0.4278, + "step": 5687 + }, + { + "epoch": 0.2711605844635664, + "grad_norm": 3.701150894165039, + "learning_rate": 3.8479260192259135e-06, + "loss": 1.0564, + "step": 5688 + }, + { + "epoch": 0.27120825685886585, + "grad_norm": 1.521830677986145, + "learning_rate": 3.844829816415808e-06, + "loss": 1.1317, + "step": 5689 + }, + { + "epoch": 0.27125592925416536, + "grad_norm": 2.0760726928710938, + "learning_rate": 3.841734563285076e-06, + "loss": 0.8162, + "step": 5690 + }, + { + "epoch": 0.2713036016494649, + "grad_norm": 1.6996026039123535, + "learning_rate": 3.8386402603112845e-06, + "loss": 0.8117, + "step": 5691 + }, + { + "epoch": 0.2713512740447644, + "grad_norm": 2.6809940338134766, + "learning_rate": 3.835546907971849e-06, + "loss": 0.7544, + "step": 5692 + }, + { + "epoch": 0.2713989464400639, + "grad_norm": 13.870550155639648, + "learning_rate": 3.832454506744043e-06, + "loss": 0.1932, + "step": 5693 + }, + { + "epoch": 0.27144661883536336, + "grad_norm": 2.608093500137329, + "learning_rate": 3.829363057104998e-06, + "loss": 0.523, + "step": 5694 + }, + { + "epoch": 0.2714942912306629, + "grad_norm": 2.375333786010742, + "learning_rate": 3.8262725595316845e-06, + "loss": 0.833, + "step": 5695 + }, + { + "epoch": 0.2715419636259624, + "grad_norm": 2.1272153854370117, + "learning_rate": 3.823183014500937e-06, + "loss": 0.8474, + "step": 5696 + }, + { + "epoch": 0.2715896360212619, + "grad_norm": 2.915160894393921, + "learning_rate": 3.820094422489442e-06, + "loss": 1.1539, + "step": 5697 + }, + { + "epoch": 0.27163730841656136, + "grad_norm": 1.6739840507507324, + "learning_rate": 3.81700678397374e-06, + "loss": 0.7797, + "step": 5698 + }, + { + "epoch": 0.2716849808118609, + "grad_norm": 1.1327998638153076, + "learning_rate": 3.813920099430215e-06, + "loss": 0.6174, + "step": 5699 + }, + { + "epoch": 0.2717326532071604, + "grad_norm": 1.7080106735229492, + "learning_rate": 3.810834369335118e-06, + "loss": 0.6037, + "step": 5700 + }, + { + "epoch": 0.2717803256024599, + "grad_norm": 1.248401165008545, + "learning_rate": 3.8077495941645392e-06, + "loss": 0.2708, + "step": 5701 + }, + { + "epoch": 0.2718279979977594, + "grad_norm": 1.445685863494873, + "learning_rate": 3.8046657743944327e-06, + "loss": 0.6664, + "step": 5702 + }, + { + "epoch": 0.2718756703930589, + "grad_norm": 1.719103455543518, + "learning_rate": 3.801582910500594e-06, + "loss": 0.6413, + "step": 5703 + }, + { + "epoch": 0.2719233427883584, + "grad_norm": 1.417418122291565, + "learning_rate": 3.7985010029586856e-06, + "loss": 0.7251, + "step": 5704 + }, + { + "epoch": 0.2719710151836579, + "grad_norm": 3.2522764205932617, + "learning_rate": 3.795420052244205e-06, + "loss": 1.0004, + "step": 5705 + }, + { + "epoch": 0.2720186875789574, + "grad_norm": 1.6385494470596313, + "learning_rate": 3.7923400588325156e-06, + "loss": 0.7247, + "step": 5706 + }, + { + "epoch": 0.27206635997425693, + "grad_norm": 1.4474941492080688, + "learning_rate": 3.7892610231988313e-06, + "loss": 0.3648, + "step": 5707 + }, + { + "epoch": 0.2721140323695564, + "grad_norm": 1.151902198791504, + "learning_rate": 3.786182945818211e-06, + "loss": 0.4966, + "step": 5708 + }, + { + "epoch": 0.2721617047648559, + "grad_norm": 1.6308925151824951, + "learning_rate": 3.7831058271655707e-06, + "loss": 0.8752, + "step": 5709 + }, + { + "epoch": 0.2722093771601554, + "grad_norm": 4.2674665451049805, + "learning_rate": 3.7800296677156844e-06, + "loss": 1.0404, + "step": 5710 + }, + { + "epoch": 0.27225704955545493, + "grad_norm": 2.5260775089263916, + "learning_rate": 3.7769544679431624e-06, + "loss": 0.4908, + "step": 5711 + }, + { + "epoch": 0.2723047219507544, + "grad_norm": 6.458588600158691, + "learning_rate": 3.773880228322482e-06, + "loss": 1.0295, + "step": 5712 + }, + { + "epoch": 0.2723523943460539, + "grad_norm": 1.3752353191375732, + "learning_rate": 3.7708069493279687e-06, + "loss": 0.8503, + "step": 5713 + }, + { + "epoch": 0.2724000667413534, + "grad_norm": 1.3675073385238647, + "learning_rate": 3.7677346314337913e-06, + "loss": 0.8296, + "step": 5714 + }, + { + "epoch": 0.27244773913665293, + "grad_norm": 3.524670362472534, + "learning_rate": 3.7646632751139844e-06, + "loss": 0.9025, + "step": 5715 + }, + { + "epoch": 0.27249541153195245, + "grad_norm": 2.0363285541534424, + "learning_rate": 3.7615928808424184e-06, + "loss": 0.6357, + "step": 5716 + }, + { + "epoch": 0.2725430839272519, + "grad_norm": 2.8297040462493896, + "learning_rate": 3.7585234490928313e-06, + "loss": 0.6437, + "step": 5717 + }, + { + "epoch": 0.2725907563225514, + "grad_norm": 1.991547703742981, + "learning_rate": 3.7554549803387984e-06, + "loss": 0.4893, + "step": 5718 + }, + { + "epoch": 0.27263842871785093, + "grad_norm": 1.7219820022583008, + "learning_rate": 3.7523874750537593e-06, + "loss": 0.9517, + "step": 5719 + }, + { + "epoch": 0.27268610111315045, + "grad_norm": 2.149207353591919, + "learning_rate": 3.7493209337109904e-06, + "loss": 0.7515, + "step": 5720 + }, + { + "epoch": 0.2727337735084499, + "grad_norm": 1.328798532485962, + "learning_rate": 3.7462553567836324e-06, + "loss": 1.0679, + "step": 5721 + }, + { + "epoch": 0.2727814459037494, + "grad_norm": 2.60494065284729, + "learning_rate": 3.743190744744675e-06, + "loss": 0.5135, + "step": 5722 + }, + { + "epoch": 0.27282911829904893, + "grad_norm": 1.3431496620178223, + "learning_rate": 3.740127098066949e-06, + "loss": 0.6711, + "step": 5723 + }, + { + "epoch": 0.27287679069434845, + "grad_norm": 1.6929900646209717, + "learning_rate": 3.7370644172231485e-06, + "loss": 0.5891, + "step": 5724 + }, + { + "epoch": 0.27292446308964796, + "grad_norm": 3.088587999343872, + "learning_rate": 3.734002702685816e-06, + "loss": 0.6051, + "step": 5725 + }, + { + "epoch": 0.2729721354849474, + "grad_norm": 1.6434996128082275, + "learning_rate": 3.730941954927335e-06, + "loss": 0.5684, + "step": 5726 + }, + { + "epoch": 0.27301980788024693, + "grad_norm": 1.6733932495117188, + "learning_rate": 3.7278821744199524e-06, + "loss": 0.9915, + "step": 5727 + }, + { + "epoch": 0.27306748027554645, + "grad_norm": 2.2283565998077393, + "learning_rate": 3.7248233616357633e-06, + "loss": 0.7064, + "step": 5728 + }, + { + "epoch": 0.27311515267084596, + "grad_norm": 1.378372073173523, + "learning_rate": 3.7217655170467035e-06, + "loss": 0.8543, + "step": 5729 + }, + { + "epoch": 0.2731628250661455, + "grad_norm": 1.9921668767929077, + "learning_rate": 3.7187086411245723e-06, + "loss": 0.421, + "step": 5730 + }, + { + "epoch": 0.27321049746144493, + "grad_norm": 1.7613787651062012, + "learning_rate": 3.715652734341015e-06, + "loss": 0.8166, + "step": 5731 + }, + { + "epoch": 0.27325816985674445, + "grad_norm": 0.9894343018531799, + "learning_rate": 3.7125977971675264e-06, + "loss": 0.3724, + "step": 5732 + }, + { + "epoch": 0.27330584225204396, + "grad_norm": 1.6460390090942383, + "learning_rate": 3.709543830075445e-06, + "loss": 0.5897, + "step": 5733 + }, + { + "epoch": 0.2733535146473435, + "grad_norm": 1.3652790784835815, + "learning_rate": 3.7064908335359716e-06, + "loss": 0.2567, + "step": 5734 + }, + { + "epoch": 0.27340118704264293, + "grad_norm": 1.6392582654953003, + "learning_rate": 3.7034388080201557e-06, + "loss": 1.0321, + "step": 5735 + }, + { + "epoch": 0.27344885943794245, + "grad_norm": 2.154822826385498, + "learning_rate": 3.7003877539988866e-06, + "loss": 0.4373, + "step": 5736 + }, + { + "epoch": 0.27349653183324196, + "grad_norm": 1.1326566934585571, + "learning_rate": 3.6973376719429134e-06, + "loss": 0.6395, + "step": 5737 + }, + { + "epoch": 0.2735442042285415, + "grad_norm": 1.2641526460647583, + "learning_rate": 3.6942885623228353e-06, + "loss": 0.6231, + "step": 5738 + }, + { + "epoch": 0.273591876623841, + "grad_norm": 1.6372159719467163, + "learning_rate": 3.691240425609093e-06, + "loss": 0.6248, + "step": 5739 + }, + { + "epoch": 0.27363954901914045, + "grad_norm": 3.5587074756622314, + "learning_rate": 3.6881932622719853e-06, + "loss": 1.3179, + "step": 5740 + }, + { + "epoch": 0.27368722141443996, + "grad_norm": 2.114997148513794, + "learning_rate": 3.6851470727816617e-06, + "loss": 0.7741, + "step": 5741 + }, + { + "epoch": 0.2737348938097395, + "grad_norm": 2.527776002883911, + "learning_rate": 3.6821018576081114e-06, + "loss": 0.7492, + "step": 5742 + }, + { + "epoch": 0.273782566205039, + "grad_norm": 2.212334394454956, + "learning_rate": 3.679057617221181e-06, + "loss": 0.7236, + "step": 5743 + }, + { + "epoch": 0.27383023860033845, + "grad_norm": 6.127382755279541, + "learning_rate": 3.6760143520905724e-06, + "loss": 0.5891, + "step": 5744 + }, + { + "epoch": 0.27387791099563796, + "grad_norm": 1.1842920780181885, + "learning_rate": 3.6729720626858213e-06, + "loss": 0.8814, + "step": 5745 + }, + { + "epoch": 0.2739255833909375, + "grad_norm": 1.8540352582931519, + "learning_rate": 3.669930749476327e-06, + "loss": 0.424, + "step": 5746 + }, + { + "epoch": 0.273973255786237, + "grad_norm": 2.6331565380096436, + "learning_rate": 3.666890412931332e-06, + "loss": 0.7251, + "step": 5747 + }, + { + "epoch": 0.2740209281815365, + "grad_norm": 5.96897029876709, + "learning_rate": 3.6638510535199245e-06, + "loss": 0.3368, + "step": 5748 + }, + { + "epoch": 0.27406860057683596, + "grad_norm": 1.5467431545257568, + "learning_rate": 3.660812671711049e-06, + "loss": 0.4915, + "step": 5749 + }, + { + "epoch": 0.2741162729721355, + "grad_norm": 1.412362813949585, + "learning_rate": 3.6577752679735023e-06, + "loss": 0.774, + "step": 5750 + }, + { + "epoch": 0.274163945367435, + "grad_norm": 1.6401735544204712, + "learning_rate": 3.6547388427759144e-06, + "loss": 0.5427, + "step": 5751 + }, + { + "epoch": 0.2742116177627345, + "grad_norm": 2.3528993129730225, + "learning_rate": 3.651703396586781e-06, + "loss": 0.6584, + "step": 5752 + }, + { + "epoch": 0.27425929015803396, + "grad_norm": 1.5741047859191895, + "learning_rate": 3.6486689298744406e-06, + "loss": 0.9682, + "step": 5753 + }, + { + "epoch": 0.2743069625533335, + "grad_norm": 1.327043056488037, + "learning_rate": 3.645635443107076e-06, + "loss": 0.7082, + "step": 5754 + }, + { + "epoch": 0.274354634948633, + "grad_norm": 2.1352598667144775, + "learning_rate": 3.642602936752724e-06, + "loss": 0.778, + "step": 5755 + }, + { + "epoch": 0.2744023073439325, + "grad_norm": 1.15536630153656, + "learning_rate": 3.6395714112792744e-06, + "loss": 0.8755, + "step": 5756 + }, + { + "epoch": 0.274449979739232, + "grad_norm": 1.1322435140609741, + "learning_rate": 3.6365408671544534e-06, + "loss": 0.7989, + "step": 5757 + }, + { + "epoch": 0.2744976521345315, + "grad_norm": 0.9391312599182129, + "learning_rate": 3.633511304845845e-06, + "loss": 0.5099, + "step": 5758 + }, + { + "epoch": 0.274545324529831, + "grad_norm": 2.4861743450164795, + "learning_rate": 3.630482724820884e-06, + "loss": 1.2608, + "step": 5759 + }, + { + "epoch": 0.2745929969251305, + "grad_norm": 2.128649950027466, + "learning_rate": 3.627455127546842e-06, + "loss": 0.8045, + "step": 5760 + }, + { + "epoch": 0.27464066932043, + "grad_norm": 2.0263898372650146, + "learning_rate": 3.6244285134908517e-06, + "loss": 1.14, + "step": 5761 + }, + { + "epoch": 0.27468834171572953, + "grad_norm": 1.3106884956359863, + "learning_rate": 3.6214028831198833e-06, + "loss": 0.7058, + "step": 5762 + }, + { + "epoch": 0.274736014111029, + "grad_norm": 1.9334492683410645, + "learning_rate": 3.618378236900767e-06, + "loss": 0.8704, + "step": 5763 + }, + { + "epoch": 0.2747836865063285, + "grad_norm": 1.6236391067504883, + "learning_rate": 3.6153545753001663e-06, + "loss": 0.4789, + "step": 5764 + }, + { + "epoch": 0.274831358901628, + "grad_norm": 1.925201654434204, + "learning_rate": 3.612331898784609e-06, + "loss": 0.7876, + "step": 5765 + }, + { + "epoch": 0.27487903129692753, + "grad_norm": 2.073216438293457, + "learning_rate": 3.6093102078204566e-06, + "loss": 0.8201, + "step": 5766 + }, + { + "epoch": 0.274926703692227, + "grad_norm": 2.232060194015503, + "learning_rate": 3.6062895028739287e-06, + "loss": 0.9654, + "step": 5767 + }, + { + "epoch": 0.2749743760875265, + "grad_norm": 8.443530082702637, + "learning_rate": 3.6032697844110896e-06, + "loss": 1.9204, + "step": 5768 + }, + { + "epoch": 0.275022048482826, + "grad_norm": 2.865467071533203, + "learning_rate": 3.6002510528978473e-06, + "loss": 1.1493, + "step": 5769 + }, + { + "epoch": 0.27506972087812553, + "grad_norm": 1.9604463577270508, + "learning_rate": 3.5972333087999622e-06, + "loss": 0.5984, + "step": 5770 + }, + { + "epoch": 0.27511739327342505, + "grad_norm": 0.8359993696212769, + "learning_rate": 3.594216552583045e-06, + "loss": 0.3696, + "step": 5771 + }, + { + "epoch": 0.2751650656687245, + "grad_norm": 2.306946277618408, + "learning_rate": 3.591200784712543e-06, + "loss": 0.7726, + "step": 5772 + }, + { + "epoch": 0.275212738064024, + "grad_norm": 1.7575749158859253, + "learning_rate": 3.588186005653763e-06, + "loss": 0.5739, + "step": 5773 + }, + { + "epoch": 0.27526041045932353, + "grad_norm": 2.016315221786499, + "learning_rate": 3.5851722158718537e-06, + "loss": 0.8791, + "step": 5774 + }, + { + "epoch": 0.27530808285462305, + "grad_norm": 1.2995957136154175, + "learning_rate": 3.582159415831814e-06, + "loss": 0.5061, + "step": 5775 + }, + { + "epoch": 0.2753557552499225, + "grad_norm": 2.3488354682922363, + "learning_rate": 3.5791476059984866e-06, + "loss": 0.6248, + "step": 5776 + }, + { + "epoch": 0.275403427645222, + "grad_norm": 1.2884950637817383, + "learning_rate": 3.576136786836557e-06, + "loss": 0.845, + "step": 5777 + }, + { + "epoch": 0.27545110004052153, + "grad_norm": 1.571522831916809, + "learning_rate": 3.5731269588105723e-06, + "loss": 0.5894, + "step": 5778 + }, + { + "epoch": 0.27549877243582105, + "grad_norm": 2.178201913833618, + "learning_rate": 3.57011812238491e-06, + "loss": 0.5167, + "step": 5779 + }, + { + "epoch": 0.27554644483112056, + "grad_norm": 1.9311840534210205, + "learning_rate": 3.5671102780238066e-06, + "loss": 0.4011, + "step": 5780 + }, + { + "epoch": 0.27559411722642, + "grad_norm": 1.3858994245529175, + "learning_rate": 3.5641034261913454e-06, + "loss": 0.3302, + "step": 5781 + }, + { + "epoch": 0.27564178962171954, + "grad_norm": 1.957581877708435, + "learning_rate": 3.561097567351445e-06, + "loss": 1.0391, + "step": 5782 + }, + { + "epoch": 0.27568946201701905, + "grad_norm": 1.8633755445480347, + "learning_rate": 3.5580927019678812e-06, + "loss": 0.2314, + "step": 5783 + }, + { + "epoch": 0.27573713441231856, + "grad_norm": 1.5867997407913208, + "learning_rate": 3.5550888305042785e-06, + "loss": 0.6767, + "step": 5784 + }, + { + "epoch": 0.275784806807618, + "grad_norm": 2.5515241622924805, + "learning_rate": 3.552085953424096e-06, + "loss": 0.8169, + "step": 5785 + }, + { + "epoch": 0.27583247920291754, + "grad_norm": 3.5643978118896484, + "learning_rate": 3.5490840711906506e-06, + "loss": 1.3515, + "step": 5786 + }, + { + "epoch": 0.27588015159821705, + "grad_norm": 2.232090711593628, + "learning_rate": 3.546083184267105e-06, + "loss": 0.6899, + "step": 5787 + }, + { + "epoch": 0.27592782399351656, + "grad_norm": 1.28727126121521, + "learning_rate": 3.5430832931164584e-06, + "loss": 0.6688, + "step": 5788 + }, + { + "epoch": 0.2759754963888161, + "grad_norm": 1.2497484683990479, + "learning_rate": 3.540084398201565e-06, + "loss": 0.6052, + "step": 5789 + }, + { + "epoch": 0.27602316878411554, + "grad_norm": 1.0611144304275513, + "learning_rate": 3.5370864999851296e-06, + "loss": 0.3656, + "step": 5790 + }, + { + "epoch": 0.27607084117941505, + "grad_norm": 1.406622052192688, + "learning_rate": 3.534089598929691e-06, + "loss": 0.8445, + "step": 5791 + }, + { + "epoch": 0.27611851357471456, + "grad_norm": 1.173233151435852, + "learning_rate": 3.5310936954976383e-06, + "loss": 0.6267, + "step": 5792 + }, + { + "epoch": 0.2761661859700141, + "grad_norm": 1.7871273756027222, + "learning_rate": 3.5280987901512142e-06, + "loss": 0.3913, + "step": 5793 + }, + { + "epoch": 0.2762138583653136, + "grad_norm": 1.7539689540863037, + "learning_rate": 3.525104883352497e-06, + "loss": 0.9235, + "step": 5794 + }, + { + "epoch": 0.27626153076061305, + "grad_norm": 1.887739658355713, + "learning_rate": 3.522111975563417e-06, + "loss": 0.4819, + "step": 5795 + }, + { + "epoch": 0.27630920315591256, + "grad_norm": 1.73068368434906, + "learning_rate": 3.519120067245754e-06, + "loss": 0.6554, + "step": 5796 + }, + { + "epoch": 0.2763568755512121, + "grad_norm": 1.55825674533844, + "learning_rate": 3.51612915886112e-06, + "loss": 0.5789, + "step": 5797 + }, + { + "epoch": 0.2764045479465116, + "grad_norm": 2.273411512374878, + "learning_rate": 3.513139250870986e-06, + "loss": 0.4424, + "step": 5798 + }, + { + "epoch": 0.27645222034181105, + "grad_norm": 5.278494834899902, + "learning_rate": 3.5101503437366678e-06, + "loss": 0.7936, + "step": 5799 + }, + { + "epoch": 0.27649989273711056, + "grad_norm": 4.1573805809021, + "learning_rate": 3.507162437919316e-06, + "loss": 0.587, + "step": 5800 + }, + { + "epoch": 0.2765475651324101, + "grad_norm": 1.4418230056762695, + "learning_rate": 3.5041755338799354e-06, + "loss": 0.891, + "step": 5801 + }, + { + "epoch": 0.2765952375277096, + "grad_norm": 1.2996065616607666, + "learning_rate": 3.5011896320793802e-06, + "loss": 0.3958, + "step": 5802 + }, + { + "epoch": 0.2766429099230091, + "grad_norm": 3.44665265083313, + "learning_rate": 3.4982047329783362e-06, + "loss": 0.5077, + "step": 5803 + }, + { + "epoch": 0.27669058231830856, + "grad_norm": 1.3354766368865967, + "learning_rate": 3.4952208370373475e-06, + "loss": 0.9142, + "step": 5804 + }, + { + "epoch": 0.2767382547136081, + "grad_norm": 2.007514715194702, + "learning_rate": 3.4922379447167997e-06, + "loss": 1.032, + "step": 5805 + }, + { + "epoch": 0.2767859271089076, + "grad_norm": 1.655014157295227, + "learning_rate": 3.4892560564769164e-06, + "loss": 0.3604, + "step": 5806 + }, + { + "epoch": 0.2768335995042071, + "grad_norm": 2.406719923019409, + "learning_rate": 3.48627517277778e-06, + "loss": 0.4553, + "step": 5807 + }, + { + "epoch": 0.27688127189950656, + "grad_norm": 4.124279499053955, + "learning_rate": 3.4832952940793054e-06, + "loss": 0.5604, + "step": 5808 + }, + { + "epoch": 0.2769289442948061, + "grad_norm": 1.866684913635254, + "learning_rate": 3.4803164208412543e-06, + "loss": 1.1109, + "step": 5809 + }, + { + "epoch": 0.2769766166901056, + "grad_norm": 2.047614812850952, + "learning_rate": 3.4773385535232408e-06, + "loss": 0.9086, + "step": 5810 + }, + { + "epoch": 0.2770242890854051, + "grad_norm": 2.289114236831665, + "learning_rate": 3.4743616925847167e-06, + "loss": 0.7814, + "step": 5811 + }, + { + "epoch": 0.2770719614807046, + "grad_norm": 1.2168200016021729, + "learning_rate": 3.4713858384849873e-06, + "loss": 0.5156, + "step": 5812 + }, + { + "epoch": 0.2771196338760041, + "grad_norm": 1.6633448600769043, + "learning_rate": 3.4684109916831866e-06, + "loss": 0.5143, + "step": 5813 + }, + { + "epoch": 0.2771673062713036, + "grad_norm": 1.4360078573226929, + "learning_rate": 3.465437152638308e-06, + "loss": 1.0772, + "step": 5814 + }, + { + "epoch": 0.2772149786666031, + "grad_norm": 2.982320785522461, + "learning_rate": 3.462464321809188e-06, + "loss": 1.3989, + "step": 5815 + }, + { + "epoch": 0.2772626510619026, + "grad_norm": 2.3763790130615234, + "learning_rate": 3.4594924996544952e-06, + "loss": 0.9511, + "step": 5816 + }, + { + "epoch": 0.27731032345720213, + "grad_norm": 1.4426672458648682, + "learning_rate": 3.4565216866327556e-06, + "loss": 0.8974, + "step": 5817 + }, + { + "epoch": 0.2773579958525016, + "grad_norm": 2.0337131023406982, + "learning_rate": 3.4535518832023383e-06, + "loss": 0.4508, + "step": 5818 + }, + { + "epoch": 0.2774056682478011, + "grad_norm": 7.215719699859619, + "learning_rate": 3.4505830898214466e-06, + "loss": 1.5514, + "step": 5819 + }, + { + "epoch": 0.2774533406431006, + "grad_norm": 2.007924795150757, + "learning_rate": 3.447615306948142e-06, + "loss": 1.0114, + "step": 5820 + }, + { + "epoch": 0.27750101303840014, + "grad_norm": 5.832476615905762, + "learning_rate": 3.4446485350403145e-06, + "loss": 1.5196, + "step": 5821 + }, + { + "epoch": 0.2775486854336996, + "grad_norm": 3.1122994422912598, + "learning_rate": 3.441682774555716e-06, + "loss": 0.4405, + "step": 5822 + }, + { + "epoch": 0.2775963578289991, + "grad_norm": 2.6448862552642822, + "learning_rate": 3.438718025951924e-06, + "loss": 0.2901, + "step": 5823 + }, + { + "epoch": 0.2776440302242986, + "grad_norm": 1.5366235971450806, + "learning_rate": 3.435754289686375e-06, + "loss": 0.678, + "step": 5824 + }, + { + "epoch": 0.27769170261959814, + "grad_norm": 1.1536791324615479, + "learning_rate": 3.432791566216338e-06, + "loss": 0.3196, + "step": 5825 + }, + { + "epoch": 0.27773937501489765, + "grad_norm": 2.690739631652832, + "learning_rate": 3.429829855998933e-06, + "loss": 1.1102, + "step": 5826 + }, + { + "epoch": 0.2777870474101971, + "grad_norm": 2.7489328384399414, + "learning_rate": 3.426869159491124e-06, + "loss": 1.1726, + "step": 5827 + }, + { + "epoch": 0.2778347198054966, + "grad_norm": 1.2362381219863892, + "learning_rate": 3.4239094771497104e-06, + "loss": 0.2277, + "step": 5828 + }, + { + "epoch": 0.27788239220079614, + "grad_norm": 1.2275652885437012, + "learning_rate": 3.420950809431345e-06, + "loss": 0.6145, + "step": 5829 + }, + { + "epoch": 0.27793006459609565, + "grad_norm": 2.2532424926757812, + "learning_rate": 3.4179931567925216e-06, + "loss": 0.7512, + "step": 5830 + }, + { + "epoch": 0.2779777369913951, + "grad_norm": 1.3912450075149536, + "learning_rate": 3.4150365196895686e-06, + "loss": 0.6163, + "step": 5831 + }, + { + "epoch": 0.2780254093866946, + "grad_norm": 0.9267414808273315, + "learning_rate": 3.412080898578669e-06, + "loss": 0.391, + "step": 5832 + }, + { + "epoch": 0.27807308178199414, + "grad_norm": 1.1409859657287598, + "learning_rate": 3.4091262939158477e-06, + "loss": 0.6547, + "step": 5833 + }, + { + "epoch": 0.27812075417729365, + "grad_norm": 4.11335563659668, + "learning_rate": 3.406172706156963e-06, + "loss": 0.4091, + "step": 5834 + }, + { + "epoch": 0.27816842657259316, + "grad_norm": 2.016162633895874, + "learning_rate": 3.4032201357577287e-06, + "loss": 0.5227, + "step": 5835 + }, + { + "epoch": 0.2782160989678926, + "grad_norm": 0.9952210187911987, + "learning_rate": 3.4002685831736917e-06, + "loss": 0.226, + "step": 5836 + }, + { + "epoch": 0.27826377136319214, + "grad_norm": 1.124870777130127, + "learning_rate": 3.3973180488602508e-06, + "loss": 0.4222, + "step": 5837 + }, + { + "epoch": 0.27831144375849165, + "grad_norm": 1.807302474975586, + "learning_rate": 3.3943685332726385e-06, + "loss": 0.8145, + "step": 5838 + }, + { + "epoch": 0.27835911615379116, + "grad_norm": 1.9683709144592285, + "learning_rate": 3.391420036865939e-06, + "loss": 1.0791, + "step": 5839 + }, + { + "epoch": 0.2784067885490906, + "grad_norm": 2.589303493499756, + "learning_rate": 3.3884725600950687e-06, + "loss": 1.1757, + "step": 5840 + }, + { + "epoch": 0.27845446094439014, + "grad_norm": 1.9659584760665894, + "learning_rate": 3.385526103414798e-06, + "loss": 0.8821, + "step": 5841 + }, + { + "epoch": 0.27850213333968965, + "grad_norm": 2.3086142539978027, + "learning_rate": 3.3825806672797355e-06, + "loss": 0.5679, + "step": 5842 + }, + { + "epoch": 0.27854980573498916, + "grad_norm": 1.6591477394104004, + "learning_rate": 3.379636252144328e-06, + "loss": 0.7197, + "step": 5843 + }, + { + "epoch": 0.2785974781302887, + "grad_norm": 2.003530263900757, + "learning_rate": 3.37669285846287e-06, + "loss": 0.984, + "step": 5844 + }, + { + "epoch": 0.27864515052558814, + "grad_norm": 1.525742530822754, + "learning_rate": 3.3737504866895e-06, + "loss": 0.644, + "step": 5845 + }, + { + "epoch": 0.27869282292088765, + "grad_norm": 1.3682410717010498, + "learning_rate": 3.3708091372781893e-06, + "loss": 0.7348, + "step": 5846 + }, + { + "epoch": 0.27874049531618716, + "grad_norm": 1.2351282835006714, + "learning_rate": 3.3678688106827616e-06, + "loss": 0.6791, + "step": 5847 + }, + { + "epoch": 0.2787881677114867, + "grad_norm": 2.5197176933288574, + "learning_rate": 3.364929507356881e-06, + "loss": 0.8547, + "step": 5848 + }, + { + "epoch": 0.2788358401067862, + "grad_norm": 1.4743976593017578, + "learning_rate": 3.361991227754048e-06, + "loss": 0.7519, + "step": 5849 + }, + { + "epoch": 0.27888351250208565, + "grad_norm": 0.8373667001724243, + "learning_rate": 3.3590539723276083e-06, + "loss": 0.632, + "step": 5850 + }, + { + "epoch": 0.27893118489738516, + "grad_norm": 1.3392571210861206, + "learning_rate": 3.3561177415307566e-06, + "loss": 0.7121, + "step": 5851 + }, + { + "epoch": 0.2789788572926847, + "grad_norm": 1.9579188823699951, + "learning_rate": 3.3531825358165184e-06, + "loss": 0.8392, + "step": 5852 + }, + { + "epoch": 0.2790265296879842, + "grad_norm": 1.713509440422058, + "learning_rate": 3.3502483556377628e-06, + "loss": 0.4693, + "step": 5853 + }, + { + "epoch": 0.27907420208328365, + "grad_norm": 4.4942097663879395, + "learning_rate": 3.3473152014472064e-06, + "loss": 0.6598, + "step": 5854 + }, + { + "epoch": 0.27912187447858317, + "grad_norm": 1.400954008102417, + "learning_rate": 3.344383073697408e-06, + "loss": 0.5549, + "step": 5855 + }, + { + "epoch": 0.2791695468738827, + "grad_norm": 2.1655681133270264, + "learning_rate": 3.341451972840759e-06, + "loss": 0.1788, + "step": 5856 + }, + { + "epoch": 0.2792172192691822, + "grad_norm": 1.1546815633773804, + "learning_rate": 3.338521899329501e-06, + "loss": 0.3929, + "step": 5857 + }, + { + "epoch": 0.2792648916644817, + "grad_norm": 1.9194657802581787, + "learning_rate": 3.335592853615717e-06, + "loss": 0.6697, + "step": 5858 + }, + { + "epoch": 0.27931256405978117, + "grad_norm": 1.1898773908615112, + "learning_rate": 3.3326648361513227e-06, + "loss": 0.6485, + "step": 5859 + }, + { + "epoch": 0.2793602364550807, + "grad_norm": 3.210023880004883, + "learning_rate": 3.3297378473880836e-06, + "loss": 1.209, + "step": 5860 + }, + { + "epoch": 0.2794079088503802, + "grad_norm": 1.895900011062622, + "learning_rate": 3.326811887777607e-06, + "loss": 0.8399, + "step": 5861 + }, + { + "epoch": 0.2794555812456797, + "grad_norm": 1.4736049175262451, + "learning_rate": 3.323886957771333e-06, + "loss": 0.9595, + "step": 5862 + }, + { + "epoch": 0.27950325364097917, + "grad_norm": 2.3377034664154053, + "learning_rate": 3.32096305782055e-06, + "loss": 0.2446, + "step": 5863 + }, + { + "epoch": 0.2795509260362787, + "grad_norm": 1.1960664987564087, + "learning_rate": 3.31804018837639e-06, + "loss": 0.81, + "step": 5864 + }, + { + "epoch": 0.2795985984315782, + "grad_norm": 1.0660697221755981, + "learning_rate": 3.3151183498898155e-06, + "loss": 0.1664, + "step": 5865 + }, + { + "epoch": 0.2796462708268777, + "grad_norm": 1.5719130039215088, + "learning_rate": 3.3121975428116414e-06, + "loss": 0.7434, + "step": 5866 + }, + { + "epoch": 0.2796939432221772, + "grad_norm": 2.6992483139038086, + "learning_rate": 3.3092777675925145e-06, + "loss": 0.8804, + "step": 5867 + }, + { + "epoch": 0.2797416156174767, + "grad_norm": 2.4621758460998535, + "learning_rate": 3.306359024682925e-06, + "loss": 0.7341, + "step": 5868 + }, + { + "epoch": 0.2797892880127762, + "grad_norm": 1.3564366102218628, + "learning_rate": 3.3034413145332065e-06, + "loss": 0.6703, + "step": 5869 + }, + { + "epoch": 0.2798369604080757, + "grad_norm": 1.1252259016036987, + "learning_rate": 3.300524637593535e-06, + "loss": 0.4572, + "step": 5870 + }, + { + "epoch": 0.2798846328033752, + "grad_norm": 1.4198715686798096, + "learning_rate": 3.297608994313918e-06, + "loss": 0.5901, + "step": 5871 + }, + { + "epoch": 0.2799323051986747, + "grad_norm": 1.6074761152267456, + "learning_rate": 3.29469438514421e-06, + "loss": 0.8219, + "step": 5872 + }, + { + "epoch": 0.2799799775939742, + "grad_norm": 1.1399770975112915, + "learning_rate": 3.291780810534112e-06, + "loss": 0.2135, + "step": 5873 + }, + { + "epoch": 0.2800276499892737, + "grad_norm": 1.8449188470840454, + "learning_rate": 3.288868270933151e-06, + "loss": 0.8644, + "step": 5874 + }, + { + "epoch": 0.2800753223845732, + "grad_norm": 3.027866840362549, + "learning_rate": 3.285956766790703e-06, + "loss": 1.3406, + "step": 5875 + }, + { + "epoch": 0.28012299477987274, + "grad_norm": 1.4485067129135132, + "learning_rate": 3.2830462985559884e-06, + "loss": 0.8413, + "step": 5876 + }, + { + "epoch": 0.2801706671751722, + "grad_norm": 2.0163278579711914, + "learning_rate": 3.2801368666780552e-06, + "loss": 0.7529, + "step": 5877 + }, + { + "epoch": 0.2802183395704717, + "grad_norm": 2.1220314502716064, + "learning_rate": 3.2772284716058032e-06, + "loss": 0.8258, + "step": 5878 + }, + { + "epoch": 0.2802660119657712, + "grad_norm": 1.9134745597839355, + "learning_rate": 3.2743211137879693e-06, + "loss": 0.6428, + "step": 5879 + }, + { + "epoch": 0.28031368436107074, + "grad_norm": 2.6584293842315674, + "learning_rate": 3.2714147936731234e-06, + "loss": 0.9394, + "step": 5880 + }, + { + "epoch": 0.28036135675637025, + "grad_norm": 1.4212067127227783, + "learning_rate": 3.268509511709688e-06, + "loss": 0.7571, + "step": 5881 + }, + { + "epoch": 0.2804090291516697, + "grad_norm": 1.792913556098938, + "learning_rate": 3.2656052683459094e-06, + "loss": 0.6554, + "step": 5882 + }, + { + "epoch": 0.2804567015469692, + "grad_norm": 2.194751501083374, + "learning_rate": 3.26270206402989e-06, + "loss": 0.7524, + "step": 5883 + }, + { + "epoch": 0.28050437394226874, + "grad_norm": 2.3831229209899902, + "learning_rate": 3.259799899209559e-06, + "loss": 0.8005, + "step": 5884 + }, + { + "epoch": 0.28055204633756825, + "grad_norm": 1.1160222291946411, + "learning_rate": 3.2568987743326964e-06, + "loss": 0.6014, + "step": 5885 + }, + { + "epoch": 0.2805997187328677, + "grad_norm": 1.176783561706543, + "learning_rate": 3.2539986898469088e-06, + "loss": 0.3688, + "step": 5886 + }, + { + "epoch": 0.2806473911281672, + "grad_norm": 1.5147234201431274, + "learning_rate": 3.2510996461996523e-06, + "loss": 0.9379, + "step": 5887 + }, + { + "epoch": 0.28069506352346674, + "grad_norm": 2.1405818462371826, + "learning_rate": 3.2482016438382215e-06, + "loss": 0.7928, + "step": 5888 + }, + { + "epoch": 0.28074273591876625, + "grad_norm": 1.048101782798767, + "learning_rate": 3.245304683209749e-06, + "loss": 0.4306, + "step": 5889 + }, + { + "epoch": 0.28079040831406576, + "grad_norm": 4.601113796234131, + "learning_rate": 3.242408764761201e-06, + "loss": 0.4628, + "step": 5890 + }, + { + "epoch": 0.2808380807093652, + "grad_norm": 3.8754489421844482, + "learning_rate": 3.2395138889393918e-06, + "loss": 0.475, + "step": 5891 + }, + { + "epoch": 0.28088575310466474, + "grad_norm": 1.3963401317596436, + "learning_rate": 3.236620056190972e-06, + "loss": 0.3811, + "step": 5892 + }, + { + "epoch": 0.28093342549996425, + "grad_norm": 1.5309783220291138, + "learning_rate": 3.233727266962425e-06, + "loss": 0.5391, + "step": 5893 + }, + { + "epoch": 0.28098109789526377, + "grad_norm": 2.2857825756073, + "learning_rate": 3.230835521700083e-06, + "loss": 0.7422, + "step": 5894 + }, + { + "epoch": 0.2810287702905632, + "grad_norm": 1.5979682207107544, + "learning_rate": 3.2279448208501128e-06, + "loss": 0.8812, + "step": 5895 + }, + { + "epoch": 0.28107644268586274, + "grad_norm": 1.2685645818710327, + "learning_rate": 3.2250551648585194e-06, + "loss": 0.4423, + "step": 5896 + }, + { + "epoch": 0.28112411508116225, + "grad_norm": 1.7394298315048218, + "learning_rate": 3.222166554171141e-06, + "loss": 0.8087, + "step": 5897 + }, + { + "epoch": 0.28117178747646177, + "grad_norm": 1.5419667959213257, + "learning_rate": 3.2192789892336694e-06, + "loss": 0.9385, + "step": 5898 + }, + { + "epoch": 0.2812194598717613, + "grad_norm": 1.1576173305511475, + "learning_rate": 3.216392470491618e-06, + "loss": 0.6088, + "step": 5899 + }, + { + "epoch": 0.28126713226706074, + "grad_norm": 5.5643768310546875, + "learning_rate": 3.213506998390351e-06, + "loss": 1.1993, + "step": 5900 + }, + { + "epoch": 0.28131480466236025, + "grad_norm": 0.9182067513465881, + "learning_rate": 3.2106225733750707e-06, + "loss": 0.4358, + "step": 5901 + }, + { + "epoch": 0.28136247705765977, + "grad_norm": 1.6869710683822632, + "learning_rate": 3.2077391958908065e-06, + "loss": 1.005, + "step": 5902 + }, + { + "epoch": 0.2814101494529593, + "grad_norm": 2.2454309463500977, + "learning_rate": 3.2048568663824375e-06, + "loss": 0.6729, + "step": 5903 + }, + { + "epoch": 0.2814578218482588, + "grad_norm": 1.4682514667510986, + "learning_rate": 3.20197558529468e-06, + "loss": 0.6519, + "step": 5904 + }, + { + "epoch": 0.28150549424355825, + "grad_norm": 1.3484890460968018, + "learning_rate": 3.199095353072081e-06, + "loss": 0.9018, + "step": 5905 + }, + { + "epoch": 0.28155316663885777, + "grad_norm": 1.8326994180679321, + "learning_rate": 3.1962161701590342e-06, + "loss": 1.1445, + "step": 5906 + }, + { + "epoch": 0.2816008390341573, + "grad_norm": 1.8259936571121216, + "learning_rate": 3.193338036999769e-06, + "loss": 0.577, + "step": 5907 + }, + { + "epoch": 0.2816485114294568, + "grad_norm": 1.3793479204177856, + "learning_rate": 3.1904609540383467e-06, + "loss": 0.5268, + "step": 5908 + }, + { + "epoch": 0.28169618382475625, + "grad_norm": 7.305703639984131, + "learning_rate": 3.187584921718675e-06, + "loss": 0.8085, + "step": 5909 + }, + { + "epoch": 0.28174385622005577, + "grad_norm": 1.557979941368103, + "learning_rate": 3.1847099404844984e-06, + "loss": 0.8441, + "step": 5910 + }, + { + "epoch": 0.2817915286153553, + "grad_norm": 4.730111598968506, + "learning_rate": 3.1818360107793933e-06, + "loss": 0.3377, + "step": 5911 + }, + { + "epoch": 0.2818392010106548, + "grad_norm": 1.458540439605713, + "learning_rate": 3.178963133046776e-06, + "loss": 0.7024, + "step": 5912 + }, + { + "epoch": 0.2818868734059543, + "grad_norm": 1.733161449432373, + "learning_rate": 3.1760913077299072e-06, + "loss": 0.7407, + "step": 5913 + }, + { + "epoch": 0.28193454580125377, + "grad_norm": 1.6189063787460327, + "learning_rate": 3.173220535271874e-06, + "loss": 0.9442, + "step": 5914 + }, + { + "epoch": 0.2819822181965533, + "grad_norm": 1.5450819730758667, + "learning_rate": 3.1703508161156095e-06, + "loss": 0.4844, + "step": 5915 + }, + { + "epoch": 0.2820298905918528, + "grad_norm": 1.3176461458206177, + "learning_rate": 3.1674821507038857e-06, + "loss": 0.7086, + "step": 5916 + }, + { + "epoch": 0.2820775629871523, + "grad_norm": 1.8015121221542358, + "learning_rate": 3.1646145394793017e-06, + "loss": 0.7689, + "step": 5917 + }, + { + "epoch": 0.28212523538245177, + "grad_norm": 1.266664743423462, + "learning_rate": 3.1617479828843023e-06, + "loss": 0.858, + "step": 5918 + }, + { + "epoch": 0.2821729077777513, + "grad_norm": 2.7740542888641357, + "learning_rate": 3.158882481361173e-06, + "loss": 0.7104, + "step": 5919 + }, + { + "epoch": 0.2822205801730508, + "grad_norm": 2.237379789352417, + "learning_rate": 3.156018035352024e-06, + "loss": 1.5049, + "step": 5920 + }, + { + "epoch": 0.2822682525683503, + "grad_norm": 1.1840728521347046, + "learning_rate": 3.1531546452988127e-06, + "loss": 0.8268, + "step": 5921 + }, + { + "epoch": 0.2823159249636498, + "grad_norm": 2.702145576477051, + "learning_rate": 3.1502923116433324e-06, + "loss": 0.9402, + "step": 5922 + }, + { + "epoch": 0.2823635973589493, + "grad_norm": 1.544381856918335, + "learning_rate": 3.1474310348272084e-06, + "loss": 0.9294, + "step": 5923 + }, + { + "epoch": 0.2824112697542488, + "grad_norm": 1.5606869459152222, + "learning_rate": 3.1445708152919075e-06, + "loss": 0.9205, + "step": 5924 + }, + { + "epoch": 0.2824589421495483, + "grad_norm": 1.5174667835235596, + "learning_rate": 3.141711653478736e-06, + "loss": 0.9588, + "step": 5925 + }, + { + "epoch": 0.2825066145448478, + "grad_norm": 1.5139796733856201, + "learning_rate": 3.1388535498288265e-06, + "loss": 0.8664, + "step": 5926 + }, + { + "epoch": 0.2825542869401473, + "grad_norm": 1.7629603147506714, + "learning_rate": 3.135996504783161e-06, + "loss": 0.5387, + "step": 5927 + }, + { + "epoch": 0.2826019593354468, + "grad_norm": 3.60149884223938, + "learning_rate": 3.1331405187825457e-06, + "loss": 1.7011, + "step": 5928 + }, + { + "epoch": 0.2826496317307463, + "grad_norm": 4.884571075439453, + "learning_rate": 3.130285592267638e-06, + "loss": 0.5431, + "step": 5929 + }, + { + "epoch": 0.2826973041260458, + "grad_norm": 2.810913324356079, + "learning_rate": 3.1274317256789144e-06, + "loss": 0.7907, + "step": 5930 + }, + { + "epoch": 0.28274497652134534, + "grad_norm": 2.3158459663391113, + "learning_rate": 3.1245789194567024e-06, + "loss": 0.8285, + "step": 5931 + }, + { + "epoch": 0.2827926489166448, + "grad_norm": 3.1197385787963867, + "learning_rate": 3.1217271740411626e-06, + "loss": 0.9111, + "step": 5932 + }, + { + "epoch": 0.2828403213119443, + "grad_norm": 1.5556949377059937, + "learning_rate": 3.1188764898722843e-06, + "loss": 0.4844, + "step": 5933 + }, + { + "epoch": 0.2828879937072438, + "grad_norm": 1.8289040327072144, + "learning_rate": 3.116026867389903e-06, + "loss": 0.7285, + "step": 5934 + }, + { + "epoch": 0.28293566610254334, + "grad_norm": 0.6461400389671326, + "learning_rate": 3.1131783070336872e-06, + "loss": 0.288, + "step": 5935 + }, + { + "epoch": 0.28298333849784285, + "grad_norm": 1.816648244857788, + "learning_rate": 3.110330809243134e-06, + "loss": 0.9826, + "step": 5936 + }, + { + "epoch": 0.2830310108931423, + "grad_norm": 1.8873894214630127, + "learning_rate": 3.1074843744575877e-06, + "loss": 0.7973, + "step": 5937 + }, + { + "epoch": 0.2830786832884418, + "grad_norm": 1.235554814338684, + "learning_rate": 3.1046390031162265e-06, + "loss": 0.5818, + "step": 5938 + }, + { + "epoch": 0.28312635568374134, + "grad_norm": 1.8525989055633545, + "learning_rate": 3.1017946956580557e-06, + "loss": 0.4799, + "step": 5939 + }, + { + "epoch": 0.28317402807904085, + "grad_norm": 3.0708417892456055, + "learning_rate": 3.098951452521929e-06, + "loss": 0.5568, + "step": 5940 + }, + { + "epoch": 0.2832217004743403, + "grad_norm": 2.0866968631744385, + "learning_rate": 3.0961092741465226e-06, + "loss": 0.5011, + "step": 5941 + }, + { + "epoch": 0.2832693728696398, + "grad_norm": 2.3611583709716797, + "learning_rate": 3.093268160970362e-06, + "loss": 0.5536, + "step": 5942 + }, + { + "epoch": 0.28331704526493934, + "grad_norm": 1.2229896783828735, + "learning_rate": 3.090428113431795e-06, + "loss": 0.5604, + "step": 5943 + }, + { + "epoch": 0.28336471766023885, + "grad_norm": 1.3708198070526123, + "learning_rate": 3.0875891319690188e-06, + "loss": 0.907, + "step": 5944 + }, + { + "epoch": 0.28341239005553837, + "grad_norm": 2.049001932144165, + "learning_rate": 3.0847512170200523e-06, + "loss": 0.6424, + "step": 5945 + }, + { + "epoch": 0.2834600624508378, + "grad_norm": 3.2906229496002197, + "learning_rate": 3.0819143690227602e-06, + "loss": 0.6675, + "step": 5946 + }, + { + "epoch": 0.28350773484613734, + "grad_norm": 1.4666670560836792, + "learning_rate": 3.0790785884148413e-06, + "loss": 0.5831, + "step": 5947 + }, + { + "epoch": 0.28355540724143685, + "grad_norm": 2.1309757232666016, + "learning_rate": 3.0762438756338207e-06, + "loss": 0.7127, + "step": 5948 + }, + { + "epoch": 0.28360307963673637, + "grad_norm": 3.729311466217041, + "learning_rate": 3.0734102311170697e-06, + "loss": 0.2881, + "step": 5949 + }, + { + "epoch": 0.2836507520320358, + "grad_norm": 1.0012531280517578, + "learning_rate": 3.070577655301793e-06, + "loss": 0.769, + "step": 5950 + }, + { + "epoch": 0.28369842442733534, + "grad_norm": 1.3662431240081787, + "learning_rate": 3.0677461486250226e-06, + "loss": 0.7394, + "step": 5951 + }, + { + "epoch": 0.28374609682263485, + "grad_norm": 2.267841339111328, + "learning_rate": 3.0649157115236315e-06, + "loss": 0.7723, + "step": 5952 + }, + { + "epoch": 0.28379376921793437, + "grad_norm": 3.1676759719848633, + "learning_rate": 3.062086344434333e-06, + "loss": 0.7729, + "step": 5953 + }, + { + "epoch": 0.2838414416132339, + "grad_norm": 1.3230173587799072, + "learning_rate": 3.0592580477936606e-06, + "loss": 0.6094, + "step": 5954 + }, + { + "epoch": 0.28388911400853334, + "grad_norm": 1.7950643301010132, + "learning_rate": 3.0564308220380003e-06, + "loss": 0.8171, + "step": 5955 + }, + { + "epoch": 0.28393678640383285, + "grad_norm": 1.1964622735977173, + "learning_rate": 3.0536046676035546e-06, + "loss": 0.7314, + "step": 5956 + }, + { + "epoch": 0.28398445879913237, + "grad_norm": 1.3880852460861206, + "learning_rate": 3.050779584926379e-06, + "loss": 0.681, + "step": 5957 + }, + { + "epoch": 0.2840321311944319, + "grad_norm": 1.5135201215744019, + "learning_rate": 3.0479555744423463e-06, + "loss": 0.5061, + "step": 5958 + }, + { + "epoch": 0.28407980358973134, + "grad_norm": 1.1748199462890625, + "learning_rate": 3.045132636587179e-06, + "loss": 0.8499, + "step": 5959 + }, + { + "epoch": 0.28412747598503085, + "grad_norm": 2.5828261375427246, + "learning_rate": 3.042310771796423e-06, + "loss": 0.8827, + "step": 5960 + }, + { + "epoch": 0.28417514838033037, + "grad_norm": 1.8506451845169067, + "learning_rate": 3.0394899805054635e-06, + "loss": 1.094, + "step": 5961 + }, + { + "epoch": 0.2842228207756299, + "grad_norm": 1.2826437950134277, + "learning_rate": 3.0366702631495237e-06, + "loss": 0.7094, + "step": 5962 + }, + { + "epoch": 0.2842704931709294, + "grad_norm": 1.3929500579833984, + "learning_rate": 3.0338516201636516e-06, + "loss": 0.7033, + "step": 5963 + }, + { + "epoch": 0.28431816556622885, + "grad_norm": 1.618619441986084, + "learning_rate": 3.031034051982735e-06, + "loss": 0.6553, + "step": 5964 + }, + { + "epoch": 0.28436583796152837, + "grad_norm": 2.004574775695801, + "learning_rate": 3.0282175590415e-06, + "loss": 0.8983, + "step": 5965 + }, + { + "epoch": 0.2844135103568279, + "grad_norm": 1.1381690502166748, + "learning_rate": 3.0254021417745027e-06, + "loss": 0.3409, + "step": 5966 + }, + { + "epoch": 0.2844611827521274, + "grad_norm": 1.8637462854385376, + "learning_rate": 3.022587800616127e-06, + "loss": 0.8184, + "step": 5967 + }, + { + "epoch": 0.2845088551474269, + "grad_norm": 2.8947017192840576, + "learning_rate": 3.0197745360006004e-06, + "loss": 0.9563, + "step": 5968 + }, + { + "epoch": 0.28455652754272637, + "grad_norm": 2.7389771938323975, + "learning_rate": 3.0169623483619824e-06, + "loss": 1.2036, + "step": 5969 + }, + { + "epoch": 0.2846041999380259, + "grad_norm": 1.9356426000595093, + "learning_rate": 3.014151238134161e-06, + "loss": 0.7647, + "step": 5970 + }, + { + "epoch": 0.2846518723333254, + "grad_norm": 1.2176311016082764, + "learning_rate": 3.011341205750866e-06, + "loss": 0.6346, + "step": 5971 + }, + { + "epoch": 0.2846995447286249, + "grad_norm": 1.3626267910003662, + "learning_rate": 3.0085322516456537e-06, + "loss": 0.7817, + "step": 5972 + }, + { + "epoch": 0.28474721712392437, + "grad_norm": 1.1688299179077148, + "learning_rate": 3.0057243762519137e-06, + "loss": 0.8457, + "step": 5973 + }, + { + "epoch": 0.2847948895192239, + "grad_norm": 2.106661796569824, + "learning_rate": 3.002917580002875e-06, + "loss": 0.4141, + "step": 5974 + }, + { + "epoch": 0.2848425619145234, + "grad_norm": 1.1825395822525024, + "learning_rate": 3.0001118633316018e-06, + "loss": 0.6835, + "step": 5975 + }, + { + "epoch": 0.2848902343098229, + "grad_norm": 1.325334906578064, + "learning_rate": 2.997307226670979e-06, + "loss": 0.5565, + "step": 5976 + }, + { + "epoch": 0.2849379067051224, + "grad_norm": 1.230260968208313, + "learning_rate": 2.9945036704537376e-06, + "loss": 0.6952, + "step": 5977 + }, + { + "epoch": 0.2849855791004219, + "grad_norm": 1.3571439981460571, + "learning_rate": 2.991701195112441e-06, + "loss": 0.3431, + "step": 5978 + }, + { + "epoch": 0.2850332514957214, + "grad_norm": 1.8420449495315552, + "learning_rate": 2.9888998010794745e-06, + "loss": 0.7724, + "step": 5979 + }, + { + "epoch": 0.2850809238910209, + "grad_norm": 1.6307415962219238, + "learning_rate": 2.986099488787069e-06, + "loss": 0.6341, + "step": 5980 + }, + { + "epoch": 0.2851285962863204, + "grad_norm": 1.3189420700073242, + "learning_rate": 2.9833002586672855e-06, + "loss": 0.6408, + "step": 5981 + }, + { + "epoch": 0.2851762686816199, + "grad_norm": 1.9311996698379517, + "learning_rate": 2.9805021111520105e-06, + "loss": 0.2892, + "step": 5982 + }, + { + "epoch": 0.2852239410769194, + "grad_norm": 1.2384161949157715, + "learning_rate": 2.977705046672974e-06, + "loss": 0.7973, + "step": 5983 + }, + { + "epoch": 0.2852716134722189, + "grad_norm": 2.065157175064087, + "learning_rate": 2.9749090656617363e-06, + "loss": 0.8093, + "step": 5984 + }, + { + "epoch": 0.2853192858675184, + "grad_norm": 1.7547610998153687, + "learning_rate": 2.9721141685496825e-06, + "loss": 0.7952, + "step": 5985 + }, + { + "epoch": 0.28536695826281794, + "grad_norm": 4.806229114532471, + "learning_rate": 2.9693203557680415e-06, + "loss": 1.0673, + "step": 5986 + }, + { + "epoch": 0.2854146306581174, + "grad_norm": 2.534628391265869, + "learning_rate": 2.9665276277478672e-06, + "loss": 0.6868, + "step": 5987 + }, + { + "epoch": 0.2854623030534169, + "grad_norm": 1.5216633081436157, + "learning_rate": 2.9637359849200474e-06, + "loss": 0.6742, + "step": 5988 + }, + { + "epoch": 0.2855099754487164, + "grad_norm": 2.193467617034912, + "learning_rate": 2.960945427715305e-06, + "loss": 0.6474, + "step": 5989 + }, + { + "epoch": 0.28555764784401594, + "grad_norm": 1.6416079998016357, + "learning_rate": 2.9581559565641983e-06, + "loss": 0.2511, + "step": 5990 + }, + { + "epoch": 0.2856053202393154, + "grad_norm": 1.167321801185608, + "learning_rate": 2.9553675718971065e-06, + "loss": 0.7189, + "step": 5991 + }, + { + "epoch": 0.2856529926346149, + "grad_norm": 1.3199819326400757, + "learning_rate": 2.9525802741442532e-06, + "loss": 0.8042, + "step": 5992 + }, + { + "epoch": 0.2857006650299144, + "grad_norm": 1.2183806896209717, + "learning_rate": 2.9497940637356924e-06, + "loss": 0.8987, + "step": 5993 + }, + { + "epoch": 0.28574833742521394, + "grad_norm": 1.8851501941680908, + "learning_rate": 2.9470089411013014e-06, + "loss": 0.3647, + "step": 5994 + }, + { + "epoch": 0.28579600982051345, + "grad_norm": 1.4431147575378418, + "learning_rate": 2.9442249066707993e-06, + "loss": 0.7273, + "step": 5995 + }, + { + "epoch": 0.2858436822158129, + "grad_norm": 1.4173214435577393, + "learning_rate": 2.9414419608737366e-06, + "loss": 0.9537, + "step": 5996 + }, + { + "epoch": 0.2858913546111124, + "grad_norm": 2.244168996810913, + "learning_rate": 2.938660104139487e-06, + "loss": 0.9552, + "step": 5997 + }, + { + "epoch": 0.28593902700641194, + "grad_norm": 1.6251845359802246, + "learning_rate": 2.935879336897265e-06, + "loss": 1.3154, + "step": 5998 + }, + { + "epoch": 0.28598669940171145, + "grad_norm": 4.886594295501709, + "learning_rate": 2.9330996595761184e-06, + "loss": 1.2302, + "step": 5999 + }, + { + "epoch": 0.28603437179701097, + "grad_norm": 1.7000819444656372, + "learning_rate": 2.930321072604917e-06, + "loss": 0.9731, + "step": 6000 + }, + { + "epoch": 0.2860820441923104, + "grad_norm": 1.326775312423706, + "learning_rate": 2.927543576412373e-06, + "loss": 0.6907, + "step": 6001 + }, + { + "epoch": 0.28612971658760994, + "grad_norm": 2.368781566619873, + "learning_rate": 2.9247671714270198e-06, + "loss": 0.2685, + "step": 6002 + }, + { + "epoch": 0.28617738898290945, + "grad_norm": 1.3293931484222412, + "learning_rate": 2.921991858077234e-06, + "loss": 0.7275, + "step": 6003 + }, + { + "epoch": 0.28622506137820897, + "grad_norm": 1.5075607299804688, + "learning_rate": 2.919217636791213e-06, + "loss": 0.6446, + "step": 6004 + }, + { + "epoch": 0.2862727337735084, + "grad_norm": 11.684954643249512, + "learning_rate": 2.916444507996993e-06, + "loss": 0.5891, + "step": 6005 + }, + { + "epoch": 0.28632040616880794, + "grad_norm": 1.08614182472229, + "learning_rate": 2.9136724721224406e-06, + "loss": 0.4969, + "step": 6006 + }, + { + "epoch": 0.28636807856410745, + "grad_norm": 1.8808544874191284, + "learning_rate": 2.910901529595248e-06, + "loss": 1.0098, + "step": 6007 + }, + { + "epoch": 0.28641575095940697, + "grad_norm": 2.4806671142578125, + "learning_rate": 2.908131680842946e-06, + "loss": 0.3836, + "step": 6008 + }, + { + "epoch": 0.2864634233547065, + "grad_norm": 2.0469717979431152, + "learning_rate": 2.9053629262928974e-06, + "loss": 0.7834, + "step": 6009 + }, + { + "epoch": 0.28651109575000594, + "grad_norm": 1.9556336402893066, + "learning_rate": 2.9025952663722845e-06, + "loss": 0.7412, + "step": 6010 + }, + { + "epoch": 0.28655876814530545, + "grad_norm": 1.4475878477096558, + "learning_rate": 2.899828701508133e-06, + "loss": 0.1685, + "step": 6011 + }, + { + "epoch": 0.28660644054060497, + "grad_norm": 1.3044952154159546, + "learning_rate": 2.8970632321272983e-06, + "loss": 1.0663, + "step": 6012 + }, + { + "epoch": 0.2866541129359045, + "grad_norm": 1.0869470834732056, + "learning_rate": 2.894298858656458e-06, + "loss": 0.3633, + "step": 6013 + }, + { + "epoch": 0.28670178533120394, + "grad_norm": 1.657095193862915, + "learning_rate": 2.8915355815221293e-06, + "loss": 0.5961, + "step": 6014 + }, + { + "epoch": 0.28674945772650345, + "grad_norm": 2.124903678894043, + "learning_rate": 2.88877340115066e-06, + "loss": 0.5006, + "step": 6015 + }, + { + "epoch": 0.28679713012180297, + "grad_norm": 1.6909003257751465, + "learning_rate": 2.8860123179682244e-06, + "loss": 0.6944, + "step": 6016 + }, + { + "epoch": 0.2868448025171025, + "grad_norm": 1.5790278911590576, + "learning_rate": 2.883252332400823e-06, + "loss": 0.8222, + "step": 6017 + }, + { + "epoch": 0.286892474912402, + "grad_norm": 1.340075969696045, + "learning_rate": 2.8804934448743037e-06, + "loss": 0.3271, + "step": 6018 + }, + { + "epoch": 0.28694014730770145, + "grad_norm": 1.6675771474838257, + "learning_rate": 2.8777356558143255e-06, + "loss": 0.7405, + "step": 6019 + }, + { + "epoch": 0.28698781970300097, + "grad_norm": 1.3836605548858643, + "learning_rate": 2.87497896564639e-06, + "loss": 0.614, + "step": 6020 + }, + { + "epoch": 0.2870354920983005, + "grad_norm": 1.848780632019043, + "learning_rate": 2.8722233747958295e-06, + "loss": 0.5663, + "step": 6021 + }, + { + "epoch": 0.2870831644936, + "grad_norm": 1.4647642374038696, + "learning_rate": 2.869468883687798e-06, + "loss": 0.6524, + "step": 6022 + }, + { + "epoch": 0.2871308368888995, + "grad_norm": 2.593418598175049, + "learning_rate": 2.8667154927472875e-06, + "loss": 0.3415, + "step": 6023 + }, + { + "epoch": 0.28717850928419897, + "grad_norm": 2.556431531906128, + "learning_rate": 2.8639632023991204e-06, + "loss": 0.9608, + "step": 6024 + }, + { + "epoch": 0.2872261816794985, + "grad_norm": 1.3714070320129395, + "learning_rate": 2.861212013067941e-06, + "loss": 0.4729, + "step": 6025 + }, + { + "epoch": 0.287273854074798, + "grad_norm": 1.4538246393203735, + "learning_rate": 2.858461925178233e-06, + "loss": 0.7408, + "step": 6026 + }, + { + "epoch": 0.2873215264700975, + "grad_norm": 1.4156399965286255, + "learning_rate": 2.855712939154309e-06, + "loss": 0.5633, + "step": 6027 + }, + { + "epoch": 0.28736919886539697, + "grad_norm": 1.7659258842468262, + "learning_rate": 2.852965055420305e-06, + "loss": 0.3994, + "step": 6028 + }, + { + "epoch": 0.2874168712606965, + "grad_norm": 1.9925000667572021, + "learning_rate": 2.8502182744001903e-06, + "loss": 0.5116, + "step": 6029 + }, + { + "epoch": 0.287464543655996, + "grad_norm": 2.9892420768737793, + "learning_rate": 2.8474725965177717e-06, + "loss": 0.8502, + "step": 6030 + }, + { + "epoch": 0.2875122160512955, + "grad_norm": 1.7563321590423584, + "learning_rate": 2.8447280221966754e-06, + "loss": 1.0132, + "step": 6031 + }, + { + "epoch": 0.287559888446595, + "grad_norm": 1.1654222011566162, + "learning_rate": 2.841984551860356e-06, + "loss": 0.4381, + "step": 6032 + }, + { + "epoch": 0.2876075608418945, + "grad_norm": 5.544343948364258, + "learning_rate": 2.8392421859321105e-06, + "loss": 1.0829, + "step": 6033 + }, + { + "epoch": 0.287655233237194, + "grad_norm": 1.2452433109283447, + "learning_rate": 2.8365009248350515e-06, + "loss": 0.6825, + "step": 6034 + }, + { + "epoch": 0.2877029056324935, + "grad_norm": 3.3150670528411865, + "learning_rate": 2.83376076899213e-06, + "loss": 0.8066, + "step": 6035 + }, + { + "epoch": 0.287750578027793, + "grad_norm": 4.442493915557861, + "learning_rate": 2.831021718826126e-06, + "loss": 0.2103, + "step": 6036 + }, + { + "epoch": 0.2877982504230925, + "grad_norm": 2.108241319656372, + "learning_rate": 2.8282837747596428e-06, + "loss": 0.8832, + "step": 6037 + }, + { + "epoch": 0.287845922818392, + "grad_norm": 4.231440544128418, + "learning_rate": 2.8255469372151178e-06, + "loss": 1.0792, + "step": 6038 + }, + { + "epoch": 0.2878935952136915, + "grad_norm": 1.7912887334823608, + "learning_rate": 2.8228112066148173e-06, + "loss": 0.4263, + "step": 6039 + }, + { + "epoch": 0.287941267608991, + "grad_norm": 1.8442174196243286, + "learning_rate": 2.8200765833808406e-06, + "loss": 0.4157, + "step": 6040 + }, + { + "epoch": 0.28798894000429054, + "grad_norm": 1.9798344373703003, + "learning_rate": 2.8173430679351055e-06, + "loss": 0.7357, + "step": 6041 + }, + { + "epoch": 0.28803661239959, + "grad_norm": 1.8778464794158936, + "learning_rate": 2.8146106606993674e-06, + "loss": 0.3929, + "step": 6042 + }, + { + "epoch": 0.2880842847948895, + "grad_norm": 2.1753764152526855, + "learning_rate": 2.8118793620952125e-06, + "loss": 0.4436, + "step": 6043 + }, + { + "epoch": 0.288131957190189, + "grad_norm": 1.4933431148529053, + "learning_rate": 2.8091491725440454e-06, + "loss": 0.8653, + "step": 6044 + }, + { + "epoch": 0.28817962958548854, + "grad_norm": 1.1439440250396729, + "learning_rate": 2.8064200924671137e-06, + "loss": 0.8711, + "step": 6045 + }, + { + "epoch": 0.288227301980788, + "grad_norm": 1.400071382522583, + "learning_rate": 2.8036921222854776e-06, + "loss": 0.6402, + "step": 6046 + }, + { + "epoch": 0.2882749743760875, + "grad_norm": 1.5463074445724487, + "learning_rate": 2.8009652624200436e-06, + "loss": 0.3914, + "step": 6047 + }, + { + "epoch": 0.288322646771387, + "grad_norm": 1.7884753942489624, + "learning_rate": 2.7982395132915295e-06, + "loss": 0.5871, + "step": 6048 + }, + { + "epoch": 0.28837031916668654, + "grad_norm": 2.3855931758880615, + "learning_rate": 2.7955148753204995e-06, + "loss": 1.2087, + "step": 6049 + }, + { + "epoch": 0.28841799156198605, + "grad_norm": 1.2375513315200806, + "learning_rate": 2.7927913489273284e-06, + "loss": 0.7577, + "step": 6050 + }, + { + "epoch": 0.2884656639572855, + "grad_norm": 1.47097909450531, + "learning_rate": 2.790068934532232e-06, + "loss": 0.6823, + "step": 6051 + }, + { + "epoch": 0.288513336352585, + "grad_norm": 1.063920497894287, + "learning_rate": 2.7873476325552538e-06, + "loss": 0.5109, + "step": 6052 + }, + { + "epoch": 0.28856100874788454, + "grad_norm": 1.6459465026855469, + "learning_rate": 2.784627443416258e-06, + "loss": 0.4553, + "step": 6053 + }, + { + "epoch": 0.28860868114318405, + "grad_norm": 1.9386135339736938, + "learning_rate": 2.7819083675349436e-06, + "loss": 0.8236, + "step": 6054 + }, + { + "epoch": 0.28865635353848357, + "grad_norm": 2.30006742477417, + "learning_rate": 2.779190405330838e-06, + "loss": 0.4692, + "step": 6055 + }, + { + "epoch": 0.288704025933783, + "grad_norm": 2.1627304553985596, + "learning_rate": 2.7764735572232916e-06, + "loss": 0.9222, + "step": 6056 + }, + { + "epoch": 0.28875169832908254, + "grad_norm": 1.87648344039917, + "learning_rate": 2.773757823631487e-06, + "loss": 0.6527, + "step": 6057 + }, + { + "epoch": 0.28879937072438205, + "grad_norm": 2.5315017700195312, + "learning_rate": 2.7710432049744363e-06, + "loss": 1.4156, + "step": 6058 + }, + { + "epoch": 0.28884704311968157, + "grad_norm": 1.2173198461532593, + "learning_rate": 2.768329701670972e-06, + "loss": 0.664, + "step": 6059 + }, + { + "epoch": 0.288894715514981, + "grad_norm": 1.634997844696045, + "learning_rate": 2.765617314139767e-06, + "loss": 0.4119, + "step": 6060 + }, + { + "epoch": 0.28894238791028054, + "grad_norm": 4.035872936248779, + "learning_rate": 2.7629060427993072e-06, + "loss": 0.127, + "step": 6061 + }, + { + "epoch": 0.28899006030558005, + "grad_norm": 2.1331849098205566, + "learning_rate": 2.7601958880679204e-06, + "loss": 0.8079, + "step": 6062 + }, + { + "epoch": 0.28903773270087957, + "grad_norm": 1.4828959703445435, + "learning_rate": 2.7574868503637496e-06, + "loss": 0.3732, + "step": 6063 + }, + { + "epoch": 0.2890854050961791, + "grad_norm": 1.6004363298416138, + "learning_rate": 2.754778930104778e-06, + "loss": 0.6978, + "step": 6064 + }, + { + "epoch": 0.28913307749147854, + "grad_norm": 1.9005931615829468, + "learning_rate": 2.7520721277088023e-06, + "loss": 0.7738, + "step": 6065 + }, + { + "epoch": 0.28918074988677805, + "grad_norm": 1.1089800596237183, + "learning_rate": 2.7493664435934574e-06, + "loss": 0.9114, + "step": 6066 + }, + { + "epoch": 0.28922842228207757, + "grad_norm": 0.8061476945877075, + "learning_rate": 2.7466618781762077e-06, + "loss": 0.2514, + "step": 6067 + }, + { + "epoch": 0.2892760946773771, + "grad_norm": 1.8242262601852417, + "learning_rate": 2.743958431874332e-06, + "loss": 0.8326, + "step": 6068 + }, + { + "epoch": 0.28932376707267654, + "grad_norm": 2.1115779876708984, + "learning_rate": 2.7412561051049468e-06, + "loss": 0.8542, + "step": 6069 + }, + { + "epoch": 0.28937143946797605, + "grad_norm": 1.728063941001892, + "learning_rate": 2.7385548982849974e-06, + "loss": 0.8097, + "step": 6070 + }, + { + "epoch": 0.28941911186327557, + "grad_norm": 1.3411555290222168, + "learning_rate": 2.7358548118312455e-06, + "loss": 0.6236, + "step": 6071 + }, + { + "epoch": 0.2894667842585751, + "grad_norm": 1.8006856441497803, + "learning_rate": 2.7331558461602905e-06, + "loss": 0.9321, + "step": 6072 + }, + { + "epoch": 0.2895144566538746, + "grad_norm": 1.475117564201355, + "learning_rate": 2.7304580016885564e-06, + "loss": 0.6531, + "step": 6073 + }, + { + "epoch": 0.28956212904917406, + "grad_norm": 1.6995662450790405, + "learning_rate": 2.727761278832288e-06, + "loss": 0.5523, + "step": 6074 + }, + { + "epoch": 0.28960980144447357, + "grad_norm": 1.3316490650177002, + "learning_rate": 2.725065678007568e-06, + "loss": 0.6085, + "step": 6075 + }, + { + "epoch": 0.2896574738397731, + "grad_norm": 1.3850513696670532, + "learning_rate": 2.7223711996302935e-06, + "loss": 0.4116, + "step": 6076 + }, + { + "epoch": 0.2897051462350726, + "grad_norm": 2.90434193611145, + "learning_rate": 2.719677844116202e-06, + "loss": 1.1875, + "step": 6077 + }, + { + "epoch": 0.28975281863037206, + "grad_norm": 1.1194543838500977, + "learning_rate": 2.7169856118808414e-06, + "loss": 0.7299, + "step": 6078 + }, + { + "epoch": 0.28980049102567157, + "grad_norm": 1.3235876560211182, + "learning_rate": 2.714294503339602e-06, + "loss": 0.525, + "step": 6079 + }, + { + "epoch": 0.2898481634209711, + "grad_norm": 13.684040069580078, + "learning_rate": 2.7116045189076946e-06, + "loss": 0.9644, + "step": 6080 + }, + { + "epoch": 0.2898958358162706, + "grad_norm": 3.838590145111084, + "learning_rate": 2.708915659000151e-06, + "loss": 1.1306, + "step": 6081 + }, + { + "epoch": 0.2899435082115701, + "grad_norm": 1.4317800998687744, + "learning_rate": 2.706227924031838e-06, + "loss": 0.9855, + "step": 6082 + }, + { + "epoch": 0.28999118060686957, + "grad_norm": 1.6434540748596191, + "learning_rate": 2.7035413144174472e-06, + "loss": 0.4999, + "step": 6083 + }, + { + "epoch": 0.2900388530021691, + "grad_norm": 1.2791095972061157, + "learning_rate": 2.7008558305714905e-06, + "loss": 0.8824, + "step": 6084 + }, + { + "epoch": 0.2900865253974686, + "grad_norm": 4.311280250549316, + "learning_rate": 2.698171472908312e-06, + "loss": 0.7368, + "step": 6085 + }, + { + "epoch": 0.2901341977927681, + "grad_norm": 1.7473076581954956, + "learning_rate": 2.6954882418420836e-06, + "loss": 1.4275, + "step": 6086 + }, + { + "epoch": 0.2901818701880676, + "grad_norm": 1.281606674194336, + "learning_rate": 2.6928061377867954e-06, + "loss": 0.5668, + "step": 6087 + }, + { + "epoch": 0.2902295425833671, + "grad_norm": 2.753815174102783, + "learning_rate": 2.6901251611562695e-06, + "loss": 0.6141, + "step": 6088 + }, + { + "epoch": 0.2902772149786666, + "grad_norm": 2.2690324783325195, + "learning_rate": 2.6874453123641585e-06, + "loss": 1.1821, + "step": 6089 + }, + { + "epoch": 0.2903248873739661, + "grad_norm": 1.9901797771453857, + "learning_rate": 2.6847665918239273e-06, + "loss": 1.2156, + "step": 6090 + }, + { + "epoch": 0.2903725597692656, + "grad_norm": 1.8178085088729858, + "learning_rate": 2.682088999948882e-06, + "loss": 1.0537, + "step": 6091 + }, + { + "epoch": 0.2904202321645651, + "grad_norm": 1.5463037490844727, + "learning_rate": 2.679412537152143e-06, + "loss": 0.8833, + "step": 6092 + }, + { + "epoch": 0.2904679045598646, + "grad_norm": 1.2732456922531128, + "learning_rate": 2.67673720384666e-06, + "loss": 0.6408, + "step": 6093 + }, + { + "epoch": 0.2905155769551641, + "grad_norm": 1.9538484811782837, + "learning_rate": 2.6740630004452115e-06, + "loss": 0.783, + "step": 6094 + }, + { + "epoch": 0.2905632493504636, + "grad_norm": 2.7993714809417725, + "learning_rate": 2.6713899273604027e-06, + "loss": 0.3994, + "step": 6095 + }, + { + "epoch": 0.29061092174576314, + "grad_norm": 1.2971073389053345, + "learning_rate": 2.668717985004654e-06, + "loss": 0.6819, + "step": 6096 + }, + { + "epoch": 0.2906585941410626, + "grad_norm": 1.4017008543014526, + "learning_rate": 2.6660471737902228e-06, + "loss": 0.4674, + "step": 6097 + }, + { + "epoch": 0.2907062665363621, + "grad_norm": 1.4421530961990356, + "learning_rate": 2.6633774941291912e-06, + "loss": 0.5829, + "step": 6098 + }, + { + "epoch": 0.2907539389316616, + "grad_norm": 1.4370533227920532, + "learning_rate": 2.6607089464334567e-06, + "loss": 0.6989, + "step": 6099 + }, + { + "epoch": 0.29080161132696114, + "grad_norm": 1.6371572017669678, + "learning_rate": 2.658041531114751e-06, + "loss": 0.845, + "step": 6100 + }, + { + "epoch": 0.2908492837222606, + "grad_norm": 1.198086142539978, + "learning_rate": 2.6553752485846327e-06, + "loss": 0.4365, + "step": 6101 + }, + { + "epoch": 0.2908969561175601, + "grad_norm": 1.3670740127563477, + "learning_rate": 2.652710099254476e-06, + "loss": 0.6445, + "step": 6102 + }, + { + "epoch": 0.2909446285128596, + "grad_norm": 1.2577002048492432, + "learning_rate": 2.650046083535489e-06, + "loss": 0.6932, + "step": 6103 + }, + { + "epoch": 0.29099230090815914, + "grad_norm": 1.0973515510559082, + "learning_rate": 2.6473832018387034e-06, + "loss": 0.5731, + "step": 6104 + }, + { + "epoch": 0.29103997330345865, + "grad_norm": 1.904301643371582, + "learning_rate": 2.64472145457497e-06, + "loss": 0.7919, + "step": 6105 + }, + { + "epoch": 0.2910876456987581, + "grad_norm": 1.3597952127456665, + "learning_rate": 2.642060842154974e-06, + "loss": 0.4704, + "step": 6106 + }, + { + "epoch": 0.2911353180940576, + "grad_norm": 1.5735416412353516, + "learning_rate": 2.639401364989218e-06, + "loss": 0.5481, + "step": 6107 + }, + { + "epoch": 0.29118299048935714, + "grad_norm": 1.0483876466751099, + "learning_rate": 2.6367430234880286e-06, + "loss": 0.5559, + "step": 6108 + }, + { + "epoch": 0.29123066288465665, + "grad_norm": 2.730222225189209, + "learning_rate": 2.634085818061565e-06, + "loss": 0.6675, + "step": 6109 + }, + { + "epoch": 0.29127833527995617, + "grad_norm": 1.515252947807312, + "learning_rate": 2.631429749119807e-06, + "loss": 0.8489, + "step": 6110 + }, + { + "epoch": 0.2913260076752556, + "grad_norm": 1.6235744953155518, + "learning_rate": 2.6287748170725545e-06, + "loss": 0.6381, + "step": 6111 + }, + { + "epoch": 0.29137368007055514, + "grad_norm": 1.1834958791732788, + "learning_rate": 2.62612102232944e-06, + "loss": 0.6782, + "step": 6112 + }, + { + "epoch": 0.29142135246585466, + "grad_norm": 1.1149227619171143, + "learning_rate": 2.6234683652999173e-06, + "loss": 0.5681, + "step": 6113 + }, + { + "epoch": 0.29146902486115417, + "grad_norm": 1.7449109554290771, + "learning_rate": 2.6208168463932595e-06, + "loss": 0.5997, + "step": 6114 + }, + { + "epoch": 0.2915166972564536, + "grad_norm": 1.2367126941680908, + "learning_rate": 2.618166466018571e-06, + "loss": 0.3397, + "step": 6115 + }, + { + "epoch": 0.29156436965175314, + "grad_norm": 1.2803328037261963, + "learning_rate": 2.6155172245847793e-06, + "loss": 0.6679, + "step": 6116 + }, + { + "epoch": 0.29161204204705266, + "grad_norm": 1.7904669046401978, + "learning_rate": 2.6128691225006376e-06, + "loss": 0.6717, + "step": 6117 + }, + { + "epoch": 0.29165971444235217, + "grad_norm": 2.405367612838745, + "learning_rate": 2.6102221601747136e-06, + "loss": 0.933, + "step": 6118 + }, + { + "epoch": 0.2917073868376517, + "grad_norm": 2.131350517272949, + "learning_rate": 2.607576338015414e-06, + "loss": 0.84, + "step": 6119 + }, + { + "epoch": 0.29175505923295114, + "grad_norm": 1.3268470764160156, + "learning_rate": 2.6049316564309546e-06, + "loss": 0.923, + "step": 6120 + }, + { + "epoch": 0.29180273162825066, + "grad_norm": 1.7321757078170776, + "learning_rate": 2.60228811582939e-06, + "loss": 0.7449, + "step": 6121 + }, + { + "epoch": 0.29185040402355017, + "grad_norm": 4.641642093658447, + "learning_rate": 2.599645716618584e-06, + "loss": 0.5682, + "step": 6122 + }, + { + "epoch": 0.2918980764188497, + "grad_norm": 2.070122003555298, + "learning_rate": 2.597004459206238e-06, + "loss": 1.0381, + "step": 6123 + }, + { + "epoch": 0.29194574881414914, + "grad_norm": 1.9797791242599487, + "learning_rate": 2.5943643439998644e-06, + "loss": 0.7218, + "step": 6124 + }, + { + "epoch": 0.29199342120944866, + "grad_norm": 1.7723979949951172, + "learning_rate": 2.5917253714068104e-06, + "loss": 0.5357, + "step": 6125 + }, + { + "epoch": 0.29204109360474817, + "grad_norm": 3.6121599674224854, + "learning_rate": 2.589087541834243e-06, + "loss": 0.3378, + "step": 6126 + }, + { + "epoch": 0.2920887660000477, + "grad_norm": 1.2610667943954468, + "learning_rate": 2.5864508556891475e-06, + "loss": 0.5998, + "step": 6127 + }, + { + "epoch": 0.2921364383953472, + "grad_norm": 1.3514364957809448, + "learning_rate": 2.5838153133783405e-06, + "loss": 0.5067, + "step": 6128 + }, + { + "epoch": 0.29218411079064666, + "grad_norm": 2.05029034614563, + "learning_rate": 2.581180915308461e-06, + "loss": 0.2832, + "step": 6129 + }, + { + "epoch": 0.29223178318594617, + "grad_norm": 1.4210838079452515, + "learning_rate": 2.578547661885965e-06, + "loss": 0.9129, + "step": 6130 + }, + { + "epoch": 0.2922794555812457, + "grad_norm": 2.062899112701416, + "learning_rate": 2.5759155535171388e-06, + "loss": 0.9497, + "step": 6131 + }, + { + "epoch": 0.2923271279765452, + "grad_norm": 1.331218957901001, + "learning_rate": 2.5732845906080915e-06, + "loss": 0.7308, + "step": 6132 + }, + { + "epoch": 0.29237480037184466, + "grad_norm": 5.356071472167969, + "learning_rate": 2.570654773564749e-06, + "loss": 1.0458, + "step": 6133 + }, + { + "epoch": 0.29242247276714417, + "grad_norm": 1.890952229499817, + "learning_rate": 2.5680261027928676e-06, + "loss": 0.6532, + "step": 6134 + }, + { + "epoch": 0.2924701451624437, + "grad_norm": 1.5121945142745972, + "learning_rate": 2.565398578698026e-06, + "loss": 0.7223, + "step": 6135 + }, + { + "epoch": 0.2925178175577432, + "grad_norm": 1.5600584745407104, + "learning_rate": 2.5627722016856237e-06, + "loss": 0.8201, + "step": 6136 + }, + { + "epoch": 0.2925654899530427, + "grad_norm": 1.4974292516708374, + "learning_rate": 2.5601469721608786e-06, + "loss": 1.0114, + "step": 6137 + }, + { + "epoch": 0.29261316234834217, + "grad_norm": 1.9437029361724854, + "learning_rate": 2.557522890528842e-06, + "loss": 0.8659, + "step": 6138 + }, + { + "epoch": 0.2926608347436417, + "grad_norm": 1.3690195083618164, + "learning_rate": 2.554899957194379e-06, + "loss": 0.8612, + "step": 6139 + }, + { + "epoch": 0.2927085071389412, + "grad_norm": 2.2465035915374756, + "learning_rate": 2.5522781725621814e-06, + "loss": 1.0236, + "step": 6140 + }, + { + "epoch": 0.2927561795342407, + "grad_norm": 1.2816989421844482, + "learning_rate": 2.549657537036769e-06, + "loss": 0.3911, + "step": 6141 + }, + { + "epoch": 0.2928038519295402, + "grad_norm": 0.9845332503318787, + "learning_rate": 2.547038051022472e-06, + "loss": 0.5499, + "step": 6142 + }, + { + "epoch": 0.2928515243248397, + "grad_norm": 1.6904677152633667, + "learning_rate": 2.544419714923454e-06, + "loss": 1.073, + "step": 6143 + }, + { + "epoch": 0.2928991967201392, + "grad_norm": 1.983976125717163, + "learning_rate": 2.5418025291436976e-06, + "loss": 0.635, + "step": 6144 + }, + { + "epoch": 0.2929468691154387, + "grad_norm": 1.4890094995498657, + "learning_rate": 2.539186494087005e-06, + "loss": 0.7808, + "step": 6145 + }, + { + "epoch": 0.2929945415107382, + "grad_norm": 1.6880234479904175, + "learning_rate": 2.5365716101570036e-06, + "loss": 0.9901, + "step": 6146 + }, + { + "epoch": 0.2930422139060377, + "grad_norm": 5.635754585266113, + "learning_rate": 2.533957877757148e-06, + "loss": 0.5273, + "step": 6147 + }, + { + "epoch": 0.2930898863013372, + "grad_norm": 3.41428804397583, + "learning_rate": 2.5313452972907027e-06, + "loss": 0.7776, + "step": 6148 + }, + { + "epoch": 0.2931375586966367, + "grad_norm": 1.9443776607513428, + "learning_rate": 2.5287338691607664e-06, + "loss": 0.5359, + "step": 6149 + }, + { + "epoch": 0.2931852310919362, + "grad_norm": 1.348767638206482, + "learning_rate": 2.5261235937702576e-06, + "loss": 0.574, + "step": 6150 + }, + { + "epoch": 0.29323290348723574, + "grad_norm": 1.3055534362792969, + "learning_rate": 2.523514471521913e-06, + "loss": 0.6479, + "step": 6151 + }, + { + "epoch": 0.2932805758825352, + "grad_norm": 1.907870888710022, + "learning_rate": 2.520906502818289e-06, + "loss": 0.7861, + "step": 6152 + }, + { + "epoch": 0.2933282482778347, + "grad_norm": 4.606992244720459, + "learning_rate": 2.518299688061772e-06, + "loss": 0.0486, + "step": 6153 + }, + { + "epoch": 0.2933759206731342, + "grad_norm": 1.4165542125701904, + "learning_rate": 2.5156940276545692e-06, + "loss": 0.4829, + "step": 6154 + }, + { + "epoch": 0.29342359306843374, + "grad_norm": 1.6055858135223389, + "learning_rate": 2.5130895219987015e-06, + "loss": 0.6591, + "step": 6155 + }, + { + "epoch": 0.2934712654637332, + "grad_norm": 1.3969099521636963, + "learning_rate": 2.5104861714960207e-06, + "loss": 0.7502, + "step": 6156 + }, + { + "epoch": 0.2935189378590327, + "grad_norm": 2.8612236976623535, + "learning_rate": 2.507883976548199e-06, + "loss": 0.7497, + "step": 6157 + }, + { + "epoch": 0.2935666102543322, + "grad_norm": 1.6466960906982422, + "learning_rate": 2.5052829375567232e-06, + "loss": 0.7623, + "step": 6158 + }, + { + "epoch": 0.29361428264963174, + "grad_norm": 3.407519817352295, + "learning_rate": 2.5026830549229097e-06, + "loss": 0.796, + "step": 6159 + }, + { + "epoch": 0.29366195504493126, + "grad_norm": 1.9417352676391602, + "learning_rate": 2.500084329047896e-06, + "loss": 0.9675, + "step": 6160 + }, + { + "epoch": 0.2937096274402307, + "grad_norm": 1.7109984159469604, + "learning_rate": 2.4974867603326337e-06, + "loss": 0.933, + "step": 6161 + }, + { + "epoch": 0.29375729983553023, + "grad_norm": 1.6186312437057495, + "learning_rate": 2.4948903491779032e-06, + "loss": 0.6186, + "step": 6162 + }, + { + "epoch": 0.29380497223082974, + "grad_norm": 1.6657383441925049, + "learning_rate": 2.492295095984306e-06, + "loss": 0.8199, + "step": 6163 + }, + { + "epoch": 0.29385264462612926, + "grad_norm": 1.8358724117279053, + "learning_rate": 2.4897010011522595e-06, + "loss": 0.7381, + "step": 6164 + }, + { + "epoch": 0.2939003170214287, + "grad_norm": 3.0252318382263184, + "learning_rate": 2.48710806508201e-06, + "loss": 0.5331, + "step": 6165 + }, + { + "epoch": 0.29394798941672823, + "grad_norm": 2.4788169860839844, + "learning_rate": 2.484516288173615e-06, + "loss": 0.9319, + "step": 6166 + }, + { + "epoch": 0.29399566181202774, + "grad_norm": 0.9535640478134155, + "learning_rate": 2.4819256708269655e-06, + "loss": 0.4834, + "step": 6167 + }, + { + "epoch": 0.29404333420732726, + "grad_norm": 2.011082887649536, + "learning_rate": 2.47933621344176e-06, + "loss": 0.7738, + "step": 6168 + }, + { + "epoch": 0.29409100660262677, + "grad_norm": 2.474637031555176, + "learning_rate": 2.4767479164175323e-06, + "loss": 0.9388, + "step": 6169 + }, + { + "epoch": 0.29413867899792623, + "grad_norm": 2.801311492919922, + "learning_rate": 2.474160780153624e-06, + "loss": 0.5579, + "step": 6170 + }, + { + "epoch": 0.29418635139322574, + "grad_norm": 4.237105846405029, + "learning_rate": 2.471574805049206e-06, + "loss": 0.6163, + "step": 6171 + }, + { + "epoch": 0.29423402378852526, + "grad_norm": 1.6840664148330688, + "learning_rate": 2.468989991503271e-06, + "loss": 0.659, + "step": 6172 + }, + { + "epoch": 0.29428169618382477, + "grad_norm": 2.800570011138916, + "learning_rate": 2.4664063399146232e-06, + "loss": 0.6551, + "step": 6173 + }, + { + "epoch": 0.2943293685791243, + "grad_norm": 1.232972502708435, + "learning_rate": 2.4638238506818958e-06, + "loss": 0.7185, + "step": 6174 + }, + { + "epoch": 0.29437704097442374, + "grad_norm": 2.1023988723754883, + "learning_rate": 2.4612425242035432e-06, + "loss": 1.0223, + "step": 6175 + }, + { + "epoch": 0.29442471336972326, + "grad_norm": 1.7870928049087524, + "learning_rate": 2.4586623608778324e-06, + "loss": 0.8333, + "step": 6176 + }, + { + "epoch": 0.29447238576502277, + "grad_norm": 1.6844615936279297, + "learning_rate": 2.456083361102858e-06, + "loss": 0.6692, + "step": 6177 + }, + { + "epoch": 0.2945200581603223, + "grad_norm": 4.017955303192139, + "learning_rate": 2.453505525276537e-06, + "loss": 0.7432, + "step": 6178 + }, + { + "epoch": 0.29456773055562174, + "grad_norm": 2.203190565109253, + "learning_rate": 2.450928853796597e-06, + "loss": 0.5097, + "step": 6179 + }, + { + "epoch": 0.29461540295092126, + "grad_norm": 1.717183232307434, + "learning_rate": 2.4483533470605967e-06, + "loss": 0.9019, + "step": 6180 + }, + { + "epoch": 0.29466307534622077, + "grad_norm": 3.6508407592773438, + "learning_rate": 2.4457790054659057e-06, + "loss": 1.2841, + "step": 6181 + }, + { + "epoch": 0.2947107477415203, + "grad_norm": 2.009955644607544, + "learning_rate": 2.443205829409724e-06, + "loss": 0.9626, + "step": 6182 + }, + { + "epoch": 0.2947584201368198, + "grad_norm": 1.492617130279541, + "learning_rate": 2.440633819289059e-06, + "loss": 0.9316, + "step": 6183 + }, + { + "epoch": 0.29480609253211926, + "grad_norm": 2.5129506587982178, + "learning_rate": 2.4380629755007524e-06, + "loss": 0.8387, + "step": 6184 + }, + { + "epoch": 0.29485376492741877, + "grad_norm": 2.931331157684326, + "learning_rate": 2.4354932984414527e-06, + "loss": 0.6627, + "step": 6185 + }, + { + "epoch": 0.2949014373227183, + "grad_norm": 1.0418578386306763, + "learning_rate": 2.432924788507638e-06, + "loss": 0.4271, + "step": 6186 + }, + { + "epoch": 0.2949491097180178, + "grad_norm": 4.242718696594238, + "learning_rate": 2.430357446095606e-06, + "loss": 1.7003, + "step": 6187 + }, + { + "epoch": 0.29499678211331726, + "grad_norm": 1.9347028732299805, + "learning_rate": 2.427791271601465e-06, + "loss": 0.6363, + "step": 6188 + }, + { + "epoch": 0.29504445450861677, + "grad_norm": 1.222937822341919, + "learning_rate": 2.425226265421151e-06, + "loss": 0.5721, + "step": 6189 + }, + { + "epoch": 0.2950921269039163, + "grad_norm": 1.9609845876693726, + "learning_rate": 2.422662427950423e-06, + "loss": 0.5534, + "step": 6190 + }, + { + "epoch": 0.2951397992992158, + "grad_norm": 1.2237188816070557, + "learning_rate": 2.4200997595848474e-06, + "loss": 0.6456, + "step": 6191 + }, + { + "epoch": 0.2951874716945153, + "grad_norm": 2.6468327045440674, + "learning_rate": 2.4175382607198217e-06, + "loss": 0.9785, + "step": 6192 + }, + { + "epoch": 0.29523514408981477, + "grad_norm": 1.7278664112091064, + "learning_rate": 2.4149779317505574e-06, + "loss": 0.9488, + "step": 6193 + }, + { + "epoch": 0.2952828164851143, + "grad_norm": 7.507183074951172, + "learning_rate": 2.4124187730720916e-06, + "loss": 0.3669, + "step": 6194 + }, + { + "epoch": 0.2953304888804138, + "grad_norm": 1.6379116773605347, + "learning_rate": 2.4098607850792712e-06, + "loss": 0.3944, + "step": 6195 + }, + { + "epoch": 0.2953781612757133, + "grad_norm": 1.0105853080749512, + "learning_rate": 2.4073039681667653e-06, + "loss": 0.3334, + "step": 6196 + }, + { + "epoch": 0.2954258336710128, + "grad_norm": 2.0226950645446777, + "learning_rate": 2.4047483227290715e-06, + "loss": 0.6971, + "step": 6197 + }, + { + "epoch": 0.2954735060663123, + "grad_norm": 1.2883707284927368, + "learning_rate": 2.4021938491604912e-06, + "loss": 0.6105, + "step": 6198 + }, + { + "epoch": 0.2955211784616118, + "grad_norm": 0.8866240382194519, + "learning_rate": 2.3996405478551586e-06, + "loss": 0.3959, + "step": 6199 + }, + { + "epoch": 0.2955688508569113, + "grad_norm": 1.6138339042663574, + "learning_rate": 2.3970884192070232e-06, + "loss": 0.5984, + "step": 6200 + }, + { + "epoch": 0.2956165232522108, + "grad_norm": 1.867600440979004, + "learning_rate": 2.3945374636098474e-06, + "loss": 1.014, + "step": 6201 + }, + { + "epoch": 0.2956641956475103, + "grad_norm": 2.4693214893341064, + "learning_rate": 2.3919876814572197e-06, + "loss": 0.8062, + "step": 6202 + }, + { + "epoch": 0.2957118680428098, + "grad_norm": 2.3167264461517334, + "learning_rate": 2.3894390731425486e-06, + "loss": 0.7653, + "step": 6203 + }, + { + "epoch": 0.2957595404381093, + "grad_norm": 1.9118690490722656, + "learning_rate": 2.3868916390590524e-06, + "loss": 0.7576, + "step": 6204 + }, + { + "epoch": 0.29580721283340883, + "grad_norm": 1.512364387512207, + "learning_rate": 2.384345379599775e-06, + "loss": 0.6581, + "step": 6205 + }, + { + "epoch": 0.29585488522870834, + "grad_norm": 3.281097888946533, + "learning_rate": 2.3818002951575834e-06, + "loss": 0.566, + "step": 6206 + }, + { + "epoch": 0.2959025576240078, + "grad_norm": 1.3613389730453491, + "learning_rate": 2.3792563861251506e-06, + "loss": 0.7637, + "step": 6207 + }, + { + "epoch": 0.2959502300193073, + "grad_norm": 1.4592924118041992, + "learning_rate": 2.3767136528949797e-06, + "loss": 0.8039, + "step": 6208 + }, + { + "epoch": 0.29599790241460683, + "grad_norm": 1.6873000860214233, + "learning_rate": 2.3741720958593896e-06, + "loss": 0.6793, + "step": 6209 + }, + { + "epoch": 0.29604557480990634, + "grad_norm": 3.2447853088378906, + "learning_rate": 2.371631715410512e-06, + "loss": 0.295, + "step": 6210 + }, + { + "epoch": 0.2960932472052058, + "grad_norm": 3.141862392425537, + "learning_rate": 2.3690925119403065e-06, + "loss": 0.4293, + "step": 6211 + }, + { + "epoch": 0.2961409196005053, + "grad_norm": 1.427262783050537, + "learning_rate": 2.3665544858405433e-06, + "loss": 0.7697, + "step": 6212 + }, + { + "epoch": 0.29618859199580483, + "grad_norm": 1.0451107025146484, + "learning_rate": 2.3640176375028103e-06, + "loss": 0.3854, + "step": 6213 + }, + { + "epoch": 0.29623626439110434, + "grad_norm": 1.004555583000183, + "learning_rate": 2.361481967318521e-06, + "loss": 0.3633, + "step": 6214 + }, + { + "epoch": 0.29628393678640386, + "grad_norm": 1.7891265153884888, + "learning_rate": 2.3589474756789045e-06, + "loss": 0.4989, + "step": 6215 + }, + { + "epoch": 0.2963316091817033, + "grad_norm": 4.669949531555176, + "learning_rate": 2.3564141629750026e-06, + "loss": 1.2666, + "step": 6216 + }, + { + "epoch": 0.29637928157700283, + "grad_norm": 0.9578444361686707, + "learning_rate": 2.3538820295976816e-06, + "loss": 0.5031, + "step": 6217 + }, + { + "epoch": 0.29642695397230234, + "grad_norm": 2.856482982635498, + "learning_rate": 2.3513510759376266e-06, + "loss": 0.53, + "step": 6218 + }, + { + "epoch": 0.29647462636760186, + "grad_norm": 1.3544644117355347, + "learning_rate": 2.3488213023853325e-06, + "loss": 0.7535, + "step": 6219 + }, + { + "epoch": 0.2965222987629013, + "grad_norm": 1.7900073528289795, + "learning_rate": 2.3462927093311183e-06, + "loss": 0.7144, + "step": 6220 + }, + { + "epoch": 0.29656997115820083, + "grad_norm": 1.481199860572815, + "learning_rate": 2.343765297165125e-06, + "loss": 0.6453, + "step": 6221 + }, + { + "epoch": 0.29661764355350034, + "grad_norm": 1.6059015989303589, + "learning_rate": 2.341239066277299e-06, + "loss": 0.5346, + "step": 6222 + }, + { + "epoch": 0.29666531594879986, + "grad_norm": 2.2869980335235596, + "learning_rate": 2.3387140170574154e-06, + "loss": 0.6164, + "step": 6223 + }, + { + "epoch": 0.29671298834409937, + "grad_norm": 2.2377076148986816, + "learning_rate": 2.3361901498950656e-06, + "loss": 0.4721, + "step": 6224 + }, + { + "epoch": 0.29676066073939883, + "grad_norm": 1.2980705499649048, + "learning_rate": 2.333667465179651e-06, + "loss": 0.7414, + "step": 6225 + }, + { + "epoch": 0.29680833313469834, + "grad_norm": 1.2358131408691406, + "learning_rate": 2.3311459633004006e-06, + "loss": 0.7175, + "step": 6226 + }, + { + "epoch": 0.29685600552999786, + "grad_norm": 1.0507140159606934, + "learning_rate": 2.328625644646355e-06, + "loss": 0.2351, + "step": 6227 + }, + { + "epoch": 0.29690367792529737, + "grad_norm": 1.6839592456817627, + "learning_rate": 2.3261065096063696e-06, + "loss": 1.3605, + "step": 6228 + }, + { + "epoch": 0.2969513503205969, + "grad_norm": 2.320051908493042, + "learning_rate": 2.3235885585691243e-06, + "loss": 0.8316, + "step": 6229 + }, + { + "epoch": 0.29699902271589634, + "grad_norm": 1.6431183815002441, + "learning_rate": 2.3210717919231117e-06, + "loss": 0.6969, + "step": 6230 + }, + { + "epoch": 0.29704669511119586, + "grad_norm": 1.236509919166565, + "learning_rate": 2.318556210056648e-06, + "loss": 0.7098, + "step": 6231 + }, + { + "epoch": 0.29709436750649537, + "grad_norm": 1.4272632598876953, + "learning_rate": 2.3160418133578544e-06, + "loss": 1.0649, + "step": 6232 + }, + { + "epoch": 0.2971420399017949, + "grad_norm": 1.1121752262115479, + "learning_rate": 2.3135286022146785e-06, + "loss": 0.5511, + "step": 6233 + }, + { + "epoch": 0.29718971229709434, + "grad_norm": 1.8114746809005737, + "learning_rate": 2.3110165770148873e-06, + "loss": 0.7566, + "step": 6234 + }, + { + "epoch": 0.29723738469239386, + "grad_norm": 1.8374031782150269, + "learning_rate": 2.308505738146055e-06, + "loss": 0.6684, + "step": 6235 + }, + { + "epoch": 0.29728505708769337, + "grad_norm": 2.0366532802581787, + "learning_rate": 2.3059960859955798e-06, + "loss": 0.7003, + "step": 6236 + }, + { + "epoch": 0.2973327294829929, + "grad_norm": 2.616689920425415, + "learning_rate": 2.303487620950677e-06, + "loss": 0.6459, + "step": 6237 + }, + { + "epoch": 0.2973804018782924, + "grad_norm": 1.5783463716506958, + "learning_rate": 2.3009803433983744e-06, + "loss": 0.585, + "step": 6238 + }, + { + "epoch": 0.29742807427359186, + "grad_norm": 1.1030124425888062, + "learning_rate": 2.2984742537255233e-06, + "loss": 0.698, + "step": 6239 + }, + { + "epoch": 0.29747574666889137, + "grad_norm": 1.8951327800750732, + "learning_rate": 2.2959693523187808e-06, + "loss": 0.8113, + "step": 6240 + }, + { + "epoch": 0.2975234190641909, + "grad_norm": 1.2447819709777832, + "learning_rate": 2.2934656395646336e-06, + "loss": 0.3373, + "step": 6241 + }, + { + "epoch": 0.2975710914594904, + "grad_norm": 1.1514697074890137, + "learning_rate": 2.290963115849375e-06, + "loss": 0.5508, + "step": 6242 + }, + { + "epoch": 0.29761876385478986, + "grad_norm": 2.6068334579467773, + "learning_rate": 2.2884617815591213e-06, + "loss": 1.0128, + "step": 6243 + }, + { + "epoch": 0.2976664362500894, + "grad_norm": 2.0510177612304688, + "learning_rate": 2.285961637079799e-06, + "loss": 0.8102, + "step": 6244 + }, + { + "epoch": 0.2977141086453889, + "grad_norm": 1.5474066734313965, + "learning_rate": 2.283462682797156e-06, + "loss": 0.9855, + "step": 6245 + }, + { + "epoch": 0.2977617810406884, + "grad_norm": 1.4880986213684082, + "learning_rate": 2.2809649190967597e-06, + "loss": 0.9896, + "step": 6246 + }, + { + "epoch": 0.2978094534359879, + "grad_norm": 1.9403513669967651, + "learning_rate": 2.2784683463639832e-06, + "loss": 0.499, + "step": 6247 + }, + { + "epoch": 0.2978571258312874, + "grad_norm": 1.157433271408081, + "learning_rate": 2.2759729649840232e-06, + "loss": 0.7567, + "step": 6248 + }, + { + "epoch": 0.2979047982265869, + "grad_norm": 2.5560498237609863, + "learning_rate": 2.2734787753418965e-06, + "loss": 0.8808, + "step": 6249 + }, + { + "epoch": 0.2979524706218864, + "grad_norm": 2.112027168273926, + "learning_rate": 2.2709857778224244e-06, + "loss": 1.1075, + "step": 6250 + }, + { + "epoch": 0.2980001430171859, + "grad_norm": 1.8769203424453735, + "learning_rate": 2.2684939728102528e-06, + "loss": 0.696, + "step": 6251 + }, + { + "epoch": 0.2980478154124854, + "grad_norm": 1.9187896251678467, + "learning_rate": 2.2660033606898447e-06, + "loss": 0.4853, + "step": 6252 + }, + { + "epoch": 0.2980954878077849, + "grad_norm": 1.8584184646606445, + "learning_rate": 2.263513941845471e-06, + "loss": 0.8023, + "step": 6253 + }, + { + "epoch": 0.2981431602030844, + "grad_norm": 7.00104284286499, + "learning_rate": 2.261025716661225e-06, + "loss": 0.8411, + "step": 6254 + }, + { + "epoch": 0.2981908325983839, + "grad_norm": 1.5704630613327026, + "learning_rate": 2.2585386855210177e-06, + "loss": 0.6705, + "step": 6255 + }, + { + "epoch": 0.29823850499368343, + "grad_norm": 1.3708139657974243, + "learning_rate": 2.256052848808571e-06, + "loss": 0.6181, + "step": 6256 + }, + { + "epoch": 0.2982861773889829, + "grad_norm": 0.9261099696159363, + "learning_rate": 2.2535682069074183e-06, + "loss": 0.4078, + "step": 6257 + }, + { + "epoch": 0.2983338497842824, + "grad_norm": 1.676835060119629, + "learning_rate": 2.251084760200921e-06, + "loss": 0.9565, + "step": 6258 + }, + { + "epoch": 0.2983815221795819, + "grad_norm": 1.5448845624923706, + "learning_rate": 2.248602509072245e-06, + "loss": 0.7191, + "step": 6259 + }, + { + "epoch": 0.29842919457488143, + "grad_norm": 2.3571763038635254, + "learning_rate": 2.2461214539043773e-06, + "loss": 1.0151, + "step": 6260 + }, + { + "epoch": 0.29847686697018094, + "grad_norm": 1.806689739227295, + "learning_rate": 2.2436415950801228e-06, + "loss": 0.6715, + "step": 6261 + }, + { + "epoch": 0.2985245393654804, + "grad_norm": 0.9933075308799744, + "learning_rate": 2.241162932982093e-06, + "loss": 0.2741, + "step": 6262 + }, + { + "epoch": 0.2985722117607799, + "grad_norm": 1.2085294723510742, + "learning_rate": 2.2386854679927215e-06, + "loss": 0.444, + "step": 6263 + }, + { + "epoch": 0.29861988415607943, + "grad_norm": 1.617403507232666, + "learning_rate": 2.2362092004942583e-06, + "loss": 0.8379, + "step": 6264 + }, + { + "epoch": 0.29866755655137894, + "grad_norm": 1.9029408693313599, + "learning_rate": 2.233734130868762e-06, + "loss": 0.7259, + "step": 6265 + }, + { + "epoch": 0.2987152289466784, + "grad_norm": 2.145231246948242, + "learning_rate": 2.2312602594981126e-06, + "loss": 0.6003, + "step": 6266 + }, + { + "epoch": 0.2987629013419779, + "grad_norm": 1.2889387607574463, + "learning_rate": 2.228787586764004e-06, + "loss": 0.6624, + "step": 6267 + }, + { + "epoch": 0.29881057373727743, + "grad_norm": 2.299807548522949, + "learning_rate": 2.2263161130479405e-06, + "loss": 0.7913, + "step": 6268 + }, + { + "epoch": 0.29885824613257694, + "grad_norm": 2.704833984375, + "learning_rate": 2.2238458387312476e-06, + "loss": 1.0125, + "step": 6269 + }, + { + "epoch": 0.29890591852787646, + "grad_norm": 1.1400604248046875, + "learning_rate": 2.2213767641950658e-06, + "loss": 0.4367, + "step": 6270 + }, + { + "epoch": 0.2989535909231759, + "grad_norm": 2.1746912002563477, + "learning_rate": 2.2189088898203446e-06, + "loss": 0.6238, + "step": 6271 + }, + { + "epoch": 0.29900126331847543, + "grad_norm": 1.5861088037490845, + "learning_rate": 2.2164422159878496e-06, + "loss": 0.8999, + "step": 6272 + }, + { + "epoch": 0.29904893571377494, + "grad_norm": 1.3148717880249023, + "learning_rate": 2.2139767430781654e-06, + "loss": 0.6231, + "step": 6273 + }, + { + "epoch": 0.29909660810907446, + "grad_norm": 1.5059646368026733, + "learning_rate": 2.211512471471692e-06, + "loss": 0.5306, + "step": 6274 + }, + { + "epoch": 0.2991442805043739, + "grad_norm": 1.476387619972229, + "learning_rate": 2.2090494015486354e-06, + "loss": 0.727, + "step": 6275 + }, + { + "epoch": 0.29919195289967343, + "grad_norm": 1.2458332777023315, + "learning_rate": 2.206587533689025e-06, + "loss": 0.8143, + "step": 6276 + }, + { + "epoch": 0.29923962529497294, + "grad_norm": 2.3385074138641357, + "learning_rate": 2.2041268682727034e-06, + "loss": 0.9914, + "step": 6277 + }, + { + "epoch": 0.29928729769027246, + "grad_norm": 1.990761637687683, + "learning_rate": 2.2016674056793232e-06, + "loss": 0.822, + "step": 6278 + }, + { + "epoch": 0.29933497008557197, + "grad_norm": 1.5822852849960327, + "learning_rate": 2.1992091462883537e-06, + "loss": 0.4592, + "step": 6279 + }, + { + "epoch": 0.29938264248087143, + "grad_norm": 1.595819354057312, + "learning_rate": 2.196752090479083e-06, + "loss": 0.7642, + "step": 6280 + }, + { + "epoch": 0.29943031487617094, + "grad_norm": 2.5592379570007324, + "learning_rate": 2.194296238630604e-06, + "loss": 0.4591, + "step": 6281 + }, + { + "epoch": 0.29947798727147046, + "grad_norm": 1.5908269882202148, + "learning_rate": 2.1918415911218327e-06, + "loss": 0.3705, + "step": 6282 + }, + { + "epoch": 0.29952565966676997, + "grad_norm": 1.236393690109253, + "learning_rate": 2.189388148331498e-06, + "loss": 0.6536, + "step": 6283 + }, + { + "epoch": 0.2995733320620695, + "grad_norm": 2.7467339038848877, + "learning_rate": 2.186935910638136e-06, + "loss": 1.3771, + "step": 6284 + }, + { + "epoch": 0.29962100445736894, + "grad_norm": 1.7299925088882446, + "learning_rate": 2.1844848784201067e-06, + "loss": 0.9504, + "step": 6285 + }, + { + "epoch": 0.29966867685266846, + "grad_norm": 1.920395851135254, + "learning_rate": 2.182035052055573e-06, + "loss": 0.5913, + "step": 6286 + }, + { + "epoch": 0.299716349247968, + "grad_norm": 0.8589304089546204, + "learning_rate": 2.1795864319225246e-06, + "loss": 0.3922, + "step": 6287 + }, + { + "epoch": 0.2997640216432675, + "grad_norm": 1.6341394186019897, + "learning_rate": 2.177139018398752e-06, + "loss": 1.0774, + "step": 6288 + }, + { + "epoch": 0.29981169403856694, + "grad_norm": 1.6117527484893799, + "learning_rate": 2.1746928118618717e-06, + "loss": 0.7447, + "step": 6289 + }, + { + "epoch": 0.29985936643386646, + "grad_norm": 1.3888081312179565, + "learning_rate": 2.1722478126893022e-06, + "loss": 0.7575, + "step": 6290 + }, + { + "epoch": 0.299907038829166, + "grad_norm": 3.7830941677093506, + "learning_rate": 2.1698040212582862e-06, + "loss": 0.8668, + "step": 6291 + }, + { + "epoch": 0.2999547112244655, + "grad_norm": 1.5668525695800781, + "learning_rate": 2.167361437945876e-06, + "loss": 0.9243, + "step": 6292 + }, + { + "epoch": 0.300002383619765, + "grad_norm": 3.770625114440918, + "learning_rate": 2.1649200631289322e-06, + "loss": 0.2455, + "step": 6293 + }, + { + "epoch": 0.30005005601506446, + "grad_norm": 2.0723373889923096, + "learning_rate": 2.162479897184139e-06, + "loss": 0.9181, + "step": 6294 + }, + { + "epoch": 0.300097728410364, + "grad_norm": 3.0751869678497314, + "learning_rate": 2.1600409404879875e-06, + "loss": 0.5347, + "step": 6295 + }, + { + "epoch": 0.3001454008056635, + "grad_norm": 1.8688127994537354, + "learning_rate": 2.157603193416781e-06, + "loss": 0.6948, + "step": 6296 + }, + { + "epoch": 0.300193073200963, + "grad_norm": 1.7637250423431396, + "learning_rate": 2.1551666563466413e-06, + "loss": 0.674, + "step": 6297 + }, + { + "epoch": 0.30024074559626246, + "grad_norm": 2.5037527084350586, + "learning_rate": 2.152731329653502e-06, + "loss": 0.6819, + "step": 6298 + }, + { + "epoch": 0.300288417991562, + "grad_norm": 1.2383288145065308, + "learning_rate": 2.150297213713105e-06, + "loss": 0.6406, + "step": 6299 + }, + { + "epoch": 0.3003360903868615, + "grad_norm": 9.533638000488281, + "learning_rate": 2.1478643089010143e-06, + "loss": 1.0306, + "step": 6300 + }, + { + "epoch": 0.300383762782161, + "grad_norm": 3.421718120574951, + "learning_rate": 2.1454326155925966e-06, + "loss": 0.8606, + "step": 6301 + }, + { + "epoch": 0.3004314351774605, + "grad_norm": 2.745079278945923, + "learning_rate": 2.1430021341630424e-06, + "loss": 0.4594, + "step": 6302 + }, + { + "epoch": 0.30047910757276, + "grad_norm": 1.7435271739959717, + "learning_rate": 2.1405728649873458e-06, + "loss": 0.7596, + "step": 6303 + }, + { + "epoch": 0.3005267799680595, + "grad_norm": 1.6694341897964478, + "learning_rate": 2.138144808440321e-06, + "loss": 0.8125, + "step": 6304 + }, + { + "epoch": 0.300574452363359, + "grad_norm": 1.1263213157653809, + "learning_rate": 2.13571796489659e-06, + "loss": 0.6427, + "step": 6305 + }, + { + "epoch": 0.3006221247586585, + "grad_norm": 7.258553504943848, + "learning_rate": 2.133292334730589e-06, + "loss": 0.6564, + "step": 6306 + }, + { + "epoch": 0.300669797153958, + "grad_norm": 1.4790229797363281, + "learning_rate": 2.1308679183165693e-06, + "loss": 0.841, + "step": 6307 + }, + { + "epoch": 0.3007174695492575, + "grad_norm": 2.84000301361084, + "learning_rate": 2.128444716028597e-06, + "loss": 1.2394, + "step": 6308 + }, + { + "epoch": 0.300765141944557, + "grad_norm": 1.6635762453079224, + "learning_rate": 2.12602272824054e-06, + "loss": 0.8609, + "step": 6309 + }, + { + "epoch": 0.3008128143398565, + "grad_norm": 1.3851314783096313, + "learning_rate": 2.123601955326091e-06, + "loss": 0.6644, + "step": 6310 + }, + { + "epoch": 0.30086048673515603, + "grad_norm": 1.7767314910888672, + "learning_rate": 2.1211823976587508e-06, + "loss": 0.8282, + "step": 6311 + }, + { + "epoch": 0.3009081591304555, + "grad_norm": 1.740729808807373, + "learning_rate": 2.118764055611828e-06, + "loss": 0.829, + "step": 6312 + }, + { + "epoch": 0.300955831525755, + "grad_norm": 1.134680986404419, + "learning_rate": 2.1163469295584504e-06, + "loss": 0.4486, + "step": 6313 + }, + { + "epoch": 0.3010035039210545, + "grad_norm": 1.057878017425537, + "learning_rate": 2.113931019871559e-06, + "loss": 0.4592, + "step": 6314 + }, + { + "epoch": 0.30105117631635403, + "grad_norm": 2.1879589557647705, + "learning_rate": 2.1115163269238992e-06, + "loss": 1.0933, + "step": 6315 + }, + { + "epoch": 0.30109884871165354, + "grad_norm": 2.1401851177215576, + "learning_rate": 2.109102851088033e-06, + "loss": 0.4979, + "step": 6316 + }, + { + "epoch": 0.301146521106953, + "grad_norm": 1.6667444705963135, + "learning_rate": 2.106690592736338e-06, + "loss": 0.8235, + "step": 6317 + }, + { + "epoch": 0.3011941935022525, + "grad_norm": 0.9434000253677368, + "learning_rate": 2.1042795522409977e-06, + "loss": 0.0076, + "step": 6318 + }, + { + "epoch": 0.30124186589755203, + "grad_norm": 1.4988133907318115, + "learning_rate": 2.101869729974011e-06, + "loss": 0.7448, + "step": 6319 + }, + { + "epoch": 0.30128953829285154, + "grad_norm": 1.2174264192581177, + "learning_rate": 2.099461126307194e-06, + "loss": 0.5325, + "step": 6320 + }, + { + "epoch": 0.301337210688151, + "grad_norm": 1.5915915966033936, + "learning_rate": 2.0970537416121617e-06, + "loss": 0.795, + "step": 6321 + }, + { + "epoch": 0.3013848830834505, + "grad_norm": 2.000627040863037, + "learning_rate": 2.0946475762603525e-06, + "loss": 0.5215, + "step": 6322 + }, + { + "epoch": 0.30143255547875003, + "grad_norm": 1.5578094720840454, + "learning_rate": 2.092242630623016e-06, + "loss": 0.8091, + "step": 6323 + }, + { + "epoch": 0.30148022787404954, + "grad_norm": 1.8483134508132935, + "learning_rate": 2.0898389050712044e-06, + "loss": 0.6688, + "step": 6324 + }, + { + "epoch": 0.30152790026934906, + "grad_norm": 1.9468728303909302, + "learning_rate": 2.0874363999757906e-06, + "loss": 0.7893, + "step": 6325 + }, + { + "epoch": 0.3015755726646485, + "grad_norm": 1.6190681457519531, + "learning_rate": 2.08503511570746e-06, + "loss": 0.8619, + "step": 6326 + }, + { + "epoch": 0.30162324505994803, + "grad_norm": 2.691166400909424, + "learning_rate": 2.0826350526367e-06, + "loss": 1.1333, + "step": 6327 + }, + { + "epoch": 0.30167091745524754, + "grad_norm": 2.630610466003418, + "learning_rate": 2.0802362111338183e-06, + "loss": 1.0579, + "step": 6328 + }, + { + "epoch": 0.30171858985054706, + "grad_norm": 1.3825013637542725, + "learning_rate": 2.0778385915689336e-06, + "loss": 0.6147, + "step": 6329 + }, + { + "epoch": 0.3017662622458465, + "grad_norm": 1.8687647581100464, + "learning_rate": 2.0754421943119695e-06, + "loss": 0.7765, + "step": 6330 + }, + { + "epoch": 0.30181393464114603, + "grad_norm": 4.715604782104492, + "learning_rate": 2.0730470197326702e-06, + "loss": 1.0323, + "step": 6331 + }, + { + "epoch": 0.30186160703644555, + "grad_norm": 1.8607902526855469, + "learning_rate": 2.0706530682005833e-06, + "loss": 0.441, + "step": 6332 + }, + { + "epoch": 0.30190927943174506, + "grad_norm": 2.3508448600769043, + "learning_rate": 2.06826034008507e-06, + "loss": 0.3873, + "step": 6333 + }, + { + "epoch": 0.3019569518270446, + "grad_norm": 5.49057674407959, + "learning_rate": 2.0658688357553036e-06, + "loss": 0.4204, + "step": 6334 + }, + { + "epoch": 0.30200462422234403, + "grad_norm": 1.716123342514038, + "learning_rate": 2.063478555580274e-06, + "loss": 0.6531, + "step": 6335 + }, + { + "epoch": 0.30205229661764355, + "grad_norm": 1.1376465559005737, + "learning_rate": 2.06108949992877e-06, + "loss": 0.6853, + "step": 6336 + }, + { + "epoch": 0.30209996901294306, + "grad_norm": 1.116325855255127, + "learning_rate": 2.0587016691694006e-06, + "loss": 0.8409, + "step": 6337 + }, + { + "epoch": 0.3021476414082426, + "grad_norm": 1.5302807092666626, + "learning_rate": 2.0563150636705873e-06, + "loss": 0.6467, + "step": 6338 + }, + { + "epoch": 0.30219531380354203, + "grad_norm": 1.5142567157745361, + "learning_rate": 2.053929683800553e-06, + "loss": 0.9118, + "step": 6339 + }, + { + "epoch": 0.30224298619884155, + "grad_norm": 3.894611120223999, + "learning_rate": 2.05154552992734e-06, + "loss": 0.9286, + "step": 6340 + }, + { + "epoch": 0.30229065859414106, + "grad_norm": 1.4084447622299194, + "learning_rate": 2.0491626024188005e-06, + "loss": 0.5777, + "step": 6341 + }, + { + "epoch": 0.3023383309894406, + "grad_norm": 2.5469160079956055, + "learning_rate": 2.046780901642591e-06, + "loss": 0.9876, + "step": 6342 + }, + { + "epoch": 0.3023860033847401, + "grad_norm": 1.5295445919036865, + "learning_rate": 2.0444004279661866e-06, + "loss": 0.7402, + "step": 6343 + }, + { + "epoch": 0.30243367578003955, + "grad_norm": 1.7182564735412598, + "learning_rate": 2.0420211817568724e-06, + "loss": 0.7154, + "step": 6344 + }, + { + "epoch": 0.30248134817533906, + "grad_norm": 6.909396648406982, + "learning_rate": 2.0396431633817348e-06, + "loss": 0.459, + "step": 6345 + }, + { + "epoch": 0.3025290205706386, + "grad_norm": 1.3105229139328003, + "learning_rate": 2.0372663732076847e-06, + "loss": 0.9715, + "step": 6346 + }, + { + "epoch": 0.3025766929659381, + "grad_norm": 3.02335786819458, + "learning_rate": 2.03489081160143e-06, + "loss": 0.295, + "step": 6347 + }, + { + "epoch": 0.3026243653612376, + "grad_norm": 1.4509479999542236, + "learning_rate": 2.0325164789295004e-06, + "loss": 0.8722, + "step": 6348 + }, + { + "epoch": 0.30267203775653706, + "grad_norm": 3.1525590419769287, + "learning_rate": 2.0301433755582266e-06, + "loss": 0.6557, + "step": 6349 + }, + { + "epoch": 0.3027197101518366, + "grad_norm": 1.3639476299285889, + "learning_rate": 2.027771501853757e-06, + "loss": 0.7237, + "step": 6350 + }, + { + "epoch": 0.3027673825471361, + "grad_norm": 1.5575635433197021, + "learning_rate": 2.025400858182048e-06, + "loss": 0.6297, + "step": 6351 + }, + { + "epoch": 0.3028150549424356, + "grad_norm": 1.4706881046295166, + "learning_rate": 2.0230314449088626e-06, + "loss": 0.7896, + "step": 6352 + }, + { + "epoch": 0.30286272733773506, + "grad_norm": 1.6276911497116089, + "learning_rate": 2.020663262399778e-06, + "loss": 0.9242, + "step": 6353 + }, + { + "epoch": 0.3029103997330346, + "grad_norm": 1.9563448429107666, + "learning_rate": 2.0182963110201823e-06, + "loss": 0.7302, + "step": 6354 + }, + { + "epoch": 0.3029580721283341, + "grad_norm": 1.1066612005233765, + "learning_rate": 2.0159305911352688e-06, + "loss": 0.5124, + "step": 6355 + }, + { + "epoch": 0.3030057445236336, + "grad_norm": 1.9852733612060547, + "learning_rate": 2.013566103110045e-06, + "loss": 0.6534, + "step": 6356 + }, + { + "epoch": 0.3030534169189331, + "grad_norm": 3.199556350708008, + "learning_rate": 2.0112028473093294e-06, + "loss": 1.087, + "step": 6357 + }, + { + "epoch": 0.3031010893142326, + "grad_norm": 1.620105266571045, + "learning_rate": 2.008840824097743e-06, + "loss": 0.4853, + "step": 6358 + }, + { + "epoch": 0.3031487617095321, + "grad_norm": 0.9042674899101257, + "learning_rate": 2.006480033839728e-06, + "loss": 0.4665, + "step": 6359 + }, + { + "epoch": 0.3031964341048316, + "grad_norm": 3.8102097511291504, + "learning_rate": 2.0041204768995225e-06, + "loss": 1.7765, + "step": 6360 + }, + { + "epoch": 0.3032441065001311, + "grad_norm": 1.1288888454437256, + "learning_rate": 2.001762153641189e-06, + "loss": 0.6047, + "step": 6361 + }, + { + "epoch": 0.3032917788954306, + "grad_norm": 1.6786080598831177, + "learning_rate": 1.999405064428587e-06, + "loss": 0.6556, + "step": 6362 + }, + { + "epoch": 0.3033394512907301, + "grad_norm": 1.3288825750350952, + "learning_rate": 1.9970492096253955e-06, + "loss": 0.5948, + "step": 6363 + }, + { + "epoch": 0.3033871236860296, + "grad_norm": 1.5415656566619873, + "learning_rate": 1.9946945895950943e-06, + "loss": 0.8503, + "step": 6364 + }, + { + "epoch": 0.3034347960813291, + "grad_norm": 1.058382511138916, + "learning_rate": 1.9923412047009794e-06, + "loss": 0.5966, + "step": 6365 + }, + { + "epoch": 0.30348246847662863, + "grad_norm": 1.4036825895309448, + "learning_rate": 1.9899890553061565e-06, + "loss": 0.7594, + "step": 6366 + }, + { + "epoch": 0.3035301408719281, + "grad_norm": 1.6590728759765625, + "learning_rate": 1.9876381417735312e-06, + "loss": 0.6407, + "step": 6367 + }, + { + "epoch": 0.3035778132672276, + "grad_norm": 1.2488852739334106, + "learning_rate": 1.98528846446583e-06, + "loss": 0.5549, + "step": 6368 + }, + { + "epoch": 0.3036254856625271, + "grad_norm": 1.3086576461791992, + "learning_rate": 1.9829400237455865e-06, + "loss": 0.6662, + "step": 6369 + }, + { + "epoch": 0.30367315805782663, + "grad_norm": 1.4790866374969482, + "learning_rate": 1.9805928199751336e-06, + "loss": 0.6447, + "step": 6370 + }, + { + "epoch": 0.30372083045312614, + "grad_norm": 4.150673866271973, + "learning_rate": 1.9782468535166253e-06, + "loss": 0.4286, + "step": 6371 + }, + { + "epoch": 0.3037685028484256, + "grad_norm": 1.3232091665267944, + "learning_rate": 1.975902124732022e-06, + "loss": 0.7318, + "step": 6372 + }, + { + "epoch": 0.3038161752437251, + "grad_norm": 1.5176620483398438, + "learning_rate": 1.973558633983087e-06, + "loss": 1.1292, + "step": 6373 + }, + { + "epoch": 0.30386384763902463, + "grad_norm": 2.3007333278656006, + "learning_rate": 1.971216381631397e-06, + "loss": 0.6538, + "step": 6374 + }, + { + "epoch": 0.30391152003432415, + "grad_norm": 1.425936222076416, + "learning_rate": 1.968875368038342e-06, + "loss": 0.8703, + "step": 6375 + }, + { + "epoch": 0.3039591924296236, + "grad_norm": 2.337597370147705, + "learning_rate": 1.9665355935651133e-06, + "loss": 0.3978, + "step": 6376 + }, + { + "epoch": 0.3040068648249231, + "grad_norm": 1.9509705305099487, + "learning_rate": 1.964197058572711e-06, + "loss": 0.7741, + "step": 6377 + }, + { + "epoch": 0.30405453722022263, + "grad_norm": 1.8793994188308716, + "learning_rate": 1.961859763421953e-06, + "loss": 0.6466, + "step": 6378 + }, + { + "epoch": 0.30410220961552215, + "grad_norm": 3.5728471279144287, + "learning_rate": 1.959523708473453e-06, + "loss": 1.5575, + "step": 6379 + }, + { + "epoch": 0.30414988201082166, + "grad_norm": 2.1158430576324463, + "learning_rate": 1.9571888940876436e-06, + "loss": 0.5644, + "step": 6380 + }, + { + "epoch": 0.3041975544061211, + "grad_norm": 1.3865089416503906, + "learning_rate": 1.9548553206247667e-06, + "loss": 0.7426, + "step": 6381 + }, + { + "epoch": 0.30424522680142063, + "grad_norm": 1.958948016166687, + "learning_rate": 1.9525229884448624e-06, + "loss": 0.905, + "step": 6382 + }, + { + "epoch": 0.30429289919672015, + "grad_norm": 1.395885705947876, + "learning_rate": 1.9501918979077874e-06, + "loss": 0.6477, + "step": 6383 + }, + { + "epoch": 0.30434057159201966, + "grad_norm": 1.6745498180389404, + "learning_rate": 1.947862049373206e-06, + "loss": 0.7298, + "step": 6384 + }, + { + "epoch": 0.3043882439873191, + "grad_norm": 1.6946300268173218, + "learning_rate": 1.945533443200591e-06, + "loss": 0.5748, + "step": 6385 + }, + { + "epoch": 0.30443591638261863, + "grad_norm": 1.435052752494812, + "learning_rate": 1.9432060797492193e-06, + "loss": 0.7157, + "step": 6386 + }, + { + "epoch": 0.30448358877791815, + "grad_norm": 1.2148957252502441, + "learning_rate": 1.94087995937818e-06, + "loss": 0.768, + "step": 6387 + }, + { + "epoch": 0.30453126117321766, + "grad_norm": 1.5300604104995728, + "learning_rate": 1.9385550824463727e-06, + "loss": 0.7784, + "step": 6388 + }, + { + "epoch": 0.3045789335685172, + "grad_norm": 1.5830923318862915, + "learning_rate": 1.9362314493124965e-06, + "loss": 0.9742, + "step": 6389 + }, + { + "epoch": 0.30462660596381663, + "grad_norm": 1.7905744314193726, + "learning_rate": 1.9339090603350698e-06, + "loss": 0.677, + "step": 6390 + }, + { + "epoch": 0.30467427835911615, + "grad_norm": 1.3788014650344849, + "learning_rate": 1.9315879158724106e-06, + "loss": 0.5513, + "step": 6391 + }, + { + "epoch": 0.30472195075441566, + "grad_norm": 2.3503267765045166, + "learning_rate": 1.929268016282645e-06, + "loss": 0.6906, + "step": 6392 + }, + { + "epoch": 0.3047696231497152, + "grad_norm": 1.0267317295074463, + "learning_rate": 1.9269493619237114e-06, + "loss": 0.5549, + "step": 6393 + }, + { + "epoch": 0.30481729554501463, + "grad_norm": 2.3521690368652344, + "learning_rate": 1.9246319531533574e-06, + "loss": 0.8716, + "step": 6394 + }, + { + "epoch": 0.30486496794031415, + "grad_norm": 5.018568515777588, + "learning_rate": 1.9223157903291313e-06, + "loss": 0.613, + "step": 6395 + }, + { + "epoch": 0.30491264033561366, + "grad_norm": 1.5714318752288818, + "learning_rate": 1.920000873808394e-06, + "loss": 0.633, + "step": 6396 + }, + { + "epoch": 0.3049603127309132, + "grad_norm": 3.5810441970825195, + "learning_rate": 1.917687203948316e-06, + "loss": 1.115, + "step": 6397 + }, + { + "epoch": 0.3050079851262127, + "grad_norm": 2.0197510719299316, + "learning_rate": 1.91537478110587e-06, + "loss": 0.6015, + "step": 6398 + }, + { + "epoch": 0.30505565752151215, + "grad_norm": 1.6447203159332275, + "learning_rate": 1.913063605637838e-06, + "loss": 0.8743, + "step": 6399 + }, + { + "epoch": 0.30510332991681166, + "grad_norm": 1.374510645866394, + "learning_rate": 1.9107536779008153e-06, + "loss": 0.4009, + "step": 6400 + }, + { + "epoch": 0.3051510023121112, + "grad_norm": 1.245357871055603, + "learning_rate": 1.908444998251194e-06, + "loss": 0.6025, + "step": 6401 + }, + { + "epoch": 0.3051986747074107, + "grad_norm": 1.340909481048584, + "learning_rate": 1.9061375670451831e-06, + "loss": 0.6918, + "step": 6402 + }, + { + "epoch": 0.3052463471027102, + "grad_norm": 1.7540247440338135, + "learning_rate": 1.903831384638798e-06, + "loss": 0.8732, + "step": 6403 + }, + { + "epoch": 0.30529401949800966, + "grad_norm": 1.3919272422790527, + "learning_rate": 1.9015264513878528e-06, + "loss": 0.6712, + "step": 6404 + }, + { + "epoch": 0.3053416918933092, + "grad_norm": 2.0504024028778076, + "learning_rate": 1.8992227676479803e-06, + "loss": 0.7113, + "step": 6405 + }, + { + "epoch": 0.3053893642886087, + "grad_norm": 2.407709836959839, + "learning_rate": 1.8969203337746101e-06, + "loss": 0.7342, + "step": 6406 + }, + { + "epoch": 0.3054370366839082, + "grad_norm": 1.4397438764572144, + "learning_rate": 1.8946191501229905e-06, + "loss": 0.9156, + "step": 6407 + }, + { + "epoch": 0.30548470907920766, + "grad_norm": 1.8526594638824463, + "learning_rate": 1.892319217048163e-06, + "loss": 0.1576, + "step": 6408 + }, + { + "epoch": 0.3055323814745072, + "grad_norm": 1.4310426712036133, + "learning_rate": 1.8900205349049904e-06, + "loss": 0.8332, + "step": 6409 + }, + { + "epoch": 0.3055800538698067, + "grad_norm": 1.1129051446914673, + "learning_rate": 1.8877231040481302e-06, + "loss": 0.498, + "step": 6410 + }, + { + "epoch": 0.3056277262651062, + "grad_norm": 1.3999230861663818, + "learning_rate": 1.8854269248320545e-06, + "loss": 0.7744, + "step": 6411 + }, + { + "epoch": 0.3056753986604057, + "grad_norm": 1.8585573434829712, + "learning_rate": 1.883131997611043e-06, + "loss": 0.6155, + "step": 6412 + }, + { + "epoch": 0.3057230710557052, + "grad_norm": 1.9685242176055908, + "learning_rate": 1.8808383227391747e-06, + "loss": 0.8717, + "step": 6413 + }, + { + "epoch": 0.3057707434510047, + "grad_norm": 1.6285715103149414, + "learning_rate": 1.8785459005703411e-06, + "loss": 0.6105, + "step": 6414 + }, + { + "epoch": 0.3058184158463042, + "grad_norm": 1.0559170246124268, + "learning_rate": 1.8762547314582435e-06, + "loss": 0.593, + "step": 6415 + }, + { + "epoch": 0.3058660882416037, + "grad_norm": 1.283506989479065, + "learning_rate": 1.8739648157563794e-06, + "loss": 0.4569, + "step": 6416 + }, + { + "epoch": 0.3059137606369032, + "grad_norm": 1.752828598022461, + "learning_rate": 1.8716761538180627e-06, + "loss": 0.271, + "step": 6417 + }, + { + "epoch": 0.3059614330322027, + "grad_norm": 2.224846839904785, + "learning_rate": 1.8693887459964123e-06, + "loss": 0.1549, + "step": 6418 + }, + { + "epoch": 0.3060091054275022, + "grad_norm": 1.3899378776550293, + "learning_rate": 1.8671025926443464e-06, + "loss": 0.5116, + "step": 6419 + }, + { + "epoch": 0.3060567778228017, + "grad_norm": 0.9118931293487549, + "learning_rate": 1.8648176941146012e-06, + "loss": 0.4819, + "step": 6420 + }, + { + "epoch": 0.30610445021810123, + "grad_norm": 1.6946864128112793, + "learning_rate": 1.8625340507597056e-06, + "loss": 0.8566, + "step": 6421 + }, + { + "epoch": 0.3061521226134007, + "grad_norm": 1.7777106761932373, + "learning_rate": 1.86025166293201e-06, + "loss": 0.8144, + "step": 6422 + }, + { + "epoch": 0.3061997950087002, + "grad_norm": 3.625394344329834, + "learning_rate": 1.8579705309836571e-06, + "loss": 0.8559, + "step": 6423 + }, + { + "epoch": 0.3062474674039997, + "grad_norm": 1.3907591104507446, + "learning_rate": 1.8556906552666042e-06, + "loss": 0.6744, + "step": 6424 + }, + { + "epoch": 0.30629513979929923, + "grad_norm": 1.1208232641220093, + "learning_rate": 1.8534120361326159e-06, + "loss": 0.6212, + "step": 6425 + }, + { + "epoch": 0.3063428121945987, + "grad_norm": 1.2047077417373657, + "learning_rate": 1.8511346739332535e-06, + "loss": 0.6538, + "step": 6426 + }, + { + "epoch": 0.3063904845898982, + "grad_norm": 1.06620192527771, + "learning_rate": 1.8488585690198946e-06, + "loss": 0.5665, + "step": 6427 + }, + { + "epoch": 0.3064381569851977, + "grad_norm": 0.919082522392273, + "learning_rate": 1.8465837217437199e-06, + "loss": 0.3602, + "step": 6428 + }, + { + "epoch": 0.30648582938049723, + "grad_norm": 1.3466386795043945, + "learning_rate": 1.8443101324557111e-06, + "loss": 0.624, + "step": 6429 + }, + { + "epoch": 0.30653350177579675, + "grad_norm": 3.57694935798645, + "learning_rate": 1.842037801506661e-06, + "loss": 1.2356, + "step": 6430 + }, + { + "epoch": 0.3065811741710962, + "grad_norm": 2.216155767440796, + "learning_rate": 1.839766729247171e-06, + "loss": 0.9596, + "step": 6431 + }, + { + "epoch": 0.3066288465663957, + "grad_norm": 0.8382368683815002, + "learning_rate": 1.8374969160276368e-06, + "loss": 0.3866, + "step": 6432 + }, + { + "epoch": 0.30667651896169523, + "grad_norm": 1.6770670413970947, + "learning_rate": 1.8352283621982713e-06, + "loss": 1.1551, + "step": 6433 + }, + { + "epoch": 0.30672419135699475, + "grad_norm": 1.2856043577194214, + "learning_rate": 1.8329610681090914e-06, + "loss": 0.279, + "step": 6434 + }, + { + "epoch": 0.30677186375229426, + "grad_norm": 1.3624242544174194, + "learning_rate": 1.8306950341099138e-06, + "loss": 0.6864, + "step": 6435 + }, + { + "epoch": 0.3068195361475937, + "grad_norm": 1.1764171123504639, + "learning_rate": 1.8284302605503624e-06, + "loss": 0.536, + "step": 6436 + }, + { + "epoch": 0.30686720854289323, + "grad_norm": 1.6602095365524292, + "learning_rate": 1.826166747779874e-06, + "loss": 0.8386, + "step": 6437 + }, + { + "epoch": 0.30691488093819275, + "grad_norm": 1.0563730001449585, + "learning_rate": 1.8239044961476794e-06, + "loss": 0.6951, + "step": 6438 + }, + { + "epoch": 0.30696255333349226, + "grad_norm": 1.2942439317703247, + "learning_rate": 1.8216435060028237e-06, + "loss": 0.447, + "step": 6439 + }, + { + "epoch": 0.3070102257287917, + "grad_norm": 1.9161652326583862, + "learning_rate": 1.819383777694157e-06, + "loss": 0.7364, + "step": 6440 + }, + { + "epoch": 0.30705789812409123, + "grad_norm": 2.8646817207336426, + "learning_rate": 1.817125311570327e-06, + "loss": 0.7922, + "step": 6441 + }, + { + "epoch": 0.30710557051939075, + "grad_norm": 1.85245680809021, + "learning_rate": 1.8148681079797925e-06, + "loss": 0.9882, + "step": 6442 + }, + { + "epoch": 0.30715324291469026, + "grad_norm": 1.95215904712677, + "learning_rate": 1.812612167270823e-06, + "loss": 0.8003, + "step": 6443 + }, + { + "epoch": 0.3072009153099898, + "grad_norm": 1.0741435289382935, + "learning_rate": 1.810357489791479e-06, + "loss": 0.4089, + "step": 6444 + }, + { + "epoch": 0.30724858770528923, + "grad_norm": 1.8579949140548706, + "learning_rate": 1.8081040758896361e-06, + "loss": 0.4324, + "step": 6445 + }, + { + "epoch": 0.30729626010058875, + "grad_norm": 2.041034698486328, + "learning_rate": 1.805851925912978e-06, + "loss": 1.3024, + "step": 6446 + }, + { + "epoch": 0.30734393249588826, + "grad_norm": 1.525901198387146, + "learning_rate": 1.803601040208981e-06, + "loss": 0.7191, + "step": 6447 + }, + { + "epoch": 0.3073916048911878, + "grad_norm": 1.84542715549469, + "learning_rate": 1.801351419124938e-06, + "loss": 0.6522, + "step": 6448 + }, + { + "epoch": 0.30743927728648723, + "grad_norm": 1.6180534362792969, + "learning_rate": 1.7991030630079431e-06, + "loss": 0.649, + "step": 6449 + }, + { + "epoch": 0.30748694968178675, + "grad_norm": 2.1256611347198486, + "learning_rate": 1.7968559722048906e-06, + "loss": 0.9058, + "step": 6450 + }, + { + "epoch": 0.30753462207708626, + "grad_norm": 3.1187829971313477, + "learning_rate": 1.7946101470624877e-06, + "loss": 0.3534, + "step": 6451 + }, + { + "epoch": 0.3075822944723858, + "grad_norm": 4.317934513092041, + "learning_rate": 1.7923655879272395e-06, + "loss": 0.1973, + "step": 6452 + }, + { + "epoch": 0.3076299668676853, + "grad_norm": 1.0948309898376465, + "learning_rate": 1.7901222951454566e-06, + "loss": 0.4622, + "step": 6453 + }, + { + "epoch": 0.30767763926298475, + "grad_norm": 1.3716005086898804, + "learning_rate": 1.7878802690632579e-06, + "loss": 0.7425, + "step": 6454 + }, + { + "epoch": 0.30772531165828426, + "grad_norm": 4.103541374206543, + "learning_rate": 1.785639510026569e-06, + "loss": 0.7147, + "step": 6455 + }, + { + "epoch": 0.3077729840535838, + "grad_norm": 2.311466693878174, + "learning_rate": 1.7834000183811085e-06, + "loss": 1.0938, + "step": 6456 + }, + { + "epoch": 0.3078206564488833, + "grad_norm": 2.1428282260894775, + "learning_rate": 1.7811617944724103e-06, + "loss": 0.7077, + "step": 6457 + }, + { + "epoch": 0.3078683288441828, + "grad_norm": 1.4838676452636719, + "learning_rate": 1.7789248386458102e-06, + "loss": 0.4631, + "step": 6458 + }, + { + "epoch": 0.30791600123948226, + "grad_norm": 4.157537460327148, + "learning_rate": 1.7766891512464491e-06, + "loss": 0.7323, + "step": 6459 + }, + { + "epoch": 0.3079636736347818, + "grad_norm": 2.3386669158935547, + "learning_rate": 1.7744547326192662e-06, + "loss": 0.2231, + "step": 6460 + }, + { + "epoch": 0.3080113460300813, + "grad_norm": 1.9055111408233643, + "learning_rate": 1.7722215831090106e-06, + "loss": 0.7469, + "step": 6461 + }, + { + "epoch": 0.3080590184253808, + "grad_norm": 0.92010098695755, + "learning_rate": 1.7699897030602376e-06, + "loss": 0.3568, + "step": 6462 + }, + { + "epoch": 0.30810669082068026, + "grad_norm": 1.552308440208435, + "learning_rate": 1.7677590928172994e-06, + "loss": 0.5546, + "step": 6463 + }, + { + "epoch": 0.3081543632159798, + "grad_norm": 1.7717546224594116, + "learning_rate": 1.7655297527243587e-06, + "loss": 0.9173, + "step": 6464 + }, + { + "epoch": 0.3082020356112793, + "grad_norm": 1.4038217067718506, + "learning_rate": 1.7633016831253757e-06, + "loss": 0.6635, + "step": 6465 + }, + { + "epoch": 0.3082497080065788, + "grad_norm": 2.149308919906616, + "learning_rate": 1.7610748843641245e-06, + "loss": 0.6601, + "step": 6466 + }, + { + "epoch": 0.3082973804018783, + "grad_norm": 1.7162526845932007, + "learning_rate": 1.7588493567841724e-06, + "loss": 0.9118, + "step": 6467 + }, + { + "epoch": 0.3083450527971778, + "grad_norm": 1.1733648777008057, + "learning_rate": 1.7566251007288992e-06, + "loss": 0.5365, + "step": 6468 + }, + { + "epoch": 0.3083927251924773, + "grad_norm": 1.8050142526626587, + "learning_rate": 1.7544021165414793e-06, + "loss": 0.7149, + "step": 6469 + }, + { + "epoch": 0.3084403975877768, + "grad_norm": 0.983949601650238, + "learning_rate": 1.7521804045649005e-06, + "loss": 0.5518, + "step": 6470 + }, + { + "epoch": 0.3084880699830763, + "grad_norm": 1.735499620437622, + "learning_rate": 1.7499599651419508e-06, + "loss": 0.8126, + "step": 6471 + }, + { + "epoch": 0.3085357423783758, + "grad_norm": 1.454580307006836, + "learning_rate": 1.7477407986152174e-06, + "loss": 0.8105, + "step": 6472 + }, + { + "epoch": 0.3085834147736753, + "grad_norm": 1.1425175666809082, + "learning_rate": 1.7455229053270973e-06, + "loss": 0.3065, + "step": 6473 + }, + { + "epoch": 0.3086310871689748, + "grad_norm": 0.9951463937759399, + "learning_rate": 1.7433062856197902e-06, + "loss": 0.4717, + "step": 6474 + }, + { + "epoch": 0.3086787595642743, + "grad_norm": 2.938292980194092, + "learning_rate": 1.7410909398352937e-06, + "loss": 0.4628, + "step": 6475 + }, + { + "epoch": 0.30872643195957383, + "grad_norm": 1.60874605178833, + "learning_rate": 1.7388768683154145e-06, + "loss": 0.7068, + "step": 6476 + }, + { + "epoch": 0.3087741043548733, + "grad_norm": 4.252487659454346, + "learning_rate": 1.7366640714017647e-06, + "loss": 1.1043, + "step": 6477 + }, + { + "epoch": 0.3088217767501728, + "grad_norm": 1.7481223344802856, + "learning_rate": 1.734452549435749e-06, + "loss": 0.8181, + "step": 6478 + }, + { + "epoch": 0.3088694491454723, + "grad_norm": 1.2348800897598267, + "learning_rate": 1.73224230275859e-06, + "loss": 0.8256, + "step": 6479 + }, + { + "epoch": 0.30891712154077183, + "grad_norm": 2.4656639099121094, + "learning_rate": 1.7300333317112983e-06, + "loss": 0.8434, + "step": 6480 + }, + { + "epoch": 0.3089647939360713, + "grad_norm": 1.6827641725540161, + "learning_rate": 1.7278256366347034e-06, + "loss": 1.0099, + "step": 6481 + }, + { + "epoch": 0.3090124663313708, + "grad_norm": 1.728666067123413, + "learning_rate": 1.725619217869422e-06, + "loss": 0.8378, + "step": 6482 + }, + { + "epoch": 0.3090601387266703, + "grad_norm": 1.216159701347351, + "learning_rate": 1.7234140757558892e-06, + "loss": 0.5811, + "step": 6483 + }, + { + "epoch": 0.30910781112196983, + "grad_norm": 1.6066782474517822, + "learning_rate": 1.7212102106343287e-06, + "loss": 0.5712, + "step": 6484 + }, + { + "epoch": 0.30915548351726935, + "grad_norm": 2.0662741661071777, + "learning_rate": 1.7190076228447782e-06, + "loss": 0.3293, + "step": 6485 + }, + { + "epoch": 0.3092031559125688, + "grad_norm": 1.6938399076461792, + "learning_rate": 1.7168063127270762e-06, + "loss": 1.0509, + "step": 6486 + }, + { + "epoch": 0.3092508283078683, + "grad_norm": 3.1264359951019287, + "learning_rate": 1.7146062806208573e-06, + "loss": 0.6917, + "step": 6487 + }, + { + "epoch": 0.30929850070316783, + "grad_norm": 1.0653722286224365, + "learning_rate": 1.7124075268655672e-06, + "loss": 0.7217, + "step": 6488 + }, + { + "epoch": 0.30934617309846735, + "grad_norm": 1.3021435737609863, + "learning_rate": 1.7102100518004517e-06, + "loss": 0.6076, + "step": 6489 + }, + { + "epoch": 0.30939384549376686, + "grad_norm": 2.4369189739227295, + "learning_rate": 1.7080138557645543e-06, + "loss": 0.3072, + "step": 6490 + }, + { + "epoch": 0.3094415178890663, + "grad_norm": 5.125096321105957, + "learning_rate": 1.7058189390967272e-06, + "loss": 1.3237, + "step": 6491 + }, + { + "epoch": 0.30948919028436583, + "grad_norm": 1.2146793603897095, + "learning_rate": 1.7036253021356275e-06, + "loss": 0.9807, + "step": 6492 + }, + { + "epoch": 0.30953686267966535, + "grad_norm": 1.4144781827926636, + "learning_rate": 1.7014329452197054e-06, + "loss": 0.5788, + "step": 6493 + }, + { + "epoch": 0.30958453507496486, + "grad_norm": 1.1346683502197266, + "learning_rate": 1.6992418686872203e-06, + "loss": 0.6866, + "step": 6494 + }, + { + "epoch": 0.3096322074702643, + "grad_norm": 1.5384643077850342, + "learning_rate": 1.6970520728762374e-06, + "loss": 0.5163, + "step": 6495 + }, + { + "epoch": 0.30967987986556383, + "grad_norm": 1.5738346576690674, + "learning_rate": 1.6948635581246142e-06, + "loss": 0.881, + "step": 6496 + }, + { + "epoch": 0.30972755226086335, + "grad_norm": 0.8995949029922485, + "learning_rate": 1.6926763247700163e-06, + "loss": 0.3336, + "step": 6497 + }, + { + "epoch": 0.30977522465616286, + "grad_norm": 1.5127801895141602, + "learning_rate": 1.6904903731499122e-06, + "loss": 0.608, + "step": 6498 + }, + { + "epoch": 0.3098228970514624, + "grad_norm": 1.5991016626358032, + "learning_rate": 1.688305703601575e-06, + "loss": 0.554, + "step": 6499 + }, + { + "epoch": 0.30987056944676183, + "grad_norm": 2.663707971572876, + "learning_rate": 1.686122316462071e-06, + "loss": 0.7762, + "step": 6500 + }, + { + "epoch": 0.30991824184206135, + "grad_norm": 1.9118679761886597, + "learning_rate": 1.6839402120682768e-06, + "loss": 0.981, + "step": 6501 + }, + { + "epoch": 0.30996591423736086, + "grad_norm": 1.5477867126464844, + "learning_rate": 1.681759390756873e-06, + "loss": 0.7308, + "step": 6502 + }, + { + "epoch": 0.3100135866326604, + "grad_norm": 1.5888001918792725, + "learning_rate": 1.6795798528643304e-06, + "loss": 0.6634, + "step": 6503 + }, + { + "epoch": 0.31006125902795983, + "grad_norm": 1.959020733833313, + "learning_rate": 1.677401598726932e-06, + "loss": 0.6814, + "step": 6504 + }, + { + "epoch": 0.31010893142325935, + "grad_norm": 1.2314298152923584, + "learning_rate": 1.6752246286807638e-06, + "loss": 0.6576, + "step": 6505 + }, + { + "epoch": 0.31015660381855886, + "grad_norm": 2.1980175971984863, + "learning_rate": 1.6730489430617048e-06, + "loss": 0.5543, + "step": 6506 + }, + { + "epoch": 0.3102042762138584, + "grad_norm": 3.7818870544433594, + "learning_rate": 1.670874542205443e-06, + "loss": 0.726, + "step": 6507 + }, + { + "epoch": 0.3102519486091579, + "grad_norm": 2.2076447010040283, + "learning_rate": 1.6687014264474677e-06, + "loss": 0.8583, + "step": 6508 + }, + { + "epoch": 0.31029962100445735, + "grad_norm": 5.225788116455078, + "learning_rate": 1.6665295961230644e-06, + "loss": 1.1672, + "step": 6509 + }, + { + "epoch": 0.31034729339975686, + "grad_norm": 1.57306969165802, + "learning_rate": 1.664359051567328e-06, + "loss": 0.6766, + "step": 6510 + }, + { + "epoch": 0.3103949657950564, + "grad_norm": 1.4917519092559814, + "learning_rate": 1.6621897931151498e-06, + "loss": 0.8296, + "step": 6511 + }, + { + "epoch": 0.3104426381903559, + "grad_norm": 2.040132761001587, + "learning_rate": 1.660021821101222e-06, + "loss": 0.582, + "step": 6512 + }, + { + "epoch": 0.31049031058565535, + "grad_norm": 1.456680178642273, + "learning_rate": 1.6578551358600415e-06, + "loss": 0.8104, + "step": 6513 + }, + { + "epoch": 0.31053798298095486, + "grad_norm": 2.2867209911346436, + "learning_rate": 1.6556897377259085e-06, + "loss": 0.9111, + "step": 6514 + }, + { + "epoch": 0.3105856553762544, + "grad_norm": 2.4910178184509277, + "learning_rate": 1.653525627032917e-06, + "loss": 1.0848, + "step": 6515 + }, + { + "epoch": 0.3106333277715539, + "grad_norm": 3.2589223384857178, + "learning_rate": 1.6513628041149688e-06, + "loss": 0.7544, + "step": 6516 + }, + { + "epoch": 0.3106810001668534, + "grad_norm": 1.4992516040802002, + "learning_rate": 1.649201269305768e-06, + "loss": 0.9809, + "step": 6517 + }, + { + "epoch": 0.31072867256215286, + "grad_norm": 1.5746660232543945, + "learning_rate": 1.6470410229388134e-06, + "loss": 0.8504, + "step": 6518 + }, + { + "epoch": 0.3107763449574524, + "grad_norm": 1.524914264678955, + "learning_rate": 1.6448820653474084e-06, + "loss": 0.5504, + "step": 6519 + }, + { + "epoch": 0.3108240173527519, + "grad_norm": 2.3994109630584717, + "learning_rate": 1.6427243968646632e-06, + "loss": 0.5918, + "step": 6520 + }, + { + "epoch": 0.3108716897480514, + "grad_norm": 1.3910152912139893, + "learning_rate": 1.6405680178234784e-06, + "loss": 0.655, + "step": 6521 + }, + { + "epoch": 0.3109193621433509, + "grad_norm": 1.3168928623199463, + "learning_rate": 1.638412928556562e-06, + "loss": 0.7157, + "step": 6522 + }, + { + "epoch": 0.3109670345386504, + "grad_norm": 1.7572548389434814, + "learning_rate": 1.6362591293964247e-06, + "loss": 0.3562, + "step": 6523 + }, + { + "epoch": 0.3110147069339499, + "grad_norm": 1.877868413925171, + "learning_rate": 1.634106620675373e-06, + "loss": 0.7416, + "step": 6524 + }, + { + "epoch": 0.3110623793292494, + "grad_norm": 12.239848136901855, + "learning_rate": 1.631955402725519e-06, + "loss": 0.0823, + "step": 6525 + }, + { + "epoch": 0.3111100517245489, + "grad_norm": 3.99135684967041, + "learning_rate": 1.6298054758787707e-06, + "loss": 1.5696, + "step": 6526 + }, + { + "epoch": 0.3111577241198484, + "grad_norm": 2.9434146881103516, + "learning_rate": 1.6276568404668425e-06, + "loss": 1.2763, + "step": 6527 + }, + { + "epoch": 0.3112053965151479, + "grad_norm": 0.8180282115936279, + "learning_rate": 1.6255094968212436e-06, + "loss": 0.604, + "step": 6528 + }, + { + "epoch": 0.3112530689104474, + "grad_norm": 1.1855956315994263, + "learning_rate": 1.6233634452732916e-06, + "loss": 0.6412, + "step": 6529 + }, + { + "epoch": 0.3113007413057469, + "grad_norm": 2.7904341220855713, + "learning_rate": 1.6212186861540946e-06, + "loss": 0.6747, + "step": 6530 + }, + { + "epoch": 0.31134841370104643, + "grad_norm": 1.178819179534912, + "learning_rate": 1.619075219794569e-06, + "loss": 0.6261, + "step": 6531 + }, + { + "epoch": 0.3113960860963459, + "grad_norm": 2.6483869552612305, + "learning_rate": 1.616933046525433e-06, + "loss": 1.2557, + "step": 6532 + }, + { + "epoch": 0.3114437584916454, + "grad_norm": 1.7681313753128052, + "learning_rate": 1.614792166677197e-06, + "loss": 0.9145, + "step": 6533 + }, + { + "epoch": 0.3114914308869449, + "grad_norm": 1.420824408531189, + "learning_rate": 1.6126525805801786e-06, + "loss": 0.8921, + "step": 6534 + }, + { + "epoch": 0.31153910328224443, + "grad_norm": 2.487520456314087, + "learning_rate": 1.610514288564493e-06, + "loss": 0.8742, + "step": 6535 + }, + { + "epoch": 0.3115867756775439, + "grad_norm": 1.4098377227783203, + "learning_rate": 1.6083772909600614e-06, + "loss": 1.0423, + "step": 6536 + }, + { + "epoch": 0.3116344480728434, + "grad_norm": 1.9439759254455566, + "learning_rate": 1.6062415880965932e-06, + "loss": 0.7453, + "step": 6537 + }, + { + "epoch": 0.3116821204681429, + "grad_norm": 1.2742236852645874, + "learning_rate": 1.60410718030361e-06, + "loss": 1.0144, + "step": 6538 + }, + { + "epoch": 0.31172979286344243, + "grad_norm": 1.2759597301483154, + "learning_rate": 1.6019740679104301e-06, + "loss": 0.8142, + "step": 6539 + }, + { + "epoch": 0.31177746525874195, + "grad_norm": 3.0621325969696045, + "learning_rate": 1.5998422512461687e-06, + "loss": 0.7816, + "step": 6540 + }, + { + "epoch": 0.3118251376540414, + "grad_norm": 1.4142330884933472, + "learning_rate": 1.5977117306397394e-06, + "loss": 0.7853, + "step": 6541 + }, + { + "epoch": 0.3118728100493409, + "grad_norm": 1.821967363357544, + "learning_rate": 1.5955825064198671e-06, + "loss": 0.8065, + "step": 6542 + }, + { + "epoch": 0.31192048244464043, + "grad_norm": 2.012448310852051, + "learning_rate": 1.5934545789150625e-06, + "loss": 0.8001, + "step": 6543 + }, + { + "epoch": 0.31196815483993995, + "grad_norm": 2.912635326385498, + "learning_rate": 1.591327948453646e-06, + "loss": 1.2998, + "step": 6544 + }, + { + "epoch": 0.3120158272352394, + "grad_norm": 2.260390043258667, + "learning_rate": 1.5892026153637363e-06, + "loss": 1.197, + "step": 6545 + }, + { + "epoch": 0.3120634996305389, + "grad_norm": 1.7087609767913818, + "learning_rate": 1.5870785799732459e-06, + "loss": 0.7234, + "step": 6546 + }, + { + "epoch": 0.31211117202583843, + "grad_norm": 1.6463042497634888, + "learning_rate": 1.5849558426098955e-06, + "loss": 0.5573, + "step": 6547 + }, + { + "epoch": 0.31215884442113795, + "grad_norm": 1.0706912279129028, + "learning_rate": 1.5828344036012012e-06, + "loss": 0.7365, + "step": 6548 + }, + { + "epoch": 0.31220651681643746, + "grad_norm": 1.188310146331787, + "learning_rate": 1.5807142632744776e-06, + "loss": 0.7697, + "step": 6549 + }, + { + "epoch": 0.3122541892117369, + "grad_norm": 2.6883914470672607, + "learning_rate": 1.57859542195684e-06, + "loss": 0.6252, + "step": 6550 + }, + { + "epoch": 0.31230186160703644, + "grad_norm": 3.5239317417144775, + "learning_rate": 1.5764778799752079e-06, + "loss": 0.6836, + "step": 6551 + }, + { + "epoch": 0.31234953400233595, + "grad_norm": 1.307760238647461, + "learning_rate": 1.5743616376562921e-06, + "loss": 0.8936, + "step": 6552 + }, + { + "epoch": 0.31239720639763546, + "grad_norm": 1.4934606552124023, + "learning_rate": 1.5722466953266068e-06, + "loss": 0.7663, + "step": 6553 + }, + { + "epoch": 0.312444878792935, + "grad_norm": 1.3050966262817383, + "learning_rate": 1.5701330533124704e-06, + "loss": 0.8025, + "step": 6554 + }, + { + "epoch": 0.31249255118823444, + "grad_norm": 2.0975804328918457, + "learning_rate": 1.5680207119399926e-06, + "loss": 0.7393, + "step": 6555 + }, + { + "epoch": 0.31254022358353395, + "grad_norm": 2.0402796268463135, + "learning_rate": 1.5659096715350842e-06, + "loss": 0.7473, + "step": 6556 + }, + { + "epoch": 0.31258789597883346, + "grad_norm": 2.0081799030303955, + "learning_rate": 1.563799932423462e-06, + "loss": 0.745, + "step": 6557 + }, + { + "epoch": 0.312635568374133, + "grad_norm": 2.052009105682373, + "learning_rate": 1.5616914949306316e-06, + "loss": 0.397, + "step": 6558 + }, + { + "epoch": 0.31268324076943244, + "grad_norm": 1.552558422088623, + "learning_rate": 1.559584359381906e-06, + "loss": 0.6583, + "step": 6559 + }, + { + "epoch": 0.31273091316473195, + "grad_norm": 1.081641435623169, + "learning_rate": 1.557478526102396e-06, + "loss": 0.6207, + "step": 6560 + }, + { + "epoch": 0.31277858556003146, + "grad_norm": 1.64644455909729, + "learning_rate": 1.5553739954170055e-06, + "loss": 0.4479, + "step": 6561 + }, + { + "epoch": 0.312826257955331, + "grad_norm": 1.3025169372558594, + "learning_rate": 1.5532707676504455e-06, + "loss": 0.6613, + "step": 6562 + }, + { + "epoch": 0.3128739303506305, + "grad_norm": 1.5401815176010132, + "learning_rate": 1.5511688431272242e-06, + "loss": 0.9271, + "step": 6563 + }, + { + "epoch": 0.31292160274592995, + "grad_norm": 2.020066738128662, + "learning_rate": 1.5490682221716413e-06, + "loss": 0.659, + "step": 6564 + }, + { + "epoch": 0.31296927514122946, + "grad_norm": 1.3544154167175293, + "learning_rate": 1.5469689051078041e-06, + "loss": 0.6343, + "step": 6565 + }, + { + "epoch": 0.313016947536529, + "grad_norm": 1.442132592201233, + "learning_rate": 1.5448708922596178e-06, + "loss": 0.961, + "step": 6566 + }, + { + "epoch": 0.3130646199318285, + "grad_norm": 4.535737991333008, + "learning_rate": 1.5427741839507804e-06, + "loss": 1.6344, + "step": 6567 + }, + { + "epoch": 0.31311229232712795, + "grad_norm": 2.2868452072143555, + "learning_rate": 1.540678780504793e-06, + "loss": 0.8728, + "step": 6568 + }, + { + "epoch": 0.31315996472242746, + "grad_norm": 2.1181046962738037, + "learning_rate": 1.538584682244958e-06, + "loss": 0.6222, + "step": 6569 + }, + { + "epoch": 0.313207637117727, + "grad_norm": 1.742482304573059, + "learning_rate": 1.5364918894943682e-06, + "loss": 0.7478, + "step": 6570 + }, + { + "epoch": 0.3132553095130265, + "grad_norm": 1.5931150913238525, + "learning_rate": 1.534400402575925e-06, + "loss": 0.6558, + "step": 6571 + }, + { + "epoch": 0.313302981908326, + "grad_norm": 1.9566168785095215, + "learning_rate": 1.5323102218123186e-06, + "loss": 0.4564, + "step": 6572 + }, + { + "epoch": 0.31335065430362546, + "grad_norm": 2.7765796184539795, + "learning_rate": 1.5302213475260475e-06, + "loss": 0.5948, + "step": 6573 + }, + { + "epoch": 0.313398326698925, + "grad_norm": 1.4216275215148926, + "learning_rate": 1.528133780039397e-06, + "loss": 0.641, + "step": 6574 + }, + { + "epoch": 0.3134459990942245, + "grad_norm": 1.6456646919250488, + "learning_rate": 1.5260475196744618e-06, + "loss": 0.955, + "step": 6575 + }, + { + "epoch": 0.313493671489524, + "grad_norm": 1.4795328378677368, + "learning_rate": 1.5239625667531322e-06, + "loss": 0.6831, + "step": 6576 + }, + { + "epoch": 0.3135413438848235, + "grad_norm": 1.8460081815719604, + "learning_rate": 1.5218789215970897e-06, + "loss": 0.8814, + "step": 6577 + }, + { + "epoch": 0.313589016280123, + "grad_norm": 1.3663274049758911, + "learning_rate": 1.5197965845278217e-06, + "loss": 0.5449, + "step": 6578 + }, + { + "epoch": 0.3136366886754225, + "grad_norm": 1.4252835512161255, + "learning_rate": 1.5177155558666135e-06, + "loss": 0.2409, + "step": 6579 + }, + { + "epoch": 0.313684361070722, + "grad_norm": 3.0009138584136963, + "learning_rate": 1.5156358359345425e-06, + "loss": 0.5513, + "step": 6580 + }, + { + "epoch": 0.3137320334660215, + "grad_norm": 2.6511809825897217, + "learning_rate": 1.5135574250524898e-06, + "loss": 0.8639, + "step": 6581 + }, + { + "epoch": 0.313779705861321, + "grad_norm": 0.9627076983451843, + "learning_rate": 1.5114803235411346e-06, + "loss": 0.4883, + "step": 6582 + }, + { + "epoch": 0.3138273782566205, + "grad_norm": 1.0760561227798462, + "learning_rate": 1.5094045317209493e-06, + "loss": 0.4245, + "step": 6583 + }, + { + "epoch": 0.31387505065192, + "grad_norm": 0.994811475276947, + "learning_rate": 1.5073300499122113e-06, + "loss": 0.4963, + "step": 6584 + }, + { + "epoch": 0.3139227230472195, + "grad_norm": 1.3928691148757935, + "learning_rate": 1.5052568784349852e-06, + "loss": 0.6735, + "step": 6585 + }, + { + "epoch": 0.31397039544251903, + "grad_norm": 1.8815174102783203, + "learning_rate": 1.5031850176091467e-06, + "loss": 0.757, + "step": 6586 + }, + { + "epoch": 0.3140180678378185, + "grad_norm": 1.2746731042861938, + "learning_rate": 1.5011144677543576e-06, + "loss": 0.7011, + "step": 6587 + }, + { + "epoch": 0.314065740233118, + "grad_norm": 1.5576210021972656, + "learning_rate": 1.499045229190087e-06, + "loss": 0.5558, + "step": 6588 + }, + { + "epoch": 0.3141134126284175, + "grad_norm": 1.9429413080215454, + "learning_rate": 1.4969773022355927e-06, + "loss": 1.0802, + "step": 6589 + }, + { + "epoch": 0.31416108502371703, + "grad_norm": 1.713625192642212, + "learning_rate": 1.494910687209935e-06, + "loss": 0.6591, + "step": 6590 + }, + { + "epoch": 0.3142087574190165, + "grad_norm": 1.3166879415512085, + "learning_rate": 1.4928453844319769e-06, + "loss": 0.6429, + "step": 6591 + }, + { + "epoch": 0.314256429814316, + "grad_norm": 2.6583194732666016, + "learning_rate": 1.4907813942203652e-06, + "loss": 0.8937, + "step": 6592 + }, + { + "epoch": 0.3143041022096155, + "grad_norm": 2.1903364658355713, + "learning_rate": 1.4887187168935579e-06, + "loss": 0.4286, + "step": 6593 + }, + { + "epoch": 0.31435177460491504, + "grad_norm": 3.6487207412719727, + "learning_rate": 1.4866573527698047e-06, + "loss": 0.5949, + "step": 6594 + }, + { + "epoch": 0.31439944700021455, + "grad_norm": 1.8561707735061646, + "learning_rate": 1.48459730216715e-06, + "loss": 0.5996, + "step": 6595 + }, + { + "epoch": 0.314447119395514, + "grad_norm": 1.1115163564682007, + "learning_rate": 1.4825385654034386e-06, + "loss": 0.7894, + "step": 6596 + }, + { + "epoch": 0.3144947917908135, + "grad_norm": 4.910205364227295, + "learning_rate": 1.4804811427963173e-06, + "loss": 0.7863, + "step": 6597 + }, + { + "epoch": 0.31454246418611304, + "grad_norm": 2.453122854232788, + "learning_rate": 1.478425034663219e-06, + "loss": 0.9238, + "step": 6598 + }, + { + "epoch": 0.31459013658141255, + "grad_norm": 1.2896822690963745, + "learning_rate": 1.4763702413213843e-06, + "loss": 0.6852, + "step": 6599 + }, + { + "epoch": 0.314637808976712, + "grad_norm": 1.6508816480636597, + "learning_rate": 1.474316763087843e-06, + "loss": 0.6804, + "step": 6600 + }, + { + "epoch": 0.3146854813720115, + "grad_norm": 1.734481930732727, + "learning_rate": 1.4722646002794294e-06, + "loss": 0.6244, + "step": 6601 + }, + { + "epoch": 0.31473315376731104, + "grad_norm": 0.9304144978523254, + "learning_rate": 1.470213753212768e-06, + "loss": 0.1639, + "step": 6602 + }, + { + "epoch": 0.31478082616261055, + "grad_norm": 2.0326039791107178, + "learning_rate": 1.468164222204287e-06, + "loss": 0.6725, + "step": 6603 + }, + { + "epoch": 0.31482849855791006, + "grad_norm": 1.7764116525650024, + "learning_rate": 1.4661160075702018e-06, + "loss": 0.3881, + "step": 6604 + }, + { + "epoch": 0.3148761709532095, + "grad_norm": 1.1036642789840698, + "learning_rate": 1.4640691096265358e-06, + "loss": 0.6321, + "step": 6605 + }, + { + "epoch": 0.31492384334850904, + "grad_norm": 3.8955821990966797, + "learning_rate": 1.4620235286891049e-06, + "loss": 1.0674, + "step": 6606 + }, + { + "epoch": 0.31497151574380855, + "grad_norm": 1.4753177165985107, + "learning_rate": 1.4599792650735179e-06, + "loss": 0.5748, + "step": 6607 + }, + { + "epoch": 0.31501918813910806, + "grad_norm": 1.3320003747940063, + "learning_rate": 1.4579363190951845e-06, + "loss": 0.7962, + "step": 6608 + }, + { + "epoch": 0.3150668605344076, + "grad_norm": 1.2139029502868652, + "learning_rate": 1.4558946910693127e-06, + "loss": 0.7905, + "step": 6609 + }, + { + "epoch": 0.31511453292970704, + "grad_norm": 1.4506020545959473, + "learning_rate": 1.453854381310902e-06, + "loss": 0.6546, + "step": 6610 + }, + { + "epoch": 0.31516220532500655, + "grad_norm": 1.3403805494308472, + "learning_rate": 1.451815390134751e-06, + "loss": 0.7319, + "step": 6611 + }, + { + "epoch": 0.31520987772030606, + "grad_norm": 1.6739354133605957, + "learning_rate": 1.449777717855455e-06, + "loss": 0.681, + "step": 6612 + }, + { + "epoch": 0.3152575501156056, + "grad_norm": 1.7977980375289917, + "learning_rate": 1.4477413647874106e-06, + "loss": 1.1114, + "step": 6613 + }, + { + "epoch": 0.31530522251090504, + "grad_norm": 2.4470624923706055, + "learning_rate": 1.4457063312447995e-06, + "loss": 0.9177, + "step": 6614 + }, + { + "epoch": 0.31535289490620455, + "grad_norm": 2.64920973777771, + "learning_rate": 1.4436726175416116e-06, + "loss": 1.0572, + "step": 6615 + }, + { + "epoch": 0.31540056730150406, + "grad_norm": 1.1840165853500366, + "learning_rate": 1.4416402239916261e-06, + "loss": 0.4655, + "step": 6616 + }, + { + "epoch": 0.3154482396968036, + "grad_norm": 1.142012357711792, + "learning_rate": 1.4396091509084175e-06, + "loss": 0.6814, + "step": 6617 + }, + { + "epoch": 0.3154959120921031, + "grad_norm": 2.9277257919311523, + "learning_rate": 1.4375793986053622e-06, + "loss": 1.0581, + "step": 6618 + }, + { + "epoch": 0.31554358448740255, + "grad_norm": 2.0775747299194336, + "learning_rate": 1.4355509673956313e-06, + "loss": 0.7016, + "step": 6619 + }, + { + "epoch": 0.31559125688270206, + "grad_norm": 1.4216697216033936, + "learning_rate": 1.4335238575921884e-06, + "loss": 0.7154, + "step": 6620 + }, + { + "epoch": 0.3156389292780016, + "grad_norm": 2.9023005962371826, + "learning_rate": 1.431498069507795e-06, + "loss": 0.6624, + "step": 6621 + }, + { + "epoch": 0.3156866016733011, + "grad_norm": 3.2430179119110107, + "learning_rate": 1.429473603455015e-06, + "loss": 0.8766, + "step": 6622 + }, + { + "epoch": 0.31573427406860055, + "grad_norm": 1.7777272462844849, + "learning_rate": 1.4274504597461946e-06, + "loss": 0.7014, + "step": 6623 + }, + { + "epoch": 0.31578194646390007, + "grad_norm": 1.048928141593933, + "learning_rate": 1.425428638693489e-06, + "loss": 0.5853, + "step": 6624 + }, + { + "epoch": 0.3158296188591996, + "grad_norm": 2.6440327167510986, + "learning_rate": 1.4234081406088463e-06, + "loss": 0.7435, + "step": 6625 + }, + { + "epoch": 0.3158772912544991, + "grad_norm": 1.7596545219421387, + "learning_rate": 1.4213889658040026e-06, + "loss": 0.7789, + "step": 6626 + }, + { + "epoch": 0.3159249636497986, + "grad_norm": 1.6288104057312012, + "learning_rate": 1.4193711145904988e-06, + "loss": 0.9032, + "step": 6627 + }, + { + "epoch": 0.31597263604509807, + "grad_norm": 2.528653144836426, + "learning_rate": 1.4173545872796713e-06, + "loss": 0.1689, + "step": 6628 + }, + { + "epoch": 0.3160203084403976, + "grad_norm": 1.156921625137329, + "learning_rate": 1.4153393841826446e-06, + "loss": 0.7119, + "step": 6629 + }, + { + "epoch": 0.3160679808356971, + "grad_norm": 1.2893879413604736, + "learning_rate": 1.4133255056103478e-06, + "loss": 0.4464, + "step": 6630 + }, + { + "epoch": 0.3161156532309966, + "grad_norm": 1.8145041465759277, + "learning_rate": 1.4113129518735002e-06, + "loss": 0.7846, + "step": 6631 + }, + { + "epoch": 0.31616332562629607, + "grad_norm": 2.8903684616088867, + "learning_rate": 1.4093017232826155e-06, + "loss": 0.5727, + "step": 6632 + }, + { + "epoch": 0.3162109980215956, + "grad_norm": 3.1536953449249268, + "learning_rate": 1.4072918201480078e-06, + "loss": 0.6654, + "step": 6633 + }, + { + "epoch": 0.3162586704168951, + "grad_norm": 2.0173208713531494, + "learning_rate": 1.405283242779787e-06, + "loss": 1.0501, + "step": 6634 + }, + { + "epoch": 0.3163063428121946, + "grad_norm": 1.5940346717834473, + "learning_rate": 1.4032759914878501e-06, + "loss": 0.9135, + "step": 6635 + }, + { + "epoch": 0.3163540152074941, + "grad_norm": 1.376216173171997, + "learning_rate": 1.401270066581899e-06, + "loss": 0.3843, + "step": 6636 + }, + { + "epoch": 0.3164016876027936, + "grad_norm": 2.7516770362854004, + "learning_rate": 1.3992654683714303e-06, + "loss": 1.0433, + "step": 6637 + }, + { + "epoch": 0.3164493599980931, + "grad_norm": 3.3411176204681396, + "learning_rate": 1.397262197165725e-06, + "loss": 0.5809, + "step": 6638 + }, + { + "epoch": 0.3164970323933926, + "grad_norm": 2.1718568801879883, + "learning_rate": 1.3952602532738734e-06, + "loss": 0.497, + "step": 6639 + }, + { + "epoch": 0.3165447047886921, + "grad_norm": 2.2344655990600586, + "learning_rate": 1.3932596370047547e-06, + "loss": 0.9163, + "step": 6640 + }, + { + "epoch": 0.31659237718399164, + "grad_norm": 1.270484447479248, + "learning_rate": 1.3912603486670396e-06, + "loss": 0.8538, + "step": 6641 + }, + { + "epoch": 0.3166400495792911, + "grad_norm": 1.4166371822357178, + "learning_rate": 1.3892623885692003e-06, + "loss": 0.6165, + "step": 6642 + }, + { + "epoch": 0.3166877219745906, + "grad_norm": 1.5425513982772827, + "learning_rate": 1.3872657570195025e-06, + "loss": 0.4812, + "step": 6643 + }, + { + "epoch": 0.3167353943698901, + "grad_norm": 1.6991596221923828, + "learning_rate": 1.385270454326002e-06, + "loss": 0.8697, + "step": 6644 + }, + { + "epoch": 0.31678306676518964, + "grad_norm": 2.2527730464935303, + "learning_rate": 1.3832764807965582e-06, + "loss": 0.6057, + "step": 6645 + }, + { + "epoch": 0.3168307391604891, + "grad_norm": 0.9060356616973877, + "learning_rate": 1.3812838367388171e-06, + "loss": 0.48, + "step": 6646 + }, + { + "epoch": 0.3168784115557886, + "grad_norm": 1.4182647466659546, + "learning_rate": 1.379292522460225e-06, + "loss": 0.3352, + "step": 6647 + }, + { + "epoch": 0.3169260839510881, + "grad_norm": 3.791928291320801, + "learning_rate": 1.3773025382680195e-06, + "loss": 0.8462, + "step": 6648 + }, + { + "epoch": 0.31697375634638764, + "grad_norm": 1.4614535570144653, + "learning_rate": 1.3753138844692348e-06, + "loss": 0.6568, + "step": 6649 + }, + { + "epoch": 0.31702142874168715, + "grad_norm": 1.1077797412872314, + "learning_rate": 1.3733265613707037e-06, + "loss": 0.4703, + "step": 6650 + }, + { + "epoch": 0.3170691011369866, + "grad_norm": 8.790328025817871, + "learning_rate": 1.3713405692790448e-06, + "loss": 0.5406, + "step": 6651 + }, + { + "epoch": 0.3171167735322861, + "grad_norm": 1.914686679840088, + "learning_rate": 1.3693559085006768e-06, + "loss": 0.6805, + "step": 6652 + }, + { + "epoch": 0.31716444592758564, + "grad_norm": 1.6303683519363403, + "learning_rate": 1.367372579341817e-06, + "loss": 0.4999, + "step": 6653 + }, + { + "epoch": 0.31721211832288515, + "grad_norm": 1.5619796514511108, + "learning_rate": 1.3653905821084668e-06, + "loss": 0.6573, + "step": 6654 + }, + { + "epoch": 0.3172597907181846, + "grad_norm": 2.1431751251220703, + "learning_rate": 1.3634099171064297e-06, + "loss": 0.9527, + "step": 6655 + }, + { + "epoch": 0.3173074631134841, + "grad_norm": 1.3966847658157349, + "learning_rate": 1.3614305846413056e-06, + "loss": 0.8538, + "step": 6656 + }, + { + "epoch": 0.31735513550878364, + "grad_norm": 1.201562523841858, + "learning_rate": 1.3594525850184803e-06, + "loss": 0.8162, + "step": 6657 + }, + { + "epoch": 0.31740280790408315, + "grad_norm": 2.9309046268463135, + "learning_rate": 1.3574759185431408e-06, + "loss": 0.4181, + "step": 6658 + }, + { + "epoch": 0.31745048029938266, + "grad_norm": 2.211801052093506, + "learning_rate": 1.3555005855202674e-06, + "loss": 0.9168, + "step": 6659 + }, + { + "epoch": 0.3174981526946821, + "grad_norm": 2.020312786102295, + "learning_rate": 1.3535265862546333e-06, + "loss": 0.9857, + "step": 6660 + }, + { + "epoch": 0.31754582508998164, + "grad_norm": 1.5531432628631592, + "learning_rate": 1.3515539210508033e-06, + "loss": 0.5855, + "step": 6661 + }, + { + "epoch": 0.31759349748528115, + "grad_norm": 2.2170095443725586, + "learning_rate": 1.3495825902131443e-06, + "loss": 0.8875, + "step": 6662 + }, + { + "epoch": 0.31764116988058066, + "grad_norm": 1.1779119968414307, + "learning_rate": 1.3476125940458062e-06, + "loss": 0.284, + "step": 6663 + }, + { + "epoch": 0.3176888422758802, + "grad_norm": 1.4040954113006592, + "learning_rate": 1.3456439328527426e-06, + "loss": 0.5142, + "step": 6664 + }, + { + "epoch": 0.31773651467117964, + "grad_norm": 1.2396644353866577, + "learning_rate": 1.3436766069377006e-06, + "loss": 0.3101, + "step": 6665 + }, + { + "epoch": 0.31778418706647915, + "grad_norm": 1.9658242464065552, + "learning_rate": 1.3417106166042127e-06, + "loss": 0.6576, + "step": 6666 + }, + { + "epoch": 0.31783185946177867, + "grad_norm": 3.4210305213928223, + "learning_rate": 1.339745962155613e-06, + "loss": 0.3107, + "step": 6667 + }, + { + "epoch": 0.3178795318570782, + "grad_norm": 1.7822257280349731, + "learning_rate": 1.3377826438950315e-06, + "loss": 0.6504, + "step": 6668 + }, + { + "epoch": 0.31792720425237764, + "grad_norm": 1.1864222288131714, + "learning_rate": 1.3358206621253812e-06, + "loss": 0.4216, + "step": 6669 + }, + { + "epoch": 0.31797487664767715, + "grad_norm": 2.1839072704315186, + "learning_rate": 1.3338600171493787e-06, + "loss": 0.7742, + "step": 6670 + }, + { + "epoch": 0.31802254904297667, + "grad_norm": 2.5582492351531982, + "learning_rate": 1.3319007092695346e-06, + "loss": 1.1295, + "step": 6671 + }, + { + "epoch": 0.3180702214382762, + "grad_norm": 1.7420909404754639, + "learning_rate": 1.3299427387881436e-06, + "loss": 0.7763, + "step": 6672 + }, + { + "epoch": 0.3181178938335757, + "grad_norm": 1.87956702709198, + "learning_rate": 1.327986106007305e-06, + "loss": 0.9161, + "step": 6673 + }, + { + "epoch": 0.31816556622887515, + "grad_norm": 2.6080970764160156, + "learning_rate": 1.3260308112289066e-06, + "loss": 1.44, + "step": 6674 + }, + { + "epoch": 0.31821323862417467, + "grad_norm": 3.1158368587493896, + "learning_rate": 1.3240768547546302e-06, + "loss": 1.006, + "step": 6675 + }, + { + "epoch": 0.3182609110194742, + "grad_norm": 1.2281715869903564, + "learning_rate": 1.3221242368859489e-06, + "loss": 0.7667, + "step": 6676 + }, + { + "epoch": 0.3183085834147737, + "grad_norm": 1.4579689502716064, + "learning_rate": 1.320172957924134e-06, + "loss": 0.6034, + "step": 6677 + }, + { + "epoch": 0.31835625581007315, + "grad_norm": 1.3187172412872314, + "learning_rate": 1.318223018170245e-06, + "loss": 0.3434, + "step": 6678 + }, + { + "epoch": 0.31840392820537267, + "grad_norm": 1.7120447158813477, + "learning_rate": 1.3162744179251396e-06, + "loss": 0.6513, + "step": 6679 + }, + { + "epoch": 0.3184516006006722, + "grad_norm": 4.401436805725098, + "learning_rate": 1.3143271574894677e-06, + "loss": 1.8345, + "step": 6680 + }, + { + "epoch": 0.3184992729959717, + "grad_norm": 0.9458251595497131, + "learning_rate": 1.3123812371636691e-06, + "loss": 0.5839, + "step": 6681 + }, + { + "epoch": 0.3185469453912712, + "grad_norm": 1.7610641717910767, + "learning_rate": 1.3104366572479798e-06, + "loss": 0.7357, + "step": 6682 + }, + { + "epoch": 0.31859461778657067, + "grad_norm": 2.6764273643493652, + "learning_rate": 1.3084934180424324e-06, + "loss": 0.9112, + "step": 6683 + }, + { + "epoch": 0.3186422901818702, + "grad_norm": 2.67384672164917, + "learning_rate": 1.3065515198468425e-06, + "loss": 0.8334, + "step": 6684 + }, + { + "epoch": 0.3186899625771697, + "grad_norm": 2.8001441955566406, + "learning_rate": 1.3046109629608273e-06, + "loss": 0.9927, + "step": 6685 + }, + { + "epoch": 0.3187376349724692, + "grad_norm": 1.3272708654403687, + "learning_rate": 1.302671747683798e-06, + "loss": 0.7976, + "step": 6686 + }, + { + "epoch": 0.31878530736776867, + "grad_norm": 2.6087403297424316, + "learning_rate": 1.3007338743149511e-06, + "loss": 1.1775, + "step": 6687 + }, + { + "epoch": 0.3188329797630682, + "grad_norm": 1.4888510704040527, + "learning_rate": 1.2987973431532818e-06, + "loss": 0.7254, + "step": 6688 + }, + { + "epoch": 0.3188806521583677, + "grad_norm": 1.4774237871170044, + "learning_rate": 1.296862154497579e-06, + "loss": 0.6758, + "step": 6689 + }, + { + "epoch": 0.3189283245536672, + "grad_norm": 1.4781259298324585, + "learning_rate": 1.2949283086464192e-06, + "loss": 0.6349, + "step": 6690 + }, + { + "epoch": 0.3189759969489667, + "grad_norm": 0.810100257396698, + "learning_rate": 1.2929958058981796e-06, + "loss": 0.5273, + "step": 6691 + }, + { + "epoch": 0.3190236693442662, + "grad_norm": 1.7063322067260742, + "learning_rate": 1.291064646551019e-06, + "loss": 0.8014, + "step": 6692 + }, + { + "epoch": 0.3190713417395657, + "grad_norm": 2.5724058151245117, + "learning_rate": 1.2891348309029005e-06, + "loss": 0.5987, + "step": 6693 + }, + { + "epoch": 0.3191190141348652, + "grad_norm": 2.300093173980713, + "learning_rate": 1.2872063592515716e-06, + "loss": 0.6093, + "step": 6694 + }, + { + "epoch": 0.3191666865301647, + "grad_norm": 1.4376941919326782, + "learning_rate": 1.2852792318945773e-06, + "loss": 0.7723, + "step": 6695 + }, + { + "epoch": 0.31921435892546424, + "grad_norm": 2.926440477371216, + "learning_rate": 1.2833534491292554e-06, + "loss": 1.0, + "step": 6696 + }, + { + "epoch": 0.3192620313207637, + "grad_norm": 1.99180006980896, + "learning_rate": 1.2814290112527295e-06, + "loss": 0.6256, + "step": 6697 + }, + { + "epoch": 0.3193097037160632, + "grad_norm": 2.6852056980133057, + "learning_rate": 1.279505918561923e-06, + "loss": 1.0226, + "step": 6698 + }, + { + "epoch": 0.3193573761113627, + "grad_norm": 1.7871979475021362, + "learning_rate": 1.2775841713535532e-06, + "loss": 0.503, + "step": 6699 + }, + { + "epoch": 0.31940504850666224, + "grad_norm": 2.6340057849884033, + "learning_rate": 1.2756637699241181e-06, + "loss": 1.2569, + "step": 6700 + }, + { + "epoch": 0.3194527209019617, + "grad_norm": 2.8678417205810547, + "learning_rate": 1.273744714569921e-06, + "loss": 0.7783, + "step": 6701 + }, + { + "epoch": 0.3195003932972612, + "grad_norm": 25.8961181640625, + "learning_rate": 1.271827005587054e-06, + "loss": 0.9727, + "step": 6702 + }, + { + "epoch": 0.3195480656925607, + "grad_norm": 2.0242180824279785, + "learning_rate": 1.2699106432713947e-06, + "loss": 0.8678, + "step": 6703 + }, + { + "epoch": 0.31959573808786024, + "grad_norm": 1.177913784980774, + "learning_rate": 1.2679956279186234e-06, + "loss": 0.6598, + "step": 6704 + }, + { + "epoch": 0.31964341048315975, + "grad_norm": 1.3602626323699951, + "learning_rate": 1.2660819598242013e-06, + "loss": 0.4607, + "step": 6705 + }, + { + "epoch": 0.3196910828784592, + "grad_norm": 2.882046937942505, + "learning_rate": 1.2641696392833935e-06, + "loss": 1.0607, + "step": 6706 + }, + { + "epoch": 0.3197387552737587, + "grad_norm": 1.8627959489822388, + "learning_rate": 1.262258666591246e-06, + "loss": 0.7894, + "step": 6707 + }, + { + "epoch": 0.31978642766905824, + "grad_norm": 3.0506558418273926, + "learning_rate": 1.260349042042608e-06, + "loss": 1.368, + "step": 6708 + }, + { + "epoch": 0.31983410006435775, + "grad_norm": 1.4230533838272095, + "learning_rate": 1.2584407659321086e-06, + "loss": 0.807, + "step": 6709 + }, + { + "epoch": 0.3198817724596572, + "grad_norm": 1.2494001388549805, + "learning_rate": 1.2565338385541792e-06, + "loss": 0.5562, + "step": 6710 + }, + { + "epoch": 0.3199294448549567, + "grad_norm": 2.4897875785827637, + "learning_rate": 1.2546282602030402e-06, + "loss": 1.0848, + "step": 6711 + }, + { + "epoch": 0.31997711725025624, + "grad_norm": 0.945253312587738, + "learning_rate": 1.2527240311726985e-06, + "loss": 0.2731, + "step": 6712 + }, + { + "epoch": 0.32002478964555575, + "grad_norm": 1.7186779975891113, + "learning_rate": 1.2508211517569592e-06, + "loss": 0.9098, + "step": 6713 + }, + { + "epoch": 0.32007246204085527, + "grad_norm": 1.7863162755966187, + "learning_rate": 1.2489196222494193e-06, + "loss": 0.535, + "step": 6714 + }, + { + "epoch": 0.3201201344361547, + "grad_norm": 1.260798454284668, + "learning_rate": 1.2470194429434601e-06, + "loss": 0.818, + "step": 6715 + }, + { + "epoch": 0.32016780683145424, + "grad_norm": 2.9051167964935303, + "learning_rate": 1.2451206141322635e-06, + "loss": 0.6785, + "step": 6716 + }, + { + "epoch": 0.32021547922675375, + "grad_norm": 2.0958476066589355, + "learning_rate": 1.243223136108801e-06, + "loss": 0.9215, + "step": 6717 + }, + { + "epoch": 0.32026315162205327, + "grad_norm": 2.3783397674560547, + "learning_rate": 1.241327009165828e-06, + "loss": 0.5876, + "step": 6718 + }, + { + "epoch": 0.3203108240173527, + "grad_norm": 1.8721314668655396, + "learning_rate": 1.239432233595903e-06, + "loss": 0.9631, + "step": 6719 + }, + { + "epoch": 0.32035849641265224, + "grad_norm": 3.1029164791107178, + "learning_rate": 1.2375388096913666e-06, + "loss": 0.2165, + "step": 6720 + }, + { + "epoch": 0.32040616880795175, + "grad_norm": 2.7242085933685303, + "learning_rate": 1.235646737744357e-06, + "loss": 0.5013, + "step": 6721 + }, + { + "epoch": 0.32045384120325127, + "grad_norm": 1.874029278755188, + "learning_rate": 1.2337560180467988e-06, + "loss": 0.65, + "step": 6722 + }, + { + "epoch": 0.3205015135985508, + "grad_norm": 1.5420160293579102, + "learning_rate": 1.2318666508904143e-06, + "loss": 0.6374, + "step": 6723 + }, + { + "epoch": 0.32054918599385024, + "grad_norm": 6.614696502685547, + "learning_rate": 1.2299786365667088e-06, + "loss": 0.3752, + "step": 6724 + }, + { + "epoch": 0.32059685838914975, + "grad_norm": 4.222172260284424, + "learning_rate": 1.2280919753669863e-06, + "loss": 0.3931, + "step": 6725 + }, + { + "epoch": 0.32064453078444927, + "grad_norm": 2.2805488109588623, + "learning_rate": 1.226206667582338e-06, + "loss": 0.8805, + "step": 6726 + }, + { + "epoch": 0.3206922031797488, + "grad_norm": 2.7942402362823486, + "learning_rate": 1.2243227135036517e-06, + "loss": 1.2933, + "step": 6727 + }, + { + "epoch": 0.3207398755750483, + "grad_norm": 2.5557541847229004, + "learning_rate": 1.2224401134215957e-06, + "loss": 0.6964, + "step": 6728 + }, + { + "epoch": 0.32078754797034775, + "grad_norm": 1.5781320333480835, + "learning_rate": 1.220558867626639e-06, + "loss": 0.7256, + "step": 6729 + }, + { + "epoch": 0.32083522036564727, + "grad_norm": 1.4425690174102783, + "learning_rate": 1.2186789764090412e-06, + "loss": 0.9856, + "step": 6730 + }, + { + "epoch": 0.3208828927609468, + "grad_norm": 2.1655056476593018, + "learning_rate": 1.216800440058844e-06, + "loss": 0.8455, + "step": 6731 + }, + { + "epoch": 0.3209305651562463, + "grad_norm": 1.1953462362289429, + "learning_rate": 1.21492325886589e-06, + "loss": 0.4475, + "step": 6732 + }, + { + "epoch": 0.32097823755154575, + "grad_norm": 3.858290195465088, + "learning_rate": 1.2130474331198106e-06, + "loss": 0.1909, + "step": 6733 + }, + { + "epoch": 0.32102590994684527, + "grad_norm": 1.1652231216430664, + "learning_rate": 1.2111729631100211e-06, + "loss": 0.4421, + "step": 6734 + }, + { + "epoch": 0.3210735823421448, + "grad_norm": 1.5531299114227295, + "learning_rate": 1.209299849125739e-06, + "loss": 0.906, + "step": 6735 + }, + { + "epoch": 0.3211212547374443, + "grad_norm": 1.4137942790985107, + "learning_rate": 1.2074280914559634e-06, + "loss": 0.6741, + "step": 6736 + }, + { + "epoch": 0.3211689271327438, + "grad_norm": 2.3914706707000732, + "learning_rate": 1.205557690389485e-06, + "loss": 0.2413, + "step": 6737 + }, + { + "epoch": 0.32121659952804327, + "grad_norm": 7.511162757873535, + "learning_rate": 1.20368864621489e-06, + "loss": 0.5593, + "step": 6738 + }, + { + "epoch": 0.3212642719233428, + "grad_norm": 2.186293363571167, + "learning_rate": 1.2018209592205542e-06, + "loss": 0.1307, + "step": 6739 + }, + { + "epoch": 0.3213119443186423, + "grad_norm": 1.970169186592102, + "learning_rate": 1.1999546296946386e-06, + "loss": 0.391, + "step": 6740 + }, + { + "epoch": 0.3213596167139418, + "grad_norm": 1.7000867128372192, + "learning_rate": 1.198089657925101e-06, + "loss": 0.5987, + "step": 6741 + }, + { + "epoch": 0.32140728910924127, + "grad_norm": 6.522705554962158, + "learning_rate": 1.1962260441996888e-06, + "loss": 0.6656, + "step": 6742 + }, + { + "epoch": 0.3214549615045408, + "grad_norm": 1.88416588306427, + "learning_rate": 1.1943637888059346e-06, + "loss": 0.8614, + "step": 6743 + }, + { + "epoch": 0.3215026338998403, + "grad_norm": 1.5290926694869995, + "learning_rate": 1.1925028920311676e-06, + "loss": 0.9489, + "step": 6744 + }, + { + "epoch": 0.3215503062951398, + "grad_norm": 3.1635072231292725, + "learning_rate": 1.1906433541625063e-06, + "loss": 0.7795, + "step": 6745 + }, + { + "epoch": 0.3215979786904393, + "grad_norm": 1.7384538650512695, + "learning_rate": 1.1887851754868551e-06, + "loss": 0.5209, + "step": 6746 + }, + { + "epoch": 0.3216456510857388, + "grad_norm": 0.8195374011993408, + "learning_rate": 1.1869283562909128e-06, + "loss": 0.2465, + "step": 6747 + }, + { + "epoch": 0.3216933234810383, + "grad_norm": 1.2265413999557495, + "learning_rate": 1.1850728968611702e-06, + "loss": 0.5326, + "step": 6748 + }, + { + "epoch": 0.3217409958763378, + "grad_norm": 1.1900591850280762, + "learning_rate": 1.1832187974839015e-06, + "loss": 0.5931, + "step": 6749 + }, + { + "epoch": 0.3217886682716373, + "grad_norm": 1.3428053855895996, + "learning_rate": 1.181366058445179e-06, + "loss": 0.5351, + "step": 6750 + }, + { + "epoch": 0.32183634066693684, + "grad_norm": 1.355079174041748, + "learning_rate": 1.17951468003086e-06, + "loss": 0.8475, + "step": 6751 + }, + { + "epoch": 0.3218840130622363, + "grad_norm": 2.929975986480713, + "learning_rate": 1.1776646625265897e-06, + "loss": 1.1311, + "step": 6752 + }, + { + "epoch": 0.3219316854575358, + "grad_norm": 1.4964652061462402, + "learning_rate": 1.1758160062178093e-06, + "loss": 0.7599, + "step": 6753 + }, + { + "epoch": 0.3219793578528353, + "grad_norm": 3.3741443157196045, + "learning_rate": 1.1739687113897501e-06, + "loss": 1.1203, + "step": 6754 + }, + { + "epoch": 0.32202703024813484, + "grad_norm": 2.0097317695617676, + "learning_rate": 1.1721227783274259e-06, + "loss": 0.6591, + "step": 6755 + }, + { + "epoch": 0.3220747026434343, + "grad_norm": 2.9421274662017822, + "learning_rate": 1.1702782073156482e-06, + "loss": 0.9512, + "step": 6756 + }, + { + "epoch": 0.3221223750387338, + "grad_norm": 1.4162436723709106, + "learning_rate": 1.1684349986390154e-06, + "loss": 0.8846, + "step": 6757 + }, + { + "epoch": 0.3221700474340333, + "grad_norm": 2.548219680786133, + "learning_rate": 1.166593152581914e-06, + "loss": 0.9899, + "step": 6758 + }, + { + "epoch": 0.32221771982933284, + "grad_norm": 1.7047868967056274, + "learning_rate": 1.1647526694285216e-06, + "loss": 0.7265, + "step": 6759 + }, + { + "epoch": 0.32226539222463235, + "grad_norm": 1.1510308980941772, + "learning_rate": 1.1629135494628097e-06, + "loss": 0.7055, + "step": 6760 + }, + { + "epoch": 0.3223130646199318, + "grad_norm": 2.1071419715881348, + "learning_rate": 1.1610757929685301e-06, + "loss": 0.6876, + "step": 6761 + }, + { + "epoch": 0.3223607370152313, + "grad_norm": 1.4728981256484985, + "learning_rate": 1.1592394002292328e-06, + "loss": 0.6636, + "step": 6762 + }, + { + "epoch": 0.32240840941053084, + "grad_norm": 2.160444974899292, + "learning_rate": 1.1574043715282557e-06, + "loss": 0.951, + "step": 6763 + }, + { + "epoch": 0.32245608180583035, + "grad_norm": 2.8908801078796387, + "learning_rate": 1.155570707148721e-06, + "loss": 1.1914, + "step": 6764 + }, + { + "epoch": 0.3225037542011298, + "grad_norm": 1.327877402305603, + "learning_rate": 1.153738407373548e-06, + "loss": 0.6152, + "step": 6765 + }, + { + "epoch": 0.3225514265964293, + "grad_norm": 1.8120874166488647, + "learning_rate": 1.1519074724854373e-06, + "loss": 0.6147, + "step": 6766 + }, + { + "epoch": 0.32259909899172884, + "grad_norm": 1.7921767234802246, + "learning_rate": 1.1500779027668885e-06, + "loss": 0.7633, + "step": 6767 + }, + { + "epoch": 0.32264677138702835, + "grad_norm": 1.6870146989822388, + "learning_rate": 1.1482496985001812e-06, + "loss": 0.7307, + "step": 6768 + }, + { + "epoch": 0.32269444378232787, + "grad_norm": 1.9880839586257935, + "learning_rate": 1.1464228599673889e-06, + "loss": 0.9944, + "step": 6769 + }, + { + "epoch": 0.3227421161776273, + "grad_norm": 1.5080907344818115, + "learning_rate": 1.144597387450378e-06, + "loss": 0.8737, + "step": 6770 + }, + { + "epoch": 0.32278978857292684, + "grad_norm": 1.4312371015548706, + "learning_rate": 1.1427732812307945e-06, + "loss": 0.9552, + "step": 6771 + }, + { + "epoch": 0.32283746096822635, + "grad_norm": 2.1981687545776367, + "learning_rate": 1.1409505415900823e-06, + "loss": 0.9216, + "step": 6772 + }, + { + "epoch": 0.32288513336352587, + "grad_norm": 1.8863228559494019, + "learning_rate": 1.139129168809473e-06, + "loss": 0.9831, + "step": 6773 + }, + { + "epoch": 0.3229328057588253, + "grad_norm": 0.7570465207099915, + "learning_rate": 1.1373091631699817e-06, + "loss": 0.1448, + "step": 6774 + }, + { + "epoch": 0.32298047815412484, + "grad_norm": 2.2315726280212402, + "learning_rate": 1.1354905249524184e-06, + "loss": 0.274, + "step": 6775 + }, + { + "epoch": 0.32302815054942435, + "grad_norm": 3.448110818862915, + "learning_rate": 1.133673254437383e-06, + "loss": 1.1285, + "step": 6776 + }, + { + "epoch": 0.32307582294472387, + "grad_norm": 1.4017457962036133, + "learning_rate": 1.1318573519052556e-06, + "loss": 0.841, + "step": 6777 + }, + { + "epoch": 0.3231234953400234, + "grad_norm": 2.547318935394287, + "learning_rate": 1.1300428176362155e-06, + "loss": 1.6176, + "step": 6778 + }, + { + "epoch": 0.32317116773532284, + "grad_norm": 2.899115800857544, + "learning_rate": 1.1282296519102277e-06, + "loss": 0.3613, + "step": 6779 + }, + { + "epoch": 0.32321884013062235, + "grad_norm": 1.8724870681762695, + "learning_rate": 1.1264178550070427e-06, + "loss": 0.7452, + "step": 6780 + }, + { + "epoch": 0.32326651252592187, + "grad_norm": 1.8066425323486328, + "learning_rate": 1.1246074272062012e-06, + "loss": 0.7663, + "step": 6781 + }, + { + "epoch": 0.3233141849212214, + "grad_norm": 1.3729069232940674, + "learning_rate": 1.1227983687870358e-06, + "loss": 0.6915, + "step": 6782 + }, + { + "epoch": 0.3233618573165209, + "grad_norm": 1.281703233718872, + "learning_rate": 1.120990680028663e-06, + "loss": 0.7031, + "step": 6783 + }, + { + "epoch": 0.32340952971182035, + "grad_norm": 2.4891021251678467, + "learning_rate": 1.119184361209993e-06, + "loss": 0.5118, + "step": 6784 + }, + { + "epoch": 0.32345720210711987, + "grad_norm": 3.2797603607177734, + "learning_rate": 1.1173794126097226e-06, + "loss": 0.1452, + "step": 6785 + }, + { + "epoch": 0.3235048745024194, + "grad_norm": 2.2303245067596436, + "learning_rate": 1.1155758345063328e-06, + "loss": 0.9356, + "step": 6786 + }, + { + "epoch": 0.3235525468977189, + "grad_norm": 2.5446243286132812, + "learning_rate": 1.1137736271781007e-06, + "loss": 0.7394, + "step": 6787 + }, + { + "epoch": 0.32360021929301835, + "grad_norm": 1.061785101890564, + "learning_rate": 1.1119727909030897e-06, + "loss": 0.5001, + "step": 6788 + }, + { + "epoch": 0.32364789168831787, + "grad_norm": 3.6874563694000244, + "learning_rate": 1.1101733259591453e-06, + "loss": 0.4842, + "step": 6789 + }, + { + "epoch": 0.3236955640836174, + "grad_norm": 1.8435956239700317, + "learning_rate": 1.1083752326239094e-06, + "loss": 0.7229, + "step": 6790 + }, + { + "epoch": 0.3237432364789169, + "grad_norm": 3.200688362121582, + "learning_rate": 1.1065785111748117e-06, + "loss": 0.6165, + "step": 6791 + }, + { + "epoch": 0.3237909088742164, + "grad_norm": 2.1181397438049316, + "learning_rate": 1.1047831618890625e-06, + "loss": 0.9953, + "step": 6792 + }, + { + "epoch": 0.32383858126951587, + "grad_norm": 2.3873510360717773, + "learning_rate": 1.1029891850436691e-06, + "loss": 0.6344, + "step": 6793 + }, + { + "epoch": 0.3238862536648154, + "grad_norm": 6.07340669631958, + "learning_rate": 1.1011965809154245e-06, + "loss": 1.1416, + "step": 6794 + }, + { + "epoch": 0.3239339260601149, + "grad_norm": 1.9123923778533936, + "learning_rate": 1.0994053497809077e-06, + "loss": 0.8761, + "step": 6795 + }, + { + "epoch": 0.3239815984554144, + "grad_norm": 3.459238052368164, + "learning_rate": 1.097615491916485e-06, + "loss": 0.8978, + "step": 6796 + }, + { + "epoch": 0.32402927085071387, + "grad_norm": 2.479867935180664, + "learning_rate": 1.0958270075983167e-06, + "loss": 1.0981, + "step": 6797 + }, + { + "epoch": 0.3240769432460134, + "grad_norm": 4.167263031005859, + "learning_rate": 1.0940398971023447e-06, + "loss": 0.5798, + "step": 6798 + }, + { + "epoch": 0.3241246156413129, + "grad_norm": 1.8770060539245605, + "learning_rate": 1.0922541607043024e-06, + "loss": 1.0562, + "step": 6799 + }, + { + "epoch": 0.3241722880366124, + "grad_norm": 2.877319812774658, + "learning_rate": 1.0904697986797131e-06, + "loss": 0.831, + "step": 6800 + }, + { + "epoch": 0.3242199604319119, + "grad_norm": 0.9527406692504883, + "learning_rate": 1.0886868113038817e-06, + "loss": 0.3654, + "step": 6801 + }, + { + "epoch": 0.3242676328272114, + "grad_norm": 1.6785608530044556, + "learning_rate": 1.0869051988519063e-06, + "loss": 0.7435, + "step": 6802 + }, + { + "epoch": 0.3243153052225109, + "grad_norm": 1.3738272190093994, + "learning_rate": 1.0851249615986715e-06, + "loss": 0.653, + "step": 6803 + }, + { + "epoch": 0.3243629776178104, + "grad_norm": 3.6544318199157715, + "learning_rate": 1.0833460998188516e-06, + "loss": 0.6799, + "step": 6804 + }, + { + "epoch": 0.3244106500131099, + "grad_norm": 1.6429892778396606, + "learning_rate": 1.081568613786903e-06, + "loss": 1.0817, + "step": 6805 + }, + { + "epoch": 0.3244583224084094, + "grad_norm": 1.6231443881988525, + "learning_rate": 1.079792503777075e-06, + "loss": 0.506, + "step": 6806 + }, + { + "epoch": 0.3245059948037089, + "grad_norm": 1.4431722164154053, + "learning_rate": 1.0780177700634053e-06, + "loss": 0.6893, + "step": 6807 + }, + { + "epoch": 0.3245536671990084, + "grad_norm": 1.261094331741333, + "learning_rate": 1.0762444129197136e-06, + "loss": 0.2204, + "step": 6808 + }, + { + "epoch": 0.3246013395943079, + "grad_norm": 4.121915340423584, + "learning_rate": 1.0744724326196133e-06, + "loss": 0.9662, + "step": 6809 + }, + { + "epoch": 0.32464901198960744, + "grad_norm": 1.5398530960083008, + "learning_rate": 1.0727018294364999e-06, + "loss": 0.5778, + "step": 6810 + }, + { + "epoch": 0.3246966843849069, + "grad_norm": 1.2922009229660034, + "learning_rate": 1.070932603643563e-06, + "loss": 0.6491, + "step": 6811 + }, + { + "epoch": 0.3247443567802064, + "grad_norm": 1.8126953840255737, + "learning_rate": 1.0691647555137719e-06, + "loss": 0.8161, + "step": 6812 + }, + { + "epoch": 0.3247920291755059, + "grad_norm": 1.652815341949463, + "learning_rate": 1.0673982853198906e-06, + "loss": 0.7883, + "step": 6813 + }, + { + "epoch": 0.32483970157080544, + "grad_norm": 1.5472791194915771, + "learning_rate": 1.0656331933344643e-06, + "loss": 0.7809, + "step": 6814 + }, + { + "epoch": 0.32488737396610495, + "grad_norm": 1.0428762435913086, + "learning_rate": 1.06386947982983e-06, + "loss": 0.39, + "step": 6815 + }, + { + "epoch": 0.3249350463614044, + "grad_norm": 1.4960932731628418, + "learning_rate": 1.0621071450781118e-06, + "loss": 0.9479, + "step": 6816 + }, + { + "epoch": 0.3249827187567039, + "grad_norm": 1.3536639213562012, + "learning_rate": 1.060346189351218e-06, + "loss": 0.7426, + "step": 6817 + }, + { + "epoch": 0.32503039115200344, + "grad_norm": 2.0714895725250244, + "learning_rate": 1.0585866129208456e-06, + "loss": 0.4976, + "step": 6818 + }, + { + "epoch": 0.32507806354730295, + "grad_norm": 2.729680061340332, + "learning_rate": 1.0568284160584818e-06, + "loss": 0.8672, + "step": 6819 + }, + { + "epoch": 0.3251257359426024, + "grad_norm": 2.2662999629974365, + "learning_rate": 1.0550715990353955e-06, + "loss": 1.1256, + "step": 6820 + }, + { + "epoch": 0.3251734083379019, + "grad_norm": 0.9396215081214905, + "learning_rate": 1.0533161621226463e-06, + "loss": 0.3311, + "step": 6821 + }, + { + "epoch": 0.32522108073320144, + "grad_norm": 1.3509668111801147, + "learning_rate": 1.051562105591082e-06, + "loss": 0.7507, + "step": 6822 + }, + { + "epoch": 0.32526875312850095, + "grad_norm": 3.611696720123291, + "learning_rate": 1.0498094297113314e-06, + "loss": 0.1463, + "step": 6823 + }, + { + "epoch": 0.32531642552380047, + "grad_norm": 1.558960199356079, + "learning_rate": 1.0480581347538199e-06, + "loss": 0.4143, + "step": 6824 + }, + { + "epoch": 0.3253640979190999, + "grad_norm": 1.4471542835235596, + "learning_rate": 1.0463082209887477e-06, + "loss": 0.641, + "step": 6825 + }, + { + "epoch": 0.32541177031439944, + "grad_norm": 2.056330919265747, + "learning_rate": 1.0445596886861143e-06, + "loss": 0.819, + "step": 6826 + }, + { + "epoch": 0.32545944270969895, + "grad_norm": 2.165382146835327, + "learning_rate": 1.0428125381156962e-06, + "loss": 0.6796, + "step": 6827 + }, + { + "epoch": 0.32550711510499847, + "grad_norm": 1.7400048971176147, + "learning_rate": 1.0410667695470633e-06, + "loss": 0.785, + "step": 6828 + }, + { + "epoch": 0.3255547875002979, + "grad_norm": 2.4737558364868164, + "learning_rate": 1.039322383249568e-06, + "loss": 0.5966, + "step": 6829 + }, + { + "epoch": 0.32560245989559744, + "grad_norm": 2.14703631401062, + "learning_rate": 1.0375793794923505e-06, + "loss": 0.5545, + "step": 6830 + }, + { + "epoch": 0.32565013229089695, + "grad_norm": 2.659785509109497, + "learning_rate": 1.0358377585443424e-06, + "loss": 0.517, + "step": 6831 + }, + { + "epoch": 0.32569780468619647, + "grad_norm": 1.8968234062194824, + "learning_rate": 1.0340975206742531e-06, + "loss": 0.6559, + "step": 6832 + }, + { + "epoch": 0.325745477081496, + "grad_norm": 1.6274470090866089, + "learning_rate": 1.0323586661505858e-06, + "loss": 0.6857, + "step": 6833 + }, + { + "epoch": 0.32579314947679544, + "grad_norm": 2.677960157394409, + "learning_rate": 1.030621195241629e-06, + "loss": 1.0955, + "step": 6834 + }, + { + "epoch": 0.32584082187209495, + "grad_norm": 2.5976550579071045, + "learning_rate": 1.0288851082154528e-06, + "loss": 0.7696, + "step": 6835 + }, + { + "epoch": 0.32588849426739447, + "grad_norm": 1.0764391422271729, + "learning_rate": 1.0271504053399195e-06, + "loss": 0.7978, + "step": 6836 + }, + { + "epoch": 0.325936166662694, + "grad_norm": 1.5717785358428955, + "learning_rate": 1.0254170868826796e-06, + "loss": 1.0062, + "step": 6837 + }, + { + "epoch": 0.3259838390579935, + "grad_norm": 1.081852912902832, + "learning_rate": 1.0236851531111592e-06, + "loss": 0.4168, + "step": 6838 + }, + { + "epoch": 0.32603151145329295, + "grad_norm": 1.7853864431381226, + "learning_rate": 1.0219546042925842e-06, + "loss": 0.7182, + "step": 6839 + }, + { + "epoch": 0.32607918384859247, + "grad_norm": 1.3578150272369385, + "learning_rate": 1.020225440693956e-06, + "loss": 0.8344, + "step": 6840 + }, + { + "epoch": 0.326126856243892, + "grad_norm": 1.8048217296600342, + "learning_rate": 1.0184976625820707e-06, + "loss": 0.889, + "step": 6841 + }, + { + "epoch": 0.3261745286391915, + "grad_norm": 1.670409917831421, + "learning_rate": 1.0167712702235023e-06, + "loss": 0.4762, + "step": 6842 + }, + { + "epoch": 0.32622220103449096, + "grad_norm": 1.2257848978042603, + "learning_rate": 1.015046263884617e-06, + "loss": 0.5661, + "step": 6843 + }, + { + "epoch": 0.32626987342979047, + "grad_norm": 1.3278084993362427, + "learning_rate": 1.013322643831569e-06, + "loss": 0.526, + "step": 6844 + }, + { + "epoch": 0.32631754582509, + "grad_norm": 1.7151070833206177, + "learning_rate": 1.011600410330289e-06, + "loss": 1.1557, + "step": 6845 + }, + { + "epoch": 0.3263652182203895, + "grad_norm": 1.1286561489105225, + "learning_rate": 1.0098795636465042e-06, + "loss": 0.7666, + "step": 6846 + }, + { + "epoch": 0.326412890615689, + "grad_norm": 3.2658588886260986, + "learning_rate": 1.0081601040457246e-06, + "loss": 1.3919, + "step": 6847 + }, + { + "epoch": 0.32646056301098847, + "grad_norm": 1.2767378091812134, + "learning_rate": 1.00644203179324e-06, + "loss": 0.7611, + "step": 6848 + }, + { + "epoch": 0.326508235406288, + "grad_norm": 3.6402831077575684, + "learning_rate": 1.004725347154134e-06, + "loss": 0.1893, + "step": 6849 + }, + { + "epoch": 0.3265559078015875, + "grad_norm": 1.2136021852493286, + "learning_rate": 1.0030100503932761e-06, + "loss": 0.583, + "step": 6850 + }, + { + "epoch": 0.326603580196887, + "grad_norm": 1.1178150177001953, + "learning_rate": 1.0012961417753142e-06, + "loss": 0.8505, + "step": 6851 + }, + { + "epoch": 0.32665125259218647, + "grad_norm": 1.1358115673065186, + "learning_rate": 9.995836215646892e-07, + "loss": 0.8125, + "step": 6852 + }, + { + "epoch": 0.326698924987486, + "grad_norm": 2.1993634700775146, + "learning_rate": 9.978724900256265e-07, + "loss": 1.2156, + "step": 6853 + }, + { + "epoch": 0.3267465973827855, + "grad_norm": 5.942317962646484, + "learning_rate": 9.961627474221324e-07, + "loss": 0.4882, + "step": 6854 + }, + { + "epoch": 0.326794269778085, + "grad_norm": 1.7636799812316895, + "learning_rate": 9.944543940180074e-07, + "loss": 0.6522, + "step": 6855 + }, + { + "epoch": 0.3268419421733845, + "grad_norm": 1.9472734928131104, + "learning_rate": 9.927474300768303e-07, + "loss": 0.6525, + "step": 6856 + }, + { + "epoch": 0.326889614568684, + "grad_norm": 1.8993598222732544, + "learning_rate": 9.91041855861965e-07, + "loss": 1.3042, + "step": 6857 + }, + { + "epoch": 0.3269372869639835, + "grad_norm": 1.998908519744873, + "learning_rate": 9.893376716365677e-07, + "loss": 0.8146, + "step": 6858 + }, + { + "epoch": 0.326984959359283, + "grad_norm": 1.3649706840515137, + "learning_rate": 9.87634877663578e-07, + "loss": 0.6859, + "step": 6859 + }, + { + "epoch": 0.3270326317545825, + "grad_norm": 1.9363383054733276, + "learning_rate": 9.859334742057158e-07, + "loss": 0.7996, + "step": 6860 + }, + { + "epoch": 0.327080304149882, + "grad_norm": 1.5329622030258179, + "learning_rate": 9.842334615254901e-07, + "loss": 0.8392, + "step": 6861 + }, + { + "epoch": 0.3271279765451815, + "grad_norm": 1.3983296155929565, + "learning_rate": 9.825348398851998e-07, + "loss": 0.5687, + "step": 6862 + }, + { + "epoch": 0.327175648940481, + "grad_norm": 2.9042091369628906, + "learning_rate": 9.808376095469196e-07, + "loss": 0.3973, + "step": 6863 + }, + { + "epoch": 0.3272233213357805, + "grad_norm": 1.4467089176177979, + "learning_rate": 9.791417707725171e-07, + "loss": 0.4728, + "step": 6864 + }, + { + "epoch": 0.32727099373108004, + "grad_norm": 1.39895498752594, + "learning_rate": 9.774473238236449e-07, + "loss": 0.9346, + "step": 6865 + }, + { + "epoch": 0.3273186661263795, + "grad_norm": 2.028965473175049, + "learning_rate": 9.757542689617328e-07, + "loss": 0.8223, + "step": 6866 + }, + { + "epoch": 0.327366338521679, + "grad_norm": 1.8963208198547363, + "learning_rate": 9.740626064480063e-07, + "loss": 1.005, + "step": 6867 + }, + { + "epoch": 0.3274140109169785, + "grad_norm": 1.631163239479065, + "learning_rate": 9.723723365434722e-07, + "loss": 0.7241, + "step": 6868 + }, + { + "epoch": 0.32746168331227804, + "grad_norm": 1.5895754098892212, + "learning_rate": 9.706834595089187e-07, + "loss": 0.713, + "step": 6869 + }, + { + "epoch": 0.32750935570757755, + "grad_norm": 1.2809149026870728, + "learning_rate": 9.68995975604925e-07, + "loss": 0.2593, + "step": 6870 + }, + { + "epoch": 0.327557028102877, + "grad_norm": 1.493955373764038, + "learning_rate": 9.673098850918506e-07, + "loss": 0.9036, + "step": 6871 + }, + { + "epoch": 0.3276047004981765, + "grad_norm": 1.843968391418457, + "learning_rate": 9.656251882298394e-07, + "loss": 1.0281, + "step": 6872 + }, + { + "epoch": 0.32765237289347604, + "grad_norm": 1.4267148971557617, + "learning_rate": 9.639418852788274e-07, + "loss": 0.6061, + "step": 6873 + }, + { + "epoch": 0.32770004528877555, + "grad_norm": 2.1960649490356445, + "learning_rate": 9.622599764985297e-07, + "loss": 1.0589, + "step": 6874 + }, + { + "epoch": 0.327747717684075, + "grad_norm": 1.7546650171279907, + "learning_rate": 9.605794621484455e-07, + "loss": 0.5425, + "step": 6875 + }, + { + "epoch": 0.3277953900793745, + "grad_norm": 0.9336969256401062, + "learning_rate": 9.589003424878618e-07, + "loss": 0.2844, + "step": 6876 + }, + { + "epoch": 0.32784306247467404, + "grad_norm": 1.0836050510406494, + "learning_rate": 9.572226177758514e-07, + "loss": 0.7409, + "step": 6877 + }, + { + "epoch": 0.32789073486997355, + "grad_norm": 1.8114019632339478, + "learning_rate": 9.555462882712684e-07, + "loss": 0.6375, + "step": 6878 + }, + { + "epoch": 0.32793840726527307, + "grad_norm": 1.5859652757644653, + "learning_rate": 9.538713542327527e-07, + "loss": 0.582, + "step": 6879 + }, + { + "epoch": 0.3279860796605725, + "grad_norm": 1.504881501197815, + "learning_rate": 9.521978159187295e-07, + "loss": 0.7675, + "step": 6880 + }, + { + "epoch": 0.32803375205587204, + "grad_norm": 2.2370870113372803, + "learning_rate": 9.505256735874113e-07, + "loss": 0.8555, + "step": 6881 + }, + { + "epoch": 0.32808142445117155, + "grad_norm": 1.9561861753463745, + "learning_rate": 9.488549274967873e-07, + "loss": 0.6115, + "step": 6882 + }, + { + "epoch": 0.32812909684647107, + "grad_norm": 2.1800622940063477, + "learning_rate": 9.471855779046424e-07, + "loss": 0.5097, + "step": 6883 + }, + { + "epoch": 0.3281767692417705, + "grad_norm": 2.48305344581604, + "learning_rate": 9.455176250685338e-07, + "loss": 1.3924, + "step": 6884 + }, + { + "epoch": 0.32822444163707004, + "grad_norm": 1.8112921714782715, + "learning_rate": 9.438510692458147e-07, + "loss": 0.8998, + "step": 6885 + }, + { + "epoch": 0.32827211403236956, + "grad_norm": 0.9631018042564392, + "learning_rate": 9.421859106936138e-07, + "loss": 0.748, + "step": 6886 + }, + { + "epoch": 0.32831978642766907, + "grad_norm": 2.037118673324585, + "learning_rate": 9.40522149668851e-07, + "loss": 0.8314, + "step": 6887 + }, + { + "epoch": 0.3283674588229686, + "grad_norm": 1.617737889289856, + "learning_rate": 9.388597864282245e-07, + "loss": 0.6754, + "step": 6888 + }, + { + "epoch": 0.32841513121826804, + "grad_norm": 2.9875071048736572, + "learning_rate": 9.371988212282212e-07, + "loss": 1.4998, + "step": 6889 + }, + { + "epoch": 0.32846280361356756, + "grad_norm": 2.425280809402466, + "learning_rate": 9.355392543251119e-07, + "loss": 0.8703, + "step": 6890 + }, + { + "epoch": 0.32851047600886707, + "grad_norm": 1.9492497444152832, + "learning_rate": 9.338810859749492e-07, + "loss": 0.5463, + "step": 6891 + }, + { + "epoch": 0.3285581484041666, + "grad_norm": 5.014599323272705, + "learning_rate": 9.322243164335709e-07, + "loss": 0.3037, + "step": 6892 + }, + { + "epoch": 0.32860582079946604, + "grad_norm": 2.4838085174560547, + "learning_rate": 9.305689459566025e-07, + "loss": 1.0198, + "step": 6893 + }, + { + "epoch": 0.32865349319476556, + "grad_norm": 5.7384934425354, + "learning_rate": 9.289149747994475e-07, + "loss": 0.4571, + "step": 6894 + }, + { + "epoch": 0.32870116559006507, + "grad_norm": 1.1955310106277466, + "learning_rate": 9.272624032172972e-07, + "loss": 0.6792, + "step": 6895 + }, + { + "epoch": 0.3287488379853646, + "grad_norm": 2.9281866550445557, + "learning_rate": 9.2561123146513e-07, + "loss": 1.546, + "step": 6896 + }, + { + "epoch": 0.3287965103806641, + "grad_norm": 6.846766471862793, + "learning_rate": 9.239614597976987e-07, + "loss": 0.6851, + "step": 6897 + }, + { + "epoch": 0.32884418277596356, + "grad_norm": 0.7876847386360168, + "learning_rate": 9.223130884695486e-07, + "loss": 0.3575, + "step": 6898 + }, + { + "epoch": 0.32889185517126307, + "grad_norm": 2.3945159912109375, + "learning_rate": 9.206661177350096e-07, + "loss": 1.3135, + "step": 6899 + }, + { + "epoch": 0.3289395275665626, + "grad_norm": 1.7309443950653076, + "learning_rate": 9.190205478481895e-07, + "loss": 0.9645, + "step": 6900 + }, + { + "epoch": 0.3289871999618621, + "grad_norm": 2.170653820037842, + "learning_rate": 9.173763790629808e-07, + "loss": 0.4641, + "step": 6901 + }, + { + "epoch": 0.3290348723571616, + "grad_norm": 1.2487143278121948, + "learning_rate": 9.15733611633065e-07, + "loss": 0.6012, + "step": 6902 + }, + { + "epoch": 0.32908254475246107, + "grad_norm": 3.0707266330718994, + "learning_rate": 9.140922458119028e-07, + "loss": 0.2333, + "step": 6903 + }, + { + "epoch": 0.3291302171477606, + "grad_norm": 1.6412420272827148, + "learning_rate": 9.124522818527393e-07, + "loss": 0.5698, + "step": 6904 + }, + { + "epoch": 0.3291778895430601, + "grad_norm": 1.5062675476074219, + "learning_rate": 9.108137200086076e-07, + "loss": 0.6577, + "step": 6905 + }, + { + "epoch": 0.3292255619383596, + "grad_norm": 2.7576942443847656, + "learning_rate": 9.091765605323155e-07, + "loss": 0.7413, + "step": 6906 + }, + { + "epoch": 0.32927323433365907, + "grad_norm": 1.5102096796035767, + "learning_rate": 9.075408036764633e-07, + "loss": 0.6647, + "step": 6907 + }, + { + "epoch": 0.3293209067289586, + "grad_norm": 1.599633812904358, + "learning_rate": 9.059064496934333e-07, + "loss": 0.6397, + "step": 6908 + }, + { + "epoch": 0.3293685791242581, + "grad_norm": 1.797092080116272, + "learning_rate": 9.042734988353841e-07, + "loss": 0.4712, + "step": 6909 + }, + { + "epoch": 0.3294162515195576, + "grad_norm": 1.429047703742981, + "learning_rate": 9.026419513542673e-07, + "loss": 0.7813, + "step": 6910 + }, + { + "epoch": 0.3294639239148571, + "grad_norm": 2.5892581939697266, + "learning_rate": 9.010118075018137e-07, + "loss": 0.8682, + "step": 6911 + }, + { + "epoch": 0.3295115963101566, + "grad_norm": 2.959779739379883, + "learning_rate": 8.993830675295345e-07, + "loss": 1.1225, + "step": 6912 + }, + { + "epoch": 0.3295592687054561, + "grad_norm": 2.6877310276031494, + "learning_rate": 8.977557316887309e-07, + "loss": 0.5966, + "step": 6913 + }, + { + "epoch": 0.3296069411007556, + "grad_norm": 2.6235692501068115, + "learning_rate": 8.961298002304841e-07, + "loss": 0.5057, + "step": 6914 + }, + { + "epoch": 0.3296546134960551, + "grad_norm": 1.8561090230941772, + "learning_rate": 8.945052734056581e-07, + "loss": 0.8651, + "step": 6915 + }, + { + "epoch": 0.3297022858913546, + "grad_norm": 2.475501537322998, + "learning_rate": 8.928821514648977e-07, + "loss": 0.8263, + "step": 6916 + }, + { + "epoch": 0.3297499582866541, + "grad_norm": 1.637777328491211, + "learning_rate": 8.912604346586362e-07, + "loss": 0.9822, + "step": 6917 + }, + { + "epoch": 0.3297976306819536, + "grad_norm": 1.7201780080795288, + "learning_rate": 8.896401232370889e-07, + "loss": 0.7964, + "step": 6918 + }, + { + "epoch": 0.3298453030772531, + "grad_norm": 2.49106502532959, + "learning_rate": 8.880212174502512e-07, + "loss": 0.8571, + "step": 6919 + }, + { + "epoch": 0.32989297547255264, + "grad_norm": 0.8593341112136841, + "learning_rate": 8.864037175479034e-07, + "loss": 0.4201, + "step": 6920 + }, + { + "epoch": 0.3299406478678521, + "grad_norm": 3.763322353363037, + "learning_rate": 8.847876237796127e-07, + "loss": 0.6488, + "step": 6921 + }, + { + "epoch": 0.3299883202631516, + "grad_norm": 1.6638611555099487, + "learning_rate": 8.831729363947216e-07, + "loss": 0.7638, + "step": 6922 + }, + { + "epoch": 0.3300359926584511, + "grad_norm": 0.7484946846961975, + "learning_rate": 8.815596556423611e-07, + "loss": 0.2955, + "step": 6923 + }, + { + "epoch": 0.33008366505375064, + "grad_norm": 1.2401622533798218, + "learning_rate": 8.799477817714452e-07, + "loss": 0.6298, + "step": 6924 + }, + { + "epoch": 0.33013133744905016, + "grad_norm": 1.2771800756454468, + "learning_rate": 8.783373150306663e-07, + "loss": 0.6583, + "step": 6925 + }, + { + "epoch": 0.3301790098443496, + "grad_norm": 1.4808845520019531, + "learning_rate": 8.767282556685053e-07, + "loss": 0.6071, + "step": 6926 + }, + { + "epoch": 0.3302266822396491, + "grad_norm": 1.4974414110183716, + "learning_rate": 8.75120603933225e-07, + "loss": 0.9551, + "step": 6927 + }, + { + "epoch": 0.33027435463494864, + "grad_norm": 1.3905906677246094, + "learning_rate": 8.735143600728646e-07, + "loss": 0.5626, + "step": 6928 + }, + { + "epoch": 0.33032202703024816, + "grad_norm": 1.1375762224197388, + "learning_rate": 8.71909524335256e-07, + "loss": 0.5293, + "step": 6929 + }, + { + "epoch": 0.3303696994255476, + "grad_norm": 2.1217849254608154, + "learning_rate": 8.703060969680055e-07, + "loss": 0.3791, + "step": 6930 + }, + { + "epoch": 0.33041737182084713, + "grad_norm": 1.729552149772644, + "learning_rate": 8.687040782185074e-07, + "loss": 0.6536, + "step": 6931 + }, + { + "epoch": 0.33046504421614664, + "grad_norm": 1.1514723300933838, + "learning_rate": 8.671034683339352e-07, + "loss": 0.757, + "step": 6932 + }, + { + "epoch": 0.33051271661144616, + "grad_norm": 1.419379472732544, + "learning_rate": 8.65504267561248e-07, + "loss": 0.8308, + "step": 6933 + }, + { + "epoch": 0.33056038900674567, + "grad_norm": 1.2579851150512695, + "learning_rate": 8.639064761471838e-07, + "loss": 0.9242, + "step": 6934 + }, + { + "epoch": 0.33060806140204513, + "grad_norm": 1.2018386125564575, + "learning_rate": 8.623100943382667e-07, + "loss": 0.5524, + "step": 6935 + }, + { + "epoch": 0.33065573379734464, + "grad_norm": 1.9304592609405518, + "learning_rate": 8.607151223808041e-07, + "loss": 0.903, + "step": 6936 + }, + { + "epoch": 0.33070340619264416, + "grad_norm": 1.6968860626220703, + "learning_rate": 8.591215605208791e-07, + "loss": 0.7001, + "step": 6937 + }, + { + "epoch": 0.33075107858794367, + "grad_norm": 2.054097890853882, + "learning_rate": 8.575294090043651e-07, + "loss": 0.6849, + "step": 6938 + }, + { + "epoch": 0.33079875098324313, + "grad_norm": 0.950583279132843, + "learning_rate": 8.559386680769166e-07, + "loss": 0.6726, + "step": 6939 + }, + { + "epoch": 0.33084642337854264, + "grad_norm": 1.2018409967422485, + "learning_rate": 8.543493379839629e-07, + "loss": 0.5115, + "step": 6940 + }, + { + "epoch": 0.33089409577384216, + "grad_norm": 0.8816591501235962, + "learning_rate": 8.527614189707245e-07, + "loss": 0.3203, + "step": 6941 + }, + { + "epoch": 0.33094176816914167, + "grad_norm": 1.4427143335342407, + "learning_rate": 8.511749112822032e-07, + "loss": 0.982, + "step": 6942 + }, + { + "epoch": 0.3309894405644412, + "grad_norm": 1.4803297519683838, + "learning_rate": 8.495898151631765e-07, + "loss": 0.8211, + "step": 6943 + }, + { + "epoch": 0.33103711295974064, + "grad_norm": 2.8168013095855713, + "learning_rate": 8.480061308582122e-07, + "loss": 0.4794, + "step": 6944 + }, + { + "epoch": 0.33108478535504016, + "grad_norm": 1.8412179946899414, + "learning_rate": 8.464238586116524e-07, + "loss": 0.304, + "step": 6945 + }, + { + "epoch": 0.33113245775033967, + "grad_norm": 1.322967767715454, + "learning_rate": 8.448429986676298e-07, + "loss": 0.5554, + "step": 6946 + }, + { + "epoch": 0.3311801301456392, + "grad_norm": 1.396625280380249, + "learning_rate": 8.432635512700505e-07, + "loss": 0.391, + "step": 6947 + }, + { + "epoch": 0.33122780254093864, + "grad_norm": 1.2411015033721924, + "learning_rate": 8.416855166626114e-07, + "loss": 0.8087, + "step": 6948 + }, + { + "epoch": 0.33127547493623816, + "grad_norm": 1.0039184093475342, + "learning_rate": 8.401088950887826e-07, + "loss": 0.2497, + "step": 6949 + }, + { + "epoch": 0.33132314733153767, + "grad_norm": 1.602081537246704, + "learning_rate": 8.385336867918226e-07, + "loss": 0.6764, + "step": 6950 + }, + { + "epoch": 0.3313708197268372, + "grad_norm": 2.843942165374756, + "learning_rate": 8.369598920147715e-07, + "loss": 0.4815, + "step": 6951 + }, + { + "epoch": 0.3314184921221367, + "grad_norm": 1.881803035736084, + "learning_rate": 8.353875110004462e-07, + "loss": 0.7745, + "step": 6952 + }, + { + "epoch": 0.33146616451743616, + "grad_norm": 2.328432321548462, + "learning_rate": 8.338165439914514e-07, + "loss": 1.0419, + "step": 6953 + }, + { + "epoch": 0.33151383691273567, + "grad_norm": 1.8506278991699219, + "learning_rate": 8.3224699123017e-07, + "loss": 0.6811, + "step": 6954 + }, + { + "epoch": 0.3315615093080352, + "grad_norm": 1.2373707294464111, + "learning_rate": 8.306788529587695e-07, + "loss": 0.7214, + "step": 6955 + }, + { + "epoch": 0.3316091817033347, + "grad_norm": 1.1605180501937866, + "learning_rate": 8.291121294191951e-07, + "loss": 0.6749, + "step": 6956 + }, + { + "epoch": 0.3316568540986342, + "grad_norm": 1.6606429815292358, + "learning_rate": 8.275468208531767e-07, + "loss": 0.8797, + "step": 6957 + }, + { + "epoch": 0.33170452649393367, + "grad_norm": 1.0410281419754028, + "learning_rate": 8.25982927502228e-07, + "loss": 0.5226, + "step": 6958 + }, + { + "epoch": 0.3317521988892332, + "grad_norm": 3.8279051780700684, + "learning_rate": 8.244204496076402e-07, + "loss": 0.5769, + "step": 6959 + }, + { + "epoch": 0.3317998712845327, + "grad_norm": 2.1523706912994385, + "learning_rate": 8.22859387410484e-07, + "loss": 1.017, + "step": 6960 + }, + { + "epoch": 0.3318475436798322, + "grad_norm": 2.217524766921997, + "learning_rate": 8.212997411516199e-07, + "loss": 1.0985, + "step": 6961 + }, + { + "epoch": 0.33189521607513167, + "grad_norm": 1.2286893129348755, + "learning_rate": 8.197415110716822e-07, + "loss": 0.7791, + "step": 6962 + }, + { + "epoch": 0.3319428884704312, + "grad_norm": 1.9869701862335205, + "learning_rate": 8.181846974110907e-07, + "loss": 1.0268, + "step": 6963 + }, + { + "epoch": 0.3319905608657307, + "grad_norm": 3.1760213375091553, + "learning_rate": 8.166293004100478e-07, + "loss": 1.088, + "step": 6964 + }, + { + "epoch": 0.3320382332610302, + "grad_norm": 1.2840080261230469, + "learning_rate": 8.150753203085315e-07, + "loss": 0.7817, + "step": 6965 + }, + { + "epoch": 0.3320859056563297, + "grad_norm": 1.7720043659210205, + "learning_rate": 8.135227573463067e-07, + "loss": 0.7465, + "step": 6966 + }, + { + "epoch": 0.3321335780516292, + "grad_norm": 1.7507137060165405, + "learning_rate": 8.119716117629206e-07, + "loss": 0.6792, + "step": 6967 + }, + { + "epoch": 0.3321812504469287, + "grad_norm": 2.602229118347168, + "learning_rate": 8.10421883797694e-07, + "loss": 1.1148, + "step": 6968 + }, + { + "epoch": 0.3322289228422282, + "grad_norm": 2.1618120670318604, + "learning_rate": 8.088735736897369e-07, + "loss": 0.4714, + "step": 6969 + }, + { + "epoch": 0.3322765952375277, + "grad_norm": 1.3942264318466187, + "learning_rate": 8.07326681677938e-07, + "loss": 0.758, + "step": 6970 + }, + { + "epoch": 0.3323242676328272, + "grad_norm": 1.5200979709625244, + "learning_rate": 8.057812080009641e-07, + "loss": 0.6898, + "step": 6971 + }, + { + "epoch": 0.3323719400281267, + "grad_norm": 1.68550443649292, + "learning_rate": 8.042371528972681e-07, + "loss": 0.6427, + "step": 6972 + }, + { + "epoch": 0.3324196124234262, + "grad_norm": 1.460419774055481, + "learning_rate": 8.026945166050837e-07, + "loss": 0.5732, + "step": 6973 + }, + { + "epoch": 0.33246728481872573, + "grad_norm": 1.9466084241867065, + "learning_rate": 8.011532993624194e-07, + "loss": 0.8939, + "step": 6974 + }, + { + "epoch": 0.33251495721402524, + "grad_norm": 1.8210314512252808, + "learning_rate": 7.996135014070727e-07, + "loss": 1.0337, + "step": 6975 + }, + { + "epoch": 0.3325626296093247, + "grad_norm": 3.750234842300415, + "learning_rate": 7.98075122976617e-07, + "loss": 0.8677, + "step": 6976 + }, + { + "epoch": 0.3326103020046242, + "grad_norm": 2.1335513591766357, + "learning_rate": 7.965381643084069e-07, + "loss": 0.9474, + "step": 6977 + }, + { + "epoch": 0.33265797439992373, + "grad_norm": 2.9676480293273926, + "learning_rate": 7.950026256395804e-07, + "loss": 0.8712, + "step": 6978 + }, + { + "epoch": 0.33270564679522324, + "grad_norm": 2.2473292350769043, + "learning_rate": 7.934685072070569e-07, + "loss": 0.9024, + "step": 6979 + }, + { + "epoch": 0.3327533191905227, + "grad_norm": 1.7809481620788574, + "learning_rate": 7.919358092475326e-07, + "loss": 0.5509, + "step": 6980 + }, + { + "epoch": 0.3328009915858222, + "grad_norm": 3.3433854579925537, + "learning_rate": 7.904045319974885e-07, + "loss": 0.8467, + "step": 6981 + }, + { + "epoch": 0.33284866398112173, + "grad_norm": 1.1155240535736084, + "learning_rate": 7.888746756931865e-07, + "loss": 0.8283, + "step": 6982 + }, + { + "epoch": 0.33289633637642124, + "grad_norm": 1.5901422500610352, + "learning_rate": 7.873462405706633e-07, + "loss": 0.9563, + "step": 6983 + }, + { + "epoch": 0.33294400877172076, + "grad_norm": 1.781674861907959, + "learning_rate": 7.858192268657438e-07, + "loss": 0.9848, + "step": 6984 + }, + { + "epoch": 0.3329916811670202, + "grad_norm": 1.6295862197875977, + "learning_rate": 7.842936348140317e-07, + "loss": 0.9112, + "step": 6985 + }, + { + "epoch": 0.33303935356231973, + "grad_norm": 1.6032387018203735, + "learning_rate": 7.827694646509065e-07, + "loss": 0.7037, + "step": 6986 + }, + { + "epoch": 0.33308702595761924, + "grad_norm": 2.0810365676879883, + "learning_rate": 7.812467166115334e-07, + "loss": 0.514, + "step": 6987 + }, + { + "epoch": 0.33313469835291876, + "grad_norm": 1.4824879169464111, + "learning_rate": 7.797253909308588e-07, + "loss": 0.7197, + "step": 6988 + }, + { + "epoch": 0.33318237074821827, + "grad_norm": 4.814878463745117, + "learning_rate": 7.782054878436051e-07, + "loss": 0.3217, + "step": 6989 + }, + { + "epoch": 0.33323004314351773, + "grad_norm": 3.686336040496826, + "learning_rate": 7.766870075842792e-07, + "loss": 0.7873, + "step": 6990 + }, + { + "epoch": 0.33327771553881724, + "grad_norm": 2.2819459438323975, + "learning_rate": 7.751699503871646e-07, + "loss": 1.2062, + "step": 6991 + }, + { + "epoch": 0.33332538793411676, + "grad_norm": 1.5199083089828491, + "learning_rate": 7.736543164863319e-07, + "loss": 0.7416, + "step": 6992 + }, + { + "epoch": 0.33337306032941627, + "grad_norm": 3.20569109916687, + "learning_rate": 7.721401061156231e-07, + "loss": 0.4969, + "step": 6993 + }, + { + "epoch": 0.33342073272471573, + "grad_norm": 3.330939531326294, + "learning_rate": 7.706273195086667e-07, + "loss": 1.5155, + "step": 6994 + }, + { + "epoch": 0.33346840512001524, + "grad_norm": 13.722299575805664, + "learning_rate": 7.691159568988727e-07, + "loss": 0.0921, + "step": 6995 + }, + { + "epoch": 0.33351607751531476, + "grad_norm": 1.3149386644363403, + "learning_rate": 7.676060185194256e-07, + "loss": 0.7787, + "step": 6996 + }, + { + "epoch": 0.33356374991061427, + "grad_norm": 1.5102317333221436, + "learning_rate": 7.660975046032948e-07, + "loss": 0.5024, + "step": 6997 + }, + { + "epoch": 0.3336114223059138, + "grad_norm": 1.3673630952835083, + "learning_rate": 7.645904153832295e-07, + "loss": 0.7202, + "step": 6998 + }, + { + "epoch": 0.33365909470121324, + "grad_norm": 2.5012240409851074, + "learning_rate": 7.63084751091755e-07, + "loss": 0.0183, + "step": 6999 + }, + { + "epoch": 0.33370676709651276, + "grad_norm": 1.5828908681869507, + "learning_rate": 7.615805119611818e-07, + "loss": 0.7323, + "step": 7000 + }, + { + "epoch": 0.33375443949181227, + "grad_norm": 1.606491208076477, + "learning_rate": 7.600776982235992e-07, + "loss": 1.0206, + "step": 7001 + }, + { + "epoch": 0.3338021118871118, + "grad_norm": 1.3894623517990112, + "learning_rate": 7.585763101108746e-07, + "loss": 0.8505, + "step": 7002 + }, + { + "epoch": 0.33384978428241124, + "grad_norm": 2.2927088737487793, + "learning_rate": 7.570763478546572e-07, + "loss": 0.5769, + "step": 7003 + }, + { + "epoch": 0.33389745667771076, + "grad_norm": 2.522141695022583, + "learning_rate": 7.555778116863755e-07, + "loss": 0.9323, + "step": 7004 + }, + { + "epoch": 0.33394512907301027, + "grad_norm": 2.126577615737915, + "learning_rate": 7.540807018372387e-07, + "loss": 1.2636, + "step": 7005 + }, + { + "epoch": 0.3339928014683098, + "grad_norm": 2.773179292678833, + "learning_rate": 7.525850185382344e-07, + "loss": 1.0102, + "step": 7006 + }, + { + "epoch": 0.3340404738636093, + "grad_norm": 2.7864255905151367, + "learning_rate": 7.510907620201335e-07, + "loss": 1.0874, + "step": 7007 + }, + { + "epoch": 0.33408814625890876, + "grad_norm": 1.3754595518112183, + "learning_rate": 7.495979325134806e-07, + "loss": 0.8089, + "step": 7008 + }, + { + "epoch": 0.33413581865420827, + "grad_norm": 3.7733309268951416, + "learning_rate": 7.481065302486057e-07, + "loss": 0.8764, + "step": 7009 + }, + { + "epoch": 0.3341834910495078, + "grad_norm": 1.4038552045822144, + "learning_rate": 7.466165554556193e-07, + "loss": 0.9035, + "step": 7010 + }, + { + "epoch": 0.3342311634448073, + "grad_norm": 1.0519099235534668, + "learning_rate": 7.451280083644052e-07, + "loss": 0.2934, + "step": 7011 + }, + { + "epoch": 0.3342788358401068, + "grad_norm": 1.473276138305664, + "learning_rate": 7.436408892046321e-07, + "loss": 0.5284, + "step": 7012 + }, + { + "epoch": 0.3343265082354063, + "grad_norm": 2.0723464488983154, + "learning_rate": 7.421551982057496e-07, + "loss": 1.2248, + "step": 7013 + }, + { + "epoch": 0.3343741806307058, + "grad_norm": 1.693808913230896, + "learning_rate": 7.406709355969821e-07, + "loss": 0.581, + "step": 7014 + }, + { + "epoch": 0.3344218530260053, + "grad_norm": 2.9343862533569336, + "learning_rate": 7.391881016073354e-07, + "loss": 0.5937, + "step": 7015 + }, + { + "epoch": 0.3344695254213048, + "grad_norm": 1.7389940023422241, + "learning_rate": 7.377066964655987e-07, + "loss": 0.7204, + "step": 7016 + }, + { + "epoch": 0.3345171978166043, + "grad_norm": 1.8688080310821533, + "learning_rate": 7.362267204003337e-07, + "loss": 0.7314, + "step": 7017 + }, + { + "epoch": 0.3345648702119038, + "grad_norm": 1.577251672744751, + "learning_rate": 7.347481736398876e-07, + "loss": 0.9979, + "step": 7018 + }, + { + "epoch": 0.3346125426072033, + "grad_norm": 2.4060111045837402, + "learning_rate": 7.332710564123869e-07, + "loss": 0.2635, + "step": 7019 + }, + { + "epoch": 0.3346602150025028, + "grad_norm": 0.9208526611328125, + "learning_rate": 7.317953689457325e-07, + "loss": 0.5298, + "step": 7020 + }, + { + "epoch": 0.33470788739780233, + "grad_norm": 1.8221689462661743, + "learning_rate": 7.303211114676067e-07, + "loss": 0.8214, + "step": 7021 + }, + { + "epoch": 0.3347555597931018, + "grad_norm": 1.2249267101287842, + "learning_rate": 7.288482842054767e-07, + "loss": 1.0256, + "step": 7022 + }, + { + "epoch": 0.3348032321884013, + "grad_norm": 2.083839178085327, + "learning_rate": 7.273768873865794e-07, + "loss": 0.9193, + "step": 7023 + }, + { + "epoch": 0.3348509045837008, + "grad_norm": 1.6141384840011597, + "learning_rate": 7.259069212379399e-07, + "loss": 0.6222, + "step": 7024 + }, + { + "epoch": 0.33489857697900033, + "grad_norm": 1.3766950368881226, + "learning_rate": 7.244383859863591e-07, + "loss": 0.5655, + "step": 7025 + }, + { + "epoch": 0.3349462493742998, + "grad_norm": 1.5624011754989624, + "learning_rate": 7.229712818584134e-07, + "loss": 0.9671, + "step": 7026 + }, + { + "epoch": 0.3349939217695993, + "grad_norm": 25.637557983398438, + "learning_rate": 7.215056090804651e-07, + "loss": 0.8141, + "step": 7027 + }, + { + "epoch": 0.3350415941648988, + "grad_norm": 2.3002846240997314, + "learning_rate": 7.200413678786522e-07, + "loss": 0.9535, + "step": 7028 + }, + { + "epoch": 0.33508926656019833, + "grad_norm": 1.5308042764663696, + "learning_rate": 7.185785584788896e-07, + "loss": 0.2675, + "step": 7029 + }, + { + "epoch": 0.33513693895549784, + "grad_norm": 2.349839210510254, + "learning_rate": 7.171171811068744e-07, + "loss": 0.7634, + "step": 7030 + }, + { + "epoch": 0.3351846113507973, + "grad_norm": 1.5442345142364502, + "learning_rate": 7.156572359880842e-07, + "loss": 0.7412, + "step": 7031 + }, + { + "epoch": 0.3352322837460968, + "grad_norm": 1.4362175464630127, + "learning_rate": 7.141987233477732e-07, + "loss": 0.9757, + "step": 7032 + }, + { + "epoch": 0.33527995614139633, + "grad_norm": 2.019601583480835, + "learning_rate": 7.127416434109724e-07, + "loss": 0.862, + "step": 7033 + }, + { + "epoch": 0.33532762853669584, + "grad_norm": 1.7395135164260864, + "learning_rate": 7.112859964024977e-07, + "loss": 0.7656, + "step": 7034 + }, + { + "epoch": 0.3353753009319953, + "grad_norm": 0.9082579016685486, + "learning_rate": 7.098317825469381e-07, + "loss": 0.3733, + "step": 7035 + }, + { + "epoch": 0.3354229733272948, + "grad_norm": 1.9751553535461426, + "learning_rate": 7.083790020686632e-07, + "loss": 0.7905, + "step": 7036 + }, + { + "epoch": 0.33547064572259433, + "grad_norm": 1.4965424537658691, + "learning_rate": 7.069276551918225e-07, + "loss": 0.6882, + "step": 7037 + }, + { + "epoch": 0.33551831811789384, + "grad_norm": 2.9734511375427246, + "learning_rate": 7.054777421403469e-07, + "loss": 0.7582, + "step": 7038 + }, + { + "epoch": 0.33556599051319336, + "grad_norm": 1.3904688358306885, + "learning_rate": 7.040292631379386e-07, + "loss": 0.3592, + "step": 7039 + }, + { + "epoch": 0.3356136629084928, + "grad_norm": 5.98747444152832, + "learning_rate": 7.025822184080844e-07, + "loss": 2.0757, + "step": 7040 + }, + { + "epoch": 0.33566133530379233, + "grad_norm": 1.2837018966674805, + "learning_rate": 7.011366081740512e-07, + "loss": 0.7061, + "step": 7041 + }, + { + "epoch": 0.33570900769909184, + "grad_norm": 1.4962846040725708, + "learning_rate": 6.996924326588772e-07, + "loss": 0.5799, + "step": 7042 + }, + { + "epoch": 0.33575668009439136, + "grad_norm": 4.407690525054932, + "learning_rate": 6.982496920853876e-07, + "loss": 0.5775, + "step": 7043 + }, + { + "epoch": 0.33580435248969087, + "grad_norm": 2.727773427963257, + "learning_rate": 6.968083866761821e-07, + "loss": 1.1992, + "step": 7044 + }, + { + "epoch": 0.33585202488499033, + "grad_norm": 1.1141144037246704, + "learning_rate": 6.953685166536361e-07, + "loss": 0.5329, + "step": 7045 + }, + { + "epoch": 0.33589969728028984, + "grad_norm": 1.6083498001098633, + "learning_rate": 6.939300822399086e-07, + "loss": 0.569, + "step": 7046 + }, + { + "epoch": 0.33594736967558936, + "grad_norm": 3.362513780593872, + "learning_rate": 6.924930836569377e-07, + "loss": 0.8628, + "step": 7047 + }, + { + "epoch": 0.33599504207088887, + "grad_norm": 2.602198362350464, + "learning_rate": 6.910575211264336e-07, + "loss": 0.5938, + "step": 7048 + }, + { + "epoch": 0.33604271446618833, + "grad_norm": 1.5819512605667114, + "learning_rate": 6.896233948698916e-07, + "loss": 0.7165, + "step": 7049 + }, + { + "epoch": 0.33609038686148784, + "grad_norm": 2.2062723636627197, + "learning_rate": 6.881907051085801e-07, + "loss": 0.8791, + "step": 7050 + }, + { + "epoch": 0.33613805925678736, + "grad_norm": 1.5825045108795166, + "learning_rate": 6.867594520635512e-07, + "loss": 0.6868, + "step": 7051 + }, + { + "epoch": 0.33618573165208687, + "grad_norm": 1.9158004522323608, + "learning_rate": 6.853296359556294e-07, + "loss": 0.7825, + "step": 7052 + }, + { + "epoch": 0.3362334040473864, + "grad_norm": 1.7363232374191284, + "learning_rate": 6.839012570054249e-07, + "loss": 0.8877, + "step": 7053 + }, + { + "epoch": 0.33628107644268584, + "grad_norm": 1.525704026222229, + "learning_rate": 6.824743154333157e-07, + "loss": 0.842, + "step": 7054 + }, + { + "epoch": 0.33632874883798536, + "grad_norm": 1.6364781856536865, + "learning_rate": 6.810488114594694e-07, + "loss": 0.5471, + "step": 7055 + }, + { + "epoch": 0.3363764212332849, + "grad_norm": 2.1185033321380615, + "learning_rate": 6.796247453038252e-07, + "loss": 0.848, + "step": 7056 + }, + { + "epoch": 0.3364240936285844, + "grad_norm": 1.9405720233917236, + "learning_rate": 6.782021171861008e-07, + "loss": 0.7518, + "step": 7057 + }, + { + "epoch": 0.33647176602388384, + "grad_norm": 1.6473499536514282, + "learning_rate": 6.76780927325793e-07, + "loss": 0.7728, + "step": 7058 + }, + { + "epoch": 0.33651943841918336, + "grad_norm": 1.209946870803833, + "learning_rate": 6.753611759421796e-07, + "loss": 0.7729, + "step": 7059 + }, + { + "epoch": 0.3365671108144829, + "grad_norm": 1.3223323822021484, + "learning_rate": 6.739428632543099e-07, + "loss": 0.7023, + "step": 7060 + }, + { + "epoch": 0.3366147832097824, + "grad_norm": 1.9575542211532593, + "learning_rate": 6.725259894810165e-07, + "loss": 0.9558, + "step": 7061 + }, + { + "epoch": 0.3366624556050819, + "grad_norm": 1.2555452585220337, + "learning_rate": 6.711105548409103e-07, + "loss": 0.6409, + "step": 7062 + }, + { + "epoch": 0.33671012800038136, + "grad_norm": 1.4947121143341064, + "learning_rate": 6.696965595523741e-07, + "loss": 0.6378, + "step": 7063 + }, + { + "epoch": 0.3367578003956809, + "grad_norm": 3.0194287300109863, + "learning_rate": 6.682840038335781e-07, + "loss": 0.4027, + "step": 7064 + }, + { + "epoch": 0.3368054727909804, + "grad_norm": 2.0665180683135986, + "learning_rate": 6.6687288790246e-07, + "loss": 0.6601, + "step": 7065 + }, + { + "epoch": 0.3368531451862799, + "grad_norm": 3.0973949432373047, + "learning_rate": 6.654632119767446e-07, + "loss": 0.5614, + "step": 7066 + }, + { + "epoch": 0.33690081758157936, + "grad_norm": 1.4588905572891235, + "learning_rate": 6.640549762739257e-07, + "loss": 0.6588, + "step": 7067 + }, + { + "epoch": 0.3369484899768789, + "grad_norm": 1.5161415338516235, + "learning_rate": 6.62648181011284e-07, + "loss": 0.6512, + "step": 7068 + }, + { + "epoch": 0.3369961623721784, + "grad_norm": 1.6196343898773193, + "learning_rate": 6.612428264058723e-07, + "loss": 0.5214, + "step": 7069 + }, + { + "epoch": 0.3370438347674779, + "grad_norm": 1.4347542524337769, + "learning_rate": 6.598389126745209e-07, + "loss": 0.7676, + "step": 7070 + }, + { + "epoch": 0.3370915071627774, + "grad_norm": 2.336092233657837, + "learning_rate": 6.584364400338395e-07, + "loss": 0.6583, + "step": 7071 + }, + { + "epoch": 0.3371391795580769, + "grad_norm": 2.322232723236084, + "learning_rate": 6.570354087002173e-07, + "loss": 0.6857, + "step": 7072 + }, + { + "epoch": 0.3371868519533764, + "grad_norm": 1.3281875848770142, + "learning_rate": 6.55635818889817e-07, + "loss": 0.6177, + "step": 7073 + }, + { + "epoch": 0.3372345243486759, + "grad_norm": 1.6594889163970947, + "learning_rate": 6.542376708185816e-07, + "loss": 0.6738, + "step": 7074 + }, + { + "epoch": 0.3372821967439754, + "grad_norm": 2.193472385406494, + "learning_rate": 6.528409647022316e-07, + "loss": 1.0536, + "step": 7075 + }, + { + "epoch": 0.33732986913927493, + "grad_norm": 1.5280077457427979, + "learning_rate": 6.514457007562625e-07, + "loss": 0.748, + "step": 7076 + }, + { + "epoch": 0.3373775415345744, + "grad_norm": 1.2045466899871826, + "learning_rate": 6.500518791959498e-07, + "loss": 0.6522, + "step": 7077 + }, + { + "epoch": 0.3374252139298739, + "grad_norm": 2.187865734100342, + "learning_rate": 6.486595002363494e-07, + "loss": 1.0395, + "step": 7078 + }, + { + "epoch": 0.3374728863251734, + "grad_norm": 1.5545551776885986, + "learning_rate": 6.47268564092286e-07, + "loss": 0.5877, + "step": 7079 + }, + { + "epoch": 0.33752055872047293, + "grad_norm": 4.3865532875061035, + "learning_rate": 6.45879070978368e-07, + "loss": 0.9466, + "step": 7080 + }, + { + "epoch": 0.3375682311157724, + "grad_norm": 1.7052801847457886, + "learning_rate": 6.444910211089827e-07, + "loss": 0.477, + "step": 7081 + }, + { + "epoch": 0.3376159035110719, + "grad_norm": 1.2200794219970703, + "learning_rate": 6.431044146982868e-07, + "loss": 0.5847, + "step": 7082 + }, + { + "epoch": 0.3376635759063714, + "grad_norm": 1.0847972631454468, + "learning_rate": 6.417192519602233e-07, + "loss": 0.477, + "step": 7083 + }, + { + "epoch": 0.33771124830167093, + "grad_norm": 3.3016562461853027, + "learning_rate": 6.403355331085092e-07, + "loss": 0.6994, + "step": 7084 + }, + { + "epoch": 0.33775892069697044, + "grad_norm": 2.6363039016723633, + "learning_rate": 6.389532583566338e-07, + "loss": 1.2867, + "step": 7085 + }, + { + "epoch": 0.3378065930922699, + "grad_norm": 2.580029010772705, + "learning_rate": 6.375724279178719e-07, + "loss": 0.8993, + "step": 7086 + }, + { + "epoch": 0.3378542654875694, + "grad_norm": 4.548689365386963, + "learning_rate": 6.361930420052709e-07, + "loss": 1.0246, + "step": 7087 + }, + { + "epoch": 0.33790193788286893, + "grad_norm": 1.1172839403152466, + "learning_rate": 6.348151008316539e-07, + "loss": 0.899, + "step": 7088 + }, + { + "epoch": 0.33794961027816844, + "grad_norm": 1.4501323699951172, + "learning_rate": 6.334386046096231e-07, + "loss": 0.4915, + "step": 7089 + }, + { + "epoch": 0.3379972826734679, + "grad_norm": 1.4368114471435547, + "learning_rate": 6.320635535515607e-07, + "loss": 0.811, + "step": 7090 + }, + { + "epoch": 0.3380449550687674, + "grad_norm": 1.2407218217849731, + "learning_rate": 6.306899478696193e-07, + "loss": 0.4421, + "step": 7091 + }, + { + "epoch": 0.33809262746406693, + "grad_norm": 2.2550017833709717, + "learning_rate": 6.293177877757339e-07, + "loss": 0.7693, + "step": 7092 + }, + { + "epoch": 0.33814029985936644, + "grad_norm": 2.1144356727600098, + "learning_rate": 6.279470734816162e-07, + "loss": 0.8044, + "step": 7093 + }, + { + "epoch": 0.33818797225466596, + "grad_norm": 1.921338677406311, + "learning_rate": 6.265778051987492e-07, + "loss": 0.7113, + "step": 7094 + }, + { + "epoch": 0.3382356446499654, + "grad_norm": 1.4199682474136353, + "learning_rate": 6.252099831384018e-07, + "loss": 0.6867, + "step": 7095 + }, + { + "epoch": 0.33828331704526493, + "grad_norm": 2.187979221343994, + "learning_rate": 6.238436075116117e-07, + "loss": 0.2012, + "step": 7096 + }, + { + "epoch": 0.33833098944056444, + "grad_norm": 2.5692050457000732, + "learning_rate": 6.22478678529197e-07, + "loss": 1.2634, + "step": 7097 + }, + { + "epoch": 0.33837866183586396, + "grad_norm": 1.5426928997039795, + "learning_rate": 6.211151964017503e-07, + "loss": 0.7481, + "step": 7098 + }, + { + "epoch": 0.3384263342311634, + "grad_norm": 1.6039663553237915, + "learning_rate": 6.197531613396479e-07, + "loss": 0.6206, + "step": 7099 + }, + { + "epoch": 0.33847400662646293, + "grad_norm": 3.3389742374420166, + "learning_rate": 6.183925735530327e-07, + "loss": 0.7603, + "step": 7100 + }, + { + "epoch": 0.33852167902176244, + "grad_norm": 2.830254077911377, + "learning_rate": 6.170334332518325e-07, + "loss": 0.8227, + "step": 7101 + }, + { + "epoch": 0.33856935141706196, + "grad_norm": 1.7223087549209595, + "learning_rate": 6.156757406457481e-07, + "loss": 1.0832, + "step": 7102 + }, + { + "epoch": 0.3386170238123615, + "grad_norm": 3.009058713912964, + "learning_rate": 6.143194959442566e-07, + "loss": 1.1031, + "step": 7103 + }, + { + "epoch": 0.33866469620766093, + "grad_norm": 6.602575778961182, + "learning_rate": 6.129646993566118e-07, + "loss": 0.2292, + "step": 7104 + }, + { + "epoch": 0.33871236860296045, + "grad_norm": 3.132328510284424, + "learning_rate": 6.116113510918476e-07, + "loss": 0.3777, + "step": 7105 + }, + { + "epoch": 0.33876004099825996, + "grad_norm": 1.1135035753250122, + "learning_rate": 6.102594513587701e-07, + "loss": 0.598, + "step": 7106 + }, + { + "epoch": 0.3388077133935595, + "grad_norm": 1.2780905961990356, + "learning_rate": 6.089090003659637e-07, + "loss": 0.4626, + "step": 7107 + }, + { + "epoch": 0.338855385788859, + "grad_norm": 1.6975395679473877, + "learning_rate": 6.075599983217895e-07, + "loss": 0.4814, + "step": 7108 + }, + { + "epoch": 0.33890305818415845, + "grad_norm": 1.5749510526657104, + "learning_rate": 6.062124454343832e-07, + "loss": 0.631, + "step": 7109 + }, + { + "epoch": 0.33895073057945796, + "grad_norm": 6.589550971984863, + "learning_rate": 6.048663419116607e-07, + "loss": 0.3112, + "step": 7110 + }, + { + "epoch": 0.3389984029747575, + "grad_norm": 2.9055511951446533, + "learning_rate": 6.035216879613082e-07, + "loss": 0.5406, + "step": 7111 + }, + { + "epoch": 0.339046075370057, + "grad_norm": 1.5288498401641846, + "learning_rate": 6.021784837907962e-07, + "loss": 0.3707, + "step": 7112 + }, + { + "epoch": 0.33909374776535645, + "grad_norm": 1.8829283714294434, + "learning_rate": 6.008367296073636e-07, + "loss": 0.9509, + "step": 7113 + }, + { + "epoch": 0.33914142016065596, + "grad_norm": 3.490995407104492, + "learning_rate": 5.994964256180313e-07, + "loss": 0.4782, + "step": 7114 + }, + { + "epoch": 0.3391890925559555, + "grad_norm": 1.3826745748519897, + "learning_rate": 5.981575720295963e-07, + "loss": 0.9863, + "step": 7115 + }, + { + "epoch": 0.339236764951255, + "grad_norm": 1.6758770942687988, + "learning_rate": 5.968201690486252e-07, + "loss": 0.5993, + "step": 7116 + }, + { + "epoch": 0.3392844373465545, + "grad_norm": 1.2160645723342896, + "learning_rate": 5.954842168814679e-07, + "loss": 0.7267, + "step": 7117 + }, + { + "epoch": 0.33933210974185396, + "grad_norm": 1.6106619834899902, + "learning_rate": 5.941497157342502e-07, + "loss": 0.7206, + "step": 7118 + }, + { + "epoch": 0.3393797821371535, + "grad_norm": 2.9641261100769043, + "learning_rate": 5.928166658128687e-07, + "loss": 0.7746, + "step": 7119 + }, + { + "epoch": 0.339427454532453, + "grad_norm": 1.2934762239456177, + "learning_rate": 5.914850673229988e-07, + "loss": 0.7819, + "step": 7120 + }, + { + "epoch": 0.3394751269277525, + "grad_norm": 1.3298192024230957, + "learning_rate": 5.901549204700974e-07, + "loss": 0.7705, + "step": 7121 + }, + { + "epoch": 0.33952279932305196, + "grad_norm": 3.0106053352355957, + "learning_rate": 5.888262254593869e-07, + "loss": 1.1803, + "step": 7122 + }, + { + "epoch": 0.3395704717183515, + "grad_norm": 1.988847017288208, + "learning_rate": 5.874989824958744e-07, + "loss": 0.7232, + "step": 7123 + }, + { + "epoch": 0.339618144113651, + "grad_norm": 1.336850643157959, + "learning_rate": 5.861731917843383e-07, + "loss": 0.4295, + "step": 7124 + }, + { + "epoch": 0.3396658165089505, + "grad_norm": 2.070082187652588, + "learning_rate": 5.848488535293362e-07, + "loss": 0.5574, + "step": 7125 + }, + { + "epoch": 0.33971348890425, + "grad_norm": 5.496239185333252, + "learning_rate": 5.835259679351968e-07, + "loss": 0.5636, + "step": 7126 + }, + { + "epoch": 0.3397611612995495, + "grad_norm": 3.804555654525757, + "learning_rate": 5.822045352060313e-07, + "loss": 0.7409, + "step": 7127 + }, + { + "epoch": 0.339808833694849, + "grad_norm": 2.286628484725952, + "learning_rate": 5.808845555457198e-07, + "loss": 0.9169, + "step": 7128 + }, + { + "epoch": 0.3398565060901485, + "grad_norm": 1.5711488723754883, + "learning_rate": 5.795660291579241e-07, + "loss": 0.7299, + "step": 7129 + }, + { + "epoch": 0.339904178485448, + "grad_norm": 1.5138362646102905, + "learning_rate": 5.782489562460791e-07, + "loss": 0.3377, + "step": 7130 + }, + { + "epoch": 0.33995185088074753, + "grad_norm": 1.4402294158935547, + "learning_rate": 5.769333370133933e-07, + "loss": 1.0856, + "step": 7131 + }, + { + "epoch": 0.339999523276047, + "grad_norm": 1.6308043003082275, + "learning_rate": 5.756191716628556e-07, + "loss": 0.6157, + "step": 7132 + }, + { + "epoch": 0.3400471956713465, + "grad_norm": 2.3055005073547363, + "learning_rate": 5.743064603972282e-07, + "loss": 0.3677, + "step": 7133 + }, + { + "epoch": 0.340094868066646, + "grad_norm": 1.1508523225784302, + "learning_rate": 5.729952034190467e-07, + "loss": 0.316, + "step": 7134 + }, + { + "epoch": 0.34014254046194553, + "grad_norm": 1.9965412616729736, + "learning_rate": 5.71685400930626e-07, + "loss": 0.7114, + "step": 7135 + }, + { + "epoch": 0.340190212857245, + "grad_norm": 2.4253644943237305, + "learning_rate": 5.703770531340569e-07, + "loss": 0.9511, + "step": 7136 + }, + { + "epoch": 0.3402378852525445, + "grad_norm": 4.56763219833374, + "learning_rate": 5.69070160231201e-07, + "loss": 1.3591, + "step": 7137 + }, + { + "epoch": 0.340285557647844, + "grad_norm": 3.3468735218048096, + "learning_rate": 5.677647224236982e-07, + "loss": 0.6144, + "step": 7138 + }, + { + "epoch": 0.34033323004314353, + "grad_norm": 1.9734158515930176, + "learning_rate": 5.664607399129684e-07, + "loss": 0.2928, + "step": 7139 + }, + { + "epoch": 0.34038090243844304, + "grad_norm": 1.6915615797042847, + "learning_rate": 5.651582129001987e-07, + "loss": 0.5172, + "step": 7140 + }, + { + "epoch": 0.3404285748337425, + "grad_norm": 5.406301021575928, + "learning_rate": 5.638571415863559e-07, + "loss": 2.348, + "step": 7141 + }, + { + "epoch": 0.340476247229042, + "grad_norm": 0.9483609795570374, + "learning_rate": 5.625575261721838e-07, + "loss": 0.5575, + "step": 7142 + }, + { + "epoch": 0.34052391962434153, + "grad_norm": 1.6920337677001953, + "learning_rate": 5.612593668581978e-07, + "loss": 0.6074, + "step": 7143 + }, + { + "epoch": 0.34057159201964105, + "grad_norm": 1.9882586002349854, + "learning_rate": 5.599626638446898e-07, + "loss": 0.7159, + "step": 7144 + }, + { + "epoch": 0.3406192644149405, + "grad_norm": 2.160698175430298, + "learning_rate": 5.586674173317308e-07, + "loss": 0.947, + "step": 7145 + }, + { + "epoch": 0.34066693681024, + "grad_norm": 1.9571664333343506, + "learning_rate": 5.573736275191622e-07, + "loss": 0.6159, + "step": 7146 + }, + { + "epoch": 0.34071460920553953, + "grad_norm": 1.8392515182495117, + "learning_rate": 5.560812946066029e-07, + "loss": 0.9505, + "step": 7147 + }, + { + "epoch": 0.34076228160083905, + "grad_norm": 1.9170050621032715, + "learning_rate": 5.54790418793445e-07, + "loss": 0.7896, + "step": 7148 + }, + { + "epoch": 0.34080995399613856, + "grad_norm": 1.3238599300384521, + "learning_rate": 5.53501000278861e-07, + "loss": 1.0377, + "step": 7149 + }, + { + "epoch": 0.340857626391438, + "grad_norm": 7.5345377922058105, + "learning_rate": 5.522130392617908e-07, + "loss": 0.4883, + "step": 7150 + }, + { + "epoch": 0.34090529878673753, + "grad_norm": 1.5058772563934326, + "learning_rate": 5.509265359409544e-07, + "loss": 0.7266, + "step": 7151 + }, + { + "epoch": 0.34095297118203705, + "grad_norm": 1.70972740650177, + "learning_rate": 5.496414905148495e-07, + "loss": 0.756, + "step": 7152 + }, + { + "epoch": 0.34100064357733656, + "grad_norm": 1.86093008518219, + "learning_rate": 5.48357903181741e-07, + "loss": 0.6973, + "step": 7153 + }, + { + "epoch": 0.341048315972636, + "grad_norm": 2.4055392742156982, + "learning_rate": 5.47075774139676e-07, + "loss": 1.2593, + "step": 7154 + }, + { + "epoch": 0.34109598836793553, + "grad_norm": 5.15859317779541, + "learning_rate": 5.457951035864729e-07, + "loss": 0.4831, + "step": 7155 + }, + { + "epoch": 0.34114366076323505, + "grad_norm": 1.8685381412506104, + "learning_rate": 5.445158917197246e-07, + "loss": 0.6988, + "step": 7156 + }, + { + "epoch": 0.34119133315853456, + "grad_norm": 4.353698253631592, + "learning_rate": 5.432381387368014e-07, + "loss": 1.0267, + "step": 7157 + }, + { + "epoch": 0.3412390055538341, + "grad_norm": 1.3330374956130981, + "learning_rate": 5.419618448348485e-07, + "loss": 0.9164, + "step": 7158 + }, + { + "epoch": 0.34128667794913353, + "grad_norm": 1.1249141693115234, + "learning_rate": 5.40687010210783e-07, + "loss": 0.5959, + "step": 7159 + }, + { + "epoch": 0.34133435034443305, + "grad_norm": 4.510800838470459, + "learning_rate": 5.394136350613e-07, + "loss": 0.7041, + "step": 7160 + }, + { + "epoch": 0.34138202273973256, + "grad_norm": 2.3502354621887207, + "learning_rate": 5.381417195828698e-07, + "loss": 0.5844, + "step": 7161 + }, + { + "epoch": 0.3414296951350321, + "grad_norm": 2.194605588912964, + "learning_rate": 5.368712639717311e-07, + "loss": 0.9033, + "step": 7162 + }, + { + "epoch": 0.3414773675303316, + "grad_norm": 1.9759695529937744, + "learning_rate": 5.35602268423906e-07, + "loss": 0.734, + "step": 7163 + }, + { + "epoch": 0.34152503992563105, + "grad_norm": 1.3834506273269653, + "learning_rate": 5.343347331351878e-07, + "loss": 0.6274, + "step": 7164 + }, + { + "epoch": 0.34157271232093056, + "grad_norm": 1.0868414640426636, + "learning_rate": 5.330686583011413e-07, + "loss": 0.6798, + "step": 7165 + }, + { + "epoch": 0.3416203847162301, + "grad_norm": 2.306903600692749, + "learning_rate": 5.318040441171101e-07, + "loss": 1.0728, + "step": 7166 + }, + { + "epoch": 0.3416680571115296, + "grad_norm": 1.7810614109039307, + "learning_rate": 5.305408907782128e-07, + "loss": 0.7936, + "step": 7167 + }, + { + "epoch": 0.34171572950682905, + "grad_norm": 1.8817362785339355, + "learning_rate": 5.292791984793388e-07, + "loss": 0.2779, + "step": 7168 + }, + { + "epoch": 0.34176340190212856, + "grad_norm": 1.3534526824951172, + "learning_rate": 5.280189674151559e-07, + "loss": 0.6725, + "step": 7169 + }, + { + "epoch": 0.3418110742974281, + "grad_norm": 1.385118007659912, + "learning_rate": 5.267601977801018e-07, + "loss": 0.558, + "step": 7170 + }, + { + "epoch": 0.3418587466927276, + "grad_norm": 1.1696964502334595, + "learning_rate": 5.255028897683956e-07, + "loss": 0.9574, + "step": 7171 + }, + { + "epoch": 0.3419064190880271, + "grad_norm": 0.910984992980957, + "learning_rate": 5.242470435740232e-07, + "loss": 0.7166, + "step": 7172 + }, + { + "epoch": 0.34195409148332656, + "grad_norm": 1.6158665418624878, + "learning_rate": 5.229926593907531e-07, + "loss": 0.697, + "step": 7173 + }, + { + "epoch": 0.3420017638786261, + "grad_norm": 1.2293812036514282, + "learning_rate": 5.217397374121192e-07, + "loss": 0.5826, + "step": 7174 + }, + { + "epoch": 0.3420494362739256, + "grad_norm": 1.4363961219787598, + "learning_rate": 5.204882778314358e-07, + "loss": 0.4648, + "step": 7175 + }, + { + "epoch": 0.3420971086692251, + "grad_norm": 2.4854228496551514, + "learning_rate": 5.192382808417939e-07, + "loss": 0.9834, + "step": 7176 + }, + { + "epoch": 0.34214478106452456, + "grad_norm": 1.4465450048446655, + "learning_rate": 5.179897466360495e-07, + "loss": 0.9246, + "step": 7177 + }, + { + "epoch": 0.3421924534598241, + "grad_norm": 1.1187666654586792, + "learning_rate": 5.167426754068427e-07, + "loss": 0.8393, + "step": 7178 + }, + { + "epoch": 0.3422401258551236, + "grad_norm": 1.1752023696899414, + "learning_rate": 5.154970673465831e-07, + "loss": 0.2504, + "step": 7179 + }, + { + "epoch": 0.3422877982504231, + "grad_norm": 1.0947926044464111, + "learning_rate": 5.142529226474536e-07, + "loss": 0.4632, + "step": 7180 + }, + { + "epoch": 0.3423354706457226, + "grad_norm": 1.5549540519714355, + "learning_rate": 5.130102415014137e-07, + "loss": 0.8201, + "step": 7181 + }, + { + "epoch": 0.3423831430410221, + "grad_norm": 3.417045831680298, + "learning_rate": 5.11769024100196e-07, + "loss": 0.8468, + "step": 7182 + }, + { + "epoch": 0.3424308154363216, + "grad_norm": 1.71469247341156, + "learning_rate": 5.105292706353093e-07, + "loss": 0.9623, + "step": 7183 + }, + { + "epoch": 0.3424784878316211, + "grad_norm": 3.3496880531311035, + "learning_rate": 5.09290981298034e-07, + "loss": 1.0611, + "step": 7184 + }, + { + "epoch": 0.3425261602269206, + "grad_norm": 1.8131933212280273, + "learning_rate": 5.080541562794239e-07, + "loss": 0.8119, + "step": 7185 + }, + { + "epoch": 0.3425738326222201, + "grad_norm": 1.277723789215088, + "learning_rate": 5.068187957703097e-07, + "loss": 0.6334, + "step": 7186 + }, + { + "epoch": 0.3426215050175196, + "grad_norm": 1.1603819131851196, + "learning_rate": 5.055848999612934e-07, + "loss": 0.7065, + "step": 7187 + }, + { + "epoch": 0.3426691774128191, + "grad_norm": 1.8818080425262451, + "learning_rate": 5.043524690427537e-07, + "loss": 0.5874, + "step": 7188 + }, + { + "epoch": 0.3427168498081186, + "grad_norm": 1.458407998085022, + "learning_rate": 5.031215032048431e-07, + "loss": 0.8327, + "step": 7189 + }, + { + "epoch": 0.34276452220341813, + "grad_norm": 1.8858898878097534, + "learning_rate": 5.018920026374841e-07, + "loss": 1.028, + "step": 7190 + }, + { + "epoch": 0.3428121945987176, + "grad_norm": 0.910244345664978, + "learning_rate": 5.006639675303781e-07, + "loss": 0.4734, + "step": 7191 + }, + { + "epoch": 0.3428598669940171, + "grad_norm": 2.176889657974243, + "learning_rate": 4.994373980729983e-07, + "loss": 0.9091, + "step": 7192 + }, + { + "epoch": 0.3429075393893166, + "grad_norm": 3.477884292602539, + "learning_rate": 4.982122944545908e-07, + "loss": 0.936, + "step": 7193 + }, + { + "epoch": 0.34295521178461613, + "grad_norm": 1.5500426292419434, + "learning_rate": 4.969886568641757e-07, + "loss": 0.5856, + "step": 7194 + }, + { + "epoch": 0.34300288417991565, + "grad_norm": 1.4851431846618652, + "learning_rate": 4.957664854905508e-07, + "loss": 0.6471, + "step": 7195 + }, + { + "epoch": 0.3430505565752151, + "grad_norm": 1.420636534690857, + "learning_rate": 4.945457805222809e-07, + "loss": 0.1592, + "step": 7196 + }, + { + "epoch": 0.3430982289705146, + "grad_norm": 2.4242300987243652, + "learning_rate": 4.933265421477096e-07, + "loss": 0.5091, + "step": 7197 + }, + { + "epoch": 0.34314590136581413, + "grad_norm": 1.2386995553970337, + "learning_rate": 4.921087705549544e-07, + "loss": 0.6226, + "step": 7198 + }, + { + "epoch": 0.34319357376111365, + "grad_norm": 3.5894792079925537, + "learning_rate": 4.908924659319037e-07, + "loss": 0.7458, + "step": 7199 + }, + { + "epoch": 0.3432412461564131, + "grad_norm": 3.0469913482666016, + "learning_rate": 4.896776284662186e-07, + "loss": 0.8975, + "step": 7200 + }, + { + "epoch": 0.3432889185517126, + "grad_norm": 1.621962070465088, + "learning_rate": 4.884642583453403e-07, + "loss": 0.9469, + "step": 7201 + }, + { + "epoch": 0.34333659094701213, + "grad_norm": 1.3070720434188843, + "learning_rate": 4.872523557564756e-07, + "loss": 0.6808, + "step": 7202 + }, + { + "epoch": 0.34338426334231165, + "grad_norm": 2.817579507827759, + "learning_rate": 4.860419208866096e-07, + "loss": 0.698, + "step": 7203 + }, + { + "epoch": 0.34343193573761116, + "grad_norm": 4.396624565124512, + "learning_rate": 4.848329539225027e-07, + "loss": 0.4839, + "step": 7204 + }, + { + "epoch": 0.3434796081329106, + "grad_norm": 3.2510831356048584, + "learning_rate": 4.836254550506814e-07, + "loss": 0.7595, + "step": 7205 + }, + { + "epoch": 0.34352728052821013, + "grad_norm": 1.2895548343658447, + "learning_rate": 4.824194244574531e-07, + "loss": 0.8418, + "step": 7206 + }, + { + "epoch": 0.34357495292350965, + "grad_norm": 2.766206741333008, + "learning_rate": 4.81214862328897e-07, + "loss": 1.4206, + "step": 7207 + }, + { + "epoch": 0.34362262531880916, + "grad_norm": 1.9874190092086792, + "learning_rate": 4.80011768850862e-07, + "loss": 1.0453, + "step": 7208 + }, + { + "epoch": 0.3436702977141086, + "grad_norm": 2.1299922466278076, + "learning_rate": 4.788101442089732e-07, + "loss": 0.7787, + "step": 7209 + }, + { + "epoch": 0.34371797010940813, + "grad_norm": 3.9610226154327393, + "learning_rate": 4.77609988588632e-07, + "loss": 1.3, + "step": 7210 + }, + { + "epoch": 0.34376564250470765, + "grad_norm": 1.107771635055542, + "learning_rate": 4.764113021750061e-07, + "loss": 0.6033, + "step": 7211 + }, + { + "epoch": 0.34381331490000716, + "grad_norm": 1.764366865158081, + "learning_rate": 4.752140851530429e-07, + "loss": 0.514, + "step": 7212 + }, + { + "epoch": 0.3438609872953067, + "grad_norm": 2.4293935298919678, + "learning_rate": 4.740183377074603e-07, + "loss": 1.0554, + "step": 7213 + }, + { + "epoch": 0.34390865969060613, + "grad_norm": 1.5060569047927856, + "learning_rate": 4.728240600227496e-07, + "loss": 0.5778, + "step": 7214 + }, + { + "epoch": 0.34395633208590565, + "grad_norm": 1.059520959854126, + "learning_rate": 4.7163125228317565e-07, + "loss": 0.4218, + "step": 7215 + }, + { + "epoch": 0.34400400448120516, + "grad_norm": 1.587288737297058, + "learning_rate": 4.704399146727767e-07, + "loss": 0.6765, + "step": 7216 + }, + { + "epoch": 0.3440516768765047, + "grad_norm": 2.7077524662017822, + "learning_rate": 4.692500473753625e-07, + "loss": 0.8471, + "step": 7217 + }, + { + "epoch": 0.3440993492718042, + "grad_norm": 3.374375343322754, + "learning_rate": 4.6806165057451833e-07, + "loss": 0.5849, + "step": 7218 + }, + { + "epoch": 0.34414702166710365, + "grad_norm": 1.5658568143844604, + "learning_rate": 4.6687472445360206e-07, + "loss": 0.609, + "step": 7219 + }, + { + "epoch": 0.34419469406240316, + "grad_norm": 1.7451024055480957, + "learning_rate": 4.656892691957426e-07, + "loss": 0.7883, + "step": 7220 + }, + { + "epoch": 0.3442423664577027, + "grad_norm": 3.42445969581604, + "learning_rate": 4.6450528498384493e-07, + "loss": 1.556, + "step": 7221 + }, + { + "epoch": 0.3442900388530022, + "grad_norm": 3.019050121307373, + "learning_rate": 4.6332277200058397e-07, + "loss": 1.5462, + "step": 7222 + }, + { + "epoch": 0.34433771124830165, + "grad_norm": 1.2758978605270386, + "learning_rate": 4.621417304284126e-07, + "loss": 0.7431, + "step": 7223 + }, + { + "epoch": 0.34438538364360116, + "grad_norm": 2.1080386638641357, + "learning_rate": 4.609621604495507e-07, + "loss": 0.9957, + "step": 7224 + }, + { + "epoch": 0.3444330560389007, + "grad_norm": 2.187030792236328, + "learning_rate": 4.597840622459937e-07, + "loss": 0.8028, + "step": 7225 + }, + { + "epoch": 0.3444807284342002, + "grad_norm": 1.9063327312469482, + "learning_rate": 4.5860743599951186e-07, + "loss": 0.9747, + "step": 7226 + }, + { + "epoch": 0.3445284008294997, + "grad_norm": 1.3499354124069214, + "learning_rate": 4.574322818916443e-07, + "loss": 0.5217, + "step": 7227 + }, + { + "epoch": 0.34457607322479916, + "grad_norm": 1.9013001918792725, + "learning_rate": 4.5625860010370726e-07, + "loss": 0.6729, + "step": 7228 + }, + { + "epoch": 0.3446237456200987, + "grad_norm": 1.1611921787261963, + "learning_rate": 4.550863908167846e-07, + "loss": 0.4916, + "step": 7229 + }, + { + "epoch": 0.3446714180153982, + "grad_norm": 1.9586031436920166, + "learning_rate": 4.5391565421174065e-07, + "loss": 0.8037, + "step": 7230 + }, + { + "epoch": 0.3447190904106977, + "grad_norm": 1.3925200700759888, + "learning_rate": 4.527463904692042e-07, + "loss": 0.7632, + "step": 7231 + }, + { + "epoch": 0.34476676280599716, + "grad_norm": 2.207425832748413, + "learning_rate": 4.515785997695832e-07, + "loss": 0.9885, + "step": 7232 + }, + { + "epoch": 0.3448144352012967, + "grad_norm": 1.337417483329773, + "learning_rate": 4.5041228229305343e-07, + "loss": 0.6356, + "step": 7233 + }, + { + "epoch": 0.3448621075965962, + "grad_norm": 1.5648466348648071, + "learning_rate": 4.492474382195666e-07, + "loss": 0.8432, + "step": 7234 + }, + { + "epoch": 0.3449097799918957, + "grad_norm": 1.0981671810150146, + "learning_rate": 4.480840677288478e-07, + "loss": 0.3318, + "step": 7235 + }, + { + "epoch": 0.3449574523871952, + "grad_norm": 1.4383518695831299, + "learning_rate": 4.4692217100039013e-07, + "loss": 0.8603, + "step": 7236 + }, + { + "epoch": 0.3450051247824947, + "grad_norm": 1.3654465675354004, + "learning_rate": 4.457617482134635e-07, + "loss": 0.9705, + "step": 7237 + }, + { + "epoch": 0.3450527971777942, + "grad_norm": 2.0730645656585693, + "learning_rate": 4.446027995471114e-07, + "loss": 0.8829, + "step": 7238 + }, + { + "epoch": 0.3451004695730937, + "grad_norm": 1.2500892877578735, + "learning_rate": 4.4344532518014405e-07, + "loss": 0.8822, + "step": 7239 + }, + { + "epoch": 0.3451481419683932, + "grad_norm": 2.913982629776001, + "learning_rate": 4.4228932529114975e-07, + "loss": 1.0263, + "step": 7240 + }, + { + "epoch": 0.3451958143636927, + "grad_norm": 1.9293720722198486, + "learning_rate": 4.411348000584881e-07, + "loss": 0.5467, + "step": 7241 + }, + { + "epoch": 0.3452434867589922, + "grad_norm": 1.5334426164627075, + "learning_rate": 4.3998174966028875e-07, + "loss": 0.7143, + "step": 7242 + }, + { + "epoch": 0.3452911591542917, + "grad_norm": 1.1827961206436157, + "learning_rate": 4.3883017427445717e-07, + "loss": 0.669, + "step": 7243 + }, + { + "epoch": 0.3453388315495912, + "grad_norm": 2.1932945251464844, + "learning_rate": 4.3768007407866685e-07, + "loss": 0.8165, + "step": 7244 + }, + { + "epoch": 0.34538650394489073, + "grad_norm": 0.846015214920044, + "learning_rate": 4.3653144925037025e-07, + "loss": 0.5453, + "step": 7245 + }, + { + "epoch": 0.3454341763401902, + "grad_norm": 1.4479868412017822, + "learning_rate": 4.3538429996678567e-07, + "loss": 0.8784, + "step": 7246 + }, + { + "epoch": 0.3454818487354897, + "grad_norm": 1.8788586854934692, + "learning_rate": 4.342386264049081e-07, + "loss": 0.8506, + "step": 7247 + }, + { + "epoch": 0.3455295211307892, + "grad_norm": 1.759360671043396, + "learning_rate": 4.3309442874150063e-07, + "loss": 0.724, + "step": 7248 + }, + { + "epoch": 0.34557719352608873, + "grad_norm": 17.884838104248047, + "learning_rate": 4.319517071531021e-07, + "loss": 0.9161, + "step": 7249 + }, + { + "epoch": 0.34562486592138825, + "grad_norm": 1.5192292928695679, + "learning_rate": 4.3081046181602583e-07, + "loss": 0.5112, + "step": 7250 + }, + { + "epoch": 0.3456725383166877, + "grad_norm": 5.040566444396973, + "learning_rate": 4.296706929063499e-07, + "loss": 0.3561, + "step": 7251 + }, + { + "epoch": 0.3457202107119872, + "grad_norm": 2.5041663646698, + "learning_rate": 4.285324005999303e-07, + "loss": 0.5347, + "step": 7252 + }, + { + "epoch": 0.34576788310728673, + "grad_norm": 2.3320980072021484, + "learning_rate": 4.2739558507239543e-07, + "loss": 1.0952, + "step": 7253 + }, + { + "epoch": 0.34581555550258625, + "grad_norm": 2.034547805786133, + "learning_rate": 4.2626024649914275e-07, + "loss": 0.3327, + "step": 7254 + }, + { + "epoch": 0.3458632278978857, + "grad_norm": 1.6025238037109375, + "learning_rate": 4.251263850553433e-07, + "loss": 0.6959, + "step": 7255 + }, + { + "epoch": 0.3459109002931852, + "grad_norm": 1.8562968969345093, + "learning_rate": 4.2399400091594154e-07, + "loss": 0.679, + "step": 7256 + }, + { + "epoch": 0.34595857268848473, + "grad_norm": 2.029865264892578, + "learning_rate": 4.2286309425564997e-07, + "loss": 0.6528, + "step": 7257 + }, + { + "epoch": 0.34600624508378425, + "grad_norm": 0.9812089800834656, + "learning_rate": 4.2173366524895787e-07, + "loss": 0.5244, + "step": 7258 + }, + { + "epoch": 0.34605391747908376, + "grad_norm": 1.5033107995986938, + "learning_rate": 4.2060571407012583e-07, + "loss": 0.6437, + "step": 7259 + }, + { + "epoch": 0.3461015898743832, + "grad_norm": 1.2605870962142944, + "learning_rate": 4.1947924089318247e-07, + "loss": 0.8642, + "step": 7260 + }, + { + "epoch": 0.34614926226968273, + "grad_norm": 3.5696067810058594, + "learning_rate": 4.1835424589193096e-07, + "loss": 0.6863, + "step": 7261 + }, + { + "epoch": 0.34619693466498225, + "grad_norm": 2.35353684425354, + "learning_rate": 4.17230729239948e-07, + "loss": 1.0326, + "step": 7262 + }, + { + "epoch": 0.34624460706028176, + "grad_norm": 2.9793784618377686, + "learning_rate": 4.161086911105816e-07, + "loss": 1.1812, + "step": 7263 + }, + { + "epoch": 0.3462922794555812, + "grad_norm": 1.5557667016983032, + "learning_rate": 4.1498813167694776e-07, + "loss": 1.0101, + "step": 7264 + }, + { + "epoch": 0.34633995185088073, + "grad_norm": 1.3478249311447144, + "learning_rate": 4.138690511119381e-07, + "loss": 0.74, + "step": 7265 + }, + { + "epoch": 0.34638762424618025, + "grad_norm": 2.1713931560516357, + "learning_rate": 4.127514495882168e-07, + "loss": 0.5199, + "step": 7266 + }, + { + "epoch": 0.34643529664147976, + "grad_norm": 1.6194018125534058, + "learning_rate": 4.1163532727821696e-07, + "loss": 0.8982, + "step": 7267 + }, + { + "epoch": 0.3464829690367793, + "grad_norm": 2.487565279006958, + "learning_rate": 4.1052068435414426e-07, + "loss": 0.8187, + "step": 7268 + }, + { + "epoch": 0.34653064143207873, + "grad_norm": 1.365309476852417, + "learning_rate": 4.094075209879789e-07, + "loss": 0.7434, + "step": 7269 + }, + { + "epoch": 0.34657831382737825, + "grad_norm": 2.3606138229370117, + "learning_rate": 4.082958373514689e-07, + "loss": 1.1335, + "step": 7270 + }, + { + "epoch": 0.34662598622267776, + "grad_norm": 1.2571296691894531, + "learning_rate": 4.0718563361613396e-07, + "loss": 0.7632, + "step": 7271 + }, + { + "epoch": 0.3466736586179773, + "grad_norm": 1.0391098260879517, + "learning_rate": 4.060769099532713e-07, + "loss": 0.3259, + "step": 7272 + }, + { + "epoch": 0.34672133101327673, + "grad_norm": 1.6397120952606201, + "learning_rate": 4.04969666533942e-07, + "loss": 0.6466, + "step": 7273 + }, + { + "epoch": 0.34676900340857625, + "grad_norm": 2.494081974029541, + "learning_rate": 4.0386390352898376e-07, + "loss": 0.9244, + "step": 7274 + }, + { + "epoch": 0.34681667580387576, + "grad_norm": 1.679502010345459, + "learning_rate": 4.0275962110900455e-07, + "loss": 0.8833, + "step": 7275 + }, + { + "epoch": 0.3468643481991753, + "grad_norm": 1.6045907735824585, + "learning_rate": 4.016568194443826e-07, + "loss": 0.6828, + "step": 7276 + }, + { + "epoch": 0.3469120205944748, + "grad_norm": 1.292438268661499, + "learning_rate": 4.0055549870526955e-07, + "loss": 0.6701, + "step": 7277 + }, + { + "epoch": 0.34695969298977425, + "grad_norm": 1.9425338506698608, + "learning_rate": 3.9945565906158833e-07, + "loss": 0.9244, + "step": 7278 + }, + { + "epoch": 0.34700736538507376, + "grad_norm": 1.5062963962554932, + "learning_rate": 3.9835730068303215e-07, + "loss": 0.5208, + "step": 7279 + }, + { + "epoch": 0.3470550377803733, + "grad_norm": 1.8522506952285767, + "learning_rate": 3.9726042373906536e-07, + "loss": 0.6404, + "step": 7280 + }, + { + "epoch": 0.3471027101756728, + "grad_norm": 1.5433088541030884, + "learning_rate": 3.961650283989282e-07, + "loss": 0.5705, + "step": 7281 + }, + { + "epoch": 0.3471503825709723, + "grad_norm": 3.9353222846984863, + "learning_rate": 3.9507111483162554e-07, + "loss": 1.1046, + "step": 7282 + }, + { + "epoch": 0.34719805496627176, + "grad_norm": 2.1075096130371094, + "learning_rate": 3.939786832059389e-07, + "loss": 0.9925, + "step": 7283 + }, + { + "epoch": 0.3472457273615713, + "grad_norm": 2.5660717487335205, + "learning_rate": 3.928877336904191e-07, + "loss": 1.3565, + "step": 7284 + }, + { + "epoch": 0.3472933997568708, + "grad_norm": 3.5466341972351074, + "learning_rate": 3.9179826645338594e-07, + "loss": 0.2571, + "step": 7285 + }, + { + "epoch": 0.3473410721521703, + "grad_norm": 1.2514286041259766, + "learning_rate": 3.90710281662936e-07, + "loss": 0.706, + "step": 7286 + }, + { + "epoch": 0.34738874454746976, + "grad_norm": 1.27994966506958, + "learning_rate": 3.8962377948693395e-07, + "loss": 0.8089, + "step": 7287 + }, + { + "epoch": 0.3474364169427693, + "grad_norm": 1.498083472251892, + "learning_rate": 3.885387600930135e-07, + "loss": 0.5761, + "step": 7288 + }, + { + "epoch": 0.3474840893380688, + "grad_norm": 3.6953487396240234, + "learning_rate": 3.8745522364858513e-07, + "loss": 0.9285, + "step": 7289 + }, + { + "epoch": 0.3475317617333683, + "grad_norm": 1.5337369441986084, + "learning_rate": 3.86373170320824e-07, + "loss": 0.8, + "step": 7290 + }, + { + "epoch": 0.3475794341286678, + "grad_norm": 2.4488043785095215, + "learning_rate": 3.8529260027668325e-07, + "loss": 0.4423, + "step": 7291 + }, + { + "epoch": 0.3476271065239673, + "grad_norm": 1.5359479188919067, + "learning_rate": 3.842135136828806e-07, + "loss": 0.5747, + "step": 7292 + }, + { + "epoch": 0.3476747789192668, + "grad_norm": 2.4388535022735596, + "learning_rate": 3.831359107059096e-07, + "loss": 0.6063, + "step": 7293 + }, + { + "epoch": 0.3477224513145663, + "grad_norm": 3.1549928188323975, + "learning_rate": 3.8205979151203274e-07, + "loss": 1.0557, + "step": 7294 + }, + { + "epoch": 0.3477701237098658, + "grad_norm": 1.7099775075912476, + "learning_rate": 3.809851562672839e-07, + "loss": 0.4977, + "step": 7295 + }, + { + "epoch": 0.3478177961051653, + "grad_norm": 1.210395336151123, + "learning_rate": 3.799120051374694e-07, + "loss": 0.4558, + "step": 7296 + }, + { + "epoch": 0.3478654685004648, + "grad_norm": 1.3912314176559448, + "learning_rate": 3.7884033828816556e-07, + "loss": 0.7312, + "step": 7297 + }, + { + "epoch": 0.3479131408957643, + "grad_norm": 2.0125679969787598, + "learning_rate": 3.77770155884718e-07, + "loss": 0.609, + "step": 7298 + }, + { + "epoch": 0.3479608132910638, + "grad_norm": 3.271801233291626, + "learning_rate": 3.7670145809224567e-07, + "loss": 0.5904, + "step": 7299 + }, + { + "epoch": 0.34800848568636333, + "grad_norm": 1.4114795923233032, + "learning_rate": 3.7563424507563785e-07, + "loss": 0.751, + "step": 7300 + }, + { + "epoch": 0.3480561580816628, + "grad_norm": 1.7076895236968994, + "learning_rate": 3.745685169995539e-07, + "loss": 0.9077, + "step": 7301 + }, + { + "epoch": 0.3481038304769623, + "grad_norm": 1.2552075386047363, + "learning_rate": 3.7350427402842446e-07, + "loss": 0.5939, + "step": 7302 + }, + { + "epoch": 0.3481515028722618, + "grad_norm": 1.6350072622299194, + "learning_rate": 3.7244151632645387e-07, + "loss": 0.8362, + "step": 7303 + }, + { + "epoch": 0.34819917526756133, + "grad_norm": 3.420283555984497, + "learning_rate": 3.7138024405761197e-07, + "loss": 0.512, + "step": 7304 + }, + { + "epoch": 0.34824684766286085, + "grad_norm": 2.005946159362793, + "learning_rate": 3.7032045738564114e-07, + "loss": 0.4772, + "step": 7305 + }, + { + "epoch": 0.3482945200581603, + "grad_norm": 1.7450584173202515, + "learning_rate": 3.692621564740584e-07, + "loss": 1.0413, + "step": 7306 + }, + { + "epoch": 0.3483421924534598, + "grad_norm": 2.4184982776641846, + "learning_rate": 3.682053414861475e-07, + "loss": 0.9437, + "step": 7307 + }, + { + "epoch": 0.34838986484875933, + "grad_norm": 1.6064496040344238, + "learning_rate": 3.6715001258496365e-07, + "loss": 0.3939, + "step": 7308 + }, + { + "epoch": 0.34843753724405885, + "grad_norm": 1.8300976753234863, + "learning_rate": 3.660961699333343e-07, + "loss": 0.8987, + "step": 7309 + }, + { + "epoch": 0.3484852096393583, + "grad_norm": 1.73401939868927, + "learning_rate": 3.65043813693855e-07, + "loss": 0.7753, + "step": 7310 + }, + { + "epoch": 0.3485328820346578, + "grad_norm": 2.3060131072998047, + "learning_rate": 3.6399294402889473e-07, + "loss": 0.6279, + "step": 7311 + }, + { + "epoch": 0.34858055442995733, + "grad_norm": 2.206881284713745, + "learning_rate": 3.629435611005916e-07, + "loss": 0.6554, + "step": 7312 + }, + { + "epoch": 0.34862822682525685, + "grad_norm": 3.581489086151123, + "learning_rate": 3.618956650708549e-07, + "loss": 1.2522, + "step": 7313 + }, + { + "epoch": 0.34867589922055636, + "grad_norm": 1.3536385297775269, + "learning_rate": 3.608492561013632e-07, + "loss": 0.4236, + "step": 7314 + }, + { + "epoch": 0.3487235716158558, + "grad_norm": 1.4357553720474243, + "learning_rate": 3.598043343535673e-07, + "loss": 0.7097, + "step": 7315 + }, + { + "epoch": 0.34877124401115533, + "grad_norm": 1.215166687965393, + "learning_rate": 3.5876089998868825e-07, + "loss": 0.6834, + "step": 7316 + }, + { + "epoch": 0.34881891640645485, + "grad_norm": 1.62375009059906, + "learning_rate": 3.577189531677161e-07, + "loss": 0.4891, + "step": 7317 + }, + { + "epoch": 0.34886658880175436, + "grad_norm": 4.09118127822876, + "learning_rate": 3.566784940514145e-07, + "loss": 0.6916, + "step": 7318 + }, + { + "epoch": 0.3489142611970538, + "grad_norm": 1.5236778259277344, + "learning_rate": 3.55639522800314e-07, + "loss": 0.5245, + "step": 7319 + }, + { + "epoch": 0.34896193359235333, + "grad_norm": 2.2369823455810547, + "learning_rate": 3.546020395747163e-07, + "loss": 0.8088, + "step": 7320 + }, + { + "epoch": 0.34900960598765285, + "grad_norm": 4.75441312789917, + "learning_rate": 3.5356604453469665e-07, + "loss": 0.412, + "step": 7321 + }, + { + "epoch": 0.34905727838295236, + "grad_norm": 2.548539400100708, + "learning_rate": 3.525315378400962e-07, + "loss": 0.7128, + "step": 7322 + }, + { + "epoch": 0.3491049507782519, + "grad_norm": 2.4372811317443848, + "learning_rate": 3.514985196505305e-07, + "loss": 0.6168, + "step": 7323 + }, + { + "epoch": 0.34915262317355134, + "grad_norm": 1.3216582536697388, + "learning_rate": 3.504669901253832e-07, + "loss": 0.7532, + "step": 7324 + }, + { + "epoch": 0.34920029556885085, + "grad_norm": 1.550697922706604, + "learning_rate": 3.4943694942380704e-07, + "loss": 0.6972, + "step": 7325 + }, + { + "epoch": 0.34924796796415036, + "grad_norm": 2.0736286640167236, + "learning_rate": 3.484083977047281e-07, + "loss": 0.9249, + "step": 7326 + }, + { + "epoch": 0.3492956403594499, + "grad_norm": 1.6073224544525146, + "learning_rate": 3.473813351268429e-07, + "loss": 0.8632, + "step": 7327 + }, + { + "epoch": 0.34934331275474934, + "grad_norm": 1.408355474472046, + "learning_rate": 3.463557618486135e-07, + "loss": 0.2324, + "step": 7328 + }, + { + "epoch": 0.34939098515004885, + "grad_norm": 1.9746421575546265, + "learning_rate": 3.453316780282767e-07, + "loss": 0.7641, + "step": 7329 + }, + { + "epoch": 0.34943865754534836, + "grad_norm": 1.434739351272583, + "learning_rate": 3.4430908382383944e-07, + "loss": 0.7244, + "step": 7330 + }, + { + "epoch": 0.3494863299406479, + "grad_norm": 1.6043431758880615, + "learning_rate": 3.4328797939307435e-07, + "loss": 0.7228, + "step": 7331 + }, + { + "epoch": 0.3495340023359474, + "grad_norm": 2.1351242065429688, + "learning_rate": 3.4226836489352987e-07, + "loss": 0.3226, + "step": 7332 + }, + { + "epoch": 0.34958167473124685, + "grad_norm": 1.3221118450164795, + "learning_rate": 3.412502404825224e-07, + "loss": 0.791, + "step": 7333 + }, + { + "epoch": 0.34962934712654636, + "grad_norm": 1.9221763610839844, + "learning_rate": 3.402336063171352e-07, + "loss": 0.6204, + "step": 7334 + }, + { + "epoch": 0.3496770195218459, + "grad_norm": 1.3127278089523315, + "learning_rate": 3.392184625542283e-07, + "loss": 0.7301, + "step": 7335 + }, + { + "epoch": 0.3497246919171454, + "grad_norm": 2.440467357635498, + "learning_rate": 3.382048093504242e-07, + "loss": 0.9175, + "step": 7336 + }, + { + "epoch": 0.3497723643124449, + "grad_norm": 2.429494857788086, + "learning_rate": 3.371926468621212e-07, + "loss": 0.8546, + "step": 7337 + }, + { + "epoch": 0.34982003670774436, + "grad_norm": 2.4686343669891357, + "learning_rate": 3.3618197524548534e-07, + "loss": 0.7445, + "step": 7338 + }, + { + "epoch": 0.3498677091030439, + "grad_norm": 1.4753676652908325, + "learning_rate": 3.3517279465645204e-07, + "loss": 0.5532, + "step": 7339 + }, + { + "epoch": 0.3499153814983434, + "grad_norm": 5.123486042022705, + "learning_rate": 3.3416510525072886e-07, + "loss": 1.1981, + "step": 7340 + }, + { + "epoch": 0.3499630538936429, + "grad_norm": 1.894282579421997, + "learning_rate": 3.331589071837904e-07, + "loss": 0.7651, + "step": 7341 + }, + { + "epoch": 0.35001072628894236, + "grad_norm": 1.8569226264953613, + "learning_rate": 3.3215420061088245e-07, + "loss": 0.8511, + "step": 7342 + }, + { + "epoch": 0.3500583986842419, + "grad_norm": 2.3166210651397705, + "learning_rate": 3.311509856870243e-07, + "loss": 0.7973, + "step": 7343 + }, + { + "epoch": 0.3501060710795414, + "grad_norm": 1.7932738065719604, + "learning_rate": 3.3014926256699665e-07, + "loss": 0.6125, + "step": 7344 + }, + { + "epoch": 0.3501537434748409, + "grad_norm": 1.515670895576477, + "learning_rate": 3.2914903140535914e-07, + "loss": 0.8435, + "step": 7345 + }, + { + "epoch": 0.3502014158701404, + "grad_norm": 3.0000510215759277, + "learning_rate": 3.2815029235643505e-07, + "loss": 0.6665, + "step": 7346 + }, + { + "epoch": 0.3502490882654399, + "grad_norm": 1.9593795537948608, + "learning_rate": 3.2715304557431994e-07, + "loss": 0.493, + "step": 7347 + }, + { + "epoch": 0.3502967606607394, + "grad_norm": 1.7829817533493042, + "learning_rate": 3.261572912128796e-07, + "loss": 0.2872, + "step": 7348 + }, + { + "epoch": 0.3503444330560389, + "grad_norm": 1.2360681295394897, + "learning_rate": 3.2516302942574794e-07, + "loss": 0.4964, + "step": 7349 + }, + { + "epoch": 0.3503921054513384, + "grad_norm": 1.6936216354370117, + "learning_rate": 3.241702603663288e-07, + "loss": 0.888, + "step": 7350 + }, + { + "epoch": 0.3504397778466379, + "grad_norm": 2.7934439182281494, + "learning_rate": 3.2317898418779634e-07, + "loss": 0.7825, + "step": 7351 + }, + { + "epoch": 0.3504874502419374, + "grad_norm": 2.769547939300537, + "learning_rate": 3.2218920104309605e-07, + "loss": 0.62, + "step": 7352 + }, + { + "epoch": 0.3505351226372369, + "grad_norm": 1.5975010395050049, + "learning_rate": 3.212009110849379e-07, + "loss": 1.0273, + "step": 7353 + }, + { + "epoch": 0.3505827950325364, + "grad_norm": 1.4424394369125366, + "learning_rate": 3.2021411446580774e-07, + "loss": 0.8914, + "step": 7354 + }, + { + "epoch": 0.35063046742783593, + "grad_norm": 2.261087656021118, + "learning_rate": 3.1922881133795827e-07, + "loss": 0.9779, + "step": 7355 + }, + { + "epoch": 0.3506781398231354, + "grad_norm": 6.544918537139893, + "learning_rate": 3.182450018534089e-07, + "loss": 0.6098, + "step": 7356 + }, + { + "epoch": 0.3507258122184349, + "grad_norm": 1.3822834491729736, + "learning_rate": 3.1726268616395273e-07, + "loss": 0.8185, + "step": 7357 + }, + { + "epoch": 0.3507734846137344, + "grad_norm": 1.5750081539154053, + "learning_rate": 3.1628186442115294e-07, + "loss": 0.7656, + "step": 7358 + }, + { + "epoch": 0.35082115700903393, + "grad_norm": 1.9999806880950928, + "learning_rate": 3.1530253677633625e-07, + "loss": 0.874, + "step": 7359 + }, + { + "epoch": 0.3508688294043334, + "grad_norm": 1.9656327962875366, + "learning_rate": 3.143247033806063e-07, + "loss": 1.2236, + "step": 7360 + }, + { + "epoch": 0.3509165017996329, + "grad_norm": 1.8565436601638794, + "learning_rate": 3.133483643848323e-07, + "loss": 0.8105, + "step": 7361 + }, + { + "epoch": 0.3509641741949324, + "grad_norm": 2.700535774230957, + "learning_rate": 3.123735199396516e-07, + "loss": 1.1901, + "step": 7362 + }, + { + "epoch": 0.35101184659023194, + "grad_norm": 1.5758708715438843, + "learning_rate": 3.1140017019547385e-07, + "loss": 0.5274, + "step": 7363 + }, + { + "epoch": 0.35105951898553145, + "grad_norm": 2.287595748901367, + "learning_rate": 3.1042831530247566e-07, + "loss": 0.7682, + "step": 7364 + }, + { + "epoch": 0.3511071913808309, + "grad_norm": 2.339594602584839, + "learning_rate": 3.0945795541060696e-07, + "loss": 0.8061, + "step": 7365 + }, + { + "epoch": 0.3511548637761304, + "grad_norm": 3.9156291484832764, + "learning_rate": 3.0848909066958035e-07, + "loss": 0.433, + "step": 7366 + }, + { + "epoch": 0.35120253617142994, + "grad_norm": 1.3135124444961548, + "learning_rate": 3.07521721228885e-07, + "loss": 0.8699, + "step": 7367 + }, + { + "epoch": 0.35125020856672945, + "grad_norm": 3.769801616668701, + "learning_rate": 3.06555847237775e-07, + "loss": 0.4134, + "step": 7368 + }, + { + "epoch": 0.35129788096202896, + "grad_norm": 2.105705738067627, + "learning_rate": 3.0559146884527324e-07, + "loss": 0.7116, + "step": 7369 + }, + { + "epoch": 0.3513455533573284, + "grad_norm": 1.5594096183776855, + "learning_rate": 3.0462858620017633e-07, + "loss": 0.7299, + "step": 7370 + }, + { + "epoch": 0.35139322575262794, + "grad_norm": 2.128023386001587, + "learning_rate": 3.0366719945104427e-07, + "loss": 1.5402, + "step": 7371 + }, + { + "epoch": 0.35144089814792745, + "grad_norm": 1.2129205465316772, + "learning_rate": 3.027073087462107e-07, + "loss": 0.4955, + "step": 7372 + }, + { + "epoch": 0.35148857054322696, + "grad_norm": 1.6877939701080322, + "learning_rate": 3.0174891423377595e-07, + "loss": 0.6331, + "step": 7373 + }, + { + "epoch": 0.3515362429385264, + "grad_norm": 1.6392323970794678, + "learning_rate": 3.007920160616129e-07, + "loss": 0.7552, + "step": 7374 + }, + { + "epoch": 0.35158391533382594, + "grad_norm": 2.563945770263672, + "learning_rate": 2.998366143773579e-07, + "loss": 0.6561, + "step": 7375 + }, + { + "epoch": 0.35163158772912545, + "grad_norm": 5.188019752502441, + "learning_rate": 2.988827093284219e-07, + "loss": 0.9177, + "step": 7376 + }, + { + "epoch": 0.35167926012442496, + "grad_norm": 1.811829924583435, + "learning_rate": 2.9793030106198164e-07, + "loss": 0.6108, + "step": 7377 + }, + { + "epoch": 0.3517269325197245, + "grad_norm": 1.7487602233886719, + "learning_rate": 2.9697938972498287e-07, + "loss": 0.6295, + "step": 7378 + }, + { + "epoch": 0.35177460491502394, + "grad_norm": 1.8587628602981567, + "learning_rate": 2.960299754641438e-07, + "loss": 0.7385, + "step": 7379 + }, + { + "epoch": 0.35182227731032345, + "grad_norm": 2.0606086254119873, + "learning_rate": 2.9508205842594727e-07, + "loss": 0.5784, + "step": 7380 + }, + { + "epoch": 0.35186994970562296, + "grad_norm": 1.1561492681503296, + "learning_rate": 2.941356387566474e-07, + "loss": 0.5408, + "step": 7381 + }, + { + "epoch": 0.3519176221009225, + "grad_norm": 1.6568877696990967, + "learning_rate": 2.9319071660226737e-07, + "loss": 0.8053, + "step": 7382 + }, + { + "epoch": 0.35196529449622194, + "grad_norm": 1.2379618883132935, + "learning_rate": 2.922472921086006e-07, + "loss": 0.5469, + "step": 7383 + }, + { + "epoch": 0.35201296689152145, + "grad_norm": 4.19793176651001, + "learning_rate": 2.913053654212039e-07, + "loss": 0.3475, + "step": 7384 + }, + { + "epoch": 0.35206063928682096, + "grad_norm": 1.9430595636367798, + "learning_rate": 2.9036493668541e-07, + "loss": 0.9096, + "step": 7385 + }, + { + "epoch": 0.3521083116821205, + "grad_norm": 3.407219171524048, + "learning_rate": 2.894260060463172e-07, + "loss": 0.6603, + "step": 7386 + }, + { + "epoch": 0.35215598407742, + "grad_norm": 1.756489634513855, + "learning_rate": 2.884885736487919e-07, + "loss": 0.7108, + "step": 7387 + }, + { + "epoch": 0.35220365647271945, + "grad_norm": 2.1382393836975098, + "learning_rate": 2.875526396374695e-07, + "loss": 0.1815, + "step": 7388 + }, + { + "epoch": 0.35225132886801896, + "grad_norm": 1.0845798254013062, + "learning_rate": 2.866182041567567e-07, + "loss": 0.5873, + "step": 7389 + }, + { + "epoch": 0.3522990012633185, + "grad_norm": 1.7142635583877563, + "learning_rate": 2.856852673508259e-07, + "loss": 0.9046, + "step": 7390 + }, + { + "epoch": 0.352346673658618, + "grad_norm": 0.9403621554374695, + "learning_rate": 2.8475382936362095e-07, + "loss": 0.3357, + "step": 7391 + }, + { + "epoch": 0.3523943460539175, + "grad_norm": 1.4498497247695923, + "learning_rate": 2.838238903388524e-07, + "loss": 0.8775, + "step": 7392 + }, + { + "epoch": 0.35244201844921696, + "grad_norm": 1.181903600692749, + "learning_rate": 2.828954504199999e-07, + "loss": 0.7466, + "step": 7393 + }, + { + "epoch": 0.3524896908445165, + "grad_norm": 1.3346874713897705, + "learning_rate": 2.819685097503133e-07, + "loss": 0.7473, + "step": 7394 + }, + { + "epoch": 0.352537363239816, + "grad_norm": 3.614489793777466, + "learning_rate": 2.810430684728094e-07, + "loss": 1.2224, + "step": 7395 + }, + { + "epoch": 0.3525850356351155, + "grad_norm": 1.2386598587036133, + "learning_rate": 2.8011912673027274e-07, + "loss": 0.6399, + "step": 7396 + }, + { + "epoch": 0.35263270803041497, + "grad_norm": 2.0606119632720947, + "learning_rate": 2.791966846652594e-07, + "loss": 1.0436, + "step": 7397 + }, + { + "epoch": 0.3526803804257145, + "grad_norm": 3.0070900917053223, + "learning_rate": 2.7827574242009434e-07, + "loss": 0.9309, + "step": 7398 + }, + { + "epoch": 0.352728052821014, + "grad_norm": 1.8840301036834717, + "learning_rate": 2.773563001368673e-07, + "loss": 0.6555, + "step": 7399 + }, + { + "epoch": 0.3527757252163135, + "grad_norm": 10.14394474029541, + "learning_rate": 2.764383579574381e-07, + "loss": 0.2678, + "step": 7400 + }, + { + "epoch": 0.352823397611613, + "grad_norm": 1.8069679737091064, + "learning_rate": 2.75521916023439e-07, + "loss": 0.7415, + "step": 7401 + }, + { + "epoch": 0.3528710700069125, + "grad_norm": 1.3674018383026123, + "learning_rate": 2.7460697447626363e-07, + "loss": 0.6104, + "step": 7402 + }, + { + "epoch": 0.352918742402212, + "grad_norm": 1.916283130645752, + "learning_rate": 2.7369353345708006e-07, + "loss": 0.8912, + "step": 7403 + }, + { + "epoch": 0.3529664147975115, + "grad_norm": 3.7305352687835693, + "learning_rate": 2.727815931068234e-07, + "loss": 0.6913, + "step": 7404 + }, + { + "epoch": 0.353014087192811, + "grad_norm": 1.9718384742736816, + "learning_rate": 2.7187115356619553e-07, + "loss": 0.6206, + "step": 7405 + }, + { + "epoch": 0.3530617595881105, + "grad_norm": 1.2205933332443237, + "learning_rate": 2.7096221497566853e-07, + "loss": 0.6263, + "step": 7406 + }, + { + "epoch": 0.35310943198341, + "grad_norm": 2.4745607376098633, + "learning_rate": 2.7005477747548245e-07, + "loss": 0.9387, + "step": 7407 + }, + { + "epoch": 0.3531571043787095, + "grad_norm": 0.9382553696632385, + "learning_rate": 2.691488412056442e-07, + "loss": 0.3974, + "step": 7408 + }, + { + "epoch": 0.353204776774009, + "grad_norm": 2.874303102493286, + "learning_rate": 2.682444063059331e-07, + "loss": 0.952, + "step": 7409 + }, + { + "epoch": 0.35325244916930854, + "grad_norm": 1.4257711172103882, + "learning_rate": 2.6734147291589075e-07, + "loss": 0.6688, + "step": 7410 + }, + { + "epoch": 0.353300121564608, + "grad_norm": 2.292304754257202, + "learning_rate": 2.6644004117483357e-07, + "loss": 0.7714, + "step": 7411 + }, + { + "epoch": 0.3533477939599075, + "grad_norm": 1.4404491186141968, + "learning_rate": 2.655401112218403e-07, + "loss": 0.4795, + "step": 7412 + }, + { + "epoch": 0.353395466355207, + "grad_norm": 1.6066272258758545, + "learning_rate": 2.646416831957621e-07, + "loss": 0.7284, + "step": 7413 + }, + { + "epoch": 0.35344313875050654, + "grad_norm": 1.872521996498108, + "learning_rate": 2.637447572352192e-07, + "loss": 0.8242, + "step": 7414 + }, + { + "epoch": 0.353490811145806, + "grad_norm": 1.0502004623413086, + "learning_rate": 2.6284933347859534e-07, + "loss": 0.573, + "step": 7415 + }, + { + "epoch": 0.3535384835411055, + "grad_norm": 4.0295586585998535, + "learning_rate": 2.619554120640455e-07, + "loss": 1.4406, + "step": 7416 + }, + { + "epoch": 0.353586155936405, + "grad_norm": 2.506178617477417, + "learning_rate": 2.610629931294939e-07, + "loss": 0.6825, + "step": 7417 + }, + { + "epoch": 0.35363382833170454, + "grad_norm": 1.5094168186187744, + "learning_rate": 2.6017207681263033e-07, + "loss": 0.689, + "step": 7418 + }, + { + "epoch": 0.35368150072700405, + "grad_norm": 1.7175939083099365, + "learning_rate": 2.5928266325091377e-07, + "loss": 1.0043, + "step": 7419 + }, + { + "epoch": 0.3537291731223035, + "grad_norm": 2.555274248123169, + "learning_rate": 2.583947525815733e-07, + "loss": 0.4071, + "step": 7420 + }, + { + "epoch": 0.353776845517603, + "grad_norm": 1.615445613861084, + "learning_rate": 2.575083449416038e-07, + "loss": 0.5633, + "step": 7421 + }, + { + "epoch": 0.35382451791290254, + "grad_norm": 3.040313482284546, + "learning_rate": 2.5662344046776697e-07, + "loss": 0.2669, + "step": 7422 + }, + { + "epoch": 0.35387219030820205, + "grad_norm": 2.82405424118042, + "learning_rate": 2.5574003929659697e-07, + "loss": 0.6578, + "step": 7423 + }, + { + "epoch": 0.35391986270350156, + "grad_norm": 1.4721612930297852, + "learning_rate": 2.548581415643936e-07, + "loss": 0.5962, + "step": 7424 + }, + { + "epoch": 0.353967535098801, + "grad_norm": 1.5721673965454102, + "learning_rate": 2.5397774740722134e-07, + "loss": 0.4572, + "step": 7425 + }, + { + "epoch": 0.35401520749410054, + "grad_norm": 1.8483844995498657, + "learning_rate": 2.5309885696091943e-07, + "loss": 0.7054, + "step": 7426 + }, + { + "epoch": 0.35406287988940005, + "grad_norm": 1.6586484909057617, + "learning_rate": 2.5222147036108925e-07, + "loss": 0.695, + "step": 7427 + }, + { + "epoch": 0.35411055228469956, + "grad_norm": 1.7140986919403076, + "learning_rate": 2.513455877431037e-07, + "loss": 0.7036, + "step": 7428 + }, + { + "epoch": 0.354158224679999, + "grad_norm": 1.3403490781784058, + "learning_rate": 2.5047120924210243e-07, + "loss": 0.8366, + "step": 7429 + }, + { + "epoch": 0.35420589707529854, + "grad_norm": 1.1276532411575317, + "learning_rate": 2.4959833499299314e-07, + "loss": 0.6857, + "step": 7430 + }, + { + "epoch": 0.35425356947059805, + "grad_norm": 2.1477253437042236, + "learning_rate": 2.4872696513045025e-07, + "loss": 0.8715, + "step": 7431 + }, + { + "epoch": 0.35430124186589756, + "grad_norm": 1.7946161031723022, + "learning_rate": 2.478570997889185e-07, + "loss": 0.5852, + "step": 7432 + }, + { + "epoch": 0.3543489142611971, + "grad_norm": 1.7223455905914307, + "learning_rate": 2.4698873910260824e-07, + "loss": 0.3981, + "step": 7433 + }, + { + "epoch": 0.35439658665649654, + "grad_norm": 1.918836236000061, + "learning_rate": 2.46121883205499e-07, + "loss": 0.7247, + "step": 7434 + }, + { + "epoch": 0.35444425905179605, + "grad_norm": 2.0484137535095215, + "learning_rate": 2.452565322313383e-07, + "loss": 0.6482, + "step": 7435 + }, + { + "epoch": 0.35449193144709557, + "grad_norm": 1.259306788444519, + "learning_rate": 2.4439268631363924e-07, + "loss": 0.5333, + "step": 7436 + }, + { + "epoch": 0.3545396038423951, + "grad_norm": 1.4508843421936035, + "learning_rate": 2.435303455856863e-07, + "loss": 1.1334, + "step": 7437 + }, + { + "epoch": 0.35458727623769454, + "grad_norm": 2.5092461109161377, + "learning_rate": 2.426695101805288e-07, + "loss": 0.9257, + "step": 7438 + }, + { + "epoch": 0.35463494863299405, + "grad_norm": 1.3373315334320068, + "learning_rate": 2.418101802309847e-07, + "loss": 0.7458, + "step": 7439 + }, + { + "epoch": 0.35468262102829357, + "grad_norm": 1.5031764507293701, + "learning_rate": 2.4095235586963916e-07, + "loss": 0.7301, + "step": 7440 + }, + { + "epoch": 0.3547302934235931, + "grad_norm": 3.019455671310425, + "learning_rate": 2.4009603722884745e-07, + "loss": 0.8141, + "step": 7441 + }, + { + "epoch": 0.3547779658188926, + "grad_norm": 1.7121657133102417, + "learning_rate": 2.392412244407294e-07, + "loss": 0.7722, + "step": 7442 + }, + { + "epoch": 0.35482563821419205, + "grad_norm": 1.6741019487380981, + "learning_rate": 2.3838791763717283e-07, + "loss": 0.7177, + "step": 7443 + }, + { + "epoch": 0.35487331060949157, + "grad_norm": 2.3587350845336914, + "learning_rate": 2.3753611694983693e-07, + "loss": 1.0375, + "step": 7444 + }, + { + "epoch": 0.3549209830047911, + "grad_norm": 2.358966827392578, + "learning_rate": 2.3668582251014316e-07, + "loss": 1.105, + "step": 7445 + }, + { + "epoch": 0.3549686554000906, + "grad_norm": 1.7531901597976685, + "learning_rate": 2.3583703444928442e-07, + "loss": 0.6243, + "step": 7446 + }, + { + "epoch": 0.35501632779539005, + "grad_norm": 1.5743778944015503, + "learning_rate": 2.3498975289822035e-07, + "loss": 0.6565, + "step": 7447 + }, + { + "epoch": 0.35506400019068957, + "grad_norm": 1.841613531112671, + "learning_rate": 2.341439779876775e-07, + "loss": 0.3976, + "step": 7448 + }, + { + "epoch": 0.3551116725859891, + "grad_norm": 1.8073867559432983, + "learning_rate": 2.3329970984814932e-07, + "loss": 1.0816, + "step": 7449 + }, + { + "epoch": 0.3551593449812886, + "grad_norm": 5.236608982086182, + "learning_rate": 2.324569486098982e-07, + "loss": 1.1874, + "step": 7450 + }, + { + "epoch": 0.3552070173765881, + "grad_norm": 2.4420433044433594, + "learning_rate": 2.3161569440295462e-07, + "loss": 1.1185, + "step": 7451 + }, + { + "epoch": 0.35525468977188757, + "grad_norm": 1.7360776662826538, + "learning_rate": 2.307759473571136e-07, + "loss": 0.7692, + "step": 7452 + }, + { + "epoch": 0.3553023621671871, + "grad_norm": 0.9339656233787537, + "learning_rate": 2.2993770760194044e-07, + "loss": 0.2735, + "step": 7453 + }, + { + "epoch": 0.3553500345624866, + "grad_norm": 1.9637995958328247, + "learning_rate": 2.2910097526676723e-07, + "loss": 0.8263, + "step": 7454 + }, + { + "epoch": 0.3553977069577861, + "grad_norm": 1.7706027030944824, + "learning_rate": 2.2826575048069287e-07, + "loss": 0.784, + "step": 7455 + }, + { + "epoch": 0.3554453793530856, + "grad_norm": 1.8194026947021484, + "learning_rate": 2.2743203337258323e-07, + "loss": 0.9633, + "step": 7456 + }, + { + "epoch": 0.3554930517483851, + "grad_norm": 1.6137317419052124, + "learning_rate": 2.2659982407107427e-07, + "loss": 0.9154, + "step": 7457 + }, + { + "epoch": 0.3555407241436846, + "grad_norm": 1.2820813655853271, + "learning_rate": 2.2576912270456442e-07, + "loss": 0.6613, + "step": 7458 + }, + { + "epoch": 0.3555883965389841, + "grad_norm": 1.43150794506073, + "learning_rate": 2.2493992940122334e-07, + "loss": 0.8796, + "step": 7459 + }, + { + "epoch": 0.3556360689342836, + "grad_norm": 1.364145040512085, + "learning_rate": 2.241122442889887e-07, + "loss": 0.9293, + "step": 7460 + }, + { + "epoch": 0.3556837413295831, + "grad_norm": 2.3687691688537598, + "learning_rate": 2.232860674955617e-07, + "loss": 0.6651, + "step": 7461 + }, + { + "epoch": 0.3557314137248826, + "grad_norm": 1.84059739112854, + "learning_rate": 2.224613991484148e-07, + "loss": 0.7915, + "step": 7462 + }, + { + "epoch": 0.3557790861201821, + "grad_norm": 1.2504510879516602, + "learning_rate": 2.2163823937478512e-07, + "loss": 1.0127, + "step": 7463 + }, + { + "epoch": 0.3558267585154816, + "grad_norm": 4.887526035308838, + "learning_rate": 2.2081658830167552e-07, + "loss": 0.745, + "step": 7464 + }, + { + "epoch": 0.35587443091078114, + "grad_norm": 2.2528183460235596, + "learning_rate": 2.1999644605586122e-07, + "loss": 0.8482, + "step": 7465 + }, + { + "epoch": 0.3559221033060806, + "grad_norm": 1.9308404922485352, + "learning_rate": 2.1917781276388217e-07, + "loss": 0.7044, + "step": 7466 + }, + { + "epoch": 0.3559697757013801, + "grad_norm": 3.6642274856567383, + "learning_rate": 2.1836068855204174e-07, + "loss": 1.0077, + "step": 7467 + }, + { + "epoch": 0.3560174480966796, + "grad_norm": 1.5573298931121826, + "learning_rate": 2.1754507354641686e-07, + "loss": 0.9672, + "step": 7468 + }, + { + "epoch": 0.35606512049197914, + "grad_norm": 1.3317711353302002, + "learning_rate": 2.1673096787284686e-07, + "loss": 0.8061, + "step": 7469 + }, + { + "epoch": 0.3561127928872786, + "grad_norm": 1.3915892839431763, + "learning_rate": 2.1591837165694018e-07, + "loss": 0.5843, + "step": 7470 + }, + { + "epoch": 0.3561604652825781, + "grad_norm": 2.037578582763672, + "learning_rate": 2.1510728502407206e-07, + "loss": 0.8099, + "step": 7471 + }, + { + "epoch": 0.3562081376778776, + "grad_norm": 1.5721982717514038, + "learning_rate": 2.1429770809938577e-07, + "loss": 0.8496, + "step": 7472 + }, + { + "epoch": 0.35625581007317714, + "grad_norm": 2.135315179824829, + "learning_rate": 2.1348964100778914e-07, + "loss": 0.7808, + "step": 7473 + }, + { + "epoch": 0.35630348246847665, + "grad_norm": 1.717445731163025, + "learning_rate": 2.1268308387395908e-07, + "loss": 0.7428, + "step": 7474 + }, + { + "epoch": 0.3563511548637761, + "grad_norm": 7.698898792266846, + "learning_rate": 2.1187803682234055e-07, + "loss": 1.355, + "step": 7475 + }, + { + "epoch": 0.3563988272590756, + "grad_norm": 1.9036109447479248, + "learning_rate": 2.110744999771419e-07, + "loss": 0.7164, + "step": 7476 + }, + { + "epoch": 0.35644649965437514, + "grad_norm": 1.195804238319397, + "learning_rate": 2.102724734623407e-07, + "loss": 0.7788, + "step": 7477 + }, + { + "epoch": 0.35649417204967465, + "grad_norm": 1.4893684387207031, + "learning_rate": 2.0947195740168347e-07, + "loss": 0.4109, + "step": 7478 + }, + { + "epoch": 0.35654184444497417, + "grad_norm": 1.3249890804290771, + "learning_rate": 2.086729519186803e-07, + "loss": 0.7046, + "step": 7479 + }, + { + "epoch": 0.3565895168402736, + "grad_norm": 1.275384783744812, + "learning_rate": 2.0787545713660817e-07, + "loss": 0.6835, + "step": 7480 + }, + { + "epoch": 0.35663718923557314, + "grad_norm": 1.1132960319519043, + "learning_rate": 2.0707947317851528e-07, + "loss": 0.5525, + "step": 7481 + }, + { + "epoch": 0.35668486163087265, + "grad_norm": 3.5884432792663574, + "learning_rate": 2.062850001672112e-07, + "loss": 0.7763, + "step": 7482 + }, + { + "epoch": 0.35673253402617217, + "grad_norm": 2.164539098739624, + "learning_rate": 2.0549203822527675e-07, + "loss": 0.716, + "step": 7483 + }, + { + "epoch": 0.3567802064214716, + "grad_norm": 1.525444507598877, + "learning_rate": 2.0470058747505516e-07, + "loss": 0.6758, + "step": 7484 + }, + { + "epoch": 0.35682787881677114, + "grad_norm": 1.6143829822540283, + "learning_rate": 2.0391064803866213e-07, + "loss": 0.6231, + "step": 7485 + }, + { + "epoch": 0.35687555121207065, + "grad_norm": 1.754177451133728, + "learning_rate": 2.0312222003797565e-07, + "loss": 0.3647, + "step": 7486 + }, + { + "epoch": 0.35692322360737017, + "grad_norm": 1.4544939994812012, + "learning_rate": 2.0233530359464183e-07, + "loss": 0.3695, + "step": 7487 + }, + { + "epoch": 0.3569708960026697, + "grad_norm": 1.8666043281555176, + "learning_rate": 2.0154989883007458e-07, + "loss": 0.4506, + "step": 7488 + }, + { + "epoch": 0.35701856839796914, + "grad_norm": 3.311408042907715, + "learning_rate": 2.007660058654537e-07, + "loss": 0.7648, + "step": 7489 + }, + { + "epoch": 0.35706624079326865, + "grad_norm": 2.4828367233276367, + "learning_rate": 1.9998362482172462e-07, + "loss": 0.5591, + "step": 7490 + }, + { + "epoch": 0.35711391318856817, + "grad_norm": 3.6499619483947754, + "learning_rate": 1.9920275581960303e-07, + "loss": 0.3009, + "step": 7491 + }, + { + "epoch": 0.3571615855838677, + "grad_norm": 2.7179312705993652, + "learning_rate": 1.9842339897956585e-07, + "loss": 0.6891, + "step": 7492 + }, + { + "epoch": 0.35720925797916714, + "grad_norm": 2.9030332565307617, + "learning_rate": 1.976455544218625e-07, + "loss": 0.8243, + "step": 7493 + }, + { + "epoch": 0.35725693037446665, + "grad_norm": 10.5089693069458, + "learning_rate": 1.9686922226650584e-07, + "loss": 0.7606, + "step": 7494 + }, + { + "epoch": 0.35730460276976617, + "grad_norm": 1.1255024671554565, + "learning_rate": 1.960944026332745e-07, + "loss": 0.6785, + "step": 7495 + }, + { + "epoch": 0.3573522751650657, + "grad_norm": 1.5652834177017212, + "learning_rate": 1.953210956417162e-07, + "loss": 0.776, + "step": 7496 + }, + { + "epoch": 0.3573999475603652, + "grad_norm": 1.469054937362671, + "learning_rate": 1.9454930141114546e-07, + "loss": 0.88, + "step": 7497 + }, + { + "epoch": 0.35744761995566465, + "grad_norm": 1.9115630388259888, + "learning_rate": 1.9377902006063932e-07, + "loss": 0.6954, + "step": 7498 + }, + { + "epoch": 0.35749529235096417, + "grad_norm": 1.575408697128296, + "learning_rate": 1.930102517090471e-07, + "loss": 0.7336, + "step": 7499 + }, + { + "epoch": 0.3575429647462637, + "grad_norm": 2.5314137935638428, + "learning_rate": 1.9224299647498058e-07, + "loss": 0.6655, + "step": 7500 + }, + { + "epoch": 0.3575906371415632, + "grad_norm": 1.3152841329574585, + "learning_rate": 1.9147725447681841e-07, + "loss": 0.2922, + "step": 7501 + }, + { + "epoch": 0.35763830953686265, + "grad_norm": 1.0879100561141968, + "learning_rate": 1.9071302583270724e-07, + "loss": 0.8709, + "step": 7502 + }, + { + "epoch": 0.35768598193216217, + "grad_norm": 5.678489685058594, + "learning_rate": 1.8995031066056157e-07, + "loss": 0.6401, + "step": 7503 + }, + { + "epoch": 0.3577336543274617, + "grad_norm": 2.0973715782165527, + "learning_rate": 1.8918910907805733e-07, + "loss": 0.6481, + "step": 7504 + }, + { + "epoch": 0.3577813267227612, + "grad_norm": 1.4970685243606567, + "learning_rate": 1.8842942120264272e-07, + "loss": 0.8114, + "step": 7505 + }, + { + "epoch": 0.3578289991180607, + "grad_norm": 1.6479661464691162, + "learning_rate": 1.8767124715152962e-07, + "loss": 0.7586, + "step": 7506 + }, + { + "epoch": 0.35787667151336017, + "grad_norm": 1.0549038648605347, + "learning_rate": 1.8691458704169442e-07, + "loss": 0.7447, + "step": 7507 + }, + { + "epoch": 0.3579243439086597, + "grad_norm": 2.5356662273406982, + "learning_rate": 1.861594409898826e-07, + "loss": 0.4792, + "step": 7508 + }, + { + "epoch": 0.3579720163039592, + "grad_norm": 2.0688745975494385, + "learning_rate": 1.8540580911260764e-07, + "loss": 1.3452, + "step": 7509 + }, + { + "epoch": 0.3580196886992587, + "grad_norm": 2.8586885929107666, + "learning_rate": 1.846536915261443e-07, + "loss": 0.87, + "step": 7510 + }, + { + "epoch": 0.3580673610945582, + "grad_norm": 1.8569843769073486, + "learning_rate": 1.839030883465387e-07, + "loss": 0.7651, + "step": 7511 + }, + { + "epoch": 0.3581150334898577, + "grad_norm": 1.5996110439300537, + "learning_rate": 1.8315399968960036e-07, + "loss": 1.2527, + "step": 7512 + }, + { + "epoch": 0.3581627058851572, + "grad_norm": 1.7045109272003174, + "learning_rate": 1.824064256709046e-07, + "loss": 0.8132, + "step": 7513 + }, + { + "epoch": 0.3582103782804567, + "grad_norm": 1.5336089134216309, + "learning_rate": 1.8166036640579697e-07, + "loss": 0.8086, + "step": 7514 + }, + { + "epoch": 0.3582580506757562, + "grad_norm": 1.3751543760299683, + "learning_rate": 1.8091582200938652e-07, + "loss": 0.72, + "step": 7515 + }, + { + "epoch": 0.3583057230710557, + "grad_norm": 2.3323755264282227, + "learning_rate": 1.8017279259654574e-07, + "loss": 0.8241, + "step": 7516 + }, + { + "epoch": 0.3583533954663552, + "grad_norm": 2.1635584831237793, + "learning_rate": 1.7943127828191852e-07, + "loss": 0.2504, + "step": 7517 + }, + { + "epoch": 0.3584010678616547, + "grad_norm": 2.0600357055664062, + "learning_rate": 1.7869127917991446e-07, + "loss": 0.9565, + "step": 7518 + }, + { + "epoch": 0.3584487402569542, + "grad_norm": 2.327176094055176, + "learning_rate": 1.7795279540470446e-07, + "loss": 0.5631, + "step": 7519 + }, + { + "epoch": 0.35849641265225374, + "grad_norm": 1.288893699645996, + "learning_rate": 1.7721582707023065e-07, + "loss": 0.5399, + "step": 7520 + }, + { + "epoch": 0.3585440850475532, + "grad_norm": 1.3500419855117798, + "learning_rate": 1.7648037429019993e-07, + "loss": 0.7824, + "step": 7521 + }, + { + "epoch": 0.3585917574428527, + "grad_norm": 1.9134793281555176, + "learning_rate": 1.7574643717808483e-07, + "loss": 0.4218, + "step": 7522 + }, + { + "epoch": 0.3586394298381522, + "grad_norm": 1.9522933959960938, + "learning_rate": 1.7501401584712475e-07, + "loss": 0.7751, + "step": 7523 + }, + { + "epoch": 0.35868710223345174, + "grad_norm": 1.2565008401870728, + "learning_rate": 1.7428311041032264e-07, + "loss": 0.3415, + "step": 7524 + }, + { + "epoch": 0.3587347746287512, + "grad_norm": 2.849976062774658, + "learning_rate": 1.7355372098045274e-07, + "loss": 0.7996, + "step": 7525 + }, + { + "epoch": 0.3587824470240507, + "grad_norm": 2.3293449878692627, + "learning_rate": 1.7282584767005062e-07, + "loss": 0.8856, + "step": 7526 + }, + { + "epoch": 0.3588301194193502, + "grad_norm": 1.8338991403579712, + "learning_rate": 1.7209949059142084e-07, + "loss": 0.7683, + "step": 7527 + }, + { + "epoch": 0.35887779181464974, + "grad_norm": 1.9646183252334595, + "learning_rate": 1.7137464985663045e-07, + "loss": 0.6699, + "step": 7528 + }, + { + "epoch": 0.35892546420994925, + "grad_norm": 1.0771929025650024, + "learning_rate": 1.7065132557751662e-07, + "loss": 0.2353, + "step": 7529 + }, + { + "epoch": 0.3589731366052487, + "grad_norm": 1.8474793434143066, + "learning_rate": 1.6992951786568123e-07, + "loss": 0.8408, + "step": 7530 + }, + { + "epoch": 0.3590208090005482, + "grad_norm": 1.6009588241577148, + "learning_rate": 1.6920922683249076e-07, + "loss": 1.0103, + "step": 7531 + }, + { + "epoch": 0.35906848139584774, + "grad_norm": 3.1210105419158936, + "learning_rate": 1.6849045258907848e-07, + "loss": 0.7174, + "step": 7532 + }, + { + "epoch": 0.35911615379114725, + "grad_norm": 2.085829496383667, + "learning_rate": 1.677731952463446e-07, + "loss": 0.9586, + "step": 7533 + }, + { + "epoch": 0.3591638261864467, + "grad_norm": 1.3121978044509888, + "learning_rate": 1.6705745491495394e-07, + "loss": 0.6006, + "step": 7534 + }, + { + "epoch": 0.3592114985817462, + "grad_norm": 2.7522873878479004, + "learning_rate": 1.6634323170533928e-07, + "loss": 0.6786, + "step": 7535 + }, + { + "epoch": 0.35925917097704574, + "grad_norm": 1.1993236541748047, + "learning_rate": 1.6563052572769578e-07, + "loss": 0.1711, + "step": 7536 + }, + { + "epoch": 0.35930684337234525, + "grad_norm": 1.1851755380630493, + "learning_rate": 1.649193370919888e-07, + "loss": 0.7118, + "step": 7537 + }, + { + "epoch": 0.35935451576764477, + "grad_norm": 1.2754336595535278, + "learning_rate": 1.6420966590794617e-07, + "loss": 0.5486, + "step": 7538 + }, + { + "epoch": 0.3594021881629442, + "grad_norm": 1.8204423189163208, + "learning_rate": 1.6350151228506251e-07, + "loss": 0.8391, + "step": 7539 + }, + { + "epoch": 0.35944986055824374, + "grad_norm": 1.747872233390808, + "learning_rate": 1.6279487633259926e-07, + "loss": 0.7039, + "step": 7540 + }, + { + "epoch": 0.35949753295354325, + "grad_norm": 1.7669453620910645, + "learning_rate": 1.620897581595826e-07, + "loss": 1.0443, + "step": 7541 + }, + { + "epoch": 0.35954520534884277, + "grad_norm": 3.060089111328125, + "learning_rate": 1.613861578748066e-07, + "loss": 1.1797, + "step": 7542 + }, + { + "epoch": 0.3595928777441423, + "grad_norm": 2.1141858100891113, + "learning_rate": 1.6068407558682775e-07, + "loss": 0.42, + "step": 7543 + }, + { + "epoch": 0.35964055013944174, + "grad_norm": 4.877460956573486, + "learning_rate": 1.599835114039705e-07, + "loss": 1.0632, + "step": 7544 + }, + { + "epoch": 0.35968822253474125, + "grad_norm": 2.6571197509765625, + "learning_rate": 1.5928446543432507e-07, + "loss": 0.5854, + "step": 7545 + }, + { + "epoch": 0.35973589493004077, + "grad_norm": 1.6897437572479248, + "learning_rate": 1.585869377857474e-07, + "loss": 1.1083, + "step": 7546 + }, + { + "epoch": 0.3597835673253403, + "grad_norm": 2.7866263389587402, + "learning_rate": 1.5789092856585697e-07, + "loss": 0.6052, + "step": 7547 + }, + { + "epoch": 0.35983123972063974, + "grad_norm": 1.2658756971359253, + "learning_rate": 1.571964378820434e-07, + "loss": 0.2581, + "step": 7548 + }, + { + "epoch": 0.35987891211593925, + "grad_norm": 3.687223196029663, + "learning_rate": 1.565034658414577e-07, + "loss": 1.6637, + "step": 7549 + }, + { + "epoch": 0.35992658451123877, + "grad_norm": 1.7222639322280884, + "learning_rate": 1.5581201255101874e-07, + "loss": 0.3713, + "step": 7550 + }, + { + "epoch": 0.3599742569065383, + "grad_norm": 1.7825167179107666, + "learning_rate": 1.551220781174101e-07, + "loss": 0.6362, + "step": 7551 + }, + { + "epoch": 0.3600219293018378, + "grad_norm": 0.963318943977356, + "learning_rate": 1.5443366264708326e-07, + "loss": 0.3692, + "step": 7552 + }, + { + "epoch": 0.36006960169713725, + "grad_norm": 1.2702791690826416, + "learning_rate": 1.5374676624625218e-07, + "loss": 0.4363, + "step": 7553 + }, + { + "epoch": 0.36011727409243677, + "grad_norm": 3.5927670001983643, + "learning_rate": 1.5306138902089763e-07, + "loss": 1.5814, + "step": 7554 + }, + { + "epoch": 0.3601649464877363, + "grad_norm": 1.0734039545059204, + "learning_rate": 1.5237753107676721e-07, + "loss": 0.3872, + "step": 7555 + }, + { + "epoch": 0.3602126188830358, + "grad_norm": 1.6010364294052124, + "learning_rate": 1.5169519251937325e-07, + "loss": 0.9508, + "step": 7556 + }, + { + "epoch": 0.36026029127833525, + "grad_norm": 2.6194820404052734, + "learning_rate": 1.5101437345399262e-07, + "loss": 0.6893, + "step": 7557 + }, + { + "epoch": 0.36030796367363477, + "grad_norm": 1.6924545764923096, + "learning_rate": 1.5033507398567017e-07, + "loss": 0.4745, + "step": 7558 + }, + { + "epoch": 0.3603556360689343, + "grad_norm": 1.4772309064865112, + "learning_rate": 1.4965729421921425e-07, + "loss": 0.6517, + "step": 7559 + }, + { + "epoch": 0.3604033084642338, + "grad_norm": 2.222808837890625, + "learning_rate": 1.4898103425919687e-07, + "loss": 0.7971, + "step": 7560 + }, + { + "epoch": 0.3604509808595333, + "grad_norm": 1.4040281772613525, + "learning_rate": 1.4830629420996222e-07, + "loss": 0.6371, + "step": 7561 + }, + { + "epoch": 0.36049865325483277, + "grad_norm": 1.7208927869796753, + "learning_rate": 1.4763307417561157e-07, + "loss": 0.7422, + "step": 7562 + }, + { + "epoch": 0.3605463256501323, + "grad_norm": 4.075765609741211, + "learning_rate": 1.4696137426001844e-07, + "loss": 0.9115, + "step": 7563 + }, + { + "epoch": 0.3605939980454318, + "grad_norm": 1.319954752922058, + "learning_rate": 1.4629119456681884e-07, + "loss": 0.4583, + "step": 7564 + }, + { + "epoch": 0.3606416704407313, + "grad_norm": 1.3423572778701782, + "learning_rate": 1.456225351994156e-07, + "loss": 0.79, + "step": 7565 + }, + { + "epoch": 0.3606893428360308, + "grad_norm": 1.6340581178665161, + "learning_rate": 1.4495539626097289e-07, + "loss": 0.925, + "step": 7566 + }, + { + "epoch": 0.3607370152313303, + "grad_norm": 1.685856819152832, + "learning_rate": 1.44289777854425e-07, + "loss": 1.08, + "step": 7567 + }, + { + "epoch": 0.3607846876266298, + "grad_norm": 1.3200937509536743, + "learning_rate": 1.4362568008247202e-07, + "loss": 0.5431, + "step": 7568 + }, + { + "epoch": 0.3608323600219293, + "grad_norm": 1.620684266090393, + "learning_rate": 1.4296310304757423e-07, + "loss": 0.449, + "step": 7569 + }, + { + "epoch": 0.3608800324172288, + "grad_norm": 1.924071192741394, + "learning_rate": 1.4230204685196202e-07, + "loss": 0.6961, + "step": 7570 + }, + { + "epoch": 0.3609277048125283, + "grad_norm": 1.5358649492263794, + "learning_rate": 1.4164251159762944e-07, + "loss": 0.8894, + "step": 7571 + }, + { + "epoch": 0.3609753772078278, + "grad_norm": 5.417590141296387, + "learning_rate": 1.4098449738633614e-07, + "loss": 0.243, + "step": 7572 + }, + { + "epoch": 0.3610230496031273, + "grad_norm": 2.6760740280151367, + "learning_rate": 1.4032800431960647e-07, + "loss": 0.737, + "step": 7573 + }, + { + "epoch": 0.3610707219984268, + "grad_norm": 3.250913143157959, + "learning_rate": 1.3967303249873053e-07, + "loss": 0.5661, + "step": 7574 + }, + { + "epoch": 0.36111839439372634, + "grad_norm": 2.3834540843963623, + "learning_rate": 1.390195820247653e-07, + "loss": 0.9905, + "step": 7575 + }, + { + "epoch": 0.3611660667890258, + "grad_norm": 0.9095104932785034, + "learning_rate": 1.3836765299852894e-07, + "loss": 0.2751, + "step": 7576 + }, + { + "epoch": 0.3612137391843253, + "grad_norm": 1.722669243812561, + "learning_rate": 1.3771724552060885e-07, + "loss": 0.928, + "step": 7577 + }, + { + "epoch": 0.3612614115796248, + "grad_norm": 1.4960683584213257, + "learning_rate": 1.3706835969135467e-07, + "loss": 0.5281, + "step": 7578 + }, + { + "epoch": 0.36130908397492434, + "grad_norm": 7.444228649139404, + "learning_rate": 1.3642099561088528e-07, + "loss": 0.0836, + "step": 7579 + }, + { + "epoch": 0.3613567563702238, + "grad_norm": 0.9885214567184448, + "learning_rate": 1.3577515337908076e-07, + "loss": 0.435, + "step": 7580 + }, + { + "epoch": 0.3614044287655233, + "grad_norm": 1.0258896350860596, + "learning_rate": 1.3513083309558806e-07, + "loss": 0.4495, + "step": 7581 + }, + { + "epoch": 0.3614521011608228, + "grad_norm": 1.1365811824798584, + "learning_rate": 1.3448803485981986e-07, + "loss": 0.8391, + "step": 7582 + }, + { + "epoch": 0.36149977355612234, + "grad_norm": 1.7019453048706055, + "learning_rate": 1.3384675877095244e-07, + "loss": 0.8294, + "step": 7583 + }, + { + "epoch": 0.36154744595142185, + "grad_norm": 1.7103431224822998, + "learning_rate": 1.3320700492792771e-07, + "loss": 0.6849, + "step": 7584 + }, + { + "epoch": 0.3615951183467213, + "grad_norm": 1.6652214527130127, + "learning_rate": 1.3256877342945452e-07, + "loss": 0.6364, + "step": 7585 + }, + { + "epoch": 0.3616427907420208, + "grad_norm": 1.4972835779190063, + "learning_rate": 1.319320643740052e-07, + "loss": 0.8808, + "step": 7586 + }, + { + "epoch": 0.36169046313732034, + "grad_norm": 1.9461411237716675, + "learning_rate": 1.312968778598167e-07, + "loss": 1.2322, + "step": 7587 + }, + { + "epoch": 0.36173813553261985, + "grad_norm": 1.602824330329895, + "learning_rate": 1.3066321398489178e-07, + "loss": 0.7117, + "step": 7588 + }, + { + "epoch": 0.3617858079279193, + "grad_norm": 1.647096037864685, + "learning_rate": 1.3003107284699777e-07, + "loss": 0.6963, + "step": 7589 + }, + { + "epoch": 0.3618334803232188, + "grad_norm": 2.3326754570007324, + "learning_rate": 1.294004545436689e-07, + "loss": 0.7142, + "step": 7590 + }, + { + "epoch": 0.36188115271851834, + "grad_norm": 0.892280638217926, + "learning_rate": 1.2877135917220173e-07, + "loss": 0.492, + "step": 7591 + }, + { + "epoch": 0.36192882511381785, + "grad_norm": 3.4787204265594482, + "learning_rate": 1.281437868296609e-07, + "loss": 1.3923, + "step": 7592 + }, + { + "epoch": 0.36197649750911737, + "grad_norm": 3.67329478263855, + "learning_rate": 1.2751773761287333e-07, + "loss": 1.7699, + "step": 7593 + }, + { + "epoch": 0.3620241699044168, + "grad_norm": 1.018886923789978, + "learning_rate": 1.2689321161843071e-07, + "loss": 0.6659, + "step": 7594 + }, + { + "epoch": 0.36207184229971634, + "grad_norm": 1.7219388484954834, + "learning_rate": 1.262702089426926e-07, + "loss": 0.6259, + "step": 7595 + }, + { + "epoch": 0.36211951469501585, + "grad_norm": 0.9731652736663818, + "learning_rate": 1.256487296817821e-07, + "loss": 0.6641, + "step": 7596 + }, + { + "epoch": 0.36216718709031537, + "grad_norm": 2.942791700363159, + "learning_rate": 1.2502877393158587e-07, + "loss": 0.738, + "step": 7597 + }, + { + "epoch": 0.3622148594856149, + "grad_norm": 1.5527029037475586, + "learning_rate": 1.2441034178775735e-07, + "loss": 1.2505, + "step": 7598 + }, + { + "epoch": 0.36226253188091434, + "grad_norm": 1.733080267906189, + "learning_rate": 1.237934333457147e-07, + "loss": 0.8964, + "step": 7599 + }, + { + "epoch": 0.36231020427621385, + "grad_norm": 1.1859900951385498, + "learning_rate": 1.2317804870063954e-07, + "loss": 0.6388, + "step": 7600 + }, + { + "epoch": 0.36235787667151337, + "grad_norm": 1.1545878648757935, + "learning_rate": 1.2256418794747925e-07, + "loss": 0.5054, + "step": 7601 + }, + { + "epoch": 0.3624055490668129, + "grad_norm": 2.0713119506835938, + "learning_rate": 1.219518511809481e-07, + "loss": 0.7267, + "step": 7602 + }, + { + "epoch": 0.36245322146211234, + "grad_norm": 2.233213424682617, + "learning_rate": 1.213410384955227e-07, + "loss": 0.7532, + "step": 7603 + }, + { + "epoch": 0.36250089385741185, + "grad_norm": 1.4822250604629517, + "learning_rate": 1.2073174998544323e-07, + "loss": 0.8033, + "step": 7604 + }, + { + "epoch": 0.36254856625271137, + "grad_norm": 1.719848394393921, + "learning_rate": 1.2012398574471785e-07, + "loss": 0.6197, + "step": 7605 + }, + { + "epoch": 0.3625962386480109, + "grad_norm": 1.4159936904907227, + "learning_rate": 1.1951774586711927e-07, + "loss": 0.5608, + "step": 7606 + }, + { + "epoch": 0.3626439110433104, + "grad_norm": 2.1619670391082764, + "learning_rate": 1.1891303044618275e-07, + "loss": 0.9584, + "step": 7607 + }, + { + "epoch": 0.36269158343860985, + "grad_norm": 3.1314663887023926, + "learning_rate": 1.1830983957521024e-07, + "loss": 1.42, + "step": 7608 + }, + { + "epoch": 0.36273925583390937, + "grad_norm": 1.7015684843063354, + "learning_rate": 1.1770817334726736e-07, + "loss": 0.8257, + "step": 7609 + }, + { + "epoch": 0.3627869282292089, + "grad_norm": 2.1228387355804443, + "learning_rate": 1.1710803185518537e-07, + "loss": 0.3531, + "step": 7610 + }, + { + "epoch": 0.3628346006245084, + "grad_norm": 2.5854666233062744, + "learning_rate": 1.1650941519156023e-07, + "loss": 0.7416, + "step": 7611 + }, + { + "epoch": 0.36288227301980785, + "grad_norm": 4.184905529022217, + "learning_rate": 1.1591232344875248e-07, + "loss": 0.601, + "step": 7612 + }, + { + "epoch": 0.36292994541510737, + "grad_norm": 1.911863923072815, + "learning_rate": 1.1531675671888621e-07, + "loss": 0.8122, + "step": 7613 + }, + { + "epoch": 0.3629776178104069, + "grad_norm": 6.793514728546143, + "learning_rate": 1.1472271509385235e-07, + "loss": 0.8045, + "step": 7614 + }, + { + "epoch": 0.3630252902057064, + "grad_norm": 3.0839173793792725, + "learning_rate": 1.1413019866530429e-07, + "loss": 0.5305, + "step": 7615 + }, + { + "epoch": 0.3630729626010059, + "grad_norm": 2.522343635559082, + "learning_rate": 1.1353920752466219e-07, + "loss": 1.1702, + "step": 7616 + }, + { + "epoch": 0.36312063499630537, + "grad_norm": 2.205889940261841, + "learning_rate": 1.129497417631098e-07, + "loss": 0.8711, + "step": 7617 + }, + { + "epoch": 0.3631683073916049, + "grad_norm": 7.297919273376465, + "learning_rate": 1.1236180147159437e-07, + "loss": 0.1326, + "step": 7618 + }, + { + "epoch": 0.3632159797869044, + "grad_norm": 1.5162936449050903, + "learning_rate": 1.117753867408311e-07, + "loss": 0.6353, + "step": 7619 + }, + { + "epoch": 0.3632636521822039, + "grad_norm": 1.6964155435562134, + "learning_rate": 1.1119049766129652e-07, + "loss": 0.9892, + "step": 7620 + }, + { + "epoch": 0.36331132457750337, + "grad_norm": 2.7673709392547607, + "learning_rate": 1.1060713432323288e-07, + "loss": 0.6602, + "step": 7621 + }, + { + "epoch": 0.3633589969728029, + "grad_norm": 2.292043685913086, + "learning_rate": 1.1002529681664598e-07, + "loss": 1.2501, + "step": 7622 + }, + { + "epoch": 0.3634066693681024, + "grad_norm": 1.9295915365219116, + "learning_rate": 1.0944498523131064e-07, + "loss": 0.7653, + "step": 7623 + }, + { + "epoch": 0.3634543417634019, + "grad_norm": 8.562410354614258, + "learning_rate": 1.0886619965676082e-07, + "loss": 0.5469, + "step": 7624 + }, + { + "epoch": 0.3635020141587014, + "grad_norm": 2.2834036350250244, + "learning_rate": 1.0828894018229619e-07, + "loss": 0.5907, + "step": 7625 + }, + { + "epoch": 0.3635496865540009, + "grad_norm": 1.7092336416244507, + "learning_rate": 1.0771320689698439e-07, + "loss": 0.8352, + "step": 7626 + }, + { + "epoch": 0.3635973589493004, + "grad_norm": 1.4992412328720093, + "learning_rate": 1.0713899988965326e-07, + "loss": 0.7809, + "step": 7627 + }, + { + "epoch": 0.3636450313445999, + "grad_norm": 1.4399573802947998, + "learning_rate": 1.0656631924889749e-07, + "loss": 0.8174, + "step": 7628 + }, + { + "epoch": 0.3636927037398994, + "grad_norm": 1.6076726913452148, + "learning_rate": 1.059951650630775e-07, + "loss": 0.5906, + "step": 7629 + }, + { + "epoch": 0.36374037613519894, + "grad_norm": 2.066838502883911, + "learning_rate": 1.0542553742031392e-07, + "loss": 0.6151, + "step": 7630 + }, + { + "epoch": 0.3637880485304984, + "grad_norm": 1.2221015691757202, + "learning_rate": 1.0485743640849533e-07, + "loss": 0.7351, + "step": 7631 + }, + { + "epoch": 0.3638357209257979, + "grad_norm": 1.4276305437088013, + "learning_rate": 1.0429086211527385e-07, + "loss": 0.6746, + "step": 7632 + }, + { + "epoch": 0.3638833933210974, + "grad_norm": 2.0552382469177246, + "learning_rate": 1.037258146280673e-07, + "loss": 1.1447, + "step": 7633 + }, + { + "epoch": 0.36393106571639694, + "grad_norm": 5.339925765991211, + "learning_rate": 1.0316229403405487e-07, + "loss": 0.8708, + "step": 7634 + }, + { + "epoch": 0.3639787381116964, + "grad_norm": 1.6786081790924072, + "learning_rate": 1.0260030042018365e-07, + "loss": 0.6252, + "step": 7635 + }, + { + "epoch": 0.3640264105069959, + "grad_norm": 1.6302331686019897, + "learning_rate": 1.0203983387316097e-07, + "loss": 1.0601, + "step": 7636 + }, + { + "epoch": 0.3640740829022954, + "grad_norm": 1.5396068096160889, + "learning_rate": 1.0148089447946319e-07, + "loss": 0.6368, + "step": 7637 + }, + { + "epoch": 0.36412175529759494, + "grad_norm": 1.9746489524841309, + "learning_rate": 1.0092348232532911e-07, + "loss": 0.5671, + "step": 7638 + }, + { + "epoch": 0.36416942769289445, + "grad_norm": 1.757285475730896, + "learning_rate": 1.0036759749676106e-07, + "loss": 0.57, + "step": 7639 + }, + { + "epoch": 0.3642171000881939, + "grad_norm": 5.034051895141602, + "learning_rate": 9.981324007952486e-08, + "loss": 0.388, + "step": 7640 + }, + { + "epoch": 0.3642647724834934, + "grad_norm": 1.8331844806671143, + "learning_rate": 9.926041015915434e-08, + "loss": 0.8112, + "step": 7641 + }, + { + "epoch": 0.36431244487879294, + "grad_norm": 1.561575174331665, + "learning_rate": 9.870910782094456e-08, + "loss": 0.6361, + "step": 7642 + }, + { + "epoch": 0.36436011727409245, + "grad_norm": 4.549339771270752, + "learning_rate": 9.81593331499564e-08, + "loss": 0.858, + "step": 7643 + }, + { + "epoch": 0.3644077896693919, + "grad_norm": 1.0300893783569336, + "learning_rate": 9.761108623101312e-08, + "loss": 0.5238, + "step": 7644 + }, + { + "epoch": 0.3644554620646914, + "grad_norm": 2.3920083045959473, + "learning_rate": 9.706436714870482e-08, + "loss": 0.1993, + "step": 7645 + }, + { + "epoch": 0.36450313445999094, + "grad_norm": 1.635974407196045, + "learning_rate": 9.651917598738402e-08, + "loss": 0.6545, + "step": 7646 + }, + { + "epoch": 0.36455080685529045, + "grad_norm": 1.5133309364318848, + "learning_rate": 9.597551283116901e-08, + "loss": 1.3502, + "step": 7647 + }, + { + "epoch": 0.36459847925058997, + "grad_norm": 1.1626709699630737, + "learning_rate": 9.543337776393936e-08, + "loss": 0.735, + "step": 7648 + }, + { + "epoch": 0.3646461516458894, + "grad_norm": 1.8082032203674316, + "learning_rate": 9.489277086934257e-08, + "loss": 0.7641, + "step": 7649 + }, + { + "epoch": 0.36469382404118894, + "grad_norm": 1.3511099815368652, + "learning_rate": 9.435369223078861e-08, + "loss": 0.6228, + "step": 7650 + }, + { + "epoch": 0.36474149643648845, + "grad_norm": 1.3195937871932983, + "learning_rate": 9.381614193145206e-08, + "loss": 0.7517, + "step": 7651 + }, + { + "epoch": 0.36478916883178797, + "grad_norm": 2.796147108078003, + "learning_rate": 9.32801200542699e-08, + "loss": 0.3394, + "step": 7652 + }, + { + "epoch": 0.3648368412270875, + "grad_norm": 0.8492860794067383, + "learning_rate": 9.274562668194598e-08, + "loss": 0.1851, + "step": 7653 + }, + { + "epoch": 0.36488451362238694, + "grad_norm": 1.235487699508667, + "learning_rate": 9.221266189694767e-08, + "loss": 0.6891, + "step": 7654 + }, + { + "epoch": 0.36493218601768646, + "grad_norm": 1.2746144533157349, + "learning_rate": 9.168122578150363e-08, + "loss": 0.7241, + "step": 7655 + }, + { + "epoch": 0.36497985841298597, + "grad_norm": 1.5769391059875488, + "learning_rate": 9.11513184176116e-08, + "loss": 0.9307, + "step": 7656 + }, + { + "epoch": 0.3650275308082855, + "grad_norm": 1.8162281513214111, + "learning_rate": 9.062293988702953e-08, + "loss": 0.6523, + "step": 7657 + }, + { + "epoch": 0.36507520320358494, + "grad_norm": 3.195460319519043, + "learning_rate": 9.009609027128108e-08, + "loss": 1.0291, + "step": 7658 + }, + { + "epoch": 0.36512287559888446, + "grad_norm": 2.0703372955322266, + "learning_rate": 8.957076965165234e-08, + "loss": 1.1073, + "step": 7659 + }, + { + "epoch": 0.36517054799418397, + "grad_norm": 2.353217124938965, + "learning_rate": 8.904697810919848e-08, + "loss": 0.5342, + "step": 7660 + }, + { + "epoch": 0.3652182203894835, + "grad_norm": 1.5540796518325806, + "learning_rate": 8.852471572473153e-08, + "loss": 0.6694, + "step": 7661 + }, + { + "epoch": 0.365265892784783, + "grad_norm": 1.4673951864242554, + "learning_rate": 8.800398257883146e-08, + "loss": 0.7309, + "step": 7662 + }, + { + "epoch": 0.36531356518008246, + "grad_norm": 1.7575181722640991, + "learning_rate": 8.748477875184514e-08, + "loss": 0.7067, + "step": 7663 + }, + { + "epoch": 0.36536123757538197, + "grad_norm": 1.5961129665374756, + "learning_rate": 8.696710432387733e-08, + "loss": 0.4317, + "step": 7664 + }, + { + "epoch": 0.3654089099706815, + "grad_norm": 1.01649010181427, + "learning_rate": 8.645095937480086e-08, + "loss": 0.3314, + "step": 7665 + }, + { + "epoch": 0.365456582365981, + "grad_norm": 1.9757106304168701, + "learning_rate": 8.593634398425199e-08, + "loss": 0.7827, + "step": 7666 + }, + { + "epoch": 0.36550425476128046, + "grad_norm": 1.3617305755615234, + "learning_rate": 8.542325823162945e-08, + "loss": 0.7212, + "step": 7667 + }, + { + "epoch": 0.36555192715657997, + "grad_norm": 2.044442892074585, + "learning_rate": 8.491170219609767e-08, + "loss": 0.356, + "step": 7668 + }, + { + "epoch": 0.3655995995518795, + "grad_norm": 1.8485454320907593, + "learning_rate": 8.440167595658577e-08, + "loss": 0.8628, + "step": 7669 + }, + { + "epoch": 0.365647271947179, + "grad_norm": 1.822192907333374, + "learning_rate": 8.3893179591783e-08, + "loss": 0.7577, + "step": 7670 + }, + { + "epoch": 0.3656949443424785, + "grad_norm": 1.7000517845153809, + "learning_rate": 8.338621318014662e-08, + "loss": 0.7554, + "step": 7671 + }, + { + "epoch": 0.36574261673777797, + "grad_norm": 1.1006102561950684, + "learning_rate": 8.288077679989737e-08, + "loss": 0.6658, + "step": 7672 + }, + { + "epoch": 0.3657902891330775, + "grad_norm": 1.5818777084350586, + "learning_rate": 8.237687052901622e-08, + "loss": 0.8453, + "step": 7673 + }, + { + "epoch": 0.365837961528377, + "grad_norm": 3.3108766078948975, + "learning_rate": 8.187449444525319e-08, + "loss": 0.6104, + "step": 7674 + }, + { + "epoch": 0.3658856339236765, + "grad_norm": 1.7844374179840088, + "learning_rate": 8.137364862611851e-08, + "loss": 0.4677, + "step": 7675 + }, + { + "epoch": 0.36593330631897597, + "grad_norm": 1.8921712636947632, + "learning_rate": 8.087433314888815e-08, + "loss": 0.9077, + "step": 7676 + }, + { + "epoch": 0.3659809787142755, + "grad_norm": 1.867358922958374, + "learning_rate": 8.037654809059937e-08, + "loss": 0.9268, + "step": 7677 + }, + { + "epoch": 0.366028651109575, + "grad_norm": 1.1022671461105347, + "learning_rate": 7.988029352805849e-08, + "loss": 0.6435, + "step": 7678 + }, + { + "epoch": 0.3660763235048745, + "grad_norm": 1.313672661781311, + "learning_rate": 7.938556953783095e-08, + "loss": 0.6473, + "step": 7679 + }, + { + "epoch": 0.366123995900174, + "grad_norm": 2.8724846839904785, + "learning_rate": 7.889237619624679e-08, + "loss": 0.9885, + "step": 7680 + }, + { + "epoch": 0.3661716682954735, + "grad_norm": 2.8557121753692627, + "learning_rate": 7.840071357940072e-08, + "loss": 0.749, + "step": 7681 + }, + { + "epoch": 0.366219340690773, + "grad_norm": 1.3713778257369995, + "learning_rate": 7.791058176315313e-08, + "loss": 0.5679, + "step": 7682 + }, + { + "epoch": 0.3662670130860725, + "grad_norm": 1.297159194946289, + "learning_rate": 7.742198082312357e-08, + "loss": 0.4253, + "step": 7683 + }, + { + "epoch": 0.366314685481372, + "grad_norm": 3.1610944271087646, + "learning_rate": 7.693491083470062e-08, + "loss": 1.438, + "step": 7684 + }, + { + "epoch": 0.36636235787667154, + "grad_norm": 2.023857831954956, + "learning_rate": 7.644937187303303e-08, + "loss": 0.4606, + "step": 7685 + }, + { + "epoch": 0.366410030271971, + "grad_norm": 3.6449878215789795, + "learning_rate": 7.596536401303422e-08, + "loss": 0.6084, + "step": 7686 + }, + { + "epoch": 0.3664577026672705, + "grad_norm": 2.608428716659546, + "learning_rate": 7.548288732938225e-08, + "loss": 0.5937, + "step": 7687 + }, + { + "epoch": 0.36650537506257, + "grad_norm": 1.8767002820968628, + "learning_rate": 7.500194189651866e-08, + "loss": 0.62, + "step": 7688 + }, + { + "epoch": 0.36655304745786954, + "grad_norm": 1.3451383113861084, + "learning_rate": 7.452252778864632e-08, + "loss": 0.7668, + "step": 7689 + }, + { + "epoch": 0.366600719853169, + "grad_norm": 2.4728777408599854, + "learning_rate": 7.404464507973608e-08, + "loss": 0.3595, + "step": 7690 + }, + { + "epoch": 0.3666483922484685, + "grad_norm": 1.694606900215149, + "learning_rate": 7.356829384351893e-08, + "loss": 0.3838, + "step": 7691 + }, + { + "epoch": 0.366696064643768, + "grad_norm": 1.0287492275238037, + "learning_rate": 7.309347415349278e-08, + "loss": 0.2514, + "step": 7692 + }, + { + "epoch": 0.36674373703906754, + "grad_norm": 1.0105295181274414, + "learning_rate": 7.262018608291566e-08, + "loss": 0.5265, + "step": 7693 + }, + { + "epoch": 0.36679140943436706, + "grad_norm": 1.980093002319336, + "learning_rate": 7.214842970481139e-08, + "loss": 1.0236, + "step": 7694 + }, + { + "epoch": 0.3668390818296665, + "grad_norm": 2.581512451171875, + "learning_rate": 7.167820509196732e-08, + "loss": 0.5972, + "step": 7695 + }, + { + "epoch": 0.366886754224966, + "grad_norm": 1.16861093044281, + "learning_rate": 7.12095123169343e-08, + "loss": 0.5906, + "step": 7696 + }, + { + "epoch": 0.36693442662026554, + "grad_norm": 3.0507681369781494, + "learning_rate": 7.074235145202668e-08, + "loss": 0.5603, + "step": 7697 + }, + { + "epoch": 0.36698209901556506, + "grad_norm": 1.3943735361099243, + "learning_rate": 7.027672256932238e-08, + "loss": 0.7102, + "step": 7698 + }, + { + "epoch": 0.3670297714108645, + "grad_norm": 2.4650938510894775, + "learning_rate": 6.981262574066395e-08, + "loss": 1.1235, + "step": 7699 + }, + { + "epoch": 0.367077443806164, + "grad_norm": 2.6008851528167725, + "learning_rate": 6.93500610376563e-08, + "loss": 1.0542, + "step": 7700 + }, + { + "epoch": 0.36712511620146354, + "grad_norm": 1.6996532678604126, + "learning_rate": 6.88890285316679e-08, + "loss": 0.7084, + "step": 7701 + }, + { + "epoch": 0.36717278859676306, + "grad_norm": 1.408275842666626, + "learning_rate": 6.842952829383187e-08, + "loss": 0.7608, + "step": 7702 + }, + { + "epoch": 0.36722046099206257, + "grad_norm": 1.5945616960525513, + "learning_rate": 6.797156039504482e-08, + "loss": 0.9326, + "step": 7703 + }, + { + "epoch": 0.36726813338736203, + "grad_norm": 1.2416167259216309, + "learning_rate": 6.751512490596467e-08, + "loss": 0.82, + "step": 7704 + }, + { + "epoch": 0.36731580578266154, + "grad_norm": 3.048807382583618, + "learning_rate": 6.706022189701622e-08, + "loss": 1.0169, + "step": 7705 + }, + { + "epoch": 0.36736347817796106, + "grad_norm": 1.791715383529663, + "learning_rate": 6.660685143838664e-08, + "loss": 1.1747, + "step": 7706 + }, + { + "epoch": 0.36741115057326057, + "grad_norm": 2.7997539043426514, + "learning_rate": 6.615501360002552e-08, + "loss": 0.8987, + "step": 7707 + }, + { + "epoch": 0.36745882296856003, + "grad_norm": 1.8299788236618042, + "learning_rate": 6.570470845164712e-08, + "loss": 1.0327, + "step": 7708 + }, + { + "epoch": 0.36750649536385954, + "grad_norm": 2.873364210128784, + "learning_rate": 6.525593606272917e-08, + "loss": 0.2513, + "step": 7709 + }, + { + "epoch": 0.36755416775915906, + "grad_norm": 2.3427250385284424, + "learning_rate": 6.480869650251187e-08, + "loss": 0.837, + "step": 7710 + }, + { + "epoch": 0.36760184015445857, + "grad_norm": 2.1455252170562744, + "learning_rate": 6.436298983999889e-08, + "loss": 1.3452, + "step": 7711 + }, + { + "epoch": 0.3676495125497581, + "grad_norm": 1.2994019985198975, + "learning_rate": 6.391881614396078e-08, + "loss": 0.2553, + "step": 7712 + }, + { + "epoch": 0.36769718494505754, + "grad_norm": 1.720689058303833, + "learning_rate": 6.347617548292717e-08, + "loss": 1.2418, + "step": 7713 + }, + { + "epoch": 0.36774485734035706, + "grad_norm": 1.196730375289917, + "learning_rate": 6.303506792519232e-08, + "loss": 0.5118, + "step": 7714 + }, + { + "epoch": 0.36779252973565657, + "grad_norm": 13.853625297546387, + "learning_rate": 6.259549353881623e-08, + "loss": 0.902, + "step": 7715 + }, + { + "epoch": 0.3678402021309561, + "grad_norm": 2.442233085632324, + "learning_rate": 6.215745239162018e-08, + "loss": 0.9641, + "step": 7716 + }, + { + "epoch": 0.3678878745262556, + "grad_norm": 1.2612642049789429, + "learning_rate": 6.172094455118904e-08, + "loss": 0.782, + "step": 7717 + }, + { + "epoch": 0.36793554692155506, + "grad_norm": 1.414482831954956, + "learning_rate": 6.128597008487225e-08, + "loss": 0.7849, + "step": 7718 + }, + { + "epoch": 0.36798321931685457, + "grad_norm": 3.647113561630249, + "learning_rate": 6.085252905978056e-08, + "loss": 0.6292, + "step": 7719 + }, + { + "epoch": 0.3680308917121541, + "grad_norm": 2.5834009647369385, + "learning_rate": 6.042062154279049e-08, + "loss": 1.1189, + "step": 7720 + }, + { + "epoch": 0.3680785641074536, + "grad_norm": 1.6807844638824463, + "learning_rate": 5.999024760054095e-08, + "loss": 0.6304, + "step": 7721 + }, + { + "epoch": 0.36812623650275306, + "grad_norm": 1.5649101734161377, + "learning_rate": 5.9561407299433274e-08, + "loss": 0.8199, + "step": 7722 + }, + { + "epoch": 0.36817390889805257, + "grad_norm": 1.6391302347183228, + "learning_rate": 5.9134100705634525e-08, + "loss": 0.8994, + "step": 7723 + }, + { + "epoch": 0.3682215812933521, + "grad_norm": 1.3756425380706787, + "learning_rate": 5.8708327885071966e-08, + "loss": 0.6368, + "step": 7724 + }, + { + "epoch": 0.3682692536886516, + "grad_norm": 1.3108423948287964, + "learning_rate": 5.8284088903439726e-08, + "loss": 0.8382, + "step": 7725 + }, + { + "epoch": 0.3683169260839511, + "grad_norm": 1.292011022567749, + "learning_rate": 5.786138382619322e-08, + "loss": 0.7141, + "step": 7726 + }, + { + "epoch": 0.36836459847925057, + "grad_norm": 2.530026435852051, + "learning_rate": 5.744021271854916e-08, + "loss": 0.5991, + "step": 7727 + }, + { + "epoch": 0.3684122708745501, + "grad_norm": 1.1009055376052856, + "learning_rate": 5.702057564549335e-08, + "loss": 0.481, + "step": 7728 + }, + { + "epoch": 0.3684599432698496, + "grad_norm": 1.625542163848877, + "learning_rate": 5.660247267176844e-08, + "loss": 0.9559, + "step": 7729 + }, + { + "epoch": 0.3685076156651491, + "grad_norm": 1.6544519662857056, + "learning_rate": 5.618590386188616e-08, + "loss": 0.7864, + "step": 7730 + }, + { + "epoch": 0.36855528806044857, + "grad_norm": 1.352328896522522, + "learning_rate": 5.577086928011732e-08, + "loss": 0.7382, + "step": 7731 + }, + { + "epoch": 0.3686029604557481, + "grad_norm": 1.5142983198165894, + "learning_rate": 5.535736899049626e-08, + "loss": 0.6611, + "step": 7732 + }, + { + "epoch": 0.3686506328510476, + "grad_norm": 1.2521287202835083, + "learning_rate": 5.4945403056824164e-08, + "loss": 0.8079, + "step": 7733 + }, + { + "epoch": 0.3686983052463471, + "grad_norm": 3.0747945308685303, + "learning_rate": 5.453497154266241e-08, + "loss": 0.6277, + "step": 7734 + }, + { + "epoch": 0.3687459776416466, + "grad_norm": 2.9783384799957275, + "learning_rate": 5.412607451133478e-08, + "loss": 0.5548, + "step": 7735 + }, + { + "epoch": 0.3687936500369461, + "grad_norm": 5.556096076965332, + "learning_rate": 5.371871202593193e-08, + "loss": 0.3213, + "step": 7736 + }, + { + "epoch": 0.3688413224322456, + "grad_norm": 2.926870584487915, + "learning_rate": 5.33128841493058e-08, + "loss": 0.5459, + "step": 7737 + }, + { + "epoch": 0.3688889948275451, + "grad_norm": 1.4682596921920776, + "learning_rate": 5.290859094406964e-08, + "loss": 0.7457, + "step": 7738 + }, + { + "epoch": 0.3689366672228446, + "grad_norm": 1.818982720375061, + "learning_rate": 5.250583247260355e-08, + "loss": 0.9037, + "step": 7739 + }, + { + "epoch": 0.3689843396181441, + "grad_norm": 1.2708405256271362, + "learning_rate": 5.2104608797047816e-08, + "loss": 0.5607, + "step": 7740 + }, + { + "epoch": 0.3690320120134436, + "grad_norm": 1.9208078384399414, + "learning_rate": 5.170491997930627e-08, + "loss": 0.5822, + "step": 7741 + }, + { + "epoch": 0.3690796844087431, + "grad_norm": 2.572831392288208, + "learning_rate": 5.1306766081048456e-08, + "loss": 0.721, + "step": 7742 + }, + { + "epoch": 0.36912735680404263, + "grad_norm": 1.138107419013977, + "learning_rate": 5.091014716370524e-08, + "loss": 0.519, + "step": 7743 + }, + { + "epoch": 0.36917502919934214, + "grad_norm": 4.329532146453857, + "learning_rate": 5.0515063288471e-08, + "loss": 1.4959, + "step": 7744 + }, + { + "epoch": 0.3692227015946416, + "grad_norm": 1.845920205116272, + "learning_rate": 5.012151451630143e-08, + "loss": 0.9106, + "step": 7745 + }, + { + "epoch": 0.3692703739899411, + "grad_norm": 2.029707908630371, + "learning_rate": 4.972950090791906e-08, + "loss": 0.945, + "step": 7746 + }, + { + "epoch": 0.36931804638524063, + "grad_norm": 2.0834524631500244, + "learning_rate": 4.933902252380662e-08, + "loss": 0.392, + "step": 7747 + }, + { + "epoch": 0.36936571878054014, + "grad_norm": 1.1270073652267456, + "learning_rate": 4.8950079424211484e-08, + "loss": 0.4158, + "step": 7748 + }, + { + "epoch": 0.36941339117583966, + "grad_norm": 4.812641143798828, + "learning_rate": 4.8562671669142304e-08, + "loss": 0.4369, + "step": 7749 + }, + { + "epoch": 0.3694610635711391, + "grad_norm": 1.43878972530365, + "learning_rate": 4.8176799318373494e-08, + "loss": 0.6074, + "step": 7750 + }, + { + "epoch": 0.36950873596643863, + "grad_norm": 1.7358713150024414, + "learning_rate": 4.7792462431439643e-08, + "loss": 0.5948, + "step": 7751 + }, + { + "epoch": 0.36955640836173814, + "grad_norm": 1.4982954263687134, + "learning_rate": 4.740966106764222e-08, + "loss": 0.6141, + "step": 7752 + }, + { + "epoch": 0.36960408075703766, + "grad_norm": 1.4248689413070679, + "learning_rate": 4.702839528604064e-08, + "loss": 0.5833, + "step": 7753 + }, + { + "epoch": 0.3696517531523371, + "grad_norm": 1.2393172979354858, + "learning_rate": 4.66486651454634e-08, + "loss": 0.7083, + "step": 7754 + }, + { + "epoch": 0.36969942554763663, + "grad_norm": 1.2555162906646729, + "learning_rate": 4.627047070449697e-08, + "loss": 0.6786, + "step": 7755 + }, + { + "epoch": 0.36974709794293614, + "grad_norm": 1.9703701734542847, + "learning_rate": 4.589381202149357e-08, + "loss": 0.9608, + "step": 7756 + }, + { + "epoch": 0.36979477033823566, + "grad_norm": 1.3421674966812134, + "learning_rate": 4.55186891545667e-08, + "loss": 0.7369, + "step": 7757 + }, + { + "epoch": 0.36984244273353517, + "grad_norm": 1.944890022277832, + "learning_rate": 4.514510216159562e-08, + "loss": 0.8617, + "step": 7758 + }, + { + "epoch": 0.36989011512883463, + "grad_norm": 1.885948657989502, + "learning_rate": 4.4773051100219787e-08, + "loss": 0.6023, + "step": 7759 + }, + { + "epoch": 0.36993778752413414, + "grad_norm": 1.1965134143829346, + "learning_rate": 4.440253602784328e-08, + "loss": 0.3921, + "step": 7760 + }, + { + "epoch": 0.36998545991943366, + "grad_norm": 1.602170705795288, + "learning_rate": 4.4033557001631475e-08, + "loss": 0.9863, + "step": 7761 + }, + { + "epoch": 0.37003313231473317, + "grad_norm": 2.1995394229888916, + "learning_rate": 4.366611407851662e-08, + "loss": 0.6922, + "step": 7762 + }, + { + "epoch": 0.37008080471003263, + "grad_norm": 2.047574996948242, + "learning_rate": 4.3300207315190026e-08, + "loss": 0.9125, + "step": 7763 + }, + { + "epoch": 0.37012847710533214, + "grad_norm": 1.2888671159744263, + "learning_rate": 4.293583676810653e-08, + "loss": 0.8809, + "step": 7764 + }, + { + "epoch": 0.37017614950063166, + "grad_norm": 1.490317463874817, + "learning_rate": 4.257300249348562e-08, + "loss": 0.6894, + "step": 7765 + }, + { + "epoch": 0.37022382189593117, + "grad_norm": 6.8432841300964355, + "learning_rate": 4.221170454730916e-08, + "loss": 0.2892, + "step": 7766 + }, + { + "epoch": 0.3702714942912307, + "grad_norm": 1.2396644353866577, + "learning_rate": 4.185194298532147e-08, + "loss": 0.8975, + "step": 7767 + }, + { + "epoch": 0.37031916668653014, + "grad_norm": 1.871390700340271, + "learning_rate": 4.149371786302925e-08, + "loss": 0.6949, + "step": 7768 + }, + { + "epoch": 0.37036683908182966, + "grad_norm": 1.211944580078125, + "learning_rate": 4.113702923570384e-08, + "loss": 0.7685, + "step": 7769 + }, + { + "epoch": 0.37041451147712917, + "grad_norm": 0.8955574035644531, + "learning_rate": 4.0781877158377894e-08, + "loss": 0.5975, + "step": 7770 + }, + { + "epoch": 0.3704621838724287, + "grad_norm": 1.8945472240447998, + "learning_rate": 4.042826168584868e-08, + "loss": 0.7167, + "step": 7771 + }, + { + "epoch": 0.3705098562677282, + "grad_norm": 2.005890130996704, + "learning_rate": 4.0076182872674785e-08, + "loss": 0.6805, + "step": 7772 + }, + { + "epoch": 0.37055752866302766, + "grad_norm": 1.0825318098068237, + "learning_rate": 3.972564077317831e-08, + "loss": 0.7917, + "step": 7773 + }, + { + "epoch": 0.37060520105832717, + "grad_norm": 2.385758638381958, + "learning_rate": 3.9376635441444874e-08, + "loss": 0.8841, + "step": 7774 + }, + { + "epoch": 0.3706528734536267, + "grad_norm": 1.4121153354644775, + "learning_rate": 3.9029166931322524e-08, + "loss": 0.6241, + "step": 7775 + }, + { + "epoch": 0.3707005458489262, + "grad_norm": 1.2842817306518555, + "learning_rate": 3.86832352964206e-08, + "loss": 0.8474, + "step": 7776 + }, + { + "epoch": 0.37074821824422566, + "grad_norm": 1.3103934526443481, + "learning_rate": 3.833884059011417e-08, + "loss": 0.7723, + "step": 7777 + }, + { + "epoch": 0.37079589063952517, + "grad_norm": 1.2943569421768188, + "learning_rate": 3.7995982865539624e-08, + "loss": 0.7501, + "step": 7778 + }, + { + "epoch": 0.3708435630348247, + "grad_norm": 3.371055841445923, + "learning_rate": 3.765466217559577e-08, + "loss": 1.0109, + "step": 7779 + }, + { + "epoch": 0.3708912354301242, + "grad_norm": 2.4353268146514893, + "learning_rate": 3.731487857294491e-08, + "loss": 1.1428, + "step": 7780 + }, + { + "epoch": 0.3709389078254237, + "grad_norm": 2.3010752201080322, + "learning_rate": 3.69766321100129e-08, + "loss": 0.6788, + "step": 7781 + }, + { + "epoch": 0.37098658022072317, + "grad_norm": 2.7268319129943848, + "learning_rate": 3.663992283898687e-08, + "loss": 0.674, + "step": 7782 + }, + { + "epoch": 0.3710342526160227, + "grad_norm": 5.783518314361572, + "learning_rate": 3.630475081181861e-08, + "loss": 0.9479, + "step": 7783 + }, + { + "epoch": 0.3710819250113222, + "grad_norm": 1.2917919158935547, + "learning_rate": 3.597111608022119e-08, + "loss": 0.9517, + "step": 7784 + }, + { + "epoch": 0.3711295974066217, + "grad_norm": 1.5208823680877686, + "learning_rate": 3.56390186956701e-08, + "loss": 0.6202, + "step": 7785 + }, + { + "epoch": 0.3711772698019212, + "grad_norm": 1.5522223711013794, + "learning_rate": 3.530845870940658e-08, + "loss": 0.7482, + "step": 7786 + }, + { + "epoch": 0.3712249421972207, + "grad_norm": 2.059873104095459, + "learning_rate": 3.497943617242983e-08, + "loss": 0.7872, + "step": 7787 + }, + { + "epoch": 0.3712726145925202, + "grad_norm": 3.6424009799957275, + "learning_rate": 3.465195113550701e-08, + "loss": 0.3467, + "step": 7788 + }, + { + "epoch": 0.3713202869878197, + "grad_norm": 1.4958759546279907, + "learning_rate": 3.43260036491655e-08, + "loss": 0.8711, + "step": 7789 + }, + { + "epoch": 0.37136795938311923, + "grad_norm": 2.646958112716675, + "learning_rate": 3.400159376369394e-08, + "loss": 0.6322, + "step": 7790 + }, + { + "epoch": 0.3714156317784187, + "grad_norm": 3.456530809402466, + "learning_rate": 3.367872152914675e-08, + "loss": 1.3635, + "step": 7791 + }, + { + "epoch": 0.3714633041737182, + "grad_norm": 1.3279197216033936, + "learning_rate": 3.335738699533964e-08, + "loss": 0.6843, + "step": 7792 + }, + { + "epoch": 0.3715109765690177, + "grad_norm": 4.989992618560791, + "learning_rate": 3.3037590211851823e-08, + "loss": 0.0866, + "step": 7793 + }, + { + "epoch": 0.37155864896431723, + "grad_norm": 2.787041425704956, + "learning_rate": 3.271933122802273e-08, + "loss": 1.0779, + "step": 7794 + }, + { + "epoch": 0.3716063213596167, + "grad_norm": 1.2259422540664673, + "learning_rate": 3.240261009295864e-08, + "loss": 0.7496, + "step": 7795 + }, + { + "epoch": 0.3716539937549162, + "grad_norm": 1.8526363372802734, + "learning_rate": 3.208742685552602e-08, + "loss": 0.4648, + "step": 7796 + }, + { + "epoch": 0.3717016661502157, + "grad_norm": 1.7674260139465332, + "learning_rate": 3.1773781564352625e-08, + "loss": 0.6233, + "step": 7797 + }, + { + "epoch": 0.37174933854551523, + "grad_norm": 1.2214933633804321, + "learning_rate": 3.146167426783198e-08, + "loss": 0.5241, + "step": 7798 + }, + { + "epoch": 0.37179701094081474, + "grad_norm": 5.145979881286621, + "learning_rate": 3.1151105014119995e-08, + "loss": 1.8501, + "step": 7799 + }, + { + "epoch": 0.3718446833361142, + "grad_norm": 1.1357462406158447, + "learning_rate": 3.084207385113169e-08, + "loss": 0.7689, + "step": 7800 + }, + { + "epoch": 0.3718923557314137, + "grad_norm": 1.7186403274536133, + "learning_rate": 3.053458082655003e-08, + "loss": 0.8398, + "step": 7801 + }, + { + "epoch": 0.37194002812671323, + "grad_norm": 1.5030573606491089, + "learning_rate": 3.0228625987817064e-08, + "loss": 0.7932, + "step": 7802 + }, + { + "epoch": 0.37198770052201274, + "grad_norm": 1.028721809387207, + "learning_rate": 2.992420938213725e-08, + "loss": 0.7488, + "step": 7803 + }, + { + "epoch": 0.37203537291731226, + "grad_norm": 2.824293375015259, + "learning_rate": 2.9621331056480796e-08, + "loss": 0.3448, + "step": 7804 + }, + { + "epoch": 0.3720830453126117, + "grad_norm": 3.2116827964782715, + "learning_rate": 2.931999105757699e-08, + "loss": 0.3277, + "step": 7805 + }, + { + "epoch": 0.37213071770791123, + "grad_norm": 1.632785677909851, + "learning_rate": 2.9020189431920865e-08, + "loss": 0.7778, + "step": 7806 + }, + { + "epoch": 0.37217839010321074, + "grad_norm": 1.9857631921768188, + "learning_rate": 2.8721926225768748e-08, + "loss": 0.7373, + "step": 7807 + }, + { + "epoch": 0.37222606249851026, + "grad_norm": 1.6291468143463135, + "learning_rate": 2.8425201485139387e-08, + "loss": 0.4981, + "step": 7808 + }, + { + "epoch": 0.3722737348938097, + "grad_norm": 2.2106332778930664, + "learning_rate": 2.8130015255812827e-08, + "loss": 0.59, + "step": 7809 + }, + { + "epoch": 0.37232140728910923, + "grad_norm": 4.437781810760498, + "learning_rate": 2.7836367583335967e-08, + "loss": 0.3724, + "step": 7810 + }, + { + "epoch": 0.37236907968440874, + "grad_norm": 2.524610996246338, + "learning_rate": 2.7544258513013678e-08, + "loss": 0.8235, + "step": 7811 + }, + { + "epoch": 0.37241675207970826, + "grad_norm": 1.413041591644287, + "learning_rate": 2.7253688089915466e-08, + "loss": 0.5837, + "step": 7812 + }, + { + "epoch": 0.37246442447500777, + "grad_norm": 1.1875849962234497, + "learning_rate": 2.6964656358874353e-08, + "loss": 0.5419, + "step": 7813 + }, + { + "epoch": 0.37251209687030723, + "grad_norm": 4.12817907333374, + "learning_rate": 2.667716336448356e-08, + "loss": 1.4011, + "step": 7814 + }, + { + "epoch": 0.37255976926560674, + "grad_norm": 1.5169075727462769, + "learning_rate": 2.639120915110094e-08, + "loss": 0.6373, + "step": 7815 + }, + { + "epoch": 0.37260744166090626, + "grad_norm": 2.392352819442749, + "learning_rate": 2.6106793762847858e-08, + "loss": 0.9112, + "step": 7816 + }, + { + "epoch": 0.37265511405620577, + "grad_norm": 3.835285186767578, + "learning_rate": 2.5823917243603668e-08, + "loss": 0.8103, + "step": 7817 + }, + { + "epoch": 0.37270278645150523, + "grad_norm": 2.305617332458496, + "learning_rate": 2.5542579637015675e-08, + "loss": 0.7076, + "step": 7818 + }, + { + "epoch": 0.37275045884680474, + "grad_norm": 1.2971391677856445, + "learning_rate": 2.5262780986491375e-08, + "loss": 0.3653, + "step": 7819 + }, + { + "epoch": 0.37279813124210426, + "grad_norm": 1.72105872631073, + "learning_rate": 2.4984521335198464e-08, + "loss": 0.6844, + "step": 7820 + }, + { + "epoch": 0.37284580363740377, + "grad_norm": 2.416649103164673, + "learning_rate": 2.4707800726072594e-08, + "loss": 0.961, + "step": 7821 + }, + { + "epoch": 0.3728934760327033, + "grad_norm": 3.378453254699707, + "learning_rate": 2.4432619201806283e-08, + "loss": 0.9911, + "step": 7822 + }, + { + "epoch": 0.37294114842800274, + "grad_norm": 2.2994165420532227, + "learning_rate": 2.4158976804858903e-08, + "loss": 1.026, + "step": 7823 + }, + { + "epoch": 0.37298882082330226, + "grad_norm": 2.1853156089782715, + "learning_rate": 2.3886873577450008e-08, + "loss": 0.7153, + "step": 7824 + }, + { + "epoch": 0.3730364932186018, + "grad_norm": 2.522794485092163, + "learning_rate": 2.3616309561562688e-08, + "loss": 0.8567, + "step": 7825 + }, + { + "epoch": 0.3730841656139013, + "grad_norm": 2.4500808715820312, + "learning_rate": 2.3347284798941327e-08, + "loss": 1.0215, + "step": 7826 + }, + { + "epoch": 0.37313183800920074, + "grad_norm": 2.114877939224243, + "learning_rate": 2.3079799331094943e-08, + "loss": 0.3447, + "step": 7827 + }, + { + "epoch": 0.37317951040450026, + "grad_norm": 1.5318366289138794, + "learning_rate": 2.2813853199292745e-08, + "loss": 0.6472, + "step": 7828 + }, + { + "epoch": 0.3732271827997998, + "grad_norm": 1.7850221395492554, + "learning_rate": 2.2549446444567468e-08, + "loss": 0.6343, + "step": 7829 + }, + { + "epoch": 0.3732748551950993, + "grad_norm": 1.692326307296753, + "learning_rate": 2.2286579107716476e-08, + "loss": 0.772, + "step": 7830 + }, + { + "epoch": 0.3733225275903988, + "grad_norm": 2.600684881210327, + "learning_rate": 2.2025251229293997e-08, + "loss": 0.5936, + "step": 7831 + }, + { + "epoch": 0.37337019998569826, + "grad_norm": 1.0904062986373901, + "learning_rate": 2.176546284962222e-08, + "loss": 0.4863, + "step": 7832 + }, + { + "epoch": 0.3734178723809978, + "grad_norm": 1.6171411275863647, + "learning_rate": 2.1507214008783527e-08, + "loss": 1.1358, + "step": 7833 + }, + { + "epoch": 0.3734655447762973, + "grad_norm": 1.8634965419769287, + "learning_rate": 2.1250504746623822e-08, + "loss": 0.6109, + "step": 7834 + }, + { + "epoch": 0.3735132171715968, + "grad_norm": 1.39500892162323, + "learning_rate": 2.0995335102749204e-08, + "loss": 0.5307, + "step": 7835 + }, + { + "epoch": 0.3735608895668963, + "grad_norm": 1.882065773010254, + "learning_rate": 2.0741705116531507e-08, + "loss": 0.7883, + "step": 7836 + }, + { + "epoch": 0.3736085619621958, + "grad_norm": 3.0995166301727295, + "learning_rate": 2.0489614827101656e-08, + "loss": 0.8638, + "step": 7837 + }, + { + "epoch": 0.3736562343574953, + "grad_norm": 2.771300792694092, + "learning_rate": 2.02390642733552e-08, + "loss": 0.3219, + "step": 7838 + }, + { + "epoch": 0.3737039067527948, + "grad_norm": 1.7595773935317993, + "learning_rate": 1.9990053493949003e-08, + "loss": 0.6482, + "step": 7839 + }, + { + "epoch": 0.3737515791480943, + "grad_norm": 1.6298259496688843, + "learning_rate": 1.9742582527303433e-08, + "loss": 0.918, + "step": 7840 + }, + { + "epoch": 0.3737992515433938, + "grad_norm": 1.8266521692276, + "learning_rate": 1.9496651411601285e-08, + "loss": 0.8181, + "step": 7841 + }, + { + "epoch": 0.3738469239386933, + "grad_norm": 4.214056015014648, + "learning_rate": 1.9252260184786652e-08, + "loss": 0.3962, + "step": 7842 + }, + { + "epoch": 0.3738945963339928, + "grad_norm": 1.3586162328720093, + "learning_rate": 1.900940888456604e-08, + "loss": 0.7629, + "step": 7843 + }, + { + "epoch": 0.3739422687292923, + "grad_norm": 1.6599266529083252, + "learning_rate": 1.876809754840836e-08, + "loss": 0.8508, + "step": 7844 + }, + { + "epoch": 0.37398994112459183, + "grad_norm": 1.5912233591079712, + "learning_rate": 1.8528326213548276e-08, + "loss": 0.9134, + "step": 7845 + }, + { + "epoch": 0.3740376135198913, + "grad_norm": 2.298814535140991, + "learning_rate": 1.829009491697731e-08, + "loss": 0.8547, + "step": 7846 + }, + { + "epoch": 0.3740852859151908, + "grad_norm": 2.1455953121185303, + "learning_rate": 1.805340369545272e-08, + "loss": 1.0501, + "step": 7847 + }, + { + "epoch": 0.3741329583104903, + "grad_norm": 2.4369187355041504, + "learning_rate": 1.781825258549419e-08, + "loss": 0.9414, + "step": 7848 + }, + { + "epoch": 0.37418063070578983, + "grad_norm": 2.005153179168701, + "learning_rate": 1.7584641623381583e-08, + "loss": 0.8507, + "step": 7849 + }, + { + "epoch": 0.3742283031010893, + "grad_norm": 3.369074583053589, + "learning_rate": 1.735257084516051e-08, + "loss": 0.8743, + "step": 7850 + }, + { + "epoch": 0.3742759754963888, + "grad_norm": 2.91791033744812, + "learning_rate": 1.7122040286636775e-08, + "loss": 1.1068, + "step": 7851 + }, + { + "epoch": 0.3743236478916883, + "grad_norm": 2.0598747730255127, + "learning_rate": 1.6893049983378597e-08, + "loss": 0.1639, + "step": 7852 + }, + { + "epoch": 0.37437132028698783, + "grad_norm": 0.9947049617767334, + "learning_rate": 1.6665599970715484e-08, + "loss": 0.5559, + "step": 7853 + }, + { + "epoch": 0.37441899268228734, + "grad_norm": 1.0994200706481934, + "learning_rate": 1.6439690283742704e-08, + "loss": 0.6166, + "step": 7854 + }, + { + "epoch": 0.3744666650775868, + "grad_norm": 1.6171073913574219, + "learning_rate": 1.6215320957315707e-08, + "loss": 0.7666, + "step": 7855 + }, + { + "epoch": 0.3745143374728863, + "grad_norm": 1.652296543121338, + "learning_rate": 1.5992492026050134e-08, + "loss": 0.4892, + "step": 7856 + }, + { + "epoch": 0.37456200986818583, + "grad_norm": 1.50570809841156, + "learning_rate": 1.5771203524328483e-08, + "loss": 0.4487, + "step": 7857 + }, + { + "epoch": 0.37460968226348534, + "grad_norm": 1.7177133560180664, + "learning_rate": 1.5551455486292333e-08, + "loss": 0.7813, + "step": 7858 + }, + { + "epoch": 0.37465735465878486, + "grad_norm": 2.157076358795166, + "learning_rate": 1.5333247945846787e-08, + "loss": 0.9098, + "step": 7859 + }, + { + "epoch": 0.3747050270540843, + "grad_norm": 2.961280107498169, + "learning_rate": 1.5116580936658242e-08, + "loss": 0.6165, + "step": 7860 + }, + { + "epoch": 0.37475269944938383, + "grad_norm": 2.552809953689575, + "learning_rate": 1.4901454492157742e-08, + "loss": 0.6839, + "step": 7861 + }, + { + "epoch": 0.37480037184468334, + "grad_norm": 1.4613701105117798, + "learning_rate": 1.4687868645535398e-08, + "loss": 0.4725, + "step": 7862 + }, + { + "epoch": 0.37484804423998286, + "grad_norm": 2.817723512649536, + "learning_rate": 1.4475823429747071e-08, + "loss": 0.635, + "step": 7863 + }, + { + "epoch": 0.3748957166352823, + "grad_norm": 2.326052188873291, + "learning_rate": 1.4265318877507705e-08, + "loss": 1.053, + "step": 7864 + }, + { + "epoch": 0.37494338903058183, + "grad_norm": 3.5985987186431885, + "learning_rate": 1.4056355021295764e-08, + "loss": 0.4344, + "step": 7865 + }, + { + "epoch": 0.37499106142588134, + "grad_norm": 1.4357486963272095, + "learning_rate": 1.3848931893353235e-08, + "loss": 1.0112, + "step": 7866 + }, + { + "epoch": 0.37503873382118086, + "grad_norm": 1.815887212753296, + "learning_rate": 1.3643049525683405e-08, + "loss": 0.5912, + "step": 7867 + }, + { + "epoch": 0.3750864062164804, + "grad_norm": 1.2245608568191528, + "learning_rate": 1.3438707950051978e-08, + "loss": 0.6778, + "step": 7868 + }, + { + "epoch": 0.37513407861177983, + "grad_norm": 1.2549368143081665, + "learning_rate": 1.3235907197984843e-08, + "loss": 0.349, + "step": 7869 + }, + { + "epoch": 0.37518175100707934, + "grad_norm": 1.3882089853286743, + "learning_rate": 1.303464730077475e-08, + "loss": 0.7041, + "step": 7870 + }, + { + "epoch": 0.37522942340237886, + "grad_norm": 1.3840465545654297, + "learning_rate": 1.2834928289472415e-08, + "loss": 0.8589, + "step": 7871 + }, + { + "epoch": 0.3752770957976784, + "grad_norm": 1.1382063627243042, + "learning_rate": 1.2636750194892078e-08, + "loss": 0.195, + "step": 7872 + }, + { + "epoch": 0.37532476819297783, + "grad_norm": 1.5908664464950562, + "learning_rate": 1.2440113047611502e-08, + "loss": 0.7147, + "step": 7873 + }, + { + "epoch": 0.37537244058827735, + "grad_norm": 1.399793267250061, + "learning_rate": 1.224501687796975e-08, + "loss": 0.5627, + "step": 7874 + }, + { + "epoch": 0.37542011298357686, + "grad_norm": 1.5485873222351074, + "learning_rate": 1.2051461716068302e-08, + "loss": 0.8783, + "step": 7875 + }, + { + "epoch": 0.3754677853788764, + "grad_norm": 2.1138932704925537, + "learning_rate": 1.1859447591769934e-08, + "loss": 0.8431, + "step": 7876 + }, + { + "epoch": 0.3755154577741759, + "grad_norm": 1.1249518394470215, + "learning_rate": 1.166897453470095e-08, + "loss": 0.5855, + "step": 7877 + }, + { + "epoch": 0.37556313016947535, + "grad_norm": 3.097949743270874, + "learning_rate": 1.148004257424895e-08, + "loss": 0.5385, + "step": 7878 + }, + { + "epoch": 0.37561080256477486, + "grad_norm": 2.025190830230713, + "learning_rate": 1.1292651739565063e-08, + "loss": 0.9114, + "step": 7879 + }, + { + "epoch": 0.3756584749600744, + "grad_norm": 3.2264833450317383, + "learning_rate": 1.1106802059560607e-08, + "loss": 0.0937, + "step": 7880 + }, + { + "epoch": 0.3757061473553739, + "grad_norm": 1.4958502054214478, + "learning_rate": 1.092249356291042e-08, + "loss": 0.7134, + "step": 7881 + }, + { + "epoch": 0.37575381975067335, + "grad_norm": 1.5587936639785767, + "learning_rate": 1.0739726278052864e-08, + "loss": 0.639, + "step": 7882 + }, + { + "epoch": 0.37580149214597286, + "grad_norm": 1.427049160003662, + "learning_rate": 1.0558500233186498e-08, + "loss": 0.7456, + "step": 7883 + }, + { + "epoch": 0.3758491645412724, + "grad_norm": 1.3080843687057495, + "learning_rate": 1.0378815456271174e-08, + "loss": 0.589, + "step": 7884 + }, + { + "epoch": 0.3758968369365719, + "grad_norm": 2.67425799369812, + "learning_rate": 1.0200671975031384e-08, + "loss": 1.0191, + "step": 7885 + }, + { + "epoch": 0.3759445093318714, + "grad_norm": 1.9029661417007446, + "learning_rate": 1.002406981695292e-08, + "loss": 0.7613, + "step": 7886 + }, + { + "epoch": 0.37599218172717086, + "grad_norm": 1.3547945022583008, + "learning_rate": 9.849009009285093e-09, + "loss": 0.8375, + "step": 7887 + }, + { + "epoch": 0.3760398541224704, + "grad_norm": 1.3373538255691528, + "learning_rate": 9.675489579035191e-09, + "loss": 0.6045, + "step": 7888 + }, + { + "epoch": 0.3760875265177699, + "grad_norm": 2.000943660736084, + "learning_rate": 9.503511552977351e-09, + "loss": 0.9951, + "step": 7889 + }, + { + "epoch": 0.3761351989130694, + "grad_norm": 7.923572540283203, + "learning_rate": 9.333074957644795e-09, + "loss": 2.2791, + "step": 7890 + }, + { + "epoch": 0.3761828713083689, + "grad_norm": 1.4883496761322021, + "learning_rate": 9.164179819335373e-09, + "loss": 0.7747, + "step": 7891 + }, + { + "epoch": 0.3762305437036684, + "grad_norm": 3.276047468185425, + "learning_rate": 8.996826164107131e-09, + "loss": 0.5226, + "step": 7892 + }, + { + "epoch": 0.3762782160989679, + "grad_norm": 1.549985647201538, + "learning_rate": 8.831014017780526e-09, + "loss": 0.6295, + "step": 7893 + }, + { + "epoch": 0.3763258884942674, + "grad_norm": 1.7809767723083496, + "learning_rate": 8.666743405940647e-09, + "loss": 0.8068, + "step": 7894 + }, + { + "epoch": 0.3763735608895669, + "grad_norm": 3.950303077697754, + "learning_rate": 8.504014353930557e-09, + "loss": 1.8445, + "step": 7895 + }, + { + "epoch": 0.3764212332848664, + "grad_norm": 1.5578914880752563, + "learning_rate": 8.342826886857946e-09, + "loss": 0.8914, + "step": 7896 + }, + { + "epoch": 0.3764689056801659, + "grad_norm": 8.306102752685547, + "learning_rate": 8.183181029594034e-09, + "loss": 0.6091, + "step": 7897 + }, + { + "epoch": 0.3765165780754654, + "grad_norm": 1.7017185688018799, + "learning_rate": 8.025076806769117e-09, + "loss": 0.7321, + "step": 7898 + }, + { + "epoch": 0.3765642504707649, + "grad_norm": 2.048006534576416, + "learning_rate": 7.868514242777015e-09, + "loss": 0.761, + "step": 7899 + }, + { + "epoch": 0.37661192286606443, + "grad_norm": 1.3152148723602295, + "learning_rate": 7.71349336177507e-09, + "loss": 0.2279, + "step": 7900 + }, + { + "epoch": 0.3766595952613639, + "grad_norm": 1.1558787822723389, + "learning_rate": 7.56001418767971e-09, + "loss": 0.8182, + "step": 7901 + }, + { + "epoch": 0.3767072676566634, + "grad_norm": 1.4645040035247803, + "learning_rate": 7.408076744171988e-09, + "loss": 0.6101, + "step": 7902 + }, + { + "epoch": 0.3767549400519629, + "grad_norm": 1.4637107849121094, + "learning_rate": 7.257681054695375e-09, + "loss": 0.7785, + "step": 7903 + }, + { + "epoch": 0.37680261244726243, + "grad_norm": 1.8055284023284912, + "learning_rate": 7.108827142452423e-09, + "loss": 0.5899, + "step": 7904 + }, + { + "epoch": 0.3768502848425619, + "grad_norm": 1.9140489101409912, + "learning_rate": 6.961515030410315e-09, + "loss": 0.7417, + "step": 7905 + }, + { + "epoch": 0.3768979572378614, + "grad_norm": 1.172397255897522, + "learning_rate": 6.8157447412975365e-09, + "loss": 0.6935, + "step": 7906 + }, + { + "epoch": 0.3769456296331609, + "grad_norm": 1.1835538148880005, + "learning_rate": 6.671516297606095e-09, + "loss": 1.0454, + "step": 7907 + }, + { + "epoch": 0.37699330202846043, + "grad_norm": 1.076431393623352, + "learning_rate": 6.528829721588193e-09, + "loss": 0.547, + "step": 7908 + }, + { + "epoch": 0.37704097442375994, + "grad_norm": 2.151554584503174, + "learning_rate": 6.38768503525955e-09, + "loss": 0.5472, + "step": 7909 + }, + { + "epoch": 0.3770886468190594, + "grad_norm": 1.8558036088943481, + "learning_rate": 6.2480822603960825e-09, + "loss": 0.342, + "step": 7910 + }, + { + "epoch": 0.3771363192143589, + "grad_norm": 1.9963477849960327, + "learning_rate": 6.110021418538337e-09, + "loss": 0.6691, + "step": 7911 + }, + { + "epoch": 0.37718399160965843, + "grad_norm": 1.7131110429763794, + "learning_rate": 5.973502530987052e-09, + "loss": 0.6267, + "step": 7912 + }, + { + "epoch": 0.37723166400495795, + "grad_norm": 1.512924313545227, + "learning_rate": 5.83852561880538e-09, + "loss": 0.5045, + "step": 7913 + }, + { + "epoch": 0.3772793364002574, + "grad_norm": 1.1113539934158325, + "learning_rate": 5.705090702819993e-09, + "loss": 0.4778, + "step": 7914 + }, + { + "epoch": 0.3773270087955569, + "grad_norm": 1.4719964265823364, + "learning_rate": 5.573197803616648e-09, + "loss": 1.0057, + "step": 7915 + }, + { + "epoch": 0.37737468119085643, + "grad_norm": 1.2357630729675293, + "learning_rate": 5.442846941546842e-09, + "loss": 0.7613, + "step": 7916 + }, + { + "epoch": 0.37742235358615595, + "grad_norm": 1.7606688737869263, + "learning_rate": 5.314038136722266e-09, + "loss": 0.8544, + "step": 7917 + }, + { + "epoch": 0.37747002598145546, + "grad_norm": 1.3121763467788696, + "learning_rate": 5.1867714090148016e-09, + "loss": 0.5183, + "step": 7918 + }, + { + "epoch": 0.3775176983767549, + "grad_norm": 1.782693862915039, + "learning_rate": 5.061046778063183e-09, + "loss": 0.918, + "step": 7919 + }, + { + "epoch": 0.37756537077205443, + "grad_norm": 1.8094278573989868, + "learning_rate": 4.936864263264119e-09, + "loss": 0.9862, + "step": 7920 + }, + { + "epoch": 0.37761304316735395, + "grad_norm": 1.605290412902832, + "learning_rate": 4.814223883776725e-09, + "loss": 1.0203, + "step": 7921 + }, + { + "epoch": 0.37766071556265346, + "grad_norm": 1.2488490343093872, + "learning_rate": 4.693125658524755e-09, + "loss": 0.7673, + "step": 7922 + }, + { + "epoch": 0.377708387957953, + "grad_norm": 1.2578039169311523, + "learning_rate": 4.573569606191042e-09, + "loss": 0.5042, + "step": 7923 + }, + { + "epoch": 0.37775606035325243, + "grad_norm": 1.093462347984314, + "learning_rate": 4.45555574522305e-09, + "loss": 0.6331, + "step": 7924 + }, + { + "epoch": 0.37780373274855195, + "grad_norm": 1.4226394891738892, + "learning_rate": 4.339084093828438e-09, + "loss": 1.0386, + "step": 7925 + }, + { + "epoch": 0.37785140514385146, + "grad_norm": 1.2890040874481201, + "learning_rate": 4.224154669978386e-09, + "loss": 0.7303, + "step": 7926 + }, + { + "epoch": 0.377899077539151, + "grad_norm": 2.2560782432556152, + "learning_rate": 4.1107674914042665e-09, + "loss": 0.357, + "step": 7927 + }, + { + "epoch": 0.37794674993445043, + "grad_norm": 1.2863467931747437, + "learning_rate": 3.998922575600972e-09, + "loss": 0.6226, + "step": 7928 + }, + { + "epoch": 0.37799442232974995, + "grad_norm": 5.049374580383301, + "learning_rate": 3.8886199398247005e-09, + "loss": 0.4091, + "step": 7929 + }, + { + "epoch": 0.37804209472504946, + "grad_norm": 3.6122961044311523, + "learning_rate": 3.77985960109517e-09, + "loss": 1.4993, + "step": 7930 + }, + { + "epoch": 0.378089767120349, + "grad_norm": 1.6522828340530396, + "learning_rate": 3.6726415761911826e-09, + "loss": 0.9534, + "step": 7931 + }, + { + "epoch": 0.3781374395156485, + "grad_norm": 1.0905990600585938, + "learning_rate": 3.5669658816572803e-09, + "loss": 0.434, + "step": 7932 + }, + { + "epoch": 0.37818511191094795, + "grad_norm": 1.8408175706863403, + "learning_rate": 3.462832533795979e-09, + "loss": 0.6149, + "step": 7933 + }, + { + "epoch": 0.37823278430624746, + "grad_norm": 3.0016887187957764, + "learning_rate": 3.360241548676646e-09, + "loss": 0.9752, + "step": 7934 + }, + { + "epoch": 0.378280456701547, + "grad_norm": 1.4469727277755737, + "learning_rate": 3.259192942125511e-09, + "loss": 0.3066, + "step": 7935 + }, + { + "epoch": 0.3783281290968465, + "grad_norm": 3.8612887859344482, + "learning_rate": 3.1596867297345457e-09, + "loss": 1.1331, + "step": 7936 + }, + { + "epoch": 0.37837580149214595, + "grad_norm": 1.598137378692627, + "learning_rate": 3.0617229268570248e-09, + "loss": 0.3896, + "step": 7937 + }, + { + "epoch": 0.37842347388744546, + "grad_norm": 1.9002468585968018, + "learning_rate": 2.9653015486064143e-09, + "loss": 0.6132, + "step": 7938 + }, + { + "epoch": 0.378471146282745, + "grad_norm": 1.2709959745407104, + "learning_rate": 2.8704226098597023e-09, + "loss": 0.624, + "step": 7939 + }, + { + "epoch": 0.3785188186780445, + "grad_norm": 1.3499418497085571, + "learning_rate": 2.7770861252574e-09, + "loss": 0.8452, + "step": 7940 + }, + { + "epoch": 0.378566491073344, + "grad_norm": 4.122645378112793, + "learning_rate": 2.6852921091991e-09, + "loss": 0.8923, + "step": 7941 + }, + { + "epoch": 0.37861416346864346, + "grad_norm": 0.9164470434188843, + "learning_rate": 2.595040575846808e-09, + "loss": 0.564, + "step": 7942 + }, + { + "epoch": 0.378661835863943, + "grad_norm": 1.244675636291504, + "learning_rate": 2.5063315391271605e-09, + "loss": 0.5326, + "step": 7943 + }, + { + "epoch": 0.3787095082592425, + "grad_norm": 1.5859483480453491, + "learning_rate": 2.4191650127269873e-09, + "loss": 0.7998, + "step": 7944 + }, + { + "epoch": 0.378757180654542, + "grad_norm": 2.027064323425293, + "learning_rate": 2.3335410100933096e-09, + "loss": 0.2413, + "step": 7945 + }, + { + "epoch": 0.3788048530498415, + "grad_norm": 1.0773860216140747, + "learning_rate": 2.249459544438892e-09, + "loss": 0.5909, + "step": 7946 + }, + { + "epoch": 0.378852525445141, + "grad_norm": 1.8932267427444458, + "learning_rate": 2.1669206287355803e-09, + "loss": 0.66, + "step": 7947 + }, + { + "epoch": 0.3789001978404405, + "grad_norm": 1.4737772941589355, + "learning_rate": 2.0859242757187425e-09, + "loss": 1.037, + "step": 7948 + }, + { + "epoch": 0.37894787023574, + "grad_norm": 1.2279188632965088, + "learning_rate": 2.006470497885049e-09, + "loss": 0.8493, + "step": 7949 + }, + { + "epoch": 0.3789955426310395, + "grad_norm": 1.3951424360275269, + "learning_rate": 1.9285593074935826e-09, + "loss": 0.9716, + "step": 7950 + }, + { + "epoch": 0.379043215026339, + "grad_norm": 1.5984203815460205, + "learning_rate": 1.8521907165658382e-09, + "loss": 0.7161, + "step": 7951 + }, + { + "epoch": 0.3790908874216385, + "grad_norm": 1.5910447835922241, + "learning_rate": 1.7773647368835023e-09, + "loss": 0.3812, + "step": 7952 + }, + { + "epoch": 0.379138559816938, + "grad_norm": 1.8767844438552856, + "learning_rate": 1.7040813799917844e-09, + "loss": 0.3736, + "step": 7953 + }, + { + "epoch": 0.3791862322122375, + "grad_norm": 7.024414539337158, + "learning_rate": 1.6323406571983058e-09, + "loss": 0.4531, + "step": 7954 + }, + { + "epoch": 0.37923390460753703, + "grad_norm": 1.522065281867981, + "learning_rate": 1.56214257957088e-09, + "loss": 0.7135, + "step": 7955 + }, + { + "epoch": 0.3792815770028365, + "grad_norm": 1.781845211982727, + "learning_rate": 1.4934871579408428e-09, + "loss": 0.9186, + "step": 7956 + }, + { + "epoch": 0.379329249398136, + "grad_norm": 1.024221658706665, + "learning_rate": 1.4263744029019422e-09, + "loss": 0.7709, + "step": 7957 + }, + { + "epoch": 0.3793769217934355, + "grad_norm": 1.4704159498214722, + "learning_rate": 1.360804324807008e-09, + "loss": 0.6995, + "step": 7958 + }, + { + "epoch": 0.37942459418873503, + "grad_norm": 1.6574232578277588, + "learning_rate": 1.2967769337746128e-09, + "loss": 0.7073, + "step": 7959 + }, + { + "epoch": 0.3794722665840345, + "grad_norm": 1.122309684753418, + "learning_rate": 1.2342922396824108e-09, + "loss": 0.8538, + "step": 7960 + }, + { + "epoch": 0.379519938979334, + "grad_norm": 1.404675841331482, + "learning_rate": 1.173350252171579e-09, + "loss": 0.6119, + "step": 7961 + }, + { + "epoch": 0.3795676113746335, + "grad_norm": 2.6873176097869873, + "learning_rate": 1.113950980645706e-09, + "loss": 0.7034, + "step": 7962 + }, + { + "epoch": 0.37961528376993303, + "grad_norm": 1.3230880498886108, + "learning_rate": 1.0560944342674627e-09, + "loss": 0.6361, + "step": 7963 + }, + { + "epoch": 0.37966295616523255, + "grad_norm": 1.7919119596481323, + "learning_rate": 9.997806219652628e-10, + "loss": 0.4311, + "step": 7964 + }, + { + "epoch": 0.379710628560532, + "grad_norm": 1.6529461145401, + "learning_rate": 9.450095524266012e-10, + "loss": 0.8398, + "step": 7965 + }, + { + "epoch": 0.3797583009558315, + "grad_norm": 1.2438713312149048, + "learning_rate": 8.917812341024956e-10, + "loss": 0.3345, + "step": 7966 + }, + { + "epoch": 0.37980597335113103, + "grad_norm": 1.432558298110962, + "learning_rate": 8.400956752063761e-10, + "loss": 0.8017, + "step": 7967 + }, + { + "epoch": 0.37985364574643055, + "grad_norm": 0.9874585270881653, + "learning_rate": 7.899528837118642e-10, + "loss": 0.6413, + "step": 7968 + }, + { + "epoch": 0.37990131814173, + "grad_norm": 1.2063425779342651, + "learning_rate": 7.413528673549941e-10, + "loss": 0.4471, + "step": 7969 + }, + { + "epoch": 0.3799489905370295, + "grad_norm": 1.660598635673523, + "learning_rate": 6.942956336353224e-10, + "loss": 1.0374, + "step": 7970 + }, + { + "epoch": 0.37999666293232903, + "grad_norm": 2.485856294631958, + "learning_rate": 6.487811898137075e-10, + "loss": 0.9347, + "step": 7971 + }, + { + "epoch": 0.38004433532762855, + "grad_norm": 2.2753517627716064, + "learning_rate": 6.048095429111999e-10, + "loss": 0.8869, + "step": 7972 + }, + { + "epoch": 0.38009200772292806, + "grad_norm": 2.6413257122039795, + "learning_rate": 5.623806997123726e-10, + "loss": 1.1618, + "step": 7973 + }, + { + "epoch": 0.3801396801182275, + "grad_norm": 1.2967902421951294, + "learning_rate": 5.214946667642106e-10, + "loss": 0.7868, + "step": 7974 + }, + { + "epoch": 0.38018735251352703, + "grad_norm": 1.7386435270309448, + "learning_rate": 4.821514503750013e-10, + "loss": 0.6167, + "step": 7975 + }, + { + "epoch": 0.38023502490882655, + "grad_norm": 2.0070996284484863, + "learning_rate": 4.4435105661433387e-10, + "loss": 0.7583, + "step": 7976 + }, + { + "epoch": 0.38028269730412606, + "grad_norm": 1.1420079469680786, + "learning_rate": 4.0809349131420984e-10, + "loss": 0.5281, + "step": 7977 + }, + { + "epoch": 0.3803303696994256, + "grad_norm": 1.9545698165893555, + "learning_rate": 3.7337876007015325e-10, + "loss": 0.7945, + "step": 7978 + }, + { + "epoch": 0.38037804209472503, + "grad_norm": 1.9201194047927856, + "learning_rate": 3.4020686823788007e-10, + "loss": 0.5952, + "step": 7979 + }, + { + "epoch": 0.38042571449002455, + "grad_norm": 0.9844833612442017, + "learning_rate": 3.0857782093440813e-10, + "loss": 0.6198, + "step": 7980 + }, + { + "epoch": 0.38047338688532406, + "grad_norm": 1.3375505208969116, + "learning_rate": 2.784916230402779e-10, + "loss": 1.0815, + "step": 7981 + }, + { + "epoch": 0.3805210592806236, + "grad_norm": 2.765907049179077, + "learning_rate": 2.49948279198442e-10, + "loss": 1.1167, + "step": 7982 + }, + { + "epoch": 0.38056873167592303, + "grad_norm": 1.541002869606018, + "learning_rate": 2.2294779381204502e-10, + "loss": 0.6469, + "step": 7983 + }, + { + "epoch": 0.38061640407122255, + "grad_norm": 3.628169536590576, + "learning_rate": 1.974901710466437e-10, + "loss": 1.2214, + "step": 7984 + }, + { + "epoch": 0.38066407646652206, + "grad_norm": 2.3656258583068848, + "learning_rate": 1.7357541483020712e-10, + "loss": 0.727, + "step": 7985 + }, + { + "epoch": 0.3807117488618216, + "grad_norm": 1.9193719625473022, + "learning_rate": 1.5120352885311663e-10, + "loss": 0.785, + "step": 7986 + }, + { + "epoch": 0.3807594212571211, + "grad_norm": 3.311270236968994, + "learning_rate": 1.3037451656705558e-10, + "loss": 0.6728, + "step": 7987 + }, + { + "epoch": 0.38080709365242055, + "grad_norm": 1.8771862983703613, + "learning_rate": 1.1108838118500942e-10, + "loss": 0.7411, + "step": 7988 + }, + { + "epoch": 0.38085476604772006, + "grad_norm": 2.5218522548675537, + "learning_rate": 9.334512568348608e-11, + "loss": 0.9595, + "step": 7989 + }, + { + "epoch": 0.3809024384430196, + "grad_norm": 1.7077536582946777, + "learning_rate": 7.714475279918531e-11, + "loss": 0.731, + "step": 7990 + }, + { + "epoch": 0.3809501108383191, + "grad_norm": 1.424237847328186, + "learning_rate": 6.248726503232938e-11, + "loss": 0.6481, + "step": 7991 + }, + { + "epoch": 0.38099778323361855, + "grad_norm": 1.0298527479171753, + "learning_rate": 4.937266464444257e-11, + "loss": 0.1631, + "step": 7992 + }, + { + "epoch": 0.38104545562891806, + "grad_norm": 3.5515754222869873, + "learning_rate": 3.7800953658351236e-11, + "loss": 0.4999, + "step": 7993 + }, + { + "epoch": 0.3810931280242176, + "grad_norm": 1.9379377365112305, + "learning_rate": 2.7772133860404227e-11, + "loss": 0.7415, + "step": 7994 + }, + { + "epoch": 0.3811408004195171, + "grad_norm": 3.0354959964752197, + "learning_rate": 1.9286206797142214e-11, + "loss": 1.2463, + "step": 7995 + }, + { + "epoch": 0.3811884728148166, + "grad_norm": 4.965470314025879, + "learning_rate": 1.234317377862837e-11, + "loss": 0.9729, + "step": 7996 + }, + { + "epoch": 0.38123614521011606, + "grad_norm": 1.3164920806884766, + "learning_rate": 6.943035875117688e-12, + "loss": 0.6675, + "step": 7997 + }, + { + "epoch": 0.3812838176054156, + "grad_norm": 1.4870984554290771, + "learning_rate": 3.0857939203876584e-12, + "loss": 0.8669, + "step": 7998 + }, + { + "epoch": 0.3813314900007151, + "grad_norm": 1.6616401672363281, + "learning_rate": 7.714485095178248e-13, + "loss": 0.5931, + "step": 7999 + }, + { + "epoch": 0.3813791623960146, + "grad_norm": 1.6352343559265137, + "learning_rate": 0.0, + "loss": 0.7391, + "step": 8000 + }, + { + "epoch": 0.3813791623960146, + "eval_loss": 0.38078346848487854, + "eval_runtime": 4278.4295, + "eval_samples_per_second": 1.396, + "eval_steps_per_second": 1.396, + "step": 8000 + } + ], + "logging_steps": 1, + "max_steps": 8000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.2236011257856e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}