diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,44834 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.5119897602047959, + "eval_steps": 500, + "global_step": 6400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 7.999840003199936e-05, + "grad_norm": 0.33082761638311287, + "learning_rate": 0.0, + "loss": 0.3004, + "step": 1 + }, + { + "epoch": 0.00015999680006399873, + "grad_norm": 0.2913215150810986, + "learning_rate": 1.3333333333333334e-08, + "loss": 0.3327, + "step": 2 + }, + { + "epoch": 0.00023999520009599807, + "grad_norm": 0.37934142492177475, + "learning_rate": 2.6666666666666667e-08, + "loss": 0.3023, + "step": 3 + }, + { + "epoch": 0.00031999360012799745, + "grad_norm": 0.25570776418714214, + "learning_rate": 4e-08, + "loss": 0.3572, + "step": 4 + }, + { + "epoch": 0.0003999920001599968, + "grad_norm": 0.4048563084271804, + "learning_rate": 5.3333333333333334e-08, + "loss": 0.2973, + "step": 5 + }, + { + "epoch": 0.00047999040019199615, + "grad_norm": 0.294296710431263, + "learning_rate": 6.666666666666668e-08, + "loss": 0.3336, + "step": 6 + }, + { + "epoch": 0.0005599888002239956, + "grad_norm": 0.2970810285479718, + "learning_rate": 8e-08, + "loss": 0.3015, + "step": 7 + }, + { + "epoch": 0.0006399872002559949, + "grad_norm": 0.2801138817171433, + "learning_rate": 9.333333333333335e-08, + "loss": 0.3129, + "step": 8 + }, + { + "epoch": 0.0007199856002879943, + "grad_norm": 0.2967514174191538, + "learning_rate": 1.0666666666666667e-07, + "loss": 0.311, + "step": 9 + }, + { + "epoch": 0.0007999840003199936, + "grad_norm": 0.3437099305921091, + "learning_rate": 1.2000000000000002e-07, + "loss": 0.3253, + "step": 10 + }, + { + "epoch": 0.000879982400351993, + "grad_norm": 0.5397417630661548, + "learning_rate": 1.3333333333333336e-07, + "loss": 0.3175, + "step": 11 + }, + { + "epoch": 0.0009599808003839923, + "grad_norm": 0.3224658652779924, + "learning_rate": 1.4666666666666668e-07, + "loss": 0.289, + "step": 12 + }, + { + "epoch": 0.0010399792004159916, + "grad_norm": 0.2897167268442966, + "learning_rate": 1.6e-07, + "loss": 0.3194, + "step": 13 + }, + { + "epoch": 0.001119977600447991, + "grad_norm": 0.24959442619549038, + "learning_rate": 1.7333333333333335e-07, + "loss": 0.3221, + "step": 14 + }, + { + "epoch": 0.0011999760004799903, + "grad_norm": 0.29492771533023515, + "learning_rate": 1.866666666666667e-07, + "loss": 0.2977, + "step": 15 + }, + { + "epoch": 0.0012799744005119898, + "grad_norm": 0.22293301820824413, + "learning_rate": 2.0000000000000002e-07, + "loss": 0.3693, + "step": 16 + }, + { + "epoch": 0.001359972800543989, + "grad_norm": 0.6168528884798137, + "learning_rate": 2.1333333333333334e-07, + "loss": 0.3079, + "step": 17 + }, + { + "epoch": 0.0014399712005759885, + "grad_norm": 0.41567461031521735, + "learning_rate": 2.266666666666667e-07, + "loss": 0.3335, + "step": 18 + }, + { + "epoch": 0.0015199696006079877, + "grad_norm": 0.292472875842971, + "learning_rate": 2.4000000000000003e-07, + "loss": 0.3392, + "step": 19 + }, + { + "epoch": 0.0015999680006399872, + "grad_norm": 0.34853964356449135, + "learning_rate": 2.533333333333333e-07, + "loss": 0.2795, + "step": 20 + }, + { + "epoch": 0.0016799664006719867, + "grad_norm": 0.2780087291513081, + "learning_rate": 2.666666666666667e-07, + "loss": 0.3044, + "step": 21 + }, + { + "epoch": 0.001759964800703986, + "grad_norm": 0.2827647097216766, + "learning_rate": 2.8e-07, + "loss": 0.3239, + "step": 22 + }, + { + "epoch": 0.0018399632007359854, + "grad_norm": 0.28414530182467007, + "learning_rate": 2.9333333333333337e-07, + "loss": 0.3287, + "step": 23 + }, + { + "epoch": 0.0019199616007679846, + "grad_norm": 0.31806015664635406, + "learning_rate": 3.0666666666666666e-07, + "loss": 0.3391, + "step": 24 + }, + { + "epoch": 0.001999960000799984, + "grad_norm": 0.2520307529365584, + "learning_rate": 3.2e-07, + "loss": 0.3457, + "step": 25 + }, + { + "epoch": 0.0020799584008319833, + "grad_norm": 0.23216855173011924, + "learning_rate": 3.3333333333333335e-07, + "loss": 0.3713, + "step": 26 + }, + { + "epoch": 0.0021599568008639825, + "grad_norm": 0.31912182901643477, + "learning_rate": 3.466666666666667e-07, + "loss": 0.3166, + "step": 27 + }, + { + "epoch": 0.002239955200895982, + "grad_norm": 0.32092495180662667, + "learning_rate": 3.6e-07, + "loss": 0.3344, + "step": 28 + }, + { + "epoch": 0.0023199536009279815, + "grad_norm": 0.319796041750652, + "learning_rate": 3.733333333333334e-07, + "loss": 0.303, + "step": 29 + }, + { + "epoch": 0.0023999520009599807, + "grad_norm": 0.3035068407364502, + "learning_rate": 3.8666666666666674e-07, + "loss": 0.3238, + "step": 30 + }, + { + "epoch": 0.0024799504009919804, + "grad_norm": 0.5933202783110759, + "learning_rate": 4.0000000000000003e-07, + "loss": 0.2862, + "step": 31 + }, + { + "epoch": 0.0025599488010239796, + "grad_norm": 0.41253800734855595, + "learning_rate": 4.133333333333334e-07, + "loss": 0.3092, + "step": 32 + }, + { + "epoch": 0.002639947201055979, + "grad_norm": 0.293001215764292, + "learning_rate": 4.266666666666667e-07, + "loss": 0.3142, + "step": 33 + }, + { + "epoch": 0.002719945601087978, + "grad_norm": 0.2510822578099165, + "learning_rate": 4.4e-07, + "loss": 0.3713, + "step": 34 + }, + { + "epoch": 0.0027999440011199778, + "grad_norm": 0.23189303424493002, + "learning_rate": 4.533333333333334e-07, + "loss": 0.3416, + "step": 35 + }, + { + "epoch": 0.002879942401151977, + "grad_norm": 0.30229246491134404, + "learning_rate": 4.666666666666667e-07, + "loss": 0.2961, + "step": 36 + }, + { + "epoch": 0.0029599408011839762, + "grad_norm": 0.3205739844872136, + "learning_rate": 4.800000000000001e-07, + "loss": 0.3484, + "step": 37 + }, + { + "epoch": 0.0030399392012159755, + "grad_norm": 0.3273582334140839, + "learning_rate": 4.933333333333334e-07, + "loss": 0.3024, + "step": 38 + }, + { + "epoch": 0.003119937601247975, + "grad_norm": 0.26615517021880947, + "learning_rate": 5.066666666666667e-07, + "loss": 0.3244, + "step": 39 + }, + { + "epoch": 0.0031999360012799744, + "grad_norm": 0.3068949619534162, + "learning_rate": 5.2e-07, + "loss": 0.3079, + "step": 40 + }, + { + "epoch": 0.0032799344013119736, + "grad_norm": 0.2803283985047437, + "learning_rate": 5.333333333333335e-07, + "loss": 0.337, + "step": 41 + }, + { + "epoch": 0.0033599328013439733, + "grad_norm": 0.31672734122837837, + "learning_rate": 5.466666666666667e-07, + "loss": 0.338, + "step": 42 + }, + { + "epoch": 0.0034399312013759726, + "grad_norm": 0.3393482800519383, + "learning_rate": 5.6e-07, + "loss": 0.2953, + "step": 43 + }, + { + "epoch": 0.003519929601407972, + "grad_norm": 0.3025747711461973, + "learning_rate": 5.733333333333334e-07, + "loss": 0.304, + "step": 44 + }, + { + "epoch": 0.003599928001439971, + "grad_norm": 0.2720493658372177, + "learning_rate": 5.866666666666667e-07, + "loss": 0.3414, + "step": 45 + }, + { + "epoch": 0.0036799264014719707, + "grad_norm": 0.3142370602638329, + "learning_rate": 6.000000000000001e-07, + "loss": 0.3004, + "step": 46 + }, + { + "epoch": 0.00375992480150397, + "grad_norm": 0.3323679423072982, + "learning_rate": 6.133333333333333e-07, + "loss": 0.3186, + "step": 47 + }, + { + "epoch": 0.003839923201535969, + "grad_norm": 0.32612763196974376, + "learning_rate": 6.266666666666667e-07, + "loss": 0.3048, + "step": 48 + }, + { + "epoch": 0.003919921601567969, + "grad_norm": 0.28751297529858044, + "learning_rate": 6.4e-07, + "loss": 0.3247, + "step": 49 + }, + { + "epoch": 0.003999920001599968, + "grad_norm": 0.3824429393686504, + "learning_rate": 6.533333333333334e-07, + "loss": 0.2935, + "step": 50 + }, + { + "epoch": 0.004079918401631967, + "grad_norm": 0.3018533488251579, + "learning_rate": 6.666666666666667e-07, + "loss": 0.316, + "step": 51 + }, + { + "epoch": 0.004159916801663967, + "grad_norm": 0.30195343332232594, + "learning_rate": 6.800000000000001e-07, + "loss": 0.3115, + "step": 52 + }, + { + "epoch": 0.004239915201695966, + "grad_norm": 0.3506805684458278, + "learning_rate": 6.933333333333334e-07, + "loss": 0.3171, + "step": 53 + }, + { + "epoch": 0.004319913601727965, + "grad_norm": 0.3185810107485491, + "learning_rate": 7.066666666666667e-07, + "loss": 0.3006, + "step": 54 + }, + { + "epoch": 0.004399912001759965, + "grad_norm": 0.2693358810664699, + "learning_rate": 7.2e-07, + "loss": 0.3206, + "step": 55 + }, + { + "epoch": 0.004479910401791964, + "grad_norm": 0.24392659455219035, + "learning_rate": 7.333333333333334e-07, + "loss": 0.3347, + "step": 56 + }, + { + "epoch": 0.004559908801823964, + "grad_norm": 0.31015163744900837, + "learning_rate": 7.466666666666668e-07, + "loss": 0.3381, + "step": 57 + }, + { + "epoch": 0.004639907201855963, + "grad_norm": 0.3102769361351094, + "learning_rate": 7.6e-07, + "loss": 0.3075, + "step": 58 + }, + { + "epoch": 0.004719905601887962, + "grad_norm": 0.5538706349504963, + "learning_rate": 7.733333333333335e-07, + "loss": 0.2794, + "step": 59 + }, + { + "epoch": 0.004799904001919961, + "grad_norm": 0.2523934291589158, + "learning_rate": 7.866666666666667e-07, + "loss": 0.3431, + "step": 60 + }, + { + "epoch": 0.004879902401951961, + "grad_norm": 0.32621842899414777, + "learning_rate": 8.000000000000001e-07, + "loss": 0.2905, + "step": 61 + }, + { + "epoch": 0.004959900801983961, + "grad_norm": 0.29215921145765517, + "learning_rate": 8.133333333333333e-07, + "loss": 0.3196, + "step": 62 + }, + { + "epoch": 0.00503989920201596, + "grad_norm": 0.2780688840346979, + "learning_rate": 8.266666666666668e-07, + "loss": 0.3203, + "step": 63 + }, + { + "epoch": 0.005119897602047959, + "grad_norm": 0.3351603443264391, + "learning_rate": 8.400000000000001e-07, + "loss": 0.2977, + "step": 64 + }, + { + "epoch": 0.0051998960020799585, + "grad_norm": 0.26823219956448324, + "learning_rate": 8.533333333333334e-07, + "loss": 0.3367, + "step": 65 + }, + { + "epoch": 0.005279894402111958, + "grad_norm": 0.25880892021145024, + "learning_rate": 8.666666666666668e-07, + "loss": 0.3182, + "step": 66 + }, + { + "epoch": 0.005359892802143957, + "grad_norm": 0.3010709774746063, + "learning_rate": 8.8e-07, + "loss": 0.323, + "step": 67 + }, + { + "epoch": 0.005439891202175956, + "grad_norm": 0.3162791184817396, + "learning_rate": 8.933333333333334e-07, + "loss": 0.3075, + "step": 68 + }, + { + "epoch": 0.005519889602207955, + "grad_norm": 0.34011821812360454, + "learning_rate": 9.066666666666668e-07, + "loss": 0.3203, + "step": 69 + }, + { + "epoch": 0.0055998880022399555, + "grad_norm": 0.31415942388867163, + "learning_rate": 9.200000000000001e-07, + "loss": 0.2991, + "step": 70 + }, + { + "epoch": 0.005679886402271955, + "grad_norm": 0.6463148407796293, + "learning_rate": 9.333333333333334e-07, + "loss": 0.3208, + "step": 71 + }, + { + "epoch": 0.005759884802303954, + "grad_norm": 0.29028764402440893, + "learning_rate": 9.466666666666667e-07, + "loss": 0.2996, + "step": 72 + }, + { + "epoch": 0.005839883202335953, + "grad_norm": 0.26343528569421293, + "learning_rate": 9.600000000000001e-07, + "loss": 0.2955, + "step": 73 + }, + { + "epoch": 0.0059198816023679525, + "grad_norm": 0.24518518576843773, + "learning_rate": 9.733333333333333e-07, + "loss": 0.3423, + "step": 74 + }, + { + "epoch": 0.005999880002399952, + "grad_norm": 0.3090146188968856, + "learning_rate": 9.866666666666668e-07, + "loss": 0.2769, + "step": 75 + }, + { + "epoch": 0.006079878402431951, + "grad_norm": 0.32206118968734543, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.2957, + "step": 76 + }, + { + "epoch": 0.006159876802463951, + "grad_norm": 0.31197354570045155, + "learning_rate": 1.0133333333333333e-06, + "loss": 0.3135, + "step": 77 + }, + { + "epoch": 0.00623987520249595, + "grad_norm": 0.26084204240920733, + "learning_rate": 1.0266666666666669e-06, + "loss": 0.3344, + "step": 78 + }, + { + "epoch": 0.0063198736025279496, + "grad_norm": 0.5554863834434538, + "learning_rate": 1.04e-06, + "loss": 0.2986, + "step": 79 + }, + { + "epoch": 0.006399872002559949, + "grad_norm": 0.3437186634680115, + "learning_rate": 1.0533333333333333e-06, + "loss": 0.2904, + "step": 80 + }, + { + "epoch": 0.006479870402591948, + "grad_norm": 0.3449567726972603, + "learning_rate": 1.066666666666667e-06, + "loss": 0.3552, + "step": 81 + }, + { + "epoch": 0.006559868802623947, + "grad_norm": 0.34546186732465545, + "learning_rate": 1.08e-06, + "loss": 0.3182, + "step": 82 + }, + { + "epoch": 0.0066398672026559465, + "grad_norm": 0.2914646622905985, + "learning_rate": 1.0933333333333334e-06, + "loss": 0.3257, + "step": 83 + }, + { + "epoch": 0.006719865602687947, + "grad_norm": 0.31653187802750293, + "learning_rate": 1.1066666666666667e-06, + "loss": 0.3203, + "step": 84 + }, + { + "epoch": 0.006799864002719946, + "grad_norm": 0.24428581351680245, + "learning_rate": 1.12e-06, + "loss": 0.3329, + "step": 85 + }, + { + "epoch": 0.006879862402751945, + "grad_norm": 0.3264501191696231, + "learning_rate": 1.1333333333333334e-06, + "loss": 0.2985, + "step": 86 + }, + { + "epoch": 0.006959860802783944, + "grad_norm": 0.331657711723914, + "learning_rate": 1.1466666666666668e-06, + "loss": 0.296, + "step": 87 + }, + { + "epoch": 0.007039859202815944, + "grad_norm": 0.3482337100975519, + "learning_rate": 1.1600000000000001e-06, + "loss": 0.3218, + "step": 88 + }, + { + "epoch": 0.007119857602847943, + "grad_norm": 0.3231501703266636, + "learning_rate": 1.1733333333333335e-06, + "loss": 0.328, + "step": 89 + }, + { + "epoch": 0.007199856002879942, + "grad_norm": 0.26271769233482944, + "learning_rate": 1.1866666666666668e-06, + "loss": 0.3523, + "step": 90 + }, + { + "epoch": 0.007279854402911942, + "grad_norm": 0.3271718103928566, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.2787, + "step": 91 + }, + { + "epoch": 0.007359852802943941, + "grad_norm": 0.2926708153372138, + "learning_rate": 1.2133333333333335e-06, + "loss": 0.3081, + "step": 92 + }, + { + "epoch": 0.007439851202975941, + "grad_norm": 0.35069411873426115, + "learning_rate": 1.2266666666666666e-06, + "loss": 0.3011, + "step": 93 + }, + { + "epoch": 0.00751984960300794, + "grad_norm": 0.2632834455201133, + "learning_rate": 1.2400000000000002e-06, + "loss": 0.3406, + "step": 94 + }, + { + "epoch": 0.007599848003039939, + "grad_norm": 0.28331254478310913, + "learning_rate": 1.2533333333333333e-06, + "loss": 0.3247, + "step": 95 + }, + { + "epoch": 0.007679846403071938, + "grad_norm": 0.38123973784502396, + "learning_rate": 1.2666666666666669e-06, + "loss": 0.295, + "step": 96 + }, + { + "epoch": 0.007759844803103938, + "grad_norm": 0.26652668798241846, + "learning_rate": 1.28e-06, + "loss": 0.3176, + "step": 97 + }, + { + "epoch": 0.007839843203135938, + "grad_norm": 0.24613725839199013, + "learning_rate": 1.2933333333333334e-06, + "loss": 0.3442, + "step": 98 + }, + { + "epoch": 0.007919841603167936, + "grad_norm": 0.3396773807987067, + "learning_rate": 1.3066666666666667e-06, + "loss": 0.3068, + "step": 99 + }, + { + "epoch": 0.007999840003199936, + "grad_norm": 0.2711808553445019, + "learning_rate": 1.32e-06, + "loss": 0.3127, + "step": 100 + }, + { + "epoch": 0.008079838403231935, + "grad_norm": 0.32447170953166365, + "learning_rate": 1.3333333333333334e-06, + "loss": 0.298, + "step": 101 + }, + { + "epoch": 0.008159836803263935, + "grad_norm": 0.34664107574619685, + "learning_rate": 1.3466666666666668e-06, + "loss": 0.2921, + "step": 102 + }, + { + "epoch": 0.008239835203295935, + "grad_norm": 0.3120193430526548, + "learning_rate": 1.3600000000000001e-06, + "loss": 0.3122, + "step": 103 + }, + { + "epoch": 0.008319833603327933, + "grad_norm": 0.22406001378841364, + "learning_rate": 1.3733333333333335e-06, + "loss": 0.3299, + "step": 104 + }, + { + "epoch": 0.008399832003359933, + "grad_norm": 0.2073963799155983, + "learning_rate": 1.3866666666666668e-06, + "loss": 0.3748, + "step": 105 + }, + { + "epoch": 0.008479830403391932, + "grad_norm": 0.3018638438274108, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.3232, + "step": 106 + }, + { + "epoch": 0.008559828803423932, + "grad_norm": 0.2682173287690338, + "learning_rate": 1.4133333333333335e-06, + "loss": 0.303, + "step": 107 + }, + { + "epoch": 0.00863982720345593, + "grad_norm": 0.3308859366521956, + "learning_rate": 1.4266666666666668e-06, + "loss": 0.265, + "step": 108 + }, + { + "epoch": 0.00871982560348793, + "grad_norm": 0.3261373953947699, + "learning_rate": 1.44e-06, + "loss": 0.3287, + "step": 109 + }, + { + "epoch": 0.00879982400351993, + "grad_norm": 0.24759434596082106, + "learning_rate": 1.4533333333333335e-06, + "loss": 0.3327, + "step": 110 + }, + { + "epoch": 0.008879822403551929, + "grad_norm": 0.2861526656612504, + "learning_rate": 1.4666666666666669e-06, + "loss": 0.331, + "step": 111 + }, + { + "epoch": 0.008959820803583929, + "grad_norm": 0.3907065597658347, + "learning_rate": 1.48e-06, + "loss": 0.3278, + "step": 112 + }, + { + "epoch": 0.009039819203615927, + "grad_norm": 0.3361061876380628, + "learning_rate": 1.4933333333333336e-06, + "loss": 0.2867, + "step": 113 + }, + { + "epoch": 0.009119817603647927, + "grad_norm": 0.2737033370360045, + "learning_rate": 1.506666666666667e-06, + "loss": 0.3141, + "step": 114 + }, + { + "epoch": 0.009199816003679926, + "grad_norm": 0.32057287945733126, + "learning_rate": 1.52e-06, + "loss": 0.3078, + "step": 115 + }, + { + "epoch": 0.009279814403711926, + "grad_norm": 0.2880083112543209, + "learning_rate": 1.5333333333333334e-06, + "loss": 0.3265, + "step": 116 + }, + { + "epoch": 0.009359812803743926, + "grad_norm": 0.3068585166271396, + "learning_rate": 1.546666666666667e-06, + "loss": 0.3072, + "step": 117 + }, + { + "epoch": 0.009439811203775924, + "grad_norm": 0.3310708719534575, + "learning_rate": 1.56e-06, + "loss": 0.3308, + "step": 118 + }, + { + "epoch": 0.009519809603807924, + "grad_norm": 0.35319095655987787, + "learning_rate": 1.5733333333333334e-06, + "loss": 0.2979, + "step": 119 + }, + { + "epoch": 0.009599808003839923, + "grad_norm": 0.38291022014761195, + "learning_rate": 1.586666666666667e-06, + "loss": 0.3106, + "step": 120 + }, + { + "epoch": 0.009679806403871923, + "grad_norm": 0.28039860675214856, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.3451, + "step": 121 + }, + { + "epoch": 0.009759804803903921, + "grad_norm": 0.30434607561402777, + "learning_rate": 1.6133333333333335e-06, + "loss": 0.2993, + "step": 122 + }, + { + "epoch": 0.009839803203935921, + "grad_norm": 0.2814533190285778, + "learning_rate": 1.6266666666666666e-06, + "loss": 0.3162, + "step": 123 + }, + { + "epoch": 0.009919801603967921, + "grad_norm": 0.3068835840774817, + "learning_rate": 1.6400000000000002e-06, + "loss": 0.3153, + "step": 124 + }, + { + "epoch": 0.00999980000399992, + "grad_norm": 0.30208460253548947, + "learning_rate": 1.6533333333333335e-06, + "loss": 0.3212, + "step": 125 + }, + { + "epoch": 0.01007979840403192, + "grad_norm": 0.23661646141165826, + "learning_rate": 1.6666666666666667e-06, + "loss": 0.3918, + "step": 126 + }, + { + "epoch": 0.010159796804063918, + "grad_norm": 0.2982549924909667, + "learning_rate": 1.6800000000000002e-06, + "loss": 0.3343, + "step": 127 + }, + { + "epoch": 0.010239795204095918, + "grad_norm": 0.2762048716470555, + "learning_rate": 1.6933333333333336e-06, + "loss": 0.3016, + "step": 128 + }, + { + "epoch": 0.010319793604127917, + "grad_norm": 0.28168626997008966, + "learning_rate": 1.7066666666666667e-06, + "loss": 0.3308, + "step": 129 + }, + { + "epoch": 0.010399792004159917, + "grad_norm": 0.2717980676095484, + "learning_rate": 1.72e-06, + "loss": 0.3179, + "step": 130 + }, + { + "epoch": 0.010479790404191917, + "grad_norm": 0.2769762403645166, + "learning_rate": 1.7333333333333336e-06, + "loss": 0.3048, + "step": 131 + }, + { + "epoch": 0.010559788804223915, + "grad_norm": 0.7155079454198295, + "learning_rate": 1.7466666666666667e-06, + "loss": 0.3132, + "step": 132 + }, + { + "epoch": 0.010639787204255916, + "grad_norm": 0.32604722004289993, + "learning_rate": 1.76e-06, + "loss": 0.2984, + "step": 133 + }, + { + "epoch": 0.010719785604287914, + "grad_norm": 0.28082026435241464, + "learning_rate": 1.7733333333333336e-06, + "loss": 0.3271, + "step": 134 + }, + { + "epoch": 0.010799784004319914, + "grad_norm": 0.2580492809834986, + "learning_rate": 1.7866666666666668e-06, + "loss": 0.3271, + "step": 135 + }, + { + "epoch": 0.010879782404351912, + "grad_norm": 0.23186860912525978, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.3393, + "step": 136 + }, + { + "epoch": 0.010959780804383912, + "grad_norm": 0.24050673417079582, + "learning_rate": 1.8133333333333337e-06, + "loss": 0.3286, + "step": 137 + }, + { + "epoch": 0.01103977920441591, + "grad_norm": 0.33944889101431586, + "learning_rate": 1.8266666666666668e-06, + "loss": 0.3256, + "step": 138 + }, + { + "epoch": 0.011119777604447911, + "grad_norm": 0.34861335103444774, + "learning_rate": 1.8400000000000002e-06, + "loss": 0.3147, + "step": 139 + }, + { + "epoch": 0.011199776004479911, + "grad_norm": 0.3054947283984174, + "learning_rate": 1.8533333333333333e-06, + "loss": 0.277, + "step": 140 + }, + { + "epoch": 0.01127977440451191, + "grad_norm": 0.25707925448175856, + "learning_rate": 1.8666666666666669e-06, + "loss": 0.3382, + "step": 141 + }, + { + "epoch": 0.01135977280454391, + "grad_norm": 0.33699829514137974, + "learning_rate": 1.8800000000000002e-06, + "loss": 0.2822, + "step": 142 + }, + { + "epoch": 0.011439771204575908, + "grad_norm": 0.35804014393151257, + "learning_rate": 1.8933333333333333e-06, + "loss": 0.3177, + "step": 143 + }, + { + "epoch": 0.011519769604607908, + "grad_norm": 0.32390316520716644, + "learning_rate": 1.906666666666667e-06, + "loss": 0.2732, + "step": 144 + }, + { + "epoch": 0.011599768004639906, + "grad_norm": 0.2814263862802696, + "learning_rate": 1.9200000000000003e-06, + "loss": 0.3068, + "step": 145 + }, + { + "epoch": 0.011679766404671907, + "grad_norm": 0.2824335737318314, + "learning_rate": 1.9333333333333336e-06, + "loss": 0.3458, + "step": 146 + }, + { + "epoch": 0.011759764804703907, + "grad_norm": 0.2526526201951397, + "learning_rate": 1.9466666666666665e-06, + "loss": 0.3573, + "step": 147 + }, + { + "epoch": 0.011839763204735905, + "grad_norm": 0.3315184280731441, + "learning_rate": 1.9600000000000003e-06, + "loss": 0.327, + "step": 148 + }, + { + "epoch": 0.011919761604767905, + "grad_norm": 0.2996394862715356, + "learning_rate": 1.9733333333333336e-06, + "loss": 0.297, + "step": 149 + }, + { + "epoch": 0.011999760004799903, + "grad_norm": 0.28557116476251826, + "learning_rate": 1.9866666666666666e-06, + "loss": 0.3164, + "step": 150 + }, + { + "epoch": 0.012079758404831904, + "grad_norm": 0.3474585203182206, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.3063, + "step": 151 + }, + { + "epoch": 0.012159756804863902, + "grad_norm": 0.2838233309413948, + "learning_rate": 2.0133333333333337e-06, + "loss": 0.3179, + "step": 152 + }, + { + "epoch": 0.012239755204895902, + "grad_norm": 0.27615612365628756, + "learning_rate": 2.0266666666666666e-06, + "loss": 0.3108, + "step": 153 + }, + { + "epoch": 0.012319753604927902, + "grad_norm": 0.24142417286479195, + "learning_rate": 2.04e-06, + "loss": 0.3283, + "step": 154 + }, + { + "epoch": 0.0123997520049599, + "grad_norm": 0.290926430945227, + "learning_rate": 2.0533333333333337e-06, + "loss": 0.3385, + "step": 155 + }, + { + "epoch": 0.0124797504049919, + "grad_norm": 0.3132276711601257, + "learning_rate": 2.0666666666666666e-06, + "loss": 0.2807, + "step": 156 + }, + { + "epoch": 0.012559748805023899, + "grad_norm": 0.34458617055163127, + "learning_rate": 2.08e-06, + "loss": 0.3206, + "step": 157 + }, + { + "epoch": 0.012639747205055899, + "grad_norm": 0.3023273595363232, + "learning_rate": 2.0933333333333338e-06, + "loss": 0.3014, + "step": 158 + }, + { + "epoch": 0.012719745605087898, + "grad_norm": 0.25745242084237335, + "learning_rate": 2.1066666666666667e-06, + "loss": 0.3254, + "step": 159 + }, + { + "epoch": 0.012799744005119898, + "grad_norm": 0.32493655477608646, + "learning_rate": 2.12e-06, + "loss": 0.2883, + "step": 160 + }, + { + "epoch": 0.012879742405151898, + "grad_norm": 0.28080304349663776, + "learning_rate": 2.133333333333334e-06, + "loss": 0.3022, + "step": 161 + }, + { + "epoch": 0.012959740805183896, + "grad_norm": 0.32367069405436416, + "learning_rate": 2.1466666666666667e-06, + "loss": 0.298, + "step": 162 + }, + { + "epoch": 0.013039739205215896, + "grad_norm": 0.4864549162381663, + "learning_rate": 2.16e-06, + "loss": 0.328, + "step": 163 + }, + { + "epoch": 0.013119737605247895, + "grad_norm": 0.29610513405004935, + "learning_rate": 2.1733333333333334e-06, + "loss": 0.3253, + "step": 164 + }, + { + "epoch": 0.013199736005279895, + "grad_norm": 0.3151967084316128, + "learning_rate": 2.1866666666666668e-06, + "loss": 0.2984, + "step": 165 + }, + { + "epoch": 0.013279734405311893, + "grad_norm": 0.3916853549551865, + "learning_rate": 2.2e-06, + "loss": 0.2801, + "step": 166 + }, + { + "epoch": 0.013359732805343893, + "grad_norm": 0.30601304519609385, + "learning_rate": 2.2133333333333335e-06, + "loss": 0.2888, + "step": 167 + }, + { + "epoch": 0.013439731205375893, + "grad_norm": 0.3381241678567355, + "learning_rate": 2.226666666666667e-06, + "loss": 0.2906, + "step": 168 + }, + { + "epoch": 0.013519729605407892, + "grad_norm": 0.28894608440476305, + "learning_rate": 2.24e-06, + "loss": 0.3187, + "step": 169 + }, + { + "epoch": 0.013599728005439892, + "grad_norm": 0.22955562250062986, + "learning_rate": 2.2533333333333335e-06, + "loss": 0.3267, + "step": 170 + }, + { + "epoch": 0.01367972640547189, + "grad_norm": 0.30934833995723104, + "learning_rate": 2.266666666666667e-06, + "loss": 0.2846, + "step": 171 + }, + { + "epoch": 0.01375972480550389, + "grad_norm": 0.30865767644143693, + "learning_rate": 2.28e-06, + "loss": 0.2863, + "step": 172 + }, + { + "epoch": 0.013839723205535889, + "grad_norm": 0.34305395987637777, + "learning_rate": 2.2933333333333335e-06, + "loss": 0.3204, + "step": 173 + }, + { + "epoch": 0.013919721605567889, + "grad_norm": 0.6126986499342134, + "learning_rate": 2.306666666666667e-06, + "loss": 0.2926, + "step": 174 + }, + { + "epoch": 0.013999720005599889, + "grad_norm": 0.2469095808190779, + "learning_rate": 2.3200000000000002e-06, + "loss": 0.3504, + "step": 175 + }, + { + "epoch": 0.014079718405631887, + "grad_norm": 0.5492167929499058, + "learning_rate": 2.3333333333333336e-06, + "loss": 0.3077, + "step": 176 + }, + { + "epoch": 0.014159716805663887, + "grad_norm": 0.25598488910292205, + "learning_rate": 2.346666666666667e-06, + "loss": 0.3102, + "step": 177 + }, + { + "epoch": 0.014239715205695886, + "grad_norm": 0.3168172064990585, + "learning_rate": 2.3600000000000003e-06, + "loss": 0.2991, + "step": 178 + }, + { + "epoch": 0.014319713605727886, + "grad_norm": 0.3075481865433467, + "learning_rate": 2.3733333333333336e-06, + "loss": 0.2994, + "step": 179 + }, + { + "epoch": 0.014399712005759884, + "grad_norm": 0.3251897713263359, + "learning_rate": 2.386666666666667e-06, + "loss": 0.2996, + "step": 180 + }, + { + "epoch": 0.014479710405791884, + "grad_norm": 0.27217892447144404, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.3358, + "step": 181 + }, + { + "epoch": 0.014559708805823884, + "grad_norm": 0.3455998928690932, + "learning_rate": 2.4133333333333337e-06, + "loss": 0.3328, + "step": 182 + }, + { + "epoch": 0.014639707205855883, + "grad_norm": 0.2731181827584468, + "learning_rate": 2.426666666666667e-06, + "loss": 0.3282, + "step": 183 + }, + { + "epoch": 0.014719705605887883, + "grad_norm": 0.28232584736296523, + "learning_rate": 2.4400000000000004e-06, + "loss": 0.3025, + "step": 184 + }, + { + "epoch": 0.014799704005919881, + "grad_norm": 0.30617747120959193, + "learning_rate": 2.4533333333333333e-06, + "loss": 0.2767, + "step": 185 + }, + { + "epoch": 0.014879702405951881, + "grad_norm": 0.26242050632790365, + "learning_rate": 2.466666666666667e-06, + "loss": 0.386, + "step": 186 + }, + { + "epoch": 0.01495970080598388, + "grad_norm": 0.33121529206856665, + "learning_rate": 2.4800000000000004e-06, + "loss": 0.2838, + "step": 187 + }, + { + "epoch": 0.01503969920601588, + "grad_norm": 0.249645253775799, + "learning_rate": 2.4933333333333333e-06, + "loss": 0.3549, + "step": 188 + }, + { + "epoch": 0.015119697606047878, + "grad_norm": 0.2342341076903019, + "learning_rate": 2.5066666666666667e-06, + "loss": 0.3365, + "step": 189 + }, + { + "epoch": 0.015199696006079878, + "grad_norm": 0.3104847568722879, + "learning_rate": 2.52e-06, + "loss": 0.2963, + "step": 190 + }, + { + "epoch": 0.015279694406111878, + "grad_norm": 0.3170305859768765, + "learning_rate": 2.5333333333333338e-06, + "loss": 0.276, + "step": 191 + }, + { + "epoch": 0.015359692806143877, + "grad_norm": 0.3483537982466923, + "learning_rate": 2.5466666666666667e-06, + "loss": 0.2738, + "step": 192 + }, + { + "epoch": 0.015439691206175877, + "grad_norm": 0.3105215828837361, + "learning_rate": 2.56e-06, + "loss": 0.33, + "step": 193 + }, + { + "epoch": 0.015519689606207875, + "grad_norm": 0.27326728991656857, + "learning_rate": 2.573333333333334e-06, + "loss": 0.3052, + "step": 194 + }, + { + "epoch": 0.015599688006239875, + "grad_norm": 0.33989069089174095, + "learning_rate": 2.5866666666666667e-06, + "loss": 0.3009, + "step": 195 + }, + { + "epoch": 0.015679686406271875, + "grad_norm": 0.28612282268628964, + "learning_rate": 2.6e-06, + "loss": 0.3059, + "step": 196 + }, + { + "epoch": 0.015759684806303874, + "grad_norm": 0.2718603913846913, + "learning_rate": 2.6133333333333334e-06, + "loss": 0.3011, + "step": 197 + }, + { + "epoch": 0.015839683206335872, + "grad_norm": 0.2614884604702425, + "learning_rate": 2.6266666666666668e-06, + "loss": 0.3548, + "step": 198 + }, + { + "epoch": 0.015919681606367874, + "grad_norm": 0.27931581114935067, + "learning_rate": 2.64e-06, + "loss": 0.3019, + "step": 199 + }, + { + "epoch": 0.015999680006399872, + "grad_norm": 0.2915159304805085, + "learning_rate": 2.6533333333333335e-06, + "loss": 0.3341, + "step": 200 + }, + { + "epoch": 0.01607967840643187, + "grad_norm": 0.2926927148294065, + "learning_rate": 2.666666666666667e-06, + "loss": 0.319, + "step": 201 + }, + { + "epoch": 0.01615967680646387, + "grad_norm": 0.3312925133896497, + "learning_rate": 2.68e-06, + "loss": 0.2729, + "step": 202 + }, + { + "epoch": 0.01623967520649587, + "grad_norm": 0.2179060164387047, + "learning_rate": 2.6933333333333335e-06, + "loss": 0.3681, + "step": 203 + }, + { + "epoch": 0.01631967360652787, + "grad_norm": 0.3347584423990397, + "learning_rate": 2.706666666666667e-06, + "loss": 0.2767, + "step": 204 + }, + { + "epoch": 0.016399672006559868, + "grad_norm": 0.33366562549913675, + "learning_rate": 2.7200000000000002e-06, + "loss": 0.2604, + "step": 205 + }, + { + "epoch": 0.01647967040659187, + "grad_norm": 0.26367098633160124, + "learning_rate": 2.7333333333333336e-06, + "loss": 0.3546, + "step": 206 + }, + { + "epoch": 0.016559668806623868, + "grad_norm": 0.27008295534598054, + "learning_rate": 2.746666666666667e-06, + "loss": 0.3559, + "step": 207 + }, + { + "epoch": 0.016639667206655866, + "grad_norm": 0.3291417108207968, + "learning_rate": 2.7600000000000003e-06, + "loss": 0.2895, + "step": 208 + }, + { + "epoch": 0.016719665606687865, + "grad_norm": 0.32160900992127883, + "learning_rate": 2.7733333333333336e-06, + "loss": 0.3265, + "step": 209 + }, + { + "epoch": 0.016799664006719867, + "grad_norm": 0.3248210046270003, + "learning_rate": 2.786666666666667e-06, + "loss": 0.2655, + "step": 210 + }, + { + "epoch": 0.016879662406751865, + "grad_norm": 0.3115821033219484, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.3139, + "step": 211 + }, + { + "epoch": 0.016959660806783863, + "grad_norm": 0.3563420955853021, + "learning_rate": 2.8133333333333336e-06, + "loss": 0.2991, + "step": 212 + }, + { + "epoch": 0.017039659206815865, + "grad_norm": 0.3328634897760365, + "learning_rate": 2.826666666666667e-06, + "loss": 0.2986, + "step": 213 + }, + { + "epoch": 0.017119657606847864, + "grad_norm": 0.33190746734616233, + "learning_rate": 2.84e-06, + "loss": 0.3162, + "step": 214 + }, + { + "epoch": 0.017199656006879862, + "grad_norm": 0.3198933164469127, + "learning_rate": 2.8533333333333337e-06, + "loss": 0.3012, + "step": 215 + }, + { + "epoch": 0.01727965440691186, + "grad_norm": 0.27769053245211817, + "learning_rate": 2.866666666666667e-06, + "loss": 0.2983, + "step": 216 + }, + { + "epoch": 0.017359652806943862, + "grad_norm": 0.2343062711718492, + "learning_rate": 2.88e-06, + "loss": 0.34, + "step": 217 + }, + { + "epoch": 0.01743965120697586, + "grad_norm": 0.24325308505926416, + "learning_rate": 2.8933333333333337e-06, + "loss": 0.3422, + "step": 218 + }, + { + "epoch": 0.01751964960700786, + "grad_norm": 0.308235214312061, + "learning_rate": 2.906666666666667e-06, + "loss": 0.3263, + "step": 219 + }, + { + "epoch": 0.01759964800703986, + "grad_norm": 0.34781277296217233, + "learning_rate": 2.92e-06, + "loss": 0.2939, + "step": 220 + }, + { + "epoch": 0.01767964640707186, + "grad_norm": 0.34775821006879976, + "learning_rate": 2.9333333333333338e-06, + "loss": 0.3055, + "step": 221 + }, + { + "epoch": 0.017759644807103857, + "grad_norm": 0.27283654468903656, + "learning_rate": 2.946666666666667e-06, + "loss": 0.3121, + "step": 222 + }, + { + "epoch": 0.017839643207135856, + "grad_norm": 0.2615045670797844, + "learning_rate": 2.96e-06, + "loss": 0.3394, + "step": 223 + }, + { + "epoch": 0.017919641607167858, + "grad_norm": 0.28460403063393813, + "learning_rate": 2.973333333333334e-06, + "loss": 0.2963, + "step": 224 + }, + { + "epoch": 0.017999640007199856, + "grad_norm": 0.2769691341015105, + "learning_rate": 2.986666666666667e-06, + "loss": 0.3057, + "step": 225 + }, + { + "epoch": 0.018079638407231854, + "grad_norm": 0.3349651366664022, + "learning_rate": 3e-06, + "loss": 0.3228, + "step": 226 + }, + { + "epoch": 0.018159636807263856, + "grad_norm": 0.26785345594532933, + "learning_rate": 3.013333333333334e-06, + "loss": 0.3615, + "step": 227 + }, + { + "epoch": 0.018239635207295855, + "grad_norm": 0.2591169067604971, + "learning_rate": 3.0266666666666668e-06, + "loss": 0.3542, + "step": 228 + }, + { + "epoch": 0.018319633607327853, + "grad_norm": 0.22763209752794938, + "learning_rate": 3.04e-06, + "loss": 0.3627, + "step": 229 + }, + { + "epoch": 0.01839963200735985, + "grad_norm": 0.3218481945467067, + "learning_rate": 3.053333333333334e-06, + "loss": 0.3161, + "step": 230 + }, + { + "epoch": 0.018479630407391853, + "grad_norm": 0.3462518883470296, + "learning_rate": 3.066666666666667e-06, + "loss": 0.2941, + "step": 231 + }, + { + "epoch": 0.01855962880742385, + "grad_norm": 0.3051645638246147, + "learning_rate": 3.08e-06, + "loss": 0.2911, + "step": 232 + }, + { + "epoch": 0.01863962720745585, + "grad_norm": 0.28833639974298064, + "learning_rate": 3.093333333333334e-06, + "loss": 0.3289, + "step": 233 + }, + { + "epoch": 0.018719625607487852, + "grad_norm": 0.2919252438417406, + "learning_rate": 3.106666666666667e-06, + "loss": 0.3028, + "step": 234 + }, + { + "epoch": 0.01879962400751985, + "grad_norm": 0.3060051978435512, + "learning_rate": 3.12e-06, + "loss": 0.3081, + "step": 235 + }, + { + "epoch": 0.01887962240755185, + "grad_norm": 0.26465653917640974, + "learning_rate": 3.133333333333334e-06, + "loss": 0.3343, + "step": 236 + }, + { + "epoch": 0.018959620807583847, + "grad_norm": 0.26619948422474793, + "learning_rate": 3.146666666666667e-06, + "loss": 0.3242, + "step": 237 + }, + { + "epoch": 0.01903961920761585, + "grad_norm": 0.4546845613794587, + "learning_rate": 3.1600000000000002e-06, + "loss": 0.2977, + "step": 238 + }, + { + "epoch": 0.019119617607647847, + "grad_norm": 0.2817255172513679, + "learning_rate": 3.173333333333334e-06, + "loss": 0.3271, + "step": 239 + }, + { + "epoch": 0.019199616007679846, + "grad_norm": 0.19494477035554048, + "learning_rate": 3.186666666666667e-06, + "loss": 0.3957, + "step": 240 + }, + { + "epoch": 0.019279614407711847, + "grad_norm": 0.31053646058844525, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.2888, + "step": 241 + }, + { + "epoch": 0.019359612807743846, + "grad_norm": 0.3102409308782587, + "learning_rate": 3.213333333333334e-06, + "loss": 0.2894, + "step": 242 + }, + { + "epoch": 0.019439611207775844, + "grad_norm": 0.31258675349254783, + "learning_rate": 3.226666666666667e-06, + "loss": 0.3442, + "step": 243 + }, + { + "epoch": 0.019519609607807843, + "grad_norm": 0.24097887922992603, + "learning_rate": 3.2400000000000003e-06, + "loss": 0.3371, + "step": 244 + }, + { + "epoch": 0.019599608007839844, + "grad_norm": 0.2714303907685429, + "learning_rate": 3.2533333333333332e-06, + "loss": 0.3193, + "step": 245 + }, + { + "epoch": 0.019679606407871843, + "grad_norm": 0.3043463393020328, + "learning_rate": 3.266666666666667e-06, + "loss": 0.3245, + "step": 246 + }, + { + "epoch": 0.01975960480790384, + "grad_norm": 0.22279006432469095, + "learning_rate": 3.2800000000000004e-06, + "loss": 0.3263, + "step": 247 + }, + { + "epoch": 0.019839603207935843, + "grad_norm": 0.27346275057272, + "learning_rate": 3.2933333333333333e-06, + "loss": 0.2966, + "step": 248 + }, + { + "epoch": 0.01991960160796784, + "grad_norm": 0.2338181782737284, + "learning_rate": 3.306666666666667e-06, + "loss": 0.3484, + "step": 249 + }, + { + "epoch": 0.01999960000799984, + "grad_norm": 0.5860536705677688, + "learning_rate": 3.3200000000000004e-06, + "loss": 0.3222, + "step": 250 + }, + { + "epoch": 0.020079598408031838, + "grad_norm": 0.285745082606006, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.3388, + "step": 251 + }, + { + "epoch": 0.02015959680806384, + "grad_norm": 0.28754198669729975, + "learning_rate": 3.346666666666667e-06, + "loss": 0.3248, + "step": 252 + }, + { + "epoch": 0.02023959520809584, + "grad_norm": 0.29454098602666395, + "learning_rate": 3.3600000000000004e-06, + "loss": 0.3337, + "step": 253 + }, + { + "epoch": 0.020319593608127837, + "grad_norm": 0.31440723855089225, + "learning_rate": 3.3733333333333334e-06, + "loss": 0.2931, + "step": 254 + }, + { + "epoch": 0.02039959200815984, + "grad_norm": 0.2466671934492438, + "learning_rate": 3.386666666666667e-06, + "loss": 0.3158, + "step": 255 + }, + { + "epoch": 0.020479590408191837, + "grad_norm": 0.35549219828394024, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.2902, + "step": 256 + }, + { + "epoch": 0.020559588808223835, + "grad_norm": 0.26736274497055573, + "learning_rate": 3.4133333333333334e-06, + "loss": 0.3105, + "step": 257 + }, + { + "epoch": 0.020639587208255834, + "grad_norm": 0.24379641065263255, + "learning_rate": 3.426666666666667e-06, + "loss": 0.337, + "step": 258 + }, + { + "epoch": 0.020719585608287835, + "grad_norm": 0.26720088550300264, + "learning_rate": 3.44e-06, + "loss": 0.2836, + "step": 259 + }, + { + "epoch": 0.020799584008319834, + "grad_norm": 0.29918995189544784, + "learning_rate": 3.4533333333333334e-06, + "loss": 0.3187, + "step": 260 + }, + { + "epoch": 0.020879582408351832, + "grad_norm": 0.2908575401893748, + "learning_rate": 3.4666666666666672e-06, + "loss": 0.3534, + "step": 261 + }, + { + "epoch": 0.020959580808383834, + "grad_norm": 0.20321947169654234, + "learning_rate": 3.48e-06, + "loss": 0.3518, + "step": 262 + }, + { + "epoch": 0.021039579208415832, + "grad_norm": 0.27410965008950094, + "learning_rate": 3.4933333333333335e-06, + "loss": 0.308, + "step": 263 + }, + { + "epoch": 0.02111957760844783, + "grad_norm": 0.3236580142952976, + "learning_rate": 3.5066666666666673e-06, + "loss": 0.3167, + "step": 264 + }, + { + "epoch": 0.02119957600847983, + "grad_norm": 0.33185429800955873, + "learning_rate": 3.52e-06, + "loss": 0.3114, + "step": 265 + }, + { + "epoch": 0.02127957440851183, + "grad_norm": 0.2561620835312766, + "learning_rate": 3.5333333333333335e-06, + "loss": 0.3494, + "step": 266 + }, + { + "epoch": 0.02135957280854383, + "grad_norm": 0.3412288207221602, + "learning_rate": 3.5466666666666673e-06, + "loss": 0.3035, + "step": 267 + }, + { + "epoch": 0.021439571208575828, + "grad_norm": 0.29847707968616566, + "learning_rate": 3.5600000000000002e-06, + "loss": 0.2871, + "step": 268 + }, + { + "epoch": 0.021519569608607826, + "grad_norm": 0.28849358223805194, + "learning_rate": 3.5733333333333336e-06, + "loss": 0.3141, + "step": 269 + }, + { + "epoch": 0.021599568008639828, + "grad_norm": 0.3262848070532352, + "learning_rate": 3.5866666666666673e-06, + "loss": 0.2829, + "step": 270 + }, + { + "epoch": 0.021679566408671826, + "grad_norm": 0.2810840132422428, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.3146, + "step": 271 + }, + { + "epoch": 0.021759564808703825, + "grad_norm": 0.2507903447564924, + "learning_rate": 3.6133333333333336e-06, + "loss": 0.3319, + "step": 272 + }, + { + "epoch": 0.021839563208735827, + "grad_norm": 0.25539464054896616, + "learning_rate": 3.6266666666666674e-06, + "loss": 0.3488, + "step": 273 + }, + { + "epoch": 0.021919561608767825, + "grad_norm": 0.2850274726617135, + "learning_rate": 3.6400000000000003e-06, + "loss": 0.318, + "step": 274 + }, + { + "epoch": 0.021999560008799823, + "grad_norm": 0.2888100552701197, + "learning_rate": 3.6533333333333336e-06, + "loss": 0.3178, + "step": 275 + }, + { + "epoch": 0.02207955840883182, + "grad_norm": 0.31353112975835506, + "learning_rate": 3.6666666666666666e-06, + "loss": 0.3249, + "step": 276 + }, + { + "epoch": 0.022159556808863824, + "grad_norm": 0.2468922489531051, + "learning_rate": 3.6800000000000003e-06, + "loss": 0.3302, + "step": 277 + }, + { + "epoch": 0.022239555208895822, + "grad_norm": 0.25082212114864855, + "learning_rate": 3.6933333333333337e-06, + "loss": 0.348, + "step": 278 + }, + { + "epoch": 0.02231955360892782, + "grad_norm": 0.2829664029581924, + "learning_rate": 3.7066666666666666e-06, + "loss": 0.2986, + "step": 279 + }, + { + "epoch": 0.022399552008959822, + "grad_norm": 0.32273623185519507, + "learning_rate": 3.7200000000000004e-06, + "loss": 0.2948, + "step": 280 + }, + { + "epoch": 0.02247955040899182, + "grad_norm": 0.23166747886010355, + "learning_rate": 3.7333333333333337e-06, + "loss": 0.3714, + "step": 281 + }, + { + "epoch": 0.02255954880902382, + "grad_norm": 0.29795140352711585, + "learning_rate": 3.7466666666666667e-06, + "loss": 0.3234, + "step": 282 + }, + { + "epoch": 0.022639547209055817, + "grad_norm": 0.2843383420956478, + "learning_rate": 3.7600000000000004e-06, + "loss": 0.3229, + "step": 283 + }, + { + "epoch": 0.02271954560908782, + "grad_norm": 0.3224715369377984, + "learning_rate": 3.7733333333333338e-06, + "loss": 0.2782, + "step": 284 + }, + { + "epoch": 0.022799544009119817, + "grad_norm": 0.25562952206538647, + "learning_rate": 3.7866666666666667e-06, + "loss": 0.3489, + "step": 285 + }, + { + "epoch": 0.022879542409151816, + "grad_norm": 0.3166754235113625, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.2673, + "step": 286 + }, + { + "epoch": 0.022959540809183818, + "grad_norm": 0.2997155781122687, + "learning_rate": 3.813333333333334e-06, + "loss": 0.3084, + "step": 287 + }, + { + "epoch": 0.023039539209215816, + "grad_norm": 0.30596102593439195, + "learning_rate": 3.826666666666667e-06, + "loss": 0.3125, + "step": 288 + }, + { + "epoch": 0.023119537609247814, + "grad_norm": 0.3209867136184037, + "learning_rate": 3.8400000000000005e-06, + "loss": 0.2846, + "step": 289 + }, + { + "epoch": 0.023199536009279813, + "grad_norm": 0.19881600823457748, + "learning_rate": 3.853333333333334e-06, + "loss": 0.382, + "step": 290 + }, + { + "epoch": 0.023279534409311815, + "grad_norm": 0.310407455101293, + "learning_rate": 3.866666666666667e-06, + "loss": 0.291, + "step": 291 + }, + { + "epoch": 0.023359532809343813, + "grad_norm": 0.3007535639216627, + "learning_rate": 3.88e-06, + "loss": 0.3378, + "step": 292 + }, + { + "epoch": 0.02343953120937581, + "grad_norm": 0.29921480775916937, + "learning_rate": 3.893333333333333e-06, + "loss": 0.3118, + "step": 293 + }, + { + "epoch": 0.023519529609407813, + "grad_norm": 0.38457980029073435, + "learning_rate": 3.906666666666667e-06, + "loss": 0.2927, + "step": 294 + }, + { + "epoch": 0.02359952800943981, + "grad_norm": 0.30980457406830697, + "learning_rate": 3.920000000000001e-06, + "loss": 0.2821, + "step": 295 + }, + { + "epoch": 0.02367952640947181, + "grad_norm": 0.31204294392915943, + "learning_rate": 3.9333333333333335e-06, + "loss": 0.3232, + "step": 296 + }, + { + "epoch": 0.02375952480950381, + "grad_norm": 0.37429849506928003, + "learning_rate": 3.946666666666667e-06, + "loss": 0.2985, + "step": 297 + }, + { + "epoch": 0.02383952320953581, + "grad_norm": 0.28736993796258586, + "learning_rate": 3.96e-06, + "loss": 0.3193, + "step": 298 + }, + { + "epoch": 0.02391952160956781, + "grad_norm": 0.29400874352009126, + "learning_rate": 3.973333333333333e-06, + "loss": 0.3001, + "step": 299 + }, + { + "epoch": 0.023999520009599807, + "grad_norm": 0.30070683796841075, + "learning_rate": 3.986666666666667e-06, + "loss": 0.3235, + "step": 300 + }, + { + "epoch": 0.02407951840963181, + "grad_norm": 0.33673323996864846, + "learning_rate": 4.000000000000001e-06, + "loss": 0.3149, + "step": 301 + }, + { + "epoch": 0.024159516809663807, + "grad_norm": 0.5915871723118653, + "learning_rate": 4.013333333333334e-06, + "loss": 0.2946, + "step": 302 + }, + { + "epoch": 0.024239515209695806, + "grad_norm": 0.34529717455075243, + "learning_rate": 4.026666666666667e-06, + "loss": 0.3, + "step": 303 + }, + { + "epoch": 0.024319513609727804, + "grad_norm": 0.3446956777151793, + "learning_rate": 4.04e-06, + "loss": 0.2934, + "step": 304 + }, + { + "epoch": 0.024399512009759806, + "grad_norm": 0.2984152835975952, + "learning_rate": 4.053333333333333e-06, + "loss": 0.3182, + "step": 305 + }, + { + "epoch": 0.024479510409791804, + "grad_norm": 0.2508654937902377, + "learning_rate": 4.066666666666667e-06, + "loss": 0.3331, + "step": 306 + }, + { + "epoch": 0.024559508809823802, + "grad_norm": 0.3086005504622366, + "learning_rate": 4.08e-06, + "loss": 0.28, + "step": 307 + }, + { + "epoch": 0.024639507209855804, + "grad_norm": 0.3250461069503589, + "learning_rate": 4.093333333333334e-06, + "loss": 0.3038, + "step": 308 + }, + { + "epoch": 0.024719505609887803, + "grad_norm": 0.2974487401261438, + "learning_rate": 4.1066666666666674e-06, + "loss": 0.281, + "step": 309 + }, + { + "epoch": 0.0247995040099198, + "grad_norm": 0.2826114481260454, + "learning_rate": 4.12e-06, + "loss": 0.3027, + "step": 310 + }, + { + "epoch": 0.0248795024099518, + "grad_norm": 0.26723447852426074, + "learning_rate": 4.133333333333333e-06, + "loss": 0.3315, + "step": 311 + }, + { + "epoch": 0.0249595008099838, + "grad_norm": 0.3308253861904821, + "learning_rate": 4.146666666666667e-06, + "loss": 0.3034, + "step": 312 + }, + { + "epoch": 0.0250394992100158, + "grad_norm": 0.3011186680573111, + "learning_rate": 4.16e-06, + "loss": 0.3112, + "step": 313 + }, + { + "epoch": 0.025119497610047798, + "grad_norm": 0.29767619437973025, + "learning_rate": 4.173333333333334e-06, + "loss": 0.315, + "step": 314 + }, + { + "epoch": 0.0251994960100798, + "grad_norm": 0.3317550541125557, + "learning_rate": 4.1866666666666675e-06, + "loss": 0.2821, + "step": 315 + }, + { + "epoch": 0.025279494410111798, + "grad_norm": 0.29864154407830745, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.3086, + "step": 316 + }, + { + "epoch": 0.025359492810143797, + "grad_norm": 0.31372143566377436, + "learning_rate": 4.213333333333333e-06, + "loss": 0.334, + "step": 317 + }, + { + "epoch": 0.025439491210175795, + "grad_norm": 0.28286029132169016, + "learning_rate": 4.226666666666667e-06, + "loss": 0.3124, + "step": 318 + }, + { + "epoch": 0.025519489610207797, + "grad_norm": 0.28343166514808793, + "learning_rate": 4.24e-06, + "loss": 0.2962, + "step": 319 + }, + { + "epoch": 0.025599488010239795, + "grad_norm": 0.30091351251625587, + "learning_rate": 4.253333333333334e-06, + "loss": 0.3192, + "step": 320 + }, + { + "epoch": 0.025679486410271794, + "grad_norm": 0.4069227954473359, + "learning_rate": 4.266666666666668e-06, + "loss": 0.3123, + "step": 321 + }, + { + "epoch": 0.025759484810303795, + "grad_norm": 0.30520424644249466, + "learning_rate": 4.2800000000000005e-06, + "loss": 0.3165, + "step": 322 + }, + { + "epoch": 0.025839483210335794, + "grad_norm": 0.3106393666632331, + "learning_rate": 4.2933333333333334e-06, + "loss": 0.3106, + "step": 323 + }, + { + "epoch": 0.025919481610367792, + "grad_norm": 0.31385237368859015, + "learning_rate": 4.306666666666666e-06, + "loss": 0.2776, + "step": 324 + }, + { + "epoch": 0.02599948001039979, + "grad_norm": 0.3595754742533826, + "learning_rate": 4.32e-06, + "loss": 0.2775, + "step": 325 + }, + { + "epoch": 0.026079478410431792, + "grad_norm": 0.29918408080553893, + "learning_rate": 4.333333333333334e-06, + "loss": 0.2779, + "step": 326 + }, + { + "epoch": 0.02615947681046379, + "grad_norm": 0.3137959946957865, + "learning_rate": 4.346666666666667e-06, + "loss": 0.2743, + "step": 327 + }, + { + "epoch": 0.02623947521049579, + "grad_norm": 0.2971425585858211, + "learning_rate": 4.360000000000001e-06, + "loss": 0.3164, + "step": 328 + }, + { + "epoch": 0.02631947361052779, + "grad_norm": 0.3002236460836409, + "learning_rate": 4.3733333333333335e-06, + "loss": 0.3046, + "step": 329 + }, + { + "epoch": 0.02639947201055979, + "grad_norm": 0.32795878075933393, + "learning_rate": 4.3866666666666665e-06, + "loss": 0.298, + "step": 330 + }, + { + "epoch": 0.026479470410591788, + "grad_norm": 0.3198183211880077, + "learning_rate": 4.4e-06, + "loss": 0.2778, + "step": 331 + }, + { + "epoch": 0.026559468810623786, + "grad_norm": 0.8442752941754909, + "learning_rate": 4.413333333333334e-06, + "loss": 0.316, + "step": 332 + }, + { + "epoch": 0.026639467210655788, + "grad_norm": 0.3285862245822231, + "learning_rate": 4.426666666666667e-06, + "loss": 0.3014, + "step": 333 + }, + { + "epoch": 0.026719465610687786, + "grad_norm": 0.3268362991602705, + "learning_rate": 4.440000000000001e-06, + "loss": 0.2932, + "step": 334 + }, + { + "epoch": 0.026799464010719785, + "grad_norm": 0.34828548322000935, + "learning_rate": 4.453333333333334e-06, + "loss": 0.2936, + "step": 335 + }, + { + "epoch": 0.026879462410751787, + "grad_norm": 0.36401067477247345, + "learning_rate": 4.4666666666666665e-06, + "loss": 0.2669, + "step": 336 + }, + { + "epoch": 0.026959460810783785, + "grad_norm": 0.30458351664481537, + "learning_rate": 4.48e-06, + "loss": 0.2808, + "step": 337 + }, + { + "epoch": 0.027039459210815783, + "grad_norm": 0.2827353121459644, + "learning_rate": 4.493333333333333e-06, + "loss": 0.3226, + "step": 338 + }, + { + "epoch": 0.02711945761084778, + "grad_norm": 0.3917300253429823, + "learning_rate": 4.506666666666667e-06, + "loss": 0.3077, + "step": 339 + }, + { + "epoch": 0.027199456010879784, + "grad_norm": 0.35563710777596164, + "learning_rate": 4.520000000000001e-06, + "loss": 0.2813, + "step": 340 + }, + { + "epoch": 0.027279454410911782, + "grad_norm": 0.36915547194942855, + "learning_rate": 4.533333333333334e-06, + "loss": 0.286, + "step": 341 + }, + { + "epoch": 0.02735945281094378, + "grad_norm": 0.33409295314600185, + "learning_rate": 4.546666666666667e-06, + "loss": 0.2822, + "step": 342 + }, + { + "epoch": 0.027439451210975782, + "grad_norm": 0.31416386957118053, + "learning_rate": 4.56e-06, + "loss": 0.3153, + "step": 343 + }, + { + "epoch": 0.02751944961100778, + "grad_norm": 0.3019158422957072, + "learning_rate": 4.573333333333333e-06, + "loss": 0.3024, + "step": 344 + }, + { + "epoch": 0.02759944801103978, + "grad_norm": 0.3100351093467754, + "learning_rate": 4.586666666666667e-06, + "loss": 0.3185, + "step": 345 + }, + { + "epoch": 0.027679446411071777, + "grad_norm": 0.3014305733934458, + "learning_rate": 4.600000000000001e-06, + "loss": 0.3339, + "step": 346 + }, + { + "epoch": 0.02775944481110378, + "grad_norm": 0.28706731839929256, + "learning_rate": 4.613333333333334e-06, + "loss": 0.2886, + "step": 347 + }, + { + "epoch": 0.027839443211135777, + "grad_norm": 0.29222632215345756, + "learning_rate": 4.626666666666667e-06, + "loss": 0.3016, + "step": 348 + }, + { + "epoch": 0.027919441611167776, + "grad_norm": 0.2782406768670366, + "learning_rate": 4.6400000000000005e-06, + "loss": 0.3402, + "step": 349 + }, + { + "epoch": 0.027999440011199778, + "grad_norm": 0.38339644650291627, + "learning_rate": 4.653333333333333e-06, + "loss": 0.2998, + "step": 350 + }, + { + "epoch": 0.028079438411231776, + "grad_norm": 0.3317118638530234, + "learning_rate": 4.666666666666667e-06, + "loss": 0.3235, + "step": 351 + }, + { + "epoch": 0.028159436811263774, + "grad_norm": 0.32472311190940556, + "learning_rate": 4.680000000000001e-06, + "loss": 0.326, + "step": 352 + }, + { + "epoch": 0.028239435211295773, + "grad_norm": 0.3076013504127327, + "learning_rate": 4.693333333333334e-06, + "loss": 0.2994, + "step": 353 + }, + { + "epoch": 0.028319433611327775, + "grad_norm": 0.3304973225502543, + "learning_rate": 4.706666666666667e-06, + "loss": 0.3097, + "step": 354 + }, + { + "epoch": 0.028399432011359773, + "grad_norm": 1.3978856065414307, + "learning_rate": 4.7200000000000005e-06, + "loss": 0.3172, + "step": 355 + }, + { + "epoch": 0.02847943041139177, + "grad_norm": 0.3491918327195412, + "learning_rate": 4.7333333333333335e-06, + "loss": 0.307, + "step": 356 + }, + { + "epoch": 0.028559428811423773, + "grad_norm": 0.25924244706907446, + "learning_rate": 4.746666666666667e-06, + "loss": 0.3225, + "step": 357 + }, + { + "epoch": 0.02863942721145577, + "grad_norm": 0.3101495324155655, + "learning_rate": 4.76e-06, + "loss": 0.3293, + "step": 358 + }, + { + "epoch": 0.02871942561148777, + "grad_norm": 0.35678900095091365, + "learning_rate": 4.773333333333334e-06, + "loss": 0.2805, + "step": 359 + }, + { + "epoch": 0.02879942401151977, + "grad_norm": 0.3067994625744902, + "learning_rate": 4.786666666666667e-06, + "loss": 0.3176, + "step": 360 + }, + { + "epoch": 0.02887942241155177, + "grad_norm": 0.24092945275116656, + "learning_rate": 4.800000000000001e-06, + "loss": 0.3469, + "step": 361 + }, + { + "epoch": 0.02895942081158377, + "grad_norm": 0.34334792749258597, + "learning_rate": 4.8133333333333336e-06, + "loss": 0.295, + "step": 362 + }, + { + "epoch": 0.029039419211615767, + "grad_norm": 0.3321900781436285, + "learning_rate": 4.826666666666667e-06, + "loss": 0.2995, + "step": 363 + }, + { + "epoch": 0.02911941761164777, + "grad_norm": 0.25049811240530573, + "learning_rate": 4.84e-06, + "loss": 0.3508, + "step": 364 + }, + { + "epoch": 0.029199416011679767, + "grad_norm": 0.3607572045401755, + "learning_rate": 4.853333333333334e-06, + "loss": 0.3096, + "step": 365 + }, + { + "epoch": 0.029279414411711766, + "grad_norm": 0.3294829442818441, + "learning_rate": 4.866666666666667e-06, + "loss": 0.2691, + "step": 366 + }, + { + "epoch": 0.029359412811743764, + "grad_norm": 0.5526739565126829, + "learning_rate": 4.880000000000001e-06, + "loss": 0.298, + "step": 367 + }, + { + "epoch": 0.029439411211775766, + "grad_norm": 0.3335696256707955, + "learning_rate": 4.893333333333334e-06, + "loss": 0.2933, + "step": 368 + }, + { + "epoch": 0.029519409611807764, + "grad_norm": 0.3663640552692995, + "learning_rate": 4.9066666666666666e-06, + "loss": 0.3128, + "step": 369 + }, + { + "epoch": 0.029599408011839762, + "grad_norm": 0.37571911507081424, + "learning_rate": 4.92e-06, + "loss": 0.3072, + "step": 370 + }, + { + "epoch": 0.02967940641187176, + "grad_norm": 0.31555521230821104, + "learning_rate": 4.933333333333334e-06, + "loss": 0.2768, + "step": 371 + }, + { + "epoch": 0.029759404811903763, + "grad_norm": 0.245298954856371, + "learning_rate": 4.946666666666667e-06, + "loss": 0.3602, + "step": 372 + }, + { + "epoch": 0.02983940321193576, + "grad_norm": 0.2456806518200736, + "learning_rate": 4.960000000000001e-06, + "loss": 0.3594, + "step": 373 + }, + { + "epoch": 0.02991940161196776, + "grad_norm": 0.3005615123583751, + "learning_rate": 4.973333333333334e-06, + "loss": 0.3053, + "step": 374 + }, + { + "epoch": 0.02999940001199976, + "grad_norm": 0.3201933337711833, + "learning_rate": 4.986666666666667e-06, + "loss": 0.2999, + "step": 375 + }, + { + "epoch": 0.03007939841203176, + "grad_norm": 0.3502857133296534, + "learning_rate": 5e-06, + "loss": 0.3083, + "step": 376 + }, + { + "epoch": 0.030159396812063758, + "grad_norm": 0.35536163911687196, + "learning_rate": 5.013333333333333e-06, + "loss": 0.2998, + "step": 377 + }, + { + "epoch": 0.030239395212095756, + "grad_norm": 0.2929929658030338, + "learning_rate": 5.026666666666667e-06, + "loss": 0.3055, + "step": 378 + }, + { + "epoch": 0.030319393612127758, + "grad_norm": 0.2841828850712374, + "learning_rate": 5.04e-06, + "loss": 0.3142, + "step": 379 + }, + { + "epoch": 0.030399392012159757, + "grad_norm": 0.2610288400838785, + "learning_rate": 5.053333333333334e-06, + "loss": 0.3478, + "step": 380 + }, + { + "epoch": 0.030479390412191755, + "grad_norm": 0.31758189304705964, + "learning_rate": 5.0666666666666676e-06, + "loss": 0.2909, + "step": 381 + }, + { + "epoch": 0.030559388812223757, + "grad_norm": 0.2931222908516946, + "learning_rate": 5.0800000000000005e-06, + "loss": 0.3001, + "step": 382 + }, + { + "epoch": 0.030639387212255755, + "grad_norm": 0.3203454950065711, + "learning_rate": 5.093333333333333e-06, + "loss": 0.2707, + "step": 383 + }, + { + "epoch": 0.030719385612287754, + "grad_norm": 0.3714027708987862, + "learning_rate": 5.106666666666667e-06, + "loss": 0.2929, + "step": 384 + }, + { + "epoch": 0.030799384012319752, + "grad_norm": 0.31815491416426595, + "learning_rate": 5.12e-06, + "loss": 0.3094, + "step": 385 + }, + { + "epoch": 0.030879382412351754, + "grad_norm": 0.3342354971381128, + "learning_rate": 5.133333333333334e-06, + "loss": 0.2717, + "step": 386 + }, + { + "epoch": 0.030959380812383752, + "grad_norm": 0.3082120376726215, + "learning_rate": 5.146666666666668e-06, + "loss": 0.3074, + "step": 387 + }, + { + "epoch": 0.03103937921241575, + "grad_norm": 0.3066371516027656, + "learning_rate": 5.1600000000000006e-06, + "loss": 0.3079, + "step": 388 + }, + { + "epoch": 0.031119377612447752, + "grad_norm": 0.281534543660837, + "learning_rate": 5.1733333333333335e-06, + "loss": 0.3537, + "step": 389 + }, + { + "epoch": 0.03119937601247975, + "grad_norm": 0.357506865511471, + "learning_rate": 5.186666666666667e-06, + "loss": 0.2948, + "step": 390 + }, + { + "epoch": 0.03127937441251175, + "grad_norm": 0.33973068675563084, + "learning_rate": 5.2e-06, + "loss": 0.2879, + "step": 391 + }, + { + "epoch": 0.03135937281254375, + "grad_norm": 0.43378779358682595, + "learning_rate": 5.213333333333334e-06, + "loss": 0.2907, + "step": 392 + }, + { + "epoch": 0.03143937121257575, + "grad_norm": 0.2753200794392037, + "learning_rate": 5.226666666666667e-06, + "loss": 0.3461, + "step": 393 + }, + { + "epoch": 0.03151936961260775, + "grad_norm": 0.31597559998812197, + "learning_rate": 5.240000000000001e-06, + "loss": 0.3296, + "step": 394 + }, + { + "epoch": 0.031599368012639746, + "grad_norm": 1.9657686448382365, + "learning_rate": 5.2533333333333336e-06, + "loss": 0.2876, + "step": 395 + }, + { + "epoch": 0.031679366412671744, + "grad_norm": 0.6104971373102391, + "learning_rate": 5.2666666666666665e-06, + "loss": 0.332, + "step": 396 + }, + { + "epoch": 0.03175936481270374, + "grad_norm": 0.5223884639865138, + "learning_rate": 5.28e-06, + "loss": 0.2847, + "step": 397 + }, + { + "epoch": 0.03183936321273575, + "grad_norm": 1.0127908583483225, + "learning_rate": 5.293333333333334e-06, + "loss": 0.3041, + "step": 398 + }, + { + "epoch": 0.031919361612767747, + "grad_norm": 0.2896248684319602, + "learning_rate": 5.306666666666667e-06, + "loss": 0.3153, + "step": 399 + }, + { + "epoch": 0.031999360012799745, + "grad_norm": 0.3111725505787792, + "learning_rate": 5.320000000000001e-06, + "loss": 0.3069, + "step": 400 + }, + { + "epoch": 0.03207935841283174, + "grad_norm": 0.2612742530323411, + "learning_rate": 5.333333333333334e-06, + "loss": 0.3485, + "step": 401 + }, + { + "epoch": 0.03215935681286374, + "grad_norm": 0.355851186485992, + "learning_rate": 5.346666666666667e-06, + "loss": 0.2903, + "step": 402 + }, + { + "epoch": 0.03223935521289574, + "grad_norm": 0.3321772891090357, + "learning_rate": 5.36e-06, + "loss": 0.303, + "step": 403 + }, + { + "epoch": 0.03231935361292774, + "grad_norm": 0.31126794542814573, + "learning_rate": 5.373333333333334e-06, + "loss": 0.3146, + "step": 404 + }, + { + "epoch": 0.032399352012959744, + "grad_norm": 0.22591448085272395, + "learning_rate": 5.386666666666667e-06, + "loss": 0.3594, + "step": 405 + }, + { + "epoch": 0.03247935041299174, + "grad_norm": 0.3308646714495262, + "learning_rate": 5.400000000000001e-06, + "loss": 0.2796, + "step": 406 + }, + { + "epoch": 0.03255934881302374, + "grad_norm": 0.292154591870389, + "learning_rate": 5.413333333333334e-06, + "loss": 0.3175, + "step": 407 + }, + { + "epoch": 0.03263934721305574, + "grad_norm": 0.3788761336953055, + "learning_rate": 5.426666666666667e-06, + "loss": 0.3106, + "step": 408 + }, + { + "epoch": 0.03271934561308774, + "grad_norm": 0.23266665825540928, + "learning_rate": 5.4400000000000004e-06, + "loss": 0.3814, + "step": 409 + }, + { + "epoch": 0.032799344013119736, + "grad_norm": 0.37785222072422875, + "learning_rate": 5.453333333333334e-06, + "loss": 0.2828, + "step": 410 + }, + { + "epoch": 0.032879342413151734, + "grad_norm": 0.3752555153860843, + "learning_rate": 5.466666666666667e-06, + "loss": 0.2992, + "step": 411 + }, + { + "epoch": 0.03295934081318374, + "grad_norm": 0.3241619726591749, + "learning_rate": 5.480000000000001e-06, + "loss": 0.2767, + "step": 412 + }, + { + "epoch": 0.03303933921321574, + "grad_norm": 0.20911832526144952, + "learning_rate": 5.493333333333334e-06, + "loss": 0.3501, + "step": 413 + }, + { + "epoch": 0.033119337613247736, + "grad_norm": 0.35445576794204775, + "learning_rate": 5.506666666666667e-06, + "loss": 0.285, + "step": 414 + }, + { + "epoch": 0.033199336013279734, + "grad_norm": 0.3490736265715893, + "learning_rate": 5.5200000000000005e-06, + "loss": 0.3003, + "step": 415 + }, + { + "epoch": 0.03327933441331173, + "grad_norm": 0.296556437643436, + "learning_rate": 5.533333333333334e-06, + "loss": 0.324, + "step": 416 + }, + { + "epoch": 0.03335933281334373, + "grad_norm": 0.29025738851167604, + "learning_rate": 5.546666666666667e-06, + "loss": 0.3215, + "step": 417 + }, + { + "epoch": 0.03343933121337573, + "grad_norm": 0.28164931440429375, + "learning_rate": 5.560000000000001e-06, + "loss": 0.3412, + "step": 418 + }, + { + "epoch": 0.033519329613407735, + "grad_norm": 0.31967845861110017, + "learning_rate": 5.573333333333334e-06, + "loss": 0.3107, + "step": 419 + }, + { + "epoch": 0.03359932801343973, + "grad_norm": 0.358408808675009, + "learning_rate": 5.586666666666667e-06, + "loss": 0.2924, + "step": 420 + }, + { + "epoch": 0.03367932641347173, + "grad_norm": 0.35010371693620035, + "learning_rate": 5.600000000000001e-06, + "loss": 0.308, + "step": 421 + }, + { + "epoch": 0.03375932481350373, + "grad_norm": 0.2989354809093555, + "learning_rate": 5.613333333333334e-06, + "loss": 0.3066, + "step": 422 + }, + { + "epoch": 0.03383932321353573, + "grad_norm": 0.47528340359375043, + "learning_rate": 5.626666666666667e-06, + "loss": 0.3008, + "step": 423 + }, + { + "epoch": 0.03391932161356773, + "grad_norm": 0.35241261808016133, + "learning_rate": 5.64e-06, + "loss": 0.2965, + "step": 424 + }, + { + "epoch": 0.033999320013599725, + "grad_norm": 0.33834186394947136, + "learning_rate": 5.653333333333334e-06, + "loss": 0.3227, + "step": 425 + }, + { + "epoch": 0.03407931841363173, + "grad_norm": 0.35873235767341694, + "learning_rate": 5.666666666666667e-06, + "loss": 0.2735, + "step": 426 + }, + { + "epoch": 0.03415931681366373, + "grad_norm": 0.361019283306883, + "learning_rate": 5.68e-06, + "loss": 0.318, + "step": 427 + }, + { + "epoch": 0.03423931521369573, + "grad_norm": 0.3491865418351152, + "learning_rate": 5.6933333333333344e-06, + "loss": 0.3186, + "step": 428 + }, + { + "epoch": 0.034319313613727725, + "grad_norm": 0.27729888276670156, + "learning_rate": 5.706666666666667e-06, + "loss": 0.3601, + "step": 429 + }, + { + "epoch": 0.034399312013759724, + "grad_norm": 0.29064256849850584, + "learning_rate": 5.72e-06, + "loss": 0.3138, + "step": 430 + }, + { + "epoch": 0.03447931041379172, + "grad_norm": 0.3508156937459735, + "learning_rate": 5.733333333333334e-06, + "loss": 0.2651, + "step": 431 + }, + { + "epoch": 0.03455930881382372, + "grad_norm": 0.28511748103168333, + "learning_rate": 5.746666666666667e-06, + "loss": 0.3234, + "step": 432 + }, + { + "epoch": 0.034639307213855726, + "grad_norm": 0.3144218865965763, + "learning_rate": 5.76e-06, + "loss": 0.2989, + "step": 433 + }, + { + "epoch": 0.034719305613887724, + "grad_norm": 0.3366749921569193, + "learning_rate": 5.7733333333333345e-06, + "loss": 0.2801, + "step": 434 + }, + { + "epoch": 0.03479930401391972, + "grad_norm": 0.31012021904677867, + "learning_rate": 5.7866666666666674e-06, + "loss": 0.2949, + "step": 435 + }, + { + "epoch": 0.03487930241395172, + "grad_norm": 0.3347669197981209, + "learning_rate": 5.8e-06, + "loss": 0.2909, + "step": 436 + }, + { + "epoch": 0.03495930081398372, + "grad_norm": 0.3075000005306856, + "learning_rate": 5.813333333333334e-06, + "loss": 0.2854, + "step": 437 + }, + { + "epoch": 0.03503929921401572, + "grad_norm": 0.27842169718598414, + "learning_rate": 5.826666666666667e-06, + "loss": 0.3276, + "step": 438 + }, + { + "epoch": 0.035119297614047716, + "grad_norm": 0.332358261271771, + "learning_rate": 5.84e-06, + "loss": 0.3129, + "step": 439 + }, + { + "epoch": 0.03519929601407972, + "grad_norm": 0.3250165592387918, + "learning_rate": 5.853333333333335e-06, + "loss": 0.2736, + "step": 440 + }, + { + "epoch": 0.03527929441411172, + "grad_norm": 0.2903957305908445, + "learning_rate": 5.8666666666666675e-06, + "loss": 0.3028, + "step": 441 + }, + { + "epoch": 0.03535929281414372, + "grad_norm": 0.3022783193404657, + "learning_rate": 5.8800000000000005e-06, + "loss": 0.3237, + "step": 442 + }, + { + "epoch": 0.03543929121417572, + "grad_norm": 0.3501135428170432, + "learning_rate": 5.893333333333334e-06, + "loss": 0.2902, + "step": 443 + }, + { + "epoch": 0.035519289614207715, + "grad_norm": 0.2827949040230491, + "learning_rate": 5.906666666666667e-06, + "loss": 0.3109, + "step": 444 + }, + { + "epoch": 0.03559928801423971, + "grad_norm": 0.27880451846861004, + "learning_rate": 5.92e-06, + "loss": 0.3103, + "step": 445 + }, + { + "epoch": 0.03567928641427171, + "grad_norm": 0.2890679709946553, + "learning_rate": 5.933333333333335e-06, + "loss": 0.2921, + "step": 446 + }, + { + "epoch": 0.03575928481430372, + "grad_norm": 0.3712738466813804, + "learning_rate": 5.946666666666668e-06, + "loss": 0.2905, + "step": 447 + }, + { + "epoch": 0.035839283214335715, + "grad_norm": 0.27790778029213054, + "learning_rate": 5.9600000000000005e-06, + "loss": 0.3553, + "step": 448 + }, + { + "epoch": 0.035919281614367714, + "grad_norm": 0.24502036618265147, + "learning_rate": 5.973333333333334e-06, + "loss": 0.3341, + "step": 449 + }, + { + "epoch": 0.03599928001439971, + "grad_norm": 0.27160629781154755, + "learning_rate": 5.986666666666667e-06, + "loss": 0.3496, + "step": 450 + }, + { + "epoch": 0.03607927841443171, + "grad_norm": 0.33303463131843214, + "learning_rate": 6e-06, + "loss": 0.3185, + "step": 451 + }, + { + "epoch": 0.03615927681446371, + "grad_norm": 0.3316125161555937, + "learning_rate": 6.013333333333335e-06, + "loss": 0.3098, + "step": 452 + }, + { + "epoch": 0.03623927521449571, + "grad_norm": 0.3397618672938498, + "learning_rate": 6.026666666666668e-06, + "loss": 0.2824, + "step": 453 + }, + { + "epoch": 0.03631927361452771, + "grad_norm": 0.33597321735886654, + "learning_rate": 6.040000000000001e-06, + "loss": 0.3003, + "step": 454 + }, + { + "epoch": 0.03639927201455971, + "grad_norm": 0.3125718610236431, + "learning_rate": 6.0533333333333335e-06, + "loss": 0.2748, + "step": 455 + }, + { + "epoch": 0.03647927041459171, + "grad_norm": 0.2870651408313308, + "learning_rate": 6.066666666666667e-06, + "loss": 0.3474, + "step": 456 + }, + { + "epoch": 0.03655926881462371, + "grad_norm": 0.33301081350451345, + "learning_rate": 6.08e-06, + "loss": 0.274, + "step": 457 + }, + { + "epoch": 0.036639267214655706, + "grad_norm": 0.3354120476704053, + "learning_rate": 6.093333333333333e-06, + "loss": 0.3103, + "step": 458 + }, + { + "epoch": 0.036719265614687704, + "grad_norm": 0.30463034017489843, + "learning_rate": 6.106666666666668e-06, + "loss": 0.3112, + "step": 459 + }, + { + "epoch": 0.0367992640147197, + "grad_norm": 0.4397832576246099, + "learning_rate": 6.120000000000001e-06, + "loss": 0.2616, + "step": 460 + }, + { + "epoch": 0.03687926241475171, + "grad_norm": 0.3266197113937444, + "learning_rate": 6.133333333333334e-06, + "loss": 0.3131, + "step": 461 + }, + { + "epoch": 0.036959260814783707, + "grad_norm": 0.33484207703049224, + "learning_rate": 6.146666666666667e-06, + "loss": 0.3084, + "step": 462 + }, + { + "epoch": 0.037039259214815705, + "grad_norm": 0.29669930459354066, + "learning_rate": 6.16e-06, + "loss": 0.3237, + "step": 463 + }, + { + "epoch": 0.0371192576148477, + "grad_norm": 0.2974988619046092, + "learning_rate": 6.173333333333333e-06, + "loss": 0.314, + "step": 464 + }, + { + "epoch": 0.0371992560148797, + "grad_norm": 0.2705470479205864, + "learning_rate": 6.186666666666668e-06, + "loss": 0.3229, + "step": 465 + }, + { + "epoch": 0.0372792544149117, + "grad_norm": 0.29645599925760746, + "learning_rate": 6.200000000000001e-06, + "loss": 0.2933, + "step": 466 + }, + { + "epoch": 0.0373592528149437, + "grad_norm": 0.37792065493147536, + "learning_rate": 6.213333333333334e-06, + "loss": 0.2998, + "step": 467 + }, + { + "epoch": 0.037439251214975704, + "grad_norm": 0.31915362480869564, + "learning_rate": 6.2266666666666675e-06, + "loss": 0.2722, + "step": 468 + }, + { + "epoch": 0.0375192496150077, + "grad_norm": 0.3349686116525976, + "learning_rate": 6.24e-06, + "loss": 0.2854, + "step": 469 + }, + { + "epoch": 0.0375992480150397, + "grad_norm": 0.31042995091475517, + "learning_rate": 6.253333333333333e-06, + "loss": 0.2925, + "step": 470 + }, + { + "epoch": 0.0376792464150717, + "grad_norm": 0.2588527472137994, + "learning_rate": 6.266666666666668e-06, + "loss": 0.3326, + "step": 471 + }, + { + "epoch": 0.0377592448151037, + "grad_norm": 0.4097898202167709, + "learning_rate": 6.280000000000001e-06, + "loss": 0.2951, + "step": 472 + }, + { + "epoch": 0.037839243215135696, + "grad_norm": 0.31931766298662156, + "learning_rate": 6.293333333333334e-06, + "loss": 0.3149, + "step": 473 + }, + { + "epoch": 0.037919241615167694, + "grad_norm": 0.28354492178943036, + "learning_rate": 6.3066666666666676e-06, + "loss": 0.2982, + "step": 474 + }, + { + "epoch": 0.0379992400151997, + "grad_norm": 0.5919782257025219, + "learning_rate": 6.3200000000000005e-06, + "loss": 0.2767, + "step": 475 + }, + { + "epoch": 0.0380792384152317, + "grad_norm": 0.3497273734790228, + "learning_rate": 6.333333333333333e-06, + "loss": 0.3049, + "step": 476 + }, + { + "epoch": 0.038159236815263696, + "grad_norm": 0.33520106800548843, + "learning_rate": 6.346666666666668e-06, + "loss": 0.2928, + "step": 477 + }, + { + "epoch": 0.038239235215295694, + "grad_norm": 0.2997709642492014, + "learning_rate": 6.360000000000001e-06, + "loss": 0.3049, + "step": 478 + }, + { + "epoch": 0.03831923361532769, + "grad_norm": 0.3176268010674167, + "learning_rate": 6.373333333333334e-06, + "loss": 0.3095, + "step": 479 + }, + { + "epoch": 0.03839923201535969, + "grad_norm": 0.35906242550966316, + "learning_rate": 6.386666666666668e-06, + "loss": 0.3206, + "step": 480 + }, + { + "epoch": 0.03847923041539169, + "grad_norm": 0.33384146928757163, + "learning_rate": 6.4000000000000006e-06, + "loss": 0.3095, + "step": 481 + }, + { + "epoch": 0.038559228815423695, + "grad_norm": 0.2982741594194618, + "learning_rate": 6.4133333333333335e-06, + "loss": 0.31, + "step": 482 + }, + { + "epoch": 0.03863922721545569, + "grad_norm": 0.3019238865273718, + "learning_rate": 6.426666666666668e-06, + "loss": 0.3168, + "step": 483 + }, + { + "epoch": 0.03871922561548769, + "grad_norm": 0.3541025012123874, + "learning_rate": 6.440000000000001e-06, + "loss": 0.293, + "step": 484 + }, + { + "epoch": 0.03879922401551969, + "grad_norm": 0.2921419503259735, + "learning_rate": 6.453333333333334e-06, + "loss": 0.3191, + "step": 485 + }, + { + "epoch": 0.03887922241555169, + "grad_norm": 0.33533179895737614, + "learning_rate": 6.466666666666667e-06, + "loss": 0.3178, + "step": 486 + }, + { + "epoch": 0.03895922081558369, + "grad_norm": 0.303441875254397, + "learning_rate": 6.480000000000001e-06, + "loss": 0.2938, + "step": 487 + }, + { + "epoch": 0.039039219215615685, + "grad_norm": 0.41029584738632285, + "learning_rate": 6.4933333333333336e-06, + "loss": 0.3036, + "step": 488 + }, + { + "epoch": 0.03911921761564769, + "grad_norm": 0.43390006219856647, + "learning_rate": 6.5066666666666665e-06, + "loss": 0.3036, + "step": 489 + }, + { + "epoch": 0.03919921601567969, + "grad_norm": 0.2962197281284441, + "learning_rate": 6.520000000000001e-06, + "loss": 0.317, + "step": 490 + }, + { + "epoch": 0.03927921441571169, + "grad_norm": 0.3029760826691946, + "learning_rate": 6.533333333333334e-06, + "loss": 0.3189, + "step": 491 + }, + { + "epoch": 0.039359212815743685, + "grad_norm": 0.3043303861548939, + "learning_rate": 6.546666666666667e-06, + "loss": 0.318, + "step": 492 + }, + { + "epoch": 0.039439211215775684, + "grad_norm": 0.2607504640285908, + "learning_rate": 6.560000000000001e-06, + "loss": 0.3419, + "step": 493 + }, + { + "epoch": 0.03951920961580768, + "grad_norm": 0.3363186674243666, + "learning_rate": 6.573333333333334e-06, + "loss": 0.3118, + "step": 494 + }, + { + "epoch": 0.03959920801583968, + "grad_norm": 0.28470788566264854, + "learning_rate": 6.5866666666666666e-06, + "loss": 0.3438, + "step": 495 + }, + { + "epoch": 0.039679206415871686, + "grad_norm": 0.346026521635282, + "learning_rate": 6.600000000000001e-06, + "loss": 0.2824, + "step": 496 + }, + { + "epoch": 0.039759204815903684, + "grad_norm": 0.34707869461125024, + "learning_rate": 6.613333333333334e-06, + "loss": 0.2756, + "step": 497 + }, + { + "epoch": 0.03983920321593568, + "grad_norm": 0.39467823751709674, + "learning_rate": 6.626666666666667e-06, + "loss": 0.3012, + "step": 498 + }, + { + "epoch": 0.03991920161596768, + "grad_norm": 0.3358019278275616, + "learning_rate": 6.640000000000001e-06, + "loss": 0.3089, + "step": 499 + }, + { + "epoch": 0.03999920001599968, + "grad_norm": 0.24376858347215066, + "learning_rate": 6.653333333333334e-06, + "loss": 0.365, + "step": 500 + }, + { + "epoch": 0.04007919841603168, + "grad_norm": 0.3046401025688762, + "learning_rate": 6.666666666666667e-06, + "loss": 0.331, + "step": 501 + }, + { + "epoch": 0.040159196816063676, + "grad_norm": 0.2693221662620079, + "learning_rate": 6.680000000000001e-06, + "loss": 0.3293, + "step": 502 + }, + { + "epoch": 0.04023919521609568, + "grad_norm": 0.25247424245155686, + "learning_rate": 6.693333333333334e-06, + "loss": 0.333, + "step": 503 + }, + { + "epoch": 0.04031919361612768, + "grad_norm": 0.3169724396118525, + "learning_rate": 6.706666666666667e-06, + "loss": 0.3187, + "step": 504 + }, + { + "epoch": 0.04039919201615968, + "grad_norm": 0.5881998972475462, + "learning_rate": 6.720000000000001e-06, + "loss": 0.2752, + "step": 505 + }, + { + "epoch": 0.04047919041619168, + "grad_norm": 0.3002963988601855, + "learning_rate": 6.733333333333334e-06, + "loss": 0.2935, + "step": 506 + }, + { + "epoch": 0.040559188816223675, + "grad_norm": 0.29552095066249523, + "learning_rate": 6.746666666666667e-06, + "loss": 0.3152, + "step": 507 + }, + { + "epoch": 0.04063918721625567, + "grad_norm": 0.2537159447710243, + "learning_rate": 6.760000000000001e-06, + "loss": 0.3399, + "step": 508 + }, + { + "epoch": 0.04071918561628767, + "grad_norm": 0.33234160937933743, + "learning_rate": 6.773333333333334e-06, + "loss": 0.2989, + "step": 509 + }, + { + "epoch": 0.04079918401631968, + "grad_norm": 0.32268573186034016, + "learning_rate": 6.786666666666667e-06, + "loss": 0.2855, + "step": 510 + }, + { + "epoch": 0.040879182416351675, + "grad_norm": 0.30733727664969035, + "learning_rate": 6.800000000000001e-06, + "loss": 0.3271, + "step": 511 + }, + { + "epoch": 0.040959180816383674, + "grad_norm": 0.2888758745237012, + "learning_rate": 6.813333333333334e-06, + "loss": 0.3099, + "step": 512 + }, + { + "epoch": 0.04103917921641567, + "grad_norm": 0.258533115813079, + "learning_rate": 6.826666666666667e-06, + "loss": 0.3634, + "step": 513 + }, + { + "epoch": 0.04111917761644767, + "grad_norm": 0.32797077931041857, + "learning_rate": 6.8400000000000014e-06, + "loss": 0.2737, + "step": 514 + }, + { + "epoch": 0.04119917601647967, + "grad_norm": 0.33155121183837055, + "learning_rate": 6.853333333333334e-06, + "loss": 0.2865, + "step": 515 + }, + { + "epoch": 0.04127917441651167, + "grad_norm": 0.3108357589344553, + "learning_rate": 6.866666666666667e-06, + "loss": 0.3167, + "step": 516 + }, + { + "epoch": 0.04135917281654367, + "grad_norm": 0.24991574927473204, + "learning_rate": 6.88e-06, + "loss": 0.328, + "step": 517 + }, + { + "epoch": 0.04143917121657567, + "grad_norm": 0.43485053216596936, + "learning_rate": 6.893333333333334e-06, + "loss": 0.2946, + "step": 518 + }, + { + "epoch": 0.04151916961660767, + "grad_norm": 0.8428664264426944, + "learning_rate": 6.906666666666667e-06, + "loss": 0.3041, + "step": 519 + }, + { + "epoch": 0.04159916801663967, + "grad_norm": 0.4140750267696265, + "learning_rate": 6.92e-06, + "loss": 0.2796, + "step": 520 + }, + { + "epoch": 0.041679166416671666, + "grad_norm": 0.28133627665652994, + "learning_rate": 6.9333333333333344e-06, + "loss": 0.3108, + "step": 521 + }, + { + "epoch": 0.041759164816703664, + "grad_norm": 0.29082274810357817, + "learning_rate": 6.946666666666667e-06, + "loss": 0.3402, + "step": 522 + }, + { + "epoch": 0.04183916321673566, + "grad_norm": 0.3733187336059807, + "learning_rate": 6.96e-06, + "loss": 0.2879, + "step": 523 + }, + { + "epoch": 0.04191916161676767, + "grad_norm": 0.318706739739197, + "learning_rate": 6.973333333333334e-06, + "loss": 0.308, + "step": 524 + }, + { + "epoch": 0.041999160016799666, + "grad_norm": 0.2832623430930091, + "learning_rate": 6.986666666666667e-06, + "loss": 0.2935, + "step": 525 + }, + { + "epoch": 0.042079158416831665, + "grad_norm": 0.3362264764755571, + "learning_rate": 7e-06, + "loss": 0.2954, + "step": 526 + }, + { + "epoch": 0.04215915681686366, + "grad_norm": 0.314784032088021, + "learning_rate": 7.0133333333333345e-06, + "loss": 0.3209, + "step": 527 + }, + { + "epoch": 0.04223915521689566, + "grad_norm": 0.28819551985251074, + "learning_rate": 7.0266666666666674e-06, + "loss": 0.3094, + "step": 528 + }, + { + "epoch": 0.04231915361692766, + "grad_norm": 0.29946809520844103, + "learning_rate": 7.04e-06, + "loss": 0.3222, + "step": 529 + }, + { + "epoch": 0.04239915201695966, + "grad_norm": 0.2786109400664497, + "learning_rate": 7.053333333333334e-06, + "loss": 0.3049, + "step": 530 + }, + { + "epoch": 0.04247915041699166, + "grad_norm": 0.3197022273747613, + "learning_rate": 7.066666666666667e-06, + "loss": 0.298, + "step": 531 + }, + { + "epoch": 0.04255914881702366, + "grad_norm": 0.33746979463956367, + "learning_rate": 7.08e-06, + "loss": 0.2831, + "step": 532 + }, + { + "epoch": 0.04263914721705566, + "grad_norm": 0.31856036505044, + "learning_rate": 7.093333333333335e-06, + "loss": 0.309, + "step": 533 + }, + { + "epoch": 0.04271914561708766, + "grad_norm": 0.3660656461056129, + "learning_rate": 7.1066666666666675e-06, + "loss": 0.3041, + "step": 534 + }, + { + "epoch": 0.04279914401711966, + "grad_norm": 0.2954128931866404, + "learning_rate": 7.1200000000000004e-06, + "loss": 0.328, + "step": 535 + }, + { + "epoch": 0.042879142417151656, + "grad_norm": 0.3363128939125697, + "learning_rate": 7.133333333333334e-06, + "loss": 0.3146, + "step": 536 + }, + { + "epoch": 0.042959140817183654, + "grad_norm": 0.29953174973848057, + "learning_rate": 7.146666666666667e-06, + "loss": 0.3333, + "step": 537 + }, + { + "epoch": 0.04303913921721565, + "grad_norm": 0.36975823450622436, + "learning_rate": 7.16e-06, + "loss": 0.2887, + "step": 538 + }, + { + "epoch": 0.04311913761724766, + "grad_norm": 0.32247062717335323, + "learning_rate": 7.173333333333335e-06, + "loss": 0.2782, + "step": 539 + }, + { + "epoch": 0.043199136017279656, + "grad_norm": 0.2786142635386788, + "learning_rate": 7.186666666666668e-06, + "loss": 0.3361, + "step": 540 + }, + { + "epoch": 0.043279134417311654, + "grad_norm": 0.27537515037340293, + "learning_rate": 7.2000000000000005e-06, + "loss": 0.3135, + "step": 541 + }, + { + "epoch": 0.04335913281734365, + "grad_norm": 0.3589920861126592, + "learning_rate": 7.213333333333334e-06, + "loss": 0.2829, + "step": 542 + }, + { + "epoch": 0.04343913121737565, + "grad_norm": 0.29938013437954697, + "learning_rate": 7.226666666666667e-06, + "loss": 0.2973, + "step": 543 + }, + { + "epoch": 0.04351912961740765, + "grad_norm": 0.30316694415023465, + "learning_rate": 7.24e-06, + "loss": 0.3121, + "step": 544 + }, + { + "epoch": 0.04359912801743965, + "grad_norm": 0.31917214871252914, + "learning_rate": 7.253333333333335e-06, + "loss": 0.3232, + "step": 545 + }, + { + "epoch": 0.04367912641747165, + "grad_norm": 0.35032567603107734, + "learning_rate": 7.266666666666668e-06, + "loss": 0.287, + "step": 546 + }, + { + "epoch": 0.04375912481750365, + "grad_norm": 0.40656293117119846, + "learning_rate": 7.280000000000001e-06, + "loss": 0.2785, + "step": 547 + }, + { + "epoch": 0.04383912321753565, + "grad_norm": 0.3141356092079716, + "learning_rate": 7.2933333333333335e-06, + "loss": 0.3143, + "step": 548 + }, + { + "epoch": 0.04391912161756765, + "grad_norm": 0.7744596350911961, + "learning_rate": 7.306666666666667e-06, + "loss": 0.2694, + "step": 549 + }, + { + "epoch": 0.04399912001759965, + "grad_norm": 0.31971476112755937, + "learning_rate": 7.32e-06, + "loss": 0.2868, + "step": 550 + }, + { + "epoch": 0.044079118417631645, + "grad_norm": 0.28496312422337494, + "learning_rate": 7.333333333333333e-06, + "loss": 0.3111, + "step": 551 + }, + { + "epoch": 0.04415911681766364, + "grad_norm": 0.3567713398018572, + "learning_rate": 7.346666666666668e-06, + "loss": 0.2888, + "step": 552 + }, + { + "epoch": 0.04423911521769565, + "grad_norm": 0.29427192379710804, + "learning_rate": 7.360000000000001e-06, + "loss": 0.3384, + "step": 553 + }, + { + "epoch": 0.04431911361772765, + "grad_norm": 0.24743843822184564, + "learning_rate": 7.373333333333334e-06, + "loss": 0.3318, + "step": 554 + }, + { + "epoch": 0.044399112017759645, + "grad_norm": 0.358752055188852, + "learning_rate": 7.386666666666667e-06, + "loss": 0.2932, + "step": 555 + }, + { + "epoch": 0.044479110417791644, + "grad_norm": 0.3206421976582925, + "learning_rate": 7.4e-06, + "loss": 0.3094, + "step": 556 + }, + { + "epoch": 0.04455910881782364, + "grad_norm": 0.26761742225044916, + "learning_rate": 7.413333333333333e-06, + "loss": 0.3399, + "step": 557 + }, + { + "epoch": 0.04463910721785564, + "grad_norm": 0.27064350330456893, + "learning_rate": 7.426666666666668e-06, + "loss": 0.3345, + "step": 558 + }, + { + "epoch": 0.04471910561788764, + "grad_norm": 0.29101192174270596, + "learning_rate": 7.440000000000001e-06, + "loss": 0.3305, + "step": 559 + }, + { + "epoch": 0.044799104017919644, + "grad_norm": 0.3917772721807302, + "learning_rate": 7.453333333333334e-06, + "loss": 0.2923, + "step": 560 + }, + { + "epoch": 0.04487910241795164, + "grad_norm": 0.3620528877940992, + "learning_rate": 7.4666666666666675e-06, + "loss": 0.311, + "step": 561 + }, + { + "epoch": 0.04495910081798364, + "grad_norm": 0.3230945435774983, + "learning_rate": 7.48e-06, + "loss": 0.3032, + "step": 562 + }, + { + "epoch": 0.04503909921801564, + "grad_norm": 0.350005362355763, + "learning_rate": 7.493333333333333e-06, + "loss": 0.3022, + "step": 563 + }, + { + "epoch": 0.04511909761804764, + "grad_norm": 0.2958772164710638, + "learning_rate": 7.506666666666668e-06, + "loss": 0.2971, + "step": 564 + }, + { + "epoch": 0.045199096018079636, + "grad_norm": 0.3247961180824908, + "learning_rate": 7.520000000000001e-06, + "loss": 0.3245, + "step": 565 + }, + { + "epoch": 0.045279094418111634, + "grad_norm": 0.2694177380329182, + "learning_rate": 7.533333333333334e-06, + "loss": 0.2914, + "step": 566 + }, + { + "epoch": 0.04535909281814364, + "grad_norm": 0.2518577316045925, + "learning_rate": 7.5466666666666675e-06, + "loss": 0.3827, + "step": 567 + }, + { + "epoch": 0.04543909121817564, + "grad_norm": 0.3502034514282393, + "learning_rate": 7.5600000000000005e-06, + "loss": 0.2766, + "step": 568 + }, + { + "epoch": 0.04551908961820764, + "grad_norm": 0.3091282899274977, + "learning_rate": 7.573333333333333e-06, + "loss": 0.3095, + "step": 569 + }, + { + "epoch": 0.045599088018239635, + "grad_norm": 0.2442219176585478, + "learning_rate": 7.586666666666668e-06, + "loss": 0.3272, + "step": 570 + }, + { + "epoch": 0.04567908641827163, + "grad_norm": 0.32889910123831523, + "learning_rate": 7.600000000000001e-06, + "loss": 0.3205, + "step": 571 + }, + { + "epoch": 0.04575908481830363, + "grad_norm": 0.25534793694505276, + "learning_rate": 7.613333333333334e-06, + "loss": 0.3391, + "step": 572 + }, + { + "epoch": 0.04583908321833563, + "grad_norm": 0.3545177108243395, + "learning_rate": 7.626666666666668e-06, + "loss": 0.2839, + "step": 573 + }, + { + "epoch": 0.045919081618367635, + "grad_norm": 0.32473986777049285, + "learning_rate": 7.640000000000001e-06, + "loss": 0.2834, + "step": 574 + }, + { + "epoch": 0.045999080018399634, + "grad_norm": 0.31906170470992173, + "learning_rate": 7.653333333333333e-06, + "loss": 0.3231, + "step": 575 + }, + { + "epoch": 0.04607907841843163, + "grad_norm": 0.2687398201356283, + "learning_rate": 7.666666666666667e-06, + "loss": 0.3427, + "step": 576 + }, + { + "epoch": 0.04615907681846363, + "grad_norm": 0.27100988341265847, + "learning_rate": 7.680000000000001e-06, + "loss": 0.3388, + "step": 577 + }, + { + "epoch": 0.04623907521849563, + "grad_norm": 0.31030817272159894, + "learning_rate": 7.693333333333333e-06, + "loss": 0.3132, + "step": 578 + }, + { + "epoch": 0.04631907361852763, + "grad_norm": 0.36354142939370093, + "learning_rate": 7.706666666666669e-06, + "loss": 0.2946, + "step": 579 + }, + { + "epoch": 0.046399072018559626, + "grad_norm": 0.3426449246017012, + "learning_rate": 7.72e-06, + "loss": 0.3292, + "step": 580 + }, + { + "epoch": 0.04647907041859163, + "grad_norm": 0.3088370390840374, + "learning_rate": 7.733333333333334e-06, + "loss": 0.3173, + "step": 581 + }, + { + "epoch": 0.04655906881862363, + "grad_norm": 0.28759815818156115, + "learning_rate": 7.746666666666666e-06, + "loss": 0.3194, + "step": 582 + }, + { + "epoch": 0.04663906721865563, + "grad_norm": 0.344707868764643, + "learning_rate": 7.76e-06, + "loss": 0.2831, + "step": 583 + }, + { + "epoch": 0.046719065618687626, + "grad_norm": 0.29742355530798625, + "learning_rate": 7.773333333333334e-06, + "loss": 0.3244, + "step": 584 + }, + { + "epoch": 0.046799064018719624, + "grad_norm": 0.2836383142386181, + "learning_rate": 7.786666666666666e-06, + "loss": 0.3128, + "step": 585 + }, + { + "epoch": 0.04687906241875162, + "grad_norm": 0.2545638071639253, + "learning_rate": 7.800000000000002e-06, + "loss": 0.3374, + "step": 586 + }, + { + "epoch": 0.04695906081878362, + "grad_norm": 0.3261965294333678, + "learning_rate": 7.813333333333334e-06, + "loss": 0.2777, + "step": 587 + }, + { + "epoch": 0.047039059218815626, + "grad_norm": 0.2852167773669578, + "learning_rate": 7.826666666666667e-06, + "loss": 0.3279, + "step": 588 + }, + { + "epoch": 0.047119057618847625, + "grad_norm": 0.3393587054160582, + "learning_rate": 7.840000000000001e-06, + "loss": 0.2837, + "step": 589 + }, + { + "epoch": 0.04719905601887962, + "grad_norm": 0.3247099685645798, + "learning_rate": 7.853333333333333e-06, + "loss": 0.2864, + "step": 590 + }, + { + "epoch": 0.04727905441891162, + "grad_norm": 0.2944077670255197, + "learning_rate": 7.866666666666667e-06, + "loss": 0.305, + "step": 591 + }, + { + "epoch": 0.04735905281894362, + "grad_norm": 0.2954680839159076, + "learning_rate": 7.88e-06, + "loss": 0.2996, + "step": 592 + }, + { + "epoch": 0.04743905121897562, + "grad_norm": 0.2985561061963635, + "learning_rate": 7.893333333333335e-06, + "loss": 0.3208, + "step": 593 + }, + { + "epoch": 0.04751904961900762, + "grad_norm": 0.3007988746030726, + "learning_rate": 7.906666666666667e-06, + "loss": 0.3007, + "step": 594 + }, + { + "epoch": 0.04759904801903962, + "grad_norm": 0.25398057694135, + "learning_rate": 7.92e-06, + "loss": 0.344, + "step": 595 + }, + { + "epoch": 0.04767904641907162, + "grad_norm": 0.2679593313964034, + "learning_rate": 7.933333333333334e-06, + "loss": 0.3506, + "step": 596 + }, + { + "epoch": 0.04775904481910362, + "grad_norm": 0.301174678656074, + "learning_rate": 7.946666666666666e-06, + "loss": 0.3115, + "step": 597 + }, + { + "epoch": 0.04783904321913562, + "grad_norm": 0.3163709906123511, + "learning_rate": 7.960000000000002e-06, + "loss": 0.334, + "step": 598 + }, + { + "epoch": 0.047919041619167616, + "grad_norm": 0.2835034971944169, + "learning_rate": 7.973333333333334e-06, + "loss": 0.3268, + "step": 599 + }, + { + "epoch": 0.047999040019199614, + "grad_norm": 0.3274329488304935, + "learning_rate": 7.986666666666668e-06, + "loss": 0.2909, + "step": 600 + }, + { + "epoch": 0.04807903841923161, + "grad_norm": 0.2991466638559298, + "learning_rate": 8.000000000000001e-06, + "loss": 0.333, + "step": 601 + }, + { + "epoch": 0.04815903681926362, + "grad_norm": 0.24739835322229659, + "learning_rate": 8.013333333333333e-06, + "loss": 0.3557, + "step": 602 + }, + { + "epoch": 0.048239035219295616, + "grad_norm": 0.3268090809918588, + "learning_rate": 8.026666666666667e-06, + "loss": 0.3015, + "step": 603 + }, + { + "epoch": 0.048319033619327614, + "grad_norm": 0.2529108025233396, + "learning_rate": 8.040000000000001e-06, + "loss": 0.3354, + "step": 604 + }, + { + "epoch": 0.04839903201935961, + "grad_norm": 0.28297194060246894, + "learning_rate": 8.053333333333335e-06, + "loss": 0.3263, + "step": 605 + }, + { + "epoch": 0.04847903041939161, + "grad_norm": 0.27272261674002124, + "learning_rate": 8.066666666666667e-06, + "loss": 0.3036, + "step": 606 + }, + { + "epoch": 0.04855902881942361, + "grad_norm": 0.2910811610239878, + "learning_rate": 8.08e-06, + "loss": 0.3054, + "step": 607 + }, + { + "epoch": 0.04863902721945561, + "grad_norm": 0.3253114566305926, + "learning_rate": 8.093333333333334e-06, + "loss": 0.3206, + "step": 608 + }, + { + "epoch": 0.04871902561948761, + "grad_norm": 0.3152132203792218, + "learning_rate": 8.106666666666666e-06, + "loss": 0.2833, + "step": 609 + }, + { + "epoch": 0.04879902401951961, + "grad_norm": 0.30025638241841135, + "learning_rate": 8.120000000000002e-06, + "loss": 0.324, + "step": 610 + }, + { + "epoch": 0.04887902241955161, + "grad_norm": 0.28742975904796864, + "learning_rate": 8.133333333333334e-06, + "loss": 0.3175, + "step": 611 + }, + { + "epoch": 0.04895902081958361, + "grad_norm": 0.3359495630777932, + "learning_rate": 8.146666666666668e-06, + "loss": 0.2843, + "step": 612 + }, + { + "epoch": 0.04903901921961561, + "grad_norm": 0.26493287194458254, + "learning_rate": 8.16e-06, + "loss": 0.3458, + "step": 613 + }, + { + "epoch": 0.049119017619647605, + "grad_norm": 0.3408797805899529, + "learning_rate": 8.173333333333334e-06, + "loss": 0.2916, + "step": 614 + }, + { + "epoch": 0.0491990160196796, + "grad_norm": 0.33809001556753493, + "learning_rate": 8.186666666666667e-06, + "loss": 0.3062, + "step": 615 + }, + { + "epoch": 0.04927901441971161, + "grad_norm": 0.2845092295614473, + "learning_rate": 8.2e-06, + "loss": 0.3206, + "step": 616 + }, + { + "epoch": 0.04935901281974361, + "grad_norm": 0.25195126164698894, + "learning_rate": 8.213333333333335e-06, + "loss": 0.3354, + "step": 617 + }, + { + "epoch": 0.049439011219775605, + "grad_norm": 0.3394270003618235, + "learning_rate": 8.226666666666667e-06, + "loss": 0.3106, + "step": 618 + }, + { + "epoch": 0.049519009619807604, + "grad_norm": 0.2935637552361525, + "learning_rate": 8.24e-06, + "loss": 0.29, + "step": 619 + }, + { + "epoch": 0.0495990080198396, + "grad_norm": 0.26339841567707817, + "learning_rate": 8.253333333333334e-06, + "loss": 0.3446, + "step": 620 + }, + { + "epoch": 0.0496790064198716, + "grad_norm": 0.3438598627089339, + "learning_rate": 8.266666666666667e-06, + "loss": 0.2687, + "step": 621 + }, + { + "epoch": 0.0497590048199036, + "grad_norm": 0.31207737407635, + "learning_rate": 8.28e-06, + "loss": 0.2871, + "step": 622 + }, + { + "epoch": 0.049839003219935604, + "grad_norm": 0.31511180418472057, + "learning_rate": 8.293333333333334e-06, + "loss": 0.3274, + "step": 623 + }, + { + "epoch": 0.0499190016199676, + "grad_norm": 0.29772904984139775, + "learning_rate": 8.306666666666668e-06, + "loss": 0.3299, + "step": 624 + }, + { + "epoch": 0.0499990000199996, + "grad_norm": 0.33777227246528174, + "learning_rate": 8.32e-06, + "loss": 0.3214, + "step": 625 + }, + { + "epoch": 0.0500789984200316, + "grad_norm": 0.3364393345256382, + "learning_rate": 8.333333333333334e-06, + "loss": 0.308, + "step": 626 + }, + { + "epoch": 0.0501589968200636, + "grad_norm": 0.34732079135653326, + "learning_rate": 8.346666666666668e-06, + "loss": 0.2888, + "step": 627 + }, + { + "epoch": 0.050238995220095596, + "grad_norm": 0.2930826081982636, + "learning_rate": 8.36e-06, + "loss": 0.3033, + "step": 628 + }, + { + "epoch": 0.050318993620127594, + "grad_norm": 0.25469903467158445, + "learning_rate": 8.373333333333335e-06, + "loss": 0.3325, + "step": 629 + }, + { + "epoch": 0.0503989920201596, + "grad_norm": 0.2842973464502169, + "learning_rate": 8.386666666666667e-06, + "loss": 0.3279, + "step": 630 + }, + { + "epoch": 0.0504789904201916, + "grad_norm": 0.22195792662400626, + "learning_rate": 8.400000000000001e-06, + "loss": 0.3773, + "step": 631 + }, + { + "epoch": 0.050558988820223597, + "grad_norm": 0.27740979336171634, + "learning_rate": 8.413333333333335e-06, + "loss": 0.3384, + "step": 632 + }, + { + "epoch": 0.050638987220255595, + "grad_norm": 0.3732154224552938, + "learning_rate": 8.426666666666667e-06, + "loss": 0.2965, + "step": 633 + }, + { + "epoch": 0.05071898562028759, + "grad_norm": 0.28266568420922794, + "learning_rate": 8.44e-06, + "loss": 0.3302, + "step": 634 + }, + { + "epoch": 0.05079898402031959, + "grad_norm": 0.27510971592016004, + "learning_rate": 8.453333333333334e-06, + "loss": 0.3511, + "step": 635 + }, + { + "epoch": 0.05087898242035159, + "grad_norm": 0.28020366293303306, + "learning_rate": 8.466666666666668e-06, + "loss": 0.3422, + "step": 636 + }, + { + "epoch": 0.050958980820383595, + "grad_norm": 0.8196942413526801, + "learning_rate": 8.48e-06, + "loss": 0.3521, + "step": 637 + }, + { + "epoch": 0.051038979220415594, + "grad_norm": 0.36368109926784503, + "learning_rate": 8.493333333333334e-06, + "loss": 0.3071, + "step": 638 + }, + { + "epoch": 0.05111897762044759, + "grad_norm": 0.35538100915584425, + "learning_rate": 8.506666666666668e-06, + "loss": 0.3404, + "step": 639 + }, + { + "epoch": 0.05119897602047959, + "grad_norm": 0.3619070679729844, + "learning_rate": 8.52e-06, + "loss": 0.2795, + "step": 640 + }, + { + "epoch": 0.05127897442051159, + "grad_norm": 0.31923621030184124, + "learning_rate": 8.533333333333335e-06, + "loss": 0.3077, + "step": 641 + }, + { + "epoch": 0.05135897282054359, + "grad_norm": 0.41536988325033963, + "learning_rate": 8.546666666666667e-06, + "loss": 0.2871, + "step": 642 + }, + { + "epoch": 0.051438971220575586, + "grad_norm": 0.29831981671162405, + "learning_rate": 8.560000000000001e-06, + "loss": 0.3182, + "step": 643 + }, + { + "epoch": 0.05151896962060759, + "grad_norm": 0.3253842821903802, + "learning_rate": 8.573333333333333e-06, + "loss": 0.29, + "step": 644 + }, + { + "epoch": 0.05159896802063959, + "grad_norm": 0.3124173721292992, + "learning_rate": 8.586666666666667e-06, + "loss": 0.3298, + "step": 645 + }, + { + "epoch": 0.05167896642067159, + "grad_norm": 0.29722686222543593, + "learning_rate": 8.6e-06, + "loss": 0.3067, + "step": 646 + }, + { + "epoch": 0.051758964820703586, + "grad_norm": 0.2283915129641728, + "learning_rate": 8.613333333333333e-06, + "loss": 0.368, + "step": 647 + }, + { + "epoch": 0.051838963220735584, + "grad_norm": 0.340931476791012, + "learning_rate": 8.626666666666668e-06, + "loss": 0.3222, + "step": 648 + }, + { + "epoch": 0.05191896162076758, + "grad_norm": 0.3005000002803097, + "learning_rate": 8.64e-06, + "loss": 0.362, + "step": 649 + }, + { + "epoch": 0.05199896002079958, + "grad_norm": 0.2879090524411462, + "learning_rate": 8.653333333333334e-06, + "loss": 0.3257, + "step": 650 + }, + { + "epoch": 0.052078958420831586, + "grad_norm": 0.256151410774499, + "learning_rate": 8.666666666666668e-06, + "loss": 0.3463, + "step": 651 + }, + { + "epoch": 0.052158956820863585, + "grad_norm": 0.35244110018552693, + "learning_rate": 8.68e-06, + "loss": 0.3078, + "step": 652 + }, + { + "epoch": 0.05223895522089558, + "grad_norm": 0.29949410599902526, + "learning_rate": 8.693333333333334e-06, + "loss": 0.3236, + "step": 653 + }, + { + "epoch": 0.05231895362092758, + "grad_norm": 0.31571726448551496, + "learning_rate": 8.706666666666667e-06, + "loss": 0.3203, + "step": 654 + }, + { + "epoch": 0.05239895202095958, + "grad_norm": 0.3009203092976193, + "learning_rate": 8.720000000000001e-06, + "loss": 0.3388, + "step": 655 + }, + { + "epoch": 0.05247895042099158, + "grad_norm": 0.27707513386666965, + "learning_rate": 8.733333333333333e-06, + "loss": 0.3357, + "step": 656 + }, + { + "epoch": 0.05255894882102358, + "grad_norm": 0.3327018041628904, + "learning_rate": 8.746666666666667e-06, + "loss": 0.2939, + "step": 657 + }, + { + "epoch": 0.05263894722105558, + "grad_norm": 0.31569376380067976, + "learning_rate": 8.76e-06, + "loss": 0.2864, + "step": 658 + }, + { + "epoch": 0.05271894562108758, + "grad_norm": 0.3372875049163226, + "learning_rate": 8.773333333333333e-06, + "loss": 0.2851, + "step": 659 + }, + { + "epoch": 0.05279894402111958, + "grad_norm": 0.35105142744664414, + "learning_rate": 8.786666666666668e-06, + "loss": 0.3019, + "step": 660 + }, + { + "epoch": 0.05287894242115158, + "grad_norm": 0.2992783020773157, + "learning_rate": 8.8e-06, + "loss": 0.3409, + "step": 661 + }, + { + "epoch": 0.052958940821183575, + "grad_norm": 0.29530772009626577, + "learning_rate": 8.813333333333334e-06, + "loss": 0.339, + "step": 662 + }, + { + "epoch": 0.053038939221215574, + "grad_norm": 0.34242968981634675, + "learning_rate": 8.826666666666668e-06, + "loss": 0.3031, + "step": 663 + }, + { + "epoch": 0.05311893762124757, + "grad_norm": 0.3276982950077085, + "learning_rate": 8.84e-06, + "loss": 0.3344, + "step": 664 + }, + { + "epoch": 0.05319893602127958, + "grad_norm": 0.25124228591032477, + "learning_rate": 8.853333333333334e-06, + "loss": 0.345, + "step": 665 + }, + { + "epoch": 0.053278934421311576, + "grad_norm": 0.2707954693498505, + "learning_rate": 8.866666666666668e-06, + "loss": 0.3589, + "step": 666 + }, + { + "epoch": 0.053358932821343574, + "grad_norm": 0.27919039500251086, + "learning_rate": 8.880000000000001e-06, + "loss": 0.3086, + "step": 667 + }, + { + "epoch": 0.05343893122137557, + "grad_norm": 0.3299186580410715, + "learning_rate": 8.893333333333333e-06, + "loss": 0.3082, + "step": 668 + }, + { + "epoch": 0.05351892962140757, + "grad_norm": 0.41990335225219644, + "learning_rate": 8.906666666666667e-06, + "loss": 0.3102, + "step": 669 + }, + { + "epoch": 0.05359892802143957, + "grad_norm": 0.28599965643564496, + "learning_rate": 8.920000000000001e-06, + "loss": 0.3164, + "step": 670 + }, + { + "epoch": 0.05367892642147157, + "grad_norm": 0.34808391734402944, + "learning_rate": 8.933333333333333e-06, + "loss": 0.3217, + "step": 671 + }, + { + "epoch": 0.05375892482150357, + "grad_norm": 0.254439022609339, + "learning_rate": 8.946666666666669e-06, + "loss": 0.322, + "step": 672 + }, + { + "epoch": 0.05383892322153557, + "grad_norm": 0.37020766567084756, + "learning_rate": 8.96e-06, + "loss": 0.2955, + "step": 673 + }, + { + "epoch": 0.05391892162156757, + "grad_norm": 0.2849948791622441, + "learning_rate": 8.973333333333334e-06, + "loss": 0.3409, + "step": 674 + }, + { + "epoch": 0.05399892002159957, + "grad_norm": 0.2507800065229427, + "learning_rate": 8.986666666666666e-06, + "loss": 0.3189, + "step": 675 + }, + { + "epoch": 0.05407891842163157, + "grad_norm": 0.3572472676386684, + "learning_rate": 9e-06, + "loss": 0.2764, + "step": 676 + }, + { + "epoch": 0.054158916821663565, + "grad_norm": 0.2720468222677146, + "learning_rate": 9.013333333333334e-06, + "loss": 0.3474, + "step": 677 + }, + { + "epoch": 0.05423891522169556, + "grad_norm": 0.3092085221001216, + "learning_rate": 9.026666666666666e-06, + "loss": 0.3042, + "step": 678 + }, + { + "epoch": 0.05431891362172757, + "grad_norm": 0.5220723866063631, + "learning_rate": 9.040000000000002e-06, + "loss": 0.3025, + "step": 679 + }, + { + "epoch": 0.05439891202175957, + "grad_norm": 0.2952834986414631, + "learning_rate": 9.053333333333334e-06, + "loss": 0.3322, + "step": 680 + }, + { + "epoch": 0.054478910421791565, + "grad_norm": 0.28083119974941356, + "learning_rate": 9.066666666666667e-06, + "loss": 0.2991, + "step": 681 + }, + { + "epoch": 0.054558908821823564, + "grad_norm": 0.33732197148992027, + "learning_rate": 9.080000000000001e-06, + "loss": 0.2905, + "step": 682 + }, + { + "epoch": 0.05463890722185556, + "grad_norm": 0.3984458691919837, + "learning_rate": 9.093333333333333e-06, + "loss": 0.2884, + "step": 683 + }, + { + "epoch": 0.05471890562188756, + "grad_norm": 0.2510279050525228, + "learning_rate": 9.106666666666667e-06, + "loss": 0.3415, + "step": 684 + }, + { + "epoch": 0.05479890402191956, + "grad_norm": 0.32958914924248883, + "learning_rate": 9.12e-06, + "loss": 0.283, + "step": 685 + }, + { + "epoch": 0.054878902421951564, + "grad_norm": 0.30974303957340654, + "learning_rate": 9.133333333333335e-06, + "loss": 0.3015, + "step": 686 + }, + { + "epoch": 0.05495890082198356, + "grad_norm": 0.34455977206231336, + "learning_rate": 9.146666666666667e-06, + "loss": 0.2768, + "step": 687 + }, + { + "epoch": 0.05503889922201556, + "grad_norm": 0.3332914051398539, + "learning_rate": 9.16e-06, + "loss": 0.3043, + "step": 688 + }, + { + "epoch": 0.05511889762204756, + "grad_norm": 0.25870417417674735, + "learning_rate": 9.173333333333334e-06, + "loss": 0.3549, + "step": 689 + }, + { + "epoch": 0.05519889602207956, + "grad_norm": 0.27113924945741075, + "learning_rate": 9.186666666666666e-06, + "loss": 0.3326, + "step": 690 + }, + { + "epoch": 0.055278894422111556, + "grad_norm": 0.35059384838302976, + "learning_rate": 9.200000000000002e-06, + "loss": 0.2838, + "step": 691 + }, + { + "epoch": 0.055358892822143554, + "grad_norm": 1.8528370686227036, + "learning_rate": 9.213333333333334e-06, + "loss": 0.3087, + "step": 692 + }, + { + "epoch": 0.05543889122217556, + "grad_norm": 0.4118964907754694, + "learning_rate": 9.226666666666668e-06, + "loss": 0.2844, + "step": 693 + }, + { + "epoch": 0.05551888962220756, + "grad_norm": 0.30237810948877875, + "learning_rate": 9.240000000000001e-06, + "loss": 0.3092, + "step": 694 + }, + { + "epoch": 0.055598888022239557, + "grad_norm": 0.3392299230775422, + "learning_rate": 9.253333333333333e-06, + "loss": 0.2874, + "step": 695 + }, + { + "epoch": 0.055678886422271555, + "grad_norm": 0.3128477107597521, + "learning_rate": 9.266666666666667e-06, + "loss": 0.2906, + "step": 696 + }, + { + "epoch": 0.05575888482230355, + "grad_norm": 0.307323897115656, + "learning_rate": 9.280000000000001e-06, + "loss": 0.3326, + "step": 697 + }, + { + "epoch": 0.05583888322233555, + "grad_norm": 0.3244991064595367, + "learning_rate": 9.293333333333335e-06, + "loss": 0.3306, + "step": 698 + }, + { + "epoch": 0.05591888162236755, + "grad_norm": 0.2926053870136065, + "learning_rate": 9.306666666666667e-06, + "loss": 0.3141, + "step": 699 + }, + { + "epoch": 0.055998880022399555, + "grad_norm": 0.2421416783310131, + "learning_rate": 9.32e-06, + "loss": 0.3441, + "step": 700 + }, + { + "epoch": 0.056078878422431554, + "grad_norm": 0.3690062702559569, + "learning_rate": 9.333333333333334e-06, + "loss": 0.3224, + "step": 701 + }, + { + "epoch": 0.05615887682246355, + "grad_norm": 0.36388521151178416, + "learning_rate": 9.346666666666666e-06, + "loss": 0.3045, + "step": 702 + }, + { + "epoch": 0.05623887522249555, + "grad_norm": 0.2527855713869621, + "learning_rate": 9.360000000000002e-06, + "loss": 0.3288, + "step": 703 + }, + { + "epoch": 0.05631887362252755, + "grad_norm": 0.2585172517011842, + "learning_rate": 9.373333333333334e-06, + "loss": 0.3428, + "step": 704 + }, + { + "epoch": 0.05639887202255955, + "grad_norm": 0.2604854393414402, + "learning_rate": 9.386666666666668e-06, + "loss": 0.3439, + "step": 705 + }, + { + "epoch": 0.056478870422591546, + "grad_norm": 0.3183776494016085, + "learning_rate": 9.4e-06, + "loss": 0.3031, + "step": 706 + }, + { + "epoch": 0.05655886882262355, + "grad_norm": 0.23453220047838064, + "learning_rate": 9.413333333333334e-06, + "loss": 0.3445, + "step": 707 + }, + { + "epoch": 0.05663886722265555, + "grad_norm": 0.3072292931743912, + "learning_rate": 9.426666666666667e-06, + "loss": 0.3194, + "step": 708 + }, + { + "epoch": 0.05671886562268755, + "grad_norm": 0.29486187302040784, + "learning_rate": 9.440000000000001e-06, + "loss": 0.3, + "step": 709 + }, + { + "epoch": 0.056798864022719546, + "grad_norm": 0.30440180669587297, + "learning_rate": 9.453333333333335e-06, + "loss": 0.3104, + "step": 710 + }, + { + "epoch": 0.056878862422751544, + "grad_norm": 0.33928836943813184, + "learning_rate": 9.466666666666667e-06, + "loss": 0.3202, + "step": 711 + }, + { + "epoch": 0.05695886082278354, + "grad_norm": 0.32879885154079164, + "learning_rate": 9.48e-06, + "loss": 0.2803, + "step": 712 + }, + { + "epoch": 0.05703885922281554, + "grad_norm": 0.26156127123572537, + "learning_rate": 9.493333333333334e-06, + "loss": 0.3397, + "step": 713 + }, + { + "epoch": 0.057118857622847546, + "grad_norm": 0.2962002522808446, + "learning_rate": 9.506666666666667e-06, + "loss": 0.3196, + "step": 714 + }, + { + "epoch": 0.057198856022879545, + "grad_norm": 0.25384069215962385, + "learning_rate": 9.52e-06, + "loss": 0.3138, + "step": 715 + }, + { + "epoch": 0.05727885442291154, + "grad_norm": 0.350693396161863, + "learning_rate": 9.533333333333334e-06, + "loss": 0.2698, + "step": 716 + }, + { + "epoch": 0.05735885282294354, + "grad_norm": 0.28837578069701103, + "learning_rate": 9.546666666666668e-06, + "loss": 0.3151, + "step": 717 + }, + { + "epoch": 0.05743885122297554, + "grad_norm": 0.27555420357623844, + "learning_rate": 9.56e-06, + "loss": 0.2947, + "step": 718 + }, + { + "epoch": 0.05751884962300754, + "grad_norm": 0.33600184027855773, + "learning_rate": 9.573333333333334e-06, + "loss": 0.2912, + "step": 719 + }, + { + "epoch": 0.05759884802303954, + "grad_norm": 0.3131343156054618, + "learning_rate": 9.586666666666667e-06, + "loss": 0.2845, + "step": 720 + }, + { + "epoch": 0.05767884642307154, + "grad_norm": 0.32839701241404035, + "learning_rate": 9.600000000000001e-06, + "loss": 0.2799, + "step": 721 + }, + { + "epoch": 0.05775884482310354, + "grad_norm": 0.3212106140341321, + "learning_rate": 9.613333333333335e-06, + "loss": 0.3217, + "step": 722 + }, + { + "epoch": 0.05783884322313554, + "grad_norm": 0.25252461776454266, + "learning_rate": 9.626666666666667e-06, + "loss": 0.3353, + "step": 723 + }, + { + "epoch": 0.05791884162316754, + "grad_norm": 0.24037883844139904, + "learning_rate": 9.640000000000001e-06, + "loss": 0.3451, + "step": 724 + }, + { + "epoch": 0.057998840023199535, + "grad_norm": 0.31095952895557355, + "learning_rate": 9.653333333333335e-06, + "loss": 0.305, + "step": 725 + }, + { + "epoch": 0.058078838423231534, + "grad_norm": 0.33485933628816067, + "learning_rate": 9.666666666666667e-06, + "loss": 0.3334, + "step": 726 + }, + { + "epoch": 0.05815883682326353, + "grad_norm": 0.31985505907968914, + "learning_rate": 9.68e-06, + "loss": 0.328, + "step": 727 + }, + { + "epoch": 0.05823883522329554, + "grad_norm": 0.3774757991027118, + "learning_rate": 9.693333333333334e-06, + "loss": 0.3069, + "step": 728 + }, + { + "epoch": 0.058318833623327536, + "grad_norm": 0.32543648008073467, + "learning_rate": 9.706666666666668e-06, + "loss": 0.3202, + "step": 729 + }, + { + "epoch": 0.058398832023359534, + "grad_norm": 0.35242843957213776, + "learning_rate": 9.72e-06, + "loss": 0.2975, + "step": 730 + }, + { + "epoch": 0.05847883042339153, + "grad_norm": 0.27290230400053705, + "learning_rate": 9.733333333333334e-06, + "loss": 0.3033, + "step": 731 + }, + { + "epoch": 0.05855882882342353, + "grad_norm": 0.21370320862999415, + "learning_rate": 9.746666666666668e-06, + "loss": 0.4041, + "step": 732 + }, + { + "epoch": 0.05863882722345553, + "grad_norm": 0.3063261628055901, + "learning_rate": 9.760000000000001e-06, + "loss": 0.2943, + "step": 733 + }, + { + "epoch": 0.05871882562348753, + "grad_norm": 0.2958569286224122, + "learning_rate": 9.773333333333335e-06, + "loss": 0.3331, + "step": 734 + }, + { + "epoch": 0.05879882402351953, + "grad_norm": 0.5851582264072526, + "learning_rate": 9.786666666666667e-06, + "loss": 0.2883, + "step": 735 + }, + { + "epoch": 0.05887882242355153, + "grad_norm": 0.3256544060883519, + "learning_rate": 9.800000000000001e-06, + "loss": 0.2764, + "step": 736 + }, + { + "epoch": 0.05895882082358353, + "grad_norm": 0.3329371427491858, + "learning_rate": 9.813333333333333e-06, + "loss": 0.2837, + "step": 737 + }, + { + "epoch": 0.05903881922361553, + "grad_norm": 0.3101892312902451, + "learning_rate": 9.826666666666667e-06, + "loss": 0.3351, + "step": 738 + }, + { + "epoch": 0.05911881762364753, + "grad_norm": 0.33160958979871064, + "learning_rate": 9.84e-06, + "loss": 0.272, + "step": 739 + }, + { + "epoch": 0.059198816023679525, + "grad_norm": 0.3329108268349368, + "learning_rate": 9.853333333333334e-06, + "loss": 0.3371, + "step": 740 + }, + { + "epoch": 0.05927881442371152, + "grad_norm": 0.3233304504191216, + "learning_rate": 9.866666666666668e-06, + "loss": 0.2868, + "step": 741 + }, + { + "epoch": 0.05935881282374352, + "grad_norm": 0.30563238052838554, + "learning_rate": 9.88e-06, + "loss": 0.2778, + "step": 742 + }, + { + "epoch": 0.05943881122377553, + "grad_norm": 0.33017617138630784, + "learning_rate": 9.893333333333334e-06, + "loss": 0.3032, + "step": 743 + }, + { + "epoch": 0.059518809623807525, + "grad_norm": 0.34472193365708464, + "learning_rate": 9.906666666666668e-06, + "loss": 0.278, + "step": 744 + }, + { + "epoch": 0.059598808023839524, + "grad_norm": 0.33325594383614676, + "learning_rate": 9.920000000000002e-06, + "loss": 0.2658, + "step": 745 + }, + { + "epoch": 0.05967880642387152, + "grad_norm": 0.3160160101487113, + "learning_rate": 9.933333333333334e-06, + "loss": 0.317, + "step": 746 + }, + { + "epoch": 0.05975880482390352, + "grad_norm": 0.2902744659795111, + "learning_rate": 9.946666666666667e-06, + "loss": 0.3025, + "step": 747 + }, + { + "epoch": 0.05983880322393552, + "grad_norm": 0.2791714670562879, + "learning_rate": 9.960000000000001e-06, + "loss": 0.3458, + "step": 748 + }, + { + "epoch": 0.05991880162396752, + "grad_norm": 0.33271426430696055, + "learning_rate": 9.973333333333333e-06, + "loss": 0.2788, + "step": 749 + }, + { + "epoch": 0.05999880002399952, + "grad_norm": 0.28766011324461654, + "learning_rate": 9.986666666666667e-06, + "loss": 0.3159, + "step": 750 + }, + { + "epoch": 0.06007879842403152, + "grad_norm": 0.3526844716665321, + "learning_rate": 1e-05, + "loss": 0.2866, + "step": 751 + }, + { + "epoch": 0.06015879682406352, + "grad_norm": 0.34454130736303923, + "learning_rate": 9.999999958041858e-06, + "loss": 0.3015, + "step": 752 + }, + { + "epoch": 0.06023879522409552, + "grad_norm": 0.30040506594110644, + "learning_rate": 9.999999832167426e-06, + "loss": 0.3348, + "step": 753 + }, + { + "epoch": 0.060318793624127516, + "grad_norm": 0.304284929154205, + "learning_rate": 9.99999962237671e-06, + "loss": 0.3215, + "step": 754 + }, + { + "epoch": 0.060398792024159514, + "grad_norm": 0.29279485871493055, + "learning_rate": 9.999999328669713e-06, + "loss": 0.3369, + "step": 755 + }, + { + "epoch": 0.06047879042419151, + "grad_norm": 0.3386829020012594, + "learning_rate": 9.999998951046439e-06, + "loss": 0.2944, + "step": 756 + }, + { + "epoch": 0.06055878882422352, + "grad_norm": 0.29152306993283794, + "learning_rate": 9.999998489506897e-06, + "loss": 0.3564, + "step": 757 + }, + { + "epoch": 0.060638787224255516, + "grad_norm": 0.33479080452143656, + "learning_rate": 9.999997944051089e-06, + "loss": 0.2823, + "step": 758 + }, + { + "epoch": 0.060718785624287515, + "grad_norm": 0.29891410542596814, + "learning_rate": 9.999997314679031e-06, + "loss": 0.2931, + "step": 759 + }, + { + "epoch": 0.06079878402431951, + "grad_norm": 0.3065257606257566, + "learning_rate": 9.99999660139073e-06, + "loss": 0.285, + "step": 760 + }, + { + "epoch": 0.06087878242435151, + "grad_norm": 0.3465903381136881, + "learning_rate": 9.999995804186196e-06, + "loss": 0.2694, + "step": 761 + }, + { + "epoch": 0.06095878082438351, + "grad_norm": 0.2949167845266175, + "learning_rate": 9.999994923065446e-06, + "loss": 0.3105, + "step": 762 + }, + { + "epoch": 0.06103877922441551, + "grad_norm": 0.35584238865904855, + "learning_rate": 9.999993958028495e-06, + "loss": 0.3081, + "step": 763 + }, + { + "epoch": 0.061118777624447514, + "grad_norm": 0.33811528199246277, + "learning_rate": 9.999992909075355e-06, + "loss": 0.2744, + "step": 764 + }, + { + "epoch": 0.06119877602447951, + "grad_norm": 0.28601475491219797, + "learning_rate": 9.999991776206049e-06, + "loss": 0.3231, + "step": 765 + }, + { + "epoch": 0.06127877442451151, + "grad_norm": 0.32422075004554857, + "learning_rate": 9.999990559420591e-06, + "loss": 0.32, + "step": 766 + }, + { + "epoch": 0.06135877282454351, + "grad_norm": 0.3026685579515361, + "learning_rate": 9.999989258719005e-06, + "loss": 0.3241, + "step": 767 + }, + { + "epoch": 0.06143877122457551, + "grad_norm": 0.3338005490502487, + "learning_rate": 9.999987874101312e-06, + "loss": 0.2935, + "step": 768 + }, + { + "epoch": 0.061518769624607506, + "grad_norm": 0.27249934087171196, + "learning_rate": 9.999986405567535e-06, + "loss": 0.3425, + "step": 769 + }, + { + "epoch": 0.061598768024639504, + "grad_norm": 0.4260320079450853, + "learning_rate": 9.999984853117697e-06, + "loss": 0.3268, + "step": 770 + }, + { + "epoch": 0.06167876642467151, + "grad_norm": 0.4692360536772432, + "learning_rate": 9.999983216751826e-06, + "loss": 0.303, + "step": 771 + }, + { + "epoch": 0.06175876482470351, + "grad_norm": 0.5665946202088298, + "learning_rate": 9.99998149646995e-06, + "loss": 0.3169, + "step": 772 + }, + { + "epoch": 0.061838763224735506, + "grad_norm": 0.3633678361199125, + "learning_rate": 9.999979692272095e-06, + "loss": 0.2867, + "step": 773 + }, + { + "epoch": 0.061918761624767504, + "grad_norm": 0.27166581987426736, + "learning_rate": 9.999977804158294e-06, + "loss": 0.3083, + "step": 774 + }, + { + "epoch": 0.0619987600247995, + "grad_norm": 0.2731532669064598, + "learning_rate": 9.999975832128578e-06, + "loss": 0.3336, + "step": 775 + }, + { + "epoch": 0.0620787584248315, + "grad_norm": 0.28301758958492174, + "learning_rate": 9.99997377618298e-06, + "loss": 0.3149, + "step": 776 + }, + { + "epoch": 0.0621587568248635, + "grad_norm": 0.28343763143867584, + "learning_rate": 9.999971636321535e-06, + "loss": 0.3325, + "step": 777 + }, + { + "epoch": 0.062238755224895505, + "grad_norm": 0.34574030222504953, + "learning_rate": 9.999969412544278e-06, + "loss": 0.2844, + "step": 778 + }, + { + "epoch": 0.0623187536249275, + "grad_norm": 0.2830608915094326, + "learning_rate": 9.999967104851244e-06, + "loss": 0.3435, + "step": 779 + }, + { + "epoch": 0.0623987520249595, + "grad_norm": 0.3365611821912759, + "learning_rate": 9.999964713242478e-06, + "loss": 0.3173, + "step": 780 + }, + { + "epoch": 0.0624787504249915, + "grad_norm": 0.2858577253840155, + "learning_rate": 9.999962237718015e-06, + "loss": 0.3093, + "step": 781 + }, + { + "epoch": 0.0625587488250235, + "grad_norm": 0.4533288064383276, + "learning_rate": 9.9999596782779e-06, + "loss": 0.3051, + "step": 782 + }, + { + "epoch": 0.0626387472250555, + "grad_norm": 0.34049192356213587, + "learning_rate": 9.99995703492217e-06, + "loss": 0.2879, + "step": 783 + }, + { + "epoch": 0.0627187456250875, + "grad_norm": 0.3012409240608873, + "learning_rate": 9.999954307650876e-06, + "loss": 0.34, + "step": 784 + }, + { + "epoch": 0.0627987440251195, + "grad_norm": 0.2968526314610915, + "learning_rate": 9.999951496464062e-06, + "loss": 0.3266, + "step": 785 + }, + { + "epoch": 0.0628787424251515, + "grad_norm": 0.3210616140957018, + "learning_rate": 9.999948601361773e-06, + "loss": 0.3067, + "step": 786 + }, + { + "epoch": 0.06295874082518349, + "grad_norm": 0.3521520582792855, + "learning_rate": 9.999945622344058e-06, + "loss": 0.3015, + "step": 787 + }, + { + "epoch": 0.0630387392252155, + "grad_norm": 0.31400764020618854, + "learning_rate": 9.99994255941097e-06, + "loss": 0.2778, + "step": 788 + }, + { + "epoch": 0.0631187376252475, + "grad_norm": 1.1320894505713925, + "learning_rate": 9.999939412562558e-06, + "loss": 0.296, + "step": 789 + }, + { + "epoch": 0.06319873602527949, + "grad_norm": 0.4474003210841208, + "learning_rate": 9.999936181798874e-06, + "loss": 0.2894, + "step": 790 + }, + { + "epoch": 0.0632787344253115, + "grad_norm": 0.33719606241734273, + "learning_rate": 9.999932867119974e-06, + "loss": 0.2848, + "step": 791 + }, + { + "epoch": 0.06335873282534349, + "grad_norm": 0.32164119945542424, + "learning_rate": 9.999929468525914e-06, + "loss": 0.2966, + "step": 792 + }, + { + "epoch": 0.0634387312253755, + "grad_norm": 0.35482592497914334, + "learning_rate": 9.999925986016748e-06, + "loss": 0.3034, + "step": 793 + }, + { + "epoch": 0.06351872962540749, + "grad_norm": 0.32487914889490105, + "learning_rate": 9.999922419592537e-06, + "loss": 0.3028, + "step": 794 + }, + { + "epoch": 0.06359872802543949, + "grad_norm": 0.3343689325929624, + "learning_rate": 9.99991876925334e-06, + "loss": 0.3105, + "step": 795 + }, + { + "epoch": 0.0636787264254715, + "grad_norm": 0.26959506618921114, + "learning_rate": 9.99991503499922e-06, + "loss": 0.3778, + "step": 796 + }, + { + "epoch": 0.06375872482550349, + "grad_norm": 0.2615678084045503, + "learning_rate": 9.999911216830239e-06, + "loss": 0.3158, + "step": 797 + }, + { + "epoch": 0.06383872322553549, + "grad_norm": 0.29678374459825013, + "learning_rate": 9.999907314746457e-06, + "loss": 0.3116, + "step": 798 + }, + { + "epoch": 0.06391872162556748, + "grad_norm": 0.3122237853890146, + "learning_rate": 9.999903328747946e-06, + "loss": 0.3171, + "step": 799 + }, + { + "epoch": 0.06399872002559949, + "grad_norm": 0.35289122075626556, + "learning_rate": 9.999899258834769e-06, + "loss": 0.2967, + "step": 800 + }, + { + "epoch": 0.06407871842563148, + "grad_norm": 0.3339393468164776, + "learning_rate": 9.999895105006995e-06, + "loss": 0.2837, + "step": 801 + }, + { + "epoch": 0.06415871682566349, + "grad_norm": 0.33901067013543945, + "learning_rate": 9.999890867264693e-06, + "loss": 0.2929, + "step": 802 + }, + { + "epoch": 0.06423871522569549, + "grad_norm": 0.49212948840384874, + "learning_rate": 9.999886545607935e-06, + "loss": 0.2754, + "step": 803 + }, + { + "epoch": 0.06431871362572748, + "grad_norm": 0.3613725725946296, + "learning_rate": 9.999882140036794e-06, + "loss": 0.3092, + "step": 804 + }, + { + "epoch": 0.06439871202575949, + "grad_norm": 0.3122380182663793, + "learning_rate": 9.999877650551344e-06, + "loss": 0.2954, + "step": 805 + }, + { + "epoch": 0.06447871042579148, + "grad_norm": 0.35452566924458495, + "learning_rate": 9.999873077151659e-06, + "loss": 0.2986, + "step": 806 + }, + { + "epoch": 0.06455870882582349, + "grad_norm": 0.32091417039466086, + "learning_rate": 9.999868419837818e-06, + "loss": 0.3032, + "step": 807 + }, + { + "epoch": 0.06463870722585548, + "grad_norm": 0.33589821110072005, + "learning_rate": 9.999863678609895e-06, + "loss": 0.2753, + "step": 808 + }, + { + "epoch": 0.06471870562588748, + "grad_norm": 0.3885732555710655, + "learning_rate": 9.999858853467972e-06, + "loss": 0.2885, + "step": 809 + }, + { + "epoch": 0.06479870402591949, + "grad_norm": 0.3000688135501107, + "learning_rate": 9.999853944412133e-06, + "loss": 0.3396, + "step": 810 + }, + { + "epoch": 0.06487870242595148, + "grad_norm": 0.31908088386782457, + "learning_rate": 9.999848951442455e-06, + "loss": 0.2678, + "step": 811 + }, + { + "epoch": 0.06495870082598348, + "grad_norm": 0.32998759142285744, + "learning_rate": 9.999843874559026e-06, + "loss": 0.2746, + "step": 812 + }, + { + "epoch": 0.06503869922601548, + "grad_norm": 0.28369174102258066, + "learning_rate": 9.99983871376193e-06, + "loss": 0.3381, + "step": 813 + }, + { + "epoch": 0.06511869762604748, + "grad_norm": 0.36153579901620225, + "learning_rate": 9.999833469051251e-06, + "loss": 0.2793, + "step": 814 + }, + { + "epoch": 0.06519869602607947, + "grad_norm": 0.304507852677711, + "learning_rate": 9.999828140427082e-06, + "loss": 0.3347, + "step": 815 + }, + { + "epoch": 0.06527869442611148, + "grad_norm": 0.3749989558189015, + "learning_rate": 9.999822727889507e-06, + "loss": 0.2984, + "step": 816 + }, + { + "epoch": 0.06535869282614348, + "grad_norm": 0.3097756171081361, + "learning_rate": 9.99981723143862e-06, + "loss": 0.3062, + "step": 817 + }, + { + "epoch": 0.06543869122617547, + "grad_norm": 0.31722382003466015, + "learning_rate": 9.999811651074513e-06, + "loss": 0.292, + "step": 818 + }, + { + "epoch": 0.06551868962620748, + "grad_norm": 0.2597025573948709, + "learning_rate": 9.99980598679728e-06, + "loss": 0.3213, + "step": 819 + }, + { + "epoch": 0.06559868802623947, + "grad_norm": 0.21773321585146505, + "learning_rate": 9.999800238607017e-06, + "loss": 0.376, + "step": 820 + }, + { + "epoch": 0.06567868642627148, + "grad_norm": 0.24386620927410277, + "learning_rate": 9.999794406503816e-06, + "loss": 0.3245, + "step": 821 + }, + { + "epoch": 0.06575868482630347, + "grad_norm": 0.2836226912888357, + "learning_rate": 9.99978849048778e-06, + "loss": 0.3109, + "step": 822 + }, + { + "epoch": 0.06583868322633547, + "grad_norm": 0.30392952183385813, + "learning_rate": 9.999782490559004e-06, + "loss": 0.3184, + "step": 823 + }, + { + "epoch": 0.06591868162636748, + "grad_norm": 0.25329092060401626, + "learning_rate": 9.999776406717592e-06, + "loss": 0.3421, + "step": 824 + }, + { + "epoch": 0.06599868002639947, + "grad_norm": 0.3286974482373493, + "learning_rate": 9.999770238963646e-06, + "loss": 0.3007, + "step": 825 + }, + { + "epoch": 0.06607867842643148, + "grad_norm": 0.3357618677234781, + "learning_rate": 9.999763987297266e-06, + "loss": 0.2904, + "step": 826 + }, + { + "epoch": 0.06615867682646347, + "grad_norm": 0.288379407718952, + "learning_rate": 9.999757651718562e-06, + "loss": 0.2949, + "step": 827 + }, + { + "epoch": 0.06623867522649547, + "grad_norm": 0.3307194139454798, + "learning_rate": 9.999751232227636e-06, + "loss": 0.3177, + "step": 828 + }, + { + "epoch": 0.06631867362652746, + "grad_norm": 0.2858103872043677, + "learning_rate": 9.999744728824599e-06, + "loss": 0.3374, + "step": 829 + }, + { + "epoch": 0.06639867202655947, + "grad_norm": 0.28295822925883035, + "learning_rate": 9.999738141509557e-06, + "loss": 0.3543, + "step": 830 + }, + { + "epoch": 0.06647867042659147, + "grad_norm": 0.3407271227598823, + "learning_rate": 9.999731470282621e-06, + "loss": 0.2914, + "step": 831 + }, + { + "epoch": 0.06655866882662347, + "grad_norm": 0.2771287086614404, + "learning_rate": 9.999724715143908e-06, + "loss": 0.3129, + "step": 832 + }, + { + "epoch": 0.06663866722665547, + "grad_norm": 0.28489926698097295, + "learning_rate": 9.999717876093525e-06, + "loss": 0.3131, + "step": 833 + }, + { + "epoch": 0.06671866562668746, + "grad_norm": 0.3325025628924701, + "learning_rate": 9.999710953131589e-06, + "loss": 0.2799, + "step": 834 + }, + { + "epoch": 0.06679866402671947, + "grad_norm": 0.30464812119369733, + "learning_rate": 9.999703946258217e-06, + "loss": 0.2861, + "step": 835 + }, + { + "epoch": 0.06687866242675146, + "grad_norm": 0.26117995376252917, + "learning_rate": 9.999696855473525e-06, + "loss": 0.3587, + "step": 836 + }, + { + "epoch": 0.06695866082678346, + "grad_norm": 0.30527132252110845, + "learning_rate": 9.999689680777634e-06, + "loss": 0.3159, + "step": 837 + }, + { + "epoch": 0.06703865922681547, + "grad_norm": 0.2961056326196707, + "learning_rate": 9.999682422170663e-06, + "loss": 0.3187, + "step": 838 + }, + { + "epoch": 0.06711865762684746, + "grad_norm": 0.316315169906044, + "learning_rate": 9.999675079652736e-06, + "loss": 0.3196, + "step": 839 + }, + { + "epoch": 0.06719865602687947, + "grad_norm": 0.31220143140115564, + "learning_rate": 9.999667653223972e-06, + "loss": 0.3285, + "step": 840 + }, + { + "epoch": 0.06727865442691146, + "grad_norm": 0.3268976513845286, + "learning_rate": 9.9996601428845e-06, + "loss": 0.2869, + "step": 841 + }, + { + "epoch": 0.06735865282694346, + "grad_norm": 0.24482625318580367, + "learning_rate": 9.999652548634443e-06, + "loss": 0.3519, + "step": 842 + }, + { + "epoch": 0.06743865122697545, + "grad_norm": 0.20743034223042126, + "learning_rate": 9.99964487047393e-06, + "loss": 0.3782, + "step": 843 + }, + { + "epoch": 0.06751864962700746, + "grad_norm": 0.4346332051521387, + "learning_rate": 9.999637108403091e-06, + "loss": 0.3216, + "step": 844 + }, + { + "epoch": 0.06759864802703947, + "grad_norm": 0.2880547520027332, + "learning_rate": 9.999629262422053e-06, + "loss": 0.3263, + "step": 845 + }, + { + "epoch": 0.06767864642707146, + "grad_norm": 0.30184781898213153, + "learning_rate": 9.99962133253095e-06, + "loss": 0.3083, + "step": 846 + }, + { + "epoch": 0.06775864482710346, + "grad_norm": 0.29343891247936804, + "learning_rate": 9.999613318729915e-06, + "loss": 0.3287, + "step": 847 + }, + { + "epoch": 0.06783864322713545, + "grad_norm": 0.3073275633377766, + "learning_rate": 9.999605221019082e-06, + "loss": 0.3199, + "step": 848 + }, + { + "epoch": 0.06791864162716746, + "grad_norm": 0.3560461036806395, + "learning_rate": 9.999597039398586e-06, + "loss": 0.3202, + "step": 849 + }, + { + "epoch": 0.06799864002719945, + "grad_norm": 0.6114206388246446, + "learning_rate": 9.999588773868566e-06, + "loss": 0.2802, + "step": 850 + }, + { + "epoch": 0.06807863842723146, + "grad_norm": 0.2923793972898796, + "learning_rate": 9.99958042442916e-06, + "loss": 0.3103, + "step": 851 + }, + { + "epoch": 0.06815863682726346, + "grad_norm": 0.2984755197747074, + "learning_rate": 9.999571991080508e-06, + "loss": 0.2977, + "step": 852 + }, + { + "epoch": 0.06823863522729545, + "grad_norm": 0.32157105170811334, + "learning_rate": 9.999563473822752e-06, + "loss": 0.3207, + "step": 853 + }, + { + "epoch": 0.06831863362732746, + "grad_norm": 0.32848645814233124, + "learning_rate": 9.999554872656034e-06, + "loss": 0.2515, + "step": 854 + }, + { + "epoch": 0.06839863202735945, + "grad_norm": 0.3585774135921888, + "learning_rate": 9.9995461875805e-06, + "loss": 0.3213, + "step": 855 + }, + { + "epoch": 0.06847863042739145, + "grad_norm": 0.3349066780603757, + "learning_rate": 9.999537418596294e-06, + "loss": 0.3034, + "step": 856 + }, + { + "epoch": 0.06855862882742345, + "grad_norm": 0.250654871136222, + "learning_rate": 9.999528565703564e-06, + "loss": 0.333, + "step": 857 + }, + { + "epoch": 0.06863862722745545, + "grad_norm": 0.2797080485039279, + "learning_rate": 9.99951962890246e-06, + "loss": 0.3367, + "step": 858 + }, + { + "epoch": 0.06871862562748746, + "grad_norm": 0.35543046820929536, + "learning_rate": 9.999510608193128e-06, + "loss": 0.2826, + "step": 859 + }, + { + "epoch": 0.06879862402751945, + "grad_norm": 0.47537950747569424, + "learning_rate": 9.999501503575723e-06, + "loss": 0.3151, + "step": 860 + }, + { + "epoch": 0.06887862242755145, + "grad_norm": 0.2941357899505503, + "learning_rate": 9.999492315050396e-06, + "loss": 0.327, + "step": 861 + }, + { + "epoch": 0.06895862082758344, + "grad_norm": 0.3710929799698191, + "learning_rate": 9.999483042617304e-06, + "loss": 0.3071, + "step": 862 + }, + { + "epoch": 0.06903861922761545, + "grad_norm": 0.3155009973514162, + "learning_rate": 9.999473686276598e-06, + "loss": 0.2707, + "step": 863 + }, + { + "epoch": 0.06911861762764744, + "grad_norm": 0.4628012801933148, + "learning_rate": 9.999464246028439e-06, + "loss": 0.2817, + "step": 864 + }, + { + "epoch": 0.06919861602767945, + "grad_norm": 0.32032826179603335, + "learning_rate": 9.999454721872983e-06, + "loss": 0.2963, + "step": 865 + }, + { + "epoch": 0.06927861442771145, + "grad_norm": 0.7661699979234026, + "learning_rate": 9.999445113810392e-06, + "loss": 0.3102, + "step": 866 + }, + { + "epoch": 0.06935861282774344, + "grad_norm": 0.321374265349741, + "learning_rate": 9.999435421840826e-06, + "loss": 0.3147, + "step": 867 + }, + { + "epoch": 0.06943861122777545, + "grad_norm": 0.31874388243033475, + "learning_rate": 9.999425645964447e-06, + "loss": 0.3316, + "step": 868 + }, + { + "epoch": 0.06951860962780744, + "grad_norm": 0.29860104988455355, + "learning_rate": 9.99941578618142e-06, + "loss": 0.319, + "step": 869 + }, + { + "epoch": 0.06959860802783945, + "grad_norm": 0.30054519230834287, + "learning_rate": 9.999405842491912e-06, + "loss": 0.3456, + "step": 870 + }, + { + "epoch": 0.06967860642787144, + "grad_norm": 0.41801898325084913, + "learning_rate": 9.999395814896086e-06, + "loss": 0.3022, + "step": 871 + }, + { + "epoch": 0.06975860482790344, + "grad_norm": 0.3626328267978701, + "learning_rate": 9.999385703394113e-06, + "loss": 0.277, + "step": 872 + }, + { + "epoch": 0.06983860322793545, + "grad_norm": 0.3015972166288483, + "learning_rate": 9.999375507986163e-06, + "loss": 0.3279, + "step": 873 + }, + { + "epoch": 0.06991860162796744, + "grad_norm": 0.28319933045952733, + "learning_rate": 9.999365228672404e-06, + "loss": 0.3538, + "step": 874 + }, + { + "epoch": 0.06999860002799944, + "grad_norm": 0.29970802715120304, + "learning_rate": 9.999354865453012e-06, + "loss": 0.31, + "step": 875 + }, + { + "epoch": 0.07007859842803144, + "grad_norm": 0.30350342791761536, + "learning_rate": 9.999344418328161e-06, + "loss": 0.3384, + "step": 876 + }, + { + "epoch": 0.07015859682806344, + "grad_norm": 0.31810227023067783, + "learning_rate": 9.999333887298025e-06, + "loss": 0.3004, + "step": 877 + }, + { + "epoch": 0.07023859522809543, + "grad_norm": 0.6866076307206432, + "learning_rate": 9.999323272362779e-06, + "loss": 0.2921, + "step": 878 + }, + { + "epoch": 0.07031859362812744, + "grad_norm": 0.29716429362144453, + "learning_rate": 9.999312573522606e-06, + "loss": 0.3121, + "step": 879 + }, + { + "epoch": 0.07039859202815944, + "grad_norm": 0.2925023029285221, + "learning_rate": 9.99930179077768e-06, + "loss": 0.2964, + "step": 880 + }, + { + "epoch": 0.07047859042819143, + "grad_norm": 0.29065379543897074, + "learning_rate": 9.999290924128186e-06, + "loss": 0.3079, + "step": 881 + }, + { + "epoch": 0.07055858882822344, + "grad_norm": 0.3273714302332476, + "learning_rate": 9.999279973574303e-06, + "loss": 0.3218, + "step": 882 + }, + { + "epoch": 0.07063858722825543, + "grad_norm": 0.28966850261552146, + "learning_rate": 9.999268939116218e-06, + "loss": 0.3225, + "step": 883 + }, + { + "epoch": 0.07071858562828744, + "grad_norm": 0.33603343368876815, + "learning_rate": 9.999257820754116e-06, + "loss": 0.2906, + "step": 884 + }, + { + "epoch": 0.07079858402831943, + "grad_norm": 0.22707881918763348, + "learning_rate": 9.999246618488181e-06, + "loss": 0.359, + "step": 885 + }, + { + "epoch": 0.07087858242835143, + "grad_norm": 0.23913340314025489, + "learning_rate": 9.999235332318603e-06, + "loss": 0.3883, + "step": 886 + }, + { + "epoch": 0.07095858082838344, + "grad_norm": 0.39838311567870205, + "learning_rate": 9.99922396224557e-06, + "loss": 0.3028, + "step": 887 + }, + { + "epoch": 0.07103857922841543, + "grad_norm": 0.3186128109962625, + "learning_rate": 9.999212508269274e-06, + "loss": 0.3126, + "step": 888 + }, + { + "epoch": 0.07111857762844744, + "grad_norm": 0.2825226338939764, + "learning_rate": 9.999200970389909e-06, + "loss": 0.315, + "step": 889 + }, + { + "epoch": 0.07119857602847943, + "grad_norm": 0.31523118531663585, + "learning_rate": 9.999189348607664e-06, + "loss": 0.2787, + "step": 890 + }, + { + "epoch": 0.07127857442851143, + "grad_norm": 0.2847158304797025, + "learning_rate": 9.999177642922736e-06, + "loss": 0.3411, + "step": 891 + }, + { + "epoch": 0.07135857282854342, + "grad_norm": 0.33357590620540634, + "learning_rate": 9.999165853335325e-06, + "loss": 0.2822, + "step": 892 + }, + { + "epoch": 0.07143857122857543, + "grad_norm": 0.34322851374654223, + "learning_rate": 9.999153979845625e-06, + "loss": 0.3483, + "step": 893 + }, + { + "epoch": 0.07151856962860743, + "grad_norm": 0.30048330516808835, + "learning_rate": 9.999142022453836e-06, + "loss": 0.3176, + "step": 894 + }, + { + "epoch": 0.07159856802863943, + "grad_norm": 0.35094029625908313, + "learning_rate": 9.999129981160159e-06, + "loss": 0.3135, + "step": 895 + }, + { + "epoch": 0.07167856642867143, + "grad_norm": 0.3405289734734757, + "learning_rate": 9.999117855964797e-06, + "loss": 0.3066, + "step": 896 + }, + { + "epoch": 0.07175856482870342, + "grad_norm": 0.531654700275743, + "learning_rate": 9.99910564686795e-06, + "loss": 0.33, + "step": 897 + }, + { + "epoch": 0.07183856322873543, + "grad_norm": 0.36689265164701906, + "learning_rate": 9.999093353869828e-06, + "loss": 0.3317, + "step": 898 + }, + { + "epoch": 0.07191856162876742, + "grad_norm": 0.2884737955362762, + "learning_rate": 9.999080976970635e-06, + "loss": 0.3278, + "step": 899 + }, + { + "epoch": 0.07199856002879942, + "grad_norm": 0.28923390447366376, + "learning_rate": 9.999068516170577e-06, + "loss": 0.3086, + "step": 900 + }, + { + "epoch": 0.07207855842883143, + "grad_norm": 0.3538448417863541, + "learning_rate": 9.999055971469864e-06, + "loss": 0.3066, + "step": 901 + }, + { + "epoch": 0.07215855682886342, + "grad_norm": 0.2694161396908422, + "learning_rate": 9.999043342868708e-06, + "loss": 0.3449, + "step": 902 + }, + { + "epoch": 0.07223855522889543, + "grad_norm": 0.26685647147984126, + "learning_rate": 9.99903063036732e-06, + "loss": 0.3352, + "step": 903 + }, + { + "epoch": 0.07231855362892742, + "grad_norm": 0.3259867131932054, + "learning_rate": 9.999017833965914e-06, + "loss": 0.3023, + "step": 904 + }, + { + "epoch": 0.07239855202895942, + "grad_norm": 0.28033383984958504, + "learning_rate": 9.999004953664703e-06, + "loss": 0.3279, + "step": 905 + }, + { + "epoch": 0.07247855042899141, + "grad_norm": 0.2516773645713914, + "learning_rate": 9.998991989463906e-06, + "loss": 0.3197, + "step": 906 + }, + { + "epoch": 0.07255854882902342, + "grad_norm": 0.36273981028380786, + "learning_rate": 9.998978941363739e-06, + "loss": 0.2977, + "step": 907 + }, + { + "epoch": 0.07263854722905543, + "grad_norm": 0.30825315485564986, + "learning_rate": 9.998965809364421e-06, + "loss": 0.3185, + "step": 908 + }, + { + "epoch": 0.07271854562908742, + "grad_norm": 0.23737112162017282, + "learning_rate": 9.998952593466171e-06, + "loss": 0.3293, + "step": 909 + }, + { + "epoch": 0.07279854402911942, + "grad_norm": 0.2833876914512136, + "learning_rate": 9.998939293669213e-06, + "loss": 0.3038, + "step": 910 + }, + { + "epoch": 0.07287854242915141, + "grad_norm": 0.42855843550103295, + "learning_rate": 9.998925909973769e-06, + "loss": 0.3134, + "step": 911 + }, + { + "epoch": 0.07295854082918342, + "grad_norm": 0.28711370902613625, + "learning_rate": 9.998912442380065e-06, + "loss": 0.3365, + "step": 912 + }, + { + "epoch": 0.07303853922921541, + "grad_norm": 0.317298845026507, + "learning_rate": 9.998898890888325e-06, + "loss": 0.3232, + "step": 913 + }, + { + "epoch": 0.07311853762924742, + "grad_norm": 0.3042676846050397, + "learning_rate": 9.998885255498778e-06, + "loss": 0.3413, + "step": 914 + }, + { + "epoch": 0.07319853602927942, + "grad_norm": 0.2641136670631214, + "learning_rate": 9.998871536211652e-06, + "loss": 0.3665, + "step": 915 + }, + { + "epoch": 0.07327853442931141, + "grad_norm": 0.3202256601343914, + "learning_rate": 9.998857733027179e-06, + "loss": 0.3122, + "step": 916 + }, + { + "epoch": 0.07335853282934342, + "grad_norm": 0.36247827930015253, + "learning_rate": 9.998843845945587e-06, + "loss": 0.257, + "step": 917 + }, + { + "epoch": 0.07343853122937541, + "grad_norm": 0.28295246226779514, + "learning_rate": 9.998829874967114e-06, + "loss": 0.3201, + "step": 918 + }, + { + "epoch": 0.07351852962940741, + "grad_norm": 0.30030632259180257, + "learning_rate": 9.99881582009199e-06, + "loss": 0.3289, + "step": 919 + }, + { + "epoch": 0.0735985280294394, + "grad_norm": 0.3642857165626749, + "learning_rate": 9.998801681320452e-06, + "loss": 0.3055, + "step": 920 + }, + { + "epoch": 0.07367852642947141, + "grad_norm": 0.35244807062529443, + "learning_rate": 9.99878745865274e-06, + "loss": 0.2926, + "step": 921 + }, + { + "epoch": 0.07375852482950342, + "grad_norm": 0.26925829576299365, + "learning_rate": 9.99877315208909e-06, + "loss": 0.3355, + "step": 922 + }, + { + "epoch": 0.07383852322953541, + "grad_norm": 0.31693613062825665, + "learning_rate": 9.99875876162974e-06, + "loss": 0.2737, + "step": 923 + }, + { + "epoch": 0.07391852162956741, + "grad_norm": 0.3339030755538431, + "learning_rate": 9.998744287274937e-06, + "loss": 0.2828, + "step": 924 + }, + { + "epoch": 0.0739985200295994, + "grad_norm": 0.3091429003619955, + "learning_rate": 9.998729729024922e-06, + "loss": 0.3198, + "step": 925 + }, + { + "epoch": 0.07407851842963141, + "grad_norm": 0.3029008515891064, + "learning_rate": 9.998715086879938e-06, + "loss": 0.3425, + "step": 926 + }, + { + "epoch": 0.0741585168296634, + "grad_norm": 0.2743957663640204, + "learning_rate": 9.998700360840231e-06, + "loss": 0.3498, + "step": 927 + }, + { + "epoch": 0.0742385152296954, + "grad_norm": 0.3194125909333428, + "learning_rate": 9.998685550906048e-06, + "loss": 0.2926, + "step": 928 + }, + { + "epoch": 0.07431851362972741, + "grad_norm": 0.3153345989993005, + "learning_rate": 9.998670657077638e-06, + "loss": 0.3284, + "step": 929 + }, + { + "epoch": 0.0743985120297594, + "grad_norm": 0.2962340588450138, + "learning_rate": 9.998655679355252e-06, + "loss": 0.3281, + "step": 930 + }, + { + "epoch": 0.07447851042979141, + "grad_norm": 0.34204540319260884, + "learning_rate": 9.99864061773914e-06, + "loss": 0.3392, + "step": 931 + }, + { + "epoch": 0.0745585088298234, + "grad_norm": 0.3460179890160004, + "learning_rate": 9.998625472229555e-06, + "loss": 0.3106, + "step": 932 + }, + { + "epoch": 0.0746385072298554, + "grad_norm": 0.35612331114946627, + "learning_rate": 9.998610242826752e-06, + "loss": 0.2849, + "step": 933 + }, + { + "epoch": 0.0747185056298874, + "grad_norm": 0.2805762961982535, + "learning_rate": 9.998594929530985e-06, + "loss": 0.3151, + "step": 934 + }, + { + "epoch": 0.0747985040299194, + "grad_norm": 0.3305900574893122, + "learning_rate": 9.998579532342511e-06, + "loss": 0.2812, + "step": 935 + }, + { + "epoch": 0.07487850242995141, + "grad_norm": 0.31614811632016304, + "learning_rate": 9.998564051261593e-06, + "loss": 0.3121, + "step": 936 + }, + { + "epoch": 0.0749585008299834, + "grad_norm": 0.32616497993418414, + "learning_rate": 9.998548486288483e-06, + "loss": 0.3382, + "step": 937 + }, + { + "epoch": 0.0750384992300154, + "grad_norm": 0.32374662058488274, + "learning_rate": 9.998532837423448e-06, + "loss": 0.2949, + "step": 938 + }, + { + "epoch": 0.0751184976300474, + "grad_norm": 0.2639322522516908, + "learning_rate": 9.998517104666749e-06, + "loss": 0.3375, + "step": 939 + }, + { + "epoch": 0.0751984960300794, + "grad_norm": 0.3429512898494253, + "learning_rate": 9.998501288018651e-06, + "loss": 0.3107, + "step": 940 + }, + { + "epoch": 0.07527849443011139, + "grad_norm": 0.2961043802990138, + "learning_rate": 9.998485387479418e-06, + "loss": 0.3151, + "step": 941 + }, + { + "epoch": 0.0753584928301434, + "grad_norm": 0.3207619441986614, + "learning_rate": 9.998469403049318e-06, + "loss": 0.3191, + "step": 942 + }, + { + "epoch": 0.0754384912301754, + "grad_norm": 0.25259197092792135, + "learning_rate": 9.998453334728619e-06, + "loss": 0.3492, + "step": 943 + }, + { + "epoch": 0.0755184896302074, + "grad_norm": 0.26607035336731205, + "learning_rate": 9.998437182517589e-06, + "loss": 0.3109, + "step": 944 + }, + { + "epoch": 0.0755984880302394, + "grad_norm": 0.3095723706810741, + "learning_rate": 9.9984209464165e-06, + "loss": 0.3197, + "step": 945 + }, + { + "epoch": 0.07567848643027139, + "grad_norm": 0.2906994902380521, + "learning_rate": 9.998404626425627e-06, + "loss": 0.3208, + "step": 946 + }, + { + "epoch": 0.0757584848303034, + "grad_norm": 0.37095484390605793, + "learning_rate": 9.998388222545242e-06, + "loss": 0.2908, + "step": 947 + }, + { + "epoch": 0.07583848323033539, + "grad_norm": 0.33311119159043867, + "learning_rate": 9.998371734775618e-06, + "loss": 0.2865, + "step": 948 + }, + { + "epoch": 0.0759184816303674, + "grad_norm": 0.31047094509315915, + "learning_rate": 9.998355163117035e-06, + "loss": 0.3425, + "step": 949 + }, + { + "epoch": 0.0759984800303994, + "grad_norm": 0.34316326352164156, + "learning_rate": 9.99833850756977e-06, + "loss": 0.3123, + "step": 950 + }, + { + "epoch": 0.07607847843043139, + "grad_norm": 0.26416866802349737, + "learning_rate": 9.998321768134101e-06, + "loss": 0.3353, + "step": 951 + }, + { + "epoch": 0.0761584768304634, + "grad_norm": 0.31700262119304584, + "learning_rate": 9.998304944810314e-06, + "loss": 0.3215, + "step": 952 + }, + { + "epoch": 0.07623847523049539, + "grad_norm": 0.3323728655426343, + "learning_rate": 9.998288037598684e-06, + "loss": 0.291, + "step": 953 + }, + { + "epoch": 0.07631847363052739, + "grad_norm": 0.300428098076248, + "learning_rate": 9.998271046499501e-06, + "loss": 0.312, + "step": 954 + }, + { + "epoch": 0.07639847203055938, + "grad_norm": 0.40117205074825596, + "learning_rate": 9.998253971513048e-06, + "loss": 0.2845, + "step": 955 + }, + { + "epoch": 0.07647847043059139, + "grad_norm": 0.2934713167443252, + "learning_rate": 9.99823681263961e-06, + "loss": 0.336, + "step": 956 + }, + { + "epoch": 0.0765584688306234, + "grad_norm": 0.3046555655299077, + "learning_rate": 9.998219569879476e-06, + "loss": 0.2986, + "step": 957 + }, + { + "epoch": 0.07663846723065539, + "grad_norm": 0.34447567097937876, + "learning_rate": 9.998202243232937e-06, + "loss": 0.3142, + "step": 958 + }, + { + "epoch": 0.07671846563068739, + "grad_norm": 0.32552140178348066, + "learning_rate": 9.998184832700282e-06, + "loss": 0.2872, + "step": 959 + }, + { + "epoch": 0.07679846403071938, + "grad_norm": 0.414640257479251, + "learning_rate": 9.998167338281803e-06, + "loss": 0.3002, + "step": 960 + }, + { + "epoch": 0.07687846243075139, + "grad_norm": 0.3005059383881909, + "learning_rate": 9.998149759977795e-06, + "loss": 0.3236, + "step": 961 + }, + { + "epoch": 0.07695846083078338, + "grad_norm": 0.28314142435755596, + "learning_rate": 9.998132097788554e-06, + "loss": 0.3271, + "step": 962 + }, + { + "epoch": 0.07703845923081538, + "grad_norm": 0.330800544738117, + "learning_rate": 9.998114351714373e-06, + "loss": 0.3339, + "step": 963 + }, + { + "epoch": 0.07711845763084739, + "grad_norm": 0.35807438473415937, + "learning_rate": 9.998096521755552e-06, + "loss": 0.3101, + "step": 964 + }, + { + "epoch": 0.07719845603087938, + "grad_norm": 0.26743430200408586, + "learning_rate": 9.99807860791239e-06, + "loss": 0.334, + "step": 965 + }, + { + "epoch": 0.07727845443091139, + "grad_norm": 0.3060667747402942, + "learning_rate": 9.998060610185187e-06, + "loss": 0.331, + "step": 966 + }, + { + "epoch": 0.07735845283094338, + "grad_norm": 0.2954348052423245, + "learning_rate": 9.998042528574246e-06, + "loss": 0.3258, + "step": 967 + }, + { + "epoch": 0.07743845123097538, + "grad_norm": 0.31044693004102286, + "learning_rate": 9.99802436307987e-06, + "loss": 0.3329, + "step": 968 + }, + { + "epoch": 0.07751844963100737, + "grad_norm": 0.2884319489948687, + "learning_rate": 9.998006113702363e-06, + "loss": 0.308, + "step": 969 + }, + { + "epoch": 0.07759844803103938, + "grad_norm": 0.3128652725221715, + "learning_rate": 9.997987780442033e-06, + "loss": 0.3143, + "step": 970 + }, + { + "epoch": 0.07767844643107139, + "grad_norm": 0.27977003156194064, + "learning_rate": 9.997969363299187e-06, + "loss": 0.303, + "step": 971 + }, + { + "epoch": 0.07775844483110338, + "grad_norm": 0.30627445888831595, + "learning_rate": 9.997950862274134e-06, + "loss": 0.3093, + "step": 972 + }, + { + "epoch": 0.07783844323113538, + "grad_norm": 0.25513555037593205, + "learning_rate": 9.997932277367183e-06, + "loss": 0.343, + "step": 973 + }, + { + "epoch": 0.07791844163116737, + "grad_norm": 0.27506729459930485, + "learning_rate": 9.997913608578651e-06, + "loss": 0.3421, + "step": 974 + }, + { + "epoch": 0.07799844003119938, + "grad_norm": 0.30285205423854594, + "learning_rate": 9.997894855908844e-06, + "loss": 0.3244, + "step": 975 + }, + { + "epoch": 0.07807843843123137, + "grad_norm": 0.3408747757138861, + "learning_rate": 9.997876019358083e-06, + "loss": 0.2707, + "step": 976 + }, + { + "epoch": 0.07815843683126338, + "grad_norm": 0.23530762933875393, + "learning_rate": 9.997857098926679e-06, + "loss": 0.3429, + "step": 977 + }, + { + "epoch": 0.07823843523129538, + "grad_norm": 0.38361673504863586, + "learning_rate": 9.997838094614956e-06, + "loss": 0.3373, + "step": 978 + }, + { + "epoch": 0.07831843363132737, + "grad_norm": 0.2881876184207208, + "learning_rate": 9.997819006423227e-06, + "loss": 0.3015, + "step": 979 + }, + { + "epoch": 0.07839843203135938, + "grad_norm": 0.28910724951259187, + "learning_rate": 9.997799834351814e-06, + "loss": 0.3093, + "step": 980 + }, + { + "epoch": 0.07847843043139137, + "grad_norm": 0.34745793904767547, + "learning_rate": 9.99778057840104e-06, + "loss": 0.3017, + "step": 981 + }, + { + "epoch": 0.07855842883142337, + "grad_norm": 0.3217752144288568, + "learning_rate": 9.997761238571227e-06, + "loss": 0.2913, + "step": 982 + }, + { + "epoch": 0.07863842723145537, + "grad_norm": 0.3298380423069833, + "learning_rate": 9.9977418148627e-06, + "loss": 0.2823, + "step": 983 + }, + { + "epoch": 0.07871842563148737, + "grad_norm": 0.33207326996160563, + "learning_rate": 9.997722307275785e-06, + "loss": 0.2762, + "step": 984 + }, + { + "epoch": 0.07879842403151938, + "grad_norm": 0.3369495613230937, + "learning_rate": 9.99770271581081e-06, + "loss": 0.3013, + "step": 985 + }, + { + "epoch": 0.07887842243155137, + "grad_norm": 0.3078921477860902, + "learning_rate": 9.997683040468103e-06, + "loss": 0.3285, + "step": 986 + }, + { + "epoch": 0.07895842083158337, + "grad_norm": 0.2906765881690975, + "learning_rate": 9.997663281247993e-06, + "loss": 0.3404, + "step": 987 + }, + { + "epoch": 0.07903841923161536, + "grad_norm": 0.3200641043001228, + "learning_rate": 9.997643438150814e-06, + "loss": 0.3293, + "step": 988 + }, + { + "epoch": 0.07911841763164737, + "grad_norm": 0.27999887105401894, + "learning_rate": 9.9976235111769e-06, + "loss": 0.3069, + "step": 989 + }, + { + "epoch": 0.07919841603167936, + "grad_norm": 0.35132035991432753, + "learning_rate": 9.99760350032658e-06, + "loss": 0.3143, + "step": 990 + }, + { + "epoch": 0.07927841443171137, + "grad_norm": 0.36487130571936177, + "learning_rate": 9.997583405600194e-06, + "loss": 0.3009, + "step": 991 + }, + { + "epoch": 0.07935841283174337, + "grad_norm": 0.3386057666259531, + "learning_rate": 9.997563226998082e-06, + "loss": 0.3071, + "step": 992 + }, + { + "epoch": 0.07943841123177536, + "grad_norm": 0.29096601909665687, + "learning_rate": 9.997542964520576e-06, + "loss": 0.338, + "step": 993 + }, + { + "epoch": 0.07951840963180737, + "grad_norm": 0.2799918723267478, + "learning_rate": 9.99752261816802e-06, + "loss": 0.3246, + "step": 994 + }, + { + "epoch": 0.07959840803183936, + "grad_norm": 0.3305568355038778, + "learning_rate": 9.997502187940757e-06, + "loss": 0.3111, + "step": 995 + }, + { + "epoch": 0.07967840643187137, + "grad_norm": 0.29763518156057117, + "learning_rate": 9.997481673839125e-06, + "loss": 0.3686, + "step": 996 + }, + { + "epoch": 0.07975840483190336, + "grad_norm": 0.30780614834520464, + "learning_rate": 9.997461075863473e-06, + "loss": 0.3406, + "step": 997 + }, + { + "epoch": 0.07983840323193536, + "grad_norm": 0.339429540242941, + "learning_rate": 9.997440394014143e-06, + "loss": 0.3098, + "step": 998 + }, + { + "epoch": 0.07991840163196737, + "grad_norm": 0.31230674056657237, + "learning_rate": 9.997419628291485e-06, + "loss": 0.319, + "step": 999 + }, + { + "epoch": 0.07999840003199936, + "grad_norm": 0.32994648197852433, + "learning_rate": 9.997398778695847e-06, + "loss": 0.2853, + "step": 1000 + }, + { + "epoch": 0.08007839843203136, + "grad_norm": 0.33815039042345024, + "learning_rate": 9.997377845227577e-06, + "loss": 0.2982, + "step": 1001 + }, + { + "epoch": 0.08015839683206336, + "grad_norm": 0.29187753039875036, + "learning_rate": 9.997356827887026e-06, + "loss": 0.3394, + "step": 1002 + }, + { + "epoch": 0.08023839523209536, + "grad_norm": 0.279755247934047, + "learning_rate": 9.99733572667455e-06, + "loss": 0.3221, + "step": 1003 + }, + { + "epoch": 0.08031839363212735, + "grad_norm": 0.25185514220515154, + "learning_rate": 9.997314541590502e-06, + "loss": 0.3441, + "step": 1004 + }, + { + "epoch": 0.08039839203215936, + "grad_norm": 0.4059835828636773, + "learning_rate": 9.997293272635236e-06, + "loss": 0.339, + "step": 1005 + }, + { + "epoch": 0.08047839043219136, + "grad_norm": 0.27775640057768064, + "learning_rate": 9.99727191980911e-06, + "loss": 0.3442, + "step": 1006 + }, + { + "epoch": 0.08055838883222335, + "grad_norm": 0.32869754471714496, + "learning_rate": 9.997250483112483e-06, + "loss": 0.3082, + "step": 1007 + }, + { + "epoch": 0.08063838723225536, + "grad_norm": 0.32743063042423676, + "learning_rate": 9.997228962545715e-06, + "loss": 0.2681, + "step": 1008 + }, + { + "epoch": 0.08071838563228735, + "grad_norm": 0.2551864563966594, + "learning_rate": 9.997207358109166e-06, + "loss": 0.3501, + "step": 1009 + }, + { + "epoch": 0.08079838403231936, + "grad_norm": 0.3719108021606949, + "learning_rate": 9.997185669803197e-06, + "loss": 0.301, + "step": 1010 + }, + { + "epoch": 0.08087838243235135, + "grad_norm": 0.3240776724396333, + "learning_rate": 9.997163897628175e-06, + "loss": 0.322, + "step": 1011 + }, + { + "epoch": 0.08095838083238335, + "grad_norm": 0.30233982071991194, + "learning_rate": 9.997142041584467e-06, + "loss": 0.3085, + "step": 1012 + }, + { + "epoch": 0.08103837923241536, + "grad_norm": 0.3442648379441969, + "learning_rate": 9.997120101672434e-06, + "loss": 0.2874, + "step": 1013 + }, + { + "epoch": 0.08111837763244735, + "grad_norm": 0.36091755267418346, + "learning_rate": 9.99709807789245e-06, + "loss": 0.311, + "step": 1014 + }, + { + "epoch": 0.08119837603247936, + "grad_norm": 0.34588376053132763, + "learning_rate": 9.997075970244878e-06, + "loss": 0.2886, + "step": 1015 + }, + { + "epoch": 0.08127837443251135, + "grad_norm": 0.33104857713866476, + "learning_rate": 9.997053778730095e-06, + "loss": 0.2953, + "step": 1016 + }, + { + "epoch": 0.08135837283254335, + "grad_norm": 0.3435035243083412, + "learning_rate": 9.997031503348473e-06, + "loss": 0.3189, + "step": 1017 + }, + { + "epoch": 0.08143837123257534, + "grad_norm": 0.31372190170406855, + "learning_rate": 9.997009144100383e-06, + "loss": 0.3067, + "step": 1018 + }, + { + "epoch": 0.08151836963260735, + "grad_norm": 0.32410564866357006, + "learning_rate": 9.996986700986201e-06, + "loss": 0.3234, + "step": 1019 + }, + { + "epoch": 0.08159836803263935, + "grad_norm": 0.28854055757672614, + "learning_rate": 9.996964174006304e-06, + "loss": 0.3176, + "step": 1020 + }, + { + "epoch": 0.08167836643267135, + "grad_norm": 0.293423545515452, + "learning_rate": 9.996941563161071e-06, + "loss": 0.314, + "step": 1021 + }, + { + "epoch": 0.08175836483270335, + "grad_norm": 0.3463719825025683, + "learning_rate": 9.996918868450882e-06, + "loss": 0.2933, + "step": 1022 + }, + { + "epoch": 0.08183836323273534, + "grad_norm": 0.4392795367408748, + "learning_rate": 9.996896089876116e-06, + "loss": 0.3108, + "step": 1023 + }, + { + "epoch": 0.08191836163276735, + "grad_norm": 0.28900803444265566, + "learning_rate": 9.996873227437156e-06, + "loss": 0.3258, + "step": 1024 + }, + { + "epoch": 0.08199836003279934, + "grad_norm": 0.30641802228300485, + "learning_rate": 9.996850281134385e-06, + "loss": 0.2722, + "step": 1025 + }, + { + "epoch": 0.08207835843283134, + "grad_norm": 0.3722525131418996, + "learning_rate": 9.99682725096819e-06, + "loss": 0.3452, + "step": 1026 + }, + { + "epoch": 0.08215835683286335, + "grad_norm": 0.30291628271957177, + "learning_rate": 9.996804136938956e-06, + "loss": 0.315, + "step": 1027 + }, + { + "epoch": 0.08223835523289534, + "grad_norm": 0.33402926294676216, + "learning_rate": 9.99678093904707e-06, + "loss": 0.2876, + "step": 1028 + }, + { + "epoch": 0.08231835363292735, + "grad_norm": 0.4079412386440936, + "learning_rate": 9.996757657292923e-06, + "loss": 0.3215, + "step": 1029 + }, + { + "epoch": 0.08239835203295934, + "grad_norm": 0.32744933019605826, + "learning_rate": 9.996734291676907e-06, + "loss": 0.3225, + "step": 1030 + }, + { + "epoch": 0.08247835043299134, + "grad_norm": 0.3230424291470875, + "learning_rate": 9.996710842199412e-06, + "loss": 0.3256, + "step": 1031 + }, + { + "epoch": 0.08255834883302333, + "grad_norm": 0.25603658860631406, + "learning_rate": 9.996687308860832e-06, + "loss": 0.3371, + "step": 1032 + }, + { + "epoch": 0.08263834723305534, + "grad_norm": 0.3872535367644594, + "learning_rate": 9.99666369166156e-06, + "loss": 0.2943, + "step": 1033 + }, + { + "epoch": 0.08271834563308735, + "grad_norm": 0.26362564867370236, + "learning_rate": 9.996639990601998e-06, + "loss": 0.331, + "step": 1034 + }, + { + "epoch": 0.08279834403311934, + "grad_norm": 0.3032944292791754, + "learning_rate": 9.996616205682538e-06, + "loss": 0.3254, + "step": 1035 + }, + { + "epoch": 0.08287834243315134, + "grad_norm": 0.2558898622870006, + "learning_rate": 9.99659233690358e-06, + "loss": 0.3235, + "step": 1036 + }, + { + "epoch": 0.08295834083318333, + "grad_norm": 0.27861790488206184, + "learning_rate": 9.996568384265529e-06, + "loss": 0.3053, + "step": 1037 + }, + { + "epoch": 0.08303833923321534, + "grad_norm": 0.26966444516082927, + "learning_rate": 9.996544347768782e-06, + "loss": 0.3645, + "step": 1038 + }, + { + "epoch": 0.08311833763324733, + "grad_norm": 0.2679090232752216, + "learning_rate": 9.996520227413747e-06, + "loss": 0.3417, + "step": 1039 + }, + { + "epoch": 0.08319833603327934, + "grad_norm": 0.33835459429485487, + "learning_rate": 9.996496023200823e-06, + "loss": 0.3066, + "step": 1040 + }, + { + "epoch": 0.08327833443331134, + "grad_norm": 0.3118287245188823, + "learning_rate": 9.996471735130422e-06, + "loss": 0.2878, + "step": 1041 + }, + { + "epoch": 0.08335833283334333, + "grad_norm": 0.32156335759034027, + "learning_rate": 9.996447363202947e-06, + "loss": 0.2684, + "step": 1042 + }, + { + "epoch": 0.08343833123337534, + "grad_norm": 0.5325381218929562, + "learning_rate": 9.99642290741881e-06, + "loss": 0.2872, + "step": 1043 + }, + { + "epoch": 0.08351832963340733, + "grad_norm": 0.335879638456767, + "learning_rate": 9.99639836777842e-06, + "loss": 0.3004, + "step": 1044 + }, + { + "epoch": 0.08359832803343933, + "grad_norm": 0.2736386369854071, + "learning_rate": 9.99637374428219e-06, + "loss": 0.3418, + "step": 1045 + }, + { + "epoch": 0.08367832643347133, + "grad_norm": 0.2609782590444062, + "learning_rate": 9.996349036930533e-06, + "loss": 0.3431, + "step": 1046 + }, + { + "epoch": 0.08375832483350333, + "grad_norm": 0.38467484126179985, + "learning_rate": 9.996324245723863e-06, + "loss": 0.2792, + "step": 1047 + }, + { + "epoch": 0.08383832323353534, + "grad_norm": 0.39280502041402643, + "learning_rate": 9.996299370662597e-06, + "loss": 0.287, + "step": 1048 + }, + { + "epoch": 0.08391832163356733, + "grad_norm": 0.3569748640965383, + "learning_rate": 9.99627441174715e-06, + "loss": 0.2861, + "step": 1049 + }, + { + "epoch": 0.08399832003359933, + "grad_norm": 0.31244605421410254, + "learning_rate": 9.996249368977945e-06, + "loss": 0.3103, + "step": 1050 + }, + { + "epoch": 0.08407831843363132, + "grad_norm": 0.35281129255342325, + "learning_rate": 9.9962242423554e-06, + "loss": 0.3156, + "step": 1051 + }, + { + "epoch": 0.08415831683366333, + "grad_norm": 0.31764325727093257, + "learning_rate": 9.996199031879935e-06, + "loss": 0.3364, + "step": 1052 + }, + { + "epoch": 0.08423831523369532, + "grad_norm": 0.4216973364516531, + "learning_rate": 9.996173737551976e-06, + "loss": 0.303, + "step": 1053 + }, + { + "epoch": 0.08431831363372733, + "grad_norm": 0.27700852381156565, + "learning_rate": 9.996148359371946e-06, + "loss": 0.3418, + "step": 1054 + }, + { + "epoch": 0.08439831203375932, + "grad_norm": 0.38200298126868704, + "learning_rate": 9.996122897340273e-06, + "loss": 0.2956, + "step": 1055 + }, + { + "epoch": 0.08447831043379132, + "grad_norm": 0.28078025921946115, + "learning_rate": 9.996097351457381e-06, + "loss": 0.3735, + "step": 1056 + }, + { + "epoch": 0.08455830883382333, + "grad_norm": 0.32466865172090137, + "learning_rate": 9.9960717217237e-06, + "loss": 0.3188, + "step": 1057 + }, + { + "epoch": 0.08463830723385532, + "grad_norm": 0.23519534372554735, + "learning_rate": 9.996046008139663e-06, + "loss": 0.3741, + "step": 1058 + }, + { + "epoch": 0.08471830563388733, + "grad_norm": 0.30483733855312045, + "learning_rate": 9.996020210705697e-06, + "loss": 0.3372, + "step": 1059 + }, + { + "epoch": 0.08479830403391932, + "grad_norm": 0.3124553947291308, + "learning_rate": 9.995994329422239e-06, + "loss": 0.3349, + "step": 1060 + }, + { + "epoch": 0.08487830243395132, + "grad_norm": 0.30392874542338777, + "learning_rate": 9.995968364289719e-06, + "loss": 0.3241, + "step": 1061 + }, + { + "epoch": 0.08495830083398331, + "grad_norm": 0.32245049888837485, + "learning_rate": 9.995942315308577e-06, + "loss": 0.3178, + "step": 1062 + }, + { + "epoch": 0.08503829923401532, + "grad_norm": 0.26471003103982005, + "learning_rate": 9.995916182479248e-06, + "loss": 0.3593, + "step": 1063 + }, + { + "epoch": 0.08511829763404732, + "grad_norm": 0.30199577439109876, + "learning_rate": 9.995889965802171e-06, + "loss": 0.3182, + "step": 1064 + }, + { + "epoch": 0.08519829603407932, + "grad_norm": 0.31274590719312867, + "learning_rate": 9.995863665277787e-06, + "loss": 0.2941, + "step": 1065 + }, + { + "epoch": 0.08527829443411132, + "grad_norm": 0.342629350138759, + "learning_rate": 9.995837280906535e-06, + "loss": 0.2967, + "step": 1066 + }, + { + "epoch": 0.08535829283414331, + "grad_norm": 0.34296909050739893, + "learning_rate": 9.99581081268886e-06, + "loss": 0.3181, + "step": 1067 + }, + { + "epoch": 0.08543829123417532, + "grad_norm": 0.34849293164447015, + "learning_rate": 9.995784260625205e-06, + "loss": 0.2875, + "step": 1068 + }, + { + "epoch": 0.08551828963420731, + "grad_norm": 0.29009933804078714, + "learning_rate": 9.995757624716019e-06, + "loss": 0.3161, + "step": 1069 + }, + { + "epoch": 0.08559828803423931, + "grad_norm": 0.19585532204086822, + "learning_rate": 9.995730904961743e-06, + "loss": 0.3801, + "step": 1070 + }, + { + "epoch": 0.08567828643427132, + "grad_norm": 0.2633059695105179, + "learning_rate": 9.99570410136283e-06, + "loss": 0.3482, + "step": 1071 + }, + { + "epoch": 0.08575828483430331, + "grad_norm": 0.3052516543601372, + "learning_rate": 9.995677213919726e-06, + "loss": 0.314, + "step": 1072 + }, + { + "epoch": 0.08583828323433532, + "grad_norm": 0.29045963629524285, + "learning_rate": 9.995650242632887e-06, + "loss": 0.3165, + "step": 1073 + }, + { + "epoch": 0.08591828163436731, + "grad_norm": 0.3328544237953998, + "learning_rate": 9.995623187502763e-06, + "loss": 0.3368, + "step": 1074 + }, + { + "epoch": 0.08599828003439931, + "grad_norm": 0.3053682523089739, + "learning_rate": 9.99559604852981e-06, + "loss": 0.2751, + "step": 1075 + }, + { + "epoch": 0.0860782784344313, + "grad_norm": 0.2714134172409879, + "learning_rate": 9.995568825714479e-06, + "loss": 0.2974, + "step": 1076 + }, + { + "epoch": 0.08615827683446331, + "grad_norm": 0.3438440508974596, + "learning_rate": 9.995541519057231e-06, + "loss": 0.2836, + "step": 1077 + }, + { + "epoch": 0.08623827523449532, + "grad_norm": 0.310393650792298, + "learning_rate": 9.995514128558523e-06, + "loss": 0.3027, + "step": 1078 + }, + { + "epoch": 0.0863182736345273, + "grad_norm": 0.33711186993885395, + "learning_rate": 9.995486654218815e-06, + "loss": 0.3142, + "step": 1079 + }, + { + "epoch": 0.08639827203455931, + "grad_norm": 0.32224263976181167, + "learning_rate": 9.995459096038568e-06, + "loss": 0.3031, + "step": 1080 + }, + { + "epoch": 0.0864782704345913, + "grad_norm": 0.3418395734553405, + "learning_rate": 9.995431454018246e-06, + "loss": 0.2704, + "step": 1081 + }, + { + "epoch": 0.08655826883462331, + "grad_norm": 0.297789685680779, + "learning_rate": 9.995403728158311e-06, + "loss": 0.2975, + "step": 1082 + }, + { + "epoch": 0.0866382672346553, + "grad_norm": 0.29348981360478565, + "learning_rate": 9.995375918459227e-06, + "loss": 0.3194, + "step": 1083 + }, + { + "epoch": 0.0867182656346873, + "grad_norm": 0.4902866863571245, + "learning_rate": 9.995348024921463e-06, + "loss": 0.2855, + "step": 1084 + }, + { + "epoch": 0.08679826403471931, + "grad_norm": 0.33330277755786136, + "learning_rate": 9.995320047545488e-06, + "loss": 0.3258, + "step": 1085 + }, + { + "epoch": 0.0868782624347513, + "grad_norm": 0.2740249763002998, + "learning_rate": 9.995291986331767e-06, + "loss": 0.3325, + "step": 1086 + }, + { + "epoch": 0.08695826083478331, + "grad_norm": 0.3408933373620087, + "learning_rate": 9.995263841280776e-06, + "loss": 0.2993, + "step": 1087 + }, + { + "epoch": 0.0870382592348153, + "grad_norm": 0.3031063900963532, + "learning_rate": 9.995235612392986e-06, + "loss": 0.3222, + "step": 1088 + }, + { + "epoch": 0.0871182576348473, + "grad_norm": 0.2502837778770749, + "learning_rate": 9.99520729966887e-06, + "loss": 0.3364, + "step": 1089 + }, + { + "epoch": 0.0871982560348793, + "grad_norm": 0.35748485496614607, + "learning_rate": 9.995178903108904e-06, + "loss": 0.2942, + "step": 1090 + }, + { + "epoch": 0.0872782544349113, + "grad_norm": 0.3555823361678222, + "learning_rate": 9.995150422713561e-06, + "loss": 0.3036, + "step": 1091 + }, + { + "epoch": 0.0873582528349433, + "grad_norm": 0.3516767614264302, + "learning_rate": 9.995121858483326e-06, + "loss": 0.2775, + "step": 1092 + }, + { + "epoch": 0.0874382512349753, + "grad_norm": 0.3342221376356437, + "learning_rate": 9.995093210418672e-06, + "loss": 0.2896, + "step": 1093 + }, + { + "epoch": 0.0875182496350073, + "grad_norm": 0.38175787639915015, + "learning_rate": 9.995064478520083e-06, + "loss": 0.3225, + "step": 1094 + }, + { + "epoch": 0.0875982480350393, + "grad_norm": 0.3291457453192748, + "learning_rate": 9.995035662788039e-06, + "loss": 0.2687, + "step": 1095 + }, + { + "epoch": 0.0876782464350713, + "grad_norm": 0.3379320080823688, + "learning_rate": 9.995006763223028e-06, + "loss": 0.2913, + "step": 1096 + }, + { + "epoch": 0.08775824483510329, + "grad_norm": 0.33494001523055583, + "learning_rate": 9.99497777982553e-06, + "loss": 0.2875, + "step": 1097 + }, + { + "epoch": 0.0878382432351353, + "grad_norm": 0.28030322892451337, + "learning_rate": 9.994948712596033e-06, + "loss": 0.3402, + "step": 1098 + }, + { + "epoch": 0.0879182416351673, + "grad_norm": 0.2547753550649061, + "learning_rate": 9.994919561535026e-06, + "loss": 0.3399, + "step": 1099 + }, + { + "epoch": 0.0879982400351993, + "grad_norm": 0.2417990846917735, + "learning_rate": 9.994890326642998e-06, + "loss": 0.323, + "step": 1100 + }, + { + "epoch": 0.0880782384352313, + "grad_norm": 0.40394887803809776, + "learning_rate": 9.99486100792044e-06, + "loss": 0.2921, + "step": 1101 + }, + { + "epoch": 0.08815823683526329, + "grad_norm": 0.389572500490799, + "learning_rate": 9.994831605367842e-06, + "loss": 0.2709, + "step": 1102 + }, + { + "epoch": 0.0882382352352953, + "grad_norm": 0.3693626832339756, + "learning_rate": 9.9948021189857e-06, + "loss": 0.2883, + "step": 1103 + }, + { + "epoch": 0.08831823363532729, + "grad_norm": 0.5292367315887432, + "learning_rate": 9.994772548774506e-06, + "loss": 0.31, + "step": 1104 + }, + { + "epoch": 0.08839823203535929, + "grad_norm": 0.3752113934340967, + "learning_rate": 9.994742894734759e-06, + "loss": 0.3006, + "step": 1105 + }, + { + "epoch": 0.0884782304353913, + "grad_norm": 0.3264503928718569, + "learning_rate": 9.994713156866956e-06, + "loss": 0.2709, + "step": 1106 + }, + { + "epoch": 0.08855822883542329, + "grad_norm": 0.3292072846562249, + "learning_rate": 9.994683335171594e-06, + "loss": 0.2846, + "step": 1107 + }, + { + "epoch": 0.0886382272354553, + "grad_norm": 0.2944920506216355, + "learning_rate": 9.994653429649178e-06, + "loss": 0.3069, + "step": 1108 + }, + { + "epoch": 0.08871822563548729, + "grad_norm": 0.33142288438776357, + "learning_rate": 9.994623440300205e-06, + "loss": 0.3041, + "step": 1109 + }, + { + "epoch": 0.08879822403551929, + "grad_norm": 0.2695811068140248, + "learning_rate": 9.99459336712518e-06, + "loss": 0.316, + "step": 1110 + }, + { + "epoch": 0.08887822243555128, + "grad_norm": 0.2716862425216491, + "learning_rate": 9.99456321012461e-06, + "loss": 0.3092, + "step": 1111 + }, + { + "epoch": 0.08895822083558329, + "grad_norm": 0.309055003074196, + "learning_rate": 9.994532969298999e-06, + "loss": 0.2731, + "step": 1112 + }, + { + "epoch": 0.08903821923561529, + "grad_norm": 0.28227866826893694, + "learning_rate": 9.994502644648854e-06, + "loss": 0.3395, + "step": 1113 + }, + { + "epoch": 0.08911821763564728, + "grad_norm": 0.30176394973207316, + "learning_rate": 9.994472236174686e-06, + "loss": 0.3181, + "step": 1114 + }, + { + "epoch": 0.08919821603567929, + "grad_norm": 0.31529772068413553, + "learning_rate": 9.994441743877003e-06, + "loss": 0.2785, + "step": 1115 + }, + { + "epoch": 0.08927821443571128, + "grad_norm": 0.29342782703419024, + "learning_rate": 9.994411167756319e-06, + "loss": 0.3352, + "step": 1116 + }, + { + "epoch": 0.08935821283574329, + "grad_norm": 0.35663002922122833, + "learning_rate": 9.994380507813146e-06, + "loss": 0.2602, + "step": 1117 + }, + { + "epoch": 0.08943821123577528, + "grad_norm": 0.3090479557779549, + "learning_rate": 9.994349764047999e-06, + "loss": 0.3173, + "step": 1118 + }, + { + "epoch": 0.08951820963580728, + "grad_norm": 0.26867713477482036, + "learning_rate": 9.994318936461393e-06, + "loss": 0.334, + "step": 1119 + }, + { + "epoch": 0.08959820803583929, + "grad_norm": 0.2663020096977348, + "learning_rate": 9.994288025053846e-06, + "loss": 0.2921, + "step": 1120 + }, + { + "epoch": 0.08967820643587128, + "grad_norm": 0.34647757135967905, + "learning_rate": 9.994257029825876e-06, + "loss": 0.3168, + "step": 1121 + }, + { + "epoch": 0.08975820483590329, + "grad_norm": 0.3084340285336873, + "learning_rate": 9.994225950778005e-06, + "loss": 0.2928, + "step": 1122 + }, + { + "epoch": 0.08983820323593528, + "grad_norm": 0.3350039376404601, + "learning_rate": 9.994194787910754e-06, + "loss": 0.2856, + "step": 1123 + }, + { + "epoch": 0.08991820163596728, + "grad_norm": 0.27082485458385747, + "learning_rate": 9.994163541224645e-06, + "loss": 0.3464, + "step": 1124 + }, + { + "epoch": 0.08999820003599927, + "grad_norm": 0.2630252580391773, + "learning_rate": 9.994132210720204e-06, + "loss": 0.3253, + "step": 1125 + }, + { + "epoch": 0.09007819843603128, + "grad_norm": 0.2641326608970051, + "learning_rate": 9.994100796397954e-06, + "loss": 0.3379, + "step": 1126 + }, + { + "epoch": 0.09015819683606328, + "grad_norm": 0.2650701043258218, + "learning_rate": 9.994069298258427e-06, + "loss": 0.3345, + "step": 1127 + }, + { + "epoch": 0.09023819523609528, + "grad_norm": 0.3128592679954027, + "learning_rate": 9.994037716302146e-06, + "loss": 0.326, + "step": 1128 + }, + { + "epoch": 0.09031819363612728, + "grad_norm": 0.296270761468446, + "learning_rate": 9.994006050529645e-06, + "loss": 0.3256, + "step": 1129 + }, + { + "epoch": 0.09039819203615927, + "grad_norm": 0.2828148736583494, + "learning_rate": 9.993974300941455e-06, + "loss": 0.2952, + "step": 1130 + }, + { + "epoch": 0.09047819043619128, + "grad_norm": 0.2796649650954519, + "learning_rate": 9.993942467538107e-06, + "loss": 0.3381, + "step": 1131 + }, + { + "epoch": 0.09055818883622327, + "grad_norm": 0.3151368856315861, + "learning_rate": 9.993910550320137e-06, + "loss": 0.3371, + "step": 1132 + }, + { + "epoch": 0.09063818723625527, + "grad_norm": 0.28245747975267826, + "learning_rate": 9.99387854928808e-06, + "loss": 0.3082, + "step": 1133 + }, + { + "epoch": 0.09071818563628728, + "grad_norm": 0.32361889119371196, + "learning_rate": 9.993846464442473e-06, + "loss": 0.2995, + "step": 1134 + }, + { + "epoch": 0.09079818403631927, + "grad_norm": 0.2813358383018441, + "learning_rate": 9.993814295783855e-06, + "loss": 0.3077, + "step": 1135 + }, + { + "epoch": 0.09087818243635128, + "grad_norm": 0.3026912738633303, + "learning_rate": 9.993782043312765e-06, + "loss": 0.3102, + "step": 1136 + }, + { + "epoch": 0.09095818083638327, + "grad_norm": 0.24039332572931846, + "learning_rate": 9.993749707029746e-06, + "loss": 0.3404, + "step": 1137 + }, + { + "epoch": 0.09103817923641527, + "grad_norm": 0.3171604716413527, + "learning_rate": 9.993717286935339e-06, + "loss": 0.3046, + "step": 1138 + }, + { + "epoch": 0.09111817763644726, + "grad_norm": 0.35600585478704544, + "learning_rate": 9.99368478303009e-06, + "loss": 0.3196, + "step": 1139 + }, + { + "epoch": 0.09119817603647927, + "grad_norm": 0.3188123205527489, + "learning_rate": 9.99365219531454e-06, + "loss": 0.3164, + "step": 1140 + }, + { + "epoch": 0.09127817443651128, + "grad_norm": 0.30752893837296913, + "learning_rate": 9.993619523789241e-06, + "loss": 0.3034, + "step": 1141 + }, + { + "epoch": 0.09135817283654327, + "grad_norm": 0.3545662479986179, + "learning_rate": 9.99358676845474e-06, + "loss": 0.3311, + "step": 1142 + }, + { + "epoch": 0.09143817123657527, + "grad_norm": 0.27140060442224245, + "learning_rate": 9.993553929311587e-06, + "loss": 0.3334, + "step": 1143 + }, + { + "epoch": 0.09151816963660726, + "grad_norm": 0.3530343564342775, + "learning_rate": 9.993521006360329e-06, + "loss": 0.3019, + "step": 1144 + }, + { + "epoch": 0.09159816803663927, + "grad_norm": 0.25501891872163307, + "learning_rate": 9.993487999601522e-06, + "loss": 0.3078, + "step": 1145 + }, + { + "epoch": 0.09167816643667126, + "grad_norm": 0.24185740548790127, + "learning_rate": 9.993454909035724e-06, + "loss": 0.3441, + "step": 1146 + }, + { + "epoch": 0.09175816483670327, + "grad_norm": 0.302461767338097, + "learning_rate": 9.993421734663484e-06, + "loss": 0.3076, + "step": 1147 + }, + { + "epoch": 0.09183816323673527, + "grad_norm": 0.31997533045008764, + "learning_rate": 9.993388476485361e-06, + "loss": 0.2858, + "step": 1148 + }, + { + "epoch": 0.09191816163676726, + "grad_norm": 0.33352700801025476, + "learning_rate": 9.993355134501914e-06, + "loss": 0.3036, + "step": 1149 + }, + { + "epoch": 0.09199816003679927, + "grad_norm": 0.23477152550410624, + "learning_rate": 9.9933217087137e-06, + "loss": 0.3307, + "step": 1150 + }, + { + "epoch": 0.09207815843683126, + "grad_norm": 0.3241193696016506, + "learning_rate": 9.993288199121283e-06, + "loss": 0.2866, + "step": 1151 + }, + { + "epoch": 0.09215815683686326, + "grad_norm": 0.29862810839684006, + "learning_rate": 9.993254605725225e-06, + "loss": 0.3027, + "step": 1152 + }, + { + "epoch": 0.09223815523689526, + "grad_norm": 0.31052349229189347, + "learning_rate": 9.993220928526086e-06, + "loss": 0.3052, + "step": 1153 + }, + { + "epoch": 0.09231815363692726, + "grad_norm": 0.32688402190217625, + "learning_rate": 9.993187167524437e-06, + "loss": 0.2826, + "step": 1154 + }, + { + "epoch": 0.09239815203695927, + "grad_norm": 0.43221238048685584, + "learning_rate": 9.993153322720841e-06, + "loss": 0.306, + "step": 1155 + }, + { + "epoch": 0.09247815043699126, + "grad_norm": 0.31361974540620574, + "learning_rate": 9.993119394115866e-06, + "loss": 0.2957, + "step": 1156 + }, + { + "epoch": 0.09255814883702326, + "grad_norm": 0.2372527917951173, + "learning_rate": 9.993085381710083e-06, + "loss": 0.3334, + "step": 1157 + }, + { + "epoch": 0.09263814723705525, + "grad_norm": 0.3787659030960463, + "learning_rate": 9.993051285504063e-06, + "loss": 0.3053, + "step": 1158 + }, + { + "epoch": 0.09271814563708726, + "grad_norm": 0.2514564379744673, + "learning_rate": 9.993017105498378e-06, + "loss": 0.333, + "step": 1159 + }, + { + "epoch": 0.09279814403711925, + "grad_norm": 0.28744698194014523, + "learning_rate": 9.992982841693599e-06, + "loss": 0.3222, + "step": 1160 + }, + { + "epoch": 0.09287814243715126, + "grad_norm": 0.20346121831885777, + "learning_rate": 9.992948494090303e-06, + "loss": 0.3555, + "step": 1161 + }, + { + "epoch": 0.09295814083718326, + "grad_norm": 0.22740467560663488, + "learning_rate": 9.992914062689068e-06, + "loss": 0.377, + "step": 1162 + }, + { + "epoch": 0.09303813923721525, + "grad_norm": 0.20088268737007853, + "learning_rate": 9.992879547490469e-06, + "loss": 0.3767, + "step": 1163 + }, + { + "epoch": 0.09311813763724726, + "grad_norm": 0.3302587245313672, + "learning_rate": 9.992844948495088e-06, + "loss": 0.2773, + "step": 1164 + }, + { + "epoch": 0.09319813603727925, + "grad_norm": 0.3190881697599193, + "learning_rate": 9.992810265703503e-06, + "loss": 0.3487, + "step": 1165 + }, + { + "epoch": 0.09327813443731126, + "grad_norm": 0.2843003259587228, + "learning_rate": 9.992775499116299e-06, + "loss": 0.323, + "step": 1166 + }, + { + "epoch": 0.09335813283734325, + "grad_norm": 0.28923702185263334, + "learning_rate": 9.992740648734057e-06, + "loss": 0.3134, + "step": 1167 + }, + { + "epoch": 0.09343813123737525, + "grad_norm": 0.6165482291257397, + "learning_rate": 9.992705714557362e-06, + "loss": 0.2937, + "step": 1168 + }, + { + "epoch": 0.09351812963740726, + "grad_norm": 0.362392729699564, + "learning_rate": 9.992670696586802e-06, + "loss": 0.2989, + "step": 1169 + }, + { + "epoch": 0.09359812803743925, + "grad_norm": 0.32207836775413334, + "learning_rate": 9.992635594822965e-06, + "loss": 0.2955, + "step": 1170 + }, + { + "epoch": 0.09367812643747125, + "grad_norm": 0.2647476605454954, + "learning_rate": 9.992600409266437e-06, + "loss": 0.3182, + "step": 1171 + }, + { + "epoch": 0.09375812483750325, + "grad_norm": 0.2677071427789346, + "learning_rate": 9.992565139917812e-06, + "loss": 0.3568, + "step": 1172 + }, + { + "epoch": 0.09383812323753525, + "grad_norm": 0.265695309870558, + "learning_rate": 9.99252978677768e-06, + "loss": 0.3582, + "step": 1173 + }, + { + "epoch": 0.09391812163756724, + "grad_norm": 0.28933844261169883, + "learning_rate": 9.992494349846635e-06, + "loss": 0.3034, + "step": 1174 + }, + { + "epoch": 0.09399812003759925, + "grad_norm": 0.24966436690867466, + "learning_rate": 9.992458829125271e-06, + "loss": 0.321, + "step": 1175 + }, + { + "epoch": 0.09407811843763125, + "grad_norm": 0.2979458823286954, + "learning_rate": 9.992423224614185e-06, + "loss": 0.322, + "step": 1176 + }, + { + "epoch": 0.09415811683766324, + "grad_norm": 0.37666835239644997, + "learning_rate": 9.992387536313975e-06, + "loss": 0.3137, + "step": 1177 + }, + { + "epoch": 0.09423811523769525, + "grad_norm": 0.24910144848028087, + "learning_rate": 9.992351764225238e-06, + "loss": 0.3252, + "step": 1178 + }, + { + "epoch": 0.09431811363772724, + "grad_norm": 0.45518140789557215, + "learning_rate": 9.992315908348578e-06, + "loss": 0.3145, + "step": 1179 + }, + { + "epoch": 0.09439811203775925, + "grad_norm": 0.3574587389265085, + "learning_rate": 9.992279968684592e-06, + "loss": 0.3062, + "step": 1180 + }, + { + "epoch": 0.09447811043779124, + "grad_norm": 0.31609703853016824, + "learning_rate": 9.992243945233886e-06, + "loss": 0.2903, + "step": 1181 + }, + { + "epoch": 0.09455810883782324, + "grad_norm": 0.2698606232678801, + "learning_rate": 9.992207837997064e-06, + "loss": 0.3315, + "step": 1182 + }, + { + "epoch": 0.09463810723785525, + "grad_norm": 0.25917329643829207, + "learning_rate": 9.992171646974734e-06, + "loss": 0.3661, + "step": 1183 + }, + { + "epoch": 0.09471810563788724, + "grad_norm": 0.3263059841584088, + "learning_rate": 9.9921353721675e-06, + "loss": 0.3093, + "step": 1184 + }, + { + "epoch": 0.09479810403791925, + "grad_norm": 0.2956940930046495, + "learning_rate": 9.99209901357597e-06, + "loss": 0.3079, + "step": 1185 + }, + { + "epoch": 0.09487810243795124, + "grad_norm": 0.2685558090337002, + "learning_rate": 9.99206257120076e-06, + "loss": 0.3446, + "step": 1186 + }, + { + "epoch": 0.09495810083798324, + "grad_norm": 0.3131040853626263, + "learning_rate": 9.992026045042478e-06, + "loss": 0.3114, + "step": 1187 + }, + { + "epoch": 0.09503809923801523, + "grad_norm": 0.30931169369990325, + "learning_rate": 9.991989435101736e-06, + "loss": 0.3113, + "step": 1188 + }, + { + "epoch": 0.09511809763804724, + "grad_norm": 0.3140938456843547, + "learning_rate": 9.99195274137915e-06, + "loss": 0.2967, + "step": 1189 + }, + { + "epoch": 0.09519809603807924, + "grad_norm": 0.3021078159539041, + "learning_rate": 9.991915963875336e-06, + "loss": 0.3053, + "step": 1190 + }, + { + "epoch": 0.09527809443811124, + "grad_norm": 0.35104514548535, + "learning_rate": 9.991879102590912e-06, + "loss": 0.2844, + "step": 1191 + }, + { + "epoch": 0.09535809283814324, + "grad_norm": 0.28970511773908714, + "learning_rate": 9.991842157526493e-06, + "loss": 0.3172, + "step": 1192 + }, + { + "epoch": 0.09543809123817523, + "grad_norm": 0.31035348927258927, + "learning_rate": 9.9918051286827e-06, + "loss": 0.3229, + "step": 1193 + }, + { + "epoch": 0.09551808963820724, + "grad_norm": 0.3094756803925865, + "learning_rate": 9.991768016060159e-06, + "loss": 0.3181, + "step": 1194 + }, + { + "epoch": 0.09559808803823923, + "grad_norm": 0.2945036503778562, + "learning_rate": 9.99173081965949e-06, + "loss": 0.3161, + "step": 1195 + }, + { + "epoch": 0.09567808643827123, + "grad_norm": 0.3036940900954199, + "learning_rate": 9.991693539481317e-06, + "loss": 0.3022, + "step": 1196 + }, + { + "epoch": 0.09575808483830324, + "grad_norm": 0.23338442756401814, + "learning_rate": 9.991656175526264e-06, + "loss": 0.3233, + "step": 1197 + }, + { + "epoch": 0.09583808323833523, + "grad_norm": 0.3356766740445785, + "learning_rate": 9.99161872779496e-06, + "loss": 0.3084, + "step": 1198 + }, + { + "epoch": 0.09591808163836724, + "grad_norm": 0.32150153531273873, + "learning_rate": 9.991581196288035e-06, + "loss": 0.2711, + "step": 1199 + }, + { + "epoch": 0.09599808003839923, + "grad_norm": 0.3377743039809436, + "learning_rate": 9.991543581006116e-06, + "loss": 0.3043, + "step": 1200 + }, + { + "epoch": 0.09607807843843123, + "grad_norm": 0.39570810617129387, + "learning_rate": 9.991505881949837e-06, + "loss": 0.286, + "step": 1201 + }, + { + "epoch": 0.09615807683846322, + "grad_norm": 0.32451725203786386, + "learning_rate": 9.991468099119828e-06, + "loss": 0.2799, + "step": 1202 + }, + { + "epoch": 0.09623807523849523, + "grad_norm": 0.3635693970246387, + "learning_rate": 9.991430232516725e-06, + "loss": 0.2935, + "step": 1203 + }, + { + "epoch": 0.09631807363852724, + "grad_norm": 0.30611953569173006, + "learning_rate": 9.991392282141161e-06, + "loss": 0.3244, + "step": 1204 + }, + { + "epoch": 0.09639807203855923, + "grad_norm": 0.34507660532781204, + "learning_rate": 9.991354247993776e-06, + "loss": 0.2896, + "step": 1205 + }, + { + "epoch": 0.09647807043859123, + "grad_norm": 0.3074957507688657, + "learning_rate": 9.991316130075208e-06, + "loss": 0.303, + "step": 1206 + }, + { + "epoch": 0.09655806883862322, + "grad_norm": 0.32739494400790675, + "learning_rate": 9.991277928386095e-06, + "loss": 0.3596, + "step": 1207 + }, + { + "epoch": 0.09663806723865523, + "grad_norm": 0.2667528346494223, + "learning_rate": 9.99123964292708e-06, + "loss": 0.343, + "step": 1208 + }, + { + "epoch": 0.09671806563868722, + "grad_norm": 0.2935436880471149, + "learning_rate": 9.991201273698805e-06, + "loss": 0.3225, + "step": 1209 + }, + { + "epoch": 0.09679806403871923, + "grad_norm": 0.3462450731645947, + "learning_rate": 9.991162820701911e-06, + "loss": 0.3161, + "step": 1210 + }, + { + "epoch": 0.09687806243875123, + "grad_norm": 0.31331579533043286, + "learning_rate": 9.991124283937049e-06, + "loss": 0.327, + "step": 1211 + }, + { + "epoch": 0.09695806083878322, + "grad_norm": 0.2926329699694107, + "learning_rate": 9.991085663404862e-06, + "loss": 0.3073, + "step": 1212 + }, + { + "epoch": 0.09703805923881523, + "grad_norm": 0.2549894897391792, + "learning_rate": 9.991046959105998e-06, + "loss": 0.3559, + "step": 1213 + }, + { + "epoch": 0.09711805763884722, + "grad_norm": 0.3672521042995224, + "learning_rate": 9.991008171041107e-06, + "loss": 0.2749, + "step": 1214 + }, + { + "epoch": 0.09719805603887922, + "grad_norm": 0.330753046382212, + "learning_rate": 9.990969299210843e-06, + "loss": 0.3052, + "step": 1215 + }, + { + "epoch": 0.09727805443891122, + "grad_norm": 0.33852157550638545, + "learning_rate": 9.990930343615854e-06, + "loss": 0.2843, + "step": 1216 + }, + { + "epoch": 0.09735805283894322, + "grad_norm": 0.3071865110051899, + "learning_rate": 9.990891304256796e-06, + "loss": 0.3034, + "step": 1217 + }, + { + "epoch": 0.09743805123897523, + "grad_norm": 0.3543877929036405, + "learning_rate": 9.990852181134323e-06, + "loss": 0.3042, + "step": 1218 + }, + { + "epoch": 0.09751804963900722, + "grad_norm": 0.3296751124860639, + "learning_rate": 9.990812974249094e-06, + "loss": 0.2936, + "step": 1219 + }, + { + "epoch": 0.09759804803903922, + "grad_norm": 0.3319169403443758, + "learning_rate": 9.990773683601764e-06, + "loss": 0.281, + "step": 1220 + }, + { + "epoch": 0.09767804643907121, + "grad_norm": 0.29435761612760664, + "learning_rate": 9.990734309192995e-06, + "loss": 0.3026, + "step": 1221 + }, + { + "epoch": 0.09775804483910322, + "grad_norm": 0.3508696198780259, + "learning_rate": 9.990694851023446e-06, + "loss": 0.2991, + "step": 1222 + }, + { + "epoch": 0.09783804323913521, + "grad_norm": 0.27849028589025154, + "learning_rate": 9.99065530909378e-06, + "loss": 0.3651, + "step": 1223 + }, + { + "epoch": 0.09791804163916722, + "grad_norm": 0.24643777819752472, + "learning_rate": 9.99061568340466e-06, + "loss": 0.3446, + "step": 1224 + }, + { + "epoch": 0.09799804003919922, + "grad_norm": 0.36466697956223487, + "learning_rate": 9.990575973956754e-06, + "loss": 0.2957, + "step": 1225 + }, + { + "epoch": 0.09807803843923121, + "grad_norm": 0.28234719611897835, + "learning_rate": 9.990536180750724e-06, + "loss": 0.3128, + "step": 1226 + }, + { + "epoch": 0.09815803683926322, + "grad_norm": 0.26571628888189464, + "learning_rate": 9.990496303787243e-06, + "loss": 0.3374, + "step": 1227 + }, + { + "epoch": 0.09823803523929521, + "grad_norm": 0.3165685919933477, + "learning_rate": 9.990456343066975e-06, + "loss": 0.2846, + "step": 1228 + }, + { + "epoch": 0.09831803363932722, + "grad_norm": 0.3037004282280437, + "learning_rate": 9.990416298590593e-06, + "loss": 0.297, + "step": 1229 + }, + { + "epoch": 0.0983980320393592, + "grad_norm": 0.22884022513625024, + "learning_rate": 9.990376170358769e-06, + "loss": 0.3537, + "step": 1230 + }, + { + "epoch": 0.09847803043939121, + "grad_norm": 0.3393243856563129, + "learning_rate": 9.990335958372178e-06, + "loss": 0.2936, + "step": 1231 + }, + { + "epoch": 0.09855802883942322, + "grad_norm": 0.33740733112670124, + "learning_rate": 9.99029566263149e-06, + "loss": 0.3046, + "step": 1232 + }, + { + "epoch": 0.09863802723945521, + "grad_norm": 0.39086042470060434, + "learning_rate": 9.990255283137388e-06, + "loss": 0.2935, + "step": 1233 + }, + { + "epoch": 0.09871802563948721, + "grad_norm": 0.2825160437529858, + "learning_rate": 9.990214819890545e-06, + "loss": 0.3135, + "step": 1234 + }, + { + "epoch": 0.0987980240395192, + "grad_norm": 0.3024014630499565, + "learning_rate": 9.990174272891642e-06, + "loss": 0.2969, + "step": 1235 + }, + { + "epoch": 0.09887802243955121, + "grad_norm": 0.26399402582762344, + "learning_rate": 9.990133642141359e-06, + "loss": 0.3313, + "step": 1236 + }, + { + "epoch": 0.0989580208395832, + "grad_norm": 0.2928909917195688, + "learning_rate": 9.990092927640378e-06, + "loss": 0.3032, + "step": 1237 + }, + { + "epoch": 0.09903801923961521, + "grad_norm": 0.2875495531790397, + "learning_rate": 9.99005212938938e-06, + "loss": 0.3308, + "step": 1238 + }, + { + "epoch": 0.09911801763964721, + "grad_norm": 0.3599689676409383, + "learning_rate": 9.990011247389055e-06, + "loss": 0.2725, + "step": 1239 + }, + { + "epoch": 0.0991980160396792, + "grad_norm": 0.3040189589717307, + "learning_rate": 9.989970281640085e-06, + "loss": 0.2988, + "step": 1240 + }, + { + "epoch": 0.09927801443971121, + "grad_norm": 0.3301827038386289, + "learning_rate": 9.989929232143159e-06, + "loss": 0.3195, + "step": 1241 + }, + { + "epoch": 0.0993580128397432, + "grad_norm": 0.3317958817552823, + "learning_rate": 9.989888098898965e-06, + "loss": 0.2705, + "step": 1242 + }, + { + "epoch": 0.0994380112397752, + "grad_norm": 0.293612853487226, + "learning_rate": 9.989846881908194e-06, + "loss": 0.2994, + "step": 1243 + }, + { + "epoch": 0.0995180096398072, + "grad_norm": 0.31166233169067775, + "learning_rate": 9.989805581171537e-06, + "loss": 0.3215, + "step": 1244 + }, + { + "epoch": 0.0995980080398392, + "grad_norm": 0.3352203043464999, + "learning_rate": 9.98976419668969e-06, + "loss": 0.3226, + "step": 1245 + }, + { + "epoch": 0.09967800643987121, + "grad_norm": 0.313630090794967, + "learning_rate": 9.989722728463345e-06, + "loss": 0.3171, + "step": 1246 + }, + { + "epoch": 0.0997580048399032, + "grad_norm": 0.2695007737475229, + "learning_rate": 9.989681176493197e-06, + "loss": 0.3466, + "step": 1247 + }, + { + "epoch": 0.0998380032399352, + "grad_norm": 0.38569026874778967, + "learning_rate": 9.989639540779945e-06, + "loss": 0.2876, + "step": 1248 + }, + { + "epoch": 0.0999180016399672, + "grad_norm": 0.35687023715211613, + "learning_rate": 9.989597821324288e-06, + "loss": 0.2975, + "step": 1249 + }, + { + "epoch": 0.0999980000399992, + "grad_norm": 0.20107465899013952, + "learning_rate": 9.989556018126925e-06, + "loss": 0.3793, + "step": 1250 + }, + { + "epoch": 0.1000779984400312, + "grad_norm": 0.3046596668292926, + "learning_rate": 9.98951413118856e-06, + "loss": 0.3289, + "step": 1251 + }, + { + "epoch": 0.1001579968400632, + "grad_norm": 0.2611889258583932, + "learning_rate": 9.989472160509892e-06, + "loss": 0.3395, + "step": 1252 + }, + { + "epoch": 0.1002379952400952, + "grad_norm": 0.3639602629968415, + "learning_rate": 9.989430106091629e-06, + "loss": 0.2988, + "step": 1253 + }, + { + "epoch": 0.1003179936401272, + "grad_norm": 0.27403630401752826, + "learning_rate": 9.989387967934477e-06, + "loss": 0.305, + "step": 1254 + }, + { + "epoch": 0.1003979920401592, + "grad_norm": 0.31924905918529256, + "learning_rate": 9.989345746039138e-06, + "loss": 0.3015, + "step": 1255 + }, + { + "epoch": 0.10047799044019119, + "grad_norm": 0.32996067929703016, + "learning_rate": 9.989303440406328e-06, + "loss": 0.3147, + "step": 1256 + }, + { + "epoch": 0.1005579888402232, + "grad_norm": 0.34996604942401544, + "learning_rate": 9.989261051036752e-06, + "loss": 0.3006, + "step": 1257 + }, + { + "epoch": 0.10063798724025519, + "grad_norm": 0.2878509099106884, + "learning_rate": 9.989218577931124e-06, + "loss": 0.3324, + "step": 1258 + }, + { + "epoch": 0.1007179856402872, + "grad_norm": 0.25956996621191714, + "learning_rate": 9.989176021090155e-06, + "loss": 0.3512, + "step": 1259 + }, + { + "epoch": 0.1007979840403192, + "grad_norm": 0.5301139208275449, + "learning_rate": 9.989133380514558e-06, + "loss": 0.2907, + "step": 1260 + }, + { + "epoch": 0.10087798244035119, + "grad_norm": 0.5293445331642519, + "learning_rate": 9.989090656205052e-06, + "loss": 0.281, + "step": 1261 + }, + { + "epoch": 0.1009579808403832, + "grad_norm": 0.38544550742549455, + "learning_rate": 9.989047848162353e-06, + "loss": 0.3259, + "step": 1262 + }, + { + "epoch": 0.10103797924041519, + "grad_norm": 0.2808493567214432, + "learning_rate": 9.989004956387179e-06, + "loss": 0.3577, + "step": 1263 + }, + { + "epoch": 0.10111797764044719, + "grad_norm": 0.3167349638535576, + "learning_rate": 9.98896198088025e-06, + "loss": 0.3187, + "step": 1264 + }, + { + "epoch": 0.10119797604047918, + "grad_norm": 0.20477675587648825, + "learning_rate": 9.988918921642287e-06, + "loss": 0.3642, + "step": 1265 + }, + { + "epoch": 0.10127797444051119, + "grad_norm": 0.24129749910625356, + "learning_rate": 9.988875778674014e-06, + "loss": 0.3235, + "step": 1266 + }, + { + "epoch": 0.1013579728405432, + "grad_norm": 0.3406247640971144, + "learning_rate": 9.988832551976151e-06, + "loss": 0.2857, + "step": 1267 + }, + { + "epoch": 0.10143797124057519, + "grad_norm": 0.2583420632215183, + "learning_rate": 9.988789241549429e-06, + "loss": 0.3466, + "step": 1268 + }, + { + "epoch": 0.10151796964060719, + "grad_norm": 0.39769910501089345, + "learning_rate": 9.988745847394572e-06, + "loss": 0.2969, + "step": 1269 + }, + { + "epoch": 0.10159796804063918, + "grad_norm": 0.23638316246652688, + "learning_rate": 9.98870236951231e-06, + "loss": 0.3511, + "step": 1270 + }, + { + "epoch": 0.10167796644067119, + "grad_norm": 0.2961384507963859, + "learning_rate": 9.988658807903369e-06, + "loss": 0.3383, + "step": 1271 + }, + { + "epoch": 0.10175796484070318, + "grad_norm": 0.34816728036944666, + "learning_rate": 9.988615162568483e-06, + "loss": 0.3226, + "step": 1272 + }, + { + "epoch": 0.10183796324073519, + "grad_norm": 0.6701901856513401, + "learning_rate": 9.988571433508383e-06, + "loss": 0.3058, + "step": 1273 + }, + { + "epoch": 0.10191796164076719, + "grad_norm": 0.35191364392506774, + "learning_rate": 9.988527620723804e-06, + "loss": 0.3205, + "step": 1274 + }, + { + "epoch": 0.10199796004079918, + "grad_norm": 0.4927961845676967, + "learning_rate": 9.988483724215483e-06, + "loss": 0.3149, + "step": 1275 + }, + { + "epoch": 0.10207795844083119, + "grad_norm": 0.3857376980686955, + "learning_rate": 9.988439743984155e-06, + "loss": 0.2871, + "step": 1276 + }, + { + "epoch": 0.10215795684086318, + "grad_norm": 0.30136918542905744, + "learning_rate": 9.988395680030556e-06, + "loss": 0.3063, + "step": 1277 + }, + { + "epoch": 0.10223795524089518, + "grad_norm": 0.31694495102436554, + "learning_rate": 9.988351532355428e-06, + "loss": 0.3365, + "step": 1278 + }, + { + "epoch": 0.10231795364092718, + "grad_norm": 0.3050659818145193, + "learning_rate": 9.988307300959513e-06, + "loss": 0.3163, + "step": 1279 + }, + { + "epoch": 0.10239795204095918, + "grad_norm": 0.2868988167797236, + "learning_rate": 9.988262985843551e-06, + "loss": 0.312, + "step": 1280 + }, + { + "epoch": 0.10247795044099119, + "grad_norm": 0.2936633728806369, + "learning_rate": 9.988218587008287e-06, + "loss": 0.3031, + "step": 1281 + }, + { + "epoch": 0.10255794884102318, + "grad_norm": 0.30562312034708167, + "learning_rate": 9.988174104454466e-06, + "loss": 0.2953, + "step": 1282 + }, + { + "epoch": 0.10263794724105518, + "grad_norm": 0.30098613252972745, + "learning_rate": 9.988129538182833e-06, + "loss": 0.3101, + "step": 1283 + }, + { + "epoch": 0.10271794564108717, + "grad_norm": 0.3439752151035775, + "learning_rate": 9.988084888194139e-06, + "loss": 0.3316, + "step": 1284 + }, + { + "epoch": 0.10279794404111918, + "grad_norm": 0.2290340858711675, + "learning_rate": 9.98804015448913e-06, + "loss": 0.3739, + "step": 1285 + }, + { + "epoch": 0.10287794244115117, + "grad_norm": 0.30234721270267423, + "learning_rate": 9.98799533706856e-06, + "loss": 0.3108, + "step": 1286 + }, + { + "epoch": 0.10295794084118318, + "grad_norm": 0.3167429964534761, + "learning_rate": 9.987950435933179e-06, + "loss": 0.3055, + "step": 1287 + }, + { + "epoch": 0.10303793924121518, + "grad_norm": 0.35936923207854876, + "learning_rate": 9.987905451083742e-06, + "loss": 0.2913, + "step": 1288 + }, + { + "epoch": 0.10311793764124717, + "grad_norm": 0.33185189738148346, + "learning_rate": 9.987860382521003e-06, + "loss": 0.2946, + "step": 1289 + }, + { + "epoch": 0.10319793604127918, + "grad_norm": 0.317909989613041, + "learning_rate": 9.987815230245717e-06, + "loss": 0.3149, + "step": 1290 + }, + { + "epoch": 0.10327793444131117, + "grad_norm": 0.3385313019442595, + "learning_rate": 9.987769994258645e-06, + "loss": 0.2729, + "step": 1291 + }, + { + "epoch": 0.10335793284134318, + "grad_norm": 0.2854576039475464, + "learning_rate": 9.987724674560544e-06, + "loss": 0.3226, + "step": 1292 + }, + { + "epoch": 0.10343793124137517, + "grad_norm": 0.3442677702237156, + "learning_rate": 9.987679271152175e-06, + "loss": 0.2935, + "step": 1293 + }, + { + "epoch": 0.10351792964140717, + "grad_norm": 0.34197836726400965, + "learning_rate": 9.9876337840343e-06, + "loss": 0.3077, + "step": 1294 + }, + { + "epoch": 0.10359792804143918, + "grad_norm": 0.29507960965647795, + "learning_rate": 9.987588213207684e-06, + "loss": 0.3307, + "step": 1295 + }, + { + "epoch": 0.10367792644147117, + "grad_norm": 0.25240740382005794, + "learning_rate": 9.98754255867309e-06, + "loss": 0.3734, + "step": 1296 + }, + { + "epoch": 0.10375792484150317, + "grad_norm": 0.3974352571139531, + "learning_rate": 9.987496820431284e-06, + "loss": 0.2913, + "step": 1297 + }, + { + "epoch": 0.10383792324153517, + "grad_norm": 0.2937148592539245, + "learning_rate": 9.987450998483035e-06, + "loss": 0.3165, + "step": 1298 + }, + { + "epoch": 0.10391792164156717, + "grad_norm": 0.35243267867548045, + "learning_rate": 9.987405092829113e-06, + "loss": 0.352, + "step": 1299 + }, + { + "epoch": 0.10399792004159916, + "grad_norm": 0.3370132406639814, + "learning_rate": 9.987359103470284e-06, + "loss": 0.2804, + "step": 1300 + }, + { + "epoch": 0.10407791844163117, + "grad_norm": 0.3370863623867093, + "learning_rate": 9.987313030407325e-06, + "loss": 0.315, + "step": 1301 + }, + { + "epoch": 0.10415791684166317, + "grad_norm": 0.29846961511591147, + "learning_rate": 9.987266873641005e-06, + "loss": 0.3393, + "step": 1302 + }, + { + "epoch": 0.10423791524169516, + "grad_norm": 0.2655617079971374, + "learning_rate": 9.987220633172101e-06, + "loss": 0.3483, + "step": 1303 + }, + { + "epoch": 0.10431791364172717, + "grad_norm": 0.3039496905169994, + "learning_rate": 9.987174309001389e-06, + "loss": 0.3167, + "step": 1304 + }, + { + "epoch": 0.10439791204175916, + "grad_norm": 0.8820324191915728, + "learning_rate": 9.987127901129647e-06, + "loss": 0.324, + "step": 1305 + }, + { + "epoch": 0.10447791044179117, + "grad_norm": 0.2690644999703079, + "learning_rate": 9.987081409557653e-06, + "loss": 0.3436, + "step": 1306 + }, + { + "epoch": 0.10455790884182316, + "grad_norm": 0.42625475439059285, + "learning_rate": 9.987034834286186e-06, + "loss": 0.2909, + "step": 1307 + }, + { + "epoch": 0.10463790724185516, + "grad_norm": 0.30301684141354085, + "learning_rate": 9.98698817531603e-06, + "loss": 0.3169, + "step": 1308 + }, + { + "epoch": 0.10471790564188717, + "grad_norm": 0.35746521313791585, + "learning_rate": 9.986941432647968e-06, + "loss": 0.353, + "step": 1309 + }, + { + "epoch": 0.10479790404191916, + "grad_norm": 0.3444488962456452, + "learning_rate": 9.986894606282781e-06, + "loss": 0.3015, + "step": 1310 + }, + { + "epoch": 0.10487790244195117, + "grad_norm": 0.2599942598559724, + "learning_rate": 9.98684769622126e-06, + "loss": 0.3412, + "step": 1311 + }, + { + "epoch": 0.10495790084198316, + "grad_norm": 0.45244421192262807, + "learning_rate": 9.986800702464188e-06, + "loss": 0.3179, + "step": 1312 + }, + { + "epoch": 0.10503789924201516, + "grad_norm": 0.28006695860917663, + "learning_rate": 9.986753625012358e-06, + "loss": 0.2973, + "step": 1313 + }, + { + "epoch": 0.10511789764204715, + "grad_norm": 0.3071198583963096, + "learning_rate": 9.986706463866555e-06, + "loss": 0.2926, + "step": 1314 + }, + { + "epoch": 0.10519789604207916, + "grad_norm": 0.3165090994527639, + "learning_rate": 9.986659219027575e-06, + "loss": 0.2932, + "step": 1315 + }, + { + "epoch": 0.10527789444211116, + "grad_norm": 0.26672877344909224, + "learning_rate": 9.986611890496207e-06, + "loss": 0.3505, + "step": 1316 + }, + { + "epoch": 0.10535789284214316, + "grad_norm": 0.2895156775431847, + "learning_rate": 9.986564478273249e-06, + "loss": 0.3298, + "step": 1317 + }, + { + "epoch": 0.10543789124217516, + "grad_norm": 0.2799604794561796, + "learning_rate": 9.986516982359495e-06, + "loss": 0.3032, + "step": 1318 + }, + { + "epoch": 0.10551788964220715, + "grad_norm": 0.37240292853299173, + "learning_rate": 9.986469402755742e-06, + "loss": 0.3039, + "step": 1319 + }, + { + "epoch": 0.10559788804223916, + "grad_norm": 0.272665775076525, + "learning_rate": 9.986421739462787e-06, + "loss": 0.365, + "step": 1320 + }, + { + "epoch": 0.10567788644227115, + "grad_norm": 0.29670359182274086, + "learning_rate": 9.986373992481434e-06, + "loss": 0.3136, + "step": 1321 + }, + { + "epoch": 0.10575788484230315, + "grad_norm": 0.21156399629307512, + "learning_rate": 9.986326161812482e-06, + "loss": 0.364, + "step": 1322 + }, + { + "epoch": 0.10583788324233516, + "grad_norm": 0.2635861252657959, + "learning_rate": 9.986278247456735e-06, + "loss": 0.319, + "step": 1323 + }, + { + "epoch": 0.10591788164236715, + "grad_norm": 0.31265765705063625, + "learning_rate": 9.986230249414994e-06, + "loss": 0.3545, + "step": 1324 + }, + { + "epoch": 0.10599788004239916, + "grad_norm": 0.25935565466178173, + "learning_rate": 9.986182167688066e-06, + "loss": 0.3392, + "step": 1325 + }, + { + "epoch": 0.10607787844243115, + "grad_norm": 0.2652829491049044, + "learning_rate": 9.98613400227676e-06, + "loss": 0.3271, + "step": 1326 + }, + { + "epoch": 0.10615787684246315, + "grad_norm": 0.42698749029613575, + "learning_rate": 9.986085753181883e-06, + "loss": 0.3037, + "step": 1327 + }, + { + "epoch": 0.10623787524249514, + "grad_norm": 0.29384866440506857, + "learning_rate": 9.986037420404244e-06, + "loss": 0.3019, + "step": 1328 + }, + { + "epoch": 0.10631787364252715, + "grad_norm": 0.5075400258226809, + "learning_rate": 9.985989003944655e-06, + "loss": 0.2607, + "step": 1329 + }, + { + "epoch": 0.10639787204255916, + "grad_norm": 0.3039461273386364, + "learning_rate": 9.985940503803928e-06, + "loss": 0.3111, + "step": 1330 + }, + { + "epoch": 0.10647787044259115, + "grad_norm": 0.33458028149626645, + "learning_rate": 9.985891919982878e-06, + "loss": 0.2939, + "step": 1331 + }, + { + "epoch": 0.10655786884262315, + "grad_norm": 0.4022390686572386, + "learning_rate": 9.98584325248232e-06, + "loss": 0.2909, + "step": 1332 + }, + { + "epoch": 0.10663786724265514, + "grad_norm": 0.2877003106087541, + "learning_rate": 9.98579450130307e-06, + "loss": 0.2898, + "step": 1333 + }, + { + "epoch": 0.10671786564268715, + "grad_norm": 0.34474511312778466, + "learning_rate": 9.985745666445948e-06, + "loss": 0.2765, + "step": 1334 + }, + { + "epoch": 0.10679786404271914, + "grad_norm": 0.32283272866703494, + "learning_rate": 9.98569674791177e-06, + "loss": 0.2855, + "step": 1335 + }, + { + "epoch": 0.10687786244275115, + "grad_norm": 1.1229365030885068, + "learning_rate": 9.985647745701362e-06, + "loss": 0.3144, + "step": 1336 + }, + { + "epoch": 0.10695786084278315, + "grad_norm": 0.3347897688339787, + "learning_rate": 9.985598659815543e-06, + "loss": 0.272, + "step": 1337 + }, + { + "epoch": 0.10703785924281514, + "grad_norm": 0.3254276245874741, + "learning_rate": 9.985549490255138e-06, + "loss": 0.2892, + "step": 1338 + }, + { + "epoch": 0.10711785764284715, + "grad_norm": 0.33438608112543416, + "learning_rate": 9.985500237020972e-06, + "loss": 0.2889, + "step": 1339 + }, + { + "epoch": 0.10719785604287914, + "grad_norm": 0.2872012281666103, + "learning_rate": 9.98545090011387e-06, + "loss": 0.3263, + "step": 1340 + }, + { + "epoch": 0.10727785444291114, + "grad_norm": 0.31516981903686203, + "learning_rate": 9.985401479534664e-06, + "loss": 0.3241, + "step": 1341 + }, + { + "epoch": 0.10735785284294314, + "grad_norm": 0.3979255205966222, + "learning_rate": 9.98535197528418e-06, + "loss": 0.3172, + "step": 1342 + }, + { + "epoch": 0.10743785124297514, + "grad_norm": 0.3311000065107302, + "learning_rate": 9.985302387363249e-06, + "loss": 0.2856, + "step": 1343 + }, + { + "epoch": 0.10751784964300715, + "grad_norm": 0.33735574769190096, + "learning_rate": 9.985252715772705e-06, + "loss": 0.2927, + "step": 1344 + }, + { + "epoch": 0.10759784804303914, + "grad_norm": 0.2836524534655475, + "learning_rate": 9.985202960513381e-06, + "loss": 0.3041, + "step": 1345 + }, + { + "epoch": 0.10767784644307114, + "grad_norm": 0.418141294242237, + "learning_rate": 9.985153121586111e-06, + "loss": 0.2725, + "step": 1346 + }, + { + "epoch": 0.10775784484310313, + "grad_norm": 0.4360924370905652, + "learning_rate": 9.985103198991733e-06, + "loss": 0.3061, + "step": 1347 + }, + { + "epoch": 0.10783784324313514, + "grad_norm": 0.31317808664606367, + "learning_rate": 9.985053192731085e-06, + "loss": 0.3226, + "step": 1348 + }, + { + "epoch": 0.10791784164316713, + "grad_norm": 0.35114821662992646, + "learning_rate": 9.985003102805004e-06, + "loss": 0.3003, + "step": 1349 + }, + { + "epoch": 0.10799784004319914, + "grad_norm": 0.34346522816141983, + "learning_rate": 9.98495292921433e-06, + "loss": 0.2976, + "step": 1350 + }, + { + "epoch": 0.10807783844323114, + "grad_norm": 0.2841215362063792, + "learning_rate": 9.984902671959911e-06, + "loss": 0.3243, + "step": 1351 + }, + { + "epoch": 0.10815783684326313, + "grad_norm": 0.34871208125747283, + "learning_rate": 9.984852331042585e-06, + "loss": 0.2795, + "step": 1352 + }, + { + "epoch": 0.10823783524329514, + "grad_norm": 0.3335832276308041, + "learning_rate": 9.984801906463199e-06, + "loss": 0.2786, + "step": 1353 + }, + { + "epoch": 0.10831783364332713, + "grad_norm": 0.2263235448444128, + "learning_rate": 9.984751398222598e-06, + "loss": 0.369, + "step": 1354 + }, + { + "epoch": 0.10839783204335914, + "grad_norm": 0.3579014521990015, + "learning_rate": 9.984700806321631e-06, + "loss": 0.3142, + "step": 1355 + }, + { + "epoch": 0.10847783044339113, + "grad_norm": 0.3524676693619381, + "learning_rate": 9.984650130761146e-06, + "loss": 0.2998, + "step": 1356 + }, + { + "epoch": 0.10855782884342313, + "grad_norm": 0.3354042591208965, + "learning_rate": 9.984599371541995e-06, + "loss": 0.294, + "step": 1357 + }, + { + "epoch": 0.10863782724345514, + "grad_norm": 0.358227469252063, + "learning_rate": 9.984548528665028e-06, + "loss": 0.2949, + "step": 1358 + }, + { + "epoch": 0.10871782564348713, + "grad_norm": 0.3380515846468206, + "learning_rate": 9.984497602131101e-06, + "loss": 0.2934, + "step": 1359 + }, + { + "epoch": 0.10879782404351913, + "grad_norm": 0.3470165351129875, + "learning_rate": 9.984446591941065e-06, + "loss": 0.2825, + "step": 1360 + }, + { + "epoch": 0.10887782244355113, + "grad_norm": 0.30430799180903056, + "learning_rate": 9.98439549809578e-06, + "loss": 0.3155, + "step": 1361 + }, + { + "epoch": 0.10895782084358313, + "grad_norm": 0.2786739661600301, + "learning_rate": 9.984344320596103e-06, + "loss": 0.3539, + "step": 1362 + }, + { + "epoch": 0.10903781924361512, + "grad_norm": 0.2798544229523909, + "learning_rate": 9.984293059442888e-06, + "loss": 0.3413, + "step": 1363 + }, + { + "epoch": 0.10911781764364713, + "grad_norm": 0.3882108173734194, + "learning_rate": 9.984241714636999e-06, + "loss": 0.3238, + "step": 1364 + }, + { + "epoch": 0.10919781604367913, + "grad_norm": 0.35321414537182727, + "learning_rate": 9.9841902861793e-06, + "loss": 0.2991, + "step": 1365 + }, + { + "epoch": 0.10927781444371112, + "grad_norm": 0.2834593596321515, + "learning_rate": 9.984138774070651e-06, + "loss": 0.2955, + "step": 1366 + }, + { + "epoch": 0.10935781284374313, + "grad_norm": 0.3071572663719183, + "learning_rate": 9.984087178311917e-06, + "loss": 0.314, + "step": 1367 + }, + { + "epoch": 0.10943781124377512, + "grad_norm": 0.3451213092158988, + "learning_rate": 9.984035498903965e-06, + "loss": 0.2773, + "step": 1368 + }, + { + "epoch": 0.10951780964380713, + "grad_norm": 0.3384903614165167, + "learning_rate": 9.98398373584766e-06, + "loss": 0.2929, + "step": 1369 + }, + { + "epoch": 0.10959780804383912, + "grad_norm": 0.3389986485063915, + "learning_rate": 9.983931889143874e-06, + "loss": 0.28, + "step": 1370 + }, + { + "epoch": 0.10967780644387112, + "grad_norm": 0.2926365930347929, + "learning_rate": 9.983879958793476e-06, + "loss": 0.3217, + "step": 1371 + }, + { + "epoch": 0.10975780484390313, + "grad_norm": 0.3350498945875562, + "learning_rate": 9.983827944797336e-06, + "loss": 0.3051, + "step": 1372 + }, + { + "epoch": 0.10983780324393512, + "grad_norm": 0.2577098138025434, + "learning_rate": 9.983775847156327e-06, + "loss": 0.3572, + "step": 1373 + }, + { + "epoch": 0.10991780164396713, + "grad_norm": 0.30659277855840583, + "learning_rate": 9.983723665871326e-06, + "loss": 0.3033, + "step": 1374 + }, + { + "epoch": 0.10999780004399912, + "grad_norm": 0.33947810651758475, + "learning_rate": 9.983671400943206e-06, + "loss": 0.2825, + "step": 1375 + }, + { + "epoch": 0.11007779844403112, + "grad_norm": 0.2923985072662539, + "learning_rate": 9.983619052372847e-06, + "loss": 0.3018, + "step": 1376 + }, + { + "epoch": 0.11015779684406311, + "grad_norm": 0.3760544257107967, + "learning_rate": 9.983566620161126e-06, + "loss": 0.2711, + "step": 1377 + }, + { + "epoch": 0.11023779524409512, + "grad_norm": 0.30763794397167393, + "learning_rate": 9.983514104308923e-06, + "loss": 0.3189, + "step": 1378 + }, + { + "epoch": 0.11031779364412712, + "grad_norm": 0.2945578848786041, + "learning_rate": 9.983461504817119e-06, + "loss": 0.3359, + "step": 1379 + }, + { + "epoch": 0.11039779204415912, + "grad_norm": 0.34270587721575296, + "learning_rate": 9.983408821686596e-06, + "loss": 0.3107, + "step": 1380 + }, + { + "epoch": 0.11047779044419112, + "grad_norm": 0.26706071138871557, + "learning_rate": 9.983356054918238e-06, + "loss": 0.35, + "step": 1381 + }, + { + "epoch": 0.11055778884422311, + "grad_norm": 0.22567891958630917, + "learning_rate": 9.983303204512935e-06, + "loss": 0.3599, + "step": 1382 + }, + { + "epoch": 0.11063778724425512, + "grad_norm": 0.33980870107461, + "learning_rate": 9.983250270471569e-06, + "loss": 0.2793, + "step": 1383 + }, + { + "epoch": 0.11071778564428711, + "grad_norm": 0.2911898495649174, + "learning_rate": 9.983197252795031e-06, + "loss": 0.3477, + "step": 1384 + }, + { + "epoch": 0.11079778404431911, + "grad_norm": 0.2972147659998989, + "learning_rate": 9.98314415148421e-06, + "loss": 0.3156, + "step": 1385 + }, + { + "epoch": 0.11087778244435112, + "grad_norm": 0.33817898180635353, + "learning_rate": 9.98309096654e-06, + "loss": 0.2728, + "step": 1386 + }, + { + "epoch": 0.11095778084438311, + "grad_norm": 0.4474019938067382, + "learning_rate": 9.983037697963287e-06, + "loss": 0.3085, + "step": 1387 + }, + { + "epoch": 0.11103777924441512, + "grad_norm": 0.28548989785634743, + "learning_rate": 9.982984345754972e-06, + "loss": 0.3134, + "step": 1388 + }, + { + "epoch": 0.11111777764444711, + "grad_norm": 0.3399075028099732, + "learning_rate": 9.982930909915944e-06, + "loss": 0.3351, + "step": 1389 + }, + { + "epoch": 0.11119777604447911, + "grad_norm": 0.29761108577632617, + "learning_rate": 9.982877390447106e-06, + "loss": 0.2727, + "step": 1390 + }, + { + "epoch": 0.1112777744445111, + "grad_norm": 0.26276528135687366, + "learning_rate": 9.982823787349352e-06, + "loss": 0.33, + "step": 1391 + }, + { + "epoch": 0.11135777284454311, + "grad_norm": 0.279946862441137, + "learning_rate": 9.982770100623584e-06, + "loss": 0.3112, + "step": 1392 + }, + { + "epoch": 0.11143777124457512, + "grad_norm": 0.2718546689068045, + "learning_rate": 9.982716330270701e-06, + "loss": 0.2948, + "step": 1393 + }, + { + "epoch": 0.1115177696446071, + "grad_norm": 0.30181062053838537, + "learning_rate": 9.98266247629161e-06, + "loss": 0.2862, + "step": 1394 + }, + { + "epoch": 0.11159776804463911, + "grad_norm": 0.24667720148422817, + "learning_rate": 9.982608538687208e-06, + "loss": 0.3263, + "step": 1395 + }, + { + "epoch": 0.1116777664446711, + "grad_norm": 0.327123223374991, + "learning_rate": 9.982554517458403e-06, + "loss": 0.3201, + "step": 1396 + }, + { + "epoch": 0.11175776484470311, + "grad_norm": 0.43777605250179946, + "learning_rate": 9.982500412606105e-06, + "loss": 0.2834, + "step": 1397 + }, + { + "epoch": 0.1118377632447351, + "grad_norm": 0.26571596143566883, + "learning_rate": 9.982446224131217e-06, + "loss": 0.3378, + "step": 1398 + }, + { + "epoch": 0.1119177616447671, + "grad_norm": 0.2568340781964374, + "learning_rate": 9.982391952034653e-06, + "loss": 0.3358, + "step": 1399 + }, + { + "epoch": 0.11199776004479911, + "grad_norm": 0.24132138161118302, + "learning_rate": 9.982337596317321e-06, + "loss": 0.3411, + "step": 1400 + }, + { + "epoch": 0.1120777584448311, + "grad_norm": 0.2643760362831179, + "learning_rate": 9.982283156980133e-06, + "loss": 0.3351, + "step": 1401 + }, + { + "epoch": 0.11215775684486311, + "grad_norm": 0.26961492781031765, + "learning_rate": 9.982228634024004e-06, + "loss": 0.3578, + "step": 1402 + }, + { + "epoch": 0.1122377552448951, + "grad_norm": 0.3113643028930828, + "learning_rate": 9.982174027449849e-06, + "loss": 0.2781, + "step": 1403 + }, + { + "epoch": 0.1123177536449271, + "grad_norm": 0.3152292197825255, + "learning_rate": 9.982119337258585e-06, + "loss": 0.3184, + "step": 1404 + }, + { + "epoch": 0.1123977520449591, + "grad_norm": 0.2578151671721826, + "learning_rate": 9.982064563451128e-06, + "loss": 0.3374, + "step": 1405 + }, + { + "epoch": 0.1124777504449911, + "grad_norm": 0.33190529764117943, + "learning_rate": 9.982009706028399e-06, + "loss": 0.2964, + "step": 1406 + }, + { + "epoch": 0.1125577488450231, + "grad_norm": 0.3266734397325813, + "learning_rate": 9.981954764991318e-06, + "loss": 0.2819, + "step": 1407 + }, + { + "epoch": 0.1126377472450551, + "grad_norm": 0.2968811814987577, + "learning_rate": 9.981899740340807e-06, + "loss": 0.315, + "step": 1408 + }, + { + "epoch": 0.1127177456450871, + "grad_norm": 0.31673607049427976, + "learning_rate": 9.981844632077788e-06, + "loss": 0.2664, + "step": 1409 + }, + { + "epoch": 0.1127977440451191, + "grad_norm": 0.27705925859633873, + "learning_rate": 9.98178944020319e-06, + "loss": 0.3423, + "step": 1410 + }, + { + "epoch": 0.1128777424451511, + "grad_norm": 0.32536013581500256, + "learning_rate": 9.981734164717936e-06, + "loss": 0.303, + "step": 1411 + }, + { + "epoch": 0.11295774084518309, + "grad_norm": 0.2706952881579151, + "learning_rate": 9.981678805622954e-06, + "loss": 0.2986, + "step": 1412 + }, + { + "epoch": 0.1130377392452151, + "grad_norm": 0.3247750360500546, + "learning_rate": 9.981623362919173e-06, + "loss": 0.3074, + "step": 1413 + }, + { + "epoch": 0.1131177376452471, + "grad_norm": 0.3245698134320572, + "learning_rate": 9.981567836607526e-06, + "loss": 0.2818, + "step": 1414 + }, + { + "epoch": 0.1131977360452791, + "grad_norm": 0.31380831107247337, + "learning_rate": 9.981512226688943e-06, + "loss": 0.2638, + "step": 1415 + }, + { + "epoch": 0.1132777344453111, + "grad_norm": 0.3302184240510599, + "learning_rate": 9.981456533164356e-06, + "loss": 0.2896, + "step": 1416 + }, + { + "epoch": 0.11335773284534309, + "grad_norm": 0.2703846217199556, + "learning_rate": 9.981400756034701e-06, + "loss": 0.3108, + "step": 1417 + }, + { + "epoch": 0.1134377312453751, + "grad_norm": 0.29058278083226297, + "learning_rate": 9.981344895300916e-06, + "loss": 0.3085, + "step": 1418 + }, + { + "epoch": 0.11351772964540709, + "grad_norm": 0.3204833693640965, + "learning_rate": 9.981288950963935e-06, + "loss": 0.2898, + "step": 1419 + }, + { + "epoch": 0.11359772804543909, + "grad_norm": 0.29668528463685284, + "learning_rate": 9.981232923024699e-06, + "loss": 0.2979, + "step": 1420 + }, + { + "epoch": 0.1136777264454711, + "grad_norm": 0.24285224028759425, + "learning_rate": 9.981176811484148e-06, + "loss": 0.3691, + "step": 1421 + }, + { + "epoch": 0.11375772484550309, + "grad_norm": 0.3158911302976063, + "learning_rate": 9.981120616343222e-06, + "loss": 0.2951, + "step": 1422 + }, + { + "epoch": 0.1138377232455351, + "grad_norm": 0.3637552199024051, + "learning_rate": 9.981064337602869e-06, + "loss": 0.2922, + "step": 1423 + }, + { + "epoch": 0.11391772164556709, + "grad_norm": 0.32651378336051123, + "learning_rate": 9.981007975264029e-06, + "loss": 0.3062, + "step": 1424 + }, + { + "epoch": 0.11399772004559909, + "grad_norm": 0.32003476616525206, + "learning_rate": 9.980951529327649e-06, + "loss": 0.2997, + "step": 1425 + }, + { + "epoch": 0.11407771844563108, + "grad_norm": 0.29555774476124724, + "learning_rate": 9.980894999794678e-06, + "loss": 0.3205, + "step": 1426 + }, + { + "epoch": 0.11415771684566309, + "grad_norm": 0.2890746054574132, + "learning_rate": 9.980838386666063e-06, + "loss": 0.2948, + "step": 1427 + }, + { + "epoch": 0.11423771524569509, + "grad_norm": 0.3458946919176639, + "learning_rate": 9.980781689942753e-06, + "loss": 0.3197, + "step": 1428 + }, + { + "epoch": 0.11431771364572708, + "grad_norm": 0.2990655353141574, + "learning_rate": 9.980724909625704e-06, + "loss": 0.2949, + "step": 1429 + }, + { + "epoch": 0.11439771204575909, + "grad_norm": 0.3337600702981667, + "learning_rate": 9.980668045715864e-06, + "loss": 0.2937, + "step": 1430 + }, + { + "epoch": 0.11447771044579108, + "grad_norm": 0.3486863418960651, + "learning_rate": 9.98061109821419e-06, + "loss": 0.2714, + "step": 1431 + }, + { + "epoch": 0.11455770884582309, + "grad_norm": 0.3015139990742092, + "learning_rate": 9.980554067121637e-06, + "loss": 0.3126, + "step": 1432 + }, + { + "epoch": 0.11463770724585508, + "grad_norm": 0.5036819319033887, + "learning_rate": 9.980496952439162e-06, + "loss": 0.3147, + "step": 1433 + }, + { + "epoch": 0.11471770564588708, + "grad_norm": 0.2854939014908416, + "learning_rate": 9.980439754167723e-06, + "loss": 0.3046, + "step": 1434 + }, + { + "epoch": 0.11479770404591909, + "grad_norm": 0.3386204937853727, + "learning_rate": 9.980382472308283e-06, + "loss": 0.2793, + "step": 1435 + }, + { + "epoch": 0.11487770244595108, + "grad_norm": 0.45164165461291966, + "learning_rate": 9.980325106861802e-06, + "loss": 0.281, + "step": 1436 + }, + { + "epoch": 0.11495770084598309, + "grad_norm": 0.3501993025413012, + "learning_rate": 9.980267657829241e-06, + "loss": 0.301, + "step": 1437 + }, + { + "epoch": 0.11503769924601508, + "grad_norm": 0.3087839880243543, + "learning_rate": 9.980210125211565e-06, + "loss": 0.3071, + "step": 1438 + }, + { + "epoch": 0.11511769764604708, + "grad_norm": 0.24451092987643727, + "learning_rate": 9.98015250900974e-06, + "loss": 0.3576, + "step": 1439 + }, + { + "epoch": 0.11519769604607907, + "grad_norm": 0.33312836101539495, + "learning_rate": 9.980094809224732e-06, + "loss": 0.286, + "step": 1440 + }, + { + "epoch": 0.11527769444611108, + "grad_norm": 0.2901214304728016, + "learning_rate": 9.980037025857511e-06, + "loss": 0.2946, + "step": 1441 + }, + { + "epoch": 0.11535769284614308, + "grad_norm": 0.28985762073093546, + "learning_rate": 9.979979158909046e-06, + "loss": 0.3042, + "step": 1442 + }, + { + "epoch": 0.11543769124617508, + "grad_norm": 0.36106296513149383, + "learning_rate": 9.979921208380308e-06, + "loss": 0.307, + "step": 1443 + }, + { + "epoch": 0.11551768964620708, + "grad_norm": 0.28809511390562903, + "learning_rate": 9.979863174272272e-06, + "loss": 0.326, + "step": 1444 + }, + { + "epoch": 0.11559768804623907, + "grad_norm": 0.32190326401354225, + "learning_rate": 9.979805056585907e-06, + "loss": 0.2967, + "step": 1445 + }, + { + "epoch": 0.11567768644627108, + "grad_norm": 0.3008408052431682, + "learning_rate": 9.979746855322192e-06, + "loss": 0.3279, + "step": 1446 + }, + { + "epoch": 0.11575768484630307, + "grad_norm": 0.2499216574047745, + "learning_rate": 9.979688570482102e-06, + "loss": 0.3503, + "step": 1447 + }, + { + "epoch": 0.11583768324633507, + "grad_norm": 0.28669117049920684, + "learning_rate": 9.979630202066619e-06, + "loss": 0.3148, + "step": 1448 + }, + { + "epoch": 0.11591768164636708, + "grad_norm": 0.4187697294160275, + "learning_rate": 9.979571750076717e-06, + "loss": 0.28, + "step": 1449 + }, + { + "epoch": 0.11599768004639907, + "grad_norm": 0.3572209385508511, + "learning_rate": 9.979513214513381e-06, + "loss": 0.3283, + "step": 1450 + }, + { + "epoch": 0.11607767844643108, + "grad_norm": 0.30147665738404217, + "learning_rate": 9.979454595377594e-06, + "loss": 0.3227, + "step": 1451 + }, + { + "epoch": 0.11615767684646307, + "grad_norm": 0.36225540456146976, + "learning_rate": 9.979395892670336e-06, + "loss": 0.319, + "step": 1452 + }, + { + "epoch": 0.11623767524649507, + "grad_norm": 0.3070999353893005, + "learning_rate": 9.979337106392596e-06, + "loss": 0.2935, + "step": 1453 + }, + { + "epoch": 0.11631767364652706, + "grad_norm": 0.5279769431102683, + "learning_rate": 9.979278236545356e-06, + "loss": 0.317, + "step": 1454 + }, + { + "epoch": 0.11639767204655907, + "grad_norm": 0.3450699672279452, + "learning_rate": 9.97921928312961e-06, + "loss": 0.2873, + "step": 1455 + }, + { + "epoch": 0.11647767044659108, + "grad_norm": 0.34255939265265894, + "learning_rate": 9.979160246146343e-06, + "loss": 0.323, + "step": 1456 + }, + { + "epoch": 0.11655766884662307, + "grad_norm": 0.2902131464997649, + "learning_rate": 9.979101125596548e-06, + "loss": 0.3152, + "step": 1457 + }, + { + "epoch": 0.11663766724665507, + "grad_norm": 0.37785094994138085, + "learning_rate": 9.979041921481217e-06, + "loss": 0.3129, + "step": 1458 + }, + { + "epoch": 0.11671766564668706, + "grad_norm": 0.3302957376558734, + "learning_rate": 9.978982633801342e-06, + "loss": 0.3085, + "step": 1459 + }, + { + "epoch": 0.11679766404671907, + "grad_norm": 0.3112910081678639, + "learning_rate": 9.978923262557918e-06, + "loss": 0.3209, + "step": 1460 + }, + { + "epoch": 0.11687766244675106, + "grad_norm": 0.34660039058612774, + "learning_rate": 9.978863807751944e-06, + "loss": 0.3399, + "step": 1461 + }, + { + "epoch": 0.11695766084678307, + "grad_norm": 0.32229968929426706, + "learning_rate": 9.978804269384417e-06, + "loss": 0.3208, + "step": 1462 + }, + { + "epoch": 0.11703765924681507, + "grad_norm": 0.34257960233020696, + "learning_rate": 9.978744647456335e-06, + "loss": 0.279, + "step": 1463 + }, + { + "epoch": 0.11711765764684706, + "grad_norm": 0.34136657588100766, + "learning_rate": 9.9786849419687e-06, + "loss": 0.2811, + "step": 1464 + }, + { + "epoch": 0.11719765604687907, + "grad_norm": 0.3211573022383478, + "learning_rate": 9.978625152922511e-06, + "loss": 0.2812, + "step": 1465 + }, + { + "epoch": 0.11727765444691106, + "grad_norm": 0.3301161313973079, + "learning_rate": 9.978565280318777e-06, + "loss": 0.2868, + "step": 1466 + }, + { + "epoch": 0.11735765284694306, + "grad_norm": 0.38159317546514854, + "learning_rate": 9.978505324158499e-06, + "loss": 0.2893, + "step": 1467 + }, + { + "epoch": 0.11743765124697506, + "grad_norm": 0.30024634169715086, + "learning_rate": 9.978445284442684e-06, + "loss": 0.3197, + "step": 1468 + }, + { + "epoch": 0.11751764964700706, + "grad_norm": 0.26133989016953835, + "learning_rate": 9.97838516117234e-06, + "loss": 0.3684, + "step": 1469 + }, + { + "epoch": 0.11759764804703907, + "grad_norm": 0.35906397718844013, + "learning_rate": 9.978324954348472e-06, + "loss": 0.2825, + "step": 1470 + }, + { + "epoch": 0.11767764644707106, + "grad_norm": 0.3286317533720119, + "learning_rate": 9.978264663972099e-06, + "loss": 0.2907, + "step": 1471 + }, + { + "epoch": 0.11775764484710306, + "grad_norm": 0.26986448589140516, + "learning_rate": 9.978204290044225e-06, + "loss": 0.3237, + "step": 1472 + }, + { + "epoch": 0.11783764324713505, + "grad_norm": 0.3232349921602911, + "learning_rate": 9.978143832565868e-06, + "loss": 0.295, + "step": 1473 + }, + { + "epoch": 0.11791764164716706, + "grad_norm": 0.3657146918647761, + "learning_rate": 9.978083291538041e-06, + "loss": 0.2783, + "step": 1474 + }, + { + "epoch": 0.11799764004719905, + "grad_norm": 0.2771884573992658, + "learning_rate": 9.97802266696176e-06, + "loss": 0.2884, + "step": 1475 + }, + { + "epoch": 0.11807763844723106, + "grad_norm": 0.34310011874585267, + "learning_rate": 9.97796195883804e-06, + "loss": 0.2897, + "step": 1476 + }, + { + "epoch": 0.11815763684726305, + "grad_norm": 0.2435888730935232, + "learning_rate": 9.977901167167904e-06, + "loss": 0.3544, + "step": 1477 + }, + { + "epoch": 0.11823763524729505, + "grad_norm": 0.3103673532728682, + "learning_rate": 9.977840291952373e-06, + "loss": 0.3291, + "step": 1478 + }, + { + "epoch": 0.11831763364732706, + "grad_norm": 0.317614922138713, + "learning_rate": 9.977779333192464e-06, + "loss": 0.311, + "step": 1479 + }, + { + "epoch": 0.11839763204735905, + "grad_norm": 0.3271110334014837, + "learning_rate": 9.977718290889202e-06, + "loss": 0.2923, + "step": 1480 + }, + { + "epoch": 0.11847763044739106, + "grad_norm": 0.31018444377527504, + "learning_rate": 9.977657165043613e-06, + "loss": 0.3029, + "step": 1481 + }, + { + "epoch": 0.11855762884742305, + "grad_norm": 0.32276842556863816, + "learning_rate": 9.977595955656722e-06, + "loss": 0.2833, + "step": 1482 + }, + { + "epoch": 0.11863762724745505, + "grad_norm": 0.2929831885525505, + "learning_rate": 9.977534662729556e-06, + "loss": 0.3103, + "step": 1483 + }, + { + "epoch": 0.11871762564748704, + "grad_norm": 0.2793754796056116, + "learning_rate": 9.977473286263145e-06, + "loss": 0.2848, + "step": 1484 + }, + { + "epoch": 0.11879762404751905, + "grad_norm": 0.28825136415120856, + "learning_rate": 9.977411826258516e-06, + "loss": 0.3111, + "step": 1485 + }, + { + "epoch": 0.11887762244755105, + "grad_norm": 0.2800849463181504, + "learning_rate": 9.977350282716703e-06, + "loss": 0.317, + "step": 1486 + }, + { + "epoch": 0.11895762084758305, + "grad_norm": 0.2854853040851749, + "learning_rate": 9.977288655638737e-06, + "loss": 0.311, + "step": 1487 + }, + { + "epoch": 0.11903761924761505, + "grad_norm": 0.3188425101986074, + "learning_rate": 9.977226945025655e-06, + "loss": 0.3041, + "step": 1488 + }, + { + "epoch": 0.11911761764764704, + "grad_norm": 0.2872613664932559, + "learning_rate": 9.977165150878492e-06, + "loss": 0.3128, + "step": 1489 + }, + { + "epoch": 0.11919761604767905, + "grad_norm": 0.3240081271199343, + "learning_rate": 9.977103273198285e-06, + "loss": 0.321, + "step": 1490 + }, + { + "epoch": 0.11927761444771104, + "grad_norm": 0.32873356293552747, + "learning_rate": 9.977041311986072e-06, + "loss": 0.3208, + "step": 1491 + }, + { + "epoch": 0.11935761284774304, + "grad_norm": 0.3203868320710008, + "learning_rate": 9.97697926724289e-06, + "loss": 0.283, + "step": 1492 + }, + { + "epoch": 0.11943761124777505, + "grad_norm": 0.3054043730128793, + "learning_rate": 9.976917138969784e-06, + "loss": 0.3523, + "step": 1493 + }, + { + "epoch": 0.11951760964780704, + "grad_norm": 0.3674858121403118, + "learning_rate": 9.976854927167799e-06, + "loss": 0.2866, + "step": 1494 + }, + { + "epoch": 0.11959760804783905, + "grad_norm": 0.32809462029957726, + "learning_rate": 9.976792631837973e-06, + "loss": 0.2959, + "step": 1495 + }, + { + "epoch": 0.11967760644787104, + "grad_norm": 0.3021737904215764, + "learning_rate": 9.976730252981354e-06, + "loss": 0.307, + "step": 1496 + }, + { + "epoch": 0.11975760484790304, + "grad_norm": 0.29884948751944573, + "learning_rate": 9.976667790598991e-06, + "loss": 0.3114, + "step": 1497 + }, + { + "epoch": 0.11983760324793503, + "grad_norm": 0.323232091189394, + "learning_rate": 9.97660524469193e-06, + "loss": 0.2891, + "step": 1498 + }, + { + "epoch": 0.11991760164796704, + "grad_norm": 0.31988370657863524, + "learning_rate": 9.976542615261223e-06, + "loss": 0.2866, + "step": 1499 + }, + { + "epoch": 0.11999760004799905, + "grad_norm": 0.3492068403565844, + "learning_rate": 9.976479902307918e-06, + "loss": 0.2926, + "step": 1500 + }, + { + "epoch": 0.12007759844803104, + "grad_norm": 0.31893173931742136, + "learning_rate": 9.97641710583307e-06, + "loss": 0.3222, + "step": 1501 + }, + { + "epoch": 0.12015759684806304, + "grad_norm": 0.3080721393753546, + "learning_rate": 9.976354225837733e-06, + "loss": 0.3097, + "step": 1502 + }, + { + "epoch": 0.12023759524809503, + "grad_norm": 0.2654106512289159, + "learning_rate": 9.97629126232296e-06, + "loss": 0.3371, + "step": 1503 + }, + { + "epoch": 0.12031759364812704, + "grad_norm": 0.3175950676091756, + "learning_rate": 9.97622821528981e-06, + "loss": 0.3081, + "step": 1504 + }, + { + "epoch": 0.12039759204815903, + "grad_norm": 0.27579554520545546, + "learning_rate": 9.97616508473934e-06, + "loss": 0.3414, + "step": 1505 + }, + { + "epoch": 0.12047759044819104, + "grad_norm": 0.24773422292196634, + "learning_rate": 9.97610187067261e-06, + "loss": 0.3402, + "step": 1506 + }, + { + "epoch": 0.12055758884822304, + "grad_norm": 0.3269540778817153, + "learning_rate": 9.976038573090679e-06, + "loss": 0.2687, + "step": 1507 + }, + { + "epoch": 0.12063758724825503, + "grad_norm": 0.29435912748136317, + "learning_rate": 9.975975191994614e-06, + "loss": 0.3114, + "step": 1508 + }, + { + "epoch": 0.12071758564828704, + "grad_norm": 0.31180608889896605, + "learning_rate": 9.975911727385473e-06, + "loss": 0.3141, + "step": 1509 + }, + { + "epoch": 0.12079758404831903, + "grad_norm": 0.3741440907311983, + "learning_rate": 9.975848179264325e-06, + "loss": 0.2712, + "step": 1510 + }, + { + "epoch": 0.12087758244835103, + "grad_norm": 0.3080836248256556, + "learning_rate": 9.975784547632237e-06, + "loss": 0.3299, + "step": 1511 + }, + { + "epoch": 0.12095758084838303, + "grad_norm": 0.5226236387917746, + "learning_rate": 9.975720832490274e-06, + "loss": 0.2799, + "step": 1512 + }, + { + "epoch": 0.12103757924841503, + "grad_norm": 0.4286751809891568, + "learning_rate": 9.975657033839506e-06, + "loss": 0.2932, + "step": 1513 + }, + { + "epoch": 0.12111757764844704, + "grad_norm": 0.36005550131069336, + "learning_rate": 9.975593151681006e-06, + "loss": 0.2867, + "step": 1514 + }, + { + "epoch": 0.12119757604847903, + "grad_norm": 0.400946489366523, + "learning_rate": 9.975529186015844e-06, + "loss": 0.2831, + "step": 1515 + }, + { + "epoch": 0.12127757444851103, + "grad_norm": 0.33439622278390013, + "learning_rate": 9.975465136845095e-06, + "loss": 0.284, + "step": 1516 + }, + { + "epoch": 0.12135757284854302, + "grad_norm": 0.28467649066049355, + "learning_rate": 9.975401004169834e-06, + "loss": 0.3313, + "step": 1517 + }, + { + "epoch": 0.12143757124857503, + "grad_norm": 0.38430441032851415, + "learning_rate": 9.975336787991135e-06, + "loss": 0.3158, + "step": 1518 + }, + { + "epoch": 0.12151756964860702, + "grad_norm": 0.3461471552793967, + "learning_rate": 9.975272488310077e-06, + "loss": 0.2703, + "step": 1519 + }, + { + "epoch": 0.12159756804863903, + "grad_norm": 0.25635041378940554, + "learning_rate": 9.97520810512774e-06, + "loss": 0.3407, + "step": 1520 + }, + { + "epoch": 0.12167756644867103, + "grad_norm": 0.36860805507721267, + "learning_rate": 9.975143638445205e-06, + "loss": 0.312, + "step": 1521 + }, + { + "epoch": 0.12175756484870302, + "grad_norm": 0.33025431942268124, + "learning_rate": 9.975079088263553e-06, + "loss": 0.2711, + "step": 1522 + }, + { + "epoch": 0.12183756324873503, + "grad_norm": 0.2685661278626135, + "learning_rate": 9.975014454583867e-06, + "loss": 0.3011, + "step": 1523 + }, + { + "epoch": 0.12191756164876702, + "grad_norm": 0.27592491432763816, + "learning_rate": 9.974949737407232e-06, + "loss": 0.3361, + "step": 1524 + }, + { + "epoch": 0.12199756004879903, + "grad_norm": 0.2810609031775785, + "learning_rate": 9.974884936734734e-06, + "loss": 0.303, + "step": 1525 + }, + { + "epoch": 0.12207755844883102, + "grad_norm": 0.2959647019456412, + "learning_rate": 9.97482005256746e-06, + "loss": 0.3277, + "step": 1526 + }, + { + "epoch": 0.12215755684886302, + "grad_norm": 0.31007940798028527, + "learning_rate": 9.974755084906503e-06, + "loss": 0.3325, + "step": 1527 + }, + { + "epoch": 0.12223755524889503, + "grad_norm": 0.294643580488709, + "learning_rate": 9.974690033752947e-06, + "loss": 0.3513, + "step": 1528 + }, + { + "epoch": 0.12231755364892702, + "grad_norm": 0.23361512903103868, + "learning_rate": 9.974624899107887e-06, + "loss": 0.3576, + "step": 1529 + }, + { + "epoch": 0.12239755204895902, + "grad_norm": 0.3488065907248092, + "learning_rate": 9.974559680972418e-06, + "loss": 0.3302, + "step": 1530 + }, + { + "epoch": 0.12247755044899102, + "grad_norm": 0.2759016892422713, + "learning_rate": 9.974494379347632e-06, + "loss": 0.356, + "step": 1531 + }, + { + "epoch": 0.12255754884902302, + "grad_norm": 0.4088228894844026, + "learning_rate": 9.974428994234626e-06, + "loss": 0.3301, + "step": 1532 + }, + { + "epoch": 0.12263754724905501, + "grad_norm": 0.306047966666119, + "learning_rate": 9.974363525634496e-06, + "loss": 0.3246, + "step": 1533 + }, + { + "epoch": 0.12271754564908702, + "grad_norm": 0.2440230840992797, + "learning_rate": 9.974297973548343e-06, + "loss": 0.3225, + "step": 1534 + }, + { + "epoch": 0.12279754404911902, + "grad_norm": 0.24050105891261542, + "learning_rate": 9.974232337977265e-06, + "loss": 0.336, + "step": 1535 + }, + { + "epoch": 0.12287754244915101, + "grad_norm": 0.3342336181750534, + "learning_rate": 9.974166618922365e-06, + "loss": 0.2853, + "step": 1536 + }, + { + "epoch": 0.12295754084918302, + "grad_norm": 0.3599751638622429, + "learning_rate": 9.974100816384746e-06, + "loss": 0.2842, + "step": 1537 + }, + { + "epoch": 0.12303753924921501, + "grad_norm": 0.3255886392617455, + "learning_rate": 9.974034930365513e-06, + "loss": 0.3004, + "step": 1538 + }, + { + "epoch": 0.12311753764924702, + "grad_norm": 0.35342236698654944, + "learning_rate": 9.97396896086577e-06, + "loss": 0.2981, + "step": 1539 + }, + { + "epoch": 0.12319753604927901, + "grad_norm": 0.3356546294397884, + "learning_rate": 9.973902907886623e-06, + "loss": 0.2734, + "step": 1540 + }, + { + "epoch": 0.12327753444931101, + "grad_norm": 0.3013855894876544, + "learning_rate": 9.973836771429185e-06, + "loss": 0.3076, + "step": 1541 + }, + { + "epoch": 0.12335753284934302, + "grad_norm": 0.27718568684371786, + "learning_rate": 9.973770551494562e-06, + "loss": 0.3192, + "step": 1542 + }, + { + "epoch": 0.12343753124937501, + "grad_norm": 0.36203360444852395, + "learning_rate": 9.973704248083868e-06, + "loss": 0.2889, + "step": 1543 + }, + { + "epoch": 0.12351752964940702, + "grad_norm": 0.33284406458829935, + "learning_rate": 9.973637861198213e-06, + "loss": 0.2897, + "step": 1544 + }, + { + "epoch": 0.123597528049439, + "grad_norm": 0.322474791867277, + "learning_rate": 9.973571390838715e-06, + "loss": 0.2821, + "step": 1545 + }, + { + "epoch": 0.12367752644947101, + "grad_norm": 0.2586611371768474, + "learning_rate": 9.973504837006487e-06, + "loss": 0.3333, + "step": 1546 + }, + { + "epoch": 0.123757524849503, + "grad_norm": 0.4434038262028073, + "learning_rate": 9.973438199702645e-06, + "loss": 0.3183, + "step": 1547 + }, + { + "epoch": 0.12383752324953501, + "grad_norm": 0.30057080920877394, + "learning_rate": 9.97337147892831e-06, + "loss": 0.3156, + "step": 1548 + }, + { + "epoch": 0.12391752164956701, + "grad_norm": 0.2817218688979036, + "learning_rate": 9.9733046746846e-06, + "loss": 0.3197, + "step": 1549 + }, + { + "epoch": 0.123997520049599, + "grad_norm": 0.3364460012975601, + "learning_rate": 9.973237786972637e-06, + "loss": 0.2755, + "step": 1550 + }, + { + "epoch": 0.12407751844963101, + "grad_norm": 0.25995755145271016, + "learning_rate": 9.973170815793543e-06, + "loss": 0.322, + "step": 1551 + }, + { + "epoch": 0.124157516849663, + "grad_norm": 0.32065157844305897, + "learning_rate": 9.973103761148444e-06, + "loss": 0.3188, + "step": 1552 + }, + { + "epoch": 0.12423751524969501, + "grad_norm": 0.3131628021305152, + "learning_rate": 9.973036623038462e-06, + "loss": 0.2938, + "step": 1553 + }, + { + "epoch": 0.124317513649727, + "grad_norm": 0.33124132803205336, + "learning_rate": 9.972969401464728e-06, + "loss": 0.3336, + "step": 1554 + }, + { + "epoch": 0.124397512049759, + "grad_norm": 0.347177685732667, + "learning_rate": 9.972902096428365e-06, + "loss": 0.2671, + "step": 1555 + }, + { + "epoch": 0.12447751044979101, + "grad_norm": 0.2862238918086368, + "learning_rate": 9.972834707930505e-06, + "loss": 0.3058, + "step": 1556 + }, + { + "epoch": 0.124557508849823, + "grad_norm": 0.2812246291021389, + "learning_rate": 9.972767235972283e-06, + "loss": 0.355, + "step": 1557 + }, + { + "epoch": 0.124637507249855, + "grad_norm": 0.3508556622891445, + "learning_rate": 9.972699680554824e-06, + "loss": 0.2853, + "step": 1558 + }, + { + "epoch": 0.124717505649887, + "grad_norm": 0.31934129236613185, + "learning_rate": 9.972632041679268e-06, + "loss": 0.3181, + "step": 1559 + }, + { + "epoch": 0.124797504049919, + "grad_norm": 0.4008315118730672, + "learning_rate": 9.972564319346747e-06, + "loss": 0.267, + "step": 1560 + }, + { + "epoch": 0.124877502449951, + "grad_norm": 0.3867394289324721, + "learning_rate": 9.972496513558399e-06, + "loss": 0.2942, + "step": 1561 + }, + { + "epoch": 0.124957500849983, + "grad_norm": 0.2655012617834147, + "learning_rate": 9.97242862431536e-06, + "loss": 0.3325, + "step": 1562 + }, + { + "epoch": 0.125037499250015, + "grad_norm": 0.30430963968798325, + "learning_rate": 9.972360651618772e-06, + "loss": 0.3227, + "step": 1563 + }, + { + "epoch": 0.125117497650047, + "grad_norm": 0.32976349762845253, + "learning_rate": 9.972292595469775e-06, + "loss": 0.2949, + "step": 1564 + }, + { + "epoch": 0.125197496050079, + "grad_norm": 0.2642429426489294, + "learning_rate": 9.972224455869508e-06, + "loss": 0.3433, + "step": 1565 + }, + { + "epoch": 0.125277494450111, + "grad_norm": 0.34800812853193486, + "learning_rate": 9.972156232819122e-06, + "loss": 0.2636, + "step": 1566 + }, + { + "epoch": 0.125357492850143, + "grad_norm": 0.24551195984941754, + "learning_rate": 9.972087926319753e-06, + "loss": 0.3203, + "step": 1567 + }, + { + "epoch": 0.125437491250175, + "grad_norm": 0.3514690991332886, + "learning_rate": 9.972019536372554e-06, + "loss": 0.2853, + "step": 1568 + }, + { + "epoch": 0.125517489650207, + "grad_norm": 0.3376762173730531, + "learning_rate": 9.971951062978671e-06, + "loss": 0.288, + "step": 1569 + }, + { + "epoch": 0.125597488050239, + "grad_norm": 0.2743899761263726, + "learning_rate": 9.971882506139251e-06, + "loss": 0.316, + "step": 1570 + }, + { + "epoch": 0.125677486450271, + "grad_norm": 0.3273720877629741, + "learning_rate": 9.971813865855448e-06, + "loss": 0.2802, + "step": 1571 + }, + { + "epoch": 0.125757484850303, + "grad_norm": 0.27218182959151843, + "learning_rate": 9.971745142128413e-06, + "loss": 0.319, + "step": 1572 + }, + { + "epoch": 0.125837483250335, + "grad_norm": 0.33635903810220347, + "learning_rate": 9.971676334959297e-06, + "loss": 0.2896, + "step": 1573 + }, + { + "epoch": 0.12591748165036698, + "grad_norm": 0.36089753673816033, + "learning_rate": 9.971607444349258e-06, + "loss": 0.2737, + "step": 1574 + }, + { + "epoch": 0.12599748005039899, + "grad_norm": 0.29738625056333357, + "learning_rate": 9.971538470299452e-06, + "loss": 0.3319, + "step": 1575 + }, + { + "epoch": 0.126077478450431, + "grad_norm": 0.33751500201620127, + "learning_rate": 9.971469412811032e-06, + "loss": 0.2994, + "step": 1576 + }, + { + "epoch": 0.126157476850463, + "grad_norm": 0.333507026377891, + "learning_rate": 9.971400271885163e-06, + "loss": 0.276, + "step": 1577 + }, + { + "epoch": 0.126237475250495, + "grad_norm": 0.3417766187453086, + "learning_rate": 9.971331047523002e-06, + "loss": 0.2931, + "step": 1578 + }, + { + "epoch": 0.12631747365052698, + "grad_norm": 0.20737597012634765, + "learning_rate": 9.971261739725713e-06, + "loss": 0.389, + "step": 1579 + }, + { + "epoch": 0.12639747205055898, + "grad_norm": 0.19969070918404824, + "learning_rate": 9.971192348494456e-06, + "loss": 0.3746, + "step": 1580 + }, + { + "epoch": 0.126477470450591, + "grad_norm": 0.343190067986367, + "learning_rate": 9.971122873830398e-06, + "loss": 0.2911, + "step": 1581 + }, + { + "epoch": 0.126557468850623, + "grad_norm": 0.3241189976081756, + "learning_rate": 9.971053315734706e-06, + "loss": 0.2901, + "step": 1582 + }, + { + "epoch": 0.126637467250655, + "grad_norm": 0.3651618005218322, + "learning_rate": 9.970983674208546e-06, + "loss": 0.2814, + "step": 1583 + }, + { + "epoch": 0.12671746565068698, + "grad_norm": 0.3054901278587988, + "learning_rate": 9.970913949253085e-06, + "loss": 0.3148, + "step": 1584 + }, + { + "epoch": 0.12679746405071898, + "grad_norm": 0.37956813633003, + "learning_rate": 9.970844140869495e-06, + "loss": 0.2912, + "step": 1585 + }, + { + "epoch": 0.126877462450751, + "grad_norm": 0.3087361815326322, + "learning_rate": 9.970774249058947e-06, + "loss": 0.3268, + "step": 1586 + }, + { + "epoch": 0.126957460850783, + "grad_norm": 0.2947287798107453, + "learning_rate": 9.970704273822618e-06, + "loss": 0.3157, + "step": 1587 + }, + { + "epoch": 0.12703745925081497, + "grad_norm": 0.35760370725899226, + "learning_rate": 9.970634215161677e-06, + "loss": 0.3033, + "step": 1588 + }, + { + "epoch": 0.12711745765084698, + "grad_norm": 0.2705000717013212, + "learning_rate": 9.9705640730773e-06, + "loss": 0.326, + "step": 1589 + }, + { + "epoch": 0.12719745605087898, + "grad_norm": 0.28820740181724164, + "learning_rate": 9.970493847570669e-06, + "loss": 0.332, + "step": 1590 + }, + { + "epoch": 0.127277454450911, + "grad_norm": 0.33870280790828833, + "learning_rate": 9.970423538642959e-06, + "loss": 0.2995, + "step": 1591 + }, + { + "epoch": 0.127357452850943, + "grad_norm": 0.3073152895623792, + "learning_rate": 9.97035314629535e-06, + "loss": 0.3027, + "step": 1592 + }, + { + "epoch": 0.12743745125097497, + "grad_norm": 0.24343305844285854, + "learning_rate": 9.970282670529024e-06, + "loss": 0.3209, + "step": 1593 + }, + { + "epoch": 0.12751744965100698, + "grad_norm": 0.2817249692395849, + "learning_rate": 9.970212111345164e-06, + "loss": 0.2954, + "step": 1594 + }, + { + "epoch": 0.12759744805103898, + "grad_norm": 0.19507604619833765, + "learning_rate": 9.970141468744953e-06, + "loss": 0.3399, + "step": 1595 + }, + { + "epoch": 0.12767744645107099, + "grad_norm": 0.33644849661089504, + "learning_rate": 9.97007074272958e-06, + "loss": 0.3003, + "step": 1596 + }, + { + "epoch": 0.127757444851103, + "grad_norm": 0.28551690329400675, + "learning_rate": 9.969999933300229e-06, + "loss": 0.2963, + "step": 1597 + }, + { + "epoch": 0.12783744325113497, + "grad_norm": 0.24111935984584498, + "learning_rate": 9.969929040458088e-06, + "loss": 0.333, + "step": 1598 + }, + { + "epoch": 0.12791744165116697, + "grad_norm": 0.2440483033012155, + "learning_rate": 9.96985806420435e-06, + "loss": 0.3258, + "step": 1599 + }, + { + "epoch": 0.12799744005119898, + "grad_norm": 0.37418989712799433, + "learning_rate": 9.969787004540202e-06, + "loss": 0.3256, + "step": 1600 + }, + { + "epoch": 0.12807743845123098, + "grad_norm": 0.30313124375864736, + "learning_rate": 9.969715861466839e-06, + "loss": 0.3146, + "step": 1601 + }, + { + "epoch": 0.12815743685126296, + "grad_norm": 0.28303531578204344, + "learning_rate": 9.969644634985456e-06, + "loss": 0.2844, + "step": 1602 + }, + { + "epoch": 0.12823743525129497, + "grad_norm": 0.31588089694356136, + "learning_rate": 9.969573325097247e-06, + "loss": 0.2564, + "step": 1603 + }, + { + "epoch": 0.12831743365132697, + "grad_norm": 0.33485316524707903, + "learning_rate": 9.96950193180341e-06, + "loss": 0.2972, + "step": 1604 + }, + { + "epoch": 0.12839743205135898, + "grad_norm": 0.27738539766765796, + "learning_rate": 9.96943045510514e-06, + "loss": 0.2948, + "step": 1605 + }, + { + "epoch": 0.12847743045139098, + "grad_norm": 0.28739144946239215, + "learning_rate": 9.96935889500364e-06, + "loss": 0.3398, + "step": 1606 + }, + { + "epoch": 0.12855742885142296, + "grad_norm": 0.3780784558149297, + "learning_rate": 9.969287251500109e-06, + "loss": 0.3029, + "step": 1607 + }, + { + "epoch": 0.12863742725145497, + "grad_norm": 0.39436280254310757, + "learning_rate": 9.969215524595751e-06, + "loss": 0.2871, + "step": 1608 + }, + { + "epoch": 0.12871742565148697, + "grad_norm": 0.2781032973117167, + "learning_rate": 9.96914371429177e-06, + "loss": 0.3417, + "step": 1609 + }, + { + "epoch": 0.12879742405151898, + "grad_norm": 0.3091777596402673, + "learning_rate": 9.96907182058937e-06, + "loss": 0.3156, + "step": 1610 + }, + { + "epoch": 0.12887742245155098, + "grad_norm": 0.34476330496554697, + "learning_rate": 9.968999843489755e-06, + "loss": 0.2797, + "step": 1611 + }, + { + "epoch": 0.12895742085158296, + "grad_norm": 0.29137705953747706, + "learning_rate": 9.968927782994139e-06, + "loss": 0.3471, + "step": 1612 + }, + { + "epoch": 0.12903741925161497, + "grad_norm": 0.22494499800610587, + "learning_rate": 9.968855639103727e-06, + "loss": 0.3429, + "step": 1613 + }, + { + "epoch": 0.12911741765164697, + "grad_norm": 0.3384786965693821, + "learning_rate": 9.968783411819732e-06, + "loss": 0.3129, + "step": 1614 + }, + { + "epoch": 0.12919741605167898, + "grad_norm": 0.37245070269968894, + "learning_rate": 9.968711101143364e-06, + "loss": 0.3024, + "step": 1615 + }, + { + "epoch": 0.12927741445171095, + "grad_norm": 0.33261383673094785, + "learning_rate": 9.968638707075839e-06, + "loss": 0.2809, + "step": 1616 + }, + { + "epoch": 0.12935741285174296, + "grad_norm": 0.3538913564212979, + "learning_rate": 9.96856622961837e-06, + "loss": 0.3058, + "step": 1617 + }, + { + "epoch": 0.12943741125177496, + "grad_norm": 0.33543894188521006, + "learning_rate": 9.968493668772177e-06, + "loss": 0.2722, + "step": 1618 + }, + { + "epoch": 0.12951740965180697, + "grad_norm": 0.3673480373341944, + "learning_rate": 9.968421024538473e-06, + "loss": 0.311, + "step": 1619 + }, + { + "epoch": 0.12959740805183897, + "grad_norm": 0.32641897734890835, + "learning_rate": 9.968348296918479e-06, + "loss": 0.2983, + "step": 1620 + }, + { + "epoch": 0.12967740645187095, + "grad_norm": 0.3145459493680963, + "learning_rate": 9.968275485913417e-06, + "loss": 0.3104, + "step": 1621 + }, + { + "epoch": 0.12975740485190296, + "grad_norm": 0.34435664527218157, + "learning_rate": 9.968202591524508e-06, + "loss": 0.3085, + "step": 1622 + }, + { + "epoch": 0.12983740325193496, + "grad_norm": 0.29708028988303964, + "learning_rate": 9.968129613752975e-06, + "loss": 0.3048, + "step": 1623 + }, + { + "epoch": 0.12991740165196697, + "grad_norm": 0.3933300902453698, + "learning_rate": 9.968056552600043e-06, + "loss": 0.2908, + "step": 1624 + }, + { + "epoch": 0.12999740005199897, + "grad_norm": 0.2657896128021916, + "learning_rate": 9.967983408066939e-06, + "loss": 0.3433, + "step": 1625 + }, + { + "epoch": 0.13007739845203095, + "grad_norm": 0.2649772146562545, + "learning_rate": 9.96791018015489e-06, + "loss": 0.3518, + "step": 1626 + }, + { + "epoch": 0.13015739685206296, + "grad_norm": 0.3384894253270735, + "learning_rate": 9.967836868865125e-06, + "loss": 0.2699, + "step": 1627 + }, + { + "epoch": 0.13023739525209496, + "grad_norm": 0.3053194370367204, + "learning_rate": 9.967763474198873e-06, + "loss": 0.3118, + "step": 1628 + }, + { + "epoch": 0.13031739365212697, + "grad_norm": 0.29400780305374524, + "learning_rate": 9.967689996157368e-06, + "loss": 0.3054, + "step": 1629 + }, + { + "epoch": 0.13039739205215894, + "grad_norm": 0.2543343038972484, + "learning_rate": 9.967616434741842e-06, + "loss": 0.3273, + "step": 1630 + }, + { + "epoch": 0.13047739045219095, + "grad_norm": 0.3950385774304625, + "learning_rate": 9.967542789953532e-06, + "loss": 0.2573, + "step": 1631 + }, + { + "epoch": 0.13055738885222296, + "grad_norm": 0.3166082015848201, + "learning_rate": 9.967469061793672e-06, + "loss": 0.2924, + "step": 1632 + }, + { + "epoch": 0.13063738725225496, + "grad_norm": 0.23891815135943617, + "learning_rate": 9.967395250263496e-06, + "loss": 0.3427, + "step": 1633 + }, + { + "epoch": 0.13071738565228697, + "grad_norm": 0.2653217347899633, + "learning_rate": 9.96732135536425e-06, + "loss": 0.2931, + "step": 1634 + }, + { + "epoch": 0.13079738405231894, + "grad_norm": 0.2660758979103026, + "learning_rate": 9.967247377097168e-06, + "loss": 0.3439, + "step": 1635 + }, + { + "epoch": 0.13087738245235095, + "grad_norm": 0.31902449798630683, + "learning_rate": 9.967173315463494e-06, + "loss": 0.2798, + "step": 1636 + }, + { + "epoch": 0.13095738085238295, + "grad_norm": 0.32309647957953375, + "learning_rate": 9.967099170464473e-06, + "loss": 0.2801, + "step": 1637 + }, + { + "epoch": 0.13103737925241496, + "grad_norm": 0.3630500586472057, + "learning_rate": 9.967024942101345e-06, + "loss": 0.3, + "step": 1638 + }, + { + "epoch": 0.13111737765244696, + "grad_norm": 0.453720988216507, + "learning_rate": 9.966950630375361e-06, + "loss": 0.2773, + "step": 1639 + }, + { + "epoch": 0.13119737605247894, + "grad_norm": 0.30711283945170126, + "learning_rate": 9.966876235287762e-06, + "loss": 0.3018, + "step": 1640 + }, + { + "epoch": 0.13127737445251095, + "grad_norm": 0.33838315886780596, + "learning_rate": 9.966801756839802e-06, + "loss": 0.2963, + "step": 1641 + }, + { + "epoch": 0.13135737285254295, + "grad_norm": 0.39474138784436325, + "learning_rate": 9.966727195032729e-06, + "loss": 0.2941, + "step": 1642 + }, + { + "epoch": 0.13143737125257496, + "grad_norm": 0.38753809544863926, + "learning_rate": 9.966652549867795e-06, + "loss": 0.2865, + "step": 1643 + }, + { + "epoch": 0.13151736965260694, + "grad_norm": 0.5787026827731592, + "learning_rate": 9.96657782134625e-06, + "loss": 0.2697, + "step": 1644 + }, + { + "epoch": 0.13159736805263894, + "grad_norm": 0.3197434565585619, + "learning_rate": 9.966503009469352e-06, + "loss": 0.3239, + "step": 1645 + }, + { + "epoch": 0.13167736645267095, + "grad_norm": 0.3935251810016136, + "learning_rate": 9.966428114238353e-06, + "loss": 0.2811, + "step": 1646 + }, + { + "epoch": 0.13175736485270295, + "grad_norm": 0.32167362638562774, + "learning_rate": 9.966353135654513e-06, + "loss": 0.3045, + "step": 1647 + }, + { + "epoch": 0.13183736325273496, + "grad_norm": 0.22817981950295207, + "learning_rate": 9.966278073719091e-06, + "loss": 0.3479, + "step": 1648 + }, + { + "epoch": 0.13191736165276693, + "grad_norm": 0.32914152365333305, + "learning_rate": 9.966202928433344e-06, + "loss": 0.287, + "step": 1649 + }, + { + "epoch": 0.13199736005279894, + "grad_norm": 0.2565248530649696, + "learning_rate": 9.966127699798533e-06, + "loss": 0.3216, + "step": 1650 + }, + { + "epoch": 0.13207735845283095, + "grad_norm": 0.2604344735331237, + "learning_rate": 9.966052387815923e-06, + "loss": 0.3377, + "step": 1651 + }, + { + "epoch": 0.13215735685286295, + "grad_norm": 0.2657973200421256, + "learning_rate": 9.965976992486777e-06, + "loss": 0.337, + "step": 1652 + }, + { + "epoch": 0.13223735525289496, + "grad_norm": 0.44261816520682506, + "learning_rate": 9.96590151381236e-06, + "loss": 0.2821, + "step": 1653 + }, + { + "epoch": 0.13231735365292693, + "grad_norm": 0.3499848148337784, + "learning_rate": 9.965825951793939e-06, + "loss": 0.313, + "step": 1654 + }, + { + "epoch": 0.13239735205295894, + "grad_norm": 0.3377789495785721, + "learning_rate": 9.965750306432782e-06, + "loss": 0.2759, + "step": 1655 + }, + { + "epoch": 0.13247735045299094, + "grad_norm": 0.3084300248662882, + "learning_rate": 9.965674577730157e-06, + "loss": 0.3087, + "step": 1656 + }, + { + "epoch": 0.13255734885302295, + "grad_norm": 0.32778165414405397, + "learning_rate": 9.965598765687338e-06, + "loss": 0.3075, + "step": 1657 + }, + { + "epoch": 0.13263734725305493, + "grad_norm": 0.2905680233037324, + "learning_rate": 9.965522870305598e-06, + "loss": 0.3195, + "step": 1658 + }, + { + "epoch": 0.13271734565308693, + "grad_norm": 0.3294272355686789, + "learning_rate": 9.965446891586208e-06, + "loss": 0.2893, + "step": 1659 + }, + { + "epoch": 0.13279734405311894, + "grad_norm": 0.33005569419541053, + "learning_rate": 9.965370829530444e-06, + "loss": 0.308, + "step": 1660 + }, + { + "epoch": 0.13287734245315094, + "grad_norm": 0.30360017842174963, + "learning_rate": 9.96529468413958e-06, + "loss": 0.3015, + "step": 1661 + }, + { + "epoch": 0.13295734085318295, + "grad_norm": 0.3234667842233614, + "learning_rate": 9.9652184554149e-06, + "loss": 0.2988, + "step": 1662 + }, + { + "epoch": 0.13303733925321493, + "grad_norm": 0.2837535920210326, + "learning_rate": 9.965142143357677e-06, + "loss": 0.3403, + "step": 1663 + }, + { + "epoch": 0.13311733765324693, + "grad_norm": 0.26632172850544816, + "learning_rate": 9.965065747969196e-06, + "loss": 0.3682, + "step": 1664 + }, + { + "epoch": 0.13319733605327894, + "grad_norm": 0.3041415048672628, + "learning_rate": 9.964989269250737e-06, + "loss": 0.3044, + "step": 1665 + }, + { + "epoch": 0.13327733445331094, + "grad_norm": 0.32970071912713045, + "learning_rate": 9.964912707203587e-06, + "loss": 0.2879, + "step": 1666 + }, + { + "epoch": 0.13335733285334295, + "grad_norm": 0.3187449002069163, + "learning_rate": 9.964836061829026e-06, + "loss": 0.283, + "step": 1667 + }, + { + "epoch": 0.13343733125337492, + "grad_norm": 0.3381036568929224, + "learning_rate": 9.964759333128344e-06, + "loss": 0.3026, + "step": 1668 + }, + { + "epoch": 0.13351732965340693, + "grad_norm": 0.34993053866836077, + "learning_rate": 9.964682521102827e-06, + "loss": 0.3305, + "step": 1669 + }, + { + "epoch": 0.13359732805343894, + "grad_norm": 0.3869382941791446, + "learning_rate": 9.964605625753763e-06, + "loss": 0.2971, + "step": 1670 + }, + { + "epoch": 0.13367732645347094, + "grad_norm": 0.31727011304715647, + "learning_rate": 9.964528647082447e-06, + "loss": 0.324, + "step": 1671 + }, + { + "epoch": 0.13375732485350292, + "grad_norm": 0.27064077673825926, + "learning_rate": 9.964451585090167e-06, + "loss": 0.3373, + "step": 1672 + }, + { + "epoch": 0.13383732325353492, + "grad_norm": 0.3010738763319749, + "learning_rate": 9.964374439778217e-06, + "loss": 0.3097, + "step": 1673 + }, + { + "epoch": 0.13391732165356693, + "grad_norm": 0.3106080288510451, + "learning_rate": 9.964297211147893e-06, + "loss": 0.3044, + "step": 1674 + }, + { + "epoch": 0.13399732005359893, + "grad_norm": 0.35253993125968475, + "learning_rate": 9.964219899200489e-06, + "loss": 0.3073, + "step": 1675 + }, + { + "epoch": 0.13407731845363094, + "grad_norm": 0.26899506633024683, + "learning_rate": 9.964142503937305e-06, + "loss": 0.3529, + "step": 1676 + }, + { + "epoch": 0.13415731685366292, + "grad_norm": 0.34071389061918994, + "learning_rate": 9.964065025359639e-06, + "loss": 0.2923, + "step": 1677 + }, + { + "epoch": 0.13423731525369492, + "grad_norm": 0.3020345058078009, + "learning_rate": 9.963987463468791e-06, + "loss": 0.3235, + "step": 1678 + }, + { + "epoch": 0.13431731365372693, + "grad_norm": 0.3291600823809039, + "learning_rate": 9.963909818266063e-06, + "loss": 0.294, + "step": 1679 + }, + { + "epoch": 0.13439731205375893, + "grad_norm": 0.2917055224651526, + "learning_rate": 9.963832089752758e-06, + "loss": 0.3115, + "step": 1680 + }, + { + "epoch": 0.1344773104537909, + "grad_norm": 0.27063817168409676, + "learning_rate": 9.96375427793018e-06, + "loss": 0.3429, + "step": 1681 + }, + { + "epoch": 0.13455730885382292, + "grad_norm": 0.29569271171031547, + "learning_rate": 9.963676382799637e-06, + "loss": 0.3103, + "step": 1682 + }, + { + "epoch": 0.13463730725385492, + "grad_norm": 0.3416539883702798, + "learning_rate": 9.963598404362435e-06, + "loss": 0.2869, + "step": 1683 + }, + { + "epoch": 0.13471730565388693, + "grad_norm": 0.317345440697365, + "learning_rate": 9.96352034261988e-06, + "loss": 0.2802, + "step": 1684 + }, + { + "epoch": 0.13479730405391893, + "grad_norm": 0.3798252914000913, + "learning_rate": 9.963442197573288e-06, + "loss": 0.3015, + "step": 1685 + }, + { + "epoch": 0.1348773024539509, + "grad_norm": 0.38545882238974566, + "learning_rate": 9.963363969223968e-06, + "loss": 0.2691, + "step": 1686 + }, + { + "epoch": 0.13495730085398291, + "grad_norm": 0.4489555407948747, + "learning_rate": 9.96328565757323e-06, + "loss": 0.3135, + "step": 1687 + }, + { + "epoch": 0.13503729925401492, + "grad_norm": 0.30985761749304114, + "learning_rate": 9.96320726262239e-06, + "loss": 0.3127, + "step": 1688 + }, + { + "epoch": 0.13511729765404693, + "grad_norm": 0.34437822862994905, + "learning_rate": 9.963128784372765e-06, + "loss": 0.2817, + "step": 1689 + }, + { + "epoch": 0.13519729605407893, + "grad_norm": 0.32296207524824966, + "learning_rate": 9.963050222825672e-06, + "loss": 0.2981, + "step": 1690 + }, + { + "epoch": 0.1352772944541109, + "grad_norm": 0.30673484253410654, + "learning_rate": 9.962971577982428e-06, + "loss": 0.3147, + "step": 1691 + }, + { + "epoch": 0.1353572928541429, + "grad_norm": 0.2965287340778612, + "learning_rate": 9.962892849844355e-06, + "loss": 0.2996, + "step": 1692 + }, + { + "epoch": 0.13543729125417492, + "grad_norm": 0.3686616630824169, + "learning_rate": 9.962814038412772e-06, + "loss": 0.3024, + "step": 1693 + }, + { + "epoch": 0.13551728965420692, + "grad_norm": 0.36857623307427717, + "learning_rate": 9.962735143689003e-06, + "loss": 0.2994, + "step": 1694 + }, + { + "epoch": 0.1355972880542389, + "grad_norm": 0.26964413945230015, + "learning_rate": 9.96265616567437e-06, + "loss": 0.3283, + "step": 1695 + }, + { + "epoch": 0.1356772864542709, + "grad_norm": 0.30960092724603844, + "learning_rate": 9.962577104370206e-06, + "loss": 0.3208, + "step": 1696 + }, + { + "epoch": 0.1357572848543029, + "grad_norm": 0.3930353754480697, + "learning_rate": 9.962497959777828e-06, + "loss": 0.2865, + "step": 1697 + }, + { + "epoch": 0.13583728325433492, + "grad_norm": 0.32100007809044945, + "learning_rate": 9.962418731898571e-06, + "loss": 0.2746, + "step": 1698 + }, + { + "epoch": 0.13591728165436692, + "grad_norm": 0.34289121610315537, + "learning_rate": 9.96233942073376e-06, + "loss": 0.296, + "step": 1699 + }, + { + "epoch": 0.1359972800543989, + "grad_norm": 0.25172116748252965, + "learning_rate": 9.96226002628473e-06, + "loss": 0.3662, + "step": 1700 + }, + { + "epoch": 0.1360772784544309, + "grad_norm": 0.32448629756567077, + "learning_rate": 9.962180548552812e-06, + "loss": 0.2831, + "step": 1701 + }, + { + "epoch": 0.1361572768544629, + "grad_norm": 0.33516748900371574, + "learning_rate": 9.96210098753934e-06, + "loss": 0.2974, + "step": 1702 + }, + { + "epoch": 0.13623727525449492, + "grad_norm": 1.0576067079658895, + "learning_rate": 9.96202134324565e-06, + "loss": 0.2794, + "step": 1703 + }, + { + "epoch": 0.13631727365452692, + "grad_norm": 0.3391188007108373, + "learning_rate": 9.961941615673075e-06, + "loss": 0.2821, + "step": 1704 + }, + { + "epoch": 0.1363972720545589, + "grad_norm": 0.3148933314226744, + "learning_rate": 9.961861804822958e-06, + "loss": 0.2716, + "step": 1705 + }, + { + "epoch": 0.1364772704545909, + "grad_norm": 0.31931580022594086, + "learning_rate": 9.961781910696636e-06, + "loss": 0.3235, + "step": 1706 + }, + { + "epoch": 0.1365572688546229, + "grad_norm": 0.29937630478445426, + "learning_rate": 9.961701933295451e-06, + "loss": 0.3282, + "step": 1707 + }, + { + "epoch": 0.13663726725465491, + "grad_norm": 0.28782150017118074, + "learning_rate": 9.961621872620744e-06, + "loss": 0.2973, + "step": 1708 + }, + { + "epoch": 0.1367172656546869, + "grad_norm": 0.2742400772084222, + "learning_rate": 9.961541728673859e-06, + "loss": 0.3294, + "step": 1709 + }, + { + "epoch": 0.1367972640547189, + "grad_norm": 0.45880200381966524, + "learning_rate": 9.961461501456142e-06, + "loss": 0.2775, + "step": 1710 + }, + { + "epoch": 0.1368772624547509, + "grad_norm": 0.31562064636694503, + "learning_rate": 9.96138119096894e-06, + "loss": 0.3602, + "step": 1711 + }, + { + "epoch": 0.1369572608547829, + "grad_norm": 0.2903401516378683, + "learning_rate": 9.961300797213597e-06, + "loss": 0.3042, + "step": 1712 + }, + { + "epoch": 0.1370372592548149, + "grad_norm": 0.3294868849179385, + "learning_rate": 9.961220320191466e-06, + "loss": 0.3185, + "step": 1713 + }, + { + "epoch": 0.1371172576548469, + "grad_norm": 0.3224538803028663, + "learning_rate": 9.961139759903898e-06, + "loss": 0.2654, + "step": 1714 + }, + { + "epoch": 0.1371972560548789, + "grad_norm": 0.32407631488149774, + "learning_rate": 9.961059116352242e-06, + "loss": 0.2885, + "step": 1715 + }, + { + "epoch": 0.1372772544549109, + "grad_norm": 0.35838211973323353, + "learning_rate": 9.960978389537853e-06, + "loss": 0.2871, + "step": 1716 + }, + { + "epoch": 0.1373572528549429, + "grad_norm": 0.27286313005129115, + "learning_rate": 9.960897579462088e-06, + "loss": 0.3507, + "step": 1717 + }, + { + "epoch": 0.1374372512549749, + "grad_norm": 0.24887063569142082, + "learning_rate": 9.9608166861263e-06, + "loss": 0.335, + "step": 1718 + }, + { + "epoch": 0.1375172496550069, + "grad_norm": 0.2755737102753344, + "learning_rate": 9.960735709531848e-06, + "loss": 0.3083, + "step": 1719 + }, + { + "epoch": 0.1375972480550389, + "grad_norm": 0.2939927270526707, + "learning_rate": 9.960654649680092e-06, + "loss": 0.3053, + "step": 1720 + }, + { + "epoch": 0.1376772464550709, + "grad_norm": 0.3114105543719996, + "learning_rate": 9.960573506572391e-06, + "loss": 0.285, + "step": 1721 + }, + { + "epoch": 0.1377572448551029, + "grad_norm": 0.2611624811881945, + "learning_rate": 9.960492280210105e-06, + "loss": 0.3274, + "step": 1722 + }, + { + "epoch": 0.13783724325513488, + "grad_norm": 0.2864737101510064, + "learning_rate": 9.960410970594603e-06, + "loss": 0.3145, + "step": 1723 + }, + { + "epoch": 0.1379172416551669, + "grad_norm": 0.3063205908763903, + "learning_rate": 9.960329577727244e-06, + "loss": 0.301, + "step": 1724 + }, + { + "epoch": 0.1379972400551989, + "grad_norm": 0.2791774800395622, + "learning_rate": 9.960248101609396e-06, + "loss": 0.3277, + "step": 1725 + }, + { + "epoch": 0.1380772384552309, + "grad_norm": 0.4676364919085364, + "learning_rate": 9.96016654224243e-06, + "loss": 0.296, + "step": 1726 + }, + { + "epoch": 0.1381572368552629, + "grad_norm": 0.31381224432593324, + "learning_rate": 9.960084899627707e-06, + "loss": 0.3222, + "step": 1727 + }, + { + "epoch": 0.13823723525529488, + "grad_norm": 0.29165147182370316, + "learning_rate": 9.960003173766603e-06, + "loss": 0.3154, + "step": 1728 + }, + { + "epoch": 0.1383172336553269, + "grad_norm": 0.5581375819852616, + "learning_rate": 9.95992136466049e-06, + "loss": 0.3212, + "step": 1729 + }, + { + "epoch": 0.1383972320553589, + "grad_norm": 0.27612364071745177, + "learning_rate": 9.959839472310737e-06, + "loss": 0.3421, + "step": 1730 + }, + { + "epoch": 0.1384772304553909, + "grad_norm": 0.3552131712701409, + "learning_rate": 9.959757496718723e-06, + "loss": 0.2728, + "step": 1731 + }, + { + "epoch": 0.1385572288554229, + "grad_norm": 0.30504713248927423, + "learning_rate": 9.95967543788582e-06, + "loss": 0.2973, + "step": 1732 + }, + { + "epoch": 0.13863722725545488, + "grad_norm": 0.3117372839299666, + "learning_rate": 9.959593295813409e-06, + "loss": 0.3144, + "step": 1733 + }, + { + "epoch": 0.1387172256554869, + "grad_norm": 0.3661145015161783, + "learning_rate": 9.959511070502864e-06, + "loss": 0.2853, + "step": 1734 + }, + { + "epoch": 0.1387972240555189, + "grad_norm": 0.2986015314313455, + "learning_rate": 9.959428761955569e-06, + "loss": 0.3089, + "step": 1735 + }, + { + "epoch": 0.1388772224555509, + "grad_norm": 0.322042310593675, + "learning_rate": 9.959346370172902e-06, + "loss": 0.3096, + "step": 1736 + }, + { + "epoch": 0.13895722085558287, + "grad_norm": 1.0597179866444584, + "learning_rate": 9.95926389515625e-06, + "loss": 0.3024, + "step": 1737 + }, + { + "epoch": 0.13903721925561488, + "grad_norm": 0.3332901498427141, + "learning_rate": 9.959181336906993e-06, + "loss": 0.3218, + "step": 1738 + }, + { + "epoch": 0.13911721765564689, + "grad_norm": 0.30180490543673855, + "learning_rate": 9.959098695426518e-06, + "loss": 0.3039, + "step": 1739 + }, + { + "epoch": 0.1391972160556789, + "grad_norm": 0.2875304219816326, + "learning_rate": 9.959015970716215e-06, + "loss": 0.3123, + "step": 1740 + }, + { + "epoch": 0.1392772144557109, + "grad_norm": 0.32021173107962153, + "learning_rate": 9.958933162777468e-06, + "loss": 0.2849, + "step": 1741 + }, + { + "epoch": 0.13935721285574287, + "grad_norm": 0.27952604512851636, + "learning_rate": 9.958850271611669e-06, + "loss": 0.2933, + "step": 1742 + }, + { + "epoch": 0.13943721125577488, + "grad_norm": 0.2586265294576407, + "learning_rate": 9.958767297220209e-06, + "loss": 0.3285, + "step": 1743 + }, + { + "epoch": 0.13951720965580688, + "grad_norm": 0.28383422193166746, + "learning_rate": 9.95868423960448e-06, + "loss": 0.3074, + "step": 1744 + }, + { + "epoch": 0.1395972080558389, + "grad_norm": 0.380703359676753, + "learning_rate": 9.958601098765877e-06, + "loss": 0.2644, + "step": 1745 + }, + { + "epoch": 0.1396772064558709, + "grad_norm": 0.31990860162341744, + "learning_rate": 9.958517874705793e-06, + "loss": 0.3046, + "step": 1746 + }, + { + "epoch": 0.13975720485590287, + "grad_norm": 0.3136246366776693, + "learning_rate": 9.958434567425627e-06, + "loss": 0.2742, + "step": 1747 + }, + { + "epoch": 0.13983720325593488, + "grad_norm": 0.4744114241245162, + "learning_rate": 9.958351176926779e-06, + "loss": 0.2807, + "step": 1748 + }, + { + "epoch": 0.13991720165596688, + "grad_norm": 0.35019852055583867, + "learning_rate": 9.958267703210645e-06, + "loss": 0.2684, + "step": 1749 + }, + { + "epoch": 0.1399972000559989, + "grad_norm": 0.3219857074865576, + "learning_rate": 9.958184146278626e-06, + "loss": 0.2843, + "step": 1750 + }, + { + "epoch": 0.14007719845603087, + "grad_norm": 0.2741862901592053, + "learning_rate": 9.958100506132127e-06, + "loss": 0.2995, + "step": 1751 + }, + { + "epoch": 0.14015719685606287, + "grad_norm": 0.29855136100499374, + "learning_rate": 9.958016782772548e-06, + "loss": 0.3164, + "step": 1752 + }, + { + "epoch": 0.14023719525609488, + "grad_norm": 0.31182241558337687, + "learning_rate": 9.957932976201298e-06, + "loss": 0.3218, + "step": 1753 + }, + { + "epoch": 0.14031719365612688, + "grad_norm": 0.3017526580799603, + "learning_rate": 9.957849086419784e-06, + "loss": 0.2808, + "step": 1754 + }, + { + "epoch": 0.1403971920561589, + "grad_norm": 0.29259984357276614, + "learning_rate": 9.95776511342941e-06, + "loss": 0.3188, + "step": 1755 + }, + { + "epoch": 0.14047719045619086, + "grad_norm": 0.2586671324325039, + "learning_rate": 9.957681057231586e-06, + "loss": 0.3414, + "step": 1756 + }, + { + "epoch": 0.14055718885622287, + "grad_norm": 0.3422309319001371, + "learning_rate": 9.957596917827726e-06, + "loss": 0.2914, + "step": 1757 + }, + { + "epoch": 0.14063718725625488, + "grad_norm": 0.30411665437657764, + "learning_rate": 9.957512695219237e-06, + "loss": 0.3257, + "step": 1758 + }, + { + "epoch": 0.14071718565628688, + "grad_norm": 0.3148256689329255, + "learning_rate": 9.95742838940754e-06, + "loss": 0.2595, + "step": 1759 + }, + { + "epoch": 0.14079718405631889, + "grad_norm": 0.3235320161935557, + "learning_rate": 9.957344000394044e-06, + "loss": 0.2817, + "step": 1760 + }, + { + "epoch": 0.14087718245635086, + "grad_norm": 0.2346349466963199, + "learning_rate": 9.957259528180166e-06, + "loss": 0.3598, + "step": 1761 + }, + { + "epoch": 0.14095718085638287, + "grad_norm": 0.32775946235867615, + "learning_rate": 9.957174972767325e-06, + "loss": 0.2561, + "step": 1762 + }, + { + "epoch": 0.14103717925641487, + "grad_norm": 0.29272448983335647, + "learning_rate": 9.95709033415694e-06, + "loss": 0.2984, + "step": 1763 + }, + { + "epoch": 0.14111717765644688, + "grad_norm": 0.3381410463768757, + "learning_rate": 9.957005612350433e-06, + "loss": 0.2756, + "step": 1764 + }, + { + "epoch": 0.14119717605647886, + "grad_norm": 0.37881724019186336, + "learning_rate": 9.956920807349222e-06, + "loss": 0.2972, + "step": 1765 + }, + { + "epoch": 0.14127717445651086, + "grad_norm": 0.2615131462346856, + "learning_rate": 9.956835919154733e-06, + "loss": 0.3333, + "step": 1766 + }, + { + "epoch": 0.14135717285654287, + "grad_norm": 0.25063255370545584, + "learning_rate": 9.95675094776839e-06, + "loss": 0.3151, + "step": 1767 + }, + { + "epoch": 0.14143717125657487, + "grad_norm": 0.3013728292075032, + "learning_rate": 9.95666589319162e-06, + "loss": 0.3241, + "step": 1768 + }, + { + "epoch": 0.14151716965660688, + "grad_norm": 0.3012317428421379, + "learning_rate": 9.956580755425847e-06, + "loss": 0.3201, + "step": 1769 + }, + { + "epoch": 0.14159716805663886, + "grad_norm": 0.5038706896382061, + "learning_rate": 9.956495534472506e-06, + "loss": 0.2782, + "step": 1770 + }, + { + "epoch": 0.14167716645667086, + "grad_norm": 0.24984419727694965, + "learning_rate": 9.956410230333023e-06, + "loss": 0.3574, + "step": 1771 + }, + { + "epoch": 0.14175716485670287, + "grad_norm": 0.4299887767701957, + "learning_rate": 9.95632484300883e-06, + "loss": 0.3258, + "step": 1772 + }, + { + "epoch": 0.14183716325673487, + "grad_norm": 0.3560279802668401, + "learning_rate": 9.956239372501361e-06, + "loss": 0.295, + "step": 1773 + }, + { + "epoch": 0.14191716165676688, + "grad_norm": 0.24932007515283539, + "learning_rate": 9.95615381881205e-06, + "loss": 0.3191, + "step": 1774 + }, + { + "epoch": 0.14199716005679885, + "grad_norm": 0.3028728743262735, + "learning_rate": 9.956068181942333e-06, + "loss": 0.311, + "step": 1775 + }, + { + "epoch": 0.14207715845683086, + "grad_norm": 0.259154033880076, + "learning_rate": 9.955982461893648e-06, + "loss": 0.3354, + "step": 1776 + }, + { + "epoch": 0.14215715685686287, + "grad_norm": 0.2897432537846509, + "learning_rate": 9.955896658667433e-06, + "loss": 0.3272, + "step": 1777 + }, + { + "epoch": 0.14223715525689487, + "grad_norm": 0.33038176949553716, + "learning_rate": 9.955810772265128e-06, + "loss": 0.3015, + "step": 1778 + }, + { + "epoch": 0.14231715365692685, + "grad_norm": 0.3516805659520599, + "learning_rate": 9.955724802688173e-06, + "loss": 0.2816, + "step": 1779 + }, + { + "epoch": 0.14239715205695885, + "grad_norm": 0.2804767726724162, + "learning_rate": 9.955638749938015e-06, + "loss": 0.3108, + "step": 1780 + }, + { + "epoch": 0.14247715045699086, + "grad_norm": 0.26314051644794173, + "learning_rate": 9.955552614016093e-06, + "loss": 0.3426, + "step": 1781 + }, + { + "epoch": 0.14255714885702286, + "grad_norm": 0.2947321905415446, + "learning_rate": 9.955466394923857e-06, + "loss": 0.3145, + "step": 1782 + }, + { + "epoch": 0.14263714725705487, + "grad_norm": 0.3241423092435149, + "learning_rate": 9.955380092662751e-06, + "loss": 0.3086, + "step": 1783 + }, + { + "epoch": 0.14271714565708685, + "grad_norm": 0.32229750720492134, + "learning_rate": 9.955293707234225e-06, + "loss": 0.3166, + "step": 1784 + }, + { + "epoch": 0.14279714405711885, + "grad_norm": 0.3235463306806733, + "learning_rate": 9.955207238639729e-06, + "loss": 0.2799, + "step": 1785 + }, + { + "epoch": 0.14287714245715086, + "grad_norm": 0.28474506326534615, + "learning_rate": 9.955120686880713e-06, + "loss": 0.3157, + "step": 1786 + }, + { + "epoch": 0.14295714085718286, + "grad_norm": 0.30609190171772627, + "learning_rate": 9.955034051958632e-06, + "loss": 0.3106, + "step": 1787 + }, + { + "epoch": 0.14303713925721487, + "grad_norm": 0.3092621200982988, + "learning_rate": 9.954947333874937e-06, + "loss": 0.316, + "step": 1788 + }, + { + "epoch": 0.14311713765724685, + "grad_norm": 0.31639271963250565, + "learning_rate": 9.954860532631086e-06, + "loss": 0.2757, + "step": 1789 + }, + { + "epoch": 0.14319713605727885, + "grad_norm": 0.3204171762520193, + "learning_rate": 9.954773648228532e-06, + "loss": 0.2879, + "step": 1790 + }, + { + "epoch": 0.14327713445731086, + "grad_norm": 0.31469817598687255, + "learning_rate": 9.954686680668737e-06, + "loss": 0.3093, + "step": 1791 + }, + { + "epoch": 0.14335713285734286, + "grad_norm": 0.3113543190689672, + "learning_rate": 9.954599629953162e-06, + "loss": 0.3275, + "step": 1792 + }, + { + "epoch": 0.14343713125737484, + "grad_norm": 0.29493445256883705, + "learning_rate": 9.954512496083262e-06, + "loss": 0.3189, + "step": 1793 + }, + { + "epoch": 0.14351712965740684, + "grad_norm": 0.22707340882769847, + "learning_rate": 9.954425279060504e-06, + "loss": 0.317, + "step": 1794 + }, + { + "epoch": 0.14359712805743885, + "grad_norm": 0.2946919884516097, + "learning_rate": 9.95433797888635e-06, + "loss": 0.272, + "step": 1795 + }, + { + "epoch": 0.14367712645747086, + "grad_norm": 0.17511818261201095, + "learning_rate": 9.954250595562267e-06, + "loss": 0.394, + "step": 1796 + }, + { + "epoch": 0.14375712485750286, + "grad_norm": 0.2707159674605878, + "learning_rate": 9.95416312908972e-06, + "loss": 0.2902, + "step": 1797 + }, + { + "epoch": 0.14383712325753484, + "grad_norm": 0.20682608381513143, + "learning_rate": 9.954075579470178e-06, + "loss": 0.3629, + "step": 1798 + }, + { + "epoch": 0.14391712165756684, + "grad_norm": 0.2890110393980868, + "learning_rate": 9.953987946705108e-06, + "loss": 0.2858, + "step": 1799 + }, + { + "epoch": 0.14399712005759885, + "grad_norm": 0.30537360603386277, + "learning_rate": 9.953900230795983e-06, + "loss": 0.3256, + "step": 1800 + }, + { + "epoch": 0.14407711845763085, + "grad_norm": 0.26136906917458186, + "learning_rate": 9.953812431744274e-06, + "loss": 0.3354, + "step": 1801 + }, + { + "epoch": 0.14415711685766286, + "grad_norm": 0.32368559541867314, + "learning_rate": 9.953724549551458e-06, + "loss": 0.2883, + "step": 1802 + }, + { + "epoch": 0.14423711525769484, + "grad_norm": 0.31932677549930444, + "learning_rate": 9.953636584219004e-06, + "loss": 0.2861, + "step": 1803 + }, + { + "epoch": 0.14431711365772684, + "grad_norm": 0.38308612066912484, + "learning_rate": 9.953548535748392e-06, + "loss": 0.304, + "step": 1804 + }, + { + "epoch": 0.14439711205775885, + "grad_norm": 0.24594211358594562, + "learning_rate": 9.9534604041411e-06, + "loss": 0.3508, + "step": 1805 + }, + { + "epoch": 0.14447711045779085, + "grad_norm": 0.284554801293597, + "learning_rate": 9.953372189398607e-06, + "loss": 0.3167, + "step": 1806 + }, + { + "epoch": 0.14455710885782283, + "grad_norm": 0.28293831246303786, + "learning_rate": 9.953283891522393e-06, + "loss": 0.3166, + "step": 1807 + }, + { + "epoch": 0.14463710725785484, + "grad_norm": 0.3123143336106142, + "learning_rate": 9.953195510513936e-06, + "loss": 0.2617, + "step": 1808 + }, + { + "epoch": 0.14471710565788684, + "grad_norm": 0.3444926159483871, + "learning_rate": 9.953107046374726e-06, + "loss": 0.2705, + "step": 1809 + }, + { + "epoch": 0.14479710405791885, + "grad_norm": 0.2619360553088575, + "learning_rate": 9.953018499106245e-06, + "loss": 0.3351, + "step": 1810 + }, + { + "epoch": 0.14487710245795085, + "grad_norm": 0.26946106436617817, + "learning_rate": 9.95292986870998e-06, + "loss": 0.3355, + "step": 1811 + }, + { + "epoch": 0.14495710085798283, + "grad_norm": 0.3154463376474094, + "learning_rate": 9.952841155187413e-06, + "loss": 0.3016, + "step": 1812 + }, + { + "epoch": 0.14503709925801483, + "grad_norm": 0.28944627024388114, + "learning_rate": 9.95275235854004e-06, + "loss": 0.3136, + "step": 1813 + }, + { + "epoch": 0.14511709765804684, + "grad_norm": 0.30575120888023094, + "learning_rate": 9.952663478769352e-06, + "loss": 0.3015, + "step": 1814 + }, + { + "epoch": 0.14519709605807885, + "grad_norm": 0.2744983369056166, + "learning_rate": 9.952574515876833e-06, + "loss": 0.2981, + "step": 1815 + }, + { + "epoch": 0.14527709445811085, + "grad_norm": 0.25090335956737053, + "learning_rate": 9.952485469863981e-06, + "loss": 0.3315, + "step": 1816 + }, + { + "epoch": 0.14535709285814283, + "grad_norm": 0.34582112367748274, + "learning_rate": 9.952396340732292e-06, + "loss": 0.3081, + "step": 1817 + }, + { + "epoch": 0.14543709125817483, + "grad_norm": 0.2613232653280772, + "learning_rate": 9.952307128483257e-06, + "loss": 0.3307, + "step": 1818 + }, + { + "epoch": 0.14551708965820684, + "grad_norm": 0.4210963978330547, + "learning_rate": 9.952217833118377e-06, + "loss": 0.2973, + "step": 1819 + }, + { + "epoch": 0.14559708805823884, + "grad_norm": 0.3118596303508891, + "learning_rate": 9.95212845463915e-06, + "loss": 0.2901, + "step": 1820 + }, + { + "epoch": 0.14567708645827082, + "grad_norm": 0.4399119912972604, + "learning_rate": 9.952038993047076e-06, + "loss": 0.2918, + "step": 1821 + }, + { + "epoch": 0.14575708485830283, + "grad_norm": 0.287987278639579, + "learning_rate": 9.951949448343656e-06, + "loss": 0.2916, + "step": 1822 + }, + { + "epoch": 0.14583708325833483, + "grad_norm": 0.564479166839949, + "learning_rate": 9.951859820530394e-06, + "loss": 0.3385, + "step": 1823 + }, + { + "epoch": 0.14591708165836684, + "grad_norm": 0.3428624229558714, + "learning_rate": 9.951770109608792e-06, + "loss": 0.3038, + "step": 1824 + }, + { + "epoch": 0.14599708005839884, + "grad_norm": 0.29459868837725184, + "learning_rate": 9.951680315580356e-06, + "loss": 0.2992, + "step": 1825 + }, + { + "epoch": 0.14607707845843082, + "grad_norm": 0.2871018425543814, + "learning_rate": 9.951590438446597e-06, + "loss": 0.3225, + "step": 1826 + }, + { + "epoch": 0.14615707685846283, + "grad_norm": 0.33733151647126747, + "learning_rate": 9.951500478209018e-06, + "loss": 0.2872, + "step": 1827 + }, + { + "epoch": 0.14623707525849483, + "grad_norm": 0.27339140850401356, + "learning_rate": 9.951410434869133e-06, + "loss": 0.3075, + "step": 1828 + }, + { + "epoch": 0.14631707365852684, + "grad_norm": 0.3335200226637057, + "learning_rate": 9.951320308428449e-06, + "loss": 0.282, + "step": 1829 + }, + { + "epoch": 0.14639707205855884, + "grad_norm": 0.26738975982876784, + "learning_rate": 9.951230098888484e-06, + "loss": 0.3389, + "step": 1830 + }, + { + "epoch": 0.14647707045859082, + "grad_norm": 0.3079110352221004, + "learning_rate": 9.951139806250747e-06, + "loss": 0.3202, + "step": 1831 + }, + { + "epoch": 0.14655706885862282, + "grad_norm": 0.2702218338207828, + "learning_rate": 9.951049430516758e-06, + "loss": 0.3402, + "step": 1832 + }, + { + "epoch": 0.14663706725865483, + "grad_norm": 0.27317461904279994, + "learning_rate": 9.950958971688028e-06, + "loss": 0.3096, + "step": 1833 + }, + { + "epoch": 0.14671706565868683, + "grad_norm": 0.28137355012401005, + "learning_rate": 9.95086842976608e-06, + "loss": 0.2917, + "step": 1834 + }, + { + "epoch": 0.1467970640587188, + "grad_norm": 0.28761896666717185, + "learning_rate": 9.950777804752432e-06, + "loss": 0.2997, + "step": 1835 + }, + { + "epoch": 0.14687706245875082, + "grad_norm": 0.3415591719291004, + "learning_rate": 9.950687096648606e-06, + "loss": 0.3124, + "step": 1836 + }, + { + "epoch": 0.14695706085878282, + "grad_norm": 0.2286831298212902, + "learning_rate": 9.950596305456124e-06, + "loss": 0.324, + "step": 1837 + }, + { + "epoch": 0.14703705925881483, + "grad_norm": 0.32869636574507294, + "learning_rate": 9.950505431176507e-06, + "loss": 0.2921, + "step": 1838 + }, + { + "epoch": 0.14711705765884683, + "grad_norm": 0.16772669505942764, + "learning_rate": 9.950414473811283e-06, + "loss": 0.3936, + "step": 1839 + }, + { + "epoch": 0.1471970560588788, + "grad_norm": 0.28649100670019073, + "learning_rate": 9.95032343336198e-06, + "loss": 0.3099, + "step": 1840 + }, + { + "epoch": 0.14727705445891082, + "grad_norm": 0.27133145364991984, + "learning_rate": 9.950232309830121e-06, + "loss": 0.2822, + "step": 1841 + }, + { + "epoch": 0.14735705285894282, + "grad_norm": 0.355369594295927, + "learning_rate": 9.95014110321724e-06, + "loss": 0.3131, + "step": 1842 + }, + { + "epoch": 0.14743705125897483, + "grad_norm": 0.3355537274064852, + "learning_rate": 9.950049813524865e-06, + "loss": 0.3021, + "step": 1843 + }, + { + "epoch": 0.14751704965900683, + "grad_norm": 0.32254511745643205, + "learning_rate": 9.94995844075453e-06, + "loss": 0.2645, + "step": 1844 + }, + { + "epoch": 0.1475970480590388, + "grad_norm": 0.3204618145122401, + "learning_rate": 9.949866984907768e-06, + "loss": 0.2793, + "step": 1845 + }, + { + "epoch": 0.14767704645907082, + "grad_norm": 0.2628702424701707, + "learning_rate": 9.949775445986112e-06, + "loss": 0.3296, + "step": 1846 + }, + { + "epoch": 0.14775704485910282, + "grad_norm": 0.28343470231694806, + "learning_rate": 9.9496838239911e-06, + "loss": 0.3012, + "step": 1847 + }, + { + "epoch": 0.14783704325913483, + "grad_norm": 0.2037717652319075, + "learning_rate": 9.949592118924271e-06, + "loss": 0.3574, + "step": 1848 + }, + { + "epoch": 0.1479170416591668, + "grad_norm": 0.42539111489672166, + "learning_rate": 9.949500330787162e-06, + "loss": 0.3811, + "step": 1849 + }, + { + "epoch": 0.1479970400591988, + "grad_norm": 0.27700887234414806, + "learning_rate": 9.949408459581316e-06, + "loss": 0.3216, + "step": 1850 + }, + { + "epoch": 0.14807703845923081, + "grad_norm": 0.2779837158618126, + "learning_rate": 9.94931650530827e-06, + "loss": 0.3433, + "step": 1851 + }, + { + "epoch": 0.14815703685926282, + "grad_norm": 0.2899179613048597, + "learning_rate": 9.949224467969574e-06, + "loss": 0.3186, + "step": 1852 + }, + { + "epoch": 0.14823703525929482, + "grad_norm": 0.5197571788806743, + "learning_rate": 9.949132347566765e-06, + "loss": 0.2746, + "step": 1853 + }, + { + "epoch": 0.1483170336593268, + "grad_norm": 0.286463152698303, + "learning_rate": 9.949040144101396e-06, + "loss": 0.3122, + "step": 1854 + }, + { + "epoch": 0.1483970320593588, + "grad_norm": 0.32277761143261857, + "learning_rate": 9.948947857575012e-06, + "loss": 0.2943, + "step": 1855 + }, + { + "epoch": 0.1484770304593908, + "grad_norm": 0.31974179742061093, + "learning_rate": 9.948855487989161e-06, + "loss": 0.3237, + "step": 1856 + }, + { + "epoch": 0.14855702885942282, + "grad_norm": 0.3466111990865634, + "learning_rate": 9.948763035345393e-06, + "loss": 0.2699, + "step": 1857 + }, + { + "epoch": 0.14863702725945482, + "grad_norm": 0.3292729549858262, + "learning_rate": 9.94867049964526e-06, + "loss": 0.2734, + "step": 1858 + }, + { + "epoch": 0.1487170256594868, + "grad_norm": 0.3306165631879451, + "learning_rate": 9.948577880890318e-06, + "loss": 0.2743, + "step": 1859 + }, + { + "epoch": 0.1487970240595188, + "grad_norm": 0.45019858639406835, + "learning_rate": 9.948485179082117e-06, + "loss": 0.3, + "step": 1860 + }, + { + "epoch": 0.1488770224595508, + "grad_norm": 0.2831601948633049, + "learning_rate": 9.948392394222214e-06, + "loss": 0.3263, + "step": 1861 + }, + { + "epoch": 0.14895702085958282, + "grad_norm": 0.2884494626802798, + "learning_rate": 9.94829952631217e-06, + "loss": 0.3183, + "step": 1862 + }, + { + "epoch": 0.1490370192596148, + "grad_norm": 0.3142569100184952, + "learning_rate": 9.948206575353539e-06, + "loss": 0.2871, + "step": 1863 + }, + { + "epoch": 0.1491170176596468, + "grad_norm": 0.3493683457326879, + "learning_rate": 9.948113541347882e-06, + "loss": 0.3434, + "step": 1864 + }, + { + "epoch": 0.1491970160596788, + "grad_norm": 0.3004402718031714, + "learning_rate": 9.948020424296762e-06, + "loss": 0.3347, + "step": 1865 + }, + { + "epoch": 0.1492770144597108, + "grad_norm": 0.33573487157449655, + "learning_rate": 9.947927224201741e-06, + "loss": 0.2749, + "step": 1866 + }, + { + "epoch": 0.14935701285974282, + "grad_norm": 0.29497337677065777, + "learning_rate": 9.947833941064382e-06, + "loss": 0.3171, + "step": 1867 + }, + { + "epoch": 0.1494370112597748, + "grad_norm": 0.31216686853788317, + "learning_rate": 9.947740574886253e-06, + "loss": 0.2656, + "step": 1868 + }, + { + "epoch": 0.1495170096598068, + "grad_norm": 0.299370916220887, + "learning_rate": 9.94764712566892e-06, + "loss": 0.2896, + "step": 1869 + }, + { + "epoch": 0.1495970080598388, + "grad_norm": 0.35034504582011555, + "learning_rate": 9.947553593413948e-06, + "loss": 0.3045, + "step": 1870 + }, + { + "epoch": 0.1496770064598708, + "grad_norm": 0.3915856739342979, + "learning_rate": 9.947459978122912e-06, + "loss": 0.2939, + "step": 1871 + }, + { + "epoch": 0.14975700485990281, + "grad_norm": 0.27324450111796184, + "learning_rate": 9.947366279797382e-06, + "loss": 0.3196, + "step": 1872 + }, + { + "epoch": 0.1498370032599348, + "grad_norm": 0.23974963319293438, + "learning_rate": 9.947272498438929e-06, + "loss": 0.3388, + "step": 1873 + }, + { + "epoch": 0.1499170016599668, + "grad_norm": 0.32731745439760157, + "learning_rate": 9.947178634049127e-06, + "loss": 0.2865, + "step": 1874 + }, + { + "epoch": 0.1499970000599988, + "grad_norm": 0.3132702973741263, + "learning_rate": 9.947084686629552e-06, + "loss": 0.2696, + "step": 1875 + }, + { + "epoch": 0.1500769984600308, + "grad_norm": 0.30088133474177303, + "learning_rate": 9.946990656181782e-06, + "loss": 0.3076, + "step": 1876 + }, + { + "epoch": 0.15015699686006279, + "grad_norm": 0.3420583506596322, + "learning_rate": 9.946896542707391e-06, + "loss": 0.2923, + "step": 1877 + }, + { + "epoch": 0.1502369952600948, + "grad_norm": 0.46027287232057934, + "learning_rate": 9.946802346207963e-06, + "loss": 0.2765, + "step": 1878 + }, + { + "epoch": 0.1503169936601268, + "grad_norm": 0.3151837933860946, + "learning_rate": 9.946708066685077e-06, + "loss": 0.2527, + "step": 1879 + }, + { + "epoch": 0.1503969920601588, + "grad_norm": 0.28301358959449174, + "learning_rate": 9.946613704140315e-06, + "loss": 0.3462, + "step": 1880 + }, + { + "epoch": 0.1504769904601908, + "grad_norm": 0.2611324915074516, + "learning_rate": 9.946519258575263e-06, + "loss": 0.3231, + "step": 1881 + }, + { + "epoch": 0.15055698886022278, + "grad_norm": 0.2830739805365615, + "learning_rate": 9.946424729991502e-06, + "loss": 0.338, + "step": 1882 + }, + { + "epoch": 0.1506369872602548, + "grad_norm": 0.3253565459614321, + "learning_rate": 9.946330118390622e-06, + "loss": 0.2956, + "step": 1883 + }, + { + "epoch": 0.1507169856602868, + "grad_norm": 0.41220679415380274, + "learning_rate": 9.946235423774211e-06, + "loss": 0.2889, + "step": 1884 + }, + { + "epoch": 0.1507969840603188, + "grad_norm": 0.3109660103563562, + "learning_rate": 9.946140646143856e-06, + "loss": 0.3315, + "step": 1885 + }, + { + "epoch": 0.1508769824603508, + "grad_norm": 0.38596949211608217, + "learning_rate": 9.946045785501148e-06, + "loss": 0.3225, + "step": 1886 + }, + { + "epoch": 0.15095698086038278, + "grad_norm": 0.3339679133978762, + "learning_rate": 9.94595084184768e-06, + "loss": 0.3095, + "step": 1887 + }, + { + "epoch": 0.1510369792604148, + "grad_norm": 0.2894816453898282, + "learning_rate": 9.945855815185046e-06, + "loss": 0.298, + "step": 1888 + }, + { + "epoch": 0.1511169776604468, + "grad_norm": 0.360803512625732, + "learning_rate": 9.945760705514839e-06, + "loss": 0.2944, + "step": 1889 + }, + { + "epoch": 0.1511969760604788, + "grad_norm": 0.3628019256451538, + "learning_rate": 9.945665512838657e-06, + "loss": 0.2772, + "step": 1890 + }, + { + "epoch": 0.15127697446051078, + "grad_norm": 0.20035706495116146, + "learning_rate": 9.945570237158098e-06, + "loss": 0.3463, + "step": 1891 + }, + { + "epoch": 0.15135697286054278, + "grad_norm": 0.31489336439107407, + "learning_rate": 9.945474878474758e-06, + "loss": 0.31, + "step": 1892 + }, + { + "epoch": 0.1514369712605748, + "grad_norm": 0.3371685502009679, + "learning_rate": 9.94537943679024e-06, + "loss": 0.2784, + "step": 1893 + }, + { + "epoch": 0.1515169696606068, + "grad_norm": 0.3123657942543008, + "learning_rate": 9.945283912106145e-06, + "loss": 0.2899, + "step": 1894 + }, + { + "epoch": 0.1515969680606388, + "grad_norm": 0.31830182623763087, + "learning_rate": 9.945188304424078e-06, + "loss": 0.2804, + "step": 1895 + }, + { + "epoch": 0.15167696646067078, + "grad_norm": 0.35986647307557873, + "learning_rate": 9.945092613745642e-06, + "loss": 0.2773, + "step": 1896 + }, + { + "epoch": 0.15175696486070278, + "grad_norm": 0.31601971500211273, + "learning_rate": 9.944996840072442e-06, + "loss": 0.2946, + "step": 1897 + }, + { + "epoch": 0.1518369632607348, + "grad_norm": 0.3275030722606867, + "learning_rate": 9.944900983406087e-06, + "loss": 0.2762, + "step": 1898 + }, + { + "epoch": 0.1519169616607668, + "grad_norm": 0.2883875538357041, + "learning_rate": 9.944805043748185e-06, + "loss": 0.2919, + "step": 1899 + }, + { + "epoch": 0.1519969600607988, + "grad_norm": 0.31496523144795685, + "learning_rate": 9.944709021100347e-06, + "loss": 0.2816, + "step": 1900 + }, + { + "epoch": 0.15207695846083077, + "grad_norm": 0.35493257580438625, + "learning_rate": 9.944612915464183e-06, + "loss": 0.2945, + "step": 1901 + }, + { + "epoch": 0.15215695686086278, + "grad_norm": 0.31007967641599943, + "learning_rate": 9.944516726841308e-06, + "loss": 0.2771, + "step": 1902 + }, + { + "epoch": 0.15223695526089479, + "grad_norm": 0.3407784653582902, + "learning_rate": 9.944420455233335e-06, + "loss": 0.3248, + "step": 1903 + }, + { + "epoch": 0.1523169536609268, + "grad_norm": 0.3277126303965842, + "learning_rate": 9.94432410064188e-06, + "loss": 0.2749, + "step": 1904 + }, + { + "epoch": 0.15239695206095877, + "grad_norm": 0.3243543805856334, + "learning_rate": 9.94422766306856e-06, + "loss": 0.2992, + "step": 1905 + }, + { + "epoch": 0.15247695046099077, + "grad_norm": 0.26179412250906714, + "learning_rate": 9.944131142514994e-06, + "loss": 0.3056, + "step": 1906 + }, + { + "epoch": 0.15255694886102278, + "grad_norm": 0.2785204697187977, + "learning_rate": 9.944034538982804e-06, + "loss": 0.3112, + "step": 1907 + }, + { + "epoch": 0.15263694726105478, + "grad_norm": 0.316223583149207, + "learning_rate": 9.943937852473605e-06, + "loss": 0.3181, + "step": 1908 + }, + { + "epoch": 0.1527169456610868, + "grad_norm": 0.2658340177863202, + "learning_rate": 9.943841082989027e-06, + "loss": 0.2993, + "step": 1909 + }, + { + "epoch": 0.15279694406111877, + "grad_norm": 0.34191815866569764, + "learning_rate": 9.943744230530689e-06, + "loss": 0.2711, + "step": 1910 + }, + { + "epoch": 0.15287694246115077, + "grad_norm": 0.32841962974101413, + "learning_rate": 9.943647295100219e-06, + "loss": 0.3133, + "step": 1911 + }, + { + "epoch": 0.15295694086118278, + "grad_norm": 0.35883213639904044, + "learning_rate": 9.943550276699244e-06, + "loss": 0.3199, + "step": 1912 + }, + { + "epoch": 0.15303693926121478, + "grad_norm": 0.30156919934819676, + "learning_rate": 9.94345317532939e-06, + "loss": 0.2832, + "step": 1913 + }, + { + "epoch": 0.1531169376612468, + "grad_norm": 0.2868351210873058, + "learning_rate": 9.943355990992289e-06, + "loss": 0.3013, + "step": 1914 + }, + { + "epoch": 0.15319693606127877, + "grad_norm": 0.2946587459800661, + "learning_rate": 9.94325872368957e-06, + "loss": 0.2973, + "step": 1915 + }, + { + "epoch": 0.15327693446131077, + "grad_norm": 0.29831093465693553, + "learning_rate": 9.943161373422869e-06, + "loss": 0.3222, + "step": 1916 + }, + { + "epoch": 0.15335693286134278, + "grad_norm": 0.3386638956830772, + "learning_rate": 9.943063940193817e-06, + "loss": 0.2833, + "step": 1917 + }, + { + "epoch": 0.15343693126137478, + "grad_norm": 0.2868698428970782, + "learning_rate": 9.94296642400405e-06, + "loss": 0.2917, + "step": 1918 + }, + { + "epoch": 0.15351692966140676, + "grad_norm": 0.3791255728042178, + "learning_rate": 9.942868824855202e-06, + "loss": 0.2764, + "step": 1919 + }, + { + "epoch": 0.15359692806143876, + "grad_norm": 0.28572584237585447, + "learning_rate": 9.942771142748917e-06, + "loss": 0.3086, + "step": 1920 + }, + { + "epoch": 0.15367692646147077, + "grad_norm": 0.2536290066873153, + "learning_rate": 9.94267337768683e-06, + "loss": 0.3354, + "step": 1921 + }, + { + "epoch": 0.15375692486150278, + "grad_norm": 0.3231069430599438, + "learning_rate": 9.942575529670582e-06, + "loss": 0.2594, + "step": 1922 + }, + { + "epoch": 0.15383692326153478, + "grad_norm": 0.3130327942294197, + "learning_rate": 9.942477598701815e-06, + "loss": 0.2779, + "step": 1923 + }, + { + "epoch": 0.15391692166156676, + "grad_norm": 0.32538530277231054, + "learning_rate": 9.942379584782176e-06, + "loss": 0.3002, + "step": 1924 + }, + { + "epoch": 0.15399692006159876, + "grad_norm": 0.2706298250783191, + "learning_rate": 9.942281487913306e-06, + "loss": 0.3416, + "step": 1925 + }, + { + "epoch": 0.15407691846163077, + "grad_norm": 0.31959008849859943, + "learning_rate": 9.942183308096853e-06, + "loss": 0.2696, + "step": 1926 + }, + { + "epoch": 0.15415691686166277, + "grad_norm": 0.26577896553433417, + "learning_rate": 9.942085045334464e-06, + "loss": 0.3213, + "step": 1927 + }, + { + "epoch": 0.15423691526169478, + "grad_norm": 0.27254963288123485, + "learning_rate": 9.94198669962779e-06, + "loss": 0.3184, + "step": 1928 + }, + { + "epoch": 0.15431691366172676, + "grad_norm": 0.3016045978165824, + "learning_rate": 9.941888270978482e-06, + "loss": 0.3046, + "step": 1929 + }, + { + "epoch": 0.15439691206175876, + "grad_norm": 0.2874779313449009, + "learning_rate": 9.941789759388187e-06, + "loss": 0.3014, + "step": 1930 + }, + { + "epoch": 0.15447691046179077, + "grad_norm": 0.36056563442619455, + "learning_rate": 9.941691164858565e-06, + "loss": 0.2631, + "step": 1931 + }, + { + "epoch": 0.15455690886182277, + "grad_norm": 0.3927924331129919, + "learning_rate": 9.941592487391265e-06, + "loss": 0.2994, + "step": 1932 + }, + { + "epoch": 0.15463690726185475, + "grad_norm": 0.3150792463892834, + "learning_rate": 9.941493726987947e-06, + "loss": 0.2685, + "step": 1933 + }, + { + "epoch": 0.15471690566188676, + "grad_norm": 0.27724982112327506, + "learning_rate": 9.941394883650266e-06, + "loss": 0.2994, + "step": 1934 + }, + { + "epoch": 0.15479690406191876, + "grad_norm": 0.26740898269289404, + "learning_rate": 9.941295957379884e-06, + "loss": 0.2849, + "step": 1935 + }, + { + "epoch": 0.15487690246195077, + "grad_norm": 0.3234473105276574, + "learning_rate": 9.941196948178457e-06, + "loss": 0.282, + "step": 1936 + }, + { + "epoch": 0.15495690086198277, + "grad_norm": 0.3014168749387208, + "learning_rate": 9.941097856047652e-06, + "loss": 0.2991, + "step": 1937 + }, + { + "epoch": 0.15503689926201475, + "grad_norm": 0.2921167973124924, + "learning_rate": 9.940998680989127e-06, + "loss": 0.3008, + "step": 1938 + }, + { + "epoch": 0.15511689766204675, + "grad_norm": 0.2994366993365175, + "learning_rate": 9.940899423004548e-06, + "loss": 0.3037, + "step": 1939 + }, + { + "epoch": 0.15519689606207876, + "grad_norm": 0.3226771448289312, + "learning_rate": 9.940800082095583e-06, + "loss": 0.3053, + "step": 1940 + }, + { + "epoch": 0.15527689446211076, + "grad_norm": 0.2434821420973053, + "learning_rate": 9.940700658263897e-06, + "loss": 0.3271, + "step": 1941 + }, + { + "epoch": 0.15535689286214277, + "grad_norm": 0.3177729962718052, + "learning_rate": 9.94060115151116e-06, + "loss": 0.2605, + "step": 1942 + }, + { + "epoch": 0.15543689126217475, + "grad_norm": 0.291119337301666, + "learning_rate": 9.940501561839043e-06, + "loss": 0.3179, + "step": 1943 + }, + { + "epoch": 0.15551688966220675, + "grad_norm": 0.35606126665376925, + "learning_rate": 9.940401889249213e-06, + "loss": 0.304, + "step": 1944 + }, + { + "epoch": 0.15559688806223876, + "grad_norm": 0.3384219456754074, + "learning_rate": 9.940302133743347e-06, + "loss": 0.2803, + "step": 1945 + }, + { + "epoch": 0.15567688646227076, + "grad_norm": 0.8944823592957772, + "learning_rate": 9.940202295323116e-06, + "loss": 0.2958, + "step": 1946 + }, + { + "epoch": 0.15575688486230274, + "grad_norm": 0.2866956059110046, + "learning_rate": 9.940102373990202e-06, + "loss": 0.3089, + "step": 1947 + }, + { + "epoch": 0.15583688326233475, + "grad_norm": 0.31768839474460686, + "learning_rate": 9.940002369746273e-06, + "loss": 0.3028, + "step": 1948 + }, + { + "epoch": 0.15591688166236675, + "grad_norm": 0.33033292503840583, + "learning_rate": 9.939902282593015e-06, + "loss": 0.2736, + "step": 1949 + }, + { + "epoch": 0.15599688006239876, + "grad_norm": 2.123138540393024, + "learning_rate": 9.939802112532103e-06, + "loss": 0.2974, + "step": 1950 + }, + { + "epoch": 0.15607687846243076, + "grad_norm": 0.28928403017680615, + "learning_rate": 9.93970185956522e-06, + "loss": 0.2997, + "step": 1951 + }, + { + "epoch": 0.15615687686246274, + "grad_norm": 0.3331706244042608, + "learning_rate": 9.93960152369405e-06, + "loss": 0.2855, + "step": 1952 + }, + { + "epoch": 0.15623687526249475, + "grad_norm": 0.26783340952157336, + "learning_rate": 9.939501104920275e-06, + "loss": 0.3432, + "step": 1953 + }, + { + "epoch": 0.15631687366252675, + "grad_norm": 0.26650864377933553, + "learning_rate": 9.939400603245581e-06, + "loss": 0.318, + "step": 1954 + }, + { + "epoch": 0.15639687206255876, + "grad_norm": 0.3629216180233225, + "learning_rate": 9.939300018671654e-06, + "loss": 0.2961, + "step": 1955 + }, + { + "epoch": 0.15647687046259076, + "grad_norm": 0.5947336439646187, + "learning_rate": 9.939199351200182e-06, + "loss": 0.2979, + "step": 1956 + }, + { + "epoch": 0.15655686886262274, + "grad_norm": 0.24725058002732525, + "learning_rate": 9.939098600832857e-06, + "loss": 0.3237, + "step": 1957 + }, + { + "epoch": 0.15663686726265474, + "grad_norm": 0.353539628846901, + "learning_rate": 9.938997767571366e-06, + "loss": 0.3079, + "step": 1958 + }, + { + "epoch": 0.15671686566268675, + "grad_norm": 0.256775710924978, + "learning_rate": 9.938896851417406e-06, + "loss": 0.3205, + "step": 1959 + }, + { + "epoch": 0.15679686406271875, + "grad_norm": 0.3425475794246176, + "learning_rate": 9.938795852372667e-06, + "loss": 0.2812, + "step": 1960 + }, + { + "epoch": 0.15687686246275073, + "grad_norm": 0.23593916471198498, + "learning_rate": 9.938694770438843e-06, + "loss": 0.3338, + "step": 1961 + }, + { + "epoch": 0.15695686086278274, + "grad_norm": 0.20874566330252095, + "learning_rate": 9.938593605617637e-06, + "loss": 0.3488, + "step": 1962 + }, + { + "epoch": 0.15703685926281474, + "grad_norm": 0.3210869159591659, + "learning_rate": 9.93849235791074e-06, + "loss": 0.2834, + "step": 1963 + }, + { + "epoch": 0.15711685766284675, + "grad_norm": 0.32824481394583327, + "learning_rate": 9.938391027319853e-06, + "loss": 0.2964, + "step": 1964 + }, + { + "epoch": 0.15719685606287875, + "grad_norm": 0.29161702233315906, + "learning_rate": 9.93828961384668e-06, + "loss": 0.3166, + "step": 1965 + }, + { + "epoch": 0.15727685446291073, + "grad_norm": 0.2588679863957179, + "learning_rate": 9.938188117492919e-06, + "loss": 0.3197, + "step": 1966 + }, + { + "epoch": 0.15735685286294274, + "grad_norm": 0.2991901550583303, + "learning_rate": 9.938086538260277e-06, + "loss": 0.3141, + "step": 1967 + }, + { + "epoch": 0.15743685126297474, + "grad_norm": 0.2373571460420263, + "learning_rate": 9.937984876150455e-06, + "loss": 0.3403, + "step": 1968 + }, + { + "epoch": 0.15751684966300675, + "grad_norm": 0.27797961478436123, + "learning_rate": 9.937883131165163e-06, + "loss": 0.2942, + "step": 1969 + }, + { + "epoch": 0.15759684806303875, + "grad_norm": 0.313459888523142, + "learning_rate": 9.937781303306105e-06, + "loss": 0.2905, + "step": 1970 + }, + { + "epoch": 0.15767684646307073, + "grad_norm": 0.18233826081065072, + "learning_rate": 9.937679392574991e-06, + "loss": 0.3995, + "step": 1971 + }, + { + "epoch": 0.15775684486310274, + "grad_norm": 0.2956874431100799, + "learning_rate": 9.937577398973533e-06, + "loss": 0.3298, + "step": 1972 + }, + { + "epoch": 0.15783684326313474, + "grad_norm": 0.3560142279117845, + "learning_rate": 9.937475322503442e-06, + "loss": 0.2801, + "step": 1973 + }, + { + "epoch": 0.15791684166316675, + "grad_norm": 0.3102471995347657, + "learning_rate": 9.93737316316643e-06, + "loss": 0.3104, + "step": 1974 + }, + { + "epoch": 0.15799684006319872, + "grad_norm": 0.2936088942445335, + "learning_rate": 9.937270920964214e-06, + "loss": 0.3097, + "step": 1975 + }, + { + "epoch": 0.15807683846323073, + "grad_norm": 0.26910560048961457, + "learning_rate": 9.93716859589851e-06, + "loss": 0.3433, + "step": 1976 + }, + { + "epoch": 0.15815683686326273, + "grad_norm": 0.2919644913402779, + "learning_rate": 9.937066187971031e-06, + "loss": 0.3404, + "step": 1977 + }, + { + "epoch": 0.15823683526329474, + "grad_norm": 0.2719191882954073, + "learning_rate": 9.9369636971835e-06, + "loss": 0.332, + "step": 1978 + }, + { + "epoch": 0.15831683366332674, + "grad_norm": 0.28901665532367127, + "learning_rate": 9.936861123537636e-06, + "loss": 0.3204, + "step": 1979 + }, + { + "epoch": 0.15839683206335872, + "grad_norm": 0.32271995431330563, + "learning_rate": 9.936758467035161e-06, + "loss": 0.3525, + "step": 1980 + }, + { + "epoch": 0.15847683046339073, + "grad_norm": 0.4475416968890067, + "learning_rate": 9.936655727677795e-06, + "loss": 0.3335, + "step": 1981 + }, + { + "epoch": 0.15855682886342273, + "grad_norm": 0.34832908968617937, + "learning_rate": 9.936552905467266e-06, + "loss": 0.2595, + "step": 1982 + }, + { + "epoch": 0.15863682726345474, + "grad_norm": 0.3371639518254655, + "learning_rate": 9.936450000405297e-06, + "loss": 0.2859, + "step": 1983 + }, + { + "epoch": 0.15871682566348674, + "grad_norm": 0.32434960199141255, + "learning_rate": 9.93634701249362e-06, + "loss": 0.3092, + "step": 1984 + }, + { + "epoch": 0.15879682406351872, + "grad_norm": 0.3108385525904896, + "learning_rate": 9.936243941733956e-06, + "loss": 0.2879, + "step": 1985 + }, + { + "epoch": 0.15887682246355073, + "grad_norm": 0.29828337033894703, + "learning_rate": 9.936140788128039e-06, + "loss": 0.3099, + "step": 1986 + }, + { + "epoch": 0.15895682086358273, + "grad_norm": 0.2905310288292043, + "learning_rate": 9.9360375516776e-06, + "loss": 0.3059, + "step": 1987 + }, + { + "epoch": 0.15903681926361474, + "grad_norm": 0.2824472375123829, + "learning_rate": 9.935934232384374e-06, + "loss": 0.2969, + "step": 1988 + }, + { + "epoch": 0.15911681766364671, + "grad_norm": 0.2855051453237634, + "learning_rate": 9.93583083025009e-06, + "loss": 0.3027, + "step": 1989 + }, + { + "epoch": 0.15919681606367872, + "grad_norm": 1.2438407842811185, + "learning_rate": 9.935727345276487e-06, + "loss": 0.2918, + "step": 1990 + }, + { + "epoch": 0.15927681446371073, + "grad_norm": 0.2883612405465744, + "learning_rate": 9.9356237774653e-06, + "loss": 0.3004, + "step": 1991 + }, + { + "epoch": 0.15935681286374273, + "grad_norm": 0.28781977965373595, + "learning_rate": 9.935520126818269e-06, + "loss": 0.3096, + "step": 1992 + }, + { + "epoch": 0.15943681126377474, + "grad_norm": 0.4968861461530532, + "learning_rate": 9.935416393337132e-06, + "loss": 0.3046, + "step": 1993 + }, + { + "epoch": 0.1595168096638067, + "grad_norm": 0.35610699215577285, + "learning_rate": 9.93531257702363e-06, + "loss": 0.2914, + "step": 1994 + }, + { + "epoch": 0.15959680806383872, + "grad_norm": 0.24555119841806142, + "learning_rate": 9.935208677879508e-06, + "loss": 0.3247, + "step": 1995 + }, + { + "epoch": 0.15967680646387072, + "grad_norm": 0.3445156274523415, + "learning_rate": 9.935104695906506e-06, + "loss": 0.2972, + "step": 1996 + }, + { + "epoch": 0.15975680486390273, + "grad_norm": 0.38663321560641845, + "learning_rate": 9.935000631106372e-06, + "loss": 0.2902, + "step": 1997 + }, + { + "epoch": 0.15983680326393473, + "grad_norm": 0.2981423721350435, + "learning_rate": 9.934896483480851e-06, + "loss": 0.3021, + "step": 1998 + }, + { + "epoch": 0.1599168016639667, + "grad_norm": 0.3223091364872938, + "learning_rate": 9.93479225303169e-06, + "loss": 0.2676, + "step": 1999 + }, + { + "epoch": 0.15999680006399872, + "grad_norm": 0.3389354411333486, + "learning_rate": 9.934687939760642e-06, + "loss": 0.3015, + "step": 2000 + }, + { + "epoch": 0.16007679846403072, + "grad_norm": 0.2968751407080333, + "learning_rate": 9.934583543669454e-06, + "loss": 0.3209, + "step": 2001 + }, + { + "epoch": 0.16015679686406273, + "grad_norm": 0.36261609121872346, + "learning_rate": 9.93447906475988e-06, + "loss": 0.2895, + "step": 2002 + }, + { + "epoch": 0.1602367952640947, + "grad_norm": 0.28266869991408483, + "learning_rate": 9.934374503033672e-06, + "loss": 0.3129, + "step": 2003 + }, + { + "epoch": 0.1603167936641267, + "grad_norm": 0.328105884068455, + "learning_rate": 9.934269858492587e-06, + "loss": 0.2818, + "step": 2004 + }, + { + "epoch": 0.16039679206415872, + "grad_norm": 0.4531626802863882, + "learning_rate": 9.934165131138381e-06, + "loss": 0.2631, + "step": 2005 + }, + { + "epoch": 0.16047679046419072, + "grad_norm": 0.31175207748463274, + "learning_rate": 9.934060320972811e-06, + "loss": 0.3416, + "step": 2006 + }, + { + "epoch": 0.16055678886422273, + "grad_norm": 0.27819934788155004, + "learning_rate": 9.933955427997634e-06, + "loss": 0.3179, + "step": 2007 + }, + { + "epoch": 0.1606367872642547, + "grad_norm": 0.42046561569885477, + "learning_rate": 9.933850452214612e-06, + "loss": 0.2747, + "step": 2008 + }, + { + "epoch": 0.1607167856642867, + "grad_norm": 0.3374576150570054, + "learning_rate": 9.933745393625509e-06, + "loss": 0.2969, + "step": 2009 + }, + { + "epoch": 0.16079678406431872, + "grad_norm": 0.2785923874961948, + "learning_rate": 9.933640252232087e-06, + "loss": 0.2939, + "step": 2010 + }, + { + "epoch": 0.16087678246435072, + "grad_norm": 0.2647996212822832, + "learning_rate": 9.933535028036108e-06, + "loss": 0.3029, + "step": 2011 + }, + { + "epoch": 0.16095678086438273, + "grad_norm": 0.28150666968321303, + "learning_rate": 9.93342972103934e-06, + "loss": 0.3099, + "step": 2012 + }, + { + "epoch": 0.1610367792644147, + "grad_norm": 0.3540097063560117, + "learning_rate": 9.933324331243553e-06, + "loss": 0.2804, + "step": 2013 + }, + { + "epoch": 0.1611167776644467, + "grad_norm": 0.3226258433038972, + "learning_rate": 9.93321885865051e-06, + "loss": 0.3082, + "step": 2014 + }, + { + "epoch": 0.16119677606447871, + "grad_norm": 0.3495634632864703, + "learning_rate": 9.933113303261987e-06, + "loss": 0.2927, + "step": 2015 + }, + { + "epoch": 0.16127677446451072, + "grad_norm": 0.2924312313285416, + "learning_rate": 9.933007665079752e-06, + "loss": 0.3158, + "step": 2016 + }, + { + "epoch": 0.1613567728645427, + "grad_norm": 0.29460187709681573, + "learning_rate": 9.932901944105578e-06, + "loss": 0.2937, + "step": 2017 + }, + { + "epoch": 0.1614367712645747, + "grad_norm": 0.3145263865114884, + "learning_rate": 9.932796140341242e-06, + "loss": 0.2795, + "step": 2018 + }, + { + "epoch": 0.1615167696646067, + "grad_norm": 0.3109208515777231, + "learning_rate": 9.932690253788516e-06, + "loss": 0.2728, + "step": 2019 + }, + { + "epoch": 0.1615967680646387, + "grad_norm": 0.2827757943067093, + "learning_rate": 9.93258428444918e-06, + "loss": 0.3068, + "step": 2020 + }, + { + "epoch": 0.16167676646467072, + "grad_norm": 0.29801789061824474, + "learning_rate": 9.932478232325013e-06, + "loss": 0.3131, + "step": 2021 + }, + { + "epoch": 0.1617567648647027, + "grad_norm": 0.300066131654582, + "learning_rate": 9.932372097417793e-06, + "loss": 0.2897, + "step": 2022 + }, + { + "epoch": 0.1618367632647347, + "grad_norm": 0.43030235696806723, + "learning_rate": 9.9322658797293e-06, + "loss": 0.2822, + "step": 2023 + }, + { + "epoch": 0.1619167616647667, + "grad_norm": 0.2917046724238359, + "learning_rate": 9.93215957926132e-06, + "loss": 0.3382, + "step": 2024 + }, + { + "epoch": 0.1619967600647987, + "grad_norm": 0.3103294520178998, + "learning_rate": 9.932053196015634e-06, + "loss": 0.282, + "step": 2025 + }, + { + "epoch": 0.16207675846483072, + "grad_norm": 0.3227811910781441, + "learning_rate": 9.93194672999403e-06, + "loss": 0.285, + "step": 2026 + }, + { + "epoch": 0.1621567568648627, + "grad_norm": 0.28412302400350264, + "learning_rate": 9.931840181198296e-06, + "loss": 0.3248, + "step": 2027 + }, + { + "epoch": 0.1622367552648947, + "grad_norm": 0.29180415940981985, + "learning_rate": 9.931733549630215e-06, + "loss": 0.3129, + "step": 2028 + }, + { + "epoch": 0.1623167536649267, + "grad_norm": 0.2772177411273436, + "learning_rate": 9.931626835291581e-06, + "loss": 0.3113, + "step": 2029 + }, + { + "epoch": 0.1623967520649587, + "grad_norm": 0.3102598246651718, + "learning_rate": 9.931520038184184e-06, + "loss": 0.2698, + "step": 2030 + }, + { + "epoch": 0.1624767504649907, + "grad_norm": 0.3749355037097101, + "learning_rate": 9.931413158309816e-06, + "loss": 0.2758, + "step": 2031 + }, + { + "epoch": 0.1625567488650227, + "grad_norm": 0.2673871235658931, + "learning_rate": 9.93130619567027e-06, + "loss": 0.3305, + "step": 2032 + }, + { + "epoch": 0.1626367472650547, + "grad_norm": 0.2993632822591623, + "learning_rate": 9.931199150267343e-06, + "loss": 0.3179, + "step": 2033 + }, + { + "epoch": 0.1627167456650867, + "grad_norm": 0.46468678674961444, + "learning_rate": 9.931092022102829e-06, + "loss": 0.3463, + "step": 2034 + }, + { + "epoch": 0.1627967440651187, + "grad_norm": 0.32546445539864916, + "learning_rate": 9.93098481117853e-06, + "loss": 0.2949, + "step": 2035 + }, + { + "epoch": 0.1628767424651507, + "grad_norm": 0.24537549794232927, + "learning_rate": 9.930877517496242e-06, + "loss": 0.3388, + "step": 2036 + }, + { + "epoch": 0.1629567408651827, + "grad_norm": 0.2896727046965226, + "learning_rate": 9.930770141057767e-06, + "loss": 0.3267, + "step": 2037 + }, + { + "epoch": 0.1630367392652147, + "grad_norm": 0.3646729676007411, + "learning_rate": 9.930662681864906e-06, + "loss": 0.3145, + "step": 2038 + }, + { + "epoch": 0.1631167376652467, + "grad_norm": 0.3246249494533878, + "learning_rate": 9.930555139919465e-06, + "loss": 0.2555, + "step": 2039 + }, + { + "epoch": 0.1631967360652787, + "grad_norm": 0.31047771059898943, + "learning_rate": 9.930447515223244e-06, + "loss": 0.2823, + "step": 2040 + }, + { + "epoch": 0.16327673446531069, + "grad_norm": 0.30489563137121306, + "learning_rate": 9.930339807778056e-06, + "loss": 0.3164, + "step": 2041 + }, + { + "epoch": 0.1633567328653427, + "grad_norm": 0.2708505017555708, + "learning_rate": 9.930232017585704e-06, + "loss": 0.3409, + "step": 2042 + }, + { + "epoch": 0.1634367312653747, + "grad_norm": 0.27613585284837033, + "learning_rate": 9.930124144647998e-06, + "loss": 0.3151, + "step": 2043 + }, + { + "epoch": 0.1635167296654067, + "grad_norm": 0.2573171694407268, + "learning_rate": 9.930016188966749e-06, + "loss": 0.2984, + "step": 2044 + }, + { + "epoch": 0.16359672806543868, + "grad_norm": 0.32483054169675796, + "learning_rate": 9.929908150543769e-06, + "loss": 0.301, + "step": 2045 + }, + { + "epoch": 0.16367672646547068, + "grad_norm": 0.3282723065043363, + "learning_rate": 9.92980002938087e-06, + "loss": 0.3002, + "step": 2046 + }, + { + "epoch": 0.1637567248655027, + "grad_norm": 0.2942875595467553, + "learning_rate": 9.929691825479868e-06, + "loss": 0.3104, + "step": 2047 + }, + { + "epoch": 0.1638367232655347, + "grad_norm": 0.29648902607067845, + "learning_rate": 9.92958353884258e-06, + "loss": 0.3007, + "step": 2048 + }, + { + "epoch": 0.1639167216655667, + "grad_norm": 0.3323779109255999, + "learning_rate": 9.929475169470819e-06, + "loss": 0.3191, + "step": 2049 + }, + { + "epoch": 0.16399672006559868, + "grad_norm": 0.24900330227006712, + "learning_rate": 9.929366717366408e-06, + "loss": 0.3208, + "step": 2050 + }, + { + "epoch": 0.16407671846563068, + "grad_norm": 0.2771006938988489, + "learning_rate": 9.929258182531167e-06, + "loss": 0.3414, + "step": 2051 + }, + { + "epoch": 0.1641567168656627, + "grad_norm": 0.318919987120989, + "learning_rate": 9.929149564966915e-06, + "loss": 0.2812, + "step": 2052 + }, + { + "epoch": 0.1642367152656947, + "grad_norm": 0.37899398351306995, + "learning_rate": 9.929040864675477e-06, + "loss": 0.3083, + "step": 2053 + }, + { + "epoch": 0.1643167136657267, + "grad_norm": 0.27473698429141086, + "learning_rate": 9.928932081658677e-06, + "loss": 0.2927, + "step": 2054 + }, + { + "epoch": 0.16439671206575868, + "grad_norm": 0.29387493528289566, + "learning_rate": 9.92882321591834e-06, + "loss": 0.3092, + "step": 2055 + }, + { + "epoch": 0.16447671046579068, + "grad_norm": 0.25145998054783447, + "learning_rate": 9.928714267456295e-06, + "loss": 0.34, + "step": 2056 + }, + { + "epoch": 0.1645567088658227, + "grad_norm": 0.29103391565460074, + "learning_rate": 9.928605236274368e-06, + "loss": 0.3198, + "step": 2057 + }, + { + "epoch": 0.1646367072658547, + "grad_norm": 0.2948898569870395, + "learning_rate": 9.928496122374388e-06, + "loss": 0.3236, + "step": 2058 + }, + { + "epoch": 0.16471670566588667, + "grad_norm": 0.22882413645760086, + "learning_rate": 9.928386925758191e-06, + "loss": 0.3508, + "step": 2059 + }, + { + "epoch": 0.16479670406591868, + "grad_norm": 0.2870428290504838, + "learning_rate": 9.928277646427605e-06, + "loss": 0.3059, + "step": 2060 + }, + { + "epoch": 0.16487670246595068, + "grad_norm": 0.30218266748330275, + "learning_rate": 9.928168284384468e-06, + "loss": 0.2944, + "step": 2061 + }, + { + "epoch": 0.16495670086598269, + "grad_norm": 0.3237261709981678, + "learning_rate": 9.928058839630612e-06, + "loss": 0.292, + "step": 2062 + }, + { + "epoch": 0.1650366992660147, + "grad_norm": 0.28878113343686035, + "learning_rate": 9.927949312167876e-06, + "loss": 0.2905, + "step": 2063 + }, + { + "epoch": 0.16511669766604667, + "grad_norm": 0.27915257720569336, + "learning_rate": 9.927839701998098e-06, + "loss": 0.3126, + "step": 2064 + }, + { + "epoch": 0.16519669606607867, + "grad_norm": 0.2443014418745416, + "learning_rate": 9.927730009123116e-06, + "loss": 0.326, + "step": 2065 + }, + { + "epoch": 0.16527669446611068, + "grad_norm": 0.34505327992132423, + "learning_rate": 9.927620233544772e-06, + "loss": 0.2791, + "step": 2066 + }, + { + "epoch": 0.16535669286614268, + "grad_norm": 0.29445914239095056, + "learning_rate": 9.92751037526491e-06, + "loss": 0.2935, + "step": 2067 + }, + { + "epoch": 0.1654366912661747, + "grad_norm": 0.350564409055299, + "learning_rate": 9.927400434285372e-06, + "loss": 0.3205, + "step": 2068 + }, + { + "epoch": 0.16551668966620667, + "grad_norm": 0.35258949036162895, + "learning_rate": 9.927290410608003e-06, + "loss": 0.2981, + "step": 2069 + }, + { + "epoch": 0.16559668806623867, + "grad_norm": 0.28547370883708645, + "learning_rate": 9.92718030423465e-06, + "loss": 0.3202, + "step": 2070 + }, + { + "epoch": 0.16567668646627068, + "grad_norm": 0.28679915795589933, + "learning_rate": 9.927070115167161e-06, + "loss": 0.296, + "step": 2071 + }, + { + "epoch": 0.16575668486630268, + "grad_norm": 0.2820621298380895, + "learning_rate": 9.926959843407387e-06, + "loss": 0.3156, + "step": 2072 + }, + { + "epoch": 0.16583668326633466, + "grad_norm": 0.2996160639064728, + "learning_rate": 9.926849488957176e-06, + "loss": 0.2934, + "step": 2073 + }, + { + "epoch": 0.16591668166636667, + "grad_norm": 0.3050953030562137, + "learning_rate": 9.92673905181838e-06, + "loss": 0.3301, + "step": 2074 + }, + { + "epoch": 0.16599668006639867, + "grad_norm": 0.42569906281833886, + "learning_rate": 9.926628531992855e-06, + "loss": 0.2854, + "step": 2075 + }, + { + "epoch": 0.16607667846643068, + "grad_norm": 0.284517162889428, + "learning_rate": 9.926517929482454e-06, + "loss": 0.3087, + "step": 2076 + }, + { + "epoch": 0.16615667686646268, + "grad_norm": 0.3188657173590444, + "learning_rate": 9.926407244289033e-06, + "loss": 0.2878, + "step": 2077 + }, + { + "epoch": 0.16623667526649466, + "grad_norm": 0.2957576480672817, + "learning_rate": 9.926296476414451e-06, + "loss": 0.2865, + "step": 2078 + }, + { + "epoch": 0.16631667366652667, + "grad_norm": 0.24215609358489673, + "learning_rate": 9.926185625860567e-06, + "loss": 0.3469, + "step": 2079 + }, + { + "epoch": 0.16639667206655867, + "grad_norm": 0.24149424115276494, + "learning_rate": 9.926074692629241e-06, + "loss": 0.3212, + "step": 2080 + }, + { + "epoch": 0.16647667046659068, + "grad_norm": 0.2864256359063075, + "learning_rate": 9.925963676722335e-06, + "loss": 0.2901, + "step": 2081 + }, + { + "epoch": 0.16655666886662268, + "grad_norm": 0.2564242849503747, + "learning_rate": 9.925852578141711e-06, + "loss": 0.3534, + "step": 2082 + }, + { + "epoch": 0.16663666726665466, + "grad_norm": 0.3133293218070104, + "learning_rate": 9.925741396889235e-06, + "loss": 0.2562, + "step": 2083 + }, + { + "epoch": 0.16671666566668666, + "grad_norm": 0.32229732284041823, + "learning_rate": 9.925630132966772e-06, + "loss": 0.2729, + "step": 2084 + }, + { + "epoch": 0.16679666406671867, + "grad_norm": 0.3300973761485826, + "learning_rate": 9.925518786376192e-06, + "loss": 0.2879, + "step": 2085 + }, + { + "epoch": 0.16687666246675067, + "grad_norm": 0.32016089799442193, + "learning_rate": 9.925407357119359e-06, + "loss": 0.3115, + "step": 2086 + }, + { + "epoch": 0.16695666086678265, + "grad_norm": 0.305268395533445, + "learning_rate": 9.925295845198148e-06, + "loss": 0.246, + "step": 2087 + }, + { + "epoch": 0.16703665926681466, + "grad_norm": 0.3030992904594177, + "learning_rate": 9.925184250614427e-06, + "loss": 0.286, + "step": 2088 + }, + { + "epoch": 0.16711665766684666, + "grad_norm": 0.30851975303268475, + "learning_rate": 9.92507257337007e-06, + "loss": 0.2833, + "step": 2089 + }, + { + "epoch": 0.16719665606687867, + "grad_norm": 0.24864383713819052, + "learning_rate": 9.924960813466952e-06, + "loss": 0.3259, + "step": 2090 + }, + { + "epoch": 0.16727665446691067, + "grad_norm": 0.2975092364269963, + "learning_rate": 9.92484897090695e-06, + "loss": 0.3067, + "step": 2091 + }, + { + "epoch": 0.16735665286694265, + "grad_norm": 0.29192769337469865, + "learning_rate": 9.924737045691938e-06, + "loss": 0.2971, + "step": 2092 + }, + { + "epoch": 0.16743665126697466, + "grad_norm": 0.2320288562788732, + "learning_rate": 9.924625037823797e-06, + "loss": 0.3326, + "step": 2093 + }, + { + "epoch": 0.16751664966700666, + "grad_norm": 0.29136998768279093, + "learning_rate": 9.924512947304403e-06, + "loss": 0.3005, + "step": 2094 + }, + { + "epoch": 0.16759664806703867, + "grad_norm": 0.2959481140531302, + "learning_rate": 9.924400774135641e-06, + "loss": 0.3094, + "step": 2095 + }, + { + "epoch": 0.16767664646707067, + "grad_norm": 0.34360539502211024, + "learning_rate": 9.924288518319394e-06, + "loss": 0.2826, + "step": 2096 + }, + { + "epoch": 0.16775664486710265, + "grad_norm": 0.29357643997439486, + "learning_rate": 9.924176179857543e-06, + "loss": 0.3075, + "step": 2097 + }, + { + "epoch": 0.16783664326713466, + "grad_norm": 0.24684161805198337, + "learning_rate": 9.924063758751976e-06, + "loss": 0.3365, + "step": 2098 + }, + { + "epoch": 0.16791664166716666, + "grad_norm": 0.2948633502995829, + "learning_rate": 9.923951255004577e-06, + "loss": 0.3083, + "step": 2099 + }, + { + "epoch": 0.16799664006719867, + "grad_norm": 0.34358517584085074, + "learning_rate": 9.923838668617238e-06, + "loss": 0.2702, + "step": 2100 + }, + { + "epoch": 0.16807663846723064, + "grad_norm": 0.33105465344399826, + "learning_rate": 9.923725999591846e-06, + "loss": 0.288, + "step": 2101 + }, + { + "epoch": 0.16815663686726265, + "grad_norm": 0.3309024643305375, + "learning_rate": 9.923613247930293e-06, + "loss": 0.2726, + "step": 2102 + }, + { + "epoch": 0.16823663526729465, + "grad_norm": 0.3249873474552501, + "learning_rate": 9.92350041363447e-06, + "loss": 0.3193, + "step": 2103 + }, + { + "epoch": 0.16831663366732666, + "grad_norm": 0.2694635218051305, + "learning_rate": 9.923387496706273e-06, + "loss": 0.3221, + "step": 2104 + }, + { + "epoch": 0.16839663206735866, + "grad_norm": 0.3185256235992889, + "learning_rate": 9.923274497147595e-06, + "loss": 0.2784, + "step": 2105 + }, + { + "epoch": 0.16847663046739064, + "grad_norm": 0.34825592715875764, + "learning_rate": 9.923161414960331e-06, + "loss": 0.2789, + "step": 2106 + }, + { + "epoch": 0.16855662886742265, + "grad_norm": 0.2930165114204921, + "learning_rate": 9.923048250146383e-06, + "loss": 0.3036, + "step": 2107 + }, + { + "epoch": 0.16863662726745465, + "grad_norm": 0.25457050039476486, + "learning_rate": 9.92293500270765e-06, + "loss": 0.3279, + "step": 2108 + }, + { + "epoch": 0.16871662566748666, + "grad_norm": 0.30317993746597344, + "learning_rate": 9.922821672646028e-06, + "loss": 0.2668, + "step": 2109 + }, + { + "epoch": 0.16879662406751864, + "grad_norm": 0.29354054918647643, + "learning_rate": 9.922708259963423e-06, + "loss": 0.2891, + "step": 2110 + }, + { + "epoch": 0.16887662246755064, + "grad_norm": 0.2307833362333732, + "learning_rate": 9.922594764661737e-06, + "loss": 0.3437, + "step": 2111 + }, + { + "epoch": 0.16895662086758265, + "grad_norm": 0.2893513886646398, + "learning_rate": 9.922481186742875e-06, + "loss": 0.289, + "step": 2112 + }, + { + "epoch": 0.16903661926761465, + "grad_norm": 0.31843158846122066, + "learning_rate": 9.922367526208746e-06, + "loss": 0.285, + "step": 2113 + }, + { + "epoch": 0.16911661766764666, + "grad_norm": 0.36210982886282705, + "learning_rate": 9.922253783061253e-06, + "loss": 0.2893, + "step": 2114 + }, + { + "epoch": 0.16919661606767863, + "grad_norm": 0.313219105341854, + "learning_rate": 9.922139957302308e-06, + "loss": 0.2751, + "step": 2115 + }, + { + "epoch": 0.16927661446771064, + "grad_norm": 0.3003082046460499, + "learning_rate": 9.922026048933819e-06, + "loss": 0.2952, + "step": 2116 + }, + { + "epoch": 0.16935661286774265, + "grad_norm": 0.2514870859616452, + "learning_rate": 9.921912057957701e-06, + "loss": 0.3307, + "step": 2117 + }, + { + "epoch": 0.16943661126777465, + "grad_norm": 0.28624968869974593, + "learning_rate": 9.921797984375866e-06, + "loss": 0.3017, + "step": 2118 + }, + { + "epoch": 0.16951660966780666, + "grad_norm": 0.29064446688182977, + "learning_rate": 9.921683828190225e-06, + "loss": 0.2957, + "step": 2119 + }, + { + "epoch": 0.16959660806783863, + "grad_norm": 0.28869322741215836, + "learning_rate": 9.9215695894027e-06, + "loss": 0.3143, + "step": 2120 + }, + { + "epoch": 0.16967660646787064, + "grad_norm": 0.2387779742533509, + "learning_rate": 9.9214552680152e-06, + "loss": 0.3612, + "step": 2121 + }, + { + "epoch": 0.16975660486790264, + "grad_norm": 0.3197505704089971, + "learning_rate": 9.921340864029653e-06, + "loss": 0.2959, + "step": 2122 + }, + { + "epoch": 0.16983660326793465, + "grad_norm": 0.36126638193109434, + "learning_rate": 9.921226377447975e-06, + "loss": 0.2942, + "step": 2123 + }, + { + "epoch": 0.16991660166796663, + "grad_norm": 0.5321695368531306, + "learning_rate": 9.921111808272087e-06, + "loss": 0.2913, + "step": 2124 + }, + { + "epoch": 0.16999660006799863, + "grad_norm": 0.3193250433671711, + "learning_rate": 9.920997156503912e-06, + "loss": 0.2823, + "step": 2125 + }, + { + "epoch": 0.17007659846803064, + "grad_norm": 0.3023044382349287, + "learning_rate": 9.920882422145372e-06, + "loss": 0.3213, + "step": 2126 + }, + { + "epoch": 0.17015659686806264, + "grad_norm": 0.3199438722910674, + "learning_rate": 9.920767605198396e-06, + "loss": 0.2755, + "step": 2127 + }, + { + "epoch": 0.17023659526809465, + "grad_norm": 0.34971011123388573, + "learning_rate": 9.920652705664912e-06, + "loss": 0.2809, + "step": 2128 + }, + { + "epoch": 0.17031659366812663, + "grad_norm": 0.2462277837906863, + "learning_rate": 9.920537723546843e-06, + "loss": 0.332, + "step": 2129 + }, + { + "epoch": 0.17039659206815863, + "grad_norm": 0.3247660198411913, + "learning_rate": 9.920422658846126e-06, + "loss": 0.2644, + "step": 2130 + }, + { + "epoch": 0.17047659046819064, + "grad_norm": 0.3044339050533969, + "learning_rate": 9.920307511564686e-06, + "loss": 0.3046, + "step": 2131 + }, + { + "epoch": 0.17055658886822264, + "grad_norm": 0.26308903269882067, + "learning_rate": 9.92019228170446e-06, + "loss": 0.3528, + "step": 2132 + }, + { + "epoch": 0.17063658726825465, + "grad_norm": 0.2771894539067521, + "learning_rate": 9.920076969267375e-06, + "loss": 0.2973, + "step": 2133 + }, + { + "epoch": 0.17071658566828662, + "grad_norm": 0.6356136314750569, + "learning_rate": 9.919961574255377e-06, + "loss": 0.3051, + "step": 2134 + }, + { + "epoch": 0.17079658406831863, + "grad_norm": 0.44399253202524924, + "learning_rate": 9.919846096670393e-06, + "loss": 0.3121, + "step": 2135 + }, + { + "epoch": 0.17087658246835064, + "grad_norm": 0.30981264347813436, + "learning_rate": 9.919730536514367e-06, + "loss": 0.2629, + "step": 2136 + }, + { + "epoch": 0.17095658086838264, + "grad_norm": 0.24137235564041146, + "learning_rate": 9.919614893789234e-06, + "loss": 0.3213, + "step": 2137 + }, + { + "epoch": 0.17103657926841462, + "grad_norm": 0.25430767273304306, + "learning_rate": 9.919499168496938e-06, + "loss": 0.342, + "step": 2138 + }, + { + "epoch": 0.17111657766844662, + "grad_norm": 0.27598393231255003, + "learning_rate": 9.919383360639423e-06, + "loss": 0.2938, + "step": 2139 + }, + { + "epoch": 0.17119657606847863, + "grad_norm": 0.2756439053478478, + "learning_rate": 9.919267470218628e-06, + "loss": 0.3301, + "step": 2140 + }, + { + "epoch": 0.17127657446851063, + "grad_norm": 0.23035634134485747, + "learning_rate": 9.9191514972365e-06, + "loss": 0.3553, + "step": 2141 + }, + { + "epoch": 0.17135657286854264, + "grad_norm": 0.37963777035644597, + "learning_rate": 9.919035441694985e-06, + "loss": 0.2998, + "step": 2142 + }, + { + "epoch": 0.17143657126857462, + "grad_norm": 0.28215279339902427, + "learning_rate": 9.918919303596034e-06, + "loss": 0.3223, + "step": 2143 + }, + { + "epoch": 0.17151656966860662, + "grad_norm": 0.5246272151869219, + "learning_rate": 9.91880308294159e-06, + "loss": 0.3085, + "step": 2144 + }, + { + "epoch": 0.17159656806863863, + "grad_norm": 0.3491492115950614, + "learning_rate": 9.918686779733608e-06, + "loss": 0.3138, + "step": 2145 + }, + { + "epoch": 0.17167656646867063, + "grad_norm": 0.21142573446764898, + "learning_rate": 9.918570393974041e-06, + "loss": 0.3555, + "step": 2146 + }, + { + "epoch": 0.17175656486870264, + "grad_norm": 0.3082936233998234, + "learning_rate": 9.91845392566484e-06, + "loss": 0.2988, + "step": 2147 + }, + { + "epoch": 0.17183656326873462, + "grad_norm": 0.3405303974408421, + "learning_rate": 9.918337374807958e-06, + "loss": 0.2972, + "step": 2148 + }, + { + "epoch": 0.17191656166876662, + "grad_norm": 0.3715830651706167, + "learning_rate": 9.918220741405356e-06, + "loss": 0.2823, + "step": 2149 + }, + { + "epoch": 0.17199656006879863, + "grad_norm": 0.29513003022903345, + "learning_rate": 9.918104025458985e-06, + "loss": 0.2964, + "step": 2150 + }, + { + "epoch": 0.17207655846883063, + "grad_norm": 0.41357158020023543, + "learning_rate": 9.917987226970811e-06, + "loss": 0.2993, + "step": 2151 + }, + { + "epoch": 0.1721565568688626, + "grad_norm": 0.29572195974882093, + "learning_rate": 9.917870345942789e-06, + "loss": 0.3297, + "step": 2152 + }, + { + "epoch": 0.17223655526889461, + "grad_norm": 0.34651737400198596, + "learning_rate": 9.917753382376883e-06, + "loss": 0.2663, + "step": 2153 + }, + { + "epoch": 0.17231655366892662, + "grad_norm": 0.29111979679515226, + "learning_rate": 9.917636336275055e-06, + "loss": 0.2925, + "step": 2154 + }, + { + "epoch": 0.17239655206895863, + "grad_norm": 0.32741385639702947, + "learning_rate": 9.91751920763927e-06, + "loss": 0.2921, + "step": 2155 + }, + { + "epoch": 0.17247655046899063, + "grad_norm": 0.33394487197858524, + "learning_rate": 9.917401996471494e-06, + "loss": 0.2663, + "step": 2156 + }, + { + "epoch": 0.1725565488690226, + "grad_norm": 0.2814006594577355, + "learning_rate": 9.917284702773692e-06, + "loss": 0.289, + "step": 2157 + }, + { + "epoch": 0.1726365472690546, + "grad_norm": 0.2802053100578063, + "learning_rate": 9.917167326547837e-06, + "loss": 0.3108, + "step": 2158 + }, + { + "epoch": 0.17271654566908662, + "grad_norm": 0.2714124651698787, + "learning_rate": 9.917049867795896e-06, + "loss": 0.2769, + "step": 2159 + }, + { + "epoch": 0.17279654406911862, + "grad_norm": 1.5360669815385892, + "learning_rate": 9.91693232651984e-06, + "loss": 0.3267, + "step": 2160 + }, + { + "epoch": 0.17287654246915063, + "grad_norm": 0.32758573693625803, + "learning_rate": 9.916814702721641e-06, + "loss": 0.2832, + "step": 2161 + }, + { + "epoch": 0.1729565408691826, + "grad_norm": 0.28747517663749467, + "learning_rate": 9.916696996403279e-06, + "loss": 0.3042, + "step": 2162 + }, + { + "epoch": 0.1730365392692146, + "grad_norm": 0.3223179020947917, + "learning_rate": 9.916579207566721e-06, + "loss": 0.2953, + "step": 2163 + }, + { + "epoch": 0.17311653766924662, + "grad_norm": 0.3243979792635454, + "learning_rate": 9.916461336213949e-06, + "loss": 0.2689, + "step": 2164 + }, + { + "epoch": 0.17319653606927862, + "grad_norm": 0.28971419130451553, + "learning_rate": 9.916343382346942e-06, + "loss": 0.3334, + "step": 2165 + }, + { + "epoch": 0.1732765344693106, + "grad_norm": 0.25956534006906684, + "learning_rate": 9.916225345967677e-06, + "loss": 0.324, + "step": 2166 + }, + { + "epoch": 0.1733565328693426, + "grad_norm": 0.2740661587980621, + "learning_rate": 9.916107227078133e-06, + "loss": 0.3338, + "step": 2167 + }, + { + "epoch": 0.1734365312693746, + "grad_norm": 0.21451879926146697, + "learning_rate": 9.915989025680299e-06, + "loss": 0.3592, + "step": 2168 + }, + { + "epoch": 0.17351652966940662, + "grad_norm": 0.31288754023635434, + "learning_rate": 9.915870741776153e-06, + "loss": 0.2874, + "step": 2169 + }, + { + "epoch": 0.17359652806943862, + "grad_norm": 0.29586617821913547, + "learning_rate": 9.915752375367681e-06, + "loss": 0.3099, + "step": 2170 + }, + { + "epoch": 0.1736765264694706, + "grad_norm": 0.3282065165377944, + "learning_rate": 9.915633926456874e-06, + "loss": 0.3029, + "step": 2171 + }, + { + "epoch": 0.1737565248695026, + "grad_norm": 0.4684352255002045, + "learning_rate": 9.915515395045715e-06, + "loss": 0.2777, + "step": 2172 + }, + { + "epoch": 0.1738365232695346, + "grad_norm": 0.32252223255515816, + "learning_rate": 9.915396781136197e-06, + "loss": 0.2889, + "step": 2173 + }, + { + "epoch": 0.17391652166956661, + "grad_norm": 0.25721943591637264, + "learning_rate": 9.91527808473031e-06, + "loss": 0.3229, + "step": 2174 + }, + { + "epoch": 0.17399652006959862, + "grad_norm": 0.32343137010678946, + "learning_rate": 9.91515930583004e-06, + "loss": 0.3073, + "step": 2175 + }, + { + "epoch": 0.1740765184696306, + "grad_norm": 0.5722792524137117, + "learning_rate": 9.91504044443739e-06, + "loss": 0.279, + "step": 2176 + }, + { + "epoch": 0.1741565168696626, + "grad_norm": 0.19970854531086682, + "learning_rate": 9.914921500554347e-06, + "loss": 0.3548, + "step": 2177 + }, + { + "epoch": 0.1742365152696946, + "grad_norm": 0.3349355759935075, + "learning_rate": 9.914802474182912e-06, + "loss": 0.295, + "step": 2178 + }, + { + "epoch": 0.1743165136697266, + "grad_norm": 0.24411384223195468, + "learning_rate": 9.914683365325083e-06, + "loss": 0.3389, + "step": 2179 + }, + { + "epoch": 0.1743965120697586, + "grad_norm": 0.3269832203187113, + "learning_rate": 9.914564173982856e-06, + "loss": 0.2615, + "step": 2180 + }, + { + "epoch": 0.1744765104697906, + "grad_norm": 0.3180765032347907, + "learning_rate": 9.914444900158234e-06, + "loss": 0.2748, + "step": 2181 + }, + { + "epoch": 0.1745565088698226, + "grad_norm": 0.3444980005956865, + "learning_rate": 9.914325543853216e-06, + "loss": 0.2573, + "step": 2182 + }, + { + "epoch": 0.1746365072698546, + "grad_norm": 0.3414980646435654, + "learning_rate": 9.914206105069806e-06, + "loss": 0.299, + "step": 2183 + }, + { + "epoch": 0.1747165056698866, + "grad_norm": 0.2821911036885186, + "learning_rate": 9.91408658381001e-06, + "loss": 0.3067, + "step": 2184 + }, + { + "epoch": 0.1747965040699186, + "grad_norm": 0.2773653704557142, + "learning_rate": 9.913966980075834e-06, + "loss": 0.3194, + "step": 2185 + }, + { + "epoch": 0.1748765024699506, + "grad_norm": 0.2585870690064825, + "learning_rate": 9.913847293869286e-06, + "loss": 0.3259, + "step": 2186 + }, + { + "epoch": 0.1749565008699826, + "grad_norm": 0.3259601159768096, + "learning_rate": 9.91372752519237e-06, + "loss": 0.3211, + "step": 2187 + }, + { + "epoch": 0.1750364992700146, + "grad_norm": 0.34908121210768117, + "learning_rate": 9.913607674047102e-06, + "loss": 0.2756, + "step": 2188 + }, + { + "epoch": 0.1751164976700466, + "grad_norm": 0.33014016279699515, + "learning_rate": 9.91348774043549e-06, + "loss": 0.287, + "step": 2189 + }, + { + "epoch": 0.1751964960700786, + "grad_norm": 0.32849560365908476, + "learning_rate": 9.913367724359548e-06, + "loss": 0.2913, + "step": 2190 + }, + { + "epoch": 0.1752764944701106, + "grad_norm": 0.3837399282874312, + "learning_rate": 9.91324762582129e-06, + "loss": 0.3003, + "step": 2191 + }, + { + "epoch": 0.1753564928701426, + "grad_norm": 0.3108473247678813, + "learning_rate": 9.913127444822732e-06, + "loss": 0.2785, + "step": 2192 + }, + { + "epoch": 0.1754364912701746, + "grad_norm": 0.30946056425886553, + "learning_rate": 9.91300718136589e-06, + "loss": 0.3177, + "step": 2193 + }, + { + "epoch": 0.17551648967020658, + "grad_norm": 0.2122239140868899, + "learning_rate": 9.912886835452783e-06, + "loss": 0.3556, + "step": 2194 + }, + { + "epoch": 0.1755964880702386, + "grad_norm": 0.3349836023569884, + "learning_rate": 9.912766407085432e-06, + "loss": 0.2818, + "step": 2195 + }, + { + "epoch": 0.1756764864702706, + "grad_norm": 0.3308669166261636, + "learning_rate": 9.912645896265858e-06, + "loss": 0.2892, + "step": 2196 + }, + { + "epoch": 0.1757564848703026, + "grad_norm": 0.2930554454074347, + "learning_rate": 9.912525302996081e-06, + "loss": 0.2817, + "step": 2197 + }, + { + "epoch": 0.1758364832703346, + "grad_norm": 0.32796863130746, + "learning_rate": 9.912404627278128e-06, + "loss": 0.2665, + "step": 2198 + }, + { + "epoch": 0.17591648167036658, + "grad_norm": 0.3623872542284336, + "learning_rate": 9.91228386911402e-06, + "loss": 0.2783, + "step": 2199 + }, + { + "epoch": 0.1759964800703986, + "grad_norm": 0.28351562454059315, + "learning_rate": 9.91216302850579e-06, + "loss": 0.3174, + "step": 2200 + }, + { + "epoch": 0.1760764784704306, + "grad_norm": 0.28471547802592956, + "learning_rate": 9.912042105455462e-06, + "loss": 0.3494, + "step": 2201 + }, + { + "epoch": 0.1761564768704626, + "grad_norm": 0.3050931235469551, + "learning_rate": 9.911921099965066e-06, + "loss": 0.2967, + "step": 2202 + }, + { + "epoch": 0.1762364752704946, + "grad_norm": 0.331954750263589, + "learning_rate": 9.911800012036633e-06, + "loss": 0.2696, + "step": 2203 + }, + { + "epoch": 0.17631647367052658, + "grad_norm": 0.2633551953189909, + "learning_rate": 9.911678841672196e-06, + "loss": 0.3294, + "step": 2204 + }, + { + "epoch": 0.17639647207055859, + "grad_norm": 0.3647429766984875, + "learning_rate": 9.911557588873787e-06, + "loss": 0.2876, + "step": 2205 + }, + { + "epoch": 0.1764764704705906, + "grad_norm": 0.252049420944994, + "learning_rate": 9.911436253643445e-06, + "loss": 0.3428, + "step": 2206 + }, + { + "epoch": 0.1765564688706226, + "grad_norm": 0.3112973492211981, + "learning_rate": 9.911314835983202e-06, + "loss": 0.2766, + "step": 2207 + }, + { + "epoch": 0.17663646727065457, + "grad_norm": 0.2940265450023907, + "learning_rate": 9.911193335895095e-06, + "loss": 0.3167, + "step": 2208 + }, + { + "epoch": 0.17671646567068658, + "grad_norm": 0.2620645115030399, + "learning_rate": 9.911071753381168e-06, + "loss": 0.3302, + "step": 2209 + }, + { + "epoch": 0.17679646407071858, + "grad_norm": 0.3055631349597738, + "learning_rate": 9.910950088443455e-06, + "loss": 0.2985, + "step": 2210 + }, + { + "epoch": 0.1768764624707506, + "grad_norm": 0.33723422007601406, + "learning_rate": 9.910828341084006e-06, + "loss": 0.2738, + "step": 2211 + }, + { + "epoch": 0.1769564608707826, + "grad_norm": 0.255736892567754, + "learning_rate": 9.91070651130486e-06, + "loss": 0.351, + "step": 2212 + }, + { + "epoch": 0.17703645927081457, + "grad_norm": 0.2823110740658937, + "learning_rate": 9.91058459910806e-06, + "loss": 0.3351, + "step": 2213 + }, + { + "epoch": 0.17711645767084658, + "grad_norm": 0.3605496428016245, + "learning_rate": 9.910462604495655e-06, + "loss": 0.2656, + "step": 2214 + }, + { + "epoch": 0.17719645607087858, + "grad_norm": 0.22269199338176454, + "learning_rate": 9.910340527469692e-06, + "loss": 0.3578, + "step": 2215 + }, + { + "epoch": 0.1772764544709106, + "grad_norm": 0.36646975328893466, + "learning_rate": 9.910218368032219e-06, + "loss": 0.2822, + "step": 2216 + }, + { + "epoch": 0.1773564528709426, + "grad_norm": 0.3654899343414024, + "learning_rate": 9.910096126185286e-06, + "loss": 0.3041, + "step": 2217 + }, + { + "epoch": 0.17743645127097457, + "grad_norm": 0.30592231878782594, + "learning_rate": 9.909973801930946e-06, + "loss": 0.2759, + "step": 2218 + }, + { + "epoch": 0.17751644967100658, + "grad_norm": 0.33404706369006754, + "learning_rate": 9.90985139527125e-06, + "loss": 0.3037, + "step": 2219 + }, + { + "epoch": 0.17759644807103858, + "grad_norm": 0.32606663944582154, + "learning_rate": 9.909728906208254e-06, + "loss": 0.3073, + "step": 2220 + }, + { + "epoch": 0.1776764464710706, + "grad_norm": 0.3051769727405732, + "learning_rate": 9.909606334744013e-06, + "loss": 0.2745, + "step": 2221 + }, + { + "epoch": 0.17775644487110256, + "grad_norm": 0.24122210105408814, + "learning_rate": 9.909483680880585e-06, + "loss": 0.3296, + "step": 2222 + }, + { + "epoch": 0.17783644327113457, + "grad_norm": 0.394209021182731, + "learning_rate": 9.909360944620027e-06, + "loss": 0.3157, + "step": 2223 + }, + { + "epoch": 0.17791644167116658, + "grad_norm": 0.4478452126833092, + "learning_rate": 9.909238125964403e-06, + "loss": 0.2772, + "step": 2224 + }, + { + "epoch": 0.17799644007119858, + "grad_norm": 0.24278381621843823, + "learning_rate": 9.909115224915768e-06, + "loss": 0.3057, + "step": 2225 + }, + { + "epoch": 0.17807643847123059, + "grad_norm": 0.3201935677327144, + "learning_rate": 9.908992241476189e-06, + "loss": 0.2961, + "step": 2226 + }, + { + "epoch": 0.17815643687126256, + "grad_norm": 0.2982477515771541, + "learning_rate": 9.90886917564773e-06, + "loss": 0.3123, + "step": 2227 + }, + { + "epoch": 0.17823643527129457, + "grad_norm": 0.2977213542669658, + "learning_rate": 9.908746027432453e-06, + "loss": 0.2883, + "step": 2228 + }, + { + "epoch": 0.17831643367132657, + "grad_norm": 0.33105000203470697, + "learning_rate": 9.908622796832427e-06, + "loss": 0.2722, + "step": 2229 + }, + { + "epoch": 0.17839643207135858, + "grad_norm": 0.2887265699872081, + "learning_rate": 9.908499483849723e-06, + "loss": 0.3036, + "step": 2230 + }, + { + "epoch": 0.17847643047139058, + "grad_norm": 0.31224244102902876, + "learning_rate": 9.908376088486407e-06, + "loss": 0.3219, + "step": 2231 + }, + { + "epoch": 0.17855642887142256, + "grad_norm": 0.321798343767336, + "learning_rate": 9.908252610744552e-06, + "loss": 0.2768, + "step": 2232 + }, + { + "epoch": 0.17863642727145457, + "grad_norm": 0.27482416388292685, + "learning_rate": 9.908129050626228e-06, + "loss": 0.3041, + "step": 2233 + }, + { + "epoch": 0.17871642567148657, + "grad_norm": 0.2904326813932616, + "learning_rate": 9.90800540813351e-06, + "loss": 0.3088, + "step": 2234 + }, + { + "epoch": 0.17879642407151858, + "grad_norm": 0.335847530744689, + "learning_rate": 9.907881683268472e-06, + "loss": 0.2777, + "step": 2235 + }, + { + "epoch": 0.17887642247155056, + "grad_norm": 0.2821625580121489, + "learning_rate": 9.907757876033193e-06, + "loss": 0.3041, + "step": 2236 + }, + { + "epoch": 0.17895642087158256, + "grad_norm": 0.2846221076025696, + "learning_rate": 9.90763398642975e-06, + "loss": 0.2897, + "step": 2237 + }, + { + "epoch": 0.17903641927161457, + "grad_norm": 0.29549203407902536, + "learning_rate": 9.907510014460222e-06, + "loss": 0.3196, + "step": 2238 + }, + { + "epoch": 0.17911641767164657, + "grad_norm": 0.27495954725428484, + "learning_rate": 9.907385960126689e-06, + "loss": 0.3042, + "step": 2239 + }, + { + "epoch": 0.17919641607167858, + "grad_norm": 0.3105245894150985, + "learning_rate": 9.907261823431236e-06, + "loss": 0.2705, + "step": 2240 + }, + { + "epoch": 0.17927641447171055, + "grad_norm": 0.3228019357195032, + "learning_rate": 9.907137604375941e-06, + "loss": 0.3013, + "step": 2241 + }, + { + "epoch": 0.17935641287174256, + "grad_norm": 0.3006033171754618, + "learning_rate": 9.907013302962893e-06, + "loss": 0.312, + "step": 2242 + }, + { + "epoch": 0.17943641127177457, + "grad_norm": 0.30520327028996524, + "learning_rate": 9.906888919194178e-06, + "loss": 0.3038, + "step": 2243 + }, + { + "epoch": 0.17951640967180657, + "grad_norm": 0.3354768103624835, + "learning_rate": 9.90676445307188e-06, + "loss": 0.2937, + "step": 2244 + }, + { + "epoch": 0.17959640807183858, + "grad_norm": 0.2764242095226642, + "learning_rate": 9.906639904598092e-06, + "loss": 0.3007, + "step": 2245 + }, + { + "epoch": 0.17967640647187055, + "grad_norm": 0.40074569474046123, + "learning_rate": 9.906515273774903e-06, + "loss": 0.3466, + "step": 2246 + }, + { + "epoch": 0.17975640487190256, + "grad_norm": 0.30842910390571976, + "learning_rate": 9.906390560604404e-06, + "loss": 0.277, + "step": 2247 + }, + { + "epoch": 0.17983640327193456, + "grad_norm": 0.24426128626942076, + "learning_rate": 9.906265765088689e-06, + "loss": 0.3228, + "step": 2248 + }, + { + "epoch": 0.17991640167196657, + "grad_norm": 0.28849825786981287, + "learning_rate": 9.906140887229852e-06, + "loss": 0.3513, + "step": 2249 + }, + { + "epoch": 0.17999640007199855, + "grad_norm": 0.3029006063459155, + "learning_rate": 9.906015927029989e-06, + "loss": 0.3012, + "step": 2250 + }, + { + "epoch": 0.18007639847203055, + "grad_norm": 0.2655859289907593, + "learning_rate": 9.905890884491196e-06, + "loss": 0.3121, + "step": 2251 + }, + { + "epoch": 0.18015639687206256, + "grad_norm": 0.26261808571779227, + "learning_rate": 9.905765759615573e-06, + "loss": 0.3164, + "step": 2252 + }, + { + "epoch": 0.18023639527209456, + "grad_norm": 0.28710893334250004, + "learning_rate": 9.905640552405222e-06, + "loss": 0.3226, + "step": 2253 + }, + { + "epoch": 0.18031639367212657, + "grad_norm": 0.3055622488945309, + "learning_rate": 9.90551526286224e-06, + "loss": 0.3134, + "step": 2254 + }, + { + "epoch": 0.18039639207215855, + "grad_norm": 0.38955677908043257, + "learning_rate": 9.905389890988734e-06, + "loss": 0.2805, + "step": 2255 + }, + { + "epoch": 0.18047639047219055, + "grad_norm": 0.3218900597039972, + "learning_rate": 9.905264436786805e-06, + "loss": 0.3014, + "step": 2256 + }, + { + "epoch": 0.18055638887222256, + "grad_norm": 0.3083594158007672, + "learning_rate": 9.90513890025856e-06, + "loss": 0.3019, + "step": 2257 + }, + { + "epoch": 0.18063638727225456, + "grad_norm": 0.21367763820039123, + "learning_rate": 9.905013281406103e-06, + "loss": 0.3614, + "step": 2258 + }, + { + "epoch": 0.18071638567228657, + "grad_norm": 0.294859138635244, + "learning_rate": 9.904887580231548e-06, + "loss": 0.3055, + "step": 2259 + }, + { + "epoch": 0.18079638407231854, + "grad_norm": 0.30072604780617934, + "learning_rate": 9.904761796737002e-06, + "loss": 0.3064, + "step": 2260 + }, + { + "epoch": 0.18087638247235055, + "grad_norm": 0.34484316610442517, + "learning_rate": 9.904635930924573e-06, + "loss": 0.2805, + "step": 2261 + }, + { + "epoch": 0.18095638087238256, + "grad_norm": 0.31854529372785434, + "learning_rate": 9.904509982796377e-06, + "loss": 0.2563, + "step": 2262 + }, + { + "epoch": 0.18103637927241456, + "grad_norm": 0.29687984292796754, + "learning_rate": 9.904383952354528e-06, + "loss": 0.2956, + "step": 2263 + }, + { + "epoch": 0.18111637767244654, + "grad_norm": 0.3098839172490386, + "learning_rate": 9.90425783960114e-06, + "loss": 0.3195, + "step": 2264 + }, + { + "epoch": 0.18119637607247854, + "grad_norm": 0.32092088354044235, + "learning_rate": 9.904131644538327e-06, + "loss": 0.3098, + "step": 2265 + }, + { + "epoch": 0.18127637447251055, + "grad_norm": 0.3516159853967498, + "learning_rate": 9.904005367168212e-06, + "loss": 0.2808, + "step": 2266 + }, + { + "epoch": 0.18135637287254255, + "grad_norm": 0.36404985927140604, + "learning_rate": 9.903879007492912e-06, + "loss": 0.2952, + "step": 2267 + }, + { + "epoch": 0.18143637127257456, + "grad_norm": 0.37202064436635934, + "learning_rate": 9.903752565514546e-06, + "loss": 0.2979, + "step": 2268 + }, + { + "epoch": 0.18151636967260654, + "grad_norm": 0.3206488979925511, + "learning_rate": 9.90362604123524e-06, + "loss": 0.2947, + "step": 2269 + }, + { + "epoch": 0.18159636807263854, + "grad_norm": 0.3734315180807726, + "learning_rate": 9.903499434657113e-06, + "loss": 0.2745, + "step": 2270 + }, + { + "epoch": 0.18167636647267055, + "grad_norm": 0.3032924354676774, + "learning_rate": 9.903372745782294e-06, + "loss": 0.3062, + "step": 2271 + }, + { + "epoch": 0.18175636487270255, + "grad_norm": 0.29645708121221287, + "learning_rate": 9.903245974612906e-06, + "loss": 0.25, + "step": 2272 + }, + { + "epoch": 0.18183636327273456, + "grad_norm": 0.28676001952958413, + "learning_rate": 9.903119121151079e-06, + "loss": 0.3331, + "step": 2273 + }, + { + "epoch": 0.18191636167276654, + "grad_norm": 0.2536170025660417, + "learning_rate": 9.90299218539894e-06, + "loss": 0.3305, + "step": 2274 + }, + { + "epoch": 0.18199636007279854, + "grad_norm": 0.3522000072916745, + "learning_rate": 9.90286516735862e-06, + "loss": 0.2591, + "step": 2275 + }, + { + "epoch": 0.18207635847283055, + "grad_norm": 0.24633740218420444, + "learning_rate": 9.902738067032254e-06, + "loss": 0.3333, + "step": 2276 + }, + { + "epoch": 0.18215635687286255, + "grad_norm": 0.30346312674543047, + "learning_rate": 9.90261088442197e-06, + "loss": 0.312, + "step": 2277 + }, + { + "epoch": 0.18223635527289453, + "grad_norm": 0.3196980307588306, + "learning_rate": 9.902483619529905e-06, + "loss": 0.3131, + "step": 2278 + }, + { + "epoch": 0.18231635367292653, + "grad_norm": 0.27860081939721126, + "learning_rate": 9.902356272358196e-06, + "loss": 0.3342, + "step": 2279 + }, + { + "epoch": 0.18239635207295854, + "grad_norm": 0.3130070210387328, + "learning_rate": 9.902228842908979e-06, + "loss": 0.2538, + "step": 2280 + }, + { + "epoch": 0.18247635047299055, + "grad_norm": 0.3347523996311443, + "learning_rate": 9.902101331184391e-06, + "loss": 0.2607, + "step": 2281 + }, + { + "epoch": 0.18255634887302255, + "grad_norm": 0.27729371241521844, + "learning_rate": 9.901973737186576e-06, + "loss": 0.3363, + "step": 2282 + }, + { + "epoch": 0.18263634727305453, + "grad_norm": 0.3325465729213715, + "learning_rate": 9.901846060917673e-06, + "loss": 0.3153, + "step": 2283 + }, + { + "epoch": 0.18271634567308653, + "grad_norm": 0.3145602390673542, + "learning_rate": 9.901718302379823e-06, + "loss": 0.3193, + "step": 2284 + }, + { + "epoch": 0.18279634407311854, + "grad_norm": 0.39470343625608306, + "learning_rate": 9.901590461575175e-06, + "loss": 0.2666, + "step": 2285 + }, + { + "epoch": 0.18287634247315054, + "grad_norm": 0.29525973471194455, + "learning_rate": 9.901462538505871e-06, + "loss": 0.3126, + "step": 2286 + }, + { + "epoch": 0.18295634087318255, + "grad_norm": 0.3072257293867776, + "learning_rate": 9.901334533174058e-06, + "loss": 0.2806, + "step": 2287 + }, + { + "epoch": 0.18303633927321453, + "grad_norm": 0.2731369512213435, + "learning_rate": 9.901206445581886e-06, + "loss": 0.2875, + "step": 2288 + }, + { + "epoch": 0.18311633767324653, + "grad_norm": 0.36400054882727756, + "learning_rate": 9.901078275731504e-06, + "loss": 0.274, + "step": 2289 + }, + { + "epoch": 0.18319633607327854, + "grad_norm": 0.28652766435324106, + "learning_rate": 9.900950023625064e-06, + "loss": 0.3068, + "step": 2290 + }, + { + "epoch": 0.18327633447331054, + "grad_norm": 0.34934296758984823, + "learning_rate": 9.900821689264715e-06, + "loss": 0.2897, + "step": 2291 + }, + { + "epoch": 0.18335633287334252, + "grad_norm": 0.4846501409415694, + "learning_rate": 9.900693272652617e-06, + "loss": 0.2756, + "step": 2292 + }, + { + "epoch": 0.18343633127337453, + "grad_norm": 0.32872762804784045, + "learning_rate": 9.90056477379092e-06, + "loss": 0.2766, + "step": 2293 + }, + { + "epoch": 0.18351632967340653, + "grad_norm": 0.33164670451439615, + "learning_rate": 9.900436192681782e-06, + "loss": 0.3018, + "step": 2294 + }, + { + "epoch": 0.18359632807343854, + "grad_norm": 0.273337330998633, + "learning_rate": 9.90030752932736e-06, + "loss": 0.2867, + "step": 2295 + }, + { + "epoch": 0.18367632647347054, + "grad_norm": 0.3455921633556097, + "learning_rate": 9.900178783729817e-06, + "loss": 0.2884, + "step": 2296 + }, + { + "epoch": 0.18375632487350252, + "grad_norm": 0.36310582356318455, + "learning_rate": 9.90004995589131e-06, + "loss": 0.2635, + "step": 2297 + }, + { + "epoch": 0.18383632327353452, + "grad_norm": 0.2489700439381916, + "learning_rate": 9.899921045814002e-06, + "loss": 0.331, + "step": 2298 + }, + { + "epoch": 0.18391632167356653, + "grad_norm": 0.30137752427878106, + "learning_rate": 9.899792053500059e-06, + "loss": 0.2877, + "step": 2299 + }, + { + "epoch": 0.18399632007359853, + "grad_norm": 0.2933254651740342, + "learning_rate": 9.899662978951643e-06, + "loss": 0.2926, + "step": 2300 + }, + { + "epoch": 0.18407631847363054, + "grad_norm": 0.34122590759341387, + "learning_rate": 9.899533822170922e-06, + "loss": 0.2709, + "step": 2301 + }, + { + "epoch": 0.18415631687366252, + "grad_norm": 0.28604807662561615, + "learning_rate": 9.899404583160064e-06, + "loss": 0.3031, + "step": 2302 + }, + { + "epoch": 0.18423631527369452, + "grad_norm": 0.2814917599428059, + "learning_rate": 9.899275261921236e-06, + "loss": 0.298, + "step": 2303 + }, + { + "epoch": 0.18431631367372653, + "grad_norm": 0.29310475781460477, + "learning_rate": 9.899145858456609e-06, + "loss": 0.3053, + "step": 2304 + }, + { + "epoch": 0.18439631207375853, + "grad_norm": 0.3081457235605028, + "learning_rate": 9.899016372768355e-06, + "loss": 0.2816, + "step": 2305 + }, + { + "epoch": 0.1844763104737905, + "grad_norm": 0.3082793240269065, + "learning_rate": 9.898886804858648e-06, + "loss": 0.2713, + "step": 2306 + }, + { + "epoch": 0.18455630887382252, + "grad_norm": 0.3321396278641944, + "learning_rate": 9.898757154729663e-06, + "loss": 0.2787, + "step": 2307 + }, + { + "epoch": 0.18463630727385452, + "grad_norm": 0.3466185324405436, + "learning_rate": 9.898627422383575e-06, + "loss": 0.2975, + "step": 2308 + }, + { + "epoch": 0.18471630567388653, + "grad_norm": 0.3422615722618602, + "learning_rate": 9.898497607822561e-06, + "loss": 0.281, + "step": 2309 + }, + { + "epoch": 0.18479630407391853, + "grad_norm": 0.2614109809738245, + "learning_rate": 9.8983677110488e-06, + "loss": 0.3467, + "step": 2310 + }, + { + "epoch": 0.1848763024739505, + "grad_norm": 0.3204206366719577, + "learning_rate": 9.898237732064472e-06, + "loss": 0.2933, + "step": 2311 + }, + { + "epoch": 0.18495630087398252, + "grad_norm": 0.5003514894176285, + "learning_rate": 9.898107670871756e-06, + "loss": 0.2933, + "step": 2312 + }, + { + "epoch": 0.18503629927401452, + "grad_norm": 0.2785755067958378, + "learning_rate": 9.897977527472842e-06, + "loss": 0.2984, + "step": 2313 + }, + { + "epoch": 0.18511629767404653, + "grad_norm": 0.3972639137161272, + "learning_rate": 9.897847301869907e-06, + "loss": 0.2993, + "step": 2314 + }, + { + "epoch": 0.18519629607407853, + "grad_norm": 0.2606780564175842, + "learning_rate": 9.89771699406514e-06, + "loss": 0.3337, + "step": 2315 + }, + { + "epoch": 0.1852762944741105, + "grad_norm": 0.2498395521344486, + "learning_rate": 9.89758660406073e-06, + "loss": 0.3398, + "step": 2316 + }, + { + "epoch": 0.18535629287414251, + "grad_norm": 0.2730345002102742, + "learning_rate": 9.89745613185886e-06, + "loss": 0.3155, + "step": 2317 + }, + { + "epoch": 0.18543629127417452, + "grad_norm": 0.3744686115512236, + "learning_rate": 9.897325577461721e-06, + "loss": 0.2704, + "step": 2318 + }, + { + "epoch": 0.18551628967420652, + "grad_norm": 0.2984615541065681, + "learning_rate": 9.897194940871509e-06, + "loss": 0.3177, + "step": 2319 + }, + { + "epoch": 0.1855962880742385, + "grad_norm": 0.3209764472706581, + "learning_rate": 9.897064222090411e-06, + "loss": 0.2944, + "step": 2320 + }, + { + "epoch": 0.1856762864742705, + "grad_norm": 0.2726405223376503, + "learning_rate": 9.896933421120623e-06, + "loss": 0.3089, + "step": 2321 + }, + { + "epoch": 0.1857562848743025, + "grad_norm": 0.3121191102482848, + "learning_rate": 9.89680253796434e-06, + "loss": 0.2605, + "step": 2322 + }, + { + "epoch": 0.18583628327433452, + "grad_norm": 0.25571568023466, + "learning_rate": 9.89667157262376e-06, + "loss": 0.3052, + "step": 2323 + }, + { + "epoch": 0.18591628167436652, + "grad_norm": 0.25807789166266315, + "learning_rate": 9.89654052510108e-06, + "loss": 0.3427, + "step": 2324 + }, + { + "epoch": 0.1859962800743985, + "grad_norm": 0.7118819727249713, + "learning_rate": 9.896409395398499e-06, + "loss": 0.3064, + "step": 2325 + }, + { + "epoch": 0.1860762784744305, + "grad_norm": 0.3534721975326911, + "learning_rate": 9.896278183518216e-06, + "loss": 0.2977, + "step": 2326 + }, + { + "epoch": 0.1861562768744625, + "grad_norm": 0.30787784279364555, + "learning_rate": 9.896146889462438e-06, + "loss": 0.2865, + "step": 2327 + }, + { + "epoch": 0.18623627527449452, + "grad_norm": 0.33335676465309655, + "learning_rate": 9.896015513233364e-06, + "loss": 0.2865, + "step": 2328 + }, + { + "epoch": 0.18631627367452652, + "grad_norm": 0.26473899251373073, + "learning_rate": 9.895884054833202e-06, + "loss": 0.2899, + "step": 2329 + }, + { + "epoch": 0.1863962720745585, + "grad_norm": 0.25715972314468105, + "learning_rate": 9.895752514264156e-06, + "loss": 0.3261, + "step": 2330 + }, + { + "epoch": 0.1864762704745905, + "grad_norm": 0.2672529629664939, + "learning_rate": 9.895620891528437e-06, + "loss": 0.3356, + "step": 2331 + }, + { + "epoch": 0.1865562688746225, + "grad_norm": 0.3391599346840918, + "learning_rate": 9.895489186628248e-06, + "loss": 0.2819, + "step": 2332 + }, + { + "epoch": 0.18663626727465452, + "grad_norm": 0.27913093583156906, + "learning_rate": 9.895357399565806e-06, + "loss": 0.3144, + "step": 2333 + }, + { + "epoch": 0.1867162656746865, + "grad_norm": 0.2544007551203407, + "learning_rate": 9.89522553034332e-06, + "loss": 0.3348, + "step": 2334 + }, + { + "epoch": 0.1867962640747185, + "grad_norm": 0.4697885513289864, + "learning_rate": 9.895093578963002e-06, + "loss": 0.289, + "step": 2335 + }, + { + "epoch": 0.1868762624747505, + "grad_norm": 0.22722016232959383, + "learning_rate": 9.894961545427069e-06, + "loss": 0.362, + "step": 2336 + }, + { + "epoch": 0.1869562608747825, + "grad_norm": 0.3023447793126911, + "learning_rate": 9.894829429737734e-06, + "loss": 0.2609, + "step": 2337 + }, + { + "epoch": 0.18703625927481451, + "grad_norm": 0.29337839797537213, + "learning_rate": 9.894697231897216e-06, + "loss": 0.3222, + "step": 2338 + }, + { + "epoch": 0.1871162576748465, + "grad_norm": 0.34616709657520256, + "learning_rate": 9.894564951907737e-06, + "loss": 0.2818, + "step": 2339 + }, + { + "epoch": 0.1871962560748785, + "grad_norm": 0.35697138727337746, + "learning_rate": 9.894432589771512e-06, + "loss": 0.2914, + "step": 2340 + }, + { + "epoch": 0.1872762544749105, + "grad_norm": 0.2999035465249017, + "learning_rate": 9.894300145490763e-06, + "loss": 0.3108, + "step": 2341 + }, + { + "epoch": 0.1873562528749425, + "grad_norm": 0.30889385836091476, + "learning_rate": 9.894167619067715e-06, + "loss": 0.3056, + "step": 2342 + }, + { + "epoch": 0.1874362512749745, + "grad_norm": 0.4227377197747017, + "learning_rate": 9.894035010504592e-06, + "loss": 0.2708, + "step": 2343 + }, + { + "epoch": 0.1875162496750065, + "grad_norm": 0.3051002526055171, + "learning_rate": 9.893902319803619e-06, + "loss": 0.334, + "step": 2344 + }, + { + "epoch": 0.1875962480750385, + "grad_norm": 0.3437984879597763, + "learning_rate": 9.893769546967023e-06, + "loss": 0.2677, + "step": 2345 + }, + { + "epoch": 0.1876762464750705, + "grad_norm": 0.30737032156345057, + "learning_rate": 9.89363669199703e-06, + "loss": 0.2619, + "step": 2346 + }, + { + "epoch": 0.1877562448751025, + "grad_norm": 0.29070417097587087, + "learning_rate": 9.893503754895874e-06, + "loss": 0.3198, + "step": 2347 + }, + { + "epoch": 0.18783624327513448, + "grad_norm": 0.3386429972378958, + "learning_rate": 9.893370735665784e-06, + "loss": 0.2841, + "step": 2348 + }, + { + "epoch": 0.1879162416751665, + "grad_norm": 0.323953695603506, + "learning_rate": 9.893237634308995e-06, + "loss": 0.2705, + "step": 2349 + }, + { + "epoch": 0.1879962400751985, + "grad_norm": 0.30627914516461774, + "learning_rate": 9.893104450827736e-06, + "loss": 0.2691, + "step": 2350 + }, + { + "epoch": 0.1880762384752305, + "grad_norm": 0.3227794432786858, + "learning_rate": 9.892971185224244e-06, + "loss": 0.2842, + "step": 2351 + }, + { + "epoch": 0.1881562368752625, + "grad_norm": 0.3123943226079266, + "learning_rate": 9.892837837500758e-06, + "loss": 0.3249, + "step": 2352 + }, + { + "epoch": 0.18823623527529448, + "grad_norm": 0.3730810029571364, + "learning_rate": 9.892704407659514e-06, + "loss": 0.3192, + "step": 2353 + }, + { + "epoch": 0.1883162336753265, + "grad_norm": 0.29071588837833817, + "learning_rate": 9.892570895702753e-06, + "loss": 0.2947, + "step": 2354 + }, + { + "epoch": 0.1883962320753585, + "grad_norm": 0.2526784826509826, + "learning_rate": 9.892437301632713e-06, + "loss": 0.3296, + "step": 2355 + }, + { + "epoch": 0.1884762304753905, + "grad_norm": 0.2949362515746845, + "learning_rate": 9.892303625451639e-06, + "loss": 0.3101, + "step": 2356 + }, + { + "epoch": 0.1885562288754225, + "grad_norm": 0.31716934229551896, + "learning_rate": 9.892169867161774e-06, + "loss": 0.3018, + "step": 2357 + }, + { + "epoch": 0.18863622727545448, + "grad_norm": 0.3822951576004735, + "learning_rate": 9.89203602676536e-06, + "loss": 0.2916, + "step": 2358 + }, + { + "epoch": 0.1887162256754865, + "grad_norm": 0.31989735533302277, + "learning_rate": 9.891902104264646e-06, + "loss": 0.2584, + "step": 2359 + }, + { + "epoch": 0.1887962240755185, + "grad_norm": 0.3060474495814552, + "learning_rate": 9.891768099661881e-06, + "loss": 0.2645, + "step": 2360 + }, + { + "epoch": 0.1888762224755505, + "grad_norm": 0.24913457952373075, + "learning_rate": 9.891634012959311e-06, + "loss": 0.3386, + "step": 2361 + }, + { + "epoch": 0.18895622087558248, + "grad_norm": 0.27129711263923684, + "learning_rate": 9.891499844159187e-06, + "loss": 0.3079, + "step": 2362 + }, + { + "epoch": 0.18903621927561448, + "grad_norm": 0.2905646614786227, + "learning_rate": 9.891365593263761e-06, + "loss": 0.3122, + "step": 2363 + }, + { + "epoch": 0.1891162176756465, + "grad_norm": 0.2894199628349873, + "learning_rate": 9.891231260275287e-06, + "loss": 0.2942, + "step": 2364 + }, + { + "epoch": 0.1891962160756785, + "grad_norm": 0.32209857413200826, + "learning_rate": 9.891096845196019e-06, + "loss": 0.3086, + "step": 2365 + }, + { + "epoch": 0.1892762144757105, + "grad_norm": 0.32710686043861265, + "learning_rate": 9.890962348028213e-06, + "loss": 0.2875, + "step": 2366 + }, + { + "epoch": 0.18935621287574247, + "grad_norm": 0.3645381502356643, + "learning_rate": 9.890827768774127e-06, + "loss": 0.2717, + "step": 2367 + }, + { + "epoch": 0.18943621127577448, + "grad_norm": 0.2655446107158337, + "learning_rate": 9.890693107436018e-06, + "loss": 0.3437, + "step": 2368 + }, + { + "epoch": 0.18951620967580649, + "grad_norm": 0.2859137374141619, + "learning_rate": 9.890558364016148e-06, + "loss": 0.309, + "step": 2369 + }, + { + "epoch": 0.1895962080758385, + "grad_norm": 0.2762738840085754, + "learning_rate": 9.890423538516777e-06, + "loss": 0.3335, + "step": 2370 + }, + { + "epoch": 0.1896762064758705, + "grad_norm": 0.29211964401080076, + "learning_rate": 9.890288630940168e-06, + "loss": 0.2923, + "step": 2371 + }, + { + "epoch": 0.18975620487590247, + "grad_norm": 0.34957818682927105, + "learning_rate": 9.890153641288587e-06, + "loss": 0.2969, + "step": 2372 + }, + { + "epoch": 0.18983620327593448, + "grad_norm": 0.33519457500326516, + "learning_rate": 9.890018569564298e-06, + "loss": 0.2675, + "step": 2373 + }, + { + "epoch": 0.18991620167596648, + "grad_norm": 0.27867395211981894, + "learning_rate": 9.889883415769566e-06, + "loss": 0.3519, + "step": 2374 + }, + { + "epoch": 0.1899962000759985, + "grad_norm": 0.23001185492002293, + "learning_rate": 9.889748179906661e-06, + "loss": 0.3623, + "step": 2375 + }, + { + "epoch": 0.19007619847603047, + "grad_norm": 0.2895844829859647, + "learning_rate": 9.889612861977855e-06, + "loss": 0.3085, + "step": 2376 + }, + { + "epoch": 0.19015619687606247, + "grad_norm": 0.31458321005565115, + "learning_rate": 9.889477461985415e-06, + "loss": 0.2803, + "step": 2377 + }, + { + "epoch": 0.19023619527609448, + "grad_norm": 0.2879545089244242, + "learning_rate": 9.889341979931616e-06, + "loss": 0.2932, + "step": 2378 + }, + { + "epoch": 0.19031619367612648, + "grad_norm": 0.36676278845548094, + "learning_rate": 9.889206415818733e-06, + "loss": 0.2885, + "step": 2379 + }, + { + "epoch": 0.1903961920761585, + "grad_norm": 0.3879702752753962, + "learning_rate": 9.889070769649038e-06, + "loss": 0.2852, + "step": 2380 + }, + { + "epoch": 0.19047619047619047, + "grad_norm": 0.3546940590573756, + "learning_rate": 9.88893504142481e-06, + "loss": 0.2782, + "step": 2381 + }, + { + "epoch": 0.19055618887622247, + "grad_norm": 0.29038723636169717, + "learning_rate": 9.888799231148325e-06, + "loss": 0.3129, + "step": 2382 + }, + { + "epoch": 0.19063618727625448, + "grad_norm": 0.2791022606069941, + "learning_rate": 9.888663338821864e-06, + "loss": 0.3053, + "step": 2383 + }, + { + "epoch": 0.19071618567628648, + "grad_norm": 0.22099306627951712, + "learning_rate": 9.888527364447709e-06, + "loss": 0.3685, + "step": 2384 + }, + { + "epoch": 0.1907961840763185, + "grad_norm": 0.3378135439204717, + "learning_rate": 9.888391308028138e-06, + "loss": 0.2916, + "step": 2385 + }, + { + "epoch": 0.19087618247635046, + "grad_norm": 0.2769700503993682, + "learning_rate": 9.888255169565437e-06, + "loss": 0.3282, + "step": 2386 + }, + { + "epoch": 0.19095618087638247, + "grad_norm": 0.25621880233623057, + "learning_rate": 9.888118949061891e-06, + "loss": 0.317, + "step": 2387 + }, + { + "epoch": 0.19103617927641448, + "grad_norm": 0.2865056182900127, + "learning_rate": 9.887982646519784e-06, + "loss": 0.3013, + "step": 2388 + }, + { + "epoch": 0.19111617767644648, + "grad_norm": 0.3292030614921192, + "learning_rate": 9.887846261941408e-06, + "loss": 0.2976, + "step": 2389 + }, + { + "epoch": 0.19119617607647846, + "grad_norm": 0.3504298429537061, + "learning_rate": 9.88770979532905e-06, + "loss": 0.2674, + "step": 2390 + }, + { + "epoch": 0.19127617447651046, + "grad_norm": 0.3238198466391161, + "learning_rate": 9.887573246684998e-06, + "loss": 0.2853, + "step": 2391 + }, + { + "epoch": 0.19135617287654247, + "grad_norm": 0.2987093593053379, + "learning_rate": 9.887436616011546e-06, + "loss": 0.2836, + "step": 2392 + }, + { + "epoch": 0.19143617127657447, + "grad_norm": 0.34897537691218683, + "learning_rate": 9.887299903310985e-06, + "loss": 0.2515, + "step": 2393 + }, + { + "epoch": 0.19151616967660648, + "grad_norm": 0.2598213466687978, + "learning_rate": 9.887163108585612e-06, + "loss": 0.3266, + "step": 2394 + }, + { + "epoch": 0.19159616807663846, + "grad_norm": 0.2940560350387327, + "learning_rate": 9.887026231837722e-06, + "loss": 0.324, + "step": 2395 + }, + { + "epoch": 0.19167616647667046, + "grad_norm": 0.2699714343161921, + "learning_rate": 9.886889273069614e-06, + "loss": 0.3323, + "step": 2396 + }, + { + "epoch": 0.19175616487670247, + "grad_norm": 0.25873492320067437, + "learning_rate": 9.886752232283582e-06, + "loss": 0.3419, + "step": 2397 + }, + { + "epoch": 0.19183616327673447, + "grad_norm": 0.31991465440170747, + "learning_rate": 9.88661510948193e-06, + "loss": 0.2953, + "step": 2398 + }, + { + "epoch": 0.19191616167676648, + "grad_norm": 0.3249208478104891, + "learning_rate": 9.886477904666958e-06, + "loss": 0.3157, + "step": 2399 + }, + { + "epoch": 0.19199616007679846, + "grad_norm": 0.3162527981849793, + "learning_rate": 9.886340617840968e-06, + "loss": 0.299, + "step": 2400 + }, + { + "epoch": 0.19207615847683046, + "grad_norm": 0.2925548542448652, + "learning_rate": 9.886203249006265e-06, + "loss": 0.3166, + "step": 2401 + }, + { + "epoch": 0.19215615687686247, + "grad_norm": 0.3108995328204289, + "learning_rate": 9.886065798165156e-06, + "loss": 0.2566, + "step": 2402 + }, + { + "epoch": 0.19223615527689447, + "grad_norm": 0.35593089009064205, + "learning_rate": 9.885928265319946e-06, + "loss": 0.2697, + "step": 2403 + }, + { + "epoch": 0.19231615367692645, + "grad_norm": 0.343831927440169, + "learning_rate": 9.885790650472943e-06, + "loss": 0.2809, + "step": 2404 + }, + { + "epoch": 0.19239615207695845, + "grad_norm": 0.35112878965920985, + "learning_rate": 9.885652953626456e-06, + "loss": 0.2873, + "step": 2405 + }, + { + "epoch": 0.19247615047699046, + "grad_norm": 0.32342997325176764, + "learning_rate": 9.8855151747828e-06, + "loss": 0.2982, + "step": 2406 + }, + { + "epoch": 0.19255614887702246, + "grad_norm": 0.2912375937111901, + "learning_rate": 9.885377313944284e-06, + "loss": 0.3128, + "step": 2407 + }, + { + "epoch": 0.19263614727705447, + "grad_norm": 0.32923896214245174, + "learning_rate": 9.88523937111322e-06, + "loss": 0.2906, + "step": 2408 + }, + { + "epoch": 0.19271614567708645, + "grad_norm": 0.24363921099042216, + "learning_rate": 9.885101346291928e-06, + "loss": 0.3215, + "step": 2409 + }, + { + "epoch": 0.19279614407711845, + "grad_norm": 0.287582024156925, + "learning_rate": 9.884963239482721e-06, + "loss": 0.2866, + "step": 2410 + }, + { + "epoch": 0.19287614247715046, + "grad_norm": 0.28753461949843867, + "learning_rate": 9.884825050687918e-06, + "loss": 0.3066, + "step": 2411 + }, + { + "epoch": 0.19295614087718246, + "grad_norm": 0.30170143136504596, + "learning_rate": 9.884686779909837e-06, + "loss": 0.3175, + "step": 2412 + }, + { + "epoch": 0.19303613927721447, + "grad_norm": 0.3230072366447336, + "learning_rate": 9.884548427150802e-06, + "loss": 0.2757, + "step": 2413 + }, + { + "epoch": 0.19311613767724645, + "grad_norm": 0.28449466880156626, + "learning_rate": 9.884409992413131e-06, + "loss": 0.3079, + "step": 2414 + }, + { + "epoch": 0.19319613607727845, + "grad_norm": 0.3575550479600838, + "learning_rate": 9.88427147569915e-06, + "loss": 0.3009, + "step": 2415 + }, + { + "epoch": 0.19327613447731046, + "grad_norm": 0.29136119323630466, + "learning_rate": 9.884132877011183e-06, + "loss": 0.2901, + "step": 2416 + }, + { + "epoch": 0.19335613287734246, + "grad_norm": 0.2614443579218222, + "learning_rate": 9.883994196351555e-06, + "loss": 0.3291, + "step": 2417 + }, + { + "epoch": 0.19343613127737444, + "grad_norm": 0.31207099013636763, + "learning_rate": 9.883855433722596e-06, + "loss": 0.3323, + "step": 2418 + }, + { + "epoch": 0.19351612967740645, + "grad_norm": 0.32276161020074934, + "learning_rate": 9.883716589126633e-06, + "loss": 0.2771, + "step": 2419 + }, + { + "epoch": 0.19359612807743845, + "grad_norm": 0.380370496908728, + "learning_rate": 9.883577662565997e-06, + "loss": 0.321, + "step": 2420 + }, + { + "epoch": 0.19367612647747046, + "grad_norm": 0.29253021048072253, + "learning_rate": 9.883438654043019e-06, + "loss": 0.3281, + "step": 2421 + }, + { + "epoch": 0.19375612487750246, + "grad_norm": 0.42989168636421377, + "learning_rate": 9.883299563560032e-06, + "loss": 0.2878, + "step": 2422 + }, + { + "epoch": 0.19383612327753444, + "grad_norm": 0.33322170148868346, + "learning_rate": 9.88316039111937e-06, + "loss": 0.2946, + "step": 2423 + }, + { + "epoch": 0.19391612167756644, + "grad_norm": 0.2712831929606459, + "learning_rate": 9.883021136723372e-06, + "loss": 0.3382, + "step": 2424 + }, + { + "epoch": 0.19399612007759845, + "grad_norm": 0.350749527602192, + "learning_rate": 9.88288180037437e-06, + "loss": 0.3, + "step": 2425 + }, + { + "epoch": 0.19407611847763045, + "grad_norm": 0.2893780578063207, + "learning_rate": 9.882742382074707e-06, + "loss": 0.3217, + "step": 2426 + }, + { + "epoch": 0.19415611687766246, + "grad_norm": 0.3125377321530427, + "learning_rate": 9.882602881826721e-06, + "loss": 0.2906, + "step": 2427 + }, + { + "epoch": 0.19423611527769444, + "grad_norm": 0.3458472019489112, + "learning_rate": 9.882463299632753e-06, + "loss": 0.2768, + "step": 2428 + }, + { + "epoch": 0.19431611367772644, + "grad_norm": 0.2860308493709997, + "learning_rate": 9.882323635495145e-06, + "loss": 0.321, + "step": 2429 + }, + { + "epoch": 0.19439611207775845, + "grad_norm": 0.3274497822709407, + "learning_rate": 9.882183889416243e-06, + "loss": 0.2694, + "step": 2430 + }, + { + "epoch": 0.19447611047779045, + "grad_norm": 0.2867208036091842, + "learning_rate": 9.882044061398393e-06, + "loss": 0.3238, + "step": 2431 + }, + { + "epoch": 0.19455610887782243, + "grad_norm": 0.24925430957835096, + "learning_rate": 9.881904151443938e-06, + "loss": 0.3083, + "step": 2432 + }, + { + "epoch": 0.19463610727785444, + "grad_norm": 0.295593610447912, + "learning_rate": 9.88176415955523e-06, + "loss": 0.3291, + "step": 2433 + }, + { + "epoch": 0.19471610567788644, + "grad_norm": 0.27584048332662, + "learning_rate": 9.881624085734614e-06, + "loss": 0.3289, + "step": 2434 + }, + { + "epoch": 0.19479610407791845, + "grad_norm": 0.39569936141902073, + "learning_rate": 9.881483929984446e-06, + "loss": 0.279, + "step": 2435 + }, + { + "epoch": 0.19487610247795045, + "grad_norm": 0.3996419460572606, + "learning_rate": 9.881343692307076e-06, + "loss": 0.2771, + "step": 2436 + }, + { + "epoch": 0.19495610087798243, + "grad_norm": 0.3519500919181083, + "learning_rate": 9.881203372704857e-06, + "loss": 0.2942, + "step": 2437 + }, + { + "epoch": 0.19503609927801444, + "grad_norm": 0.2560451991341518, + "learning_rate": 9.881062971180146e-06, + "loss": 0.3403, + "step": 2438 + }, + { + "epoch": 0.19511609767804644, + "grad_norm": 0.32695682401607506, + "learning_rate": 9.880922487735295e-06, + "loss": 0.2877, + "step": 2439 + }, + { + "epoch": 0.19519609607807845, + "grad_norm": 0.33161277507138803, + "learning_rate": 9.880781922372669e-06, + "loss": 0.2735, + "step": 2440 + }, + { + "epoch": 0.19527609447811045, + "grad_norm": 0.24206560368889032, + "learning_rate": 9.88064127509462e-06, + "loss": 0.3205, + "step": 2441 + }, + { + "epoch": 0.19535609287814243, + "grad_norm": 0.2982483040386578, + "learning_rate": 9.880500545903513e-06, + "loss": 0.312, + "step": 2442 + }, + { + "epoch": 0.19543609127817443, + "grad_norm": 0.2862445243547295, + "learning_rate": 9.880359734801708e-06, + "loss": 0.3135, + "step": 2443 + }, + { + "epoch": 0.19551608967820644, + "grad_norm": 0.29525374985522085, + "learning_rate": 9.880218841791568e-06, + "loss": 0.2946, + "step": 2444 + }, + { + "epoch": 0.19559608807823844, + "grad_norm": 0.2849388938750511, + "learning_rate": 9.880077866875459e-06, + "loss": 0.2966, + "step": 2445 + }, + { + "epoch": 0.19567608647827042, + "grad_norm": 0.33532488534431437, + "learning_rate": 9.879936810055746e-06, + "loss": 0.2776, + "step": 2446 + }, + { + "epoch": 0.19575608487830243, + "grad_norm": 0.3047728567057308, + "learning_rate": 9.879795671334798e-06, + "loss": 0.2652, + "step": 2447 + }, + { + "epoch": 0.19583608327833443, + "grad_norm": 0.31141759485667686, + "learning_rate": 9.87965445071498e-06, + "loss": 0.2548, + "step": 2448 + }, + { + "epoch": 0.19591608167836644, + "grad_norm": 0.31076067514272154, + "learning_rate": 9.879513148198668e-06, + "loss": 0.2816, + "step": 2449 + }, + { + "epoch": 0.19599608007839844, + "grad_norm": 0.25037213905934613, + "learning_rate": 9.879371763788228e-06, + "loss": 0.3161, + "step": 2450 + }, + { + "epoch": 0.19607607847843042, + "grad_norm": 0.24937138050144456, + "learning_rate": 9.879230297486034e-06, + "loss": 0.3222, + "step": 2451 + }, + { + "epoch": 0.19615607687846243, + "grad_norm": 0.2802995243055443, + "learning_rate": 9.879088749294463e-06, + "loss": 0.2934, + "step": 2452 + }, + { + "epoch": 0.19623607527849443, + "grad_norm": 0.3506298073580569, + "learning_rate": 9.878947119215889e-06, + "loss": 0.2867, + "step": 2453 + }, + { + "epoch": 0.19631607367852644, + "grad_norm": 0.3255369697273748, + "learning_rate": 9.878805407252689e-06, + "loss": 0.2857, + "step": 2454 + }, + { + "epoch": 0.19639607207855844, + "grad_norm": 0.2893414875906338, + "learning_rate": 9.87866361340724e-06, + "loss": 0.2947, + "step": 2455 + }, + { + "epoch": 0.19647607047859042, + "grad_norm": 0.39805723841441526, + "learning_rate": 9.878521737681925e-06, + "loss": 0.2862, + "step": 2456 + }, + { + "epoch": 0.19655606887862243, + "grad_norm": 0.41990772557645983, + "learning_rate": 9.878379780079122e-06, + "loss": 0.2775, + "step": 2457 + }, + { + "epoch": 0.19663606727865443, + "grad_norm": 0.3020374725086929, + "learning_rate": 9.878237740601215e-06, + "loss": 0.2982, + "step": 2458 + }, + { + "epoch": 0.19671606567868644, + "grad_norm": 0.3162887489576106, + "learning_rate": 9.878095619250588e-06, + "loss": 0.2849, + "step": 2459 + }, + { + "epoch": 0.1967960640787184, + "grad_norm": 0.2780042268219262, + "learning_rate": 9.877953416029625e-06, + "loss": 0.3062, + "step": 2460 + }, + { + "epoch": 0.19687606247875042, + "grad_norm": 0.2793847414690196, + "learning_rate": 9.877811130940715e-06, + "loss": 0.2988, + "step": 2461 + }, + { + "epoch": 0.19695606087878242, + "grad_norm": 0.26876140322926706, + "learning_rate": 9.877668763986243e-06, + "loss": 0.2997, + "step": 2462 + }, + { + "epoch": 0.19703605927881443, + "grad_norm": 0.24684775297924227, + "learning_rate": 9.8775263151686e-06, + "loss": 0.318, + "step": 2463 + }, + { + "epoch": 0.19711605767884643, + "grad_norm": 0.2789719048627102, + "learning_rate": 9.877383784490177e-06, + "loss": 0.2848, + "step": 2464 + }, + { + "epoch": 0.1971960560788784, + "grad_norm": 0.3209649709546764, + "learning_rate": 9.877241171953367e-06, + "loss": 0.2563, + "step": 2465 + }, + { + "epoch": 0.19727605447891042, + "grad_norm": 0.27072595422558393, + "learning_rate": 9.877098477560562e-06, + "loss": 0.3263, + "step": 2466 + }, + { + "epoch": 0.19735605287894242, + "grad_norm": 0.37282841305137376, + "learning_rate": 9.876955701314157e-06, + "loss": 0.2967, + "step": 2467 + }, + { + "epoch": 0.19743605127897443, + "grad_norm": 0.3608877831699875, + "learning_rate": 9.876812843216548e-06, + "loss": 0.2648, + "step": 2468 + }, + { + "epoch": 0.19751604967900643, + "grad_norm": 0.32434821277736336, + "learning_rate": 9.876669903270133e-06, + "loss": 0.2734, + "step": 2469 + }, + { + "epoch": 0.1975960480790384, + "grad_norm": 0.43411774309647644, + "learning_rate": 9.87652688147731e-06, + "loss": 0.2882, + "step": 2470 + }, + { + "epoch": 0.19767604647907042, + "grad_norm": 0.30999849698856097, + "learning_rate": 9.876383777840484e-06, + "loss": 0.3376, + "step": 2471 + }, + { + "epoch": 0.19775604487910242, + "grad_norm": 0.2744638471044857, + "learning_rate": 9.87624059236205e-06, + "loss": 0.3308, + "step": 2472 + }, + { + "epoch": 0.19783604327913443, + "grad_norm": 0.271948012042772, + "learning_rate": 9.876097325044416e-06, + "loss": 0.3316, + "step": 2473 + }, + { + "epoch": 0.1979160416791664, + "grad_norm": 0.31925002149734755, + "learning_rate": 9.875953975889984e-06, + "loss": 0.2757, + "step": 2474 + }, + { + "epoch": 0.1979960400791984, + "grad_norm": 0.2740866818677422, + "learning_rate": 9.87581054490116e-06, + "loss": 0.3358, + "step": 2475 + }, + { + "epoch": 0.19807603847923042, + "grad_norm": 0.29377435594612916, + "learning_rate": 9.875667032080354e-06, + "loss": 0.3011, + "step": 2476 + }, + { + "epoch": 0.19815603687926242, + "grad_norm": 0.31095485228516095, + "learning_rate": 9.87552343742997e-06, + "loss": 0.2858, + "step": 2477 + }, + { + "epoch": 0.19823603527929443, + "grad_norm": 0.3111704161791113, + "learning_rate": 9.875379760952422e-06, + "loss": 0.269, + "step": 2478 + }, + { + "epoch": 0.1983160336793264, + "grad_norm": 0.33746539806254855, + "learning_rate": 9.875236002650119e-06, + "loss": 0.2881, + "step": 2479 + }, + { + "epoch": 0.1983960320793584, + "grad_norm": 0.35346244368460855, + "learning_rate": 9.875092162525476e-06, + "loss": 0.2817, + "step": 2480 + }, + { + "epoch": 0.19847603047939041, + "grad_norm": 0.2915984188407071, + "learning_rate": 9.874948240580903e-06, + "loss": 0.3346, + "step": 2481 + }, + { + "epoch": 0.19855602887942242, + "grad_norm": 0.30760062473496075, + "learning_rate": 9.874804236818823e-06, + "loss": 0.2676, + "step": 2482 + }, + { + "epoch": 0.19863602727945442, + "grad_norm": 0.34001360797097285, + "learning_rate": 9.874660151241644e-06, + "loss": 0.3072, + "step": 2483 + }, + { + "epoch": 0.1987160256794864, + "grad_norm": 0.4703492988539531, + "learning_rate": 9.874515983851788e-06, + "loss": 0.2802, + "step": 2484 + }, + { + "epoch": 0.1987960240795184, + "grad_norm": 0.31191250767120043, + "learning_rate": 9.874371734651678e-06, + "loss": 0.3126, + "step": 2485 + }, + { + "epoch": 0.1988760224795504, + "grad_norm": 0.33120765273751773, + "learning_rate": 9.874227403643729e-06, + "loss": 0.2631, + "step": 2486 + }, + { + "epoch": 0.19895602087958242, + "grad_norm": 0.29535075578252534, + "learning_rate": 9.874082990830366e-06, + "loss": 0.3287, + "step": 2487 + }, + { + "epoch": 0.1990360192796144, + "grad_norm": 0.2807656621855734, + "learning_rate": 9.873938496214014e-06, + "loss": 0.3153, + "step": 2488 + }, + { + "epoch": 0.1991160176796464, + "grad_norm": 0.24900389638186068, + "learning_rate": 9.873793919797099e-06, + "loss": 0.3429, + "step": 2489 + }, + { + "epoch": 0.1991960160796784, + "grad_norm": 0.3343860035900998, + "learning_rate": 9.873649261582043e-06, + "loss": 0.2902, + "step": 2490 + }, + { + "epoch": 0.1992760144797104, + "grad_norm": 0.26278213219263286, + "learning_rate": 9.873504521571278e-06, + "loss": 0.3481, + "step": 2491 + }, + { + "epoch": 0.19935601287974242, + "grad_norm": 0.3009982918031528, + "learning_rate": 9.873359699767229e-06, + "loss": 0.3101, + "step": 2492 + }, + { + "epoch": 0.1994360112797744, + "grad_norm": 0.3233752344444209, + "learning_rate": 9.87321479617233e-06, + "loss": 0.2629, + "step": 2493 + }, + { + "epoch": 0.1995160096798064, + "grad_norm": 0.2930155626945073, + "learning_rate": 9.873069810789013e-06, + "loss": 0.3096, + "step": 2494 + }, + { + "epoch": 0.1995960080798384, + "grad_norm": 0.29402489128825876, + "learning_rate": 9.87292474361971e-06, + "loss": 0.2736, + "step": 2495 + }, + { + "epoch": 0.1996760064798704, + "grad_norm": 0.28086199719298965, + "learning_rate": 9.872779594666856e-06, + "loss": 0.2896, + "step": 2496 + }, + { + "epoch": 0.19975600487990242, + "grad_norm": 0.3720197991308306, + "learning_rate": 9.872634363932887e-06, + "loss": 0.2761, + "step": 2497 + }, + { + "epoch": 0.1998360032799344, + "grad_norm": 0.3041056253799466, + "learning_rate": 9.87248905142024e-06, + "loss": 0.2884, + "step": 2498 + }, + { + "epoch": 0.1999160016799664, + "grad_norm": 0.3147776737370676, + "learning_rate": 9.872343657131355e-06, + "loss": 0.2813, + "step": 2499 + }, + { + "epoch": 0.1999960000799984, + "grad_norm": 0.36771054697241135, + "learning_rate": 9.87219818106867e-06, + "loss": 0.2809, + "step": 2500 + }, + { + "epoch": 0.2000759984800304, + "grad_norm": 0.31214376381122216, + "learning_rate": 9.872052623234632e-06, + "loss": 0.2678, + "step": 2501 + }, + { + "epoch": 0.2001559968800624, + "grad_norm": 0.3706037130845664, + "learning_rate": 9.871906983631676e-06, + "loss": 0.2901, + "step": 2502 + }, + { + "epoch": 0.2002359952800944, + "grad_norm": 0.25635475077629827, + "learning_rate": 9.871761262262252e-06, + "loss": 0.3324, + "step": 2503 + }, + { + "epoch": 0.2003159936801264, + "grad_norm": 0.36663918442864113, + "learning_rate": 9.871615459128805e-06, + "loss": 0.2691, + "step": 2504 + }, + { + "epoch": 0.2003959920801584, + "grad_norm": 0.3880893797444578, + "learning_rate": 9.871469574233781e-06, + "loss": 0.2696, + "step": 2505 + }, + { + "epoch": 0.2004759904801904, + "grad_norm": 0.3502549702036501, + "learning_rate": 9.871323607579628e-06, + "loss": 0.2682, + "step": 2506 + }, + { + "epoch": 0.20055598888022239, + "grad_norm": 0.3035134431422913, + "learning_rate": 9.871177559168795e-06, + "loss": 0.322, + "step": 2507 + }, + { + "epoch": 0.2006359872802544, + "grad_norm": 0.3000155201642723, + "learning_rate": 9.871031429003735e-06, + "loss": 0.2945, + "step": 2508 + }, + { + "epoch": 0.2007159856802864, + "grad_norm": 0.3458682613510783, + "learning_rate": 9.8708852170869e-06, + "loss": 0.3107, + "step": 2509 + }, + { + "epoch": 0.2007959840803184, + "grad_norm": 0.28556943385431643, + "learning_rate": 9.870738923420746e-06, + "loss": 0.3288, + "step": 2510 + }, + { + "epoch": 0.2008759824803504, + "grad_norm": 0.30743684386888975, + "learning_rate": 9.870592548007725e-06, + "loss": 0.3107, + "step": 2511 + }, + { + "epoch": 0.20095598088038238, + "grad_norm": 0.41174604803246856, + "learning_rate": 9.870446090850295e-06, + "loss": 0.2527, + "step": 2512 + }, + { + "epoch": 0.2010359792804144, + "grad_norm": 0.2127036740695035, + "learning_rate": 9.870299551950912e-06, + "loss": 0.3448, + "step": 2513 + }, + { + "epoch": 0.2011159776804464, + "grad_norm": 0.4114119181636995, + "learning_rate": 9.87015293131204e-06, + "loss": 0.2947, + "step": 2514 + }, + { + "epoch": 0.2011959760804784, + "grad_norm": 0.336565649624392, + "learning_rate": 9.870006228936135e-06, + "loss": 0.3127, + "step": 2515 + }, + { + "epoch": 0.20127597448051038, + "grad_norm": 0.27243877892233254, + "learning_rate": 9.869859444825664e-06, + "loss": 0.2959, + "step": 2516 + }, + { + "epoch": 0.20135597288054238, + "grad_norm": 0.30387059662020083, + "learning_rate": 9.869712578983085e-06, + "loss": 0.2984, + "step": 2517 + }, + { + "epoch": 0.2014359712805744, + "grad_norm": 0.3121277870256797, + "learning_rate": 9.869565631410867e-06, + "loss": 0.3122, + "step": 2518 + }, + { + "epoch": 0.2015159696806064, + "grad_norm": 0.2953190454284815, + "learning_rate": 9.869418602111475e-06, + "loss": 0.2573, + "step": 2519 + }, + { + "epoch": 0.2015959680806384, + "grad_norm": 0.29153610250715895, + "learning_rate": 9.869271491087376e-06, + "loss": 0.3092, + "step": 2520 + }, + { + "epoch": 0.20167596648067038, + "grad_norm": 0.25673396585990116, + "learning_rate": 9.869124298341039e-06, + "loss": 0.3277, + "step": 2521 + }, + { + "epoch": 0.20175596488070238, + "grad_norm": 0.38451946628164757, + "learning_rate": 9.868977023874937e-06, + "loss": 0.2614, + "step": 2522 + }, + { + "epoch": 0.2018359632807344, + "grad_norm": 0.32469348468808834, + "learning_rate": 9.868829667691538e-06, + "loss": 0.3064, + "step": 2523 + }, + { + "epoch": 0.2019159616807664, + "grad_norm": 0.3739748175455632, + "learning_rate": 9.868682229793317e-06, + "loss": 0.2727, + "step": 2524 + }, + { + "epoch": 0.20199596008079837, + "grad_norm": 0.27917317953680815, + "learning_rate": 9.868534710182747e-06, + "loss": 0.2943, + "step": 2525 + }, + { + "epoch": 0.20207595848083038, + "grad_norm": 0.34585163227207605, + "learning_rate": 9.868387108862307e-06, + "loss": 0.2578, + "step": 2526 + }, + { + "epoch": 0.20215595688086238, + "grad_norm": 0.2494883622840502, + "learning_rate": 9.868239425834472e-06, + "loss": 0.3114, + "step": 2527 + }, + { + "epoch": 0.20223595528089439, + "grad_norm": 0.3124776255954141, + "learning_rate": 9.868091661101719e-06, + "loss": 0.2362, + "step": 2528 + }, + { + "epoch": 0.2023159536809264, + "grad_norm": 0.28844148601111425, + "learning_rate": 9.867943814666533e-06, + "loss": 0.3066, + "step": 2529 + }, + { + "epoch": 0.20239595208095837, + "grad_norm": 0.32739632459084717, + "learning_rate": 9.86779588653139e-06, + "loss": 0.2906, + "step": 2530 + }, + { + "epoch": 0.20247595048099037, + "grad_norm": 0.30203661640886614, + "learning_rate": 9.867647876698776e-06, + "loss": 0.318, + "step": 2531 + }, + { + "epoch": 0.20255594888102238, + "grad_norm": 0.3005049714844136, + "learning_rate": 9.867499785171174e-06, + "loss": 0.3092, + "step": 2532 + }, + { + "epoch": 0.20263594728105438, + "grad_norm": 0.33151061009463134, + "learning_rate": 9.867351611951071e-06, + "loss": 0.2592, + "step": 2533 + }, + { + "epoch": 0.2027159456810864, + "grad_norm": 0.3463336664243194, + "learning_rate": 9.86720335704095e-06, + "loss": 0.277, + "step": 2534 + }, + { + "epoch": 0.20279594408111837, + "grad_norm": 0.2482258481366194, + "learning_rate": 9.867055020443302e-06, + "loss": 0.3411, + "step": 2535 + }, + { + "epoch": 0.20287594248115037, + "grad_norm": 0.36597878379474863, + "learning_rate": 9.866906602160616e-06, + "loss": 0.2718, + "step": 2536 + }, + { + "epoch": 0.20295594088118238, + "grad_norm": 0.3147076332952683, + "learning_rate": 9.866758102195384e-06, + "loss": 0.2599, + "step": 2537 + }, + { + "epoch": 0.20303593928121438, + "grad_norm": 0.301839912448097, + "learning_rate": 9.866609520550097e-06, + "loss": 0.2994, + "step": 2538 + }, + { + "epoch": 0.20311593768124636, + "grad_norm": 0.31115526389640036, + "learning_rate": 9.86646085722725e-06, + "loss": 0.2958, + "step": 2539 + }, + { + "epoch": 0.20319593608127837, + "grad_norm": 0.31253599647178126, + "learning_rate": 9.866312112229335e-06, + "loss": 0.2912, + "step": 2540 + }, + { + "epoch": 0.20327593448131037, + "grad_norm": 0.2804238986866396, + "learning_rate": 9.866163285558851e-06, + "loss": 0.3055, + "step": 2541 + }, + { + "epoch": 0.20335593288134238, + "grad_norm": 0.32638630035232546, + "learning_rate": 9.866014377218297e-06, + "loss": 0.2917, + "step": 2542 + }, + { + "epoch": 0.20343593128137438, + "grad_norm": 0.28911941105504807, + "learning_rate": 9.865865387210169e-06, + "loss": 0.3104, + "step": 2543 + }, + { + "epoch": 0.20351592968140636, + "grad_norm": 0.2954711260065343, + "learning_rate": 9.86571631553697e-06, + "loss": 0.3023, + "step": 2544 + }, + { + "epoch": 0.20359592808143837, + "grad_norm": 0.27580838740040864, + "learning_rate": 9.8655671622012e-06, + "loss": 0.2992, + "step": 2545 + }, + { + "epoch": 0.20367592648147037, + "grad_norm": 0.22830238930840105, + "learning_rate": 9.865417927205363e-06, + "loss": 0.3247, + "step": 2546 + }, + { + "epoch": 0.20375592488150238, + "grad_norm": 0.3347282680138978, + "learning_rate": 9.865268610551966e-06, + "loss": 0.307, + "step": 2547 + }, + { + "epoch": 0.20383592328153438, + "grad_norm": 0.33840686872862186, + "learning_rate": 9.86511921224351e-06, + "loss": 0.2502, + "step": 2548 + }, + { + "epoch": 0.20391592168156636, + "grad_norm": 0.29619123584674023, + "learning_rate": 9.864969732282507e-06, + "loss": 0.2916, + "step": 2549 + }, + { + "epoch": 0.20399592008159836, + "grad_norm": 0.31171500612831754, + "learning_rate": 9.864820170671466e-06, + "loss": 0.2785, + "step": 2550 + }, + { + "epoch": 0.20407591848163037, + "grad_norm": 0.3270667075498327, + "learning_rate": 9.864670527412891e-06, + "loss": 0.269, + "step": 2551 + }, + { + "epoch": 0.20415591688166237, + "grad_norm": 0.2496264533236665, + "learning_rate": 9.8645208025093e-06, + "loss": 0.3458, + "step": 2552 + }, + { + "epoch": 0.20423591528169435, + "grad_norm": 0.28151360059638797, + "learning_rate": 9.864370995963204e-06, + "loss": 0.291, + "step": 2553 + }, + { + "epoch": 0.20431591368172636, + "grad_norm": 0.2793813007934314, + "learning_rate": 9.864221107777116e-06, + "loss": 0.2938, + "step": 2554 + }, + { + "epoch": 0.20439591208175836, + "grad_norm": 0.3013546586306375, + "learning_rate": 9.864071137953552e-06, + "loss": 0.3102, + "step": 2555 + }, + { + "epoch": 0.20447591048179037, + "grad_norm": 0.3238735392023272, + "learning_rate": 9.86392108649503e-06, + "loss": 0.2733, + "step": 2556 + }, + { + "epoch": 0.20455590888182237, + "grad_norm": 0.2528648475838113, + "learning_rate": 9.863770953404068e-06, + "loss": 0.3599, + "step": 2557 + }, + { + "epoch": 0.20463590728185435, + "grad_norm": 0.2958940495704302, + "learning_rate": 9.863620738683186e-06, + "loss": 0.3012, + "step": 2558 + }, + { + "epoch": 0.20471590568188636, + "grad_norm": 0.27027703977830325, + "learning_rate": 9.863470442334904e-06, + "loss": 0.289, + "step": 2559 + }, + { + "epoch": 0.20479590408191836, + "grad_norm": 0.24131474615738915, + "learning_rate": 9.863320064361743e-06, + "loss": 0.3239, + "step": 2560 + }, + { + "epoch": 0.20487590248195037, + "grad_norm": 0.3292687121316407, + "learning_rate": 9.863169604766231e-06, + "loss": 0.2658, + "step": 2561 + }, + { + "epoch": 0.20495590088198237, + "grad_norm": 0.35704150610215796, + "learning_rate": 9.863019063550892e-06, + "loss": 0.2738, + "step": 2562 + }, + { + "epoch": 0.20503589928201435, + "grad_norm": 0.27468754467581213, + "learning_rate": 9.86286844071825e-06, + "loss": 0.2987, + "step": 2563 + }, + { + "epoch": 0.20511589768204636, + "grad_norm": 0.283725797229102, + "learning_rate": 9.862717736270834e-06, + "loss": 0.3267, + "step": 2564 + }, + { + "epoch": 0.20519589608207836, + "grad_norm": 0.26799631941171415, + "learning_rate": 9.862566950211175e-06, + "loss": 0.3306, + "step": 2565 + }, + { + "epoch": 0.20527589448211037, + "grad_norm": 0.3349905029087938, + "learning_rate": 9.862416082541803e-06, + "loss": 0.3778, + "step": 2566 + }, + { + "epoch": 0.20535589288214234, + "grad_norm": 0.2651739749496281, + "learning_rate": 9.862265133265248e-06, + "loss": 0.3521, + "step": 2567 + }, + { + "epoch": 0.20543589128217435, + "grad_norm": 0.33430554324665285, + "learning_rate": 9.862114102384045e-06, + "loss": 0.2847, + "step": 2568 + }, + { + "epoch": 0.20551588968220635, + "grad_norm": 0.31793197732536266, + "learning_rate": 9.861962989900732e-06, + "loss": 0.285, + "step": 2569 + }, + { + "epoch": 0.20559588808223836, + "grad_norm": 0.3048678447015555, + "learning_rate": 9.86181179581784e-06, + "loss": 0.3134, + "step": 2570 + }, + { + "epoch": 0.20567588648227036, + "grad_norm": 0.25872361853646114, + "learning_rate": 9.861660520137908e-06, + "loss": 0.3295, + "step": 2571 + }, + { + "epoch": 0.20575588488230234, + "grad_norm": 0.3152439438992059, + "learning_rate": 9.861509162863475e-06, + "loss": 0.2994, + "step": 2572 + }, + { + "epoch": 0.20583588328233435, + "grad_norm": 0.3180730359281916, + "learning_rate": 9.861357723997082e-06, + "loss": 0.2696, + "step": 2573 + }, + { + "epoch": 0.20591588168236635, + "grad_norm": 0.25642345422382495, + "learning_rate": 9.861206203541271e-06, + "loss": 0.3093, + "step": 2574 + }, + { + "epoch": 0.20599588008239836, + "grad_norm": 0.3066590860851833, + "learning_rate": 9.861054601498586e-06, + "loss": 0.279, + "step": 2575 + }, + { + "epoch": 0.20607587848243036, + "grad_norm": 0.2878753419464467, + "learning_rate": 9.860902917871566e-06, + "loss": 0.3188, + "step": 2576 + }, + { + "epoch": 0.20615587688246234, + "grad_norm": 0.2593035731459659, + "learning_rate": 9.860751152662762e-06, + "loss": 0.2917, + "step": 2577 + }, + { + "epoch": 0.20623587528249435, + "grad_norm": 0.3577923093024356, + "learning_rate": 9.860599305874721e-06, + "loss": 0.3451, + "step": 2578 + }, + { + "epoch": 0.20631587368252635, + "grad_norm": 0.2283692151709258, + "learning_rate": 9.860447377509989e-06, + "loss": 0.3336, + "step": 2579 + }, + { + "epoch": 0.20639587208255836, + "grad_norm": 0.2788654231907945, + "learning_rate": 9.860295367571115e-06, + "loss": 0.2968, + "step": 2580 + }, + { + "epoch": 0.20647587048259033, + "grad_norm": 0.27789653311159646, + "learning_rate": 9.860143276060655e-06, + "loss": 0.3022, + "step": 2581 + }, + { + "epoch": 0.20655586888262234, + "grad_norm": 0.2684105659566808, + "learning_rate": 9.859991102981159e-06, + "loss": 0.3194, + "step": 2582 + }, + { + "epoch": 0.20663586728265435, + "grad_norm": 0.24175737729596883, + "learning_rate": 9.859838848335178e-06, + "loss": 0.3166, + "step": 2583 + }, + { + "epoch": 0.20671586568268635, + "grad_norm": 0.22743378125783334, + "learning_rate": 9.859686512125271e-06, + "loss": 0.3576, + "step": 2584 + }, + { + "epoch": 0.20679586408271836, + "grad_norm": 0.28867984116195733, + "learning_rate": 9.859534094353994e-06, + "loss": 0.3021, + "step": 2585 + }, + { + "epoch": 0.20687586248275033, + "grad_norm": 0.310601753771648, + "learning_rate": 9.859381595023905e-06, + "loss": 0.2729, + "step": 2586 + }, + { + "epoch": 0.20695586088278234, + "grad_norm": 0.307048033907565, + "learning_rate": 9.859229014137564e-06, + "loss": 0.2583, + "step": 2587 + }, + { + "epoch": 0.20703585928281434, + "grad_norm": 0.4888119069224936, + "learning_rate": 9.85907635169753e-06, + "loss": 0.3714, + "step": 2588 + }, + { + "epoch": 0.20711585768284635, + "grad_norm": 0.3153847734855788, + "learning_rate": 9.858923607706366e-06, + "loss": 0.2576, + "step": 2589 + }, + { + "epoch": 0.20719585608287835, + "grad_norm": 0.2218734860720186, + "learning_rate": 9.858770782166636e-06, + "loss": 0.3715, + "step": 2590 + }, + { + "epoch": 0.20727585448291033, + "grad_norm": 0.2834264116325163, + "learning_rate": 9.858617875080904e-06, + "loss": 0.317, + "step": 2591 + }, + { + "epoch": 0.20735585288294234, + "grad_norm": 0.32784744837547514, + "learning_rate": 9.858464886451737e-06, + "loss": 0.2731, + "step": 2592 + }, + { + "epoch": 0.20743585128297434, + "grad_norm": 0.35870952179236076, + "learning_rate": 9.858311816281703e-06, + "loss": 0.2497, + "step": 2593 + }, + { + "epoch": 0.20751584968300635, + "grad_norm": 0.29527538196502084, + "learning_rate": 9.85815866457337e-06, + "loss": 0.2998, + "step": 2594 + }, + { + "epoch": 0.20759584808303833, + "grad_norm": 0.297987564398225, + "learning_rate": 9.858005431329309e-06, + "loss": 0.2902, + "step": 2595 + }, + { + "epoch": 0.20767584648307033, + "grad_norm": 0.32385108690487874, + "learning_rate": 9.857852116552094e-06, + "loss": 0.2652, + "step": 2596 + }, + { + "epoch": 0.20775584488310234, + "grad_norm": 0.25062474573476295, + "learning_rate": 9.857698720244294e-06, + "loss": 0.3187, + "step": 2597 + }, + { + "epoch": 0.20783584328313434, + "grad_norm": 0.31147193334369294, + "learning_rate": 9.857545242408485e-06, + "loss": 0.2814, + "step": 2598 + }, + { + "epoch": 0.20791584168316635, + "grad_norm": 0.31595150721051446, + "learning_rate": 9.857391683047244e-06, + "loss": 0.2742, + "step": 2599 + }, + { + "epoch": 0.20799584008319832, + "grad_norm": 0.43452394381523424, + "learning_rate": 9.857238042163147e-06, + "loss": 0.2829, + "step": 2600 + }, + { + "epoch": 0.20807583848323033, + "grad_norm": 0.31876869018677917, + "learning_rate": 9.857084319758772e-06, + "loss": 0.2677, + "step": 2601 + }, + { + "epoch": 0.20815583688326234, + "grad_norm": 0.3001456554338103, + "learning_rate": 9.856930515836701e-06, + "loss": 0.2962, + "step": 2602 + }, + { + "epoch": 0.20823583528329434, + "grad_norm": 0.32170999788480525, + "learning_rate": 9.856776630399514e-06, + "loss": 0.2757, + "step": 2603 + }, + { + "epoch": 0.20831583368332635, + "grad_norm": 0.33767545123662196, + "learning_rate": 9.856622663449796e-06, + "loss": 0.2782, + "step": 2604 + }, + { + "epoch": 0.20839583208335832, + "grad_norm": 0.2866285089547352, + "learning_rate": 9.856468614990127e-06, + "loss": 0.2908, + "step": 2605 + }, + { + "epoch": 0.20847583048339033, + "grad_norm": 0.3399330209114603, + "learning_rate": 9.856314485023094e-06, + "loss": 0.3188, + "step": 2606 + }, + { + "epoch": 0.20855582888342233, + "grad_norm": 0.27410844614186114, + "learning_rate": 9.856160273551285e-06, + "loss": 0.3375, + "step": 2607 + }, + { + "epoch": 0.20863582728345434, + "grad_norm": 0.4022746721880151, + "learning_rate": 9.856005980577287e-06, + "loss": 0.2875, + "step": 2608 + }, + { + "epoch": 0.20871582568348632, + "grad_norm": 0.24003499723735117, + "learning_rate": 9.855851606103691e-06, + "loss": 0.3559, + "step": 2609 + }, + { + "epoch": 0.20879582408351832, + "grad_norm": 0.253744530744458, + "learning_rate": 9.855697150133086e-06, + "loss": 0.3283, + "step": 2610 + }, + { + "epoch": 0.20887582248355033, + "grad_norm": 0.3025758528064443, + "learning_rate": 9.855542612668066e-06, + "loss": 0.2692, + "step": 2611 + }, + { + "epoch": 0.20895582088358233, + "grad_norm": 0.2837250700054285, + "learning_rate": 9.855387993711224e-06, + "loss": 0.3056, + "step": 2612 + }, + { + "epoch": 0.20903581928361434, + "grad_norm": 0.318261309439039, + "learning_rate": 9.855233293265153e-06, + "loss": 0.2707, + "step": 2613 + }, + { + "epoch": 0.20911581768364632, + "grad_norm": 0.3004698880274868, + "learning_rate": 9.855078511332455e-06, + "loss": 0.2937, + "step": 2614 + }, + { + "epoch": 0.20919581608367832, + "grad_norm": 0.8035849371079382, + "learning_rate": 9.85492364791572e-06, + "loss": 0.2819, + "step": 2615 + }, + { + "epoch": 0.20927581448371033, + "grad_norm": 0.5977262378878152, + "learning_rate": 9.854768703017553e-06, + "loss": 0.2764, + "step": 2616 + }, + { + "epoch": 0.20935581288374233, + "grad_norm": 0.3787727099325343, + "learning_rate": 9.854613676640551e-06, + "loss": 0.3052, + "step": 2617 + }, + { + "epoch": 0.20943581128377434, + "grad_norm": 0.2708423938342487, + "learning_rate": 9.854458568787319e-06, + "loss": 0.2947, + "step": 2618 + }, + { + "epoch": 0.20951580968380631, + "grad_norm": 0.26247177006934125, + "learning_rate": 9.854303379460458e-06, + "loss": 0.3177, + "step": 2619 + }, + { + "epoch": 0.20959580808383832, + "grad_norm": 0.3005402015428693, + "learning_rate": 9.854148108662574e-06, + "loss": 0.297, + "step": 2620 + }, + { + "epoch": 0.20967580648387033, + "grad_norm": 0.31853259802685274, + "learning_rate": 9.853992756396272e-06, + "loss": 0.266, + "step": 2621 + }, + { + "epoch": 0.20975580488390233, + "grad_norm": 0.2808401081517454, + "learning_rate": 9.853837322664159e-06, + "loss": 0.3335, + "step": 2622 + }, + { + "epoch": 0.2098358032839343, + "grad_norm": 0.32120698607564574, + "learning_rate": 9.853681807468845e-06, + "loss": 0.3193, + "step": 2623 + }, + { + "epoch": 0.2099158016839663, + "grad_norm": 0.2954754884644285, + "learning_rate": 9.853526210812939e-06, + "loss": 0.276, + "step": 2624 + }, + { + "epoch": 0.20999580008399832, + "grad_norm": 0.23618435555819356, + "learning_rate": 9.853370532699052e-06, + "loss": 0.3529, + "step": 2625 + }, + { + "epoch": 0.21007579848403032, + "grad_norm": 0.33736958083233115, + "learning_rate": 9.853214773129796e-06, + "loss": 0.3109, + "step": 2626 + }, + { + "epoch": 0.21015579688406233, + "grad_norm": 0.3100424290521666, + "learning_rate": 9.853058932107789e-06, + "loss": 0.2984, + "step": 2627 + }, + { + "epoch": 0.2102357952840943, + "grad_norm": 0.24733352365560216, + "learning_rate": 9.852903009635642e-06, + "loss": 0.3289, + "step": 2628 + }, + { + "epoch": 0.2103157936841263, + "grad_norm": 0.36558602648613103, + "learning_rate": 9.852747005715976e-06, + "loss": 0.3159, + "step": 2629 + }, + { + "epoch": 0.21039579208415832, + "grad_norm": 0.3392219252115747, + "learning_rate": 9.852590920351406e-06, + "loss": 0.3059, + "step": 2630 + }, + { + "epoch": 0.21047579048419032, + "grad_norm": 0.2739304012487695, + "learning_rate": 9.852434753544552e-06, + "loss": 0.2976, + "step": 2631 + }, + { + "epoch": 0.21055578888422233, + "grad_norm": 0.2661625745298221, + "learning_rate": 9.852278505298039e-06, + "loss": 0.3204, + "step": 2632 + }, + { + "epoch": 0.2106357872842543, + "grad_norm": 0.28307330441230977, + "learning_rate": 9.852122175614484e-06, + "loss": 0.3194, + "step": 2633 + }, + { + "epoch": 0.2107157856842863, + "grad_norm": 0.3325161548955037, + "learning_rate": 9.851965764496513e-06, + "loss": 0.2768, + "step": 2634 + }, + { + "epoch": 0.21079578408431832, + "grad_norm": 0.2466102774768095, + "learning_rate": 9.85180927194675e-06, + "loss": 0.3232, + "step": 2635 + }, + { + "epoch": 0.21087578248435032, + "grad_norm": 0.25269394461887223, + "learning_rate": 9.851652697967825e-06, + "loss": 0.3501, + "step": 2636 + }, + { + "epoch": 0.2109557808843823, + "grad_norm": 0.38021228774154636, + "learning_rate": 9.85149604256236e-06, + "loss": 0.2765, + "step": 2637 + }, + { + "epoch": 0.2110357792844143, + "grad_norm": 0.41340643597588905, + "learning_rate": 9.85133930573299e-06, + "loss": 0.3104, + "step": 2638 + }, + { + "epoch": 0.2111157776844463, + "grad_norm": 0.35090828512811995, + "learning_rate": 9.851182487482342e-06, + "loss": 0.2985, + "step": 2639 + }, + { + "epoch": 0.21119577608447831, + "grad_norm": 0.29919557305630723, + "learning_rate": 9.85102558781305e-06, + "loss": 0.314, + "step": 2640 + }, + { + "epoch": 0.21127577448451032, + "grad_norm": 0.32767191415306157, + "learning_rate": 9.850868606727745e-06, + "loss": 0.283, + "step": 2641 + }, + { + "epoch": 0.2113557728845423, + "grad_norm": 0.3609568296619404, + "learning_rate": 9.850711544229064e-06, + "loss": 0.3284, + "step": 2642 + }, + { + "epoch": 0.2114357712845743, + "grad_norm": 0.2804793339197424, + "learning_rate": 9.85055440031964e-06, + "loss": 0.3137, + "step": 2643 + }, + { + "epoch": 0.2115157696846063, + "grad_norm": 0.31692955604348066, + "learning_rate": 9.850397175002114e-06, + "loss": 0.2571, + "step": 2644 + }, + { + "epoch": 0.2115957680846383, + "grad_norm": 0.31280534854586156, + "learning_rate": 9.850239868279123e-06, + "loss": 0.2978, + "step": 2645 + }, + { + "epoch": 0.21167576648467032, + "grad_norm": 0.2461719714667789, + "learning_rate": 9.850082480153306e-06, + "loss": 0.3073, + "step": 2646 + }, + { + "epoch": 0.2117557648847023, + "grad_norm": 0.2438606334097585, + "learning_rate": 9.849925010627308e-06, + "loss": 0.3389, + "step": 2647 + }, + { + "epoch": 0.2118357632847343, + "grad_norm": 0.250981202875402, + "learning_rate": 9.849767459703767e-06, + "loss": 0.3223, + "step": 2648 + }, + { + "epoch": 0.2119157616847663, + "grad_norm": 0.27319398114539656, + "learning_rate": 9.84960982738533e-06, + "loss": 0.2959, + "step": 2649 + }, + { + "epoch": 0.2119957600847983, + "grad_norm": 0.2588723060701358, + "learning_rate": 9.849452113674644e-06, + "loss": 0.3395, + "step": 2650 + }, + { + "epoch": 0.2120757584848303, + "grad_norm": 0.3113830424072568, + "learning_rate": 9.849294318574353e-06, + "loss": 0.2764, + "step": 2651 + }, + { + "epoch": 0.2121557568848623, + "grad_norm": 0.3255334493629717, + "learning_rate": 9.849136442087106e-06, + "loss": 0.2656, + "step": 2652 + }, + { + "epoch": 0.2122357552848943, + "grad_norm": 0.2672229038025903, + "learning_rate": 9.848978484215554e-06, + "loss": 0.2985, + "step": 2653 + }, + { + "epoch": 0.2123157536849263, + "grad_norm": 0.3228294562911482, + "learning_rate": 9.848820444962348e-06, + "loss": 0.2608, + "step": 2654 + }, + { + "epoch": 0.2123957520849583, + "grad_norm": 0.27731408432430354, + "learning_rate": 9.848662324330139e-06, + "loss": 0.2955, + "step": 2655 + }, + { + "epoch": 0.2124757504849903, + "grad_norm": 0.2557945280358065, + "learning_rate": 9.848504122321581e-06, + "loss": 0.3484, + "step": 2656 + }, + { + "epoch": 0.2125557488850223, + "grad_norm": 0.2996916312690073, + "learning_rate": 9.848345838939329e-06, + "loss": 0.2703, + "step": 2657 + }, + { + "epoch": 0.2126357472850543, + "grad_norm": 0.309715122700965, + "learning_rate": 9.848187474186042e-06, + "loss": 0.2927, + "step": 2658 + }, + { + "epoch": 0.2127157456850863, + "grad_norm": 0.2992297600551249, + "learning_rate": 9.848029028064374e-06, + "loss": 0.2612, + "step": 2659 + }, + { + "epoch": 0.2127957440851183, + "grad_norm": 0.27926854080156416, + "learning_rate": 9.847870500576987e-06, + "loss": 0.2968, + "step": 2660 + }, + { + "epoch": 0.2128757424851503, + "grad_norm": 0.3346592535363179, + "learning_rate": 9.847711891726543e-06, + "loss": 0.2819, + "step": 2661 + }, + { + "epoch": 0.2129557408851823, + "grad_norm": 0.27941362800859104, + "learning_rate": 9.847553201515701e-06, + "loss": 0.2962, + "step": 2662 + }, + { + "epoch": 0.2130357392852143, + "grad_norm": 0.32169481724417076, + "learning_rate": 9.847394429947124e-06, + "loss": 0.2822, + "step": 2663 + }, + { + "epoch": 0.2131157376852463, + "grad_norm": 0.28639363064721884, + "learning_rate": 9.847235577023477e-06, + "loss": 0.3015, + "step": 2664 + }, + { + "epoch": 0.21319573608527828, + "grad_norm": 0.39301790129794173, + "learning_rate": 9.847076642747429e-06, + "loss": 0.2971, + "step": 2665 + }, + { + "epoch": 0.2132757344853103, + "grad_norm": 0.275527562346172, + "learning_rate": 9.846917627121644e-06, + "loss": 0.2941, + "step": 2666 + }, + { + "epoch": 0.2133557328853423, + "grad_norm": 0.26514752441994255, + "learning_rate": 9.846758530148793e-06, + "loss": 0.3269, + "step": 2667 + }, + { + "epoch": 0.2134357312853743, + "grad_norm": 0.27964971645020964, + "learning_rate": 9.846599351831546e-06, + "loss": 0.3041, + "step": 2668 + }, + { + "epoch": 0.2135157296854063, + "grad_norm": 0.3391421443429832, + "learning_rate": 9.84644009217257e-06, + "loss": 0.3131, + "step": 2669 + }, + { + "epoch": 0.21359572808543828, + "grad_norm": 0.3218926405260842, + "learning_rate": 9.846280751174547e-06, + "loss": 0.274, + "step": 2670 + }, + { + "epoch": 0.21367572648547029, + "grad_norm": 0.3003566018120908, + "learning_rate": 9.846121328840143e-06, + "loss": 0.3007, + "step": 2671 + }, + { + "epoch": 0.2137557248855023, + "grad_norm": 0.5875141239115877, + "learning_rate": 9.845961825172038e-06, + "loss": 0.2909, + "step": 2672 + }, + { + "epoch": 0.2138357232855343, + "grad_norm": 0.32763270391057214, + "learning_rate": 9.845802240172908e-06, + "loss": 0.3179, + "step": 2673 + }, + { + "epoch": 0.2139157216855663, + "grad_norm": 0.29805498012725107, + "learning_rate": 9.845642573845429e-06, + "loss": 0.2981, + "step": 2674 + }, + { + "epoch": 0.21399572008559828, + "grad_norm": 0.2894932349216363, + "learning_rate": 9.845482826192284e-06, + "loss": 0.3048, + "step": 2675 + }, + { + "epoch": 0.21407571848563028, + "grad_norm": 0.3526205180494495, + "learning_rate": 9.845322997216153e-06, + "loss": 0.2697, + "step": 2676 + }, + { + "epoch": 0.2141557168856623, + "grad_norm": 0.32036767967792773, + "learning_rate": 9.845163086919718e-06, + "loss": 0.2841, + "step": 2677 + }, + { + "epoch": 0.2142357152856943, + "grad_norm": 0.33478769480695575, + "learning_rate": 9.845003095305663e-06, + "loss": 0.2603, + "step": 2678 + }, + { + "epoch": 0.21431571368572627, + "grad_norm": 0.31702206193184534, + "learning_rate": 9.844843022376673e-06, + "loss": 0.3114, + "step": 2679 + }, + { + "epoch": 0.21439571208575828, + "grad_norm": 0.33810202678168527, + "learning_rate": 9.844682868135436e-06, + "loss": 0.2937, + "step": 2680 + }, + { + "epoch": 0.21447571048579028, + "grad_norm": 0.30718656185501, + "learning_rate": 9.844522632584636e-06, + "loss": 0.3157, + "step": 2681 + }, + { + "epoch": 0.2145557088858223, + "grad_norm": 0.23715494967858633, + "learning_rate": 9.844362315726967e-06, + "loss": 0.3422, + "step": 2682 + }, + { + "epoch": 0.2146357072858543, + "grad_norm": 0.34566301881893324, + "learning_rate": 9.844201917565119e-06, + "loss": 0.2728, + "step": 2683 + }, + { + "epoch": 0.21471570568588627, + "grad_norm": 0.3272919656944929, + "learning_rate": 9.844041438101781e-06, + "loss": 0.2648, + "step": 2684 + }, + { + "epoch": 0.21479570408591828, + "grad_norm": 0.28607274994312515, + "learning_rate": 9.843880877339648e-06, + "loss": 0.2856, + "step": 2685 + }, + { + "epoch": 0.21487570248595028, + "grad_norm": 0.31462507401100903, + "learning_rate": 9.843720235281416e-06, + "loss": 0.3092, + "step": 2686 + }, + { + "epoch": 0.2149557008859823, + "grad_norm": 0.2823580456228662, + "learning_rate": 9.843559511929777e-06, + "loss": 0.3329, + "step": 2687 + }, + { + "epoch": 0.2150356992860143, + "grad_norm": 0.26847056859120855, + "learning_rate": 9.843398707287433e-06, + "loss": 0.3682, + "step": 2688 + }, + { + "epoch": 0.21511569768604627, + "grad_norm": 0.33347020968981606, + "learning_rate": 9.843237821357082e-06, + "loss": 0.3108, + "step": 2689 + }, + { + "epoch": 0.21519569608607828, + "grad_norm": 0.33186024591765123, + "learning_rate": 9.843076854141422e-06, + "loss": 0.2978, + "step": 2690 + }, + { + "epoch": 0.21527569448611028, + "grad_norm": 0.3208605690650275, + "learning_rate": 9.842915805643156e-06, + "loss": 0.2797, + "step": 2691 + }, + { + "epoch": 0.21535569288614229, + "grad_norm": 0.29784265882873934, + "learning_rate": 9.842754675864988e-06, + "loss": 0.3138, + "step": 2692 + }, + { + "epoch": 0.21543569128617426, + "grad_norm": 0.29555145652749976, + "learning_rate": 9.84259346480962e-06, + "loss": 0.2786, + "step": 2693 + }, + { + "epoch": 0.21551568968620627, + "grad_norm": 0.27404166731387225, + "learning_rate": 9.842432172479759e-06, + "loss": 0.3245, + "step": 2694 + }, + { + "epoch": 0.21559568808623827, + "grad_norm": 0.31743892318865347, + "learning_rate": 9.842270798878111e-06, + "loss": 0.2671, + "step": 2695 + }, + { + "epoch": 0.21567568648627028, + "grad_norm": 0.2889267506297946, + "learning_rate": 9.842109344007386e-06, + "loss": 0.2865, + "step": 2696 + }, + { + "epoch": 0.21575568488630228, + "grad_norm": 0.31324974216615253, + "learning_rate": 9.841947807870293e-06, + "loss": 0.2966, + "step": 2697 + }, + { + "epoch": 0.21583568328633426, + "grad_norm": 0.32175102113058407, + "learning_rate": 9.841786190469542e-06, + "loss": 0.2806, + "step": 2698 + }, + { + "epoch": 0.21591568168636627, + "grad_norm": 0.3383437745680364, + "learning_rate": 9.841624491807846e-06, + "loss": 0.2849, + "step": 2699 + }, + { + "epoch": 0.21599568008639827, + "grad_norm": 0.3241243546944871, + "learning_rate": 9.84146271188792e-06, + "loss": 0.3025, + "step": 2700 + }, + { + "epoch": 0.21607567848643028, + "grad_norm": 0.2968437116223096, + "learning_rate": 9.841300850712479e-06, + "loss": 0.2936, + "step": 2701 + }, + { + "epoch": 0.21615567688646228, + "grad_norm": 0.2882845376907925, + "learning_rate": 9.84113890828424e-06, + "loss": 0.3052, + "step": 2702 + }, + { + "epoch": 0.21623567528649426, + "grad_norm": 0.33500568623570515, + "learning_rate": 9.840976884605916e-06, + "loss": 0.2784, + "step": 2703 + }, + { + "epoch": 0.21631567368652627, + "grad_norm": 0.3297713416465176, + "learning_rate": 9.840814779680234e-06, + "loss": 0.2649, + "step": 2704 + }, + { + "epoch": 0.21639567208655827, + "grad_norm": 0.3010535321913041, + "learning_rate": 9.840652593509909e-06, + "loss": 0.2516, + "step": 2705 + }, + { + "epoch": 0.21647567048659028, + "grad_norm": 0.24571714681986007, + "learning_rate": 9.840490326097667e-06, + "loss": 0.3246, + "step": 2706 + }, + { + "epoch": 0.21655566888662225, + "grad_norm": 0.2709688932949307, + "learning_rate": 9.840327977446226e-06, + "loss": 0.3261, + "step": 2707 + }, + { + "epoch": 0.21663566728665426, + "grad_norm": 0.36225667082066015, + "learning_rate": 9.840165547558317e-06, + "loss": 0.2843, + "step": 2708 + }, + { + "epoch": 0.21671566568668627, + "grad_norm": 0.30258306213515673, + "learning_rate": 9.840003036436661e-06, + "loss": 0.268, + "step": 2709 + }, + { + "epoch": 0.21679566408671827, + "grad_norm": 0.3250927940334589, + "learning_rate": 9.839840444083988e-06, + "loss": 0.2861, + "step": 2710 + }, + { + "epoch": 0.21687566248675028, + "grad_norm": 0.30794826838587813, + "learning_rate": 9.839677770503028e-06, + "loss": 0.2927, + "step": 2711 + }, + { + "epoch": 0.21695566088678225, + "grad_norm": 0.2920344666268955, + "learning_rate": 9.839515015696509e-06, + "loss": 0.2981, + "step": 2712 + }, + { + "epoch": 0.21703565928681426, + "grad_norm": 0.3433353209551213, + "learning_rate": 9.839352179667162e-06, + "loss": 0.2781, + "step": 2713 + }, + { + "epoch": 0.21711565768684626, + "grad_norm": 0.321036770205326, + "learning_rate": 9.839189262417721e-06, + "loss": 0.2556, + "step": 2714 + }, + { + "epoch": 0.21719565608687827, + "grad_norm": 0.33210105088529196, + "learning_rate": 9.83902626395092e-06, + "loss": 0.2669, + "step": 2715 + }, + { + "epoch": 0.21727565448691027, + "grad_norm": 0.3191826253855119, + "learning_rate": 9.838863184269496e-06, + "loss": 0.2693, + "step": 2716 + }, + { + "epoch": 0.21735565288694225, + "grad_norm": 0.27652427372262117, + "learning_rate": 9.838700023376184e-06, + "loss": 0.2868, + "step": 2717 + }, + { + "epoch": 0.21743565128697426, + "grad_norm": 0.31398565315749427, + "learning_rate": 9.838536781273725e-06, + "loss": 0.2857, + "step": 2718 + }, + { + "epoch": 0.21751564968700626, + "grad_norm": 0.27710338916230115, + "learning_rate": 9.838373457964856e-06, + "loss": 0.3442, + "step": 2719 + }, + { + "epoch": 0.21759564808703827, + "grad_norm": 0.26119808307859543, + "learning_rate": 9.838210053452318e-06, + "loss": 0.3374, + "step": 2720 + }, + { + "epoch": 0.21767564648707025, + "grad_norm": 0.364142839383261, + "learning_rate": 9.838046567738856e-06, + "loss": 0.276, + "step": 2721 + }, + { + "epoch": 0.21775564488710225, + "grad_norm": 0.23074399437991608, + "learning_rate": 9.837883000827214e-06, + "loss": 0.3596, + "step": 2722 + }, + { + "epoch": 0.21783564328713426, + "grad_norm": 0.4792957180415395, + "learning_rate": 9.837719352720133e-06, + "loss": 0.2765, + "step": 2723 + }, + { + "epoch": 0.21791564168716626, + "grad_norm": 0.30512474907492615, + "learning_rate": 9.837555623420363e-06, + "loss": 0.3101, + "step": 2724 + }, + { + "epoch": 0.21799564008719827, + "grad_norm": 0.4083352573008931, + "learning_rate": 9.83739181293065e-06, + "loss": 0.2663, + "step": 2725 + }, + { + "epoch": 0.21807563848723024, + "grad_norm": 0.347837399700398, + "learning_rate": 9.837227921253747e-06, + "loss": 0.2635, + "step": 2726 + }, + { + "epoch": 0.21815563688726225, + "grad_norm": 0.28583372002693014, + "learning_rate": 9.837063948392401e-06, + "loss": 0.3367, + "step": 2727 + }, + { + "epoch": 0.21823563528729426, + "grad_norm": 0.3239847965080378, + "learning_rate": 9.836899894349364e-06, + "loss": 0.2685, + "step": 2728 + }, + { + "epoch": 0.21831563368732626, + "grad_norm": 0.2655627145338744, + "learning_rate": 9.836735759127391e-06, + "loss": 0.3102, + "step": 2729 + }, + { + "epoch": 0.21839563208735827, + "grad_norm": 0.44753151826142484, + "learning_rate": 9.836571542729236e-06, + "loss": 0.269, + "step": 2730 + }, + { + "epoch": 0.21847563048739024, + "grad_norm": 0.3328863091845624, + "learning_rate": 9.836407245157656e-06, + "loss": 0.2742, + "step": 2731 + }, + { + "epoch": 0.21855562888742225, + "grad_norm": 0.3410880122098285, + "learning_rate": 9.836242866415406e-06, + "loss": 0.2599, + "step": 2732 + }, + { + "epoch": 0.21863562728745425, + "grad_norm": 0.24883249994566176, + "learning_rate": 9.836078406505249e-06, + "loss": 0.3308, + "step": 2733 + }, + { + "epoch": 0.21871562568748626, + "grad_norm": 0.2568036851508712, + "learning_rate": 9.83591386542994e-06, + "loss": 0.3312, + "step": 2734 + }, + { + "epoch": 0.21879562408751824, + "grad_norm": 0.3549816513595237, + "learning_rate": 9.835749243192245e-06, + "loss": 0.2959, + "step": 2735 + }, + { + "epoch": 0.21887562248755024, + "grad_norm": 0.20508016165095397, + "learning_rate": 9.835584539794925e-06, + "loss": 0.3618, + "step": 2736 + }, + { + "epoch": 0.21895562088758225, + "grad_norm": 0.2947832887422123, + "learning_rate": 9.835419755240743e-06, + "loss": 0.2985, + "step": 2737 + }, + { + "epoch": 0.21903561928761425, + "grad_norm": 0.23694130743523875, + "learning_rate": 9.835254889532466e-06, + "loss": 0.3296, + "step": 2738 + }, + { + "epoch": 0.21911561768764626, + "grad_norm": 0.30613778259311786, + "learning_rate": 9.835089942672862e-06, + "loss": 0.2953, + "step": 2739 + }, + { + "epoch": 0.21919561608767824, + "grad_norm": 0.2520373860967617, + "learning_rate": 9.834924914664696e-06, + "loss": 0.328, + "step": 2740 + }, + { + "epoch": 0.21927561448771024, + "grad_norm": 0.26820830544684077, + "learning_rate": 9.834759805510742e-06, + "loss": 0.2981, + "step": 2741 + }, + { + "epoch": 0.21935561288774225, + "grad_norm": 0.28310543265742477, + "learning_rate": 9.83459461521377e-06, + "loss": 0.3033, + "step": 2742 + }, + { + "epoch": 0.21943561128777425, + "grad_norm": 0.46558995996700625, + "learning_rate": 9.834429343776551e-06, + "loss": 0.3073, + "step": 2743 + }, + { + "epoch": 0.21951560968780626, + "grad_norm": 0.3419822007292441, + "learning_rate": 9.834263991201857e-06, + "loss": 0.277, + "step": 2744 + }, + { + "epoch": 0.21959560808783823, + "grad_norm": 0.33671400275896723, + "learning_rate": 9.834098557492467e-06, + "loss": 0.2817, + "step": 2745 + }, + { + "epoch": 0.21967560648787024, + "grad_norm": 0.3053576986719263, + "learning_rate": 9.833933042651156e-06, + "loss": 0.3021, + "step": 2746 + }, + { + "epoch": 0.21975560488790225, + "grad_norm": 0.278869592231678, + "learning_rate": 9.8337674466807e-06, + "loss": 0.317, + "step": 2747 + }, + { + "epoch": 0.21983560328793425, + "grad_norm": 0.2772025766159624, + "learning_rate": 9.833601769583883e-06, + "loss": 0.294, + "step": 2748 + }, + { + "epoch": 0.21991560168796623, + "grad_norm": 0.2762096890157505, + "learning_rate": 9.833436011363482e-06, + "loss": 0.3102, + "step": 2749 + }, + { + "epoch": 0.21999560008799823, + "grad_norm": 0.2962296325603438, + "learning_rate": 9.833270172022277e-06, + "loss": 0.2652, + "step": 2750 + }, + { + "epoch": 0.22007559848803024, + "grad_norm": 0.27892391636371006, + "learning_rate": 9.833104251563058e-06, + "loss": 0.3093, + "step": 2751 + }, + { + "epoch": 0.22015559688806224, + "grad_norm": 0.2826854057343321, + "learning_rate": 9.832938249988602e-06, + "loss": 0.2952, + "step": 2752 + }, + { + "epoch": 0.22023559528809425, + "grad_norm": 0.28984837723954787, + "learning_rate": 9.832772167301701e-06, + "loss": 0.2906, + "step": 2753 + }, + { + "epoch": 0.22031559368812623, + "grad_norm": 0.24474436996528706, + "learning_rate": 9.832606003505139e-06, + "loss": 0.3373, + "step": 2754 + }, + { + "epoch": 0.22039559208815823, + "grad_norm": 0.28531884139798375, + "learning_rate": 9.832439758601706e-06, + "loss": 0.2875, + "step": 2755 + }, + { + "epoch": 0.22047559048819024, + "grad_norm": 0.3889518278504408, + "learning_rate": 9.832273432594192e-06, + "loss": 0.2888, + "step": 2756 + }, + { + "epoch": 0.22055558888822224, + "grad_norm": 0.31358457851726484, + "learning_rate": 9.83210702548539e-06, + "loss": 0.2613, + "step": 2757 + }, + { + "epoch": 0.22063558728825425, + "grad_norm": 0.2818486488104054, + "learning_rate": 9.831940537278088e-06, + "loss": 0.2976, + "step": 2758 + }, + { + "epoch": 0.22071558568828623, + "grad_norm": 0.3041743534993681, + "learning_rate": 9.831773967975085e-06, + "loss": 0.3385, + "step": 2759 + }, + { + "epoch": 0.22079558408831823, + "grad_norm": 0.2657031713422216, + "learning_rate": 9.831607317579178e-06, + "loss": 0.2959, + "step": 2760 + }, + { + "epoch": 0.22087558248835024, + "grad_norm": 0.35126019099305517, + "learning_rate": 9.831440586093157e-06, + "loss": 0.2856, + "step": 2761 + }, + { + "epoch": 0.22095558088838224, + "grad_norm": 0.31539668613165267, + "learning_rate": 9.831273773519826e-06, + "loss": 0.2987, + "step": 2762 + }, + { + "epoch": 0.22103557928841422, + "grad_norm": 0.19301876229647127, + "learning_rate": 9.831106879861982e-06, + "loss": 0.3533, + "step": 2763 + }, + { + "epoch": 0.22111557768844622, + "grad_norm": 0.3597979432086649, + "learning_rate": 9.830939905122429e-06, + "loss": 0.2721, + "step": 2764 + }, + { + "epoch": 0.22119557608847823, + "grad_norm": 0.27028818646640096, + "learning_rate": 9.830772849303967e-06, + "loss": 0.3375, + "step": 2765 + }, + { + "epoch": 0.22127557448851023, + "grad_norm": 0.27609937566928244, + "learning_rate": 9.830605712409399e-06, + "loss": 0.2904, + "step": 2766 + }, + { + "epoch": 0.22135557288854224, + "grad_norm": 0.2708883982284736, + "learning_rate": 9.830438494441533e-06, + "loss": 0.2907, + "step": 2767 + }, + { + "epoch": 0.22143557128857422, + "grad_norm": 0.28664579065424456, + "learning_rate": 9.830271195403171e-06, + "loss": 0.2387, + "step": 2768 + }, + { + "epoch": 0.22151556968860622, + "grad_norm": 0.24268426947492366, + "learning_rate": 9.830103815297126e-06, + "loss": 0.3444, + "step": 2769 + }, + { + "epoch": 0.22159556808863823, + "grad_norm": 0.2584215309622741, + "learning_rate": 9.829936354126202e-06, + "loss": 0.3257, + "step": 2770 + }, + { + "epoch": 0.22167556648867023, + "grad_norm": 0.42756734149467834, + "learning_rate": 9.829768811893214e-06, + "loss": 0.2464, + "step": 2771 + }, + { + "epoch": 0.22175556488870224, + "grad_norm": 0.2956583654662576, + "learning_rate": 9.829601188600972e-06, + "loss": 0.2987, + "step": 2772 + }, + { + "epoch": 0.22183556328873422, + "grad_norm": 0.41135328763894985, + "learning_rate": 9.829433484252292e-06, + "loss": 0.2689, + "step": 2773 + }, + { + "epoch": 0.22191556168876622, + "grad_norm": 0.25479562674346473, + "learning_rate": 9.829265698849983e-06, + "loss": 0.3213, + "step": 2774 + }, + { + "epoch": 0.22199556008879823, + "grad_norm": 0.2619112180027783, + "learning_rate": 9.829097832396864e-06, + "loss": 0.3338, + "step": 2775 + }, + { + "epoch": 0.22207555848883023, + "grad_norm": 0.3181700521634343, + "learning_rate": 9.828929884895753e-06, + "loss": 0.2803, + "step": 2776 + }, + { + "epoch": 0.2221555568888622, + "grad_norm": 0.2425961994417845, + "learning_rate": 9.82876185634947e-06, + "loss": 0.3154, + "step": 2777 + }, + { + "epoch": 0.22223555528889422, + "grad_norm": 0.25666679399800707, + "learning_rate": 9.82859374676083e-06, + "loss": 0.3398, + "step": 2778 + }, + { + "epoch": 0.22231555368892622, + "grad_norm": 0.310089123725397, + "learning_rate": 9.828425556132659e-06, + "loss": 0.2835, + "step": 2779 + }, + { + "epoch": 0.22239555208895823, + "grad_norm": 0.23590066745262112, + "learning_rate": 9.828257284467778e-06, + "loss": 0.3345, + "step": 2780 + }, + { + "epoch": 0.22247555048899023, + "grad_norm": 0.3218966870135753, + "learning_rate": 9.828088931769012e-06, + "loss": 0.3018, + "step": 2781 + }, + { + "epoch": 0.2225555488890222, + "grad_norm": 0.3278771663091074, + "learning_rate": 9.827920498039185e-06, + "loss": 0.2778, + "step": 2782 + }, + { + "epoch": 0.22263554728905421, + "grad_norm": 0.3259407018457449, + "learning_rate": 9.827751983281126e-06, + "loss": 0.3144, + "step": 2783 + }, + { + "epoch": 0.22271554568908622, + "grad_norm": 0.21358686684524758, + "learning_rate": 9.827583387497664e-06, + "loss": 0.3469, + "step": 2784 + }, + { + "epoch": 0.22279554408911822, + "grad_norm": 0.2969374554508598, + "learning_rate": 9.827414710691624e-06, + "loss": 0.2954, + "step": 2785 + }, + { + "epoch": 0.22287554248915023, + "grad_norm": 0.2722321939915326, + "learning_rate": 9.82724595286584e-06, + "loss": 0.3288, + "step": 2786 + }, + { + "epoch": 0.2229555408891822, + "grad_norm": 0.3307719806588941, + "learning_rate": 9.827077114023145e-06, + "loss": 0.2831, + "step": 2787 + }, + { + "epoch": 0.2230355392892142, + "grad_norm": 0.32519824964350474, + "learning_rate": 9.82690819416637e-06, + "loss": 0.2936, + "step": 2788 + }, + { + "epoch": 0.22311553768924622, + "grad_norm": 0.33243564456585944, + "learning_rate": 9.826739193298353e-06, + "loss": 0.2761, + "step": 2789 + }, + { + "epoch": 0.22319553608927822, + "grad_norm": 0.3031233247548669, + "learning_rate": 9.826570111421929e-06, + "loss": 0.313, + "step": 2790 + }, + { + "epoch": 0.2232755344893102, + "grad_norm": 0.32889574985144054, + "learning_rate": 9.826400948539935e-06, + "loss": 0.28, + "step": 2791 + }, + { + "epoch": 0.2233555328893422, + "grad_norm": 0.29675962602250633, + "learning_rate": 9.826231704655212e-06, + "loss": 0.2894, + "step": 2792 + }, + { + "epoch": 0.2234355312893742, + "grad_norm": 0.3264323016476866, + "learning_rate": 9.826062379770598e-06, + "loss": 0.2668, + "step": 2793 + }, + { + "epoch": 0.22351552968940622, + "grad_norm": 0.3056468078206645, + "learning_rate": 9.825892973888937e-06, + "loss": 0.3313, + "step": 2794 + }, + { + "epoch": 0.22359552808943822, + "grad_norm": 0.33479481297415675, + "learning_rate": 9.82572348701307e-06, + "loss": 0.2812, + "step": 2795 + }, + { + "epoch": 0.2236755264894702, + "grad_norm": 0.24312384574232604, + "learning_rate": 9.825553919145845e-06, + "loss": 0.3256, + "step": 2796 + }, + { + "epoch": 0.2237555248895022, + "grad_norm": 0.33455726808038416, + "learning_rate": 9.825384270290104e-06, + "loss": 0.2688, + "step": 2797 + }, + { + "epoch": 0.2238355232895342, + "grad_norm": 0.20786618013182903, + "learning_rate": 9.825214540448698e-06, + "loss": 0.336, + "step": 2798 + }, + { + "epoch": 0.22391552168956622, + "grad_norm": 0.3902924261106954, + "learning_rate": 9.825044729624472e-06, + "loss": 0.2951, + "step": 2799 + }, + { + "epoch": 0.22399552008959822, + "grad_norm": 0.36141011328120415, + "learning_rate": 9.824874837820278e-06, + "loss": 0.2645, + "step": 2800 + }, + { + "epoch": 0.2240755184896302, + "grad_norm": 0.29397943433504675, + "learning_rate": 9.824704865038967e-06, + "loss": 0.3014, + "step": 2801 + }, + { + "epoch": 0.2241555168896622, + "grad_norm": 0.3088152417119995, + "learning_rate": 9.824534811283393e-06, + "loss": 0.2766, + "step": 2802 + }, + { + "epoch": 0.2242355152896942, + "grad_norm": 0.32536508113358453, + "learning_rate": 9.824364676556406e-06, + "loss": 0.264, + "step": 2803 + }, + { + "epoch": 0.22431551368972621, + "grad_norm": 0.3222333487637704, + "learning_rate": 9.824194460860867e-06, + "loss": 0.2622, + "step": 2804 + }, + { + "epoch": 0.2243955120897582, + "grad_norm": 0.28243423747395074, + "learning_rate": 9.824024164199627e-06, + "loss": 0.2957, + "step": 2805 + }, + { + "epoch": 0.2244755104897902, + "grad_norm": 0.3430101187383174, + "learning_rate": 9.82385378657555e-06, + "loss": 0.2794, + "step": 2806 + }, + { + "epoch": 0.2245555088898222, + "grad_norm": 0.30170529761601594, + "learning_rate": 9.823683327991492e-06, + "loss": 0.2998, + "step": 2807 + }, + { + "epoch": 0.2246355072898542, + "grad_norm": 0.2792551926949058, + "learning_rate": 9.823512788450313e-06, + "loss": 0.2887, + "step": 2808 + }, + { + "epoch": 0.2247155056898862, + "grad_norm": 0.3565739231763268, + "learning_rate": 9.82334216795488e-06, + "loss": 0.2981, + "step": 2809 + }, + { + "epoch": 0.2247955040899182, + "grad_norm": 0.29247348419244473, + "learning_rate": 9.82317146650805e-06, + "loss": 0.3111, + "step": 2810 + }, + { + "epoch": 0.2248755024899502, + "grad_norm": 0.35156310888539855, + "learning_rate": 9.823000684112691e-06, + "loss": 0.2757, + "step": 2811 + }, + { + "epoch": 0.2249555008899822, + "grad_norm": 0.6380107811587993, + "learning_rate": 9.822829820771671e-06, + "loss": 0.2677, + "step": 2812 + }, + { + "epoch": 0.2250354992900142, + "grad_norm": 0.302627695018539, + "learning_rate": 9.822658876487854e-06, + "loss": 0.3083, + "step": 2813 + }, + { + "epoch": 0.2251154976900462, + "grad_norm": 0.3031087832758983, + "learning_rate": 9.822487851264113e-06, + "loss": 0.3096, + "step": 2814 + }, + { + "epoch": 0.2251954960900782, + "grad_norm": 0.34693579892961685, + "learning_rate": 9.822316745103316e-06, + "loss": 0.2824, + "step": 2815 + }, + { + "epoch": 0.2252754944901102, + "grad_norm": 0.3302604864919298, + "learning_rate": 9.822145558008333e-06, + "loss": 0.2822, + "step": 2816 + }, + { + "epoch": 0.2253554928901422, + "grad_norm": 0.393393172020002, + "learning_rate": 9.821974289982042e-06, + "loss": 0.2569, + "step": 2817 + }, + { + "epoch": 0.2254354912901742, + "grad_norm": 0.28973125401530564, + "learning_rate": 9.821802941027314e-06, + "loss": 0.2965, + "step": 2818 + }, + { + "epoch": 0.22551548969020618, + "grad_norm": 0.36986866250518635, + "learning_rate": 9.821631511147025e-06, + "loss": 0.3046, + "step": 2819 + }, + { + "epoch": 0.2255954880902382, + "grad_norm": 0.31753481111825277, + "learning_rate": 9.821460000344053e-06, + "loss": 0.2985, + "step": 2820 + }, + { + "epoch": 0.2256754864902702, + "grad_norm": 0.29486798651425994, + "learning_rate": 9.821288408621276e-06, + "loss": 0.2836, + "step": 2821 + }, + { + "epoch": 0.2257554848903022, + "grad_norm": 0.3394435778688585, + "learning_rate": 9.821116735981573e-06, + "loss": 0.284, + "step": 2822 + }, + { + "epoch": 0.2258354832903342, + "grad_norm": 0.38080315536991344, + "learning_rate": 9.820944982427826e-06, + "loss": 0.3433, + "step": 2823 + }, + { + "epoch": 0.22591548169036618, + "grad_norm": 0.5712058756570679, + "learning_rate": 9.820773147962919e-06, + "loss": 0.2803, + "step": 2824 + }, + { + "epoch": 0.2259954800903982, + "grad_norm": 0.2730113783413178, + "learning_rate": 9.820601232589735e-06, + "loss": 0.3187, + "step": 2825 + }, + { + "epoch": 0.2260754784904302, + "grad_norm": 0.3066507803765661, + "learning_rate": 9.820429236311158e-06, + "loss": 0.2674, + "step": 2826 + }, + { + "epoch": 0.2261554768904622, + "grad_norm": 0.32097813625075844, + "learning_rate": 9.820257159130076e-06, + "loss": 0.303, + "step": 2827 + }, + { + "epoch": 0.2262354752904942, + "grad_norm": 0.25517862246193185, + "learning_rate": 9.820085001049377e-06, + "loss": 0.3375, + "step": 2828 + }, + { + "epoch": 0.22631547369052618, + "grad_norm": 0.33969974736772013, + "learning_rate": 9.81991276207195e-06, + "loss": 0.2863, + "step": 2829 + }, + { + "epoch": 0.2263954720905582, + "grad_norm": 0.3449054338402782, + "learning_rate": 9.819740442200685e-06, + "loss": 0.2887, + "step": 2830 + }, + { + "epoch": 0.2264754704905902, + "grad_norm": 0.26241390360242645, + "learning_rate": 9.819568041438477e-06, + "loss": 0.3215, + "step": 2831 + }, + { + "epoch": 0.2265554688906222, + "grad_norm": 0.28837138122269645, + "learning_rate": 9.819395559788216e-06, + "loss": 0.3085, + "step": 2832 + }, + { + "epoch": 0.22663546729065417, + "grad_norm": 0.24610936941864156, + "learning_rate": 9.819222997252798e-06, + "loss": 0.3277, + "step": 2833 + }, + { + "epoch": 0.22671546569068618, + "grad_norm": 0.2895707100625207, + "learning_rate": 9.819050353835117e-06, + "loss": 0.3119, + "step": 2834 + }, + { + "epoch": 0.22679546409071819, + "grad_norm": 0.2981039787348253, + "learning_rate": 9.818877629538077e-06, + "loss": 0.3098, + "step": 2835 + }, + { + "epoch": 0.2268754624907502, + "grad_norm": 0.34896706608987865, + "learning_rate": 9.818704824364571e-06, + "loss": 0.287, + "step": 2836 + }, + { + "epoch": 0.2269554608907822, + "grad_norm": 0.30931190767962996, + "learning_rate": 9.818531938317499e-06, + "loss": 0.2552, + "step": 2837 + }, + { + "epoch": 0.22703545929081417, + "grad_norm": 0.35493825927360395, + "learning_rate": 9.818358971399767e-06, + "loss": 0.2686, + "step": 2838 + }, + { + "epoch": 0.22711545769084618, + "grad_norm": 0.28195964122994477, + "learning_rate": 9.818185923614274e-06, + "loss": 0.2896, + "step": 2839 + }, + { + "epoch": 0.22719545609087818, + "grad_norm": 0.31771860145746555, + "learning_rate": 9.818012794963927e-06, + "loss": 0.2785, + "step": 2840 + }, + { + "epoch": 0.2272754544909102, + "grad_norm": 0.2379791786726089, + "learning_rate": 9.817839585451629e-06, + "loss": 0.3305, + "step": 2841 + }, + { + "epoch": 0.2273554528909422, + "grad_norm": 0.21446388280018136, + "learning_rate": 9.81766629508029e-06, + "loss": 0.3605, + "step": 2842 + }, + { + "epoch": 0.22743545129097417, + "grad_norm": 0.33536153488175674, + "learning_rate": 9.817492923852817e-06, + "loss": 0.2795, + "step": 2843 + }, + { + "epoch": 0.22751544969100618, + "grad_norm": 0.32965559388052224, + "learning_rate": 9.817319471772117e-06, + "loss": 0.3097, + "step": 2844 + }, + { + "epoch": 0.22759544809103818, + "grad_norm": 0.27789479640478076, + "learning_rate": 9.817145938841106e-06, + "loss": 0.3017, + "step": 2845 + }, + { + "epoch": 0.2276754464910702, + "grad_norm": 0.29116083155246303, + "learning_rate": 9.816972325062694e-06, + "loss": 0.2942, + "step": 2846 + }, + { + "epoch": 0.22775544489110217, + "grad_norm": 0.2852941848512344, + "learning_rate": 9.816798630439794e-06, + "loss": 0.2826, + "step": 2847 + }, + { + "epoch": 0.22783544329113417, + "grad_norm": 0.3390132545668271, + "learning_rate": 9.816624854975324e-06, + "loss": 0.2584, + "step": 2848 + }, + { + "epoch": 0.22791544169116618, + "grad_norm": 4.552299893741447, + "learning_rate": 9.816450998672195e-06, + "loss": 0.2518, + "step": 2849 + }, + { + "epoch": 0.22799544009119818, + "grad_norm": 0.5354968133124064, + "learning_rate": 9.816277061533332e-06, + "loss": 0.3024, + "step": 2850 + }, + { + "epoch": 0.2280754384912302, + "grad_norm": 0.2379432136536726, + "learning_rate": 9.816103043561648e-06, + "loss": 0.3383, + "step": 2851 + }, + { + "epoch": 0.22815543689126216, + "grad_norm": 0.2841401912723477, + "learning_rate": 9.815928944760068e-06, + "loss": 0.2888, + "step": 2852 + }, + { + "epoch": 0.22823543529129417, + "grad_norm": 0.350176396847407, + "learning_rate": 9.815754765131511e-06, + "loss": 0.3164, + "step": 2853 + }, + { + "epoch": 0.22831543369132618, + "grad_norm": 0.32359760961803513, + "learning_rate": 9.815580504678903e-06, + "loss": 0.2856, + "step": 2854 + }, + { + "epoch": 0.22839543209135818, + "grad_norm": 0.36137550041740946, + "learning_rate": 9.815406163405165e-06, + "loss": 0.2441, + "step": 2855 + }, + { + "epoch": 0.22847543049139019, + "grad_norm": 0.44823054821236286, + "learning_rate": 9.815231741313227e-06, + "loss": 0.3009, + "step": 2856 + }, + { + "epoch": 0.22855542889142216, + "grad_norm": 0.30634504078237407, + "learning_rate": 9.815057238406015e-06, + "loss": 0.2918, + "step": 2857 + }, + { + "epoch": 0.22863542729145417, + "grad_norm": 0.32786766115995913, + "learning_rate": 9.814882654686456e-06, + "loss": 0.3012, + "step": 2858 + }, + { + "epoch": 0.22871542569148617, + "grad_norm": 0.3548557466306779, + "learning_rate": 9.814707990157482e-06, + "loss": 0.3124, + "step": 2859 + }, + { + "epoch": 0.22879542409151818, + "grad_norm": 0.34656071151942575, + "learning_rate": 9.814533244822025e-06, + "loss": 0.2826, + "step": 2860 + }, + { + "epoch": 0.22887542249155016, + "grad_norm": 0.3028679335179603, + "learning_rate": 9.814358418683014e-06, + "loss": 0.2901, + "step": 2861 + }, + { + "epoch": 0.22895542089158216, + "grad_norm": 0.3006034706355257, + "learning_rate": 9.814183511743387e-06, + "loss": 0.2968, + "step": 2862 + }, + { + "epoch": 0.22903541929161417, + "grad_norm": 0.28232449889379063, + "learning_rate": 9.814008524006077e-06, + "loss": 0.3064, + "step": 2863 + }, + { + "epoch": 0.22911541769164617, + "grad_norm": 0.3575040397575339, + "learning_rate": 9.813833455474025e-06, + "loss": 0.2946, + "step": 2864 + }, + { + "epoch": 0.22919541609167818, + "grad_norm": 0.30024328142386997, + "learning_rate": 9.813658306150164e-06, + "loss": 0.3045, + "step": 2865 + }, + { + "epoch": 0.22927541449171016, + "grad_norm": 0.2678906401170984, + "learning_rate": 9.813483076037438e-06, + "loss": 0.3547, + "step": 2866 + }, + { + "epoch": 0.22935541289174216, + "grad_norm": 0.350971199324955, + "learning_rate": 9.813307765138784e-06, + "loss": 0.2745, + "step": 2867 + }, + { + "epoch": 0.22943541129177417, + "grad_norm": 0.27679514295409147, + "learning_rate": 9.813132373457147e-06, + "loss": 0.3323, + "step": 2868 + }, + { + "epoch": 0.22951540969180617, + "grad_norm": 0.4476153605934737, + "learning_rate": 9.81295690099547e-06, + "loss": 0.2921, + "step": 2869 + }, + { + "epoch": 0.22959540809183818, + "grad_norm": 0.32952290189488787, + "learning_rate": 9.812781347756697e-06, + "loss": 0.2911, + "step": 2870 + }, + { + "epoch": 0.22967540649187015, + "grad_norm": 0.37902207056552817, + "learning_rate": 9.812605713743775e-06, + "loss": 0.281, + "step": 2871 + }, + { + "epoch": 0.22975540489190216, + "grad_norm": 0.3627156164743356, + "learning_rate": 9.812429998959652e-06, + "loss": 0.2654, + "step": 2872 + }, + { + "epoch": 0.22983540329193416, + "grad_norm": 0.2777324422142774, + "learning_rate": 9.812254203407278e-06, + "loss": 0.3219, + "step": 2873 + }, + { + "epoch": 0.22991540169196617, + "grad_norm": 0.33421424571686753, + "learning_rate": 9.8120783270896e-06, + "loss": 0.291, + "step": 2874 + }, + { + "epoch": 0.22999540009199815, + "grad_norm": 0.25237272088786455, + "learning_rate": 9.811902370009576e-06, + "loss": 0.337, + "step": 2875 + }, + { + "epoch": 0.23007539849203015, + "grad_norm": 0.34807911249487616, + "learning_rate": 9.811726332170153e-06, + "loss": 0.2677, + "step": 2876 + }, + { + "epoch": 0.23015539689206216, + "grad_norm": 0.3813077855344696, + "learning_rate": 9.811550213574287e-06, + "loss": 0.2788, + "step": 2877 + }, + { + "epoch": 0.23023539529209416, + "grad_norm": 0.3430368653491586, + "learning_rate": 9.811374014224935e-06, + "loss": 0.2698, + "step": 2878 + }, + { + "epoch": 0.23031539369212617, + "grad_norm": 0.28040863769742647, + "learning_rate": 9.811197734125055e-06, + "loss": 0.326, + "step": 2879 + }, + { + "epoch": 0.23039539209215815, + "grad_norm": 0.2893105404958831, + "learning_rate": 9.811021373277603e-06, + "loss": 0.3094, + "step": 2880 + }, + { + "epoch": 0.23047539049219015, + "grad_norm": 0.3016796605357451, + "learning_rate": 9.810844931685542e-06, + "loss": 0.299, + "step": 2881 + }, + { + "epoch": 0.23055538889222216, + "grad_norm": 0.28737541412543294, + "learning_rate": 9.81066840935183e-06, + "loss": 0.3388, + "step": 2882 + }, + { + "epoch": 0.23063538729225416, + "grad_norm": 0.3213815460091929, + "learning_rate": 9.810491806279432e-06, + "loss": 0.2631, + "step": 2883 + }, + { + "epoch": 0.23071538569228617, + "grad_norm": 0.3202434828369608, + "learning_rate": 9.810315122471309e-06, + "loss": 0.296, + "step": 2884 + }, + { + "epoch": 0.23079538409231815, + "grad_norm": 0.2878981459651451, + "learning_rate": 9.81013835793043e-06, + "loss": 0.3132, + "step": 2885 + }, + { + "epoch": 0.23087538249235015, + "grad_norm": 0.2942354562993355, + "learning_rate": 9.80996151265976e-06, + "loss": 0.2945, + "step": 2886 + }, + { + "epoch": 0.23095538089238216, + "grad_norm": 0.3161415658395395, + "learning_rate": 9.809784586662268e-06, + "loss": 0.2702, + "step": 2887 + }, + { + "epoch": 0.23103537929241416, + "grad_norm": 0.33082684171376475, + "learning_rate": 9.809607579940922e-06, + "loss": 0.3352, + "step": 2888 + }, + { + "epoch": 0.23111537769244614, + "grad_norm": 0.3431429291501672, + "learning_rate": 9.809430492498693e-06, + "loss": 0.2654, + "step": 2889 + }, + { + "epoch": 0.23119537609247814, + "grad_norm": 0.30387802690260884, + "learning_rate": 9.809253324338554e-06, + "loss": 0.3197, + "step": 2890 + }, + { + "epoch": 0.23127537449251015, + "grad_norm": 0.28981388177797623, + "learning_rate": 9.809076075463476e-06, + "loss": 0.3071, + "step": 2891 + }, + { + "epoch": 0.23135537289254215, + "grad_norm": 0.3477562458310773, + "learning_rate": 9.808898745876439e-06, + "loss": 0.2761, + "step": 2892 + }, + { + "epoch": 0.23143537129257416, + "grad_norm": 0.3633355817237512, + "learning_rate": 9.808721335580414e-06, + "loss": 0.2866, + "step": 2893 + }, + { + "epoch": 0.23151536969260614, + "grad_norm": 0.32200710071978406, + "learning_rate": 9.80854384457838e-06, + "loss": 0.2797, + "step": 2894 + }, + { + "epoch": 0.23159536809263814, + "grad_norm": 0.2957026891113077, + "learning_rate": 9.808366272873317e-06, + "loss": 0.2943, + "step": 2895 + }, + { + "epoch": 0.23167536649267015, + "grad_norm": 0.1975566005310841, + "learning_rate": 9.808188620468204e-06, + "loss": 0.386, + "step": 2896 + }, + { + "epoch": 0.23175536489270215, + "grad_norm": 0.33904846839535463, + "learning_rate": 9.808010887366024e-06, + "loss": 0.2773, + "step": 2897 + }, + { + "epoch": 0.23183536329273416, + "grad_norm": 0.3248290532592434, + "learning_rate": 9.807833073569758e-06, + "loss": 0.2765, + "step": 2898 + }, + { + "epoch": 0.23191536169276614, + "grad_norm": 0.30368960953369173, + "learning_rate": 9.807655179082392e-06, + "loss": 0.2841, + "step": 2899 + }, + { + "epoch": 0.23199536009279814, + "grad_norm": 0.3201841405243278, + "learning_rate": 9.80747720390691e-06, + "loss": 0.2795, + "step": 2900 + }, + { + "epoch": 0.23207535849283015, + "grad_norm": 0.29593085068935, + "learning_rate": 9.807299148046301e-06, + "loss": 0.2971, + "step": 2901 + }, + { + "epoch": 0.23215535689286215, + "grad_norm": 0.33258594183800666, + "learning_rate": 9.807121011503552e-06, + "loss": 0.3167, + "step": 2902 + }, + { + "epoch": 0.23223535529289413, + "grad_norm": 0.2951744922122382, + "learning_rate": 9.806942794281654e-06, + "loss": 0.2816, + "step": 2903 + }, + { + "epoch": 0.23231535369292614, + "grad_norm": 0.2822058872214629, + "learning_rate": 9.806764496383595e-06, + "loss": 0.2869, + "step": 2904 + }, + { + "epoch": 0.23239535209295814, + "grad_norm": 0.30245379605556677, + "learning_rate": 9.80658611781237e-06, + "loss": 0.303, + "step": 2905 + }, + { + "epoch": 0.23247535049299015, + "grad_norm": 0.2613250120667393, + "learning_rate": 9.806407658570973e-06, + "loss": 0.3321, + "step": 2906 + }, + { + "epoch": 0.23255534889302215, + "grad_norm": 0.3833071678108477, + "learning_rate": 9.806229118662398e-06, + "loss": 0.259, + "step": 2907 + }, + { + "epoch": 0.23263534729305413, + "grad_norm": 0.3256194876302207, + "learning_rate": 9.806050498089643e-06, + "loss": 0.2662, + "step": 2908 + }, + { + "epoch": 0.23271534569308613, + "grad_norm": 0.2544751485067872, + "learning_rate": 9.805871796855704e-06, + "loss": 0.335, + "step": 2909 + }, + { + "epoch": 0.23279534409311814, + "grad_norm": 0.31840997600911863, + "learning_rate": 9.80569301496358e-06, + "loss": 0.2838, + "step": 2910 + }, + { + "epoch": 0.23287534249315014, + "grad_norm": 0.2839815610398486, + "learning_rate": 9.805514152416274e-06, + "loss": 0.2934, + "step": 2911 + }, + { + "epoch": 0.23295534089318215, + "grad_norm": 0.29603915239964707, + "learning_rate": 9.805335209216787e-06, + "loss": 0.2947, + "step": 2912 + }, + { + "epoch": 0.23303533929321413, + "grad_norm": 0.30802267809517675, + "learning_rate": 9.80515618536812e-06, + "loss": 0.3018, + "step": 2913 + }, + { + "epoch": 0.23311533769324613, + "grad_norm": 0.2875083640744617, + "learning_rate": 9.80497708087328e-06, + "loss": 0.2879, + "step": 2914 + }, + { + "epoch": 0.23319533609327814, + "grad_norm": 0.286485238851757, + "learning_rate": 9.80479789573527e-06, + "loss": 0.2981, + "step": 2915 + }, + { + "epoch": 0.23327533449331014, + "grad_norm": 0.31931103348272644, + "learning_rate": 9.8046186299571e-06, + "loss": 0.2853, + "step": 2916 + }, + { + "epoch": 0.23335533289334212, + "grad_norm": 0.2809465059431971, + "learning_rate": 9.804439283541781e-06, + "loss": 0.2938, + "step": 2917 + }, + { + "epoch": 0.23343533129337413, + "grad_norm": 0.371183069899213, + "learning_rate": 9.804259856492318e-06, + "loss": 0.3192, + "step": 2918 + }, + { + "epoch": 0.23351532969340613, + "grad_norm": 0.3095768129349247, + "learning_rate": 9.804080348811725e-06, + "loss": 0.2683, + "step": 2919 + }, + { + "epoch": 0.23359532809343814, + "grad_norm": 0.2884060080492279, + "learning_rate": 9.803900760503015e-06, + "loss": 0.3113, + "step": 2920 + }, + { + "epoch": 0.23367532649347014, + "grad_norm": 0.18420338116983706, + "learning_rate": 9.803721091569201e-06, + "loss": 0.3569, + "step": 2921 + }, + { + "epoch": 0.23375532489350212, + "grad_norm": 0.3866165227233019, + "learning_rate": 9.803541342013299e-06, + "loss": 0.2789, + "step": 2922 + }, + { + "epoch": 0.23383532329353413, + "grad_norm": 0.3136911516691616, + "learning_rate": 9.803361511838324e-06, + "loss": 0.292, + "step": 2923 + }, + { + "epoch": 0.23391532169356613, + "grad_norm": 0.3139841046105295, + "learning_rate": 9.803181601047296e-06, + "loss": 0.2864, + "step": 2924 + }, + { + "epoch": 0.23399532009359814, + "grad_norm": 0.7156320508377609, + "learning_rate": 9.803001609643234e-06, + "loss": 0.2872, + "step": 2925 + }, + { + "epoch": 0.23407531849363014, + "grad_norm": 0.3093972189497827, + "learning_rate": 9.802821537629162e-06, + "loss": 0.2689, + "step": 2926 + }, + { + "epoch": 0.23415531689366212, + "grad_norm": 0.34576995308752667, + "learning_rate": 9.802641385008096e-06, + "loss": 0.2718, + "step": 2927 + }, + { + "epoch": 0.23423531529369412, + "grad_norm": 0.2669261479024423, + "learning_rate": 9.802461151783064e-06, + "loss": 0.2953, + "step": 2928 + }, + { + "epoch": 0.23431531369372613, + "grad_norm": 0.2778605474588341, + "learning_rate": 9.80228083795709e-06, + "loss": 0.2938, + "step": 2929 + }, + { + "epoch": 0.23439531209375813, + "grad_norm": 0.3153459998291426, + "learning_rate": 9.8021004435332e-06, + "loss": 0.274, + "step": 2930 + }, + { + "epoch": 0.2344753104937901, + "grad_norm": 0.3042643397962928, + "learning_rate": 9.80191996851442e-06, + "loss": 0.3049, + "step": 2931 + }, + { + "epoch": 0.23455530889382212, + "grad_norm": 0.2810640375570699, + "learning_rate": 9.80173941290378e-06, + "loss": 0.2752, + "step": 2932 + }, + { + "epoch": 0.23463530729385412, + "grad_norm": 0.35029874005477063, + "learning_rate": 9.801558776704315e-06, + "loss": 0.2891, + "step": 2933 + }, + { + "epoch": 0.23471530569388613, + "grad_norm": 0.3302681592387155, + "learning_rate": 9.801378059919049e-06, + "loss": 0.3143, + "step": 2934 + }, + { + "epoch": 0.23479530409391813, + "grad_norm": 0.32902754013508334, + "learning_rate": 9.801197262551019e-06, + "loss": 0.2649, + "step": 2935 + }, + { + "epoch": 0.2348753024939501, + "grad_norm": 0.30143525366193513, + "learning_rate": 9.801016384603259e-06, + "loss": 0.3076, + "step": 2936 + }, + { + "epoch": 0.23495530089398212, + "grad_norm": 0.2950286435659639, + "learning_rate": 9.800835426078804e-06, + "loss": 0.2864, + "step": 2937 + }, + { + "epoch": 0.23503529929401412, + "grad_norm": 0.2886197604190246, + "learning_rate": 9.800654386980692e-06, + "loss": 0.3057, + "step": 2938 + }, + { + "epoch": 0.23511529769404613, + "grad_norm": 0.3349307097080484, + "learning_rate": 9.800473267311962e-06, + "loss": 0.3086, + "step": 2939 + }, + { + "epoch": 0.23519529609407813, + "grad_norm": 0.30458485290954795, + "learning_rate": 9.800292067075651e-06, + "loss": 0.2875, + "step": 2940 + }, + { + "epoch": 0.2352752944941101, + "grad_norm": 0.2657456449217982, + "learning_rate": 9.800110786274803e-06, + "loss": 0.2761, + "step": 2941 + }, + { + "epoch": 0.23535529289414212, + "grad_norm": 0.2844425767048757, + "learning_rate": 9.79992942491246e-06, + "loss": 0.297, + "step": 2942 + }, + { + "epoch": 0.23543529129417412, + "grad_norm": 0.29706764419014725, + "learning_rate": 9.799747982991665e-06, + "loss": 0.2966, + "step": 2943 + }, + { + "epoch": 0.23551528969420613, + "grad_norm": 0.2981523676796156, + "learning_rate": 9.799566460515464e-06, + "loss": 0.294, + "step": 2944 + }, + { + "epoch": 0.2355952880942381, + "grad_norm": 0.31984706794791723, + "learning_rate": 9.799384857486902e-06, + "loss": 0.2666, + "step": 2945 + }, + { + "epoch": 0.2356752864942701, + "grad_norm": 0.28757138061950765, + "learning_rate": 9.799203173909028e-06, + "loss": 0.3494, + "step": 2946 + }, + { + "epoch": 0.23575528489430211, + "grad_norm": 0.3463662811425225, + "learning_rate": 9.799021409784892e-06, + "loss": 0.2529, + "step": 2947 + }, + { + "epoch": 0.23583528329433412, + "grad_norm": 0.30171384676569757, + "learning_rate": 9.798839565117541e-06, + "loss": 0.3091, + "step": 2948 + }, + { + "epoch": 0.23591528169436612, + "grad_norm": 0.2497302586569846, + "learning_rate": 9.798657639910033e-06, + "loss": 0.3062, + "step": 2949 + }, + { + "epoch": 0.2359952800943981, + "grad_norm": 0.3388699331241011, + "learning_rate": 9.798475634165417e-06, + "loss": 0.2662, + "step": 2950 + }, + { + "epoch": 0.2360752784944301, + "grad_norm": 0.35006879274503844, + "learning_rate": 9.798293547886748e-06, + "loss": 0.2703, + "step": 2951 + }, + { + "epoch": 0.2361552768944621, + "grad_norm": 0.22004554728366202, + "learning_rate": 9.798111381077082e-06, + "loss": 0.3565, + "step": 2952 + }, + { + "epoch": 0.23623527529449412, + "grad_norm": 0.3110676567945162, + "learning_rate": 9.79792913373948e-06, + "loss": 0.2705, + "step": 2953 + }, + { + "epoch": 0.2363152736945261, + "grad_norm": 0.30986703687485045, + "learning_rate": 9.797746805876996e-06, + "loss": 0.2872, + "step": 2954 + }, + { + "epoch": 0.2363952720945581, + "grad_norm": 1.1966868998010027, + "learning_rate": 9.79756439749269e-06, + "loss": 0.2791, + "step": 2955 + }, + { + "epoch": 0.2364752704945901, + "grad_norm": 0.30969822109107553, + "learning_rate": 9.797381908589627e-06, + "loss": 0.2872, + "step": 2956 + }, + { + "epoch": 0.2365552688946221, + "grad_norm": 0.25544114438781207, + "learning_rate": 9.797199339170866e-06, + "loss": 0.285, + "step": 2957 + }, + { + "epoch": 0.23663526729465412, + "grad_norm": 0.3038233583629293, + "learning_rate": 9.797016689239476e-06, + "loss": 0.2793, + "step": 2958 + }, + { + "epoch": 0.2367152656946861, + "grad_norm": 0.25458814791769385, + "learning_rate": 9.796833958798517e-06, + "loss": 0.3116, + "step": 2959 + }, + { + "epoch": 0.2367952640947181, + "grad_norm": 0.35899247760306424, + "learning_rate": 9.79665114785106e-06, + "loss": 0.2738, + "step": 2960 + }, + { + "epoch": 0.2368752624947501, + "grad_norm": 0.28606400914018576, + "learning_rate": 9.79646825640017e-06, + "loss": 0.2915, + "step": 2961 + }, + { + "epoch": 0.2369552608947821, + "grad_norm": 0.31552405571307507, + "learning_rate": 9.796285284448919e-06, + "loss": 0.2617, + "step": 2962 + }, + { + "epoch": 0.23703525929481412, + "grad_norm": 0.32663266804221264, + "learning_rate": 9.796102232000378e-06, + "loss": 0.2849, + "step": 2963 + }, + { + "epoch": 0.2371152576948461, + "grad_norm": 0.21896067443946426, + "learning_rate": 9.795919099057616e-06, + "loss": 0.3361, + "step": 2964 + }, + { + "epoch": 0.2371952560948781, + "grad_norm": 0.3017545556149471, + "learning_rate": 9.795735885623708e-06, + "loss": 0.2697, + "step": 2965 + }, + { + "epoch": 0.2372752544949101, + "grad_norm": 0.3049294047843202, + "learning_rate": 9.795552591701732e-06, + "loss": 0.3069, + "step": 2966 + }, + { + "epoch": 0.2373552528949421, + "grad_norm": 0.31199509519568996, + "learning_rate": 9.795369217294759e-06, + "loss": 0.2783, + "step": 2967 + }, + { + "epoch": 0.2374352512949741, + "grad_norm": 0.31991986375481757, + "learning_rate": 9.795185762405872e-06, + "loss": 0.2711, + "step": 2968 + }, + { + "epoch": 0.2375152496950061, + "grad_norm": 0.2548193904334765, + "learning_rate": 9.795002227038146e-06, + "loss": 0.3432, + "step": 2969 + }, + { + "epoch": 0.2375952480950381, + "grad_norm": 0.31783220837296405, + "learning_rate": 9.794818611194662e-06, + "loss": 0.2831, + "step": 2970 + }, + { + "epoch": 0.2376752464950701, + "grad_norm": 0.30272512027014703, + "learning_rate": 9.794634914878505e-06, + "loss": 0.2848, + "step": 2971 + }, + { + "epoch": 0.2377552448951021, + "grad_norm": 0.30395645540534616, + "learning_rate": 9.794451138092754e-06, + "loss": 0.3036, + "step": 2972 + }, + { + "epoch": 0.23783524329513409, + "grad_norm": 0.23673877372734747, + "learning_rate": 9.794267280840494e-06, + "loss": 0.3336, + "step": 2973 + }, + { + "epoch": 0.2379152416951661, + "grad_norm": 0.2708686071736194, + "learning_rate": 9.794083343124812e-06, + "loss": 0.2826, + "step": 2974 + }, + { + "epoch": 0.2379952400951981, + "grad_norm": 0.31852771532366536, + "learning_rate": 9.793899324948795e-06, + "loss": 0.2695, + "step": 2975 + }, + { + "epoch": 0.2380752384952301, + "grad_norm": 0.3140648590913954, + "learning_rate": 9.79371522631553e-06, + "loss": 0.2666, + "step": 2976 + }, + { + "epoch": 0.2381552368952621, + "grad_norm": 0.29611521793134804, + "learning_rate": 9.79353104722811e-06, + "loss": 0.304, + "step": 2977 + }, + { + "epoch": 0.23823523529529408, + "grad_norm": 0.24080792863610254, + "learning_rate": 9.793346787689622e-06, + "loss": 0.3249, + "step": 2978 + }, + { + "epoch": 0.2383152336953261, + "grad_norm": 0.33059480454093304, + "learning_rate": 9.793162447703161e-06, + "loss": 0.2835, + "step": 2979 + }, + { + "epoch": 0.2383952320953581, + "grad_norm": 0.3051598330739392, + "learning_rate": 9.79297802727182e-06, + "loss": 0.2961, + "step": 2980 + }, + { + "epoch": 0.2384752304953901, + "grad_norm": 0.2836506287854809, + "learning_rate": 9.792793526398694e-06, + "loss": 0.3039, + "step": 2981 + }, + { + "epoch": 0.23855522889542208, + "grad_norm": 0.3107079789153107, + "learning_rate": 9.79260894508688e-06, + "loss": 0.2543, + "step": 2982 + }, + { + "epoch": 0.23863522729545408, + "grad_norm": 0.20184273802156774, + "learning_rate": 9.792424283339477e-06, + "loss": 0.4019, + "step": 2983 + }, + { + "epoch": 0.2387152256954861, + "grad_norm": 0.28231648378272167, + "learning_rate": 9.792239541159581e-06, + "loss": 0.2985, + "step": 2984 + }, + { + "epoch": 0.2387952240955181, + "grad_norm": 0.2307841533136294, + "learning_rate": 9.792054718550297e-06, + "loss": 0.3023, + "step": 2985 + }, + { + "epoch": 0.2388752224955501, + "grad_norm": 0.3357385871036078, + "learning_rate": 9.791869815514723e-06, + "loss": 0.2655, + "step": 2986 + }, + { + "epoch": 0.23895522089558208, + "grad_norm": 0.3064048846323717, + "learning_rate": 9.791684832055962e-06, + "loss": 0.2584, + "step": 2987 + }, + { + "epoch": 0.23903521929561408, + "grad_norm": 0.32205256850572594, + "learning_rate": 9.791499768177124e-06, + "loss": 0.2625, + "step": 2988 + }, + { + "epoch": 0.2391152176956461, + "grad_norm": 0.33410193594859283, + "learning_rate": 9.79131462388131e-06, + "loss": 0.271, + "step": 2989 + }, + { + "epoch": 0.2391952160956781, + "grad_norm": 0.3341786081080264, + "learning_rate": 9.791129399171628e-06, + "loss": 0.2865, + "step": 2990 + }, + { + "epoch": 0.2392752144957101, + "grad_norm": 0.2749110056215002, + "learning_rate": 9.790944094051188e-06, + "loss": 0.3228, + "step": 2991 + }, + { + "epoch": 0.23935521289574208, + "grad_norm": 0.2951214848887231, + "learning_rate": 9.790758708523099e-06, + "loss": 0.2837, + "step": 2992 + }, + { + "epoch": 0.23943521129577408, + "grad_norm": 0.3018272512432192, + "learning_rate": 9.790573242590473e-06, + "loss": 0.2504, + "step": 2993 + }, + { + "epoch": 0.23951520969580609, + "grad_norm": 0.32430565204467254, + "learning_rate": 9.790387696256422e-06, + "loss": 0.2769, + "step": 2994 + }, + { + "epoch": 0.2395952080958381, + "grad_norm": 0.2864657586127316, + "learning_rate": 9.790202069524061e-06, + "loss": 0.2542, + "step": 2995 + }, + { + "epoch": 0.23967520649587007, + "grad_norm": 0.34528215843029125, + "learning_rate": 9.790016362396506e-06, + "loss": 0.2747, + "step": 2996 + }, + { + "epoch": 0.23975520489590207, + "grad_norm": 0.25159441423707923, + "learning_rate": 9.789830574876873e-06, + "loss": 0.325, + "step": 2997 + }, + { + "epoch": 0.23983520329593408, + "grad_norm": 0.3315768244227891, + "learning_rate": 9.789644706968278e-06, + "loss": 0.3028, + "step": 2998 + }, + { + "epoch": 0.23991520169596608, + "grad_norm": 0.24223165137641967, + "learning_rate": 9.789458758673843e-06, + "loss": 0.3371, + "step": 2999 + }, + { + "epoch": 0.2399952000959981, + "grad_norm": 0.3172154648504064, + "learning_rate": 9.789272729996689e-06, + "loss": 0.2871, + "step": 3000 + }, + { + "epoch": 0.24007519849603007, + "grad_norm": 0.3403037358604376, + "learning_rate": 9.789086620939936e-06, + "loss": 0.2482, + "step": 3001 + }, + { + "epoch": 0.24015519689606207, + "grad_norm": 0.30351299694534395, + "learning_rate": 9.788900431506709e-06, + "loss": 0.3034, + "step": 3002 + }, + { + "epoch": 0.24023519529609408, + "grad_norm": 0.3458478226329017, + "learning_rate": 9.788714161700135e-06, + "loss": 0.2635, + "step": 3003 + }, + { + "epoch": 0.24031519369612608, + "grad_norm": 0.2923109492954135, + "learning_rate": 9.788527811523336e-06, + "loss": 0.3079, + "step": 3004 + }, + { + "epoch": 0.2403951920961581, + "grad_norm": 0.27359555796958407, + "learning_rate": 9.78834138097944e-06, + "loss": 0.3044, + "step": 3005 + }, + { + "epoch": 0.24047519049619007, + "grad_norm": 0.2690690877949422, + "learning_rate": 9.78815487007158e-06, + "loss": 0.2843, + "step": 3006 + }, + { + "epoch": 0.24055518889622207, + "grad_norm": 0.33983420453492813, + "learning_rate": 9.787968278802883e-06, + "loss": 0.2785, + "step": 3007 + }, + { + "epoch": 0.24063518729625408, + "grad_norm": 0.3268249113703334, + "learning_rate": 9.78778160717648e-06, + "loss": 0.266, + "step": 3008 + }, + { + "epoch": 0.24071518569628608, + "grad_norm": 0.2693428819884109, + "learning_rate": 9.787594855195509e-06, + "loss": 0.2864, + "step": 3009 + }, + { + "epoch": 0.24079518409631806, + "grad_norm": 0.26663551116209877, + "learning_rate": 9.787408022863097e-06, + "loss": 0.3052, + "step": 3010 + }, + { + "epoch": 0.24087518249635007, + "grad_norm": 0.266101868160398, + "learning_rate": 9.787221110182384e-06, + "loss": 0.3409, + "step": 3011 + }, + { + "epoch": 0.24095518089638207, + "grad_norm": 0.3181706829361852, + "learning_rate": 9.787034117156506e-06, + "loss": 0.2856, + "step": 3012 + }, + { + "epoch": 0.24103517929641408, + "grad_norm": 0.499112122181375, + "learning_rate": 9.786847043788601e-06, + "loss": 0.2661, + "step": 3013 + }, + { + "epoch": 0.24111517769644608, + "grad_norm": 0.29313108650088443, + "learning_rate": 9.786659890081811e-06, + "loss": 0.2891, + "step": 3014 + }, + { + "epoch": 0.24119517609647806, + "grad_norm": 0.31731566370819686, + "learning_rate": 9.786472656039275e-06, + "loss": 0.2645, + "step": 3015 + }, + { + "epoch": 0.24127517449651006, + "grad_norm": 0.2880820944356039, + "learning_rate": 9.786285341664135e-06, + "loss": 0.295, + "step": 3016 + }, + { + "epoch": 0.24135517289654207, + "grad_norm": 0.3167003258936168, + "learning_rate": 9.786097946959534e-06, + "loss": 0.2834, + "step": 3017 + }, + { + "epoch": 0.24143517129657407, + "grad_norm": 0.2528755654853122, + "learning_rate": 9.785910471928621e-06, + "loss": 0.3217, + "step": 3018 + }, + { + "epoch": 0.24151516969660608, + "grad_norm": 0.3332560152387603, + "learning_rate": 9.785722916574539e-06, + "loss": 0.2646, + "step": 3019 + }, + { + "epoch": 0.24159516809663806, + "grad_norm": 0.25156173656243175, + "learning_rate": 9.785535280900437e-06, + "loss": 0.296, + "step": 3020 + }, + { + "epoch": 0.24167516649667006, + "grad_norm": 0.31428653694755915, + "learning_rate": 9.785347564909464e-06, + "loss": 0.2635, + "step": 3021 + }, + { + "epoch": 0.24175516489670207, + "grad_norm": 0.4887764792099205, + "learning_rate": 9.78515976860477e-06, + "loss": 0.2902, + "step": 3022 + }, + { + "epoch": 0.24183516329673407, + "grad_norm": 0.2825174452197186, + "learning_rate": 9.784971891989508e-06, + "loss": 0.3103, + "step": 3023 + }, + { + "epoch": 0.24191516169676605, + "grad_norm": 0.23495952913436058, + "learning_rate": 9.784783935066828e-06, + "loss": 0.316, + "step": 3024 + }, + { + "epoch": 0.24199516009679806, + "grad_norm": 0.3302554543255552, + "learning_rate": 9.78459589783989e-06, + "loss": 0.2579, + "step": 3025 + }, + { + "epoch": 0.24207515849683006, + "grad_norm": 0.36530593477874856, + "learning_rate": 9.784407780311845e-06, + "loss": 0.286, + "step": 3026 + }, + { + "epoch": 0.24215515689686207, + "grad_norm": 0.31424262672399134, + "learning_rate": 9.784219582485853e-06, + "loss": 0.3096, + "step": 3027 + }, + { + "epoch": 0.24223515529689407, + "grad_norm": 0.30604335877420313, + "learning_rate": 9.784031304365072e-06, + "loss": 0.2733, + "step": 3028 + }, + { + "epoch": 0.24231515369692605, + "grad_norm": 0.29165470388813486, + "learning_rate": 9.78384294595266e-06, + "loss": 0.311, + "step": 3029 + }, + { + "epoch": 0.24239515209695806, + "grad_norm": 0.3141428916212566, + "learning_rate": 9.78365450725178e-06, + "loss": 0.2791, + "step": 3030 + }, + { + "epoch": 0.24247515049699006, + "grad_norm": 0.3513446508226203, + "learning_rate": 9.783465988265594e-06, + "loss": 0.305, + "step": 3031 + }, + { + "epoch": 0.24255514889702207, + "grad_norm": 0.3128965968597101, + "learning_rate": 9.78327738899727e-06, + "loss": 0.2785, + "step": 3032 + }, + { + "epoch": 0.24263514729705407, + "grad_norm": 0.32659013609709037, + "learning_rate": 9.783088709449967e-06, + "loss": 0.2581, + "step": 3033 + }, + { + "epoch": 0.24271514569708605, + "grad_norm": 0.29780028953274873, + "learning_rate": 9.782899949626853e-06, + "loss": 0.2735, + "step": 3034 + }, + { + "epoch": 0.24279514409711805, + "grad_norm": 0.32911214260937544, + "learning_rate": 9.7827111095311e-06, + "loss": 0.2645, + "step": 3035 + }, + { + "epoch": 0.24287514249715006, + "grad_norm": 0.3058683177461703, + "learning_rate": 9.782522189165873e-06, + "loss": 0.2599, + "step": 3036 + }, + { + "epoch": 0.24295514089718206, + "grad_norm": 0.3018559923800518, + "learning_rate": 9.782333188534345e-06, + "loss": 0.3163, + "step": 3037 + }, + { + "epoch": 0.24303513929721404, + "grad_norm": 0.35130398851790273, + "learning_rate": 9.782144107639686e-06, + "loss": 0.275, + "step": 3038 + }, + { + "epoch": 0.24311513769724605, + "grad_norm": 0.2753139169580173, + "learning_rate": 9.781954946485072e-06, + "loss": 0.3022, + "step": 3039 + }, + { + "epoch": 0.24319513609727805, + "grad_norm": 0.28902516909497084, + "learning_rate": 9.781765705073679e-06, + "loss": 0.3018, + "step": 3040 + }, + { + "epoch": 0.24327513449731006, + "grad_norm": 0.2702473997635275, + "learning_rate": 9.781576383408678e-06, + "loss": 0.3008, + "step": 3041 + }, + { + "epoch": 0.24335513289734206, + "grad_norm": 0.31976478220407334, + "learning_rate": 9.781386981493249e-06, + "loss": 0.263, + "step": 3042 + }, + { + "epoch": 0.24343513129737404, + "grad_norm": 0.2918703750343495, + "learning_rate": 9.781197499330572e-06, + "loss": 0.2855, + "step": 3043 + }, + { + "epoch": 0.24351512969740605, + "grad_norm": 0.31182659920799544, + "learning_rate": 9.781007936923825e-06, + "loss": 0.2518, + "step": 3044 + }, + { + "epoch": 0.24359512809743805, + "grad_norm": 0.30301870677231585, + "learning_rate": 9.78081829427619e-06, + "loss": 0.2803, + "step": 3045 + }, + { + "epoch": 0.24367512649747006, + "grad_norm": 0.3206395881246978, + "learning_rate": 9.780628571390853e-06, + "loss": 0.2854, + "step": 3046 + }, + { + "epoch": 0.24375512489750206, + "grad_norm": 0.23894183492628182, + "learning_rate": 9.780438768270992e-06, + "loss": 0.3213, + "step": 3047 + }, + { + "epoch": 0.24383512329753404, + "grad_norm": 0.31123147441317217, + "learning_rate": 9.780248884919799e-06, + "loss": 0.2691, + "step": 3048 + }, + { + "epoch": 0.24391512169756605, + "grad_norm": 0.3258184100803723, + "learning_rate": 9.780058921340456e-06, + "loss": 0.266, + "step": 3049 + }, + { + "epoch": 0.24399512009759805, + "grad_norm": 0.2490481856550722, + "learning_rate": 9.779868877536154e-06, + "loss": 0.3271, + "step": 3050 + }, + { + "epoch": 0.24407511849763006, + "grad_norm": 0.29501399334033884, + "learning_rate": 9.779678753510082e-06, + "loss": 0.2978, + "step": 3051 + }, + { + "epoch": 0.24415511689766203, + "grad_norm": 0.3293627338212833, + "learning_rate": 9.779488549265429e-06, + "loss": 0.2536, + "step": 3052 + }, + { + "epoch": 0.24423511529769404, + "grad_norm": 0.3163076790876164, + "learning_rate": 9.77929826480539e-06, + "loss": 0.2977, + "step": 3053 + }, + { + "epoch": 0.24431511369772604, + "grad_norm": 0.5043741272360033, + "learning_rate": 9.779107900133157e-06, + "loss": 0.2996, + "step": 3054 + }, + { + "epoch": 0.24439511209775805, + "grad_norm": 0.3679864982254292, + "learning_rate": 9.778917455251924e-06, + "loss": 0.2782, + "step": 3055 + }, + { + "epoch": 0.24447511049779005, + "grad_norm": 0.28931736767332183, + "learning_rate": 9.77872693016489e-06, + "loss": 0.3284, + "step": 3056 + }, + { + "epoch": 0.24455510889782203, + "grad_norm": 0.23200463723555667, + "learning_rate": 9.778536324875252e-06, + "loss": 0.2971, + "step": 3057 + }, + { + "epoch": 0.24463510729785404, + "grad_norm": 0.2729819359106997, + "learning_rate": 9.778345639386206e-06, + "loss": 0.3195, + "step": 3058 + }, + { + "epoch": 0.24471510569788604, + "grad_norm": 0.33404891863436337, + "learning_rate": 9.778154873700956e-06, + "loss": 0.2621, + "step": 3059 + }, + { + "epoch": 0.24479510409791805, + "grad_norm": 0.32870271104589577, + "learning_rate": 9.777964027822701e-06, + "loss": 0.3116, + "step": 3060 + }, + { + "epoch": 0.24487510249795005, + "grad_norm": 0.22519185142925494, + "learning_rate": 9.777773101754648e-06, + "loss": 0.3703, + "step": 3061 + }, + { + "epoch": 0.24495510089798203, + "grad_norm": 0.3365647932265385, + "learning_rate": 9.777582095499995e-06, + "loss": 0.2793, + "step": 3062 + }, + { + "epoch": 0.24503509929801404, + "grad_norm": 0.30441162904786606, + "learning_rate": 9.777391009061954e-06, + "loss": 0.2828, + "step": 3063 + }, + { + "epoch": 0.24511509769804604, + "grad_norm": 0.28644365705133507, + "learning_rate": 9.777199842443729e-06, + "loss": 0.2879, + "step": 3064 + }, + { + "epoch": 0.24519509609807805, + "grad_norm": 0.2966228841810259, + "learning_rate": 9.777008595648527e-06, + "loss": 0.3145, + "step": 3065 + }, + { + "epoch": 0.24527509449811002, + "grad_norm": 0.27027226974388907, + "learning_rate": 9.776817268679562e-06, + "loss": 0.335, + "step": 3066 + }, + { + "epoch": 0.24535509289814203, + "grad_norm": 0.2296889240172845, + "learning_rate": 9.77662586154004e-06, + "loss": 0.3139, + "step": 3067 + }, + { + "epoch": 0.24543509129817404, + "grad_norm": 0.255096222810185, + "learning_rate": 9.776434374233178e-06, + "loss": 0.331, + "step": 3068 + }, + { + "epoch": 0.24551508969820604, + "grad_norm": 0.4319542245186691, + "learning_rate": 9.776242806762187e-06, + "loss": 0.3438, + "step": 3069 + }, + { + "epoch": 0.24559508809823805, + "grad_norm": 0.3413211685378461, + "learning_rate": 9.776051159130283e-06, + "loss": 0.2741, + "step": 3070 + }, + { + "epoch": 0.24567508649827002, + "grad_norm": 0.27080491474753127, + "learning_rate": 9.775859431340681e-06, + "loss": 0.293, + "step": 3071 + }, + { + "epoch": 0.24575508489830203, + "grad_norm": 0.33970203133520444, + "learning_rate": 9.775667623396601e-06, + "loss": 0.2721, + "step": 3072 + }, + { + "epoch": 0.24583508329833403, + "grad_norm": 0.2724470607668596, + "learning_rate": 9.775475735301261e-06, + "loss": 0.3463, + "step": 3073 + }, + { + "epoch": 0.24591508169836604, + "grad_norm": 0.30747640971273876, + "learning_rate": 9.775283767057883e-06, + "loss": 0.2898, + "step": 3074 + }, + { + "epoch": 0.24599508009839804, + "grad_norm": 0.29194169430773564, + "learning_rate": 9.775091718669688e-06, + "loss": 0.2555, + "step": 3075 + }, + { + "epoch": 0.24607507849843002, + "grad_norm": 0.316860011792518, + "learning_rate": 9.774899590139897e-06, + "loss": 0.2879, + "step": 3076 + }, + { + "epoch": 0.24615507689846203, + "grad_norm": 0.2995507920827911, + "learning_rate": 9.774707381471737e-06, + "loss": 0.2816, + "step": 3077 + }, + { + "epoch": 0.24623507529849403, + "grad_norm": 0.25499531005472453, + "learning_rate": 9.774515092668435e-06, + "loss": 0.3451, + "step": 3078 + }, + { + "epoch": 0.24631507369852604, + "grad_norm": 0.29942815615249313, + "learning_rate": 9.774322723733216e-06, + "loss": 0.2998, + "step": 3079 + }, + { + "epoch": 0.24639507209855802, + "grad_norm": 0.27342280409800107, + "learning_rate": 9.774130274669309e-06, + "loss": 0.3112, + "step": 3080 + }, + { + "epoch": 0.24647507049859002, + "grad_norm": 0.31731505339244853, + "learning_rate": 9.773937745479942e-06, + "loss": 0.2805, + "step": 3081 + }, + { + "epoch": 0.24655506889862203, + "grad_norm": 0.3123969939483814, + "learning_rate": 9.773745136168352e-06, + "loss": 0.3003, + "step": 3082 + }, + { + "epoch": 0.24663506729865403, + "grad_norm": 0.26952883835143066, + "learning_rate": 9.773552446737764e-06, + "loss": 0.2724, + "step": 3083 + }, + { + "epoch": 0.24671506569868604, + "grad_norm": 0.26326345472443974, + "learning_rate": 9.773359677191418e-06, + "loss": 0.3075, + "step": 3084 + }, + { + "epoch": 0.24679506409871801, + "grad_norm": 0.34024679826568205, + "learning_rate": 9.773166827532548e-06, + "loss": 0.308, + "step": 3085 + }, + { + "epoch": 0.24687506249875002, + "grad_norm": 0.32871923333498027, + "learning_rate": 9.772973897764389e-06, + "loss": 0.2628, + "step": 3086 + }, + { + "epoch": 0.24695506089878203, + "grad_norm": 0.26587052116698695, + "learning_rate": 9.77278088789018e-06, + "loss": 0.3303, + "step": 3087 + }, + { + "epoch": 0.24703505929881403, + "grad_norm": 0.3144201460443083, + "learning_rate": 9.77258779791316e-06, + "loss": 0.2939, + "step": 3088 + }, + { + "epoch": 0.24711505769884604, + "grad_norm": 0.23891789720261453, + "learning_rate": 9.77239462783657e-06, + "loss": 0.3162, + "step": 3089 + }, + { + "epoch": 0.247195056098878, + "grad_norm": 0.34417679590389627, + "learning_rate": 9.77220137766365e-06, + "loss": 0.2922, + "step": 3090 + }, + { + "epoch": 0.24727505449891002, + "grad_norm": 0.29328897436891405, + "learning_rate": 9.772008047397647e-06, + "loss": 0.3097, + "step": 3091 + }, + { + "epoch": 0.24735505289894202, + "grad_norm": 0.2446005038709564, + "learning_rate": 9.771814637041806e-06, + "loss": 0.3411, + "step": 3092 + }, + { + "epoch": 0.24743505129897403, + "grad_norm": 0.3517788268670302, + "learning_rate": 9.771621146599369e-06, + "loss": 0.2545, + "step": 3093 + }, + { + "epoch": 0.247515049699006, + "grad_norm": 0.31954605551797144, + "learning_rate": 9.771427576073584e-06, + "loss": 0.2497, + "step": 3094 + }, + { + "epoch": 0.247595048099038, + "grad_norm": 0.38167982554778185, + "learning_rate": 9.771233925467703e-06, + "loss": 0.2772, + "step": 3095 + }, + { + "epoch": 0.24767504649907002, + "grad_norm": 0.3255399169571891, + "learning_rate": 9.771040194784973e-06, + "loss": 0.2562, + "step": 3096 + }, + { + "epoch": 0.24775504489910202, + "grad_norm": 0.28286986479732285, + "learning_rate": 9.770846384028647e-06, + "loss": 0.3102, + "step": 3097 + }, + { + "epoch": 0.24783504329913403, + "grad_norm": 0.3612863164605747, + "learning_rate": 9.770652493201977e-06, + "loss": 0.2943, + "step": 3098 + }, + { + "epoch": 0.247915041699166, + "grad_norm": 0.3168649848502705, + "learning_rate": 9.77045852230822e-06, + "loss": 0.2698, + "step": 3099 + }, + { + "epoch": 0.247995040099198, + "grad_norm": 0.32849858658128883, + "learning_rate": 9.770264471350628e-06, + "loss": 0.2953, + "step": 3100 + }, + { + "epoch": 0.24807503849923002, + "grad_norm": 0.31385057119707854, + "learning_rate": 9.770070340332457e-06, + "loss": 0.298, + "step": 3101 + }, + { + "epoch": 0.24815503689926202, + "grad_norm": 0.34516009215340687, + "learning_rate": 9.769876129256969e-06, + "loss": 0.263, + "step": 3102 + }, + { + "epoch": 0.24823503529929403, + "grad_norm": 0.44322728229939024, + "learning_rate": 9.769681838127421e-06, + "loss": 0.2522, + "step": 3103 + }, + { + "epoch": 0.248315033699326, + "grad_norm": 0.334640302637303, + "learning_rate": 9.769487466947075e-06, + "loss": 0.2378, + "step": 3104 + }, + { + "epoch": 0.248395032099358, + "grad_norm": 0.2375613673412739, + "learning_rate": 9.76929301571919e-06, + "loss": 0.3302, + "step": 3105 + }, + { + "epoch": 0.24847503049939001, + "grad_norm": 0.3218906643486621, + "learning_rate": 9.769098484447034e-06, + "loss": 0.3145, + "step": 3106 + }, + { + "epoch": 0.24855502889942202, + "grad_norm": 0.23786766950075333, + "learning_rate": 9.76890387313387e-06, + "loss": 0.3452, + "step": 3107 + }, + { + "epoch": 0.248635027299454, + "grad_norm": 0.24578248063222669, + "learning_rate": 9.768709181782962e-06, + "loss": 0.3301, + "step": 3108 + }, + { + "epoch": 0.248715025699486, + "grad_norm": 0.3247073109242354, + "learning_rate": 9.768514410397583e-06, + "loss": 0.276, + "step": 3109 + }, + { + "epoch": 0.248795024099518, + "grad_norm": 0.3091310869026541, + "learning_rate": 9.768319558980997e-06, + "loss": 0.2529, + "step": 3110 + }, + { + "epoch": 0.24887502249955, + "grad_norm": 0.7231704557010377, + "learning_rate": 9.768124627536474e-06, + "loss": 0.2712, + "step": 3111 + }, + { + "epoch": 0.24895502089958202, + "grad_norm": 0.2757231559350152, + "learning_rate": 9.767929616067289e-06, + "loss": 0.2942, + "step": 3112 + }, + { + "epoch": 0.249035019299614, + "grad_norm": 0.2502254882635002, + "learning_rate": 9.767734524576714e-06, + "loss": 0.322, + "step": 3113 + }, + { + "epoch": 0.249115017699646, + "grad_norm": 0.30291739176643107, + "learning_rate": 9.767539353068021e-06, + "loss": 0.2513, + "step": 3114 + }, + { + "epoch": 0.249195016099678, + "grad_norm": 0.32701353695295515, + "learning_rate": 9.767344101544489e-06, + "loss": 0.2941, + "step": 3115 + }, + { + "epoch": 0.24927501449971, + "grad_norm": 0.2791713759134054, + "learning_rate": 9.767148770009393e-06, + "loss": 0.2808, + "step": 3116 + }, + { + "epoch": 0.24935501289974202, + "grad_norm": 0.5971855118611495, + "learning_rate": 9.76695335846601e-06, + "loss": 0.2939, + "step": 3117 + }, + { + "epoch": 0.249435011299774, + "grad_norm": 0.2754846734676285, + "learning_rate": 9.766757866917622e-06, + "loss": 0.3219, + "step": 3118 + }, + { + "epoch": 0.249515009699806, + "grad_norm": 0.29331806050667314, + "learning_rate": 9.76656229536751e-06, + "loss": 0.2953, + "step": 3119 + }, + { + "epoch": 0.249595008099838, + "grad_norm": 0.26598381966268686, + "learning_rate": 9.766366643818954e-06, + "loss": 0.3029, + "step": 3120 + }, + { + "epoch": 0.24967500649987, + "grad_norm": 0.2325438730593487, + "learning_rate": 9.76617091227524e-06, + "loss": 0.3282, + "step": 3121 + }, + { + "epoch": 0.249755004899902, + "grad_norm": 0.3201023287744438, + "learning_rate": 9.76597510073965e-06, + "loss": 0.2714, + "step": 3122 + }, + { + "epoch": 0.249835003299934, + "grad_norm": 0.31257830277634696, + "learning_rate": 9.765779209215474e-06, + "loss": 0.2732, + "step": 3123 + }, + { + "epoch": 0.249915001699966, + "grad_norm": 0.48566379791154374, + "learning_rate": 9.765583237705999e-06, + "loss": 0.2648, + "step": 3124 + }, + { + "epoch": 0.249995000099998, + "grad_norm": 0.32847257438351457, + "learning_rate": 9.765387186214512e-06, + "loss": 0.3163, + "step": 3125 + }, + { + "epoch": 0.25007499850003, + "grad_norm": 0.35912689079795534, + "learning_rate": 9.765191054744305e-06, + "loss": 0.3195, + "step": 3126 + }, + { + "epoch": 0.250154996900062, + "grad_norm": 0.30305233523238356, + "learning_rate": 9.76499484329867e-06, + "loss": 0.3107, + "step": 3127 + }, + { + "epoch": 0.250234995300094, + "grad_norm": 0.29303062304337557, + "learning_rate": 9.764798551880898e-06, + "loss": 0.2854, + "step": 3128 + }, + { + "epoch": 0.250314993700126, + "grad_norm": 0.2791212878065816, + "learning_rate": 9.764602180494285e-06, + "loss": 0.3055, + "step": 3129 + }, + { + "epoch": 0.250394992100158, + "grad_norm": 0.2685813521171975, + "learning_rate": 9.764405729142129e-06, + "loss": 0.3099, + "step": 3130 + }, + { + "epoch": 0.25047499050019, + "grad_norm": 0.31286079404996653, + "learning_rate": 9.764209197827721e-06, + "loss": 0.2606, + "step": 3131 + }, + { + "epoch": 0.250554988900222, + "grad_norm": 0.2919168005108195, + "learning_rate": 9.764012586554364e-06, + "loss": 0.2934, + "step": 3132 + }, + { + "epoch": 0.250634987300254, + "grad_norm": 0.31864235475792807, + "learning_rate": 9.763815895325357e-06, + "loss": 0.2723, + "step": 3133 + }, + { + "epoch": 0.250714985700286, + "grad_norm": 0.251421311433494, + "learning_rate": 9.763619124144003e-06, + "loss": 0.3258, + "step": 3134 + }, + { + "epoch": 0.250794984100318, + "grad_norm": 0.3651064910573383, + "learning_rate": 9.7634222730136e-06, + "loss": 0.2575, + "step": 3135 + }, + { + "epoch": 0.25087498250035, + "grad_norm": 0.25729249414434596, + "learning_rate": 9.763225341937455e-06, + "loss": 0.3264, + "step": 3136 + }, + { + "epoch": 0.250954980900382, + "grad_norm": 0.30144316623451045, + "learning_rate": 9.763028330918874e-06, + "loss": 0.2436, + "step": 3137 + }, + { + "epoch": 0.251034979300414, + "grad_norm": 0.32593775625968424, + "learning_rate": 9.76283123996116e-06, + "loss": 0.266, + "step": 3138 + }, + { + "epoch": 0.251114977700446, + "grad_norm": 0.2938718733586929, + "learning_rate": 9.762634069067622e-06, + "loss": 0.3136, + "step": 3139 + }, + { + "epoch": 0.251194976100478, + "grad_norm": 0.31234139817401346, + "learning_rate": 9.76243681824157e-06, + "loss": 0.274, + "step": 3140 + }, + { + "epoch": 0.25127497450051, + "grad_norm": 0.323618036090237, + "learning_rate": 9.762239487486316e-06, + "loss": 0.2798, + "step": 3141 + }, + { + "epoch": 0.251354972900542, + "grad_norm": 0.23281698233229056, + "learning_rate": 9.762042076805169e-06, + "loss": 0.3586, + "step": 3142 + }, + { + "epoch": 0.25143497130057396, + "grad_norm": 0.2875508610252089, + "learning_rate": 9.761844586201444e-06, + "loss": 0.303, + "step": 3143 + }, + { + "epoch": 0.251514969700606, + "grad_norm": 0.32343200371070885, + "learning_rate": 9.761647015678455e-06, + "loss": 0.2952, + "step": 3144 + }, + { + "epoch": 0.251594968100638, + "grad_norm": 0.3070580428762798, + "learning_rate": 9.761449365239518e-06, + "loss": 0.2932, + "step": 3145 + }, + { + "epoch": 0.25167496650067, + "grad_norm": 0.3188035438070415, + "learning_rate": 9.761251634887949e-06, + "loss": 0.2941, + "step": 3146 + }, + { + "epoch": 0.251754964900702, + "grad_norm": 0.30375534864295345, + "learning_rate": 9.761053824627068e-06, + "loss": 0.3033, + "step": 3147 + }, + { + "epoch": 0.25183496330073396, + "grad_norm": 0.32955671063662617, + "learning_rate": 9.760855934460193e-06, + "loss": 0.2585, + "step": 3148 + }, + { + "epoch": 0.251914961700766, + "grad_norm": 0.2321404191599689, + "learning_rate": 9.76065796439065e-06, + "loss": 0.3327, + "step": 3149 + }, + { + "epoch": 0.25199496010079797, + "grad_norm": 0.29660382532939567, + "learning_rate": 9.760459914421756e-06, + "loss": 0.2441, + "step": 3150 + }, + { + "epoch": 0.25207495850083, + "grad_norm": 0.2989727475467132, + "learning_rate": 9.76026178455684e-06, + "loss": 0.297, + "step": 3151 + }, + { + "epoch": 0.252154956900862, + "grad_norm": 0.3331313336036164, + "learning_rate": 9.760063574799221e-06, + "loss": 0.2783, + "step": 3152 + }, + { + "epoch": 0.25223495530089396, + "grad_norm": 0.3543229019862621, + "learning_rate": 9.759865285152231e-06, + "loss": 0.2771, + "step": 3153 + }, + { + "epoch": 0.252314953700926, + "grad_norm": 0.2862505184288917, + "learning_rate": 9.759666915619195e-06, + "loss": 0.2839, + "step": 3154 + }, + { + "epoch": 0.25239495210095797, + "grad_norm": 0.3137654879459138, + "learning_rate": 9.759468466203444e-06, + "loss": 0.3133, + "step": 3155 + }, + { + "epoch": 0.25247495050099, + "grad_norm": 0.27556620221835976, + "learning_rate": 9.759269936908308e-06, + "loss": 0.2817, + "step": 3156 + }, + { + "epoch": 0.252554948901022, + "grad_norm": 0.2882192169145278, + "learning_rate": 9.75907132773712e-06, + "loss": 0.3015, + "step": 3157 + }, + { + "epoch": 0.25263494730105396, + "grad_norm": 0.32162320697352065, + "learning_rate": 9.75887263869321e-06, + "loss": 0.2822, + "step": 3158 + }, + { + "epoch": 0.252714945701086, + "grad_norm": 0.2931425106674063, + "learning_rate": 9.758673869779915e-06, + "loss": 0.3031, + "step": 3159 + }, + { + "epoch": 0.25279494410111797, + "grad_norm": 0.31050957134911716, + "learning_rate": 9.758475021000572e-06, + "loss": 0.2668, + "step": 3160 + }, + { + "epoch": 0.25287494250115, + "grad_norm": 0.3468256199857036, + "learning_rate": 9.758276092358518e-06, + "loss": 0.2716, + "step": 3161 + }, + { + "epoch": 0.252954940901182, + "grad_norm": 0.22159482162827437, + "learning_rate": 9.758077083857091e-06, + "loss": 0.3687, + "step": 3162 + }, + { + "epoch": 0.25303493930121396, + "grad_norm": 0.19353265301436734, + "learning_rate": 9.75787799549963e-06, + "loss": 0.3327, + "step": 3163 + }, + { + "epoch": 0.253114937701246, + "grad_norm": 0.3287110143423559, + "learning_rate": 9.757678827289476e-06, + "loss": 0.2856, + "step": 3164 + }, + { + "epoch": 0.25319493610127797, + "grad_norm": 0.37082405106310107, + "learning_rate": 9.757479579229974e-06, + "loss": 0.2537, + "step": 3165 + }, + { + "epoch": 0.25327493450131, + "grad_norm": 0.3073932256828583, + "learning_rate": 9.757280251324468e-06, + "loss": 0.2532, + "step": 3166 + }, + { + "epoch": 0.253354932901342, + "grad_norm": 0.2959037483714209, + "learning_rate": 9.757080843576301e-06, + "loss": 0.2572, + "step": 3167 + }, + { + "epoch": 0.25343493130137396, + "grad_norm": 0.35160350708915283, + "learning_rate": 9.756881355988823e-06, + "loss": 0.2728, + "step": 3168 + }, + { + "epoch": 0.253514929701406, + "grad_norm": 0.3016602684599165, + "learning_rate": 9.756681788565379e-06, + "loss": 0.2867, + "step": 3169 + }, + { + "epoch": 0.25359492810143797, + "grad_norm": 0.2884132604018904, + "learning_rate": 9.756482141309319e-06, + "loss": 0.2867, + "step": 3170 + }, + { + "epoch": 0.25367492650146994, + "grad_norm": 0.27790658732169743, + "learning_rate": 9.756282414223995e-06, + "loss": 0.3007, + "step": 3171 + }, + { + "epoch": 0.253754924901502, + "grad_norm": 0.3320424066135094, + "learning_rate": 9.756082607312756e-06, + "loss": 0.2822, + "step": 3172 + }, + { + "epoch": 0.25383492330153395, + "grad_norm": 0.28418536740923783, + "learning_rate": 9.75588272057896e-06, + "loss": 0.2905, + "step": 3173 + }, + { + "epoch": 0.253914921701566, + "grad_norm": 0.4074420406588612, + "learning_rate": 9.75568275402596e-06, + "loss": 0.2992, + "step": 3174 + }, + { + "epoch": 0.25399492010159797, + "grad_norm": 0.3231161694624928, + "learning_rate": 9.755482707657109e-06, + "loss": 0.2618, + "step": 3175 + }, + { + "epoch": 0.25407491850162994, + "grad_norm": 0.27744247795812155, + "learning_rate": 9.755282581475769e-06, + "loss": 0.2821, + "step": 3176 + }, + { + "epoch": 0.254154916901662, + "grad_norm": 0.33259579697216696, + "learning_rate": 9.755082375485296e-06, + "loss": 0.2837, + "step": 3177 + }, + { + "epoch": 0.25423491530169395, + "grad_norm": 0.3084921934587112, + "learning_rate": 9.75488208968905e-06, + "loss": 0.2683, + "step": 3178 + }, + { + "epoch": 0.254314913701726, + "grad_norm": 0.32846868113232724, + "learning_rate": 9.754681724090396e-06, + "loss": 0.2514, + "step": 3179 + }, + { + "epoch": 0.25439491210175796, + "grad_norm": 0.28290460456135014, + "learning_rate": 9.75448127869269e-06, + "loss": 0.3106, + "step": 3180 + }, + { + "epoch": 0.25447491050178994, + "grad_norm": 0.27222462079886184, + "learning_rate": 9.754280753499306e-06, + "loss": 0.3334, + "step": 3181 + }, + { + "epoch": 0.254554908901822, + "grad_norm": 0.2910273087368221, + "learning_rate": 9.7540801485136e-06, + "loss": 0.2987, + "step": 3182 + }, + { + "epoch": 0.25463490730185395, + "grad_norm": 0.28369410318268795, + "learning_rate": 9.753879463738942e-06, + "loss": 0.2972, + "step": 3183 + }, + { + "epoch": 0.254714905701886, + "grad_norm": 0.301861897644405, + "learning_rate": 9.753678699178702e-06, + "loss": 0.2635, + "step": 3184 + }, + { + "epoch": 0.25479490410191796, + "grad_norm": 0.3084697951275763, + "learning_rate": 9.753477854836248e-06, + "loss": 0.2574, + "step": 3185 + }, + { + "epoch": 0.25487490250194994, + "grad_norm": 0.2873646860488063, + "learning_rate": 9.75327693071495e-06, + "loss": 0.2897, + "step": 3186 + }, + { + "epoch": 0.254954900901982, + "grad_norm": 0.2729266506001137, + "learning_rate": 9.753075926818182e-06, + "loss": 0.2986, + "step": 3187 + }, + { + "epoch": 0.25503489930201395, + "grad_norm": 0.34359885026253295, + "learning_rate": 9.752874843149316e-06, + "loss": 0.2704, + "step": 3188 + }, + { + "epoch": 0.255114897702046, + "grad_norm": 0.29824452263794987, + "learning_rate": 9.752673679711728e-06, + "loss": 0.3029, + "step": 3189 + }, + { + "epoch": 0.25519489610207796, + "grad_norm": 0.28658816453112573, + "learning_rate": 9.752472436508794e-06, + "loss": 0.3078, + "step": 3190 + }, + { + "epoch": 0.25527489450210994, + "grad_norm": 0.2932443757340073, + "learning_rate": 9.75227111354389e-06, + "loss": 0.293, + "step": 3191 + }, + { + "epoch": 0.25535489290214197, + "grad_norm": 0.26676321530325825, + "learning_rate": 9.752069710820398e-06, + "loss": 0.3405, + "step": 3192 + }, + { + "epoch": 0.25543489130217395, + "grad_norm": 0.30716866592177, + "learning_rate": 9.751868228341695e-06, + "loss": 0.2971, + "step": 3193 + }, + { + "epoch": 0.255514889702206, + "grad_norm": 0.2667353096275026, + "learning_rate": 9.751666666111162e-06, + "loss": 0.3178, + "step": 3194 + }, + { + "epoch": 0.25559488810223796, + "grad_norm": 0.24691711895703883, + "learning_rate": 9.751465024132184e-06, + "loss": 0.322, + "step": 3195 + }, + { + "epoch": 0.25567488650226994, + "grad_norm": 0.3277837754072894, + "learning_rate": 9.751263302408146e-06, + "loss": 0.2694, + "step": 3196 + }, + { + "epoch": 0.25575488490230197, + "grad_norm": 0.28454348245061956, + "learning_rate": 9.751061500942434e-06, + "loss": 0.323, + "step": 3197 + }, + { + "epoch": 0.25583488330233395, + "grad_norm": 0.28680987886515613, + "learning_rate": 9.75085961973843e-06, + "loss": 0.2886, + "step": 3198 + }, + { + "epoch": 0.2559148817023659, + "grad_norm": 0.3176873727220522, + "learning_rate": 9.750657658799528e-06, + "loss": 0.2793, + "step": 3199 + }, + { + "epoch": 0.25599488010239796, + "grad_norm": 0.30954563210196295, + "learning_rate": 9.750455618129115e-06, + "loss": 0.2824, + "step": 3200 + }, + { + "epoch": 0.25607487850242994, + "grad_norm": 0.3185704757330435, + "learning_rate": 9.75025349773058e-06, + "loss": 0.2666, + "step": 3201 + }, + { + "epoch": 0.25615487690246197, + "grad_norm": 0.308420890862119, + "learning_rate": 9.750051297607317e-06, + "loss": 0.265, + "step": 3202 + }, + { + "epoch": 0.25623487530249395, + "grad_norm": 0.367922928666844, + "learning_rate": 9.749849017762723e-06, + "loss": 0.2583, + "step": 3203 + }, + { + "epoch": 0.2563148737025259, + "grad_norm": 0.3619544134758433, + "learning_rate": 9.749646658200187e-06, + "loss": 0.2704, + "step": 3204 + }, + { + "epoch": 0.25639487210255796, + "grad_norm": 0.29600735509099035, + "learning_rate": 9.749444218923108e-06, + "loss": 0.3101, + "step": 3205 + }, + { + "epoch": 0.25647487050258994, + "grad_norm": 0.23244678078243874, + "learning_rate": 9.749241699934883e-06, + "loss": 0.3079, + "step": 3206 + }, + { + "epoch": 0.25655486890262197, + "grad_norm": 0.2331057033800822, + "learning_rate": 9.749039101238914e-06, + "loss": 0.3264, + "step": 3207 + }, + { + "epoch": 0.25663486730265395, + "grad_norm": 0.2663008358159601, + "learning_rate": 9.748836422838597e-06, + "loss": 0.3294, + "step": 3208 + }, + { + "epoch": 0.2567148657026859, + "grad_norm": 0.27988414306715015, + "learning_rate": 9.748633664737334e-06, + "loss": 0.2956, + "step": 3209 + }, + { + "epoch": 0.25679486410271796, + "grad_norm": 0.4228685523315738, + "learning_rate": 9.74843082693853e-06, + "loss": 0.2795, + "step": 3210 + }, + { + "epoch": 0.25687486250274993, + "grad_norm": 0.34759790146086567, + "learning_rate": 9.74822790944559e-06, + "loss": 0.2736, + "step": 3211 + }, + { + "epoch": 0.25695486090278197, + "grad_norm": 0.2464487087484887, + "learning_rate": 9.748024912261917e-06, + "loss": 0.3283, + "step": 3212 + }, + { + "epoch": 0.25703485930281395, + "grad_norm": 0.28081185935381325, + "learning_rate": 9.74782183539092e-06, + "loss": 0.2894, + "step": 3213 + }, + { + "epoch": 0.2571148577028459, + "grad_norm": 0.30395882478133074, + "learning_rate": 9.747618678836006e-06, + "loss": 0.2646, + "step": 3214 + }, + { + "epoch": 0.25719485610287796, + "grad_norm": 0.31355911123205715, + "learning_rate": 9.747415442600585e-06, + "loss": 0.2707, + "step": 3215 + }, + { + "epoch": 0.25727485450290993, + "grad_norm": 0.2918532539368113, + "learning_rate": 9.747212126688067e-06, + "loss": 0.3107, + "step": 3216 + }, + { + "epoch": 0.25735485290294197, + "grad_norm": 0.28087800356704284, + "learning_rate": 9.747008731101865e-06, + "loss": 0.3038, + "step": 3217 + }, + { + "epoch": 0.25743485130297394, + "grad_norm": 0.28380640312623645, + "learning_rate": 9.746805255845395e-06, + "loss": 0.2921, + "step": 3218 + }, + { + "epoch": 0.2575148497030059, + "grad_norm": 0.3140372124811116, + "learning_rate": 9.74660170092207e-06, + "loss": 0.2707, + "step": 3219 + }, + { + "epoch": 0.25759484810303795, + "grad_norm": 0.27691319017455657, + "learning_rate": 9.746398066335304e-06, + "loss": 0.3023, + "step": 3220 + }, + { + "epoch": 0.25767484650306993, + "grad_norm": 0.3603708862689575, + "learning_rate": 9.746194352088518e-06, + "loss": 0.3079, + "step": 3221 + }, + { + "epoch": 0.25775484490310197, + "grad_norm": 0.21567589896663422, + "learning_rate": 9.74599055818513e-06, + "loss": 0.3618, + "step": 3222 + }, + { + "epoch": 0.25783484330313394, + "grad_norm": 0.34908780889054475, + "learning_rate": 9.74578668462856e-06, + "loss": 0.2725, + "step": 3223 + }, + { + "epoch": 0.2579148417031659, + "grad_norm": 0.30762254538976935, + "learning_rate": 9.74558273142223e-06, + "loss": 0.312, + "step": 3224 + }, + { + "epoch": 0.25799484010319795, + "grad_norm": 0.28903001684581536, + "learning_rate": 9.745378698569562e-06, + "loss": 0.3015, + "step": 3225 + }, + { + "epoch": 0.25807483850322993, + "grad_norm": 0.32875194600273516, + "learning_rate": 9.745174586073982e-06, + "loss": 0.2676, + "step": 3226 + }, + { + "epoch": 0.2581548369032619, + "grad_norm": 0.29455268459193984, + "learning_rate": 9.744970393938915e-06, + "loss": 0.3179, + "step": 3227 + }, + { + "epoch": 0.25823483530329394, + "grad_norm": 0.30325676975412563, + "learning_rate": 9.744766122167786e-06, + "loss": 0.2761, + "step": 3228 + }, + { + "epoch": 0.2583148337033259, + "grad_norm": 0.2948242738377378, + "learning_rate": 9.744561770764027e-06, + "loss": 0.2992, + "step": 3229 + }, + { + "epoch": 0.25839483210335795, + "grad_norm": 0.27051516574762985, + "learning_rate": 9.744357339731065e-06, + "loss": 0.265, + "step": 3230 + }, + { + "epoch": 0.25847483050338993, + "grad_norm": 0.2621034616887968, + "learning_rate": 9.744152829072333e-06, + "loss": 0.3029, + "step": 3231 + }, + { + "epoch": 0.2585548289034219, + "grad_norm": 0.2910382533903895, + "learning_rate": 9.743948238791262e-06, + "loss": 0.2938, + "step": 3232 + }, + { + "epoch": 0.25863482730345394, + "grad_norm": 0.39652395647044114, + "learning_rate": 9.743743568891287e-06, + "loss": 0.287, + "step": 3233 + }, + { + "epoch": 0.2587148257034859, + "grad_norm": 0.29744086540449843, + "learning_rate": 9.743538819375839e-06, + "loss": 0.2595, + "step": 3234 + }, + { + "epoch": 0.25879482410351795, + "grad_norm": 0.3508410950734714, + "learning_rate": 9.743333990248359e-06, + "loss": 0.2828, + "step": 3235 + }, + { + "epoch": 0.25887482250354993, + "grad_norm": 0.2920741821379883, + "learning_rate": 9.743129081512284e-06, + "loss": 0.291, + "step": 3236 + }, + { + "epoch": 0.2589548209035819, + "grad_norm": 0.2735367332359823, + "learning_rate": 9.742924093171051e-06, + "loss": 0.3086, + "step": 3237 + }, + { + "epoch": 0.25903481930361394, + "grad_norm": 0.3114733871551781, + "learning_rate": 9.742719025228102e-06, + "loss": 0.2513, + "step": 3238 + }, + { + "epoch": 0.2591148177036459, + "grad_norm": 0.31151903013245424, + "learning_rate": 9.742513877686877e-06, + "loss": 0.2538, + "step": 3239 + }, + { + "epoch": 0.25919481610367795, + "grad_norm": 0.3405091854579703, + "learning_rate": 9.742308650550821e-06, + "loss": 0.2658, + "step": 3240 + }, + { + "epoch": 0.2592748145037099, + "grad_norm": 0.2648100413471504, + "learning_rate": 9.742103343823376e-06, + "loss": 0.3161, + "step": 3241 + }, + { + "epoch": 0.2593548129037419, + "grad_norm": 0.33849745698521233, + "learning_rate": 9.741897957507993e-06, + "loss": 0.2983, + "step": 3242 + }, + { + "epoch": 0.25943481130377394, + "grad_norm": 0.3024443579502387, + "learning_rate": 9.741692491608112e-06, + "loss": 0.2665, + "step": 3243 + }, + { + "epoch": 0.2595148097038059, + "grad_norm": 0.2672136156028046, + "learning_rate": 9.741486946127186e-06, + "loss": 0.3054, + "step": 3244 + }, + { + "epoch": 0.25959480810383795, + "grad_norm": 0.2670619187863482, + "learning_rate": 9.741281321068663e-06, + "loss": 0.2994, + "step": 3245 + }, + { + "epoch": 0.2596748065038699, + "grad_norm": 0.24086180092342724, + "learning_rate": 9.741075616435995e-06, + "loss": 0.3194, + "step": 3246 + }, + { + "epoch": 0.2597548049039019, + "grad_norm": 0.31699460478142966, + "learning_rate": 9.740869832232634e-06, + "loss": 0.3209, + "step": 3247 + }, + { + "epoch": 0.25983480330393394, + "grad_norm": 0.26048216398761814, + "learning_rate": 9.740663968462034e-06, + "loss": 0.3144, + "step": 3248 + }, + { + "epoch": 0.2599148017039659, + "grad_norm": 0.27773802069981673, + "learning_rate": 9.740458025127649e-06, + "loss": 0.3074, + "step": 3249 + }, + { + "epoch": 0.25999480010399795, + "grad_norm": 0.3552467262478054, + "learning_rate": 9.740252002232936e-06, + "loss": 0.2972, + "step": 3250 + }, + { + "epoch": 0.2600747985040299, + "grad_norm": 0.3045304557309494, + "learning_rate": 9.740045899781353e-06, + "loss": 0.2403, + "step": 3251 + }, + { + "epoch": 0.2601547969040619, + "grad_norm": 0.27604615190718945, + "learning_rate": 9.73983971777636e-06, + "loss": 0.3216, + "step": 3252 + }, + { + "epoch": 0.26023479530409394, + "grad_norm": 0.32356406830005247, + "learning_rate": 9.739633456221415e-06, + "loss": 0.2939, + "step": 3253 + }, + { + "epoch": 0.2603147937041259, + "grad_norm": 1.487630158580523, + "learning_rate": 9.739427115119981e-06, + "loss": 0.2982, + "step": 3254 + }, + { + "epoch": 0.2603947921041579, + "grad_norm": 0.2962492920471454, + "learning_rate": 9.739220694475522e-06, + "loss": 0.2591, + "step": 3255 + }, + { + "epoch": 0.2604747905041899, + "grad_norm": 0.27195383092622316, + "learning_rate": 9.7390141942915e-06, + "loss": 0.2951, + "step": 3256 + }, + { + "epoch": 0.2605547889042219, + "grad_norm": 0.32019956156401685, + "learning_rate": 9.738807614571384e-06, + "loss": 0.2961, + "step": 3257 + }, + { + "epoch": 0.26063478730425393, + "grad_norm": 0.29118752375192614, + "learning_rate": 9.738600955318637e-06, + "loss": 0.2946, + "step": 3258 + }, + { + "epoch": 0.2607147857042859, + "grad_norm": 0.231787575535212, + "learning_rate": 9.738394216536733e-06, + "loss": 0.3174, + "step": 3259 + }, + { + "epoch": 0.2607947841043179, + "grad_norm": 0.2836800528797434, + "learning_rate": 9.738187398229137e-06, + "loss": 0.2879, + "step": 3260 + }, + { + "epoch": 0.2608747825043499, + "grad_norm": 0.3182819570698695, + "learning_rate": 9.737980500399322e-06, + "loss": 0.2618, + "step": 3261 + }, + { + "epoch": 0.2609547809043819, + "grad_norm": 0.2864719108999295, + "learning_rate": 9.73777352305076e-06, + "loss": 0.2906, + "step": 3262 + }, + { + "epoch": 0.26103477930441393, + "grad_norm": 0.3500651926917599, + "learning_rate": 9.737566466186922e-06, + "loss": 0.2821, + "step": 3263 + }, + { + "epoch": 0.2611147777044459, + "grad_norm": 0.2793646048901002, + "learning_rate": 9.73735932981129e-06, + "loss": 0.2991, + "step": 3264 + }, + { + "epoch": 0.2611947761044779, + "grad_norm": 0.2931298048840424, + "learning_rate": 9.737152113927335e-06, + "loss": 0.295, + "step": 3265 + }, + { + "epoch": 0.2612747745045099, + "grad_norm": 0.3073007769412102, + "learning_rate": 9.736944818538536e-06, + "loss": 0.2789, + "step": 3266 + }, + { + "epoch": 0.2613547729045419, + "grad_norm": 0.3039940650108085, + "learning_rate": 9.736737443648372e-06, + "loss": 0.2951, + "step": 3267 + }, + { + "epoch": 0.26143477130457393, + "grad_norm": 0.31881615996196216, + "learning_rate": 9.736529989260323e-06, + "loss": 0.269, + "step": 3268 + }, + { + "epoch": 0.2615147697046059, + "grad_norm": 0.21991980248983584, + "learning_rate": 9.73632245537787e-06, + "loss": 0.3677, + "step": 3269 + }, + { + "epoch": 0.2615947681046379, + "grad_norm": 0.3240602565368453, + "learning_rate": 9.7361148420045e-06, + "loss": 0.2777, + "step": 3270 + }, + { + "epoch": 0.2616747665046699, + "grad_norm": 0.31730705942922377, + "learning_rate": 9.735907149143695e-06, + "loss": 0.2688, + "step": 3271 + }, + { + "epoch": 0.2617547649047019, + "grad_norm": 0.2941215203614757, + "learning_rate": 9.73569937679894e-06, + "loss": 0.3002, + "step": 3272 + }, + { + "epoch": 0.26183476330473393, + "grad_norm": 0.31247364950108886, + "learning_rate": 9.735491524973723e-06, + "loss": 0.2729, + "step": 3273 + }, + { + "epoch": 0.2619147617047659, + "grad_norm": 0.2862502786293765, + "learning_rate": 9.73528359367153e-06, + "loss": 0.298, + "step": 3274 + }, + { + "epoch": 0.2619947601047979, + "grad_norm": 0.29113964497483635, + "learning_rate": 9.735075582895856e-06, + "loss": 0.3006, + "step": 3275 + }, + { + "epoch": 0.2620747585048299, + "grad_norm": 0.3021742418006258, + "learning_rate": 9.734867492650187e-06, + "loss": 0.2571, + "step": 3276 + }, + { + "epoch": 0.2621547569048619, + "grad_norm": 0.3244099191976601, + "learning_rate": 9.734659322938018e-06, + "loss": 0.3033, + "step": 3277 + }, + { + "epoch": 0.26223475530489393, + "grad_norm": 0.2394255828914602, + "learning_rate": 9.734451073762843e-06, + "loss": 0.3787, + "step": 3278 + }, + { + "epoch": 0.2623147537049259, + "grad_norm": 0.3038020855181389, + "learning_rate": 9.734242745128156e-06, + "loss": 0.2949, + "step": 3279 + }, + { + "epoch": 0.2623947521049579, + "grad_norm": 0.25954194895489024, + "learning_rate": 9.734034337037452e-06, + "loss": 0.3314, + "step": 3280 + }, + { + "epoch": 0.2624747505049899, + "grad_norm": 0.26239388442958606, + "learning_rate": 9.733825849494232e-06, + "loss": 0.3346, + "step": 3281 + }, + { + "epoch": 0.2625547489050219, + "grad_norm": 0.3672997475674531, + "learning_rate": 9.733617282501994e-06, + "loss": 0.2895, + "step": 3282 + }, + { + "epoch": 0.2626347473050539, + "grad_norm": 0.3249400349568183, + "learning_rate": 9.733408636064236e-06, + "loss": 0.2603, + "step": 3283 + }, + { + "epoch": 0.2627147457050859, + "grad_norm": 0.3275285268875061, + "learning_rate": 9.733199910184464e-06, + "loss": 0.2615, + "step": 3284 + }, + { + "epoch": 0.2627947441051179, + "grad_norm": 0.35728076425360167, + "learning_rate": 9.732991104866179e-06, + "loss": 0.2599, + "step": 3285 + }, + { + "epoch": 0.2628747425051499, + "grad_norm": 0.3243932245599651, + "learning_rate": 9.732782220112884e-06, + "loss": 0.3011, + "step": 3286 + }, + { + "epoch": 0.2629547409051819, + "grad_norm": 0.25779972397750855, + "learning_rate": 9.732573255928086e-06, + "loss": 0.3318, + "step": 3287 + }, + { + "epoch": 0.26303473930521387, + "grad_norm": 0.39904677150893636, + "learning_rate": 9.732364212315293e-06, + "loss": 0.2593, + "step": 3288 + }, + { + "epoch": 0.2631147377052459, + "grad_norm": 0.2852369885953849, + "learning_rate": 9.732155089278013e-06, + "loss": 0.3126, + "step": 3289 + }, + { + "epoch": 0.2631947361052779, + "grad_norm": 0.31633298247727804, + "learning_rate": 9.731945886819756e-06, + "loss": 0.2659, + "step": 3290 + }, + { + "epoch": 0.2632747345053099, + "grad_norm": 0.32495253968216375, + "learning_rate": 9.731736604944031e-06, + "loss": 0.2863, + "step": 3291 + }, + { + "epoch": 0.2633547329053419, + "grad_norm": 0.2887882074764957, + "learning_rate": 9.731527243654352e-06, + "loss": 0.2771, + "step": 3292 + }, + { + "epoch": 0.26343473130537387, + "grad_norm": 0.2987400485853798, + "learning_rate": 9.731317802954233e-06, + "loss": 0.29, + "step": 3293 + }, + { + "epoch": 0.2635147297054059, + "grad_norm": 0.19954308000688015, + "learning_rate": 9.731108282847189e-06, + "loss": 0.3381, + "step": 3294 + }, + { + "epoch": 0.2635947281054379, + "grad_norm": 0.2874608593695783, + "learning_rate": 9.730898683336735e-06, + "loss": 0.2999, + "step": 3295 + }, + { + "epoch": 0.2636747265054699, + "grad_norm": 0.3511455185501061, + "learning_rate": 9.730689004426392e-06, + "loss": 0.249, + "step": 3296 + }, + { + "epoch": 0.2637547249055019, + "grad_norm": 0.29820350159924747, + "learning_rate": 9.730479246119677e-06, + "loss": 0.2955, + "step": 3297 + }, + { + "epoch": 0.26383472330553387, + "grad_norm": 0.3158625618793012, + "learning_rate": 9.73026940842011e-06, + "loss": 0.2737, + "step": 3298 + }, + { + "epoch": 0.2639147217055659, + "grad_norm": 0.37134953695820383, + "learning_rate": 9.730059491331214e-06, + "loss": 0.2896, + "step": 3299 + }, + { + "epoch": 0.2639947201055979, + "grad_norm": 0.3056288199349463, + "learning_rate": 9.72984949485651e-06, + "loss": 0.2691, + "step": 3300 + }, + { + "epoch": 0.2640747185056299, + "grad_norm": 0.27795776233097025, + "learning_rate": 9.729639418999524e-06, + "loss": 0.3042, + "step": 3301 + }, + { + "epoch": 0.2641547169056619, + "grad_norm": 0.3495656334955742, + "learning_rate": 9.729429263763781e-06, + "loss": 0.2512, + "step": 3302 + }, + { + "epoch": 0.26423471530569387, + "grad_norm": 0.3145251999706438, + "learning_rate": 9.72921902915281e-06, + "loss": 0.2573, + "step": 3303 + }, + { + "epoch": 0.2643147137057259, + "grad_norm": 0.23899468241463667, + "learning_rate": 9.729008715170137e-06, + "loss": 0.3523, + "step": 3304 + }, + { + "epoch": 0.2643947121057579, + "grad_norm": 0.3450568362481362, + "learning_rate": 9.728798321819294e-06, + "loss": 0.2934, + "step": 3305 + }, + { + "epoch": 0.2644747105057899, + "grad_norm": 0.32223327205109004, + "learning_rate": 9.72858784910381e-06, + "loss": 0.2886, + "step": 3306 + }, + { + "epoch": 0.2645547089058219, + "grad_norm": 0.3223086249263875, + "learning_rate": 9.728377297027218e-06, + "loss": 0.2516, + "step": 3307 + }, + { + "epoch": 0.26463470730585387, + "grad_norm": 0.2866324183886891, + "learning_rate": 9.728166665593053e-06, + "loss": 0.3006, + "step": 3308 + }, + { + "epoch": 0.2647147057058859, + "grad_norm": 0.24527495178358014, + "learning_rate": 9.727955954804848e-06, + "loss": 0.3525, + "step": 3309 + }, + { + "epoch": 0.2647947041059179, + "grad_norm": 0.20786093537727424, + "learning_rate": 9.727745164666142e-06, + "loss": 0.3362, + "step": 3310 + }, + { + "epoch": 0.26487470250594985, + "grad_norm": 0.3472425808348966, + "learning_rate": 9.727534295180471e-06, + "loss": 0.2604, + "step": 3311 + }, + { + "epoch": 0.2649547009059819, + "grad_norm": 0.3270131390312293, + "learning_rate": 9.727323346351374e-06, + "loss": 0.2616, + "step": 3312 + }, + { + "epoch": 0.26503469930601387, + "grad_norm": 0.2725110248153651, + "learning_rate": 9.727112318182392e-06, + "loss": 0.3448, + "step": 3313 + }, + { + "epoch": 0.2651146977060459, + "grad_norm": 0.4115174831327253, + "learning_rate": 9.726901210677066e-06, + "loss": 0.2647, + "step": 3314 + }, + { + "epoch": 0.2651946961060779, + "grad_norm": 0.2826592282726971, + "learning_rate": 9.72669002383894e-06, + "loss": 0.3028, + "step": 3315 + }, + { + "epoch": 0.26527469450610985, + "grad_norm": 0.3365942294289492, + "learning_rate": 9.72647875767156e-06, + "loss": 0.2884, + "step": 3316 + }, + { + "epoch": 0.2653546929061419, + "grad_norm": 0.28974386117515627, + "learning_rate": 9.726267412178467e-06, + "loss": 0.2839, + "step": 3317 + }, + { + "epoch": 0.26543469130617386, + "grad_norm": 0.2969030564574937, + "learning_rate": 9.726055987363212e-06, + "loss": 0.2713, + "step": 3318 + }, + { + "epoch": 0.2655146897062059, + "grad_norm": 0.31580232381748147, + "learning_rate": 9.725844483229342e-06, + "loss": 0.2748, + "step": 3319 + }, + { + "epoch": 0.2655946881062379, + "grad_norm": 0.28374228199959434, + "learning_rate": 9.725632899780406e-06, + "loss": 0.2951, + "step": 3320 + }, + { + "epoch": 0.26567468650626985, + "grad_norm": 0.32877307915009857, + "learning_rate": 9.725421237019957e-06, + "loss": 0.3004, + "step": 3321 + }, + { + "epoch": 0.2657546849063019, + "grad_norm": 0.25142999068398847, + "learning_rate": 9.725209494951547e-06, + "loss": 0.3092, + "step": 3322 + }, + { + "epoch": 0.26583468330633386, + "grad_norm": 0.3464101290305094, + "learning_rate": 9.724997673578727e-06, + "loss": 0.2976, + "step": 3323 + }, + { + "epoch": 0.2659146817063659, + "grad_norm": 0.24850765856760965, + "learning_rate": 9.724785772905057e-06, + "loss": 0.3281, + "step": 3324 + }, + { + "epoch": 0.2659946801063979, + "grad_norm": 0.2970398200846788, + "learning_rate": 9.724573792934089e-06, + "loss": 0.2882, + "step": 3325 + }, + { + "epoch": 0.26607467850642985, + "grad_norm": 0.21853837980171653, + "learning_rate": 9.724361733669383e-06, + "loss": 0.357, + "step": 3326 + }, + { + "epoch": 0.2661546769064619, + "grad_norm": 0.44497950736512093, + "learning_rate": 9.724149595114496e-06, + "loss": 0.249, + "step": 3327 + }, + { + "epoch": 0.26623467530649386, + "grad_norm": 0.27895155376721537, + "learning_rate": 9.723937377272989e-06, + "loss": 0.3032, + "step": 3328 + }, + { + "epoch": 0.2663146737065259, + "grad_norm": 0.2739129974646797, + "learning_rate": 9.723725080148426e-06, + "loss": 0.2957, + "step": 3329 + }, + { + "epoch": 0.2663946721065579, + "grad_norm": 0.297392650414487, + "learning_rate": 9.723512703744369e-06, + "loss": 0.2627, + "step": 3330 + }, + { + "epoch": 0.26647467050658985, + "grad_norm": 0.41813483304230814, + "learning_rate": 9.72330024806438e-06, + "loss": 0.2982, + "step": 3331 + }, + { + "epoch": 0.2665546689066219, + "grad_norm": 0.28240711448204286, + "learning_rate": 9.723087713112027e-06, + "loss": 0.3125, + "step": 3332 + }, + { + "epoch": 0.26663466730665386, + "grad_norm": 0.278401573935145, + "learning_rate": 9.722875098890878e-06, + "loss": 0.3001, + "step": 3333 + }, + { + "epoch": 0.2667146657066859, + "grad_norm": 0.30625688373713295, + "learning_rate": 9.722662405404499e-06, + "loss": 0.2993, + "step": 3334 + }, + { + "epoch": 0.26679466410671787, + "grad_norm": 0.29653441867906993, + "learning_rate": 9.72244963265646e-06, + "loss": 0.276, + "step": 3335 + }, + { + "epoch": 0.26687466250674985, + "grad_norm": 0.30711738345171774, + "learning_rate": 9.722236780650333e-06, + "loss": 0.2703, + "step": 3336 + }, + { + "epoch": 0.2669546609067819, + "grad_norm": 0.3051518138513531, + "learning_rate": 9.72202384938969e-06, + "loss": 0.2599, + "step": 3337 + }, + { + "epoch": 0.26703465930681386, + "grad_norm": 0.26865016629155475, + "learning_rate": 9.721810838878105e-06, + "loss": 0.288, + "step": 3338 + }, + { + "epoch": 0.26711465770684584, + "grad_norm": 0.32100437801558435, + "learning_rate": 9.721597749119151e-06, + "loss": 0.2618, + "step": 3339 + }, + { + "epoch": 0.26719465610687787, + "grad_norm": 0.2742990030426426, + "learning_rate": 9.721384580116409e-06, + "loss": 0.3327, + "step": 3340 + }, + { + "epoch": 0.26727465450690985, + "grad_norm": 0.250498245779253, + "learning_rate": 9.721171331873452e-06, + "loss": 0.3222, + "step": 3341 + }, + { + "epoch": 0.2673546529069419, + "grad_norm": 0.290789182304481, + "learning_rate": 9.720958004393859e-06, + "loss": 0.3059, + "step": 3342 + }, + { + "epoch": 0.26743465130697386, + "grad_norm": 0.25820050014333923, + "learning_rate": 9.720744597681213e-06, + "loss": 0.2884, + "step": 3343 + }, + { + "epoch": 0.26751464970700584, + "grad_norm": 0.33825639371484695, + "learning_rate": 9.720531111739095e-06, + "loss": 0.2566, + "step": 3344 + }, + { + "epoch": 0.26759464810703787, + "grad_norm": 1.2888987200921869, + "learning_rate": 9.720317546571088e-06, + "loss": 0.25, + "step": 3345 + }, + { + "epoch": 0.26767464650706985, + "grad_norm": 0.28241611393234, + "learning_rate": 9.720103902180776e-06, + "loss": 0.3097, + "step": 3346 + }, + { + "epoch": 0.2677546449071019, + "grad_norm": 0.2824841646686313, + "learning_rate": 9.719890178571744e-06, + "loss": 0.3105, + "step": 3347 + }, + { + "epoch": 0.26783464330713386, + "grad_norm": 0.29996986732106473, + "learning_rate": 9.71967637574758e-06, + "loss": 0.2492, + "step": 3348 + }, + { + "epoch": 0.26791464170716583, + "grad_norm": 0.3177365184043487, + "learning_rate": 9.719462493711873e-06, + "loss": 0.2867, + "step": 3349 + }, + { + "epoch": 0.26799464010719787, + "grad_norm": 0.24767417615348608, + "learning_rate": 9.719248532468209e-06, + "loss": 0.313, + "step": 3350 + }, + { + "epoch": 0.26807463850722985, + "grad_norm": 0.3259646254900526, + "learning_rate": 9.719034492020183e-06, + "loss": 0.2646, + "step": 3351 + }, + { + "epoch": 0.2681546369072619, + "grad_norm": 0.2727055541190775, + "learning_rate": 9.718820372371385e-06, + "loss": 0.3003, + "step": 3352 + }, + { + "epoch": 0.26823463530729386, + "grad_norm": 0.3111205801927099, + "learning_rate": 9.718606173525411e-06, + "loss": 0.2741, + "step": 3353 + }, + { + "epoch": 0.26831463370732583, + "grad_norm": 0.3043032031903069, + "learning_rate": 9.718391895485853e-06, + "loss": 0.2728, + "step": 3354 + }, + { + "epoch": 0.26839463210735787, + "grad_norm": 0.2486021372953984, + "learning_rate": 9.718177538256309e-06, + "loss": 0.3073, + "step": 3355 + }, + { + "epoch": 0.26847463050738984, + "grad_norm": 0.24096639141712556, + "learning_rate": 9.717963101840375e-06, + "loss": 0.3225, + "step": 3356 + }, + { + "epoch": 0.2685546289074219, + "grad_norm": 0.3120002122875939, + "learning_rate": 9.717748586241653e-06, + "loss": 0.2758, + "step": 3357 + }, + { + "epoch": 0.26863462730745385, + "grad_norm": 0.28756894606994393, + "learning_rate": 9.717533991463742e-06, + "loss": 0.3028, + "step": 3358 + }, + { + "epoch": 0.26871462570748583, + "grad_norm": 0.32677744244938034, + "learning_rate": 9.71731931751024e-06, + "loss": 0.2708, + "step": 3359 + }, + { + "epoch": 0.26879462410751787, + "grad_norm": 0.2940324280410806, + "learning_rate": 9.717104564384756e-06, + "loss": 0.2724, + "step": 3360 + }, + { + "epoch": 0.26887462250754984, + "grad_norm": 0.3815781685218439, + "learning_rate": 9.71688973209089e-06, + "loss": 0.2579, + "step": 3361 + }, + { + "epoch": 0.2689546209075818, + "grad_norm": 0.3052222603650581, + "learning_rate": 9.716674820632248e-06, + "loss": 0.2879, + "step": 3362 + }, + { + "epoch": 0.26903461930761385, + "grad_norm": 0.2701807211767095, + "learning_rate": 9.716459830012439e-06, + "loss": 0.2969, + "step": 3363 + }, + { + "epoch": 0.26911461770764583, + "grad_norm": 0.2934183183848906, + "learning_rate": 9.71624476023507e-06, + "loss": 0.3023, + "step": 3364 + }, + { + "epoch": 0.26919461610767786, + "grad_norm": 0.3043936713373222, + "learning_rate": 9.71602961130375e-06, + "loss": 0.2504, + "step": 3365 + }, + { + "epoch": 0.26927461450770984, + "grad_norm": 0.2836284709611108, + "learning_rate": 9.71581438322209e-06, + "loss": 0.2908, + "step": 3366 + }, + { + "epoch": 0.2693546129077418, + "grad_norm": 0.29046907736885885, + "learning_rate": 9.715599075993705e-06, + "loss": 0.285, + "step": 3367 + }, + { + "epoch": 0.26943461130777385, + "grad_norm": 0.3377959964629928, + "learning_rate": 9.715383689622205e-06, + "loss": 0.2779, + "step": 3368 + }, + { + "epoch": 0.26951460970780583, + "grad_norm": 0.3293983847601966, + "learning_rate": 9.715168224111205e-06, + "loss": 0.274, + "step": 3369 + }, + { + "epoch": 0.26959460810783786, + "grad_norm": 0.28717511577037863, + "learning_rate": 9.714952679464324e-06, + "loss": 0.2818, + "step": 3370 + }, + { + "epoch": 0.26967460650786984, + "grad_norm": 0.27764524530597867, + "learning_rate": 9.714737055685176e-06, + "loss": 0.292, + "step": 3371 + }, + { + "epoch": 0.2697546049079018, + "grad_norm": 0.3114071421627263, + "learning_rate": 9.714521352777383e-06, + "loss": 0.2627, + "step": 3372 + }, + { + "epoch": 0.26983460330793385, + "grad_norm": 0.2723969094675828, + "learning_rate": 9.714305570744564e-06, + "loss": 0.2963, + "step": 3373 + }, + { + "epoch": 0.26991460170796583, + "grad_norm": 0.31989046544946853, + "learning_rate": 9.71408970959034e-06, + "loss": 0.2511, + "step": 3374 + }, + { + "epoch": 0.26999460010799786, + "grad_norm": 0.1954346841357914, + "learning_rate": 9.713873769318333e-06, + "loss": 0.3505, + "step": 3375 + }, + { + "epoch": 0.27007459850802984, + "grad_norm": 0.2672554019239106, + "learning_rate": 9.713657749932172e-06, + "loss": 0.3058, + "step": 3376 + }, + { + "epoch": 0.2701545969080618, + "grad_norm": 0.263092771566001, + "learning_rate": 9.713441651435477e-06, + "loss": 0.3082, + "step": 3377 + }, + { + "epoch": 0.27023459530809385, + "grad_norm": 0.2806704501647788, + "learning_rate": 9.713225473831878e-06, + "loss": 0.2928, + "step": 3378 + }, + { + "epoch": 0.2703145937081258, + "grad_norm": 0.33940064480119825, + "learning_rate": 9.713009217125e-06, + "loss": 0.3054, + "step": 3379 + }, + { + "epoch": 0.27039459210815786, + "grad_norm": 0.2856751494725099, + "learning_rate": 9.712792881318475e-06, + "loss": 0.3352, + "step": 3380 + }, + { + "epoch": 0.27047459050818984, + "grad_norm": 0.24291882245532978, + "learning_rate": 9.712576466415935e-06, + "loss": 0.369, + "step": 3381 + }, + { + "epoch": 0.2705545889082218, + "grad_norm": 0.31407908794926154, + "learning_rate": 9.712359972421008e-06, + "loss": 0.3012, + "step": 3382 + }, + { + "epoch": 0.27063458730825385, + "grad_norm": 0.3380004327703376, + "learning_rate": 9.712143399337333e-06, + "loss": 0.2626, + "step": 3383 + }, + { + "epoch": 0.2707145857082858, + "grad_norm": 0.3013170073120938, + "learning_rate": 9.711926747168539e-06, + "loss": 0.2743, + "step": 3384 + }, + { + "epoch": 0.27079458410831786, + "grad_norm": 0.3300293994939693, + "learning_rate": 9.711710015918266e-06, + "loss": 0.2603, + "step": 3385 + }, + { + "epoch": 0.27087458250834984, + "grad_norm": 0.3258324076830785, + "learning_rate": 9.71149320559015e-06, + "loss": 0.2594, + "step": 3386 + }, + { + "epoch": 0.2709545809083818, + "grad_norm": 0.32567848907553143, + "learning_rate": 9.71127631618783e-06, + "loss": 0.2612, + "step": 3387 + }, + { + "epoch": 0.27103457930841385, + "grad_norm": 0.3040966604903849, + "learning_rate": 9.711059347714947e-06, + "loss": 0.274, + "step": 3388 + }, + { + "epoch": 0.2711145777084458, + "grad_norm": 0.3542670554711023, + "learning_rate": 9.710842300175141e-06, + "loss": 0.276, + "step": 3389 + }, + { + "epoch": 0.2711945761084778, + "grad_norm": 0.29618113692179054, + "learning_rate": 9.710625173572057e-06, + "loss": 0.2925, + "step": 3390 + }, + { + "epoch": 0.27127457450850984, + "grad_norm": 0.2795797016294275, + "learning_rate": 9.710407967909336e-06, + "loss": 0.2974, + "step": 3391 + }, + { + "epoch": 0.2713545729085418, + "grad_norm": 0.2865469162148655, + "learning_rate": 9.710190683190626e-06, + "loss": 0.286, + "step": 3392 + }, + { + "epoch": 0.27143457130857385, + "grad_norm": 0.3150370508779848, + "learning_rate": 9.709973319419572e-06, + "loss": 0.2857, + "step": 3393 + }, + { + "epoch": 0.2715145697086058, + "grad_norm": 0.3224244835097185, + "learning_rate": 9.709755876599822e-06, + "loss": 0.2887, + "step": 3394 + }, + { + "epoch": 0.2715945681086378, + "grad_norm": 0.2931722640292034, + "learning_rate": 9.709538354735026e-06, + "loss": 0.2779, + "step": 3395 + }, + { + "epoch": 0.27167456650866983, + "grad_norm": 0.3091348559939679, + "learning_rate": 9.709320753828837e-06, + "loss": 0.2879, + "step": 3396 + }, + { + "epoch": 0.2717545649087018, + "grad_norm": 0.33430000686222283, + "learning_rate": 9.709103073884905e-06, + "loss": 0.2657, + "step": 3397 + }, + { + "epoch": 0.27183456330873385, + "grad_norm": 0.26577951180156834, + "learning_rate": 9.708885314906882e-06, + "loss": 0.3432, + "step": 3398 + }, + { + "epoch": 0.2719145617087658, + "grad_norm": 0.3290026370917659, + "learning_rate": 9.708667476898423e-06, + "loss": 0.2842, + "step": 3399 + }, + { + "epoch": 0.2719945601087978, + "grad_norm": 0.2644554855859131, + "learning_rate": 9.708449559863187e-06, + "loss": 0.3232, + "step": 3400 + }, + { + "epoch": 0.27207455850882983, + "grad_norm": 0.27777301703914026, + "learning_rate": 9.708231563804828e-06, + "loss": 0.3071, + "step": 3401 + }, + { + "epoch": 0.2721545569088618, + "grad_norm": 0.3547900861605703, + "learning_rate": 9.708013488727006e-06, + "loss": 0.2784, + "step": 3402 + }, + { + "epoch": 0.27223455530889384, + "grad_norm": 0.2618491764283487, + "learning_rate": 9.707795334633383e-06, + "loss": 0.2938, + "step": 3403 + }, + { + "epoch": 0.2723145537089258, + "grad_norm": 0.3366615249883749, + "learning_rate": 9.707577101527616e-06, + "loss": 0.2787, + "step": 3404 + }, + { + "epoch": 0.2723945521089578, + "grad_norm": 0.32633388071026886, + "learning_rate": 9.707358789413373e-06, + "loss": 0.2755, + "step": 3405 + }, + { + "epoch": 0.27247455050898983, + "grad_norm": 0.2714848638949016, + "learning_rate": 9.707140398294313e-06, + "loss": 0.2816, + "step": 3406 + }, + { + "epoch": 0.2725545489090218, + "grad_norm": 0.2762041495719878, + "learning_rate": 9.706921928174105e-06, + "loss": 0.2721, + "step": 3407 + }, + { + "epoch": 0.27263454730905384, + "grad_norm": 0.26482309361463785, + "learning_rate": 9.706703379056412e-06, + "loss": 0.3166, + "step": 3408 + }, + { + "epoch": 0.2727145457090858, + "grad_norm": 0.32858530451554446, + "learning_rate": 9.706484750944905e-06, + "loss": 0.2631, + "step": 3409 + }, + { + "epoch": 0.2727945441091178, + "grad_norm": 0.31745761189708305, + "learning_rate": 9.706266043843253e-06, + "loss": 0.2863, + "step": 3410 + }, + { + "epoch": 0.27287454250914983, + "grad_norm": 0.30593428584074167, + "learning_rate": 9.706047257755124e-06, + "loss": 0.2885, + "step": 3411 + }, + { + "epoch": 0.2729545409091818, + "grad_norm": 0.2735380321845854, + "learning_rate": 9.705828392684194e-06, + "loss": 0.3356, + "step": 3412 + }, + { + "epoch": 0.27303453930921384, + "grad_norm": 0.2615612160715219, + "learning_rate": 9.705609448634133e-06, + "loss": 0.323, + "step": 3413 + }, + { + "epoch": 0.2731145377092458, + "grad_norm": 0.2982344390512166, + "learning_rate": 9.705390425608617e-06, + "loss": 0.2881, + "step": 3414 + }, + { + "epoch": 0.2731945361092778, + "grad_norm": 0.3071638640440055, + "learning_rate": 9.705171323611322e-06, + "loss": 0.2992, + "step": 3415 + }, + { + "epoch": 0.27327453450930983, + "grad_norm": 0.3447159883085105, + "learning_rate": 9.704952142645925e-06, + "loss": 0.309, + "step": 3416 + }, + { + "epoch": 0.2733545329093418, + "grad_norm": 0.2774190670158889, + "learning_rate": 9.704732882716104e-06, + "loss": 0.3167, + "step": 3417 + }, + { + "epoch": 0.2734345313093738, + "grad_norm": 0.3022004072712756, + "learning_rate": 9.70451354382554e-06, + "loss": 0.2895, + "step": 3418 + }, + { + "epoch": 0.2735145297094058, + "grad_norm": 0.27718848587746664, + "learning_rate": 9.704294125977912e-06, + "loss": 0.3116, + "step": 3419 + }, + { + "epoch": 0.2735945281094378, + "grad_norm": 0.2865912302171865, + "learning_rate": 9.704074629176905e-06, + "loss": 0.3117, + "step": 3420 + }, + { + "epoch": 0.27367452650946983, + "grad_norm": 0.28617222537538883, + "learning_rate": 9.703855053426202e-06, + "loss": 0.3015, + "step": 3421 + }, + { + "epoch": 0.2737545249095018, + "grad_norm": 0.2477933144543708, + "learning_rate": 9.70363539872949e-06, + "loss": 0.3238, + "step": 3422 + }, + { + "epoch": 0.2738345233095338, + "grad_norm": 0.2652507433983257, + "learning_rate": 9.703415665090452e-06, + "loss": 0.3303, + "step": 3423 + }, + { + "epoch": 0.2739145217095658, + "grad_norm": 0.2037894228535896, + "learning_rate": 9.703195852512776e-06, + "loss": 0.3504, + "step": 3424 + }, + { + "epoch": 0.2739945201095978, + "grad_norm": 0.30790354644645107, + "learning_rate": 9.702975961000155e-06, + "loss": 0.2921, + "step": 3425 + }, + { + "epoch": 0.2740745185096298, + "grad_norm": 0.3165170959258859, + "learning_rate": 9.702755990556277e-06, + "loss": 0.2727, + "step": 3426 + }, + { + "epoch": 0.2741545169096618, + "grad_norm": 0.2970940593119044, + "learning_rate": 9.702535941184833e-06, + "loss": 0.3008, + "step": 3427 + }, + { + "epoch": 0.2742345153096938, + "grad_norm": 0.30707267989733694, + "learning_rate": 9.702315812889518e-06, + "loss": 0.266, + "step": 3428 + }, + { + "epoch": 0.2743145137097258, + "grad_norm": 0.32388109721849806, + "learning_rate": 9.702095605674027e-06, + "loss": 0.2575, + "step": 3429 + }, + { + "epoch": 0.2743945121097578, + "grad_norm": 0.25905050208092584, + "learning_rate": 9.701875319542052e-06, + "loss": 0.3192, + "step": 3430 + }, + { + "epoch": 0.2744745105097898, + "grad_norm": 0.31540322714569075, + "learning_rate": 9.701654954497294e-06, + "loss": 0.2647, + "step": 3431 + }, + { + "epoch": 0.2745545089098218, + "grad_norm": 0.26679652232924855, + "learning_rate": 9.70143451054345e-06, + "loss": 0.3259, + "step": 3432 + }, + { + "epoch": 0.2746345073098538, + "grad_norm": 0.31837083310889264, + "learning_rate": 9.70121398768422e-06, + "loss": 0.294, + "step": 3433 + }, + { + "epoch": 0.2747145057098858, + "grad_norm": 0.3538398625734197, + "learning_rate": 9.700993385923303e-06, + "loss": 0.2652, + "step": 3434 + }, + { + "epoch": 0.2747945041099178, + "grad_norm": 0.32443731258814784, + "learning_rate": 9.700772705264405e-06, + "loss": 0.2759, + "step": 3435 + }, + { + "epoch": 0.2748745025099498, + "grad_norm": 0.3274663079846949, + "learning_rate": 9.700551945711228e-06, + "loss": 0.2682, + "step": 3436 + }, + { + "epoch": 0.2749545009099818, + "grad_norm": 0.3332624411008144, + "learning_rate": 9.700331107267477e-06, + "loss": 0.2738, + "step": 3437 + }, + { + "epoch": 0.2750344993100138, + "grad_norm": 0.3469697981360618, + "learning_rate": 9.700110189936858e-06, + "loss": 0.2525, + "step": 3438 + }, + { + "epoch": 0.2751144977100458, + "grad_norm": 0.3185481723066586, + "learning_rate": 9.69988919372308e-06, + "loss": 0.2637, + "step": 3439 + }, + { + "epoch": 0.2751944961100778, + "grad_norm": 0.3255951733404259, + "learning_rate": 9.69966811862985e-06, + "loss": 0.2465, + "step": 3440 + }, + { + "epoch": 0.2752744945101098, + "grad_norm": 0.3139263980423136, + "learning_rate": 9.699446964660882e-06, + "loss": 0.2565, + "step": 3441 + }, + { + "epoch": 0.2753544929101418, + "grad_norm": 0.2916452937014567, + "learning_rate": 9.699225731819884e-06, + "loss": 0.3051, + "step": 3442 + }, + { + "epoch": 0.2754344913101738, + "grad_norm": 0.2924444536331352, + "learning_rate": 9.69900442011057e-06, + "loss": 0.3008, + "step": 3443 + }, + { + "epoch": 0.2755144897102058, + "grad_norm": 0.2970619984898718, + "learning_rate": 9.698783029536653e-06, + "loss": 0.2891, + "step": 3444 + }, + { + "epoch": 0.2755944881102378, + "grad_norm": 0.25925161108317984, + "learning_rate": 9.698561560101853e-06, + "loss": 0.29, + "step": 3445 + }, + { + "epoch": 0.27567448651026977, + "grad_norm": 0.3522522008529718, + "learning_rate": 9.698340011809883e-06, + "loss": 0.2787, + "step": 3446 + }, + { + "epoch": 0.2757544849103018, + "grad_norm": 0.25942200608700616, + "learning_rate": 9.698118384664464e-06, + "loss": 0.3372, + "step": 3447 + }, + { + "epoch": 0.2758344833103338, + "grad_norm": 0.3236105055403877, + "learning_rate": 9.697896678669313e-06, + "loss": 0.2847, + "step": 3448 + }, + { + "epoch": 0.2759144817103658, + "grad_norm": 0.28259322706556816, + "learning_rate": 9.69767489382815e-06, + "loss": 0.3081, + "step": 3449 + }, + { + "epoch": 0.2759944801103978, + "grad_norm": 0.320447191303649, + "learning_rate": 9.697453030144703e-06, + "loss": 0.2741, + "step": 3450 + }, + { + "epoch": 0.27607447851042977, + "grad_norm": 0.31942562013903714, + "learning_rate": 9.697231087622691e-06, + "loss": 0.2648, + "step": 3451 + }, + { + "epoch": 0.2761544769104618, + "grad_norm": 0.3069206549259995, + "learning_rate": 9.697009066265839e-06, + "loss": 0.258, + "step": 3452 + }, + { + "epoch": 0.2762344753104938, + "grad_norm": 0.25827490507190504, + "learning_rate": 9.696786966077875e-06, + "loss": 0.3135, + "step": 3453 + }, + { + "epoch": 0.2763144737105258, + "grad_norm": 0.3388246551684192, + "learning_rate": 9.696564787062526e-06, + "loss": 0.2701, + "step": 3454 + }, + { + "epoch": 0.2763944721105578, + "grad_norm": 0.3485375777713787, + "learning_rate": 9.69634252922352e-06, + "loss": 0.2746, + "step": 3455 + }, + { + "epoch": 0.27647447051058976, + "grad_norm": 0.3038528240215262, + "learning_rate": 9.696120192564587e-06, + "loss": 0.281, + "step": 3456 + }, + { + "epoch": 0.2765544689106218, + "grad_norm": 0.4416445832446767, + "learning_rate": 9.69589777708946e-06, + "loss": 0.3302, + "step": 3457 + }, + { + "epoch": 0.2766344673106538, + "grad_norm": 0.32307712399478633, + "learning_rate": 9.695675282801873e-06, + "loss": 0.3014, + "step": 3458 + }, + { + "epoch": 0.2767144657106858, + "grad_norm": 0.27799900034853686, + "learning_rate": 9.695452709705555e-06, + "loss": 0.2947, + "step": 3459 + }, + { + "epoch": 0.2767944641107178, + "grad_norm": 0.35006191671828635, + "learning_rate": 9.695230057804248e-06, + "loss": 0.2663, + "step": 3460 + }, + { + "epoch": 0.27687446251074976, + "grad_norm": 0.3115638032397091, + "learning_rate": 9.695007327101685e-06, + "loss": 0.2789, + "step": 3461 + }, + { + "epoch": 0.2769544609107818, + "grad_norm": 0.29179097980121665, + "learning_rate": 9.694784517601604e-06, + "loss": 0.2689, + "step": 3462 + }, + { + "epoch": 0.2770344593108138, + "grad_norm": 0.2953682128747273, + "learning_rate": 9.694561629307745e-06, + "loss": 0.2828, + "step": 3463 + }, + { + "epoch": 0.2771144577108458, + "grad_norm": 0.282683785376063, + "learning_rate": 9.69433866222385e-06, + "loss": 0.2923, + "step": 3464 + }, + { + "epoch": 0.2771944561108778, + "grad_norm": 0.30253245521849464, + "learning_rate": 9.694115616353662e-06, + "loss": 0.3065, + "step": 3465 + }, + { + "epoch": 0.27727445451090976, + "grad_norm": 0.2842160775515158, + "learning_rate": 9.693892491700919e-06, + "loss": 0.3005, + "step": 3466 + }, + { + "epoch": 0.2773544529109418, + "grad_norm": 0.31437851175070225, + "learning_rate": 9.693669288269371e-06, + "loss": 0.2484, + "step": 3467 + }, + { + "epoch": 0.2774344513109738, + "grad_norm": 0.23104963252187732, + "learning_rate": 9.693446006062764e-06, + "loss": 0.3149, + "step": 3468 + }, + { + "epoch": 0.2775144497110058, + "grad_norm": 0.31049297068487686, + "learning_rate": 9.69322264508484e-06, + "loss": 0.2766, + "step": 3469 + }, + { + "epoch": 0.2775944481110378, + "grad_norm": 0.2690023524368739, + "learning_rate": 9.692999205339356e-06, + "loss": 0.2919, + "step": 3470 + }, + { + "epoch": 0.27767444651106976, + "grad_norm": 0.3029907628210159, + "learning_rate": 9.692775686830057e-06, + "loss": 0.2853, + "step": 3471 + }, + { + "epoch": 0.2777544449111018, + "grad_norm": 0.5684404932906133, + "learning_rate": 9.692552089560695e-06, + "loss": 0.2563, + "step": 3472 + }, + { + "epoch": 0.27783444331113377, + "grad_norm": 0.3215899309324039, + "learning_rate": 9.69232841353502e-06, + "loss": 0.2471, + "step": 3473 + }, + { + "epoch": 0.27791444171116575, + "grad_norm": 0.30719762658838706, + "learning_rate": 9.69210465875679e-06, + "loss": 0.2554, + "step": 3474 + }, + { + "epoch": 0.2779944401111978, + "grad_norm": 0.34701367591986115, + "learning_rate": 9.69188082522976e-06, + "loss": 0.2797, + "step": 3475 + }, + { + "epoch": 0.27807443851122976, + "grad_norm": 0.3094979214597019, + "learning_rate": 9.691656912957686e-06, + "loss": 0.2661, + "step": 3476 + }, + { + "epoch": 0.2781544369112618, + "grad_norm": 0.32863185105908344, + "learning_rate": 9.691432921944325e-06, + "loss": 0.2876, + "step": 3477 + }, + { + "epoch": 0.27823443531129377, + "grad_norm": 0.2918550380099812, + "learning_rate": 9.691208852193438e-06, + "loss": 0.2952, + "step": 3478 + }, + { + "epoch": 0.27831443371132575, + "grad_norm": 0.28682632021408966, + "learning_rate": 9.690984703708783e-06, + "loss": 0.3103, + "step": 3479 + }, + { + "epoch": 0.2783944321113578, + "grad_norm": 0.2876496445255653, + "learning_rate": 9.690760476494125e-06, + "loss": 0.2878, + "step": 3480 + }, + { + "epoch": 0.27847443051138976, + "grad_norm": 0.3135698285542876, + "learning_rate": 9.690536170553226e-06, + "loss": 0.2735, + "step": 3481 + }, + { + "epoch": 0.2785544289114218, + "grad_norm": 0.2818132190689427, + "learning_rate": 9.69031178588985e-06, + "loss": 0.2998, + "step": 3482 + }, + { + "epoch": 0.27863442731145377, + "grad_norm": 0.3712455840706995, + "learning_rate": 9.690087322507763e-06, + "loss": 0.2601, + "step": 3483 + }, + { + "epoch": 0.27871442571148575, + "grad_norm": 0.31238838748874187, + "learning_rate": 9.689862780410732e-06, + "loss": 0.2653, + "step": 3484 + }, + { + "epoch": 0.2787944241115178, + "grad_norm": 0.38769530488490955, + "learning_rate": 9.689638159602527e-06, + "loss": 0.2687, + "step": 3485 + }, + { + "epoch": 0.27887442251154976, + "grad_norm": 0.29039237619758346, + "learning_rate": 9.689413460086917e-06, + "loss": 0.3193, + "step": 3486 + }, + { + "epoch": 0.2789544209115818, + "grad_norm": 0.25073839412898447, + "learning_rate": 9.689188681867675e-06, + "loss": 0.3188, + "step": 3487 + }, + { + "epoch": 0.27903441931161377, + "grad_norm": 0.335596696242481, + "learning_rate": 9.68896382494857e-06, + "loss": 0.2699, + "step": 3488 + }, + { + "epoch": 0.27911441771164575, + "grad_norm": 0.25055872779079974, + "learning_rate": 9.688738889333376e-06, + "loss": 0.3463, + "step": 3489 + }, + { + "epoch": 0.2791944161116778, + "grad_norm": 0.3127114679308454, + "learning_rate": 9.688513875025871e-06, + "loss": 0.2711, + "step": 3490 + }, + { + "epoch": 0.27927441451170976, + "grad_norm": 0.2880476741957524, + "learning_rate": 9.68828878202983e-06, + "loss": 0.2955, + "step": 3491 + }, + { + "epoch": 0.2793544129117418, + "grad_norm": 0.30691679753455314, + "learning_rate": 9.688063610349033e-06, + "loss": 0.31, + "step": 3492 + }, + { + "epoch": 0.27943441131177377, + "grad_norm": 0.32814671488186115, + "learning_rate": 9.687838359987254e-06, + "loss": 0.2386, + "step": 3493 + }, + { + "epoch": 0.27951440971180574, + "grad_norm": 0.3155816575952546, + "learning_rate": 9.687613030948277e-06, + "loss": 0.2594, + "step": 3494 + }, + { + "epoch": 0.2795944081118378, + "grad_norm": 0.32365509684370347, + "learning_rate": 9.687387623235885e-06, + "loss": 0.2897, + "step": 3495 + }, + { + "epoch": 0.27967440651186976, + "grad_norm": 0.2854343279497684, + "learning_rate": 9.687162136853858e-06, + "loss": 0.2776, + "step": 3496 + }, + { + "epoch": 0.2797544049119018, + "grad_norm": 0.3192117243365602, + "learning_rate": 9.686936571805982e-06, + "loss": 0.2715, + "step": 3497 + }, + { + "epoch": 0.27983440331193377, + "grad_norm": 0.3090694960718188, + "learning_rate": 9.68671092809604e-06, + "loss": 0.2648, + "step": 3498 + }, + { + "epoch": 0.27991440171196574, + "grad_norm": 0.29365525155090805, + "learning_rate": 9.686485205727827e-06, + "loss": 0.3172, + "step": 3499 + }, + { + "epoch": 0.2799944001119978, + "grad_norm": 0.24379337465862244, + "learning_rate": 9.686259404705122e-06, + "loss": 0.3136, + "step": 3500 + }, + { + "epoch": 0.28007439851202975, + "grad_norm": 0.26877434204596284, + "learning_rate": 9.68603352503172e-06, + "loss": 0.3034, + "step": 3501 + }, + { + "epoch": 0.28015439691206173, + "grad_norm": 0.2727502723213497, + "learning_rate": 9.685807566711409e-06, + "loss": 0.2848, + "step": 3502 + }, + { + "epoch": 0.28023439531209376, + "grad_norm": 0.2701070517944016, + "learning_rate": 9.685581529747982e-06, + "loss": 0.3111, + "step": 3503 + }, + { + "epoch": 0.28031439371212574, + "grad_norm": 0.37025514028448614, + "learning_rate": 9.685355414145237e-06, + "loss": 0.2724, + "step": 3504 + }, + { + "epoch": 0.2803943921121578, + "grad_norm": 0.303195262015166, + "learning_rate": 9.685129219906964e-06, + "loss": 0.3027, + "step": 3505 + }, + { + "epoch": 0.28047439051218975, + "grad_norm": 0.28952981722471194, + "learning_rate": 9.684902947036959e-06, + "loss": 0.2783, + "step": 3506 + }, + { + "epoch": 0.28055438891222173, + "grad_norm": 0.2885010529664571, + "learning_rate": 9.684676595539023e-06, + "loss": 0.2969, + "step": 3507 + }, + { + "epoch": 0.28063438731225376, + "grad_norm": 0.3128208179187677, + "learning_rate": 9.684450165416953e-06, + "loss": 0.2836, + "step": 3508 + }, + { + "epoch": 0.28071438571228574, + "grad_norm": 0.27154605730218084, + "learning_rate": 9.684223656674548e-06, + "loss": 0.2945, + "step": 3509 + }, + { + "epoch": 0.2807943841123178, + "grad_norm": 0.2615764046320604, + "learning_rate": 9.683997069315612e-06, + "loss": 0.2998, + "step": 3510 + }, + { + "epoch": 0.28087438251234975, + "grad_norm": 0.3156322050827389, + "learning_rate": 9.683770403343947e-06, + "loss": 0.2587, + "step": 3511 + }, + { + "epoch": 0.28095438091238173, + "grad_norm": 0.3148725487243505, + "learning_rate": 9.683543658763357e-06, + "loss": 0.2829, + "step": 3512 + }, + { + "epoch": 0.28103437931241376, + "grad_norm": 0.24727285575114463, + "learning_rate": 9.683316835577648e-06, + "loss": 0.3314, + "step": 3513 + }, + { + "epoch": 0.28111437771244574, + "grad_norm": 0.24754777489049318, + "learning_rate": 9.683089933790626e-06, + "loss": 0.2999, + "step": 3514 + }, + { + "epoch": 0.2811943761124778, + "grad_norm": 0.24461125398433178, + "learning_rate": 9.6828629534061e-06, + "loss": 0.3122, + "step": 3515 + }, + { + "epoch": 0.28127437451250975, + "grad_norm": 0.2716940807741596, + "learning_rate": 9.682635894427878e-06, + "loss": 0.2864, + "step": 3516 + }, + { + "epoch": 0.28135437291254173, + "grad_norm": 0.22107850125775544, + "learning_rate": 9.682408756859772e-06, + "loss": 0.3047, + "step": 3517 + }, + { + "epoch": 0.28143437131257376, + "grad_norm": 0.2720880829421066, + "learning_rate": 9.682181540705596e-06, + "loss": 0.2953, + "step": 3518 + }, + { + "epoch": 0.28151436971260574, + "grad_norm": 0.28302441241508147, + "learning_rate": 9.681954245969158e-06, + "loss": 0.2865, + "step": 3519 + }, + { + "epoch": 0.28159436811263777, + "grad_norm": 0.31699394921124563, + "learning_rate": 9.681726872654278e-06, + "loss": 0.2793, + "step": 3520 + }, + { + "epoch": 0.28167436651266975, + "grad_norm": 0.311326859843477, + "learning_rate": 9.681499420764771e-06, + "loss": 0.2939, + "step": 3521 + }, + { + "epoch": 0.2817543649127017, + "grad_norm": 0.3126531677953092, + "learning_rate": 9.681271890304451e-06, + "loss": 0.2696, + "step": 3522 + }, + { + "epoch": 0.28183436331273376, + "grad_norm": 0.28627298391850425, + "learning_rate": 9.681044281277141e-06, + "loss": 0.3104, + "step": 3523 + }, + { + "epoch": 0.28191436171276574, + "grad_norm": 0.2869154481880232, + "learning_rate": 9.68081659368666e-06, + "loss": 0.3117, + "step": 3524 + }, + { + "epoch": 0.28199436011279777, + "grad_norm": 0.3224264963164287, + "learning_rate": 9.680588827536828e-06, + "loss": 0.2584, + "step": 3525 + }, + { + "epoch": 0.28207435851282975, + "grad_norm": 0.32270759340202915, + "learning_rate": 9.680360982831467e-06, + "loss": 0.2619, + "step": 3526 + }, + { + "epoch": 0.2821543569128617, + "grad_norm": 0.24948287534558383, + "learning_rate": 9.680133059574403e-06, + "loss": 0.3379, + "step": 3527 + }, + { + "epoch": 0.28223435531289376, + "grad_norm": 0.3097735572798732, + "learning_rate": 9.67990505776946e-06, + "loss": 0.2765, + "step": 3528 + }, + { + "epoch": 0.28231435371292574, + "grad_norm": 0.28730750520419135, + "learning_rate": 9.679676977420467e-06, + "loss": 0.3304, + "step": 3529 + }, + { + "epoch": 0.2823943521129577, + "grad_norm": 0.33353603419496103, + "learning_rate": 9.679448818531248e-06, + "loss": 0.2599, + "step": 3530 + }, + { + "epoch": 0.28247435051298975, + "grad_norm": 0.3351463484192178, + "learning_rate": 9.679220581105636e-06, + "loss": 0.2768, + "step": 3531 + }, + { + "epoch": 0.2825543489130217, + "grad_norm": 0.26887352570595346, + "learning_rate": 9.678992265147458e-06, + "loss": 0.2887, + "step": 3532 + }, + { + "epoch": 0.28263434731305376, + "grad_norm": 0.28827037007858636, + "learning_rate": 9.67876387066055e-06, + "loss": 0.3126, + "step": 3533 + }, + { + "epoch": 0.28271434571308574, + "grad_norm": 0.31643090337016383, + "learning_rate": 9.678535397648741e-06, + "loss": 0.255, + "step": 3534 + }, + { + "epoch": 0.2827943441131177, + "grad_norm": 0.24295894839082222, + "learning_rate": 9.67830684611587e-06, + "loss": 0.3208, + "step": 3535 + }, + { + "epoch": 0.28287434251314975, + "grad_norm": 0.2630339611720179, + "learning_rate": 9.678078216065766e-06, + "loss": 0.3218, + "step": 3536 + }, + { + "epoch": 0.2829543409131817, + "grad_norm": 0.24506561646479622, + "learning_rate": 9.677849507502275e-06, + "loss": 0.3143, + "step": 3537 + }, + { + "epoch": 0.28303433931321376, + "grad_norm": 0.28856380505982, + "learning_rate": 9.67762072042923e-06, + "loss": 0.307, + "step": 3538 + }, + { + "epoch": 0.28311433771324573, + "grad_norm": 0.25562677580581983, + "learning_rate": 9.67739185485047e-06, + "loss": 0.291, + "step": 3539 + }, + { + "epoch": 0.2831943361132777, + "grad_norm": 0.26830088274597796, + "learning_rate": 9.67716291076984e-06, + "loss": 0.294, + "step": 3540 + }, + { + "epoch": 0.28327433451330974, + "grad_norm": 0.24875984164012233, + "learning_rate": 9.676933888191178e-06, + "loss": 0.3325, + "step": 3541 + }, + { + "epoch": 0.2833543329133417, + "grad_norm": 0.3443339217621843, + "learning_rate": 9.676704787118332e-06, + "loss": 0.2966, + "step": 3542 + }, + { + "epoch": 0.28343433131337376, + "grad_norm": 0.31932565675285657, + "learning_rate": 9.676475607555145e-06, + "loss": 0.2684, + "step": 3543 + }, + { + "epoch": 0.28351432971340573, + "grad_norm": 0.31487996172819716, + "learning_rate": 9.676246349505462e-06, + "loss": 0.2553, + "step": 3544 + }, + { + "epoch": 0.2835943281134377, + "grad_norm": 0.2842247572322971, + "learning_rate": 9.676017012973133e-06, + "loss": 0.2895, + "step": 3545 + }, + { + "epoch": 0.28367432651346974, + "grad_norm": 0.22752549253876772, + "learning_rate": 9.675787597962007e-06, + "loss": 0.3043, + "step": 3546 + }, + { + "epoch": 0.2837543249135017, + "grad_norm": 0.22142071462468546, + "learning_rate": 9.675558104475933e-06, + "loss": 0.3076, + "step": 3547 + }, + { + "epoch": 0.28383432331353375, + "grad_norm": 0.26692368007808837, + "learning_rate": 9.675328532518762e-06, + "loss": 0.2825, + "step": 3548 + }, + { + "epoch": 0.28391432171356573, + "grad_norm": 0.27711547576078527, + "learning_rate": 9.67509888209435e-06, + "loss": 0.3059, + "step": 3549 + }, + { + "epoch": 0.2839943201135977, + "grad_norm": 0.3127072257642832, + "learning_rate": 9.674869153206547e-06, + "loss": 0.2873, + "step": 3550 + }, + { + "epoch": 0.28407431851362974, + "grad_norm": 0.25228187746615616, + "learning_rate": 9.674639345859213e-06, + "loss": 0.3232, + "step": 3551 + }, + { + "epoch": 0.2841543169136617, + "grad_norm": 0.37585151941760914, + "learning_rate": 9.674409460056204e-06, + "loss": 0.2865, + "step": 3552 + }, + { + "epoch": 0.28423431531369375, + "grad_norm": 0.27624787579470245, + "learning_rate": 9.674179495801375e-06, + "loss": 0.3154, + "step": 3553 + }, + { + "epoch": 0.28431431371372573, + "grad_norm": 0.3891993012699676, + "learning_rate": 9.673949453098587e-06, + "loss": 0.256, + "step": 3554 + }, + { + "epoch": 0.2843943121137577, + "grad_norm": 0.30425647695869706, + "learning_rate": 9.673719331951706e-06, + "loss": 0.2573, + "step": 3555 + }, + { + "epoch": 0.28447431051378974, + "grad_norm": 0.6599545792969306, + "learning_rate": 9.673489132364586e-06, + "loss": 0.3142, + "step": 3556 + }, + { + "epoch": 0.2845543089138217, + "grad_norm": 0.2843458805046887, + "learning_rate": 9.673258854341094e-06, + "loss": 0.3049, + "step": 3557 + }, + { + "epoch": 0.2846343073138537, + "grad_norm": 0.3040213883530118, + "learning_rate": 9.673028497885098e-06, + "loss": 0.2661, + "step": 3558 + }, + { + "epoch": 0.28471430571388573, + "grad_norm": 0.23763033673227574, + "learning_rate": 9.672798063000458e-06, + "loss": 0.3155, + "step": 3559 + }, + { + "epoch": 0.2847943041139177, + "grad_norm": 0.278766534900064, + "learning_rate": 9.672567549691046e-06, + "loss": 0.2967, + "step": 3560 + }, + { + "epoch": 0.28487430251394974, + "grad_norm": 0.32771764937400233, + "learning_rate": 9.67233695796073e-06, + "loss": 0.267, + "step": 3561 + }, + { + "epoch": 0.2849543009139817, + "grad_norm": 0.23054792919045614, + "learning_rate": 9.67210628781338e-06, + "loss": 0.3232, + "step": 3562 + }, + { + "epoch": 0.2850342993140137, + "grad_norm": 0.2189192766962145, + "learning_rate": 9.671875539252865e-06, + "loss": 0.3567, + "step": 3563 + }, + { + "epoch": 0.28511429771404573, + "grad_norm": 0.28407415174754974, + "learning_rate": 9.671644712283061e-06, + "loss": 0.2898, + "step": 3564 + }, + { + "epoch": 0.2851942961140777, + "grad_norm": 0.31229030550406567, + "learning_rate": 9.67141380690784e-06, + "loss": 0.3007, + "step": 3565 + }, + { + "epoch": 0.28527429451410974, + "grad_norm": 0.2962903372689868, + "learning_rate": 9.671182823131079e-06, + "loss": 0.2663, + "step": 3566 + }, + { + "epoch": 0.2853542929141417, + "grad_norm": 0.3134596181137637, + "learning_rate": 9.670951760956653e-06, + "loss": 0.2969, + "step": 3567 + }, + { + "epoch": 0.2854342913141737, + "grad_norm": 0.23928713432477816, + "learning_rate": 9.67072062038844e-06, + "loss": 0.3334, + "step": 3568 + }, + { + "epoch": 0.2855142897142057, + "grad_norm": 1.252922272598145, + "learning_rate": 9.670489401430322e-06, + "loss": 0.2838, + "step": 3569 + }, + { + "epoch": 0.2855942881142377, + "grad_norm": 0.332364862607897, + "learning_rate": 9.670258104086175e-06, + "loss": 0.2872, + "step": 3570 + }, + { + "epoch": 0.28567428651426974, + "grad_norm": 0.29779300854561425, + "learning_rate": 9.670026728359884e-06, + "loss": 0.2423, + "step": 3571 + }, + { + "epoch": 0.2857542849143017, + "grad_norm": 0.33394153390700454, + "learning_rate": 9.669795274255334e-06, + "loss": 0.2568, + "step": 3572 + }, + { + "epoch": 0.2858342833143337, + "grad_norm": 0.27766390163841304, + "learning_rate": 9.669563741776405e-06, + "loss": 0.3134, + "step": 3573 + }, + { + "epoch": 0.2859142817143657, + "grad_norm": 0.40186118190687187, + "learning_rate": 9.669332130926985e-06, + "loss": 0.2887, + "step": 3574 + }, + { + "epoch": 0.2859942801143977, + "grad_norm": 0.30936017999011756, + "learning_rate": 9.669100441710962e-06, + "loss": 0.2738, + "step": 3575 + }, + { + "epoch": 0.28607427851442974, + "grad_norm": 0.26752992823786176, + "learning_rate": 9.668868674132224e-06, + "loss": 0.2763, + "step": 3576 + }, + { + "epoch": 0.2861542769144617, + "grad_norm": 0.2571555542828409, + "learning_rate": 9.66863682819466e-06, + "loss": 0.3362, + "step": 3577 + }, + { + "epoch": 0.2862342753144937, + "grad_norm": 0.32160783997009484, + "learning_rate": 9.668404903902161e-06, + "loss": 0.26, + "step": 3578 + }, + { + "epoch": 0.2863142737145257, + "grad_norm": 0.32752325193888654, + "learning_rate": 9.668172901258623e-06, + "loss": 0.2695, + "step": 3579 + }, + { + "epoch": 0.2863942721145577, + "grad_norm": 0.2777100720642354, + "learning_rate": 9.667940820267935e-06, + "loss": 0.3039, + "step": 3580 + }, + { + "epoch": 0.28647427051458974, + "grad_norm": 0.29056125100124036, + "learning_rate": 9.667708660933994e-06, + "loss": 0.2915, + "step": 3581 + }, + { + "epoch": 0.2865542689146217, + "grad_norm": 0.29013375715373413, + "learning_rate": 9.667476423260696e-06, + "loss": 0.2933, + "step": 3582 + }, + { + "epoch": 0.2866342673146537, + "grad_norm": 0.2755326473346131, + "learning_rate": 9.66724410725194e-06, + "loss": 0.2812, + "step": 3583 + }, + { + "epoch": 0.2867142657146857, + "grad_norm": 0.30203473419123206, + "learning_rate": 9.667011712911625e-06, + "loss": 0.3021, + "step": 3584 + }, + { + "epoch": 0.2867942641147177, + "grad_norm": 0.264519233834704, + "learning_rate": 9.66677924024365e-06, + "loss": 0.3277, + "step": 3585 + }, + { + "epoch": 0.2868742625147497, + "grad_norm": 0.2524821117628419, + "learning_rate": 9.666546689251916e-06, + "loss": 0.3091, + "step": 3586 + }, + { + "epoch": 0.2869542609147817, + "grad_norm": 0.26361569298958415, + "learning_rate": 9.666314059940326e-06, + "loss": 0.2921, + "step": 3587 + }, + { + "epoch": 0.2870342593148137, + "grad_norm": 0.2709067721793426, + "learning_rate": 9.666081352312789e-06, + "loss": 0.3068, + "step": 3588 + }, + { + "epoch": 0.2871142577148457, + "grad_norm": 0.2079724231244705, + "learning_rate": 9.665848566373204e-06, + "loss": 0.349, + "step": 3589 + }, + { + "epoch": 0.2871942561148777, + "grad_norm": 0.29602360914847914, + "learning_rate": 9.665615702125482e-06, + "loss": 0.2944, + "step": 3590 + }, + { + "epoch": 0.2872742545149097, + "grad_norm": 0.31818458419094364, + "learning_rate": 9.665382759573529e-06, + "loss": 0.2531, + "step": 3591 + }, + { + "epoch": 0.2873542529149417, + "grad_norm": 0.31006998694065263, + "learning_rate": 9.665149738721258e-06, + "loss": 0.3182, + "step": 3592 + }, + { + "epoch": 0.2874342513149737, + "grad_norm": 0.32636730740493136, + "learning_rate": 9.664916639572574e-06, + "loss": 0.2575, + "step": 3593 + }, + { + "epoch": 0.2875142497150057, + "grad_norm": 0.2920081928252954, + "learning_rate": 9.664683462131397e-06, + "loss": 0.3063, + "step": 3594 + }, + { + "epoch": 0.2875942481150377, + "grad_norm": 0.2410949850330817, + "learning_rate": 9.664450206401633e-06, + "loss": 0.3257, + "step": 3595 + }, + { + "epoch": 0.2876742465150697, + "grad_norm": 0.3174193011491744, + "learning_rate": 9.664216872387202e-06, + "loss": 0.2481, + "step": 3596 + }, + { + "epoch": 0.2877542449151017, + "grad_norm": 0.31090327268265827, + "learning_rate": 9.663983460092015e-06, + "loss": 0.2735, + "step": 3597 + }, + { + "epoch": 0.2878342433151337, + "grad_norm": 0.2439558762274201, + "learning_rate": 9.663749969519994e-06, + "loss": 0.3264, + "step": 3598 + }, + { + "epoch": 0.2879142417151657, + "grad_norm": 0.2514240283033212, + "learning_rate": 9.663516400675057e-06, + "loss": 0.3212, + "step": 3599 + }, + { + "epoch": 0.2879942401151977, + "grad_norm": 0.2835896842328783, + "learning_rate": 9.663282753561124e-06, + "loss": 0.2974, + "step": 3600 + }, + { + "epoch": 0.2880742385152297, + "grad_norm": 0.33940816915779104, + "learning_rate": 9.663049028182112e-06, + "loss": 0.2654, + "step": 3601 + }, + { + "epoch": 0.2881542369152617, + "grad_norm": 0.20482419056241102, + "learning_rate": 9.662815224541949e-06, + "loss": 0.3449, + "step": 3602 + }, + { + "epoch": 0.2882342353152937, + "grad_norm": 0.30337850902951247, + "learning_rate": 9.662581342644557e-06, + "loss": 0.2593, + "step": 3603 + }, + { + "epoch": 0.2883142337153257, + "grad_norm": 0.20427567936580654, + "learning_rate": 9.662347382493863e-06, + "loss": 0.3571, + "step": 3604 + }, + { + "epoch": 0.2883942321153577, + "grad_norm": 0.24484493972005475, + "learning_rate": 9.662113344093791e-06, + "loss": 0.3632, + "step": 3605 + }, + { + "epoch": 0.2884742305153897, + "grad_norm": 0.326462097588612, + "learning_rate": 9.66187922744827e-06, + "loss": 0.3186, + "step": 3606 + }, + { + "epoch": 0.2885542289154217, + "grad_norm": 0.30081808241951774, + "learning_rate": 9.66164503256123e-06, + "loss": 0.2966, + "step": 3607 + }, + { + "epoch": 0.2886342273154537, + "grad_norm": 0.621189559489383, + "learning_rate": 9.6614107594366e-06, + "loss": 0.3041, + "step": 3608 + }, + { + "epoch": 0.2887142257154857, + "grad_norm": 0.31585116718843326, + "learning_rate": 9.661176408078315e-06, + "loss": 0.2667, + "step": 3609 + }, + { + "epoch": 0.2887942241155177, + "grad_norm": 0.2680006914897545, + "learning_rate": 9.660941978490302e-06, + "loss": 0.2991, + "step": 3610 + }, + { + "epoch": 0.2888742225155497, + "grad_norm": 0.36748299932967704, + "learning_rate": 9.660707470676503e-06, + "loss": 0.2821, + "step": 3611 + }, + { + "epoch": 0.2889542209155817, + "grad_norm": 0.28997781194531913, + "learning_rate": 9.660472884640848e-06, + "loss": 0.3005, + "step": 3612 + }, + { + "epoch": 0.2890342193156137, + "grad_norm": 0.2873148591015218, + "learning_rate": 9.660238220387277e-06, + "loss": 0.295, + "step": 3613 + }, + { + "epoch": 0.28911421771564566, + "grad_norm": 0.28616598892234374, + "learning_rate": 9.660003477919727e-06, + "loss": 0.3054, + "step": 3614 + }, + { + "epoch": 0.2891942161156777, + "grad_norm": 0.21877521125323413, + "learning_rate": 9.659768657242138e-06, + "loss": 0.3195, + "step": 3615 + }, + { + "epoch": 0.28927421451570967, + "grad_norm": 0.31213642762197963, + "learning_rate": 9.659533758358455e-06, + "loss": 0.2652, + "step": 3616 + }, + { + "epoch": 0.2893542129157417, + "grad_norm": 0.2527272388147323, + "learning_rate": 9.659298781272615e-06, + "loss": 0.3004, + "step": 3617 + }, + { + "epoch": 0.2894342113157737, + "grad_norm": 0.2814768470351549, + "learning_rate": 9.659063725988562e-06, + "loss": 0.285, + "step": 3618 + }, + { + "epoch": 0.28951420971580566, + "grad_norm": 0.2251439533656403, + "learning_rate": 9.658828592510243e-06, + "loss": 0.3665, + "step": 3619 + }, + { + "epoch": 0.2895942081158377, + "grad_norm": 0.2505596122145966, + "learning_rate": 9.658593380841605e-06, + "loss": 0.3168, + "step": 3620 + }, + { + "epoch": 0.28967420651586967, + "grad_norm": 0.25385916125848074, + "learning_rate": 9.658358090986594e-06, + "loss": 0.3149, + "step": 3621 + }, + { + "epoch": 0.2897542049159017, + "grad_norm": 0.3014876867758723, + "learning_rate": 9.658122722949161e-06, + "loss": 0.2886, + "step": 3622 + }, + { + "epoch": 0.2898342033159337, + "grad_norm": 0.3070731922875036, + "learning_rate": 9.657887276733254e-06, + "loss": 0.2604, + "step": 3623 + }, + { + "epoch": 0.28991420171596566, + "grad_norm": 0.26123416921451614, + "learning_rate": 9.657651752342824e-06, + "loss": 0.3166, + "step": 3624 + }, + { + "epoch": 0.2899942001159977, + "grad_norm": 0.26274565773351494, + "learning_rate": 9.657416149781826e-06, + "loss": 0.3297, + "step": 3625 + }, + { + "epoch": 0.29007419851602967, + "grad_norm": 0.28286791612407075, + "learning_rate": 9.657180469054213e-06, + "loss": 0.3042, + "step": 3626 + }, + { + "epoch": 0.2901541969160617, + "grad_norm": 0.2887230967073155, + "learning_rate": 9.65694471016394e-06, + "loss": 0.2975, + "step": 3627 + }, + { + "epoch": 0.2902341953160937, + "grad_norm": 0.30982160227213973, + "learning_rate": 9.656708873114966e-06, + "loss": 0.2551, + "step": 3628 + }, + { + "epoch": 0.29031419371612566, + "grad_norm": 0.27219807766382315, + "learning_rate": 9.656472957911247e-06, + "loss": 0.2931, + "step": 3629 + }, + { + "epoch": 0.2903941921161577, + "grad_norm": 0.27086082481773255, + "learning_rate": 9.656236964556742e-06, + "loss": 0.3023, + "step": 3630 + }, + { + "epoch": 0.29047419051618967, + "grad_norm": 0.2618741052120626, + "learning_rate": 9.656000893055416e-06, + "loss": 0.3144, + "step": 3631 + }, + { + "epoch": 0.2905541889162217, + "grad_norm": 0.3053854648384801, + "learning_rate": 9.655764743411224e-06, + "loss": 0.2463, + "step": 3632 + }, + { + "epoch": 0.2906341873162537, + "grad_norm": 0.2880725367070506, + "learning_rate": 9.655528515628136e-06, + "loss": 0.3144, + "step": 3633 + }, + { + "epoch": 0.29071418571628566, + "grad_norm": 0.31600063153583674, + "learning_rate": 9.655292209710111e-06, + "loss": 0.238, + "step": 3634 + }, + { + "epoch": 0.2907941841163177, + "grad_norm": 0.3208901107665132, + "learning_rate": 9.655055825661122e-06, + "loss": 0.2851, + "step": 3635 + }, + { + "epoch": 0.29087418251634967, + "grad_norm": 0.2997957886432289, + "learning_rate": 9.65481936348513e-06, + "loss": 0.2484, + "step": 3636 + }, + { + "epoch": 0.2909541809163817, + "grad_norm": 0.40280525710843185, + "learning_rate": 9.654582823186107e-06, + "loss": 0.2942, + "step": 3637 + }, + { + "epoch": 0.2910341793164137, + "grad_norm": 0.2598381322881362, + "learning_rate": 9.654346204768019e-06, + "loss": 0.3249, + "step": 3638 + }, + { + "epoch": 0.29111417771644565, + "grad_norm": 0.28273157136828847, + "learning_rate": 9.654109508234843e-06, + "loss": 0.3037, + "step": 3639 + }, + { + "epoch": 0.2911941761164777, + "grad_norm": 0.24193152083222313, + "learning_rate": 9.653872733590547e-06, + "loss": 0.3275, + "step": 3640 + }, + { + "epoch": 0.29127417451650967, + "grad_norm": 0.2785288485829046, + "learning_rate": 9.653635880839107e-06, + "loss": 0.2933, + "step": 3641 + }, + { + "epoch": 0.29135417291654164, + "grad_norm": 0.26234933785429176, + "learning_rate": 9.653398949984497e-06, + "loss": 0.3037, + "step": 3642 + }, + { + "epoch": 0.2914341713165737, + "grad_norm": 0.2409530985579634, + "learning_rate": 9.653161941030695e-06, + "loss": 0.3253, + "step": 3643 + }, + { + "epoch": 0.29151416971660565, + "grad_norm": 0.3130850214610556, + "learning_rate": 9.652924853981676e-06, + "loss": 0.2739, + "step": 3644 + }, + { + "epoch": 0.2915941681166377, + "grad_norm": 0.28051620437122887, + "learning_rate": 9.652687688841422e-06, + "loss": 0.3178, + "step": 3645 + }, + { + "epoch": 0.29167416651666966, + "grad_norm": 0.3221847246746687, + "learning_rate": 9.652450445613913e-06, + "loss": 0.2544, + "step": 3646 + }, + { + "epoch": 0.29175416491670164, + "grad_norm": 0.3076556519718147, + "learning_rate": 9.652213124303126e-06, + "loss": 0.2693, + "step": 3647 + }, + { + "epoch": 0.2918341633167337, + "grad_norm": 0.29864406862355425, + "learning_rate": 9.651975724913051e-06, + "loss": 0.2941, + "step": 3648 + }, + { + "epoch": 0.29191416171676565, + "grad_norm": 0.30740397326881647, + "learning_rate": 9.65173824744767e-06, + "loss": 0.2609, + "step": 3649 + }, + { + "epoch": 0.2919941601167977, + "grad_norm": 0.2848300563932702, + "learning_rate": 9.65150069191097e-06, + "loss": 0.2872, + "step": 3650 + }, + { + "epoch": 0.29207415851682966, + "grad_norm": 0.2909451909650978, + "learning_rate": 9.651263058306932e-06, + "loss": 0.2878, + "step": 3651 + }, + { + "epoch": 0.29215415691686164, + "grad_norm": 0.28404880480673894, + "learning_rate": 9.65102534663955e-06, + "loss": 0.2977, + "step": 3652 + }, + { + "epoch": 0.2922341553168937, + "grad_norm": 0.2545819949316656, + "learning_rate": 9.650787556912811e-06, + "loss": 0.3249, + "step": 3653 + }, + { + "epoch": 0.29231415371692565, + "grad_norm": 0.34201736190712306, + "learning_rate": 9.650549689130706e-06, + "loss": 0.2861, + "step": 3654 + }, + { + "epoch": 0.2923941521169577, + "grad_norm": 0.27140677441089944, + "learning_rate": 9.650311743297229e-06, + "loss": 0.3028, + "step": 3655 + }, + { + "epoch": 0.29247415051698966, + "grad_norm": 0.31290119711359193, + "learning_rate": 9.650073719416374e-06, + "loss": 0.2451, + "step": 3656 + }, + { + "epoch": 0.29255414891702164, + "grad_norm": 0.28758686258621446, + "learning_rate": 9.64983561749213e-06, + "loss": 0.2871, + "step": 3657 + }, + { + "epoch": 0.29263414731705367, + "grad_norm": 0.3263971096916948, + "learning_rate": 9.6495974375285e-06, + "loss": 0.2724, + "step": 3658 + }, + { + "epoch": 0.29271414571708565, + "grad_norm": 0.31800109214298666, + "learning_rate": 9.649359179529477e-06, + "loss": 0.256, + "step": 3659 + }, + { + "epoch": 0.2927941441171177, + "grad_norm": 0.31727300964075655, + "learning_rate": 9.649120843499065e-06, + "loss": 0.2603, + "step": 3660 + }, + { + "epoch": 0.29287414251714966, + "grad_norm": 0.3475772808426779, + "learning_rate": 9.648882429441258e-06, + "loss": 0.2765, + "step": 3661 + }, + { + "epoch": 0.29295414091718164, + "grad_norm": 0.3154059517955945, + "learning_rate": 9.64864393736006e-06, + "loss": 0.2514, + "step": 3662 + }, + { + "epoch": 0.29303413931721367, + "grad_norm": 0.27476871996524005, + "learning_rate": 9.648405367259475e-06, + "loss": 0.3094, + "step": 3663 + }, + { + "epoch": 0.29311413771724565, + "grad_norm": 0.328053225591602, + "learning_rate": 9.648166719143504e-06, + "loss": 0.2433, + "step": 3664 + }, + { + "epoch": 0.2931941361172777, + "grad_norm": 0.2772424880219433, + "learning_rate": 9.647927993016154e-06, + "loss": 0.3289, + "step": 3665 + }, + { + "epoch": 0.29327413451730966, + "grad_norm": 0.2860111505037963, + "learning_rate": 9.647689188881431e-06, + "loss": 0.2871, + "step": 3666 + }, + { + "epoch": 0.29335413291734164, + "grad_norm": 0.23119815102629518, + "learning_rate": 9.647450306743345e-06, + "loss": 0.316, + "step": 3667 + }, + { + "epoch": 0.29343413131737367, + "grad_norm": 0.25103273019291483, + "learning_rate": 9.647211346605902e-06, + "loss": 0.3095, + "step": 3668 + }, + { + "epoch": 0.29351412971740565, + "grad_norm": 0.27787133399955677, + "learning_rate": 9.646972308473115e-06, + "loss": 0.2917, + "step": 3669 + }, + { + "epoch": 0.2935941281174376, + "grad_norm": 0.3204560639969028, + "learning_rate": 9.646733192348996e-06, + "loss": 0.25, + "step": 3670 + }, + { + "epoch": 0.29367412651746966, + "grad_norm": 0.2891861766751723, + "learning_rate": 9.646493998237557e-06, + "loss": 0.2865, + "step": 3671 + }, + { + "epoch": 0.29375412491750164, + "grad_norm": 0.23214187634365005, + "learning_rate": 9.64625472614281e-06, + "loss": 0.3255, + "step": 3672 + }, + { + "epoch": 0.29383412331753367, + "grad_norm": 0.24368769748316976, + "learning_rate": 9.646015376068776e-06, + "loss": 0.312, + "step": 3673 + }, + { + "epoch": 0.29391412171756565, + "grad_norm": 0.31150511655916724, + "learning_rate": 9.645775948019466e-06, + "loss": 0.2851, + "step": 3674 + }, + { + "epoch": 0.2939941201175976, + "grad_norm": 0.2705216689498565, + "learning_rate": 9.645536441998907e-06, + "loss": 0.2709, + "step": 3675 + }, + { + "epoch": 0.29407411851762966, + "grad_norm": 0.32738656870048694, + "learning_rate": 9.645296858011109e-06, + "loss": 0.265, + "step": 3676 + }, + { + "epoch": 0.29415411691766163, + "grad_norm": 0.2743633712930602, + "learning_rate": 9.6450571960601e-06, + "loss": 0.3053, + "step": 3677 + }, + { + "epoch": 0.29423411531769367, + "grad_norm": 0.24948198917658698, + "learning_rate": 9.644817456149898e-06, + "loss": 0.2878, + "step": 3678 + }, + { + "epoch": 0.29431411371772565, + "grad_norm": 0.21507038159488337, + "learning_rate": 9.64457763828453e-06, + "loss": 0.345, + "step": 3679 + }, + { + "epoch": 0.2943941121177576, + "grad_norm": 0.4280201945340013, + "learning_rate": 9.644337742468017e-06, + "loss": 0.2747, + "step": 3680 + }, + { + "epoch": 0.29447411051778966, + "grad_norm": 0.27187052490272967, + "learning_rate": 9.64409776870439e-06, + "loss": 0.3133, + "step": 3681 + }, + { + "epoch": 0.29455410891782163, + "grad_norm": 0.29810668702114657, + "learning_rate": 9.643857716997674e-06, + "loss": 0.318, + "step": 3682 + }, + { + "epoch": 0.29463410731785367, + "grad_norm": 0.2396709265443274, + "learning_rate": 9.643617587351897e-06, + "loss": 0.3183, + "step": 3683 + }, + { + "epoch": 0.29471410571788564, + "grad_norm": 0.2504255980305616, + "learning_rate": 9.64337737977109e-06, + "loss": 0.3188, + "step": 3684 + }, + { + "epoch": 0.2947941041179176, + "grad_norm": 0.25570491083131913, + "learning_rate": 9.643137094259285e-06, + "loss": 0.3132, + "step": 3685 + }, + { + "epoch": 0.29487410251794965, + "grad_norm": 0.27973499448631417, + "learning_rate": 9.642896730820514e-06, + "loss": 0.2907, + "step": 3686 + }, + { + "epoch": 0.29495410091798163, + "grad_norm": 0.19842977661186897, + "learning_rate": 9.642656289458812e-06, + "loss": 0.3608, + "step": 3687 + }, + { + "epoch": 0.29503409931801367, + "grad_norm": 0.27323309896896375, + "learning_rate": 9.642415770178213e-06, + "loss": 0.3149, + "step": 3688 + }, + { + "epoch": 0.29511409771804564, + "grad_norm": 0.38977248536656156, + "learning_rate": 9.642175172982755e-06, + "loss": 0.294, + "step": 3689 + }, + { + "epoch": 0.2951940961180776, + "grad_norm": 0.26292251450084825, + "learning_rate": 9.641934497876476e-06, + "loss": 0.304, + "step": 3690 + }, + { + "epoch": 0.29527409451810965, + "grad_norm": 0.34320477281012707, + "learning_rate": 9.641693744863413e-06, + "loss": 0.2819, + "step": 3691 + }, + { + "epoch": 0.29535409291814163, + "grad_norm": 0.3222693957793193, + "learning_rate": 9.641452913947611e-06, + "loss": 0.2835, + "step": 3692 + }, + { + "epoch": 0.29543409131817366, + "grad_norm": 0.3008470277823178, + "learning_rate": 9.641212005133107e-06, + "loss": 0.2646, + "step": 3693 + }, + { + "epoch": 0.29551408971820564, + "grad_norm": 0.24515306598083095, + "learning_rate": 9.64097101842395e-06, + "loss": 0.3197, + "step": 3694 + }, + { + "epoch": 0.2955940881182376, + "grad_norm": 0.38829884202271975, + "learning_rate": 9.640729953824178e-06, + "loss": 0.3236, + "step": 3695 + }, + { + "epoch": 0.29567408651826965, + "grad_norm": 0.29560961059019253, + "learning_rate": 9.64048881133784e-06, + "loss": 0.2889, + "step": 3696 + }, + { + "epoch": 0.29575408491830163, + "grad_norm": 0.2532809208065725, + "learning_rate": 9.640247590968985e-06, + "loss": 0.3081, + "step": 3697 + }, + { + "epoch": 0.2958340833183336, + "grad_norm": 0.2864046071706207, + "learning_rate": 9.64000629272166e-06, + "loss": 0.2834, + "step": 3698 + }, + { + "epoch": 0.29591408171836564, + "grad_norm": 0.2928897796205158, + "learning_rate": 9.639764916599913e-06, + "loss": 0.2786, + "step": 3699 + }, + { + "epoch": 0.2959940801183976, + "grad_norm": 0.27145428331465776, + "learning_rate": 9.639523462607796e-06, + "loss": 0.3217, + "step": 3700 + }, + { + "epoch": 0.29607407851842965, + "grad_norm": 0.28536457158623474, + "learning_rate": 9.639281930749363e-06, + "loss": 0.2921, + "step": 3701 + }, + { + "epoch": 0.29615407691846163, + "grad_norm": 0.2748436166073774, + "learning_rate": 9.639040321028665e-06, + "loss": 0.2955, + "step": 3702 + }, + { + "epoch": 0.2962340753184936, + "grad_norm": 0.7792886666737343, + "learning_rate": 9.63879863344976e-06, + "loss": 0.2798, + "step": 3703 + }, + { + "epoch": 0.29631407371852564, + "grad_norm": 0.319197613409756, + "learning_rate": 9.638556868016704e-06, + "loss": 0.3174, + "step": 3704 + }, + { + "epoch": 0.2963940721185576, + "grad_norm": 0.3516996553239381, + "learning_rate": 9.638315024733552e-06, + "loss": 0.3018, + "step": 3705 + }, + { + "epoch": 0.29647407051858965, + "grad_norm": 0.3012396617555724, + "learning_rate": 9.638073103604364e-06, + "loss": 0.273, + "step": 3706 + }, + { + "epoch": 0.2965540689186216, + "grad_norm": 0.30481295833388966, + "learning_rate": 9.6378311046332e-06, + "loss": 0.2808, + "step": 3707 + }, + { + "epoch": 0.2966340673186536, + "grad_norm": 0.31370002630011007, + "learning_rate": 9.637589027824123e-06, + "loss": 0.2639, + "step": 3708 + }, + { + "epoch": 0.29671406571868564, + "grad_norm": 0.27322122593818465, + "learning_rate": 9.637346873181194e-06, + "loss": 0.258, + "step": 3709 + }, + { + "epoch": 0.2967940641187176, + "grad_norm": 0.27250613028642545, + "learning_rate": 9.637104640708482e-06, + "loss": 0.3341, + "step": 3710 + }, + { + "epoch": 0.29687406251874965, + "grad_norm": 0.3150893975119145, + "learning_rate": 9.636862330410043e-06, + "loss": 0.2692, + "step": 3711 + }, + { + "epoch": 0.2969540609187816, + "grad_norm": 0.31024142453826253, + "learning_rate": 9.636619942289953e-06, + "loss": 0.2827, + "step": 3712 + }, + { + "epoch": 0.2970340593188136, + "grad_norm": 0.2950546739832863, + "learning_rate": 9.636377476352277e-06, + "loss": 0.2457, + "step": 3713 + }, + { + "epoch": 0.29711405771884564, + "grad_norm": 0.3596318405935052, + "learning_rate": 9.636134932601082e-06, + "loss": 0.2705, + "step": 3714 + }, + { + "epoch": 0.2971940561188776, + "grad_norm": 0.3217345323118577, + "learning_rate": 9.63589231104044e-06, + "loss": 0.2809, + "step": 3715 + }, + { + "epoch": 0.29727405451890965, + "grad_norm": 0.23982361333263838, + "learning_rate": 9.635649611674425e-06, + "loss": 0.3222, + "step": 3716 + }, + { + "epoch": 0.2973540529189416, + "grad_norm": 0.39246221825566546, + "learning_rate": 9.635406834507108e-06, + "loss": 0.2791, + "step": 3717 + }, + { + "epoch": 0.2974340513189736, + "grad_norm": 0.2919341227049066, + "learning_rate": 9.635163979542564e-06, + "loss": 0.2918, + "step": 3718 + }, + { + "epoch": 0.29751404971900564, + "grad_norm": 0.2210234633242241, + "learning_rate": 9.63492104678487e-06, + "loss": 0.3532, + "step": 3719 + }, + { + "epoch": 0.2975940481190376, + "grad_norm": 0.3522288793019361, + "learning_rate": 9.634678036238102e-06, + "loss": 0.3004, + "step": 3720 + }, + { + "epoch": 0.29767404651906965, + "grad_norm": 0.2814137892248608, + "learning_rate": 9.634434947906337e-06, + "loss": 0.2881, + "step": 3721 + }, + { + "epoch": 0.2977540449191016, + "grad_norm": 0.31428072205296853, + "learning_rate": 9.634191781793659e-06, + "loss": 0.2693, + "step": 3722 + }, + { + "epoch": 0.2978340433191336, + "grad_norm": 0.268707894804558, + "learning_rate": 9.633948537904145e-06, + "loss": 0.269, + "step": 3723 + }, + { + "epoch": 0.29791404171916563, + "grad_norm": 0.3703272103027423, + "learning_rate": 9.633705216241881e-06, + "loss": 0.2482, + "step": 3724 + }, + { + "epoch": 0.2979940401191976, + "grad_norm": 0.2799431544590881, + "learning_rate": 9.633461816810949e-06, + "loss": 0.3093, + "step": 3725 + }, + { + "epoch": 0.2980740385192296, + "grad_norm": 0.29215297072072555, + "learning_rate": 9.633218339615433e-06, + "loss": 0.3111, + "step": 3726 + }, + { + "epoch": 0.2981540369192616, + "grad_norm": 0.28201874271282845, + "learning_rate": 9.632974784659421e-06, + "loss": 0.2873, + "step": 3727 + }, + { + "epoch": 0.2982340353192936, + "grad_norm": 0.2789291061611354, + "learning_rate": 9.632731151946999e-06, + "loss": 0.2683, + "step": 3728 + }, + { + "epoch": 0.29831403371932563, + "grad_norm": 0.3382906313118446, + "learning_rate": 9.632487441482258e-06, + "loss": 0.2624, + "step": 3729 + }, + { + "epoch": 0.2983940321193576, + "grad_norm": 0.24943013622185053, + "learning_rate": 9.632243653269287e-06, + "loss": 0.3272, + "step": 3730 + }, + { + "epoch": 0.2984740305193896, + "grad_norm": 0.255167024913473, + "learning_rate": 9.631999787312179e-06, + "loss": 0.3321, + "step": 3731 + }, + { + "epoch": 0.2985540289194216, + "grad_norm": 0.29736051306144934, + "learning_rate": 9.631755843615024e-06, + "loss": 0.2855, + "step": 3732 + }, + { + "epoch": 0.2986340273194536, + "grad_norm": 0.2786282049815428, + "learning_rate": 9.631511822181918e-06, + "loss": 0.2836, + "step": 3733 + }, + { + "epoch": 0.29871402571948563, + "grad_norm": 0.2529832848333478, + "learning_rate": 9.631267723016956e-06, + "loss": 0.3141, + "step": 3734 + }, + { + "epoch": 0.2987940241195176, + "grad_norm": 0.6475168815239636, + "learning_rate": 9.631023546124236e-06, + "loss": 0.2877, + "step": 3735 + }, + { + "epoch": 0.2988740225195496, + "grad_norm": 0.32859645512767577, + "learning_rate": 9.630779291507854e-06, + "loss": 0.2645, + "step": 3736 + }, + { + "epoch": 0.2989540209195816, + "grad_norm": 0.28785597464177126, + "learning_rate": 9.630534959171912e-06, + "loss": 0.2897, + "step": 3737 + }, + { + "epoch": 0.2990340193196136, + "grad_norm": 0.3181455689232381, + "learning_rate": 9.630290549120508e-06, + "loss": 0.2866, + "step": 3738 + }, + { + "epoch": 0.29911401771964563, + "grad_norm": 0.2425466268635054, + "learning_rate": 9.630046061357745e-06, + "loss": 0.3304, + "step": 3739 + }, + { + "epoch": 0.2991940161196776, + "grad_norm": 0.25700648831491657, + "learning_rate": 9.62980149588773e-06, + "loss": 0.3197, + "step": 3740 + }, + { + "epoch": 0.2992740145197096, + "grad_norm": 0.35519267931075654, + "learning_rate": 9.62955685271456e-06, + "loss": 0.2966, + "step": 3741 + }, + { + "epoch": 0.2993540129197416, + "grad_norm": 0.24091482272976955, + "learning_rate": 9.629312131842346e-06, + "loss": 0.3269, + "step": 3742 + }, + { + "epoch": 0.2994340113197736, + "grad_norm": 0.3656330195048561, + "learning_rate": 9.629067333275195e-06, + "loss": 0.2554, + "step": 3743 + }, + { + "epoch": 0.29951400971980563, + "grad_norm": 0.2852262908138894, + "learning_rate": 9.628822457017215e-06, + "loss": 0.2608, + "step": 3744 + }, + { + "epoch": 0.2995940081198376, + "grad_norm": 0.2965723208472894, + "learning_rate": 9.628577503072513e-06, + "loss": 0.2918, + "step": 3745 + }, + { + "epoch": 0.2996740065198696, + "grad_norm": 0.3193581453606131, + "learning_rate": 9.628332471445206e-06, + "loss": 0.2432, + "step": 3746 + }, + { + "epoch": 0.2997540049199016, + "grad_norm": 0.3383474939622251, + "learning_rate": 9.628087362139402e-06, + "loss": 0.2877, + "step": 3747 + }, + { + "epoch": 0.2998340033199336, + "grad_norm": 0.3128038202416646, + "learning_rate": 9.627842175159217e-06, + "loss": 0.2921, + "step": 3748 + }, + { + "epoch": 0.29991400171996563, + "grad_norm": 0.3458796821020789, + "learning_rate": 9.627596910508763e-06, + "loss": 0.3278, + "step": 3749 + }, + { + "epoch": 0.2999940001199976, + "grad_norm": 0.6562247094933588, + "learning_rate": 9.627351568192159e-06, + "loss": 0.2622, + "step": 3750 + }, + { + "epoch": 0.3000739985200296, + "grad_norm": 0.3345316510721369, + "learning_rate": 9.627106148213521e-06, + "loss": 0.2441, + "step": 3751 + }, + { + "epoch": 0.3001539969200616, + "grad_norm": 0.39309551517960806, + "learning_rate": 9.62686065057697e-06, + "loss": 0.2771, + "step": 3752 + }, + { + "epoch": 0.3002339953200936, + "grad_norm": 0.308974290541272, + "learning_rate": 9.626615075286626e-06, + "loss": 0.2526, + "step": 3753 + }, + { + "epoch": 0.30031399372012557, + "grad_norm": 0.33054394464959064, + "learning_rate": 9.62636942234661e-06, + "loss": 0.2507, + "step": 3754 + }, + { + "epoch": 0.3003939921201576, + "grad_norm": 0.29144442848464214, + "learning_rate": 9.62612369176104e-06, + "loss": 0.2787, + "step": 3755 + }, + { + "epoch": 0.3004739905201896, + "grad_norm": 0.2879023395833271, + "learning_rate": 9.625877883534047e-06, + "loss": 0.2977, + "step": 3756 + }, + { + "epoch": 0.3005539889202216, + "grad_norm": 0.3157858332161786, + "learning_rate": 9.625631997669757e-06, + "loss": 0.274, + "step": 3757 + }, + { + "epoch": 0.3006339873202536, + "grad_norm": 0.3152540208439719, + "learning_rate": 9.62538603417229e-06, + "loss": 0.2591, + "step": 3758 + }, + { + "epoch": 0.30071398572028557, + "grad_norm": 0.28168736361190055, + "learning_rate": 9.62513999304578e-06, + "loss": 0.3034, + "step": 3759 + }, + { + "epoch": 0.3007939841203176, + "grad_norm": 0.2869715337774613, + "learning_rate": 9.624893874294355e-06, + "loss": 0.3126, + "step": 3760 + }, + { + "epoch": 0.3008739825203496, + "grad_norm": 0.31983042898724656, + "learning_rate": 9.624647677922143e-06, + "loss": 0.2569, + "step": 3761 + }, + { + "epoch": 0.3009539809203816, + "grad_norm": 0.36315234007826497, + "learning_rate": 9.624401403933279e-06, + "loss": 0.2564, + "step": 3762 + }, + { + "epoch": 0.3010339793204136, + "grad_norm": 0.3159159509095322, + "learning_rate": 9.624155052331896e-06, + "loss": 0.2604, + "step": 3763 + }, + { + "epoch": 0.30111397772044557, + "grad_norm": 0.25063653941001807, + "learning_rate": 9.623908623122127e-06, + "loss": 0.3002, + "step": 3764 + }, + { + "epoch": 0.3011939761204776, + "grad_norm": 0.2746297749014126, + "learning_rate": 9.623662116308108e-06, + "loss": 0.2911, + "step": 3765 + }, + { + "epoch": 0.3012739745205096, + "grad_norm": 0.25971659915489653, + "learning_rate": 9.623415531893978e-06, + "loss": 0.3258, + "step": 3766 + }, + { + "epoch": 0.3013539729205416, + "grad_norm": 0.22848702320610767, + "learning_rate": 9.623168869883874e-06, + "loss": 0.3485, + "step": 3767 + }, + { + "epoch": 0.3014339713205736, + "grad_norm": 0.3276802975195463, + "learning_rate": 9.622922130281937e-06, + "loss": 0.2719, + "step": 3768 + }, + { + "epoch": 0.30151396972060557, + "grad_norm": 0.3147418831064497, + "learning_rate": 9.622675313092307e-06, + "loss": 0.2681, + "step": 3769 + }, + { + "epoch": 0.3015939681206376, + "grad_norm": 0.27471357000394214, + "learning_rate": 9.622428418319126e-06, + "loss": 0.2903, + "step": 3770 + }, + { + "epoch": 0.3016739665206696, + "grad_norm": 0.2960204708052019, + "learning_rate": 9.622181445966539e-06, + "loss": 0.2891, + "step": 3771 + }, + { + "epoch": 0.3017539649207016, + "grad_norm": 0.3081278432304237, + "learning_rate": 9.62193439603869e-06, + "loss": 0.2623, + "step": 3772 + }, + { + "epoch": 0.3018339633207336, + "grad_norm": 0.29959819998855064, + "learning_rate": 9.621687268539725e-06, + "loss": 0.3144, + "step": 3773 + }, + { + "epoch": 0.30191396172076557, + "grad_norm": 0.2822632179700892, + "learning_rate": 9.621440063473795e-06, + "loss": 0.2721, + "step": 3774 + }, + { + "epoch": 0.3019939601207976, + "grad_norm": 0.21427632045311756, + "learning_rate": 9.621192780845044e-06, + "loss": 0.3472, + "step": 3775 + }, + { + "epoch": 0.3020739585208296, + "grad_norm": 0.2662552990541531, + "learning_rate": 9.620945420657625e-06, + "loss": 0.2842, + "step": 3776 + }, + { + "epoch": 0.3021539569208616, + "grad_norm": 0.29116465745618164, + "learning_rate": 9.620697982915688e-06, + "loss": 0.2992, + "step": 3777 + }, + { + "epoch": 0.3022339553208936, + "grad_norm": 0.3348695161420912, + "learning_rate": 9.620450467623387e-06, + "loss": 0.2465, + "step": 3778 + }, + { + "epoch": 0.30231395372092557, + "grad_norm": 0.2358412188553948, + "learning_rate": 9.620202874784878e-06, + "loss": 0.3083, + "step": 3779 + }, + { + "epoch": 0.3023939521209576, + "grad_norm": 0.2771555959617794, + "learning_rate": 9.619955204404312e-06, + "loss": 0.3242, + "step": 3780 + }, + { + "epoch": 0.3024739505209896, + "grad_norm": 0.20610996669403264, + "learning_rate": 9.619707456485848e-06, + "loss": 0.361, + "step": 3781 + }, + { + "epoch": 0.30255394892102155, + "grad_norm": 0.24100582838326473, + "learning_rate": 9.619459631033645e-06, + "loss": 0.3117, + "step": 3782 + }, + { + "epoch": 0.3026339473210536, + "grad_norm": 0.3317388276072845, + "learning_rate": 9.61921172805186e-06, + "loss": 0.2673, + "step": 3783 + }, + { + "epoch": 0.30271394572108556, + "grad_norm": 0.3107686184126036, + "learning_rate": 9.618963747544656e-06, + "loss": 0.2453, + "step": 3784 + }, + { + "epoch": 0.3027939441211176, + "grad_norm": 0.29591603280460227, + "learning_rate": 9.618715689516194e-06, + "loss": 0.2467, + "step": 3785 + }, + { + "epoch": 0.3028739425211496, + "grad_norm": 0.5291170084959257, + "learning_rate": 9.618467553970636e-06, + "loss": 0.2752, + "step": 3786 + }, + { + "epoch": 0.30295394092118155, + "grad_norm": 0.3390655942547282, + "learning_rate": 9.61821934091215e-06, + "loss": 0.2856, + "step": 3787 + }, + { + "epoch": 0.3030339393212136, + "grad_norm": 0.30433072623351004, + "learning_rate": 9.617971050344896e-06, + "loss": 0.3276, + "step": 3788 + }, + { + "epoch": 0.30311393772124556, + "grad_norm": 0.32704553463861796, + "learning_rate": 9.617722682273048e-06, + "loss": 0.2422, + "step": 3789 + }, + { + "epoch": 0.3031939361212776, + "grad_norm": 0.2926224734726999, + "learning_rate": 9.61747423670077e-06, + "loss": 0.2892, + "step": 3790 + }, + { + "epoch": 0.3032739345213096, + "grad_norm": 0.28390835478640686, + "learning_rate": 9.61722571363223e-06, + "loss": 0.3149, + "step": 3791 + }, + { + "epoch": 0.30335393292134155, + "grad_norm": 0.27449134217943877, + "learning_rate": 9.616977113071604e-06, + "loss": 0.2846, + "step": 3792 + }, + { + "epoch": 0.3034339313213736, + "grad_norm": 0.28896656002923693, + "learning_rate": 9.616728435023061e-06, + "loss": 0.2837, + "step": 3793 + }, + { + "epoch": 0.30351392972140556, + "grad_norm": 0.35680666227656566, + "learning_rate": 9.616479679490778e-06, + "loss": 0.2817, + "step": 3794 + }, + { + "epoch": 0.3035939281214376, + "grad_norm": 0.29092818640389756, + "learning_rate": 9.616230846478925e-06, + "loss": 0.3038, + "step": 3795 + }, + { + "epoch": 0.3036739265214696, + "grad_norm": 0.3809271499375508, + "learning_rate": 9.615981935991683e-06, + "loss": 0.2574, + "step": 3796 + }, + { + "epoch": 0.30375392492150155, + "grad_norm": 0.2742500134416911, + "learning_rate": 9.615732948033225e-06, + "loss": 0.2906, + "step": 3797 + }, + { + "epoch": 0.3038339233215336, + "grad_norm": 0.7198177136901439, + "learning_rate": 9.615483882607735e-06, + "loss": 0.3125, + "step": 3798 + }, + { + "epoch": 0.30391392172156556, + "grad_norm": 0.3322955738597489, + "learning_rate": 9.615234739719387e-06, + "loss": 0.2604, + "step": 3799 + }, + { + "epoch": 0.3039939201215976, + "grad_norm": 0.23480709401223898, + "learning_rate": 9.61498551937237e-06, + "loss": 0.3291, + "step": 3800 + }, + { + "epoch": 0.30407391852162957, + "grad_norm": 0.3289741368819474, + "learning_rate": 9.61473622157086e-06, + "loss": 0.2534, + "step": 3801 + }, + { + "epoch": 0.30415391692166155, + "grad_norm": 0.2441793423429152, + "learning_rate": 9.614486846319042e-06, + "loss": 0.3085, + "step": 3802 + }, + { + "epoch": 0.3042339153216936, + "grad_norm": 0.268559343801519, + "learning_rate": 9.614237393621104e-06, + "loss": 0.2991, + "step": 3803 + }, + { + "epoch": 0.30431391372172556, + "grad_norm": 0.31691605607341283, + "learning_rate": 9.613987863481232e-06, + "loss": 0.284, + "step": 3804 + }, + { + "epoch": 0.30439391212175754, + "grad_norm": 0.34193819608321324, + "learning_rate": 9.613738255903613e-06, + "loss": 0.2819, + "step": 3805 + }, + { + "epoch": 0.30447391052178957, + "grad_norm": 0.29450632015261274, + "learning_rate": 9.613488570892437e-06, + "loss": 0.2981, + "step": 3806 + }, + { + "epoch": 0.30455390892182155, + "grad_norm": 0.2821893541055286, + "learning_rate": 9.613238808451894e-06, + "loss": 0.2814, + "step": 3807 + }, + { + "epoch": 0.3046339073218536, + "grad_norm": 0.2973239793777625, + "learning_rate": 9.612988968586176e-06, + "loss": 0.3008, + "step": 3808 + }, + { + "epoch": 0.30471390572188556, + "grad_norm": 0.32501317446584055, + "learning_rate": 9.612739051299477e-06, + "loss": 0.2647, + "step": 3809 + }, + { + "epoch": 0.30479390412191754, + "grad_norm": 0.30708287310983845, + "learning_rate": 9.61248905659599e-06, + "loss": 0.2494, + "step": 3810 + }, + { + "epoch": 0.30487390252194957, + "grad_norm": 0.3665100338740675, + "learning_rate": 9.61223898447991e-06, + "loss": 0.268, + "step": 3811 + }, + { + "epoch": 0.30495390092198155, + "grad_norm": 0.3177027509716792, + "learning_rate": 9.611988834955437e-06, + "loss": 0.2501, + "step": 3812 + }, + { + "epoch": 0.3050338993220136, + "grad_norm": 0.26673527287010673, + "learning_rate": 9.611738608026765e-06, + "loss": 0.3025, + "step": 3813 + }, + { + "epoch": 0.30511389772204556, + "grad_norm": 0.34420837537026416, + "learning_rate": 9.6114883036981e-06, + "loss": 0.2492, + "step": 3814 + }, + { + "epoch": 0.30519389612207753, + "grad_norm": 0.28538947787050495, + "learning_rate": 9.611237921973637e-06, + "loss": 0.2961, + "step": 3815 + }, + { + "epoch": 0.30527389452210957, + "grad_norm": 0.23390040594024505, + "learning_rate": 9.61098746285758e-06, + "loss": 0.3395, + "step": 3816 + }, + { + "epoch": 0.30535389292214155, + "grad_norm": 0.2777698528163031, + "learning_rate": 9.610736926354133e-06, + "loss": 0.3041, + "step": 3817 + }, + { + "epoch": 0.3054338913221736, + "grad_norm": 0.31299250673242407, + "learning_rate": 9.610486312467502e-06, + "loss": 0.2714, + "step": 3818 + }, + { + "epoch": 0.30551388972220556, + "grad_norm": 0.33049703076523484, + "learning_rate": 9.61023562120189e-06, + "loss": 0.239, + "step": 3819 + }, + { + "epoch": 0.30559388812223753, + "grad_norm": 0.2779233114273755, + "learning_rate": 9.609984852561508e-06, + "loss": 0.2915, + "step": 3820 + }, + { + "epoch": 0.30567388652226957, + "grad_norm": 0.3960042605009564, + "learning_rate": 9.609734006550562e-06, + "loss": 0.3103, + "step": 3821 + }, + { + "epoch": 0.30575388492230154, + "grad_norm": 0.2933981078228004, + "learning_rate": 9.609483083173264e-06, + "loss": 0.314, + "step": 3822 + }, + { + "epoch": 0.3058338833223336, + "grad_norm": 0.23975670407659508, + "learning_rate": 9.609232082433824e-06, + "loss": 0.3117, + "step": 3823 + }, + { + "epoch": 0.30591388172236555, + "grad_norm": 0.33693168443581706, + "learning_rate": 9.608981004336453e-06, + "loss": 0.2575, + "step": 3824 + }, + { + "epoch": 0.30599388012239753, + "grad_norm": 0.2584737640054285, + "learning_rate": 9.608729848885369e-06, + "loss": 0.2945, + "step": 3825 + }, + { + "epoch": 0.30607387852242957, + "grad_norm": 0.3427539956567722, + "learning_rate": 9.608478616084784e-06, + "loss": 0.262, + "step": 3826 + }, + { + "epoch": 0.30615387692246154, + "grad_norm": 0.25580698875503427, + "learning_rate": 9.608227305938915e-06, + "loss": 0.3093, + "step": 3827 + }, + { + "epoch": 0.3062338753224936, + "grad_norm": 0.4642928980650298, + "learning_rate": 9.607975918451982e-06, + "loss": 0.266, + "step": 3828 + }, + { + "epoch": 0.30631387372252555, + "grad_norm": 0.27491582515521595, + "learning_rate": 9.6077244536282e-06, + "loss": 0.2894, + "step": 3829 + }, + { + "epoch": 0.30639387212255753, + "grad_norm": 0.3048648575886629, + "learning_rate": 9.607472911471794e-06, + "loss": 0.2922, + "step": 3830 + }, + { + "epoch": 0.30647387052258956, + "grad_norm": 0.22465292368897324, + "learning_rate": 9.607221291986983e-06, + "loss": 0.3476, + "step": 3831 + }, + { + "epoch": 0.30655386892262154, + "grad_norm": 0.30550927674254147, + "learning_rate": 9.60696959517799e-06, + "loss": 0.2768, + "step": 3832 + }, + { + "epoch": 0.3066338673226535, + "grad_norm": 0.3116252639666084, + "learning_rate": 9.606717821049042e-06, + "loss": 0.2726, + "step": 3833 + }, + { + "epoch": 0.30671386572268555, + "grad_norm": 0.2867152731669325, + "learning_rate": 9.606465969604359e-06, + "loss": 0.2802, + "step": 3834 + }, + { + "epoch": 0.30679386412271753, + "grad_norm": 0.27970437215769656, + "learning_rate": 9.606214040848174e-06, + "loss": 0.304, + "step": 3835 + }, + { + "epoch": 0.30687386252274956, + "grad_norm": 0.32136627180269983, + "learning_rate": 9.605962034784711e-06, + "loss": 0.3007, + "step": 3836 + }, + { + "epoch": 0.30695386092278154, + "grad_norm": 0.3137107034506244, + "learning_rate": 9.605709951418201e-06, + "loss": 0.2473, + "step": 3837 + }, + { + "epoch": 0.3070338593228135, + "grad_norm": 0.32728744709941154, + "learning_rate": 9.605457790752875e-06, + "loss": 0.2938, + "step": 3838 + }, + { + "epoch": 0.30711385772284555, + "grad_norm": 0.3236730307097795, + "learning_rate": 9.605205552792964e-06, + "loss": 0.2583, + "step": 3839 + }, + { + "epoch": 0.30719385612287753, + "grad_norm": 0.26884877121274947, + "learning_rate": 9.604953237542703e-06, + "loss": 0.2915, + "step": 3840 + }, + { + "epoch": 0.30727385452290956, + "grad_norm": 0.2765280117174748, + "learning_rate": 9.604700845006326e-06, + "loss": 0.2876, + "step": 3841 + }, + { + "epoch": 0.30735385292294154, + "grad_norm": 0.2686954180081356, + "learning_rate": 9.604448375188069e-06, + "loss": 0.2638, + "step": 3842 + }, + { + "epoch": 0.3074338513229735, + "grad_norm": 0.4993603649475995, + "learning_rate": 9.604195828092169e-06, + "loss": 0.2676, + "step": 3843 + }, + { + "epoch": 0.30751384972300555, + "grad_norm": 0.2874197221424311, + "learning_rate": 9.603943203722863e-06, + "loss": 0.282, + "step": 3844 + }, + { + "epoch": 0.3075938481230375, + "grad_norm": 0.7773256645907821, + "learning_rate": 9.603690502084396e-06, + "loss": 0.3196, + "step": 3845 + }, + { + "epoch": 0.30767384652306956, + "grad_norm": 0.3238968881475202, + "learning_rate": 9.603437723181002e-06, + "loss": 0.2654, + "step": 3846 + }, + { + "epoch": 0.30775384492310154, + "grad_norm": 0.2585152884011422, + "learning_rate": 9.603184867016929e-06, + "loss": 0.2903, + "step": 3847 + }, + { + "epoch": 0.3078338433231335, + "grad_norm": 0.27691320438776407, + "learning_rate": 9.602931933596418e-06, + "loss": 0.2857, + "step": 3848 + }, + { + "epoch": 0.30791384172316555, + "grad_norm": 0.35775798367894546, + "learning_rate": 9.602678922923716e-06, + "loss": 0.2723, + "step": 3849 + }, + { + "epoch": 0.3079938401231975, + "grad_norm": 1.2863576070193736, + "learning_rate": 9.602425835003067e-06, + "loss": 0.2891, + "step": 3850 + }, + { + "epoch": 0.30807383852322956, + "grad_norm": 0.54277316129236, + "learning_rate": 9.602172669838721e-06, + "loss": 0.2418, + "step": 3851 + }, + { + "epoch": 0.30815383692326154, + "grad_norm": 0.2711723891039418, + "learning_rate": 9.601919427434925e-06, + "loss": 0.3267, + "step": 3852 + }, + { + "epoch": 0.3082338353232935, + "grad_norm": 0.27337109813122557, + "learning_rate": 9.60166610779593e-06, + "loss": 0.3048, + "step": 3853 + }, + { + "epoch": 0.30831383372332555, + "grad_norm": 0.3268866093802923, + "learning_rate": 9.60141271092599e-06, + "loss": 0.258, + "step": 3854 + }, + { + "epoch": 0.3083938321233575, + "grad_norm": 0.29545279189992013, + "learning_rate": 9.601159236829353e-06, + "loss": 0.2537, + "step": 3855 + }, + { + "epoch": 0.30847383052338956, + "grad_norm": 0.28963730549731564, + "learning_rate": 9.600905685510276e-06, + "loss": 0.2837, + "step": 3856 + }, + { + "epoch": 0.30855382892342154, + "grad_norm": 0.2763666959912324, + "learning_rate": 9.600652056973013e-06, + "loss": 0.3039, + "step": 3857 + }, + { + "epoch": 0.3086338273234535, + "grad_norm": 0.3437588748629757, + "learning_rate": 9.600398351221824e-06, + "loss": 0.2965, + "step": 3858 + }, + { + "epoch": 0.30871382572348555, + "grad_norm": 0.28654084811095687, + "learning_rate": 9.600144568260962e-06, + "loss": 0.3021, + "step": 3859 + }, + { + "epoch": 0.3087938241235175, + "grad_norm": 0.4350037837738027, + "learning_rate": 9.59989070809469e-06, + "loss": 0.2472, + "step": 3860 + }, + { + "epoch": 0.3088738225235495, + "grad_norm": 0.2946843504154889, + "learning_rate": 9.59963677072727e-06, + "loss": 0.2956, + "step": 3861 + }, + { + "epoch": 0.30895382092358153, + "grad_norm": 0.31017296850475085, + "learning_rate": 9.599382756162959e-06, + "loss": 0.2619, + "step": 3862 + }, + { + "epoch": 0.3090338193236135, + "grad_norm": 0.3214027070700401, + "learning_rate": 9.599128664406023e-06, + "loss": 0.2741, + "step": 3863 + }, + { + "epoch": 0.30911381772364555, + "grad_norm": 0.30879458191331927, + "learning_rate": 9.598874495460726e-06, + "loss": 0.3043, + "step": 3864 + }, + { + "epoch": 0.3091938161236775, + "grad_norm": 0.35835214310783076, + "learning_rate": 9.598620249331334e-06, + "loss": 0.2661, + "step": 3865 + }, + { + "epoch": 0.3092738145237095, + "grad_norm": 0.26802161431422383, + "learning_rate": 9.598365926022116e-06, + "loss": 0.2984, + "step": 3866 + }, + { + "epoch": 0.30935381292374153, + "grad_norm": 0.33221556146718784, + "learning_rate": 9.598111525537336e-06, + "loss": 0.2698, + "step": 3867 + }, + { + "epoch": 0.3094338113237735, + "grad_norm": 0.30883473634301856, + "learning_rate": 9.597857047881266e-06, + "loss": 0.3095, + "step": 3868 + }, + { + "epoch": 0.30951380972380554, + "grad_norm": 0.2627994989928834, + "learning_rate": 9.597602493058178e-06, + "loss": 0.3251, + "step": 3869 + }, + { + "epoch": 0.3095938081238375, + "grad_norm": 0.3063941750846015, + "learning_rate": 9.597347861072343e-06, + "loss": 0.288, + "step": 3870 + }, + { + "epoch": 0.3096738065238695, + "grad_norm": 0.3110071516095005, + "learning_rate": 9.597093151928035e-06, + "loss": 0.26, + "step": 3871 + }, + { + "epoch": 0.30975380492390153, + "grad_norm": 0.2758509596028826, + "learning_rate": 9.596838365629529e-06, + "loss": 0.2954, + "step": 3872 + }, + { + "epoch": 0.3098338033239335, + "grad_norm": 0.23917969159544608, + "learning_rate": 9.5965835021811e-06, + "loss": 0.3194, + "step": 3873 + }, + { + "epoch": 0.30991380172396554, + "grad_norm": 0.32816977876996856, + "learning_rate": 9.596328561587027e-06, + "loss": 0.2916, + "step": 3874 + }, + { + "epoch": 0.3099938001239975, + "grad_norm": 0.4589889577704082, + "learning_rate": 9.596073543851587e-06, + "loss": 0.2476, + "step": 3875 + }, + { + "epoch": 0.3100737985240295, + "grad_norm": 0.33961492288498996, + "learning_rate": 9.595818448979061e-06, + "loss": 0.2923, + "step": 3876 + }, + { + "epoch": 0.31015379692406153, + "grad_norm": 0.27888592423637126, + "learning_rate": 9.595563276973732e-06, + "loss": 0.2826, + "step": 3877 + }, + { + "epoch": 0.3102337953240935, + "grad_norm": 0.2973125113353187, + "learning_rate": 9.59530802783988e-06, + "loss": 0.281, + "step": 3878 + }, + { + "epoch": 0.31031379372412554, + "grad_norm": 0.5074649206210077, + "learning_rate": 9.59505270158179e-06, + "loss": 0.262, + "step": 3879 + }, + { + "epoch": 0.3103937921241575, + "grad_norm": 0.2646974829774116, + "learning_rate": 9.594797298203748e-06, + "loss": 0.276, + "step": 3880 + }, + { + "epoch": 0.3104737905241895, + "grad_norm": 0.3177078713149871, + "learning_rate": 9.594541817710039e-06, + "loss": 0.2816, + "step": 3881 + }, + { + "epoch": 0.31055378892422153, + "grad_norm": 0.24065522476019588, + "learning_rate": 9.59428626010495e-06, + "loss": 0.3231, + "step": 3882 + }, + { + "epoch": 0.3106337873242535, + "grad_norm": 0.27672865647737155, + "learning_rate": 9.594030625392772e-06, + "loss": 0.2938, + "step": 3883 + }, + { + "epoch": 0.31071378572428554, + "grad_norm": 0.31334329137748806, + "learning_rate": 9.593774913577795e-06, + "loss": 0.2926, + "step": 3884 + }, + { + "epoch": 0.3107937841243175, + "grad_norm": 0.37626839744819696, + "learning_rate": 9.593519124664313e-06, + "loss": 0.2593, + "step": 3885 + }, + { + "epoch": 0.3108737825243495, + "grad_norm": 0.3019103638508728, + "learning_rate": 9.593263258656614e-06, + "loss": 0.2515, + "step": 3886 + }, + { + "epoch": 0.31095378092438153, + "grad_norm": 0.31050540398668647, + "learning_rate": 9.593007315558996e-06, + "loss": 0.2989, + "step": 3887 + }, + { + "epoch": 0.3110337793244135, + "grad_norm": 0.3307990733125002, + "learning_rate": 9.59275129537575e-06, + "loss": 0.2716, + "step": 3888 + }, + { + "epoch": 0.3111137777244455, + "grad_norm": 0.30914697713595596, + "learning_rate": 9.59249519811118e-06, + "loss": 0.2681, + "step": 3889 + }, + { + "epoch": 0.3111937761244775, + "grad_norm": 0.2869537809107287, + "learning_rate": 9.59223902376958e-06, + "loss": 0.2586, + "step": 3890 + }, + { + "epoch": 0.3112737745245095, + "grad_norm": 0.30720149721069334, + "learning_rate": 9.591982772355248e-06, + "loss": 0.2933, + "step": 3891 + }, + { + "epoch": 0.3113537729245415, + "grad_norm": 0.29997372052972104, + "learning_rate": 9.591726443872487e-06, + "loss": 0.2962, + "step": 3892 + }, + { + "epoch": 0.3114337713245735, + "grad_norm": 0.27278348171440836, + "learning_rate": 9.591470038325599e-06, + "loss": 0.3052, + "step": 3893 + }, + { + "epoch": 0.3115137697246055, + "grad_norm": 0.3094339715810796, + "learning_rate": 9.591213555718889e-06, + "loss": 0.2674, + "step": 3894 + }, + { + "epoch": 0.3115937681246375, + "grad_norm": 0.28486644794215404, + "learning_rate": 9.590956996056656e-06, + "loss": 0.287, + "step": 3895 + }, + { + "epoch": 0.3116737665246695, + "grad_norm": 0.31743902430208826, + "learning_rate": 9.59070035934321e-06, + "loss": 0.2551, + "step": 3896 + }, + { + "epoch": 0.3117537649247015, + "grad_norm": 0.32340802124708706, + "learning_rate": 9.590443645582859e-06, + "loss": 0.2599, + "step": 3897 + }, + { + "epoch": 0.3118337633247335, + "grad_norm": 0.26358580265571574, + "learning_rate": 9.590186854779909e-06, + "loss": 0.2823, + "step": 3898 + }, + { + "epoch": 0.3119137617247655, + "grad_norm": 0.33439674203012215, + "learning_rate": 9.58992998693867e-06, + "loss": 0.322, + "step": 3899 + }, + { + "epoch": 0.3119937601247975, + "grad_norm": 0.2831863941837388, + "learning_rate": 9.589673042063456e-06, + "loss": 0.3015, + "step": 3900 + }, + { + "epoch": 0.3120737585248295, + "grad_norm": 0.28417321523487726, + "learning_rate": 9.589416020158577e-06, + "loss": 0.2911, + "step": 3901 + }, + { + "epoch": 0.3121537569248615, + "grad_norm": 0.29292873122271634, + "learning_rate": 9.589158921228346e-06, + "loss": 0.298, + "step": 3902 + }, + { + "epoch": 0.3122337553248935, + "grad_norm": 0.2750471891616348, + "learning_rate": 9.58890174527708e-06, + "loss": 0.2899, + "step": 3903 + }, + { + "epoch": 0.3123137537249255, + "grad_norm": 0.2894306006665051, + "learning_rate": 9.588644492309093e-06, + "loss": 0.2735, + "step": 3904 + }, + { + "epoch": 0.3123937521249575, + "grad_norm": 0.2753159652953076, + "learning_rate": 9.588387162328705e-06, + "loss": 0.2735, + "step": 3905 + }, + { + "epoch": 0.3124737505249895, + "grad_norm": 0.27729447813458397, + "learning_rate": 9.588129755340231e-06, + "loss": 0.2838, + "step": 3906 + }, + { + "epoch": 0.3125537489250215, + "grad_norm": 0.24987290914441593, + "learning_rate": 9.587872271347996e-06, + "loss": 0.3347, + "step": 3907 + }, + { + "epoch": 0.3126337473250535, + "grad_norm": 0.25821226362023975, + "learning_rate": 9.587614710356318e-06, + "loss": 0.2846, + "step": 3908 + }, + { + "epoch": 0.3127137457250855, + "grad_norm": 0.33149551831859797, + "learning_rate": 9.587357072369522e-06, + "loss": 0.2747, + "step": 3909 + }, + { + "epoch": 0.3127937441251175, + "grad_norm": 0.3659492192793033, + "learning_rate": 9.58709935739193e-06, + "loss": 0.2731, + "step": 3910 + }, + { + "epoch": 0.3128737425251495, + "grad_norm": 0.2720949021289437, + "learning_rate": 9.586841565427869e-06, + "loss": 0.295, + "step": 3911 + }, + { + "epoch": 0.3129537409251815, + "grad_norm": 0.318861928957452, + "learning_rate": 9.586583696481664e-06, + "loss": 0.3334, + "step": 3912 + }, + { + "epoch": 0.3130337393252135, + "grad_norm": 0.25467458864021475, + "learning_rate": 9.586325750557643e-06, + "loss": 0.3091, + "step": 3913 + }, + { + "epoch": 0.3131137377252455, + "grad_norm": 0.2877708954949333, + "learning_rate": 9.586067727660138e-06, + "loss": 0.2818, + "step": 3914 + }, + { + "epoch": 0.3131937361252775, + "grad_norm": 0.3102993471880673, + "learning_rate": 9.585809627793475e-06, + "loss": 0.2695, + "step": 3915 + }, + { + "epoch": 0.3132737345253095, + "grad_norm": 0.27001233567184524, + "learning_rate": 9.585551450961989e-06, + "loss": 0.265, + "step": 3916 + }, + { + "epoch": 0.31335373292534147, + "grad_norm": 1.4311188802097372, + "learning_rate": 9.58529319717001e-06, + "loss": 0.2364, + "step": 3917 + }, + { + "epoch": 0.3134337313253735, + "grad_norm": 0.2978597808997237, + "learning_rate": 9.585034866421877e-06, + "loss": 0.2705, + "step": 3918 + }, + { + "epoch": 0.3135137297254055, + "grad_norm": 0.3247709718007176, + "learning_rate": 9.584776458721922e-06, + "loss": 0.2652, + "step": 3919 + }, + { + "epoch": 0.3135937281254375, + "grad_norm": 0.3996160135425736, + "learning_rate": 9.584517974074483e-06, + "loss": 0.3403, + "step": 3920 + }, + { + "epoch": 0.3136737265254695, + "grad_norm": 0.25965274545549266, + "learning_rate": 9.584259412483899e-06, + "loss": 0.2866, + "step": 3921 + }, + { + "epoch": 0.31375372492550146, + "grad_norm": 0.3503608886302911, + "learning_rate": 9.584000773954507e-06, + "loss": 0.2495, + "step": 3922 + }, + { + "epoch": 0.3138337233255335, + "grad_norm": 0.24925995168079726, + "learning_rate": 9.58374205849065e-06, + "loss": 0.324, + "step": 3923 + }, + { + "epoch": 0.3139137217255655, + "grad_norm": 0.29258512902412276, + "learning_rate": 9.58348326609667e-06, + "loss": 0.2751, + "step": 3924 + }, + { + "epoch": 0.3139937201255975, + "grad_norm": 0.4528150694793641, + "learning_rate": 9.58322439677691e-06, + "loss": 0.3089, + "step": 3925 + }, + { + "epoch": 0.3140737185256295, + "grad_norm": 0.22970094107638747, + "learning_rate": 9.582965450535716e-06, + "loss": 0.342, + "step": 3926 + }, + { + "epoch": 0.31415371692566146, + "grad_norm": 0.28154352716643033, + "learning_rate": 9.58270642737743e-06, + "loss": 0.2764, + "step": 3927 + }, + { + "epoch": 0.3142337153256935, + "grad_norm": 0.28228610686934, + "learning_rate": 9.5824473273064e-06, + "loss": 0.2824, + "step": 3928 + }, + { + "epoch": 0.3143137137257255, + "grad_norm": 0.2781717330394963, + "learning_rate": 9.582188150326981e-06, + "loss": 0.3191, + "step": 3929 + }, + { + "epoch": 0.3143937121257575, + "grad_norm": 0.32532496465351285, + "learning_rate": 9.581928896443517e-06, + "loss": 0.2909, + "step": 3930 + }, + { + "epoch": 0.3144737105257895, + "grad_norm": 0.27785996581839767, + "learning_rate": 9.58166956566036e-06, + "loss": 0.3263, + "step": 3931 + }, + { + "epoch": 0.31455370892582146, + "grad_norm": 0.4067201266905533, + "learning_rate": 9.58141015798186e-06, + "loss": 0.2775, + "step": 3932 + }, + { + "epoch": 0.3146337073258535, + "grad_norm": 0.31383028910350796, + "learning_rate": 9.581150673412376e-06, + "loss": 0.2958, + "step": 3933 + }, + { + "epoch": 0.3147137057258855, + "grad_norm": 0.34201621859043685, + "learning_rate": 9.58089111195626e-06, + "loss": 0.2625, + "step": 3934 + }, + { + "epoch": 0.3147937041259175, + "grad_norm": 0.35328825049775214, + "learning_rate": 9.58063147361787e-06, + "loss": 0.2761, + "step": 3935 + }, + { + "epoch": 0.3148737025259495, + "grad_norm": 0.29986101037815416, + "learning_rate": 9.58037175840156e-06, + "loss": 0.2935, + "step": 3936 + }, + { + "epoch": 0.31495370092598146, + "grad_norm": 0.30730592029876386, + "learning_rate": 9.580111966311692e-06, + "loss": 0.2689, + "step": 3937 + }, + { + "epoch": 0.3150336993260135, + "grad_norm": 0.316582518561074, + "learning_rate": 9.579852097352625e-06, + "loss": 0.3097, + "step": 3938 + }, + { + "epoch": 0.31511369772604547, + "grad_norm": 0.33691115655362097, + "learning_rate": 9.579592151528721e-06, + "loss": 0.2633, + "step": 3939 + }, + { + "epoch": 0.3151936961260775, + "grad_norm": 0.2965737922613462, + "learning_rate": 9.579332128844342e-06, + "loss": 0.3, + "step": 3940 + }, + { + "epoch": 0.3152736945261095, + "grad_norm": 0.248686376204246, + "learning_rate": 9.579072029303855e-06, + "loss": 0.3372, + "step": 3941 + }, + { + "epoch": 0.31535369292614146, + "grad_norm": 0.25850999340759045, + "learning_rate": 9.57881185291162e-06, + "loss": 0.3062, + "step": 3942 + }, + { + "epoch": 0.3154336913261735, + "grad_norm": 0.3367296420055412, + "learning_rate": 9.578551599672008e-06, + "loss": 0.2826, + "step": 3943 + }, + { + "epoch": 0.31551368972620547, + "grad_norm": 0.2807093376051179, + "learning_rate": 9.578291269589384e-06, + "loss": 0.3022, + "step": 3944 + }, + { + "epoch": 0.31559368812623745, + "grad_norm": 0.33089238719470726, + "learning_rate": 9.57803086266812e-06, + "loss": 0.2829, + "step": 3945 + }, + { + "epoch": 0.3156736865262695, + "grad_norm": 0.30323563722632235, + "learning_rate": 9.577770378912584e-06, + "loss": 0.2933, + "step": 3946 + }, + { + "epoch": 0.31575368492630146, + "grad_norm": 0.32487367631960307, + "learning_rate": 9.57750981832715e-06, + "loss": 0.2614, + "step": 3947 + }, + { + "epoch": 0.3158336833263335, + "grad_norm": 0.29811040897680635, + "learning_rate": 9.577249180916188e-06, + "loss": 0.2764, + "step": 3948 + }, + { + "epoch": 0.31591368172636547, + "grad_norm": 0.3235825274760498, + "learning_rate": 9.576988466684077e-06, + "loss": 0.2815, + "step": 3949 + }, + { + "epoch": 0.31599368012639745, + "grad_norm": 0.33389418759453765, + "learning_rate": 9.576727675635186e-06, + "loss": 0.2555, + "step": 3950 + }, + { + "epoch": 0.3160736785264295, + "grad_norm": 0.3301049227912578, + "learning_rate": 9.5764668077739e-06, + "loss": 0.2592, + "step": 3951 + }, + { + "epoch": 0.31615367692646146, + "grad_norm": 0.28729958209903517, + "learning_rate": 9.576205863104588e-06, + "loss": 0.2718, + "step": 3952 + }, + { + "epoch": 0.3162336753264935, + "grad_norm": 0.30330438684038347, + "learning_rate": 9.575944841631636e-06, + "loss": 0.2804, + "step": 3953 + }, + { + "epoch": 0.31631367372652547, + "grad_norm": 0.31763471828758494, + "learning_rate": 9.575683743359425e-06, + "loss": 0.2919, + "step": 3954 + }, + { + "epoch": 0.31639367212655745, + "grad_norm": 0.3133490432540109, + "learning_rate": 9.575422568292336e-06, + "loss": 0.266, + "step": 3955 + }, + { + "epoch": 0.3164736705265895, + "grad_norm": 0.28413874320701693, + "learning_rate": 9.575161316434749e-06, + "loss": 0.2896, + "step": 3956 + }, + { + "epoch": 0.31655366892662146, + "grad_norm": 0.31282787605427237, + "learning_rate": 9.574899987791054e-06, + "loss": 0.3269, + "step": 3957 + }, + { + "epoch": 0.3166336673266535, + "grad_norm": 0.33526664499800835, + "learning_rate": 9.574638582365631e-06, + "loss": 0.3121, + "step": 3958 + }, + { + "epoch": 0.31671366572668547, + "grad_norm": 0.29833517183570424, + "learning_rate": 9.574377100162874e-06, + "loss": 0.3313, + "step": 3959 + }, + { + "epoch": 0.31679366412671744, + "grad_norm": 0.2841155937846932, + "learning_rate": 9.574115541187166e-06, + "loss": 0.2975, + "step": 3960 + }, + { + "epoch": 0.3168736625267495, + "grad_norm": 0.3006996187651303, + "learning_rate": 9.573853905442899e-06, + "loss": 0.2867, + "step": 3961 + }, + { + "epoch": 0.31695366092678146, + "grad_norm": 0.364433452757692, + "learning_rate": 9.573592192934465e-06, + "loss": 0.2778, + "step": 3962 + }, + { + "epoch": 0.3170336593268135, + "grad_norm": 0.2834108108811054, + "learning_rate": 9.573330403666254e-06, + "loss": 0.3072, + "step": 3963 + }, + { + "epoch": 0.31711365772684547, + "grad_norm": 0.2874029200475847, + "learning_rate": 9.573068537642663e-06, + "loss": 0.3009, + "step": 3964 + }, + { + "epoch": 0.31719365612687744, + "grad_norm": 0.2861945034143621, + "learning_rate": 9.572806594868082e-06, + "loss": 0.3409, + "step": 3965 + }, + { + "epoch": 0.3172736545269095, + "grad_norm": 0.3693568298861793, + "learning_rate": 9.572544575346912e-06, + "loss": 0.2641, + "step": 3966 + }, + { + "epoch": 0.31735365292694145, + "grad_norm": 0.318077472910601, + "learning_rate": 9.572282479083548e-06, + "loss": 0.3056, + "step": 3967 + }, + { + "epoch": 0.3174336513269735, + "grad_norm": 0.3404735319657658, + "learning_rate": 9.57202030608239e-06, + "loss": 0.2746, + "step": 3968 + }, + { + "epoch": 0.31751364972700546, + "grad_norm": 0.32492418596836403, + "learning_rate": 9.571758056347839e-06, + "loss": 0.2999, + "step": 3969 + }, + { + "epoch": 0.31759364812703744, + "grad_norm": 0.30668738174884774, + "learning_rate": 9.571495729884294e-06, + "loss": 0.301, + "step": 3970 + }, + { + "epoch": 0.3176736465270695, + "grad_norm": 0.28502096572478286, + "learning_rate": 9.571233326696159e-06, + "loss": 0.3051, + "step": 3971 + }, + { + "epoch": 0.31775364492710145, + "grad_norm": 0.2275487732423886, + "learning_rate": 9.570970846787838e-06, + "loss": 0.3181, + "step": 3972 + }, + { + "epoch": 0.31783364332713343, + "grad_norm": 0.25242228439122755, + "learning_rate": 9.570708290163735e-06, + "loss": 0.3196, + "step": 3973 + }, + { + "epoch": 0.31791364172716546, + "grad_norm": 0.27244667542189976, + "learning_rate": 9.570445656828257e-06, + "loss": 0.3248, + "step": 3974 + }, + { + "epoch": 0.31799364012719744, + "grad_norm": 0.2976689546729177, + "learning_rate": 9.570182946785816e-06, + "loss": 0.2956, + "step": 3975 + }, + { + "epoch": 0.3180736385272295, + "grad_norm": 0.2982129500863135, + "learning_rate": 9.569920160040815e-06, + "loss": 0.2614, + "step": 3976 + }, + { + "epoch": 0.31815363692726145, + "grad_norm": 0.32407168727586233, + "learning_rate": 9.569657296597668e-06, + "loss": 0.295, + "step": 3977 + }, + { + "epoch": 0.31823363532729343, + "grad_norm": 0.2919827562703247, + "learning_rate": 9.569394356460784e-06, + "loss": 0.2863, + "step": 3978 + }, + { + "epoch": 0.31831363372732546, + "grad_norm": 0.2865151336624842, + "learning_rate": 9.569131339634578e-06, + "loss": 0.3268, + "step": 3979 + }, + { + "epoch": 0.31839363212735744, + "grad_norm": 0.31558925569229895, + "learning_rate": 9.568868246123466e-06, + "loss": 0.2664, + "step": 3980 + }, + { + "epoch": 0.3184736305273895, + "grad_norm": 2.0248862754846995, + "learning_rate": 9.56860507593186e-06, + "loss": 0.2821, + "step": 3981 + }, + { + "epoch": 0.31855362892742145, + "grad_norm": 0.8687422236056815, + "learning_rate": 9.56834182906418e-06, + "loss": 0.3073, + "step": 3982 + }, + { + "epoch": 0.31863362732745343, + "grad_norm": 0.2842177648517353, + "learning_rate": 9.56807850552484e-06, + "loss": 0.282, + "step": 3983 + }, + { + "epoch": 0.31871362572748546, + "grad_norm": 0.31636699140195035, + "learning_rate": 9.567815105318263e-06, + "loss": 0.2593, + "step": 3984 + }, + { + "epoch": 0.31879362412751744, + "grad_norm": 0.2467758243710989, + "learning_rate": 9.56755162844887e-06, + "loss": 0.3194, + "step": 3985 + }, + { + "epoch": 0.31887362252754947, + "grad_norm": 0.341488002663993, + "learning_rate": 9.567288074921082e-06, + "loss": 0.2929, + "step": 3986 + }, + { + "epoch": 0.31895362092758145, + "grad_norm": 0.2876899845169144, + "learning_rate": 9.567024444739319e-06, + "loss": 0.2863, + "step": 3987 + }, + { + "epoch": 0.3190336193276134, + "grad_norm": 0.2859067348665681, + "learning_rate": 9.56676073790801e-06, + "loss": 0.3072, + "step": 3988 + }, + { + "epoch": 0.31911361772764546, + "grad_norm": 0.35174386983488926, + "learning_rate": 9.566496954431581e-06, + "loss": 0.3362, + "step": 3989 + }, + { + "epoch": 0.31919361612767744, + "grad_norm": 0.26984492023546414, + "learning_rate": 9.566233094314456e-06, + "loss": 0.2718, + "step": 3990 + }, + { + "epoch": 0.31927361452770947, + "grad_norm": 0.2704083748539496, + "learning_rate": 9.565969157561066e-06, + "loss": 0.2921, + "step": 3991 + }, + { + "epoch": 0.31935361292774145, + "grad_norm": 0.3364570928005177, + "learning_rate": 9.565705144175839e-06, + "loss": 0.2656, + "step": 3992 + }, + { + "epoch": 0.3194336113277734, + "grad_norm": 0.26851676493540766, + "learning_rate": 9.565441054163205e-06, + "loss": 0.2948, + "step": 3993 + }, + { + "epoch": 0.31951360972780546, + "grad_norm": 0.1697263810751049, + "learning_rate": 9.5651768875276e-06, + "loss": 0.3719, + "step": 3994 + }, + { + "epoch": 0.31959360812783744, + "grad_norm": 0.3212125770010283, + "learning_rate": 9.564912644273456e-06, + "loss": 0.2851, + "step": 3995 + }, + { + "epoch": 0.31967360652786947, + "grad_norm": 0.3324070803908523, + "learning_rate": 9.564648324405206e-06, + "loss": 0.2575, + "step": 3996 + }, + { + "epoch": 0.31975360492790145, + "grad_norm": 0.24694722371162814, + "learning_rate": 9.564383927927289e-06, + "loss": 0.3078, + "step": 3997 + }, + { + "epoch": 0.3198336033279334, + "grad_norm": 0.29554648044051524, + "learning_rate": 9.56411945484414e-06, + "loss": 0.3057, + "step": 3998 + }, + { + "epoch": 0.31991360172796546, + "grad_norm": 0.31988315139831774, + "learning_rate": 9.5638549051602e-06, + "loss": 0.2778, + "step": 3999 + }, + { + "epoch": 0.31999360012799744, + "grad_norm": 0.32641897086496025, + "learning_rate": 9.563590278879906e-06, + "loss": 0.2608, + "step": 4000 + }, + { + "epoch": 0.3200735985280294, + "grad_norm": 0.5075193181280644, + "learning_rate": 9.563325576007702e-06, + "loss": 0.2611, + "step": 4001 + }, + { + "epoch": 0.32015359692806145, + "grad_norm": 0.2896470124600797, + "learning_rate": 9.563060796548029e-06, + "loss": 0.2715, + "step": 4002 + }, + { + "epoch": 0.3202335953280934, + "grad_norm": 0.2712153722754573, + "learning_rate": 9.562795940505332e-06, + "loss": 0.3055, + "step": 4003 + }, + { + "epoch": 0.32031359372812546, + "grad_norm": 0.4166443406309326, + "learning_rate": 9.562531007884056e-06, + "loss": 0.2645, + "step": 4004 + }, + { + "epoch": 0.32039359212815743, + "grad_norm": 0.3062796891372946, + "learning_rate": 9.562265998688648e-06, + "loss": 0.2577, + "step": 4005 + }, + { + "epoch": 0.3204735905281894, + "grad_norm": 0.46964732719571156, + "learning_rate": 9.562000912923551e-06, + "loss": 0.3008, + "step": 4006 + }, + { + "epoch": 0.32055358892822144, + "grad_norm": 0.3100781206552371, + "learning_rate": 9.561735750593221e-06, + "loss": 0.3066, + "step": 4007 + }, + { + "epoch": 0.3206335873282534, + "grad_norm": 0.296049214094488, + "learning_rate": 9.561470511702105e-06, + "loss": 0.2972, + "step": 4008 + }, + { + "epoch": 0.32071358572828546, + "grad_norm": 0.301133650172872, + "learning_rate": 9.561205196254652e-06, + "loss": 0.2894, + "step": 4009 + }, + { + "epoch": 0.32079358412831743, + "grad_norm": 0.25227236249208285, + "learning_rate": 9.56093980425532e-06, + "loss": 0.318, + "step": 4010 + }, + { + "epoch": 0.3208735825283494, + "grad_norm": 0.24041520518074755, + "learning_rate": 9.56067433570856e-06, + "loss": 0.3223, + "step": 4011 + }, + { + "epoch": 0.32095358092838144, + "grad_norm": 0.36256696568392754, + "learning_rate": 9.560408790618828e-06, + "loss": 0.2879, + "step": 4012 + }, + { + "epoch": 0.3210335793284134, + "grad_norm": 0.3693646197492591, + "learning_rate": 9.56014316899058e-06, + "loss": 0.2813, + "step": 4013 + }, + { + "epoch": 0.32111357772844545, + "grad_norm": 0.2791271227882677, + "learning_rate": 9.559877470828277e-06, + "loss": 0.2982, + "step": 4014 + }, + { + "epoch": 0.32119357612847743, + "grad_norm": 0.29407645959942963, + "learning_rate": 9.559611696136375e-06, + "loss": 0.2719, + "step": 4015 + }, + { + "epoch": 0.3212735745285094, + "grad_norm": 0.3475739604922169, + "learning_rate": 9.559345844919334e-06, + "loss": 0.254, + "step": 4016 + }, + { + "epoch": 0.32135357292854144, + "grad_norm": 0.29903318309828236, + "learning_rate": 9.55907991718162e-06, + "loss": 0.3073, + "step": 4017 + }, + { + "epoch": 0.3214335713285734, + "grad_norm": 0.32768885424418126, + "learning_rate": 9.558813912927692e-06, + "loss": 0.2688, + "step": 4018 + }, + { + "epoch": 0.32151356972860545, + "grad_norm": 0.34219063500364366, + "learning_rate": 9.558547832162017e-06, + "loss": 0.2499, + "step": 4019 + }, + { + "epoch": 0.32159356812863743, + "grad_norm": 0.28114818437122263, + "learning_rate": 9.558281674889058e-06, + "loss": 0.2848, + "step": 4020 + }, + { + "epoch": 0.3216735665286694, + "grad_norm": 0.2765459146319245, + "learning_rate": 9.558015441113285e-06, + "loss": 0.3288, + "step": 4021 + }, + { + "epoch": 0.32175356492870144, + "grad_norm": 0.3698007664091243, + "learning_rate": 9.557749130839164e-06, + "loss": 0.2984, + "step": 4022 + }, + { + "epoch": 0.3218335633287334, + "grad_norm": 0.32412915405056036, + "learning_rate": 9.557482744071166e-06, + "loss": 0.2531, + "step": 4023 + }, + { + "epoch": 0.32191356172876545, + "grad_norm": 0.29606563834974864, + "learning_rate": 9.557216280813764e-06, + "loss": 0.3128, + "step": 4024 + }, + { + "epoch": 0.32199356012879743, + "grad_norm": 0.3054948339671617, + "learning_rate": 9.556949741071423e-06, + "loss": 0.2959, + "step": 4025 + }, + { + "epoch": 0.3220735585288294, + "grad_norm": 0.2553380885639412, + "learning_rate": 9.556683124848624e-06, + "loss": 0.3212, + "step": 4026 + }, + { + "epoch": 0.32215355692886144, + "grad_norm": 0.264916518683339, + "learning_rate": 9.556416432149838e-06, + "loss": 0.3202, + "step": 4027 + }, + { + "epoch": 0.3222335553288934, + "grad_norm": 0.2557577568551388, + "learning_rate": 9.556149662979541e-06, + "loss": 0.3353, + "step": 4028 + }, + { + "epoch": 0.3223135537289254, + "grad_norm": 0.3052677610730034, + "learning_rate": 9.555882817342212e-06, + "loss": 0.2578, + "step": 4029 + }, + { + "epoch": 0.32239355212895743, + "grad_norm": 0.32713931659198986, + "learning_rate": 9.555615895242327e-06, + "loss": 0.2799, + "step": 4030 + }, + { + "epoch": 0.3224735505289894, + "grad_norm": 0.2900546404195957, + "learning_rate": 9.555348896684366e-06, + "loss": 0.2914, + "step": 4031 + }, + { + "epoch": 0.32255354892902144, + "grad_norm": 0.3197630355105231, + "learning_rate": 9.555081821672814e-06, + "loss": 0.2684, + "step": 4032 + }, + { + "epoch": 0.3226335473290534, + "grad_norm": 0.33345881717726855, + "learning_rate": 9.55481467021215e-06, + "loss": 0.3117, + "step": 4033 + }, + { + "epoch": 0.3227135457290854, + "grad_norm": 0.3384081464782839, + "learning_rate": 9.554547442306858e-06, + "loss": 0.2861, + "step": 4034 + }, + { + "epoch": 0.3227935441291174, + "grad_norm": 0.25492748306734875, + "learning_rate": 9.554280137961423e-06, + "loss": 0.324, + "step": 4035 + }, + { + "epoch": 0.3228735425291494, + "grad_norm": 0.29424950055754046, + "learning_rate": 9.55401275718033e-06, + "loss": 0.2669, + "step": 4036 + }, + { + "epoch": 0.32295354092918144, + "grad_norm": 0.2971300509386836, + "learning_rate": 9.553745299968071e-06, + "loss": 0.3262, + "step": 4037 + }, + { + "epoch": 0.3230335393292134, + "grad_norm": 0.2861000213198556, + "learning_rate": 9.553477766329132e-06, + "loss": 0.2954, + "step": 4038 + }, + { + "epoch": 0.3231135377292454, + "grad_norm": 0.268185361380556, + "learning_rate": 9.553210156268e-06, + "loss": 0.2684, + "step": 4039 + }, + { + "epoch": 0.3231935361292774, + "grad_norm": 0.31192653631494927, + "learning_rate": 9.552942469789172e-06, + "loss": 0.2562, + "step": 4040 + }, + { + "epoch": 0.3232735345293094, + "grad_norm": 0.30551885330142187, + "learning_rate": 9.552674706897136e-06, + "loss": 0.3008, + "step": 4041 + }, + { + "epoch": 0.32335353292934144, + "grad_norm": 0.23476156166918968, + "learning_rate": 9.552406867596388e-06, + "loss": 0.296, + "step": 4042 + }, + { + "epoch": 0.3234335313293734, + "grad_norm": 0.2775178012807703, + "learning_rate": 9.552138951891425e-06, + "loss": 0.2799, + "step": 4043 + }, + { + "epoch": 0.3235135297294054, + "grad_norm": 0.33839652542386145, + "learning_rate": 9.55187095978674e-06, + "loss": 0.2684, + "step": 4044 + }, + { + "epoch": 0.3235935281294374, + "grad_norm": 0.30491407323811603, + "learning_rate": 9.551602891286833e-06, + "loss": 0.2693, + "step": 4045 + }, + { + "epoch": 0.3236735265294694, + "grad_norm": 0.3055082154736578, + "learning_rate": 9.551334746396203e-06, + "loss": 0.2857, + "step": 4046 + }, + { + "epoch": 0.32375352492950144, + "grad_norm": 0.2710121364574758, + "learning_rate": 9.551066525119349e-06, + "loss": 0.2888, + "step": 4047 + }, + { + "epoch": 0.3238335233295334, + "grad_norm": 0.2757055268244437, + "learning_rate": 9.550798227460774e-06, + "loss": 0.2923, + "step": 4048 + }, + { + "epoch": 0.3239135217295654, + "grad_norm": 0.2542502174010691, + "learning_rate": 9.550529853424979e-06, + "loss": 0.2837, + "step": 4049 + }, + { + "epoch": 0.3239935201295974, + "grad_norm": 0.31078202408045963, + "learning_rate": 9.55026140301647e-06, + "loss": 0.2565, + "step": 4050 + }, + { + "epoch": 0.3240735185296294, + "grad_norm": 0.30190144243575673, + "learning_rate": 9.549992876239753e-06, + "loss": 0.2638, + "step": 4051 + }, + { + "epoch": 0.32415351692966143, + "grad_norm": 0.3538883893493791, + "learning_rate": 9.549724273099333e-06, + "loss": 0.2682, + "step": 4052 + }, + { + "epoch": 0.3242335153296934, + "grad_norm": 1.2109482459833467, + "learning_rate": 9.54945559359972e-06, + "loss": 0.2558, + "step": 4053 + }, + { + "epoch": 0.3243135137297254, + "grad_norm": 0.2912686806043731, + "learning_rate": 9.54918683774542e-06, + "loss": 0.2554, + "step": 4054 + }, + { + "epoch": 0.3243935121297574, + "grad_norm": 0.3146192167625131, + "learning_rate": 9.548918005540948e-06, + "loss": 0.2487, + "step": 4055 + }, + { + "epoch": 0.3244735105297894, + "grad_norm": 0.21844768163006562, + "learning_rate": 9.548649096990811e-06, + "loss": 0.3398, + "step": 4056 + }, + { + "epoch": 0.3245535089298214, + "grad_norm": 0.6801558014844721, + "learning_rate": 9.548380112099527e-06, + "loss": 0.2867, + "step": 4057 + }, + { + "epoch": 0.3246335073298534, + "grad_norm": 0.8054304579422645, + "learning_rate": 9.548111050871607e-06, + "loss": 0.2575, + "step": 4058 + }, + { + "epoch": 0.3247135057298854, + "grad_norm": 0.33813185412173413, + "learning_rate": 9.547841913311567e-06, + "loss": 0.2887, + "step": 4059 + }, + { + "epoch": 0.3247935041299174, + "grad_norm": 0.2836560635347328, + "learning_rate": 9.547572699423927e-06, + "loss": 0.29, + "step": 4060 + }, + { + "epoch": 0.3248735025299494, + "grad_norm": 0.2884220151799123, + "learning_rate": 9.547303409213202e-06, + "loss": 0.3039, + "step": 4061 + }, + { + "epoch": 0.3249535009299814, + "grad_norm": 0.2698166056217857, + "learning_rate": 9.547034042683913e-06, + "loss": 0.283, + "step": 4062 + }, + { + "epoch": 0.3250334993300134, + "grad_norm": 0.2960669444474324, + "learning_rate": 9.546764599840581e-06, + "loss": 0.2904, + "step": 4063 + }, + { + "epoch": 0.3251134977300454, + "grad_norm": 0.3221641557980718, + "learning_rate": 9.546495080687727e-06, + "loss": 0.2635, + "step": 4064 + }, + { + "epoch": 0.3251934961300774, + "grad_norm": 0.3134634361148778, + "learning_rate": 9.546225485229876e-06, + "loss": 0.2549, + "step": 4065 + }, + { + "epoch": 0.3252734945301094, + "grad_norm": 0.24875528951106704, + "learning_rate": 9.545955813471552e-06, + "loss": 0.34, + "step": 4066 + }, + { + "epoch": 0.3253534929301414, + "grad_norm": 0.3319478340152881, + "learning_rate": 9.545686065417279e-06, + "loss": 0.2738, + "step": 4067 + }, + { + "epoch": 0.3254334913301734, + "grad_norm": 0.2507483284470259, + "learning_rate": 9.545416241071588e-06, + "loss": 0.3166, + "step": 4068 + }, + { + "epoch": 0.3255134897302054, + "grad_norm": 0.301063661839558, + "learning_rate": 9.545146340439005e-06, + "loss": 0.2766, + "step": 4069 + }, + { + "epoch": 0.3255934881302374, + "grad_norm": 0.3200789615898504, + "learning_rate": 9.54487636352406e-06, + "loss": 0.3085, + "step": 4070 + }, + { + "epoch": 0.3256734865302694, + "grad_norm": 0.3594942449972947, + "learning_rate": 9.544606310331284e-06, + "loss": 0.2719, + "step": 4071 + }, + { + "epoch": 0.3257534849303014, + "grad_norm": 0.2881992736432892, + "learning_rate": 9.544336180865212e-06, + "loss": 0.279, + "step": 4072 + }, + { + "epoch": 0.3258334833303334, + "grad_norm": 0.26913004486202013, + "learning_rate": 9.544065975130375e-06, + "loss": 0.3312, + "step": 4073 + }, + { + "epoch": 0.3259134817303654, + "grad_norm": 0.3441375528930245, + "learning_rate": 9.543795693131306e-06, + "loss": 0.2635, + "step": 4074 + }, + { + "epoch": 0.3259934801303974, + "grad_norm": 0.32673878317588556, + "learning_rate": 9.543525334872546e-06, + "loss": 0.2624, + "step": 4075 + }, + { + "epoch": 0.3260734785304294, + "grad_norm": 0.24107340829890755, + "learning_rate": 9.54325490035863e-06, + "loss": 0.339, + "step": 4076 + }, + { + "epoch": 0.3261534769304614, + "grad_norm": 0.2923915734715394, + "learning_rate": 9.542984389594096e-06, + "loss": 0.2954, + "step": 4077 + }, + { + "epoch": 0.3262334753304934, + "grad_norm": 0.3001303169206607, + "learning_rate": 9.542713802583485e-06, + "loss": 0.2827, + "step": 4078 + }, + { + "epoch": 0.3263134737305254, + "grad_norm": 0.27984853705181556, + "learning_rate": 9.54244313933134e-06, + "loss": 0.3207, + "step": 4079 + }, + { + "epoch": 0.3263934721305574, + "grad_norm": 0.28644662960675643, + "learning_rate": 9.5421723998422e-06, + "loss": 0.29, + "step": 4080 + }, + { + "epoch": 0.3264734705305894, + "grad_norm": 0.35188282002655014, + "learning_rate": 9.541901584120612e-06, + "loss": 0.2607, + "step": 4081 + }, + { + "epoch": 0.32655346893062137, + "grad_norm": 0.31466364220696796, + "learning_rate": 9.541630692171119e-06, + "loss": 0.2543, + "step": 4082 + }, + { + "epoch": 0.3266334673306534, + "grad_norm": 0.3156907364146312, + "learning_rate": 9.541359723998268e-06, + "loss": 0.306, + "step": 4083 + }, + { + "epoch": 0.3267134657306854, + "grad_norm": 0.26175058440313936, + "learning_rate": 9.541088679606609e-06, + "loss": 0.3197, + "step": 4084 + }, + { + "epoch": 0.32679346413071736, + "grad_norm": 0.3371694255451264, + "learning_rate": 9.540817559000688e-06, + "loss": 0.2635, + "step": 4085 + }, + { + "epoch": 0.3268734625307494, + "grad_norm": 0.3332781085233276, + "learning_rate": 9.540546362185056e-06, + "loss": 0.2957, + "step": 4086 + }, + { + "epoch": 0.32695346093078137, + "grad_norm": 0.2315063819203408, + "learning_rate": 9.540275089164266e-06, + "loss": 0.3227, + "step": 4087 + }, + { + "epoch": 0.3270334593308134, + "grad_norm": 0.2521063676269978, + "learning_rate": 9.54000373994287e-06, + "loss": 0.3165, + "step": 4088 + }, + { + "epoch": 0.3271134577308454, + "grad_norm": 0.42509382236989884, + "learning_rate": 9.539732314525421e-06, + "loss": 0.2803, + "step": 4089 + }, + { + "epoch": 0.32719345613087736, + "grad_norm": 0.2743381982377584, + "learning_rate": 9.539460812916477e-06, + "loss": 0.279, + "step": 4090 + }, + { + "epoch": 0.3272734545309094, + "grad_norm": 0.2764559811333477, + "learning_rate": 9.539189235120591e-06, + "loss": 0.2919, + "step": 4091 + }, + { + "epoch": 0.32735345293094137, + "grad_norm": 0.31707134596405084, + "learning_rate": 9.538917581142325e-06, + "loss": 0.2854, + "step": 4092 + }, + { + "epoch": 0.3274334513309734, + "grad_norm": 0.23850551013460794, + "learning_rate": 9.538645850986235e-06, + "loss": 0.3225, + "step": 4093 + }, + { + "epoch": 0.3275134497310054, + "grad_norm": 0.31008110368471076, + "learning_rate": 9.538374044656884e-06, + "loss": 0.2935, + "step": 4094 + }, + { + "epoch": 0.32759344813103736, + "grad_norm": 0.18460543601194526, + "learning_rate": 9.538102162158832e-06, + "loss": 0.3357, + "step": 4095 + }, + { + "epoch": 0.3276734465310694, + "grad_norm": 0.2833355482826337, + "learning_rate": 9.537830203496642e-06, + "loss": 0.2967, + "step": 4096 + }, + { + "epoch": 0.32775344493110137, + "grad_norm": 0.20479504316967445, + "learning_rate": 9.53755816867488e-06, + "loss": 0.3417, + "step": 4097 + }, + { + "epoch": 0.3278334433311334, + "grad_norm": 0.2290054765598029, + "learning_rate": 9.53728605769811e-06, + "loss": 0.3108, + "step": 4098 + }, + { + "epoch": 0.3279134417311654, + "grad_norm": 0.36893586727068906, + "learning_rate": 9.5370138705709e-06, + "loss": 0.3165, + "step": 4099 + }, + { + "epoch": 0.32799344013119736, + "grad_norm": 0.3168477715819342, + "learning_rate": 9.536741607297817e-06, + "loss": 0.2582, + "step": 4100 + }, + { + "epoch": 0.3280734385312294, + "grad_norm": 0.31669431405959253, + "learning_rate": 9.536469267883432e-06, + "loss": 0.275, + "step": 4101 + }, + { + "epoch": 0.32815343693126137, + "grad_norm": 0.31764342164195486, + "learning_rate": 9.536196852332315e-06, + "loss": 0.2634, + "step": 4102 + }, + { + "epoch": 0.3282334353312934, + "grad_norm": 0.6425278933222287, + "learning_rate": 9.535924360649038e-06, + "loss": 0.2527, + "step": 4103 + }, + { + "epoch": 0.3283134337313254, + "grad_norm": 0.29581621755885257, + "learning_rate": 9.535651792838173e-06, + "loss": 0.2591, + "step": 4104 + }, + { + "epoch": 0.32839343213135735, + "grad_norm": 0.30118869282274835, + "learning_rate": 9.535379148904297e-06, + "loss": 0.3032, + "step": 4105 + }, + { + "epoch": 0.3284734305313894, + "grad_norm": 0.2830920529137497, + "learning_rate": 9.535106428851984e-06, + "loss": 0.2854, + "step": 4106 + }, + { + "epoch": 0.32855342893142137, + "grad_norm": 0.2996699154339364, + "learning_rate": 9.534833632685813e-06, + "loss": 0.2743, + "step": 4107 + }, + { + "epoch": 0.3286334273314534, + "grad_norm": 0.270431222823684, + "learning_rate": 9.534560760410361e-06, + "loss": 0.3319, + "step": 4108 + }, + { + "epoch": 0.3287134257314854, + "grad_norm": 0.3224130952289341, + "learning_rate": 9.534287812030207e-06, + "loss": 0.2665, + "step": 4109 + }, + { + "epoch": 0.32879342413151735, + "grad_norm": 0.335115943467358, + "learning_rate": 9.534014787549932e-06, + "loss": 0.2678, + "step": 4110 + }, + { + "epoch": 0.3288734225315494, + "grad_norm": 0.2500551084012514, + "learning_rate": 9.533741686974122e-06, + "loss": 0.3124, + "step": 4111 + }, + { + "epoch": 0.32895342093158136, + "grad_norm": 0.32179509747836377, + "learning_rate": 9.533468510307356e-06, + "loss": 0.2517, + "step": 4112 + }, + { + "epoch": 0.32903341933161334, + "grad_norm": 0.25092734120252885, + "learning_rate": 9.53319525755422e-06, + "loss": 0.3217, + "step": 4113 + }, + { + "epoch": 0.3291134177316454, + "grad_norm": 0.3137999338440026, + "learning_rate": 9.532921928719301e-06, + "loss": 0.2451, + "step": 4114 + }, + { + "epoch": 0.32919341613167735, + "grad_norm": 0.30176603738657304, + "learning_rate": 9.532648523807186e-06, + "loss": 0.2831, + "step": 4115 + }, + { + "epoch": 0.3292734145317094, + "grad_norm": 0.27007076204482405, + "learning_rate": 9.532375042822464e-06, + "loss": 0.2957, + "step": 4116 + }, + { + "epoch": 0.32935341293174136, + "grad_norm": 0.33451915805712173, + "learning_rate": 9.532101485769723e-06, + "loss": 0.2637, + "step": 4117 + }, + { + "epoch": 0.32943341133177334, + "grad_norm": 0.31077902981085714, + "learning_rate": 9.531827852653556e-06, + "loss": 0.296, + "step": 4118 + }, + { + "epoch": 0.3295134097318054, + "grad_norm": 0.32491363700130044, + "learning_rate": 9.531554143478556e-06, + "loss": 0.3041, + "step": 4119 + }, + { + "epoch": 0.32959340813183735, + "grad_norm": 0.28536976266890873, + "learning_rate": 9.531280358249315e-06, + "loss": 0.2966, + "step": 4120 + }, + { + "epoch": 0.3296734065318694, + "grad_norm": 0.32700786541367605, + "learning_rate": 9.53100649697043e-06, + "loss": 0.2725, + "step": 4121 + }, + { + "epoch": 0.32975340493190136, + "grad_norm": 0.25124382151309893, + "learning_rate": 9.530732559646494e-06, + "loss": 0.3409, + "step": 4122 + }, + { + "epoch": 0.32983340333193334, + "grad_norm": 0.32287087468751224, + "learning_rate": 9.530458546282108e-06, + "loss": 0.2676, + "step": 4123 + }, + { + "epoch": 0.32991340173196537, + "grad_norm": 0.2778814747653328, + "learning_rate": 9.530184456881869e-06, + "loss": 0.3063, + "step": 4124 + }, + { + "epoch": 0.32999340013199735, + "grad_norm": 0.32167136941954166, + "learning_rate": 9.529910291450377e-06, + "loss": 0.3212, + "step": 4125 + }, + { + "epoch": 0.3300733985320294, + "grad_norm": 0.2794985077192064, + "learning_rate": 9.529636049992235e-06, + "loss": 0.3329, + "step": 4126 + }, + { + "epoch": 0.33015339693206136, + "grad_norm": 0.28120960024259806, + "learning_rate": 9.529361732512044e-06, + "loss": 0.2799, + "step": 4127 + }, + { + "epoch": 0.33023339533209334, + "grad_norm": 0.363779403437616, + "learning_rate": 9.52908733901441e-06, + "loss": 0.2652, + "step": 4128 + }, + { + "epoch": 0.33031339373212537, + "grad_norm": 0.3151173668877767, + "learning_rate": 9.528812869503934e-06, + "loss": 0.2662, + "step": 4129 + }, + { + "epoch": 0.33039339213215735, + "grad_norm": 0.2827964309593129, + "learning_rate": 9.528538323985228e-06, + "loss": 0.2933, + "step": 4130 + }, + { + "epoch": 0.3304733905321894, + "grad_norm": 0.2799196849385178, + "learning_rate": 9.528263702462894e-06, + "loss": 0.2994, + "step": 4131 + }, + { + "epoch": 0.33055338893222136, + "grad_norm": 0.31865835119181024, + "learning_rate": 9.527989004941547e-06, + "loss": 0.2612, + "step": 4132 + }, + { + "epoch": 0.33063338733225334, + "grad_norm": 0.3236072857927237, + "learning_rate": 9.527714231425793e-06, + "loss": 0.2374, + "step": 4133 + }, + { + "epoch": 0.33071338573228537, + "grad_norm": 0.326599885644297, + "learning_rate": 9.527439381920245e-06, + "loss": 0.2634, + "step": 4134 + }, + { + "epoch": 0.33079338413231735, + "grad_norm": 0.33752876679304256, + "learning_rate": 9.527164456429517e-06, + "loss": 0.2763, + "step": 4135 + }, + { + "epoch": 0.3308733825323494, + "grad_norm": 0.3840046524221778, + "learning_rate": 9.52688945495822e-06, + "loss": 0.2552, + "step": 4136 + }, + { + "epoch": 0.33095338093238136, + "grad_norm": 0.3205375258323118, + "learning_rate": 9.526614377510971e-06, + "loss": 0.2438, + "step": 4137 + }, + { + "epoch": 0.33103337933241334, + "grad_norm": 0.3047620324022224, + "learning_rate": 9.526339224092389e-06, + "loss": 0.2609, + "step": 4138 + }, + { + "epoch": 0.33111337773244537, + "grad_norm": 0.27598933560230243, + "learning_rate": 9.526063994707091e-06, + "loss": 0.2764, + "step": 4139 + }, + { + "epoch": 0.33119337613247735, + "grad_norm": 0.28627632148415, + "learning_rate": 9.525788689359694e-06, + "loss": 0.2779, + "step": 4140 + }, + { + "epoch": 0.3312733745325093, + "grad_norm": 0.34462279105239907, + "learning_rate": 9.525513308054818e-06, + "loss": 0.2699, + "step": 4141 + }, + { + "epoch": 0.33135337293254136, + "grad_norm": 0.23289918828079942, + "learning_rate": 9.525237850797088e-06, + "loss": 0.3186, + "step": 4142 + }, + { + "epoch": 0.33143337133257333, + "grad_norm": 0.22948182726099486, + "learning_rate": 9.524962317591128e-06, + "loss": 0.3098, + "step": 4143 + }, + { + "epoch": 0.33151336973260537, + "grad_norm": 0.34862687906530354, + "learning_rate": 9.524686708441558e-06, + "loss": 0.2758, + "step": 4144 + }, + { + "epoch": 0.33159336813263735, + "grad_norm": 0.3352335778407664, + "learning_rate": 9.524411023353007e-06, + "loss": 0.2572, + "step": 4145 + }, + { + "epoch": 0.3316733665326693, + "grad_norm": 0.24622626533941536, + "learning_rate": 9.524135262330098e-06, + "loss": 0.299, + "step": 4146 + }, + { + "epoch": 0.33175336493270136, + "grad_norm": 0.35289577127501226, + "learning_rate": 9.523859425377464e-06, + "loss": 0.2965, + "step": 4147 + }, + { + "epoch": 0.33183336333273333, + "grad_norm": 0.3315888168707551, + "learning_rate": 9.523583512499731e-06, + "loss": 0.249, + "step": 4148 + }, + { + "epoch": 0.33191336173276537, + "grad_norm": 0.26650730921260624, + "learning_rate": 9.523307523701532e-06, + "loss": 0.2717, + "step": 4149 + }, + { + "epoch": 0.33199336013279734, + "grad_norm": 0.44258672156889756, + "learning_rate": 9.523031458987498e-06, + "loss": 0.2636, + "step": 4150 + }, + { + "epoch": 0.3320733585328293, + "grad_norm": 0.27286703746738733, + "learning_rate": 9.52275531836226e-06, + "loss": 0.2829, + "step": 4151 + }, + { + "epoch": 0.33215335693286135, + "grad_norm": 0.2936453029520197, + "learning_rate": 9.522479101830457e-06, + "loss": 0.271, + "step": 4152 + }, + { + "epoch": 0.33223335533289333, + "grad_norm": 0.2987200485579244, + "learning_rate": 9.522202809396721e-06, + "loss": 0.2928, + "step": 4153 + }, + { + "epoch": 0.33231335373292537, + "grad_norm": 0.26363995176236205, + "learning_rate": 9.52192644106569e-06, + "loss": 0.3121, + "step": 4154 + }, + { + "epoch": 0.33239335213295734, + "grad_norm": 0.2737718752816055, + "learning_rate": 9.521649996842006e-06, + "loss": 0.2779, + "step": 4155 + }, + { + "epoch": 0.3324733505329893, + "grad_norm": 0.3025786287236667, + "learning_rate": 9.521373476730303e-06, + "loss": 0.2857, + "step": 4156 + }, + { + "epoch": 0.33255334893302135, + "grad_norm": 0.2498331208313307, + "learning_rate": 9.521096880735226e-06, + "loss": 0.3341, + "step": 4157 + }, + { + "epoch": 0.33263334733305333, + "grad_norm": 0.3096483913169192, + "learning_rate": 9.520820208861415e-06, + "loss": 0.2429, + "step": 4158 + }, + { + "epoch": 0.33271334573308536, + "grad_norm": 0.34253369294869657, + "learning_rate": 9.520543461113514e-06, + "loss": 0.2591, + "step": 4159 + }, + { + "epoch": 0.33279334413311734, + "grad_norm": 0.30211663780516795, + "learning_rate": 9.520266637496167e-06, + "loss": 0.2826, + "step": 4160 + }, + { + "epoch": 0.3328733425331493, + "grad_norm": 0.2851274268699115, + "learning_rate": 9.519989738014022e-06, + "loss": 0.2868, + "step": 4161 + }, + { + "epoch": 0.33295334093318135, + "grad_norm": 0.26199754508458256, + "learning_rate": 9.519712762671724e-06, + "loss": 0.3134, + "step": 4162 + }, + { + "epoch": 0.33303333933321333, + "grad_norm": 0.32014163321540284, + "learning_rate": 9.519435711473926e-06, + "loss": 0.2939, + "step": 4163 + }, + { + "epoch": 0.33311333773324536, + "grad_norm": 0.2751728163060749, + "learning_rate": 9.519158584425271e-06, + "loss": 0.2912, + "step": 4164 + }, + { + "epoch": 0.33319333613327734, + "grad_norm": 0.2766209626063788, + "learning_rate": 9.518881381530415e-06, + "loss": 0.2821, + "step": 4165 + }, + { + "epoch": 0.3332733345333093, + "grad_norm": 0.33615046433294693, + "learning_rate": 9.51860410279401e-06, + "loss": 0.2642, + "step": 4166 + }, + { + "epoch": 0.33335333293334135, + "grad_norm": 0.38223601416577, + "learning_rate": 9.518326748220707e-06, + "loss": 0.3143, + "step": 4167 + }, + { + "epoch": 0.33343333133337333, + "grad_norm": 0.22769130761737674, + "learning_rate": 9.518049317815163e-06, + "loss": 0.3347, + "step": 4168 + }, + { + "epoch": 0.3335133297334053, + "grad_norm": 0.30242582541347013, + "learning_rate": 9.517771811582033e-06, + "loss": 0.2326, + "step": 4169 + }, + { + "epoch": 0.33359332813343734, + "grad_norm": 0.2828283456380088, + "learning_rate": 9.517494229525976e-06, + "loss": 0.2953, + "step": 4170 + }, + { + "epoch": 0.3336733265334693, + "grad_norm": 0.29973868634483586, + "learning_rate": 9.51721657165165e-06, + "loss": 0.2398, + "step": 4171 + }, + { + "epoch": 0.33375332493350135, + "grad_norm": 0.29402402286140816, + "learning_rate": 9.516938837963714e-06, + "loss": 0.2885, + "step": 4172 + }, + { + "epoch": 0.3338333233335333, + "grad_norm": 0.30846077669433797, + "learning_rate": 9.51666102846683e-06, + "loss": 0.2959, + "step": 4173 + }, + { + "epoch": 0.3339133217335653, + "grad_norm": 0.32748474666219946, + "learning_rate": 9.516383143165662e-06, + "loss": 0.2507, + "step": 4174 + }, + { + "epoch": 0.33399332013359734, + "grad_norm": 0.32312020400649577, + "learning_rate": 9.516105182064872e-06, + "loss": 0.2617, + "step": 4175 + }, + { + "epoch": 0.3340733185336293, + "grad_norm": 0.3067508970492394, + "learning_rate": 9.515827145169128e-06, + "loss": 0.2549, + "step": 4176 + }, + { + "epoch": 0.33415331693366135, + "grad_norm": 0.3599965355577617, + "learning_rate": 9.515549032483091e-06, + "loss": 0.2371, + "step": 4177 + }, + { + "epoch": 0.3342333153336933, + "grad_norm": 0.34105268263584393, + "learning_rate": 9.515270844011433e-06, + "loss": 0.2482, + "step": 4178 + }, + { + "epoch": 0.3343133137337253, + "grad_norm": 0.22538237070756553, + "learning_rate": 9.51499257975882e-06, + "loss": 0.3588, + "step": 4179 + }, + { + "epoch": 0.33439331213375734, + "grad_norm": 0.2735725628756655, + "learning_rate": 9.514714239729925e-06, + "loss": 0.2804, + "step": 4180 + }, + { + "epoch": 0.3344733105337893, + "grad_norm": 0.31888926636045983, + "learning_rate": 9.514435823929418e-06, + "loss": 0.2822, + "step": 4181 + }, + { + "epoch": 0.33455330893382135, + "grad_norm": 0.31874364811327177, + "learning_rate": 9.514157332361971e-06, + "loss": 0.2595, + "step": 4182 + }, + { + "epoch": 0.3346333073338533, + "grad_norm": 0.29127867918140804, + "learning_rate": 9.51387876503226e-06, + "loss": 0.2949, + "step": 4183 + }, + { + "epoch": 0.3347133057338853, + "grad_norm": 0.25089558064270673, + "learning_rate": 9.51360012194496e-06, + "loss": 0.3177, + "step": 4184 + }, + { + "epoch": 0.33479330413391734, + "grad_norm": 0.49268450682797754, + "learning_rate": 9.513321403104745e-06, + "loss": 0.3152, + "step": 4185 + }, + { + "epoch": 0.3348733025339493, + "grad_norm": 0.24502973088370322, + "learning_rate": 9.513042608516296e-06, + "loss": 0.3036, + "step": 4186 + }, + { + "epoch": 0.33495330093398135, + "grad_norm": 0.7535335180494861, + "learning_rate": 9.512763738184289e-06, + "loss": 0.2516, + "step": 4187 + }, + { + "epoch": 0.3350332993340133, + "grad_norm": 0.2661974220009502, + "learning_rate": 9.512484792113408e-06, + "loss": 0.2823, + "step": 4188 + }, + { + "epoch": 0.3351132977340453, + "grad_norm": 0.36540229322224166, + "learning_rate": 9.51220577030833e-06, + "loss": 0.2721, + "step": 4189 + }, + { + "epoch": 0.33519329613407733, + "grad_norm": 0.28545236376950817, + "learning_rate": 9.511926672773742e-06, + "loss": 0.2977, + "step": 4190 + }, + { + "epoch": 0.3352732945341093, + "grad_norm": 0.278134527846739, + "learning_rate": 9.511647499514327e-06, + "loss": 0.3206, + "step": 4191 + }, + { + "epoch": 0.33535329293414134, + "grad_norm": 0.27688185887636607, + "learning_rate": 9.511368250534769e-06, + "loss": 0.3029, + "step": 4192 + }, + { + "epoch": 0.3354332913341733, + "grad_norm": 0.2769087920514703, + "learning_rate": 9.511088925839758e-06, + "loss": 0.2971, + "step": 4193 + }, + { + "epoch": 0.3355132897342053, + "grad_norm": 0.35035219106707854, + "learning_rate": 9.510809525433977e-06, + "loss": 0.2861, + "step": 4194 + }, + { + "epoch": 0.33559328813423733, + "grad_norm": 0.2969801694667227, + "learning_rate": 9.51053004932212e-06, + "loss": 0.255, + "step": 4195 + }, + { + "epoch": 0.3356732865342693, + "grad_norm": 0.2948339751533439, + "learning_rate": 9.510250497508873e-06, + "loss": 0.2501, + "step": 4196 + }, + { + "epoch": 0.3357532849343013, + "grad_norm": 0.29281113827003824, + "learning_rate": 9.509970869998933e-06, + "loss": 0.2566, + "step": 4197 + }, + { + "epoch": 0.3358332833343333, + "grad_norm": 0.2465854749404492, + "learning_rate": 9.50969116679699e-06, + "loss": 0.3562, + "step": 4198 + }, + { + "epoch": 0.3359132817343653, + "grad_norm": 0.26811917766098803, + "learning_rate": 9.509411387907738e-06, + "loss": 0.3219, + "step": 4199 + }, + { + "epoch": 0.33599328013439733, + "grad_norm": 0.2905191712206117, + "learning_rate": 9.509131533335874e-06, + "loss": 0.271, + "step": 4200 + }, + { + "epoch": 0.3360732785344293, + "grad_norm": 0.2787838427873101, + "learning_rate": 9.508851603086094e-06, + "loss": 0.304, + "step": 4201 + }, + { + "epoch": 0.3361532769344613, + "grad_norm": 0.42892097190236117, + "learning_rate": 9.508571597163095e-06, + "loss": 0.2926, + "step": 4202 + }, + { + "epoch": 0.3362332753344933, + "grad_norm": 0.30485760616483865, + "learning_rate": 9.50829151557158e-06, + "loss": 0.2813, + "step": 4203 + }, + { + "epoch": 0.3363132737345253, + "grad_norm": 0.3181552363455147, + "learning_rate": 9.508011358316244e-06, + "loss": 0.2841, + "step": 4204 + }, + { + "epoch": 0.33639327213455733, + "grad_norm": 0.27926069763881844, + "learning_rate": 9.507731125401795e-06, + "loss": 0.2732, + "step": 4205 + }, + { + "epoch": 0.3364732705345893, + "grad_norm": 0.26992277033561796, + "learning_rate": 9.507450816832934e-06, + "loss": 0.2834, + "step": 4206 + }, + { + "epoch": 0.3365532689346213, + "grad_norm": 0.3651261141929914, + "learning_rate": 9.507170432614364e-06, + "loss": 0.2931, + "step": 4207 + }, + { + "epoch": 0.3366332673346533, + "grad_norm": 0.3161952853209081, + "learning_rate": 9.506889972750792e-06, + "loss": 0.2935, + "step": 4208 + }, + { + "epoch": 0.3367132657346853, + "grad_norm": 0.25868257943383854, + "learning_rate": 9.506609437246924e-06, + "loss": 0.263, + "step": 4209 + }, + { + "epoch": 0.33679326413471733, + "grad_norm": 0.29382827351934276, + "learning_rate": 9.506328826107472e-06, + "loss": 0.3044, + "step": 4210 + }, + { + "epoch": 0.3368732625347493, + "grad_norm": 0.29236024368946345, + "learning_rate": 9.506048139337142e-06, + "loss": 0.2643, + "step": 4211 + }, + { + "epoch": 0.3369532609347813, + "grad_norm": 0.2730083195624845, + "learning_rate": 9.505767376940642e-06, + "loss": 0.2759, + "step": 4212 + }, + { + "epoch": 0.3370332593348133, + "grad_norm": 0.27804821736529406, + "learning_rate": 9.50548653892269e-06, + "loss": 0.3055, + "step": 4213 + }, + { + "epoch": 0.3371132577348453, + "grad_norm": 0.5524963149748693, + "learning_rate": 9.505205625288e-06, + "loss": 0.2849, + "step": 4214 + }, + { + "epoch": 0.33719325613487733, + "grad_norm": 0.32688167113078737, + "learning_rate": 9.504924636041279e-06, + "loss": 0.261, + "step": 4215 + }, + { + "epoch": 0.3372732545349093, + "grad_norm": 0.3011645915518335, + "learning_rate": 9.50464357118725e-06, + "loss": 0.2342, + "step": 4216 + }, + { + "epoch": 0.3373532529349413, + "grad_norm": 0.32563717515424556, + "learning_rate": 9.504362430730627e-06, + "loss": 0.2729, + "step": 4217 + }, + { + "epoch": 0.3374332513349733, + "grad_norm": 0.3172746333595425, + "learning_rate": 9.50408121467613e-06, + "loss": 0.2612, + "step": 4218 + }, + { + "epoch": 0.3375132497350053, + "grad_norm": 0.2458830190890694, + "learning_rate": 9.503799923028478e-06, + "loss": 0.3062, + "step": 4219 + }, + { + "epoch": 0.33759324813503727, + "grad_norm": 0.26610409494043075, + "learning_rate": 9.503518555792392e-06, + "loss": 0.2856, + "step": 4220 + }, + { + "epoch": 0.3376732465350693, + "grad_norm": 0.2649896720313164, + "learning_rate": 9.503237112972594e-06, + "loss": 0.3057, + "step": 4221 + }, + { + "epoch": 0.3377532449351013, + "grad_norm": 0.30134189687860263, + "learning_rate": 9.502955594573807e-06, + "loss": 0.2603, + "step": 4222 + }, + { + "epoch": 0.3378332433351333, + "grad_norm": 0.26326425511418927, + "learning_rate": 9.502674000600758e-06, + "loss": 0.29, + "step": 4223 + }, + { + "epoch": 0.3379132417351653, + "grad_norm": 0.35501648588678963, + "learning_rate": 9.50239233105817e-06, + "loss": 0.2467, + "step": 4224 + }, + { + "epoch": 0.33799324013519727, + "grad_norm": 0.2760707274939826, + "learning_rate": 9.502110585950773e-06, + "loss": 0.2915, + "step": 4225 + }, + { + "epoch": 0.3380732385352293, + "grad_norm": 0.29424043954422985, + "learning_rate": 9.501828765283295e-06, + "loss": 0.285, + "step": 4226 + }, + { + "epoch": 0.3381532369352613, + "grad_norm": 0.278064592520865, + "learning_rate": 9.501546869060466e-06, + "loss": 0.3084, + "step": 4227 + }, + { + "epoch": 0.3382332353352933, + "grad_norm": 0.2843033673256028, + "learning_rate": 9.501264897287015e-06, + "loss": 0.3127, + "step": 4228 + }, + { + "epoch": 0.3383132337353253, + "grad_norm": 0.3052274360189814, + "learning_rate": 9.500982849967674e-06, + "loss": 0.3, + "step": 4229 + }, + { + "epoch": 0.33839323213535727, + "grad_norm": 0.3107926570627238, + "learning_rate": 9.500700727107182e-06, + "loss": 0.2561, + "step": 4230 + }, + { + "epoch": 0.3384732305353893, + "grad_norm": 0.3512836376530983, + "learning_rate": 9.50041852871027e-06, + "loss": 0.2458, + "step": 4231 + }, + { + "epoch": 0.3385532289354213, + "grad_norm": 0.2686511713811813, + "learning_rate": 9.500136254781674e-06, + "loss": 0.2898, + "step": 4232 + }, + { + "epoch": 0.3386332273354533, + "grad_norm": 0.2832750130606172, + "learning_rate": 9.499853905326133e-06, + "loss": 0.3042, + "step": 4233 + }, + { + "epoch": 0.3387132257354853, + "grad_norm": 0.36969808215827404, + "learning_rate": 9.499571480348385e-06, + "loss": 0.2454, + "step": 4234 + }, + { + "epoch": 0.33879322413551727, + "grad_norm": 0.33417453707713296, + "learning_rate": 9.49928897985317e-06, + "loss": 0.2564, + "step": 4235 + }, + { + "epoch": 0.3388732225355493, + "grad_norm": 0.300208253199366, + "learning_rate": 9.49900640384523e-06, + "loss": 0.2672, + "step": 4236 + }, + { + "epoch": 0.3389532209355813, + "grad_norm": 0.3058876665008324, + "learning_rate": 9.498723752329304e-06, + "loss": 0.259, + "step": 4237 + }, + { + "epoch": 0.3390332193356133, + "grad_norm": 0.29026773853039556, + "learning_rate": 9.498441025310142e-06, + "loss": 0.2791, + "step": 4238 + }, + { + "epoch": 0.3391132177356453, + "grad_norm": 0.3000904461508221, + "learning_rate": 9.498158222792484e-06, + "loss": 0.2962, + "step": 4239 + }, + { + "epoch": 0.33919321613567727, + "grad_norm": 0.34956776599279343, + "learning_rate": 9.497875344781078e-06, + "loss": 0.2625, + "step": 4240 + }, + { + "epoch": 0.3392732145357093, + "grad_norm": 0.3048818566172097, + "learning_rate": 9.497592391280672e-06, + "loss": 0.2476, + "step": 4241 + }, + { + "epoch": 0.3393532129357413, + "grad_norm": 5.469832629541838, + "learning_rate": 9.497309362296015e-06, + "loss": 0.29, + "step": 4242 + }, + { + "epoch": 0.3394332113357733, + "grad_norm": 0.43755541579944734, + "learning_rate": 9.497026257831856e-06, + "loss": 0.305, + "step": 4243 + }, + { + "epoch": 0.3395132097358053, + "grad_norm": 0.25907917934101155, + "learning_rate": 9.496743077892948e-06, + "loss": 0.2812, + "step": 4244 + }, + { + "epoch": 0.33959320813583727, + "grad_norm": 0.31474947307406675, + "learning_rate": 9.496459822484043e-06, + "loss": 0.2529, + "step": 4245 + }, + { + "epoch": 0.3396732065358693, + "grad_norm": 0.18692514924378942, + "learning_rate": 9.496176491609893e-06, + "loss": 0.3496, + "step": 4246 + }, + { + "epoch": 0.3397532049359013, + "grad_norm": 0.29102379236580905, + "learning_rate": 9.495893085275256e-06, + "loss": 0.2947, + "step": 4247 + }, + { + "epoch": 0.33983320333593325, + "grad_norm": 0.3039095028742169, + "learning_rate": 9.495609603484887e-06, + "loss": 0.2901, + "step": 4248 + }, + { + "epoch": 0.3399132017359653, + "grad_norm": 0.31852993243479677, + "learning_rate": 9.495326046243547e-06, + "loss": 0.2634, + "step": 4249 + }, + { + "epoch": 0.33999320013599726, + "grad_norm": 0.3305617694724181, + "learning_rate": 9.495042413555988e-06, + "loss": 0.2705, + "step": 4250 + }, + { + "epoch": 0.3400731985360293, + "grad_norm": 0.26614369319100134, + "learning_rate": 9.494758705426978e-06, + "loss": 0.3235, + "step": 4251 + }, + { + "epoch": 0.3401531969360613, + "grad_norm": 0.35185872689168907, + "learning_rate": 9.494474921861274e-06, + "loss": 0.2433, + "step": 4252 + }, + { + "epoch": 0.34023319533609325, + "grad_norm": 0.29908461661269914, + "learning_rate": 9.494191062863638e-06, + "loss": 0.3151, + "step": 4253 + }, + { + "epoch": 0.3403131937361253, + "grad_norm": 0.33742140893680217, + "learning_rate": 9.493907128438838e-06, + "loss": 0.2801, + "step": 4254 + }, + { + "epoch": 0.34039319213615726, + "grad_norm": 0.2950131610756389, + "learning_rate": 9.493623118591638e-06, + "loss": 0.2831, + "step": 4255 + }, + { + "epoch": 0.3404731905361893, + "grad_norm": 0.33164400544718636, + "learning_rate": 9.493339033326802e-06, + "loss": 0.2784, + "step": 4256 + }, + { + "epoch": 0.3405531889362213, + "grad_norm": 0.29714970690394904, + "learning_rate": 9.4930548726491e-06, + "loss": 0.2593, + "step": 4257 + }, + { + "epoch": 0.34063318733625325, + "grad_norm": 0.3158193348514893, + "learning_rate": 9.492770636563303e-06, + "loss": 0.2496, + "step": 4258 + }, + { + "epoch": 0.3407131857362853, + "grad_norm": 0.333498635453097, + "learning_rate": 9.492486325074177e-06, + "loss": 0.2493, + "step": 4259 + }, + { + "epoch": 0.34079318413631726, + "grad_norm": 0.28780795148769905, + "learning_rate": 9.492201938186496e-06, + "loss": 0.2911, + "step": 4260 + }, + { + "epoch": 0.3408731825363493, + "grad_norm": 0.3405918675704944, + "learning_rate": 9.491917475905034e-06, + "loss": 0.262, + "step": 4261 + }, + { + "epoch": 0.3409531809363813, + "grad_norm": 0.24380301349587288, + "learning_rate": 9.491632938234563e-06, + "loss": 0.3427, + "step": 4262 + }, + { + "epoch": 0.34103317933641325, + "grad_norm": 0.35064649592418756, + "learning_rate": 9.491348325179861e-06, + "loss": 0.2698, + "step": 4263 + }, + { + "epoch": 0.3411131777364453, + "grad_norm": 0.2855980789980333, + "learning_rate": 9.491063636745704e-06, + "loss": 0.2836, + "step": 4264 + }, + { + "epoch": 0.34119317613647726, + "grad_norm": 0.3542564638270448, + "learning_rate": 9.490778872936867e-06, + "loss": 0.2281, + "step": 4265 + }, + { + "epoch": 0.3412731745365093, + "grad_norm": 0.287689247054273, + "learning_rate": 9.490494033758132e-06, + "loss": 0.2922, + "step": 4266 + }, + { + "epoch": 0.34135317293654127, + "grad_norm": 0.3530141448578932, + "learning_rate": 9.490209119214282e-06, + "loss": 0.2512, + "step": 4267 + }, + { + "epoch": 0.34143317133657325, + "grad_norm": 0.3087465322483459, + "learning_rate": 9.489924129310094e-06, + "loss": 0.2511, + "step": 4268 + }, + { + "epoch": 0.3415131697366053, + "grad_norm": 0.25855183961396433, + "learning_rate": 9.489639064050353e-06, + "loss": 0.3258, + "step": 4269 + }, + { + "epoch": 0.34159316813663726, + "grad_norm": 0.2919818702032761, + "learning_rate": 9.489353923439843e-06, + "loss": 0.2988, + "step": 4270 + }, + { + "epoch": 0.3416731665366693, + "grad_norm": 0.290290736446847, + "learning_rate": 9.48906870748335e-06, + "loss": 0.3047, + "step": 4271 + }, + { + "epoch": 0.34175316493670127, + "grad_norm": 0.4815677362309613, + "learning_rate": 9.48878341618566e-06, + "loss": 0.2404, + "step": 4272 + }, + { + "epoch": 0.34183316333673325, + "grad_norm": 0.2856137811314598, + "learning_rate": 9.488498049551563e-06, + "loss": 0.2878, + "step": 4273 + }, + { + "epoch": 0.3419131617367653, + "grad_norm": 0.30138821353249046, + "learning_rate": 9.488212607585847e-06, + "loss": 0.2926, + "step": 4274 + }, + { + "epoch": 0.34199316013679726, + "grad_norm": 0.31495904239448713, + "learning_rate": 9.487927090293302e-06, + "loss": 0.2538, + "step": 4275 + }, + { + "epoch": 0.34207315853682924, + "grad_norm": 0.34267501041614296, + "learning_rate": 9.487641497678724e-06, + "loss": 0.2479, + "step": 4276 + }, + { + "epoch": 0.34215315693686127, + "grad_norm": 0.3490547029686455, + "learning_rate": 9.4873558297469e-06, + "loss": 0.2785, + "step": 4277 + }, + { + "epoch": 0.34223315533689325, + "grad_norm": 0.3197164288875752, + "learning_rate": 9.487070086502627e-06, + "loss": 0.2519, + "step": 4278 + }, + { + "epoch": 0.3423131537369253, + "grad_norm": 0.2946325375889917, + "learning_rate": 9.486784267950704e-06, + "loss": 0.2941, + "step": 4279 + }, + { + "epoch": 0.34239315213695726, + "grad_norm": 0.32373204956468515, + "learning_rate": 9.486498374095922e-06, + "loss": 0.2749, + "step": 4280 + }, + { + "epoch": 0.34247315053698923, + "grad_norm": 0.33761168843910017, + "learning_rate": 9.486212404943084e-06, + "loss": 0.241, + "step": 4281 + }, + { + "epoch": 0.34255314893702127, + "grad_norm": 0.33387600386572547, + "learning_rate": 9.485926360496988e-06, + "loss": 0.273, + "step": 4282 + }, + { + "epoch": 0.34263314733705325, + "grad_norm": 0.1984838660640497, + "learning_rate": 9.485640240762434e-06, + "loss": 0.3429, + "step": 4283 + }, + { + "epoch": 0.3427131457370853, + "grad_norm": 0.2987928069194664, + "learning_rate": 9.485354045744226e-06, + "loss": 0.2963, + "step": 4284 + }, + { + "epoch": 0.34279314413711726, + "grad_norm": 0.28167266109497824, + "learning_rate": 9.485067775447164e-06, + "loss": 0.2793, + "step": 4285 + }, + { + "epoch": 0.34287314253714923, + "grad_norm": 0.3031111122991727, + "learning_rate": 9.484781429876055e-06, + "loss": 0.237, + "step": 4286 + }, + { + "epoch": 0.34295314093718127, + "grad_norm": 0.25940845328958534, + "learning_rate": 9.484495009035705e-06, + "loss": 0.2803, + "step": 4287 + }, + { + "epoch": 0.34303313933721324, + "grad_norm": 0.27757859822975783, + "learning_rate": 9.48420851293092e-06, + "loss": 0.3137, + "step": 4288 + }, + { + "epoch": 0.3431131377372453, + "grad_norm": 0.30693997414111424, + "learning_rate": 9.483921941566508e-06, + "loss": 0.2383, + "step": 4289 + }, + { + "epoch": 0.34319313613727725, + "grad_norm": 0.29798973746055113, + "learning_rate": 9.48363529494728e-06, + "loss": 0.34, + "step": 4290 + }, + { + "epoch": 0.34327313453730923, + "grad_norm": 0.2754974623863915, + "learning_rate": 9.483348573078046e-06, + "loss": 0.3082, + "step": 4291 + }, + { + "epoch": 0.34335313293734127, + "grad_norm": 0.3553606773548199, + "learning_rate": 9.483061775963618e-06, + "loss": 0.2677, + "step": 4292 + }, + { + "epoch": 0.34343313133737324, + "grad_norm": 0.39162978462433223, + "learning_rate": 9.48277490360881e-06, + "loss": 0.2695, + "step": 4293 + }, + { + "epoch": 0.3435131297374053, + "grad_norm": 0.22946185310349518, + "learning_rate": 9.482487956018436e-06, + "loss": 0.3035, + "step": 4294 + }, + { + "epoch": 0.34359312813743725, + "grad_norm": 0.25586267455064826, + "learning_rate": 9.482200933197312e-06, + "loss": 0.3323, + "step": 4295 + }, + { + "epoch": 0.34367312653746923, + "grad_norm": 0.33422549808169644, + "learning_rate": 9.481913835150256e-06, + "loss": 0.2369, + "step": 4296 + }, + { + "epoch": 0.34375312493750126, + "grad_norm": 0.28977691416052875, + "learning_rate": 9.481626661882084e-06, + "loss": 0.269, + "step": 4297 + }, + { + "epoch": 0.34383312333753324, + "grad_norm": 0.2861293041064024, + "learning_rate": 9.48133941339762e-06, + "loss": 0.2703, + "step": 4298 + }, + { + "epoch": 0.3439131217375653, + "grad_norm": 0.314922485077229, + "learning_rate": 9.48105208970168e-06, + "loss": 0.2665, + "step": 4299 + }, + { + "epoch": 0.34399312013759725, + "grad_norm": 0.30315282636052626, + "learning_rate": 9.480764690799091e-06, + "loss": 0.2652, + "step": 4300 + }, + { + "epoch": 0.34407311853762923, + "grad_norm": 0.2602876280060404, + "learning_rate": 9.480477216694674e-06, + "loss": 0.2677, + "step": 4301 + }, + { + "epoch": 0.34415311693766126, + "grad_norm": 0.2948333413950905, + "learning_rate": 9.480189667393254e-06, + "loss": 0.2896, + "step": 4302 + }, + { + "epoch": 0.34423311533769324, + "grad_norm": 0.2963929806677376, + "learning_rate": 9.479902042899655e-06, + "loss": 0.2517, + "step": 4303 + }, + { + "epoch": 0.3443131137377252, + "grad_norm": 0.3009082100603607, + "learning_rate": 9.479614343218709e-06, + "loss": 0.2773, + "step": 4304 + }, + { + "epoch": 0.34439311213775725, + "grad_norm": 0.33546121869581863, + "learning_rate": 9.47932656835524e-06, + "loss": 0.2717, + "step": 4305 + }, + { + "epoch": 0.34447311053778923, + "grad_norm": 0.24583208448507946, + "learning_rate": 9.47903871831408e-06, + "loss": 0.306, + "step": 4306 + }, + { + "epoch": 0.34455310893782126, + "grad_norm": 0.30823771357260016, + "learning_rate": 9.47875079310006e-06, + "loss": 0.2604, + "step": 4307 + }, + { + "epoch": 0.34463310733785324, + "grad_norm": 0.9034362380820966, + "learning_rate": 9.47846279271801e-06, + "loss": 0.2681, + "step": 4308 + }, + { + "epoch": 0.3447131057378852, + "grad_norm": 0.34366504153780064, + "learning_rate": 9.478174717172768e-06, + "loss": 0.2415, + "step": 4309 + }, + { + "epoch": 0.34479310413791725, + "grad_norm": 0.3184543911648916, + "learning_rate": 9.477886566469165e-06, + "loss": 0.2698, + "step": 4310 + }, + { + "epoch": 0.3448731025379492, + "grad_norm": 0.29891552512302727, + "learning_rate": 9.47759834061204e-06, + "loss": 0.2465, + "step": 4311 + }, + { + "epoch": 0.34495310093798126, + "grad_norm": 0.2736964547255473, + "learning_rate": 9.477310039606228e-06, + "loss": 0.2657, + "step": 4312 + }, + { + "epoch": 0.34503309933801324, + "grad_norm": 0.26370295659622983, + "learning_rate": 9.477021663456569e-06, + "loss": 0.2864, + "step": 4313 + }, + { + "epoch": 0.3451130977380452, + "grad_norm": 0.25652185113185444, + "learning_rate": 9.476733212167901e-06, + "loss": 0.3393, + "step": 4314 + }, + { + "epoch": 0.34519309613807725, + "grad_norm": 0.30406997333538116, + "learning_rate": 9.476444685745067e-06, + "loss": 0.2891, + "step": 4315 + }, + { + "epoch": 0.3452730945381092, + "grad_norm": 0.30455027541346624, + "learning_rate": 9.47615608419291e-06, + "loss": 0.2524, + "step": 4316 + }, + { + "epoch": 0.34535309293814126, + "grad_norm": 0.3161252203212158, + "learning_rate": 9.475867407516272e-06, + "loss": 0.2639, + "step": 4317 + }, + { + "epoch": 0.34543309133817324, + "grad_norm": 0.2506104985563852, + "learning_rate": 9.47557865572e-06, + "loss": 0.3075, + "step": 4318 + }, + { + "epoch": 0.3455130897382052, + "grad_norm": 0.2999786871994182, + "learning_rate": 9.475289828808937e-06, + "loss": 0.263, + "step": 4319 + }, + { + "epoch": 0.34559308813823725, + "grad_norm": 0.43822098377154356, + "learning_rate": 9.475000926787931e-06, + "loss": 0.2666, + "step": 4320 + }, + { + "epoch": 0.3456730865382692, + "grad_norm": 0.33065879934591713, + "learning_rate": 9.474711949661835e-06, + "loss": 0.2693, + "step": 4321 + }, + { + "epoch": 0.34575308493830126, + "grad_norm": 0.5013963560284516, + "learning_rate": 9.474422897435496e-06, + "loss": 0.277, + "step": 4322 + }, + { + "epoch": 0.34583308333833324, + "grad_norm": 0.27212794145613994, + "learning_rate": 9.474133770113763e-06, + "loss": 0.2946, + "step": 4323 + }, + { + "epoch": 0.3459130817383652, + "grad_norm": 0.2454876242765903, + "learning_rate": 9.473844567701491e-06, + "loss": 0.2979, + "step": 4324 + }, + { + "epoch": 0.34599308013839725, + "grad_norm": 0.3152062613802087, + "learning_rate": 9.473555290203534e-06, + "loss": 0.2878, + "step": 4325 + }, + { + "epoch": 0.3460730785384292, + "grad_norm": 0.27202476234076606, + "learning_rate": 9.473265937624748e-06, + "loss": 0.2795, + "step": 4326 + }, + { + "epoch": 0.34615307693846126, + "grad_norm": 0.2326493819412296, + "learning_rate": 9.472976509969984e-06, + "loss": 0.3164, + "step": 4327 + }, + { + "epoch": 0.34623307533849323, + "grad_norm": 0.3625500642682127, + "learning_rate": 9.472687007244106e-06, + "loss": 0.3042, + "step": 4328 + }, + { + "epoch": 0.3463130737385252, + "grad_norm": 0.31876132348401975, + "learning_rate": 9.47239742945197e-06, + "loss": 0.2399, + "step": 4329 + }, + { + "epoch": 0.34639307213855725, + "grad_norm": 0.3122754610054898, + "learning_rate": 9.472107776598435e-06, + "loss": 0.2693, + "step": 4330 + }, + { + "epoch": 0.3464730705385892, + "grad_norm": 0.3308103457778237, + "learning_rate": 9.471818048688364e-06, + "loss": 0.2542, + "step": 4331 + }, + { + "epoch": 0.3465530689386212, + "grad_norm": 0.33380310292061505, + "learning_rate": 9.471528245726618e-06, + "loss": 0.2559, + "step": 4332 + }, + { + "epoch": 0.34663306733865323, + "grad_norm": 0.2768525847622787, + "learning_rate": 9.471238367718064e-06, + "loss": 0.2954, + "step": 4333 + }, + { + "epoch": 0.3467130657386852, + "grad_norm": 0.31696348642123356, + "learning_rate": 9.470948414667562e-06, + "loss": 0.2425, + "step": 4334 + }, + { + "epoch": 0.34679306413871724, + "grad_norm": 0.28561634923240226, + "learning_rate": 9.470658386579983e-06, + "loss": 0.2933, + "step": 4335 + }, + { + "epoch": 0.3468730625387492, + "grad_norm": 0.3302115402124963, + "learning_rate": 9.470368283460193e-06, + "loss": 0.2625, + "step": 4336 + }, + { + "epoch": 0.3469530609387812, + "grad_norm": 0.27056622924408813, + "learning_rate": 9.470078105313062e-06, + "loss": 0.3167, + "step": 4337 + }, + { + "epoch": 0.34703305933881323, + "grad_norm": 0.2507326906138062, + "learning_rate": 9.469787852143458e-06, + "loss": 0.3226, + "step": 4338 + }, + { + "epoch": 0.3471130577388452, + "grad_norm": 0.2985939255239289, + "learning_rate": 9.469497523956253e-06, + "loss": 0.2875, + "step": 4339 + }, + { + "epoch": 0.34719305613887724, + "grad_norm": 0.2762241211467178, + "learning_rate": 9.46920712075632e-06, + "loss": 0.3181, + "step": 4340 + }, + { + "epoch": 0.3472730545389092, + "grad_norm": 0.28967330201731833, + "learning_rate": 9.468916642548534e-06, + "loss": 0.239, + "step": 4341 + }, + { + "epoch": 0.3473530529389412, + "grad_norm": 0.38097360145867176, + "learning_rate": 9.468626089337767e-06, + "loss": 0.2573, + "step": 4342 + }, + { + "epoch": 0.34743305133897323, + "grad_norm": 0.2910523714060384, + "learning_rate": 9.468335461128898e-06, + "loss": 0.269, + "step": 4343 + }, + { + "epoch": 0.3475130497390052, + "grad_norm": 0.3187817349021624, + "learning_rate": 9.468044757926804e-06, + "loss": 0.2587, + "step": 4344 + }, + { + "epoch": 0.34759304813903724, + "grad_norm": 0.267475803607043, + "learning_rate": 9.467753979736365e-06, + "loss": 0.3244, + "step": 4345 + }, + { + "epoch": 0.3476730465390692, + "grad_norm": 0.32592848055603196, + "learning_rate": 9.467463126562461e-06, + "loss": 0.2634, + "step": 4346 + }, + { + "epoch": 0.3477530449391012, + "grad_norm": 0.3019395619439381, + "learning_rate": 9.467172198409971e-06, + "loss": 0.2719, + "step": 4347 + }, + { + "epoch": 0.34783304333913323, + "grad_norm": 0.26687158148998863, + "learning_rate": 9.466881195283782e-06, + "loss": 0.2845, + "step": 4348 + }, + { + "epoch": 0.3479130417391652, + "grad_norm": 0.2864044030207187, + "learning_rate": 9.466590117188773e-06, + "loss": 0.2918, + "step": 4349 + }, + { + "epoch": 0.34799304013919724, + "grad_norm": 0.20132515209834007, + "learning_rate": 9.466298964129832e-06, + "loss": 0.3669, + "step": 4350 + }, + { + "epoch": 0.3480730385392292, + "grad_norm": 0.29399505437396484, + "learning_rate": 9.466007736111846e-06, + "loss": 0.2914, + "step": 4351 + }, + { + "epoch": 0.3481530369392612, + "grad_norm": 0.3197337323327319, + "learning_rate": 9.465716433139702e-06, + "loss": 0.2461, + "step": 4352 + }, + { + "epoch": 0.34823303533929323, + "grad_norm": 0.30604925978737085, + "learning_rate": 9.465425055218289e-06, + "loss": 0.2511, + "step": 4353 + }, + { + "epoch": 0.3483130337393252, + "grad_norm": 0.31402366617290123, + "learning_rate": 9.465133602352497e-06, + "loss": 0.3246, + "step": 4354 + }, + { + "epoch": 0.34839303213935724, + "grad_norm": 0.27231286699225876, + "learning_rate": 9.464842074547218e-06, + "loss": 0.2834, + "step": 4355 + }, + { + "epoch": 0.3484730305393892, + "grad_norm": 0.3565449921353315, + "learning_rate": 9.464550471807346e-06, + "loss": 0.2548, + "step": 4356 + }, + { + "epoch": 0.3485530289394212, + "grad_norm": 0.28094402877701946, + "learning_rate": 9.464258794137771e-06, + "loss": 0.2872, + "step": 4357 + }, + { + "epoch": 0.3486330273394532, + "grad_norm": 0.2905392282394296, + "learning_rate": 9.46396704154339e-06, + "loss": 0.2863, + "step": 4358 + }, + { + "epoch": 0.3487130257394852, + "grad_norm": 0.3929194060155688, + "learning_rate": 9.463675214029104e-06, + "loss": 0.2753, + "step": 4359 + }, + { + "epoch": 0.3487930241395172, + "grad_norm": 0.31526305450255054, + "learning_rate": 9.463383311599806e-06, + "loss": 0.27, + "step": 4360 + }, + { + "epoch": 0.3488730225395492, + "grad_norm": 0.2699418931353672, + "learning_rate": 9.463091334260397e-06, + "loss": 0.3142, + "step": 4361 + }, + { + "epoch": 0.3489530209395812, + "grad_norm": 0.3339972773028592, + "learning_rate": 9.462799282015775e-06, + "loss": 0.295, + "step": 4362 + }, + { + "epoch": 0.3490330193396132, + "grad_norm": 0.3442178117022778, + "learning_rate": 9.462507154870846e-06, + "loss": 0.2554, + "step": 4363 + }, + { + "epoch": 0.3491130177396452, + "grad_norm": 0.2501113741724323, + "learning_rate": 9.462214952830507e-06, + "loss": 0.3223, + "step": 4364 + }, + { + "epoch": 0.3491930161396772, + "grad_norm": 0.2915605271924354, + "learning_rate": 9.461922675899668e-06, + "loss": 0.3114, + "step": 4365 + }, + { + "epoch": 0.3492730145397092, + "grad_norm": 0.3243893896502367, + "learning_rate": 9.461630324083228e-06, + "loss": 0.2645, + "step": 4366 + }, + { + "epoch": 0.3493530129397412, + "grad_norm": 0.2775562163203753, + "learning_rate": 9.4613378973861e-06, + "loss": 0.3197, + "step": 4367 + }, + { + "epoch": 0.3494330113397732, + "grad_norm": 0.30941489796468646, + "learning_rate": 9.461045395813188e-06, + "loss": 0.2636, + "step": 4368 + }, + { + "epoch": 0.3495130097398052, + "grad_norm": 0.2722517855704053, + "learning_rate": 9.460752819369405e-06, + "loss": 0.2917, + "step": 4369 + }, + { + "epoch": 0.3495930081398372, + "grad_norm": 0.3110908886141127, + "learning_rate": 9.460460168059655e-06, + "loss": 0.2773, + "step": 4370 + }, + { + "epoch": 0.3496730065398692, + "grad_norm": 0.3619610026116026, + "learning_rate": 9.460167441888855e-06, + "loss": 0.2633, + "step": 4371 + }, + { + "epoch": 0.3497530049399012, + "grad_norm": 0.321834807621303, + "learning_rate": 9.459874640861917e-06, + "loss": 0.2514, + "step": 4372 + }, + { + "epoch": 0.3498330033399332, + "grad_norm": 0.2877897690292016, + "learning_rate": 9.459581764983752e-06, + "loss": 0.3004, + "step": 4373 + }, + { + "epoch": 0.3499130017399652, + "grad_norm": 0.35739789819881634, + "learning_rate": 9.459288814259279e-06, + "loss": 0.2651, + "step": 4374 + }, + { + "epoch": 0.3499930001399972, + "grad_norm": 0.3155776403518716, + "learning_rate": 9.458995788693414e-06, + "loss": 0.2817, + "step": 4375 + }, + { + "epoch": 0.3500729985400292, + "grad_norm": 0.3410392656074438, + "learning_rate": 9.458702688291072e-06, + "loss": 0.3102, + "step": 4376 + }, + { + "epoch": 0.3501529969400612, + "grad_norm": 0.3107243714986064, + "learning_rate": 9.458409513057176e-06, + "loss": 0.2884, + "step": 4377 + }, + { + "epoch": 0.3502329953400932, + "grad_norm": 0.3289579373250139, + "learning_rate": 9.458116262996646e-06, + "loss": 0.2686, + "step": 4378 + }, + { + "epoch": 0.3503129937401252, + "grad_norm": 0.3362178198620883, + "learning_rate": 9.457822938114401e-06, + "loss": 0.2733, + "step": 4379 + }, + { + "epoch": 0.3503929921401572, + "grad_norm": 0.30852884699834876, + "learning_rate": 9.457529538415366e-06, + "loss": 0.252, + "step": 4380 + }, + { + "epoch": 0.3504729905401892, + "grad_norm": 0.37718518755574243, + "learning_rate": 9.457236063904465e-06, + "loss": 0.2872, + "step": 4381 + }, + { + "epoch": 0.3505529889402212, + "grad_norm": 0.31345469018931643, + "learning_rate": 9.456942514586623e-06, + "loss": 0.2547, + "step": 4382 + }, + { + "epoch": 0.3506329873402532, + "grad_norm": 0.3017408692498249, + "learning_rate": 9.456648890466767e-06, + "loss": 0.2679, + "step": 4383 + }, + { + "epoch": 0.3507129857402852, + "grad_norm": 0.3081577963014683, + "learning_rate": 9.456355191549826e-06, + "loss": 0.2648, + "step": 4384 + }, + { + "epoch": 0.3507929841403172, + "grad_norm": 0.3159367390089897, + "learning_rate": 9.456061417840727e-06, + "loss": 0.2676, + "step": 4385 + }, + { + "epoch": 0.3508729825403492, + "grad_norm": 0.32642381202243553, + "learning_rate": 9.4557675693444e-06, + "loss": 0.264, + "step": 4386 + }, + { + "epoch": 0.3509529809403812, + "grad_norm": 0.28334396989278965, + "learning_rate": 9.455473646065782e-06, + "loss": 0.2927, + "step": 4387 + }, + { + "epoch": 0.35103297934041316, + "grad_norm": 0.2502521089542464, + "learning_rate": 9.4551796480098e-06, + "loss": 0.2817, + "step": 4388 + }, + { + "epoch": 0.3511129777404452, + "grad_norm": 0.3064201932697271, + "learning_rate": 9.454885575181391e-06, + "loss": 0.3039, + "step": 4389 + }, + { + "epoch": 0.3511929761404772, + "grad_norm": 0.2243859729067831, + "learning_rate": 9.454591427585489e-06, + "loss": 0.3467, + "step": 4390 + }, + { + "epoch": 0.3512729745405092, + "grad_norm": 0.31457188423253873, + "learning_rate": 9.454297205227034e-06, + "loss": 0.2389, + "step": 4391 + }, + { + "epoch": 0.3513529729405412, + "grad_norm": 0.2740108651853356, + "learning_rate": 9.45400290811096e-06, + "loss": 0.2692, + "step": 4392 + }, + { + "epoch": 0.35143297134057316, + "grad_norm": 0.28215901269006904, + "learning_rate": 9.45370853624221e-06, + "loss": 0.29, + "step": 4393 + }, + { + "epoch": 0.3515129697406052, + "grad_norm": 0.2868176853006843, + "learning_rate": 9.453414089625722e-06, + "loss": 0.3175, + "step": 4394 + }, + { + "epoch": 0.3515929681406372, + "grad_norm": 0.5443891302331229, + "learning_rate": 9.453119568266435e-06, + "loss": 0.2914, + "step": 4395 + }, + { + "epoch": 0.3516729665406692, + "grad_norm": 0.3088524527530198, + "learning_rate": 9.452824972169298e-06, + "loss": 0.247, + "step": 4396 + }, + { + "epoch": 0.3517529649407012, + "grad_norm": 0.23854697460608962, + "learning_rate": 9.452530301339254e-06, + "loss": 0.3062, + "step": 4397 + }, + { + "epoch": 0.35183296334073316, + "grad_norm": 0.27294430814377757, + "learning_rate": 9.452235555781246e-06, + "loss": 0.2736, + "step": 4398 + }, + { + "epoch": 0.3519129617407652, + "grad_norm": 0.2906641160778461, + "learning_rate": 9.451940735500222e-06, + "loss": 0.2789, + "step": 4399 + }, + { + "epoch": 0.3519929601407972, + "grad_norm": 0.28468058217749204, + "learning_rate": 9.451645840501131e-06, + "loss": 0.2889, + "step": 4400 + }, + { + "epoch": 0.3520729585408292, + "grad_norm": 0.35288303542469, + "learning_rate": 9.451350870788922e-06, + "loss": 0.2879, + "step": 4401 + }, + { + "epoch": 0.3521529569408612, + "grad_norm": 0.29927280273963813, + "learning_rate": 9.451055826368542e-06, + "loss": 0.2524, + "step": 4402 + }, + { + "epoch": 0.35223295534089316, + "grad_norm": 0.2661774931387885, + "learning_rate": 9.450760707244948e-06, + "loss": 0.2898, + "step": 4403 + }, + { + "epoch": 0.3523129537409252, + "grad_norm": 0.3123631500564031, + "learning_rate": 9.45046551342309e-06, + "loss": 0.2758, + "step": 4404 + }, + { + "epoch": 0.35239295214095717, + "grad_norm": 0.23429733781655696, + "learning_rate": 9.450170244907924e-06, + "loss": 0.3317, + "step": 4405 + }, + { + "epoch": 0.3524729505409892, + "grad_norm": 0.22396360431766393, + "learning_rate": 9.449874901704404e-06, + "loss": 0.3228, + "step": 4406 + }, + { + "epoch": 0.3525529489410212, + "grad_norm": 0.3211433935513282, + "learning_rate": 9.449579483817487e-06, + "loss": 0.2655, + "step": 4407 + }, + { + "epoch": 0.35263294734105316, + "grad_norm": 0.25443751643639356, + "learning_rate": 9.449283991252132e-06, + "loss": 0.2863, + "step": 4408 + }, + { + "epoch": 0.3527129457410852, + "grad_norm": 0.2869633988283425, + "learning_rate": 9.448988424013298e-06, + "loss": 0.3119, + "step": 4409 + }, + { + "epoch": 0.35279294414111717, + "grad_norm": 0.3584701119246657, + "learning_rate": 9.448692782105945e-06, + "loss": 0.2582, + "step": 4410 + }, + { + "epoch": 0.3528729425411492, + "grad_norm": 0.3024010724888866, + "learning_rate": 9.448397065535037e-06, + "loss": 0.2743, + "step": 4411 + }, + { + "epoch": 0.3529529409411812, + "grad_norm": 0.38965927812243917, + "learning_rate": 9.448101274305533e-06, + "loss": 0.2832, + "step": 4412 + }, + { + "epoch": 0.35303293934121316, + "grad_norm": 0.2606347963680028, + "learning_rate": 9.4478054084224e-06, + "loss": 0.2953, + "step": 4413 + }, + { + "epoch": 0.3531129377412452, + "grad_norm": 0.30094340745243525, + "learning_rate": 9.447509467890605e-06, + "loss": 0.2878, + "step": 4414 + }, + { + "epoch": 0.35319293614127717, + "grad_norm": 0.2709932647076854, + "learning_rate": 9.447213452715114e-06, + "loss": 0.3005, + "step": 4415 + }, + { + "epoch": 0.35327293454130915, + "grad_norm": 0.24338161459454632, + "learning_rate": 9.446917362900891e-06, + "loss": 0.3201, + "step": 4416 + }, + { + "epoch": 0.3533529329413412, + "grad_norm": 0.2716207941014577, + "learning_rate": 9.446621198452912e-06, + "loss": 0.2866, + "step": 4417 + }, + { + "epoch": 0.35343293134137316, + "grad_norm": 0.3151838389922976, + "learning_rate": 9.446324959376142e-06, + "loss": 0.2566, + "step": 4418 + }, + { + "epoch": 0.3535129297414052, + "grad_norm": 0.31889775897723543, + "learning_rate": 9.446028645675556e-06, + "loss": 0.26, + "step": 4419 + }, + { + "epoch": 0.35359292814143717, + "grad_norm": 0.2942341704670075, + "learning_rate": 9.445732257356125e-06, + "loss": 0.2594, + "step": 4420 + }, + { + "epoch": 0.35367292654146915, + "grad_norm": 0.23628129050290167, + "learning_rate": 9.445435794422826e-06, + "loss": 0.3173, + "step": 4421 + }, + { + "epoch": 0.3537529249415012, + "grad_norm": 0.2922182737120497, + "learning_rate": 9.445139256880633e-06, + "loss": 0.2529, + "step": 4422 + }, + { + "epoch": 0.35383292334153316, + "grad_norm": 0.31180473630202155, + "learning_rate": 9.444842644734521e-06, + "loss": 0.2696, + "step": 4423 + }, + { + "epoch": 0.3539129217415652, + "grad_norm": 0.2664640219560601, + "learning_rate": 9.444545957989473e-06, + "loss": 0.297, + "step": 4424 + }, + { + "epoch": 0.35399292014159717, + "grad_norm": 0.23936614827356684, + "learning_rate": 9.444249196650465e-06, + "loss": 0.3205, + "step": 4425 + }, + { + "epoch": 0.35407291854162914, + "grad_norm": 0.31225580769074285, + "learning_rate": 9.443952360722477e-06, + "loss": 0.2897, + "step": 4426 + }, + { + "epoch": 0.3541529169416612, + "grad_norm": 0.2869200166494667, + "learning_rate": 9.443655450210494e-06, + "loss": 0.2934, + "step": 4427 + }, + { + "epoch": 0.35423291534169316, + "grad_norm": 0.20620858285576718, + "learning_rate": 9.443358465119495e-06, + "loss": 0.3449, + "step": 4428 + }, + { + "epoch": 0.3543129137417252, + "grad_norm": 0.3178511369967502, + "learning_rate": 9.443061405454468e-06, + "loss": 0.2609, + "step": 4429 + }, + { + "epoch": 0.35439291214175717, + "grad_norm": 0.3012788290057523, + "learning_rate": 9.442764271220396e-06, + "loss": 0.2955, + "step": 4430 + }, + { + "epoch": 0.35447291054178914, + "grad_norm": 0.2943264347935936, + "learning_rate": 9.442467062422267e-06, + "loss": 0.2814, + "step": 4431 + }, + { + "epoch": 0.3545529089418212, + "grad_norm": 0.32795716531023017, + "learning_rate": 9.44216977906507e-06, + "loss": 0.2624, + "step": 4432 + }, + { + "epoch": 0.35463290734185315, + "grad_norm": 0.26294382512046843, + "learning_rate": 9.441872421153792e-06, + "loss": 0.3171, + "step": 4433 + }, + { + "epoch": 0.3547129057418852, + "grad_norm": 0.23215261991553796, + "learning_rate": 9.441574988693428e-06, + "loss": 0.299, + "step": 4434 + }, + { + "epoch": 0.35479290414191716, + "grad_norm": 0.24753659625125082, + "learning_rate": 9.441277481688964e-06, + "loss": 0.2679, + "step": 4435 + }, + { + "epoch": 0.35487290254194914, + "grad_norm": 0.341472585630691, + "learning_rate": 9.4409799001454e-06, + "loss": 0.2477, + "step": 4436 + }, + { + "epoch": 0.3549529009419812, + "grad_norm": 0.2852737516161907, + "learning_rate": 9.440682244067724e-06, + "loss": 0.294, + "step": 4437 + }, + { + "epoch": 0.35503289934201315, + "grad_norm": 0.35447279907086576, + "learning_rate": 9.440384513460934e-06, + "loss": 0.2857, + "step": 4438 + }, + { + "epoch": 0.3551128977420452, + "grad_norm": 0.19525603906640657, + "learning_rate": 9.440086708330028e-06, + "loss": 0.3522, + "step": 4439 + }, + { + "epoch": 0.35519289614207716, + "grad_norm": 0.2956597073282292, + "learning_rate": 9.439788828680003e-06, + "loss": 0.3018, + "step": 4440 + }, + { + "epoch": 0.35527289454210914, + "grad_norm": 0.23992580976088543, + "learning_rate": 9.439490874515859e-06, + "loss": 0.3063, + "step": 4441 + }, + { + "epoch": 0.3553528929421412, + "grad_norm": 0.24484804798281992, + "learning_rate": 9.439192845842596e-06, + "loss": 0.3225, + "step": 4442 + }, + { + "epoch": 0.35543289134217315, + "grad_norm": 0.3215229986035043, + "learning_rate": 9.438894742665217e-06, + "loss": 0.2528, + "step": 4443 + }, + { + "epoch": 0.35551288974220513, + "grad_norm": 0.2631620165223143, + "learning_rate": 9.438596564988722e-06, + "loss": 0.2877, + "step": 4444 + }, + { + "epoch": 0.35559288814223716, + "grad_norm": 0.28650928547132853, + "learning_rate": 9.43829831281812e-06, + "loss": 0.2585, + "step": 4445 + }, + { + "epoch": 0.35567288654226914, + "grad_norm": 0.33739668125319294, + "learning_rate": 9.437999986158413e-06, + "loss": 0.2783, + "step": 4446 + }, + { + "epoch": 0.3557528849423012, + "grad_norm": 0.31955748708451004, + "learning_rate": 9.437701585014608e-06, + "loss": 0.2561, + "step": 4447 + }, + { + "epoch": 0.35583288334233315, + "grad_norm": 0.3090232694550856, + "learning_rate": 9.437403109391719e-06, + "loss": 0.2344, + "step": 4448 + }, + { + "epoch": 0.35591288174236513, + "grad_norm": 0.2828346847764824, + "learning_rate": 9.437104559294748e-06, + "loss": 0.2857, + "step": 4449 + }, + { + "epoch": 0.35599288014239716, + "grad_norm": 0.30289337480785694, + "learning_rate": 9.436805934728707e-06, + "loss": 0.2753, + "step": 4450 + }, + { + "epoch": 0.35607287854242914, + "grad_norm": 0.333190583700772, + "learning_rate": 9.436507235698613e-06, + "loss": 0.2634, + "step": 4451 + }, + { + "epoch": 0.35615287694246117, + "grad_norm": 0.31440311666104254, + "learning_rate": 9.436208462209474e-06, + "loss": 0.2709, + "step": 4452 + }, + { + "epoch": 0.35623287534249315, + "grad_norm": 0.23052313868837052, + "learning_rate": 9.435909614266303e-06, + "loss": 0.323, + "step": 4453 + }, + { + "epoch": 0.3563128737425251, + "grad_norm": 0.3033913263148454, + "learning_rate": 9.435610691874122e-06, + "loss": 0.2558, + "step": 4454 + }, + { + "epoch": 0.35639287214255716, + "grad_norm": 0.32675912202492474, + "learning_rate": 9.435311695037943e-06, + "loss": 0.2606, + "step": 4455 + }, + { + "epoch": 0.35647287054258914, + "grad_norm": 0.45341736283275125, + "learning_rate": 9.435012623762785e-06, + "loss": 0.2665, + "step": 4456 + }, + { + "epoch": 0.35655286894262117, + "grad_norm": 0.3270318327387806, + "learning_rate": 9.43471347805367e-06, + "loss": 0.281, + "step": 4457 + }, + { + "epoch": 0.35663286734265315, + "grad_norm": 0.2600079143244019, + "learning_rate": 9.434414257915614e-06, + "loss": 0.2922, + "step": 4458 + }, + { + "epoch": 0.3567128657426851, + "grad_norm": 0.2586940486877129, + "learning_rate": 9.434114963353644e-06, + "loss": 0.2807, + "step": 4459 + }, + { + "epoch": 0.35679286414271716, + "grad_norm": 0.3084230383712127, + "learning_rate": 9.433815594372779e-06, + "loss": 0.2801, + "step": 4460 + }, + { + "epoch": 0.35687286254274914, + "grad_norm": 0.2852648501726477, + "learning_rate": 9.433516150978045e-06, + "loss": 0.2204, + "step": 4461 + }, + { + "epoch": 0.35695286094278117, + "grad_norm": 0.3019917422443777, + "learning_rate": 9.433216633174469e-06, + "loss": 0.2573, + "step": 4462 + }, + { + "epoch": 0.35703285934281315, + "grad_norm": 0.2547033262592317, + "learning_rate": 9.432917040967074e-06, + "loss": 0.337, + "step": 4463 + }, + { + "epoch": 0.3571128577428451, + "grad_norm": 0.29461529433087563, + "learning_rate": 9.432617374360893e-06, + "loss": 0.2485, + "step": 4464 + }, + { + "epoch": 0.35719285614287716, + "grad_norm": 0.2983154924622732, + "learning_rate": 9.432317633360952e-06, + "loss": 0.2496, + "step": 4465 + }, + { + "epoch": 0.35727285454290914, + "grad_norm": 0.30651751004856975, + "learning_rate": 9.432017817972282e-06, + "loss": 0.3037, + "step": 4466 + }, + { + "epoch": 0.35735285294294117, + "grad_norm": 0.6587789290331908, + "learning_rate": 9.431717928199916e-06, + "loss": 0.2777, + "step": 4467 + }, + { + "epoch": 0.35743285134297315, + "grad_norm": 0.20217020049614517, + "learning_rate": 9.431417964048888e-06, + "loss": 0.3593, + "step": 4468 + }, + { + "epoch": 0.3575128497430051, + "grad_norm": 0.2883552702985981, + "learning_rate": 9.43111792552423e-06, + "loss": 0.2976, + "step": 4469 + }, + { + "epoch": 0.35759284814303716, + "grad_norm": 0.2844555145949519, + "learning_rate": 9.43081781263098e-06, + "loss": 0.281, + "step": 4470 + }, + { + "epoch": 0.35767284654306913, + "grad_norm": 0.31130871097456314, + "learning_rate": 9.430517625374171e-06, + "loss": 0.2879, + "step": 4471 + }, + { + "epoch": 0.3577528449431011, + "grad_norm": 0.2724069759505394, + "learning_rate": 9.430217363758844e-06, + "loss": 0.2945, + "step": 4472 + }, + { + "epoch": 0.35783284334313314, + "grad_norm": 0.3102826197917452, + "learning_rate": 9.42991702779004e-06, + "loss": 0.2674, + "step": 4473 + }, + { + "epoch": 0.3579128417431651, + "grad_norm": 0.3993731542439318, + "learning_rate": 9.429616617472796e-06, + "loss": 0.272, + "step": 4474 + }, + { + "epoch": 0.35799284014319716, + "grad_norm": 0.45669771274050597, + "learning_rate": 9.429316132812156e-06, + "loss": 0.2846, + "step": 4475 + }, + { + "epoch": 0.35807283854322913, + "grad_norm": 0.29587062075013965, + "learning_rate": 9.429015573813163e-06, + "loss": 0.2937, + "step": 4476 + }, + { + "epoch": 0.3581528369432611, + "grad_norm": 0.3071939435957104, + "learning_rate": 9.428714940480861e-06, + "loss": 0.2597, + "step": 4477 + }, + { + "epoch": 0.35823283534329314, + "grad_norm": 0.2594193840488259, + "learning_rate": 9.428414232820295e-06, + "loss": 0.3425, + "step": 4478 + }, + { + "epoch": 0.3583128337433251, + "grad_norm": 0.19993419728368275, + "learning_rate": 9.428113450836514e-06, + "loss": 0.3671, + "step": 4479 + }, + { + "epoch": 0.35839283214335715, + "grad_norm": 0.3150661416341772, + "learning_rate": 9.427812594534563e-06, + "loss": 0.2637, + "step": 4480 + }, + { + "epoch": 0.35847283054338913, + "grad_norm": 0.3144894066949601, + "learning_rate": 9.427511663919492e-06, + "loss": 0.2592, + "step": 4481 + }, + { + "epoch": 0.3585528289434211, + "grad_norm": 0.27538584832362045, + "learning_rate": 9.427210658996353e-06, + "loss": 0.2719, + "step": 4482 + }, + { + "epoch": 0.35863282734345314, + "grad_norm": 0.2531354479254568, + "learning_rate": 9.426909579770197e-06, + "loss": 0.3137, + "step": 4483 + }, + { + "epoch": 0.3587128257434851, + "grad_norm": 0.4618133265380159, + "learning_rate": 9.42660842624608e-06, + "loss": 0.3099, + "step": 4484 + }, + { + "epoch": 0.35879282414351715, + "grad_norm": 0.21860173599346713, + "learning_rate": 9.426307198429053e-06, + "loss": 0.347, + "step": 4485 + }, + { + "epoch": 0.35887282254354913, + "grad_norm": 0.20101630454402167, + "learning_rate": 9.42600589632417e-06, + "loss": 0.3333, + "step": 4486 + }, + { + "epoch": 0.3589528209435811, + "grad_norm": 0.2670409930918773, + "learning_rate": 9.425704519936492e-06, + "loss": 0.3177, + "step": 4487 + }, + { + "epoch": 0.35903281934361314, + "grad_norm": 0.33940161501015775, + "learning_rate": 9.425403069271076e-06, + "loss": 0.2541, + "step": 4488 + }, + { + "epoch": 0.3591128177436451, + "grad_norm": 0.2708896913299644, + "learning_rate": 9.425101544332979e-06, + "loss": 0.3299, + "step": 4489 + }, + { + "epoch": 0.35919281614367715, + "grad_norm": 0.2810315733789894, + "learning_rate": 9.424799945127263e-06, + "loss": 0.3058, + "step": 4490 + }, + { + "epoch": 0.35927281454370913, + "grad_norm": 0.36450701616855763, + "learning_rate": 9.424498271658991e-06, + "loss": 0.2752, + "step": 4491 + }, + { + "epoch": 0.3593528129437411, + "grad_norm": 0.33697939841434216, + "learning_rate": 9.424196523933225e-06, + "loss": 0.2806, + "step": 4492 + }, + { + "epoch": 0.35943281134377314, + "grad_norm": 0.31157900290943724, + "learning_rate": 9.423894701955028e-06, + "loss": 0.2694, + "step": 4493 + }, + { + "epoch": 0.3595128097438051, + "grad_norm": 0.3132242687415547, + "learning_rate": 9.423592805729466e-06, + "loss": 0.2606, + "step": 4494 + }, + { + "epoch": 0.35959280814383715, + "grad_norm": 0.35824916270380586, + "learning_rate": 9.42329083526161e-06, + "loss": 0.2596, + "step": 4495 + }, + { + "epoch": 0.35967280654386913, + "grad_norm": 0.3132700667570474, + "learning_rate": 9.422988790556524e-06, + "loss": 0.2958, + "step": 4496 + }, + { + "epoch": 0.3597528049439011, + "grad_norm": 0.2842753839713741, + "learning_rate": 9.422686671619277e-06, + "loss": 0.3272, + "step": 4497 + }, + { + "epoch": 0.35983280334393314, + "grad_norm": 0.35236423590464855, + "learning_rate": 9.42238447845494e-06, + "loss": 0.2668, + "step": 4498 + }, + { + "epoch": 0.3599128017439651, + "grad_norm": 0.3371229421962793, + "learning_rate": 9.422082211068586e-06, + "loss": 0.2743, + "step": 4499 + }, + { + "epoch": 0.3599928001439971, + "grad_norm": 0.30034875165333313, + "learning_rate": 9.421779869465288e-06, + "loss": 0.259, + "step": 4500 + }, + { + "epoch": 0.3600727985440291, + "grad_norm": 0.4286225481189003, + "learning_rate": 9.421477453650118e-06, + "loss": 0.2463, + "step": 4501 + }, + { + "epoch": 0.3601527969440611, + "grad_norm": 0.30272599198383504, + "learning_rate": 9.421174963628155e-06, + "loss": 0.2491, + "step": 4502 + }, + { + "epoch": 0.36023279534409314, + "grad_norm": 0.25768740278243524, + "learning_rate": 9.420872399404473e-06, + "loss": 0.3353, + "step": 4503 + }, + { + "epoch": 0.3603127937441251, + "grad_norm": 0.27321943339742494, + "learning_rate": 9.420569760984152e-06, + "loss": 0.2799, + "step": 4504 + }, + { + "epoch": 0.3603927921441571, + "grad_norm": 0.29506843650648334, + "learning_rate": 9.42026704837227e-06, + "loss": 0.2649, + "step": 4505 + }, + { + "epoch": 0.3604727905441891, + "grad_norm": 0.2733325479115751, + "learning_rate": 9.419964261573906e-06, + "loss": 0.2819, + "step": 4506 + }, + { + "epoch": 0.3605527889442211, + "grad_norm": 0.35840935516388817, + "learning_rate": 9.419661400594145e-06, + "loss": 0.2677, + "step": 4507 + }, + { + "epoch": 0.36063278734425314, + "grad_norm": 0.27087630156695036, + "learning_rate": 9.419358465438069e-06, + "loss": 0.3158, + "step": 4508 + }, + { + "epoch": 0.3607127857442851, + "grad_norm": 0.30604821293880397, + "learning_rate": 9.41905545611076e-06, + "loss": 0.29, + "step": 4509 + }, + { + "epoch": 0.3607927841443171, + "grad_norm": 0.29834984212125365, + "learning_rate": 9.418752372617306e-06, + "loss": 0.2724, + "step": 4510 + }, + { + "epoch": 0.3608727825443491, + "grad_norm": 0.3116680806420882, + "learning_rate": 9.418449214962793e-06, + "loss": 0.2661, + "step": 4511 + }, + { + "epoch": 0.3609527809443811, + "grad_norm": 0.20932305616553637, + "learning_rate": 9.41814598315231e-06, + "loss": 0.3535, + "step": 4512 + }, + { + "epoch": 0.36103277934441314, + "grad_norm": 0.25947676782948126, + "learning_rate": 9.417842677190944e-06, + "loss": 0.3026, + "step": 4513 + }, + { + "epoch": 0.3611127777444451, + "grad_norm": 0.43281973963755344, + "learning_rate": 9.417539297083787e-06, + "loss": 0.2813, + "step": 4514 + }, + { + "epoch": 0.3611927761444771, + "grad_norm": 0.26585731791138606, + "learning_rate": 9.417235842835929e-06, + "loss": 0.3186, + "step": 4515 + }, + { + "epoch": 0.3612727745445091, + "grad_norm": 0.3071571122215886, + "learning_rate": 9.416932314452464e-06, + "loss": 0.2814, + "step": 4516 + }, + { + "epoch": 0.3613527729445411, + "grad_norm": 0.29442914566337125, + "learning_rate": 9.416628711938489e-06, + "loss": 0.2667, + "step": 4517 + }, + { + "epoch": 0.36143277134457313, + "grad_norm": 0.2861889292642095, + "learning_rate": 9.416325035299094e-06, + "loss": 0.2914, + "step": 4518 + }, + { + "epoch": 0.3615127697446051, + "grad_norm": 0.2355753718154151, + "learning_rate": 9.41602128453938e-06, + "loss": 0.321, + "step": 4519 + }, + { + "epoch": 0.3615927681446371, + "grad_norm": 0.2651155794025791, + "learning_rate": 9.415717459664443e-06, + "loss": 0.2837, + "step": 4520 + }, + { + "epoch": 0.3616727665446691, + "grad_norm": 0.2514851115644703, + "learning_rate": 9.415413560679385e-06, + "loss": 0.3071, + "step": 4521 + }, + { + "epoch": 0.3617527649447011, + "grad_norm": 0.2940407645015243, + "learning_rate": 9.415109587589302e-06, + "loss": 0.2563, + "step": 4522 + }, + { + "epoch": 0.36183276334473313, + "grad_norm": 0.28114527438991405, + "learning_rate": 9.414805540399298e-06, + "loss": 0.294, + "step": 4523 + }, + { + "epoch": 0.3619127617447651, + "grad_norm": 0.2260838976330332, + "learning_rate": 9.414501419114474e-06, + "loss": 0.3201, + "step": 4524 + }, + { + "epoch": 0.3619927601447971, + "grad_norm": 0.59151798067773, + "learning_rate": 9.414197223739939e-06, + "loss": 0.3096, + "step": 4525 + }, + { + "epoch": 0.3620727585448291, + "grad_norm": 0.330730994837611, + "learning_rate": 9.413892954280793e-06, + "loss": 0.3479, + "step": 4526 + }, + { + "epoch": 0.3621527569448611, + "grad_norm": 0.3860622165168661, + "learning_rate": 9.413588610742146e-06, + "loss": 0.3143, + "step": 4527 + }, + { + "epoch": 0.3622327553448931, + "grad_norm": 0.25913422369943356, + "learning_rate": 9.413284193129104e-06, + "loss": 0.2835, + "step": 4528 + }, + { + "epoch": 0.3623127537449251, + "grad_norm": 0.36210213359391125, + "learning_rate": 9.412979701446776e-06, + "loss": 0.2827, + "step": 4529 + }, + { + "epoch": 0.3623927521449571, + "grad_norm": 0.2898838771167482, + "learning_rate": 9.412675135700272e-06, + "loss": 0.2877, + "step": 4530 + }, + { + "epoch": 0.3624727505449891, + "grad_norm": 0.2513596210310494, + "learning_rate": 9.412370495894708e-06, + "loss": 0.3024, + "step": 4531 + }, + { + "epoch": 0.3625527489450211, + "grad_norm": 0.3011111371280085, + "learning_rate": 9.412065782035193e-06, + "loss": 0.2518, + "step": 4532 + }, + { + "epoch": 0.3626327473450531, + "grad_norm": 0.29918973244841746, + "learning_rate": 9.41176099412684e-06, + "loss": 0.282, + "step": 4533 + }, + { + "epoch": 0.3627127457450851, + "grad_norm": 0.28883602464741204, + "learning_rate": 9.411456132174768e-06, + "loss": 0.3085, + "step": 4534 + }, + { + "epoch": 0.3627927441451171, + "grad_norm": 0.21863851386648173, + "learning_rate": 9.41115119618409e-06, + "loss": 0.3481, + "step": 4535 + }, + { + "epoch": 0.3628727425451491, + "grad_norm": 0.2822141222519014, + "learning_rate": 9.410846186159926e-06, + "loss": 0.2924, + "step": 4536 + }, + { + "epoch": 0.3629527409451811, + "grad_norm": 0.28235884195825345, + "learning_rate": 9.410541102107394e-06, + "loss": 0.2966, + "step": 4537 + }, + { + "epoch": 0.3630327393452131, + "grad_norm": 0.22863196301218053, + "learning_rate": 9.410235944031616e-06, + "loss": 0.3269, + "step": 4538 + }, + { + "epoch": 0.3631127377452451, + "grad_norm": 0.29079668195959973, + "learning_rate": 9.40993071193771e-06, + "loss": 0.2675, + "step": 4539 + }, + { + "epoch": 0.3631927361452771, + "grad_norm": 0.2524075925425891, + "learning_rate": 9.409625405830804e-06, + "loss": 0.3335, + "step": 4540 + }, + { + "epoch": 0.3632727345453091, + "grad_norm": 0.29576485047062606, + "learning_rate": 9.409320025716018e-06, + "loss": 0.3, + "step": 4541 + }, + { + "epoch": 0.3633527329453411, + "grad_norm": 0.26063967091046764, + "learning_rate": 9.409014571598478e-06, + "loss": 0.3039, + "step": 4542 + }, + { + "epoch": 0.3634327313453731, + "grad_norm": 0.23136406035153806, + "learning_rate": 9.40870904348331e-06, + "loss": 0.3416, + "step": 4543 + }, + { + "epoch": 0.3635127297454051, + "grad_norm": 0.23712501546434936, + "learning_rate": 9.408403441375644e-06, + "loss": 0.3068, + "step": 4544 + }, + { + "epoch": 0.3635927281454371, + "grad_norm": 0.30387628213477447, + "learning_rate": 9.408097765280608e-06, + "loss": 0.2657, + "step": 4545 + }, + { + "epoch": 0.3636727265454691, + "grad_norm": 0.28223824842109635, + "learning_rate": 9.407792015203331e-06, + "loss": 0.2734, + "step": 4546 + }, + { + "epoch": 0.3637527249455011, + "grad_norm": 0.207894299809762, + "learning_rate": 9.407486191148947e-06, + "loss": 0.3499, + "step": 4547 + }, + { + "epoch": 0.36383272334553307, + "grad_norm": 0.31848990341620637, + "learning_rate": 9.407180293122586e-06, + "loss": 0.2722, + "step": 4548 + }, + { + "epoch": 0.3639127217455651, + "grad_norm": 0.27432527947466057, + "learning_rate": 9.406874321129384e-06, + "loss": 0.2756, + "step": 4549 + }, + { + "epoch": 0.3639927201455971, + "grad_norm": 0.2508750191121811, + "learning_rate": 9.406568275174475e-06, + "loss": 0.2922, + "step": 4550 + }, + { + "epoch": 0.3640727185456291, + "grad_norm": 0.32027744875107417, + "learning_rate": 9.406262155262995e-06, + "loss": 0.2654, + "step": 4551 + }, + { + "epoch": 0.3641527169456611, + "grad_norm": 0.3031426095609422, + "learning_rate": 9.405955961400083e-06, + "loss": 0.2399, + "step": 4552 + }, + { + "epoch": 0.36423271534569307, + "grad_norm": 0.3403121690187805, + "learning_rate": 9.405649693590877e-06, + "loss": 0.2958, + "step": 4553 + }, + { + "epoch": 0.3643127137457251, + "grad_norm": 0.3078850273436866, + "learning_rate": 9.405343351840517e-06, + "loss": 0.2715, + "step": 4554 + }, + { + "epoch": 0.3643927121457571, + "grad_norm": 0.25218323571025897, + "learning_rate": 9.405036936154146e-06, + "loss": 0.3137, + "step": 4555 + }, + { + "epoch": 0.36447271054578906, + "grad_norm": 0.3568854564180672, + "learning_rate": 9.404730446536905e-06, + "loss": 0.3096, + "step": 4556 + }, + { + "epoch": 0.3645527089458211, + "grad_norm": 0.31619564823892804, + "learning_rate": 9.40442388299394e-06, + "loss": 0.2496, + "step": 4557 + }, + { + "epoch": 0.36463270734585307, + "grad_norm": 0.3010449890763942, + "learning_rate": 9.404117245530393e-06, + "loss": 0.2653, + "step": 4558 + }, + { + "epoch": 0.3647127057458851, + "grad_norm": 0.31741889594952866, + "learning_rate": 9.403810534151411e-06, + "loss": 0.2772, + "step": 4559 + }, + { + "epoch": 0.3647927041459171, + "grad_norm": 0.29957728347144463, + "learning_rate": 9.403503748862146e-06, + "loss": 0.2574, + "step": 4560 + }, + { + "epoch": 0.36487270254594906, + "grad_norm": 0.30048095987999945, + "learning_rate": 9.403196889667742e-06, + "loss": 0.2501, + "step": 4561 + }, + { + "epoch": 0.3649527009459811, + "grad_norm": 0.23580487420531285, + "learning_rate": 9.40288995657335e-06, + "loss": 0.3333, + "step": 4562 + }, + { + "epoch": 0.36503269934601307, + "grad_norm": 0.24036459139326682, + "learning_rate": 9.402582949584122e-06, + "loss": 0.3071, + "step": 4563 + }, + { + "epoch": 0.3651126977460451, + "grad_norm": 0.2834537840227493, + "learning_rate": 9.40227586870521e-06, + "loss": 0.2914, + "step": 4564 + }, + { + "epoch": 0.3651926961460771, + "grad_norm": 0.29027014858303357, + "learning_rate": 9.40196871394177e-06, + "loss": 0.2916, + "step": 4565 + }, + { + "epoch": 0.36527269454610906, + "grad_norm": 0.31099566410256013, + "learning_rate": 9.401661485298954e-06, + "loss": 0.2968, + "step": 4566 + }, + { + "epoch": 0.3653526929461411, + "grad_norm": 1.1118475152143055, + "learning_rate": 9.401354182781921e-06, + "loss": 0.287, + "step": 4567 + }, + { + "epoch": 0.36543269134617307, + "grad_norm": 0.2953870570671785, + "learning_rate": 9.401046806395826e-06, + "loss": 0.2497, + "step": 4568 + }, + { + "epoch": 0.3655126897462051, + "grad_norm": 0.35156429373183057, + "learning_rate": 9.400739356145829e-06, + "loss": 0.2973, + "step": 4569 + }, + { + "epoch": 0.3655926881462371, + "grad_norm": 0.26587914941390806, + "learning_rate": 9.400431832037092e-06, + "loss": 0.2926, + "step": 4570 + }, + { + "epoch": 0.36567268654626905, + "grad_norm": 0.328839933375101, + "learning_rate": 9.400124234074772e-06, + "loss": 0.2712, + "step": 4571 + }, + { + "epoch": 0.3657526849463011, + "grad_norm": 0.2725587201403524, + "learning_rate": 9.399816562264034e-06, + "loss": 0.2986, + "step": 4572 + }, + { + "epoch": 0.36583268334633307, + "grad_norm": 0.3207195480820183, + "learning_rate": 9.399508816610042e-06, + "loss": 0.2612, + "step": 4573 + }, + { + "epoch": 0.3659126817463651, + "grad_norm": 0.27002140934801894, + "learning_rate": 9.399200997117961e-06, + "loss": 0.2892, + "step": 4574 + }, + { + "epoch": 0.3659926801463971, + "grad_norm": 0.27648152829037176, + "learning_rate": 9.398893103792956e-06, + "loss": 0.3107, + "step": 4575 + }, + { + "epoch": 0.36607267854642905, + "grad_norm": 0.3205082329959992, + "learning_rate": 9.398585136640195e-06, + "loss": 0.2802, + "step": 4576 + }, + { + "epoch": 0.3661526769464611, + "grad_norm": 0.25280020449268564, + "learning_rate": 9.398277095664848e-06, + "loss": 0.308, + "step": 4577 + }, + { + "epoch": 0.36623267534649306, + "grad_norm": 0.5656160672890643, + "learning_rate": 9.397968980872082e-06, + "loss": 0.2627, + "step": 4578 + }, + { + "epoch": 0.3663126737465251, + "grad_norm": 0.2994260119529522, + "learning_rate": 9.397660792267072e-06, + "loss": 0.3002, + "step": 4579 + }, + { + "epoch": 0.3663926721465571, + "grad_norm": 0.2949646105190192, + "learning_rate": 9.397352529854987e-06, + "loss": 0.2621, + "step": 4580 + }, + { + "epoch": 0.36647267054658905, + "grad_norm": 0.2913781511531592, + "learning_rate": 9.397044193641e-06, + "loss": 0.2657, + "step": 4581 + }, + { + "epoch": 0.3665526689466211, + "grad_norm": 0.2866843539078499, + "learning_rate": 9.39673578363029e-06, + "loss": 0.2901, + "step": 4582 + }, + { + "epoch": 0.36663266734665306, + "grad_norm": 0.26608195849976635, + "learning_rate": 9.396427299828033e-06, + "loss": 0.2932, + "step": 4583 + }, + { + "epoch": 0.36671266574668504, + "grad_norm": 0.30598439570764546, + "learning_rate": 9.396118742239402e-06, + "loss": 0.2559, + "step": 4584 + }, + { + "epoch": 0.3667926641467171, + "grad_norm": 0.2269731161375952, + "learning_rate": 9.395810110869579e-06, + "loss": 0.3301, + "step": 4585 + }, + { + "epoch": 0.36687266254674905, + "grad_norm": 0.29815261575523905, + "learning_rate": 9.395501405723741e-06, + "loss": 0.2811, + "step": 4586 + }, + { + "epoch": 0.3669526609467811, + "grad_norm": 0.32769361339555636, + "learning_rate": 9.395192626807072e-06, + "loss": 0.2867, + "step": 4587 + }, + { + "epoch": 0.36703265934681306, + "grad_norm": 0.281090671303096, + "learning_rate": 9.394883774124755e-06, + "loss": 0.2941, + "step": 4588 + }, + { + "epoch": 0.36711265774684504, + "grad_norm": 0.31503907542869636, + "learning_rate": 9.39457484768197e-06, + "loss": 0.262, + "step": 4589 + }, + { + "epoch": 0.36719265614687707, + "grad_norm": 0.23001304698108144, + "learning_rate": 9.394265847483903e-06, + "loss": 0.3111, + "step": 4590 + }, + { + "epoch": 0.36727265454690905, + "grad_norm": 0.266132330707941, + "learning_rate": 9.393956773535742e-06, + "loss": 0.3088, + "step": 4591 + }, + { + "epoch": 0.3673526529469411, + "grad_norm": 0.3010013911452164, + "learning_rate": 9.393647625842671e-06, + "loss": 0.2875, + "step": 4592 + }, + { + "epoch": 0.36743265134697306, + "grad_norm": 0.26990632336033993, + "learning_rate": 9.393338404409881e-06, + "loss": 0.2904, + "step": 4593 + }, + { + "epoch": 0.36751264974700504, + "grad_norm": 0.2812717508214428, + "learning_rate": 9.393029109242562e-06, + "loss": 0.307, + "step": 4594 + }, + { + "epoch": 0.36759264814703707, + "grad_norm": 0.27825922686465965, + "learning_rate": 9.392719740345904e-06, + "loss": 0.2909, + "step": 4595 + }, + { + "epoch": 0.36767264654706905, + "grad_norm": 0.25781478149280096, + "learning_rate": 9.392410297725099e-06, + "loss": 0.2827, + "step": 4596 + }, + { + "epoch": 0.3677526449471011, + "grad_norm": 0.2722726354463833, + "learning_rate": 9.39210078138534e-06, + "loss": 0.2637, + "step": 4597 + }, + { + "epoch": 0.36783264334713306, + "grad_norm": 0.3038452940855688, + "learning_rate": 9.391791191331823e-06, + "loss": 0.2786, + "step": 4598 + }, + { + "epoch": 0.36791264174716504, + "grad_norm": 0.33953679487083954, + "learning_rate": 9.391481527569744e-06, + "loss": 0.2673, + "step": 4599 + }, + { + "epoch": 0.36799264014719707, + "grad_norm": 0.32923474393822805, + "learning_rate": 9.391171790104298e-06, + "loss": 0.2721, + "step": 4600 + }, + { + "epoch": 0.36807263854722905, + "grad_norm": 0.24579890453095807, + "learning_rate": 9.390861978940687e-06, + "loss": 0.3033, + "step": 4601 + }, + { + "epoch": 0.3681526369472611, + "grad_norm": 0.2965533628193658, + "learning_rate": 9.390552094084107e-06, + "loss": 0.2508, + "step": 4602 + }, + { + "epoch": 0.36823263534729306, + "grad_norm": 0.2858351805187073, + "learning_rate": 9.390242135539761e-06, + "loss": 0.2756, + "step": 4603 + }, + { + "epoch": 0.36831263374732504, + "grad_norm": 0.29294292261091875, + "learning_rate": 9.389932103312851e-06, + "loss": 0.2842, + "step": 4604 + }, + { + "epoch": 0.36839263214735707, + "grad_norm": 0.2958023675705473, + "learning_rate": 9.38962199740858e-06, + "loss": 0.2725, + "step": 4605 + }, + { + "epoch": 0.36847263054738905, + "grad_norm": 0.34806272913676906, + "learning_rate": 9.389311817832152e-06, + "loss": 0.2734, + "step": 4606 + }, + { + "epoch": 0.3685526289474211, + "grad_norm": 0.30086264500418197, + "learning_rate": 9.389001564588773e-06, + "loss": 0.2521, + "step": 4607 + }, + { + "epoch": 0.36863262734745306, + "grad_norm": 0.2983952640539182, + "learning_rate": 9.38869123768365e-06, + "loss": 0.3084, + "step": 4608 + }, + { + "epoch": 0.36871262574748503, + "grad_norm": 0.33458720362553473, + "learning_rate": 9.388380837121993e-06, + "loss": 0.2553, + "step": 4609 + }, + { + "epoch": 0.36879262414751707, + "grad_norm": 0.28435329925715064, + "learning_rate": 9.38807036290901e-06, + "loss": 0.2631, + "step": 4610 + }, + { + "epoch": 0.36887262254754905, + "grad_norm": 0.3053038317674436, + "learning_rate": 9.387759815049911e-06, + "loss": 0.2878, + "step": 4611 + }, + { + "epoch": 0.368952620947581, + "grad_norm": 0.31656285022079794, + "learning_rate": 9.38744919354991e-06, + "loss": 0.2767, + "step": 4612 + }, + { + "epoch": 0.36903261934761306, + "grad_norm": 0.24045599706988152, + "learning_rate": 9.38713849841422e-06, + "loss": 0.3293, + "step": 4613 + }, + { + "epoch": 0.36911261774764503, + "grad_norm": 0.23785717164441986, + "learning_rate": 9.386827729648052e-06, + "loss": 0.3306, + "step": 4614 + }, + { + "epoch": 0.36919261614767707, + "grad_norm": 0.4180508952900747, + "learning_rate": 9.386516887256627e-06, + "loss": 0.2738, + "step": 4615 + }, + { + "epoch": 0.36927261454770904, + "grad_norm": 0.28791609003000845, + "learning_rate": 9.386205971245157e-06, + "loss": 0.2713, + "step": 4616 + }, + { + "epoch": 0.369352612947741, + "grad_norm": 0.23270118379546492, + "learning_rate": 9.385894981618866e-06, + "loss": 0.3144, + "step": 4617 + }, + { + "epoch": 0.36943261134777305, + "grad_norm": 0.2871485491329268, + "learning_rate": 9.385583918382966e-06, + "loss": 0.2516, + "step": 4618 + }, + { + "epoch": 0.36951260974780503, + "grad_norm": 0.31839088302898083, + "learning_rate": 9.385272781542686e-06, + "loss": 0.2677, + "step": 4619 + }, + { + "epoch": 0.36959260814783707, + "grad_norm": 0.26656999981867596, + "learning_rate": 9.38496157110324e-06, + "loss": 0.2923, + "step": 4620 + }, + { + "epoch": 0.36967260654786904, + "grad_norm": 0.3536963266953804, + "learning_rate": 9.384650287069856e-06, + "loss": 0.2877, + "step": 4621 + }, + { + "epoch": 0.369752604947901, + "grad_norm": 0.3280761720243504, + "learning_rate": 9.384338929447755e-06, + "loss": 0.2517, + "step": 4622 + }, + { + "epoch": 0.36983260334793305, + "grad_norm": 0.3088768793405493, + "learning_rate": 9.384027498242168e-06, + "loss": 0.2774, + "step": 4623 + }, + { + "epoch": 0.36991260174796503, + "grad_norm": 0.2673392939817972, + "learning_rate": 9.383715993458315e-06, + "loss": 0.3176, + "step": 4624 + }, + { + "epoch": 0.36999260014799706, + "grad_norm": 0.32159499563859906, + "learning_rate": 9.38340441510143e-06, + "loss": 0.3043, + "step": 4625 + }, + { + "epoch": 0.37007259854802904, + "grad_norm": 0.27489130810490914, + "learning_rate": 9.38309276317674e-06, + "loss": 0.3059, + "step": 4626 + }, + { + "epoch": 0.370152596948061, + "grad_norm": 0.26908686544227184, + "learning_rate": 9.382781037689475e-06, + "loss": 0.3332, + "step": 4627 + }, + { + "epoch": 0.37023259534809305, + "grad_norm": 0.25768581013228414, + "learning_rate": 9.382469238644864e-06, + "loss": 0.3133, + "step": 4628 + }, + { + "epoch": 0.37031259374812503, + "grad_norm": 0.6438835085732397, + "learning_rate": 9.382157366048146e-06, + "loss": 0.2875, + "step": 4629 + }, + { + "epoch": 0.37039259214815706, + "grad_norm": 0.26815661234512994, + "learning_rate": 9.381845419904554e-06, + "loss": 0.2851, + "step": 4630 + }, + { + "epoch": 0.37047259054818904, + "grad_norm": 0.3022504065570971, + "learning_rate": 9.381533400219319e-06, + "loss": 0.2688, + "step": 4631 + }, + { + "epoch": 0.370552588948221, + "grad_norm": 0.2814119878840409, + "learning_rate": 9.381221306997681e-06, + "loss": 0.3021, + "step": 4632 + }, + { + "epoch": 0.37063258734825305, + "grad_norm": 0.31391623319388456, + "learning_rate": 9.380909140244878e-06, + "loss": 0.2896, + "step": 4633 + }, + { + "epoch": 0.37071258574828503, + "grad_norm": 0.30656268979820234, + "learning_rate": 9.380596899966147e-06, + "loss": 0.2909, + "step": 4634 + }, + { + "epoch": 0.370792584148317, + "grad_norm": 0.30795398449937783, + "learning_rate": 9.380284586166732e-06, + "loss": 0.2588, + "step": 4635 + }, + { + "epoch": 0.37087258254834904, + "grad_norm": 0.2849359022624659, + "learning_rate": 9.379972198851874e-06, + "loss": 0.2951, + "step": 4636 + }, + { + "epoch": 0.370952580948381, + "grad_norm": 0.30166104848763414, + "learning_rate": 9.379659738026812e-06, + "loss": 0.2399, + "step": 4637 + }, + { + "epoch": 0.37103257934841305, + "grad_norm": 0.28771611518273776, + "learning_rate": 9.379347203696794e-06, + "loss": 0.3038, + "step": 4638 + }, + { + "epoch": 0.371112577748445, + "grad_norm": 0.2763800709296993, + "learning_rate": 9.379034595867062e-06, + "loss": 0.2854, + "step": 4639 + }, + { + "epoch": 0.371192576148477, + "grad_norm": 0.3378184812728501, + "learning_rate": 9.378721914542867e-06, + "loss": 0.3205, + "step": 4640 + }, + { + "epoch": 0.37127257454850904, + "grad_norm": 0.2722964242389891, + "learning_rate": 9.378409159729454e-06, + "loss": 0.2933, + "step": 4641 + }, + { + "epoch": 0.371352572948541, + "grad_norm": 0.2568437551492094, + "learning_rate": 9.378096331432071e-06, + "loss": 0.2871, + "step": 4642 + }, + { + "epoch": 0.37143257134857305, + "grad_norm": 0.31141779802266084, + "learning_rate": 9.37778342965597e-06, + "loss": 0.2472, + "step": 4643 + }, + { + "epoch": 0.371512569748605, + "grad_norm": 0.30164303557939237, + "learning_rate": 9.377470454406404e-06, + "loss": 0.2997, + "step": 4644 + }, + { + "epoch": 0.371592568148637, + "grad_norm": 0.25955987991021784, + "learning_rate": 9.377157405688622e-06, + "loss": 0.2978, + "step": 4645 + }, + { + "epoch": 0.37167256654866904, + "grad_norm": 0.2859636698434837, + "learning_rate": 9.37684428350788e-06, + "loss": 0.2986, + "step": 4646 + }, + { + "epoch": 0.371752564948701, + "grad_norm": 0.31251807093040984, + "learning_rate": 9.376531087869435e-06, + "loss": 0.2628, + "step": 4647 + }, + { + "epoch": 0.37183256334873305, + "grad_norm": 0.27210950476140455, + "learning_rate": 9.37621781877854e-06, + "loss": 0.2627, + "step": 4648 + }, + { + "epoch": 0.371912561748765, + "grad_norm": 0.25472684066751045, + "learning_rate": 9.375904476240457e-06, + "loss": 0.3423, + "step": 4649 + }, + { + "epoch": 0.371992560148797, + "grad_norm": 0.3815297294720448, + "learning_rate": 9.375591060260439e-06, + "loss": 0.279, + "step": 4650 + }, + { + "epoch": 0.37207255854882904, + "grad_norm": 0.26981390166274777, + "learning_rate": 9.37527757084375e-06, + "loss": 0.2955, + "step": 4651 + }, + { + "epoch": 0.372152556948861, + "grad_norm": 0.2917857347964334, + "learning_rate": 9.374964007995651e-06, + "loss": 0.3084, + "step": 4652 + }, + { + "epoch": 0.37223255534889305, + "grad_norm": 0.3353738638730792, + "learning_rate": 9.374650371721405e-06, + "loss": 0.248, + "step": 4653 + }, + { + "epoch": 0.372312553748925, + "grad_norm": 0.37725142463909683, + "learning_rate": 9.374336662026274e-06, + "loss": 0.2767, + "step": 4654 + }, + { + "epoch": 0.372392552148957, + "grad_norm": 0.2800840724068355, + "learning_rate": 9.374022878915525e-06, + "loss": 0.2966, + "step": 4655 + }, + { + "epoch": 0.37247255054898903, + "grad_norm": 0.3134351148114533, + "learning_rate": 9.373709022394424e-06, + "loss": 0.2775, + "step": 4656 + }, + { + "epoch": 0.372552548949021, + "grad_norm": 0.2625896847014437, + "learning_rate": 9.373395092468238e-06, + "loss": 0.3059, + "step": 4657 + }, + { + "epoch": 0.37263254734905304, + "grad_norm": 0.28676629484884947, + "learning_rate": 9.373081089142235e-06, + "loss": 0.2651, + "step": 4658 + }, + { + "epoch": 0.372712545749085, + "grad_norm": 0.2782892596161019, + "learning_rate": 9.372767012421687e-06, + "loss": 0.2933, + "step": 4659 + }, + { + "epoch": 0.372792544149117, + "grad_norm": 0.28562559892708006, + "learning_rate": 9.372452862311862e-06, + "loss": 0.2725, + "step": 4660 + }, + { + "epoch": 0.37287254254914903, + "grad_norm": 0.26748997608943115, + "learning_rate": 9.372138638818036e-06, + "loss": 0.2871, + "step": 4661 + }, + { + "epoch": 0.372952540949181, + "grad_norm": 0.2796022726078067, + "learning_rate": 9.371824341945481e-06, + "loss": 0.3017, + "step": 4662 + }, + { + "epoch": 0.373032539349213, + "grad_norm": 0.29243598004845744, + "learning_rate": 9.371509971699471e-06, + "loss": 0.2608, + "step": 4663 + }, + { + "epoch": 0.373112537749245, + "grad_norm": 0.30200382632076933, + "learning_rate": 9.371195528085287e-06, + "loss": 0.2804, + "step": 4664 + }, + { + "epoch": 0.373192536149277, + "grad_norm": 0.28357839833790005, + "learning_rate": 9.370881011108198e-06, + "loss": 0.271, + "step": 4665 + }, + { + "epoch": 0.37327253454930903, + "grad_norm": 0.28153314449032313, + "learning_rate": 9.37056642077349e-06, + "loss": 0.281, + "step": 4666 + }, + { + "epoch": 0.373352532949341, + "grad_norm": 0.33634541104891535, + "learning_rate": 9.370251757086439e-06, + "loss": 0.2849, + "step": 4667 + }, + { + "epoch": 0.373432531349373, + "grad_norm": 0.3838292730602674, + "learning_rate": 9.369937020052329e-06, + "loss": 0.2898, + "step": 4668 + }, + { + "epoch": 0.373512529749405, + "grad_norm": 0.4765959671214792, + "learning_rate": 9.36962220967644e-06, + "loss": 0.2576, + "step": 4669 + }, + { + "epoch": 0.373592528149437, + "grad_norm": 0.30263956808660836, + "learning_rate": 9.369307325964054e-06, + "loss": 0.2495, + "step": 4670 + }, + { + "epoch": 0.37367252654946903, + "grad_norm": 0.3102388571932994, + "learning_rate": 9.36899236892046e-06, + "loss": 0.3129, + "step": 4671 + }, + { + "epoch": 0.373752524949501, + "grad_norm": 0.3326410603297145, + "learning_rate": 9.368677338550942e-06, + "loss": 0.2641, + "step": 4672 + }, + { + "epoch": 0.373832523349533, + "grad_norm": 0.23756099615171702, + "learning_rate": 9.368362234860785e-06, + "loss": 0.3134, + "step": 4673 + }, + { + "epoch": 0.373912521749565, + "grad_norm": 0.29886437655868503, + "learning_rate": 9.368047057855282e-06, + "loss": 0.2643, + "step": 4674 + }, + { + "epoch": 0.373992520149597, + "grad_norm": 0.3008738515193368, + "learning_rate": 9.36773180753972e-06, + "loss": 0.2577, + "step": 4675 + }, + { + "epoch": 0.37407251854962903, + "grad_norm": 0.2817795060185692, + "learning_rate": 9.367416483919387e-06, + "loss": 0.2957, + "step": 4676 + }, + { + "epoch": 0.374152516949661, + "grad_norm": 0.2834223276286296, + "learning_rate": 9.367101086999582e-06, + "loss": 0.2871, + "step": 4677 + }, + { + "epoch": 0.374232515349693, + "grad_norm": 0.2774136949114465, + "learning_rate": 9.366785616785594e-06, + "loss": 0.2872, + "step": 4678 + }, + { + "epoch": 0.374312513749725, + "grad_norm": 0.24100072942389075, + "learning_rate": 9.366470073282718e-06, + "loss": 0.3182, + "step": 4679 + }, + { + "epoch": 0.374392512149757, + "grad_norm": 0.26400384282979905, + "learning_rate": 9.36615445649625e-06, + "loss": 0.2941, + "step": 4680 + }, + { + "epoch": 0.37447251054978903, + "grad_norm": 0.29118389309736004, + "learning_rate": 9.365838766431487e-06, + "loss": 0.3117, + "step": 4681 + }, + { + "epoch": 0.374552508949821, + "grad_norm": 0.3264776656173028, + "learning_rate": 9.365523003093728e-06, + "loss": 0.3163, + "step": 4682 + }, + { + "epoch": 0.374632507349853, + "grad_norm": 0.32265954145840947, + "learning_rate": 9.365207166488273e-06, + "loss": 0.2599, + "step": 4683 + }, + { + "epoch": 0.374712505749885, + "grad_norm": 0.7269795164637523, + "learning_rate": 9.364891256620422e-06, + "loss": 0.2645, + "step": 4684 + }, + { + "epoch": 0.374792504149917, + "grad_norm": 0.258749341074798, + "learning_rate": 9.364575273495475e-06, + "loss": 0.2845, + "step": 4685 + }, + { + "epoch": 0.374872502549949, + "grad_norm": 0.22761247632207296, + "learning_rate": 9.364259217118738e-06, + "loss": 0.3267, + "step": 4686 + }, + { + "epoch": 0.374952500949981, + "grad_norm": 0.30801844316727534, + "learning_rate": 9.363943087495515e-06, + "loss": 0.2845, + "step": 4687 + }, + { + "epoch": 0.375032499350013, + "grad_norm": 0.277996690762652, + "learning_rate": 9.36362688463111e-06, + "loss": 0.2961, + "step": 4688 + }, + { + "epoch": 0.375112497750045, + "grad_norm": 0.2527342044511156, + "learning_rate": 9.363310608530834e-06, + "loss": 0.32, + "step": 4689 + }, + { + "epoch": 0.375192496150077, + "grad_norm": 0.33240444190574925, + "learning_rate": 9.362994259199988e-06, + "loss": 0.3094, + "step": 4690 + }, + { + "epoch": 0.37527249455010897, + "grad_norm": 0.2740657164953733, + "learning_rate": 9.36267783664389e-06, + "loss": 0.3145, + "step": 4691 + }, + { + "epoch": 0.375352492950141, + "grad_norm": 0.35567076802791736, + "learning_rate": 9.362361340867846e-06, + "loss": 0.2681, + "step": 4692 + }, + { + "epoch": 0.375432491350173, + "grad_norm": 0.24535327942781943, + "learning_rate": 9.362044771877164e-06, + "loss": 0.3252, + "step": 4693 + }, + { + "epoch": 0.375512489750205, + "grad_norm": 0.35322095468792813, + "learning_rate": 9.361728129677165e-06, + "loss": 0.2763, + "step": 4694 + }, + { + "epoch": 0.375592488150237, + "grad_norm": 0.24316242408252822, + "learning_rate": 9.361411414273159e-06, + "loss": 0.3189, + "step": 4695 + }, + { + "epoch": 0.37567248655026897, + "grad_norm": 0.26768463347269406, + "learning_rate": 9.36109462567046e-06, + "loss": 0.296, + "step": 4696 + }, + { + "epoch": 0.375752484950301, + "grad_norm": 0.31278961177299586, + "learning_rate": 9.360777763874389e-06, + "loss": 0.2562, + "step": 4697 + }, + { + "epoch": 0.375832483350333, + "grad_norm": 0.2817563764111623, + "learning_rate": 9.36046082889026e-06, + "loss": 0.2828, + "step": 4698 + }, + { + "epoch": 0.375912481750365, + "grad_norm": 0.263186110066437, + "learning_rate": 9.360143820723395e-06, + "loss": 0.3275, + "step": 4699 + }, + { + "epoch": 0.375992480150397, + "grad_norm": 0.2815778830397835, + "learning_rate": 9.359826739379113e-06, + "loss": 0.2858, + "step": 4700 + }, + { + "epoch": 0.37607247855042897, + "grad_norm": 0.2790346637194123, + "learning_rate": 9.359509584862735e-06, + "loss": 0.298, + "step": 4701 + }, + { + "epoch": 0.376152476950461, + "grad_norm": 0.32545118933124173, + "learning_rate": 9.359192357179587e-06, + "loss": 0.2691, + "step": 4702 + }, + { + "epoch": 0.376232475350493, + "grad_norm": 0.2637625876023099, + "learning_rate": 9.35887505633499e-06, + "loss": 0.2905, + "step": 4703 + }, + { + "epoch": 0.376312473750525, + "grad_norm": 0.2958015198850977, + "learning_rate": 9.35855768233427e-06, + "loss": 0.2717, + "step": 4704 + }, + { + "epoch": 0.376392472150557, + "grad_norm": 0.2905761974540743, + "learning_rate": 9.358240235182754e-06, + "loss": 0.2457, + "step": 4705 + }, + { + "epoch": 0.37647247055058897, + "grad_norm": 0.231397850701339, + "learning_rate": 9.35792271488577e-06, + "loss": 0.3329, + "step": 4706 + }, + { + "epoch": 0.376552468950621, + "grad_norm": 0.3078815372030194, + "learning_rate": 9.357605121448648e-06, + "loss": 0.2699, + "step": 4707 + }, + { + "epoch": 0.376632467350653, + "grad_norm": 0.3495728004871776, + "learning_rate": 9.357287454876715e-06, + "loss": 0.2713, + "step": 4708 + }, + { + "epoch": 0.376712465750685, + "grad_norm": 0.22970366538633463, + "learning_rate": 9.356969715175305e-06, + "loss": 0.3048, + "step": 4709 + }, + { + "epoch": 0.376792464150717, + "grad_norm": 0.27149828614080357, + "learning_rate": 9.35665190234975e-06, + "loss": 0.3096, + "step": 4710 + }, + { + "epoch": 0.37687246255074897, + "grad_norm": 0.2620521134406785, + "learning_rate": 9.356334016405383e-06, + "loss": 0.271, + "step": 4711 + }, + { + "epoch": 0.376952460950781, + "grad_norm": 0.29672659479994967, + "learning_rate": 9.356016057347543e-06, + "loss": 0.2541, + "step": 4712 + }, + { + "epoch": 0.377032459350813, + "grad_norm": 0.2845048269291872, + "learning_rate": 9.355698025181561e-06, + "loss": 0.2902, + "step": 4713 + }, + { + "epoch": 0.377112457750845, + "grad_norm": 0.22057444746997362, + "learning_rate": 9.35537991991278e-06, + "loss": 0.3522, + "step": 4714 + }, + { + "epoch": 0.377192456150877, + "grad_norm": 0.22117835595012575, + "learning_rate": 9.355061741546533e-06, + "loss": 0.3646, + "step": 4715 + }, + { + "epoch": 0.37727245455090896, + "grad_norm": 0.3257388823338327, + "learning_rate": 9.354743490088166e-06, + "loss": 0.2428, + "step": 4716 + }, + { + "epoch": 0.377352452950941, + "grad_norm": 0.3880830331739957, + "learning_rate": 9.354425165543018e-06, + "loss": 0.2936, + "step": 4717 + }, + { + "epoch": 0.377432451350973, + "grad_norm": 0.32389843297190934, + "learning_rate": 9.354106767916428e-06, + "loss": 0.2703, + "step": 4718 + }, + { + "epoch": 0.37751244975100495, + "grad_norm": 0.2698153318110809, + "learning_rate": 9.353788297213743e-06, + "loss": 0.3179, + "step": 4719 + }, + { + "epoch": 0.377592448151037, + "grad_norm": 0.21754816056131854, + "learning_rate": 9.353469753440309e-06, + "loss": 0.3271, + "step": 4720 + }, + { + "epoch": 0.37767244655106896, + "grad_norm": 0.31543811059093235, + "learning_rate": 9.35315113660147e-06, + "loss": 0.2854, + "step": 4721 + }, + { + "epoch": 0.377752444951101, + "grad_norm": 0.25162407199770276, + "learning_rate": 9.352832446702578e-06, + "loss": 0.2778, + "step": 4722 + }, + { + "epoch": 0.377832443351133, + "grad_norm": 0.29843650840868496, + "learning_rate": 9.352513683748974e-06, + "loss": 0.2866, + "step": 4723 + }, + { + "epoch": 0.37791244175116495, + "grad_norm": 0.596364596970844, + "learning_rate": 9.352194847746014e-06, + "loss": 0.2687, + "step": 4724 + }, + { + "epoch": 0.377992440151197, + "grad_norm": 0.25959831343278256, + "learning_rate": 9.351875938699045e-06, + "loss": 0.309, + "step": 4725 + }, + { + "epoch": 0.37807243855122896, + "grad_norm": 0.3029693451069674, + "learning_rate": 9.351556956613423e-06, + "loss": 0.2676, + "step": 4726 + }, + { + "epoch": 0.378152436951261, + "grad_norm": 0.25403835411972375, + "learning_rate": 9.351237901494498e-06, + "loss": 0.293, + "step": 4727 + }, + { + "epoch": 0.378232435351293, + "grad_norm": 0.26665258234736167, + "learning_rate": 9.35091877334763e-06, + "loss": 0.2974, + "step": 4728 + }, + { + "epoch": 0.37831243375132495, + "grad_norm": 0.41882408645407493, + "learning_rate": 9.35059957217817e-06, + "loss": 0.2567, + "step": 4729 + }, + { + "epoch": 0.378392432151357, + "grad_norm": 0.29538992489034543, + "learning_rate": 9.350280297991476e-06, + "loss": 0.2687, + "step": 4730 + }, + { + "epoch": 0.37847243055138896, + "grad_norm": 0.2546291437813796, + "learning_rate": 9.349960950792907e-06, + "loss": 0.2814, + "step": 4731 + }, + { + "epoch": 0.378552428951421, + "grad_norm": 0.25337431419742307, + "learning_rate": 9.349641530587825e-06, + "loss": 0.2892, + "step": 4732 + }, + { + "epoch": 0.37863242735145297, + "grad_norm": 0.30055212740347037, + "learning_rate": 9.349322037381587e-06, + "loss": 0.2719, + "step": 4733 + }, + { + "epoch": 0.37871242575148495, + "grad_norm": 0.4418382151695005, + "learning_rate": 9.349002471179558e-06, + "loss": 0.2846, + "step": 4734 + }, + { + "epoch": 0.378792424151517, + "grad_norm": 0.2603518449678616, + "learning_rate": 9.348682831987101e-06, + "loss": 0.2988, + "step": 4735 + }, + { + "epoch": 0.37887242255154896, + "grad_norm": 0.2824622437105871, + "learning_rate": 9.34836311980958e-06, + "loss": 0.3015, + "step": 4736 + }, + { + "epoch": 0.378952420951581, + "grad_norm": 0.300313889548431, + "learning_rate": 9.348043334652362e-06, + "loss": 0.2897, + "step": 4737 + }, + { + "epoch": 0.37903241935161297, + "grad_norm": 0.27703618777885997, + "learning_rate": 9.34772347652081e-06, + "loss": 0.2478, + "step": 4738 + }, + { + "epoch": 0.37911241775164495, + "grad_norm": 0.28473989189781346, + "learning_rate": 9.347403545420298e-06, + "loss": 0.2892, + "step": 4739 + }, + { + "epoch": 0.379192416151677, + "grad_norm": 0.269067828157466, + "learning_rate": 9.34708354135619e-06, + "loss": 0.2774, + "step": 4740 + }, + { + "epoch": 0.37927241455170896, + "grad_norm": 0.28976003123172994, + "learning_rate": 9.346763464333862e-06, + "loss": 0.3331, + "step": 4741 + }, + { + "epoch": 0.379352412951741, + "grad_norm": 0.25898029910543874, + "learning_rate": 9.346443314358682e-06, + "loss": 0.2785, + "step": 4742 + }, + { + "epoch": 0.37943241135177297, + "grad_norm": 0.3052295156775338, + "learning_rate": 9.346123091436024e-06, + "loss": 0.2651, + "step": 4743 + }, + { + "epoch": 0.37951240975180495, + "grad_norm": 0.2660681865859992, + "learning_rate": 9.345802795571262e-06, + "loss": 0.3058, + "step": 4744 + }, + { + "epoch": 0.379592408151837, + "grad_norm": 0.27871173540684635, + "learning_rate": 9.345482426769774e-06, + "loss": 0.2864, + "step": 4745 + }, + { + "epoch": 0.37967240655186896, + "grad_norm": 0.27726904412541825, + "learning_rate": 9.345161985036937e-06, + "loss": 0.3096, + "step": 4746 + }, + { + "epoch": 0.37975240495190093, + "grad_norm": 0.27014684741515416, + "learning_rate": 9.344841470378125e-06, + "loss": 0.301, + "step": 4747 + }, + { + "epoch": 0.37983240335193297, + "grad_norm": 0.25929576152687656, + "learning_rate": 9.34452088279872e-06, + "loss": 0.2981, + "step": 4748 + }, + { + "epoch": 0.37991240175196495, + "grad_norm": 0.2899825999096309, + "learning_rate": 9.344200222304103e-06, + "loss": 0.259, + "step": 4749 + }, + { + "epoch": 0.379992400151997, + "grad_norm": 0.3284273014537563, + "learning_rate": 9.343879488899653e-06, + "loss": 0.2802, + "step": 4750 + }, + { + "epoch": 0.38007239855202896, + "grad_norm": 0.2919617350679286, + "learning_rate": 9.343558682590757e-06, + "loss": 0.2661, + "step": 4751 + }, + { + "epoch": 0.38015239695206093, + "grad_norm": 0.34368748087266526, + "learning_rate": 9.343237803382793e-06, + "loss": 0.3162, + "step": 4752 + }, + { + "epoch": 0.38023239535209297, + "grad_norm": 0.2508286002948896, + "learning_rate": 9.342916851281155e-06, + "loss": 0.3335, + "step": 4753 + }, + { + "epoch": 0.38031239375212494, + "grad_norm": 0.2997957584435135, + "learning_rate": 9.342595826291224e-06, + "loss": 0.2657, + "step": 4754 + }, + { + "epoch": 0.380392392152157, + "grad_norm": 0.26815781727109483, + "learning_rate": 9.342274728418388e-06, + "loss": 0.2999, + "step": 4755 + }, + { + "epoch": 0.38047239055218895, + "grad_norm": 0.34466872943359234, + "learning_rate": 9.341953557668037e-06, + "loss": 0.3076, + "step": 4756 + }, + { + "epoch": 0.38055238895222093, + "grad_norm": 0.24191439992024608, + "learning_rate": 9.34163231404556e-06, + "loss": 0.3125, + "step": 4757 + }, + { + "epoch": 0.38063238735225297, + "grad_norm": 0.31746459168290464, + "learning_rate": 9.341310997556352e-06, + "loss": 0.2435, + "step": 4758 + }, + { + "epoch": 0.38071238575228494, + "grad_norm": 0.31418512955237227, + "learning_rate": 9.340989608205803e-06, + "loss": 0.2568, + "step": 4759 + }, + { + "epoch": 0.380792384152317, + "grad_norm": 0.31047703324609527, + "learning_rate": 9.340668145999308e-06, + "loss": 0.2545, + "step": 4760 + }, + { + "epoch": 0.38087238255234895, + "grad_norm": 0.24510803859178262, + "learning_rate": 9.34034661094226e-06, + "loss": 0.3151, + "step": 4761 + }, + { + "epoch": 0.38095238095238093, + "grad_norm": 0.3251268238716554, + "learning_rate": 9.340025003040056e-06, + "loss": 0.2543, + "step": 4762 + }, + { + "epoch": 0.38103237935241296, + "grad_norm": 0.2731619054969784, + "learning_rate": 9.339703322298098e-06, + "loss": 0.3051, + "step": 4763 + }, + { + "epoch": 0.38111237775244494, + "grad_norm": 0.2745346752430434, + "learning_rate": 9.339381568721779e-06, + "loss": 0.2787, + "step": 4764 + }, + { + "epoch": 0.381192376152477, + "grad_norm": 0.2863443760905175, + "learning_rate": 9.339059742316501e-06, + "loss": 0.2981, + "step": 4765 + }, + { + "epoch": 0.38127237455250895, + "grad_norm": 0.28992790021659914, + "learning_rate": 9.338737843087668e-06, + "loss": 0.2973, + "step": 4766 + }, + { + "epoch": 0.38135237295254093, + "grad_norm": 0.2609971415471059, + "learning_rate": 9.33841587104068e-06, + "loss": 0.3037, + "step": 4767 + }, + { + "epoch": 0.38143237135257296, + "grad_norm": 0.31522470261860935, + "learning_rate": 9.338093826180941e-06, + "loss": 0.2665, + "step": 4768 + }, + { + "epoch": 0.38151236975260494, + "grad_norm": 0.2380875622384156, + "learning_rate": 9.337771708513854e-06, + "loss": 0.3214, + "step": 4769 + }, + { + "epoch": 0.381592368152637, + "grad_norm": 0.28630067401138737, + "learning_rate": 9.33744951804483e-06, + "loss": 0.2597, + "step": 4770 + }, + { + "epoch": 0.38167236655266895, + "grad_norm": 0.3406852723299316, + "learning_rate": 9.337127254779272e-06, + "loss": 0.2905, + "step": 4771 + }, + { + "epoch": 0.38175236495270093, + "grad_norm": 0.3384121103458429, + "learning_rate": 9.336804918722591e-06, + "loss": 0.2524, + "step": 4772 + }, + { + "epoch": 0.38183236335273296, + "grad_norm": 0.3470554452790307, + "learning_rate": 9.336482509880195e-06, + "loss": 0.2645, + "step": 4773 + }, + { + "epoch": 0.38191236175276494, + "grad_norm": 0.3155539209132383, + "learning_rate": 9.3361600282575e-06, + "loss": 0.265, + "step": 4774 + }, + { + "epoch": 0.3819923601527969, + "grad_norm": 0.23843930232112603, + "learning_rate": 9.33583747385991e-06, + "loss": 0.3386, + "step": 4775 + }, + { + "epoch": 0.38207235855282895, + "grad_norm": 0.3075765198465915, + "learning_rate": 9.335514846692846e-06, + "loss": 0.2898, + "step": 4776 + }, + { + "epoch": 0.3821523569528609, + "grad_norm": 0.29309647006356254, + "learning_rate": 9.335192146761717e-06, + "loss": 0.3079, + "step": 4777 + }, + { + "epoch": 0.38223235535289296, + "grad_norm": 0.270291133434768, + "learning_rate": 9.334869374071945e-06, + "loss": 0.2936, + "step": 4778 + }, + { + "epoch": 0.38231235375292494, + "grad_norm": 0.3058053264694791, + "learning_rate": 9.334546528628942e-06, + "loss": 0.266, + "step": 4779 + }, + { + "epoch": 0.3823923521529569, + "grad_norm": 0.3229559882313754, + "learning_rate": 9.334223610438128e-06, + "loss": 0.303, + "step": 4780 + }, + { + "epoch": 0.38247235055298895, + "grad_norm": 0.29967475524978815, + "learning_rate": 9.333900619504923e-06, + "loss": 0.2605, + "step": 4781 + }, + { + "epoch": 0.3825523489530209, + "grad_norm": 0.3083874559674714, + "learning_rate": 9.333577555834748e-06, + "loss": 0.251, + "step": 4782 + }, + { + "epoch": 0.38263234735305296, + "grad_norm": 0.21337789665920695, + "learning_rate": 9.333254419433026e-06, + "loss": 0.3514, + "step": 4783 + }, + { + "epoch": 0.38271234575308494, + "grad_norm": 0.30435586005980725, + "learning_rate": 9.332931210305179e-06, + "loss": 0.304, + "step": 4784 + }, + { + "epoch": 0.3827923441531169, + "grad_norm": 0.2800480942725592, + "learning_rate": 9.332607928456629e-06, + "loss": 0.2981, + "step": 4785 + }, + { + "epoch": 0.38287234255314895, + "grad_norm": 0.30226079268804074, + "learning_rate": 9.332284573892808e-06, + "loss": 0.2709, + "step": 4786 + }, + { + "epoch": 0.3829523409531809, + "grad_norm": 0.2576230028155306, + "learning_rate": 9.331961146619135e-06, + "loss": 0.3389, + "step": 4787 + }, + { + "epoch": 0.38303233935321296, + "grad_norm": 0.30086300414631717, + "learning_rate": 9.331637646641046e-06, + "loss": 0.2564, + "step": 4788 + }, + { + "epoch": 0.38311233775324494, + "grad_norm": 0.37654417396874174, + "learning_rate": 9.331314073963964e-06, + "loss": 0.3092, + "step": 4789 + }, + { + "epoch": 0.3831923361532769, + "grad_norm": 0.2877783763365303, + "learning_rate": 9.330990428593325e-06, + "loss": 0.2921, + "step": 4790 + }, + { + "epoch": 0.38327233455330895, + "grad_norm": 0.3045063611953543, + "learning_rate": 9.330666710534556e-06, + "loss": 0.2637, + "step": 4791 + }, + { + "epoch": 0.3833523329533409, + "grad_norm": 0.2995294704173348, + "learning_rate": 9.330342919793094e-06, + "loss": 0.2926, + "step": 4792 + }, + { + "epoch": 0.38343233135337296, + "grad_norm": 0.29421488279132874, + "learning_rate": 9.33001905637437e-06, + "loss": 0.2806, + "step": 4793 + }, + { + "epoch": 0.38351232975340493, + "grad_norm": 0.28791206120151136, + "learning_rate": 9.329695120283823e-06, + "loss": 0.2886, + "step": 4794 + }, + { + "epoch": 0.3835923281534369, + "grad_norm": 0.3192830024315258, + "learning_rate": 9.329371111526887e-06, + "loss": 0.2452, + "step": 4795 + }, + { + "epoch": 0.38367232655346895, + "grad_norm": 0.2848780734565408, + "learning_rate": 9.329047030109e-06, + "loss": 0.3035, + "step": 4796 + }, + { + "epoch": 0.3837523249535009, + "grad_norm": 0.2534543278102248, + "learning_rate": 9.3287228760356e-06, + "loss": 0.3271, + "step": 4797 + }, + { + "epoch": 0.38383232335353296, + "grad_norm": 0.25729605805652633, + "learning_rate": 9.328398649312133e-06, + "loss": 0.3236, + "step": 4798 + }, + { + "epoch": 0.38391232175356493, + "grad_norm": 0.26936544930140294, + "learning_rate": 9.328074349944034e-06, + "loss": 0.3158, + "step": 4799 + }, + { + "epoch": 0.3839923201535969, + "grad_norm": 0.3200630203115815, + "learning_rate": 9.32774997793675e-06, + "loss": 0.2673, + "step": 4800 + }, + { + "epoch": 0.38407231855362894, + "grad_norm": 0.3333604174838229, + "learning_rate": 9.327425533295725e-06, + "loss": 0.2784, + "step": 4801 + }, + { + "epoch": 0.3841523169536609, + "grad_norm": 0.2988090436097877, + "learning_rate": 9.327101016026399e-06, + "loss": 0.2914, + "step": 4802 + }, + { + "epoch": 0.3842323153536929, + "grad_norm": 0.3118017503599925, + "learning_rate": 9.326776426134223e-06, + "loss": 0.2906, + "step": 4803 + }, + { + "epoch": 0.38431231375372493, + "grad_norm": 0.25897457823542364, + "learning_rate": 9.326451763624646e-06, + "loss": 0.3193, + "step": 4804 + }, + { + "epoch": 0.3843923121537569, + "grad_norm": 0.29288286463856394, + "learning_rate": 9.326127028503114e-06, + "loss": 0.2742, + "step": 4805 + }, + { + "epoch": 0.38447231055378894, + "grad_norm": 0.27391998850846144, + "learning_rate": 9.325802220775077e-06, + "loss": 0.2315, + "step": 4806 + }, + { + "epoch": 0.3845523089538209, + "grad_norm": 0.30615986242623516, + "learning_rate": 9.325477340445989e-06, + "loss": 0.265, + "step": 4807 + }, + { + "epoch": 0.3846323073538529, + "grad_norm": 0.28735743237106043, + "learning_rate": 9.325152387521299e-06, + "loss": 0.2947, + "step": 4808 + }, + { + "epoch": 0.38471230575388493, + "grad_norm": 0.2947071120597362, + "learning_rate": 9.324827362006464e-06, + "loss": 0.3043, + "step": 4809 + }, + { + "epoch": 0.3847923041539169, + "grad_norm": 0.27303823592032184, + "learning_rate": 9.324502263906937e-06, + "loss": 0.2983, + "step": 4810 + }, + { + "epoch": 0.38487230255394894, + "grad_norm": 0.3600443270894972, + "learning_rate": 9.324177093228175e-06, + "loss": 0.2928, + "step": 4811 + }, + { + "epoch": 0.3849523009539809, + "grad_norm": 0.6103823939963011, + "learning_rate": 9.323851849975634e-06, + "loss": 0.2615, + "step": 4812 + }, + { + "epoch": 0.3850322993540129, + "grad_norm": 0.3176669946313247, + "learning_rate": 9.323526534154775e-06, + "loss": 0.2535, + "step": 4813 + }, + { + "epoch": 0.38511229775404493, + "grad_norm": 0.27867126449888346, + "learning_rate": 9.323201145771057e-06, + "loss": 0.2875, + "step": 4814 + }, + { + "epoch": 0.3851922961540769, + "grad_norm": 0.2930399366937355, + "learning_rate": 9.32287568482994e-06, + "loss": 0.2871, + "step": 4815 + }, + { + "epoch": 0.38527229455410894, + "grad_norm": 0.26532325107896376, + "learning_rate": 9.322550151336887e-06, + "loss": 0.3029, + "step": 4816 + }, + { + "epoch": 0.3853522929541409, + "grad_norm": 0.30378630507854376, + "learning_rate": 9.322224545297363e-06, + "loss": 0.2593, + "step": 4817 + }, + { + "epoch": 0.3854322913541729, + "grad_norm": 0.35984160955497496, + "learning_rate": 9.32189886671683e-06, + "loss": 0.2667, + "step": 4818 + }, + { + "epoch": 0.38551228975420493, + "grad_norm": 0.27065720026061196, + "learning_rate": 9.321573115600755e-06, + "loss": 0.2823, + "step": 4819 + }, + { + "epoch": 0.3855922881542369, + "grad_norm": 0.29763623061560496, + "learning_rate": 9.321247291954606e-06, + "loss": 0.2514, + "step": 4820 + }, + { + "epoch": 0.38567228655426894, + "grad_norm": 0.27741306543951094, + "learning_rate": 9.32092139578385e-06, + "loss": 0.2846, + "step": 4821 + }, + { + "epoch": 0.3857522849543009, + "grad_norm": 0.2972512531668461, + "learning_rate": 9.32059542709396e-06, + "loss": 0.2672, + "step": 4822 + }, + { + "epoch": 0.3858322833543329, + "grad_norm": 0.3130207178114284, + "learning_rate": 9.3202693858904e-06, + "loss": 0.2742, + "step": 4823 + }, + { + "epoch": 0.3859122817543649, + "grad_norm": 0.3023261789103395, + "learning_rate": 9.319943272178648e-06, + "loss": 0.2625, + "step": 4824 + }, + { + "epoch": 0.3859922801543969, + "grad_norm": 0.2843282128055772, + "learning_rate": 9.319617085964177e-06, + "loss": 0.3175, + "step": 4825 + }, + { + "epoch": 0.38607227855442894, + "grad_norm": 0.8494740364278741, + "learning_rate": 9.31929082725246e-06, + "loss": 0.2673, + "step": 4826 + }, + { + "epoch": 0.3861522769544609, + "grad_norm": 0.30711964879198966, + "learning_rate": 9.31896449604897e-06, + "loss": 0.2746, + "step": 4827 + }, + { + "epoch": 0.3862322753544929, + "grad_norm": 0.273812140557507, + "learning_rate": 9.318638092359188e-06, + "loss": 0.3034, + "step": 4828 + }, + { + "epoch": 0.3863122737545249, + "grad_norm": 0.3394264031895906, + "learning_rate": 9.318311616188592e-06, + "loss": 0.283, + "step": 4829 + }, + { + "epoch": 0.3863922721545569, + "grad_norm": 0.3170360725935444, + "learning_rate": 9.317985067542658e-06, + "loss": 0.2969, + "step": 4830 + }, + { + "epoch": 0.3864722705545889, + "grad_norm": 0.32317898874187156, + "learning_rate": 9.31765844642687e-06, + "loss": 0.2656, + "step": 4831 + }, + { + "epoch": 0.3865522689546209, + "grad_norm": 0.2732705025172585, + "learning_rate": 9.317331752846708e-06, + "loss": 0.2729, + "step": 4832 + }, + { + "epoch": 0.3866322673546529, + "grad_norm": 0.2583479740476746, + "learning_rate": 9.317004986807656e-06, + "loss": 0.3221, + "step": 4833 + }, + { + "epoch": 0.3867122657546849, + "grad_norm": 0.2901595131570063, + "learning_rate": 9.316678148315196e-06, + "loss": 0.2803, + "step": 4834 + }, + { + "epoch": 0.3867922641547169, + "grad_norm": 0.23921873893848905, + "learning_rate": 9.316351237374816e-06, + "loss": 0.3305, + "step": 4835 + }, + { + "epoch": 0.3868722625547489, + "grad_norm": 0.2446780521797944, + "learning_rate": 9.316024253992e-06, + "loss": 0.3134, + "step": 4836 + }, + { + "epoch": 0.3869522609547809, + "grad_norm": 0.2382191770944095, + "learning_rate": 9.315697198172238e-06, + "loss": 0.3633, + "step": 4837 + }, + { + "epoch": 0.3870322593548129, + "grad_norm": 0.30551837299635143, + "learning_rate": 9.31537006992102e-06, + "loss": 0.2596, + "step": 4838 + }, + { + "epoch": 0.3871122577548449, + "grad_norm": 0.23338363387067115, + "learning_rate": 9.315042869243833e-06, + "loss": 0.3137, + "step": 4839 + }, + { + "epoch": 0.3871922561548769, + "grad_norm": 0.28340900564158766, + "learning_rate": 9.314715596146171e-06, + "loss": 0.2958, + "step": 4840 + }, + { + "epoch": 0.3872722545549089, + "grad_norm": 0.27844845401447654, + "learning_rate": 9.314388250633526e-06, + "loss": 0.2799, + "step": 4841 + }, + { + "epoch": 0.3873522529549409, + "grad_norm": 0.3095038597494353, + "learning_rate": 9.31406083271139e-06, + "loss": 0.269, + "step": 4842 + }, + { + "epoch": 0.3874322513549729, + "grad_norm": 0.3067432222748822, + "learning_rate": 9.313733342385263e-06, + "loss": 0.2772, + "step": 4843 + }, + { + "epoch": 0.3875122497550049, + "grad_norm": 0.2895046446944875, + "learning_rate": 9.313405779660638e-06, + "loss": 0.305, + "step": 4844 + }, + { + "epoch": 0.3875922481550369, + "grad_norm": 0.23593236008752308, + "learning_rate": 9.313078144543012e-06, + "loss": 0.3258, + "step": 4845 + }, + { + "epoch": 0.3876722465550689, + "grad_norm": 0.3088175389983806, + "learning_rate": 9.312750437037886e-06, + "loss": 0.2446, + "step": 4846 + }, + { + "epoch": 0.3877522449551009, + "grad_norm": 0.3136528412794775, + "learning_rate": 9.312422657150755e-06, + "loss": 0.2455, + "step": 4847 + }, + { + "epoch": 0.3878322433551329, + "grad_norm": 0.29577318289844057, + "learning_rate": 9.312094804887126e-06, + "loss": 0.2634, + "step": 4848 + }, + { + "epoch": 0.3879122417551649, + "grad_norm": 0.5062111558308404, + "learning_rate": 9.3117668802525e-06, + "loss": 0.2534, + "step": 4849 + }, + { + "epoch": 0.3879922401551969, + "grad_norm": 0.26534665828161746, + "learning_rate": 9.31143888325238e-06, + "loss": 0.2759, + "step": 4850 + }, + { + "epoch": 0.3880722385552289, + "grad_norm": 0.3174845989977014, + "learning_rate": 9.31111081389227e-06, + "loss": 0.2428, + "step": 4851 + }, + { + "epoch": 0.3881522369552609, + "grad_norm": 0.31290690083661776, + "learning_rate": 9.310782672177678e-06, + "loss": 0.2485, + "step": 4852 + }, + { + "epoch": 0.3882322353552929, + "grad_norm": 0.29791880078532285, + "learning_rate": 9.31045445811411e-06, + "loss": 0.3058, + "step": 4853 + }, + { + "epoch": 0.3883122337553249, + "grad_norm": 0.2912453067191081, + "learning_rate": 9.310126171707075e-06, + "loss": 0.2789, + "step": 4854 + }, + { + "epoch": 0.3883922321553569, + "grad_norm": 0.2989256247091368, + "learning_rate": 9.30979781296208e-06, + "loss": 0.2468, + "step": 4855 + }, + { + "epoch": 0.3884722305553889, + "grad_norm": 0.3038506475186876, + "learning_rate": 9.309469381884641e-06, + "loss": 0.2804, + "step": 4856 + }, + { + "epoch": 0.3885522289554209, + "grad_norm": 0.26373862741738086, + "learning_rate": 9.309140878480267e-06, + "loss": 0.3167, + "step": 4857 + }, + { + "epoch": 0.3886322273554529, + "grad_norm": 0.27605479311799785, + "learning_rate": 9.30881230275447e-06, + "loss": 0.2884, + "step": 4858 + }, + { + "epoch": 0.38871222575548486, + "grad_norm": 0.2971406902988554, + "learning_rate": 9.308483654712769e-06, + "loss": 0.2933, + "step": 4859 + }, + { + "epoch": 0.3887922241555169, + "grad_norm": 0.33034469346797674, + "learning_rate": 9.308154934360676e-06, + "loss": 0.2479, + "step": 4860 + }, + { + "epoch": 0.3888722225555489, + "grad_norm": 0.2430513829801032, + "learning_rate": 9.30782614170371e-06, + "loss": 0.3139, + "step": 4861 + }, + { + "epoch": 0.3889522209555809, + "grad_norm": 0.2797729810835908, + "learning_rate": 9.307497276747389e-06, + "loss": 0.2909, + "step": 4862 + }, + { + "epoch": 0.3890322193556129, + "grad_norm": 0.2889856263949378, + "learning_rate": 9.30716833949723e-06, + "loss": 0.2775, + "step": 4863 + }, + { + "epoch": 0.38911221775564486, + "grad_norm": 0.2670152119338059, + "learning_rate": 9.306839329958754e-06, + "loss": 0.2767, + "step": 4864 + }, + { + "epoch": 0.3891922161556769, + "grad_norm": 0.2933177046303458, + "learning_rate": 9.306510248137488e-06, + "loss": 0.3028, + "step": 4865 + }, + { + "epoch": 0.3892722145557089, + "grad_norm": 0.2858990208010733, + "learning_rate": 9.30618109403895e-06, + "loss": 0.3164, + "step": 4866 + }, + { + "epoch": 0.3893522129557409, + "grad_norm": 0.28461954063000644, + "learning_rate": 9.305851867668665e-06, + "loss": 0.2854, + "step": 4867 + }, + { + "epoch": 0.3894322113557729, + "grad_norm": 0.2813236239302838, + "learning_rate": 9.30552256903216e-06, + "loss": 0.3074, + "step": 4868 + }, + { + "epoch": 0.38951220975580486, + "grad_norm": 0.27473870450145954, + "learning_rate": 9.305193198134962e-06, + "loss": 0.305, + "step": 4869 + }, + { + "epoch": 0.3895922081558369, + "grad_norm": 0.5526305731148495, + "learning_rate": 9.304863754982596e-06, + "loss": 0.2508, + "step": 4870 + }, + { + "epoch": 0.38967220655586887, + "grad_norm": 0.2737252159714737, + "learning_rate": 9.304534239580591e-06, + "loss": 0.2793, + "step": 4871 + }, + { + "epoch": 0.3897522049559009, + "grad_norm": 0.334680525627947, + "learning_rate": 9.304204651934484e-06, + "loss": 0.2774, + "step": 4872 + }, + { + "epoch": 0.3898322033559329, + "grad_norm": 0.2781667571653855, + "learning_rate": 9.303874992049797e-06, + "loss": 0.2975, + "step": 4873 + }, + { + "epoch": 0.38991220175596486, + "grad_norm": 0.27324606288748143, + "learning_rate": 9.303545259932072e-06, + "loss": 0.2866, + "step": 4874 + }, + { + "epoch": 0.3899922001559969, + "grad_norm": 0.5961633731840792, + "learning_rate": 9.303215455586835e-06, + "loss": 0.27, + "step": 4875 + }, + { + "epoch": 0.39007219855602887, + "grad_norm": 0.32070531476653424, + "learning_rate": 9.302885579019626e-06, + "loss": 0.2509, + "step": 4876 + }, + { + "epoch": 0.3901521969560609, + "grad_norm": 0.285952337217137, + "learning_rate": 9.30255563023598e-06, + "loss": 0.2634, + "step": 4877 + }, + { + "epoch": 0.3902321953560929, + "grad_norm": 0.2778755048420881, + "learning_rate": 9.302225609241436e-06, + "loss": 0.3037, + "step": 4878 + }, + { + "epoch": 0.39031219375612486, + "grad_norm": 0.3479077575717999, + "learning_rate": 9.301895516041531e-06, + "loss": 0.2645, + "step": 4879 + }, + { + "epoch": 0.3903921921561569, + "grad_norm": 0.3165529235609275, + "learning_rate": 9.301565350641806e-06, + "loss": 0.2523, + "step": 4880 + }, + { + "epoch": 0.39047219055618887, + "grad_norm": 0.2967099086265595, + "learning_rate": 9.301235113047801e-06, + "loss": 0.2866, + "step": 4881 + }, + { + "epoch": 0.3905521889562209, + "grad_norm": 0.2836419264167962, + "learning_rate": 9.300904803265061e-06, + "loss": 0.2824, + "step": 4882 + }, + { + "epoch": 0.3906321873562529, + "grad_norm": 0.3009136868594308, + "learning_rate": 9.300574421299127e-06, + "loss": 0.2729, + "step": 4883 + }, + { + "epoch": 0.39071218575628486, + "grad_norm": 0.24900676681618725, + "learning_rate": 9.300243967155545e-06, + "loss": 0.3212, + "step": 4884 + }, + { + "epoch": 0.3907921841563169, + "grad_norm": 0.28562486784397295, + "learning_rate": 9.299913440839859e-06, + "loss": 0.2687, + "step": 4885 + }, + { + "epoch": 0.39087218255634887, + "grad_norm": 0.23408404401543617, + "learning_rate": 9.299582842357619e-06, + "loss": 0.327, + "step": 4886 + }, + { + "epoch": 0.39095218095638085, + "grad_norm": 0.34196709134874403, + "learning_rate": 9.299252171714374e-06, + "loss": 0.2349, + "step": 4887 + }, + { + "epoch": 0.3910321793564129, + "grad_norm": 0.21750651196099097, + "learning_rate": 9.298921428915674e-06, + "loss": 0.3517, + "step": 4888 + }, + { + "epoch": 0.39111217775644486, + "grad_norm": 0.3290300799951341, + "learning_rate": 9.298590613967067e-06, + "loss": 0.2926, + "step": 4889 + }, + { + "epoch": 0.3911921761564769, + "grad_norm": 0.30926738760470074, + "learning_rate": 9.298259726874105e-06, + "loss": 0.2467, + "step": 4890 + }, + { + "epoch": 0.39127217455650887, + "grad_norm": 0.29235639767949917, + "learning_rate": 9.297928767642346e-06, + "loss": 0.2529, + "step": 4891 + }, + { + "epoch": 0.39135217295654084, + "grad_norm": 0.21947920904478438, + "learning_rate": 9.297597736277339e-06, + "loss": 0.3621, + "step": 4892 + }, + { + "epoch": 0.3914321713565729, + "grad_norm": 0.27020595724711527, + "learning_rate": 9.297266632784646e-06, + "loss": 0.2811, + "step": 4893 + }, + { + "epoch": 0.39151216975660486, + "grad_norm": 0.26218227191969007, + "learning_rate": 9.296935457169817e-06, + "loss": 0.3147, + "step": 4894 + }, + { + "epoch": 0.3915921681566369, + "grad_norm": 0.29558364252541164, + "learning_rate": 9.296604209438414e-06, + "loss": 0.2671, + "step": 4895 + }, + { + "epoch": 0.39167216655666887, + "grad_norm": 0.3353527865799121, + "learning_rate": 9.296272889595997e-06, + "loss": 0.2717, + "step": 4896 + }, + { + "epoch": 0.39175216495670084, + "grad_norm": 0.2907789445184273, + "learning_rate": 9.295941497648125e-06, + "loss": 0.2457, + "step": 4897 + }, + { + "epoch": 0.3918321633567329, + "grad_norm": 0.40840004195439866, + "learning_rate": 9.295610033600362e-06, + "loss": 0.2691, + "step": 4898 + }, + { + "epoch": 0.39191216175676485, + "grad_norm": 0.3000432329400431, + "learning_rate": 9.295278497458266e-06, + "loss": 0.2679, + "step": 4899 + }, + { + "epoch": 0.3919921601567969, + "grad_norm": 0.28720880591716935, + "learning_rate": 9.294946889227408e-06, + "loss": 0.2802, + "step": 4900 + }, + { + "epoch": 0.39207215855682886, + "grad_norm": 0.3257263473546301, + "learning_rate": 9.29461520891335e-06, + "loss": 0.286, + "step": 4901 + }, + { + "epoch": 0.39215215695686084, + "grad_norm": 0.3046700558668821, + "learning_rate": 9.294283456521658e-06, + "loss": 0.3002, + "step": 4902 + }, + { + "epoch": 0.3922321553568929, + "grad_norm": 0.27670311340069864, + "learning_rate": 9.2939516320579e-06, + "loss": 0.2859, + "step": 4903 + }, + { + "epoch": 0.39231215375692485, + "grad_norm": 0.2551949968873366, + "learning_rate": 9.293619735527649e-06, + "loss": 0.3101, + "step": 4904 + }, + { + "epoch": 0.3923921521569569, + "grad_norm": 0.2939482951074272, + "learning_rate": 9.293287766936469e-06, + "loss": 0.3045, + "step": 4905 + }, + { + "epoch": 0.39247215055698886, + "grad_norm": 0.28532471828073486, + "learning_rate": 9.292955726289935e-06, + "loss": 0.3059, + "step": 4906 + }, + { + "epoch": 0.39255214895702084, + "grad_norm": 0.2549460574195678, + "learning_rate": 9.29262361359362e-06, + "loss": 0.321, + "step": 4907 + }, + { + "epoch": 0.3926321473570529, + "grad_norm": 0.265134346748614, + "learning_rate": 9.292291428853097e-06, + "loss": 0.3168, + "step": 4908 + }, + { + "epoch": 0.39271214575708485, + "grad_norm": 0.30934495830095277, + "learning_rate": 9.291959172073943e-06, + "loss": 0.2781, + "step": 4909 + }, + { + "epoch": 0.3927921441571169, + "grad_norm": 0.25704991891225626, + "learning_rate": 9.291626843261732e-06, + "loss": 0.2828, + "step": 4910 + }, + { + "epoch": 0.39287214255714886, + "grad_norm": 0.26879678043745536, + "learning_rate": 9.291294442422043e-06, + "loss": 0.2995, + "step": 4911 + }, + { + "epoch": 0.39295214095718084, + "grad_norm": 0.26598922910372863, + "learning_rate": 9.290961969560452e-06, + "loss": 0.2839, + "step": 4912 + }, + { + "epoch": 0.3930321393572129, + "grad_norm": 0.29662800310511234, + "learning_rate": 9.290629424682543e-06, + "loss": 0.291, + "step": 4913 + }, + { + "epoch": 0.39311213775724485, + "grad_norm": 0.23815048215809567, + "learning_rate": 9.290296807793894e-06, + "loss": 0.3158, + "step": 4914 + }, + { + "epoch": 0.39319213615727683, + "grad_norm": 0.2637924687959926, + "learning_rate": 9.289964118900092e-06, + "loss": 0.3122, + "step": 4915 + }, + { + "epoch": 0.39327213455730886, + "grad_norm": 0.256407275662942, + "learning_rate": 9.289631358006715e-06, + "loss": 0.305, + "step": 4916 + }, + { + "epoch": 0.39335213295734084, + "grad_norm": 0.30605527614189065, + "learning_rate": 9.28929852511935e-06, + "loss": 0.2404, + "step": 4917 + }, + { + "epoch": 0.39343213135737287, + "grad_norm": 0.296514600494335, + "learning_rate": 9.288965620243584e-06, + "loss": 0.2787, + "step": 4918 + }, + { + "epoch": 0.39351212975740485, + "grad_norm": 0.3098202512502373, + "learning_rate": 9.288632643385002e-06, + "loss": 0.2627, + "step": 4919 + }, + { + "epoch": 0.3935921281574368, + "grad_norm": 0.3163530054582972, + "learning_rate": 9.288299594549195e-06, + "loss": 0.2331, + "step": 4920 + }, + { + "epoch": 0.39367212655746886, + "grad_norm": 0.29614248193823495, + "learning_rate": 9.287966473741752e-06, + "loss": 0.2664, + "step": 4921 + }, + { + "epoch": 0.39375212495750084, + "grad_norm": 0.2711493578891143, + "learning_rate": 9.287633280968263e-06, + "loss": 0.293, + "step": 4922 + }, + { + "epoch": 0.39383212335753287, + "grad_norm": 0.31219455842559224, + "learning_rate": 9.28730001623432e-06, + "loss": 0.277, + "step": 4923 + }, + { + "epoch": 0.39391212175756485, + "grad_norm": 0.28966369424049765, + "learning_rate": 9.286966679545516e-06, + "loss": 0.2966, + "step": 4924 + }, + { + "epoch": 0.3939921201575968, + "grad_norm": 0.31777465376102604, + "learning_rate": 9.286633270907448e-06, + "loss": 0.2592, + "step": 4925 + }, + { + "epoch": 0.39407211855762886, + "grad_norm": 0.28584211046025093, + "learning_rate": 9.286299790325708e-06, + "loss": 0.2637, + "step": 4926 + }, + { + "epoch": 0.39415211695766084, + "grad_norm": 0.20535500953986718, + "learning_rate": 9.285966237805895e-06, + "loss": 0.3447, + "step": 4927 + }, + { + "epoch": 0.39423211535769287, + "grad_norm": 0.28768616416881776, + "learning_rate": 9.285632613353607e-06, + "loss": 0.283, + "step": 4928 + }, + { + "epoch": 0.39431211375772485, + "grad_norm": 0.2952998277188881, + "learning_rate": 9.285298916974443e-06, + "loss": 0.2786, + "step": 4929 + }, + { + "epoch": 0.3943921121577568, + "grad_norm": 0.26674459512069204, + "learning_rate": 9.284965148674004e-06, + "loss": 0.2865, + "step": 4930 + }, + { + "epoch": 0.39447211055778886, + "grad_norm": 0.2713987160330422, + "learning_rate": 9.284631308457892e-06, + "loss": 0.2823, + "step": 4931 + }, + { + "epoch": 0.39455210895782084, + "grad_norm": 0.27189608905177337, + "learning_rate": 9.284297396331709e-06, + "loss": 0.3061, + "step": 4932 + }, + { + "epoch": 0.39463210735785287, + "grad_norm": 0.28139145402442345, + "learning_rate": 9.283963412301058e-06, + "loss": 0.284, + "step": 4933 + }, + { + "epoch": 0.39471210575788485, + "grad_norm": 0.200147738129542, + "learning_rate": 9.283629356371548e-06, + "loss": 0.3352, + "step": 4934 + }, + { + "epoch": 0.3947921041579168, + "grad_norm": 0.2637830719066077, + "learning_rate": 9.28329522854878e-06, + "loss": 0.2798, + "step": 4935 + }, + { + "epoch": 0.39487210255794886, + "grad_norm": 0.24386134132415022, + "learning_rate": 9.282961028838369e-06, + "loss": 0.3116, + "step": 4936 + }, + { + "epoch": 0.39495210095798083, + "grad_norm": 0.2916238769998534, + "learning_rate": 9.282626757245918e-06, + "loss": 0.2865, + "step": 4937 + }, + { + "epoch": 0.39503209935801287, + "grad_norm": 0.310428523305582, + "learning_rate": 9.282292413777039e-06, + "loss": 0.2448, + "step": 4938 + }, + { + "epoch": 0.39511209775804484, + "grad_norm": 0.4393450085560604, + "learning_rate": 9.281957998437345e-06, + "loss": 0.2845, + "step": 4939 + }, + { + "epoch": 0.3951920961580768, + "grad_norm": 0.2288654384369801, + "learning_rate": 9.281623511232445e-06, + "loss": 0.3203, + "step": 4940 + }, + { + "epoch": 0.39527209455810886, + "grad_norm": 0.24376670709846104, + "learning_rate": 9.281288952167957e-06, + "loss": 0.3249, + "step": 4941 + }, + { + "epoch": 0.39535209295814083, + "grad_norm": 0.2743079915780085, + "learning_rate": 9.280954321249492e-06, + "loss": 0.2703, + "step": 4942 + }, + { + "epoch": 0.3954320913581728, + "grad_norm": 0.6256807932792317, + "learning_rate": 9.28061961848267e-06, + "loss": 0.2629, + "step": 4943 + }, + { + "epoch": 0.39551208975820484, + "grad_norm": 0.27249929039297366, + "learning_rate": 9.280284843873104e-06, + "loss": 0.3014, + "step": 4944 + }, + { + "epoch": 0.3955920881582368, + "grad_norm": 0.3027042764144979, + "learning_rate": 9.279949997426417e-06, + "loss": 0.2877, + "step": 4945 + }, + { + "epoch": 0.39567208655826885, + "grad_norm": 0.32336793433404165, + "learning_rate": 9.279615079148227e-06, + "loss": 0.2904, + "step": 4946 + }, + { + "epoch": 0.39575208495830083, + "grad_norm": 0.34576818216412203, + "learning_rate": 9.279280089044155e-06, + "loss": 0.2423, + "step": 4947 + }, + { + "epoch": 0.3958320833583328, + "grad_norm": 0.30287458691336006, + "learning_rate": 9.278945027119821e-06, + "loss": 0.3037, + "step": 4948 + }, + { + "epoch": 0.39591208175836484, + "grad_norm": 0.24735420503803085, + "learning_rate": 9.278609893380855e-06, + "loss": 0.3226, + "step": 4949 + }, + { + "epoch": 0.3959920801583968, + "grad_norm": 0.2616705547544419, + "learning_rate": 9.278274687832874e-06, + "loss": 0.2844, + "step": 4950 + }, + { + "epoch": 0.39607207855842885, + "grad_norm": 0.2550626812227144, + "learning_rate": 9.277939410481507e-06, + "loss": 0.3267, + "step": 4951 + }, + { + "epoch": 0.39615207695846083, + "grad_norm": 0.24097297595164116, + "learning_rate": 9.277604061332382e-06, + "loss": 0.2639, + "step": 4952 + }, + { + "epoch": 0.3962320753584928, + "grad_norm": 0.30141052522388384, + "learning_rate": 9.277268640391126e-06, + "loss": 0.2595, + "step": 4953 + }, + { + "epoch": 0.39631207375852484, + "grad_norm": 0.3266268961564636, + "learning_rate": 9.27693314766337e-06, + "loss": 0.292, + "step": 4954 + }, + { + "epoch": 0.3963920721585568, + "grad_norm": 0.33201987292222385, + "learning_rate": 9.27659758315474e-06, + "loss": 0.2418, + "step": 4955 + }, + { + "epoch": 0.39647207055858885, + "grad_norm": 0.32265864992593174, + "learning_rate": 9.276261946870875e-06, + "loss": 0.256, + "step": 4956 + }, + { + "epoch": 0.39655206895862083, + "grad_norm": 0.3364486534565527, + "learning_rate": 9.275926238817404e-06, + "loss": 0.2974, + "step": 4957 + }, + { + "epoch": 0.3966320673586528, + "grad_norm": 0.31837491247763366, + "learning_rate": 9.275590458999962e-06, + "loss": 0.3333, + "step": 4958 + }, + { + "epoch": 0.39671206575868484, + "grad_norm": 0.2953471841244303, + "learning_rate": 9.275254607424182e-06, + "loss": 0.2355, + "step": 4959 + }, + { + "epoch": 0.3967920641587168, + "grad_norm": 0.27829245783281253, + "learning_rate": 9.274918684095703e-06, + "loss": 0.2755, + "step": 4960 + }, + { + "epoch": 0.39687206255874885, + "grad_norm": 0.20255866690888125, + "learning_rate": 9.274582689020164e-06, + "loss": 0.3361, + "step": 4961 + }, + { + "epoch": 0.39695206095878083, + "grad_norm": 0.3661372714960386, + "learning_rate": 9.274246622203204e-06, + "loss": 0.3228, + "step": 4962 + }, + { + "epoch": 0.3970320593588128, + "grad_norm": 0.3807652867315839, + "learning_rate": 9.273910483650461e-06, + "loss": 0.3066, + "step": 4963 + }, + { + "epoch": 0.39711205775884484, + "grad_norm": 0.26442186656373057, + "learning_rate": 9.273574273367577e-06, + "loss": 0.2921, + "step": 4964 + }, + { + "epoch": 0.3971920561588768, + "grad_norm": 0.3356222186116281, + "learning_rate": 9.273237991360196e-06, + "loss": 0.2812, + "step": 4965 + }, + { + "epoch": 0.39727205455890885, + "grad_norm": 0.2951746129500423, + "learning_rate": 9.272901637633961e-06, + "loss": 0.2677, + "step": 4966 + }, + { + "epoch": 0.3973520529589408, + "grad_norm": 0.26310663834970627, + "learning_rate": 9.272565212194517e-06, + "loss": 0.3, + "step": 4967 + }, + { + "epoch": 0.3974320513589728, + "grad_norm": 0.3389389095705509, + "learning_rate": 9.272228715047512e-06, + "loss": 0.2681, + "step": 4968 + }, + { + "epoch": 0.39751204975900484, + "grad_norm": 0.29524617299712363, + "learning_rate": 9.271892146198591e-06, + "loss": 0.2625, + "step": 4969 + }, + { + "epoch": 0.3975920481590368, + "grad_norm": 0.30988479736107793, + "learning_rate": 9.271555505653404e-06, + "loss": 0.2645, + "step": 4970 + }, + { + "epoch": 0.3976720465590688, + "grad_norm": 0.23920328035494015, + "learning_rate": 9.271218793417601e-06, + "loss": 0.3122, + "step": 4971 + }, + { + "epoch": 0.3977520449591008, + "grad_norm": 0.25015261401070144, + "learning_rate": 9.270882009496834e-06, + "loss": 0.3049, + "step": 4972 + }, + { + "epoch": 0.3978320433591328, + "grad_norm": 0.23649340434488053, + "learning_rate": 9.270545153896752e-06, + "loss": 0.3305, + "step": 4973 + }, + { + "epoch": 0.39791204175916484, + "grad_norm": 0.24473822786817326, + "learning_rate": 9.270208226623013e-06, + "loss": 0.3134, + "step": 4974 + }, + { + "epoch": 0.3979920401591968, + "grad_norm": 0.3004297306808186, + "learning_rate": 9.269871227681268e-06, + "loss": 0.2633, + "step": 4975 + }, + { + "epoch": 0.3980720385592288, + "grad_norm": 0.2675037887902771, + "learning_rate": 9.269534157077177e-06, + "loss": 0.2917, + "step": 4976 + }, + { + "epoch": 0.3981520369592608, + "grad_norm": 0.27351496789423035, + "learning_rate": 9.269197014816393e-06, + "loss": 0.2952, + "step": 4977 + }, + { + "epoch": 0.3982320353592928, + "grad_norm": 0.30134302204225966, + "learning_rate": 9.268859800904575e-06, + "loss": 0.3044, + "step": 4978 + }, + { + "epoch": 0.39831203375932484, + "grad_norm": 0.317782581541904, + "learning_rate": 9.268522515347384e-06, + "loss": 0.2931, + "step": 4979 + }, + { + "epoch": 0.3983920321593568, + "grad_norm": 0.3144302736878511, + "learning_rate": 9.268185158150482e-06, + "loss": 0.2664, + "step": 4980 + }, + { + "epoch": 0.3984720305593888, + "grad_norm": 0.29890159728713495, + "learning_rate": 9.267847729319528e-06, + "loss": 0.2964, + "step": 4981 + }, + { + "epoch": 0.3985520289594208, + "grad_norm": 0.30612932473175336, + "learning_rate": 9.267510228860187e-06, + "loss": 0.2842, + "step": 4982 + }, + { + "epoch": 0.3986320273594528, + "grad_norm": 0.29716721633879417, + "learning_rate": 9.267172656778122e-06, + "loss": 0.2945, + "step": 4983 + }, + { + "epoch": 0.39871202575948483, + "grad_norm": 0.2783425684967864, + "learning_rate": 9.266835013079e-06, + "loss": 0.3015, + "step": 4984 + }, + { + "epoch": 0.3987920241595168, + "grad_norm": 0.23434914632947976, + "learning_rate": 9.266497297768487e-06, + "loss": 0.3171, + "step": 4985 + }, + { + "epoch": 0.3988720225595488, + "grad_norm": 0.23066008964775217, + "learning_rate": 9.266159510852251e-06, + "loss": 0.3205, + "step": 4986 + }, + { + "epoch": 0.3989520209595808, + "grad_norm": 0.3191400251677262, + "learning_rate": 9.265821652335961e-06, + "loss": 0.287, + "step": 4987 + }, + { + "epoch": 0.3990320193596128, + "grad_norm": 0.28569400921712595, + "learning_rate": 9.265483722225288e-06, + "loss": 0.2945, + "step": 4988 + }, + { + "epoch": 0.39911201775964483, + "grad_norm": 0.266379394590889, + "learning_rate": 9.265145720525902e-06, + "loss": 0.3067, + "step": 4989 + }, + { + "epoch": 0.3991920161596768, + "grad_norm": 0.30091685367434845, + "learning_rate": 9.26480764724348e-06, + "loss": 0.2496, + "step": 4990 + }, + { + "epoch": 0.3992720145597088, + "grad_norm": 0.3096213478614501, + "learning_rate": 9.264469502383689e-06, + "loss": 0.2611, + "step": 4991 + }, + { + "epoch": 0.3993520129597408, + "grad_norm": 0.3684877319274733, + "learning_rate": 9.264131285952209e-06, + "loss": 0.2777, + "step": 4992 + }, + { + "epoch": 0.3994320113597728, + "grad_norm": 0.2993231439950321, + "learning_rate": 9.263792997954717e-06, + "loss": 0.2742, + "step": 4993 + }, + { + "epoch": 0.39951200975980483, + "grad_norm": 0.21364371777268776, + "learning_rate": 9.263454638396889e-06, + "loss": 0.3264, + "step": 4994 + }, + { + "epoch": 0.3995920081598368, + "grad_norm": 0.31960204440287454, + "learning_rate": 9.263116207284402e-06, + "loss": 0.2634, + "step": 4995 + }, + { + "epoch": 0.3996720065598688, + "grad_norm": 0.3389121352967545, + "learning_rate": 9.262777704622939e-06, + "loss": 0.3062, + "step": 4996 + }, + { + "epoch": 0.3997520049599008, + "grad_norm": 0.3109395643649128, + "learning_rate": 9.26243913041818e-06, + "loss": 0.2727, + "step": 4997 + }, + { + "epoch": 0.3998320033599328, + "grad_norm": 0.29331121253792536, + "learning_rate": 9.262100484675807e-06, + "loss": 0.2893, + "step": 4998 + }, + { + "epoch": 0.3999120017599648, + "grad_norm": 0.2691149018747211, + "learning_rate": 9.261761767401503e-06, + "loss": 0.2711, + "step": 4999 + }, + { + "epoch": 0.3999920001599968, + "grad_norm": 0.296576034418958, + "learning_rate": 9.261422978600955e-06, + "loss": 0.2854, + "step": 5000 + }, + { + "epoch": 0.4000719985600288, + "grad_norm": 0.3531510028917982, + "learning_rate": 9.261084118279846e-06, + "loss": 0.2652, + "step": 5001 + }, + { + "epoch": 0.4001519969600608, + "grad_norm": 0.3580791990637992, + "learning_rate": 9.260745186443867e-06, + "loss": 0.2969, + "step": 5002 + }, + { + "epoch": 0.4002319953600928, + "grad_norm": 0.2732021189957499, + "learning_rate": 9.260406183098704e-06, + "loss": 0.3137, + "step": 5003 + }, + { + "epoch": 0.4003119937601248, + "grad_norm": 0.3651616943857174, + "learning_rate": 9.260067108250046e-06, + "loss": 0.246, + "step": 5004 + }, + { + "epoch": 0.4003919921601568, + "grad_norm": 0.2875771658907377, + "learning_rate": 9.259727961903584e-06, + "loss": 0.315, + "step": 5005 + }, + { + "epoch": 0.4004719905601888, + "grad_norm": 0.3118289710743714, + "learning_rate": 9.259388744065012e-06, + "loss": 0.2967, + "step": 5006 + }, + { + "epoch": 0.4005519889602208, + "grad_norm": 0.35068534037207505, + "learning_rate": 9.25904945474002e-06, + "loss": 0.2638, + "step": 5007 + }, + { + "epoch": 0.4006319873602528, + "grad_norm": 0.2899637522019673, + "learning_rate": 9.258710093934306e-06, + "loss": 0.2682, + "step": 5008 + }, + { + "epoch": 0.4007119857602848, + "grad_norm": 0.2906777883736232, + "learning_rate": 9.258370661653563e-06, + "loss": 0.3059, + "step": 5009 + }, + { + "epoch": 0.4007919841603168, + "grad_norm": 0.3041562548172392, + "learning_rate": 9.258031157903489e-06, + "loss": 0.2908, + "step": 5010 + }, + { + "epoch": 0.4008719825603488, + "grad_norm": 0.2640874593079455, + "learning_rate": 9.25769158268978e-06, + "loss": 0.3103, + "step": 5011 + }, + { + "epoch": 0.4009519809603808, + "grad_norm": 0.314693387470461, + "learning_rate": 9.257351936018137e-06, + "loss": 0.2792, + "step": 5012 + }, + { + "epoch": 0.4010319793604128, + "grad_norm": 1.6442972826920612, + "learning_rate": 9.257012217894261e-06, + "loss": 0.3086, + "step": 5013 + }, + { + "epoch": 0.40111197776044477, + "grad_norm": 0.33033963481614415, + "learning_rate": 9.256672428323852e-06, + "loss": 0.3042, + "step": 5014 + }, + { + "epoch": 0.4011919761604768, + "grad_norm": 1.094982486110196, + "learning_rate": 9.256332567312614e-06, + "loss": 0.2829, + "step": 5015 + }, + { + "epoch": 0.4012719745605088, + "grad_norm": 0.2880930155399843, + "learning_rate": 9.255992634866248e-06, + "loss": 0.3016, + "step": 5016 + }, + { + "epoch": 0.4013519729605408, + "grad_norm": 0.29015800984498824, + "learning_rate": 9.255652630990464e-06, + "loss": 0.3137, + "step": 5017 + }, + { + "epoch": 0.4014319713605728, + "grad_norm": 0.26500750844840604, + "learning_rate": 9.255312555690965e-06, + "loss": 0.2694, + "step": 5018 + }, + { + "epoch": 0.40151196976060477, + "grad_norm": 0.28241871317948614, + "learning_rate": 9.25497240897346e-06, + "loss": 0.3122, + "step": 5019 + }, + { + "epoch": 0.4015919681606368, + "grad_norm": 0.2885469735871929, + "learning_rate": 9.254632190843657e-06, + "loss": 0.3167, + "step": 5020 + }, + { + "epoch": 0.4016719665606688, + "grad_norm": 0.3234172073724753, + "learning_rate": 9.254291901307267e-06, + "loss": 0.2997, + "step": 5021 + }, + { + "epoch": 0.4017519649607008, + "grad_norm": 0.40371240460558705, + "learning_rate": 9.253951540369999e-06, + "loss": 0.2537, + "step": 5022 + }, + { + "epoch": 0.4018319633607328, + "grad_norm": 0.32854774756768734, + "learning_rate": 9.253611108037566e-06, + "loss": 0.2606, + "step": 5023 + }, + { + "epoch": 0.40191196176076477, + "grad_norm": 0.43486019387190494, + "learning_rate": 9.253270604315685e-06, + "loss": 0.2665, + "step": 5024 + }, + { + "epoch": 0.4019919601607968, + "grad_norm": 0.27070070193384177, + "learning_rate": 9.252930029210066e-06, + "loss": 0.3188, + "step": 5025 + }, + { + "epoch": 0.4020719585608288, + "grad_norm": 0.309841357449696, + "learning_rate": 9.252589382726426e-06, + "loss": 0.2546, + "step": 5026 + }, + { + "epoch": 0.40215195696086076, + "grad_norm": 0.32501942556157376, + "learning_rate": 9.252248664870486e-06, + "loss": 0.2851, + "step": 5027 + }, + { + "epoch": 0.4022319553608928, + "grad_norm": 0.32224442920577284, + "learning_rate": 9.251907875647961e-06, + "loss": 0.3012, + "step": 5028 + }, + { + "epoch": 0.40231195376092477, + "grad_norm": 0.33129666928433, + "learning_rate": 9.25156701506457e-06, + "loss": 0.2975, + "step": 5029 + }, + { + "epoch": 0.4023919521609568, + "grad_norm": 0.2848291646035959, + "learning_rate": 9.251226083126035e-06, + "loss": 0.2818, + "step": 5030 + }, + { + "epoch": 0.4024719505609888, + "grad_norm": 0.23517434792560457, + "learning_rate": 9.250885079838079e-06, + "loss": 0.3389, + "step": 5031 + }, + { + "epoch": 0.40255194896102076, + "grad_norm": 0.47806638346222996, + "learning_rate": 9.250544005206421e-06, + "loss": 0.2754, + "step": 5032 + }, + { + "epoch": 0.4026319473610528, + "grad_norm": 0.37640281800543207, + "learning_rate": 9.250202859236792e-06, + "loss": 0.2892, + "step": 5033 + }, + { + "epoch": 0.40271194576108477, + "grad_norm": 0.2564643697842794, + "learning_rate": 9.249861641934911e-06, + "loss": 0.2878, + "step": 5034 + }, + { + "epoch": 0.4027919441611168, + "grad_norm": 0.3430377354177823, + "learning_rate": 9.249520353306509e-06, + "loss": 0.2731, + "step": 5035 + }, + { + "epoch": 0.4028719425611488, + "grad_norm": 0.2683295531141864, + "learning_rate": 9.249178993357312e-06, + "loss": 0.2882, + "step": 5036 + }, + { + "epoch": 0.40295194096118075, + "grad_norm": 0.2933632400625278, + "learning_rate": 9.248837562093049e-06, + "loss": 0.2759, + "step": 5037 + }, + { + "epoch": 0.4030319393612128, + "grad_norm": 0.2622192298906411, + "learning_rate": 9.248496059519452e-06, + "loss": 0.3238, + "step": 5038 + }, + { + "epoch": 0.40311193776124477, + "grad_norm": 0.3428165002251814, + "learning_rate": 9.24815448564225e-06, + "loss": 0.2624, + "step": 5039 + }, + { + "epoch": 0.4031919361612768, + "grad_norm": 0.30873633820468505, + "learning_rate": 9.247812840467179e-06, + "loss": 0.2877, + "step": 5040 + }, + { + "epoch": 0.4032719345613088, + "grad_norm": 0.25738574384172697, + "learning_rate": 9.24747112399997e-06, + "loss": 0.29, + "step": 5041 + }, + { + "epoch": 0.40335193296134075, + "grad_norm": 0.25150575581941115, + "learning_rate": 9.24712933624636e-06, + "loss": 0.3067, + "step": 5042 + }, + { + "epoch": 0.4034319313613728, + "grad_norm": 0.3288895564004573, + "learning_rate": 9.246787477212085e-06, + "loss": 0.2632, + "step": 5043 + }, + { + "epoch": 0.40351192976140476, + "grad_norm": 0.3032031788991777, + "learning_rate": 9.24644554690288e-06, + "loss": 0.2519, + "step": 5044 + }, + { + "epoch": 0.4035919281614368, + "grad_norm": 0.2893762348697433, + "learning_rate": 9.246103545324488e-06, + "loss": 0.2927, + "step": 5045 + }, + { + "epoch": 0.4036719265614688, + "grad_norm": 0.3343603013673226, + "learning_rate": 9.245761472482646e-06, + "loss": 0.2618, + "step": 5046 + }, + { + "epoch": 0.40375192496150075, + "grad_norm": 0.3021940748433022, + "learning_rate": 9.245419328383095e-06, + "loss": 0.2753, + "step": 5047 + }, + { + "epoch": 0.4038319233615328, + "grad_norm": 0.3263627013624696, + "learning_rate": 9.245077113031577e-06, + "loss": 0.2546, + "step": 5048 + }, + { + "epoch": 0.40391192176156476, + "grad_norm": 0.2677818052763272, + "learning_rate": 9.244734826433839e-06, + "loss": 0.284, + "step": 5049 + }, + { + "epoch": 0.40399192016159674, + "grad_norm": 0.24340605910982918, + "learning_rate": 9.244392468595622e-06, + "loss": 0.3349, + "step": 5050 + }, + { + "epoch": 0.4040719185616288, + "grad_norm": 0.33123884193897074, + "learning_rate": 9.244050039522673e-06, + "loss": 0.2569, + "step": 5051 + }, + { + "epoch": 0.40415191696166075, + "grad_norm": 0.3058730002708665, + "learning_rate": 9.243707539220739e-06, + "loss": 0.3145, + "step": 5052 + }, + { + "epoch": 0.4042319153616928, + "grad_norm": 0.290328994784023, + "learning_rate": 9.243364967695569e-06, + "loss": 0.2809, + "step": 5053 + }, + { + "epoch": 0.40431191376172476, + "grad_norm": 0.23150043859870223, + "learning_rate": 9.243022324952912e-06, + "loss": 0.3066, + "step": 5054 + }, + { + "epoch": 0.40439191216175674, + "grad_norm": 0.3139180638078602, + "learning_rate": 9.242679610998519e-06, + "loss": 0.265, + "step": 5055 + }, + { + "epoch": 0.40447191056178877, + "grad_norm": 0.27440633355491206, + "learning_rate": 9.242336825838141e-06, + "loss": 0.2771, + "step": 5056 + }, + { + "epoch": 0.40455190896182075, + "grad_norm": 0.284952591423798, + "learning_rate": 9.241993969477531e-06, + "loss": 0.2954, + "step": 5057 + }, + { + "epoch": 0.4046319073618528, + "grad_norm": 0.2229839786110436, + "learning_rate": 9.241651041922443e-06, + "loss": 0.3165, + "step": 5058 + }, + { + "epoch": 0.40471190576188476, + "grad_norm": 0.19684151283244064, + "learning_rate": 9.241308043178635e-06, + "loss": 0.3547, + "step": 5059 + }, + { + "epoch": 0.40479190416191674, + "grad_norm": 0.2885099048285632, + "learning_rate": 9.240964973251861e-06, + "loss": 0.2911, + "step": 5060 + }, + { + "epoch": 0.40487190256194877, + "grad_norm": 0.2642295130386296, + "learning_rate": 9.24062183214788e-06, + "loss": 0.3111, + "step": 5061 + }, + { + "epoch": 0.40495190096198075, + "grad_norm": 0.2556353383347269, + "learning_rate": 9.24027861987245e-06, + "loss": 0.2887, + "step": 5062 + }, + { + "epoch": 0.4050318993620128, + "grad_norm": 0.2587158139104808, + "learning_rate": 9.239935336431331e-06, + "loss": 0.3196, + "step": 5063 + }, + { + "epoch": 0.40511189776204476, + "grad_norm": 0.3063833374611063, + "learning_rate": 9.239591981830286e-06, + "loss": 0.2663, + "step": 5064 + }, + { + "epoch": 0.40519189616207674, + "grad_norm": 0.30496393118043685, + "learning_rate": 9.23924855607508e-06, + "loss": 0.263, + "step": 5065 + }, + { + "epoch": 0.40527189456210877, + "grad_norm": 0.2937037553971845, + "learning_rate": 9.23890505917147e-06, + "loss": 0.2516, + "step": 5066 + }, + { + "epoch": 0.40535189296214075, + "grad_norm": 0.2887407581849976, + "learning_rate": 9.238561491125225e-06, + "loss": 0.2606, + "step": 5067 + }, + { + "epoch": 0.4054318913621728, + "grad_norm": 0.2926303705389336, + "learning_rate": 9.238217851942112e-06, + "loss": 0.291, + "step": 5068 + }, + { + "epoch": 0.40551188976220476, + "grad_norm": 0.3281407837920735, + "learning_rate": 9.237874141627896e-06, + "loss": 0.2839, + "step": 5069 + }, + { + "epoch": 0.40559188816223674, + "grad_norm": 0.33451802883583126, + "learning_rate": 9.23753036018835e-06, + "loss": 0.2615, + "step": 5070 + }, + { + "epoch": 0.40567188656226877, + "grad_norm": 0.21113763120602924, + "learning_rate": 9.237186507629236e-06, + "loss": 0.3359, + "step": 5071 + }, + { + "epoch": 0.40575188496230075, + "grad_norm": 0.27247347659026977, + "learning_rate": 9.236842583956334e-06, + "loss": 0.2968, + "step": 5072 + }, + { + "epoch": 0.4058318833623328, + "grad_norm": 0.8649339486613512, + "learning_rate": 9.236498589175408e-06, + "loss": 0.2563, + "step": 5073 + }, + { + "epoch": 0.40591188176236476, + "grad_norm": 0.2661454606332336, + "learning_rate": 9.236154523292237e-06, + "loss": 0.2756, + "step": 5074 + }, + { + "epoch": 0.40599188016239673, + "grad_norm": 0.26571725251363654, + "learning_rate": 9.235810386312594e-06, + "loss": 0.2963, + "step": 5075 + }, + { + "epoch": 0.40607187856242877, + "grad_norm": 0.33408613662105385, + "learning_rate": 9.235466178242255e-06, + "loss": 0.2958, + "step": 5076 + }, + { + "epoch": 0.40615187696246075, + "grad_norm": 0.3285741726513298, + "learning_rate": 9.235121899086994e-06, + "loss": 0.2597, + "step": 5077 + }, + { + "epoch": 0.4062318753624927, + "grad_norm": 0.3873371718337856, + "learning_rate": 9.234777548852593e-06, + "loss": 0.2793, + "step": 5078 + }, + { + "epoch": 0.40631187376252476, + "grad_norm": 0.2887667478203784, + "learning_rate": 9.23443312754483e-06, + "loss": 0.2507, + "step": 5079 + }, + { + "epoch": 0.40639187216255673, + "grad_norm": 0.31670435599624613, + "learning_rate": 9.234088635169484e-06, + "loss": 0.2538, + "step": 5080 + }, + { + "epoch": 0.40647187056258877, + "grad_norm": 0.2279322113398891, + "learning_rate": 9.23374407173234e-06, + "loss": 0.3336, + "step": 5081 + }, + { + "epoch": 0.40655186896262074, + "grad_norm": 0.31415458187132445, + "learning_rate": 9.233399437239177e-06, + "loss": 0.2602, + "step": 5082 + }, + { + "epoch": 0.4066318673626527, + "grad_norm": 0.3254314407867844, + "learning_rate": 9.233054731695782e-06, + "loss": 0.2966, + "step": 5083 + }, + { + "epoch": 0.40671186576268475, + "grad_norm": 0.29610324548549055, + "learning_rate": 9.23270995510794e-06, + "loss": 0.2888, + "step": 5084 + }, + { + "epoch": 0.40679186416271673, + "grad_norm": 0.2343661445058714, + "learning_rate": 9.232365107481433e-06, + "loss": 0.3489, + "step": 5085 + }, + { + "epoch": 0.40687186256274877, + "grad_norm": 0.4390519830109507, + "learning_rate": 9.232020188822055e-06, + "loss": 0.2648, + "step": 5086 + }, + { + "epoch": 0.40695186096278074, + "grad_norm": 0.33079487875766916, + "learning_rate": 9.231675199135593e-06, + "loss": 0.253, + "step": 5087 + }, + { + "epoch": 0.4070318593628127, + "grad_norm": 0.2780373302361011, + "learning_rate": 9.231330138427836e-06, + "loss": 0.2926, + "step": 5088 + }, + { + "epoch": 0.40711185776284475, + "grad_norm": 0.2955049763325041, + "learning_rate": 9.230985006704575e-06, + "loss": 0.2747, + "step": 5089 + }, + { + "epoch": 0.40719185616287673, + "grad_norm": 0.26656259266039384, + "learning_rate": 9.230639803971603e-06, + "loss": 0.2873, + "step": 5090 + }, + { + "epoch": 0.40727185456290876, + "grad_norm": 0.6105416018094711, + "learning_rate": 9.230294530234714e-06, + "loss": 0.2524, + "step": 5091 + }, + { + "epoch": 0.40735185296294074, + "grad_norm": 0.29108484932788886, + "learning_rate": 9.2299491854997e-06, + "loss": 0.2949, + "step": 5092 + }, + { + "epoch": 0.4074318513629727, + "grad_norm": 0.2708798756674137, + "learning_rate": 9.229603769772364e-06, + "loss": 0.286, + "step": 5093 + }, + { + "epoch": 0.40751184976300475, + "grad_norm": 0.2752122282132114, + "learning_rate": 9.229258283058496e-06, + "loss": 0.2978, + "step": 5094 + }, + { + "epoch": 0.40759184816303673, + "grad_norm": 0.31232880287125725, + "learning_rate": 9.228912725363897e-06, + "loss": 0.2472, + "step": 5095 + }, + { + "epoch": 0.40767184656306876, + "grad_norm": 0.24846062515251233, + "learning_rate": 9.228567096694366e-06, + "loss": 0.3284, + "step": 5096 + }, + { + "epoch": 0.40775184496310074, + "grad_norm": 0.35248465135271806, + "learning_rate": 9.228221397055705e-06, + "loss": 0.2807, + "step": 5097 + }, + { + "epoch": 0.4078318433631327, + "grad_norm": 0.29705505187076364, + "learning_rate": 9.227875626453718e-06, + "loss": 0.315, + "step": 5098 + }, + { + "epoch": 0.40791184176316475, + "grad_norm": 0.29119153022585825, + "learning_rate": 9.2275297848942e-06, + "loss": 0.2888, + "step": 5099 + }, + { + "epoch": 0.40799184016319673, + "grad_norm": 0.2845988395098363, + "learning_rate": 9.227183872382965e-06, + "loss": 0.3018, + "step": 5100 + }, + { + "epoch": 0.40807183856322876, + "grad_norm": 0.2169737723265164, + "learning_rate": 9.226837888925813e-06, + "loss": 0.3425, + "step": 5101 + }, + { + "epoch": 0.40815183696326074, + "grad_norm": 0.33806147505314693, + "learning_rate": 9.226491834528553e-06, + "loss": 0.3223, + "step": 5102 + }, + { + "epoch": 0.4082318353632927, + "grad_norm": 0.25515128511837964, + "learning_rate": 9.226145709196991e-06, + "loss": 0.2928, + "step": 5103 + }, + { + "epoch": 0.40831183376332475, + "grad_norm": 0.2943256295930612, + "learning_rate": 9.225799512936938e-06, + "loss": 0.2982, + "step": 5104 + }, + { + "epoch": 0.4083918321633567, + "grad_norm": 0.2754034913082835, + "learning_rate": 9.225453245754204e-06, + "loss": 0.2864, + "step": 5105 + }, + { + "epoch": 0.4084718305633887, + "grad_norm": 0.3080616989550143, + "learning_rate": 9.2251069076546e-06, + "loss": 0.3273, + "step": 5106 + }, + { + "epoch": 0.40855182896342074, + "grad_norm": 0.29406993101235734, + "learning_rate": 9.224760498643936e-06, + "loss": 0.2872, + "step": 5107 + }, + { + "epoch": 0.4086318273634527, + "grad_norm": 0.33215953460884384, + "learning_rate": 9.22441401872803e-06, + "loss": 0.2568, + "step": 5108 + }, + { + "epoch": 0.40871182576348475, + "grad_norm": 0.27812625207001357, + "learning_rate": 9.224067467912696e-06, + "loss": 0.2926, + "step": 5109 + }, + { + "epoch": 0.4087918241635167, + "grad_norm": 0.26621281573365096, + "learning_rate": 9.22372084620375e-06, + "loss": 0.3116, + "step": 5110 + }, + { + "epoch": 0.4088718225635487, + "grad_norm": 0.31073427995343855, + "learning_rate": 9.22337415360701e-06, + "loss": 0.3114, + "step": 5111 + }, + { + "epoch": 0.40895182096358074, + "grad_norm": 0.4029026120248433, + "learning_rate": 9.223027390128292e-06, + "loss": 0.2631, + "step": 5112 + }, + { + "epoch": 0.4090318193636127, + "grad_norm": 0.21258601598254037, + "learning_rate": 9.222680555773417e-06, + "loss": 0.3426, + "step": 5113 + }, + { + "epoch": 0.40911181776364475, + "grad_norm": 0.4391077492364632, + "learning_rate": 9.22233365054821e-06, + "loss": 0.2763, + "step": 5114 + }, + { + "epoch": 0.4091918161636767, + "grad_norm": 0.25896563266095185, + "learning_rate": 9.221986674458488e-06, + "loss": 0.2734, + "step": 5115 + }, + { + "epoch": 0.4092718145637087, + "grad_norm": 0.33086304884652123, + "learning_rate": 9.221639627510076e-06, + "loss": 0.2499, + "step": 5116 + }, + { + "epoch": 0.40935181296374074, + "grad_norm": 0.30054583459854994, + "learning_rate": 9.221292509708799e-06, + "loss": 0.2809, + "step": 5117 + }, + { + "epoch": 0.4094318113637727, + "grad_norm": 0.2657949215120971, + "learning_rate": 9.220945321060485e-06, + "loss": 0.3232, + "step": 5118 + }, + { + "epoch": 0.40951180976380475, + "grad_norm": 0.28552281075403557, + "learning_rate": 9.220598061570956e-06, + "loss": 0.2902, + "step": 5119 + }, + { + "epoch": 0.4095918081638367, + "grad_norm": 0.7497670639096392, + "learning_rate": 9.220250731246042e-06, + "loss": 0.2694, + "step": 5120 + }, + { + "epoch": 0.4096718065638687, + "grad_norm": 0.30911911357796185, + "learning_rate": 9.219903330091575e-06, + "loss": 0.2749, + "step": 5121 + }, + { + "epoch": 0.40975180496390073, + "grad_norm": 0.33332049506059624, + "learning_rate": 9.219555858113383e-06, + "loss": 0.2714, + "step": 5122 + }, + { + "epoch": 0.4098318033639327, + "grad_norm": 0.34156497444688066, + "learning_rate": 9.219208315317298e-06, + "loss": 0.2627, + "step": 5123 + }, + { + "epoch": 0.40991180176396474, + "grad_norm": 0.2944717076069869, + "learning_rate": 9.218860701709154e-06, + "loss": 0.2839, + "step": 5124 + }, + { + "epoch": 0.4099918001639967, + "grad_norm": 0.30347883791488056, + "learning_rate": 9.218513017294784e-06, + "loss": 0.2662, + "step": 5125 + }, + { + "epoch": 0.4100717985640287, + "grad_norm": 0.2653290454319142, + "learning_rate": 9.218165262080024e-06, + "loss": 0.3112, + "step": 5126 + }, + { + "epoch": 0.41015179696406073, + "grad_norm": 0.28810169013437004, + "learning_rate": 9.217817436070708e-06, + "loss": 0.2911, + "step": 5127 + }, + { + "epoch": 0.4102317953640927, + "grad_norm": 0.2892865111041984, + "learning_rate": 9.217469539272679e-06, + "loss": 0.2917, + "step": 5128 + }, + { + "epoch": 0.41031179376412474, + "grad_norm": 0.20711187665121045, + "learning_rate": 9.21712157169177e-06, + "loss": 0.3279, + "step": 5129 + }, + { + "epoch": 0.4103917921641567, + "grad_norm": 0.24396732886107703, + "learning_rate": 9.216773533333825e-06, + "loss": 0.3248, + "step": 5130 + }, + { + "epoch": 0.4104717905641887, + "grad_norm": 0.2063294020052452, + "learning_rate": 9.216425424204683e-06, + "loss": 0.34, + "step": 5131 + }, + { + "epoch": 0.41055178896422073, + "grad_norm": 0.20436774684527628, + "learning_rate": 9.216077244310187e-06, + "loss": 0.3527, + "step": 5132 + }, + { + "epoch": 0.4106317873642527, + "grad_norm": 0.18043212793630797, + "learning_rate": 9.21572899365618e-06, + "loss": 0.3446, + "step": 5133 + }, + { + "epoch": 0.4107117857642847, + "grad_norm": 0.3148796817690092, + "learning_rate": 9.21538067224851e-06, + "loss": 0.2733, + "step": 5134 + }, + { + "epoch": 0.4107917841643167, + "grad_norm": 0.2954961298562056, + "learning_rate": 9.21503228009302e-06, + "loss": 0.2591, + "step": 5135 + }, + { + "epoch": 0.4108717825643487, + "grad_norm": 0.26542330131988123, + "learning_rate": 9.214683817195558e-06, + "loss": 0.2963, + "step": 5136 + }, + { + "epoch": 0.41095178096438073, + "grad_norm": 0.3387772669648989, + "learning_rate": 9.21433528356197e-06, + "loss": 0.2619, + "step": 5137 + }, + { + "epoch": 0.4110317793644127, + "grad_norm": 0.2734606970327009, + "learning_rate": 9.213986679198107e-06, + "loss": 0.2471, + "step": 5138 + }, + { + "epoch": 0.4111117777644447, + "grad_norm": 0.3102462458135226, + "learning_rate": 9.213638004109824e-06, + "loss": 0.2798, + "step": 5139 + }, + { + "epoch": 0.4111917761644767, + "grad_norm": 0.28240935705461273, + "learning_rate": 9.213289258302967e-06, + "loss": 0.2926, + "step": 5140 + }, + { + "epoch": 0.4112717745645087, + "grad_norm": 0.33231753727160324, + "learning_rate": 9.212940441783392e-06, + "loss": 0.2744, + "step": 5141 + }, + { + "epoch": 0.41135177296454073, + "grad_norm": 0.35571882380393344, + "learning_rate": 9.21259155455695e-06, + "loss": 0.2997, + "step": 5142 + }, + { + "epoch": 0.4114317713645727, + "grad_norm": 0.27852116049278425, + "learning_rate": 9.212242596629504e-06, + "loss": 0.302, + "step": 5143 + }, + { + "epoch": 0.4115117697646047, + "grad_norm": 0.33236755714087285, + "learning_rate": 9.211893568006903e-06, + "loss": 0.2672, + "step": 5144 + }, + { + "epoch": 0.4115917681646367, + "grad_norm": 0.43552530304557246, + "learning_rate": 9.211544468695006e-06, + "loss": 0.2853, + "step": 5145 + }, + { + "epoch": 0.4116717665646687, + "grad_norm": 0.31638432904638675, + "learning_rate": 9.211195298699674e-06, + "loss": 0.2646, + "step": 5146 + }, + { + "epoch": 0.41175176496470073, + "grad_norm": 0.3095783844627131, + "learning_rate": 9.210846058026767e-06, + "loss": 0.2532, + "step": 5147 + }, + { + "epoch": 0.4118317633647327, + "grad_norm": 0.3221378378969911, + "learning_rate": 9.210496746682147e-06, + "loss": 0.2807, + "step": 5148 + }, + { + "epoch": 0.4119117617647647, + "grad_norm": 0.256840253566607, + "learning_rate": 9.210147364671677e-06, + "loss": 0.3304, + "step": 5149 + }, + { + "epoch": 0.4119917601647967, + "grad_norm": 0.2644823654447232, + "learning_rate": 9.209797912001218e-06, + "loss": 0.3179, + "step": 5150 + }, + { + "epoch": 0.4120717585648287, + "grad_norm": 0.3359436338492232, + "learning_rate": 9.209448388676636e-06, + "loss": 0.2454, + "step": 5151 + }, + { + "epoch": 0.4121517569648607, + "grad_norm": 0.2607042957976346, + "learning_rate": 9.209098794703796e-06, + "loss": 0.3082, + "step": 5152 + }, + { + "epoch": 0.4122317553648927, + "grad_norm": 0.2733418116361817, + "learning_rate": 9.20874913008857e-06, + "loss": 0.2807, + "step": 5153 + }, + { + "epoch": 0.4123117537649247, + "grad_norm": 0.2761646471051783, + "learning_rate": 9.208399394836823e-06, + "loss": 0.2936, + "step": 5154 + }, + { + "epoch": 0.4123917521649567, + "grad_norm": 0.30935343639940405, + "learning_rate": 9.208049588954424e-06, + "loss": 0.2609, + "step": 5155 + }, + { + "epoch": 0.4124717505649887, + "grad_norm": 0.2069829173232812, + "learning_rate": 9.207699712447246e-06, + "loss": 0.3509, + "step": 5156 + }, + { + "epoch": 0.4125517489650207, + "grad_norm": 0.31585244730559664, + "learning_rate": 9.20734976532116e-06, + "loss": 0.2531, + "step": 5157 + }, + { + "epoch": 0.4126317473650527, + "grad_norm": 0.3063345741017045, + "learning_rate": 9.206999747582039e-06, + "loss": 0.2828, + "step": 5158 + }, + { + "epoch": 0.4127117457650847, + "grad_norm": 0.24950146103579104, + "learning_rate": 9.206649659235756e-06, + "loss": 0.271, + "step": 5159 + }, + { + "epoch": 0.4127917441651167, + "grad_norm": 0.2294088381327079, + "learning_rate": 9.20629950028819e-06, + "loss": 0.3092, + "step": 5160 + }, + { + "epoch": 0.4128717425651487, + "grad_norm": 0.284899742187152, + "learning_rate": 9.205949270745217e-06, + "loss": 0.3051, + "step": 5161 + }, + { + "epoch": 0.41295174096518067, + "grad_norm": 0.30785774988989656, + "learning_rate": 9.205598970612713e-06, + "loss": 0.262, + "step": 5162 + }, + { + "epoch": 0.4130317393652127, + "grad_norm": 0.3136404434456607, + "learning_rate": 9.20524859989656e-06, + "loss": 0.2475, + "step": 5163 + }, + { + "epoch": 0.4131117377652447, + "grad_norm": 0.31898727827148876, + "learning_rate": 9.204898158602633e-06, + "loss": 0.2587, + "step": 5164 + }, + { + "epoch": 0.4131917361652767, + "grad_norm": 0.26542806851169926, + "learning_rate": 9.204547646736821e-06, + "loss": 0.2929, + "step": 5165 + }, + { + "epoch": 0.4132717345653087, + "grad_norm": 0.26897606795733425, + "learning_rate": 9.204197064305002e-06, + "loss": 0.3663, + "step": 5166 + }, + { + "epoch": 0.41335173296534067, + "grad_norm": 0.3234404814518937, + "learning_rate": 9.20384641131306e-06, + "loss": 0.2459, + "step": 5167 + }, + { + "epoch": 0.4134317313653727, + "grad_norm": 0.21860367568617967, + "learning_rate": 9.203495687766883e-06, + "loss": 0.3486, + "step": 5168 + }, + { + "epoch": 0.4135117297654047, + "grad_norm": 0.2954374144046898, + "learning_rate": 9.203144893672354e-06, + "loss": 0.3013, + "step": 5169 + }, + { + "epoch": 0.4135917281654367, + "grad_norm": 0.26815101832419375, + "learning_rate": 9.20279402903536e-06, + "loss": 0.2851, + "step": 5170 + }, + { + "epoch": 0.4136717265654687, + "grad_norm": 0.24869166232650186, + "learning_rate": 9.202443093861796e-06, + "loss": 0.3078, + "step": 5171 + }, + { + "epoch": 0.41375172496550067, + "grad_norm": 0.2943948549152174, + "learning_rate": 9.202092088157542e-06, + "loss": 0.2834, + "step": 5172 + }, + { + "epoch": 0.4138317233655327, + "grad_norm": 0.2797817038495954, + "learning_rate": 9.201741011928498e-06, + "loss": 0.2806, + "step": 5173 + }, + { + "epoch": 0.4139117217655647, + "grad_norm": 0.35045531128736945, + "learning_rate": 9.20138986518055e-06, + "loss": 0.2612, + "step": 5174 + }, + { + "epoch": 0.4139917201655967, + "grad_norm": 0.299978119866561, + "learning_rate": 9.201038647919595e-06, + "loss": 0.2766, + "step": 5175 + }, + { + "epoch": 0.4140717185656287, + "grad_norm": 0.2986304503766352, + "learning_rate": 9.200687360151527e-06, + "loss": 0.3059, + "step": 5176 + }, + { + "epoch": 0.41415171696566067, + "grad_norm": 0.24788436750353124, + "learning_rate": 9.20033600188224e-06, + "loss": 0.3213, + "step": 5177 + }, + { + "epoch": 0.4142317153656927, + "grad_norm": 0.4028020609627026, + "learning_rate": 9.199984573117633e-06, + "loss": 0.2887, + "step": 5178 + }, + { + "epoch": 0.4143117137657247, + "grad_norm": 0.29490270603628127, + "learning_rate": 9.199633073863603e-06, + "loss": 0.2588, + "step": 5179 + }, + { + "epoch": 0.4143917121657567, + "grad_norm": 0.27603424299719537, + "learning_rate": 9.19928150412605e-06, + "loss": 0.2891, + "step": 5180 + }, + { + "epoch": 0.4144717105657887, + "grad_norm": 0.3089826628831555, + "learning_rate": 9.198929863910874e-06, + "loss": 0.265, + "step": 5181 + }, + { + "epoch": 0.41455170896582066, + "grad_norm": 0.3364696696791171, + "learning_rate": 9.198578153223976e-06, + "loss": 0.2906, + "step": 5182 + }, + { + "epoch": 0.4146317073658527, + "grad_norm": 0.26033512707356476, + "learning_rate": 9.198226372071259e-06, + "loss": 0.3109, + "step": 5183 + }, + { + "epoch": 0.4147117057658847, + "grad_norm": 0.2975368377645226, + "learning_rate": 9.19787452045863e-06, + "loss": 0.265, + "step": 5184 + }, + { + "epoch": 0.4147917041659167, + "grad_norm": 0.25753871905202474, + "learning_rate": 9.19752259839199e-06, + "loss": 0.2723, + "step": 5185 + }, + { + "epoch": 0.4148717025659487, + "grad_norm": 0.31398608661861965, + "learning_rate": 9.197170605877248e-06, + "loss": 0.2772, + "step": 5186 + }, + { + "epoch": 0.41495170096598066, + "grad_norm": 0.2503903623610147, + "learning_rate": 9.19681854292031e-06, + "loss": 0.3191, + "step": 5187 + }, + { + "epoch": 0.4150316993660127, + "grad_norm": 0.2921910583877275, + "learning_rate": 9.196466409527085e-06, + "loss": 0.263, + "step": 5188 + }, + { + "epoch": 0.4151116977660447, + "grad_norm": 0.3084137953617749, + "learning_rate": 9.196114205703484e-06, + "loss": 0.2589, + "step": 5189 + }, + { + "epoch": 0.41519169616607665, + "grad_norm": 0.22663415563494682, + "learning_rate": 9.195761931455418e-06, + "loss": 0.3015, + "step": 5190 + }, + { + "epoch": 0.4152716945661087, + "grad_norm": 0.32978353164604135, + "learning_rate": 9.1954095867888e-06, + "loss": 0.2544, + "step": 5191 + }, + { + "epoch": 0.41535169296614066, + "grad_norm": 0.2281996158079269, + "learning_rate": 9.19505717170954e-06, + "loss": 0.313, + "step": 5192 + }, + { + "epoch": 0.4154316913661727, + "grad_norm": 0.24192166812926552, + "learning_rate": 9.194704686223557e-06, + "loss": 0.3194, + "step": 5193 + }, + { + "epoch": 0.4155116897662047, + "grad_norm": 0.30577745117581545, + "learning_rate": 9.194352130336764e-06, + "loss": 0.2513, + "step": 5194 + }, + { + "epoch": 0.41559168816623665, + "grad_norm": 0.2897562570127388, + "learning_rate": 9.19399950405508e-06, + "loss": 0.2408, + "step": 5195 + }, + { + "epoch": 0.4156716865662687, + "grad_norm": 0.41816273186182856, + "learning_rate": 9.193646807384421e-06, + "loss": 0.3211, + "step": 5196 + }, + { + "epoch": 0.41575168496630066, + "grad_norm": 0.2650843484348309, + "learning_rate": 9.193294040330709e-06, + "loss": 0.2729, + "step": 5197 + }, + { + "epoch": 0.4158316833663327, + "grad_norm": 0.2869447618554817, + "learning_rate": 9.192941202899861e-06, + "loss": 0.2894, + "step": 5198 + }, + { + "epoch": 0.41591168176636467, + "grad_norm": 0.29373937245755993, + "learning_rate": 9.192588295097801e-06, + "loss": 0.2413, + "step": 5199 + }, + { + "epoch": 0.41599168016639665, + "grad_norm": 0.272057923651325, + "learning_rate": 9.192235316930454e-06, + "loss": 0.2982, + "step": 5200 + }, + { + "epoch": 0.4160716785664287, + "grad_norm": 0.34316178138284675, + "learning_rate": 9.191882268403743e-06, + "loss": 0.2361, + "step": 5201 + }, + { + "epoch": 0.41615167696646066, + "grad_norm": 0.3331277987001649, + "learning_rate": 9.191529149523588e-06, + "loss": 0.2574, + "step": 5202 + }, + { + "epoch": 0.4162316753664927, + "grad_norm": 0.3111070685580852, + "learning_rate": 9.191175960295924e-06, + "loss": 0.2541, + "step": 5203 + }, + { + "epoch": 0.41631167376652467, + "grad_norm": 0.24262366178607908, + "learning_rate": 9.190822700726671e-06, + "loss": 0.334, + "step": 5204 + }, + { + "epoch": 0.41639167216655665, + "grad_norm": 0.29257639429490034, + "learning_rate": 9.190469370821764e-06, + "loss": 0.2706, + "step": 5205 + }, + { + "epoch": 0.4164716705665887, + "grad_norm": 0.21763815602611417, + "learning_rate": 9.190115970587129e-06, + "loss": 0.3132, + "step": 5206 + }, + { + "epoch": 0.41655166896662066, + "grad_norm": 0.24968064126960107, + "learning_rate": 9.189762500028698e-06, + "loss": 0.3311, + "step": 5207 + }, + { + "epoch": 0.4166316673666527, + "grad_norm": 0.23902617205940302, + "learning_rate": 9.189408959152405e-06, + "loss": 0.3173, + "step": 5208 + }, + { + "epoch": 0.41671166576668467, + "grad_norm": 0.30836418658037196, + "learning_rate": 9.189055347964182e-06, + "loss": 0.2681, + "step": 5209 + }, + { + "epoch": 0.41679166416671665, + "grad_norm": 0.2999580615531441, + "learning_rate": 9.188701666469965e-06, + "loss": 0.269, + "step": 5210 + }, + { + "epoch": 0.4168716625667487, + "grad_norm": 0.3029525933345695, + "learning_rate": 9.188347914675689e-06, + "loss": 0.2715, + "step": 5211 + }, + { + "epoch": 0.41695166096678066, + "grad_norm": 0.27874212707814683, + "learning_rate": 9.18799409258729e-06, + "loss": 0.2754, + "step": 5212 + }, + { + "epoch": 0.4170316593668127, + "grad_norm": 0.2600266357156443, + "learning_rate": 9.18764020021071e-06, + "loss": 0.2831, + "step": 5213 + }, + { + "epoch": 0.41711165776684467, + "grad_norm": 0.2965364834774281, + "learning_rate": 9.187286237551885e-06, + "loss": 0.2492, + "step": 5214 + }, + { + "epoch": 0.41719165616687665, + "grad_norm": 0.29956627287901055, + "learning_rate": 9.186932204616756e-06, + "loss": 0.2595, + "step": 5215 + }, + { + "epoch": 0.4172716545669087, + "grad_norm": 0.2668196454869672, + "learning_rate": 9.186578101411266e-06, + "loss": 0.2666, + "step": 5216 + }, + { + "epoch": 0.41735165296694066, + "grad_norm": 0.2438224529828869, + "learning_rate": 9.186223927941356e-06, + "loss": 0.3302, + "step": 5217 + }, + { + "epoch": 0.41743165136697263, + "grad_norm": 0.2938911067670263, + "learning_rate": 9.185869684212974e-06, + "loss": 0.2757, + "step": 5218 + }, + { + "epoch": 0.41751164976700467, + "grad_norm": 0.33162557376991736, + "learning_rate": 9.185515370232062e-06, + "loss": 0.2592, + "step": 5219 + }, + { + "epoch": 0.41759164816703664, + "grad_norm": 0.305928534297296, + "learning_rate": 9.185160986004566e-06, + "loss": 0.263, + "step": 5220 + }, + { + "epoch": 0.4176716465670687, + "grad_norm": 0.2769072028946352, + "learning_rate": 9.184806531536438e-06, + "loss": 0.2987, + "step": 5221 + }, + { + "epoch": 0.41775164496710065, + "grad_norm": 0.27513542857080053, + "learning_rate": 9.184452006833623e-06, + "loss": 0.2762, + "step": 5222 + }, + { + "epoch": 0.41783164336713263, + "grad_norm": 0.2890867584634828, + "learning_rate": 9.184097411902072e-06, + "loss": 0.2555, + "step": 5223 + }, + { + "epoch": 0.41791164176716467, + "grad_norm": 0.22557238944886635, + "learning_rate": 9.183742746747737e-06, + "loss": 0.3063, + "step": 5224 + }, + { + "epoch": 0.41799164016719664, + "grad_norm": 0.33861889666703715, + "learning_rate": 9.18338801137657e-06, + "loss": 0.2519, + "step": 5225 + }, + { + "epoch": 0.4180716385672287, + "grad_norm": 0.23919587310154336, + "learning_rate": 9.183033205794525e-06, + "loss": 0.338, + "step": 5226 + }, + { + "epoch": 0.41815163696726065, + "grad_norm": 0.312113249158113, + "learning_rate": 9.182678330007556e-06, + "loss": 0.2922, + "step": 5227 + }, + { + "epoch": 0.41823163536729263, + "grad_norm": 0.2325558703465777, + "learning_rate": 9.182323384021619e-06, + "loss": 0.3121, + "step": 5228 + }, + { + "epoch": 0.41831163376732466, + "grad_norm": 0.27912906922177216, + "learning_rate": 9.181968367842674e-06, + "loss": 0.3214, + "step": 5229 + }, + { + "epoch": 0.41839163216735664, + "grad_norm": 0.7267050551840867, + "learning_rate": 9.181613281476674e-06, + "loss": 0.2928, + "step": 5230 + }, + { + "epoch": 0.4184716305673887, + "grad_norm": 0.3927267879288216, + "learning_rate": 9.181258124929582e-06, + "loss": 0.2822, + "step": 5231 + }, + { + "epoch": 0.41855162896742065, + "grad_norm": 0.42614000468655916, + "learning_rate": 9.180902898207359e-06, + "loss": 0.2934, + "step": 5232 + }, + { + "epoch": 0.41863162736745263, + "grad_norm": 0.31114967860920073, + "learning_rate": 9.180547601315963e-06, + "loss": 0.3104, + "step": 5233 + }, + { + "epoch": 0.41871162576748466, + "grad_norm": 0.2809300293608719, + "learning_rate": 9.180192234261363e-06, + "loss": 0.3086, + "step": 5234 + }, + { + "epoch": 0.41879162416751664, + "grad_norm": 0.31476296330044196, + "learning_rate": 9.17983679704952e-06, + "loss": 0.25, + "step": 5235 + }, + { + "epoch": 0.4188716225675487, + "grad_norm": 0.3035885835425518, + "learning_rate": 9.179481289686398e-06, + "loss": 0.2995, + "step": 5236 + }, + { + "epoch": 0.41895162096758065, + "grad_norm": 0.29950245530443553, + "learning_rate": 9.179125712177965e-06, + "loss": 0.3142, + "step": 5237 + }, + { + "epoch": 0.41903161936761263, + "grad_norm": 0.22365127860565798, + "learning_rate": 9.178770064530191e-06, + "loss": 0.3101, + "step": 5238 + }, + { + "epoch": 0.41911161776764466, + "grad_norm": 0.26127833219000807, + "learning_rate": 9.17841434674904e-06, + "loss": 0.2779, + "step": 5239 + }, + { + "epoch": 0.41919161616767664, + "grad_norm": 0.28374757978865806, + "learning_rate": 9.178058558840488e-06, + "loss": 0.2744, + "step": 5240 + }, + { + "epoch": 0.4192716145677087, + "grad_norm": 0.2893333515800285, + "learning_rate": 9.1777027008105e-06, + "loss": 0.2449, + "step": 5241 + }, + { + "epoch": 0.41935161296774065, + "grad_norm": 0.2033571467457253, + "learning_rate": 9.177346772665054e-06, + "loss": 0.3432, + "step": 5242 + }, + { + "epoch": 0.4194316113677726, + "grad_norm": 0.2553674067238468, + "learning_rate": 9.17699077441012e-06, + "loss": 0.2848, + "step": 5243 + }, + { + "epoch": 0.41951160976780466, + "grad_norm": 0.27193427963304284, + "learning_rate": 9.176634706051676e-06, + "loss": 0.2701, + "step": 5244 + }, + { + "epoch": 0.41959160816783664, + "grad_norm": 0.3149712650959617, + "learning_rate": 9.176278567595696e-06, + "loss": 0.2528, + "step": 5245 + }, + { + "epoch": 0.4196716065678686, + "grad_norm": 0.32663286761371185, + "learning_rate": 9.175922359048156e-06, + "loss": 0.2697, + "step": 5246 + }, + { + "epoch": 0.41975160496790065, + "grad_norm": 0.3146877974573286, + "learning_rate": 9.175566080415036e-06, + "loss": 0.2679, + "step": 5247 + }, + { + "epoch": 0.4198316033679326, + "grad_norm": 0.3343882668757924, + "learning_rate": 9.175209731702313e-06, + "loss": 0.2706, + "step": 5248 + }, + { + "epoch": 0.41991160176796466, + "grad_norm": 0.2844214952317712, + "learning_rate": 9.174853312915972e-06, + "loss": 0.3069, + "step": 5249 + }, + { + "epoch": 0.41999160016799664, + "grad_norm": 0.26474572835037347, + "learning_rate": 9.174496824061992e-06, + "loss": 0.2961, + "step": 5250 + }, + { + "epoch": 0.4200715985680286, + "grad_norm": 0.2601711454875526, + "learning_rate": 9.174140265146356e-06, + "loss": 0.2641, + "step": 5251 + }, + { + "epoch": 0.42015159696806065, + "grad_norm": 0.3628405695202224, + "learning_rate": 9.173783636175051e-06, + "loss": 0.2821, + "step": 5252 + }, + { + "epoch": 0.4202315953680926, + "grad_norm": 0.25865390270665845, + "learning_rate": 9.173426937154058e-06, + "loss": 0.3224, + "step": 5253 + }, + { + "epoch": 0.42031159376812466, + "grad_norm": 0.31091930774921067, + "learning_rate": 9.173070168089367e-06, + "loss": 0.2612, + "step": 5254 + }, + { + "epoch": 0.42039159216815664, + "grad_norm": 0.32783997617721866, + "learning_rate": 9.172713328986965e-06, + "loss": 0.2669, + "step": 5255 + }, + { + "epoch": 0.4204715905681886, + "grad_norm": 0.27634542486630254, + "learning_rate": 9.172356419852841e-06, + "loss": 0.3086, + "step": 5256 + }, + { + "epoch": 0.42055158896822065, + "grad_norm": 0.24106327598852467, + "learning_rate": 9.171999440692982e-06, + "loss": 0.3347, + "step": 5257 + }, + { + "epoch": 0.4206315873682526, + "grad_norm": 0.3227879519005372, + "learning_rate": 9.171642391513384e-06, + "loss": 0.2704, + "step": 5258 + }, + { + "epoch": 0.42071158576828466, + "grad_norm": 2.6687118274508905, + "learning_rate": 9.17128527232004e-06, + "loss": 0.2641, + "step": 5259 + }, + { + "epoch": 0.42079158416831663, + "grad_norm": 0.26193567428660103, + "learning_rate": 9.170928083118937e-06, + "loss": 0.317, + "step": 5260 + }, + { + "epoch": 0.4208715825683486, + "grad_norm": 0.28812959166307556, + "learning_rate": 9.170570823916074e-06, + "loss": 0.2858, + "step": 5261 + }, + { + "epoch": 0.42095158096838065, + "grad_norm": 0.38820447629464544, + "learning_rate": 9.170213494717448e-06, + "loss": 0.2951, + "step": 5262 + }, + { + "epoch": 0.4210315793684126, + "grad_norm": 0.4206183538045783, + "learning_rate": 9.169856095529055e-06, + "loss": 0.2385, + "step": 5263 + }, + { + "epoch": 0.42111157776844466, + "grad_norm": 0.3300137230347034, + "learning_rate": 9.169498626356892e-06, + "loss": 0.3307, + "step": 5264 + }, + { + "epoch": 0.42119157616847663, + "grad_norm": 0.26450499737859773, + "learning_rate": 9.16914108720696e-06, + "loss": 0.3127, + "step": 5265 + }, + { + "epoch": 0.4212715745685086, + "grad_norm": 0.19051765385229086, + "learning_rate": 9.168783478085261e-06, + "loss": 0.3458, + "step": 5266 + }, + { + "epoch": 0.42135157296854064, + "grad_norm": 0.30551027963355376, + "learning_rate": 9.168425798997794e-06, + "loss": 0.2512, + "step": 5267 + }, + { + "epoch": 0.4214315713685726, + "grad_norm": 0.28042328376019976, + "learning_rate": 9.168068049950563e-06, + "loss": 0.304, + "step": 5268 + }, + { + "epoch": 0.42151156976860465, + "grad_norm": 0.2728669457583841, + "learning_rate": 9.167710230949573e-06, + "loss": 0.2779, + "step": 5269 + }, + { + "epoch": 0.42159156816863663, + "grad_norm": 0.3062326715161398, + "learning_rate": 9.16735234200083e-06, + "loss": 0.2662, + "step": 5270 + }, + { + "epoch": 0.4216715665686686, + "grad_norm": 0.2847921223795915, + "learning_rate": 9.166994383110338e-06, + "loss": 0.3047, + "step": 5271 + }, + { + "epoch": 0.42175156496870064, + "grad_norm": 0.21151185431073452, + "learning_rate": 9.166636354284107e-06, + "loss": 0.3362, + "step": 5272 + }, + { + "epoch": 0.4218315633687326, + "grad_norm": 0.2385011506609248, + "learning_rate": 9.166278255528143e-06, + "loss": 0.3246, + "step": 5273 + }, + { + "epoch": 0.4219115617687646, + "grad_norm": 0.2427294855982681, + "learning_rate": 9.165920086848461e-06, + "loss": 0.2966, + "step": 5274 + }, + { + "epoch": 0.42199156016879663, + "grad_norm": 0.29486245365104086, + "learning_rate": 9.165561848251066e-06, + "loss": 0.3163, + "step": 5275 + }, + { + "epoch": 0.4220715585688286, + "grad_norm": 0.27956780620980404, + "learning_rate": 9.165203539741976e-06, + "loss": 0.2749, + "step": 5276 + }, + { + "epoch": 0.42215155696886064, + "grad_norm": 0.3155036892323425, + "learning_rate": 9.164845161327203e-06, + "loss": 0.2482, + "step": 5277 + }, + { + "epoch": 0.4222315553688926, + "grad_norm": 0.31750105867283485, + "learning_rate": 9.164486713012759e-06, + "loss": 0.3014, + "step": 5278 + }, + { + "epoch": 0.4223115537689246, + "grad_norm": 0.32472418717527135, + "learning_rate": 9.164128194804663e-06, + "loss": 0.2489, + "step": 5279 + }, + { + "epoch": 0.42239155216895663, + "grad_norm": 0.2893463872434011, + "learning_rate": 9.16376960670893e-06, + "loss": 0.2916, + "step": 5280 + }, + { + "epoch": 0.4224715505689886, + "grad_norm": 0.32193818821457015, + "learning_rate": 9.16341094873158e-06, + "loss": 0.2604, + "step": 5281 + }, + { + "epoch": 0.42255154896902064, + "grad_norm": 0.29681431771585615, + "learning_rate": 9.163052220878633e-06, + "loss": 0.291, + "step": 5282 + }, + { + "epoch": 0.4226315473690526, + "grad_norm": 0.27848858788440195, + "learning_rate": 9.162693423156106e-06, + "loss": 0.3004, + "step": 5283 + }, + { + "epoch": 0.4227115457690846, + "grad_norm": 0.34259184711845087, + "learning_rate": 9.162334555570025e-06, + "loss": 0.3025, + "step": 5284 + }, + { + "epoch": 0.42279154416911663, + "grad_norm": 0.29961308736486125, + "learning_rate": 9.161975618126411e-06, + "loss": 0.2627, + "step": 5285 + }, + { + "epoch": 0.4228715425691486, + "grad_norm": 0.3346393116663815, + "learning_rate": 9.161616610831287e-06, + "loss": 0.2746, + "step": 5286 + }, + { + "epoch": 0.42295154096918064, + "grad_norm": 0.2691011270747603, + "learning_rate": 9.161257533690682e-06, + "loss": 0.3044, + "step": 5287 + }, + { + "epoch": 0.4230315393692126, + "grad_norm": 0.29726820511238716, + "learning_rate": 9.160898386710619e-06, + "loss": 0.2902, + "step": 5288 + }, + { + "epoch": 0.4231115377692446, + "grad_norm": 0.3399868764615308, + "learning_rate": 9.160539169897126e-06, + "loss": 0.2736, + "step": 5289 + }, + { + "epoch": 0.4231915361692766, + "grad_norm": 0.27436839261924617, + "learning_rate": 9.160179883256233e-06, + "loss": 0.3123, + "step": 5290 + }, + { + "epoch": 0.4232715345693086, + "grad_norm": 0.9101902372283796, + "learning_rate": 9.15982052679397e-06, + "loss": 0.2602, + "step": 5291 + }, + { + "epoch": 0.42335153296934064, + "grad_norm": 0.33587211563704966, + "learning_rate": 9.159461100516367e-06, + "loss": 0.2401, + "step": 5292 + }, + { + "epoch": 0.4234315313693726, + "grad_norm": 0.3159690901194471, + "learning_rate": 9.15910160442946e-06, + "loss": 0.278, + "step": 5293 + }, + { + "epoch": 0.4235115297694046, + "grad_norm": 0.34387493962237814, + "learning_rate": 9.158742038539275e-06, + "loss": 0.2708, + "step": 5294 + }, + { + "epoch": 0.4235915281694366, + "grad_norm": 0.31554018735901446, + "learning_rate": 9.158382402851854e-06, + "loss": 0.2576, + "step": 5295 + }, + { + "epoch": 0.4236715265694686, + "grad_norm": 0.3359679760212402, + "learning_rate": 9.15802269737323e-06, + "loss": 0.2702, + "step": 5296 + }, + { + "epoch": 0.42375152496950064, + "grad_norm": 0.2717092147292755, + "learning_rate": 9.15766292210944e-06, + "loss": 0.2787, + "step": 5297 + }, + { + "epoch": 0.4238315233695326, + "grad_norm": 0.3656748160507865, + "learning_rate": 9.157303077066523e-06, + "loss": 0.2895, + "step": 5298 + }, + { + "epoch": 0.4239115217695646, + "grad_norm": 0.3373646002280782, + "learning_rate": 9.156943162250516e-06, + "loss": 0.2729, + "step": 5299 + }, + { + "epoch": 0.4239915201695966, + "grad_norm": 0.3012919391379303, + "learning_rate": 9.156583177667464e-06, + "loss": 0.2723, + "step": 5300 + }, + { + "epoch": 0.4240715185696286, + "grad_norm": 0.25234126586826544, + "learning_rate": 9.156223123323405e-06, + "loss": 0.282, + "step": 5301 + }, + { + "epoch": 0.4241515169696606, + "grad_norm": 0.2577952385033093, + "learning_rate": 9.15586299922438e-06, + "loss": 0.3347, + "step": 5302 + }, + { + "epoch": 0.4242315153696926, + "grad_norm": 0.37470584741579593, + "learning_rate": 9.155502805376439e-06, + "loss": 0.2638, + "step": 5303 + }, + { + "epoch": 0.4243115137697246, + "grad_norm": 0.16892465842934803, + "learning_rate": 9.155142541785624e-06, + "loss": 0.3695, + "step": 5304 + }, + { + "epoch": 0.4243915121697566, + "grad_norm": 0.26403641191913835, + "learning_rate": 9.154782208457981e-06, + "loss": 0.3054, + "step": 5305 + }, + { + "epoch": 0.4244715105697886, + "grad_norm": 0.2468957529614833, + "learning_rate": 9.154421805399561e-06, + "loss": 0.3351, + "step": 5306 + }, + { + "epoch": 0.4245515089698206, + "grad_norm": 0.35693101654377846, + "learning_rate": 9.154061332616407e-06, + "loss": 0.2625, + "step": 5307 + }, + { + "epoch": 0.4246315073698526, + "grad_norm": 0.3050144741955356, + "learning_rate": 9.153700790114573e-06, + "loss": 0.2388, + "step": 5308 + }, + { + "epoch": 0.4247115057698846, + "grad_norm": 0.3101898650006513, + "learning_rate": 9.153340177900108e-06, + "loss": 0.3181, + "step": 5309 + }, + { + "epoch": 0.4247915041699166, + "grad_norm": 0.3111720657209628, + "learning_rate": 9.152979495979064e-06, + "loss": 0.2494, + "step": 5310 + }, + { + "epoch": 0.4248715025699486, + "grad_norm": 0.28244221197244207, + "learning_rate": 9.152618744357498e-06, + "loss": 0.2908, + "step": 5311 + }, + { + "epoch": 0.4249515009699806, + "grad_norm": 0.3051308415489462, + "learning_rate": 9.15225792304146e-06, + "loss": 0.2561, + "step": 5312 + }, + { + "epoch": 0.4250314993700126, + "grad_norm": 0.27985499237892836, + "learning_rate": 9.15189703203701e-06, + "loss": 0.2839, + "step": 5313 + }, + { + "epoch": 0.4251114977700446, + "grad_norm": 0.3093332654169806, + "learning_rate": 9.1515360713502e-06, + "loss": 0.2527, + "step": 5314 + }, + { + "epoch": 0.4251914961700766, + "grad_norm": 0.2025541211873972, + "learning_rate": 9.151175040987094e-06, + "loss": 0.3633, + "step": 5315 + }, + { + "epoch": 0.4252714945701086, + "grad_norm": 0.3040913820126117, + "learning_rate": 9.150813940953747e-06, + "loss": 0.3197, + "step": 5316 + }, + { + "epoch": 0.4253514929701406, + "grad_norm": 0.2702027936223868, + "learning_rate": 9.15045277125622e-06, + "loss": 0.3164, + "step": 5317 + }, + { + "epoch": 0.4254314913701726, + "grad_norm": 0.23403963520163276, + "learning_rate": 9.150091531900576e-06, + "loss": 0.3212, + "step": 5318 + }, + { + "epoch": 0.4255114897702046, + "grad_norm": 0.2527400693331135, + "learning_rate": 9.149730222892876e-06, + "loss": 0.2905, + "step": 5319 + }, + { + "epoch": 0.4255914881702366, + "grad_norm": 0.2875071502672263, + "learning_rate": 9.149368844239185e-06, + "loss": 0.2735, + "step": 5320 + }, + { + "epoch": 0.4256714865702686, + "grad_norm": 0.2655333240204377, + "learning_rate": 9.149007395945569e-06, + "loss": 0.2849, + "step": 5321 + }, + { + "epoch": 0.4257514849703006, + "grad_norm": 0.32093676377340474, + "learning_rate": 9.148645878018092e-06, + "loss": 0.2797, + "step": 5322 + }, + { + "epoch": 0.4258314833703326, + "grad_norm": 0.35001812965126505, + "learning_rate": 9.148284290462825e-06, + "loss": 0.2927, + "step": 5323 + }, + { + "epoch": 0.4259114817703646, + "grad_norm": 0.28671191560457954, + "learning_rate": 9.147922633285832e-06, + "loss": 0.2888, + "step": 5324 + }, + { + "epoch": 0.4259914801703966, + "grad_norm": 0.32575341804565866, + "learning_rate": 9.147560906493189e-06, + "loss": 0.2614, + "step": 5325 + }, + { + "epoch": 0.4260714785704286, + "grad_norm": 0.3110247774504736, + "learning_rate": 9.14719911009096e-06, + "loss": 0.2849, + "step": 5326 + }, + { + "epoch": 0.4261514769704606, + "grad_norm": 0.4078771336313896, + "learning_rate": 9.14683724408522e-06, + "loss": 0.2829, + "step": 5327 + }, + { + "epoch": 0.4262314753704926, + "grad_norm": 0.32315447542311293, + "learning_rate": 9.146475308482043e-06, + "loss": 0.2848, + "step": 5328 + }, + { + "epoch": 0.4263114737705246, + "grad_norm": 0.2836332579485243, + "learning_rate": 9.146113303287503e-06, + "loss": 0.298, + "step": 5329 + }, + { + "epoch": 0.42639147217055656, + "grad_norm": 0.32001439444728585, + "learning_rate": 9.145751228507677e-06, + "loss": 0.2417, + "step": 5330 + }, + { + "epoch": 0.4264714705705886, + "grad_norm": 0.23781742672835954, + "learning_rate": 9.14538908414864e-06, + "loss": 0.326, + "step": 5331 + }, + { + "epoch": 0.4265514689706206, + "grad_norm": 0.28253292809151725, + "learning_rate": 9.145026870216469e-06, + "loss": 0.2649, + "step": 5332 + }, + { + "epoch": 0.4266314673706526, + "grad_norm": 0.4590546957217334, + "learning_rate": 9.144664586717246e-06, + "loss": 0.2387, + "step": 5333 + }, + { + "epoch": 0.4267114657706846, + "grad_norm": 0.27281238004416963, + "learning_rate": 9.14430223365705e-06, + "loss": 0.3024, + "step": 5334 + }, + { + "epoch": 0.42679146417071656, + "grad_norm": 0.2657652222632267, + "learning_rate": 9.14393981104196e-06, + "loss": 0.2658, + "step": 5335 + }, + { + "epoch": 0.4268714625707486, + "grad_norm": 0.25597808752190165, + "learning_rate": 9.143577318878062e-06, + "loss": 0.2843, + "step": 5336 + }, + { + "epoch": 0.42695146097078057, + "grad_norm": 0.30115816032943915, + "learning_rate": 9.14321475717144e-06, + "loss": 0.2711, + "step": 5337 + }, + { + "epoch": 0.4270314593708126, + "grad_norm": 0.24670215973009077, + "learning_rate": 9.142852125928177e-06, + "loss": 0.3179, + "step": 5338 + }, + { + "epoch": 0.4271114577708446, + "grad_norm": 0.4722351211278366, + "learning_rate": 9.14248942515436e-06, + "loss": 0.2657, + "step": 5339 + }, + { + "epoch": 0.42719145617087656, + "grad_norm": 0.2721677351306439, + "learning_rate": 9.142126654856075e-06, + "loss": 0.3002, + "step": 5340 + }, + { + "epoch": 0.4272714545709086, + "grad_norm": 0.2859034552913096, + "learning_rate": 9.141763815039413e-06, + "loss": 0.2668, + "step": 5341 + }, + { + "epoch": 0.42735145297094057, + "grad_norm": 0.27799070924290403, + "learning_rate": 9.141400905710462e-06, + "loss": 0.2717, + "step": 5342 + }, + { + "epoch": 0.4274314513709726, + "grad_norm": 0.33913875873941696, + "learning_rate": 9.141037926875312e-06, + "loss": 0.2616, + "step": 5343 + }, + { + "epoch": 0.4275114497710046, + "grad_norm": 0.2305010779992306, + "learning_rate": 9.140674878540056e-06, + "loss": 0.3057, + "step": 5344 + }, + { + "epoch": 0.42759144817103656, + "grad_norm": 0.27543315966859977, + "learning_rate": 9.140311760710788e-06, + "loss": 0.271, + "step": 5345 + }, + { + "epoch": 0.4276714465710686, + "grad_norm": 0.33658752780313894, + "learning_rate": 9.139948573393602e-06, + "loss": 0.2534, + "step": 5346 + }, + { + "epoch": 0.42775144497110057, + "grad_norm": 0.30381141325256994, + "learning_rate": 9.139585316594592e-06, + "loss": 0.2638, + "step": 5347 + }, + { + "epoch": 0.4278314433711326, + "grad_norm": 0.3013122533527082, + "learning_rate": 9.139221990319856e-06, + "loss": 0.2819, + "step": 5348 + }, + { + "epoch": 0.4279114417711646, + "grad_norm": 0.25248263189537096, + "learning_rate": 9.13885859457549e-06, + "loss": 0.315, + "step": 5349 + }, + { + "epoch": 0.42799144017119656, + "grad_norm": 0.272330768502305, + "learning_rate": 9.138495129367595e-06, + "loss": 0.3022, + "step": 5350 + }, + { + "epoch": 0.4280714385712286, + "grad_norm": 0.35587849253476533, + "learning_rate": 9.13813159470227e-06, + "loss": 0.2586, + "step": 5351 + }, + { + "epoch": 0.42815143697126057, + "grad_norm": 0.32759523312285027, + "learning_rate": 9.137767990585618e-06, + "loss": 0.2603, + "step": 5352 + }, + { + "epoch": 0.4282314353712926, + "grad_norm": 0.21961314895895181, + "learning_rate": 9.137404317023738e-06, + "loss": 0.3434, + "step": 5353 + }, + { + "epoch": 0.4283114337713246, + "grad_norm": 0.36331792561609977, + "learning_rate": 9.13704057402274e-06, + "loss": 0.2847, + "step": 5354 + }, + { + "epoch": 0.42839143217135656, + "grad_norm": 0.48818216252520774, + "learning_rate": 9.13667676158872e-06, + "loss": 0.2597, + "step": 5355 + }, + { + "epoch": 0.4284714305713886, + "grad_norm": 0.2814787227297044, + "learning_rate": 9.136312879727791e-06, + "loss": 0.3299, + "step": 5356 + }, + { + "epoch": 0.42855142897142057, + "grad_norm": 0.2751027231612159, + "learning_rate": 9.135948928446057e-06, + "loss": 0.2842, + "step": 5357 + }, + { + "epoch": 0.42863142737145254, + "grad_norm": 0.430150543863991, + "learning_rate": 9.135584907749627e-06, + "loss": 0.2784, + "step": 5358 + }, + { + "epoch": 0.4287114257714846, + "grad_norm": 0.29316039071201766, + "learning_rate": 9.13522081764461e-06, + "loss": 0.2353, + "step": 5359 + }, + { + "epoch": 0.42879142417151656, + "grad_norm": 0.3212379296611071, + "learning_rate": 9.134856658137118e-06, + "loss": 0.2881, + "step": 5360 + }, + { + "epoch": 0.4288714225715486, + "grad_norm": 0.2476914429326272, + "learning_rate": 9.134492429233262e-06, + "loss": 0.3061, + "step": 5361 + }, + { + "epoch": 0.42895142097158057, + "grad_norm": 0.2934879083485212, + "learning_rate": 9.134128130939153e-06, + "loss": 0.2837, + "step": 5362 + }, + { + "epoch": 0.42903141937161254, + "grad_norm": 0.34030904430854375, + "learning_rate": 9.133763763260907e-06, + "loss": 0.257, + "step": 5363 + }, + { + "epoch": 0.4291114177716446, + "grad_norm": 0.24010735103412736, + "learning_rate": 9.13339932620464e-06, + "loss": 0.342, + "step": 5364 + }, + { + "epoch": 0.42919141617167655, + "grad_norm": 0.26883467146039147, + "learning_rate": 9.133034819776469e-06, + "loss": 0.2885, + "step": 5365 + }, + { + "epoch": 0.4292714145717086, + "grad_norm": 0.3028333320526651, + "learning_rate": 9.132670243982509e-06, + "loss": 0.2951, + "step": 5366 + }, + { + "epoch": 0.42935141297174056, + "grad_norm": 0.2815714700966718, + "learning_rate": 9.13230559882888e-06, + "loss": 0.2778, + "step": 5367 + }, + { + "epoch": 0.42943141137177254, + "grad_norm": 0.2873876960054808, + "learning_rate": 9.131940884321702e-06, + "loss": 0.2619, + "step": 5368 + }, + { + "epoch": 0.4295114097718046, + "grad_norm": 0.3695986160635828, + "learning_rate": 9.131576100467095e-06, + "loss": 0.2628, + "step": 5369 + }, + { + "epoch": 0.42959140817183655, + "grad_norm": 0.23247133757044536, + "learning_rate": 9.131211247271184e-06, + "loss": 0.3166, + "step": 5370 + }, + { + "epoch": 0.4296714065718686, + "grad_norm": 0.27384089889368796, + "learning_rate": 9.130846324740087e-06, + "loss": 0.2858, + "step": 5371 + }, + { + "epoch": 0.42975140497190056, + "grad_norm": 0.31825890381581784, + "learning_rate": 9.130481332879936e-06, + "loss": 0.2642, + "step": 5372 + }, + { + "epoch": 0.42983140337193254, + "grad_norm": 0.3048222134609976, + "learning_rate": 9.130116271696851e-06, + "loss": 0.2445, + "step": 5373 + }, + { + "epoch": 0.4299114017719646, + "grad_norm": 0.3424479237750182, + "learning_rate": 9.129751141196963e-06, + "loss": 0.2744, + "step": 5374 + }, + { + "epoch": 0.42999140017199655, + "grad_norm": 0.2655349312138144, + "learning_rate": 9.129385941386397e-06, + "loss": 0.3161, + "step": 5375 + }, + { + "epoch": 0.4300713985720286, + "grad_norm": 0.3455002075239909, + "learning_rate": 9.129020672271283e-06, + "loss": 0.2685, + "step": 5376 + }, + { + "epoch": 0.43015139697206056, + "grad_norm": 0.3726921797393575, + "learning_rate": 9.128655333857751e-06, + "loss": 0.2787, + "step": 5377 + }, + { + "epoch": 0.43023139537209254, + "grad_norm": 0.2507191399964659, + "learning_rate": 9.128289926151935e-06, + "loss": 0.273, + "step": 5378 + }, + { + "epoch": 0.4303113937721246, + "grad_norm": 0.2938005621743509, + "learning_rate": 9.127924449159966e-06, + "loss": 0.2512, + "step": 5379 + }, + { + "epoch": 0.43039139217215655, + "grad_norm": 0.3182788486296455, + "learning_rate": 9.127558902887976e-06, + "loss": 0.2828, + "step": 5380 + }, + { + "epoch": 0.4304713905721886, + "grad_norm": 0.2808318401685348, + "learning_rate": 9.127193287342103e-06, + "loss": 0.2803, + "step": 5381 + }, + { + "epoch": 0.43055138897222056, + "grad_norm": 0.3024134817342552, + "learning_rate": 9.126827602528482e-06, + "loss": 0.2559, + "step": 5382 + }, + { + "epoch": 0.43063138737225254, + "grad_norm": 0.3258480355031198, + "learning_rate": 9.12646184845325e-06, + "loss": 0.2546, + "step": 5383 + }, + { + "epoch": 0.43071138577228457, + "grad_norm": 0.25189855527432087, + "learning_rate": 9.126096025122548e-06, + "loss": 0.3065, + "step": 5384 + }, + { + "epoch": 0.43079138417231655, + "grad_norm": 0.23514439414586483, + "learning_rate": 9.125730132542511e-06, + "loss": 0.3104, + "step": 5385 + }, + { + "epoch": 0.4308713825723485, + "grad_norm": 0.2702422154633485, + "learning_rate": 9.125364170719284e-06, + "loss": 0.3144, + "step": 5386 + }, + { + "epoch": 0.43095138097238056, + "grad_norm": 0.266598087977465, + "learning_rate": 9.124998139659009e-06, + "loss": 0.2828, + "step": 5387 + }, + { + "epoch": 0.43103137937241254, + "grad_norm": 0.2953915500486957, + "learning_rate": 9.124632039367826e-06, + "loss": 0.2562, + "step": 5388 + }, + { + "epoch": 0.43111137777244457, + "grad_norm": 0.30378048699631005, + "learning_rate": 9.124265869851882e-06, + "loss": 0.2622, + "step": 5389 + }, + { + "epoch": 0.43119137617247655, + "grad_norm": 0.26777484068919577, + "learning_rate": 9.123899631117322e-06, + "loss": 0.3386, + "step": 5390 + }, + { + "epoch": 0.4312713745725085, + "grad_norm": 0.26831250017963326, + "learning_rate": 9.12353332317029e-06, + "loss": 0.2788, + "step": 5391 + }, + { + "epoch": 0.43135137297254056, + "grad_norm": 0.19623543668312327, + "learning_rate": 9.12316694601694e-06, + "loss": 0.3341, + "step": 5392 + }, + { + "epoch": 0.43143137137257254, + "grad_norm": 0.2612742401543717, + "learning_rate": 9.122800499663414e-06, + "loss": 0.2928, + "step": 5393 + }, + { + "epoch": 0.43151136977260457, + "grad_norm": 0.3020648450479614, + "learning_rate": 9.122433984115868e-06, + "loss": 0.2618, + "step": 5394 + }, + { + "epoch": 0.43159136817263655, + "grad_norm": 0.3680965398742099, + "learning_rate": 9.12206739938045e-06, + "loss": 0.2601, + "step": 5395 + }, + { + "epoch": 0.4316713665726685, + "grad_norm": 0.2768480409270497, + "learning_rate": 9.121700745463312e-06, + "loss": 0.2964, + "step": 5396 + }, + { + "epoch": 0.43175136497270056, + "grad_norm": 0.2963873015123927, + "learning_rate": 9.12133402237061e-06, + "loss": 0.2678, + "step": 5397 + }, + { + "epoch": 0.43183136337273254, + "grad_norm": 0.2726660713855438, + "learning_rate": 9.120967230108497e-06, + "loss": 0.2933, + "step": 5398 + }, + { + "epoch": 0.43191136177276457, + "grad_norm": 0.2312359857809251, + "learning_rate": 9.12060036868313e-06, + "loss": 0.3143, + "step": 5399 + }, + { + "epoch": 0.43199136017279655, + "grad_norm": 0.2673499587560985, + "learning_rate": 9.120233438100665e-06, + "loss": 0.2794, + "step": 5400 + }, + { + "epoch": 0.4320713585728285, + "grad_norm": 0.2712611159090724, + "learning_rate": 9.119866438367263e-06, + "loss": 0.2822, + "step": 5401 + }, + { + "epoch": 0.43215135697286056, + "grad_norm": 0.30234296272936134, + "learning_rate": 9.11949936948908e-06, + "loss": 0.2975, + "step": 5402 + }, + { + "epoch": 0.43223135537289253, + "grad_norm": 0.24034325263906536, + "learning_rate": 9.119132231472278e-06, + "loss": 0.3397, + "step": 5403 + }, + { + "epoch": 0.43231135377292457, + "grad_norm": 0.3027450115663048, + "learning_rate": 9.118765024323021e-06, + "loss": 0.2933, + "step": 5404 + }, + { + "epoch": 0.43239135217295654, + "grad_norm": 0.2549203469124061, + "learning_rate": 9.118397748047467e-06, + "loss": 0.3243, + "step": 5405 + }, + { + "epoch": 0.4324713505729885, + "grad_norm": 0.21570434977751268, + "learning_rate": 9.118030402651786e-06, + "loss": 0.3266, + "step": 5406 + }, + { + "epoch": 0.43255134897302056, + "grad_norm": 0.29268030067956136, + "learning_rate": 9.117662988142138e-06, + "loss": 0.2609, + "step": 5407 + }, + { + "epoch": 0.43263134737305253, + "grad_norm": 0.32298425445107, + "learning_rate": 9.117295504524692e-06, + "loss": 0.2513, + "step": 5408 + }, + { + "epoch": 0.43271134577308457, + "grad_norm": 0.25877497877995925, + "learning_rate": 9.116927951805615e-06, + "loss": 0.2925, + "step": 5409 + }, + { + "epoch": 0.43279134417311654, + "grad_norm": 0.3065392139581013, + "learning_rate": 9.116560329991077e-06, + "loss": 0.2436, + "step": 5410 + }, + { + "epoch": 0.4328713425731485, + "grad_norm": 0.24664053903373473, + "learning_rate": 9.116192639087245e-06, + "loss": 0.3231, + "step": 5411 + }, + { + "epoch": 0.43295134097318055, + "grad_norm": 0.32261343504936224, + "learning_rate": 9.115824879100294e-06, + "loss": 0.2695, + "step": 5412 + }, + { + "epoch": 0.43303133937321253, + "grad_norm": 0.2909319003310747, + "learning_rate": 9.115457050036393e-06, + "loss": 0.2798, + "step": 5413 + }, + { + "epoch": 0.4331113377732445, + "grad_norm": 0.3057968980457586, + "learning_rate": 9.115089151901715e-06, + "loss": 0.2854, + "step": 5414 + }, + { + "epoch": 0.43319133617327654, + "grad_norm": 0.2213846996831468, + "learning_rate": 9.11472118470244e-06, + "loss": 0.301, + "step": 5415 + }, + { + "epoch": 0.4332713345733085, + "grad_norm": 0.32188464289406854, + "learning_rate": 9.114353148444735e-06, + "loss": 0.2649, + "step": 5416 + }, + { + "epoch": 0.43335133297334055, + "grad_norm": 0.3066453535439091, + "learning_rate": 9.113985043134784e-06, + "loss": 0.3019, + "step": 5417 + }, + { + "epoch": 0.43343133137337253, + "grad_norm": 0.30687210335486925, + "learning_rate": 9.113616868778762e-06, + "loss": 0.2613, + "step": 5418 + }, + { + "epoch": 0.4335113297734045, + "grad_norm": 0.31677425915505125, + "learning_rate": 9.113248625382849e-06, + "loss": 0.2914, + "step": 5419 + }, + { + "epoch": 0.43359132817343654, + "grad_norm": 0.27827872711784385, + "learning_rate": 9.112880312953225e-06, + "loss": 0.2957, + "step": 5420 + }, + { + "epoch": 0.4336713265734685, + "grad_norm": 0.3176231716308831, + "learning_rate": 9.11251193149607e-06, + "loss": 0.2651, + "step": 5421 + }, + { + "epoch": 0.43375132497350055, + "grad_norm": 0.24210515622266685, + "learning_rate": 9.112143481017571e-06, + "loss": 0.3104, + "step": 5422 + }, + { + "epoch": 0.43383132337353253, + "grad_norm": 0.2753839470802295, + "learning_rate": 9.111774961523906e-06, + "loss": 0.2993, + "step": 5423 + }, + { + "epoch": 0.4339113217735645, + "grad_norm": 0.3248365130189086, + "learning_rate": 9.111406373021264e-06, + "loss": 0.2472, + "step": 5424 + }, + { + "epoch": 0.43399132017359654, + "grad_norm": 0.2847559163264054, + "learning_rate": 9.11103771551583e-06, + "loss": 0.2853, + "step": 5425 + }, + { + "epoch": 0.4340713185736285, + "grad_norm": 0.28817518942233566, + "learning_rate": 9.11066898901379e-06, + "loss": 0.2956, + "step": 5426 + }, + { + "epoch": 0.43415131697366055, + "grad_norm": 0.3169411756066201, + "learning_rate": 9.110300193521336e-06, + "loss": 0.3388, + "step": 5427 + }, + { + "epoch": 0.43423131537369253, + "grad_norm": 0.2911984102301359, + "learning_rate": 9.109931329044655e-06, + "loss": 0.294, + "step": 5428 + }, + { + "epoch": 0.4343113137737245, + "grad_norm": 0.307404375575855, + "learning_rate": 9.109562395589937e-06, + "loss": 0.2513, + "step": 5429 + }, + { + "epoch": 0.43439131217375654, + "grad_norm": 0.20205107830849464, + "learning_rate": 9.109193393163377e-06, + "loss": 0.3667, + "step": 5430 + }, + { + "epoch": 0.4344713105737885, + "grad_norm": 0.34867706845160945, + "learning_rate": 9.108824321771163e-06, + "loss": 0.2711, + "step": 5431 + }, + { + "epoch": 0.43455130897382055, + "grad_norm": 0.2751137364604493, + "learning_rate": 9.108455181419493e-06, + "loss": 0.2983, + "step": 5432 + }, + { + "epoch": 0.4346313073738525, + "grad_norm": 0.23793040387150152, + "learning_rate": 9.108085972114563e-06, + "loss": 0.3105, + "step": 5433 + }, + { + "epoch": 0.4347113057738845, + "grad_norm": 0.25447894869290316, + "learning_rate": 9.107716693862566e-06, + "loss": 0.2879, + "step": 5434 + }, + { + "epoch": 0.43479130417391654, + "grad_norm": 0.2894078075106074, + "learning_rate": 9.107347346669705e-06, + "loss": 0.2989, + "step": 5435 + }, + { + "epoch": 0.4348713025739485, + "grad_norm": 0.2477062605366648, + "learning_rate": 9.106977930542171e-06, + "loss": 0.3391, + "step": 5436 + }, + { + "epoch": 0.43495130097398055, + "grad_norm": 0.34301639426146047, + "learning_rate": 9.106608445486171e-06, + "loss": 0.3217, + "step": 5437 + }, + { + "epoch": 0.4350312993740125, + "grad_norm": 0.23364256104013012, + "learning_rate": 9.106238891507906e-06, + "loss": 0.3091, + "step": 5438 + }, + { + "epoch": 0.4351112977740445, + "grad_norm": 1.631504738519471, + "learning_rate": 9.105869268613574e-06, + "loss": 0.2439, + "step": 5439 + }, + { + "epoch": 0.43519129617407654, + "grad_norm": 0.2945974063359576, + "learning_rate": 9.10549957680938e-06, + "loss": 0.234, + "step": 5440 + }, + { + "epoch": 0.4352712945741085, + "grad_norm": 0.30557953524840054, + "learning_rate": 9.105129816101531e-06, + "loss": 0.2615, + "step": 5441 + }, + { + "epoch": 0.4353512929741405, + "grad_norm": 0.32975962712134294, + "learning_rate": 9.10475998649623e-06, + "loss": 0.2752, + "step": 5442 + }, + { + "epoch": 0.4354312913741725, + "grad_norm": 0.2920265303935656, + "learning_rate": 9.104390087999686e-06, + "loss": 0.3136, + "step": 5443 + }, + { + "epoch": 0.4355112897742045, + "grad_norm": 0.2961220762770864, + "learning_rate": 9.104020120618104e-06, + "loss": 0.2751, + "step": 5444 + }, + { + "epoch": 0.43559128817423654, + "grad_norm": 0.30111999838139236, + "learning_rate": 9.103650084357697e-06, + "loss": 0.2673, + "step": 5445 + }, + { + "epoch": 0.4356712865742685, + "grad_norm": 0.3337355756468021, + "learning_rate": 9.103279979224676e-06, + "loss": 0.2756, + "step": 5446 + }, + { + "epoch": 0.4357512849743005, + "grad_norm": 0.2960697191209259, + "learning_rate": 9.102909805225246e-06, + "loss": 0.2699, + "step": 5447 + }, + { + "epoch": 0.4358312833743325, + "grad_norm": 0.25538927646094045, + "learning_rate": 9.102539562365626e-06, + "loss": 0.3185, + "step": 5448 + }, + { + "epoch": 0.4359112817743645, + "grad_norm": 0.3067771826715026, + "learning_rate": 9.102169250652029e-06, + "loss": 0.2631, + "step": 5449 + }, + { + "epoch": 0.43599128017439653, + "grad_norm": 0.34250525373737395, + "learning_rate": 9.101798870090667e-06, + "loss": 0.2883, + "step": 5450 + }, + { + "epoch": 0.4360712785744285, + "grad_norm": 0.29995039635118365, + "learning_rate": 9.101428420687759e-06, + "loss": 0.2667, + "step": 5451 + }, + { + "epoch": 0.4361512769744605, + "grad_norm": 0.32816371536701494, + "learning_rate": 9.101057902449523e-06, + "loss": 0.2412, + "step": 5452 + }, + { + "epoch": 0.4362312753744925, + "grad_norm": 0.33348193996199105, + "learning_rate": 9.100687315382174e-06, + "loss": 0.2963, + "step": 5453 + }, + { + "epoch": 0.4363112737745245, + "grad_norm": 0.29124816195568953, + "learning_rate": 9.100316659491935e-06, + "loss": 0.3201, + "step": 5454 + }, + { + "epoch": 0.43639127217455653, + "grad_norm": 0.28815725984958035, + "learning_rate": 9.099945934785026e-06, + "loss": 0.3166, + "step": 5455 + }, + { + "epoch": 0.4364712705745885, + "grad_norm": 0.29327413954104725, + "learning_rate": 9.099575141267667e-06, + "loss": 0.2473, + "step": 5456 + }, + { + "epoch": 0.4365512689746205, + "grad_norm": 0.3364885479265867, + "learning_rate": 9.099204278946083e-06, + "loss": 0.248, + "step": 5457 + }, + { + "epoch": 0.4366312673746525, + "grad_norm": 0.29817036655446233, + "learning_rate": 9.098833347826497e-06, + "loss": 0.2566, + "step": 5458 + }, + { + "epoch": 0.4367112657746845, + "grad_norm": 0.3058383803043822, + "learning_rate": 9.098462347915136e-06, + "loss": 0.2764, + "step": 5459 + }, + { + "epoch": 0.43679126417471653, + "grad_norm": 0.22133487702698182, + "learning_rate": 9.098091279218227e-06, + "loss": 0.305, + "step": 5460 + }, + { + "epoch": 0.4368712625747485, + "grad_norm": 0.23328280040374874, + "learning_rate": 9.097720141741994e-06, + "loss": 0.3428, + "step": 5461 + }, + { + "epoch": 0.4369512609747805, + "grad_norm": 0.3785764928316921, + "learning_rate": 9.097348935492672e-06, + "loss": 0.2679, + "step": 5462 + }, + { + "epoch": 0.4370312593748125, + "grad_norm": 0.2542034275789129, + "learning_rate": 9.096977660476485e-06, + "loss": 0.3213, + "step": 5463 + }, + { + "epoch": 0.4371112577748445, + "grad_norm": 0.3080388803389258, + "learning_rate": 9.096606316699668e-06, + "loss": 0.2837, + "step": 5464 + }, + { + "epoch": 0.4371912561748765, + "grad_norm": 0.27497843638813996, + "learning_rate": 9.096234904168451e-06, + "loss": 0.251, + "step": 5465 + }, + { + "epoch": 0.4372712545749085, + "grad_norm": 0.27779179506706336, + "learning_rate": 9.09586342288907e-06, + "loss": 0.299, + "step": 5466 + }, + { + "epoch": 0.4373512529749405, + "grad_norm": 0.30373462882419044, + "learning_rate": 9.095491872867757e-06, + "loss": 0.2484, + "step": 5467 + }, + { + "epoch": 0.4374312513749725, + "grad_norm": 0.28118529768435446, + "learning_rate": 9.09512025411075e-06, + "loss": 0.2984, + "step": 5468 + }, + { + "epoch": 0.4375112497750045, + "grad_norm": 0.26226498576800605, + "learning_rate": 9.094748566624285e-06, + "loss": 0.2875, + "step": 5469 + }, + { + "epoch": 0.4375912481750365, + "grad_norm": 0.29822436571078753, + "learning_rate": 9.0943768104146e-06, + "loss": 0.2768, + "step": 5470 + }, + { + "epoch": 0.4376712465750685, + "grad_norm": 0.302418451220066, + "learning_rate": 9.094004985487935e-06, + "loss": 0.2549, + "step": 5471 + }, + { + "epoch": 0.4377512449751005, + "grad_norm": 0.27054080486850507, + "learning_rate": 9.09363309185053e-06, + "loss": 0.2766, + "step": 5472 + }, + { + "epoch": 0.4378312433751325, + "grad_norm": 0.2865973039695305, + "learning_rate": 9.093261129508625e-06, + "loss": 0.2994, + "step": 5473 + }, + { + "epoch": 0.4379112417751645, + "grad_norm": 0.2858088220550337, + "learning_rate": 9.092889098468467e-06, + "loss": 0.2739, + "step": 5474 + }, + { + "epoch": 0.4379912401751965, + "grad_norm": 0.33741181188868535, + "learning_rate": 9.092516998736296e-06, + "loss": 0.2599, + "step": 5475 + }, + { + "epoch": 0.4380712385752285, + "grad_norm": 0.3233741086896138, + "learning_rate": 9.092144830318357e-06, + "loss": 0.3152, + "step": 5476 + }, + { + "epoch": 0.4381512369752605, + "grad_norm": 0.27549994394537264, + "learning_rate": 9.0917725932209e-06, + "loss": 0.3089, + "step": 5477 + }, + { + "epoch": 0.4382312353752925, + "grad_norm": 0.232507032761216, + "learning_rate": 9.091400287450167e-06, + "loss": 0.3152, + "step": 5478 + }, + { + "epoch": 0.4383112337753245, + "grad_norm": 0.3335390863454883, + "learning_rate": 9.091027913012411e-06, + "loss": 0.2396, + "step": 5479 + }, + { + "epoch": 0.43839123217535647, + "grad_norm": 0.30545173646632695, + "learning_rate": 9.09065546991388e-06, + "loss": 0.2622, + "step": 5480 + }, + { + "epoch": 0.4384712305753885, + "grad_norm": 0.272215119987308, + "learning_rate": 9.090282958160823e-06, + "loss": 0.2861, + "step": 5481 + }, + { + "epoch": 0.4385512289754205, + "grad_norm": 0.2754642449551888, + "learning_rate": 9.089910377759494e-06, + "loss": 0.3163, + "step": 5482 + }, + { + "epoch": 0.4386312273754525, + "grad_norm": 0.29689513007836654, + "learning_rate": 9.089537728716147e-06, + "loss": 0.2845, + "step": 5483 + }, + { + "epoch": 0.4387112257754845, + "grad_norm": 0.3039318864007909, + "learning_rate": 9.089165011037036e-06, + "loss": 0.2566, + "step": 5484 + }, + { + "epoch": 0.43879122417551647, + "grad_norm": 0.29171957970357143, + "learning_rate": 9.088792224728413e-06, + "loss": 0.3131, + "step": 5485 + }, + { + "epoch": 0.4388712225755485, + "grad_norm": 0.22508671554069734, + "learning_rate": 9.088419369796539e-06, + "loss": 0.3577, + "step": 5486 + }, + { + "epoch": 0.4389512209755805, + "grad_norm": 0.2959611048981429, + "learning_rate": 9.08804644624767e-06, + "loss": 0.2882, + "step": 5487 + }, + { + "epoch": 0.4390312193756125, + "grad_norm": 0.38072388353906433, + "learning_rate": 9.087673454088063e-06, + "loss": 0.2682, + "step": 5488 + }, + { + "epoch": 0.4391112177756445, + "grad_norm": 0.31602366480939253, + "learning_rate": 9.08730039332398e-06, + "loss": 0.322, + "step": 5489 + }, + { + "epoch": 0.43919121617567647, + "grad_norm": 0.2642153685219187, + "learning_rate": 9.086927263961682e-06, + "loss": 0.298, + "step": 5490 + }, + { + "epoch": 0.4392712145757085, + "grad_norm": 0.2878770300066201, + "learning_rate": 9.08655406600743e-06, + "loss": 0.2881, + "step": 5491 + }, + { + "epoch": 0.4393512129757405, + "grad_norm": 0.3126396548096743, + "learning_rate": 9.086180799467492e-06, + "loss": 0.2482, + "step": 5492 + }, + { + "epoch": 0.43943121137577246, + "grad_norm": 0.26486259266923773, + "learning_rate": 9.085807464348127e-06, + "loss": 0.2987, + "step": 5493 + }, + { + "epoch": 0.4395112097758045, + "grad_norm": 0.2988306457756422, + "learning_rate": 9.085434060655603e-06, + "loss": 0.2977, + "step": 5494 + }, + { + "epoch": 0.43959120817583647, + "grad_norm": 0.3540172944750142, + "learning_rate": 9.085060588396188e-06, + "loss": 0.2429, + "step": 5495 + }, + { + "epoch": 0.4396712065758685, + "grad_norm": 0.270444004448149, + "learning_rate": 9.084687047576149e-06, + "loss": 0.2716, + "step": 5496 + }, + { + "epoch": 0.4397512049759005, + "grad_norm": 0.27306499461729083, + "learning_rate": 9.084313438201754e-06, + "loss": 0.2757, + "step": 5497 + }, + { + "epoch": 0.43983120337593246, + "grad_norm": 0.3179994182485001, + "learning_rate": 9.083939760279275e-06, + "loss": 0.2568, + "step": 5498 + }, + { + "epoch": 0.4399112017759645, + "grad_norm": 0.2970879780847911, + "learning_rate": 9.083566013814985e-06, + "loss": 0.2947, + "step": 5499 + }, + { + "epoch": 0.43999120017599647, + "grad_norm": 0.21804543829080744, + "learning_rate": 9.083192198815154e-06, + "loss": 0.35, + "step": 5500 + }, + { + "epoch": 0.4400711985760285, + "grad_norm": 0.34285352397650765, + "learning_rate": 9.082818315286054e-06, + "loss": 0.2604, + "step": 5501 + }, + { + "epoch": 0.4401511969760605, + "grad_norm": 0.2401506634097383, + "learning_rate": 9.082444363233967e-06, + "loss": 0.3068, + "step": 5502 + }, + { + "epoch": 0.44023119537609245, + "grad_norm": 0.2572650195998039, + "learning_rate": 9.082070342665163e-06, + "loss": 0.2935, + "step": 5503 + }, + { + "epoch": 0.4403111937761245, + "grad_norm": 0.252048732971219, + "learning_rate": 9.08169625358592e-06, + "loss": 0.2693, + "step": 5504 + }, + { + "epoch": 0.44039119217615647, + "grad_norm": 0.2591584221631981, + "learning_rate": 9.08132209600252e-06, + "loss": 0.3298, + "step": 5505 + }, + { + "epoch": 0.4404711905761885, + "grad_norm": 0.27080821908467057, + "learning_rate": 9.080947869921238e-06, + "loss": 0.2779, + "step": 5506 + }, + { + "epoch": 0.4405511889762205, + "grad_norm": 0.3074359965828897, + "learning_rate": 9.080573575348358e-06, + "loss": 0.2657, + "step": 5507 + }, + { + "epoch": 0.44063118737625245, + "grad_norm": 0.2925259111943576, + "learning_rate": 9.08019921229016e-06, + "loss": 0.2719, + "step": 5508 + }, + { + "epoch": 0.4407111857762845, + "grad_norm": 0.3623279402143395, + "learning_rate": 9.079824780752929e-06, + "loss": 0.271, + "step": 5509 + }, + { + "epoch": 0.44079118417631646, + "grad_norm": 0.30132961149451437, + "learning_rate": 9.079450280742948e-06, + "loss": 0.2569, + "step": 5510 + }, + { + "epoch": 0.4408711825763485, + "grad_norm": 0.30944446287213856, + "learning_rate": 9.079075712266501e-06, + "loss": 0.3091, + "step": 5511 + }, + { + "epoch": 0.4409511809763805, + "grad_norm": 0.31076879062229956, + "learning_rate": 9.078701075329875e-06, + "loss": 0.2661, + "step": 5512 + }, + { + "epoch": 0.44103117937641245, + "grad_norm": 0.30565880401873535, + "learning_rate": 9.078326369939361e-06, + "loss": 0.2874, + "step": 5513 + }, + { + "epoch": 0.4411111777764445, + "grad_norm": 0.2842776076681883, + "learning_rate": 9.077951596101244e-06, + "loss": 0.2993, + "step": 5514 + }, + { + "epoch": 0.44119117617647646, + "grad_norm": 0.4484252539311488, + "learning_rate": 9.077576753821815e-06, + "loss": 0.2534, + "step": 5515 + }, + { + "epoch": 0.4412711745765085, + "grad_norm": 0.3237263552254394, + "learning_rate": 9.077201843107366e-06, + "loss": 0.2406, + "step": 5516 + }, + { + "epoch": 0.4413511729765405, + "grad_norm": 0.29212045970859557, + "learning_rate": 9.076826863964188e-06, + "loss": 0.309, + "step": 5517 + }, + { + "epoch": 0.44143117137657245, + "grad_norm": 0.3090907303482042, + "learning_rate": 9.076451816398574e-06, + "loss": 0.2948, + "step": 5518 + }, + { + "epoch": 0.4415111697766045, + "grad_norm": 0.29409861796757963, + "learning_rate": 9.07607670041682e-06, + "loss": 0.2781, + "step": 5519 + }, + { + "epoch": 0.44159116817663646, + "grad_norm": 0.3320203282513383, + "learning_rate": 9.075701516025219e-06, + "loss": 0.2666, + "step": 5520 + }, + { + "epoch": 0.44167116657666844, + "grad_norm": 0.3023415606817555, + "learning_rate": 9.075326263230073e-06, + "loss": 0.2676, + "step": 5521 + }, + { + "epoch": 0.44175116497670047, + "grad_norm": 0.3036445613512114, + "learning_rate": 9.074950942037674e-06, + "loss": 0.2752, + "step": 5522 + }, + { + "epoch": 0.44183116337673245, + "grad_norm": 0.48889357604376044, + "learning_rate": 9.074575552454325e-06, + "loss": 0.2835, + "step": 5523 + }, + { + "epoch": 0.4419111617767645, + "grad_norm": 0.3114596395402801, + "learning_rate": 9.074200094486327e-06, + "loss": 0.2445, + "step": 5524 + }, + { + "epoch": 0.44199116017679646, + "grad_norm": 0.2845483961412628, + "learning_rate": 9.073824568139979e-06, + "loss": 0.2735, + "step": 5525 + }, + { + "epoch": 0.44207115857682844, + "grad_norm": 0.3256403819121603, + "learning_rate": 9.073448973421581e-06, + "loss": 0.2889, + "step": 5526 + }, + { + "epoch": 0.44215115697686047, + "grad_norm": 0.3291355906724503, + "learning_rate": 9.073073310337443e-06, + "loss": 0.2575, + "step": 5527 + }, + { + "epoch": 0.44223115537689245, + "grad_norm": 0.27322637684631557, + "learning_rate": 9.072697578893865e-06, + "loss": 0.306, + "step": 5528 + }, + { + "epoch": 0.4423111537769245, + "grad_norm": 0.29862018104240856, + "learning_rate": 9.072321779097155e-06, + "loss": 0.2521, + "step": 5529 + }, + { + "epoch": 0.44239115217695646, + "grad_norm": 0.2663899555691935, + "learning_rate": 9.07194591095362e-06, + "loss": 0.2788, + "step": 5530 + }, + { + "epoch": 0.44247115057698844, + "grad_norm": 0.29449568704359064, + "learning_rate": 9.071569974469569e-06, + "loss": 0.2958, + "step": 5531 + }, + { + "epoch": 0.44255114897702047, + "grad_norm": 0.31496612545393604, + "learning_rate": 9.071193969651308e-06, + "loss": 0.2616, + "step": 5532 + }, + { + "epoch": 0.44263114737705245, + "grad_norm": 0.5337093576316178, + "learning_rate": 9.070817896505153e-06, + "loss": 0.2508, + "step": 5533 + }, + { + "epoch": 0.4427111457770845, + "grad_norm": 0.2545939015832631, + "learning_rate": 9.070441755037411e-06, + "loss": 0.3202, + "step": 5534 + }, + { + "epoch": 0.44279114417711646, + "grad_norm": 0.3555568993738711, + "learning_rate": 9.0700655452544e-06, + "loss": 0.2538, + "step": 5535 + }, + { + "epoch": 0.44287114257714844, + "grad_norm": 0.32798846812092775, + "learning_rate": 9.069689267162425e-06, + "loss": 0.2793, + "step": 5536 + }, + { + "epoch": 0.44295114097718047, + "grad_norm": 0.3025105824838569, + "learning_rate": 9.06931292076781e-06, + "loss": 0.2962, + "step": 5537 + }, + { + "epoch": 0.44303113937721245, + "grad_norm": 0.3220798367232042, + "learning_rate": 9.068936506076869e-06, + "loss": 0.2579, + "step": 5538 + }, + { + "epoch": 0.4431111377772445, + "grad_norm": 0.2544258968850388, + "learning_rate": 9.068560023095917e-06, + "loss": 0.3207, + "step": 5539 + }, + { + "epoch": 0.44319113617727646, + "grad_norm": 0.2786166745225385, + "learning_rate": 9.068183471831276e-06, + "loss": 0.2806, + "step": 5540 + }, + { + "epoch": 0.44327113457730843, + "grad_norm": 0.2894796650763557, + "learning_rate": 9.067806852289262e-06, + "loss": 0.241, + "step": 5541 + }, + { + "epoch": 0.44335113297734047, + "grad_norm": 0.20226464380511605, + "learning_rate": 9.067430164476201e-06, + "loss": 0.3527, + "step": 5542 + }, + { + "epoch": 0.44343113137737245, + "grad_norm": 0.32119690595369493, + "learning_rate": 9.067053408398409e-06, + "loss": 0.2555, + "step": 5543 + }, + { + "epoch": 0.4435111297774045, + "grad_norm": 0.44914315103030594, + "learning_rate": 9.066676584062214e-06, + "loss": 0.2658, + "step": 5544 + }, + { + "epoch": 0.44359112817743646, + "grad_norm": 0.33500301745631045, + "learning_rate": 9.06629969147394e-06, + "loss": 0.2793, + "step": 5545 + }, + { + "epoch": 0.44367112657746843, + "grad_norm": 0.2707431956379634, + "learning_rate": 9.065922730639906e-06, + "loss": 0.2747, + "step": 5546 + }, + { + "epoch": 0.44375112497750047, + "grad_norm": 0.3664944037464088, + "learning_rate": 9.065545701566448e-06, + "loss": 0.3078, + "step": 5547 + }, + { + "epoch": 0.44383112337753244, + "grad_norm": 0.3131343809017703, + "learning_rate": 9.06516860425989e-06, + "loss": 0.2996, + "step": 5548 + }, + { + "epoch": 0.4439111217775644, + "grad_norm": 0.33206726437611067, + "learning_rate": 9.064791438726557e-06, + "loss": 0.2546, + "step": 5549 + }, + { + "epoch": 0.44399112017759645, + "grad_norm": 0.30318433147221946, + "learning_rate": 9.064414204972784e-06, + "loss": 0.2438, + "step": 5550 + }, + { + "epoch": 0.44407111857762843, + "grad_norm": 0.3439310249667007, + "learning_rate": 9.0640369030049e-06, + "loss": 0.2604, + "step": 5551 + }, + { + "epoch": 0.44415111697766047, + "grad_norm": 0.31798152020472314, + "learning_rate": 9.063659532829239e-06, + "loss": 0.2629, + "step": 5552 + }, + { + "epoch": 0.44423111537769244, + "grad_norm": 0.2916073236850165, + "learning_rate": 9.063282094452133e-06, + "loss": 0.2758, + "step": 5553 + }, + { + "epoch": 0.4443111137777244, + "grad_norm": 0.28494244531663493, + "learning_rate": 9.062904587879917e-06, + "loss": 0.2924, + "step": 5554 + }, + { + "epoch": 0.44439111217775645, + "grad_norm": 0.30645940106419894, + "learning_rate": 9.062527013118926e-06, + "loss": 0.2552, + "step": 5555 + }, + { + "epoch": 0.44447111057778843, + "grad_norm": 0.3519526889375159, + "learning_rate": 9.062149370175502e-06, + "loss": 0.2483, + "step": 5556 + }, + { + "epoch": 0.44455110897782046, + "grad_norm": 0.3633300490177178, + "learning_rate": 9.061771659055974e-06, + "loss": 0.2641, + "step": 5557 + }, + { + "epoch": 0.44463110737785244, + "grad_norm": 0.29064249432426426, + "learning_rate": 9.061393879766688e-06, + "loss": 0.2368, + "step": 5558 + }, + { + "epoch": 0.4447111057778844, + "grad_norm": 0.2369138208912775, + "learning_rate": 9.061016032313984e-06, + "loss": 0.331, + "step": 5559 + }, + { + "epoch": 0.44479110417791645, + "grad_norm": 0.30358583170583575, + "learning_rate": 9.060638116704201e-06, + "loss": 0.2952, + "step": 5560 + }, + { + "epoch": 0.44487110257794843, + "grad_norm": 0.31303486117091034, + "learning_rate": 9.060260132943682e-06, + "loss": 0.2504, + "step": 5561 + }, + { + "epoch": 0.44495110097798046, + "grad_norm": 0.29103839590360725, + "learning_rate": 9.059882081038773e-06, + "loss": 0.286, + "step": 5562 + }, + { + "epoch": 0.44503109937801244, + "grad_norm": 0.22349322831935822, + "learning_rate": 9.059503960995816e-06, + "loss": 0.3602, + "step": 5563 + }, + { + "epoch": 0.4451110977780444, + "grad_norm": 0.3293902448108499, + "learning_rate": 9.059125772821158e-06, + "loss": 0.2605, + "step": 5564 + }, + { + "epoch": 0.44519109617807645, + "grad_norm": 0.4345774723158368, + "learning_rate": 9.058747516521149e-06, + "loss": 0.2438, + "step": 5565 + }, + { + "epoch": 0.44527109457810843, + "grad_norm": 0.3150458258751355, + "learning_rate": 9.058369192102134e-06, + "loss": 0.3195, + "step": 5566 + }, + { + "epoch": 0.44535109297814046, + "grad_norm": 0.25570843852966946, + "learning_rate": 9.057990799570464e-06, + "loss": 0.2816, + "step": 5567 + }, + { + "epoch": 0.44543109137817244, + "grad_norm": 0.2783916287739102, + "learning_rate": 9.05761233893249e-06, + "loss": 0.2898, + "step": 5568 + }, + { + "epoch": 0.4455110897782044, + "grad_norm": 0.30273883791637235, + "learning_rate": 9.05723381019456e-06, + "loss": 0.2406, + "step": 5569 + }, + { + "epoch": 0.44559108817823645, + "grad_norm": 0.23948383113919694, + "learning_rate": 9.056855213363032e-06, + "loss": 0.3295, + "step": 5570 + }, + { + "epoch": 0.4456710865782684, + "grad_norm": 0.2646450057218176, + "learning_rate": 9.056476548444258e-06, + "loss": 0.3045, + "step": 5571 + }, + { + "epoch": 0.44575108497830046, + "grad_norm": 0.2693345918562001, + "learning_rate": 9.056097815444593e-06, + "loss": 0.3165, + "step": 5572 + }, + { + "epoch": 0.44583108337833244, + "grad_norm": 0.3227580408894766, + "learning_rate": 9.055719014370396e-06, + "loss": 0.3002, + "step": 5573 + }, + { + "epoch": 0.4459110817783644, + "grad_norm": 0.28757082386956906, + "learning_rate": 9.05534014522802e-06, + "loss": 0.259, + "step": 5574 + }, + { + "epoch": 0.44599108017839645, + "grad_norm": 0.2300400703266702, + "learning_rate": 9.054961208023827e-06, + "loss": 0.2989, + "step": 5575 + }, + { + "epoch": 0.4460710785784284, + "grad_norm": 0.27937683778934963, + "learning_rate": 9.054582202764175e-06, + "loss": 0.3353, + "step": 5576 + }, + { + "epoch": 0.4461510769784604, + "grad_norm": 0.205044693315194, + "learning_rate": 9.054203129455425e-06, + "loss": 0.3488, + "step": 5577 + }, + { + "epoch": 0.44623107537849244, + "grad_norm": 0.3239849069221907, + "learning_rate": 9.053823988103943e-06, + "loss": 0.2783, + "step": 5578 + }, + { + "epoch": 0.4463110737785244, + "grad_norm": 0.252336336580127, + "learning_rate": 9.053444778716085e-06, + "loss": 0.3219, + "step": 5579 + }, + { + "epoch": 0.44639107217855645, + "grad_norm": 0.3078969168285223, + "learning_rate": 9.053065501298222e-06, + "loss": 0.2399, + "step": 5580 + }, + { + "epoch": 0.4464710705785884, + "grad_norm": 0.2687630987547738, + "learning_rate": 9.052686155856716e-06, + "loss": 0.2831, + "step": 5581 + }, + { + "epoch": 0.4465510689786204, + "grad_norm": 0.37006433520735704, + "learning_rate": 9.052306742397933e-06, + "loss": 0.2637, + "step": 5582 + }, + { + "epoch": 0.44663106737865244, + "grad_norm": 0.2628079482409702, + "learning_rate": 9.051927260928243e-06, + "loss": 0.2776, + "step": 5583 + }, + { + "epoch": 0.4467110657786844, + "grad_norm": 0.29440717479146344, + "learning_rate": 9.051547711454016e-06, + "loss": 0.3144, + "step": 5584 + }, + { + "epoch": 0.44679106417871645, + "grad_norm": 0.3300556101413221, + "learning_rate": 9.051168093981619e-06, + "loss": 0.2606, + "step": 5585 + }, + { + "epoch": 0.4468710625787484, + "grad_norm": 0.3332302910193934, + "learning_rate": 9.050788408517426e-06, + "loss": 0.2585, + "step": 5586 + }, + { + "epoch": 0.4469510609787804, + "grad_norm": 0.4744145968107132, + "learning_rate": 9.050408655067806e-06, + "loss": 0.2967, + "step": 5587 + }, + { + "epoch": 0.44703105937881243, + "grad_norm": 0.2971934536629232, + "learning_rate": 9.050028833639135e-06, + "loss": 0.3283, + "step": 5588 + }, + { + "epoch": 0.4471110577788444, + "grad_norm": 0.8355916304823727, + "learning_rate": 9.049648944237788e-06, + "loss": 0.2759, + "step": 5589 + }, + { + "epoch": 0.44719105617887644, + "grad_norm": 0.4343972901271774, + "learning_rate": 9.049268986870139e-06, + "loss": 0.2748, + "step": 5590 + }, + { + "epoch": 0.4472710545789084, + "grad_norm": 0.2674819603457788, + "learning_rate": 9.048888961542565e-06, + "loss": 0.2781, + "step": 5591 + }, + { + "epoch": 0.4473510529789404, + "grad_norm": 0.38285013281028885, + "learning_rate": 9.048508868261446e-06, + "loss": 0.2597, + "step": 5592 + }, + { + "epoch": 0.44743105137897243, + "grad_norm": 0.33413971273208976, + "learning_rate": 9.048128707033159e-06, + "loss": 0.2611, + "step": 5593 + }, + { + "epoch": 0.4475110497790044, + "grad_norm": 0.2435886384062945, + "learning_rate": 9.047748477864087e-06, + "loss": 0.2789, + "step": 5594 + }, + { + "epoch": 0.44759104817903644, + "grad_norm": 0.31104568939276717, + "learning_rate": 9.04736818076061e-06, + "loss": 0.2385, + "step": 5595 + }, + { + "epoch": 0.4476710465790684, + "grad_norm": 0.3288918532548245, + "learning_rate": 9.046987815729108e-06, + "loss": 0.2917, + "step": 5596 + }, + { + "epoch": 0.4477510449791004, + "grad_norm": 0.4474752320977135, + "learning_rate": 9.04660738277597e-06, + "loss": 0.2448, + "step": 5597 + }, + { + "epoch": 0.44783104337913243, + "grad_norm": 0.31423169424439035, + "learning_rate": 9.046226881907575e-06, + "loss": 0.2916, + "step": 5598 + }, + { + "epoch": 0.4479110417791644, + "grad_norm": 0.284311799031118, + "learning_rate": 9.045846313130313e-06, + "loss": 0.2903, + "step": 5599 + }, + { + "epoch": 0.44799104017919644, + "grad_norm": 0.32099135577091886, + "learning_rate": 9.045465676450572e-06, + "loss": 0.2868, + "step": 5600 + }, + { + "epoch": 0.4480710385792284, + "grad_norm": 0.26307008946547045, + "learning_rate": 9.045084971874738e-06, + "loss": 0.3011, + "step": 5601 + }, + { + "epoch": 0.4481510369792604, + "grad_norm": 0.24024802222938746, + "learning_rate": 9.0447041994092e-06, + "loss": 0.306, + "step": 5602 + }, + { + "epoch": 0.44823103537929243, + "grad_norm": 0.2737629597786706, + "learning_rate": 9.044323359060352e-06, + "loss": 0.2805, + "step": 5603 + }, + { + "epoch": 0.4483110337793244, + "grad_norm": 0.3429648158531512, + "learning_rate": 9.043942450834582e-06, + "loss": 0.2673, + "step": 5604 + }, + { + "epoch": 0.4483910321793564, + "grad_norm": 0.25793553174001876, + "learning_rate": 9.043561474738285e-06, + "loss": 0.3112, + "step": 5605 + }, + { + "epoch": 0.4484710305793884, + "grad_norm": 0.289542396743031, + "learning_rate": 9.043180430777854e-06, + "loss": 0.272, + "step": 5606 + }, + { + "epoch": 0.4485510289794204, + "grad_norm": 0.3400782640274442, + "learning_rate": 9.042799318959684e-06, + "loss": 0.2531, + "step": 5607 + }, + { + "epoch": 0.44863102737945243, + "grad_norm": 0.29912716934376893, + "learning_rate": 9.042418139290173e-06, + "loss": 0.2862, + "step": 5608 + }, + { + "epoch": 0.4487110257794844, + "grad_norm": 0.26575761068988984, + "learning_rate": 9.042036891775715e-06, + "loss": 0.2988, + "step": 5609 + }, + { + "epoch": 0.4487910241795164, + "grad_norm": 0.29738317284111315, + "learning_rate": 9.041655576422713e-06, + "loss": 0.2869, + "step": 5610 + }, + { + "epoch": 0.4488710225795484, + "grad_norm": 0.3223219323130359, + "learning_rate": 9.041274193237565e-06, + "loss": 0.235, + "step": 5611 + }, + { + "epoch": 0.4489510209795804, + "grad_norm": 0.24892052165565023, + "learning_rate": 9.04089274222667e-06, + "loss": 0.3164, + "step": 5612 + }, + { + "epoch": 0.44903101937961243, + "grad_norm": 0.24974714072125548, + "learning_rate": 9.040511223396432e-06, + "loss": 0.3315, + "step": 5613 + }, + { + "epoch": 0.4491110177796444, + "grad_norm": 0.34450259211205, + "learning_rate": 9.040129636753253e-06, + "loss": 0.2836, + "step": 5614 + }, + { + "epoch": 0.4491910161796764, + "grad_norm": 0.31941063345613213, + "learning_rate": 9.039747982303539e-06, + "loss": 0.2714, + "step": 5615 + }, + { + "epoch": 0.4492710145797084, + "grad_norm": 0.3287485874172965, + "learning_rate": 9.039366260053693e-06, + "loss": 0.2724, + "step": 5616 + }, + { + "epoch": 0.4493510129797404, + "grad_norm": 0.3086087293584916, + "learning_rate": 9.038984470010123e-06, + "loss": 0.3125, + "step": 5617 + }, + { + "epoch": 0.4494310113797724, + "grad_norm": 0.24890016446126073, + "learning_rate": 9.038602612179236e-06, + "loss": 0.3078, + "step": 5618 + }, + { + "epoch": 0.4495110097798044, + "grad_norm": 0.2884142537736799, + "learning_rate": 9.038220686567443e-06, + "loss": 0.2991, + "step": 5619 + }, + { + "epoch": 0.4495910081798364, + "grad_norm": 0.32664211971791374, + "learning_rate": 9.037838693181151e-06, + "loss": 0.2502, + "step": 5620 + }, + { + "epoch": 0.4496710065798684, + "grad_norm": 0.30543754011929564, + "learning_rate": 9.037456632026774e-06, + "loss": 0.2608, + "step": 5621 + }, + { + "epoch": 0.4497510049799004, + "grad_norm": 0.2722700118972102, + "learning_rate": 9.037074503110719e-06, + "loss": 0.2935, + "step": 5622 + }, + { + "epoch": 0.4498310033799324, + "grad_norm": 0.2814185484101297, + "learning_rate": 9.036692306439406e-06, + "loss": 0.276, + "step": 5623 + }, + { + "epoch": 0.4499110017799644, + "grad_norm": 0.33917382616876846, + "learning_rate": 9.036310042019245e-06, + "loss": 0.2629, + "step": 5624 + }, + { + "epoch": 0.4499910001799964, + "grad_norm": 0.2644404209193627, + "learning_rate": 9.035927709856654e-06, + "loss": 0.3064, + "step": 5625 + }, + { + "epoch": 0.4500709985800284, + "grad_norm": 0.31998057162255167, + "learning_rate": 9.035545309958048e-06, + "loss": 0.2791, + "step": 5626 + }, + { + "epoch": 0.4501509969800604, + "grad_norm": 0.3144383580175707, + "learning_rate": 9.035162842329845e-06, + "loss": 0.2527, + "step": 5627 + }, + { + "epoch": 0.4502309953800924, + "grad_norm": 0.2475654942027119, + "learning_rate": 9.034780306978466e-06, + "loss": 0.2811, + "step": 5628 + }, + { + "epoch": 0.4503109937801244, + "grad_norm": 0.33728880170131154, + "learning_rate": 9.034397703910328e-06, + "loss": 0.2435, + "step": 5629 + }, + { + "epoch": 0.4503909921801564, + "grad_norm": 0.3028517258463103, + "learning_rate": 9.034015033131858e-06, + "loss": 0.2624, + "step": 5630 + }, + { + "epoch": 0.4504709905801884, + "grad_norm": 0.19293587862662287, + "learning_rate": 9.033632294649473e-06, + "loss": 0.3324, + "step": 5631 + }, + { + "epoch": 0.4505509889802204, + "grad_norm": 0.3338684285346212, + "learning_rate": 9.033249488469597e-06, + "loss": 0.2884, + "step": 5632 + }, + { + "epoch": 0.45063098738025237, + "grad_norm": 0.41663317342503803, + "learning_rate": 9.032866614598658e-06, + "loss": 0.2524, + "step": 5633 + }, + { + "epoch": 0.4507109857802844, + "grad_norm": 0.44225916933109277, + "learning_rate": 9.03248367304308e-06, + "loss": 0.2996, + "step": 5634 + }, + { + "epoch": 0.4507909841803164, + "grad_norm": 0.2628770511632072, + "learning_rate": 9.032100663809288e-06, + "loss": 0.2999, + "step": 5635 + }, + { + "epoch": 0.4508709825803484, + "grad_norm": 0.25813889565506465, + "learning_rate": 9.031717586903715e-06, + "loss": 0.2887, + "step": 5636 + }, + { + "epoch": 0.4509509809803804, + "grad_norm": 0.26502412157002514, + "learning_rate": 9.031334442332784e-06, + "loss": 0.3216, + "step": 5637 + }, + { + "epoch": 0.45103097938041237, + "grad_norm": 0.2982665930974367, + "learning_rate": 9.03095123010293e-06, + "loss": 0.2378, + "step": 5638 + }, + { + "epoch": 0.4511109777804444, + "grad_norm": 0.29201081711920746, + "learning_rate": 9.030567950220586e-06, + "loss": 0.2615, + "step": 5639 + }, + { + "epoch": 0.4511909761804764, + "grad_norm": 0.2998080542854087, + "learning_rate": 9.030184602692179e-06, + "loss": 0.2491, + "step": 5640 + }, + { + "epoch": 0.4512709745805084, + "grad_norm": 0.2986194988912872, + "learning_rate": 9.029801187524147e-06, + "loss": 0.2597, + "step": 5641 + }, + { + "epoch": 0.4513509729805404, + "grad_norm": 0.2883281759915458, + "learning_rate": 9.029417704722925e-06, + "loss": 0.2938, + "step": 5642 + }, + { + "epoch": 0.45143097138057237, + "grad_norm": 0.2430385808531927, + "learning_rate": 9.029034154294945e-06, + "loss": 0.3295, + "step": 5643 + }, + { + "epoch": 0.4515109697806044, + "grad_norm": 0.30187154664375243, + "learning_rate": 9.02865053624665e-06, + "loss": 0.3231, + "step": 5644 + }, + { + "epoch": 0.4515909681806364, + "grad_norm": 0.31179860386625846, + "learning_rate": 9.028266850584473e-06, + "loss": 0.2566, + "step": 5645 + }, + { + "epoch": 0.4516709665806684, + "grad_norm": 0.28223329319727625, + "learning_rate": 9.027883097314859e-06, + "loss": 0.2834, + "step": 5646 + }, + { + "epoch": 0.4517509649807004, + "grad_norm": 0.30544234956852817, + "learning_rate": 9.027499276444242e-06, + "loss": 0.2467, + "step": 5647 + }, + { + "epoch": 0.45183096338073236, + "grad_norm": 0.2597709208639203, + "learning_rate": 9.02711538797907e-06, + "loss": 0.3074, + "step": 5648 + }, + { + "epoch": 0.4519109617807644, + "grad_norm": 0.30266098700005173, + "learning_rate": 9.026731431925784e-06, + "loss": 0.2654, + "step": 5649 + }, + { + "epoch": 0.4519909601807964, + "grad_norm": 0.24729322250465036, + "learning_rate": 9.026347408290825e-06, + "loss": 0.3259, + "step": 5650 + }, + { + "epoch": 0.4520709585808284, + "grad_norm": 0.26801114860203573, + "learning_rate": 9.025963317080641e-06, + "loss": 0.2769, + "step": 5651 + }, + { + "epoch": 0.4521509569808604, + "grad_norm": 0.33734427242948156, + "learning_rate": 9.025579158301679e-06, + "loss": 0.2734, + "step": 5652 + }, + { + "epoch": 0.45223095538089236, + "grad_norm": 0.22481383076956757, + "learning_rate": 9.025194931960385e-06, + "loss": 0.3218, + "step": 5653 + }, + { + "epoch": 0.4523109537809244, + "grad_norm": 0.2998795172784253, + "learning_rate": 9.024810638063207e-06, + "loss": 0.2588, + "step": 5654 + }, + { + "epoch": 0.4523909521809564, + "grad_norm": 0.3139141850903015, + "learning_rate": 9.024426276616595e-06, + "loss": 0.2483, + "step": 5655 + }, + { + "epoch": 0.4524709505809884, + "grad_norm": 0.31033240773630466, + "learning_rate": 9.024041847627003e-06, + "loss": 0.2571, + "step": 5656 + }, + { + "epoch": 0.4525509489810204, + "grad_norm": 0.29880279724779935, + "learning_rate": 9.023657351100878e-06, + "loss": 0.2918, + "step": 5657 + }, + { + "epoch": 0.45263094738105236, + "grad_norm": 0.31054421730758835, + "learning_rate": 9.023272787044677e-06, + "loss": 0.2506, + "step": 5658 + }, + { + "epoch": 0.4527109457810844, + "grad_norm": 0.28541812610873313, + "learning_rate": 9.02288815546485e-06, + "loss": 0.2711, + "step": 5659 + }, + { + "epoch": 0.4527909441811164, + "grad_norm": 0.3021087151657876, + "learning_rate": 9.022503456367857e-06, + "loss": 0.2596, + "step": 5660 + }, + { + "epoch": 0.45287094258114835, + "grad_norm": 0.23484861913491784, + "learning_rate": 9.022118689760153e-06, + "loss": 0.2962, + "step": 5661 + }, + { + "epoch": 0.4529509409811804, + "grad_norm": 0.2881987201842057, + "learning_rate": 9.021733855648195e-06, + "loss": 0.2737, + "step": 5662 + }, + { + "epoch": 0.45303093938121236, + "grad_norm": 0.30361817825960186, + "learning_rate": 9.02134895403844e-06, + "loss": 0.2768, + "step": 5663 + }, + { + "epoch": 0.4531109377812444, + "grad_norm": 0.22661207778574827, + "learning_rate": 9.020963984937352e-06, + "loss": 0.3583, + "step": 5664 + }, + { + "epoch": 0.45319093618127637, + "grad_norm": 0.2670716373803451, + "learning_rate": 9.020578948351389e-06, + "loss": 0.2939, + "step": 5665 + }, + { + "epoch": 0.45327093458130835, + "grad_norm": 0.34748727809787566, + "learning_rate": 9.020193844287013e-06, + "loss": 0.2661, + "step": 5666 + }, + { + "epoch": 0.4533509329813404, + "grad_norm": 0.5706291529292663, + "learning_rate": 9.01980867275069e-06, + "loss": 0.2456, + "step": 5667 + }, + { + "epoch": 0.45343093138137236, + "grad_norm": 0.29968714206184144, + "learning_rate": 9.019423433748882e-06, + "loss": 0.2587, + "step": 5668 + }, + { + "epoch": 0.4535109297814044, + "grad_norm": 0.2705649589283515, + "learning_rate": 9.019038127288056e-06, + "loss": 0.3035, + "step": 5669 + }, + { + "epoch": 0.45359092818143637, + "grad_norm": 0.28762733187630907, + "learning_rate": 9.018652753374677e-06, + "loss": 0.2582, + "step": 5670 + }, + { + "epoch": 0.45367092658146835, + "grad_norm": 0.23547754523875356, + "learning_rate": 9.018267312015214e-06, + "loss": 0.3341, + "step": 5671 + }, + { + "epoch": 0.4537509249815004, + "grad_norm": 0.2701532552630431, + "learning_rate": 9.017881803216138e-06, + "loss": 0.3042, + "step": 5672 + }, + { + "epoch": 0.45383092338153236, + "grad_norm": 0.3500251342970308, + "learning_rate": 9.017496226983915e-06, + "loss": 0.2817, + "step": 5673 + }, + { + "epoch": 0.4539109217815644, + "grad_norm": 0.34459706625376774, + "learning_rate": 9.017110583325017e-06, + "loss": 0.2532, + "step": 5674 + }, + { + "epoch": 0.45399092018159637, + "grad_norm": 0.22861990433302665, + "learning_rate": 9.01672487224592e-06, + "loss": 0.3137, + "step": 5675 + }, + { + "epoch": 0.45407091858162835, + "grad_norm": 0.2062557821894587, + "learning_rate": 9.016339093753093e-06, + "loss": 0.3557, + "step": 5676 + }, + { + "epoch": 0.4541509169816604, + "grad_norm": 0.2498386261950635, + "learning_rate": 9.015953247853014e-06, + "loss": 0.324, + "step": 5677 + }, + { + "epoch": 0.45423091538169236, + "grad_norm": 0.2775962049889726, + "learning_rate": 9.015567334552158e-06, + "loss": 0.2948, + "step": 5678 + }, + { + "epoch": 0.4543109137817244, + "grad_norm": 0.2896902225691437, + "learning_rate": 9.015181353856998e-06, + "loss": 0.2687, + "step": 5679 + }, + { + "epoch": 0.45439091218175637, + "grad_norm": 0.30501492797322455, + "learning_rate": 9.014795305774019e-06, + "loss": 0.2698, + "step": 5680 + }, + { + "epoch": 0.45447091058178835, + "grad_norm": 0.28561895341133053, + "learning_rate": 9.014409190309695e-06, + "loss": 0.3026, + "step": 5681 + }, + { + "epoch": 0.4545509089818204, + "grad_norm": 0.30837919402429925, + "learning_rate": 9.014023007470507e-06, + "loss": 0.2508, + "step": 5682 + }, + { + "epoch": 0.45463090738185236, + "grad_norm": 0.2977031939966874, + "learning_rate": 9.013636757262938e-06, + "loss": 0.2432, + "step": 5683 + }, + { + "epoch": 0.4547109057818844, + "grad_norm": 0.25754415871267855, + "learning_rate": 9.013250439693468e-06, + "loss": 0.3002, + "step": 5684 + }, + { + "epoch": 0.45479090418191637, + "grad_norm": 0.27349375734525255, + "learning_rate": 9.012864054768584e-06, + "loss": 0.2712, + "step": 5685 + }, + { + "epoch": 0.45487090258194834, + "grad_norm": 0.26811872260216413, + "learning_rate": 9.012477602494768e-06, + "loss": 0.2816, + "step": 5686 + }, + { + "epoch": 0.4549509009819804, + "grad_norm": 0.259485351355254, + "learning_rate": 9.01209108287851e-06, + "loss": 0.3002, + "step": 5687 + }, + { + "epoch": 0.45503089938201235, + "grad_norm": 0.29037885670128794, + "learning_rate": 9.01170449592629e-06, + "loss": 0.271, + "step": 5688 + }, + { + "epoch": 0.45511089778204433, + "grad_norm": 0.3370454315786992, + "learning_rate": 9.011317841644602e-06, + "loss": 0.2484, + "step": 5689 + }, + { + "epoch": 0.45519089618207637, + "grad_norm": 0.30406566790697725, + "learning_rate": 9.010931120039934e-06, + "loss": 0.3035, + "step": 5690 + }, + { + "epoch": 0.45527089458210834, + "grad_norm": 0.30286979605168124, + "learning_rate": 9.010544331118776e-06, + "loss": 0.2899, + "step": 5691 + }, + { + "epoch": 0.4553508929821404, + "grad_norm": 0.3268703307425401, + "learning_rate": 9.01015747488762e-06, + "loss": 0.2614, + "step": 5692 + }, + { + "epoch": 0.45543089138217235, + "grad_norm": 0.2021326362922993, + "learning_rate": 9.009770551352957e-06, + "loss": 0.368, + "step": 5693 + }, + { + "epoch": 0.45551088978220433, + "grad_norm": 0.30653252424072697, + "learning_rate": 9.009383560521284e-06, + "loss": 0.2755, + "step": 5694 + }, + { + "epoch": 0.45559088818223636, + "grad_norm": 0.24797127795035148, + "learning_rate": 9.008996502399092e-06, + "loss": 0.2807, + "step": 5695 + }, + { + "epoch": 0.45567088658226834, + "grad_norm": 0.24017968391194194, + "learning_rate": 9.008609376992879e-06, + "loss": 0.3119, + "step": 5696 + }, + { + "epoch": 0.4557508849823004, + "grad_norm": 0.23524960158742408, + "learning_rate": 9.008222184309145e-06, + "loss": 0.3277, + "step": 5697 + }, + { + "epoch": 0.45583088338233235, + "grad_norm": 0.25241637842412046, + "learning_rate": 9.007834924354384e-06, + "loss": 0.2896, + "step": 5698 + }, + { + "epoch": 0.45591088178236433, + "grad_norm": 0.27192507701001134, + "learning_rate": 9.007447597135097e-06, + "loss": 0.2881, + "step": 5699 + }, + { + "epoch": 0.45599088018239636, + "grad_norm": 0.3307123795920149, + "learning_rate": 9.007060202657785e-06, + "loss": 0.2559, + "step": 5700 + }, + { + "epoch": 0.45607087858242834, + "grad_norm": 0.23930411036443908, + "learning_rate": 9.006672740928952e-06, + "loss": 0.3139, + "step": 5701 + }, + { + "epoch": 0.4561508769824604, + "grad_norm": 0.2574565588745887, + "learning_rate": 9.006285211955095e-06, + "loss": 0.2927, + "step": 5702 + }, + { + "epoch": 0.45623087538249235, + "grad_norm": 0.30533934059239326, + "learning_rate": 9.005897615742723e-06, + "loss": 0.2869, + "step": 5703 + }, + { + "epoch": 0.45631087378252433, + "grad_norm": 0.24824860185365413, + "learning_rate": 9.005509952298341e-06, + "loss": 0.3307, + "step": 5704 + }, + { + "epoch": 0.45639087218255636, + "grad_norm": 0.27551065378772505, + "learning_rate": 9.005122221628452e-06, + "loss": 0.2834, + "step": 5705 + }, + { + "epoch": 0.45647087058258834, + "grad_norm": 0.31840337583536443, + "learning_rate": 9.004734423739567e-06, + "loss": 0.2419, + "step": 5706 + }, + { + "epoch": 0.4565508689826204, + "grad_norm": 0.29808064580035876, + "learning_rate": 9.00434655863819e-06, + "loss": 0.2803, + "step": 5707 + }, + { + "epoch": 0.45663086738265235, + "grad_norm": 0.29771552682217306, + "learning_rate": 9.003958626330836e-06, + "loss": 0.288, + "step": 5708 + }, + { + "epoch": 0.4567108657826843, + "grad_norm": 0.3012665850981478, + "learning_rate": 9.003570626824013e-06, + "loss": 0.2613, + "step": 5709 + }, + { + "epoch": 0.45679086418271636, + "grad_norm": 0.30670784685946273, + "learning_rate": 9.003182560124233e-06, + "loss": 0.2518, + "step": 5710 + }, + { + "epoch": 0.45687086258274834, + "grad_norm": 0.247326398357017, + "learning_rate": 9.002794426238009e-06, + "loss": 0.3034, + "step": 5711 + }, + { + "epoch": 0.45695086098278037, + "grad_norm": 0.26153281510187193, + "learning_rate": 9.002406225171854e-06, + "loss": 0.3137, + "step": 5712 + }, + { + "epoch": 0.45703085938281235, + "grad_norm": 0.33003122461644413, + "learning_rate": 9.002017956932285e-06, + "loss": 0.238, + "step": 5713 + }, + { + "epoch": 0.4571108577828443, + "grad_norm": 0.33957556940596817, + "learning_rate": 9.001629621525819e-06, + "loss": 0.3039, + "step": 5714 + }, + { + "epoch": 0.45719085618287636, + "grad_norm": 0.25844997754384297, + "learning_rate": 9.001241218958972e-06, + "loss": 0.2798, + "step": 5715 + }, + { + "epoch": 0.45727085458290834, + "grad_norm": 0.25058774269315715, + "learning_rate": 9.000852749238263e-06, + "loss": 0.275, + "step": 5716 + }, + { + "epoch": 0.4573508529829403, + "grad_norm": 0.2564060105360013, + "learning_rate": 9.00046421237021e-06, + "loss": 0.2913, + "step": 5717 + }, + { + "epoch": 0.45743085138297235, + "grad_norm": 0.2619532662604188, + "learning_rate": 9.000075608361338e-06, + "loss": 0.2939, + "step": 5718 + }, + { + "epoch": 0.4575108497830043, + "grad_norm": 0.2797279891150205, + "learning_rate": 8.999686937218168e-06, + "loss": 0.289, + "step": 5719 + }, + { + "epoch": 0.45759084818303636, + "grad_norm": 0.27504308412081513, + "learning_rate": 8.999298198947219e-06, + "loss": 0.3024, + "step": 5720 + }, + { + "epoch": 0.45767084658306834, + "grad_norm": 0.2930297939351951, + "learning_rate": 8.998909393555022e-06, + "loss": 0.2489, + "step": 5721 + }, + { + "epoch": 0.4577508449831003, + "grad_norm": 0.28397552985575436, + "learning_rate": 8.998520521048096e-06, + "loss": 0.2885, + "step": 5722 + }, + { + "epoch": 0.45783084338313235, + "grad_norm": 0.18929032703718748, + "learning_rate": 8.998131581432972e-06, + "loss": 0.3288, + "step": 5723 + }, + { + "epoch": 0.4579108417831643, + "grad_norm": 0.2811289826699942, + "learning_rate": 8.997742574716175e-06, + "loss": 0.2719, + "step": 5724 + }, + { + "epoch": 0.45799084018319636, + "grad_norm": 0.29873891016683424, + "learning_rate": 8.997353500904234e-06, + "loss": 0.2513, + "step": 5725 + }, + { + "epoch": 0.45807083858322833, + "grad_norm": 0.25813044240358757, + "learning_rate": 8.99696436000368e-06, + "loss": 0.2659, + "step": 5726 + }, + { + "epoch": 0.4581508369832603, + "grad_norm": 0.29203999347527887, + "learning_rate": 8.996575152021045e-06, + "loss": 0.257, + "step": 5727 + }, + { + "epoch": 0.45823083538329235, + "grad_norm": 0.2810570026486525, + "learning_rate": 8.996185876962859e-06, + "loss": 0.2293, + "step": 5728 + }, + { + "epoch": 0.4583108337833243, + "grad_norm": 0.3019512217265175, + "learning_rate": 8.995796534835656e-06, + "loss": 0.2765, + "step": 5729 + }, + { + "epoch": 0.45839083218335636, + "grad_norm": 0.316066082426242, + "learning_rate": 8.995407125645972e-06, + "loss": 0.2521, + "step": 5730 + }, + { + "epoch": 0.45847083058338833, + "grad_norm": 0.4163112045672262, + "learning_rate": 8.995017649400341e-06, + "loss": 0.239, + "step": 5731 + }, + { + "epoch": 0.4585508289834203, + "grad_norm": 0.23341158775925291, + "learning_rate": 8.994628106105298e-06, + "loss": 0.3198, + "step": 5732 + }, + { + "epoch": 0.45863082738345234, + "grad_norm": 0.7548677407165855, + "learning_rate": 8.994238495767385e-06, + "loss": 0.2576, + "step": 5733 + }, + { + "epoch": 0.4587108257834843, + "grad_norm": 0.34504067280488815, + "learning_rate": 8.993848818393139e-06, + "loss": 0.2389, + "step": 5734 + }, + { + "epoch": 0.45879082418351635, + "grad_norm": 0.2588751462417314, + "learning_rate": 8.993459073989098e-06, + "loss": 0.3128, + "step": 5735 + }, + { + "epoch": 0.45887082258354833, + "grad_norm": 0.3127232439908338, + "learning_rate": 8.993069262561805e-06, + "loss": 0.3028, + "step": 5736 + }, + { + "epoch": 0.4589508209835803, + "grad_norm": 0.3024930462453285, + "learning_rate": 8.992679384117802e-06, + "loss": 0.2773, + "step": 5737 + }, + { + "epoch": 0.45903081938361234, + "grad_norm": 0.26151220410359416, + "learning_rate": 8.992289438663632e-06, + "loss": 0.2956, + "step": 5738 + }, + { + "epoch": 0.4591108177836443, + "grad_norm": 0.3343245354080847, + "learning_rate": 8.991899426205844e-06, + "loss": 0.2762, + "step": 5739 + }, + { + "epoch": 0.45919081618367635, + "grad_norm": 0.3405573810839733, + "learning_rate": 8.991509346750975e-06, + "loss": 0.2824, + "step": 5740 + }, + { + "epoch": 0.45927081458370833, + "grad_norm": 0.3048668397576305, + "learning_rate": 8.99111920030558e-06, + "loss": 0.2926, + "step": 5741 + }, + { + "epoch": 0.4593508129837403, + "grad_norm": 0.3053999848651645, + "learning_rate": 8.990728986876203e-06, + "loss": 0.2801, + "step": 5742 + }, + { + "epoch": 0.45943081138377234, + "grad_norm": 0.2933816480427734, + "learning_rate": 8.990338706469393e-06, + "loss": 0.2599, + "step": 5743 + }, + { + "epoch": 0.4595108097838043, + "grad_norm": 0.2665839337944934, + "learning_rate": 8.9899483590917e-06, + "loss": 0.2774, + "step": 5744 + }, + { + "epoch": 0.4595908081838363, + "grad_norm": 0.34123110035725057, + "learning_rate": 8.989557944749677e-06, + "loss": 0.2562, + "step": 5745 + }, + { + "epoch": 0.45967080658386833, + "grad_norm": 0.25257115847420963, + "learning_rate": 8.989167463449874e-06, + "loss": 0.3117, + "step": 5746 + }, + { + "epoch": 0.4597508049839003, + "grad_norm": 0.2677806148407579, + "learning_rate": 8.988776915198849e-06, + "loss": 0.29, + "step": 5747 + }, + { + "epoch": 0.45983080338393234, + "grad_norm": 0.3173726398976456, + "learning_rate": 8.988386300003153e-06, + "loss": 0.2582, + "step": 5748 + }, + { + "epoch": 0.4599108017839643, + "grad_norm": 0.2776671944309633, + "learning_rate": 8.987995617869341e-06, + "loss": 0.2829, + "step": 5749 + }, + { + "epoch": 0.4599908001839963, + "grad_norm": 0.26687416860938623, + "learning_rate": 8.987604868803972e-06, + "loss": 0.2632, + "step": 5750 + }, + { + "epoch": 0.46007079858402833, + "grad_norm": 0.3208482103308363, + "learning_rate": 8.987214052813605e-06, + "loss": 0.225, + "step": 5751 + }, + { + "epoch": 0.4601507969840603, + "grad_norm": 0.26742010100475155, + "learning_rate": 8.986823169904797e-06, + "loss": 0.3164, + "step": 5752 + }, + { + "epoch": 0.46023079538409234, + "grad_norm": 0.269844582091821, + "learning_rate": 8.986432220084108e-06, + "loss": 0.2761, + "step": 5753 + }, + { + "epoch": 0.4603107937841243, + "grad_norm": 0.22389317613720738, + "learning_rate": 8.986041203358101e-06, + "loss": 0.3215, + "step": 5754 + }, + { + "epoch": 0.4603907921841563, + "grad_norm": 0.291198802529429, + "learning_rate": 8.985650119733338e-06, + "loss": 0.257, + "step": 5755 + }, + { + "epoch": 0.4604707905841883, + "grad_norm": 0.2957773042894445, + "learning_rate": 8.985258969216383e-06, + "loss": 0.2808, + "step": 5756 + }, + { + "epoch": 0.4605507889842203, + "grad_norm": 0.2950509523276521, + "learning_rate": 8.9848677518138e-06, + "loss": 0.2974, + "step": 5757 + }, + { + "epoch": 0.46063078738425234, + "grad_norm": 0.28130603845741026, + "learning_rate": 8.984476467532156e-06, + "loss": 0.3018, + "step": 5758 + }, + { + "epoch": 0.4607107857842843, + "grad_norm": 0.2654643009382576, + "learning_rate": 8.984085116378015e-06, + "loss": 0.3048, + "step": 5759 + }, + { + "epoch": 0.4607907841843163, + "grad_norm": 0.3108520601587131, + "learning_rate": 8.98369369835795e-06, + "loss": 0.2498, + "step": 5760 + }, + { + "epoch": 0.4608707825843483, + "grad_norm": 0.3563608829858653, + "learning_rate": 8.983302213478525e-06, + "loss": 0.2573, + "step": 5761 + }, + { + "epoch": 0.4609507809843803, + "grad_norm": 0.31070741242122324, + "learning_rate": 8.982910661746315e-06, + "loss": 0.2908, + "step": 5762 + }, + { + "epoch": 0.46103077938441234, + "grad_norm": 0.30846740530982875, + "learning_rate": 8.982519043167888e-06, + "loss": 0.2651, + "step": 5763 + }, + { + "epoch": 0.4611107777844443, + "grad_norm": 0.24598659564488626, + "learning_rate": 8.98212735774982e-06, + "loss": 0.3232, + "step": 5764 + }, + { + "epoch": 0.4611907761844763, + "grad_norm": 0.27321142676964094, + "learning_rate": 8.981735605498683e-06, + "loss": 0.2856, + "step": 5765 + }, + { + "epoch": 0.4612707745845083, + "grad_norm": 0.28610065071086943, + "learning_rate": 8.981343786421051e-06, + "loss": 0.2784, + "step": 5766 + }, + { + "epoch": 0.4613507729845403, + "grad_norm": 0.36010062803700194, + "learning_rate": 8.9809519005235e-06, + "loss": 0.278, + "step": 5767 + }, + { + "epoch": 0.46143077138457234, + "grad_norm": 0.3192937977714049, + "learning_rate": 8.98055994781261e-06, + "loss": 0.2984, + "step": 5768 + }, + { + "epoch": 0.4615107697846043, + "grad_norm": 0.23240969099305078, + "learning_rate": 8.980167928294956e-06, + "loss": 0.3017, + "step": 5769 + }, + { + "epoch": 0.4615907681846363, + "grad_norm": 0.24092273770462636, + "learning_rate": 8.97977584197712e-06, + "loss": 0.3332, + "step": 5770 + }, + { + "epoch": 0.4616707665846683, + "grad_norm": 0.27069702263300327, + "learning_rate": 8.97938368886568e-06, + "loss": 0.2924, + "step": 5771 + }, + { + "epoch": 0.4617507649847003, + "grad_norm": 0.24862989850632858, + "learning_rate": 8.978991468967218e-06, + "loss": 0.2726, + "step": 5772 + }, + { + "epoch": 0.4618307633847323, + "grad_norm": 0.21831012753687268, + "learning_rate": 8.978599182288319e-06, + "loss": 0.3187, + "step": 5773 + }, + { + "epoch": 0.4619107617847643, + "grad_norm": 0.29600031757909434, + "learning_rate": 8.978206828835564e-06, + "loss": 0.2867, + "step": 5774 + }, + { + "epoch": 0.4619907601847963, + "grad_norm": 0.23250020902349608, + "learning_rate": 8.97781440861554e-06, + "loss": 0.3125, + "step": 5775 + }, + { + "epoch": 0.4620707585848283, + "grad_norm": 0.3355016215639556, + "learning_rate": 8.977421921634833e-06, + "loss": 0.2406, + "step": 5776 + }, + { + "epoch": 0.4621507569848603, + "grad_norm": 0.2665202993886267, + "learning_rate": 8.977029367900028e-06, + "loss": 0.2803, + "step": 5777 + }, + { + "epoch": 0.4622307553848923, + "grad_norm": 0.3061210664835922, + "learning_rate": 8.976636747417715e-06, + "loss": 0.2876, + "step": 5778 + }, + { + "epoch": 0.4623107537849243, + "grad_norm": 0.24807078690939627, + "learning_rate": 8.976244060194484e-06, + "loss": 0.2819, + "step": 5779 + }, + { + "epoch": 0.4623907521849563, + "grad_norm": 0.31685086451058636, + "learning_rate": 8.975851306236925e-06, + "loss": 0.283, + "step": 5780 + }, + { + "epoch": 0.4624707505849883, + "grad_norm": 0.3082104061076356, + "learning_rate": 8.97545848555163e-06, + "loss": 0.2366, + "step": 5781 + }, + { + "epoch": 0.4625507489850203, + "grad_norm": 0.2686830051682658, + "learning_rate": 8.97506559814519e-06, + "loss": 0.255, + "step": 5782 + }, + { + "epoch": 0.4626307473850523, + "grad_norm": 0.2658598030042549, + "learning_rate": 8.9746726440242e-06, + "loss": 0.3367, + "step": 5783 + }, + { + "epoch": 0.4627107457850843, + "grad_norm": 0.29630629414157716, + "learning_rate": 8.974279623195257e-06, + "loss": 0.2648, + "step": 5784 + }, + { + "epoch": 0.4627907441851163, + "grad_norm": 0.28807600672163663, + "learning_rate": 8.973886535664954e-06, + "loss": 0.2477, + "step": 5785 + }, + { + "epoch": 0.4628707425851483, + "grad_norm": 0.2497658765717286, + "learning_rate": 8.973493381439892e-06, + "loss": 0.2919, + "step": 5786 + }, + { + "epoch": 0.4629507409851803, + "grad_norm": 0.2698661346263241, + "learning_rate": 8.973100160526666e-06, + "loss": 0.2924, + "step": 5787 + }, + { + "epoch": 0.4630307393852123, + "grad_norm": 0.3283141863721741, + "learning_rate": 8.972706872931877e-06, + "loss": 0.2618, + "step": 5788 + }, + { + "epoch": 0.4631107377852443, + "grad_norm": 0.23351849884289716, + "learning_rate": 8.972313518662125e-06, + "loss": 0.3145, + "step": 5789 + }, + { + "epoch": 0.4631907361852763, + "grad_norm": 0.5213563944960924, + "learning_rate": 8.971920097724012e-06, + "loss": 0.2872, + "step": 5790 + }, + { + "epoch": 0.4632707345853083, + "grad_norm": 0.2753298222630535, + "learning_rate": 8.971526610124142e-06, + "loss": 0.3464, + "step": 5791 + }, + { + "epoch": 0.4633507329853403, + "grad_norm": 0.32971697956594914, + "learning_rate": 8.971133055869117e-06, + "loss": 0.2665, + "step": 5792 + }, + { + "epoch": 0.4634307313853723, + "grad_norm": 0.2504774800329252, + "learning_rate": 8.970739434965544e-06, + "loss": 0.2987, + "step": 5793 + }, + { + "epoch": 0.4635107297854043, + "grad_norm": 0.26933383989691456, + "learning_rate": 8.970345747420029e-06, + "loss": 0.2843, + "step": 5794 + }, + { + "epoch": 0.4635907281854363, + "grad_norm": 0.25718202378259497, + "learning_rate": 8.969951993239177e-06, + "loss": 0.287, + "step": 5795 + }, + { + "epoch": 0.4636707265854683, + "grad_norm": 0.3069182338044001, + "learning_rate": 8.9695581724296e-06, + "loss": 0.2505, + "step": 5796 + }, + { + "epoch": 0.4637507249855003, + "grad_norm": 0.26707907635924255, + "learning_rate": 8.969164284997905e-06, + "loss": 0.2842, + "step": 5797 + }, + { + "epoch": 0.4638307233855323, + "grad_norm": 0.27193971181991994, + "learning_rate": 8.968770330950703e-06, + "loss": 0.301, + "step": 5798 + }, + { + "epoch": 0.4639107217855643, + "grad_norm": 0.2580996315642149, + "learning_rate": 8.968376310294608e-06, + "loss": 0.2702, + "step": 5799 + }, + { + "epoch": 0.4639907201855963, + "grad_norm": 0.2689842735500525, + "learning_rate": 8.96798222303623e-06, + "loss": 0.2686, + "step": 5800 + }, + { + "epoch": 0.46407071858562826, + "grad_norm": 0.2719310774258587, + "learning_rate": 8.967588069182184e-06, + "loss": 0.292, + "step": 5801 + }, + { + "epoch": 0.4641507169856603, + "grad_norm": 0.3158973157639977, + "learning_rate": 8.967193848739087e-06, + "loss": 0.2918, + "step": 5802 + }, + { + "epoch": 0.46423071538569227, + "grad_norm": 0.18594801823625473, + "learning_rate": 8.966799561713556e-06, + "loss": 0.3329, + "step": 5803 + }, + { + "epoch": 0.4643107137857243, + "grad_norm": 0.2776274425328694, + "learning_rate": 8.966405208112202e-06, + "loss": 0.2714, + "step": 5804 + }, + { + "epoch": 0.4643907121857563, + "grad_norm": 0.23801349093970028, + "learning_rate": 8.96601078794165e-06, + "loss": 0.3211, + "step": 5805 + }, + { + "epoch": 0.46447071058578826, + "grad_norm": 0.3229035737062405, + "learning_rate": 8.965616301208517e-06, + "loss": 0.2467, + "step": 5806 + }, + { + "epoch": 0.4645507089858203, + "grad_norm": 0.3159661051094077, + "learning_rate": 8.965221747919424e-06, + "loss": 0.2537, + "step": 5807 + }, + { + "epoch": 0.46463070738585227, + "grad_norm": 0.34638462706942025, + "learning_rate": 8.964827128080995e-06, + "loss": 0.2497, + "step": 5808 + }, + { + "epoch": 0.4647107057858843, + "grad_norm": 0.38670104685900775, + "learning_rate": 8.964432441699848e-06, + "loss": 0.2759, + "step": 5809 + }, + { + "epoch": 0.4647907041859163, + "grad_norm": 0.2982318404788443, + "learning_rate": 8.964037688782614e-06, + "loss": 0.3114, + "step": 5810 + }, + { + "epoch": 0.46487070258594826, + "grad_norm": 0.30879349143825763, + "learning_rate": 8.963642869335913e-06, + "loss": 0.308, + "step": 5811 + }, + { + "epoch": 0.4649507009859803, + "grad_norm": 0.28064384084310307, + "learning_rate": 8.963247983366372e-06, + "loss": 0.2526, + "step": 5812 + }, + { + "epoch": 0.46503069938601227, + "grad_norm": 0.6168050452553306, + "learning_rate": 8.96285303088062e-06, + "loss": 0.2807, + "step": 5813 + }, + { + "epoch": 0.4651106977860443, + "grad_norm": 0.3218248903171723, + "learning_rate": 8.962458011885286e-06, + "loss": 0.2566, + "step": 5814 + }, + { + "epoch": 0.4651906961860763, + "grad_norm": 0.2895488715182279, + "learning_rate": 8.962062926386998e-06, + "loss": 0.2478, + "step": 5815 + }, + { + "epoch": 0.46527069458610826, + "grad_norm": 0.31511710745757054, + "learning_rate": 8.961667774392387e-06, + "loss": 0.2832, + "step": 5816 + }, + { + "epoch": 0.4653506929861403, + "grad_norm": 0.3118931716744917, + "learning_rate": 8.961272555908084e-06, + "loss": 0.2677, + "step": 5817 + }, + { + "epoch": 0.46543069138617227, + "grad_norm": 0.31530004663403877, + "learning_rate": 8.960877270940727e-06, + "loss": 0.2948, + "step": 5818 + }, + { + "epoch": 0.4655106897862043, + "grad_norm": 0.3134026917592955, + "learning_rate": 8.960481919496944e-06, + "loss": 0.2987, + "step": 5819 + }, + { + "epoch": 0.4655906881862363, + "grad_norm": 0.31406935245783446, + "learning_rate": 8.960086501583376e-06, + "loss": 0.2583, + "step": 5820 + }, + { + "epoch": 0.46567068658626826, + "grad_norm": 0.29388817366868586, + "learning_rate": 8.959691017206653e-06, + "loss": 0.2515, + "step": 5821 + }, + { + "epoch": 0.4657506849863003, + "grad_norm": 0.3107970543538932, + "learning_rate": 8.959295466373417e-06, + "loss": 0.273, + "step": 5822 + }, + { + "epoch": 0.46583068338633227, + "grad_norm": 0.22435008673002646, + "learning_rate": 8.958899849090306e-06, + "loss": 0.2939, + "step": 5823 + }, + { + "epoch": 0.4659106817863643, + "grad_norm": 0.3241206416335767, + "learning_rate": 8.958504165363959e-06, + "loss": 0.2827, + "step": 5824 + }, + { + "epoch": 0.4659906801863963, + "grad_norm": 0.27711610635765205, + "learning_rate": 8.958108415201017e-06, + "loss": 0.2755, + "step": 5825 + }, + { + "epoch": 0.46607067858642826, + "grad_norm": 0.2873209588122253, + "learning_rate": 8.957712598608123e-06, + "loss": 0.2899, + "step": 5826 + }, + { + "epoch": 0.4661506769864603, + "grad_norm": 0.31933428095131366, + "learning_rate": 8.957316715591918e-06, + "loss": 0.281, + "step": 5827 + }, + { + "epoch": 0.46623067538649227, + "grad_norm": 0.7691796598485064, + "learning_rate": 8.956920766159048e-06, + "loss": 0.2498, + "step": 5828 + }, + { + "epoch": 0.46631067378652424, + "grad_norm": 0.2983793390487636, + "learning_rate": 8.956524750316158e-06, + "loss": 0.2464, + "step": 5829 + }, + { + "epoch": 0.4663906721865563, + "grad_norm": 0.34662663938500254, + "learning_rate": 8.956128668069894e-06, + "loss": 0.2535, + "step": 5830 + }, + { + "epoch": 0.46647067058658825, + "grad_norm": 0.28012057632041476, + "learning_rate": 8.955732519426902e-06, + "loss": 0.3079, + "step": 5831 + }, + { + "epoch": 0.4665506689866203, + "grad_norm": 0.30403948765802663, + "learning_rate": 8.955336304393833e-06, + "loss": 0.2668, + "step": 5832 + }, + { + "epoch": 0.46663066738665226, + "grad_norm": 0.31213347266694547, + "learning_rate": 8.954940022977338e-06, + "loss": 0.2613, + "step": 5833 + }, + { + "epoch": 0.46671066578668424, + "grad_norm": 0.32791326303815566, + "learning_rate": 8.954543675184065e-06, + "loss": 0.2546, + "step": 5834 + }, + { + "epoch": 0.4667906641867163, + "grad_norm": 0.2433565302470024, + "learning_rate": 8.954147261020667e-06, + "loss": 0.3305, + "step": 5835 + }, + { + "epoch": 0.46687066258674825, + "grad_norm": 0.34837732794149967, + "learning_rate": 8.953750780493797e-06, + "loss": 0.2709, + "step": 5836 + }, + { + "epoch": 0.4669506609867803, + "grad_norm": 0.3198878433794871, + "learning_rate": 8.95335423361011e-06, + "loss": 0.2568, + "step": 5837 + }, + { + "epoch": 0.46703065938681226, + "grad_norm": 0.3213000047800948, + "learning_rate": 8.952957620376261e-06, + "loss": 0.2903, + "step": 5838 + }, + { + "epoch": 0.46711065778684424, + "grad_norm": 0.2934820533427515, + "learning_rate": 8.952560940798905e-06, + "loss": 0.2605, + "step": 5839 + }, + { + "epoch": 0.4671906561868763, + "grad_norm": 0.2893104564107298, + "learning_rate": 8.9521641948847e-06, + "loss": 0.2491, + "step": 5840 + }, + { + "epoch": 0.46727065458690825, + "grad_norm": 0.292775622086539, + "learning_rate": 8.951767382640308e-06, + "loss": 0.2549, + "step": 5841 + }, + { + "epoch": 0.4673506529869403, + "grad_norm": 0.2922770887675927, + "learning_rate": 8.951370504072385e-06, + "loss": 0.2438, + "step": 5842 + }, + { + "epoch": 0.46743065138697226, + "grad_norm": 0.32067563627927154, + "learning_rate": 8.950973559187593e-06, + "loss": 0.2623, + "step": 5843 + }, + { + "epoch": 0.46751064978700424, + "grad_norm": 0.27563222348123945, + "learning_rate": 8.950576547992593e-06, + "loss": 0.2856, + "step": 5844 + }, + { + "epoch": 0.4675906481870363, + "grad_norm": 0.3026488372225054, + "learning_rate": 8.950179470494051e-06, + "loss": 0.2571, + "step": 5845 + }, + { + "epoch": 0.46767064658706825, + "grad_norm": 0.26563865369927925, + "learning_rate": 8.94978232669863e-06, + "loss": 0.2875, + "step": 5846 + }, + { + "epoch": 0.4677506449871003, + "grad_norm": 0.24855700776861267, + "learning_rate": 8.949385116612994e-06, + "loss": 0.3227, + "step": 5847 + }, + { + "epoch": 0.46783064338713226, + "grad_norm": 0.28460494082076854, + "learning_rate": 8.948987840243812e-06, + "loss": 0.3032, + "step": 5848 + }, + { + "epoch": 0.46791064178716424, + "grad_norm": 0.2878708965975192, + "learning_rate": 8.948590497597749e-06, + "loss": 0.2405, + "step": 5849 + }, + { + "epoch": 0.46799064018719627, + "grad_norm": 0.38550085029146125, + "learning_rate": 8.948193088681474e-06, + "loss": 0.2548, + "step": 5850 + }, + { + "epoch": 0.46807063858722825, + "grad_norm": 0.3016023784671988, + "learning_rate": 8.947795613501658e-06, + "loss": 0.3357, + "step": 5851 + }, + { + "epoch": 0.4681506369872603, + "grad_norm": 0.3121298955966143, + "learning_rate": 8.947398072064972e-06, + "loss": 0.2689, + "step": 5852 + }, + { + "epoch": 0.46823063538729226, + "grad_norm": 0.40709280607609527, + "learning_rate": 8.947000464378088e-06, + "loss": 0.2758, + "step": 5853 + }, + { + "epoch": 0.46831063378732424, + "grad_norm": 0.31176806236948573, + "learning_rate": 8.946602790447677e-06, + "loss": 0.2788, + "step": 5854 + }, + { + "epoch": 0.46839063218735627, + "grad_norm": 0.29491363686560407, + "learning_rate": 8.946205050280417e-06, + "loss": 0.2844, + "step": 5855 + }, + { + "epoch": 0.46847063058738825, + "grad_norm": 0.26843393871554133, + "learning_rate": 8.94580724388298e-06, + "loss": 0.2782, + "step": 5856 + }, + { + "epoch": 0.4685506289874202, + "grad_norm": 0.28596350262461795, + "learning_rate": 8.945409371262044e-06, + "loss": 0.2716, + "step": 5857 + }, + { + "epoch": 0.46863062738745226, + "grad_norm": 0.4192351977298747, + "learning_rate": 8.945011432424287e-06, + "loss": 0.2585, + "step": 5858 + }, + { + "epoch": 0.46871062578748424, + "grad_norm": 0.23287590744755174, + "learning_rate": 8.944613427376385e-06, + "loss": 0.3497, + "step": 5859 + }, + { + "epoch": 0.46879062418751627, + "grad_norm": 0.335048066872026, + "learning_rate": 8.944215356125022e-06, + "loss": 0.2648, + "step": 5860 + }, + { + "epoch": 0.46887062258754825, + "grad_norm": 0.2858325760188751, + "learning_rate": 8.943817218676877e-06, + "loss": 0.2953, + "step": 5861 + }, + { + "epoch": 0.4689506209875802, + "grad_norm": 1.2012702829785136, + "learning_rate": 8.943419015038631e-06, + "loss": 0.2517, + "step": 5862 + }, + { + "epoch": 0.46903061938761226, + "grad_norm": 0.44540565641616214, + "learning_rate": 8.943020745216968e-06, + "loss": 0.3118, + "step": 5863 + }, + { + "epoch": 0.46911061778764424, + "grad_norm": 0.26463789242067015, + "learning_rate": 8.942622409218573e-06, + "loss": 0.2689, + "step": 5864 + }, + { + "epoch": 0.46919061618767627, + "grad_norm": 0.2727117036155042, + "learning_rate": 8.942224007050131e-06, + "loss": 0.2858, + "step": 5865 + }, + { + "epoch": 0.46927061458770825, + "grad_norm": 0.23522948077420866, + "learning_rate": 8.941825538718328e-06, + "loss": 0.3198, + "step": 5866 + }, + { + "epoch": 0.4693506129877402, + "grad_norm": 0.25995979455885676, + "learning_rate": 8.941427004229851e-06, + "loss": 0.2831, + "step": 5867 + }, + { + "epoch": 0.46943061138777226, + "grad_norm": 0.36442325635105177, + "learning_rate": 8.94102840359139e-06, + "loss": 0.2932, + "step": 5868 + }, + { + "epoch": 0.46951060978780423, + "grad_norm": 0.3361864503048774, + "learning_rate": 8.940629736809635e-06, + "loss": 0.2656, + "step": 5869 + }, + { + "epoch": 0.46959060818783627, + "grad_norm": 0.24719226764272864, + "learning_rate": 8.940231003891275e-06, + "loss": 0.316, + "step": 5870 + }, + { + "epoch": 0.46967060658786824, + "grad_norm": 0.2791908379821809, + "learning_rate": 8.939832204843003e-06, + "loss": 0.2745, + "step": 5871 + }, + { + "epoch": 0.4697506049879002, + "grad_norm": 0.28890249708525007, + "learning_rate": 8.939433339671514e-06, + "loss": 0.2854, + "step": 5872 + }, + { + "epoch": 0.46983060338793226, + "grad_norm": 0.2013530635557675, + "learning_rate": 8.939034408383502e-06, + "loss": 0.3461, + "step": 5873 + }, + { + "epoch": 0.46991060178796423, + "grad_norm": 0.3059123301646322, + "learning_rate": 8.938635410985658e-06, + "loss": 0.2695, + "step": 5874 + }, + { + "epoch": 0.46999060018799627, + "grad_norm": 0.34695752833308763, + "learning_rate": 8.938236347484684e-06, + "loss": 0.2968, + "step": 5875 + }, + { + "epoch": 0.47007059858802824, + "grad_norm": 0.2918977147130378, + "learning_rate": 8.937837217887273e-06, + "loss": 0.286, + "step": 5876 + }, + { + "epoch": 0.4701505969880602, + "grad_norm": 0.3053461446902127, + "learning_rate": 8.937438022200126e-06, + "loss": 0.289, + "step": 5877 + }, + { + "epoch": 0.47023059538809225, + "grad_norm": 0.2591159932211754, + "learning_rate": 8.937038760429944e-06, + "loss": 0.3265, + "step": 5878 + }, + { + "epoch": 0.47031059378812423, + "grad_norm": 0.29559144009668364, + "learning_rate": 8.936639432583424e-06, + "loss": 0.2901, + "step": 5879 + }, + { + "epoch": 0.47039059218815626, + "grad_norm": 0.2772054703273193, + "learning_rate": 8.936240038667275e-06, + "loss": 0.272, + "step": 5880 + }, + { + "epoch": 0.47047059058818824, + "grad_norm": 0.33263450760740615, + "learning_rate": 8.935840578688191e-06, + "loss": 0.2536, + "step": 5881 + }, + { + "epoch": 0.4705505889882202, + "grad_norm": 0.27008538275523497, + "learning_rate": 8.935441052652884e-06, + "loss": 0.295, + "step": 5882 + }, + { + "epoch": 0.47063058738825225, + "grad_norm": 0.2827234167160174, + "learning_rate": 8.935041460568055e-06, + "loss": 0.2443, + "step": 5883 + }, + { + "epoch": 0.47071058578828423, + "grad_norm": 0.2763055106015007, + "learning_rate": 8.934641802440411e-06, + "loss": 0.283, + "step": 5884 + }, + { + "epoch": 0.4707905841883162, + "grad_norm": 0.33224486110303325, + "learning_rate": 8.934242078276662e-06, + "loss": 0.2587, + "step": 5885 + }, + { + "epoch": 0.47087058258834824, + "grad_norm": 0.35265997002484256, + "learning_rate": 8.933842288083514e-06, + "loss": 0.2514, + "step": 5886 + }, + { + "epoch": 0.4709505809883802, + "grad_norm": 0.24058093082503645, + "learning_rate": 8.933442431867678e-06, + "loss": 0.316, + "step": 5887 + }, + { + "epoch": 0.47103057938841225, + "grad_norm": 0.28129880717369393, + "learning_rate": 8.933042509635866e-06, + "loss": 0.2768, + "step": 5888 + }, + { + "epoch": 0.47111057778844423, + "grad_norm": 0.30184007697442156, + "learning_rate": 8.932642521394786e-06, + "loss": 0.2534, + "step": 5889 + }, + { + "epoch": 0.4711905761884762, + "grad_norm": 0.27743091175409795, + "learning_rate": 8.932242467151156e-06, + "loss": 0.2852, + "step": 5890 + }, + { + "epoch": 0.47127057458850824, + "grad_norm": 0.31035886113688516, + "learning_rate": 8.931842346911688e-06, + "loss": 0.2348, + "step": 5891 + }, + { + "epoch": 0.4713505729885402, + "grad_norm": 0.3330061584474159, + "learning_rate": 8.931442160683094e-06, + "loss": 0.2825, + "step": 5892 + }, + { + "epoch": 0.47143057138857225, + "grad_norm": 0.2801275634358817, + "learning_rate": 8.931041908472098e-06, + "loss": 0.2938, + "step": 5893 + }, + { + "epoch": 0.47151056978860423, + "grad_norm": 0.3143147904339116, + "learning_rate": 8.930641590285412e-06, + "loss": 0.2986, + "step": 5894 + }, + { + "epoch": 0.4715905681886362, + "grad_norm": 0.3368672331390658, + "learning_rate": 8.930241206129754e-06, + "loss": 0.2438, + "step": 5895 + }, + { + "epoch": 0.47167056658866824, + "grad_norm": 0.2630154179612777, + "learning_rate": 8.929840756011847e-06, + "loss": 0.2618, + "step": 5896 + }, + { + "epoch": 0.4717505649887002, + "grad_norm": 0.3245431725744292, + "learning_rate": 8.929440239938409e-06, + "loss": 0.2482, + "step": 5897 + }, + { + "epoch": 0.47183056338873225, + "grad_norm": 0.30803423862029006, + "learning_rate": 8.929039657916166e-06, + "loss": 0.2729, + "step": 5898 + }, + { + "epoch": 0.4719105617887642, + "grad_norm": 0.31570771303494316, + "learning_rate": 8.928639009951837e-06, + "loss": 0.2331, + "step": 5899 + }, + { + "epoch": 0.4719905601887962, + "grad_norm": 0.5190775275367271, + "learning_rate": 8.928238296052148e-06, + "loss": 0.2811, + "step": 5900 + }, + { + "epoch": 0.47207055858882824, + "grad_norm": 0.24493905939111194, + "learning_rate": 8.927837516223824e-06, + "loss": 0.3136, + "step": 5901 + }, + { + "epoch": 0.4721505569888602, + "grad_norm": 0.2658373338819349, + "learning_rate": 8.927436670473592e-06, + "loss": 0.2994, + "step": 5902 + }, + { + "epoch": 0.47223055538889225, + "grad_norm": 0.27232455458262067, + "learning_rate": 8.927035758808178e-06, + "loss": 0.3053, + "step": 5903 + }, + { + "epoch": 0.4723105537889242, + "grad_norm": 0.24495463669629086, + "learning_rate": 8.926634781234311e-06, + "loss": 0.317, + "step": 5904 + }, + { + "epoch": 0.4723905521889562, + "grad_norm": 0.2904562816811924, + "learning_rate": 8.926233737758722e-06, + "loss": 0.2953, + "step": 5905 + }, + { + "epoch": 0.47247055058898824, + "grad_norm": 0.24446139877730716, + "learning_rate": 8.925832628388142e-06, + "loss": 0.3115, + "step": 5906 + }, + { + "epoch": 0.4725505489890202, + "grad_norm": 0.28410498109980487, + "learning_rate": 8.9254314531293e-06, + "loss": 0.3022, + "step": 5907 + }, + { + "epoch": 0.4726305473890522, + "grad_norm": 0.3444122776357474, + "learning_rate": 8.925030211988932e-06, + "loss": 0.2529, + "step": 5908 + }, + { + "epoch": 0.4727105457890842, + "grad_norm": 0.21459535293641052, + "learning_rate": 8.924628904973771e-06, + "loss": 0.3369, + "step": 5909 + }, + { + "epoch": 0.4727905441891162, + "grad_norm": 0.27714872896057946, + "learning_rate": 8.924227532090553e-06, + "loss": 0.2909, + "step": 5910 + }, + { + "epoch": 0.47287054258914824, + "grad_norm": 0.31695133765143274, + "learning_rate": 8.923826093346013e-06, + "loss": 0.2435, + "step": 5911 + }, + { + "epoch": 0.4729505409891802, + "grad_norm": 0.3970991386733703, + "learning_rate": 8.92342458874689e-06, + "loss": 0.2586, + "step": 5912 + }, + { + "epoch": 0.4730305393892122, + "grad_norm": 0.2774232628524979, + "learning_rate": 8.92302301829992e-06, + "loss": 0.2889, + "step": 5913 + }, + { + "epoch": 0.4731105377892442, + "grad_norm": 0.2826792846487184, + "learning_rate": 8.922621382011845e-06, + "loss": 0.2876, + "step": 5914 + }, + { + "epoch": 0.4731905361892762, + "grad_norm": 0.3245687342727812, + "learning_rate": 8.922219679889406e-06, + "loss": 0.295, + "step": 5915 + }, + { + "epoch": 0.47327053458930823, + "grad_norm": 0.3084131998490697, + "learning_rate": 8.921817911939344e-06, + "loss": 0.2617, + "step": 5916 + }, + { + "epoch": 0.4733505329893402, + "grad_norm": 0.49203562354859276, + "learning_rate": 8.9214160781684e-06, + "loss": 0.292, + "step": 5917 + }, + { + "epoch": 0.4734305313893722, + "grad_norm": 0.2520513268813035, + "learning_rate": 8.92101417858332e-06, + "loss": 0.2835, + "step": 5918 + }, + { + "epoch": 0.4735105297894042, + "grad_norm": 0.27615554711002055, + "learning_rate": 8.92061221319085e-06, + "loss": 0.291, + "step": 5919 + }, + { + "epoch": 0.4735905281894362, + "grad_norm": 0.27559113956556025, + "learning_rate": 8.920210181997736e-06, + "loss": 0.2954, + "step": 5920 + }, + { + "epoch": 0.47367052658946823, + "grad_norm": 0.2540698389484329, + "learning_rate": 8.919808085010726e-06, + "loss": 0.289, + "step": 5921 + }, + { + "epoch": 0.4737505249895002, + "grad_norm": 0.27367972626124093, + "learning_rate": 8.919405922236568e-06, + "loss": 0.2905, + "step": 5922 + }, + { + "epoch": 0.4738305233895322, + "grad_norm": 0.26152788105970687, + "learning_rate": 8.919003693682008e-06, + "loss": 0.3007, + "step": 5923 + }, + { + "epoch": 0.4739105217895642, + "grad_norm": 0.286758959816618, + "learning_rate": 8.9186013993538e-06, + "loss": 0.2414, + "step": 5924 + }, + { + "epoch": 0.4739905201895962, + "grad_norm": 0.2773187069773488, + "learning_rate": 8.918199039258697e-06, + "loss": 0.2888, + "step": 5925 + }, + { + "epoch": 0.47407051858962823, + "grad_norm": 0.25236753926421013, + "learning_rate": 8.917796613403451e-06, + "loss": 0.2951, + "step": 5926 + }, + { + "epoch": 0.4741505169896602, + "grad_norm": 0.2852609713877524, + "learning_rate": 8.917394121794814e-06, + "loss": 0.3132, + "step": 5927 + }, + { + "epoch": 0.4742305153896922, + "grad_norm": 0.27743512886149024, + "learning_rate": 8.916991564439544e-06, + "loss": 0.2651, + "step": 5928 + }, + { + "epoch": 0.4743105137897242, + "grad_norm": 0.21887002940546096, + "learning_rate": 8.916588941344393e-06, + "loss": 0.322, + "step": 5929 + }, + { + "epoch": 0.4743905121897562, + "grad_norm": 0.31514393176654626, + "learning_rate": 8.916186252516123e-06, + "loss": 0.2362, + "step": 5930 + }, + { + "epoch": 0.47447051058978823, + "grad_norm": 0.2951223519193214, + "learning_rate": 8.915783497961492e-06, + "loss": 0.3161, + "step": 5931 + }, + { + "epoch": 0.4745505089898202, + "grad_norm": 0.2950177995826164, + "learning_rate": 8.915380677687256e-06, + "loss": 0.2534, + "step": 5932 + }, + { + "epoch": 0.4746305073898522, + "grad_norm": 0.2825358411757625, + "learning_rate": 8.914977791700178e-06, + "loss": 0.2738, + "step": 5933 + }, + { + "epoch": 0.4747105057898842, + "grad_norm": 0.3518925372883448, + "learning_rate": 8.91457484000702e-06, + "loss": 0.2438, + "step": 5934 + }, + { + "epoch": 0.4747905041899162, + "grad_norm": 0.27292179135300654, + "learning_rate": 8.914171822614543e-06, + "loss": 0.2745, + "step": 5935 + }, + { + "epoch": 0.4748705025899482, + "grad_norm": 0.3603713974375238, + "learning_rate": 8.913768739529513e-06, + "loss": 0.2614, + "step": 5936 + }, + { + "epoch": 0.4749505009899802, + "grad_norm": 0.2977064200850592, + "learning_rate": 8.913365590758695e-06, + "loss": 0.2636, + "step": 5937 + }, + { + "epoch": 0.4750304993900122, + "grad_norm": 0.24917559348916168, + "learning_rate": 8.912962376308854e-06, + "loss": 0.2985, + "step": 5938 + }, + { + "epoch": 0.4751104977900442, + "grad_norm": 0.27049291311525436, + "learning_rate": 8.912559096186759e-06, + "loss": 0.293, + "step": 5939 + }, + { + "epoch": 0.4751904961900762, + "grad_norm": 0.31101898024916386, + "learning_rate": 8.912155750399176e-06, + "loss": 0.2828, + "step": 5940 + }, + { + "epoch": 0.4752704945901082, + "grad_norm": 0.26660813734316846, + "learning_rate": 8.911752338952875e-06, + "loss": 0.3172, + "step": 5941 + }, + { + "epoch": 0.4753504929901402, + "grad_norm": 1.5421789426134587, + "learning_rate": 8.911348861854628e-06, + "loss": 0.2766, + "step": 5942 + }, + { + "epoch": 0.4754304913901722, + "grad_norm": 0.2984340593968737, + "learning_rate": 8.910945319111204e-06, + "loss": 0.2667, + "step": 5943 + }, + { + "epoch": 0.4755104897902042, + "grad_norm": 0.2668255260411109, + "learning_rate": 8.910541710729379e-06, + "loss": 0.2849, + "step": 5944 + }, + { + "epoch": 0.4755904881902362, + "grad_norm": 0.3020466084076634, + "learning_rate": 8.910138036715924e-06, + "loss": 0.2539, + "step": 5945 + }, + { + "epoch": 0.47567048659026817, + "grad_norm": 0.45672845821740365, + "learning_rate": 8.909734297077618e-06, + "loss": 0.2856, + "step": 5946 + }, + { + "epoch": 0.4757504849903002, + "grad_norm": 0.3176164393174906, + "learning_rate": 8.90933049182123e-06, + "loss": 0.2692, + "step": 5947 + }, + { + "epoch": 0.4758304833903322, + "grad_norm": 0.31465453433525264, + "learning_rate": 8.908926620953545e-06, + "loss": 0.2511, + "step": 5948 + }, + { + "epoch": 0.4759104817903642, + "grad_norm": 0.32015544314426303, + "learning_rate": 8.908522684481336e-06, + "loss": 0.2739, + "step": 5949 + }, + { + "epoch": 0.4759904801903962, + "grad_norm": 0.3237046384666863, + "learning_rate": 8.908118682411387e-06, + "loss": 0.2584, + "step": 5950 + }, + { + "epoch": 0.47607047859042817, + "grad_norm": 0.30825170026179494, + "learning_rate": 8.907714614750473e-06, + "loss": 0.2586, + "step": 5951 + }, + { + "epoch": 0.4761504769904602, + "grad_norm": 0.2730951174691001, + "learning_rate": 8.907310481505378e-06, + "loss": 0.3066, + "step": 5952 + }, + { + "epoch": 0.4762304753904922, + "grad_norm": 0.22981698141948778, + "learning_rate": 8.906906282682886e-06, + "loss": 0.3541, + "step": 5953 + }, + { + "epoch": 0.4763104737905242, + "grad_norm": 0.2843489355211349, + "learning_rate": 8.90650201828978e-06, + "loss": 0.2853, + "step": 5954 + }, + { + "epoch": 0.4763904721905562, + "grad_norm": 0.31260470933364426, + "learning_rate": 8.906097688332844e-06, + "loss": 0.2585, + "step": 5955 + }, + { + "epoch": 0.47647047059058817, + "grad_norm": 0.297678785048006, + "learning_rate": 8.905693292818864e-06, + "loss": 0.2436, + "step": 5956 + }, + { + "epoch": 0.4765504689906202, + "grad_norm": 0.31529151103253006, + "learning_rate": 8.905288831754628e-06, + "loss": 0.2887, + "step": 5957 + }, + { + "epoch": 0.4766304673906522, + "grad_norm": 0.3710037981386871, + "learning_rate": 8.904884305146924e-06, + "loss": 0.2511, + "step": 5958 + }, + { + "epoch": 0.4767104657906842, + "grad_norm": 0.28976739023098963, + "learning_rate": 8.904479713002542e-06, + "loss": 0.2766, + "step": 5959 + }, + { + "epoch": 0.4767904641907162, + "grad_norm": 0.30270950744911435, + "learning_rate": 8.90407505532827e-06, + "loss": 0.2434, + "step": 5960 + }, + { + "epoch": 0.47687046259074817, + "grad_norm": 0.2905647218959929, + "learning_rate": 8.9036703321309e-06, + "loss": 0.3111, + "step": 5961 + }, + { + "epoch": 0.4769504609907802, + "grad_norm": 0.3111951423804636, + "learning_rate": 8.903265543417227e-06, + "loss": 0.2919, + "step": 5962 + }, + { + "epoch": 0.4770304593908122, + "grad_norm": 0.2869641496652721, + "learning_rate": 8.902860689194044e-06, + "loss": 0.2798, + "step": 5963 + }, + { + "epoch": 0.47711045779084416, + "grad_norm": 0.24255111454943826, + "learning_rate": 8.902455769468143e-06, + "loss": 0.331, + "step": 5964 + }, + { + "epoch": 0.4771904561908762, + "grad_norm": 0.31540067310751363, + "learning_rate": 8.902050784246324e-06, + "loss": 0.3031, + "step": 5965 + }, + { + "epoch": 0.47727045459090817, + "grad_norm": 0.3052807396173634, + "learning_rate": 8.90164573353538e-06, + "loss": 0.3041, + "step": 5966 + }, + { + "epoch": 0.4773504529909402, + "grad_norm": 0.3849216381300884, + "learning_rate": 8.901240617342111e-06, + "loss": 0.279, + "step": 5967 + }, + { + "epoch": 0.4774304513909722, + "grad_norm": 0.28365711101001334, + "learning_rate": 8.900835435673316e-06, + "loss": 0.2768, + "step": 5968 + }, + { + "epoch": 0.47751044979100415, + "grad_norm": 0.3193785217918695, + "learning_rate": 8.900430188535796e-06, + "loss": 0.2719, + "step": 5969 + }, + { + "epoch": 0.4775904481910362, + "grad_norm": 0.2940398697004442, + "learning_rate": 8.900024875936351e-06, + "loss": 0.312, + "step": 5970 + }, + { + "epoch": 0.47767044659106817, + "grad_norm": 0.290426297398639, + "learning_rate": 8.899619497881784e-06, + "loss": 0.2937, + "step": 5971 + }, + { + "epoch": 0.4777504449911002, + "grad_norm": 0.2910230791645577, + "learning_rate": 8.899214054378898e-06, + "loss": 0.3221, + "step": 5972 + }, + { + "epoch": 0.4778304433911322, + "grad_norm": 0.26701746118922837, + "learning_rate": 8.8988085454345e-06, + "loss": 0.2906, + "step": 5973 + }, + { + "epoch": 0.47791044179116415, + "grad_norm": 0.3481157102399472, + "learning_rate": 8.898402971055393e-06, + "loss": 0.2618, + "step": 5974 + }, + { + "epoch": 0.4779904401911962, + "grad_norm": 0.21062696141137857, + "learning_rate": 8.897997331248384e-06, + "loss": 0.3406, + "step": 5975 + }, + { + "epoch": 0.47807043859122816, + "grad_norm": 0.3349618195279979, + "learning_rate": 8.897591626020284e-06, + "loss": 0.2792, + "step": 5976 + }, + { + "epoch": 0.4781504369912602, + "grad_norm": 0.2428784430014319, + "learning_rate": 8.8971858553779e-06, + "loss": 0.3071, + "step": 5977 + }, + { + "epoch": 0.4782304353912922, + "grad_norm": 0.4267800827640234, + "learning_rate": 8.89678001932804e-06, + "loss": 0.2471, + "step": 5978 + }, + { + "epoch": 0.47831043379132415, + "grad_norm": 0.3839809694214864, + "learning_rate": 8.896374117877519e-06, + "loss": 0.259, + "step": 5979 + }, + { + "epoch": 0.4783904321913562, + "grad_norm": 0.45701899799500406, + "learning_rate": 8.895968151033147e-06, + "loss": 0.282, + "step": 5980 + }, + { + "epoch": 0.47847043059138816, + "grad_norm": 0.30808801033910155, + "learning_rate": 8.895562118801739e-06, + "loss": 0.2558, + "step": 5981 + }, + { + "epoch": 0.4785504289914202, + "grad_norm": 0.2451417995689836, + "learning_rate": 8.895156021190109e-06, + "loss": 0.3095, + "step": 5982 + }, + { + "epoch": 0.4786304273914522, + "grad_norm": 0.26306478982211084, + "learning_rate": 8.89474985820507e-06, + "loss": 0.2873, + "step": 5983 + }, + { + "epoch": 0.47871042579148415, + "grad_norm": 0.43681134069748473, + "learning_rate": 8.894343629853442e-06, + "loss": 0.2921, + "step": 5984 + }, + { + "epoch": 0.4787904241915162, + "grad_norm": 0.30489621652928284, + "learning_rate": 8.893937336142043e-06, + "loss": 0.3209, + "step": 5985 + }, + { + "epoch": 0.47887042259154816, + "grad_norm": 0.28936053098899633, + "learning_rate": 8.89353097707769e-06, + "loss": 0.2656, + "step": 5986 + }, + { + "epoch": 0.4789504209915802, + "grad_norm": 0.3465262010692459, + "learning_rate": 8.893124552667203e-06, + "loss": 0.252, + "step": 5987 + }, + { + "epoch": 0.47903041939161217, + "grad_norm": 0.3040600554500932, + "learning_rate": 8.892718062917405e-06, + "loss": 0.2975, + "step": 5988 + }, + { + "epoch": 0.47911041779164415, + "grad_norm": 0.27135774535392615, + "learning_rate": 8.892311507835118e-06, + "loss": 0.2958, + "step": 5989 + }, + { + "epoch": 0.4791904161916762, + "grad_norm": 0.3502376409873895, + "learning_rate": 8.891904887427164e-06, + "loss": 0.2986, + "step": 5990 + }, + { + "epoch": 0.47927041459170816, + "grad_norm": 0.26545625360116043, + "learning_rate": 8.891498201700368e-06, + "loss": 0.3075, + "step": 5991 + }, + { + "epoch": 0.47935041299174014, + "grad_norm": 0.27679202445555856, + "learning_rate": 8.891091450661556e-06, + "loss": 0.2945, + "step": 5992 + }, + { + "epoch": 0.47943041139177217, + "grad_norm": 0.259828046196747, + "learning_rate": 8.890684634317552e-06, + "loss": 0.2808, + "step": 5993 + }, + { + "epoch": 0.47951040979180415, + "grad_norm": 0.29423025654605167, + "learning_rate": 8.890277752675187e-06, + "loss": 0.3091, + "step": 5994 + }, + { + "epoch": 0.4795904081918362, + "grad_norm": 0.2597333117078829, + "learning_rate": 8.889870805741288e-06, + "loss": 0.2838, + "step": 5995 + }, + { + "epoch": 0.47967040659186816, + "grad_norm": 0.26047642178345937, + "learning_rate": 8.889463793522687e-06, + "loss": 0.2662, + "step": 5996 + }, + { + "epoch": 0.47975040499190014, + "grad_norm": 0.223619804677584, + "learning_rate": 8.889056716026213e-06, + "loss": 0.3065, + "step": 5997 + }, + { + "epoch": 0.47983040339193217, + "grad_norm": 0.27823752785917666, + "learning_rate": 8.888649573258697e-06, + "loss": 0.2816, + "step": 5998 + }, + { + "epoch": 0.47991040179196415, + "grad_norm": 0.3208355594101808, + "learning_rate": 8.888242365226975e-06, + "loss": 0.2617, + "step": 5999 + }, + { + "epoch": 0.4799904001919962, + "grad_norm": 0.3559079743089482, + "learning_rate": 8.88783509193788e-06, + "loss": 0.309, + "step": 6000 + }, + { + "epoch": 0.48007039859202816, + "grad_norm": 0.26953091566283877, + "learning_rate": 8.887427753398249e-06, + "loss": 0.2739, + "step": 6001 + }, + { + "epoch": 0.48015039699206014, + "grad_norm": 0.27038877757890944, + "learning_rate": 8.887020349614914e-06, + "loss": 0.2899, + "step": 6002 + }, + { + "epoch": 0.48023039539209217, + "grad_norm": 0.37312978896567034, + "learning_rate": 8.886612880594715e-06, + "loss": 0.3019, + "step": 6003 + }, + { + "epoch": 0.48031039379212415, + "grad_norm": 0.2411088059741634, + "learning_rate": 8.886205346344495e-06, + "loss": 0.3417, + "step": 6004 + }, + { + "epoch": 0.4803903921921562, + "grad_norm": 0.2559689392391588, + "learning_rate": 8.885797746871085e-06, + "loss": 0.2756, + "step": 6005 + }, + { + "epoch": 0.48047039059218816, + "grad_norm": 0.39328241276357806, + "learning_rate": 8.885390082181333e-06, + "loss": 0.2873, + "step": 6006 + }, + { + "epoch": 0.48055038899222013, + "grad_norm": 0.2728652678708431, + "learning_rate": 8.884982352282078e-06, + "loss": 0.3265, + "step": 6007 + }, + { + "epoch": 0.48063038739225217, + "grad_norm": 0.272205818960488, + "learning_rate": 8.884574557180165e-06, + "loss": 0.2922, + "step": 6008 + }, + { + "epoch": 0.48071038579228415, + "grad_norm": 0.264583990166896, + "learning_rate": 8.884166696882436e-06, + "loss": 0.3014, + "step": 6009 + }, + { + "epoch": 0.4807903841923162, + "grad_norm": 0.26922172038974934, + "learning_rate": 8.883758771395739e-06, + "loss": 0.2748, + "step": 6010 + }, + { + "epoch": 0.48087038259234816, + "grad_norm": 0.27759006054417085, + "learning_rate": 8.883350780726915e-06, + "loss": 0.2926, + "step": 6011 + }, + { + "epoch": 0.48095038099238013, + "grad_norm": 0.26346029869252574, + "learning_rate": 8.882942724882816e-06, + "loss": 0.2856, + "step": 6012 + }, + { + "epoch": 0.48103037939241217, + "grad_norm": 0.23300387129809144, + "learning_rate": 8.88253460387029e-06, + "loss": 0.3233, + "step": 6013 + }, + { + "epoch": 0.48111037779244414, + "grad_norm": 0.2448451361469002, + "learning_rate": 8.882126417696185e-06, + "loss": 0.3187, + "step": 6014 + }, + { + "epoch": 0.4811903761924762, + "grad_norm": 0.3308517955591014, + "learning_rate": 8.881718166367353e-06, + "loss": 0.2451, + "step": 6015 + }, + { + "epoch": 0.48127037459250815, + "grad_norm": 0.2653559160003545, + "learning_rate": 8.881309849890645e-06, + "loss": 0.3221, + "step": 6016 + }, + { + "epoch": 0.48135037299254013, + "grad_norm": 0.3075622635619129, + "learning_rate": 8.880901468272913e-06, + "loss": 0.2648, + "step": 6017 + }, + { + "epoch": 0.48143037139257217, + "grad_norm": 0.3168801818528636, + "learning_rate": 8.880493021521013e-06, + "loss": 0.2649, + "step": 6018 + }, + { + "epoch": 0.48151036979260414, + "grad_norm": 0.2975093586817609, + "learning_rate": 8.8800845096418e-06, + "loss": 0.259, + "step": 6019 + }, + { + "epoch": 0.4815903681926361, + "grad_norm": 0.30287101021594365, + "learning_rate": 8.879675932642129e-06, + "loss": 0.2486, + "step": 6020 + }, + { + "epoch": 0.48167036659266815, + "grad_norm": 0.3238328331113838, + "learning_rate": 8.87926729052886e-06, + "loss": 0.2841, + "step": 6021 + }, + { + "epoch": 0.48175036499270013, + "grad_norm": 0.2101944924046625, + "learning_rate": 8.878858583308845e-06, + "loss": 0.3393, + "step": 6022 + }, + { + "epoch": 0.48183036339273216, + "grad_norm": 0.4306022384961016, + "learning_rate": 8.87844981098895e-06, + "loss": 0.2689, + "step": 6023 + }, + { + "epoch": 0.48191036179276414, + "grad_norm": 0.38028574127863807, + "learning_rate": 8.87804097357603e-06, + "loss": 0.2385, + "step": 6024 + }, + { + "epoch": 0.4819903601927961, + "grad_norm": 0.2871861482154307, + "learning_rate": 8.877632071076952e-06, + "loss": 0.2982, + "step": 6025 + }, + { + "epoch": 0.48207035859282815, + "grad_norm": 0.2757773633451801, + "learning_rate": 8.877223103498576e-06, + "loss": 0.2783, + "step": 6026 + }, + { + "epoch": 0.48215035699286013, + "grad_norm": 0.27341700723425694, + "learning_rate": 8.876814070847766e-06, + "loss": 0.2676, + "step": 6027 + }, + { + "epoch": 0.48223035539289216, + "grad_norm": 0.25073205520297837, + "learning_rate": 8.876404973131387e-06, + "loss": 0.3184, + "step": 6028 + }, + { + "epoch": 0.48231035379292414, + "grad_norm": 0.38097853285361427, + "learning_rate": 8.875995810356306e-06, + "loss": 0.3019, + "step": 6029 + }, + { + "epoch": 0.4823903521929561, + "grad_norm": 0.3250014171644075, + "learning_rate": 8.875586582529388e-06, + "loss": 0.2463, + "step": 6030 + }, + { + "epoch": 0.48247035059298815, + "grad_norm": 0.28581214791011883, + "learning_rate": 8.875177289657502e-06, + "loss": 0.289, + "step": 6031 + }, + { + "epoch": 0.48255034899302013, + "grad_norm": 0.26504143160053084, + "learning_rate": 8.87476793174752e-06, + "loss": 0.2986, + "step": 6032 + }, + { + "epoch": 0.48263034739305216, + "grad_norm": 0.2711934730699754, + "learning_rate": 8.874358508806306e-06, + "loss": 0.3062, + "step": 6033 + }, + { + "epoch": 0.48271034579308414, + "grad_norm": 0.23616451497331176, + "learning_rate": 8.873949020840738e-06, + "loss": 0.3114, + "step": 6034 + }, + { + "epoch": 0.4827903441931161, + "grad_norm": 0.2863916929837123, + "learning_rate": 8.873539467857683e-06, + "loss": 0.2434, + "step": 6035 + }, + { + "epoch": 0.48287034259314815, + "grad_norm": 0.28409951930720534, + "learning_rate": 8.87312984986402e-06, + "loss": 0.251, + "step": 6036 + }, + { + "epoch": 0.4829503409931801, + "grad_norm": 0.25662827129188315, + "learning_rate": 8.872720166866623e-06, + "loss": 0.3126, + "step": 6037 + }, + { + "epoch": 0.48303033939321216, + "grad_norm": 0.27234315195949665, + "learning_rate": 8.872310418872364e-06, + "loss": 0.2678, + "step": 6038 + }, + { + "epoch": 0.48311033779324414, + "grad_norm": 0.3796631107619307, + "learning_rate": 8.871900605888121e-06, + "loss": 0.2589, + "step": 6039 + }, + { + "epoch": 0.4831903361932761, + "grad_norm": 0.289288538321304, + "learning_rate": 8.871490727920773e-06, + "loss": 0.2734, + "step": 6040 + }, + { + "epoch": 0.48327033459330815, + "grad_norm": 0.2537414314242945, + "learning_rate": 8.8710807849772e-06, + "loss": 0.3174, + "step": 6041 + }, + { + "epoch": 0.4833503329933401, + "grad_norm": 0.2977227889456845, + "learning_rate": 8.870670777064281e-06, + "loss": 0.2915, + "step": 6042 + }, + { + "epoch": 0.48343033139337216, + "grad_norm": 0.5136933141458111, + "learning_rate": 8.870260704188897e-06, + "loss": 0.2704, + "step": 6043 + }, + { + "epoch": 0.48351032979340414, + "grad_norm": 0.2748376330613757, + "learning_rate": 8.86985056635793e-06, + "loss": 0.2768, + "step": 6044 + }, + { + "epoch": 0.4835903281934361, + "grad_norm": 0.25301100524974496, + "learning_rate": 8.869440363578267e-06, + "loss": 0.3159, + "step": 6045 + }, + { + "epoch": 0.48367032659346815, + "grad_norm": 0.19634041250045706, + "learning_rate": 8.86903009585679e-06, + "loss": 0.3384, + "step": 6046 + }, + { + "epoch": 0.4837503249935001, + "grad_norm": 0.3273255750032447, + "learning_rate": 8.868619763200384e-06, + "loss": 0.27, + "step": 6047 + }, + { + "epoch": 0.4838303233935321, + "grad_norm": 0.3094620768881021, + "learning_rate": 8.868209365615934e-06, + "loss": 0.3044, + "step": 6048 + }, + { + "epoch": 0.48391032179356414, + "grad_norm": 0.38877805330353715, + "learning_rate": 8.867798903110331e-06, + "loss": 0.2794, + "step": 6049 + }, + { + "epoch": 0.4839903201935961, + "grad_norm": 0.3458700294414693, + "learning_rate": 8.867388375690464e-06, + "loss": 0.2755, + "step": 6050 + }, + { + "epoch": 0.48407031859362815, + "grad_norm": 0.3160104872424893, + "learning_rate": 8.866977783363219e-06, + "loss": 0.2681, + "step": 6051 + }, + { + "epoch": 0.4841503169936601, + "grad_norm": 0.2729571125003863, + "learning_rate": 8.866567126135493e-06, + "loss": 0.2869, + "step": 6052 + }, + { + "epoch": 0.4842303153936921, + "grad_norm": 0.3650429043703798, + "learning_rate": 8.866156404014175e-06, + "loss": 0.2936, + "step": 6053 + }, + { + "epoch": 0.48431031379372413, + "grad_norm": 0.3366046848790977, + "learning_rate": 8.865745617006157e-06, + "loss": 0.2524, + "step": 6054 + }, + { + "epoch": 0.4843903121937561, + "grad_norm": 0.27294679689350243, + "learning_rate": 8.865334765118335e-06, + "loss": 0.2838, + "step": 6055 + }, + { + "epoch": 0.48447031059378814, + "grad_norm": 0.27147187919255483, + "learning_rate": 8.864923848357605e-06, + "loss": 0.2783, + "step": 6056 + }, + { + "epoch": 0.4845503089938201, + "grad_norm": 0.2594736040800616, + "learning_rate": 8.864512866730862e-06, + "loss": 0.2934, + "step": 6057 + }, + { + "epoch": 0.4846303073938521, + "grad_norm": 0.3036612213967791, + "learning_rate": 8.864101820245003e-06, + "loss": 0.2331, + "step": 6058 + }, + { + "epoch": 0.48471030579388413, + "grad_norm": 0.28294286151990794, + "learning_rate": 8.863690708906931e-06, + "loss": 0.2905, + "step": 6059 + }, + { + "epoch": 0.4847903041939161, + "grad_norm": 0.27179675468251785, + "learning_rate": 8.86327953272354e-06, + "loss": 0.309, + "step": 6060 + }, + { + "epoch": 0.48487030259394814, + "grad_norm": 0.2851163954720835, + "learning_rate": 8.862868291701735e-06, + "loss": 0.2762, + "step": 6061 + }, + { + "epoch": 0.4849503009939801, + "grad_norm": 0.5114124861768686, + "learning_rate": 8.862456985848417e-06, + "loss": 0.2733, + "step": 6062 + }, + { + "epoch": 0.4850302993940121, + "grad_norm": 0.28527109179894633, + "learning_rate": 8.862045615170487e-06, + "loss": 0.2988, + "step": 6063 + }, + { + "epoch": 0.48511029779404413, + "grad_norm": 0.24748707626915728, + "learning_rate": 8.861634179674851e-06, + "loss": 0.2772, + "step": 6064 + }, + { + "epoch": 0.4851902961940761, + "grad_norm": 0.3585957527880161, + "learning_rate": 8.861222679368416e-06, + "loss": 0.2698, + "step": 6065 + }, + { + "epoch": 0.48527029459410814, + "grad_norm": 0.3103252461130635, + "learning_rate": 8.860811114258085e-06, + "loss": 0.2758, + "step": 6066 + }, + { + "epoch": 0.4853502929941401, + "grad_norm": 0.2850462172928093, + "learning_rate": 8.860399484350768e-06, + "loss": 0.3357, + "step": 6067 + }, + { + "epoch": 0.4854302913941721, + "grad_norm": 0.29143916843725176, + "learning_rate": 8.859987789653371e-06, + "loss": 0.2742, + "step": 6068 + }, + { + "epoch": 0.48551028979420413, + "grad_norm": 0.28664496910778864, + "learning_rate": 8.859576030172804e-06, + "loss": 0.2628, + "step": 6069 + }, + { + "epoch": 0.4855902881942361, + "grad_norm": 0.25841190178424, + "learning_rate": 8.85916420591598e-06, + "loss": 0.2634, + "step": 6070 + }, + { + "epoch": 0.48567028659426814, + "grad_norm": 0.3813984994640962, + "learning_rate": 8.858752316889809e-06, + "loss": 0.2714, + "step": 6071 + }, + { + "epoch": 0.4857502849943001, + "grad_norm": 0.28858054546509043, + "learning_rate": 8.858340363101204e-06, + "loss": 0.2324, + "step": 6072 + }, + { + "epoch": 0.4858302833943321, + "grad_norm": 0.28364559729022565, + "learning_rate": 8.857928344557079e-06, + "loss": 0.2456, + "step": 6073 + }, + { + "epoch": 0.48591028179436413, + "grad_norm": 0.28076197813166387, + "learning_rate": 8.85751626126435e-06, + "loss": 0.2964, + "step": 6074 + }, + { + "epoch": 0.4859902801943961, + "grad_norm": 0.257441639544771, + "learning_rate": 8.857104113229929e-06, + "loss": 0.2809, + "step": 6075 + }, + { + "epoch": 0.4860702785944281, + "grad_norm": 0.3285007928553187, + "learning_rate": 8.85669190046074e-06, + "loss": 0.2575, + "step": 6076 + }, + { + "epoch": 0.4861502769944601, + "grad_norm": 0.35666355546693773, + "learning_rate": 8.856279622963694e-06, + "loss": 0.2555, + "step": 6077 + }, + { + "epoch": 0.4862302753944921, + "grad_norm": 0.3229680715740346, + "learning_rate": 8.855867280745717e-06, + "loss": 0.2483, + "step": 6078 + }, + { + "epoch": 0.48631027379452413, + "grad_norm": 0.3155712952956712, + "learning_rate": 8.855454873813724e-06, + "loss": 0.2471, + "step": 6079 + }, + { + "epoch": 0.4863902721945561, + "grad_norm": 0.2542018498374534, + "learning_rate": 8.85504240217464e-06, + "loss": 0.3252, + "step": 6080 + }, + { + "epoch": 0.4864702705945881, + "grad_norm": 0.5113692309886261, + "learning_rate": 8.854629865835387e-06, + "loss": 0.2661, + "step": 6081 + }, + { + "epoch": 0.4865502689946201, + "grad_norm": 0.29414563543630257, + "learning_rate": 8.85421726480289e-06, + "loss": 0.2445, + "step": 6082 + }, + { + "epoch": 0.4866302673946521, + "grad_norm": 0.26241372436923976, + "learning_rate": 8.853804599084068e-06, + "loss": 0.2848, + "step": 6083 + }, + { + "epoch": 0.4867102657946841, + "grad_norm": 0.3082505933386186, + "learning_rate": 8.853391868685853e-06, + "loss": 0.216, + "step": 6084 + }, + { + "epoch": 0.4867902641947161, + "grad_norm": 0.2892346121353118, + "learning_rate": 8.852979073615172e-06, + "loss": 0.246, + "step": 6085 + }, + { + "epoch": 0.4868702625947481, + "grad_norm": 0.28124505557993795, + "learning_rate": 8.852566213878947e-06, + "loss": 0.2743, + "step": 6086 + }, + { + "epoch": 0.4869502609947801, + "grad_norm": 0.2561583334831174, + "learning_rate": 8.852153289484114e-06, + "loss": 0.3234, + "step": 6087 + }, + { + "epoch": 0.4870302593948121, + "grad_norm": 0.2595761475915162, + "learning_rate": 8.851740300437597e-06, + "loss": 0.2898, + "step": 6088 + }, + { + "epoch": 0.4871102577948441, + "grad_norm": 0.26554621854777943, + "learning_rate": 8.851327246746334e-06, + "loss": 0.2927, + "step": 6089 + }, + { + "epoch": 0.4871902561948761, + "grad_norm": 0.289659492372637, + "learning_rate": 8.850914128417252e-06, + "loss": 0.2419, + "step": 6090 + }, + { + "epoch": 0.4872702545949081, + "grad_norm": 0.267276134388402, + "learning_rate": 8.850500945457286e-06, + "loss": 0.2782, + "step": 6091 + }, + { + "epoch": 0.4873502529949401, + "grad_norm": 0.3038839274424505, + "learning_rate": 8.850087697873372e-06, + "loss": 0.2672, + "step": 6092 + }, + { + "epoch": 0.4874302513949721, + "grad_norm": 0.9867457409084249, + "learning_rate": 8.849674385672444e-06, + "loss": 0.3049, + "step": 6093 + }, + { + "epoch": 0.4875102497950041, + "grad_norm": 0.27991568562509167, + "learning_rate": 8.84926100886144e-06, + "loss": 0.2862, + "step": 6094 + }, + { + "epoch": 0.4875902481950361, + "grad_norm": 0.31782705558349744, + "learning_rate": 8.848847567447298e-06, + "loss": 0.3062, + "step": 6095 + }, + { + "epoch": 0.4876702465950681, + "grad_norm": 0.2995384082613687, + "learning_rate": 8.848434061436954e-06, + "loss": 0.2935, + "step": 6096 + }, + { + "epoch": 0.4877502449951001, + "grad_norm": 0.3275342457376998, + "learning_rate": 8.848020490837352e-06, + "loss": 0.2531, + "step": 6097 + }, + { + "epoch": 0.4878302433951321, + "grad_norm": 0.2682795915746839, + "learning_rate": 8.847606855655429e-06, + "loss": 0.2975, + "step": 6098 + }, + { + "epoch": 0.4879102417951641, + "grad_norm": 0.29402823178675375, + "learning_rate": 8.84719315589813e-06, + "loss": 0.2725, + "step": 6099 + }, + { + "epoch": 0.4879902401951961, + "grad_norm": 0.2802527364503724, + "learning_rate": 8.846779391572399e-06, + "loss": 0.287, + "step": 6100 + }, + { + "epoch": 0.4880702385952281, + "grad_norm": 0.3367822979842637, + "learning_rate": 8.846365562685178e-06, + "loss": 0.2598, + "step": 6101 + }, + { + "epoch": 0.4881502369952601, + "grad_norm": 0.23450974261413876, + "learning_rate": 8.84595166924341e-06, + "loss": 0.3497, + "step": 6102 + }, + { + "epoch": 0.4882302353952921, + "grad_norm": 0.23553521079397546, + "learning_rate": 8.845537711254048e-06, + "loss": 0.3144, + "step": 6103 + }, + { + "epoch": 0.48831023379532407, + "grad_norm": 0.2940793778242793, + "learning_rate": 8.845123688724037e-06, + "loss": 0.2683, + "step": 6104 + }, + { + "epoch": 0.4883902321953561, + "grad_norm": 0.2759623421620397, + "learning_rate": 8.844709601660323e-06, + "loss": 0.3192, + "step": 6105 + }, + { + "epoch": 0.4884702305953881, + "grad_norm": 0.26951289079493057, + "learning_rate": 8.844295450069858e-06, + "loss": 0.2846, + "step": 6106 + }, + { + "epoch": 0.4885502289954201, + "grad_norm": 0.31785066916016436, + "learning_rate": 8.843881233959592e-06, + "loss": 0.2761, + "step": 6107 + }, + { + "epoch": 0.4886302273954521, + "grad_norm": 0.2605319093679355, + "learning_rate": 8.843466953336478e-06, + "loss": 0.2851, + "step": 6108 + }, + { + "epoch": 0.48871022579548407, + "grad_norm": 0.3107064313221954, + "learning_rate": 8.843052608207468e-06, + "loss": 0.2678, + "step": 6109 + }, + { + "epoch": 0.4887902241955161, + "grad_norm": 0.302151218080538, + "learning_rate": 8.842638198579517e-06, + "loss": 0.266, + "step": 6110 + }, + { + "epoch": 0.4888702225955481, + "grad_norm": 0.32769684719788494, + "learning_rate": 8.842223724459578e-06, + "loss": 0.2519, + "step": 6111 + }, + { + "epoch": 0.4889502209955801, + "grad_norm": 0.26206886432306425, + "learning_rate": 8.84180918585461e-06, + "loss": 0.2927, + "step": 6112 + }, + { + "epoch": 0.4890302193956121, + "grad_norm": 0.3258798357815694, + "learning_rate": 8.841394582771568e-06, + "loss": 0.2946, + "step": 6113 + }, + { + "epoch": 0.48911021779564406, + "grad_norm": 0.31797944972437214, + "learning_rate": 8.840979915217412e-06, + "loss": 0.2568, + "step": 6114 + }, + { + "epoch": 0.4891902161956761, + "grad_norm": 0.31214940591085477, + "learning_rate": 8.840565183199102e-06, + "loss": 0.2588, + "step": 6115 + }, + { + "epoch": 0.4892702145957081, + "grad_norm": 0.329987287238085, + "learning_rate": 8.840150386723596e-06, + "loss": 0.2907, + "step": 6116 + }, + { + "epoch": 0.4893502129957401, + "grad_norm": 0.3441850285226875, + "learning_rate": 8.839735525797857e-06, + "loss": 0.2487, + "step": 6117 + }, + { + "epoch": 0.4894302113957721, + "grad_norm": 0.25565075419674743, + "learning_rate": 8.839320600428847e-06, + "loss": 0.3169, + "step": 6118 + }, + { + "epoch": 0.48951020979580406, + "grad_norm": 0.43858916606906057, + "learning_rate": 8.838905610623532e-06, + "loss": 0.2572, + "step": 6119 + }, + { + "epoch": 0.4895902081958361, + "grad_norm": 0.2987897965406687, + "learning_rate": 8.838490556388875e-06, + "loss": 0.236, + "step": 6120 + }, + { + "epoch": 0.4896702065958681, + "grad_norm": 0.29688482580580317, + "learning_rate": 8.838075437731844e-06, + "loss": 0.2802, + "step": 6121 + }, + { + "epoch": 0.4897502049959001, + "grad_norm": 0.2591323042020038, + "learning_rate": 8.837660254659401e-06, + "loss": 0.3215, + "step": 6122 + }, + { + "epoch": 0.4898302033959321, + "grad_norm": 0.26407900962002884, + "learning_rate": 8.837245007178522e-06, + "loss": 0.2856, + "step": 6123 + }, + { + "epoch": 0.48991020179596406, + "grad_norm": 0.33745564817412593, + "learning_rate": 8.836829695296167e-06, + "loss": 0.2577, + "step": 6124 + }, + { + "epoch": 0.4899902001959961, + "grad_norm": 0.2703920533925351, + "learning_rate": 8.836414319019314e-06, + "loss": 0.2801, + "step": 6125 + }, + { + "epoch": 0.4900701985960281, + "grad_norm": 0.5976531652704125, + "learning_rate": 8.83599887835493e-06, + "loss": 0.3097, + "step": 6126 + }, + { + "epoch": 0.4901501969960601, + "grad_norm": 0.30068138951012646, + "learning_rate": 8.83558337330999e-06, + "loss": 0.3097, + "step": 6127 + }, + { + "epoch": 0.4902301953960921, + "grad_norm": 0.2504469450390486, + "learning_rate": 8.835167803891467e-06, + "loss": 0.2779, + "step": 6128 + }, + { + "epoch": 0.49031019379612406, + "grad_norm": 0.28252254631474455, + "learning_rate": 8.834752170106334e-06, + "loss": 0.2824, + "step": 6129 + }, + { + "epoch": 0.4903901921961561, + "grad_norm": 0.28600491679106227, + "learning_rate": 8.834336471961569e-06, + "loss": 0.2535, + "step": 6130 + }, + { + "epoch": 0.49047019059618807, + "grad_norm": 0.31840214702062336, + "learning_rate": 8.833920709464146e-06, + "loss": 0.2887, + "step": 6131 + }, + { + "epoch": 0.49055018899622005, + "grad_norm": 0.30268468891513156, + "learning_rate": 8.833504882621045e-06, + "loss": 0.2536, + "step": 6132 + }, + { + "epoch": 0.4906301873962521, + "grad_norm": 0.2987049884739924, + "learning_rate": 8.833088991439245e-06, + "loss": 0.2497, + "step": 6133 + }, + { + "epoch": 0.49071018579628406, + "grad_norm": 0.32457899218494535, + "learning_rate": 8.832673035925724e-06, + "loss": 0.2396, + "step": 6134 + }, + { + "epoch": 0.4907901841963161, + "grad_norm": 0.3283392672319787, + "learning_rate": 8.832257016087464e-06, + "loss": 0.2966, + "step": 6135 + }, + { + "epoch": 0.49087018259634807, + "grad_norm": 0.3307696933621787, + "learning_rate": 8.831840931931448e-06, + "loss": 0.228, + "step": 6136 + }, + { + "epoch": 0.49095018099638005, + "grad_norm": 0.2554593438951163, + "learning_rate": 8.83142478346466e-06, + "loss": 0.2781, + "step": 6137 + }, + { + "epoch": 0.4910301793964121, + "grad_norm": 0.2953200240660326, + "learning_rate": 8.831008570694082e-06, + "loss": 0.2918, + "step": 6138 + }, + { + "epoch": 0.49111017779644406, + "grad_norm": 0.2820461701366511, + "learning_rate": 8.830592293626702e-06, + "loss": 0.2942, + "step": 6139 + }, + { + "epoch": 0.4911901761964761, + "grad_norm": 0.3003647812027781, + "learning_rate": 8.830175952269502e-06, + "loss": 0.2867, + "step": 6140 + }, + { + "epoch": 0.49127017459650807, + "grad_norm": 0.25929235632416175, + "learning_rate": 8.829759546629474e-06, + "loss": 0.2681, + "step": 6141 + }, + { + "epoch": 0.49135017299654005, + "grad_norm": 0.2720004783664826, + "learning_rate": 8.829343076713607e-06, + "loss": 0.2906, + "step": 6142 + }, + { + "epoch": 0.4914301713965721, + "grad_norm": 0.2814995606411108, + "learning_rate": 8.828926542528888e-06, + "loss": 0.2862, + "step": 6143 + }, + { + "epoch": 0.49151016979660406, + "grad_norm": 0.2775399193462214, + "learning_rate": 8.828509944082308e-06, + "loss": 0.3152, + "step": 6144 + }, + { + "epoch": 0.4915901681966361, + "grad_norm": 0.23441422011686913, + "learning_rate": 8.828093281380859e-06, + "loss": 0.3271, + "step": 6145 + }, + { + "epoch": 0.49167016659666807, + "grad_norm": 0.31252199328148905, + "learning_rate": 8.827676554431534e-06, + "loss": 0.2888, + "step": 6146 + }, + { + "epoch": 0.49175016499670005, + "grad_norm": 0.3062310465492467, + "learning_rate": 8.82725976324133e-06, + "loss": 0.2515, + "step": 6147 + }, + { + "epoch": 0.4918301633967321, + "grad_norm": 0.4664191791414356, + "learning_rate": 8.82684290781724e-06, + "loss": 0.2818, + "step": 6148 + }, + { + "epoch": 0.49191016179676406, + "grad_norm": 0.32797269820090874, + "learning_rate": 8.826425988166259e-06, + "loss": 0.2457, + "step": 6149 + }, + { + "epoch": 0.4919901601967961, + "grad_norm": 0.2504772065394578, + "learning_rate": 8.826009004295383e-06, + "loss": 0.2818, + "step": 6150 + }, + { + "epoch": 0.49207015859682807, + "grad_norm": 0.32903107691927563, + "learning_rate": 8.825591956211614e-06, + "loss": 0.2387, + "step": 6151 + }, + { + "epoch": 0.49215015699686004, + "grad_norm": 0.2772987061434365, + "learning_rate": 8.825174843921951e-06, + "loss": 0.2852, + "step": 6152 + }, + { + "epoch": 0.4922301553968921, + "grad_norm": 0.32511580835134446, + "learning_rate": 8.824757667433392e-06, + "loss": 0.2568, + "step": 6153 + }, + { + "epoch": 0.49231015379692405, + "grad_norm": 0.2889389992525257, + "learning_rate": 8.824340426752941e-06, + "loss": 0.2513, + "step": 6154 + }, + { + "epoch": 0.4923901521969561, + "grad_norm": 0.2819002758092301, + "learning_rate": 8.8239231218876e-06, + "loss": 0.2796, + "step": 6155 + }, + { + "epoch": 0.49247015059698807, + "grad_norm": 0.24224042004922952, + "learning_rate": 8.823505752844372e-06, + "loss": 0.3267, + "step": 6156 + }, + { + "epoch": 0.49255014899702004, + "grad_norm": 0.2927185254652474, + "learning_rate": 8.823088319630262e-06, + "loss": 0.2893, + "step": 6157 + }, + { + "epoch": 0.4926301473970521, + "grad_norm": 0.24280504408562034, + "learning_rate": 8.822670822252277e-06, + "loss": 0.3061, + "step": 6158 + }, + { + "epoch": 0.49271014579708405, + "grad_norm": 0.2772632867948452, + "learning_rate": 8.822253260717422e-06, + "loss": 0.3053, + "step": 6159 + }, + { + "epoch": 0.49279014419711603, + "grad_norm": 0.2816771355687215, + "learning_rate": 8.821835635032708e-06, + "loss": 0.2744, + "step": 6160 + }, + { + "epoch": 0.49287014259714806, + "grad_norm": 0.31194003166338646, + "learning_rate": 8.82141794520514e-06, + "loss": 0.2583, + "step": 6161 + }, + { + "epoch": 0.49295014099718004, + "grad_norm": 0.29170121201844873, + "learning_rate": 8.82100019124173e-06, + "loss": 0.2882, + "step": 6162 + }, + { + "epoch": 0.4930301393972121, + "grad_norm": 0.2880961067970108, + "learning_rate": 8.820582373149491e-06, + "loss": 0.2923, + "step": 6163 + }, + { + "epoch": 0.49311013779724405, + "grad_norm": 0.24713540721807353, + "learning_rate": 8.820164490935435e-06, + "loss": 0.32, + "step": 6164 + }, + { + "epoch": 0.49319013619727603, + "grad_norm": 0.2917153683615355, + "learning_rate": 8.819746544606573e-06, + "loss": 0.2934, + "step": 6165 + }, + { + "epoch": 0.49327013459730806, + "grad_norm": 0.5481727615817722, + "learning_rate": 8.819328534169922e-06, + "loss": 0.2726, + "step": 6166 + }, + { + "epoch": 0.49335013299734004, + "grad_norm": 0.3126742227168862, + "learning_rate": 8.818910459632495e-06, + "loss": 0.255, + "step": 6167 + }, + { + "epoch": 0.4934301313973721, + "grad_norm": 0.39325891364849425, + "learning_rate": 8.818492321001311e-06, + "loss": 0.2564, + "step": 6168 + }, + { + "epoch": 0.49351012979740405, + "grad_norm": 0.29462075143685634, + "learning_rate": 8.818074118283389e-06, + "loss": 0.2293, + "step": 6169 + }, + { + "epoch": 0.49359012819743603, + "grad_norm": 0.28612136843199354, + "learning_rate": 8.817655851485744e-06, + "loss": 0.2674, + "step": 6170 + }, + { + "epoch": 0.49367012659746806, + "grad_norm": 0.23975356612050455, + "learning_rate": 8.817237520615398e-06, + "loss": 0.3414, + "step": 6171 + }, + { + "epoch": 0.49375012499750004, + "grad_norm": 0.18624517951886338, + "learning_rate": 8.81681912567937e-06, + "loss": 0.3656, + "step": 6172 + }, + { + "epoch": 0.4938301233975321, + "grad_norm": 0.27388646938430544, + "learning_rate": 8.816400666684685e-06, + "loss": 0.2659, + "step": 6173 + }, + { + "epoch": 0.49391012179756405, + "grad_norm": 0.26404587869789004, + "learning_rate": 8.815982143638366e-06, + "loss": 0.2886, + "step": 6174 + }, + { + "epoch": 0.493990120197596, + "grad_norm": 0.28463380500914875, + "learning_rate": 8.815563556547434e-06, + "loss": 0.2931, + "step": 6175 + }, + { + "epoch": 0.49407011859762806, + "grad_norm": 0.2746904308973596, + "learning_rate": 8.815144905418918e-06, + "loss": 0.2608, + "step": 6176 + }, + { + "epoch": 0.49415011699766004, + "grad_norm": 0.2947171386997634, + "learning_rate": 8.81472619025984e-06, + "loss": 0.3141, + "step": 6177 + }, + { + "epoch": 0.49423011539769207, + "grad_norm": 0.3081920645616456, + "learning_rate": 8.814307411077233e-06, + "loss": 0.2512, + "step": 6178 + }, + { + "epoch": 0.49431011379772405, + "grad_norm": 0.2531982549809297, + "learning_rate": 8.81388856787812e-06, + "loss": 0.279, + "step": 6179 + }, + { + "epoch": 0.494390112197756, + "grad_norm": 0.3617405574036201, + "learning_rate": 8.813469660669532e-06, + "loss": 0.2775, + "step": 6180 + }, + { + "epoch": 0.49447011059778806, + "grad_norm": 0.22876192521100797, + "learning_rate": 8.813050689458502e-06, + "loss": 0.3301, + "step": 6181 + }, + { + "epoch": 0.49455010899782004, + "grad_norm": 0.3380061021276065, + "learning_rate": 8.812631654252061e-06, + "loss": 0.2382, + "step": 6182 + }, + { + "epoch": 0.49463010739785207, + "grad_norm": 0.28000678359119424, + "learning_rate": 8.81221255505724e-06, + "loss": 0.247, + "step": 6183 + }, + { + "epoch": 0.49471010579788405, + "grad_norm": 0.30962445175473036, + "learning_rate": 8.811793391881074e-06, + "loss": 0.2628, + "step": 6184 + }, + { + "epoch": 0.494790104197916, + "grad_norm": 0.2527738952014621, + "learning_rate": 8.811374164730599e-06, + "loss": 0.2857, + "step": 6185 + }, + { + "epoch": 0.49487010259794806, + "grad_norm": 0.2812747743444406, + "learning_rate": 8.81095487361285e-06, + "loss": 0.2976, + "step": 6186 + }, + { + "epoch": 0.49495010099798004, + "grad_norm": 0.370419002579203, + "learning_rate": 8.810535518534862e-06, + "loss": 0.2736, + "step": 6187 + }, + { + "epoch": 0.495030099398012, + "grad_norm": 0.22383486255057167, + "learning_rate": 8.810116099503675e-06, + "loss": 0.345, + "step": 6188 + }, + { + "epoch": 0.49511009779804405, + "grad_norm": 0.31843054127070447, + "learning_rate": 8.80969661652633e-06, + "loss": 0.2492, + "step": 6189 + }, + { + "epoch": 0.495190096198076, + "grad_norm": 0.31398463982348573, + "learning_rate": 8.809277069609863e-06, + "loss": 0.3059, + "step": 6190 + }, + { + "epoch": 0.49527009459810806, + "grad_norm": 0.33107502038416686, + "learning_rate": 8.80885745876132e-06, + "loss": 0.2305, + "step": 6191 + }, + { + "epoch": 0.49535009299814003, + "grad_norm": 0.2965927361348403, + "learning_rate": 8.80843778398774e-06, + "loss": 0.3142, + "step": 6192 + }, + { + "epoch": 0.495430091398172, + "grad_norm": 0.29777299696063586, + "learning_rate": 8.80801804529617e-06, + "loss": 0.2884, + "step": 6193 + }, + { + "epoch": 0.49551008979820405, + "grad_norm": 0.3284321667257422, + "learning_rate": 8.80759824269365e-06, + "loss": 0.2507, + "step": 6194 + }, + { + "epoch": 0.495590088198236, + "grad_norm": 0.30694150498535294, + "learning_rate": 8.80717837618723e-06, + "loss": 0.246, + "step": 6195 + }, + { + "epoch": 0.49567008659826806, + "grad_norm": 0.30662303846561084, + "learning_rate": 8.806758445783954e-06, + "loss": 0.2573, + "step": 6196 + }, + { + "epoch": 0.49575008499830003, + "grad_norm": 0.22145930868483196, + "learning_rate": 8.80633845149087e-06, + "loss": 0.3458, + "step": 6197 + }, + { + "epoch": 0.495830083398332, + "grad_norm": 0.2844871237289375, + "learning_rate": 8.805918393315028e-06, + "loss": 0.2827, + "step": 6198 + }, + { + "epoch": 0.49591008179836404, + "grad_norm": 0.33653259836990795, + "learning_rate": 8.805498271263477e-06, + "loss": 0.2598, + "step": 6199 + }, + { + "epoch": 0.495990080198396, + "grad_norm": 0.2334144710958175, + "learning_rate": 8.80507808534327e-06, + "loss": 0.3023, + "step": 6200 + }, + { + "epoch": 0.49607007859842805, + "grad_norm": 0.28469530950879157, + "learning_rate": 8.804657835561456e-06, + "loss": 0.2834, + "step": 6201 + }, + { + "epoch": 0.49615007699846003, + "grad_norm": 0.24360761785309337, + "learning_rate": 8.804237521925089e-06, + "loss": 0.3314, + "step": 6202 + }, + { + "epoch": 0.496230075398492, + "grad_norm": 0.33624240112833514, + "learning_rate": 8.803817144441227e-06, + "loss": 0.2513, + "step": 6203 + }, + { + "epoch": 0.49631007379852404, + "grad_norm": 0.35956965641806815, + "learning_rate": 8.80339670311692e-06, + "loss": 0.2936, + "step": 6204 + }, + { + "epoch": 0.496390072198556, + "grad_norm": 0.32813913917164916, + "learning_rate": 8.802976197959228e-06, + "loss": 0.2802, + "step": 6205 + }, + { + "epoch": 0.49647007059858805, + "grad_norm": 0.3001599528905617, + "learning_rate": 8.802555628975204e-06, + "loss": 0.261, + "step": 6206 + }, + { + "epoch": 0.49655006899862003, + "grad_norm": 0.3103453053778995, + "learning_rate": 8.802134996171913e-06, + "loss": 0.2649, + "step": 6207 + }, + { + "epoch": 0.496630067398652, + "grad_norm": 0.29820463246610635, + "learning_rate": 8.80171429955641e-06, + "loss": 0.2421, + "step": 6208 + }, + { + "epoch": 0.49671006579868404, + "grad_norm": 0.3017429531885275, + "learning_rate": 8.801293539135755e-06, + "loss": 0.2738, + "step": 6209 + }, + { + "epoch": 0.496790064198716, + "grad_norm": 0.24891872653789646, + "learning_rate": 8.800872714917013e-06, + "loss": 0.2964, + "step": 6210 + }, + { + "epoch": 0.49687006259874805, + "grad_norm": 0.30615152686626124, + "learning_rate": 8.800451826907245e-06, + "loss": 0.2487, + "step": 6211 + }, + { + "epoch": 0.49695006099878003, + "grad_norm": 0.28482174296561136, + "learning_rate": 8.800030875113517e-06, + "loss": 0.2805, + "step": 6212 + }, + { + "epoch": 0.497030059398812, + "grad_norm": 0.304139376832062, + "learning_rate": 8.79960985954289e-06, + "loss": 0.258, + "step": 6213 + }, + { + "epoch": 0.49711005779884404, + "grad_norm": 0.2347774628157303, + "learning_rate": 8.799188780202435e-06, + "loss": 0.3135, + "step": 6214 + }, + { + "epoch": 0.497190056198876, + "grad_norm": 0.27149790363672666, + "learning_rate": 8.798767637099212e-06, + "loss": 0.2923, + "step": 6215 + }, + { + "epoch": 0.497270054598908, + "grad_norm": 0.27737415917511216, + "learning_rate": 8.798346430240297e-06, + "loss": 0.3247, + "step": 6216 + }, + { + "epoch": 0.49735005299894003, + "grad_norm": 0.2981947915674584, + "learning_rate": 8.797925159632753e-06, + "loss": 0.2488, + "step": 6217 + }, + { + "epoch": 0.497430051398972, + "grad_norm": 0.3518703439828616, + "learning_rate": 8.797503825283655e-06, + "loss": 0.2611, + "step": 6218 + }, + { + "epoch": 0.49751004979900404, + "grad_norm": 0.41758276842174935, + "learning_rate": 8.79708242720007e-06, + "loss": 0.2536, + "step": 6219 + }, + { + "epoch": 0.497590048199036, + "grad_norm": 0.2766698561242508, + "learning_rate": 8.796660965389075e-06, + "loss": 0.2673, + "step": 6220 + }, + { + "epoch": 0.497670046599068, + "grad_norm": 0.26236207042969006, + "learning_rate": 8.79623943985774e-06, + "loss": 0.2844, + "step": 6221 + }, + { + "epoch": 0.4977500449991, + "grad_norm": 0.2422867368158345, + "learning_rate": 8.795817850613142e-06, + "loss": 0.3148, + "step": 6222 + }, + { + "epoch": 0.497830043399132, + "grad_norm": 0.2919719376708548, + "learning_rate": 8.795396197662355e-06, + "loss": 0.2849, + "step": 6223 + }, + { + "epoch": 0.49791004179916404, + "grad_norm": 0.34671038065839765, + "learning_rate": 8.794974481012455e-06, + "loss": 0.2272, + "step": 6224 + }, + { + "epoch": 0.497990040199196, + "grad_norm": 0.2714354108365635, + "learning_rate": 8.794552700670522e-06, + "loss": 0.2755, + "step": 6225 + }, + { + "epoch": 0.498070038599228, + "grad_norm": 0.3023993856322621, + "learning_rate": 8.794130856643635e-06, + "loss": 0.2634, + "step": 6226 + }, + { + "epoch": 0.49815003699926, + "grad_norm": 0.3121652909443062, + "learning_rate": 8.79370894893887e-06, + "loss": 0.2732, + "step": 6227 + }, + { + "epoch": 0.498230035399292, + "grad_norm": 0.3110639672821751, + "learning_rate": 8.79328697756331e-06, + "loss": 0.2489, + "step": 6228 + }, + { + "epoch": 0.49831003379932404, + "grad_norm": 0.3300889620086029, + "learning_rate": 8.792864942524042e-06, + "loss": 0.2534, + "step": 6229 + }, + { + "epoch": 0.498390032199356, + "grad_norm": 0.2511934582809575, + "learning_rate": 8.792442843828141e-06, + "loss": 0.3058, + "step": 6230 + }, + { + "epoch": 0.498470030599388, + "grad_norm": 0.24390011678622378, + "learning_rate": 8.792020681482698e-06, + "loss": 0.2723, + "step": 6231 + }, + { + "epoch": 0.49855002899942, + "grad_norm": 0.2936919726913615, + "learning_rate": 8.791598455494793e-06, + "loss": 0.2684, + "step": 6232 + }, + { + "epoch": 0.498630027399452, + "grad_norm": 0.2876063518337499, + "learning_rate": 8.791176165871515e-06, + "loss": 0.2516, + "step": 6233 + }, + { + "epoch": 0.49871002579948404, + "grad_norm": 0.2996283959206275, + "learning_rate": 8.790753812619952e-06, + "loss": 0.2887, + "step": 6234 + }, + { + "epoch": 0.498790024199516, + "grad_norm": 0.22899200505736134, + "learning_rate": 8.79033139574719e-06, + "loss": 0.3093, + "step": 6235 + }, + { + "epoch": 0.498870022599548, + "grad_norm": 0.25789596587768754, + "learning_rate": 8.789908915260322e-06, + "loss": 0.3186, + "step": 6236 + }, + { + "epoch": 0.49895002099958, + "grad_norm": 0.2937590842027953, + "learning_rate": 8.789486371166435e-06, + "loss": 0.2696, + "step": 6237 + }, + { + "epoch": 0.499030019399612, + "grad_norm": 0.32960698858358717, + "learning_rate": 8.789063763472624e-06, + "loss": 0.2764, + "step": 6238 + }, + { + "epoch": 0.49911001779964403, + "grad_norm": 0.6339535374463624, + "learning_rate": 8.788641092185978e-06, + "loss": 0.2355, + "step": 6239 + }, + { + "epoch": 0.499190016199676, + "grad_norm": 0.3113731057890879, + "learning_rate": 8.788218357313594e-06, + "loss": 0.2546, + "step": 6240 + }, + { + "epoch": 0.499270014599708, + "grad_norm": 0.21442528912868053, + "learning_rate": 8.787795558862566e-06, + "loss": 0.3426, + "step": 6241 + }, + { + "epoch": 0.49935001299974, + "grad_norm": 0.2642253887847804, + "learning_rate": 8.787372696839989e-06, + "loss": 0.283, + "step": 6242 + }, + { + "epoch": 0.499430011399772, + "grad_norm": 0.20123927417956905, + "learning_rate": 8.786949771252961e-06, + "loss": 0.3342, + "step": 6243 + }, + { + "epoch": 0.499510009799804, + "grad_norm": 0.31353754515587184, + "learning_rate": 8.786526782108579e-06, + "loss": 0.2464, + "step": 6244 + }, + { + "epoch": 0.499590008199836, + "grad_norm": 0.2751998754609181, + "learning_rate": 8.786103729413944e-06, + "loss": 0.2832, + "step": 6245 + }, + { + "epoch": 0.499670006599868, + "grad_norm": 0.3033487943481697, + "learning_rate": 8.785680613176153e-06, + "loss": 0.2926, + "step": 6246 + }, + { + "epoch": 0.4997500049999, + "grad_norm": 0.2975187575131753, + "learning_rate": 8.785257433402311e-06, + "loss": 0.2524, + "step": 6247 + }, + { + "epoch": 0.499830003399932, + "grad_norm": 0.3018289434187931, + "learning_rate": 8.784834190099519e-06, + "loss": 0.2439, + "step": 6248 + }, + { + "epoch": 0.499910001799964, + "grad_norm": 0.31375128156228366, + "learning_rate": 8.784410883274879e-06, + "loss": 0.2632, + "step": 6249 + }, + { + "epoch": 0.499990000199996, + "grad_norm": 0.2611977037643278, + "learning_rate": 8.783987512935498e-06, + "loss": 0.2814, + "step": 6250 + }, + { + "epoch": 0.500069998600028, + "grad_norm": 0.304541362898982, + "learning_rate": 8.783564079088478e-06, + "loss": 0.2527, + "step": 6251 + }, + { + "epoch": 0.50014999700006, + "grad_norm": 0.3302825008821275, + "learning_rate": 8.783140581740927e-06, + "loss": 0.2937, + "step": 6252 + }, + { + "epoch": 0.500229995400092, + "grad_norm": 0.27699535533304726, + "learning_rate": 8.782717020899957e-06, + "loss": 0.2735, + "step": 6253 + }, + { + "epoch": 0.500309993800124, + "grad_norm": 0.2689907287314529, + "learning_rate": 8.782293396572669e-06, + "loss": 0.293, + "step": 6254 + }, + { + "epoch": 0.500389992200156, + "grad_norm": 0.3012722891711402, + "learning_rate": 8.781869708766179e-06, + "loss": 0.2908, + "step": 6255 + }, + { + "epoch": 0.500469990600188, + "grad_norm": 0.2967077943637965, + "learning_rate": 8.781445957487595e-06, + "loss": 0.2406, + "step": 6256 + }, + { + "epoch": 0.50054998900022, + "grad_norm": 0.2980459589836454, + "learning_rate": 8.781022142744028e-06, + "loss": 0.2368, + "step": 6257 + }, + { + "epoch": 0.500629987400252, + "grad_norm": 0.33408051679032774, + "learning_rate": 8.780598264542597e-06, + "loss": 0.2591, + "step": 6258 + }, + { + "epoch": 0.500709985800284, + "grad_norm": 0.27978288410319896, + "learning_rate": 8.78017432289041e-06, + "loss": 0.2547, + "step": 6259 + }, + { + "epoch": 0.500789984200316, + "grad_norm": 0.3137039696457966, + "learning_rate": 8.779750317794582e-06, + "loss": 0.2499, + "step": 6260 + }, + { + "epoch": 0.500869982600348, + "grad_norm": 0.284273439729684, + "learning_rate": 8.779326249262232e-06, + "loss": 0.3136, + "step": 6261 + }, + { + "epoch": 0.50094998100038, + "grad_norm": 0.33446665193229613, + "learning_rate": 8.778902117300475e-06, + "loss": 0.2715, + "step": 6262 + }, + { + "epoch": 0.5010299794004119, + "grad_norm": 0.2570259779119952, + "learning_rate": 8.778477921916431e-06, + "loss": 0.276, + "step": 6263 + }, + { + "epoch": 0.501109977800444, + "grad_norm": 0.3860663646413159, + "learning_rate": 8.77805366311722e-06, + "loss": 0.2771, + "step": 6264 + }, + { + "epoch": 0.501189976200476, + "grad_norm": 0.2734503582315964, + "learning_rate": 8.777629340909963e-06, + "loss": 0.2966, + "step": 6265 + }, + { + "epoch": 0.501269974600508, + "grad_norm": 0.2812462874939072, + "learning_rate": 8.777204955301777e-06, + "loss": 0.2844, + "step": 6266 + }, + { + "epoch": 0.50134997300054, + "grad_norm": 0.2653240088021343, + "learning_rate": 8.77678050629979e-06, + "loss": 0.285, + "step": 6267 + }, + { + "epoch": 0.501429971400572, + "grad_norm": 0.23312541407220877, + "learning_rate": 8.77635599391112e-06, + "loss": 0.3174, + "step": 6268 + }, + { + "epoch": 0.501509969800604, + "grad_norm": 0.21975551238120355, + "learning_rate": 8.775931418142895e-06, + "loss": 0.3032, + "step": 6269 + }, + { + "epoch": 0.501589968200636, + "grad_norm": 0.31042804252077155, + "learning_rate": 8.775506779002243e-06, + "loss": 0.2507, + "step": 6270 + }, + { + "epoch": 0.501669966600668, + "grad_norm": 0.30009975331281935, + "learning_rate": 8.775082076496287e-06, + "loss": 0.2812, + "step": 6271 + }, + { + "epoch": 0.5017499650007, + "grad_norm": 0.29293481140760097, + "learning_rate": 8.774657310632157e-06, + "loss": 0.2389, + "step": 6272 + }, + { + "epoch": 0.5018299634007319, + "grad_norm": 0.32609770531104576, + "learning_rate": 8.77423248141698e-06, + "loss": 0.2545, + "step": 6273 + }, + { + "epoch": 0.501909961800764, + "grad_norm": 0.2802559829283081, + "learning_rate": 8.773807588857887e-06, + "loss": 0.2893, + "step": 6274 + }, + { + "epoch": 0.501989960200796, + "grad_norm": 0.2720664207118118, + "learning_rate": 8.773382632962011e-06, + "loss": 0.2862, + "step": 6275 + }, + { + "epoch": 0.502069958600828, + "grad_norm": 0.2883361909694832, + "learning_rate": 8.772957613736483e-06, + "loss": 0.2898, + "step": 6276 + }, + { + "epoch": 0.50214995700086, + "grad_norm": 0.26123621604761577, + "learning_rate": 8.772532531188434e-06, + "loss": 0.2749, + "step": 6277 + }, + { + "epoch": 0.502229955400892, + "grad_norm": 0.2279925274090334, + "learning_rate": 8.772107385325e-06, + "loss": 0.3048, + "step": 6278 + }, + { + "epoch": 0.502309953800924, + "grad_norm": 0.24105549729433387, + "learning_rate": 8.771682176153317e-06, + "loss": 0.2968, + "step": 6279 + }, + { + "epoch": 0.502389952200956, + "grad_norm": 0.2788393350324983, + "learning_rate": 8.77125690368052e-06, + "loss": 0.3018, + "step": 6280 + }, + { + "epoch": 0.502469950600988, + "grad_norm": 0.2298527450957488, + "learning_rate": 8.770831567913747e-06, + "loss": 0.3308, + "step": 6281 + }, + { + "epoch": 0.50254994900102, + "grad_norm": 0.31295840330347935, + "learning_rate": 8.770406168860138e-06, + "loss": 0.2763, + "step": 6282 + }, + { + "epoch": 0.5026299474010519, + "grad_norm": 0.23316640379813178, + "learning_rate": 8.76998070652683e-06, + "loss": 0.3102, + "step": 6283 + }, + { + "epoch": 0.502709945801084, + "grad_norm": 0.3189578063543914, + "learning_rate": 8.769555180920966e-06, + "loss": 0.2611, + "step": 6284 + }, + { + "epoch": 0.502789944201116, + "grad_norm": 0.24612766296188715, + "learning_rate": 8.769129592049685e-06, + "loss": 0.2964, + "step": 6285 + }, + { + "epoch": 0.5028699426011479, + "grad_norm": 0.42071400468838627, + "learning_rate": 8.768703939920133e-06, + "loss": 0.2561, + "step": 6286 + }, + { + "epoch": 0.50294994100118, + "grad_norm": 0.2713208010665168, + "learning_rate": 8.768278224539451e-06, + "loss": 0.2741, + "step": 6287 + }, + { + "epoch": 0.503029939401212, + "grad_norm": 0.5094972792558844, + "learning_rate": 8.767852445914784e-06, + "loss": 0.2686, + "step": 6288 + }, + { + "epoch": 0.503109937801244, + "grad_norm": 0.406249848697086, + "learning_rate": 8.767426604053282e-06, + "loss": 0.2609, + "step": 6289 + }, + { + "epoch": 0.503189936201276, + "grad_norm": 0.23187734044257552, + "learning_rate": 8.767000698962087e-06, + "loss": 0.2976, + "step": 6290 + }, + { + "epoch": 0.503269934601308, + "grad_norm": 0.351227817640469, + "learning_rate": 8.76657473064835e-06, + "loss": 0.251, + "step": 6291 + }, + { + "epoch": 0.50334993300134, + "grad_norm": 0.3049630067062447, + "learning_rate": 8.76614869911922e-06, + "loss": 0.2538, + "step": 6292 + }, + { + "epoch": 0.5034299314013719, + "grad_norm": 0.24265407229996766, + "learning_rate": 8.765722604381843e-06, + "loss": 0.3577, + "step": 6293 + }, + { + "epoch": 0.503509929801404, + "grad_norm": 0.21995827556366285, + "learning_rate": 8.765296446443377e-06, + "loss": 0.3164, + "step": 6294 + }, + { + "epoch": 0.503589928201436, + "grad_norm": 0.25699180894119267, + "learning_rate": 8.76487022531097e-06, + "loss": 0.3128, + "step": 6295 + }, + { + "epoch": 0.5036699266014679, + "grad_norm": 0.3344515679998034, + "learning_rate": 8.764443940991776e-06, + "loss": 0.2475, + "step": 6296 + }, + { + "epoch": 0.5037499250015, + "grad_norm": 0.32749041126611467, + "learning_rate": 8.764017593492951e-06, + "loss": 0.2675, + "step": 6297 + }, + { + "epoch": 0.503829923401532, + "grad_norm": 0.33041714098430913, + "learning_rate": 8.763591182821647e-06, + "loss": 0.2598, + "step": 6298 + }, + { + "epoch": 0.503909921801564, + "grad_norm": 0.298064282351983, + "learning_rate": 8.763164708985026e-06, + "loss": 0.3051, + "step": 6299 + }, + { + "epoch": 0.5039899202015959, + "grad_norm": 0.2570912606503043, + "learning_rate": 8.762738171990242e-06, + "loss": 0.275, + "step": 6300 + }, + { + "epoch": 0.504069918601628, + "grad_norm": 0.26716381265564565, + "learning_rate": 8.762311571844453e-06, + "loss": 0.2871, + "step": 6301 + }, + { + "epoch": 0.50414991700166, + "grad_norm": 0.2424059267181268, + "learning_rate": 8.761884908554818e-06, + "loss": 0.3185, + "step": 6302 + }, + { + "epoch": 0.5042299154016919, + "grad_norm": 0.3425031202563655, + "learning_rate": 8.761458182128503e-06, + "loss": 0.2905, + "step": 6303 + }, + { + "epoch": 0.504309913801724, + "grad_norm": 0.27336414093640604, + "learning_rate": 8.761031392572665e-06, + "loss": 0.2728, + "step": 6304 + }, + { + "epoch": 0.504389912201756, + "grad_norm": 0.30014573247936854, + "learning_rate": 8.76060453989447e-06, + "loss": 0.2803, + "step": 6305 + }, + { + "epoch": 0.5044699106017879, + "grad_norm": 0.282378096069049, + "learning_rate": 8.760177624101079e-06, + "loss": 0.3105, + "step": 6306 + }, + { + "epoch": 0.50454990900182, + "grad_norm": 0.3163778280576005, + "learning_rate": 8.75975064519966e-06, + "loss": 0.2923, + "step": 6307 + }, + { + "epoch": 0.504629907401852, + "grad_norm": 0.2892165726596172, + "learning_rate": 8.759323603197377e-06, + "loss": 0.2559, + "step": 6308 + }, + { + "epoch": 0.5047099058018839, + "grad_norm": 0.28984176988978866, + "learning_rate": 8.758896498101397e-06, + "loss": 0.2785, + "step": 6309 + }, + { + "epoch": 0.5047899042019159, + "grad_norm": 0.27680320592234975, + "learning_rate": 8.75846932991889e-06, + "loss": 0.2852, + "step": 6310 + }, + { + "epoch": 0.504869902601948, + "grad_norm": 0.31099022653542613, + "learning_rate": 8.758042098657022e-06, + "loss": 0.2772, + "step": 6311 + }, + { + "epoch": 0.50494990100198, + "grad_norm": 0.31234868951044725, + "learning_rate": 8.757614804322968e-06, + "loss": 0.2601, + "step": 6312 + }, + { + "epoch": 0.5050298994020119, + "grad_norm": 0.25319215452677285, + "learning_rate": 8.757187446923896e-06, + "loss": 0.3305, + "step": 6313 + }, + { + "epoch": 0.505109897802044, + "grad_norm": 0.37316326721559234, + "learning_rate": 8.75676002646698e-06, + "loss": 0.2472, + "step": 6314 + }, + { + "epoch": 0.505189896202076, + "grad_norm": 0.30582011068902065, + "learning_rate": 8.756332542959394e-06, + "loss": 0.261, + "step": 6315 + }, + { + "epoch": 0.5052698946021079, + "grad_norm": 0.2212791814643995, + "learning_rate": 8.75590499640831e-06, + "loss": 0.3172, + "step": 6316 + }, + { + "epoch": 0.50534989300214, + "grad_norm": 0.19928828324980044, + "learning_rate": 8.755477386820906e-06, + "loss": 0.3476, + "step": 6317 + }, + { + "epoch": 0.505429891402172, + "grad_norm": 0.29311239948504214, + "learning_rate": 8.755049714204357e-06, + "loss": 0.252, + "step": 6318 + }, + { + "epoch": 0.5055098898022039, + "grad_norm": 0.260109042448725, + "learning_rate": 8.75462197856584e-06, + "loss": 0.2653, + "step": 6319 + }, + { + "epoch": 0.5055898882022359, + "grad_norm": 0.37857891369433805, + "learning_rate": 8.75419417991254e-06, + "loss": 0.3007, + "step": 6320 + }, + { + "epoch": 0.505669886602268, + "grad_norm": 0.39538881799127834, + "learning_rate": 8.753766318251628e-06, + "loss": 0.2507, + "step": 6321 + }, + { + "epoch": 0.5057498850023, + "grad_norm": 0.2642534638613745, + "learning_rate": 8.753338393590292e-06, + "loss": 0.2949, + "step": 6322 + }, + { + "epoch": 0.5058298834023319, + "grad_norm": 0.3286064258181822, + "learning_rate": 8.752910405935708e-06, + "loss": 0.2302, + "step": 6323 + }, + { + "epoch": 0.505909881802364, + "grad_norm": 0.2624475579172414, + "learning_rate": 8.752482355295065e-06, + "loss": 0.2767, + "step": 6324 + }, + { + "epoch": 0.505989880202396, + "grad_norm": 0.2718872355779903, + "learning_rate": 8.752054241675543e-06, + "loss": 0.2483, + "step": 6325 + }, + { + "epoch": 0.5060698786024279, + "grad_norm": 0.26746631566265905, + "learning_rate": 8.751626065084328e-06, + "loss": 0.3011, + "step": 6326 + }, + { + "epoch": 0.50614987700246, + "grad_norm": 0.26966741493678964, + "learning_rate": 8.751197825528607e-06, + "loss": 0.2728, + "step": 6327 + }, + { + "epoch": 0.506229875402492, + "grad_norm": 0.2607201478904174, + "learning_rate": 8.750769523015568e-06, + "loss": 0.2611, + "step": 6328 + }, + { + "epoch": 0.5063098738025239, + "grad_norm": 0.45693938198765477, + "learning_rate": 8.750341157552396e-06, + "loss": 0.2551, + "step": 6329 + }, + { + "epoch": 0.5063898722025559, + "grad_norm": 0.233540134825675, + "learning_rate": 8.749912729146283e-06, + "loss": 0.3078, + "step": 6330 + }, + { + "epoch": 0.506469870602588, + "grad_norm": 0.2672135626925953, + "learning_rate": 8.74948423780442e-06, + "loss": 0.2744, + "step": 6331 + }, + { + "epoch": 0.50654986900262, + "grad_norm": 0.2869379882783276, + "learning_rate": 8.749055683533995e-06, + "loss": 0.2685, + "step": 6332 + }, + { + "epoch": 0.5066298674026519, + "grad_norm": 0.3018553641769092, + "learning_rate": 8.748627066342206e-06, + "loss": 0.2963, + "step": 6333 + }, + { + "epoch": 0.506709865802684, + "grad_norm": 0.2617675858803325, + "learning_rate": 8.748198386236241e-06, + "loss": 0.2967, + "step": 6334 + }, + { + "epoch": 0.506789864202716, + "grad_norm": 0.3360853030310783, + "learning_rate": 8.7477696432233e-06, + "loss": 0.2933, + "step": 6335 + }, + { + "epoch": 0.5068698626027479, + "grad_norm": 0.26300366264943653, + "learning_rate": 8.747340837310574e-06, + "loss": 0.2813, + "step": 6336 + }, + { + "epoch": 0.50694986100278, + "grad_norm": 0.2778915232778241, + "learning_rate": 8.746911968505262e-06, + "loss": 0.2954, + "step": 6337 + }, + { + "epoch": 0.507029859402812, + "grad_norm": 0.27309891370722833, + "learning_rate": 8.746483036814561e-06, + "loss": 0.3116, + "step": 6338 + }, + { + "epoch": 0.5071098578028439, + "grad_norm": 0.35358892697470456, + "learning_rate": 8.74605404224567e-06, + "loss": 0.2679, + "step": 6339 + }, + { + "epoch": 0.5071898562028759, + "grad_norm": 0.26546425661701656, + "learning_rate": 8.74562498480579e-06, + "loss": 0.318, + "step": 6340 + }, + { + "epoch": 0.507269854602908, + "grad_norm": 0.2680335957799276, + "learning_rate": 8.745195864502121e-06, + "loss": 0.279, + "step": 6341 + }, + { + "epoch": 0.5073498530029399, + "grad_norm": 0.30886579836433153, + "learning_rate": 8.744766681341866e-06, + "loss": 0.2508, + "step": 6342 + }, + { + "epoch": 0.5074298514029719, + "grad_norm": 0.2830385418970947, + "learning_rate": 8.744337435332226e-06, + "loss": 0.278, + "step": 6343 + }, + { + "epoch": 0.507509849803004, + "grad_norm": 0.30528688834790846, + "learning_rate": 8.743908126480408e-06, + "loss": 0.2475, + "step": 6344 + }, + { + "epoch": 0.507589848203036, + "grad_norm": 0.27153869652124374, + "learning_rate": 8.743478754793616e-06, + "loss": 0.2947, + "step": 6345 + }, + { + "epoch": 0.5076698466030679, + "grad_norm": 0.27529965487455693, + "learning_rate": 8.743049320279053e-06, + "loss": 0.2808, + "step": 6346 + }, + { + "epoch": 0.5077498450030999, + "grad_norm": 0.4460240629763492, + "learning_rate": 8.742619822943932e-06, + "loss": 0.2693, + "step": 6347 + }, + { + "epoch": 0.507829843403132, + "grad_norm": 0.2585966620013788, + "learning_rate": 8.742190262795458e-06, + "loss": 0.2744, + "step": 6348 + }, + { + "epoch": 0.5079098418031639, + "grad_norm": 0.3109977970819965, + "learning_rate": 8.74176063984084e-06, + "loss": 0.2556, + "step": 6349 + }, + { + "epoch": 0.5079898402031959, + "grad_norm": 0.3108898260252627, + "learning_rate": 8.741330954087291e-06, + "loss": 0.2574, + "step": 6350 + }, + { + "epoch": 0.508069838603228, + "grad_norm": 0.30929210561246107, + "learning_rate": 8.74090120554202e-06, + "loss": 0.2566, + "step": 6351 + }, + { + "epoch": 0.5081498370032599, + "grad_norm": 0.29013371787319897, + "learning_rate": 8.740471394212242e-06, + "loss": 0.2832, + "step": 6352 + }, + { + "epoch": 0.5082298354032919, + "grad_norm": 0.2816329317577217, + "learning_rate": 8.740041520105168e-06, + "loss": 0.305, + "step": 6353 + }, + { + "epoch": 0.508309833803324, + "grad_norm": 0.28996547189623834, + "learning_rate": 8.739611583228014e-06, + "loss": 0.256, + "step": 6354 + }, + { + "epoch": 0.508389832203356, + "grad_norm": 0.2788915549803398, + "learning_rate": 8.739181583587997e-06, + "loss": 0.2918, + "step": 6355 + }, + { + "epoch": 0.5084698306033879, + "grad_norm": 0.3199198931418646, + "learning_rate": 8.73875152119233e-06, + "loss": 0.2558, + "step": 6356 + }, + { + "epoch": 0.5085498290034199, + "grad_norm": 0.4434178824740245, + "learning_rate": 8.738321396048235e-06, + "loss": 0.2594, + "step": 6357 + }, + { + "epoch": 0.508629827403452, + "grad_norm": 0.26352315719387487, + "learning_rate": 8.73789120816293e-06, + "loss": 0.3191, + "step": 6358 + }, + { + "epoch": 0.5087098258034839, + "grad_norm": 0.32987147710074316, + "learning_rate": 8.737460957543633e-06, + "loss": 0.2824, + "step": 6359 + }, + { + "epoch": 0.5087898242035159, + "grad_norm": 0.31741415715751325, + "learning_rate": 8.737030644197566e-06, + "loss": 0.271, + "step": 6360 + }, + { + "epoch": 0.508869822603548, + "grad_norm": 0.28540390718501785, + "learning_rate": 8.736600268131953e-06, + "loss": 0.2542, + "step": 6361 + }, + { + "epoch": 0.5089498210035799, + "grad_norm": 0.30048979440893203, + "learning_rate": 8.736169829354012e-06, + "loss": 0.2555, + "step": 6362 + }, + { + "epoch": 0.5090298194036119, + "grad_norm": 0.2957560502764145, + "learning_rate": 8.735739327870974e-06, + "loss": 0.2497, + "step": 6363 + }, + { + "epoch": 0.509109817803644, + "grad_norm": 0.2756206478946818, + "learning_rate": 8.73530876369006e-06, + "loss": 0.2749, + "step": 6364 + }, + { + "epoch": 0.5091898162036759, + "grad_norm": 0.3023930067587958, + "learning_rate": 8.734878136818496e-06, + "loss": 0.2797, + "step": 6365 + }, + { + "epoch": 0.5092698146037079, + "grad_norm": 0.29283047878326296, + "learning_rate": 8.73444744726351e-06, + "loss": 0.2476, + "step": 6366 + }, + { + "epoch": 0.5093498130037399, + "grad_norm": 0.25960329251610215, + "learning_rate": 8.734016695032333e-06, + "loss": 0.2734, + "step": 6367 + }, + { + "epoch": 0.509429811403772, + "grad_norm": 0.26594726513257555, + "learning_rate": 8.733585880132189e-06, + "loss": 0.2742, + "step": 6368 + }, + { + "epoch": 0.5095098098038039, + "grad_norm": 0.40830614133430737, + "learning_rate": 8.733155002570315e-06, + "loss": 0.2667, + "step": 6369 + }, + { + "epoch": 0.5095898082038359, + "grad_norm": 0.27062026907067, + "learning_rate": 8.732724062353937e-06, + "loss": 0.2755, + "step": 6370 + }, + { + "epoch": 0.509669806603868, + "grad_norm": 0.23148898842123292, + "learning_rate": 8.73229305949029e-06, + "loss": 0.3359, + "step": 6371 + }, + { + "epoch": 0.5097498050038999, + "grad_norm": 0.26829143346797524, + "learning_rate": 8.731861993986608e-06, + "loss": 0.2733, + "step": 6372 + }, + { + "epoch": 0.5098298034039319, + "grad_norm": 0.27321636666393245, + "learning_rate": 8.731430865850124e-06, + "loss": 0.2871, + "step": 6373 + }, + { + "epoch": 0.509909801803964, + "grad_norm": 0.2992400669628326, + "learning_rate": 8.730999675088075e-06, + "loss": 0.2471, + "step": 6374 + }, + { + "epoch": 0.5099898002039959, + "grad_norm": 0.3173399260580309, + "learning_rate": 8.730568421707699e-06, + "loss": 0.2584, + "step": 6375 + }, + { + "epoch": 0.5100697986040279, + "grad_norm": 0.39942238959930576, + "learning_rate": 8.730137105716231e-06, + "loss": 0.2671, + "step": 6376 + }, + { + "epoch": 0.5101497970040599, + "grad_norm": 0.33097521252812084, + "learning_rate": 8.729705727120911e-06, + "loss": 0.2788, + "step": 6377 + }, + { + "epoch": 0.510229795404092, + "grad_norm": 0.27291127013991473, + "learning_rate": 8.72927428592898e-06, + "loss": 0.2749, + "step": 6378 + }, + { + "epoch": 0.5103097938041239, + "grad_norm": 0.30109509811447827, + "learning_rate": 8.728842782147679e-06, + "loss": 0.2302, + "step": 6379 + }, + { + "epoch": 0.5103897922041559, + "grad_norm": 0.2916838161999928, + "learning_rate": 8.728411215784246e-06, + "loss": 0.239, + "step": 6380 + }, + { + "epoch": 0.510469790604188, + "grad_norm": 0.32067224862764493, + "learning_rate": 8.727979586845931e-06, + "loss": 0.2578, + "step": 6381 + }, + { + "epoch": 0.5105497890042199, + "grad_norm": 0.3206381047068147, + "learning_rate": 8.727547895339974e-06, + "loss": 0.2626, + "step": 6382 + }, + { + "epoch": 0.5106297874042519, + "grad_norm": 0.3082124764253943, + "learning_rate": 8.727116141273619e-06, + "loss": 0.2369, + "step": 6383 + }, + { + "epoch": 0.5107097858042839, + "grad_norm": 0.2741492227582812, + "learning_rate": 8.726684324654115e-06, + "loss": 0.3057, + "step": 6384 + }, + { + "epoch": 0.5107897842043159, + "grad_norm": 0.339283108525902, + "learning_rate": 8.726252445488708e-06, + "loss": 0.246, + "step": 6385 + }, + { + "epoch": 0.5108697826043479, + "grad_norm": 0.8038592105658828, + "learning_rate": 8.725820503784648e-06, + "loss": 0.2774, + "step": 6386 + }, + { + "epoch": 0.5109497810043799, + "grad_norm": 0.28954994439665704, + "learning_rate": 8.725388499549182e-06, + "loss": 0.2768, + "step": 6387 + }, + { + "epoch": 0.511029779404412, + "grad_norm": 0.20660916849362523, + "learning_rate": 8.72495643278956e-06, + "loss": 0.3587, + "step": 6388 + }, + { + "epoch": 0.5111097778044439, + "grad_norm": 0.23459056698151448, + "learning_rate": 8.724524303513035e-06, + "loss": 0.3121, + "step": 6389 + }, + { + "epoch": 0.5111897762044759, + "grad_norm": 0.29070581253921846, + "learning_rate": 8.724092111726861e-06, + "loss": 0.284, + "step": 6390 + }, + { + "epoch": 0.511269774604508, + "grad_norm": 0.32627902159843886, + "learning_rate": 8.72365985743829e-06, + "loss": 0.26, + "step": 6391 + }, + { + "epoch": 0.5113497730045399, + "grad_norm": 0.3105413123160655, + "learning_rate": 8.723227540654574e-06, + "loss": 0.2855, + "step": 6392 + }, + { + "epoch": 0.5114297714045719, + "grad_norm": 0.2606233427496336, + "learning_rate": 8.722795161382974e-06, + "loss": 0.2891, + "step": 6393 + }, + { + "epoch": 0.5115097698046039, + "grad_norm": 0.3167862793327284, + "learning_rate": 8.722362719630741e-06, + "loss": 0.2495, + "step": 6394 + }, + { + "epoch": 0.5115897682046359, + "grad_norm": 0.26133039747930814, + "learning_rate": 8.72193021540514e-06, + "loss": 0.2818, + "step": 6395 + }, + { + "epoch": 0.5116697666046679, + "grad_norm": 0.24349370532312845, + "learning_rate": 8.721497648713423e-06, + "loss": 0.2863, + "step": 6396 + }, + { + "epoch": 0.5117497650046999, + "grad_norm": 0.2927263678640821, + "learning_rate": 8.721065019562854e-06, + "loss": 0.263, + "step": 6397 + }, + { + "epoch": 0.5118297634047319, + "grad_norm": 0.3175143480093446, + "learning_rate": 8.72063232796069e-06, + "loss": 0.2461, + "step": 6398 + }, + { + "epoch": 0.5119097618047639, + "grad_norm": 0.326270044178002, + "learning_rate": 8.720199573914196e-06, + "loss": 0.2845, + "step": 6399 + }, + { + "epoch": 0.5119897602047959, + "grad_norm": 0.29604109702657116, + "learning_rate": 8.719766757430637e-06, + "loss": 0.2636, + "step": 6400 + } + ], + "logging_steps": 1.0, + "max_steps": 25000, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 400, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.1977941640701542e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}