| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9995577178239717, | |
| "eval_steps": 142, | |
| "global_step": 565, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 4.710759162902832, | |
| "learning_rate": 0.0005, | |
| "loss": 3.3182, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0, | |
| "eval_loss": 3.3362529277801514, | |
| "eval_runtime": 14.462, | |
| "eval_samples_per_second": 32.983, | |
| "eval_steps_per_second": 8.298, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0, | |
| "grad_norm": 4.644880771636963, | |
| "learning_rate": 0.001, | |
| "loss": 3.2788, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 2.652876138687134, | |
| "learning_rate": 0.0015, | |
| "loss": 0.9781, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 12.365164756774902, | |
| "learning_rate": 0.002, | |
| "loss": 0.65, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 18.633052825927734, | |
| "learning_rate": 0.0025, | |
| "loss": 2.3289, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 12.415583610534668, | |
| "learning_rate": 0.003, | |
| "loss": 0.7836, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 10.400487899780273, | |
| "learning_rate": 0.0034999999999999996, | |
| "loss": 0.8244, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 2.2445099353790283, | |
| "learning_rate": 0.004, | |
| "loss": 0.4377, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 23.533748626708984, | |
| "learning_rate": 0.0045000000000000005, | |
| "loss": 1.1932, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 83.60514831542969, | |
| "learning_rate": 0.005, | |
| "loss": 18.4965, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 51.204532623291016, | |
| "learning_rate": 0.0049999956547994865, | |
| "loss": 4.8481, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 100.4805908203125, | |
| "learning_rate": 0.0049999826192130515, | |
| "loss": 21.3498, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 105.87885284423828, | |
| "learning_rate": 0.004999960893286007, | |
| "loss": 12.4439, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 83.33695983886719, | |
| "learning_rate": 0.004999930477093878, | |
| "loss": 14.4507, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 103.42640686035156, | |
| "learning_rate": 0.0049998913707423945, | |
| "loss": 15.3804, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 36.2640380859375, | |
| "learning_rate": 0.0049998435743674975, | |
| "loss": 17.3659, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 41.00593948364258, | |
| "learning_rate": 0.0049997870881353336, | |
| "loss": 15.5879, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 26.99127197265625, | |
| "learning_rate": 0.0049997219122422595, | |
| "loss": 32.2726, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 23.069412231445312, | |
| "learning_rate": 0.0049996480469148355, | |
| "loss": 35.3862, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 16.51882553100586, | |
| "learning_rate": 0.004999565492409831, | |
| "loss": 15.8877, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 22.481494903564453, | |
| "learning_rate": 0.0049994742490142175, | |
| "loss": 27.0009, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 25.911300659179688, | |
| "learning_rate": 0.004999374317045171, | |
| "loss": 31.2604, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 19.360708236694336, | |
| "learning_rate": 0.0049992656968500734, | |
| "loss": 15.2702, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 24.852096557617188, | |
| "learning_rate": 0.0049991483888065045, | |
| "loss": 19.2565, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 21.394197463989258, | |
| "learning_rate": 0.004999022393322246, | |
| "loss": 11.2259, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 20.673797607421875, | |
| "learning_rate": 0.004998887710835278, | |
| "loss": 12.7072, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 22.879501342773438, | |
| "learning_rate": 0.004998744341813779, | |
| "loss": 12.8781, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 17.807809829711914, | |
| "learning_rate": 0.004998592286756122, | |
| "loss": 10.9183, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 18.086101531982422, | |
| "learning_rate": 0.004998431546190876, | |
| "loss": 15.1292, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 14.039950370788574, | |
| "learning_rate": 0.0049982621206768, | |
| "loss": 13.7787, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 20.522136688232422, | |
| "learning_rate": 0.004998084010802845, | |
| "loss": 15.6458, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 19.700590133666992, | |
| "learning_rate": 0.004997897217188148, | |
| "loss": 12.6489, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 9.318965911865234, | |
| "learning_rate": 0.004997701740482036, | |
| "loss": 7.3008, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 10.423604965209961, | |
| "learning_rate": 0.004997497581364014, | |
| "loss": 4.9017, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 8.51007080078125, | |
| "learning_rate": 0.004997284740543776, | |
| "loss": 3.6646, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 14.993752479553223, | |
| "learning_rate": 0.0049970632187611875, | |
| "loss": 5.5905, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 14.212651252746582, | |
| "learning_rate": 0.0049968330167862954, | |
| "loss": 5.7808, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 10.461276054382324, | |
| "learning_rate": 0.004996594135419318, | |
| "loss": 3.9539, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 9.1497220993042, | |
| "learning_rate": 0.004996346575490646, | |
| "loss": 3.124, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 9.099406242370605, | |
| "learning_rate": 0.004996090337860836, | |
| "loss": 3.7992, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 13.767452239990234, | |
| "learning_rate": 0.0049958254234206126, | |
| "loss": 4.5866, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 12.716552734375, | |
| "learning_rate": 0.00499555183309086, | |
| "loss": 3.3461, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 7.223677158355713, | |
| "learning_rate": 0.004995269567822622, | |
| "loss": 2.0728, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 12.463162422180176, | |
| "learning_rate": 0.0049949786285970995, | |
| "loss": 2.7682, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 4.0829691886901855, | |
| "learning_rate": 0.004994679016425641, | |
| "loss": 1.6774, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 6.238979816436768, | |
| "learning_rate": 0.00499437073234975, | |
| "loss": 2.3438, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 5.805751800537109, | |
| "learning_rate": 0.004994053777441069, | |
| "loss": 2.712, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 5.092411041259766, | |
| "learning_rate": 0.004993728152801385, | |
| "loss": 2.2179, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 5.180473327636719, | |
| "learning_rate": 0.00499339385956262, | |
| "loss": 2.0434, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 10.287396430969238, | |
| "learning_rate": 0.004993050898886834, | |
| "loss": 4.3246, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 10.666089057922363, | |
| "learning_rate": 0.00499269927196621, | |
| "loss": 4.0474, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 9.083404541015625, | |
| "learning_rate": 0.004992338980023061, | |
| "loss": 2.7797, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 6.394657135009766, | |
| "learning_rate": 0.00499197002430982, | |
| "loss": 2.4962, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 7.713652610778809, | |
| "learning_rate": 0.004991592406109036, | |
| "loss": 2.9878, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 8.85560131072998, | |
| "learning_rate": 0.00499120612673337, | |
| "loss": 3.1156, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 6.502203941345215, | |
| "learning_rate": 0.004990811187525591, | |
| "loss": 2.0781, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 7.197846412658691, | |
| "learning_rate": 0.004990407589858572, | |
| "loss": 2.4231, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 7.068665504455566, | |
| "learning_rate": 0.004989995335135282, | |
| "loss": 3.0021, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 5.702466011047363, | |
| "learning_rate": 0.004989574424788787, | |
| "loss": 2.6863, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 3.5532689094543457, | |
| "learning_rate": 0.004989144860282235, | |
| "loss": 1.8552, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 6.944902420043945, | |
| "learning_rate": 0.004988706643108864, | |
| "loss": 2.413, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 8.177939414978027, | |
| "learning_rate": 0.004988259774791987, | |
| "loss": 2.779, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 4.095608711242676, | |
| "learning_rate": 0.004987804256884988, | |
| "loss": 2.4072, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 6.517646789550781, | |
| "learning_rate": 0.004987340090971322, | |
| "loss": 2.3885, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 3.5518805980682373, | |
| "learning_rate": 0.0049868672786645045, | |
| "loss": 1.6975, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 4.4491496086120605, | |
| "learning_rate": 0.004986385821608106, | |
| "loss": 1.61, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 6.043672561645508, | |
| "learning_rate": 0.004985895721475749, | |
| "loss": 2.0829, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 4.9056010246276855, | |
| "learning_rate": 0.004985396979971099, | |
| "loss": 1.7009, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 3.501497983932495, | |
| "learning_rate": 0.0049848895988278625, | |
| "loss": 1.4123, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 4.273284912109375, | |
| "learning_rate": 0.004984373579809778, | |
| "loss": 1.4578, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 4.848941326141357, | |
| "learning_rate": 0.00498384892471061, | |
| "loss": 1.8438, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 5.663628101348877, | |
| "learning_rate": 0.004983315635354144, | |
| "loss": 1.9982, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 5.814151287078857, | |
| "learning_rate": 0.004982773713594179, | |
| "loss": 1.8503, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 5.263804912567139, | |
| "learning_rate": 0.004982223161314522, | |
| "loss": 1.9777, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 3.4019265174865723, | |
| "learning_rate": 0.00498166398042898, | |
| "loss": 1.8134, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 3.5382890701293945, | |
| "learning_rate": 0.0049810961728813585, | |
| "loss": 1.3, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 9.210183143615723, | |
| "learning_rate": 0.0049805197406454435, | |
| "loss": 2.2969, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 4.941742420196533, | |
| "learning_rate": 0.004979934685725011, | |
| "loss": 2.0702, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 4.069230556488037, | |
| "learning_rate": 0.0049793410101538005, | |
| "loss": 1.7945, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 1.7246112823486328, | |
| "learning_rate": 0.004978738715995527, | |
| "loss": 1.2438, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 3.8042044639587402, | |
| "learning_rate": 0.004978127805343859, | |
| "loss": 1.761, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 3.8493285179138184, | |
| "learning_rate": 0.0049775082803224235, | |
| "loss": 2.4098, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 3.6602845191955566, | |
| "learning_rate": 0.004976880143084786, | |
| "loss": 2.0654, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 3.3052616119384766, | |
| "learning_rate": 0.004976243395814452, | |
| "loss": 1.6004, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 2.4892361164093018, | |
| "learning_rate": 0.00497559804072486, | |
| "loss": 1.5039, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 3.386439323425293, | |
| "learning_rate": 0.004974944080059365, | |
| "loss": 1.5869, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 3.3360776901245117, | |
| "learning_rate": 0.004974281516091241, | |
| "loss": 1.7258, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 2.726275682449341, | |
| "learning_rate": 0.004973610351123664, | |
| "loss": 1.4276, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 3.8582663536071777, | |
| "learning_rate": 0.004972930587489714, | |
| "loss": 1.5193, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 4.054693222045898, | |
| "learning_rate": 0.004972242227552358, | |
| "loss": 1.7082, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 2.651496171951294, | |
| "learning_rate": 0.0049715452737044445, | |
| "loss": 1.4398, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 1.8549413681030273, | |
| "learning_rate": 0.004970839728368696, | |
| "loss": 1.1277, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 3.3630051612854004, | |
| "learning_rate": 0.004970125593997705, | |
| "loss": 1.4225, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 2.9479305744171143, | |
| "learning_rate": 0.004969402873073914, | |
| "loss": 1.5494, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 2.186293601989746, | |
| "learning_rate": 0.004968671568109616, | |
| "loss": 1.4165, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 2.534365177154541, | |
| "learning_rate": 0.004967931681646948, | |
| "loss": 1.4073, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 3.300574779510498, | |
| "learning_rate": 0.00496718321625787, | |
| "loss": 1.3459, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 1.7278486490249634, | |
| "learning_rate": 0.004966426174544171, | |
| "loss": 1.14, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 3.837832450866699, | |
| "learning_rate": 0.004965660559137448, | |
| "loss": 1.3687, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 3.386354684829712, | |
| "learning_rate": 0.0049648863726991024, | |
| "loss": 1.4498, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 2.825348138809204, | |
| "learning_rate": 0.004964103617920332, | |
| "loss": 1.3711, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 4.416274547576904, | |
| "learning_rate": 0.004963312297522116, | |
| "loss": 1.5401, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 21.538768768310547, | |
| "learning_rate": 0.004962512414255214, | |
| "loss": 3.905, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 3.0376906394958496, | |
| "learning_rate": 0.0049617039709001455, | |
| "loss": 1.2659, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 3.5271809101104736, | |
| "learning_rate": 0.004960886970267191, | |
| "loss": 1.3535, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 2.844996929168701, | |
| "learning_rate": 0.004960061415196374, | |
| "loss": 1.3717, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 2.2052347660064697, | |
| "learning_rate": 0.004959227308557459, | |
| "loss": 1.1754, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 4.3411736488342285, | |
| "learning_rate": 0.004958384653249933, | |
| "loss": 1.4149, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 3.9820494651794434, | |
| "learning_rate": 0.004957533452203, | |
| "loss": 1.5, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 4.067386627197266, | |
| "learning_rate": 0.0049566737083755735, | |
| "loss": 1.46, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 1.9211413860321045, | |
| "learning_rate": 0.00495580542475626, | |
| "loss": 1.1369, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 3.1695358753204346, | |
| "learning_rate": 0.004954928604363353, | |
| "loss": 1.3703, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 3.406451940536499, | |
| "learning_rate": 0.004954043250244819, | |
| "loss": 1.4874, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 2.779665470123291, | |
| "learning_rate": 0.004953149365478293, | |
| "loss": 1.2439, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 3.7801878452301025, | |
| "learning_rate": 0.004952246953171062, | |
| "loss": 1.3442, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 3.9421513080596924, | |
| "learning_rate": 0.004951336016460053, | |
| "loss": 1.4753, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 2.5723776817321777, | |
| "learning_rate": 0.004950416558511832, | |
| "loss": 1.2204, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 2.7108566761016846, | |
| "learning_rate": 0.00494948858252258, | |
| "loss": 1.2844, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 3.7284321784973145, | |
| "learning_rate": 0.004948552091718092, | |
| "loss": 1.2804, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 2.006575107574463, | |
| "learning_rate": 0.004947607089353758, | |
| "loss": 1.1397, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 2.2837071418762207, | |
| "learning_rate": 0.004946653578714559, | |
| "loss": 1.1474, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 2.0283279418945312, | |
| "learning_rate": 0.0049456915631150514, | |
| "loss": 1.1666, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 3.005060911178589, | |
| "learning_rate": 0.0049447210458993555, | |
| "loss": 1.1475, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 1.9485745429992676, | |
| "learning_rate": 0.004943742030441145, | |
| "loss": 1.1236, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 2.6474170684814453, | |
| "learning_rate": 0.004942754520143634, | |
| "loss": 1.2125, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 2.2282495498657227, | |
| "learning_rate": 0.0049417585184395665, | |
| "loss": 0.9396, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 1.7349326610565186, | |
| "learning_rate": 0.004940754028791205, | |
| "loss": 0.8818, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 1.3199844360351562, | |
| "learning_rate": 0.004939741054690317, | |
| "loss": 0.8562, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 2.7869062423706055, | |
| "learning_rate": 0.004938719599658161, | |
| "loss": 1.0136, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 10.484325408935547, | |
| "learning_rate": 0.0049376896672454805, | |
| "loss": 2.0971, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 1.8717925548553467, | |
| "learning_rate": 0.004936651261032486, | |
| "loss": 1.0544, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 1.6124004125595093, | |
| "learning_rate": 0.004935604384628842, | |
| "loss": 0.8779, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.550537109375, | |
| "learning_rate": 0.004934549041673661, | |
| "loss": 0.8067, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 2.3560948371887207, | |
| "learning_rate": 0.0049334852358354836, | |
| "loss": 1.0108, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 1.5042353868484497, | |
| "learning_rate": 0.004932412970812268, | |
| "loss": 0.8573, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 4.7427978515625, | |
| "learning_rate": 0.004931332250331382, | |
| "loss": 1.5866, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 3.3485641479492188, | |
| "learning_rate": 0.004930243078149581, | |
| "loss": 1.024, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 1.072599172592163, | |
| "learning_rate": 0.004929145458053005, | |
| "loss": 0.9484, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 2.42199444770813, | |
| "learning_rate": 0.004928039393857155, | |
| "loss": 1.2085, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 1.3018336296081543, | |
| "learning_rate": 0.0049269248894068885, | |
| "loss": 0.8148, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 2.3955087661743164, | |
| "learning_rate": 0.004925801948576402, | |
| "loss": 0.8474, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 0.8443170189857483, | |
| "learning_rate": 0.004924670575269217, | |
| "loss": 0.9249, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "eval_loss": 1.2173190116882324, | |
| "eval_runtime": 18.0229, | |
| "eval_samples_per_second": 26.466, | |
| "eval_steps_per_second": 6.658, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 18.21401596069336, | |
| "learning_rate": 0.0049235307734181695, | |
| "loss": 1.0142, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 18.445880889892578, | |
| "learning_rate": 0.004922382546985393, | |
| "loss": 1.4088, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 3.3651649951934814, | |
| "learning_rate": 0.004921225899962307, | |
| "loss": 1.3859, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 5.419590950012207, | |
| "learning_rate": 0.004920060836369603, | |
| "loss": 1.3982, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 6.42302942276001, | |
| "learning_rate": 0.004918887360257228, | |
| "loss": 0.8986, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 2.406660795211792, | |
| "learning_rate": 0.004917705475704374, | |
| "loss": 0.7824, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 1.913580060005188, | |
| "learning_rate": 0.00491651518681946, | |
| "loss": 0.8983, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 3.3653571605682373, | |
| "learning_rate": 0.004915316497740121, | |
| "loss": 1.8095, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 2.5922982692718506, | |
| "learning_rate": 0.004914109412633194, | |
| "loss": 0.6852, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 2.0083518028259277, | |
| "learning_rate": 0.0049128939356946994, | |
| "loss": 0.6871, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 3.255305051803589, | |
| "learning_rate": 0.004911670071149831, | |
| "loss": 0.7797, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 5.756755828857422, | |
| "learning_rate": 0.0049104378232529364, | |
| "loss": 1.5976, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 3.5214617252349854, | |
| "learning_rate": 0.004909197196287509, | |
| "loss": 0.7824, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 2.516773223876953, | |
| "learning_rate": 0.004907948194566167, | |
| "loss": 0.7435, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 1.5353460311889648, | |
| "learning_rate": 0.004906690822430638, | |
| "loss": 0.5389, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 21.023536682128906, | |
| "learning_rate": 0.004905425084251753, | |
| "loss": 0.6795, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 2.1965794563293457, | |
| "learning_rate": 0.0049041509844294185, | |
| "loss": 0.8708, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 16.767335891723633, | |
| "learning_rate": 0.004902868527392611, | |
| "loss": 11.3741, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 23.763965606689453, | |
| "learning_rate": 0.004901577717599356, | |
| "loss": 2.152, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 5.418874740600586, | |
| "learning_rate": 0.004900278559536716, | |
| "loss": 2.0419, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 3.5911929607391357, | |
| "learning_rate": 0.004898971057720773, | |
| "loss": 1.2719, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 2.972569465637207, | |
| "learning_rate": 0.004897655216696613, | |
| "loss": 0.8896, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 12.528779029846191, | |
| "learning_rate": 0.0048963310410383085, | |
| "loss": 3.1689, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 11.792468070983887, | |
| "learning_rate": 0.00489499853534891, | |
| "loss": 1.2653, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 3.2615010738372803, | |
| "learning_rate": 0.004893657704260419, | |
| "loss": 0.8104, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 3.4236032962799072, | |
| "learning_rate": 0.00489230855243378, | |
| "loss": 0.6288, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 4.4682698249816895, | |
| "learning_rate": 0.004890951084558859, | |
| "loss": 0.8304, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 1.8197399377822876, | |
| "learning_rate": 0.004889585305354436, | |
| "loss": 0.4139, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 1.393115758895874, | |
| "learning_rate": 0.004888211219568175, | |
| "loss": 0.496, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 18.753366470336914, | |
| "learning_rate": 0.0048868288319766215, | |
| "loss": 1.8722, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 3.966686248779297, | |
| "learning_rate": 0.004885438147385174, | |
| "loss": 2.0632, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 2.374682903289795, | |
| "learning_rate": 0.004884039170628077, | |
| "loss": 0.823, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 3.996807813644409, | |
| "learning_rate": 0.004882631906568398, | |
| "loss": 2.6766, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 3.3108572959899902, | |
| "learning_rate": 0.004881216360098012, | |
| "loss": 1.9905, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 1.809220314025879, | |
| "learning_rate": 0.004879792536137585, | |
| "loss": 0.7756, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 2.4348185062408447, | |
| "learning_rate": 0.004878360439636559, | |
| "loss": 0.9632, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 3.283806324005127, | |
| "learning_rate": 0.004876920075573129, | |
| "loss": 1.0558, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.7374613285064697, | |
| "learning_rate": 0.004875471448954234, | |
| "loss": 0.5686, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 2.121089458465576, | |
| "learning_rate": 0.004874014564815531, | |
| "loss": 0.7455, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 2.558676242828369, | |
| "learning_rate": 0.004872549428221384, | |
| "loss": 0.8103, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 1.171405553817749, | |
| "learning_rate": 0.004871076044264842, | |
| "loss": 0.5592, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 1.803780198097229, | |
| "learning_rate": 0.004869594418067624, | |
| "loss": 0.6193, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 12.401668548583984, | |
| "learning_rate": 0.0048681045547801, | |
| "loss": 0.5366, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 59.98530197143555, | |
| "learning_rate": 0.004866606459581275, | |
| "loss": 3.0222, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 6.432986259460449, | |
| "learning_rate": 0.0048651001376787675, | |
| "loss": 3.8615, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 3.953183174133301, | |
| "learning_rate": 0.004863585594308793, | |
| "loss": 2.0406, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 2.769693374633789, | |
| "learning_rate": 0.0048620628347361495, | |
| "loss": 1.3331, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 8.753790855407715, | |
| "learning_rate": 0.004860531864254192, | |
| "loss": 1.7073, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 2.858003854751587, | |
| "learning_rate": 0.004858992688184819, | |
| "loss": 1.1102, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 4.322317600250244, | |
| "learning_rate": 0.0048574453118784555, | |
| "loss": 0.832, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 1.9701354503631592, | |
| "learning_rate": 0.004855889740714028, | |
| "loss": 0.809, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 2.3157918453216553, | |
| "learning_rate": 0.004854325980098951, | |
| "loss": 0.984, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 1.2397148609161377, | |
| "learning_rate": 0.004852754035469109, | |
| "loss": 0.625, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 3.4943249225616455, | |
| "learning_rate": 0.004851173912288833, | |
| "loss": 1.0917, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 2.8938004970550537, | |
| "learning_rate": 0.004849585616050884, | |
| "loss": 1.0859, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 1.433147668838501, | |
| "learning_rate": 0.004847989152276435, | |
| "loss": 0.7359, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 3.213465452194214, | |
| "learning_rate": 0.00484638452651505, | |
| "loss": 1.2727, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 2.7757675647735596, | |
| "learning_rate": 0.004844771744344666, | |
| "loss": 1.0766, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 1.495505928993225, | |
| "learning_rate": 0.0048431508113715716, | |
| "loss": 0.7187, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 3.498947858810425, | |
| "learning_rate": 0.004841521733230391, | |
| "loss": 1.0661, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 3.4537739753723145, | |
| "learning_rate": 0.00483988451558406, | |
| "loss": 1.2463, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 1.4878536462783813, | |
| "learning_rate": 0.004838239164123811, | |
| "loss": 0.7462, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 3.1998729705810547, | |
| "learning_rate": 0.004836585684569148, | |
| "loss": 1.1343, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 2.9068143367767334, | |
| "learning_rate": 0.0048349240826678335, | |
| "loss": 1.1487, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 1.4316679239273071, | |
| "learning_rate": 0.00483325436419586, | |
| "loss": 0.7304, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 2.4590532779693604, | |
| "learning_rate": 0.004831576534957437, | |
| "loss": 1.0713, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 2.316596269607544, | |
| "learning_rate": 0.004829890600784969, | |
| "loss": 1.1284, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 1.1584460735321045, | |
| "learning_rate": 0.004828196567539034, | |
| "loss": 0.5914, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 2.8200185298919678, | |
| "learning_rate": 0.004826494441108362, | |
| "loss": 0.8873, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 2.6962389945983887, | |
| "learning_rate": 0.004824784227409819, | |
| "loss": 0.8854, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 0.9641892313957214, | |
| "learning_rate": 0.0048230659323883806, | |
| "loss": 0.4092, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 2.1415939331054688, | |
| "learning_rate": 0.004821339562017116, | |
| "loss": 0.9972, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 2.411900758743286, | |
| "learning_rate": 0.004819605122297167, | |
| "loss": 1.0741, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 2.072662591934204, | |
| "learning_rate": 0.004817862619257723, | |
| "loss": 0.7594, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 2.3440897464752197, | |
| "learning_rate": 0.004816112058956005, | |
| "loss": 0.4256, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 2.8536453247070312, | |
| "learning_rate": 0.00481435344747724, | |
| "loss": 0.7141, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 1.831053614616394, | |
| "learning_rate": 0.004812586790934645, | |
| "loss": 0.6077, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 0.8879019618034363, | |
| "learning_rate": 0.004810812095469401, | |
| "loss": 0.5193, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 2.061986207962036, | |
| "learning_rate": 0.004809029367250635, | |
| "loss": 0.8594, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 2.325282573699951, | |
| "learning_rate": 0.004807238612475394, | |
| "loss": 0.7912, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 1.2494412660598755, | |
| "learning_rate": 0.004805439837368632, | |
| "loss": 0.3469, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 1.9265069961547852, | |
| "learning_rate": 0.004803633048183176, | |
| "loss": 0.4018, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 1.8633078336715698, | |
| "learning_rate": 0.004801818251199718, | |
| "loss": 0.6729, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.923722505569458, | |
| "learning_rate": 0.004799995452726783, | |
| "loss": 0.3185, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.7258334755897522, | |
| "learning_rate": 0.00479816465910071, | |
| "loss": 0.3711, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 8.977919578552246, | |
| "learning_rate": 0.004796325876685632, | |
| "loss": 0.6389, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 1.6173701286315918, | |
| "learning_rate": 0.004794479111873451, | |
| "loss": 0.8077, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 0.3017358183860779, | |
| "learning_rate": 0.004792624371083819, | |
| "loss": 0.1717, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 0.2644597887992859, | |
| "learning_rate": 0.004790761660764111, | |
| "loss": 0.1666, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 0.32617634534835815, | |
| "learning_rate": 0.004788890987389409, | |
| "loss": 0.1912, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 0.325503408908844, | |
| "learning_rate": 0.00478701235746247, | |
| "loss": 0.1745, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 0.36416152119636536, | |
| "learning_rate": 0.004785125777513716, | |
| "loss": 0.1877, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 0.17476695775985718, | |
| "learning_rate": 0.004783231254101201, | |
| "loss": 0.1534, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 0.33042675256729126, | |
| "learning_rate": 0.004781328793810592, | |
| "loss": 0.1552, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 0.2617625892162323, | |
| "learning_rate": 0.004779418403255146, | |
| "loss": 0.1399, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 1.1109986305236816, | |
| "learning_rate": 0.004777500089075687, | |
| "loss": 0.2959, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 2.3491461277008057, | |
| "learning_rate": 0.004775573857940583, | |
| "loss": 0.4543, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 0.46592727303504944, | |
| "learning_rate": 0.004773639716545723, | |
| "loss": 0.1858, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 0.5546693801879883, | |
| "learning_rate": 0.0047716976716144915, | |
| "loss": 0.1889, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 1.0210996866226196, | |
| "learning_rate": 0.004769747729897749, | |
| "loss": 0.3422, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 0.28574883937835693, | |
| "learning_rate": 0.004767789898173806, | |
| "loss": 0.1658, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 0.7777175903320312, | |
| "learning_rate": 0.004765824183248399, | |
| "loss": 0.2562, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 0.6948490142822266, | |
| "learning_rate": 0.0047638505919546685, | |
| "loss": 0.2538, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 0.3330797553062439, | |
| "learning_rate": 0.0047618691311531345, | |
| "loss": 0.1513, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 0.38396719098091125, | |
| "learning_rate": 0.004759879807731673, | |
| "loss": 0.172, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 0.3653559684753418, | |
| "learning_rate": 0.00475788262860549, | |
| "loss": 0.1862, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 0.20279382169246674, | |
| "learning_rate": 0.004755877600717102, | |
| "loss": 0.1508, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 0.2565731704235077, | |
| "learning_rate": 0.004753864731036307, | |
| "loss": 0.1877, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 0.33084383606910706, | |
| "learning_rate": 0.004751844026560163, | |
| "loss": 0.1754, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 0.21679562330245972, | |
| "learning_rate": 0.004749815494312963, | |
| "loss": 0.1466, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 0.3501861095428467, | |
| "learning_rate": 0.0047477791413462105, | |
| "loss": 0.1706, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 0.025921905413269997, | |
| "learning_rate": 0.004745734974738593, | |
| "loss": 0.1392, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 0.09658165276050568, | |
| "learning_rate": 0.004743683001595965, | |
| "loss": 0.1439, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 0.20020923018455505, | |
| "learning_rate": 0.004741623229051313, | |
| "loss": 0.1569, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 0.42405426502227783, | |
| "learning_rate": 0.004739555664264736, | |
| "loss": 0.1703, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 0.13087014853954315, | |
| "learning_rate": 0.004737480314423421, | |
| "loss": 0.1508, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 0.07791759818792343, | |
| "learning_rate": 0.0047353971867416175, | |
| "loss": 0.146, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 0.4447411596775055, | |
| "learning_rate": 0.004733306288460612, | |
| "loss": 0.1531, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 0.08216764032840729, | |
| "learning_rate": 0.0047312076268487, | |
| "loss": 0.1446, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 0.20517706871032715, | |
| "learning_rate": 0.004729101209201169, | |
| "loss": 0.1419, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 0.10283233225345612, | |
| "learning_rate": 0.004726987042840263, | |
| "loss": 0.146, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 0.2815485894680023, | |
| "learning_rate": 0.004724865135115163, | |
| "loss": 0.1565, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 0.14592072367668152, | |
| "learning_rate": 0.00472273549340196, | |
| "loss": 0.1406, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 0.48644283413887024, | |
| "learning_rate": 0.0047205981251036335, | |
| "loss": 0.1923, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 0.12471989542245865, | |
| "learning_rate": 0.004718453037650016, | |
| "loss": 0.1459, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 0.19896233081817627, | |
| "learning_rate": 0.004716300238497776, | |
| "loss": 0.1451, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.09358244389295578, | |
| "learning_rate": 0.004714139735130388, | |
| "loss": 0.144, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.13858307898044586, | |
| "learning_rate": 0.0047119715350581095, | |
| "loss": 0.1424, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.1569807082414627, | |
| "learning_rate": 0.0047097956458179505, | |
| "loss": 0.1359, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.5818818807601929, | |
| "learning_rate": 0.004707612074973653, | |
| "loss": 0.3056, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.33774101734161377, | |
| "learning_rate": 0.004705420830115658, | |
| "loss": 0.1647, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.22037023305892944, | |
| "learning_rate": 0.004703221918861084, | |
| "loss": 0.1546, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 0.2596515119075775, | |
| "learning_rate": 0.004701015348853699, | |
| "loss": 0.1631, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 0.013330257497727871, | |
| "learning_rate": 0.004698801127763896, | |
| "loss": 0.1225, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 0.12322998046875, | |
| "learning_rate": 0.004696579263288661, | |
| "loss": 0.1446, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 0.0688777044415474, | |
| "learning_rate": 0.004694349763151553, | |
| "loss": 0.1432, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 0.19567111134529114, | |
| "learning_rate": 0.00469211263510267, | |
| "loss": 0.1487, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.05435417592525482, | |
| "learning_rate": 0.004689867886918629, | |
| "loss": 0.1375, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.04905528575181961, | |
| "learning_rate": 0.004687615526402536, | |
| "loss": 0.1418, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.1233174130320549, | |
| "learning_rate": 0.004685355561383956, | |
| "loss": 0.1435, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.07042816281318665, | |
| "learning_rate": 0.00468308799971889, | |
| "loss": 0.1378, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.04000591114163399, | |
| "learning_rate": 0.0046808128492897465, | |
| "loss": 0.1414, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "eval_loss": 0.1390625536441803, | |
| "eval_runtime": 20.6734, | |
| "eval_samples_per_second": 23.073, | |
| "eval_steps_per_second": 5.805, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.025082377716898918, | |
| "learning_rate": 0.004678530118005312, | |
| "loss": 0.1416, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 0.07912474870681763, | |
| "learning_rate": 0.004676239813800729, | |
| "loss": 0.1411, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 0.21243184804916382, | |
| "learning_rate": 0.004673941944637461, | |
| "loss": 0.1283, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 0.15224987268447876, | |
| "learning_rate": 0.00467163651850327, | |
| "loss": 0.1457, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 0.5148062109947205, | |
| "learning_rate": 0.004669323543412186, | |
| "loss": 0.1871, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 0.019062018021941185, | |
| "learning_rate": 0.004667003027404483, | |
| "loss": 0.1392, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 0.07946087419986725, | |
| "learning_rate": 0.004664674978546646, | |
| "loss": 0.1252, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 0.6038045287132263, | |
| "learning_rate": 0.004662339404931347, | |
| "loss": 0.2693, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 0.0627756342291832, | |
| "learning_rate": 0.004659996314677414, | |
| "loss": 0.1262, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 0.18904772400856018, | |
| "learning_rate": 0.004657645715929804, | |
| "loss": 0.1462, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 0.06582311540842056, | |
| "learning_rate": 0.0046552876168595774, | |
| "loss": 0.1279, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 0.075279101729393, | |
| "learning_rate": 0.004652922025663863, | |
| "loss": 0.117, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 0.5974221229553223, | |
| "learning_rate": 0.004650548950565835, | |
| "loss": 0.2998, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 0.47275882959365845, | |
| "learning_rate": 0.004648168399814684, | |
| "loss": 0.2351, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 0.371979683637619, | |
| "learning_rate": 0.004645780381685586, | |
| "loss": 0.1653, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 0.4610370695590973, | |
| "learning_rate": 0.004643384904479675, | |
| "loss": 0.2125, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 1.6561771631240845, | |
| "learning_rate": 0.004640981976524015, | |
| "loss": 0.3289, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 0.27472352981567383, | |
| "learning_rate": 0.004638571606171567, | |
| "loss": 0.1696, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 0.4794062077999115, | |
| "learning_rate": 0.004636153801801167, | |
| "loss": 0.2235, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 0.008101632818579674, | |
| "learning_rate": 0.004633728571817489, | |
| "loss": 0.1393, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 0.652870774269104, | |
| "learning_rate": 0.004631295924651024, | |
| "loss": 0.248, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 1.7754223346710205, | |
| "learning_rate": 0.0046288558687580415, | |
| "loss": 0.1255, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 0.47201940417289734, | |
| "learning_rate": 0.004626408412620567, | |
| "loss": 0.202, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 0.09795279055833817, | |
| "learning_rate": 0.004623953564746353, | |
| "loss": 0.1473, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 0.2842445373535156, | |
| "learning_rate": 0.004621491333668843, | |
| "loss": 0.1655, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 0.4557732045650482, | |
| "learning_rate": 0.004619021727947146, | |
| "loss": 0.2113, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 0.35528117418289185, | |
| "learning_rate": 0.00461654475616601, | |
| "loss": 0.1811, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 0.09643755853176117, | |
| "learning_rate": 0.004614060426935786, | |
| "loss": 0.1309, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 0.3083273768424988, | |
| "learning_rate": 0.0046115687488923985, | |
| "loss": 0.1479, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.6159799098968506, | |
| "learning_rate": 0.004609069730697322, | |
| "loss": 0.2834, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.3343372344970703, | |
| "learning_rate": 0.004606563381037544, | |
| "loss": 0.1887, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.04232935607433319, | |
| "learning_rate": 0.004604049708625538, | |
| "loss": 0.134, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.3046329617500305, | |
| "learning_rate": 0.004601528722199234, | |
| "loss": 0.1658, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.11442878097295761, | |
| "learning_rate": 0.004599000430521983, | |
| "loss": 0.1407, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.47239166498184204, | |
| "learning_rate": 0.004596464842382534, | |
| "loss": 0.2002, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 0.0836309939622879, | |
| "learning_rate": 0.0045939219665949976, | |
| "loss": 0.1431, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 0.16392894089221954, | |
| "learning_rate": 0.004591371811998817, | |
| "loss": 0.1487, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 0.342985063791275, | |
| "learning_rate": 0.00458881438745874, | |
| "loss": 0.1738, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 0.015053262002766132, | |
| "learning_rate": 0.004586249701864783, | |
| "loss": 0.1315, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 0.04966433346271515, | |
| "learning_rate": 0.004583677764132207, | |
| "loss": 0.1341, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 0.06436709314584732, | |
| "learning_rate": 0.004581098583201478, | |
| "loss": 0.1391, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 0.007449989672750235, | |
| "learning_rate": 0.004578512168038244, | |
| "loss": 0.1406, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 0.10672789067029953, | |
| "learning_rate": 0.004575918527633297, | |
| "loss": 0.1395, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 0.6618992686271667, | |
| "learning_rate": 0.004573317671002549, | |
| "loss": 0.1401, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 0.042477015405893326, | |
| "learning_rate": 0.004570709607186994, | |
| "loss": 0.1398, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 0.13720382750034332, | |
| "learning_rate": 0.0045680943452526815, | |
| "loss": 0.1406, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 0.09360575675964355, | |
| "learning_rate": 0.0045654718942906795, | |
| "loss": 0.1327, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 0.0712946355342865, | |
| "learning_rate": 0.00456284226341705, | |
| "loss": 0.1313, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 0.20698802173137665, | |
| "learning_rate": 0.0045602054617728096, | |
| "loss": 0.1557, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 0.2574722170829773, | |
| "learning_rate": 0.004557561498523905, | |
| "loss": 0.1639, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 0.28649410605430603, | |
| "learning_rate": 0.004554910382861178, | |
| "loss": 0.1636, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 0.022425998002290726, | |
| "learning_rate": 0.00455225212400033, | |
| "loss": 0.1309, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.5155848860740662, | |
| "learning_rate": 0.004549586731181896, | |
| "loss": 0.2228, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.4240152835845947, | |
| "learning_rate": 0.004546914213671209, | |
| "loss": 0.2065, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.30198466777801514, | |
| "learning_rate": 0.004544234580758367, | |
| "loss": 0.1581, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.06408875435590744, | |
| "learning_rate": 0.0045415478417582065, | |
| "loss": 0.1331, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 0.5334039926528931, | |
| "learning_rate": 0.004538854006010263, | |
| "loss": 0.2534, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 0.3273349106311798, | |
| "learning_rate": 0.004536153082878738, | |
| "loss": 0.194, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 0.48004212975502014, | |
| "learning_rate": 0.004533445081752478, | |
| "loss": 0.1795, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 0.3054547607898712, | |
| "learning_rate": 0.004530730012044926, | |
| "loss": 0.1317, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 0.17748180031776428, | |
| "learning_rate": 0.004528007883194103, | |
| "loss": 0.1479, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 0.35406604409217834, | |
| "learning_rate": 0.0045252787046625624, | |
| "loss": 0.1637, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 3.8229777812957764, | |
| "learning_rate": 0.0045225424859373685, | |
| "loss": 0.1387, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 0.06019274517893791, | |
| "learning_rate": 0.0045197992365300565, | |
| "loss": 0.1359, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 0.12145840376615524, | |
| "learning_rate": 0.0045170489659766, | |
| "loss": 0.1338, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 0.04294529929757118, | |
| "learning_rate": 0.004514291683837383, | |
| "loss": 0.1242, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 0.01808145083487034, | |
| "learning_rate": 0.004511527399697158, | |
| "loss": 0.1099, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 0.4128633737564087, | |
| "learning_rate": 0.004508756123165021, | |
| "loss": 0.208, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 0.49105894565582275, | |
| "learning_rate": 0.004505977863874374, | |
| "loss": 0.1505, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 0.2189539074897766, | |
| "learning_rate": 0.0045031926314828925, | |
| "loss": 0.1322, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 0.21910899877548218, | |
| "learning_rate": 0.00450040043567249, | |
| "loss": 0.1407, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 0.19418276846408844, | |
| "learning_rate": 0.004497601286149288, | |
| "loss": 0.1345, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 0.20071132481098175, | |
| "learning_rate": 0.0044947951926435775, | |
| "loss": 0.1526, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 0.020260484889149666, | |
| "learning_rate": 0.004491982164909792, | |
| "loss": 0.1323, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.02881007082760334, | |
| "learning_rate": 0.0044891622127264654, | |
| "loss": 0.1364, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.14652033150196075, | |
| "learning_rate": 0.004486335345896204, | |
| "loss": 0.1451, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.0947147086262703, | |
| "learning_rate": 0.004483501574245652, | |
| "loss": 0.1443, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.2556053102016449, | |
| "learning_rate": 0.004480660907625452, | |
| "loss": 0.1479, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.20763279497623444, | |
| "learning_rate": 0.0044778133559102195, | |
| "loss": 0.1389, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.13740937411785126, | |
| "learning_rate": 0.004474958928998498, | |
| "loss": 0.1435, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.2058638483285904, | |
| "learning_rate": 0.004472097636812735, | |
| "loss": 0.1466, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.051496896892786026, | |
| "learning_rate": 0.004469229489299242, | |
| "loss": 0.1392, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.09042657911777496, | |
| "learning_rate": 0.004466354496428157, | |
| "loss": 0.1382, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.12760333716869354, | |
| "learning_rate": 0.0044634726681934194, | |
| "loss": 0.1433, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.27251380681991577, | |
| "learning_rate": 0.004460584014612724, | |
| "loss": 0.1623, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 0.05649774894118309, | |
| "learning_rate": 0.004457688545727497, | |
| "loss": 0.1303, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 0.11580526828765869, | |
| "learning_rate": 0.004454786271602848, | |
| "loss": 0.1346, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 0.06390184909105301, | |
| "learning_rate": 0.004451877202327553, | |
| "loss": 0.1361, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 0.15965820848941803, | |
| "learning_rate": 0.004448961348013999, | |
| "loss": 0.1339, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 0.14588314294815063, | |
| "learning_rate": 0.004446038718798166, | |
| "loss": 0.1458, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 0.12371037155389786, | |
| "learning_rate": 0.00444310932483958, | |
| "loss": 0.1391, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 0.07788253575563431, | |
| "learning_rate": 0.004440173176321287, | |
| "loss": 0.1364, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 0.11355935782194138, | |
| "learning_rate": 0.004437230283449808, | |
| "loss": 0.1482, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 0.2870952785015106, | |
| "learning_rate": 0.00443428065645511, | |
| "loss": 0.1406, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 0.2574450373649597, | |
| "learning_rate": 0.004431324305590572, | |
| "loss": 0.1395, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 0.2524733245372772, | |
| "learning_rate": 0.004428361241132943, | |
| "loss": 0.1589, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 0.19581332802772522, | |
| "learning_rate": 0.004425391473382309, | |
| "loss": 0.1526, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.33516693115234375, | |
| "learning_rate": 0.004422415012662061, | |
| "loss": 0.162, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.036571960896253586, | |
| "learning_rate": 0.004419431869318853, | |
| "loss": 0.132, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.4105718731880188, | |
| "learning_rate": 0.004416442053722569, | |
| "loss": 0.1958, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.04623480886220932, | |
| "learning_rate": 0.004413445576266289, | |
| "loss": 0.1099, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.049837108701467514, | |
| "learning_rate": 0.004410442447366249, | |
| "loss": 0.1171, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 0.11380172520875931, | |
| "learning_rate": 0.0044074326774618065, | |
| "loss": 0.1357, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 0.026750722900032997, | |
| "learning_rate": 0.004404416277015404, | |
| "loss": 0.1278, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 0.2501939833164215, | |
| "learning_rate": 0.004401393256512534, | |
| "loss": 0.157, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 0.1605241894721985, | |
| "learning_rate": 0.004398363626461701, | |
| "loss": 0.1323, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 0.214866042137146, | |
| "learning_rate": 0.004395327397394384, | |
| "loss": 0.1361, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 0.09074309468269348, | |
| "learning_rate": 0.004392284579865004, | |
| "loss": 0.1379, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 0.25873011350631714, | |
| "learning_rate": 0.004389235184450881, | |
| "loss": 0.1363, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 0.2037936896085739, | |
| "learning_rate": 0.004386179221752202, | |
| "loss": 0.1514, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 0.2722165584564209, | |
| "learning_rate": 0.004383116702391987, | |
| "loss": 0.1577, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 0.10281626135110855, | |
| "learning_rate": 0.004380047637016041, | |
| "loss": 0.1429, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 0.09306799620389938, | |
| "learning_rate": 0.00437697203629293, | |
| "loss": 0.1296, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 0.14582695066928864, | |
| "learning_rate": 0.004373889910913934, | |
| "loss": 0.1467, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 0.40496793389320374, | |
| "learning_rate": 0.004370801271593016, | |
| "loss": 0.2055, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 0.26718080043792725, | |
| "learning_rate": 0.0043677061290667805, | |
| "loss": 0.1661, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 0.07576692849397659, | |
| "learning_rate": 0.004364604494094441, | |
| "loss": 0.1379, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 0.009126567281782627, | |
| "learning_rate": 0.004361496377457777, | |
| "loss": 0.1292, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 0.10117870569229126, | |
| "learning_rate": 0.0043583817899611015, | |
| "loss": 0.1281, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 0.3940899968147278, | |
| "learning_rate": 0.0043552607424312195, | |
| "loss": 0.2176, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.5003737211227417, | |
| "learning_rate": 0.004352133245717394, | |
| "loss": 0.24, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.12259330600500107, | |
| "learning_rate": 0.0043489993106913035, | |
| "loss": 0.1427, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.25444668531417847, | |
| "learning_rate": 0.00434585894824701, | |
| "loss": 0.1562, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.385450154542923, | |
| "learning_rate": 0.0043427121693009165, | |
| "loss": 0.1894, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.5544012784957886, | |
| "learning_rate": 0.004339558984791732, | |
| "loss": 0.2228, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 0.08573274314403534, | |
| "learning_rate": 0.004336399405680431, | |
| "loss": 0.1344, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 0.06946069002151489, | |
| "learning_rate": 0.004333233442950219, | |
| "loss": 0.1379, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 0.08204808086156845, | |
| "learning_rate": 0.0043300611076064885, | |
| "loss": 0.1297, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 0.34511202573776245, | |
| "learning_rate": 0.004326882410676787, | |
| "loss": 0.1882, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 0.23568613827228546, | |
| "learning_rate": 0.004323697363210774, | |
| "loss": 0.1601, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 0.17524772882461548, | |
| "learning_rate": 0.004320505976280185, | |
| "loss": 0.1563, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 0.2556067407131195, | |
| "learning_rate": 0.004317308260978795, | |
| "loss": 0.1495, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 0.06263621896505356, | |
| "learning_rate": 0.004314104228422374, | |
| "loss": 0.1403, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 0.039918459951877594, | |
| "learning_rate": 0.004310893889748653, | |
| "loss": 0.1363, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 0.05942288413643837, | |
| "learning_rate": 0.004307677256117285, | |
| "loss": 0.134, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 0.40024271607398987, | |
| "learning_rate": 0.0043044543387098025, | |
| "loss": 0.1808, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 0.05047342926263809, | |
| "learning_rate": 0.0043012251487295865, | |
| "loss": 0.1361, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 0.1135723665356636, | |
| "learning_rate": 0.004297989697401817, | |
| "loss": 0.1451, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 0.2591931223869324, | |
| "learning_rate": 0.004294747995973442, | |
| "loss": 0.1599, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 0.2932302951812744, | |
| "learning_rate": 0.004291500055713138, | |
| "loss": 0.1634, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 0.03932628408074379, | |
| "learning_rate": 0.004288245887911263, | |
| "loss": 0.1441, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 0.39508238434791565, | |
| "learning_rate": 0.004284985503879828, | |
| "loss": 0.1786, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "eval_loss": 0.15055102109909058, | |
| "eval_runtime": 20.8151, | |
| "eval_samples_per_second": 22.916, | |
| "eval_steps_per_second": 5.765, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 0.2353733777999878, | |
| "learning_rate": 0.004281718914952452, | |
| "loss": 0.155, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 0.14004479348659515, | |
| "learning_rate": 0.0042784461324843195, | |
| "loss": 0.1324, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 0.1574845165014267, | |
| "learning_rate": 0.004275167167852149, | |
| "loss": 0.1408, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 0.30841565132141113, | |
| "learning_rate": 0.004271882032454147, | |
| "loss": 0.1601, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 0.13977472484111786, | |
| "learning_rate": 0.004268590737709972, | |
| "loss": 0.1293, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 0.02860305644571781, | |
| "learning_rate": 0.004265293295060692, | |
| "loss": 0.1345, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 0.03978782147169113, | |
| "learning_rate": 0.004261989715968746, | |
| "loss": 0.1283, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 0.13351576030254364, | |
| "learning_rate": 0.004258680011917905, | |
| "loss": 0.145, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 0.10464897006750107, | |
| "learning_rate": 0.004255364194413231, | |
| "loss": 0.1445, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 0.1459510177373886, | |
| "learning_rate": 0.00425204227498104, | |
| "loss": 0.139, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 0.01213156245648861, | |
| "learning_rate": 0.004248714265168853, | |
| "loss": 0.1396, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 0.3408389389514923, | |
| "learning_rate": 0.004245380176545369, | |
| "loss": 0.1747, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 0.22795382142066956, | |
| "learning_rate": 0.004242040020700413, | |
| "loss": 0.1561, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 0.1614357829093933, | |
| "learning_rate": 0.004238693809244904, | |
| "loss": 0.1425, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 0.11894867569208145, | |
| "learning_rate": 0.004235341553810807, | |
| "loss": 0.1422, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 0.14360809326171875, | |
| "learning_rate": 0.004231983266051104, | |
| "loss": 0.1434, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 0.2520577907562256, | |
| "learning_rate": 0.004228618957639738, | |
| "loss": 0.1649, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 0.0031091556884348392, | |
| "learning_rate": 0.004225248640271587, | |
| "loss": 0.1168, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 0.2224978506565094, | |
| "learning_rate": 0.0042218723256624135, | |
| "loss": 0.1546, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 0.044364750385284424, | |
| "learning_rate": 0.00421849002554883, | |
| "loss": 0.142, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 0.40695396065711975, | |
| "learning_rate": 0.004215101751688253, | |
| "loss": 0.1561, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 0.021591413766145706, | |
| "learning_rate": 0.004211707515858866, | |
| "loss": 0.1415, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 0.0725998729467392, | |
| "learning_rate": 0.0042083073298595786, | |
| "loss": 0.1405, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.06313532590866089, | |
| "learning_rate": 0.004204901205509981, | |
| "loss": 0.1388, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.22441565990447998, | |
| "learning_rate": 0.00420148915465031, | |
| "loss": 0.1497, | |
| "step": 451 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.20351390540599823, | |
| "learning_rate": 0.004198071189141399, | |
| "loss": 0.1388, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.021423619240522385, | |
| "learning_rate": 0.004194647320864647, | |
| "loss": 0.1378, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.017314398661255836, | |
| "learning_rate": 0.004191217561721966, | |
| "loss": 0.1392, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.051026392728090286, | |
| "learning_rate": 0.004187781923635753, | |
| "loss": 0.1423, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 0.229749858379364, | |
| "learning_rate": 0.004184340418548835, | |
| "loss": 0.1335, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 0.2312706857919693, | |
| "learning_rate": 0.004180893058424435, | |
| "loss": 0.1613, | |
| "step": 457 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 0.3830375075340271, | |
| "learning_rate": 0.0041774398552461315, | |
| "loss": 0.1757, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 0.06416508555412292, | |
| "learning_rate": 0.004173980821017812, | |
| "loss": 0.1405, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 0.1635216921567917, | |
| "learning_rate": 0.004170515967763634, | |
| "loss": 0.1471, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 0.13280948996543884, | |
| "learning_rate": 0.0041670453075279825, | |
| "loss": 0.145, | |
| "step": 461 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 0.04049176722764969, | |
| "learning_rate": 0.004163568852375431, | |
| "loss": 0.1401, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 0.0036700742784887552, | |
| "learning_rate": 0.004160086614390695, | |
| "loss": 0.1372, | |
| "step": 463 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 0.038986846804618835, | |
| "learning_rate": 0.004156598605678591, | |
| "loss": 0.1348, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 0.006637162528932095, | |
| "learning_rate": 0.004153104838363997, | |
| "loss": 0.1374, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 0.2121923714876175, | |
| "learning_rate": 0.00414960532459181, | |
| "loss": 0.1421, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 0.206923246383667, | |
| "learning_rate": 0.0041461000765269, | |
| "loss": 0.1484, | |
| "step": 467 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 0.1175004169344902, | |
| "learning_rate": 0.004142589106354071, | |
| "loss": 0.1399, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 0.17498064041137695, | |
| "learning_rate": 0.004139072426278021, | |
| "loss": 0.1316, | |
| "step": 469 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 0.07634201645851135, | |
| "learning_rate": 0.004135550048523292, | |
| "loss": 0.1417, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 0.16571657359600067, | |
| "learning_rate": 0.004132021985334235, | |
| "loss": 0.1455, | |
| "step": 471 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 0.015314368531107903, | |
| "learning_rate": 0.004128488248974962, | |
| "loss": 0.1112, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 0.20316685736179352, | |
| "learning_rate": 0.004124948851729309, | |
| "loss": 0.1488, | |
| "step": 473 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 0.014201296493411064, | |
| "learning_rate": 0.004121403805900789, | |
| "loss": 0.1216, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 0.19993703067302704, | |
| "learning_rate": 0.004117853123812549, | |
| "loss": 0.1512, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 0.2707497775554657, | |
| "learning_rate": 0.00411429681780733, | |
| "loss": 0.139, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 0.37889397144317627, | |
| "learning_rate": 0.0041107349002474204, | |
| "loss": 0.1834, | |
| "step": 477 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 0.2253105640411377, | |
| "learning_rate": 0.0041071673835146195, | |
| "loss": 0.1614, | |
| "step": 478 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 0.4593866467475891, | |
| "learning_rate": 0.004103594280010186, | |
| "loss": 0.2019, | |
| "step": 479 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 0.043750498443841934, | |
| "learning_rate": 0.004100015602154802, | |
| "loss": 0.1394, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 0.11412761360406876, | |
| "learning_rate": 0.004096431362388525, | |
| "loss": 0.1423, | |
| "step": 481 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 0.28491342067718506, | |
| "learning_rate": 0.004092841573170748, | |
| "loss": 0.17, | |
| "step": 482 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 0.47304004430770874, | |
| "learning_rate": 0.004089246246980154, | |
| "loss": 0.2082, | |
| "step": 483 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 0.1522788107395172, | |
| "learning_rate": 0.0040856453963146735, | |
| "loss": 0.1447, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 0.16730143129825592, | |
| "learning_rate": 0.00408203903369144, | |
| "loss": 0.1478, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 0.43677958846092224, | |
| "learning_rate": 0.0040784271716467506, | |
| "loss": 0.2072, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 0.4553922414779663, | |
| "learning_rate": 0.004074809822736015, | |
| "loss": 0.2006, | |
| "step": 487 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 0.016633370891213417, | |
| "learning_rate": 0.00407118699953372, | |
| "loss": 0.1424, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 0.055445339530706406, | |
| "learning_rate": 0.004067558714633378, | |
| "loss": 0.135, | |
| "step": 489 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 0.4653269052505493, | |
| "learning_rate": 0.004063924980647492, | |
| "loss": 0.2187, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 0.16954103112220764, | |
| "learning_rate": 0.004060285810207503, | |
| "loss": 0.1444, | |
| "step": 491 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 0.023942217230796814, | |
| "learning_rate": 0.004056641215963751, | |
| "loss": 0.1225, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 0.09879268705844879, | |
| "learning_rate": 0.00405299121058543, | |
| "loss": 0.1279, | |
| "step": 493 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 0.266255646944046, | |
| "learning_rate": 0.004049335806760545, | |
| "loss": 0.1514, | |
| "step": 494 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.11469101160764694, | |
| "learning_rate": 0.004045675017195866, | |
| "loss": 0.1424, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.20512178540229797, | |
| "learning_rate": 0.004042008854616883, | |
| "loss": 0.1594, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.2791603207588196, | |
| "learning_rate": 0.004038337331767768, | |
| "loss": 0.1638, | |
| "step": 497 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.08956770598888397, | |
| "learning_rate": 0.004034660461411321, | |
| "loss": 0.1403, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.03588682785630226, | |
| "learning_rate": 0.004030978256328936, | |
| "loss": 0.1339, | |
| "step": 499 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.25951525568962097, | |
| "learning_rate": 0.004027290729320545, | |
| "loss": 0.1654, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 0.2677713632583618, | |
| "learning_rate": 0.004023597893204586, | |
| "loss": 0.1663, | |
| "step": 501 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 0.10936828702688217, | |
| "learning_rate": 0.004019899760817948, | |
| "loss": 0.1314, | |
| "step": 502 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 0.16409318149089813, | |
| "learning_rate": 0.004016196345015933, | |
| "loss": 0.1381, | |
| "step": 503 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 0.12992282211780548, | |
| "learning_rate": 0.00401248765867221, | |
| "loss": 0.1297, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 0.14716273546218872, | |
| "learning_rate": 0.004008773714678766, | |
| "loss": 0.1468, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 0.07979045063257217, | |
| "learning_rate": 0.004005054525945865, | |
| "loss": 0.1406, | |
| "step": 506 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 0.007667996920645237, | |
| "learning_rate": 0.004001330105402006, | |
| "loss": 0.1337, | |
| "step": 507 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 0.30883529782295227, | |
| "learning_rate": 0.0039976004659938716, | |
| "loss": 0.157, | |
| "step": 508 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 0.31749460101127625, | |
| "learning_rate": 0.0039938656206862854, | |
| "loss": 0.1493, | |
| "step": 509 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 0.05762209743261337, | |
| "learning_rate": 0.00399012558246217, | |
| "loss": 0.1385, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 0.14837181568145752, | |
| "learning_rate": 0.003986380364322498, | |
| "loss": 0.1375, | |
| "step": 511 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 0.07137307524681091, | |
| "learning_rate": 0.003982629979286247, | |
| "loss": 0.1422, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 0.07783070206642151, | |
| "learning_rate": 0.003978874440390361, | |
| "loss": 0.1287, | |
| "step": 513 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 0.06229304522275925, | |
| "learning_rate": 0.003975113760689691, | |
| "loss": 0.1328, | |
| "step": 514 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 0.0366533026099205, | |
| "learning_rate": 0.003971347953256965, | |
| "loss": 0.1174, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 0.272078275680542, | |
| "learning_rate": 0.003967577031182733, | |
| "loss": 0.1629, | |
| "step": 516 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 0.15681308507919312, | |
| "learning_rate": 0.003963801007575327, | |
| "loss": 0.1505, | |
| "step": 517 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 0.03374667838215828, | |
| "learning_rate": 0.003960019895560808, | |
| "loss": 0.1395, | |
| "step": 518 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 0.660275936126709, | |
| "learning_rate": 0.0039562337082829305, | |
| "loss": 0.1399, | |
| "step": 519 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 0.34664130210876465, | |
| "learning_rate": 0.003952442458903087, | |
| "loss": 0.1449, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 0.46702539920806885, | |
| "learning_rate": 0.003948646160600268, | |
| "loss": 0.1969, | |
| "step": 521 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 0.06642913073301315, | |
| "learning_rate": 0.003944844826571018, | |
| "loss": 0.1387, | |
| "step": 522 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 0.25087377429008484, | |
| "learning_rate": 0.003941038470029382, | |
| "loss": 0.164, | |
| "step": 523 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 0.16770517826080322, | |
| "learning_rate": 0.003937227104206865, | |
| "loss": 0.1539, | |
| "step": 524 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 0.1529918909072876, | |
| "learning_rate": 0.003933410742352388, | |
| "loss": 0.1451, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 0.18149301409721375, | |
| "learning_rate": 0.003929589397732236, | |
| "loss": 0.135, | |
| "step": 526 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 0.27569884061813354, | |
| "learning_rate": 0.003925763083630017, | |
| "loss": 0.1421, | |
| "step": 527 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 0.2581075131893158, | |
| "learning_rate": 0.003921931813346611, | |
| "loss": 0.162, | |
| "step": 528 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 0.33019861578941345, | |
| "learning_rate": 0.003918095600200127, | |
| "loss": 0.1779, | |
| "step": 529 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 0.263822078704834, | |
| "learning_rate": 0.003914254457525862, | |
| "loss": 0.1634, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 0.17741429805755615, | |
| "learning_rate": 0.003910408398676239, | |
| "loss": 0.1335, | |
| "step": 531 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 0.0159847941249609, | |
| "learning_rate": 0.003906557437020779, | |
| "loss": 0.1361, | |
| "step": 532 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 0.21829059720039368, | |
| "learning_rate": 0.0039027015859460397, | |
| "loss": 0.1635, | |
| "step": 533 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 0.0067328461445868015, | |
| "learning_rate": 0.00389884085885558, | |
| "loss": 0.1327, | |
| "step": 534 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 0.056643418967723846, | |
| "learning_rate": 0.0038949752691699057, | |
| "loss": 0.1381, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 0.12431513518095016, | |
| "learning_rate": 0.0038911048303264272, | |
| "loss": 0.1345, | |
| "step": 536 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 0.21443720161914825, | |
| "learning_rate": 0.00388722955577941, | |
| "loss": 0.1189, | |
| "step": 537 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 0.10018088668584824, | |
| "learning_rate": 0.003883349458999931, | |
| "loss": 0.1354, | |
| "step": 538 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 0.01695556379854679, | |
| "learning_rate": 0.0038794645534758277, | |
| "loss": 0.1092, | |
| "step": 539 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.12418782711029053, | |
| "learning_rate": 0.003875574852711656, | |
| "loss": 0.1374, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.31890037655830383, | |
| "learning_rate": 0.003871680370228639, | |
| "loss": 0.1865, | |
| "step": 541 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.18868288397789001, | |
| "learning_rate": 0.003867781119564623, | |
| "loss": 0.1489, | |
| "step": 542 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.14611639082431793, | |
| "learning_rate": 0.0038638771142740294, | |
| "loss": 0.1318, | |
| "step": 543 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.7513630390167236, | |
| "learning_rate": 0.003859968367927805, | |
| "loss": 0.2047, | |
| "step": 544 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.4710327386856079, | |
| "learning_rate": 0.0038560548941133812, | |
| "loss": 0.2374, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 0.2790408134460449, | |
| "learning_rate": 0.003852136706434619, | |
| "loss": 0.1734, | |
| "step": 546 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 0.13020950555801392, | |
| "learning_rate": 0.003848213818511769, | |
| "loss": 0.1434, | |
| "step": 547 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 0.3076300323009491, | |
| "learning_rate": 0.0038442862439814177, | |
| "loss": 0.1752, | |
| "step": 548 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 0.4267752766609192, | |
| "learning_rate": 0.003840353996496444, | |
| "loss": 0.198, | |
| "step": 549 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 0.13199670612812042, | |
| "learning_rate": 0.003836417089725971, | |
| "loss": 0.1439, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 0.26677653193473816, | |
| "learning_rate": 0.0038324755373553188, | |
| "loss": 0.1516, | |
| "step": 551 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 0.24929864704608917, | |
| "learning_rate": 0.0038285293530859557, | |
| "loss": 0.152, | |
| "step": 552 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 0.06707077473402023, | |
| "learning_rate": 0.003824578550635451, | |
| "loss": 0.1344, | |
| "step": 553 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 0.10363825410604477, | |
| "learning_rate": 0.0038206231437374273, | |
| "loss": 0.1385, | |
| "step": 554 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 0.31158074736595154, | |
| "learning_rate": 0.003816663146141514, | |
| "loss": 0.1318, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 0.1382443606853485, | |
| "learning_rate": 0.0038126985716132977, | |
| "loss": 0.1431, | |
| "step": 556 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 0.5671953558921814, | |
| "learning_rate": 0.0038087294339342764, | |
| "loss": 0.2439, | |
| "step": 557 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 0.19111286103725433, | |
| "learning_rate": 0.003804755746901808, | |
| "loss": 0.153, | |
| "step": 558 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 0.13765619695186615, | |
| "learning_rate": 0.0038007775243290667, | |
| "loss": 0.1409, | |
| "step": 559 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 0.06567953526973724, | |
| "learning_rate": 0.003796794780044992, | |
| "loss": 0.1404, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 0.2712881863117218, | |
| "learning_rate": 0.003792807527894242, | |
| "loss": 0.1616, | |
| "step": 561 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 0.17540942132472992, | |
| "learning_rate": 0.0037888157817371456, | |
| "loss": 0.154, | |
| "step": 562 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.14081189036369324, | |
| "learning_rate": 0.003784819555449651, | |
| "loss": 0.1415, | |
| "step": 563 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.20738956332206726, | |
| "learning_rate": 0.0037808188629232836, | |
| "loss": 0.1504, | |
| "step": 564 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.19751325249671936, | |
| "learning_rate": 0.0037768137180650913, | |
| "loss": 0.1516, | |
| "step": 565 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 1695, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 565, | |
| "total_flos": 5.169945694856806e+16, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |