| { | |
| "best_metric": 1.4878435134887695, | |
| "best_model_checkpoint": "lora_lr_pad/mistralai/Mistral-7B-Instruct-v0.2/unaligned/checkpoint-500", | |
| "epoch": 0.655150351887396, | |
| "eval_steps": 20, | |
| "global_step": 512, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0012795905310300703, | |
| "grad_norm": 1.3359375, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": 3.2562, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0025591810620601407, | |
| "grad_norm": 1.3671875, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 3.152, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.003838771593090211, | |
| "grad_norm": 1.2734375, | |
| "learning_rate": 6e-06, | |
| "loss": 3.101, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.005118362124120281, | |
| "grad_norm": 1.421875, | |
| "learning_rate": 8.000000000000001e-06, | |
| "loss": 3.2665, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.006397952655150352, | |
| "grad_norm": 1.375, | |
| "learning_rate": 1e-05, | |
| "loss": 3.2401, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.007677543186180422, | |
| "grad_norm": 1.3046875, | |
| "learning_rate": 1.2e-05, | |
| "loss": 3.1574, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.008957133717210493, | |
| "grad_norm": 1.3515625, | |
| "learning_rate": 1.4000000000000001e-05, | |
| "loss": 3.1197, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.010236724248240563, | |
| "grad_norm": 1.4140625, | |
| "learning_rate": 1.6000000000000003e-05, | |
| "loss": 3.2179, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.011516314779270634, | |
| "grad_norm": 1.421875, | |
| "learning_rate": 1.8e-05, | |
| "loss": 3.2687, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.012795905310300703, | |
| "grad_norm": 1.4609375, | |
| "learning_rate": 2e-05, | |
| "loss": 3.2973, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.014075495841330775, | |
| "grad_norm": 1.3359375, | |
| "learning_rate": 2.2000000000000003e-05, | |
| "loss": 3.1228, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.015355086372360844, | |
| "grad_norm": 1.234375, | |
| "learning_rate": 2.4e-05, | |
| "loss": 3.0316, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.016634676903390915, | |
| "grad_norm": 1.4765625, | |
| "learning_rate": 2.6000000000000002e-05, | |
| "loss": 3.2293, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.017914267434420986, | |
| "grad_norm": 1.4375, | |
| "learning_rate": 2.8000000000000003e-05, | |
| "loss": 3.2405, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.019193857965451054, | |
| "grad_norm": 1.453125, | |
| "learning_rate": 3e-05, | |
| "loss": 3.217, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.020473448496481125, | |
| "grad_norm": 1.375, | |
| "learning_rate": 3.2000000000000005e-05, | |
| "loss": 3.092, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.021753039027511197, | |
| "grad_norm": 1.59375, | |
| "learning_rate": 3.4000000000000007e-05, | |
| "loss": 3.2057, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.023032629558541268, | |
| "grad_norm": 1.3984375, | |
| "learning_rate": 3.6e-05, | |
| "loss": 2.9932, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.02431222008957134, | |
| "grad_norm": 1.546875, | |
| "learning_rate": 3.8e-05, | |
| "loss": 3.0673, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.025591810620601407, | |
| "grad_norm": 1.5703125, | |
| "learning_rate": 4e-05, | |
| "loss": 2.9652, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.025591810620601407, | |
| "eval_loss": 2.928436756134033, | |
| "eval_runtime": 103.8047, | |
| "eval_samples_per_second": 48.167, | |
| "eval_steps_per_second": 1.512, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.026871401151631478, | |
| "grad_norm": 1.6875, | |
| "learning_rate": 4.2e-05, | |
| "loss": 2.9173, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.02815099168266155, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 4.4000000000000006e-05, | |
| "loss": 2.9455, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.02943058221369162, | |
| "grad_norm": 1.7421875, | |
| "learning_rate": 4.600000000000001e-05, | |
| "loss": 2.909, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.030710172744721688, | |
| "grad_norm": 1.7734375, | |
| "learning_rate": 4.8e-05, | |
| "loss": 2.8191, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.03198976327575176, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 5e-05, | |
| "loss": 2.7853, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.03326935380678183, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 5.2000000000000004e-05, | |
| "loss": 2.7678, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.0345489443378119, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 5.4000000000000005e-05, | |
| "loss": 2.6028, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.03582853486884197, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 5.6000000000000006e-05, | |
| "loss": 2.6369, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.037108125399872044, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 5.8e-05, | |
| "loss": 2.6158, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.03838771593090211, | |
| "grad_norm": 1.59375, | |
| "learning_rate": 6e-05, | |
| "loss": 2.2824, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.03966730646193218, | |
| "grad_norm": 1.6796875, | |
| "learning_rate": 6.2e-05, | |
| "loss": 2.4286, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.04094689699296225, | |
| "grad_norm": 1.5625, | |
| "learning_rate": 6.400000000000001e-05, | |
| "loss": 2.2919, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.04222648752399232, | |
| "grad_norm": 1.546875, | |
| "learning_rate": 6.6e-05, | |
| "loss": 2.2725, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.04350607805502239, | |
| "grad_norm": 1.484375, | |
| "learning_rate": 6.800000000000001e-05, | |
| "loss": 2.202, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.044785668586052464, | |
| "grad_norm": 1.375, | |
| "learning_rate": 7e-05, | |
| "loss": 2.1415, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.046065259117082535, | |
| "grad_norm": 1.328125, | |
| "learning_rate": 7.2e-05, | |
| "loss": 2.0692, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.04734484964811261, | |
| "grad_norm": 1.2734375, | |
| "learning_rate": 7.4e-05, | |
| "loss": 2.1186, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.04862444017914268, | |
| "grad_norm": 1.046875, | |
| "learning_rate": 7.6e-05, | |
| "loss": 1.9482, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.04990403071017274, | |
| "grad_norm": 0.88671875, | |
| "learning_rate": 7.800000000000001e-05, | |
| "loss": 1.8985, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.05118362124120281, | |
| "grad_norm": 0.7421875, | |
| "learning_rate": 8e-05, | |
| "loss": 1.844, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.05118362124120281, | |
| "eval_loss": 1.8063277006149292, | |
| "eval_runtime": 103.8793, | |
| "eval_samples_per_second": 48.133, | |
| "eval_steps_per_second": 1.511, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.052463211772232884, | |
| "grad_norm": 0.6015625, | |
| "learning_rate": 8.2e-05, | |
| "loss": 1.8771, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.053742802303262956, | |
| "grad_norm": 0.515625, | |
| "learning_rate": 8.4e-05, | |
| "loss": 1.7902, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.05502239283429303, | |
| "grad_norm": 0.45703125, | |
| "learning_rate": 8.6e-05, | |
| "loss": 1.7946, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.0563019833653231, | |
| "grad_norm": 0.458984375, | |
| "learning_rate": 8.800000000000001e-05, | |
| "loss": 1.751, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.05758157389635317, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 9e-05, | |
| "loss": 1.7537, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.05886116442738324, | |
| "grad_norm": 0.4765625, | |
| "learning_rate": 9.200000000000001e-05, | |
| "loss": 1.7619, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.060140754958413305, | |
| "grad_norm": 0.5546875, | |
| "learning_rate": 9.4e-05, | |
| "loss": 1.7668, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.061420345489443376, | |
| "grad_norm": 0.625, | |
| "learning_rate": 9.6e-05, | |
| "loss": 1.7556, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.06269993602047345, | |
| "grad_norm": 0.75390625, | |
| "learning_rate": 9.8e-05, | |
| "loss": 1.7158, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.06397952655150352, | |
| "grad_norm": 0.8828125, | |
| "learning_rate": 0.0001, | |
| "loss": 1.7301, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.06525911708253358, | |
| "grad_norm": 0.9921875, | |
| "learning_rate": 0.00010200000000000001, | |
| "loss": 1.7979, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.06653870761356366, | |
| "grad_norm": 0.84765625, | |
| "learning_rate": 0.00010400000000000001, | |
| "loss": 1.6356, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.06781829814459372, | |
| "grad_norm": 0.494140625, | |
| "learning_rate": 0.00010600000000000002, | |
| "loss": 1.6094, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.0690978886756238, | |
| "grad_norm": 0.48046875, | |
| "learning_rate": 0.00010800000000000001, | |
| "loss": 1.6467, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.07037747920665387, | |
| "grad_norm": 0.44140625, | |
| "learning_rate": 0.00011000000000000002, | |
| "loss": 1.576, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.07165706973768395, | |
| "grad_norm": 0.384765625, | |
| "learning_rate": 0.00011200000000000001, | |
| "loss": 1.6307, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.07293666026871401, | |
| "grad_norm": 0.375, | |
| "learning_rate": 0.00011399999999999999, | |
| "loss": 1.6133, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.07421625079974409, | |
| "grad_norm": 0.33984375, | |
| "learning_rate": 0.000116, | |
| "loss": 1.6575, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.07549584133077415, | |
| "grad_norm": 0.310546875, | |
| "learning_rate": 0.000118, | |
| "loss": 1.5782, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.07677543186180422, | |
| "grad_norm": 0.29296875, | |
| "learning_rate": 0.00012, | |
| "loss": 1.6386, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.07677543186180422, | |
| "eval_loss": 1.5920685529708862, | |
| "eval_runtime": 103.8813, | |
| "eval_samples_per_second": 48.132, | |
| "eval_steps_per_second": 1.511, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.0780550223928343, | |
| "grad_norm": 0.2890625, | |
| "learning_rate": 0.000122, | |
| "loss": 1.5949, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.07933461292386436, | |
| "grad_norm": 0.28125, | |
| "learning_rate": 0.000124, | |
| "loss": 1.6136, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.08061420345489444, | |
| "grad_norm": 0.2890625, | |
| "learning_rate": 0.000126, | |
| "loss": 1.6135, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.0818937939859245, | |
| "grad_norm": 0.287109375, | |
| "learning_rate": 0.00012800000000000002, | |
| "loss": 1.5579, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.08317338451695458, | |
| "grad_norm": 0.306640625, | |
| "learning_rate": 0.00013000000000000002, | |
| "loss": 1.6174, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.08445297504798464, | |
| "grad_norm": 0.28125, | |
| "learning_rate": 0.000132, | |
| "loss": 1.6687, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.08573256557901472, | |
| "grad_norm": 0.30078125, | |
| "learning_rate": 0.000134, | |
| "loss": 1.604, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.08701215611004479, | |
| "grad_norm": 0.3125, | |
| "learning_rate": 0.00013600000000000003, | |
| "loss": 1.5936, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.08829174664107485, | |
| "grad_norm": 0.27734375, | |
| "learning_rate": 0.000138, | |
| "loss": 1.5744, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.08957133717210493, | |
| "grad_norm": 0.275390625, | |
| "learning_rate": 0.00014, | |
| "loss": 1.5875, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.09085092770313499, | |
| "grad_norm": 0.283203125, | |
| "learning_rate": 0.000142, | |
| "loss": 1.5938, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.09213051823416507, | |
| "grad_norm": 0.291015625, | |
| "learning_rate": 0.000144, | |
| "loss": 1.5795, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.09341010876519514, | |
| "grad_norm": 0.2578125, | |
| "learning_rate": 0.000146, | |
| "loss": 1.5642, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.09468969929622521, | |
| "grad_norm": 0.255859375, | |
| "learning_rate": 0.000148, | |
| "loss": 1.6276, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.09596928982725528, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 0.00015000000000000001, | |
| "loss": 1.6222, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.09724888035828536, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 0.000152, | |
| "loss": 1.5487, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.09852847088931542, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 0.000154, | |
| "loss": 1.6183, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.09980806142034548, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 0.00015600000000000002, | |
| "loss": 1.5813, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.10108765195137556, | |
| "grad_norm": 0.25, | |
| "learning_rate": 0.00015800000000000002, | |
| "loss": 1.5332, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.10236724248240563, | |
| "grad_norm": 0.255859375, | |
| "learning_rate": 0.00016, | |
| "loss": 1.5553, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.10236724248240563, | |
| "eval_loss": 1.5541130304336548, | |
| "eval_runtime": 103.8472, | |
| "eval_samples_per_second": 48.148, | |
| "eval_steps_per_second": 1.512, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.1036468330134357, | |
| "grad_norm": 0.248046875, | |
| "learning_rate": 0.000162, | |
| "loss": 1.5697, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.10492642354446577, | |
| "grad_norm": 0.251953125, | |
| "learning_rate": 0.000164, | |
| "loss": 1.5474, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.10620601407549585, | |
| "grad_norm": 0.2412109375, | |
| "learning_rate": 0.000166, | |
| "loss": 1.5834, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.10748560460652591, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 0.000168, | |
| "loss": 1.542, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.10876519513755598, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 0.00017, | |
| "loss": 1.5841, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.11004478566858605, | |
| "grad_norm": 0.25390625, | |
| "learning_rate": 0.000172, | |
| "loss": 1.515, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.11132437619961612, | |
| "grad_norm": 0.244140625, | |
| "learning_rate": 0.000174, | |
| "loss": 1.5565, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.1126039667306462, | |
| "grad_norm": 0.2197265625, | |
| "learning_rate": 0.00017600000000000002, | |
| "loss": 1.6133, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.11388355726167626, | |
| "grad_norm": 0.25, | |
| "learning_rate": 0.00017800000000000002, | |
| "loss": 1.5476, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.11516314779270634, | |
| "grad_norm": 0.25, | |
| "learning_rate": 0.00018, | |
| "loss": 1.5212, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.1164427383237364, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 0.000182, | |
| "loss": 1.5628, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.11772232885476648, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 0.00018400000000000003, | |
| "loss": 1.4946, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.11900191938579655, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 0.00018600000000000002, | |
| "loss": 1.6303, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.12028150991682661, | |
| "grad_norm": 0.24609375, | |
| "learning_rate": 0.000188, | |
| "loss": 1.5155, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.12156110044785669, | |
| "grad_norm": 0.220703125, | |
| "learning_rate": 0.00019, | |
| "loss": 1.5652, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.12284069097888675, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 0.000192, | |
| "loss": 1.6042, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.12412028150991683, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 0.000194, | |
| "loss": 1.5697, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.1253998720409469, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 0.000196, | |
| "loss": 1.4785, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.12667946257197696, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 0.00019800000000000002, | |
| "loss": 1.4921, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.12795905310300704, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4985, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.12795905310300704, | |
| "eval_loss": 1.5341166257858276, | |
| "eval_runtime": 103.7833, | |
| "eval_samples_per_second": 48.177, | |
| "eval_steps_per_second": 1.513, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.12923864363403711, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 0.00019951456310679614, | |
| "loss": 1.494, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.13051823416506717, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 0.00019902912621359224, | |
| "loss": 1.5407, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.13179782469609724, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 0.00019854368932038837, | |
| "loss": 1.5755, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.13307741522712732, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 0.00019805825242718447, | |
| "loss": 1.5491, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.1343570057581574, | |
| "grad_norm": 0.2421875, | |
| "learning_rate": 0.0001975728155339806, | |
| "loss": 1.5393, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.13563659628918745, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 0.0001970873786407767, | |
| "loss": 1.5657, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.13691618682021753, | |
| "grad_norm": 0.2119140625, | |
| "learning_rate": 0.00019660194174757283, | |
| "loss": 1.5551, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.1381957773512476, | |
| "grad_norm": 0.240234375, | |
| "learning_rate": 0.00019611650485436895, | |
| "loss": 1.5408, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.13947536788227768, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 0.00019563106796116505, | |
| "loss": 1.5375, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.14075495841330773, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 0.00019514563106796118, | |
| "loss": 1.5578, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.1420345489443378, | |
| "grad_norm": 0.240234375, | |
| "learning_rate": 0.00019466019417475728, | |
| "loss": 1.4969, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.1433141394753679, | |
| "grad_norm": 0.244140625, | |
| "learning_rate": 0.0001941747572815534, | |
| "loss": 1.5347, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.14459373000639794, | |
| "grad_norm": 0.287109375, | |
| "learning_rate": 0.00019368932038834954, | |
| "loss": 1.549, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.14587332053742802, | |
| "grad_norm": 0.2470703125, | |
| "learning_rate": 0.00019320388349514564, | |
| "loss": 1.5749, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.1471529110684581, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 0.00019271844660194177, | |
| "loss": 1.5311, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.14843250159948818, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 0.00019223300970873787, | |
| "loss": 1.5678, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.14971209213051823, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 0.000191747572815534, | |
| "loss": 1.616, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.1509916826615483, | |
| "grad_norm": 0.2158203125, | |
| "learning_rate": 0.0001912621359223301, | |
| "loss": 1.5449, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.15227127319257838, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 0.00019077669902912623, | |
| "loss": 1.4975, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.15355086372360843, | |
| "grad_norm": 0.234375, | |
| "learning_rate": 0.00019029126213592236, | |
| "loss": 1.5631, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.15355086372360843, | |
| "eval_loss": 1.5212680101394653, | |
| "eval_runtime": 103.8151, | |
| "eval_samples_per_second": 48.163, | |
| "eval_steps_per_second": 1.512, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.1548304542546385, | |
| "grad_norm": 0.2177734375, | |
| "learning_rate": 0.00018980582524271846, | |
| "loss": 1.5534, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.1561100447856686, | |
| "grad_norm": 0.2578125, | |
| "learning_rate": 0.00018932038834951458, | |
| "loss": 1.5329, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.15738963531669867, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 0.00018883495145631069, | |
| "loss": 1.5372, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.15866922584772872, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 0.00018834951456310681, | |
| "loss": 1.5373, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.1599488163787588, | |
| "grad_norm": 0.2421875, | |
| "learning_rate": 0.00018786407766990291, | |
| "loss": 1.5492, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.16122840690978887, | |
| "grad_norm": 0.255859375, | |
| "learning_rate": 0.00018737864077669904, | |
| "loss": 1.536, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.16250799744081892, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 0.00018689320388349517, | |
| "loss": 1.5719, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.163787587971849, | |
| "grad_norm": 0.248046875, | |
| "learning_rate": 0.00018640776699029127, | |
| "loss": 1.5101, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.16506717850287908, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 0.0001859223300970874, | |
| "loss": 1.5896, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.16634676903390916, | |
| "grad_norm": 0.220703125, | |
| "learning_rate": 0.0001854368932038835, | |
| "loss": 1.5791, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.1676263595649392, | |
| "grad_norm": 0.251953125, | |
| "learning_rate": 0.00018495145631067963, | |
| "loss": 1.5013, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.1689059500959693, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 0.00018446601941747576, | |
| "loss": 1.5421, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.17018554062699937, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 0.00018398058252427186, | |
| "loss": 1.5329, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.17146513115802944, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 0.00018349514563106799, | |
| "loss": 1.5631, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.1727447216890595, | |
| "grad_norm": 0.25390625, | |
| "learning_rate": 0.0001830097087378641, | |
| "loss": 1.5278, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.17402431222008957, | |
| "grad_norm": 0.2431640625, | |
| "learning_rate": 0.00018252427184466022, | |
| "loss": 1.5548, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.17530390275111965, | |
| "grad_norm": 0.275390625, | |
| "learning_rate": 0.00018203883495145632, | |
| "loss": 1.5023, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.1765834932821497, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 0.00018155339805825244, | |
| "loss": 1.5062, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.17786308381317978, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 0.00018106796116504857, | |
| "loss": 1.5637, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.17914267434420986, | |
| "grad_norm": 0.255859375, | |
| "learning_rate": 0.00018058252427184467, | |
| "loss": 1.5201, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.17914267434420986, | |
| "eval_loss": 1.513644814491272, | |
| "eval_runtime": 103.8133, | |
| "eval_samples_per_second": 48.163, | |
| "eval_steps_per_second": 1.512, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.18042226487523993, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 0.0001800970873786408, | |
| "loss": 1.5106, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.18170185540626999, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 0.0001796116504854369, | |
| "loss": 1.5403, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.18298144593730006, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 0.00017912621359223303, | |
| "loss": 1.5373, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.18426103646833014, | |
| "grad_norm": 0.2421875, | |
| "learning_rate": 0.00017864077669902913, | |
| "loss": 1.5697, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.1855406269993602, | |
| "grad_norm": 0.240234375, | |
| "learning_rate": 0.00017815533980582526, | |
| "loss": 1.5135, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.18682021753039027, | |
| "grad_norm": 0.228515625, | |
| "learning_rate": 0.0001776699029126214, | |
| "loss": 1.5446, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.18809980806142035, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 0.0001771844660194175, | |
| "loss": 1.5226, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.18937939859245043, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 0.00017669902912621362, | |
| "loss": 1.4343, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.19065898912348048, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 0.00017621359223300972, | |
| "loss": 1.5509, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.19193857965451055, | |
| "grad_norm": 0.234375, | |
| "learning_rate": 0.00017572815533980585, | |
| "loss": 1.5397, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.19321817018554063, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 0.00017524271844660195, | |
| "loss": 1.4796, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.1944977607165707, | |
| "grad_norm": 0.2470703125, | |
| "learning_rate": 0.00017475728155339805, | |
| "loss": 1.5855, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.19577735124760076, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 0.00017427184466019418, | |
| "loss": 1.5221, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.19705694177863084, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 0.00017378640776699028, | |
| "loss": 1.4963, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.19833653230966092, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 0.0001733009708737864, | |
| "loss": 1.5379, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.19961612284069097, | |
| "grad_norm": 0.220703125, | |
| "learning_rate": 0.00017281553398058253, | |
| "loss": 1.4701, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.20089571337172105, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 0.00017233009708737864, | |
| "loss": 1.5133, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.20217530390275112, | |
| "grad_norm": 0.251953125, | |
| "learning_rate": 0.00017184466019417476, | |
| "loss": 1.5238, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.2034548944337812, | |
| "grad_norm": 0.2490234375, | |
| "learning_rate": 0.00017135922330097086, | |
| "loss": 1.5361, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.20473448496481125, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 0.000170873786407767, | |
| "loss": 1.4585, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.20473448496481125, | |
| "eval_loss": 1.5087493658065796, | |
| "eval_runtime": 103.8289, | |
| "eval_samples_per_second": 48.156, | |
| "eval_steps_per_second": 1.512, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.20601407549584133, | |
| "grad_norm": 0.2412109375, | |
| "learning_rate": 0.0001703883495145631, | |
| "loss": 1.5574, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.2072936660268714, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 0.00016990291262135922, | |
| "loss": 1.4938, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.20857325655790146, | |
| "grad_norm": 0.2431640625, | |
| "learning_rate": 0.00016941747572815535, | |
| "loss": 1.5307, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.20985284708893154, | |
| "grad_norm": 0.23828125, | |
| "learning_rate": 0.00016893203883495145, | |
| "loss": 1.4849, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.21113243761996162, | |
| "grad_norm": 0.25390625, | |
| "learning_rate": 0.00016844660194174758, | |
| "loss": 1.4399, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.2124120281509917, | |
| "grad_norm": 0.2578125, | |
| "learning_rate": 0.00016796116504854368, | |
| "loss": 1.512, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.21369161868202174, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 0.0001674757281553398, | |
| "loss": 1.5586, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.21497120921305182, | |
| "grad_norm": 0.2412109375, | |
| "learning_rate": 0.00016699029126213594, | |
| "loss": 1.5673, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.2162507997440819, | |
| "grad_norm": 0.2392578125, | |
| "learning_rate": 0.00016650485436893204, | |
| "loss": 1.4893, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.21753039027511195, | |
| "grad_norm": 0.244140625, | |
| "learning_rate": 0.00016601941747572817, | |
| "loss": 1.5885, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.21880998080614203, | |
| "grad_norm": 0.275390625, | |
| "learning_rate": 0.00016553398058252427, | |
| "loss": 1.5318, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.2200895713371721, | |
| "grad_norm": 0.2392578125, | |
| "learning_rate": 0.0001650485436893204, | |
| "loss": 1.4523, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.22136916186820219, | |
| "grad_norm": 0.255859375, | |
| "learning_rate": 0.0001645631067961165, | |
| "loss": 1.5486, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.22264875239923224, | |
| "grad_norm": 0.234375, | |
| "learning_rate": 0.00016407766990291262, | |
| "loss": 1.4989, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.22392834293026231, | |
| "grad_norm": 0.2421875, | |
| "learning_rate": 0.00016359223300970875, | |
| "loss": 1.5556, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.2252079334612924, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 0.00016310679611650485, | |
| "loss": 1.545, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.22648752399232247, | |
| "grad_norm": 0.2470703125, | |
| "learning_rate": 0.00016262135922330098, | |
| "loss": 1.4939, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.22776711452335252, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 0.00016213592233009708, | |
| "loss": 1.4768, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.2290467050543826, | |
| "grad_norm": 0.2578125, | |
| "learning_rate": 0.0001616504854368932, | |
| "loss": 1.494, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.23032629558541268, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 0.0001611650485436893, | |
| "loss": 1.5361, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.23032629558541268, | |
| "eval_loss": 1.5049980878829956, | |
| "eval_runtime": 103.8247, | |
| "eval_samples_per_second": 48.158, | |
| "eval_steps_per_second": 1.512, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.23160588611644273, | |
| "grad_norm": 0.248046875, | |
| "learning_rate": 0.00016067961165048544, | |
| "loss": 1.5126, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.2328854766474728, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 0.00016019417475728157, | |
| "loss": 1.4835, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.23416506717850288, | |
| "grad_norm": 0.251953125, | |
| "learning_rate": 0.00015970873786407767, | |
| "loss": 1.5131, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.23544465770953296, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 0.0001592233009708738, | |
| "loss": 1.4804, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.236724248240563, | |
| "grad_norm": 0.25, | |
| "learning_rate": 0.0001587378640776699, | |
| "loss": 1.6027, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.2380038387715931, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 0.00015825242718446603, | |
| "loss": 1.5373, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.23928342930262317, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 0.00015776699029126213, | |
| "loss": 1.5531, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.24056301983365322, | |
| "grad_norm": 0.2412109375, | |
| "learning_rate": 0.00015728155339805825, | |
| "loss": 1.5101, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.2418426103646833, | |
| "grad_norm": 0.2578125, | |
| "learning_rate": 0.00015679611650485438, | |
| "loss": 1.538, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.24312220089571338, | |
| "grad_norm": 0.2451171875, | |
| "learning_rate": 0.00015631067961165048, | |
| "loss": 1.526, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.24440179142674345, | |
| "grad_norm": 0.248046875, | |
| "learning_rate": 0.0001558252427184466, | |
| "loss": 1.5275, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.2456813819577735, | |
| "grad_norm": 0.2421875, | |
| "learning_rate": 0.0001553398058252427, | |
| "loss": 1.567, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.24696097248880358, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 0.00015485436893203884, | |
| "loss": 1.4457, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.24824056301983366, | |
| "grad_norm": 0.2421875, | |
| "learning_rate": 0.00015436893203883497, | |
| "loss": 1.5728, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.2495201535508637, | |
| "grad_norm": 0.2421875, | |
| "learning_rate": 0.00015388349514563107, | |
| "loss": 1.4829, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.2507997440818938, | |
| "grad_norm": 0.23046875, | |
| "learning_rate": 0.0001533980582524272, | |
| "loss": 1.5093, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.25207933461292387, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 0.0001529126213592233, | |
| "loss": 1.5079, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.2533589251439539, | |
| "grad_norm": 0.2265625, | |
| "learning_rate": 0.00015242718446601943, | |
| "loss": 1.4996, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.254638515674984, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 0.00015194174757281553, | |
| "loss": 1.4967, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.2559181062060141, | |
| "grad_norm": 0.287109375, | |
| "learning_rate": 0.00015145631067961166, | |
| "loss": 1.485, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.2559181062060141, | |
| "eval_loss": 1.5022693872451782, | |
| "eval_runtime": 103.8087, | |
| "eval_samples_per_second": 48.166, | |
| "eval_steps_per_second": 1.512, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.2571976967370441, | |
| "grad_norm": 0.255859375, | |
| "learning_rate": 0.00015097087378640778, | |
| "loss": 1.4926, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.25847728726807423, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 0.00015048543689320389, | |
| "loss": 1.5215, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.2597568777991043, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 0.00015000000000000001, | |
| "loss": 1.5674, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.26103646833013433, | |
| "grad_norm": 0.240234375, | |
| "learning_rate": 0.00014951456310679611, | |
| "loss": 1.5157, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.26231605886116444, | |
| "grad_norm": 0.2451171875, | |
| "learning_rate": 0.00014902912621359224, | |
| "loss": 1.4502, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.2635956493921945, | |
| "grad_norm": 0.2578125, | |
| "learning_rate": 0.00014854368932038834, | |
| "loss": 1.5289, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.2648752399232246, | |
| "grad_norm": 0.24609375, | |
| "learning_rate": 0.00014805825242718447, | |
| "loss": 1.4454, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.26615483045425464, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 0.0001475728155339806, | |
| "loss": 1.5132, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.2674344209852847, | |
| "grad_norm": 0.28515625, | |
| "learning_rate": 0.0001470873786407767, | |
| "loss": 1.5041, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.2687140115163148, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 0.00014660194174757283, | |
| "loss": 1.5313, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.26999360204734485, | |
| "grad_norm": 0.24609375, | |
| "learning_rate": 0.00014611650485436893, | |
| "loss": 1.5156, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.2712731925783749, | |
| "grad_norm": 0.2451171875, | |
| "learning_rate": 0.00014563106796116506, | |
| "loss": 1.4958, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.272552783109405, | |
| "grad_norm": 0.2451171875, | |
| "learning_rate": 0.0001451456310679612, | |
| "loss": 1.5324, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.27383237364043506, | |
| "grad_norm": 0.2451171875, | |
| "learning_rate": 0.0001446601941747573, | |
| "loss": 1.4894, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.2751119641714651, | |
| "grad_norm": 0.2412109375, | |
| "learning_rate": 0.00014417475728155342, | |
| "loss": 1.4462, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.2763915547024952, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 0.00014368932038834952, | |
| "loss": 1.5, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.27767114523352526, | |
| "grad_norm": 0.296875, | |
| "learning_rate": 0.00014320388349514565, | |
| "loss": 1.5317, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.27895073576455537, | |
| "grad_norm": 0.251953125, | |
| "learning_rate": 0.00014271844660194175, | |
| "loss": 1.5553, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.2802303262955854, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 0.00014223300970873787, | |
| "loss": 1.5055, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.28150991682661547, | |
| "grad_norm": 0.25, | |
| "learning_rate": 0.000141747572815534, | |
| "loss": 1.5299, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.28150991682661547, | |
| "eval_loss": 1.4998944997787476, | |
| "eval_runtime": 103.8007, | |
| "eval_samples_per_second": 48.169, | |
| "eval_steps_per_second": 1.513, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.2827895073576456, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 0.0001412621359223301, | |
| "loss": 1.5298, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.2840690978886756, | |
| "grad_norm": 0.25, | |
| "learning_rate": 0.00014077669902912623, | |
| "loss": 1.5178, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.2853486884197057, | |
| "grad_norm": 0.244140625, | |
| "learning_rate": 0.00014029126213592233, | |
| "loss": 1.4975, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 0.2866282789507358, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 0.00013980582524271846, | |
| "loss": 1.5121, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.28790786948176583, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 0.00013932038834951456, | |
| "loss": 1.4838, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.2891874600127959, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 0.0001388349514563107, | |
| "loss": 1.4422, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.290467050543826, | |
| "grad_norm": 0.2421875, | |
| "learning_rate": 0.00013834951456310682, | |
| "loss": 1.5315, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 0.29174664107485604, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 0.00013786407766990292, | |
| "loss": 1.524, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.2930262316058861, | |
| "grad_norm": 0.255859375, | |
| "learning_rate": 0.00013737864077669905, | |
| "loss": 1.4314, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 0.2943058221369162, | |
| "grad_norm": 0.2421875, | |
| "learning_rate": 0.00013689320388349515, | |
| "loss": 1.5496, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.29558541266794625, | |
| "grad_norm": 0.279296875, | |
| "learning_rate": 0.00013640776699029128, | |
| "loss": 1.5526, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 0.29686500319897635, | |
| "grad_norm": 0.25, | |
| "learning_rate": 0.0001359223300970874, | |
| "loss": 1.427, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.2981445937300064, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 0.0001354368932038835, | |
| "loss": 1.4646, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 0.29942418426103645, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 0.00013495145631067963, | |
| "loss": 1.5392, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.30070377479206656, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 0.00013446601941747573, | |
| "loss": 1.5333, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.3019833653230966, | |
| "grad_norm": 0.2431640625, | |
| "learning_rate": 0.00013398058252427186, | |
| "loss": 1.5282, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.30326295585412666, | |
| "grad_norm": 0.251953125, | |
| "learning_rate": 0.00013349514563106796, | |
| "loss": 1.4763, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 0.30454254638515676, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 0.0001330097087378641, | |
| "loss": 1.5199, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.3058221369161868, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 0.00013252427184466022, | |
| "loss": 1.5497, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 0.30710172744721687, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 0.00013203883495145632, | |
| "loss": 1.5539, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.30710172744721687, | |
| "eval_loss": 1.4981228113174438, | |
| "eval_runtime": 103.819, | |
| "eval_samples_per_second": 48.161, | |
| "eval_steps_per_second": 1.512, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.30838131797824697, | |
| "grad_norm": 0.255859375, | |
| "learning_rate": 0.00013155339805825245, | |
| "loss": 1.5309, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 0.309660908509277, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 0.00013106796116504855, | |
| "loss": 1.4503, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.31094049904030713, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 0.00013058252427184468, | |
| "loss": 1.4718, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 0.3122200895713372, | |
| "grad_norm": 0.2578125, | |
| "learning_rate": 0.00013009708737864078, | |
| "loss": 1.4445, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.31349968010236723, | |
| "grad_norm": 0.2392578125, | |
| "learning_rate": 0.0001296116504854369, | |
| "loss": 1.5606, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.31477927063339733, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 0.00012912621359223304, | |
| "loss": 1.4917, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.3160588611644274, | |
| "grad_norm": 0.240234375, | |
| "learning_rate": 0.00012864077669902914, | |
| "loss": 1.5295, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 0.31733845169545744, | |
| "grad_norm": 0.2890625, | |
| "learning_rate": 0.00012815533980582526, | |
| "loss": 1.5685, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.31861804222648754, | |
| "grad_norm": 0.28515625, | |
| "learning_rate": 0.00012766990291262137, | |
| "loss": 1.4985, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 0.3198976327575176, | |
| "grad_norm": 0.251953125, | |
| "learning_rate": 0.0001271844660194175, | |
| "loss": 1.4746, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.32117722328854764, | |
| "grad_norm": 0.26953125, | |
| "learning_rate": 0.00012669902912621362, | |
| "loss": 1.5615, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 0.32245681381957775, | |
| "grad_norm": 0.2412109375, | |
| "learning_rate": 0.00012621359223300972, | |
| "loss": 1.4777, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.3237364043506078, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 0.00012572815533980585, | |
| "loss": 1.4152, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 0.32501599488163785, | |
| "grad_norm": 0.25390625, | |
| "learning_rate": 0.00012524271844660195, | |
| "loss": 1.4632, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.32629558541266795, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 0.00012475728155339805, | |
| "loss": 1.5011, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.327575175943698, | |
| "grad_norm": 0.244140625, | |
| "learning_rate": 0.00012427184466019418, | |
| "loss": 1.5339, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.3288547664747281, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 0.00012378640776699028, | |
| "loss": 1.4776, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 0.33013435700575816, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 0.0001233009708737864, | |
| "loss": 1.4585, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.3314139475367882, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 0.0001228155339805825, | |
| "loss": 1.4791, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 0.3326935380678183, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 0.00012233009708737864, | |
| "loss": 1.521, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.3326935380678183, | |
| "eval_loss": 1.4961707592010498, | |
| "eval_runtime": 103.7907, | |
| "eval_samples_per_second": 48.174, | |
| "eval_steps_per_second": 1.513, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.33397312859884837, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 0.00012184466019417475, | |
| "loss": 1.5015, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 0.3352527191298784, | |
| "grad_norm": 0.26953125, | |
| "learning_rate": 0.00012135922330097087, | |
| "loss": 1.4732, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 0.3365323096609085, | |
| "grad_norm": 0.294921875, | |
| "learning_rate": 0.00012087378640776698, | |
| "loss": 1.4832, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 0.3378119001919386, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 0.0001203883495145631, | |
| "loss": 1.4629, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.3390914907229686, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 0.00011990291262135923, | |
| "loss": 1.5046, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.34037108125399873, | |
| "grad_norm": 0.279296875, | |
| "learning_rate": 0.00011941747572815534, | |
| "loss": 1.5724, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 0.3416506717850288, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 0.00011893203883495146, | |
| "loss": 1.4481, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 0.3429302623160589, | |
| "grad_norm": 0.25390625, | |
| "learning_rate": 0.00011844660194174757, | |
| "loss": 1.5081, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 0.34420985284708894, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 0.00011796116504854368, | |
| "loss": 1.5056, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 0.345489443378119, | |
| "grad_norm": 0.26953125, | |
| "learning_rate": 0.0001174757281553398, | |
| "loss": 1.5279, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.3467690339091491, | |
| "grad_norm": 0.244140625, | |
| "learning_rate": 0.00011699029126213593, | |
| "loss": 1.5293, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 0.34804862444017914, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 0.00011650485436893204, | |
| "loss": 1.5436, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 0.3493282149712092, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 0.00011601941747572816, | |
| "loss": 1.5806, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 0.3506078055022393, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 0.00011553398058252427, | |
| "loss": 1.5412, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 0.35188739603326935, | |
| "grad_norm": 0.283203125, | |
| "learning_rate": 0.00011504854368932039, | |
| "loss": 1.5225, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.3531669865642994, | |
| "grad_norm": 0.2490234375, | |
| "learning_rate": 0.0001145631067961165, | |
| "loss": 1.6229, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.3544465770953295, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 0.00011407766990291261, | |
| "loss": 1.5231, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 0.35572616762635956, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 0.00011359223300970874, | |
| "loss": 1.481, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 0.3570057581573896, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 0.00011310679611650486, | |
| "loss": 1.544, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 0.3582853486884197, | |
| "grad_norm": 0.25390625, | |
| "learning_rate": 0.00011262135922330097, | |
| "loss": 1.5186, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.3582853486884197, | |
| "eval_loss": 1.4947106838226318, | |
| "eval_runtime": 103.7757, | |
| "eval_samples_per_second": 48.181, | |
| "eval_steps_per_second": 1.513, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.35956493921944976, | |
| "grad_norm": 0.2470703125, | |
| "learning_rate": 0.00011213592233009709, | |
| "loss": 1.5507, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 0.36084452975047987, | |
| "grad_norm": 0.255859375, | |
| "learning_rate": 0.0001116504854368932, | |
| "loss": 1.5091, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 0.3621241202815099, | |
| "grad_norm": 0.28125, | |
| "learning_rate": 0.00011116504854368932, | |
| "loss": 1.4962, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 0.36340371081253997, | |
| "grad_norm": 0.251953125, | |
| "learning_rate": 0.00011067961165048544, | |
| "loss": 1.537, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 0.3646833013435701, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 0.00011019417475728156, | |
| "loss": 1.5814, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.3659628918746001, | |
| "grad_norm": 0.279296875, | |
| "learning_rate": 0.00010970873786407767, | |
| "loss": 1.5328, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 0.3672424824056302, | |
| "grad_norm": 0.275390625, | |
| "learning_rate": 0.00010922330097087379, | |
| "loss": 1.5429, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 0.3685220729366603, | |
| "grad_norm": 0.26953125, | |
| "learning_rate": 0.0001087378640776699, | |
| "loss": 1.489, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.36980166346769033, | |
| "grad_norm": 0.248046875, | |
| "learning_rate": 0.00010825242718446602, | |
| "loss": 1.518, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 0.3710812539987204, | |
| "grad_norm": 0.25, | |
| "learning_rate": 0.00010776699029126213, | |
| "loss": 1.5142, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.3723608445297505, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 0.00010728155339805826, | |
| "loss": 1.5229, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 0.37364043506078054, | |
| "grad_norm": 0.251953125, | |
| "learning_rate": 0.00010679611650485437, | |
| "loss": 1.4803, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 0.37492002559181065, | |
| "grad_norm": 0.2578125, | |
| "learning_rate": 0.00010631067961165049, | |
| "loss": 1.5791, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 0.3761996161228407, | |
| "grad_norm": 0.2470703125, | |
| "learning_rate": 0.0001058252427184466, | |
| "loss": 1.5336, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 0.37747920665387075, | |
| "grad_norm": 0.275390625, | |
| "learning_rate": 0.00010533980582524272, | |
| "loss": 1.4581, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.37875879718490085, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 0.00010485436893203883, | |
| "loss": 1.4919, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 0.3800383877159309, | |
| "grad_norm": 0.25390625, | |
| "learning_rate": 0.00010436893203883496, | |
| "loss": 1.5778, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 0.38131797824696095, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 0.00010388349514563107, | |
| "loss": 1.4489, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 0.38259756877799106, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 0.00010339805825242719, | |
| "loss": 1.4704, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 0.3838771593090211, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 0.0001029126213592233, | |
| "loss": 1.5316, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.3838771593090211, | |
| "eval_loss": 1.4935728311538696, | |
| "eval_runtime": 103.76, | |
| "eval_samples_per_second": 48.188, | |
| "eval_steps_per_second": 1.513, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.38515674984005116, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 0.00010242718446601942, | |
| "loss": 1.5093, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 0.38643634037108127, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 0.00010194174757281553, | |
| "loss": 1.4871, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 0.3877159309021113, | |
| "grad_norm": 0.29296875, | |
| "learning_rate": 0.00010145631067961166, | |
| "loss": 1.4842, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 0.3889955214331414, | |
| "grad_norm": 0.275390625, | |
| "learning_rate": 0.00010097087378640778, | |
| "loss": 1.5545, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 0.3902751119641715, | |
| "grad_norm": 0.255859375, | |
| "learning_rate": 0.00010048543689320389, | |
| "loss": 1.5671, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.3915547024952015, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 0.0001, | |
| "loss": 1.4817, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 0.39283429302623163, | |
| "grad_norm": 0.275390625, | |
| "learning_rate": 9.951456310679612e-05, | |
| "loss": 1.4727, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 0.3941138835572617, | |
| "grad_norm": 0.251953125, | |
| "learning_rate": 9.902912621359223e-05, | |
| "loss": 1.4717, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 0.39539347408829173, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 9.854368932038835e-05, | |
| "loss": 1.4795, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 0.39667306461932184, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 9.805825242718448e-05, | |
| "loss": 1.4475, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.3979526551503519, | |
| "grad_norm": 0.283203125, | |
| "learning_rate": 9.757281553398059e-05, | |
| "loss": 1.4661, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 0.39923224568138194, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 9.70873786407767e-05, | |
| "loss": 1.4808, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 0.40051183621241204, | |
| "grad_norm": 0.29296875, | |
| "learning_rate": 9.660194174757282e-05, | |
| "loss": 1.5169, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 0.4017914267434421, | |
| "grad_norm": 0.296875, | |
| "learning_rate": 9.611650485436893e-05, | |
| "loss": 1.4856, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 0.40307101727447214, | |
| "grad_norm": 0.2578125, | |
| "learning_rate": 9.563106796116505e-05, | |
| "loss": 1.4995, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.40435060780550225, | |
| "grad_norm": 0.28125, | |
| "learning_rate": 9.514563106796118e-05, | |
| "loss": 1.5682, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 0.4056301983365323, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 9.466019417475729e-05, | |
| "loss": 1.492, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 0.4069097888675624, | |
| "grad_norm": 0.275390625, | |
| "learning_rate": 9.417475728155341e-05, | |
| "loss": 1.4706, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 0.40818937939859246, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 9.368932038834952e-05, | |
| "loss": 1.5184, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 0.4094689699296225, | |
| "grad_norm": 0.283203125, | |
| "learning_rate": 9.320388349514564e-05, | |
| "loss": 1.5288, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.4094689699296225, | |
| "eval_loss": 1.4924702644348145, | |
| "eval_runtime": 103.7588, | |
| "eval_samples_per_second": 48.189, | |
| "eval_steps_per_second": 1.513, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.4107485604606526, | |
| "grad_norm": 0.275390625, | |
| "learning_rate": 9.271844660194175e-05, | |
| "loss": 1.5166, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 0.41202815099168266, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 9.223300970873788e-05, | |
| "loss": 1.5137, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 0.4133077415227127, | |
| "grad_norm": 0.2890625, | |
| "learning_rate": 9.174757281553399e-05, | |
| "loss": 1.4874, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 0.4145873320537428, | |
| "grad_norm": 0.27734375, | |
| "learning_rate": 9.126213592233011e-05, | |
| "loss": 1.4144, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 0.41586692258477287, | |
| "grad_norm": 0.251953125, | |
| "learning_rate": 9.077669902912622e-05, | |
| "loss": 1.529, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.4171465131158029, | |
| "grad_norm": 0.2578125, | |
| "learning_rate": 9.029126213592234e-05, | |
| "loss": 1.5507, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 0.418426103646833, | |
| "grad_norm": 0.283203125, | |
| "learning_rate": 8.980582524271845e-05, | |
| "loss": 1.5447, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 0.4197056941778631, | |
| "grad_norm": 0.2578125, | |
| "learning_rate": 8.932038834951457e-05, | |
| "loss": 1.556, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 0.4209852847088932, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 8.88349514563107e-05, | |
| "loss": 1.4945, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 0.42226487523992323, | |
| "grad_norm": 0.2578125, | |
| "learning_rate": 8.834951456310681e-05, | |
| "loss": 1.5668, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.4235444657709533, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 8.786407766990292e-05, | |
| "loss": 1.501, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 0.4248240563019834, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 8.737864077669902e-05, | |
| "loss": 1.5187, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 0.42610364683301344, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 8.689320388349514e-05, | |
| "loss": 1.5736, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 0.4273832373640435, | |
| "grad_norm": 0.28125, | |
| "learning_rate": 8.640776699029127e-05, | |
| "loss": 1.5085, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 0.4286628278950736, | |
| "grad_norm": 0.25390625, | |
| "learning_rate": 8.592233009708738e-05, | |
| "loss": 1.4757, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.42994241842610365, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 8.54368932038835e-05, | |
| "loss": 1.5243, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 0.4312220089571337, | |
| "grad_norm": 0.26953125, | |
| "learning_rate": 8.495145631067961e-05, | |
| "loss": 1.5662, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 0.4325015994881638, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 8.446601941747573e-05, | |
| "loss": 1.433, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 0.43378119001919385, | |
| "grad_norm": 0.283203125, | |
| "learning_rate": 8.398058252427184e-05, | |
| "loss": 1.5378, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 0.4350607805502239, | |
| "grad_norm": 0.29296875, | |
| "learning_rate": 8.349514563106797e-05, | |
| "loss": 1.5276, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.4350607805502239, | |
| "eval_loss": 1.4914450645446777, | |
| "eval_runtime": 103.7522, | |
| "eval_samples_per_second": 48.192, | |
| "eval_steps_per_second": 1.513, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.436340371081254, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 8.300970873786408e-05, | |
| "loss": 1.4723, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 0.43761996161228406, | |
| "grad_norm": 0.25390625, | |
| "learning_rate": 8.25242718446602e-05, | |
| "loss": 1.5185, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 0.43889955214331416, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 8.203883495145631e-05, | |
| "loss": 1.5317, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 0.4401791426743442, | |
| "grad_norm": 0.251953125, | |
| "learning_rate": 8.155339805825243e-05, | |
| "loss": 1.5254, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 0.44145873320537427, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 8.106796116504854e-05, | |
| "loss": 1.5152, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.44273832373640437, | |
| "grad_norm": 0.28125, | |
| "learning_rate": 8.058252427184466e-05, | |
| "loss": 1.4812, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 0.4440179142674344, | |
| "grad_norm": 0.26953125, | |
| "learning_rate": 8.009708737864078e-05, | |
| "loss": 1.5023, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 0.44529750479846447, | |
| "grad_norm": 0.291015625, | |
| "learning_rate": 7.96116504854369e-05, | |
| "loss": 1.4516, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 0.4465770953294946, | |
| "grad_norm": 0.279296875, | |
| "learning_rate": 7.912621359223301e-05, | |
| "loss": 1.4349, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 0.44785668586052463, | |
| "grad_norm": 0.28125, | |
| "learning_rate": 7.864077669902913e-05, | |
| "loss": 1.5181, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.4491362763915547, | |
| "grad_norm": 0.27734375, | |
| "learning_rate": 7.815533980582524e-05, | |
| "loss": 1.4765, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 0.4504158669225848, | |
| "grad_norm": 0.275390625, | |
| "learning_rate": 7.766990291262136e-05, | |
| "loss": 1.524, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 0.45169545745361483, | |
| "grad_norm": 0.25, | |
| "learning_rate": 7.718446601941748e-05, | |
| "loss": 1.5236, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 0.45297504798464494, | |
| "grad_norm": 0.2890625, | |
| "learning_rate": 7.66990291262136e-05, | |
| "loss": 1.5358, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 0.454254638515675, | |
| "grad_norm": 0.244140625, | |
| "learning_rate": 7.621359223300971e-05, | |
| "loss": 1.5447, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.45553422904670504, | |
| "grad_norm": 0.287109375, | |
| "learning_rate": 7.572815533980583e-05, | |
| "loss": 1.428, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 0.45681381957773515, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 7.524271844660194e-05, | |
| "loss": 1.3921, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 0.4580934101087652, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 7.475728155339806e-05, | |
| "loss": 1.5059, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 0.45937300063979525, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 7.427184466019417e-05, | |
| "loss": 1.4832, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 0.46065259117082535, | |
| "grad_norm": 0.279296875, | |
| "learning_rate": 7.37864077669903e-05, | |
| "loss": 1.5236, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.46065259117082535, | |
| "eval_loss": 1.4904447793960571, | |
| "eval_runtime": 103.753, | |
| "eval_samples_per_second": 48.191, | |
| "eval_steps_per_second": 1.513, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.4619321817018554, | |
| "grad_norm": 0.287109375, | |
| "learning_rate": 7.330097087378641e-05, | |
| "loss": 1.5008, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 0.46321177223288545, | |
| "grad_norm": 0.28125, | |
| "learning_rate": 7.281553398058253e-05, | |
| "loss": 1.5244, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 0.46449136276391556, | |
| "grad_norm": 0.27734375, | |
| "learning_rate": 7.233009708737864e-05, | |
| "loss": 1.5849, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 0.4657709532949456, | |
| "grad_norm": 0.2490234375, | |
| "learning_rate": 7.184466019417476e-05, | |
| "loss": 1.4882, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 0.46705054382597566, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 7.135922330097087e-05, | |
| "loss": 1.4905, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.46833013435700577, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 7.0873786407767e-05, | |
| "loss": 1.4391, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 0.4696097248880358, | |
| "grad_norm": 0.2451171875, | |
| "learning_rate": 7.038834951456312e-05, | |
| "loss": 1.5034, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 0.4708893154190659, | |
| "grad_norm": 0.283203125, | |
| "learning_rate": 6.990291262135923e-05, | |
| "loss": 1.4928, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 0.472168905950096, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 6.941747572815534e-05, | |
| "loss": 1.5578, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 0.473448496481126, | |
| "grad_norm": 0.287109375, | |
| "learning_rate": 6.893203883495146e-05, | |
| "loss": 1.5403, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.47472808701215613, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 6.844660194174757e-05, | |
| "loss": 1.5081, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 0.4760076775431862, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 6.79611650485437e-05, | |
| "loss": 1.5799, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 0.47728726807421623, | |
| "grad_norm": 0.25390625, | |
| "learning_rate": 6.747572815533982e-05, | |
| "loss": 1.5097, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 0.47856685860524634, | |
| "grad_norm": 0.26953125, | |
| "learning_rate": 6.699029126213593e-05, | |
| "loss": 1.5164, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 0.4798464491362764, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 6.650485436893205e-05, | |
| "loss": 1.5139, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.48112603966730644, | |
| "grad_norm": 0.251953125, | |
| "learning_rate": 6.601941747572816e-05, | |
| "loss": 1.4756, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 0.48240563019833654, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 6.553398058252428e-05, | |
| "loss": 1.5325, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 0.4836852207293666, | |
| "grad_norm": 0.283203125, | |
| "learning_rate": 6.504854368932039e-05, | |
| "loss": 1.4932, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 0.4849648112603967, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 6.456310679611652e-05, | |
| "loss": 1.5157, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 0.48624440179142675, | |
| "grad_norm": 0.298828125, | |
| "learning_rate": 6.407766990291263e-05, | |
| "loss": 1.4882, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.48624440179142675, | |
| "eval_loss": 1.4896763563156128, | |
| "eval_runtime": 103.7894, | |
| "eval_samples_per_second": 48.174, | |
| "eval_steps_per_second": 1.513, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.4875239923224568, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 6.359223300970875e-05, | |
| "loss": 1.546, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 0.4888035828534869, | |
| "grad_norm": 0.279296875, | |
| "learning_rate": 6.310679611650486e-05, | |
| "loss": 1.4907, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 0.49008317338451696, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 6.262135922330098e-05, | |
| "loss": 1.4512, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 0.491362763915547, | |
| "grad_norm": 0.26953125, | |
| "learning_rate": 6.213592233009709e-05, | |
| "loss": 1.5442, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 0.4926423544465771, | |
| "grad_norm": 0.28125, | |
| "learning_rate": 6.16504854368932e-05, | |
| "loss": 1.4975, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.49392194497760716, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 6.116504854368932e-05, | |
| "loss": 1.534, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 0.4952015355086372, | |
| "grad_norm": 0.27734375, | |
| "learning_rate": 6.0679611650485434e-05, | |
| "loss": 1.417, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 0.4964811260396673, | |
| "grad_norm": 0.255859375, | |
| "learning_rate": 6.019417475728155e-05, | |
| "loss": 1.4766, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 0.49776071657069737, | |
| "grad_norm": 0.283203125, | |
| "learning_rate": 5.970873786407767e-05, | |
| "loss": 1.4672, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 0.4990403071017274, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 5.9223300970873785e-05, | |
| "loss": 1.5021, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.5003198976327575, | |
| "grad_norm": 0.283203125, | |
| "learning_rate": 5.87378640776699e-05, | |
| "loss": 1.5723, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 0.5015994881637876, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 5.825242718446602e-05, | |
| "loss": 1.5351, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 0.5028790786948176, | |
| "grad_norm": 0.27734375, | |
| "learning_rate": 5.7766990291262135e-05, | |
| "loss": 1.5922, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 0.5041586692258477, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 5.728155339805825e-05, | |
| "loss": 1.5, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 0.5054382597568778, | |
| "grad_norm": 0.26953125, | |
| "learning_rate": 5.679611650485437e-05, | |
| "loss": 1.5057, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.5067178502879078, | |
| "grad_norm": 0.255859375, | |
| "learning_rate": 5.6310679611650486e-05, | |
| "loss": 1.4968, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 0.5079974408189379, | |
| "grad_norm": 0.27734375, | |
| "learning_rate": 5.58252427184466e-05, | |
| "loss": 1.4383, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 0.509277031349968, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 5.533980582524272e-05, | |
| "loss": 1.5069, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 0.510556621880998, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 5.4854368932038836e-05, | |
| "loss": 1.4293, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 0.5118362124120281, | |
| "grad_norm": 0.275390625, | |
| "learning_rate": 5.436893203883495e-05, | |
| "loss": 1.5098, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.5118362124120281, | |
| "eval_loss": 1.4892088174819946, | |
| "eval_runtime": 103.7401, | |
| "eval_samples_per_second": 48.197, | |
| "eval_steps_per_second": 1.513, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.5131158029430583, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 5.3883495145631065e-05, | |
| "loss": 1.4924, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 0.5143953934740882, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 5.339805825242719e-05, | |
| "loss": 1.5093, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 0.5156749840051184, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 5.29126213592233e-05, | |
| "loss": 1.5384, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 0.5169545745361485, | |
| "grad_norm": 0.275390625, | |
| "learning_rate": 5.2427184466019416e-05, | |
| "loss": 1.5331, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 0.5182341650671785, | |
| "grad_norm": 0.28515625, | |
| "learning_rate": 5.194174757281554e-05, | |
| "loss": 1.5009, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.5195137555982086, | |
| "grad_norm": 0.255859375, | |
| "learning_rate": 5.145631067961165e-05, | |
| "loss": 1.4744, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 0.5207933461292387, | |
| "grad_norm": 0.28125, | |
| "learning_rate": 5.0970873786407766e-05, | |
| "loss": 1.5178, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 0.5220729366602687, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 5.048543689320389e-05, | |
| "loss": 1.4893, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 0.5233525271912988, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 5e-05, | |
| "loss": 1.5309, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 0.5246321177223289, | |
| "grad_norm": 0.30078125, | |
| "learning_rate": 4.951456310679612e-05, | |
| "loss": 1.4873, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.525911708253359, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 4.902912621359224e-05, | |
| "loss": 1.5083, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 0.527191298784389, | |
| "grad_norm": 0.25390625, | |
| "learning_rate": 4.854368932038835e-05, | |
| "loss": 1.5487, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 0.5284708893154191, | |
| "grad_norm": 0.2578125, | |
| "learning_rate": 4.805825242718447e-05, | |
| "loss": 1.4886, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 0.5297504798464492, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 4.757281553398059e-05, | |
| "loss": 1.4687, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 0.5310300703774792, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 4.7087378640776703e-05, | |
| "loss": 1.5612, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.5323096609085093, | |
| "grad_norm": 0.26953125, | |
| "learning_rate": 4.660194174757282e-05, | |
| "loss": 1.534, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 0.5335892514395394, | |
| "grad_norm": 0.296875, | |
| "learning_rate": 4.611650485436894e-05, | |
| "loss": 1.4985, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 0.5348688419705694, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 4.5631067961165054e-05, | |
| "loss": 1.5042, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 0.5361484325015995, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 4.514563106796117e-05, | |
| "loss": 1.5292, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 0.5374280230326296, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 4.466019417475728e-05, | |
| "loss": 1.4526, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.5374280230326296, | |
| "eval_loss": 1.4887601137161255, | |
| "eval_runtime": 103.7847, | |
| "eval_samples_per_second": 48.177, | |
| "eval_steps_per_second": 1.513, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.5387076135636596, | |
| "grad_norm": 0.275390625, | |
| "learning_rate": 4.4174757281553404e-05, | |
| "loss": 1.4529, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 0.5399872040946897, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 4.368932038834951e-05, | |
| "loss": 1.4163, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 0.5412667946257198, | |
| "grad_norm": 0.2578125, | |
| "learning_rate": 4.3203883495145634e-05, | |
| "loss": 1.5169, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 0.5425463851567498, | |
| "grad_norm": 0.27734375, | |
| "learning_rate": 4.271844660194175e-05, | |
| "loss": 1.4888, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 0.5438259756877799, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 4.223300970873786e-05, | |
| "loss": 1.4733, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.54510556621881, | |
| "grad_norm": 0.306640625, | |
| "learning_rate": 4.1747572815533984e-05, | |
| "loss": 1.503, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 0.54638515674984, | |
| "grad_norm": 0.27734375, | |
| "learning_rate": 4.12621359223301e-05, | |
| "loss": 1.4406, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 0.5476647472808701, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 4.077669902912621e-05, | |
| "loss": 1.4952, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 0.5489443378119002, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 4.029126213592233e-05, | |
| "loss": 1.4837, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 0.5502239283429302, | |
| "grad_norm": 0.28515625, | |
| "learning_rate": 3.980582524271845e-05, | |
| "loss": 1.6037, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.5515035188739603, | |
| "grad_norm": 0.283203125, | |
| "learning_rate": 3.9320388349514564e-05, | |
| "loss": 1.4425, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 0.5527831094049904, | |
| "grad_norm": 0.26953125, | |
| "learning_rate": 3.883495145631068e-05, | |
| "loss": 1.4502, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 0.5540626999360204, | |
| "grad_norm": 0.291015625, | |
| "learning_rate": 3.83495145631068e-05, | |
| "loss": 1.4936, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 0.5553422904670505, | |
| "grad_norm": 0.287109375, | |
| "learning_rate": 3.7864077669902914e-05, | |
| "loss": 1.5186, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 0.5566218809980806, | |
| "grad_norm": 0.283203125, | |
| "learning_rate": 3.737864077669903e-05, | |
| "loss": 1.464, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.5579014715291107, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 3.689320388349515e-05, | |
| "loss": 1.4973, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 0.5591810620601407, | |
| "grad_norm": 0.2578125, | |
| "learning_rate": 3.6407766990291265e-05, | |
| "loss": 1.5231, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 0.5604606525911708, | |
| "grad_norm": 0.255859375, | |
| "learning_rate": 3.592233009708738e-05, | |
| "loss": 1.5225, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 0.5617402431222009, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 3.54368932038835e-05, | |
| "loss": 1.4312, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 0.5630198336532309, | |
| "grad_norm": 0.27734375, | |
| "learning_rate": 3.4951456310679615e-05, | |
| "loss": 1.5529, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.5630198336532309, | |
| "eval_loss": 1.4884061813354492, | |
| "eval_runtime": 103.7793, | |
| "eval_samples_per_second": 48.179, | |
| "eval_steps_per_second": 1.513, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.564299424184261, | |
| "grad_norm": 0.2890625, | |
| "learning_rate": 3.446601941747573e-05, | |
| "loss": 1.4933, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 0.5655790147152912, | |
| "grad_norm": 0.298828125, | |
| "learning_rate": 3.398058252427185e-05, | |
| "loss": 1.4352, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 0.5668586052463211, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 3.3495145631067966e-05, | |
| "loss": 1.5062, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 0.5681381957773513, | |
| "grad_norm": 0.279296875, | |
| "learning_rate": 3.300970873786408e-05, | |
| "loss": 1.5018, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 0.5694177863083814, | |
| "grad_norm": 0.283203125, | |
| "learning_rate": 3.2524271844660195e-05, | |
| "loss": 1.5037, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.5706973768394114, | |
| "grad_norm": 0.287109375, | |
| "learning_rate": 3.2038834951456316e-05, | |
| "loss": 1.5056, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 0.5719769673704415, | |
| "grad_norm": 0.294921875, | |
| "learning_rate": 3.155339805825243e-05, | |
| "loss": 1.567, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 0.5732565579014716, | |
| "grad_norm": 0.287109375, | |
| "learning_rate": 3.1067961165048545e-05, | |
| "loss": 1.4753, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 0.5745361484325016, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 3.058252427184466e-05, | |
| "loss": 1.5731, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 0.5758157389635317, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 3.0097087378640774e-05, | |
| "loss": 1.4695, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.5770953294945618, | |
| "grad_norm": 0.279296875, | |
| "learning_rate": 2.9611650485436892e-05, | |
| "loss": 1.4394, | |
| "step": 451 | |
| }, | |
| { | |
| "epoch": 0.5783749200255918, | |
| "grad_norm": 0.26953125, | |
| "learning_rate": 2.912621359223301e-05, | |
| "loss": 1.52, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 0.5796545105566219, | |
| "grad_norm": 0.275390625, | |
| "learning_rate": 2.8640776699029125e-05, | |
| "loss": 1.5178, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 0.580934101087652, | |
| "grad_norm": 0.298828125, | |
| "learning_rate": 2.8155339805825243e-05, | |
| "loss": 1.5035, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 0.582213691618682, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 2.766990291262136e-05, | |
| "loss": 1.5201, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.5834932821497121, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 2.7184466019417475e-05, | |
| "loss": 1.5495, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 0.5847728726807422, | |
| "grad_norm": 0.27734375, | |
| "learning_rate": 2.6699029126213593e-05, | |
| "loss": 1.483, | |
| "step": 457 | |
| }, | |
| { | |
| "epoch": 0.5860524632117722, | |
| "grad_norm": 0.287109375, | |
| "learning_rate": 2.6213592233009708e-05, | |
| "loss": 1.5084, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 0.5873320537428023, | |
| "grad_norm": 0.26953125, | |
| "learning_rate": 2.5728155339805826e-05, | |
| "loss": 1.5115, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 0.5886116442738324, | |
| "grad_norm": 0.275390625, | |
| "learning_rate": 2.5242718446601944e-05, | |
| "loss": 1.4747, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.5886116442738324, | |
| "eval_loss": 1.4881880283355713, | |
| "eval_runtime": 103.7794, | |
| "eval_samples_per_second": 48.179, | |
| "eval_steps_per_second": 1.513, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.5898912348048625, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 2.475728155339806e-05, | |
| "loss": 1.5567, | |
| "step": 461 | |
| }, | |
| { | |
| "epoch": 0.5911708253358925, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 2.4271844660194176e-05, | |
| "loss": 1.4473, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 0.5924504158669226, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 2.3786407766990294e-05, | |
| "loss": 1.5204, | |
| "step": 463 | |
| }, | |
| { | |
| "epoch": 0.5937300063979527, | |
| "grad_norm": 0.283203125, | |
| "learning_rate": 2.330097087378641e-05, | |
| "loss": 1.537, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 0.5950095969289827, | |
| "grad_norm": 0.251953125, | |
| "learning_rate": 2.2815533980582527e-05, | |
| "loss": 1.4269, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.5962891874600128, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 2.233009708737864e-05, | |
| "loss": 1.452, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 0.5975687779910429, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 2.1844660194174756e-05, | |
| "loss": 1.4702, | |
| "step": 467 | |
| }, | |
| { | |
| "epoch": 0.5988483685220729, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 2.1359223300970874e-05, | |
| "loss": 1.4577, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 0.600127959053103, | |
| "grad_norm": 0.27734375, | |
| "learning_rate": 2.0873786407766992e-05, | |
| "loss": 1.5009, | |
| "step": 469 | |
| }, | |
| { | |
| "epoch": 0.6014075495841331, | |
| "grad_norm": 0.251953125, | |
| "learning_rate": 2.0388349514563107e-05, | |
| "loss": 1.4926, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.6026871401151631, | |
| "grad_norm": 0.2890625, | |
| "learning_rate": 1.9902912621359225e-05, | |
| "loss": 1.5575, | |
| "step": 471 | |
| }, | |
| { | |
| "epoch": 0.6039667306461932, | |
| "grad_norm": 0.28125, | |
| "learning_rate": 1.941747572815534e-05, | |
| "loss": 1.4811, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 0.6052463211772233, | |
| "grad_norm": 0.291015625, | |
| "learning_rate": 1.8932038834951457e-05, | |
| "loss": 1.496, | |
| "step": 473 | |
| }, | |
| { | |
| "epoch": 0.6065259117082533, | |
| "grad_norm": 0.26953125, | |
| "learning_rate": 1.8446601941747575e-05, | |
| "loss": 1.4922, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 0.6078055022392834, | |
| "grad_norm": 0.28515625, | |
| "learning_rate": 1.796116504854369e-05, | |
| "loss": 1.5435, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.6090850927703135, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 1.7475728155339808e-05, | |
| "loss": 1.5427, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 0.6103646833013435, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 1.6990291262135926e-05, | |
| "loss": 1.4921, | |
| "step": 477 | |
| }, | |
| { | |
| "epoch": 0.6116442738323736, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 1.650485436893204e-05, | |
| "loss": 1.4226, | |
| "step": 478 | |
| }, | |
| { | |
| "epoch": 0.6129238643634037, | |
| "grad_norm": 0.287109375, | |
| "learning_rate": 1.6019417475728158e-05, | |
| "loss": 1.4878, | |
| "step": 479 | |
| }, | |
| { | |
| "epoch": 0.6142034548944337, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 1.5533980582524273e-05, | |
| "loss": 1.5165, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.6142034548944337, | |
| "eval_loss": 1.4879465103149414, | |
| "eval_runtime": 103.7919, | |
| "eval_samples_per_second": 48.173, | |
| "eval_steps_per_second": 1.513, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.6154830454254638, | |
| "grad_norm": 0.28515625, | |
| "learning_rate": 1.5048543689320387e-05, | |
| "loss": 1.5134, | |
| "step": 481 | |
| }, | |
| { | |
| "epoch": 0.6167626359564939, | |
| "grad_norm": 0.28125, | |
| "learning_rate": 1.4563106796116505e-05, | |
| "loss": 1.4587, | |
| "step": 482 | |
| }, | |
| { | |
| "epoch": 0.6180422264875239, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 1.4077669902912621e-05, | |
| "loss": 1.4203, | |
| "step": 483 | |
| }, | |
| { | |
| "epoch": 0.619321817018554, | |
| "grad_norm": 0.28515625, | |
| "learning_rate": 1.3592233009708738e-05, | |
| "loss": 1.4245, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 0.6206014075495841, | |
| "grad_norm": 0.279296875, | |
| "learning_rate": 1.3106796116504854e-05, | |
| "loss": 1.513, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.6218809980806143, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 1.2621359223300972e-05, | |
| "loss": 1.5439, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 0.6231605886116443, | |
| "grad_norm": 0.283203125, | |
| "learning_rate": 1.2135922330097088e-05, | |
| "loss": 1.5063, | |
| "step": 487 | |
| }, | |
| { | |
| "epoch": 0.6244401791426744, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 1.1650485436893204e-05, | |
| "loss": 1.4927, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 0.6257197696737045, | |
| "grad_norm": 0.28515625, | |
| "learning_rate": 1.116504854368932e-05, | |
| "loss": 1.4943, | |
| "step": 489 | |
| }, | |
| { | |
| "epoch": 0.6269993602047345, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 1.0679611650485437e-05, | |
| "loss": 1.5125, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.6282789507357646, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 1.0194174757281553e-05, | |
| "loss": 1.4396, | |
| "step": 491 | |
| }, | |
| { | |
| "epoch": 0.6295585412667947, | |
| "grad_norm": 0.275390625, | |
| "learning_rate": 9.70873786407767e-06, | |
| "loss": 1.4928, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 0.6308381317978247, | |
| "grad_norm": 0.291015625, | |
| "learning_rate": 9.223300970873788e-06, | |
| "loss": 1.4664, | |
| "step": 493 | |
| }, | |
| { | |
| "epoch": 0.6321177223288548, | |
| "grad_norm": 0.279296875, | |
| "learning_rate": 8.737864077669904e-06, | |
| "loss": 1.4305, | |
| "step": 494 | |
| }, | |
| { | |
| "epoch": 0.6333973128598849, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 8.25242718446602e-06, | |
| "loss": 1.5016, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.6346769033909149, | |
| "grad_norm": 0.28125, | |
| "learning_rate": 7.766990291262136e-06, | |
| "loss": 1.4641, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 0.635956493921945, | |
| "grad_norm": 0.275390625, | |
| "learning_rate": 7.281553398058253e-06, | |
| "loss": 1.497, | |
| "step": 497 | |
| }, | |
| { | |
| "epoch": 0.6372360844529751, | |
| "grad_norm": 0.2578125, | |
| "learning_rate": 6.796116504854369e-06, | |
| "loss": 1.54, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 0.6385156749840051, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 6.310679611650486e-06, | |
| "loss": 1.4769, | |
| "step": 499 | |
| }, | |
| { | |
| "epoch": 0.6397952655150352, | |
| "grad_norm": 0.2412109375, | |
| "learning_rate": 5.825242718446602e-06, | |
| "loss": 1.5592, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.6397952655150352, | |
| "eval_loss": 1.4878435134887695, | |
| "eval_runtime": 103.7542, | |
| "eval_samples_per_second": 48.191, | |
| "eval_steps_per_second": 1.513, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.6410748560460653, | |
| "grad_norm": 0.28515625, | |
| "learning_rate": 5.3398058252427185e-06, | |
| "loss": 1.4929, | |
| "step": 501 | |
| }, | |
| { | |
| "epoch": 0.6423544465770953, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 4.854368932038835e-06, | |
| "loss": 1.4825, | |
| "step": 502 | |
| }, | |
| { | |
| "epoch": 0.6436340371081254, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 4.368932038834952e-06, | |
| "loss": 1.5464, | |
| "step": 503 | |
| }, | |
| { | |
| "epoch": 0.6449136276391555, | |
| "grad_norm": 0.2890625, | |
| "learning_rate": 3.883495145631068e-06, | |
| "loss": 1.4797, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 0.6461932181701855, | |
| "grad_norm": 0.298828125, | |
| "learning_rate": 3.3980582524271844e-06, | |
| "loss": 1.5039, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.6474728087012156, | |
| "grad_norm": 0.296875, | |
| "learning_rate": 2.912621359223301e-06, | |
| "loss": 1.4895, | |
| "step": 506 | |
| }, | |
| { | |
| "epoch": 0.6487523992322457, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 2.4271844660194174e-06, | |
| "loss": 1.4887, | |
| "step": 507 | |
| }, | |
| { | |
| "epoch": 0.6500319897632757, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 1.941747572815534e-06, | |
| "loss": 1.4734, | |
| "step": 508 | |
| }, | |
| { | |
| "epoch": 0.6513115802943058, | |
| "grad_norm": 0.2578125, | |
| "learning_rate": 1.4563106796116506e-06, | |
| "loss": 1.4636, | |
| "step": 509 | |
| }, | |
| { | |
| "epoch": 0.6525911708253359, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 9.70873786407767e-07, | |
| "loss": 1.5035, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.653870761356366, | |
| "grad_norm": 0.251953125, | |
| "learning_rate": 4.854368932038835e-07, | |
| "loss": 1.5264, | |
| "step": 511 | |
| }, | |
| { | |
| "epoch": 0.655150351887396, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 0.0, | |
| "loss": 1.5441, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 0.655150351887396, | |
| "step": 512, | |
| "total_flos": 3.6685588640169984e+17, | |
| "train_loss": 1.622293347492814, | |
| "train_runtime": 4726.6516, | |
| "train_samples_per_second": 6.933, | |
| "train_steps_per_second": 0.108 | |
| }, | |
| { | |
| "epoch": 0.655150351887396, | |
| "eval_loss": 1.4878435134887695, | |
| "eval_runtime": 103.7194, | |
| "eval_samples_per_second": 48.207, | |
| "eval_steps_per_second": 1.514, | |
| "step": 512 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 512, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 20, | |
| "total_flos": 3.6685588640169984e+17, | |
| "train_batch_size": 32, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |