JW17's picture
Add files using upload-large-folder tool
a117c42 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.2,
"eval_steps": 500,
"global_step": 80,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"Batch Mean": 2.561767578125,
"accuracy": 0.375,
"epoch": 0,
"step": 0
},
{
"Batch Mean": 2.625244140625,
"accuracy": 0.5,
"epoch": 0,
"step": 0
},
{
"Batch Mean": 2.579345703125,
"accuracy": 0.6875,
"epoch": 0,
"step": 0
},
{
"Batch Mean": 2.6171875,
"accuracy": 0.34375,
"epoch": 0,
"step": 0
},
{
"epoch": 0.0025,
"grad_norm": 6.891047954559326,
"learning_rate": 1.5000000000000002e-07,
"loss": 0.6968,
"step": 1
},
{
"Batch Mean": 2.593505859375,
"accuracy": 0.5,
"epoch": 0.0025,
"step": 1
},
{
"Batch Mean": 2.5416259765625,
"accuracy": 0.5,
"epoch": 0.0025,
"step": 1
},
{
"Batch Mean": 2.53125,
"accuracy": 0.5,
"epoch": 0.0025,
"step": 1
},
{
"Batch Mean": 2.525146484375,
"accuracy": 0.5625,
"epoch": 0.0025,
"step": 1
},
{
"epoch": 0.005,
"grad_norm": 7.476072311401367,
"learning_rate": 3.0000000000000004e-07,
"loss": 0.6855,
"step": 2
},
{
"Batch Mean": 2.633056640625,
"accuracy": 0.5,
"epoch": 0.005,
"step": 2
},
{
"Batch Mean": 2.579345703125,
"accuracy": 0.4375,
"epoch": 0.005,
"step": 2
},
{
"Batch Mean": 2.6005859375,
"accuracy": 0.4375,
"epoch": 0.005,
"step": 2
},
{
"Batch Mean": 2.56005859375,
"accuracy": 0.59375,
"epoch": 0.005,
"step": 2
},
{
"epoch": 0.0075,
"grad_norm": 9.779342651367188,
"learning_rate": 4.5e-07,
"loss": 0.6984,
"step": 3
},
{
"Batch Mean": 2.668701171875,
"accuracy": 0.5625,
"epoch": 0.0075,
"step": 3
},
{
"Batch Mean": 2.6484375,
"accuracy": 0.375,
"epoch": 0.0075,
"step": 3
},
{
"Batch Mean": 2.505126953125,
"accuracy": 0.53125,
"epoch": 0.0075,
"step": 3
},
{
"Batch Mean": 2.651611328125,
"accuracy": 0.53125,
"epoch": 0.0075,
"step": 3
},
{
"epoch": 0.01,
"grad_norm": 7.470857620239258,
"learning_rate": 6.000000000000001e-07,
"loss": 0.6943,
"step": 4
},
{
"Batch Mean": 2.631591796875,
"accuracy": 0.53125,
"epoch": 0.01,
"step": 4
},
{
"Batch Mean": 2.61376953125,
"accuracy": 0.5625,
"epoch": 0.01,
"step": 4
},
{
"Batch Mean": 2.5850830078125,
"accuracy": 0.46875,
"epoch": 0.01,
"step": 4
},
{
"Batch Mean": 2.572509765625,
"accuracy": 0.5625,
"epoch": 0.01,
"step": 4
},
{
"epoch": 0.0125,
"grad_norm": 4.961440563201904,
"learning_rate": 7.5e-07,
"loss": 0.6986,
"step": 5
},
{
"Batch Mean": 2.603271484375,
"accuracy": 0.5,
"epoch": 0.0125,
"step": 5
},
{
"Batch Mean": 2.56103515625,
"accuracy": 0.46875,
"epoch": 0.0125,
"step": 5
},
{
"Batch Mean": 2.60791015625,
"accuracy": 0.5,
"epoch": 0.0125,
"step": 5
},
{
"Batch Mean": 2.577392578125,
"accuracy": 0.5,
"epoch": 0.0125,
"step": 5
},
{
"epoch": 0.015,
"grad_norm": 8.984463691711426,
"learning_rate": 9e-07,
"loss": 0.6933,
"step": 6
},
{
"Batch Mean": 2.61767578125,
"accuracy": 0.46875,
"epoch": 0.015,
"step": 6
},
{
"Batch Mean": 2.596435546875,
"accuracy": 0.4375,
"epoch": 0.015,
"step": 6
},
{
"Batch Mean": 2.617919921875,
"accuracy": 0.5,
"epoch": 0.015,
"step": 6
},
{
"Batch Mean": 2.5743408203125,
"accuracy": 0.4375,
"epoch": 0.015,
"step": 6
},
{
"epoch": 0.0175,
"grad_norm": 5.3602728843688965,
"learning_rate": 1.05e-06,
"loss": 0.7051,
"step": 7
},
{
"Batch Mean": 2.658447265625,
"accuracy": 0.5625,
"epoch": 0.0175,
"step": 7
},
{
"Batch Mean": 2.598388671875,
"accuracy": 0.5625,
"epoch": 0.0175,
"step": 7
},
{
"Batch Mean": 2.5712890625,
"accuracy": 0.40625,
"epoch": 0.0175,
"step": 7
},
{
"Batch Mean": 2.60302734375,
"accuracy": 0.5,
"epoch": 0.0175,
"step": 7
},
{
"epoch": 0.02,
"grad_norm": 6.917173385620117,
"learning_rate": 1.2000000000000002e-06,
"loss": 0.6994,
"step": 8
},
{
"Batch Mean": 2.6435546875,
"accuracy": 0.46875,
"epoch": 0.02,
"step": 8
},
{
"Batch Mean": 2.58740234375,
"accuracy": 0.4375,
"epoch": 0.02,
"step": 8
},
{
"Batch Mean": 2.589111328125,
"accuracy": 0.46875,
"epoch": 0.02,
"step": 8
},
{
"Batch Mean": 2.61767578125,
"accuracy": 0.53125,
"epoch": 0.02,
"step": 8
},
{
"epoch": 0.0225,
"grad_norm": 6.093692302703857,
"learning_rate": 1.35e-06,
"loss": 0.6999,
"step": 9
},
{
"Batch Mean": 2.69091796875,
"accuracy": 0.59375,
"epoch": 0.0225,
"step": 9
},
{
"Batch Mean": 2.63525390625,
"accuracy": 0.5625,
"epoch": 0.0225,
"step": 9
},
{
"Batch Mean": 2.607421875,
"accuracy": 0.46875,
"epoch": 0.0225,
"step": 9
},
{
"Batch Mean": 2.65185546875,
"accuracy": 0.40625,
"epoch": 0.0225,
"step": 9
},
{
"epoch": 0.025,
"grad_norm": 6.671854496002197,
"learning_rate": 1.5e-06,
"loss": 0.6841,
"step": 10
},
{
"Batch Mean": 2.681640625,
"accuracy": 0.5625,
"epoch": 0.025,
"step": 10
},
{
"Batch Mean": 2.698974609375,
"accuracy": 0.53125,
"epoch": 0.025,
"step": 10
},
{
"Batch Mean": 2.645751953125,
"accuracy": 0.625,
"epoch": 0.025,
"step": 10
},
{
"Batch Mean": 2.648681640625,
"accuracy": 0.59375,
"epoch": 0.025,
"step": 10
},
{
"epoch": 0.0275,
"grad_norm": 7.0144758224487305,
"learning_rate": 1.65e-06,
"loss": 0.6924,
"step": 11
},
{
"Batch Mean": 2.687255859375,
"accuracy": 0.5625,
"epoch": 0.0275,
"step": 11
},
{
"Batch Mean": 2.7158203125,
"accuracy": 0.625,
"epoch": 0.0275,
"step": 11
},
{
"Batch Mean": 2.649658203125,
"accuracy": 0.4375,
"epoch": 0.0275,
"step": 11
},
{
"Batch Mean": 2.67041015625,
"accuracy": 0.53125,
"epoch": 0.0275,
"step": 11
},
{
"epoch": 0.03,
"grad_norm": 7.060955047607422,
"learning_rate": 1.8e-06,
"loss": 0.6724,
"step": 12
},
{
"Batch Mean": 2.6439208984375,
"accuracy": 0.6875,
"epoch": 0.03,
"step": 12
},
{
"Batch Mean": 2.66748046875,
"accuracy": 0.59375,
"epoch": 0.03,
"step": 12
},
{
"Batch Mean": 2.59814453125,
"accuracy": 0.59375,
"epoch": 0.03,
"step": 12
},
{
"Batch Mean": 2.6533203125,
"accuracy": 0.46875,
"epoch": 0.03,
"step": 12
},
{
"epoch": 0.0325,
"grad_norm": 7.240173816680908,
"learning_rate": 1.95e-06,
"loss": 0.6762,
"step": 13
},
{
"Batch Mean": 2.741943359375,
"accuracy": 0.5625,
"epoch": 0.0325,
"step": 13
},
{
"Batch Mean": 2.7222900390625,
"accuracy": 0.8125,
"epoch": 0.0325,
"step": 13
},
{
"Batch Mean": 2.854736328125,
"accuracy": 0.59375,
"epoch": 0.0325,
"step": 13
},
{
"Batch Mean": 2.762451171875,
"accuracy": 0.59375,
"epoch": 0.0325,
"step": 13
},
{
"epoch": 0.035,
"grad_norm": 6.017094612121582,
"learning_rate": 2.1e-06,
"loss": 0.6564,
"step": 14
},
{
"Batch Mean": 2.851318359375,
"accuracy": 0.4375,
"epoch": 0.035,
"step": 14
},
{
"Batch Mean": 2.780029296875,
"accuracy": 0.75,
"epoch": 0.035,
"step": 14
},
{
"Batch Mean": 2.72509765625,
"accuracy": 0.5625,
"epoch": 0.035,
"step": 14
},
{
"Batch Mean": 2.74609375,
"accuracy": 0.5,
"epoch": 0.035,
"step": 14
},
{
"epoch": 0.0375,
"grad_norm": 7.460608959197998,
"learning_rate": 2.25e-06,
"loss": 0.6804,
"step": 15
},
{
"Batch Mean": 2.7120361328125,
"accuracy": 0.625,
"epoch": 0.0375,
"step": 15
},
{
"Batch Mean": 2.6575927734375,
"accuracy": 0.78125,
"epoch": 0.0375,
"step": 15
},
{
"Batch Mean": 2.6646881103515625,
"accuracy": 0.5625,
"epoch": 0.0375,
"step": 15
},
{
"Batch Mean": 2.817626953125,
"accuracy": 0.5625,
"epoch": 0.0375,
"step": 15
},
{
"epoch": 0.04,
"grad_norm": 5.134560585021973,
"learning_rate": 2.4000000000000003e-06,
"loss": 0.6439,
"step": 16
},
{
"Batch Mean": 2.87158203125,
"accuracy": 0.625,
"epoch": 0.04,
"step": 16
},
{
"Batch Mean": 2.80712890625,
"accuracy": 0.875,
"epoch": 0.04,
"step": 16
},
{
"Batch Mean": 2.92333984375,
"accuracy": 0.65625,
"epoch": 0.04,
"step": 16
},
{
"Batch Mean": 2.859375,
"accuracy": 0.6875,
"epoch": 0.04,
"step": 16
},
{
"epoch": 0.0425,
"grad_norm": 9.069143295288086,
"learning_rate": 2.55e-06,
"loss": 0.6009,
"step": 17
},
{
"Batch Mean": 2.988525390625,
"accuracy": 0.46875,
"epoch": 0.0425,
"step": 17
},
{
"Batch Mean": 2.9150390625,
"accuracy": 0.46875,
"epoch": 0.0425,
"step": 17
},
{
"Batch Mean": 2.9075927734375,
"accuracy": 0.65625,
"epoch": 0.0425,
"step": 17
},
{
"Batch Mean": 2.949462890625,
"accuracy": 0.5625,
"epoch": 0.0425,
"step": 17
},
{
"epoch": 0.045,
"grad_norm": 7.171381950378418,
"learning_rate": 2.7e-06,
"loss": 0.6478,
"step": 18
},
{
"Batch Mean": 2.8892822265625,
"accuracy": 0.75,
"epoch": 0.045,
"step": 18
},
{
"Batch Mean": 3.0479736328125,
"accuracy": 0.59375,
"epoch": 0.045,
"step": 18
},
{
"Batch Mean": 2.951904296875,
"accuracy": 0.78125,
"epoch": 0.045,
"step": 18
},
{
"Batch Mean": 2.9471435546875,
"accuracy": 0.8125,
"epoch": 0.045,
"step": 18
},
{
"epoch": 0.0475,
"grad_norm": 9.2951078414917,
"learning_rate": 2.85e-06,
"loss": 0.5669,
"step": 19
},
{
"Batch Mean": 2.9896240234375,
"accuracy": 0.5625,
"epoch": 0.0475,
"step": 19
},
{
"Batch Mean": 3.28076171875,
"accuracy": 0.59375,
"epoch": 0.0475,
"step": 19
},
{
"Batch Mean": 3.0648193359375,
"accuracy": 0.78125,
"epoch": 0.0475,
"step": 19
},
{
"Batch Mean": 3.02362060546875,
"accuracy": 0.59375,
"epoch": 0.0475,
"step": 19
},
{
"epoch": 0.05,
"grad_norm": 6.917195796966553,
"learning_rate": 3e-06,
"loss": 0.6227,
"step": 20
},
{
"Batch Mean": 3.187744140625,
"accuracy": 0.65625,
"epoch": 0.05,
"step": 20
},
{
"Batch Mean": 2.88177490234375,
"accuracy": 0.6875,
"epoch": 0.05,
"step": 20
},
{
"Batch Mean": 3.10791015625,
"accuracy": 0.8125,
"epoch": 0.05,
"step": 20
},
{
"Batch Mean": 2.94873046875,
"accuracy": 0.625,
"epoch": 0.05,
"step": 20
},
{
"epoch": 0.0525,
"grad_norm": 8.974556922912598,
"learning_rate": 2.992105263157895e-06,
"loss": 0.5974,
"step": 21
},
{
"Batch Mean": 3.12005615234375,
"accuracy": 0.5625,
"epoch": 0.0525,
"step": 21
},
{
"Batch Mean": 3.238006591796875,
"accuracy": 0.78125,
"epoch": 0.0525,
"step": 21
},
{
"Batch Mean": 3.1559066772460938,
"accuracy": 0.625,
"epoch": 0.0525,
"step": 21
},
{
"Batch Mean": 2.941864013671875,
"accuracy": 0.8125,
"epoch": 0.0525,
"step": 21
},
{
"epoch": 0.055,
"grad_norm": 6.266777038574219,
"learning_rate": 2.9842105263157896e-06,
"loss": 0.5654,
"step": 22
},
{
"Batch Mean": 3.0380859375,
"accuracy": 0.625,
"epoch": 0.055,
"step": 22
},
{
"Batch Mean": 3.34686279296875,
"accuracy": 0.625,
"epoch": 0.055,
"step": 22
},
{
"Batch Mean": 3.39178466796875,
"accuracy": 0.75,
"epoch": 0.055,
"step": 22
},
{
"Batch Mean": 3.215087890625,
"accuracy": 0.78125,
"epoch": 0.055,
"step": 22
},
{
"epoch": 0.0575,
"grad_norm": 12.593354225158691,
"learning_rate": 2.9763157894736843e-06,
"loss": 0.6144,
"step": 23
},
{
"Batch Mean": 3.6195030212402344,
"accuracy": 0.8125,
"epoch": 0.0575,
"step": 23
},
{
"Batch Mean": 3.9134521484375,
"accuracy": 0.4375,
"epoch": 0.0575,
"step": 23
},
{
"Batch Mean": 3.4212799072265625,
"accuracy": 0.6875,
"epoch": 0.0575,
"step": 23
},
{
"Batch Mean": 3.5451812744140625,
"accuracy": 0.6875,
"epoch": 0.0575,
"step": 23
},
{
"epoch": 0.06,
"grad_norm": 11.987595558166504,
"learning_rate": 2.968421052631579e-06,
"loss": 0.643,
"step": 24
},
{
"Batch Mean": 4.074462890625,
"accuracy": 0.71875,
"epoch": 0.06,
"step": 24
},
{
"Batch Mean": 3.9752197265625,
"accuracy": 0.75,
"epoch": 0.06,
"step": 24
},
{
"Batch Mean": 3.98480224609375,
"accuracy": 0.6875,
"epoch": 0.06,
"step": 24
},
{
"Batch Mean": 4.22607421875,
"accuracy": 0.8125,
"epoch": 0.06,
"step": 24
},
{
"epoch": 0.0625,
"grad_norm": 9.740213394165039,
"learning_rate": 2.960526315789474e-06,
"loss": 0.5295,
"step": 25
},
{
"Batch Mean": 4.26507568359375,
"accuracy": 0.6875,
"epoch": 0.0625,
"step": 25
},
{
"Batch Mean": 4.357688903808594,
"accuracy": 0.8125,
"epoch": 0.0625,
"step": 25
},
{
"Batch Mean": 4.076980113983154,
"accuracy": 0.59375,
"epoch": 0.0625,
"step": 25
},
{
"Batch Mean": 4.0531005859375,
"accuracy": 0.65625,
"epoch": 0.0625,
"step": 25
},
{
"epoch": 0.065,
"grad_norm": 14.751266479492188,
"learning_rate": 2.9526315789473685e-06,
"loss": 0.6701,
"step": 26
},
{
"Batch Mean": 4.2257080078125,
"accuracy": 0.65625,
"epoch": 0.065,
"step": 26
},
{
"Batch Mean": 3.6409378051757812,
"accuracy": 0.625,
"epoch": 0.065,
"step": 26
},
{
"Batch Mean": 4.32666015625,
"accuracy": 0.75,
"epoch": 0.065,
"step": 26
},
{
"Batch Mean": 4.223388671875,
"accuracy": 0.75,
"epoch": 0.065,
"step": 26
},
{
"epoch": 0.0675,
"grad_norm": 12.241596221923828,
"learning_rate": 2.9447368421052633e-06,
"loss": 0.6132,
"step": 27
},
{
"Batch Mean": 4.0788421630859375,
"accuracy": 0.75,
"epoch": 0.0675,
"step": 27
},
{
"Batch Mean": 4.0919647216796875,
"accuracy": 0.78125,
"epoch": 0.0675,
"step": 27
},
{
"Batch Mean": 3.9208831787109375,
"accuracy": 0.75,
"epoch": 0.0675,
"step": 27
},
{
"Batch Mean": 3.6530914306640625,
"accuracy": 0.78125,
"epoch": 0.0675,
"step": 27
},
{
"epoch": 0.07,
"grad_norm": 9.23661994934082,
"learning_rate": 2.936842105263158e-06,
"loss": 0.491,
"step": 28
},
{
"Batch Mean": 3.82781982421875,
"accuracy": 0.6875,
"epoch": 0.07,
"step": 28
},
{
"Batch Mean": 3.8771190643310547,
"accuracy": 0.75,
"epoch": 0.07,
"step": 28
},
{
"Batch Mean": 4.152229309082031,
"accuracy": 0.75,
"epoch": 0.07,
"step": 28
},
{
"Batch Mean": 3.3394012451171875,
"accuracy": 0.6875,
"epoch": 0.07,
"step": 28
},
{
"epoch": 0.0725,
"grad_norm": 8.272553443908691,
"learning_rate": 2.9289473684210528e-06,
"loss": 0.4942,
"step": 29
},
{
"Batch Mean": 3.72723388671875,
"accuracy": 0.625,
"epoch": 0.0725,
"step": 29
},
{
"Batch Mean": 3.4618988037109375,
"accuracy": 0.78125,
"epoch": 0.0725,
"step": 29
},
{
"Batch Mean": 4.0829925537109375,
"accuracy": 0.8125,
"epoch": 0.0725,
"step": 29
},
{
"Batch Mean": 3.5148162841796875,
"accuracy": 0.6875,
"epoch": 0.0725,
"step": 29
},
{
"epoch": 0.075,
"grad_norm": 8.980628967285156,
"learning_rate": 2.9210526315789475e-06,
"loss": 0.5918,
"step": 30
},
{
"Batch Mean": 3.8092041015625,
"accuracy": 0.625,
"epoch": 0.075,
"step": 30
},
{
"Batch Mean": 3.889404296875,
"accuracy": 0.5625,
"epoch": 0.075,
"step": 30
},
{
"Batch Mean": 3.994617462158203,
"accuracy": 0.78125,
"epoch": 0.075,
"step": 30
},
{
"Batch Mean": 3.82794189453125,
"accuracy": 0.875,
"epoch": 0.075,
"step": 30
},
{
"epoch": 0.0775,
"grad_norm": 8.450087547302246,
"learning_rate": 2.9131578947368423e-06,
"loss": 0.5721,
"step": 31
},
{
"Batch Mean": 3.980316162109375,
"accuracy": 0.6875,
"epoch": 0.0775,
"step": 31
},
{
"Batch Mean": 4.373779296875,
"accuracy": 0.625,
"epoch": 0.0775,
"step": 31
},
{
"Batch Mean": 4.037841796875,
"accuracy": 0.75,
"epoch": 0.0775,
"step": 31
},
{
"Batch Mean": 3.831298828125,
"accuracy": 0.75,
"epoch": 0.0775,
"step": 31
},
{
"epoch": 0.08,
"grad_norm": 8.705353736877441,
"learning_rate": 2.905263157894737e-06,
"loss": 0.5436,
"step": 32
},
{
"Batch Mean": 3.7490234375,
"accuracy": 0.8125,
"epoch": 0.08,
"step": 32
},
{
"Batch Mean": 3.83935546875,
"accuracy": 0.84375,
"epoch": 0.08,
"step": 32
},
{
"Batch Mean": 3.965576171875,
"accuracy": 0.71875,
"epoch": 0.08,
"step": 32
},
{
"Batch Mean": 3.752685546875,
"accuracy": 0.71875,
"epoch": 0.08,
"step": 32
},
{
"epoch": 0.0825,
"grad_norm": 8.69457721710205,
"learning_rate": 2.8973684210526318e-06,
"loss": 0.4884,
"step": 33
},
{
"Batch Mean": 3.9962158203125,
"accuracy": 0.84375,
"epoch": 0.0825,
"step": 33
},
{
"Batch Mean": 3.93798828125,
"accuracy": 0.90625,
"epoch": 0.0825,
"step": 33
},
{
"Batch Mean": 3.80908203125,
"accuracy": 0.90625,
"epoch": 0.0825,
"step": 33
},
{
"Batch Mean": 3.912109375,
"accuracy": 0.75,
"epoch": 0.0825,
"step": 33
},
{
"epoch": 0.085,
"grad_norm": 7.060486316680908,
"learning_rate": 2.8894736842105265e-06,
"loss": 0.4117,
"step": 34
},
{
"Batch Mean": 3.6049652099609375,
"accuracy": 0.78125,
"epoch": 0.085,
"step": 34
},
{
"Batch Mean": 3.53125,
"accuracy": 0.71875,
"epoch": 0.085,
"step": 34
},
{
"Batch Mean": 3.86993408203125,
"accuracy": 0.75,
"epoch": 0.085,
"step": 34
},
{
"Batch Mean": 3.9173583984375,
"accuracy": 0.59375,
"epoch": 0.085,
"step": 34
},
{
"epoch": 0.0875,
"grad_norm": 7.442059516906738,
"learning_rate": 2.8815789473684213e-06,
"loss": 0.5188,
"step": 35
},
{
"Batch Mean": 3.7060546875,
"accuracy": 0.71875,
"epoch": 0.0875,
"step": 35
},
{
"Batch Mean": 3.6419677734375,
"accuracy": 0.84375,
"epoch": 0.0875,
"step": 35
},
{
"Batch Mean": 3.79437255859375,
"accuracy": 0.71875,
"epoch": 0.0875,
"step": 35
},
{
"Batch Mean": 3.9708251953125,
"accuracy": 0.71875,
"epoch": 0.0875,
"step": 35
},
{
"epoch": 0.09,
"grad_norm": 7.1602888107299805,
"learning_rate": 2.873684210526316e-06,
"loss": 0.4938,
"step": 36
},
{
"Batch Mean": 3.74139404296875,
"accuracy": 0.625,
"epoch": 0.09,
"step": 36
},
{
"Batch Mean": 4.314697265625,
"accuracy": 0.625,
"epoch": 0.09,
"step": 36
},
{
"Batch Mean": 3.3461456298828125,
"accuracy": 0.75,
"epoch": 0.09,
"step": 36
},
{
"Batch Mean": 3.1932830810546875,
"accuracy": 0.78125,
"epoch": 0.09,
"step": 36
},
{
"epoch": 0.0925,
"grad_norm": 7.272737503051758,
"learning_rate": 2.8657894736842103e-06,
"loss": 0.5001,
"step": 37
},
{
"Batch Mean": 4.018646240234375,
"accuracy": 0.75,
"epoch": 0.0925,
"step": 37
},
{
"Batch Mean": 3.30047607421875,
"accuracy": 0.71875,
"epoch": 0.0925,
"step": 37
},
{
"Batch Mean": 3.917877197265625,
"accuracy": 0.78125,
"epoch": 0.0925,
"step": 37
},
{
"Batch Mean": 3.60589599609375,
"accuracy": 0.78125,
"epoch": 0.0925,
"step": 37
},
{
"epoch": 0.095,
"grad_norm": 7.850794792175293,
"learning_rate": 2.857894736842105e-06,
"loss": 0.5267,
"step": 38
},
{
"Batch Mean": 4.016511917114258,
"accuracy": 0.84375,
"epoch": 0.095,
"step": 38
},
{
"Batch Mean": 3.8358612060546875,
"accuracy": 0.71875,
"epoch": 0.095,
"step": 38
},
{
"Batch Mean": 4.089599609375,
"accuracy": 0.875,
"epoch": 0.095,
"step": 38
},
{
"Batch Mean": 3.9793701171875,
"accuracy": 0.65625,
"epoch": 0.095,
"step": 38
},
{
"epoch": 0.0975,
"grad_norm": 7.21096134185791,
"learning_rate": 2.85e-06,
"loss": 0.4716,
"step": 39
},
{
"Batch Mean": 3.28546142578125,
"accuracy": 0.78125,
"epoch": 0.0975,
"step": 39
},
{
"Batch Mean": 3.844146728515625,
"accuracy": 0.875,
"epoch": 0.0975,
"step": 39
},
{
"Batch Mean": 4.03277587890625,
"accuracy": 0.84375,
"epoch": 0.0975,
"step": 39
},
{
"Batch Mean": 3.28216552734375,
"accuracy": 0.78125,
"epoch": 0.0975,
"step": 39
},
{
"epoch": 0.1,
"grad_norm": 11.831915855407715,
"learning_rate": 2.8421052631578946e-06,
"loss": 0.424,
"step": 40
},
{
"Batch Mean": 4.2200927734375,
"accuracy": 0.6875,
"epoch": 0.1,
"step": 40
},
{
"Batch Mean": 3.5522689819335938,
"accuracy": 0.84375,
"epoch": 0.1,
"step": 40
},
{
"Batch Mean": 3.92266845703125,
"accuracy": 0.8125,
"epoch": 0.1,
"step": 40
},
{
"Batch Mean": 3.9715576171875,
"accuracy": 0.90625,
"epoch": 0.1,
"step": 40
},
{
"epoch": 0.1025,
"grad_norm": 8.1405668258667,
"learning_rate": 2.8342105263157897e-06,
"loss": 0.424,
"step": 41
},
{
"Batch Mean": 4.1801605224609375,
"accuracy": 0.75,
"epoch": 0.1025,
"step": 41
},
{
"Batch Mean": 4.2984619140625,
"accuracy": 0.75,
"epoch": 0.1025,
"step": 41
},
{
"Batch Mean": 4.1153564453125,
"accuracy": 0.75,
"epoch": 0.1025,
"step": 41
},
{
"Batch Mean": 4.0693817138671875,
"accuracy": 0.71875,
"epoch": 0.1025,
"step": 41
},
{
"epoch": 0.105,
"grad_norm": 13.232189178466797,
"learning_rate": 2.8263157894736845e-06,
"loss": 0.5667,
"step": 42
},
{
"Batch Mean": 3.77996826171875,
"accuracy": 0.71875,
"epoch": 0.105,
"step": 42
},
{
"Batch Mean": 3.576416015625,
"accuracy": 0.75,
"epoch": 0.105,
"step": 42
},
{
"Batch Mean": 4.18341064453125,
"accuracy": 0.8125,
"epoch": 0.105,
"step": 42
},
{
"Batch Mean": 4.179298400878906,
"accuracy": 0.84375,
"epoch": 0.105,
"step": 42
},
{
"epoch": 0.1075,
"grad_norm": 10.862428665161133,
"learning_rate": 2.8184210526315792e-06,
"loss": 0.5047,
"step": 43
},
{
"Batch Mean": 3.411527633666992,
"accuracy": 0.65625,
"epoch": 0.1075,
"step": 43
},
{
"Batch Mean": 3.7935562133789062,
"accuracy": 0.78125,
"epoch": 0.1075,
"step": 43
},
{
"Batch Mean": 4.264190673828125,
"accuracy": 0.625,
"epoch": 0.1075,
"step": 43
},
{
"Batch Mean": 3.1276397705078125,
"accuracy": 0.8125,
"epoch": 0.1075,
"step": 43
},
{
"epoch": 0.11,
"grad_norm": 10.28708267211914,
"learning_rate": 2.810526315789474e-06,
"loss": 0.4958,
"step": 44
},
{
"Batch Mean": 3.398651123046875,
"accuracy": 0.8125,
"epoch": 0.11,
"step": 44
},
{
"Batch Mean": 3.4655685424804688,
"accuracy": 0.90625,
"epoch": 0.11,
"step": 44
},
{
"Batch Mean": 3.491424560546875,
"accuracy": 0.75,
"epoch": 0.11,
"step": 44
},
{
"Batch Mean": 4.3106536865234375,
"accuracy": 0.75,
"epoch": 0.11,
"step": 44
},
{
"epoch": 0.1125,
"grad_norm": 9.006805419921875,
"learning_rate": 2.8026315789473687e-06,
"loss": 0.4885,
"step": 45
},
{
"Batch Mean": 3.865081787109375,
"accuracy": 0.8125,
"epoch": 0.1125,
"step": 45
},
{
"Batch Mean": 2.941986083984375,
"accuracy": 0.75,
"epoch": 0.1125,
"step": 45
},
{
"Batch Mean": 3.431640625,
"accuracy": 0.90625,
"epoch": 0.1125,
"step": 45
},
{
"Batch Mean": 3.38702392578125,
"accuracy": 0.78125,
"epoch": 0.1125,
"step": 45
},
{
"epoch": 0.115,
"grad_norm": 8.483081817626953,
"learning_rate": 2.7947368421052635e-06,
"loss": 0.4436,
"step": 46
},
{
"Batch Mean": 3.768218994140625,
"accuracy": 0.71875,
"epoch": 0.115,
"step": 46
},
{
"Batch Mean": 2.960357666015625,
"accuracy": 0.78125,
"epoch": 0.115,
"step": 46
},
{
"Batch Mean": 3.8204345703125,
"accuracy": 0.90625,
"epoch": 0.115,
"step": 46
},
{
"Batch Mean": 3.39542293548584,
"accuracy": 0.78125,
"epoch": 0.115,
"step": 46
},
{
"epoch": 0.1175,
"grad_norm": 13.869011878967285,
"learning_rate": 2.7868421052631578e-06,
"loss": 0.4939,
"step": 47
},
{
"Batch Mean": 3.5629959106445312,
"accuracy": 0.65625,
"epoch": 0.1175,
"step": 47
},
{
"Batch Mean": 3.121295928955078,
"accuracy": 0.8125,
"epoch": 0.1175,
"step": 47
},
{
"Batch Mean": 3.243316650390625,
"accuracy": 0.8125,
"epoch": 0.1175,
"step": 47
},
{
"Batch Mean": 3.4634838104248047,
"accuracy": 0.90625,
"epoch": 0.1175,
"step": 47
},
{
"epoch": 0.12,
"grad_norm": 10.390000343322754,
"learning_rate": 2.7789473684210525e-06,
"loss": 0.4101,
"step": 48
},
{
"Batch Mean": 2.8801422119140625,
"accuracy": 0.875,
"epoch": 0.12,
"step": 48
},
{
"Batch Mean": 2.7355971336364746,
"accuracy": 0.59375,
"epoch": 0.12,
"step": 48
},
{
"Batch Mean": 3.0902557373046875,
"accuracy": 0.8125,
"epoch": 0.12,
"step": 48
},
{
"Batch Mean": 2.7920761108398438,
"accuracy": 0.8125,
"epoch": 0.12,
"step": 48
},
{
"epoch": 0.1225,
"grad_norm": 7.682050704956055,
"learning_rate": 2.7710526315789473e-06,
"loss": 0.4488,
"step": 49
},
{
"Batch Mean": 2.1150331497192383,
"accuracy": 0.78125,
"epoch": 0.1225,
"step": 49
},
{
"Batch Mean": 2.1563758850097656,
"accuracy": 0.625,
"epoch": 0.1225,
"step": 49
},
{
"Batch Mean": 2.22674560546875,
"accuracy": 0.78125,
"epoch": 0.1225,
"step": 49
},
{
"Batch Mean": 2.350677490234375,
"accuracy": 0.625,
"epoch": 0.1225,
"step": 49
},
{
"epoch": 0.125,
"grad_norm": 10.955310821533203,
"learning_rate": 2.763157894736842e-06,
"loss": 0.543,
"step": 50
},
{
"Batch Mean": 1.7153472900390625,
"accuracy": 0.6875,
"epoch": 0.125,
"step": 50
},
{
"Batch Mean": 1.4781265258789062,
"accuracy": 0.71875,
"epoch": 0.125,
"step": 50
},
{
"Batch Mean": 1.9790096282958984,
"accuracy": 0.8125,
"epoch": 0.125,
"step": 50
},
{
"Batch Mean": 2.209127426147461,
"accuracy": 0.875,
"epoch": 0.125,
"step": 50
},
{
"epoch": 0.1275,
"grad_norm": 8.823392868041992,
"learning_rate": 2.7552631578947368e-06,
"loss": 0.4216,
"step": 51
},
{
"Batch Mean": 1.8213386535644531,
"accuracy": 0.875,
"epoch": 0.1275,
"step": 51
},
{
"Batch Mean": 1.600327491760254,
"accuracy": 0.84375,
"epoch": 0.1275,
"step": 51
},
{
"Batch Mean": 1.6077766418457031,
"accuracy": 0.71875,
"epoch": 0.1275,
"step": 51
},
{
"Batch Mean": 1.3565635681152344,
"accuracy": 0.8125,
"epoch": 0.1275,
"step": 51
},
{
"epoch": 0.13,
"grad_norm": 9.650039672851562,
"learning_rate": 2.7473684210526315e-06,
"loss": 0.4226,
"step": 52
},
{
"Batch Mean": 1.4758195877075195,
"accuracy": 0.875,
"epoch": 0.13,
"step": 52
},
{
"Batch Mean": 1.5553207397460938,
"accuracy": 0.90625,
"epoch": 0.13,
"step": 52
},
{
"Batch Mean": 1.637298583984375,
"accuracy": 0.71875,
"epoch": 0.13,
"step": 52
},
{
"Batch Mean": 1.4313430786132812,
"accuracy": 0.8125,
"epoch": 0.13,
"step": 52
},
{
"epoch": 0.1325,
"grad_norm": 8.510551452636719,
"learning_rate": 2.7394736842105263e-06,
"loss": 0.4413,
"step": 53
},
{
"Batch Mean": 1.7805156707763672,
"accuracy": 0.8125,
"epoch": 0.1325,
"step": 53
},
{
"Batch Mean": 1.9161019325256348,
"accuracy": 0.6875,
"epoch": 0.1325,
"step": 53
},
{
"Batch Mean": 1.6803773641586304,
"accuracy": 0.78125,
"epoch": 0.1325,
"step": 53
},
{
"Batch Mean": 2.052886962890625,
"accuracy": 0.84375,
"epoch": 0.1325,
"step": 53
},
{
"epoch": 0.135,
"grad_norm": 7.554328441619873,
"learning_rate": 2.7315789473684214e-06,
"loss": 0.4647,
"step": 54
},
{
"Batch Mean": 2.4049549102783203,
"accuracy": 0.6875,
"epoch": 0.135,
"step": 54
},
{
"Batch Mean": 1.8316669464111328,
"accuracy": 0.71875,
"epoch": 0.135,
"step": 54
},
{
"Batch Mean": 1.7167816162109375,
"accuracy": 0.75,
"epoch": 0.135,
"step": 54
},
{
"Batch Mean": 1.8629379272460938,
"accuracy": 0.6875,
"epoch": 0.135,
"step": 54
},
{
"epoch": 0.1375,
"grad_norm": 12.288453102111816,
"learning_rate": 2.723684210526316e-06,
"loss": 0.5379,
"step": 55
},
{
"Batch Mean": 2.3260297775268555,
"accuracy": 0.6875,
"epoch": 0.1375,
"step": 55
},
{
"Batch Mean": 2.1811676025390625,
"accuracy": 0.75,
"epoch": 0.1375,
"step": 55
},
{
"Batch Mean": 2.3114492893218994,
"accuracy": 0.78125,
"epoch": 0.1375,
"step": 55
},
{
"Batch Mean": 2.71484375,
"accuracy": 0.75,
"epoch": 0.1375,
"step": 55
},
{
"epoch": 0.14,
"grad_norm": 11.225523948669434,
"learning_rate": 2.715789473684211e-06,
"loss": 0.4975,
"step": 56
},
{
"Batch Mean": 2.4541587829589844,
"accuracy": 0.8125,
"epoch": 0.14,
"step": 56
},
{
"Batch Mean": 2.656031608581543,
"accuracy": 0.71875,
"epoch": 0.14,
"step": 56
},
{
"Batch Mean": 2.3141021728515625,
"accuracy": 0.71875,
"epoch": 0.14,
"step": 56
},
{
"Batch Mean": 2.468475341796875,
"accuracy": 0.78125,
"epoch": 0.14,
"step": 56
},
{
"epoch": 0.1425,
"grad_norm": 9.733325958251953,
"learning_rate": 2.7078947368421052e-06,
"loss": 0.5559,
"step": 57
},
{
"Batch Mean": 2.35528564453125,
"accuracy": 0.8125,
"epoch": 0.1425,
"step": 57
},
{
"Batch Mean": 2.867542266845703,
"accuracy": 0.8125,
"epoch": 0.1425,
"step": 57
},
{
"Batch Mean": 2.365894317626953,
"accuracy": 0.6875,
"epoch": 0.1425,
"step": 57
},
{
"Batch Mean": 2.246662139892578,
"accuracy": 0.78125,
"epoch": 0.1425,
"step": 57
},
{
"epoch": 0.145,
"grad_norm": 8.220906257629395,
"learning_rate": 2.7e-06,
"loss": 0.4247,
"step": 58
},
{
"Batch Mean": 2.3054046630859375,
"accuracy": 0.78125,
"epoch": 0.145,
"step": 58
},
{
"Batch Mean": 2.5247726440429688,
"accuracy": 0.90625,
"epoch": 0.145,
"step": 58
},
{
"Batch Mean": 2.8898544311523438,
"accuracy": 0.90625,
"epoch": 0.145,
"step": 58
},
{
"Batch Mean": 2.017698287963867,
"accuracy": 0.8125,
"epoch": 0.145,
"step": 58
},
{
"epoch": 0.1475,
"grad_norm": 7.708261013031006,
"learning_rate": 2.6921052631578947e-06,
"loss": 0.3713,
"step": 59
},
{
"Batch Mean": 2.355548858642578,
"accuracy": 0.8125,
"epoch": 0.1475,
"step": 59
},
{
"Batch Mean": 2.2453155517578125,
"accuracy": 0.71875,
"epoch": 0.1475,
"step": 59
},
{
"Batch Mean": 2.658827781677246,
"accuracy": 0.84375,
"epoch": 0.1475,
"step": 59
},
{
"Batch Mean": 2.4546985626220703,
"accuracy": 1.0,
"epoch": 0.1475,
"step": 59
},
{
"epoch": 0.15,
"grad_norm": 7.150002956390381,
"learning_rate": 2.6842105263157895e-06,
"loss": 0.4002,
"step": 60
},
{
"Batch Mean": 2.0813217163085938,
"accuracy": 0.65625,
"epoch": 0.15,
"step": 60
},
{
"Batch Mean": 1.9841499328613281,
"accuracy": 0.78125,
"epoch": 0.15,
"step": 60
},
{
"Batch Mean": 2.2633228302001953,
"accuracy": 0.75,
"epoch": 0.15,
"step": 60
},
{
"Batch Mean": 2.2690048217773438,
"accuracy": 0.71875,
"epoch": 0.15,
"step": 60
},
{
"epoch": 0.1525,
"grad_norm": 7.390719413757324,
"learning_rate": 2.6763157894736842e-06,
"loss": 0.5013,
"step": 61
},
{
"Batch Mean": 2.179004669189453,
"accuracy": 0.65625,
"epoch": 0.1525,
"step": 61
},
{
"Batch Mean": 2.1857452392578125,
"accuracy": 0.78125,
"epoch": 0.1525,
"step": 61
},
{
"Batch Mean": 1.4751567840576172,
"accuracy": 0.8125,
"epoch": 0.1525,
"step": 61
},
{
"Batch Mean": 2.2417678833007812,
"accuracy": 0.875,
"epoch": 0.1525,
"step": 61
},
{
"epoch": 0.155,
"grad_norm": 6.933903694152832,
"learning_rate": 2.668421052631579e-06,
"loss": 0.4227,
"step": 62
},
{
"Batch Mean": 1.8520653247833252,
"accuracy": 0.78125,
"epoch": 0.155,
"step": 62
},
{
"Batch Mean": 1.680877685546875,
"accuracy": 0.8125,
"epoch": 0.155,
"step": 62
},
{
"Batch Mean": 1.7311248779296875,
"accuracy": 0.75,
"epoch": 0.155,
"step": 62
},
{
"Batch Mean": 2.07061767578125,
"accuracy": 0.65625,
"epoch": 0.155,
"step": 62
},
{
"epoch": 0.1575,
"grad_norm": 6.547921180725098,
"learning_rate": 2.6605263157894737e-06,
"loss": 0.4736,
"step": 63
},
{
"Batch Mean": 2.2572174072265625,
"accuracy": 0.875,
"epoch": 0.1575,
"step": 63
},
{
"Batch Mean": 2.0034332275390625,
"accuracy": 0.78125,
"epoch": 0.1575,
"step": 63
},
{
"Batch Mean": 1.7276840209960938,
"accuracy": 0.84375,
"epoch": 0.1575,
"step": 63
},
{
"Batch Mean": 2.1414947509765625,
"accuracy": 0.8125,
"epoch": 0.1575,
"step": 63
},
{
"epoch": 0.16,
"grad_norm": 8.414312362670898,
"learning_rate": 2.6526315789473685e-06,
"loss": 0.4544,
"step": 64
},
{
"Batch Mean": 1.8973770141601562,
"accuracy": 0.96875,
"epoch": 0.16,
"step": 64
},
{
"Batch Mean": 2.1470298767089844,
"accuracy": 0.78125,
"epoch": 0.16,
"step": 64
},
{
"Batch Mean": 1.9893627166748047,
"accuracy": 0.75,
"epoch": 0.16,
"step": 64
},
{
"Batch Mean": 1.8890247344970703,
"accuracy": 0.8125,
"epoch": 0.16,
"step": 64
},
{
"epoch": 0.1625,
"grad_norm": 7.936588287353516,
"learning_rate": 2.644736842105263e-06,
"loss": 0.4333,
"step": 65
},
{
"Batch Mean": 1.9641265869140625,
"accuracy": 0.75,
"epoch": 0.1625,
"step": 65
},
{
"Batch Mean": 1.9673995971679688,
"accuracy": 0.875,
"epoch": 0.1625,
"step": 65
},
{
"Batch Mean": 1.6348600387573242,
"accuracy": 0.6875,
"epoch": 0.1625,
"step": 65
},
{
"Batch Mean": 1.740340232849121,
"accuracy": 0.84375,
"epoch": 0.1625,
"step": 65
},
{
"epoch": 0.165,
"grad_norm": 7.350888729095459,
"learning_rate": 2.636842105263158e-06,
"loss": 0.4376,
"step": 66
},
{
"Batch Mean": 1.7691650390625,
"accuracy": 0.8125,
"epoch": 0.165,
"step": 66
},
{
"Batch Mean": 2.187957763671875,
"accuracy": 0.875,
"epoch": 0.165,
"step": 66
},
{
"Batch Mean": 2.187624454498291,
"accuracy": 0.84375,
"epoch": 0.165,
"step": 66
},
{
"Batch Mean": 2.1125755310058594,
"accuracy": 0.71875,
"epoch": 0.165,
"step": 66
},
{
"epoch": 0.1675,
"grad_norm": 7.868091106414795,
"learning_rate": 2.6289473684210527e-06,
"loss": 0.3905,
"step": 67
},
{
"Batch Mean": 3.1567535400390625,
"accuracy": 0.84375,
"epoch": 0.1675,
"step": 67
},
{
"Batch Mean": 2.20458984375,
"accuracy": 0.84375,
"epoch": 0.1675,
"step": 67
},
{
"Batch Mean": 2.19390869140625,
"accuracy": 0.71875,
"epoch": 0.1675,
"step": 67
},
{
"Batch Mean": 2.5717415809631348,
"accuracy": 0.84375,
"epoch": 0.1675,
"step": 67
},
{
"epoch": 0.17,
"grad_norm": 7.648073673248291,
"learning_rate": 2.6210526315789474e-06,
"loss": 0.4176,
"step": 68
},
{
"Batch Mean": 3.2033939361572266,
"accuracy": 0.90625,
"epoch": 0.17,
"step": 68
},
{
"Batch Mean": 2.538959503173828,
"accuracy": 0.6875,
"epoch": 0.17,
"step": 68
},
{
"Batch Mean": 2.950897216796875,
"accuracy": 0.71875,
"epoch": 0.17,
"step": 68
},
{
"Batch Mean": 2.9720191955566406,
"accuracy": 0.65625,
"epoch": 0.17,
"step": 68
},
{
"epoch": 0.1725,
"grad_norm": 9.133621215820312,
"learning_rate": 2.613157894736842e-06,
"loss": 0.4827,
"step": 69
},
{
"Batch Mean": 2.8690185546875,
"accuracy": 0.78125,
"epoch": 0.1725,
"step": 69
},
{
"Batch Mean": 3.2310791015625,
"accuracy": 0.84375,
"epoch": 0.1725,
"step": 69
},
{
"Batch Mean": 3.4392356872558594,
"accuracy": 0.84375,
"epoch": 0.1725,
"step": 69
},
{
"Batch Mean": 3.79766845703125,
"accuracy": 0.8125,
"epoch": 0.1725,
"step": 69
},
{
"epoch": 0.175,
"grad_norm": 8.451118469238281,
"learning_rate": 2.605263157894737e-06,
"loss": 0.416,
"step": 70
},
{
"Batch Mean": 3.61334228515625,
"accuracy": 0.8125,
"epoch": 0.175,
"step": 70
},
{
"Batch Mean": 3.4711837768554688,
"accuracy": 0.78125,
"epoch": 0.175,
"step": 70
},
{
"Batch Mean": 3.1711807250976562,
"accuracy": 0.84375,
"epoch": 0.175,
"step": 70
},
{
"Batch Mean": 3.73419189453125,
"accuracy": 0.875,
"epoch": 0.175,
"step": 70
},
{
"epoch": 0.1775,
"grad_norm": 8.39415168762207,
"learning_rate": 2.5973684210526317e-06,
"loss": 0.4187,
"step": 71
},
{
"Batch Mean": 3.579629898071289,
"accuracy": 0.8125,
"epoch": 0.1775,
"step": 71
},
{
"Batch Mean": 3.81597900390625,
"accuracy": 0.90625,
"epoch": 0.1775,
"step": 71
},
{
"Batch Mean": 4.013580322265625,
"accuracy": 0.84375,
"epoch": 0.1775,
"step": 71
},
{
"Batch Mean": 3.486042022705078,
"accuracy": 0.8125,
"epoch": 0.1775,
"step": 71
},
{
"epoch": 0.18,
"grad_norm": 7.406739711761475,
"learning_rate": 2.5894736842105264e-06,
"loss": 0.2955,
"step": 72
},
{
"Batch Mean": 3.823301315307617,
"accuracy": 0.78125,
"epoch": 0.18,
"step": 72
},
{
"Batch Mean": 3.58245849609375,
"accuracy": 0.8125,
"epoch": 0.18,
"step": 72
},
{
"Batch Mean": 3.817901611328125,
"accuracy": 0.75,
"epoch": 0.18,
"step": 72
},
{
"Batch Mean": 3.1810264587402344,
"accuracy": 0.84375,
"epoch": 0.18,
"step": 72
},
{
"epoch": 0.1825,
"grad_norm": 7.441671848297119,
"learning_rate": 2.581578947368421e-06,
"loss": 0.46,
"step": 73
},
{
"Batch Mean": 4.177032470703125,
"accuracy": 0.84375,
"epoch": 0.1825,
"step": 73
},
{
"Batch Mean": 4.8514404296875,
"accuracy": 0.84375,
"epoch": 0.1825,
"step": 73
},
{
"Batch Mean": 3.1745223999023438,
"accuracy": 0.84375,
"epoch": 0.1825,
"step": 73
},
{
"Batch Mean": 3.3532562255859375,
"accuracy": 0.75,
"epoch": 0.1825,
"step": 73
},
{
"epoch": 0.185,
"grad_norm": 7.315526008605957,
"learning_rate": 2.573684210526316e-06,
"loss": 0.4084,
"step": 74
},
{
"Batch Mean": 2.9517765045166016,
"accuracy": 0.78125,
"epoch": 0.185,
"step": 74
},
{
"Batch Mean": 3.189727783203125,
"accuracy": 0.84375,
"epoch": 0.185,
"step": 74
},
{
"Batch Mean": 3.7370223999023438,
"accuracy": 0.84375,
"epoch": 0.185,
"step": 74
},
{
"Batch Mean": 3.0487213134765625,
"accuracy": 0.8125,
"epoch": 0.185,
"step": 74
},
{
"epoch": 0.1875,
"grad_norm": 7.253166198730469,
"learning_rate": 2.5657894736842107e-06,
"loss": 0.4185,
"step": 75
},
{
"Batch Mean": 3.319133758544922,
"accuracy": 0.84375,
"epoch": 0.1875,
"step": 75
},
{
"Batch Mean": 3.280853271484375,
"accuracy": 0.78125,
"epoch": 0.1875,
"step": 75
},
{
"Batch Mean": 3.16741943359375,
"accuracy": 0.71875,
"epoch": 0.1875,
"step": 75
},
{
"Batch Mean": 2.6110219955444336,
"accuracy": 0.78125,
"epoch": 0.1875,
"step": 75
},
{
"epoch": 0.19,
"grad_norm": 9.37726879119873,
"learning_rate": 2.5578947368421054e-06,
"loss": 0.4841,
"step": 76
},
{
"Batch Mean": 2.8535995483398438,
"accuracy": 0.78125,
"epoch": 0.19,
"step": 76
},
{
"Batch Mean": 2.5045166015625,
"accuracy": 0.71875,
"epoch": 0.19,
"step": 76
},
{
"Batch Mean": 3.445037841796875,
"accuracy": 0.78125,
"epoch": 0.19,
"step": 76
},
{
"Batch Mean": 3.05963134765625,
"accuracy": 0.75,
"epoch": 0.19,
"step": 76
},
{
"epoch": 0.1925,
"grad_norm": 9.00017261505127,
"learning_rate": 2.55e-06,
"loss": 0.4535,
"step": 77
},
{
"Batch Mean": 2.868865966796875,
"accuracy": 0.8125,
"epoch": 0.1925,
"step": 77
},
{
"Batch Mean": 2.8354110717773438,
"accuracy": 0.78125,
"epoch": 0.1925,
"step": 77
},
{
"Batch Mean": 3.1343002319335938,
"accuracy": 0.90625,
"epoch": 0.1925,
"step": 77
},
{
"Batch Mean": 3.1275634765625,
"accuracy": 0.84375,
"epoch": 0.1925,
"step": 77
},
{
"epoch": 0.195,
"grad_norm": 6.656139850616455,
"learning_rate": 2.542105263157895e-06,
"loss": 0.3829,
"step": 78
},
{
"Batch Mean": 2.7342681884765625,
"accuracy": 0.8125,
"epoch": 0.195,
"step": 78
},
{
"Batch Mean": 2.3845291137695312,
"accuracy": 0.875,
"epoch": 0.195,
"step": 78
},
{
"Batch Mean": 2.4807891845703125,
"accuracy": 0.84375,
"epoch": 0.195,
"step": 78
},
{
"Batch Mean": 2.4909114837646484,
"accuracy": 0.8125,
"epoch": 0.195,
"step": 78
},
{
"epoch": 0.1975,
"grad_norm": 6.564652919769287,
"learning_rate": 2.5342105263157892e-06,
"loss": 0.3979,
"step": 79
},
{
"Batch Mean": 2.9712295532226562,
"accuracy": 0.84375,
"epoch": 0.1975,
"step": 79
},
{
"Batch Mean": 2.5406951904296875,
"accuracy": 0.65625,
"epoch": 0.1975,
"step": 79
},
{
"Batch Mean": 2.974529266357422,
"accuracy": 0.9375,
"epoch": 0.1975,
"step": 79
},
{
"Batch Mean": 2.565216064453125,
"accuracy": 0.875,
"epoch": 0.1975,
"step": 79
},
{
"epoch": 0.2,
"grad_norm": 6.637782096862793,
"learning_rate": 2.526315789473684e-06,
"loss": 0.362,
"step": 80
}
],
"logging_steps": 1,
"max_steps": 400,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 80,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}