{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0582010582010581, "eval_steps": 50, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "classification_loss": 0.6695, "epoch": 0.005291005291005291, "grad_norm": 16.771141052246094, "learning_rate": 7.017543859649123e-07, "lm_loss": 2.0851, "loss": 2.7547, "step": 1 }, { "classification_loss": 0.7145, "epoch": 0.010582010582010581, "grad_norm": 27.93190574645996, "learning_rate": 1.4035087719298246e-06, "lm_loss": 1.9011, "loss": 2.6157, "step": 2 }, { "classification_loss": 0.74, "epoch": 0.015873015873015872, "grad_norm": 19.260587692260742, "learning_rate": 2.105263157894737e-06, "lm_loss": 2.152, "loss": 2.892, "step": 3 }, { "classification_loss": 0.668, "epoch": 0.021164021164021163, "grad_norm": 16.394054412841797, "learning_rate": 2.8070175438596493e-06, "lm_loss": 2.0417, "loss": 2.7098, "step": 4 }, { "classification_loss": 0.6821, "epoch": 0.026455026455026454, "grad_norm": 18.829200744628906, "learning_rate": 3.5087719298245615e-06, "lm_loss": 2.0113, "loss": 2.6933, "step": 5 }, { "classification_loss": 0.6701, "epoch": 0.031746031746031744, "grad_norm": 18.288366317749023, "learning_rate": 4.210526315789474e-06, "lm_loss": 1.8286, "loss": 2.4987, "step": 6 }, { "classification_loss": 0.6698, "epoch": 0.037037037037037035, "grad_norm": 25.650251388549805, "learning_rate": 4.912280701754386e-06, "lm_loss": 1.7178, "loss": 2.3876, "step": 7 }, { "classification_loss": 0.8098, "epoch": 0.042328042328042326, "grad_norm": 15.184894561767578, "learning_rate": 5.6140350877192985e-06, "lm_loss": 1.6045, "loss": 2.4142, "step": 8 }, { "classification_loss": 0.6963, "epoch": 0.047619047619047616, "grad_norm": 12.884407997131348, "learning_rate": 6.31578947368421e-06, "lm_loss": 1.372, "loss": 2.0683, "step": 9 }, { "classification_loss": 0.7149, "epoch": 0.05291005291005291, "grad_norm": 12.367244720458984, "learning_rate": 7.017543859649123e-06, "lm_loss": 1.0866, "loss": 1.8015, "step": 10 }, { "classification_loss": 0.7124, "epoch": 0.0582010582010582, "grad_norm": 19.37381362915039, "learning_rate": 7.719298245614036e-06, "lm_loss": 0.7991, "loss": 1.5115, "step": 11 }, { "classification_loss": 0.6034, "epoch": 0.06349206349206349, "grad_norm": 17.332901000976562, "learning_rate": 8.421052631578948e-06, "lm_loss": 0.6295, "loss": 1.2329, "step": 12 }, { "classification_loss": 0.5687, "epoch": 0.06878306878306878, "grad_norm": 12.365259170532227, "learning_rate": 9.12280701754386e-06, "lm_loss": 0.6126, "loss": 1.1812, "step": 13 }, { "classification_loss": 0.7166, "epoch": 0.07407407407407407, "grad_norm": 12.777908325195312, "learning_rate": 9.824561403508772e-06, "lm_loss": 0.5857, "loss": 1.3022, "step": 14 }, { "classification_loss": 0.7863, "epoch": 0.07936507936507936, "grad_norm": 12.364931106567383, "learning_rate": 1.0526315789473684e-05, "lm_loss": 0.4866, "loss": 1.2729, "step": 15 }, { "classification_loss": 0.6334, "epoch": 0.08465608465608465, "grad_norm": 10.23062515258789, "learning_rate": 1.1228070175438597e-05, "lm_loss": 0.3727, "loss": 1.006, "step": 16 }, { "classification_loss": 0.5849, "epoch": 0.08994708994708994, "grad_norm": 11.9069185256958, "learning_rate": 1.192982456140351e-05, "lm_loss": 0.2464, "loss": 0.8313, "step": 17 }, { "classification_loss": 0.6384, "epoch": 0.09523809523809523, "grad_norm": 12.602136611938477, "learning_rate": 1.263157894736842e-05, "lm_loss": 0.2392, "loss": 0.8776, "step": 18 }, { "classification_loss": 0.67, "epoch": 0.10052910052910052, "grad_norm": 15.037280082702637, "learning_rate": 1.3333333333333333e-05, "lm_loss": 0.1902, "loss": 0.8602, "step": 19 }, { "classification_loss": 0.6626, "epoch": 0.10582010582010581, "grad_norm": 9.703207015991211, "learning_rate": 1.4035087719298246e-05, "lm_loss": 0.1744, "loss": 0.837, "step": 20 }, { "classification_loss": 0.6218, "epoch": 0.1111111111111111, "grad_norm": 9.03652572631836, "learning_rate": 1.4736842105263159e-05, "lm_loss": 0.1287, "loss": 0.7505, "step": 21 }, { "classification_loss": 0.47, "epoch": 0.1164021164021164, "grad_norm": 8.84855842590332, "learning_rate": 1.543859649122807e-05, "lm_loss": 0.0953, "loss": 0.5653, "step": 22 }, { "classification_loss": 0.6106, "epoch": 0.12169312169312169, "grad_norm": 9.092327117919922, "learning_rate": 1.6140350877192984e-05, "lm_loss": 0.0903, "loss": 0.7009, "step": 23 }, { "classification_loss": 0.3901, "epoch": 0.12698412698412698, "grad_norm": 10.399832725524902, "learning_rate": 1.6842105263157896e-05, "lm_loss": 0.0447, "loss": 0.4348, "step": 24 }, { "classification_loss": 0.8788, "epoch": 0.13227513227513227, "grad_norm": 16.038183212280273, "learning_rate": 1.754385964912281e-05, "lm_loss": 0.083, "loss": 0.9618, "step": 25 }, { "classification_loss": 0.8995, "epoch": 0.13756613756613756, "grad_norm": 17.929433822631836, "learning_rate": 1.824561403508772e-05, "lm_loss": 0.1014, "loss": 1.0009, "step": 26 }, { "classification_loss": 0.7673, "epoch": 0.14285714285714285, "grad_norm": 11.831334114074707, "learning_rate": 1.894736842105263e-05, "lm_loss": 0.0959, "loss": 0.8631, "step": 27 }, { "classification_loss": 0.567, "epoch": 0.14814814814814814, "grad_norm": 7.2031073570251465, "learning_rate": 1.9649122807017544e-05, "lm_loss": 0.0559, "loss": 0.6229, "step": 28 }, { "classification_loss": 0.8024, "epoch": 0.15343915343915343, "grad_norm": 12.598705291748047, "learning_rate": 2.035087719298246e-05, "lm_loss": 0.1079, "loss": 0.9102, "step": 29 }, { "classification_loss": 0.5282, "epoch": 0.15873015873015872, "grad_norm": 7.753366470336914, "learning_rate": 2.105263157894737e-05, "lm_loss": 0.0315, "loss": 0.5598, "step": 30 }, { "classification_loss": 0.6261, "epoch": 0.164021164021164, "grad_norm": 8.418317794799805, "learning_rate": 2.1754385964912285e-05, "lm_loss": 0.0626, "loss": 0.6887, "step": 31 }, { "classification_loss": 0.6078, "epoch": 0.1693121693121693, "grad_norm": 10.975348472595215, "learning_rate": 2.2456140350877194e-05, "lm_loss": 0.0661, "loss": 0.6739, "step": 32 }, { "classification_loss": 0.6152, "epoch": 0.1746031746031746, "grad_norm": 7.836843490600586, "learning_rate": 2.3157894736842107e-05, "lm_loss": 0.0555, "loss": 0.6708, "step": 33 }, { "classification_loss": 0.6342, "epoch": 0.17989417989417988, "grad_norm": 9.3395414352417, "learning_rate": 2.385964912280702e-05, "lm_loss": 0.0772, "loss": 0.7114, "step": 34 }, { "classification_loss": 0.5311, "epoch": 0.18518518518518517, "grad_norm": 8.910663604736328, "learning_rate": 2.4561403508771932e-05, "lm_loss": 0.0496, "loss": 0.5807, "step": 35 }, { "classification_loss": 0.4504, "epoch": 0.19047619047619047, "grad_norm": 8.439433097839355, "learning_rate": 2.526315789473684e-05, "lm_loss": 0.0434, "loss": 0.4937, "step": 36 }, { "classification_loss": 0.4308, "epoch": 0.19576719576719576, "grad_norm": 9.815783500671387, "learning_rate": 2.5964912280701757e-05, "lm_loss": 0.0391, "loss": 0.4699, "step": 37 }, { "classification_loss": 0.5369, "epoch": 0.20105820105820105, "grad_norm": 8.448427200317383, "learning_rate": 2.6666666666666667e-05, "lm_loss": 0.0661, "loss": 0.603, "step": 38 }, { "classification_loss": 0.551, "epoch": 0.20634920634920634, "grad_norm": 9.024697303771973, "learning_rate": 2.7368421052631583e-05, "lm_loss": 0.0627, "loss": 0.6137, "step": 39 }, { "classification_loss": 0.5113, "epoch": 0.21164021164021163, "grad_norm": 10.45963191986084, "learning_rate": 2.8070175438596492e-05, "lm_loss": 0.0432, "loss": 0.5544, "step": 40 }, { "classification_loss": 0.5548, "epoch": 0.21693121693121692, "grad_norm": 9.504748344421387, "learning_rate": 2.8771929824561408e-05, "lm_loss": 0.0704, "loss": 0.6252, "step": 41 }, { "classification_loss": 0.1668, "epoch": 0.2222222222222222, "grad_norm": 6.110610485076904, "learning_rate": 2.9473684210526317e-05, "lm_loss": 0.0104, "loss": 0.1772, "step": 42 }, { "classification_loss": 0.2372, "epoch": 0.2275132275132275, "grad_norm": 9.717988967895508, "learning_rate": 3.017543859649123e-05, "lm_loss": 0.0401, "loss": 0.2773, "step": 43 }, { "classification_loss": 0.4028, "epoch": 0.2328042328042328, "grad_norm": 9.197904586791992, "learning_rate": 3.087719298245614e-05, "lm_loss": 0.0687, "loss": 0.4716, "step": 44 }, { "classification_loss": 0.8087, "epoch": 0.23809523809523808, "grad_norm": 16.595661163330078, "learning_rate": 3.157894736842106e-05, "lm_loss": 0.082, "loss": 0.8907, "step": 45 }, { "classification_loss": 0.683, "epoch": 0.24338624338624337, "grad_norm": 14.11158561706543, "learning_rate": 3.228070175438597e-05, "lm_loss": 0.0702, "loss": 0.7532, "step": 46 }, { "classification_loss": 1.045, "epoch": 0.24867724867724866, "grad_norm": 19.863332748413086, "learning_rate": 3.298245614035088e-05, "lm_loss": 0.1122, "loss": 1.1571, "step": 47 }, { "classification_loss": 0.5931, "epoch": 0.25396825396825395, "grad_norm": 11.521686553955078, "learning_rate": 3.368421052631579e-05, "lm_loss": 0.1241, "loss": 0.7173, "step": 48 }, { "classification_loss": 0.5617, "epoch": 0.25925925925925924, "grad_norm": 12.246728897094727, "learning_rate": 3.43859649122807e-05, "lm_loss": 0.0858, "loss": 0.6475, "step": 49 }, { "classification_loss": 0.5201, "epoch": 0.26455026455026454, "grad_norm": 8.966938018798828, "learning_rate": 3.508771929824562e-05, "lm_loss": 0.0585, "loss": 0.5785, "step": 50 }, { "epoch": 0.26455026455026454, "eval_accuracy": 0.8983050584793091, "eval_auroc": 0.9314866065979004, "eval_classification_loss": 0.32121941447257996, "eval_f1": 0.8604651093482971, "eval_lm_loss": 0.04069896787405014, "eval_loss": 0.3619183897972107, "eval_precision": 0.8283582329750061, "eval_recall": 0.8951612710952759, "eval_runtime": 15.993, "eval_samples_per_second": 22.135, "eval_steps_per_second": 1.438, "step": 50 }, { "classification_loss": 0.4476, "epoch": 0.2698412698412698, "grad_norm": 10.744962692260742, "learning_rate": 3.578947368421053e-05, "lm_loss": 0.0362, "loss": 0.4839, "step": 51 }, { "classification_loss": 0.5896, "epoch": 0.2751322751322751, "grad_norm": 15.367817878723145, "learning_rate": 3.649122807017544e-05, "lm_loss": 0.0429, "loss": 0.6325, "step": 52 }, { "classification_loss": 0.6608, "epoch": 0.2804232804232804, "grad_norm": 16.497859954833984, "learning_rate": 3.719298245614035e-05, "lm_loss": 0.1118, "loss": 0.7726, "step": 53 }, { "classification_loss": 0.5343, "epoch": 0.2857142857142857, "grad_norm": 10.103208541870117, "learning_rate": 3.789473684210526e-05, "lm_loss": 0.0552, "loss": 0.5895, "step": 54 }, { "classification_loss": 0.7036, "epoch": 0.291005291005291, "grad_norm": 15.235865592956543, "learning_rate": 3.859649122807018e-05, "lm_loss": 0.0706, "loss": 0.7742, "step": 55 }, { "classification_loss": 0.3018, "epoch": 0.2962962962962963, "grad_norm": 9.986725807189941, "learning_rate": 3.929824561403509e-05, "lm_loss": 0.028, "loss": 0.3298, "step": 56 }, { "classification_loss": 0.2255, "epoch": 0.30158730158730157, "grad_norm": 5.508973121643066, "learning_rate": 4e-05, "lm_loss": 0.014, "loss": 0.2395, "step": 57 }, { "classification_loss": 0.8275, "epoch": 0.30687830687830686, "grad_norm": 13.436299324035645, "learning_rate": 3.999962054697454e-05, "lm_loss": 0.1098, "loss": 0.9373, "step": 58 }, { "classification_loss": 1.3698, "epoch": 0.31216931216931215, "grad_norm": 23.956758499145508, "learning_rate": 3.999848220229662e-05, "lm_loss": 0.1758, "loss": 1.5456, "step": 59 }, { "classification_loss": 0.5866, "epoch": 0.31746031746031744, "grad_norm": 11.431910514831543, "learning_rate": 3.9996585009161056e-05, "lm_loss": 0.0556, "loss": 0.6421, "step": 60 }, { "classification_loss": 0.4283, "epoch": 0.32275132275132273, "grad_norm": 12.705597877502441, "learning_rate": 3.999392903955744e-05, "lm_loss": 0.0373, "loss": 0.4656, "step": 61 }, { "classification_loss": 0.5689, "epoch": 0.328042328042328, "grad_norm": 12.244424819946289, "learning_rate": 3.999051439426732e-05, "lm_loss": 0.0649, "loss": 0.6338, "step": 62 }, { "classification_loss": 0.662, "epoch": 0.3333333333333333, "grad_norm": 23.293601989746094, "learning_rate": 3.9986341202860467e-05, "lm_loss": 0.0678, "loss": 0.7298, "step": 63 }, { "classification_loss": 0.4212, "epoch": 0.3386243386243386, "grad_norm": 14.655537605285645, "learning_rate": 3.998140962368987e-05, "lm_loss": 0.0411, "loss": 0.4624, "step": 64 }, { "classification_loss": 0.4347, "epoch": 0.3439153439153439, "grad_norm": 9.236527442932129, "learning_rate": 3.99757198438858e-05, "lm_loss": 0.0334, "loss": 0.4682, "step": 65 }, { "classification_loss": 0.4239, "epoch": 0.3492063492063492, "grad_norm": 6.273439884185791, "learning_rate": 3.9969272079348685e-05, "lm_loss": 0.0596, "loss": 0.4835, "step": 66 }, { "classification_loss": 0.2413, "epoch": 0.3544973544973545, "grad_norm": 7.6536359786987305, "learning_rate": 3.9962066574740886e-05, "lm_loss": 0.0107, "loss": 0.2519, "step": 67 }, { "classification_loss": 0.2787, "epoch": 0.35978835978835977, "grad_norm": 6.183154106140137, "learning_rate": 3.9954103603477465e-05, "lm_loss": 0.0238, "loss": 0.3025, "step": 68 }, { "classification_loss": 0.4597, "epoch": 0.36507936507936506, "grad_norm": 6.519228935241699, "learning_rate": 3.994538346771576e-05, "lm_loss": 0.0455, "loss": 0.5052, "step": 69 }, { "classification_loss": 0.3247, "epoch": 0.37037037037037035, "grad_norm": 6.159754276275635, "learning_rate": 3.993590649834398e-05, "lm_loss": 0.0403, "loss": 0.3649, "step": 70 }, { "classification_loss": 0.5143, "epoch": 0.37566137566137564, "grad_norm": 9.48263931274414, "learning_rate": 3.992567305496859e-05, "lm_loss": 0.0586, "loss": 0.5729, "step": 71 }, { "classification_loss": 0.9599, "epoch": 0.38095238095238093, "grad_norm": 13.587491035461426, "learning_rate": 3.991468352590069e-05, "lm_loss": 0.1014, "loss": 1.0613, "step": 72 }, { "classification_loss": 0.2357, "epoch": 0.3862433862433862, "grad_norm": 4.031269550323486, "learning_rate": 3.990293832814129e-05, "lm_loss": 0.0273, "loss": 0.263, "step": 73 }, { "classification_loss": 0.7618, "epoch": 0.3915343915343915, "grad_norm": 9.462884902954102, "learning_rate": 3.989043790736547e-05, "lm_loss": 0.0755, "loss": 0.8374, "step": 74 }, { "classification_loss": 0.5417, "epoch": 0.3968253968253968, "grad_norm": 10.55236530303955, "learning_rate": 3.987718273790548e-05, "lm_loss": 0.0666, "loss": 0.6083, "step": 75 }, { "classification_loss": 0.5999, "epoch": 0.4021164021164021, "grad_norm": 10.495997428894043, "learning_rate": 3.986317332273273e-05, "lm_loss": 0.0624, "loss": 0.6623, "step": 76 }, { "classification_loss": 0.5918, "epoch": 0.4074074074074074, "grad_norm": 7.395081520080566, "learning_rate": 3.984841019343872e-05, "lm_loss": 0.0542, "loss": 0.646, "step": 77 }, { "classification_loss": 0.6787, "epoch": 0.4126984126984127, "grad_norm": 10.61036205291748, "learning_rate": 3.983289391021486e-05, "lm_loss": 0.0718, "loss": 0.7506, "step": 78 }, { "classification_loss": 0.5252, "epoch": 0.41798941798941797, "grad_norm": 11.041518211364746, "learning_rate": 3.9816625061831206e-05, "lm_loss": 0.067, "loss": 0.5922, "step": 79 }, { "classification_loss": 0.4167, "epoch": 0.42328042328042326, "grad_norm": 8.010622024536133, "learning_rate": 3.9799604265614145e-05, "lm_loss": 0.0396, "loss": 0.4563, "step": 80 }, { "classification_loss": 0.3161, "epoch": 0.42857142857142855, "grad_norm": 5.370311737060547, "learning_rate": 3.9781832167422926e-05, "lm_loss": 0.032, "loss": 0.348, "step": 81 }, { "classification_loss": 0.2709, "epoch": 0.43386243386243384, "grad_norm": 7.361512660980225, "learning_rate": 3.976330944162519e-05, "lm_loss": 0.0262, "loss": 0.2971, "step": 82 }, { "classification_loss": 0.3102, "epoch": 0.43915343915343913, "grad_norm": 5.32011604309082, "learning_rate": 3.974403679107139e-05, "lm_loss": 0.0306, "loss": 0.3408, "step": 83 }, { "classification_loss": 0.7924, "epoch": 0.4444444444444444, "grad_norm": 13.953025817871094, "learning_rate": 3.972401494706805e-05, "lm_loss": 0.0671, "loss": 0.8596, "step": 84 }, { "classification_loss": 0.6462, "epoch": 0.4497354497354497, "grad_norm": 8.769445419311523, "learning_rate": 3.970324466935013e-05, "lm_loss": 0.0593, "loss": 0.7056, "step": 85 }, { "classification_loss": 1.0483, "epoch": 0.455026455026455, "grad_norm": 20.772789001464844, "learning_rate": 3.968172674605209e-05, "lm_loss": 0.1143, "loss": 1.1626, "step": 86 }, { "classification_loss": 0.6172, "epoch": 0.4603174603174603, "grad_norm": 8.573277473449707, "learning_rate": 3.965946199367804e-05, "lm_loss": 0.0719, "loss": 0.6891, "step": 87 }, { "classification_loss": 0.4384, "epoch": 0.4656084656084656, "grad_norm": 10.491643905639648, "learning_rate": 3.9636451257070744e-05, "lm_loss": 0.0432, "loss": 0.4816, "step": 88 }, { "classification_loss": 0.6998, "epoch": 0.4708994708994709, "grad_norm": 12.389782905578613, "learning_rate": 3.9612695409379555e-05, "lm_loss": 0.0842, "loss": 0.784, "step": 89 }, { "classification_loss": 0.4868, "epoch": 0.47619047619047616, "grad_norm": 16.95037269592285, "learning_rate": 3.958819535202732e-05, "lm_loss": 0.0394, "loss": 0.5262, "step": 90 }, { "classification_loss": 0.5326, "epoch": 0.48148148148148145, "grad_norm": 16.24233055114746, "learning_rate": 3.9562952014676116e-05, "lm_loss": 0.0326, "loss": 0.5652, "step": 91 }, { "classification_loss": 0.5912, "epoch": 0.48677248677248675, "grad_norm": 8.411019325256348, "learning_rate": 3.9536966355192016e-05, "lm_loss": 0.059, "loss": 0.6503, "step": 92 }, { "classification_loss": 0.4194, "epoch": 0.49206349206349204, "grad_norm": 9.523442268371582, "learning_rate": 3.951023935960874e-05, "lm_loss": 0.0503, "loss": 0.4698, "step": 93 }, { "classification_loss": 0.4555, "epoch": 0.4973544973544973, "grad_norm": 8.235868453979492, "learning_rate": 3.948277204209021e-05, "lm_loss": 0.053, "loss": 0.5085, "step": 94 }, { "classification_loss": 0.5776, "epoch": 0.5026455026455027, "grad_norm": 7.883787155151367, "learning_rate": 3.94545654448921e-05, "lm_loss": 0.0673, "loss": 0.645, "step": 95 }, { "classification_loss": 0.3547, "epoch": 0.5079365079365079, "grad_norm": 4.894813537597656, "learning_rate": 3.942562063832228e-05, "lm_loss": 0.0338, "loss": 0.3885, "step": 96 }, { "classification_loss": 0.6996, "epoch": 0.5132275132275133, "grad_norm": 8.746223449707031, "learning_rate": 3.9395938720700196e-05, "lm_loss": 0.0804, "loss": 0.7801, "step": 97 }, { "classification_loss": 0.6367, "epoch": 0.5185185185185185, "grad_norm": 9.751365661621094, "learning_rate": 3.936552081831518e-05, "lm_loss": 0.0673, "loss": 0.704, "step": 98 }, { "classification_loss": 0.7447, "epoch": 0.5238095238095238, "grad_norm": 10.005545616149902, "learning_rate": 3.933436808538375e-05, "lm_loss": 0.0847, "loss": 0.8294, "step": 99 }, { "classification_loss": 0.3838, "epoch": 0.5291005291005291, "grad_norm": 6.678869247436523, "learning_rate": 3.930248170400578e-05, "lm_loss": 0.0474, "loss": 0.4312, "step": 100 }, { "epoch": 0.5291005291005291, "eval_accuracy": 0.8700565099716187, "eval_auroc": 0.93075031042099, "eval_classification_loss": 0.37040606141090393, "eval_f1": 0.8391608595848083, "eval_lm_loss": 0.04573189839720726, "eval_loss": 0.4161379635334015, "eval_precision": 0.7407407164573669, "eval_recall": 0.9677419066429138, "eval_runtime": 15.53, "eval_samples_per_second": 22.795, "eval_steps_per_second": 1.481, "step": 100 }, { "classification_loss": 0.317, "epoch": 0.5343915343915344, "grad_norm": 4.977490425109863, "learning_rate": 3.9269862884119666e-05, "lm_loss": 0.0596, "loss": 0.3766, "step": 101 }, { "classification_loss": 0.5167, "epoch": 0.5396825396825397, "grad_norm": 5.793409824371338, "learning_rate": 3.923651286345638e-05, "lm_loss": 0.0525, "loss": 0.5692, "step": 102 }, { "classification_loss": 0.3664, "epoch": 0.544973544973545, "grad_norm": 5.308522701263428, "learning_rate": 3.920243290749257e-05, "lm_loss": 0.0368, "loss": 0.4032, "step": 103 }, { "classification_loss": 0.3934, "epoch": 0.5502645502645502, "grad_norm": 5.193403244018555, "learning_rate": 3.916762430940245e-05, "lm_loss": 0.0431, "loss": 0.4365, "step": 104 }, { "classification_loss": 0.5176, "epoch": 0.5555555555555556, "grad_norm": 6.1582417488098145, "learning_rate": 3.913208839000882e-05, "lm_loss": 0.054, "loss": 0.5716, "step": 105 }, { "classification_loss": 0.2793, "epoch": 0.5608465608465608, "grad_norm": 4.8112359046936035, "learning_rate": 3.9095826497732894e-05, "lm_loss": 0.0276, "loss": 0.3069, "step": 106 }, { "classification_loss": 0.5567, "epoch": 0.5661375661375662, "grad_norm": 11.405841827392578, "learning_rate": 3.9058840008543136e-05, "lm_loss": 0.0422, "loss": 0.5989, "step": 107 }, { "classification_loss": 0.743, "epoch": 0.5714285714285714, "grad_norm": 7.794854164123535, "learning_rate": 3.9021130325903076e-05, "lm_loss": 0.0612, "loss": 0.8042, "step": 108 }, { "classification_loss": 0.2738, "epoch": 0.5767195767195767, "grad_norm": 5.324756622314453, "learning_rate": 3.898269888071803e-05, "lm_loss": 0.0325, "loss": 0.3063, "step": 109 }, { "classification_loss": 0.4425, "epoch": 0.582010582010582, "grad_norm": 8.351327896118164, "learning_rate": 3.894354713128081e-05, "lm_loss": 0.0522, "loss": 0.4947, "step": 110 }, { "classification_loss": 0.419, "epoch": 0.5873015873015873, "grad_norm": 5.21488618850708, "learning_rate": 3.89036765632164e-05, "lm_loss": 0.0345, "loss": 0.4535, "step": 111 }, { "classification_loss": 0.5999, "epoch": 0.5925925925925926, "grad_norm": 7.710854530334473, "learning_rate": 3.886308868942555e-05, "lm_loss": 0.0519, "loss": 0.6518, "step": 112 }, { "classification_loss": 0.4637, "epoch": 0.5978835978835979, "grad_norm": 5.4215850830078125, "learning_rate": 3.882178505002743e-05, "lm_loss": 0.0509, "loss": 0.5146, "step": 113 }, { "classification_loss": 0.402, "epoch": 0.6031746031746031, "grad_norm": 4.375611782073975, "learning_rate": 3.877976721230114e-05, "lm_loss": 0.0434, "loss": 0.4455, "step": 114 }, { "classification_loss": 0.4535, "epoch": 0.6084656084656085, "grad_norm": 8.165907859802246, "learning_rate": 3.8737036770626215e-05, "lm_loss": 0.0434, "loss": 0.4969, "step": 115 }, { "classification_loss": 0.6316, "epoch": 0.6137566137566137, "grad_norm": 8.169610977172852, "learning_rate": 3.8693595346422216e-05, "lm_loss": 0.065, "loss": 0.6967, "step": 116 }, { "classification_loss": 0.6539, "epoch": 0.6190476190476191, "grad_norm": 9.205584526062012, "learning_rate": 3.864944458808712e-05, "lm_loss": 0.0611, "loss": 0.715, "step": 117 }, { "classification_loss": 0.3923, "epoch": 0.6243386243386243, "grad_norm": 4.229909896850586, "learning_rate": 3.860458617093481e-05, "lm_loss": 0.0409, "loss": 0.4332, "step": 118 }, { "classification_loss": 0.6024, "epoch": 0.6296296296296297, "grad_norm": 7.795385837554932, "learning_rate": 3.85590217971315e-05, "lm_loss": 0.0687, "loss": 0.6711, "step": 119 }, { "classification_loss": 0.7778, "epoch": 0.6349206349206349, "grad_norm": 14.413529396057129, "learning_rate": 3.851275319563113e-05, "lm_loss": 0.1123, "loss": 0.8901, "step": 120 }, { "classification_loss": 0.6812, "epoch": 0.6402116402116402, "grad_norm": 7.452106475830078, "learning_rate": 3.846578212210979e-05, "lm_loss": 0.076, "loss": 0.7573, "step": 121 }, { "classification_loss": 0.2687, "epoch": 0.6455026455026455, "grad_norm": 3.9474825859069824, "learning_rate": 3.841811035889908e-05, "lm_loss": 0.0276, "loss": 0.2963, "step": 122 }, { "classification_loss": 0.3593, "epoch": 0.6507936507936508, "grad_norm": 4.684298515319824, "learning_rate": 3.836973971491847e-05, "lm_loss": 0.0289, "loss": 0.3882, "step": 123 }, { "classification_loss": 0.4461, "epoch": 0.656084656084656, "grad_norm": 4.967252731323242, "learning_rate": 3.832067202560668e-05, "lm_loss": 0.0395, "loss": 0.4856, "step": 124 }, { "classification_loss": 0.5545, "epoch": 0.6613756613756614, "grad_norm": 5.953970909118652, "learning_rate": 3.827090915285202e-05, "lm_loss": 0.0641, "loss": 0.6185, "step": 125 }, { "classification_loss": 0.5184, "epoch": 0.6666666666666666, "grad_norm": 7.034300327301025, "learning_rate": 3.822045298492177e-05, "lm_loss": 0.0836, "loss": 0.602, "step": 126 }, { "classification_loss": 0.5116, "epoch": 0.671957671957672, "grad_norm": 7.100449562072754, "learning_rate": 3.8169305436390474e-05, "lm_loss": 0.0455, "loss": 0.5571, "step": 127 }, { "classification_loss": 0.6806, "epoch": 0.6772486772486772, "grad_norm": 9.678451538085938, "learning_rate": 3.8117468448067345e-05, "lm_loss": 0.0689, "loss": 0.7496, "step": 128 }, { "classification_loss": 0.3966, "epoch": 0.6825396825396826, "grad_norm": 5.198436260223389, "learning_rate": 3.806494398692258e-05, "lm_loss": 0.0461, "loss": 0.4427, "step": 129 }, { "classification_loss": 0.4291, "epoch": 0.6878306878306878, "grad_norm": 5.229558944702148, "learning_rate": 3.801173404601275e-05, "lm_loss": 0.0354, "loss": 0.4645, "step": 130 }, { "classification_loss": 0.429, "epoch": 0.6931216931216931, "grad_norm": 4.4721455574035645, "learning_rate": 3.7957840644405164e-05, "lm_loss": 0.0489, "loss": 0.4779, "step": 131 }, { "classification_loss": 0.5577, "epoch": 0.6984126984126984, "grad_norm": 5.873918056488037, "learning_rate": 3.790326582710125e-05, "lm_loss": 0.0595, "loss": 0.6172, "step": 132 }, { "classification_loss": 0.4751, "epoch": 0.7037037037037037, "grad_norm": 5.666612148284912, "learning_rate": 3.784801166495896e-05, "lm_loss": 0.0501, "loss": 0.5252, "step": 133 }, { "classification_loss": 0.3547, "epoch": 0.708994708994709, "grad_norm": 4.493051528930664, "learning_rate": 3.77920802546142e-05, "lm_loss": 0.0364, "loss": 0.3911, "step": 134 }, { "classification_loss": 0.4999, "epoch": 0.7142857142857143, "grad_norm": 6.651557922363281, "learning_rate": 3.773547371840124e-05, "lm_loss": 0.0428, "loss": 0.5427, "step": 135 }, { "classification_loss": 0.3249, "epoch": 0.7195767195767195, "grad_norm": 5.5701985359191895, "learning_rate": 3.7678194204272246e-05, "lm_loss": 0.0275, "loss": 0.3524, "step": 136 }, { "classification_loss": 0.553, "epoch": 0.7248677248677249, "grad_norm": 6.871671676635742, "learning_rate": 3.7620243885715695e-05, "lm_loss": 0.0569, "loss": 0.61, "step": 137 }, { "classification_loss": 0.3173, "epoch": 0.7301587301587301, "grad_norm": 3.8207015991210938, "learning_rate": 3.756162496167396e-05, "lm_loss": 0.0298, "loss": 0.3471, "step": 138 }, { "classification_loss": 0.2213, "epoch": 0.7354497354497355, "grad_norm": 4.471009731292725, "learning_rate": 3.750233965645985e-05, "lm_loss": 0.0273, "loss": 0.2486, "step": 139 }, { "classification_loss": 0.277, "epoch": 0.7407407407407407, "grad_norm": 4.64603853225708, "learning_rate": 3.744239021967222e-05, "lm_loss": 0.0258, "loss": 0.3028, "step": 140 }, { "classification_loss": 0.6027, "epoch": 0.746031746031746, "grad_norm": 8.339837074279785, "learning_rate": 3.738177892611057e-05, "lm_loss": 0.0561, "loss": 0.6588, "step": 141 }, { "classification_loss": 0.8025, "epoch": 0.7513227513227513, "grad_norm": 11.05119514465332, "learning_rate": 3.732050807568878e-05, "lm_loss": 0.0623, "loss": 0.8648, "step": 142 }, { "classification_loss": 0.7122, "epoch": 0.7566137566137566, "grad_norm": 9.975105285644531, "learning_rate": 3.72585799933478e-05, "lm_loss": 0.0644, "loss": 0.7766, "step": 143 }, { "classification_loss": 0.3115, "epoch": 0.7619047619047619, "grad_norm": 5.718208312988281, "learning_rate": 3.719599702896745e-05, "lm_loss": 0.0328, "loss": 0.3443, "step": 144 }, { "classification_loss": 0.4022, "epoch": 0.7671957671957672, "grad_norm": 6.724981784820557, "learning_rate": 3.713276155727726e-05, "lm_loss": 0.0337, "loss": 0.4359, "step": 145 }, { "classification_loss": 0.5227, "epoch": 0.7724867724867724, "grad_norm": 8.479836463928223, "learning_rate": 3.706887597776632e-05, "lm_loss": 0.0441, "loss": 0.5668, "step": 146 }, { "classification_loss": 0.2577, "epoch": 0.7777777777777778, "grad_norm": 4.008189678192139, "learning_rate": 3.700434271459229e-05, "lm_loss": 0.0275, "loss": 0.2853, "step": 147 }, { "classification_loss": 0.564, "epoch": 0.783068783068783, "grad_norm": 6.004014492034912, "learning_rate": 3.6939164216489345e-05, "lm_loss": 0.0526, "loss": 0.6166, "step": 148 }, { "classification_loss": 0.4274, "epoch": 0.7883597883597884, "grad_norm": 4.825228214263916, "learning_rate": 3.687334295667533e-05, "lm_loss": 0.0396, "loss": 0.467, "step": 149 }, { "classification_loss": 0.4864, "epoch": 0.7936507936507936, "grad_norm": 8.514625549316406, "learning_rate": 3.680688143275786e-05, "lm_loss": 0.0408, "loss": 0.5272, "step": 150 }, { "epoch": 0.7936507936507936, "eval_accuracy": 0.8813559412956238, "eval_auroc": 0.9317671656608582, "eval_classification_loss": 0.4037400186061859, "eval_f1": 0.8489208817481995, "eval_lm_loss": 0.03990057483315468, "eval_loss": 0.4436405301094055, "eval_precision": 0.7662337422370911, "eval_recall": 0.9516128897666931, "eval_runtime": 15.5269, "eval_samples_per_second": 22.799, "eval_steps_per_second": 1.481, "step": 150 }, { "classification_loss": 0.4105, "epoch": 0.798941798941799, "grad_norm": 5.878465175628662, "learning_rate": 3.673978216663956e-05, "lm_loss": 0.0416, "loss": 0.452, "step": 151 }, { "classification_loss": 0.6657, "epoch": 0.8042328042328042, "grad_norm": 5.658477306365967, "learning_rate": 3.667204770442239e-05, "lm_loss": 0.0655, "loss": 0.7312, "step": 152 }, { "classification_loss": 0.3666, "epoch": 0.8095238095238095, "grad_norm": 7.839348316192627, "learning_rate": 3.6603680616311013e-05, "lm_loss": 0.0375, "loss": 0.4041, "step": 153 }, { "classification_loss": 0.3906, "epoch": 0.8148148148148148, "grad_norm": 4.447023868560791, "learning_rate": 3.653468349651527e-05, "lm_loss": 0.0338, "loss": 0.4244, "step": 154 }, { "classification_loss": 0.3549, "epoch": 0.8201058201058201, "grad_norm": 3.7753591537475586, "learning_rate": 3.646505896315175e-05, "lm_loss": 0.0427, "loss": 0.3976, "step": 155 }, { "classification_loss": 0.5758, "epoch": 0.8253968253968254, "grad_norm": 5.476145267486572, "learning_rate": 3.639480965814443e-05, "lm_loss": 0.0619, "loss": 0.6376, "step": 156 }, { "classification_loss": 0.4419, "epoch": 0.8306878306878307, "grad_norm": 4.684837341308594, "learning_rate": 3.632393824712444e-05, "lm_loss": 0.0543, "loss": 0.4962, "step": 157 }, { "classification_loss": 0.3925, "epoch": 0.8359788359788359, "grad_norm": 5.519379615783691, "learning_rate": 3.625244741932892e-05, "lm_loss": 0.0363, "loss": 0.4288, "step": 158 }, { "classification_loss": 0.4306, "epoch": 0.8412698412698413, "grad_norm": 4.434086322784424, "learning_rate": 3.6180339887498953e-05, "lm_loss": 0.0424, "loss": 0.4731, "step": 159 }, { "classification_loss": 0.4753, "epoch": 0.8465608465608465, "grad_norm": 4.594567775726318, "learning_rate": 3.610761838777665e-05, "lm_loss": 0.0528, "loss": 0.5281, "step": 160 }, { "classification_loss": 0.4848, "epoch": 0.8518518518518519, "grad_norm": 7.448020935058594, "learning_rate": 3.6034285679601334e-05, "lm_loss": 0.0618, "loss": 0.5466, "step": 161 }, { "classification_loss": 0.1928, "epoch": 0.8571428571428571, "grad_norm": 6.5110931396484375, "learning_rate": 3.5960344545604796e-05, "lm_loss": 0.0124, "loss": 0.2053, "step": 162 }, { "classification_loss": 0.3206, "epoch": 0.8624338624338624, "grad_norm": 4.479043006896973, "learning_rate": 3.588579779150572e-05, "lm_loss": 0.0355, "loss": 0.3561, "step": 163 }, { "classification_loss": 0.2145, "epoch": 0.8677248677248677, "grad_norm": 6.838748931884766, "learning_rate": 3.581064824600327e-05, "lm_loss": 0.0209, "loss": 0.2354, "step": 164 }, { "classification_loss": 0.5187, "epoch": 0.873015873015873, "grad_norm": 7.051374912261963, "learning_rate": 3.573489876066967e-05, "lm_loss": 0.0476, "loss": 0.5662, "step": 165 }, { "classification_loss": 0.4017, "epoch": 0.8783068783068783, "grad_norm": 5.828752517700195, "learning_rate": 3.565855220984206e-05, "lm_loss": 0.0468, "loss": 0.4484, "step": 166 }, { "classification_loss": 0.2539, "epoch": 0.8835978835978836, "grad_norm": 7.273751735687256, "learning_rate": 3.558161149051341e-05, "lm_loss": 0.0206, "loss": 0.2745, "step": 167 }, { "classification_loss": 0.8012, "epoch": 0.8888888888888888, "grad_norm": 9.21143627166748, "learning_rate": 3.55040795222226e-05, "lm_loss": 0.0898, "loss": 0.891, "step": 168 }, { "classification_loss": 0.1164, "epoch": 0.8941798941798942, "grad_norm": 4.1860032081604, "learning_rate": 3.542595924694362e-05, "lm_loss": 0.013, "loss": 0.1294, "step": 169 }, { "classification_loss": 0.5173, "epoch": 0.8994708994708994, "grad_norm": 7.0001301765441895, "learning_rate": 3.534725362897394e-05, "lm_loss": 0.0479, "loss": 0.5651, "step": 170 }, { "classification_loss": 0.2549, "epoch": 0.9047619047619048, "grad_norm": 7.007653713226318, "learning_rate": 3.526796565482206e-05, "lm_loss": 0.0301, "loss": 0.285, "step": 171 }, { "classification_loss": 0.4568, "epoch": 0.91005291005291, "grad_norm": 5.752516746520996, "learning_rate": 3.5188098333094145e-05, "lm_loss": 0.0573, "loss": 0.5141, "step": 172 }, { "classification_loss": 0.7478, "epoch": 0.9153439153439153, "grad_norm": 8.939093589782715, "learning_rate": 3.5107654694379873e-05, "lm_loss": 0.0759, "loss": 0.8238, "step": 173 }, { "classification_loss": 0.5309, "epoch": 0.9206349206349206, "grad_norm": 5.358952522277832, "learning_rate": 3.502663779113747e-05, "lm_loss": 0.0571, "loss": 0.588, "step": 174 }, { "classification_loss": 0.8604, "epoch": 0.9259259259259259, "grad_norm": 9.570755958557129, "learning_rate": 3.494505069757782e-05, "lm_loss": 0.0792, "loss": 0.9396, "step": 175 }, { "classification_loss": 0.489, "epoch": 0.9312169312169312, "grad_norm": 7.282289981842041, "learning_rate": 3.4862896509547886e-05, "lm_loss": 0.0456, "loss": 0.5346, "step": 176 }, { "classification_loss": 0.4567, "epoch": 0.9365079365079365, "grad_norm": 6.593536853790283, "learning_rate": 3.478017834441319e-05, "lm_loss": 0.0574, "loss": 0.5142, "step": 177 }, { "classification_loss": 0.329, "epoch": 0.9417989417989417, "grad_norm": 6.121315002441406, "learning_rate": 3.4696899340939517e-05, "lm_loss": 0.0372, "loss": 0.3662, "step": 178 }, { "classification_loss": 0.4779, "epoch": 0.9470899470899471, "grad_norm": 6.270368576049805, "learning_rate": 3.4613062659173865e-05, "lm_loss": 0.0542, "loss": 0.5321, "step": 179 }, { "classification_loss": 0.6646, "epoch": 0.9523809523809523, "grad_norm": 7.001364231109619, "learning_rate": 3.452867148032449e-05, "lm_loss": 0.0641, "loss": 0.7287, "step": 180 }, { "classification_loss": 0.3869, "epoch": 0.9576719576719577, "grad_norm": 4.298563003540039, "learning_rate": 3.4443729006640186e-05, "lm_loss": 0.0346, "loss": 0.4216, "step": 181 }, { "classification_loss": 0.4179, "epoch": 0.9629629629629629, "grad_norm": 4.519134044647217, "learning_rate": 3.435823846128884e-05, "lm_loss": 0.0425, "loss": 0.4604, "step": 182 }, { "classification_loss": 0.5921, "epoch": 0.9682539682539683, "grad_norm": 5.8256916999816895, "learning_rate": 3.427220308823505e-05, "lm_loss": 0.0601, "loss": 0.6522, "step": 183 }, { "classification_loss": 0.383, "epoch": 0.9735449735449735, "grad_norm": 4.641335964202881, "learning_rate": 3.418562615211707e-05, "lm_loss": 0.0435, "loss": 0.4265, "step": 184 }, { "classification_loss": 0.3616, "epoch": 0.9788359788359788, "grad_norm": 4.482592582702637, "learning_rate": 3.409851093812295e-05, "lm_loss": 0.032, "loss": 0.3936, "step": 185 }, { "classification_loss": 0.6806, "epoch": 0.9841269841269841, "grad_norm": 5.762667655944824, "learning_rate": 3.401086075186582e-05, "lm_loss": 0.061, "loss": 0.7416, "step": 186 }, { "classification_loss": 0.3452, "epoch": 0.9894179894179894, "grad_norm": 5.030579566955566, "learning_rate": 3.392267891925854e-05, "lm_loss": 0.0322, "loss": 0.3774, "step": 187 }, { "classification_loss": 0.1775, "epoch": 0.9947089947089947, "grad_norm": 3.0634727478027344, "learning_rate": 3.383396878638741e-05, "lm_loss": 0.0224, "loss": 0.1999, "step": 188 }, { "classification_loss": 0.2488, "epoch": 1.0, "grad_norm": 6.196967124938965, "learning_rate": 3.374473371938526e-05, "lm_loss": 0.021, "loss": 0.2698, "step": 189 }, { "classification_loss": 0.6167, "epoch": 1.0052910052910053, "grad_norm": 7.134896278381348, "learning_rate": 3.365497710430371e-05, "lm_loss": 0.0709, "loss": 0.6876, "step": 190 }, { "classification_loss": 0.2399, "epoch": 1.0105820105820107, "grad_norm": 5.862342834472656, "learning_rate": 3.356470234698468e-05, "lm_loss": 0.0159, "loss": 0.2558, "step": 191 }, { "classification_loss": 0.3992, "epoch": 1.0158730158730158, "grad_norm": 4.166635513305664, "learning_rate": 3.347391287293115e-05, "lm_loss": 0.0494, "loss": 0.4485, "step": 192 }, { "classification_loss": 0.0891, "epoch": 1.0211640211640212, "grad_norm": 2.7699098587036133, "learning_rate": 3.3382612127177166e-05, "lm_loss": 0.0064, "loss": 0.0956, "step": 193 }, { "classification_loss": 0.6035, "epoch": 1.0264550264550265, "grad_norm": 6.711143970489502, "learning_rate": 3.329080357415716e-05, "lm_loss": 0.0677, "loss": 0.6712, "step": 194 }, { "classification_loss": 0.4115, "epoch": 1.0317460317460316, "grad_norm": 4.826721668243408, "learning_rate": 3.319849069757446e-05, "lm_loss": 0.0396, "loss": 0.4511, "step": 195 }, { "classification_loss": 0.3025, "epoch": 1.037037037037037, "grad_norm": 4.070664882659912, "learning_rate": 3.310567700026908e-05, "lm_loss": 0.0294, "loss": 0.3319, "step": 196 }, { "classification_loss": 0.2767, "epoch": 1.0423280423280423, "grad_norm": 5.486062049865723, "learning_rate": 3.301236600408484e-05, "lm_loss": 0.0329, "loss": 0.3097, "step": 197 }, { "classification_loss": 0.4176, "epoch": 1.0476190476190477, "grad_norm": 4.901826858520508, "learning_rate": 3.291856124973575e-05, "lm_loss": 0.0482, "loss": 0.4658, "step": 198 }, { "classification_loss": 0.5174, "epoch": 1.052910052910053, "grad_norm": 5.271239757537842, "learning_rate": 3.282426629667157e-05, "lm_loss": 0.0517, "loss": 0.5692, "step": 199 }, { "classification_loss": 0.3984, "epoch": 1.0582010582010581, "grad_norm": 4.809482574462891, "learning_rate": 3.272948472294283e-05, "lm_loss": 0.0345, "loss": 0.4328, "step": 200 }, { "epoch": 1.0582010582010581, "eval_accuracy": 0.8898305296897888, "eval_auroc": 0.9264024496078491, "eval_classification_loss": 0.42330121994018555, "eval_f1": 0.8592057824134827, "eval_lm_loss": 0.0521097332239151, "eval_loss": 0.4754108786582947, "eval_precision": 0.7777777910232544, "eval_recall": 0.9596773982048035, "eval_runtime": 15.5476, "eval_samples_per_second": 22.769, "eval_steps_per_second": 1.479, "step": 200 } ], "logging_steps": 1, "max_steps": 567, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.0885255820288e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }