{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.996996996996997, "eval_steps": 500, "global_step": 166, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006006006006006006, "grad_norm": 4.622050762176514, "learning_rate": 1.1764705882352942e-05, "loss": 2.2096, "step": 1 }, { "epoch": 0.03003003003003003, "grad_norm": 4.466672420501709, "learning_rate": 5.882352941176471e-05, "loss": 2.1887, "step": 5 }, { "epoch": 0.06006006006006006, "grad_norm": 2.914835214614868, "learning_rate": 0.00011764705882352942, "loss": 2.089, "step": 10 }, { "epoch": 0.09009009009009009, "grad_norm": 2.6984052658081055, "learning_rate": 0.00017647058823529413, "loss": 1.9548, "step": 15 }, { "epoch": 0.12012012012012012, "grad_norm": 2.9828903675079346, "learning_rate": 0.00019980001615408228, "loss": 1.8137, "step": 20 }, { "epoch": 0.15015015015015015, "grad_norm": 1.9843279123306274, "learning_rate": 0.00019858078810097002, "loss": 1.6709, "step": 25 }, { "epoch": 0.18018018018018017, "grad_norm": 1.2187334299087524, "learning_rate": 0.00019626695552163578, "loss": 1.5917, "step": 30 }, { "epoch": 0.21021021021021022, "grad_norm": 1.0200163125991821, "learning_rate": 0.00019288421035528028, "loss": 1.585, "step": 35 }, { "epoch": 0.24024024024024024, "grad_norm": 1.0672650337219238, "learning_rate": 0.00018847011335021449, "loss": 1.5534, "step": 40 }, { "epoch": 0.2702702702702703, "grad_norm": 1.100464940071106, "learning_rate": 0.0001830736770032341, "loss": 1.5358, "step": 45 }, { "epoch": 0.3003003003003003, "grad_norm": 0.978142499923706, "learning_rate": 0.000176754821343025, "loss": 1.526, "step": 50 }, { "epoch": 0.3303303303303303, "grad_norm": 1.1280028820037842, "learning_rate": 0.00016958370860037717, "loss": 1.519, "step": 55 }, { "epoch": 0.36036036036036034, "grad_norm": 0.9044827222824097, "learning_rate": 0.00016163996415278424, "loss": 1.5143, "step": 60 }, { "epoch": 0.39039039039039036, "grad_norm": 0.949729323387146, "learning_rate": 0.00015301179239376938, "loss": 1.5115, "step": 65 }, { "epoch": 0.42042042042042044, "grad_norm": 0.9165012836456299, "learning_rate": 0.00014379499734399798, "loss": 1.5074, "step": 70 }, { "epoch": 0.45045045045045046, "grad_norm": 0.8735723495483398, "learning_rate": 0.0001340919188789477, "loss": 1.496, "step": 75 }, { "epoch": 0.4804804804804805, "grad_norm": 0.8276928663253784, "learning_rate": 0.00012401029638486953, "loss": 1.4897, "step": 80 }, { "epoch": 0.5105105105105106, "grad_norm": 0.8228772282600403, "learning_rate": 0.0001136620724605827, "loss": 1.4963, "step": 85 }, { "epoch": 0.5405405405405406, "grad_norm": 0.8163589835166931, "learning_rate": 0.0001031621499483559, "loss": 1.4858, "step": 90 }, { "epoch": 0.5705705705705706, "grad_norm": 0.9330477118492126, "learning_rate": 9.26271160953421e-05, "loss": 1.4865, "step": 95 }, { "epoch": 0.6006006006006006, "grad_norm": 0.8192353248596191, "learning_rate": 8.217394801200631e-05, "loss": 1.476, "step": 100 }, { "epoch": 0.6306306306306306, "grad_norm": 0.8679515719413757, "learning_rate": 7.191871380165538e-05, "loss": 1.4751, "step": 105 }, { "epoch": 0.6606606606606606, "grad_norm": 0.9080681800842285, "learning_rate": 6.197528378324665e-05, "loss": 1.4706, "step": 110 }, { "epoch": 0.6906906906906907, "grad_norm": 0.8363797664642334, "learning_rate": 5.2454066117578815e-05, "loss": 1.4791, "step": 115 }, { "epoch": 0.7207207207207207, "grad_norm": 0.865535318851471, "learning_rate": 4.346078087600412e-05, "loss": 1.475, "step": 120 }, { "epoch": 0.7507507507507507, "grad_norm": 0.8175177574157715, "learning_rate": 3.509528616394716e-05, "loss": 1.4816, "step": 125 }, { "epoch": 0.7807807807807807, "grad_norm": 0.8171746134757996, "learning_rate": 2.7450469333520855e-05, "loss": 1.4713, "step": 130 }, { "epoch": 0.8108108108108109, "grad_norm": 0.8764714002609253, "learning_rate": 2.06112155968028e-05, "loss": 1.4671, "step": 135 }, { "epoch": 0.8408408408408409, "grad_norm": 0.856728732585907, "learning_rate": 1.4653465491908003e-05, "loss": 1.4823, "step": 140 }, { "epoch": 0.8708708708708709, "grad_norm": 0.8493318557739258, "learning_rate": 9.643371667405698e-06, "loss": 1.469, "step": 145 }, { "epoch": 0.9009009009009009, "grad_norm": 0.8912323117256165, "learning_rate": 5.636564347832907e-06, "loss": 1.4737, "step": 150 }, { "epoch": 0.9309309309309309, "grad_norm": 0.9049487113952637, "learning_rate": 2.677533636303964e-06, "loss": 1.4727, "step": 155 }, { "epoch": 0.960960960960961, "grad_norm": 0.8411799669265747, "learning_rate": 7.991355128984079e-07, "loss": 1.4699, "step": 160 }, { "epoch": 0.990990990990991, "grad_norm": 0.885953426361084, "learning_rate": 2.222701403818972e-08, "loss": 1.4745, "step": 165 }, { "epoch": 0.996996996996997, "eval_loss": 2.1457931995391846, "eval_runtime": 0.6019, "eval_samples_per_second": 39.873, "eval_steps_per_second": 1.661, "step": 166 }, { "epoch": 0.996996996996997, "step": 166, "total_flos": 6.968531416388731e+17, "train_loss": 1.56508505703455, "train_runtime": 590.2519, "train_samples_per_second": 54.006, "train_steps_per_second": 0.281 } ], "logging_steps": 5, "max_steps": 166, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.968531416388731e+17, "train_batch_size": 12, "trial_name": null, "trial_params": null }