{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9895470383275261, "eval_steps": 500, "global_step": 71, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0, "eval_loss": 1.0074056386947632, "eval_runtime": 98.4091, "eval_samples_per_second": 47.221, "eval_steps_per_second": 0.193, "memory/device_mem_reserved(gib)": 26.24, "memory/max_mem_active(gib)": 22.2, "memory/max_mem_allocated(gib)": 22.2, "step": 0 }, { "epoch": 0.013937282229965157, "grad_norm": 33.0, "learning_rate": 0.0, "loss": 1.0193, "memory/device_mem_reserved(gib)": 101.63, "memory/max_mem_active(gib)": 96.87, "memory/max_mem_allocated(gib)": 96.87, "step": 1 }, { "epoch": 0.027874564459930314, "grad_norm": 30.25, "learning_rate": 3.5e-06, "loss": 0.9952, "memory/device_mem_reserved(gib)": 128.11, "memory/max_mem_active(gib)": 122.89, "memory/max_mem_allocated(gib)": 122.89, "step": 2 }, { "epoch": 0.041811846689895474, "grad_norm": 2.171875, "learning_rate": 7e-06, "loss": 0.9583, "memory/device_mem_reserved(gib)": 128.11, "memory/max_mem_active(gib)": 122.89, "memory/max_mem_allocated(gib)": 122.89, "step": 3 }, { "epoch": 0.05574912891986063, "grad_norm": 1.578125, "learning_rate": 6.9963728577635466e-06, "loss": 0.9165, "memory/device_mem_reserved(gib)": 128.11, "memory/max_mem_active(gib)": 122.89, "memory/max_mem_allocated(gib)": 122.89, "step": 4 }, { "epoch": 0.06968641114982578, "grad_norm": 1.015625, "learning_rate": 6.98549894886036e-06, "loss": 0.8888, "memory/device_mem_reserved(gib)": 128.11, "memory/max_mem_active(gib)": 122.89, "memory/max_mem_allocated(gib)": 122.89, "step": 5 }, { "epoch": 0.08362369337979095, "grad_norm": 0.7421875, "learning_rate": 6.9674008111271575e-06, "loss": 0.8721, "memory/device_mem_reserved(gib)": 128.11, "memory/max_mem_active(gib)": 122.89, "memory/max_mem_allocated(gib)": 122.89, "step": 6 }, { "epoch": 0.0975609756097561, "grad_norm": 0.671875, "learning_rate": 6.942115955718097e-06, "loss": 0.8577, "memory/device_mem_reserved(gib)": 128.11, "memory/max_mem_active(gib)": 122.89, "memory/max_mem_allocated(gib)": 122.89, "step": 7 }, { "epoch": 0.11149825783972125, "grad_norm": 0.57421875, "learning_rate": 6.909696789357177e-06, "loss": 0.8596, "memory/device_mem_reserved(gib)": 128.11, "memory/max_mem_active(gib)": 122.89, "memory/max_mem_allocated(gib)": 122.89, "step": 8 }, { "epoch": 0.1254355400696864, "grad_norm": 0.5703125, "learning_rate": 6.870210505717297e-06, "loss": 0.8492, "memory/device_mem_reserved(gib)": 128.11, "memory/max_mem_active(gib)": 122.89, "memory/max_mem_allocated(gib)": 122.89, "step": 9 }, { "epoch": 0.13937282229965156, "grad_norm": 0.5625, "learning_rate": 6.8237389461511175e-06, "loss": 0.8379, "memory/device_mem_reserved(gib)": 128.11, "memory/max_mem_active(gib)": 122.89, "memory/max_mem_allocated(gib)": 122.89, "step": 10 }, { "epoch": 0.15331010452961671, "grad_norm": 0.458984375, "learning_rate": 6.770378430062349e-06, "loss": 0.8453, "memory/device_mem_reserved(gib)": 128.11, "memory/max_mem_active(gib)": 122.89, "memory/max_mem_allocated(gib)": 122.89, "step": 11 }, { "epoch": 0.1672473867595819, "grad_norm": 0.52734375, "learning_rate": 6.710239555269086e-06, "loss": 0.8091, "memory/device_mem_reserved(gib)": 128.11, "memory/max_mem_active(gib)": 122.89, "memory/max_mem_allocated(gib)": 122.89, "step": 12 }, { "epoch": 0.18118466898954705, "grad_norm": 0.45703125, "learning_rate": 6.643446968772936e-06, "loss": 0.838, "memory/device_mem_reserved(gib)": 128.11, "memory/max_mem_active(gib)": 122.89, "memory/max_mem_allocated(gib)": 122.89, "step": 13 }, { "epoch": 0.1951219512195122, "grad_norm": 0.375, "learning_rate": 6.5701391084090805e-06, "loss": 0.7947, "memory/device_mem_reserved(gib)": 128.11, "memory/max_mem_active(gib)": 122.89, "memory/max_mem_allocated(gib)": 122.89, "step": 14 }, { "epoch": 0.20905923344947736, "grad_norm": 0.359375, "learning_rate": 6.49046791591271e-06, "loss": 0.7993, "memory/device_mem_reserved(gib)": 128.11, "memory/max_mem_active(gib)": 122.89, "memory/max_mem_allocated(gib)": 122.89, "step": 15 }, { "epoch": 0.2229965156794425, "grad_norm": 0.3828125, "learning_rate": 6.404598521996588e-06, "loss": 0.8075, "memory/device_mem_reserved(gib)": 128.11, "memory/max_mem_active(gib)": 122.89, "memory/max_mem_allocated(gib)": 122.89, "step": 16 }, { "epoch": 0.23693379790940766, "grad_norm": 0.37109375, "learning_rate": 6.312708904092424e-06, "loss": 0.8114, "memory/device_mem_reserved(gib)": 128.11, "memory/max_mem_active(gib)": 122.89, "memory/max_mem_allocated(gib)": 122.89, "step": 17 }, { "epoch": 0.2508710801393728, "grad_norm": 0.38671875, "learning_rate": 6.21498951746547e-06, "loss": 0.7909, "memory/device_mem_reserved(gib)": 128.11, "memory/max_mem_active(gib)": 122.89, "memory/max_mem_allocated(gib)": 122.89, "step": 18 }, { "epoch": 0.26480836236933797, "grad_norm": 0.439453125, "learning_rate": 6.111642900466899e-06, "loss": 0.7892, "memory/device_mem_reserved(gib)": 128.11, "memory/max_mem_active(gib)": 122.89, "memory/max_mem_allocated(gib)": 122.89, "step": 19 }, { "epoch": 0.2787456445993031, "grad_norm": 0.330078125, "learning_rate": 6.002883254742148e-06, "loss": 0.7954, "memory/device_mem_reserved(gib)": 133.93, "memory/max_mem_active(gib)": 125.6, "memory/max_mem_allocated(gib)": 125.6, "step": 20 }, { "epoch": 0.2926829268292683, "grad_norm": 0.328125, "learning_rate": 5.88893600126529e-06, "loss": 0.7871, "memory/device_mem_reserved(gib)": 133.93, "memory/max_mem_active(gib)": 125.6, "memory/max_mem_allocated(gib)": 125.6, "step": 21 }, { "epoch": 0.30662020905923343, "grad_norm": 0.310546875, "learning_rate": 5.770037313119646e-06, "loss": 0.7897, "memory/device_mem_reserved(gib)": 133.93, "memory/max_mem_active(gib)": 125.6, "memory/max_mem_allocated(gib)": 125.6, "step": 22 }, { "epoch": 0.3205574912891986, "grad_norm": 0.333984375, "learning_rate": 5.646433625993007e-06, "loss": 0.8007, "memory/device_mem_reserved(gib)": 133.93, "memory/max_mem_active(gib)": 125.6, "memory/max_mem_allocated(gib)": 125.6, "step": 23 }, { "epoch": 0.3344947735191638, "grad_norm": 0.302734375, "learning_rate": 5.518381127402035e-06, "loss": 0.79, "memory/device_mem_reserved(gib)": 133.93, "memory/max_mem_active(gib)": 125.6, "memory/max_mem_allocated(gib)": 125.6, "step": 24 }, { "epoch": 0.34843205574912894, "grad_norm": 0.345703125, "learning_rate": 5.386145225704515e-06, "loss": 0.7838, "memory/device_mem_reserved(gib)": 133.93, "memory/max_mem_active(gib)": 125.6, "memory/max_mem_allocated(gib)": 125.6, "step": 25 }, { "epoch": 0.3623693379790941, "grad_norm": 0.306640625, "learning_rate": 5.25e-06, "loss": 0.7863, "memory/device_mem_reserved(gib)": 133.93, "memory/max_mem_active(gib)": 125.6, "memory/max_mem_allocated(gib)": 125.6, "step": 26 }, { "epoch": 0.37630662020905925, "grad_norm": 0.43359375, "learning_rate": 5.110227632059032e-06, "loss": 0.7719, "memory/device_mem_reserved(gib)": 133.93, "memory/max_mem_active(gib)": 125.6, "memory/max_mem_allocated(gib)": 125.6, "step": 27 }, { "epoch": 0.3902439024390244, "grad_norm": 0.294921875, "learning_rate": 4.967117821458325e-06, "loss": 0.7827, "memory/device_mem_reserved(gib)": 133.93, "memory/max_mem_active(gib)": 125.6, "memory/max_mem_allocated(gib)": 125.6, "step": 28 }, { "epoch": 0.40418118466898956, "grad_norm": 0.291015625, "learning_rate": 4.82096718513415e-06, "loss": 0.7893, "memory/device_mem_reserved(gib)": 133.93, "memory/max_mem_active(gib)": 125.6, "memory/max_mem_allocated(gib)": 125.6, "step": 29 }, { "epoch": 0.4181184668989547, "grad_norm": 0.2470703125, "learning_rate": 4.672078642598451e-06, "loss": 0.7885, "memory/device_mem_reserved(gib)": 133.93, "memory/max_mem_active(gib)": 125.6, "memory/max_mem_allocated(gib)": 125.6, "step": 30 }, { "epoch": 0.43205574912891986, "grad_norm": 0.26953125, "learning_rate": 4.5207607880918874e-06, "loss": 0.7921, "memory/device_mem_reserved(gib)": 133.93, "memory/max_mem_active(gib)": 125.6, "memory/max_mem_allocated(gib)": 125.6, "step": 31 }, { "epoch": 0.445993031358885, "grad_norm": 0.279296875, "learning_rate": 4.36732725097515e-06, "loss": 0.7808, "memory/device_mem_reserved(gib)": 133.93, "memory/max_mem_active(gib)": 125.6, "memory/max_mem_allocated(gib)": 125.6, "step": 32 }, { "epoch": 0.45993031358885017, "grad_norm": 0.259765625, "learning_rate": 4.212096045684219e-06, "loss": 0.787, "memory/device_mem_reserved(gib)": 133.93, "memory/max_mem_active(gib)": 125.6, "memory/max_mem_allocated(gib)": 125.6, "step": 33 }, { "epoch": 0.4738675958188153, "grad_norm": 0.244140625, "learning_rate": 4.055388912596879e-06, "loss": 0.7653, "memory/device_mem_reserved(gib)": 133.93, "memory/max_mem_active(gib)": 125.6, "memory/max_mem_allocated(gib)": 125.6, "step": 34 }, { "epoch": 0.4878048780487805, "grad_norm": 0.248046875, "learning_rate": 3.897530651176662e-06, "loss": 0.7802, "memory/device_mem_reserved(gib)": 133.93, "memory/max_mem_active(gib)": 125.6, "memory/max_mem_allocated(gib)": 125.6, "step": 35 }, { "epoch": 0.5017421602787456, "grad_norm": 0.25390625, "learning_rate": 3.7388484467763488e-06, "loss": 0.7856, "memory/device_mem_reserved(gib)": 133.93, "memory/max_mem_active(gib)": 125.6, "memory/max_mem_allocated(gib)": 125.6, "step": 36 }, { "epoch": 0.5156794425087108, "grad_norm": 0.302734375, "learning_rate": 3.5796711924963697e-06, "loss": 0.7722, "memory/device_mem_reserved(gib)": 133.93, "memory/max_mem_active(gib)": 125.6, "memory/max_mem_allocated(gib)": 125.6, "step": 37 }, { "epoch": 0.5296167247386759, "grad_norm": 0.26953125, "learning_rate": 3.42032880750363e-06, "loss": 0.7803, "memory/device_mem_reserved(gib)": 133.93, "memory/max_mem_active(gib)": 125.6, "memory/max_mem_allocated(gib)": 125.6, "step": 38 }, { "epoch": 0.5435540069686411, "grad_norm": 0.2578125, "learning_rate": 3.261151553223652e-06, "loss": 0.7782, "memory/device_mem_reserved(gib)": 133.93, "memory/max_mem_active(gib)": 125.6, "memory/max_mem_allocated(gib)": 125.6, "step": 39 }, { "epoch": 0.5574912891986062, "grad_norm": 0.2412109375, "learning_rate": 3.1024693488233373e-06, "loss": 0.7711, "memory/device_mem_reserved(gib)": 133.93, "memory/max_mem_active(gib)": 125.6, "memory/max_mem_allocated(gib)": 125.6, "step": 40 }, { "epoch": 0.5714285714285714, "grad_norm": 0.26953125, "learning_rate": 2.94461108740312e-06, "loss": 0.7634, "memory/device_mem_reserved(gib)": 133.93, "memory/max_mem_active(gib)": 125.6, "memory/max_mem_allocated(gib)": 125.6, "step": 41 }, { "epoch": 0.5853658536585366, "grad_norm": 0.296875, "learning_rate": 2.7879039543157825e-06, "loss": 0.7734, "memory/device_mem_reserved(gib)": 133.93, "memory/max_mem_active(gib)": 125.6, "memory/max_mem_allocated(gib)": 125.6, "step": 42 }, { "epoch": 0.5993031358885017, "grad_norm": 0.26171875, "learning_rate": 2.6326727490248506e-06, "loss": 0.7876, "memory/device_mem_reserved(gib)": 133.93, "memory/max_mem_active(gib)": 125.6, "memory/max_mem_allocated(gib)": 125.6, "step": 43 }, { "epoch": 0.6132404181184669, "grad_norm": 0.240234375, "learning_rate": 2.4792392119081124e-06, "loss": 0.7603, "memory/device_mem_reserved(gib)": 133.93, "memory/max_mem_active(gib)": 125.6, "memory/max_mem_allocated(gib)": 125.6, "step": 44 }, { "epoch": 0.627177700348432, "grad_norm": 0.251953125, "learning_rate": 2.3279213574015483e-06, "loss": 0.7671, "memory/device_mem_reserved(gib)": 133.93, "memory/max_mem_active(gib)": 125.6, "memory/max_mem_allocated(gib)": 125.6, "step": 45 }, { "epoch": 0.6411149825783972, "grad_norm": 0.263671875, "learning_rate": 2.17903281486585e-06, "loss": 0.7783, "memory/device_mem_reserved(gib)": 133.93, "memory/max_mem_active(gib)": 125.6, "memory/max_mem_allocated(gib)": 125.6, "step": 46 }, { "epoch": 0.6550522648083623, "grad_norm": 0.2734375, "learning_rate": 2.0328821785416767e-06, "loss": 0.7866, "memory/device_mem_reserved(gib)": 133.93, "memory/max_mem_active(gib)": 125.6, "memory/max_mem_allocated(gib)": 125.6, "step": 47 }, { "epoch": 0.6689895470383276, "grad_norm": 0.251953125, "learning_rate": 1.8897723679409675e-06, "loss": 0.782, "memory/device_mem_reserved(gib)": 133.93, "memory/max_mem_active(gib)": 125.6, "memory/max_mem_allocated(gib)": 125.6, "step": 48 }, { "epoch": 0.6829268292682927, "grad_norm": 0.333984375, "learning_rate": 1.7500000000000008e-06, "loss": 0.7747, "memory/device_mem_reserved(gib)": 133.93, "memory/max_mem_active(gib)": 125.6, "memory/max_mem_allocated(gib)": 125.6, "step": 49 }, { "epoch": 0.6968641114982579, "grad_norm": 0.255859375, "learning_rate": 1.6138547742954857e-06, "loss": 0.7854, "memory/device_mem_reserved(gib)": 133.93, "memory/max_mem_active(gib)": 125.6, "memory/max_mem_allocated(gib)": 125.6, "step": 50 }, { "epoch": 0.710801393728223, "grad_norm": 0.2421875, "learning_rate": 1.4816188725979652e-06, "loss": 0.7655, "memory/device_mem_reserved(gib)": 133.93, "memory/max_mem_active(gib)": 125.6, "memory/max_mem_allocated(gib)": 125.6, "step": 51 }, { "epoch": 0.7247386759581882, "grad_norm": 0.2490234375, "learning_rate": 1.3535663740069923e-06, "loss": 0.7726, "memory/device_mem_reserved(gib)": 133.93, "memory/max_mem_active(gib)": 125.6, "memory/max_mem_allocated(gib)": 125.6, "step": 52 }, { "epoch": 0.7386759581881533, "grad_norm": 0.232421875, "learning_rate": 1.229962686880354e-06, "loss": 0.7836, "memory/device_mem_reserved(gib)": 133.93, "memory/max_mem_active(gib)": 125.6, "memory/max_mem_allocated(gib)": 125.6, "step": 53 }, { "epoch": 0.7526132404181185, "grad_norm": 0.2451171875, "learning_rate": 1.1110639987347114e-06, "loss": 0.7531, "memory/device_mem_reserved(gib)": 133.93, "memory/max_mem_active(gib)": 128.58, "memory/max_mem_allocated(gib)": 128.58, "step": 54 }, { "epoch": 0.7665505226480837, "grad_norm": 0.24609375, "learning_rate": 9.971167452578519e-07, "loss": 0.7736, "memory/device_mem_reserved(gib)": 133.93, "memory/max_mem_active(gib)": 128.58, "memory/max_mem_allocated(gib)": 128.58, "step": 55 }, { "epoch": 0.7804878048780488, "grad_norm": 0.259765625, "learning_rate": 8.883570995331009e-07, "loss": 0.7662, "memory/device_mem_reserved(gib)": 133.93, "memory/max_mem_active(gib)": 128.58, "memory/max_mem_allocated(gib)": 128.58, "step": 56 }, { "epoch": 0.794425087108014, "grad_norm": 0.2578125, "learning_rate": 7.850104825345303e-07, "loss": 0.7646, "memory/device_mem_reserved(gib)": 133.93, "memory/max_mem_active(gib)": 128.58, "memory/max_mem_allocated(gib)": 128.58, "step": 57 }, { "epoch": 0.8083623693379791, "grad_norm": 0.24609375, "learning_rate": 6.872910959075762e-07, "loss": 0.7673, "memory/device_mem_reserved(gib)": 133.93, "memory/max_mem_active(gib)": 128.58, "memory/max_mem_allocated(gib)": 128.58, "step": 58 }, { "epoch": 0.8222996515679443, "grad_norm": 0.248046875, "learning_rate": 5.954014780034123e-07, "loss": 0.7748, "memory/device_mem_reserved(gib)": 133.93, "memory/max_mem_active(gib)": 128.58, "memory/max_mem_allocated(gib)": 128.58, "step": 59 }, { "epoch": 0.8362369337979094, "grad_norm": 0.24609375, "learning_rate": 5.0953208408729e-07, "loss": 0.7784, "memory/device_mem_reserved(gib)": 133.93, "memory/max_mem_active(gib)": 128.58, "memory/max_mem_allocated(gib)": 128.58, "step": 60 }, { "epoch": 0.8501742160278746, "grad_norm": 0.283203125, "learning_rate": 4.2986089159092006e-07, "loss": 0.7638, "memory/device_mem_reserved(gib)": 133.93, "memory/max_mem_active(gib)": 128.58, "memory/max_mem_allocated(gib)": 128.58, "step": 61 }, { "epoch": 0.8641114982578397, "grad_norm": 0.244140625, "learning_rate": 3.5655303122706395e-07, "loss": 0.7682, "memory/device_mem_reserved(gib)": 133.93, "memory/max_mem_active(gib)": 128.58, "memory/max_mem_allocated(gib)": 128.58, "step": 62 }, { "epoch": 0.8780487804878049, "grad_norm": 0.27734375, "learning_rate": 2.897604447309151e-07, "loss": 0.7706, "memory/device_mem_reserved(gib)": 133.93, "memory/max_mem_active(gib)": 128.58, "memory/max_mem_allocated(gib)": 128.58, "step": 63 }, { "epoch": 0.89198606271777, "grad_norm": 0.2578125, "learning_rate": 2.2962156993765138e-07, "loss": 0.777, "memory/device_mem_reserved(gib)": 133.93, "memory/max_mem_active(gib)": 128.58, "memory/max_mem_allocated(gib)": 128.58, "step": 64 }, { "epoch": 0.9059233449477352, "grad_norm": 0.255859375, "learning_rate": 1.7626105384888284e-07, "loss": 0.7701, "memory/device_mem_reserved(gib)": 133.93, "memory/max_mem_active(gib)": 128.58, "memory/max_mem_allocated(gib)": 128.58, "step": 65 }, { "epoch": 0.9198606271777003, "grad_norm": 0.248046875, "learning_rate": 1.2978949428270303e-07, "loss": 0.761, "memory/device_mem_reserved(gib)": 133.93, "memory/max_mem_active(gib)": 128.58, "memory/max_mem_allocated(gib)": 128.58, "step": 66 }, { "epoch": 0.9337979094076655, "grad_norm": 0.283203125, "learning_rate": 9.030321064282354e-08, "loss": 0.7711, "memory/device_mem_reserved(gib)": 133.93, "memory/max_mem_active(gib)": 128.58, "memory/max_mem_allocated(gib)": 128.58, "step": 67 }, { "epoch": 0.9477351916376306, "grad_norm": 0.2451171875, "learning_rate": 5.788404428190291e-08, "loss": 0.7776, "memory/device_mem_reserved(gib)": 133.93, "memory/max_mem_active(gib)": 128.58, "memory/max_mem_allocated(gib)": 128.58, "step": 68 }, { "epoch": 0.9616724738675958, "grad_norm": 0.236328125, "learning_rate": 3.259918887284235e-08, "loss": 0.7955, "memory/device_mem_reserved(gib)": 133.93, "memory/max_mem_active(gib)": 128.58, "memory/max_mem_allocated(gib)": 128.58, "step": 69 }, { "epoch": 0.975609756097561, "grad_norm": 0.255859375, "learning_rate": 1.4501051139640508e-08, "loss": 0.7704, "memory/device_mem_reserved(gib)": 133.93, "memory/max_mem_active(gib)": 128.58, "memory/max_mem_allocated(gib)": 128.58, "step": 70 }, { "epoch": 0.9895470383275261, "grad_norm": 0.263671875, "learning_rate": 3.627142236453551e-09, "loss": 0.7826, "memory/device_mem_reserved(gib)": 133.93, "memory/max_mem_active(gib)": 128.58, "memory/max_mem_allocated(gib)": 128.58, "step": 71 } ], "logging_steps": 1, "max_steps": 71, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.4926199650791719e+19, "train_batch_size": 32, "trial_name": null, "trial_params": null }