_n_gpu: 3
accelerator_config:
  dispatch_batches: null
  even_batches: true
  gradient_accumulation_kwargs: null
  non_blocking: false
  split_batches: false
  use_configured_state: false
  use_seedable_sampler: true
activation_offloading: false
adafactor: false
adam_beta1: 0.9
adam_beta2: 0.999
adam_epsilon: 1.0e-08
assistant_only_loss: false
auto_find_batch_size: false
auto_infer_class_weights: false
average_tokens_across_devices: true
batch_eval_metrics: false
bf16: true
bf16_full_eval: false
chat_template_path: null
chosen_column: null
class_weights: null
completion_only_loss: false
data_dir: storage/data
data_seed: null
dataloader_drop_last: false
dataloader_num_workers: 0
dataloader_persistent_workers: false
dataloader_pin_memory: true
dataloader_prefetch_factor: null
dataset_kwargs: null
dataset_num_proc: null
dataset_text_field: text
ddp_backend: null
ddp_broadcast_buffers: null
ddp_bucket_cap_mb: null
ddp_find_unused_parameters: null
ddp_timeout: 1800
debug: []
deepspeed: null
disable_tqdm: false
do_eval: false
do_lora: true
do_predict: false
do_train: false
early_stopping_patience: null
early_stopping_threshold: 0.0
env_dir: /project2/jonmay_1426/ashokd/llm-utils/env/.venv/
eos_token: null
eval_accumulation_steps: null
eval_delay: 0
eval_do_concat_batches: true
eval_max_new_tokens: 512
eval_on_start: false
eval_packing: null
eval_steps: null
eval_strategy: !!python/object/apply:transformers.trainer_utils.IntervalStrategy
- 'no'
eval_use_gather_object: false
evaluate_before_training: false
figure_dir: results/figures
figure_force_save: true
fp16: false
fp16_backend: auto
fp16_full_eval: false
fp16_opt_level: O1
fsdp: []
fsdp_config:
  min_num_params: 0
  xla: false
  xla_fsdp_grad_ckpt: false
  xla_fsdp_v2: false
fsdp_min_num_params: 0
fsdp_transformer_layer_cls_to_wrap: null
full_determinism: false
ga_forget_column: forget
gradient_accumulation_steps: 1
gradient_checkpointing: true
gradient_checkpointing_kwargs: null
greater_is_better: null
group_by_length: false
half_precision_backend: auto
hub_always_push: false
hub_model_id: pqb_eqa_test_pretraining_small_Llama-3.1-8B-Instruct
hub_private_repo: null
hub_revision: null
hub_strategy: !!python/object/apply:transformers.trainer_utils.HubStrategy
- every_save
hub_token: null
huggingface_hub_username: Anonymous
ignore_data_skip: false
image_input_column: image
include_for_metrics: []
include_inputs_for_metrics: false
include_num_input_tokens_seen: 'no'
include_tokens_per_second: false
input_column: input
jit_mode_eval: false
label_names: null
label_smoothing_factor: 0.0
learning_rate: 0.0005
length_column_name: length
liger_kernel_config: null
load_best_model_at_end: false
local_rank: 0
log_dir: results/logs
log_file: results/logs/log.txt
log_level: passive
log_level_replica: warning
log_on_each_node: true
log_verbose: false
logger: !!python/object/apply:logging.getLogger
- LLM-Utils
logging_dir: /project2/jessetho_1732/ashokd//KnowledgeAcquisition//models/pqb_eqa/test/pretraining/small/Llama-3.1-8B-Instruct/runs/Nov07_15-50-53_c05-03.hpc.usc.edu
logging_first_step: false
logging_nan_inf_filter: true
logging_steps: 0.1
logging_strategy: !!python/object/apply:transformers.trainer_utils.IntervalStrategy
- epoch
lora_alpha: 16
lora_dropout: 0.05
lora_r: 8
lora_target_modules: null
loss_type: nll
lr_scheduler_kwargs: {}
lr_scheduler_type: !!python/object/apply:transformers.trainer_utils.SchedulerType
- cosine
max_grad_norm: 1.0
max_input_length: 512
max_length: 1024
max_steps: -1
max_test_samples: null
max_train_samples: null
max_valid_samples: null
metric_for_best_model: null
modality: lm
model_dir: storage/models
model_dtype: float16
model_init_kwargs: null
model_name: meta-llama/Llama-3.1-8B-Instruct
mp_parameters: ''
n_eval_output_batches: 1
neftune_noise_alpha: null
no_cuda: false
num_train_epochs: 1.0
num_workers: 4
openai_tmp_dir: ./openai_tmp_files
optim: !!python/object/apply:transformers.training_args.OptimizerNames
- adamw_torch
optim_args: null
optim_target_modules: null
output_column: output
output_dir: /project2/jessetho_1732/ashokd//KnowledgeAcquisition//models/pqb_eqa/test/pretraining/small/Llama-3.1-8B-Instruct
overwrite_output_dir: false
packing: true
packing_strategy: bfd
pad_to_multiple_of: null
pad_token: null
padding_free: false
parallelism_config: null
past_index: -1
per_device_eval_batch_size: 4
per_device_train_batch_size: 4
per_gpu_eval_batch_size: null
per_gpu_train_batch_size: null
prediction_loss_only: false
pretrain_with_output: false
project: huggingface
project_root: /home1/ashokd/projects/KnowledgeAcquisition/llm-utils
push_to_hub: true
push_to_hub_model_id: null
push_to_hub_organization: null
push_to_hub_token: null
random_seed: null
ray_scope: last
rejected_column: null
remove_unused_columns: true
report_to: wandb
restore_callback_states_from_checkpoint: false
results_dir: results
resume_from_checkpoint: false
run_name: pqb_eqa_test_pretraining_small_Llama-3.1-8B-Instruct
run_start_time: 2025-11-07-H23-M50-S53
save_on_each_node: false
save_only_model: false
save_safetensors: true
save_steps: 500
save_strategy: !!python/object/apply:transformers.trainer_utils.SaveStrategy
- steps
save_total_limit: 2
seed: 42
shuffle_buffer: 5000
skip_memory_metrics: true
storage_dir: storage
streaming: true
test_file: null
tf32: null
tmp_dir: storage/tmp
torch_compile: false
torch_compile_backend: null
torch_compile_mode: null
torch_empty_cache_steps: null
torchdynamo: null
tpu_metrics_debug: false
tpu_num_cores: null
trackio_space_id: trackio
train_file: /project2/jessetho_1732/ashokd//KnowledgeAcquisition//data/curated_data/pqb_eqa/test/pretraining/small/train.csv
train_validation_split: null
training_kind: pre
use_bnb: false
use_cpu: false
use_legacy_prediction_loop: false
use_liger_kernel: false
use_mps_device: false
use_peft: true
validation_file: null
validation_test_split: null
warmup_ratio: 0.1
warmup_steps: 0
weight_decay: 0.01