_n_gpu: 3 accelerator_config: dispatch_batches: null even_batches: true gradient_accumulation_kwargs: null non_blocking: false split_batches: false use_configured_state: false use_seedable_sampler: true activation_offloading: false adafactor: false adam_beta1: 0.9 adam_beta2: 0.999 adam_epsilon: 1.0e-08 assistant_only_loss: false auto_find_batch_size: false auto_infer_class_weights: false average_tokens_across_devices: true batch_eval_metrics: false bf16: true bf16_full_eval: false chat_template_path: null chosen_column: null class_weights: null completion_only_loss: false data_dir: storage/data data_seed: null dataloader_drop_last: false dataloader_num_workers: 0 dataloader_persistent_workers: false dataloader_pin_memory: true dataloader_prefetch_factor: null dataset_kwargs: null dataset_num_proc: null dataset_text_field: text ddp_backend: null ddp_broadcast_buffers: null ddp_bucket_cap_mb: null ddp_find_unused_parameters: null ddp_timeout: 1800 debug: [] deepspeed: null disable_tqdm: false do_eval: false do_lora: true do_predict: false do_train: false early_stopping_patience: null early_stopping_threshold: 0.0 env_dir: /project2/jonmay_1426/ashokd/llm-utils/env/.venv/ eos_token: null eval_accumulation_steps: null eval_delay: 0 eval_do_concat_batches: true eval_max_new_tokens: 512 eval_on_start: false eval_packing: null eval_steps: null eval_strategy: !!python/object/apply:transformers.trainer_utils.IntervalStrategy - 'no' eval_use_gather_object: false evaluate_before_training: false figure_dir: results/figures figure_force_save: true fp16: false fp16_backend: auto fp16_full_eval: false fp16_opt_level: O1 fsdp: [] fsdp_config: min_num_params: 0 xla: false xla_fsdp_grad_ckpt: false xla_fsdp_v2: false fsdp_min_num_params: 0 fsdp_transformer_layer_cls_to_wrap: null full_determinism: false ga_forget_column: forget gradient_accumulation_steps: 1 gradient_checkpointing: true gradient_checkpointing_kwargs: null greater_is_better: null group_by_length: false half_precision_backend: auto hub_always_push: false hub_model_id: pqb_eqa_test_pretraining_small_Llama-3.1-8B-Instruct hub_private_repo: null hub_revision: null hub_strategy: !!python/object/apply:transformers.trainer_utils.HubStrategy - every_save hub_token: null huggingface_hub_username: Anonymous ignore_data_skip: false image_input_column: image include_for_metrics: [] include_inputs_for_metrics: false include_num_input_tokens_seen: 'no' include_tokens_per_second: false input_column: input jit_mode_eval: false label_names: null label_smoothing_factor: 0.0 learning_rate: 0.0005 length_column_name: length liger_kernel_config: null load_best_model_at_end: false local_rank: 0 log_dir: results/logs log_file: results/logs/log.txt log_level: passive log_level_replica: warning log_on_each_node: true log_verbose: false logger: !!python/object/apply:logging.getLogger - LLM-Utils logging_dir: /project2/jessetho_1732/ashokd//KnowledgeAcquisition//models/pqb_eqa/test/pretraining/small/Llama-3.1-8B-Instruct/runs/Nov07_15-50-53_c05-03.hpc.usc.edu logging_first_step: false logging_nan_inf_filter: true logging_steps: 0.1 logging_strategy: !!python/object/apply:transformers.trainer_utils.IntervalStrategy - epoch lora_alpha: 16 lora_dropout: 0.05 lora_r: 8 lora_target_modules: null loss_type: nll lr_scheduler_kwargs: {} lr_scheduler_type: !!python/object/apply:transformers.trainer_utils.SchedulerType - cosine max_grad_norm: 1.0 max_input_length: 512 max_length: 1024 max_steps: -1 max_test_samples: null max_train_samples: null max_valid_samples: null metric_for_best_model: null modality: lm model_dir: storage/models model_dtype: float16 model_init_kwargs: null model_name: meta-llama/Llama-3.1-8B-Instruct mp_parameters: '' n_eval_output_batches: 1 neftune_noise_alpha: null no_cuda: false num_train_epochs: 1.0 num_workers: 4 openai_tmp_dir: ./openai_tmp_files optim: !!python/object/apply:transformers.training_args.OptimizerNames - adamw_torch optim_args: null optim_target_modules: null output_column: output output_dir: /project2/jessetho_1732/ashokd//KnowledgeAcquisition//models/pqb_eqa/test/pretraining/small/Llama-3.1-8B-Instruct overwrite_output_dir: false packing: true packing_strategy: bfd pad_to_multiple_of: null pad_token: null padding_free: false parallelism_config: null past_index: -1 per_device_eval_batch_size: 4 per_device_train_batch_size: 4 per_gpu_eval_batch_size: null per_gpu_train_batch_size: null prediction_loss_only: false pretrain_with_output: false project: huggingface project_root: /home1/ashokd/projects/KnowledgeAcquisition/llm-utils push_to_hub: true push_to_hub_model_id: null push_to_hub_organization: null push_to_hub_token: null random_seed: null ray_scope: last rejected_column: null remove_unused_columns: true report_to: wandb restore_callback_states_from_checkpoint: false results_dir: results resume_from_checkpoint: false run_name: pqb_eqa_test_pretraining_small_Llama-3.1-8B-Instruct run_start_time: 2025-11-07-H23-M50-S53 save_on_each_node: false save_only_model: false save_safetensors: true save_steps: 500 save_strategy: !!python/object/apply:transformers.trainer_utils.SaveStrategy - steps save_total_limit: 2 seed: 42 shuffle_buffer: 5000 skip_memory_metrics: true storage_dir: storage streaming: true test_file: null tf32: null tmp_dir: storage/tmp torch_compile: false torch_compile_backend: null torch_compile_mode: null torch_empty_cache_steps: null torchdynamo: null tpu_metrics_debug: false tpu_num_cores: null trackio_space_id: trackio train_file: /project2/jessetho_1732/ashokd//KnowledgeAcquisition//data/curated_data/pqb_eqa/test/pretraining/small/train.csv train_validation_split: null training_kind: pre use_bnb: false use_cpu: false use_legacy_prediction_loop: false use_liger_kernel: false use_mps_device: false use_peft: true validation_file: null validation_test_split: null warmup_ratio: 0.1 warmup_steps: 0 weight_decay: 0.01