seed: 101112 ### model model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct trust_remote_code: true flash_attn: auto use_cache: false ### method stage: sft do_train: true finetuning_type: lora lora_rank: 16 lora_alpha: 16 lora_dropout: 0.05 lora_target: k_proj,v_proj,down_proj ### dataset dataset: hellaswag template: llama3 cutoff_len: 2048 overwrite_cache: true preprocessing_num_workers: 8 dataloader_num_workers: 2 packing: false ### output output_dir: saves_multiple/lora/llama-3-8b-instruct/train_hellaswag_101112_1760638083 logging_steps: 5 save_steps: 0.05 overwrite_output_dir: true save_only_model: false plot_loss: true include_num_input_tokens_seen: true push_to_hub: true push_to_hub_organization: rbelanec load_best_model_at_end: true save_total_limit: 1 ### train per_device_train_batch_size: 4 learning_rate: 5.0e-5 num_train_epochs: 20 weight_decay: 1.0e-5 lr_scheduler_type: cosine bf16: true ddp_timeout: 180000000 resume_from_checkpoint: null warmup_ratio: 0.1 optim: adamw_torch report_to: - wandb run_name: lora_llama-3-8b-instruct_train_hellaswag_101112_1760638083 ### eval per_device_eval_batch_size: 4 eval_strategy: steps eval_steps: 0.05 val_size: 0.1