| time_start: null | |
| DEBUG: false | |
| debug_model: unsloth/Qwen2.5-7B-bnb-4bit | |
| fold: 0 | |
| random_seed: true | |
| train_on_all_folds: false | |
| eval_only: false | |
| merge_adapters: false | |
| wandb_id: null | |
| val_split_name: val | |
| pad_token: <pad> | |
| response_template_ids: | |
| - 4 | |
| num_proc: 20 | |
| hub_repo_tags: | |
| - odesia | |
| script_args: | |
| dataset_name: nbroad/odesia-combined-v1 | |
| config: null | |
| gradient_checkpointing_use_reentrant: true | |
| ignore_bias_buffers: false | |
| model_config: | |
| model_name_or_path: mistralai/Ministral-8B-Instruct-2410 | |
| torch_dtype: bfloat16 | |
| attn_implementation: flash_attention_2 | |
| use_peft: true | |
| lora_r: 16 | |
| lora_alpha: 32 | |
| lora_dropout: 0.05 | |
| lora_target_modules: | |
| - q_proj | |
| - v_proj | |
| - k_proj | |
| - o_proj | |
| - up_proj | |
| - down_proj | |
| - gate_proj | |
| lora_modules_to_save: null | |
| lora_task_type: CAUSAL_LM | |
| use_rslora: true | |
| load_in_8bit: false | |
| load_in_4bit: false | |
| bnb_4bit_quant_type: nf4 | |
| use_bnb_nested_quant: true | |
| training_args: | |
| resume_from_checkpoint: null | |
| output_dir: ./ | |
| num_train_epochs: 1 | |
| per_device_train_batch_size: 8 | |
| per_device_eval_batch_size: 8 | |
| warmup_ratio: 0.1 | |
| fp16: false | |
| bf16: true | |
| eval_strategy: steps | |
| save_strategy: steps | |
| eval_steps: 100 | |
| save_steps: 100 | |
| save_total_limit: 2 | |
| logging_steps: 2 | |
| run_name: null | |
| weight_decay: 0.01 | |
| report_to: wandb | |
| learning_rate: 6.0e-05 | |
| metric_for_best_model: loss | |
| greater_is_better: false | |
| gradient_checkpointing: true | |
| gradient_accumulation_steps: 8 | |
| gradient_checkpointing_kwargs: | |
| use_reentrant: true | |
| optim: adamw_torch | |
| dataloader_num_workers: 4 | |
| seed: 18 | |
| max_grad_norm: 2.0 | |
| load_best_model_at_end: true | |
| push_to_hub: true | |
| hub_private_repo: true | |
| lr_scheduler_type: cosine | |
| remove_unused_columns: false | |
| ddp_find_unused_parameters: false | |
| use_liger_kernel: true | |