| # Config for multi-device LoRA finetuning in lora_finetune_distributed.py | |
| # using a Llama3 8B model | |
| # | |
| # This config assumes that you've run the following command before launching | |
| # this run: | |
| # tune download meta-llama/Meta-Llama-3-8B --output-dir /tmp/Meta-Llama-3-8B --hf-token <HF_TOKEN> | |
| # | |
| # To launch on 2 devices, run the following command from root: | |
| # tune run --nproc_per_node 2 lora_finetune_distributed --config llama3/8B_lora | |
| # | |
| # You can add specific overrides through the command line. For example | |
| # to override the checkpointer directory while launching training | |
| # you can run: | |
| # tune run --nproc_per_node 2 lora_finetune_distributed --config llama3/8B_lora checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR> | |
| # | |
| # This config works best when the model is being fine-tuned on 2+ GPUs. | |
| # For single device LoRA finetuning please use 8B_lora_single_device.yaml | |
| # or 8B_qlora_single_device.yaml | |
| # Tokenizer | |
| tokenizer: | |
| _component_: torchtune.models.llama3.llama3_tokenizer | |
| path: ./model/original/tokenizer.model | |
| # Model Arguments | |
| model: | |
| _component_: torchtune.models.llama3.lora_llama3_8b | |
| lora_attn_modules: ['q_proj', 'v_proj'] | |
| apply_lora_to_mlp: False | |
| apply_lora_to_output: False | |
| lora_rank: 8 | |
| lora_alpha: 16 | |
| checkpointer: | |
| _component_: torchtune.utils.FullModelMetaCheckpointer | |
| checkpoint_dir: ./model/original/ | |
| checkpoint_files: [ | |
| consolidated.00.pth | |
| ] | |
| recipe_checkpoint: null | |
| output_dir: ./finetuned_model/ | |
| model_type: LLAMA3 | |
| resume_from_checkpoint: False | |
| # Dataset and Sampler | |
| # InstructDataset( | |
| # tokenizer=tokenizer, | |
| # source=source, | |
| # template=GrammarErrorCorrectionTemplate, | |
| # column_map={"sentence": "input"}, | |
| # train_on_input=train_on_input, | |
| # split="train", | |
| # ) | |
| dataset: | |
| _component_: torchtune.datasets.instruct_dataset | |
| source: grammarly/coedit | |
| template: GrammarErrorCorrectionTemplate | |
| column_map: {"sentence": "src", "output": "tgt"} | |
| train_on_input: False | |
| split: train | |
| seed: 123 | |
| shuffle: True | |
| batch_size: 4 | |
| # Optimizer and Scheduler | |
| optimizer: | |
| _component_: torch.optim.AdamW | |
| weight_decay: 0.01 | |
| lr: 3e-4 | |
| lr_scheduler: | |
| _component_: torchtune.modules.get_cosine_schedule_with_warmup | |
| num_warmup_steps: 100 | |
| loss: | |
| _component_: torch.nn.CrossEntropyLoss | |
| # Training | |
| epochs: 2 | |
| max_steps_per_epoch: null | |
| gradient_accumulation_steps: 32 | |
| # Logging | |
| output_dir: ./lora_finetune_output | |
| metric_logger: | |
| _component_: torchtune.utils.metric_logging.WandBLogger | |
| project: torchtune | |
| group: llama3-grammarly | |
| log_every_n_steps: null | |
| # Environment | |
| device: cuda | |
| dtype: bf16 | |
| enable_activation_checkpointing: False | |