11#  Config for multi-device LoRA finetuning in lora_finetune_distributed.py
22#  using a Qwen2.5 0.5B model
33# 
4- #  This config assumes that you've run the following command before launching
5- #  this run:
6- #    tune download Qwen/Qwen2.5-0.5B-Instruct --output-dir /tmp/Qwen2_5-0_5B-Instruct --ignore-patterns None
4+ #  This config assumes that you've run the following command before launching:
5+ #    tune download Qwen/Qwen2.5-0.5B-Instruct --ignore-patterns None
76# 
87#  To launch on 2 devices, run the following command from root:
9- #    tune run --nnodes 1 -- nproc_per_node 2 lora_finetune_distributed --config qwen2_5/0_5B_lora 
8+ #    tune run --nproc_per_node 2 lora_finetune_distributed --config qwen2_5/0.5B_lora 
109# 
1110#  You can add specific overrides through the command line. For example
12- #  to override the checkpointer directory while launching training
13- #  you can run:
14- #    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config qwen2_5/0_5B_lora checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
11+ #  to override the checkpointer directory while launching training:
12+ #    tune run --nproc_per_node 2 lora_finetune_distributed --config qwen2_5/0.5B_lora checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
1513# 
16- #  This config works best when the model is being fine-tuned on 2+ GPUs.
17- #  For single device LoRA finetuning please use 0_5B_lora_single_device.yaml
14+ #  This config is for fine-tuning on 2+ GPUs.
1815
19- 
20- #  Model Arguments
16+ #  Model arguments
2117model :
2218  _component_ : torchtune.models.qwen2_5.lora_qwen2_5_0_5b 
2319  lora_attn_modules : ['q_proj', 'v_proj', 'output_proj'] 
@@ -26,71 +22,66 @@ model:
2622  lora_alpha : 64   #  usually alpha=2*rank
2723  lora_dropout : 0.0 
2824
25+ #  Tokenizer
2926tokenizer :
3027  _component_ : torchtune.models.qwen2_5.qwen2_5_tokenizer 
31-   path : /tmp/Qwen2_5-0_5B -Instruct/vocab.json 
32-   merges_file : /tmp/Qwen2_5-0_5B -Instruct/merges.txt 
28+   path : /tmp/Qwen2.5-0.5B -Instruct/vocab.json 
29+   merges_file : /tmp/Qwen2.5-0.5B -Instruct/merges.txt 
3330  max_seq_len : null 
3431
32+ #  Checkpointer
3533checkpointer :
3634  _component_ : torchtune.training.FullModelHFCheckpointer 
37-   checkpoint_dir : /tmp/Qwen2_5-0_5B-Instruct 
38-   checkpoint_files : [ 
39-     model.safetensors 
40-   ] 
35+   checkpoint_dir : /tmp/Qwen2.5-0.5B-Instruct 
36+   checkpoint_files : [model.safetensors] 
4137  recipe_checkpoint : null 
42-   output_dir : /tmp/Qwen2_5-0_5B -Instruct-lora-finetune 
38+   output_dir : /tmp/Qwen2.5-0.5B -Instruct-lora-finetune 
4339  model_type : QWEN2 
4440resume_from_checkpoint : False 
4541
46- #  Dataset and Sampler 
42+ #  Dataset
4743dataset :
4844  _component_ : torchtune.datasets.alpaca_cleaned_dataset 
4945  packed : False   #  True increases speed
50- 
5146seed : null 
5247shuffle : True 
53- batch_size : 4 
5448
55- #  Optimizer and Scheduler
49+ #  Fine-tuning arguments
50+ epochs : 1 
51+ max_steps_per_epoch : null 
52+ batch_size : 2 
53+ gradient_accumulation_steps : 8   #  Use to increase virtual batch size
5654optimizer :
5755  _component_ : torch.optim.AdamW 
5856  fused : True 
5957  weight_decay : 0.01 
6058  lr : 2e-3 
61- 
6259lr_scheduler :
6360  _component_ : torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup 
6461  num_warmup_steps : 100 
65- 
6662loss :
6763  _component_ : torchtune.modules.loss.CEWithChunkedOutputLoss 
6864
69- #  Training
70- epochs : 1 
71- max_steps_per_epoch : null 
72- gradient_accumulation_steps : 8   #  Use to increase virtual batch size
73- compile : False   #  pytorch compile, set to true for better perf/memory
65+ #  Training env
66+ device : cuda 
67+ 
68+ #  Memory management / performance
69+ enable_activation_checkpointing : False   #  True reduces memory
70+ enable_activation_offloading : False   #  True reduces memory
71+ dtype : bf16 
72+ compile : False   #  torch.compile the model + loss, True increases speed + decreases memory
7473
7574#  Logging
76- output_dir : /tmp/Qwen2_5-0_5B -Instruct-lora-finetune 
75+ output_dir : /tmp/Qwen2.5-0.5B -Instruct-lora-finetune 
7776metric_logger :
7877  _component_ : torchtune.training.metric_logging.DiskLogger 
79-   log_dir : ${output_dir} 
78+   log_dir : ${output_dir}/logs  
8079log_every_n_steps : 1 
81- log_peak_memory_stats : False 
80+ log_peak_memory_stats : True 
8281
83- #  Environment
84- device : cuda 
85- dtype : bf16 
86- enable_activation_checkpointing : True   #  True reduces memory
87- enable_activation_offloading : False   #  True reduces memory
88- 
89- #  Show case the usage of pytorch profiler
90- #  Set enabled to False as it's only needed for debugging training
82+ #  Profiler (disabled)
9183profiler :
9284  _component_ : torchtune.training.setup_torch_profiler 
93- 
9485  enabled : False 
9586
9687  # Output directory of trace artifacts
@@ -109,6 +100,6 @@ profiler:
109100  #  `torch.profiler.schedule` options:
110101  #  wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
111102  wait_steps : 5 
112-   warmup_steps : 5 
103+   warmup_steps : 3 
113104  active_steps : 2 
114105  num_cycles : 1 
0 commit comments