Skip to content

Commit 1eb7785

Browse files
authored
Update Qwen2.5 configs (#1999)
1 parent 18d97f0 commit 1eb7785

File tree

9 files changed

+268
-334
lines changed

9 files changed

+268
-334
lines changed

recipes/configs/qwen2_5/0_5B_full.yaml renamed to recipes/configs/qwen2_5/0.5B_full.yaml

Lines changed: 32 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1,82 +1,75 @@
11
# Config for multi-device full finetuning in full_finetune_distributed.py
22
# using a Qwen2.5 0.5B model
33
#
4-
# This config assumes that you've run the following command before launching
5-
# this run:
6-
# tune download Qwen/Qwen2.5-0.5B-Instruct --output-dir /tmp/Qwen2_5-0_5B-Instruct --ignore-patterns None
4+
# This config assumes that you've run the following command before launching:
5+
# tune download Qwen/Qwen2.5-0.5B-Instruct --ignore-patterns None
76
#
87
# To launch on 2 devices, run the following command from root:
9-
# tune run --nnodes 1 --nproc_per_node 2 full_finetune_distributed --config qwen2_5/0_5B_full
8+
# tune run --nproc_per_node 2 full_finetune_distributed --config qwen2_5/0.5B_full
109
#
1110
# You can add specific overrides through the command line. For example
12-
# to override the checkpointer directory while launching training
13-
# you can run:
14-
# tune run --nnodes 1 --nproc_per_node 2 full_finetune_distributed --config qwen2_5/0_5B_full checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
11+
# to override the checkpointer directory while launching training:
12+
# tune run --nproc_per_node 2 full_finetune_distributed --config qwen2_5/0.5B_full checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
1513
#
16-
# This config works best when the model is being fine-tuned on 2+ GPUs.
17-
# Single device full finetuning requires more memory optimizations. It's
18-
# best to use 0_5B_full_single_device.yaml for those cases
14+
# This config is for fine-tuning on 2+ GPUs.
15+
16+
# Model arguments
17+
model:
18+
_component_: torchtune.models.qwen2_5.qwen2_5_0_5b
1919

2020
# Tokenizer
2121
tokenizer:
2222
_component_: torchtune.models.qwen2_5.qwen2_5_tokenizer
23-
path: /tmp/Qwen2_5-0_5B-Instruct/vocab.json
24-
merges_file: /tmp/Qwen2_5-0_5B-Instruct/merges.txt
23+
path: /tmp/Qwen2.5-0.5B-Instruct/vocab.json
24+
merges_file: /tmp/Qwen2.5-0.5B-Instruct/merges.txt
2525
max_seq_len: null
2626

27+
# Checkpointer
28+
checkpointer:
29+
_component_: torchtune.training.FullModelHFCheckpointer
30+
checkpoint_dir: /tmp/Qwen2.5-0.5B-Instruct
31+
checkpoint_files: [model.safetensors]
32+
recipe_checkpoint: null
33+
output_dir: /tmp/Qwen2.5-0.5B-Instruct-finetune
34+
model_type: QWEN2
35+
resume_from_checkpoint: False
36+
2737
# Dataset
2838
dataset:
2939
_component_: torchtune.datasets.alpaca_cleaned_dataset
3040
packed: False # True increases speed
3141
seed: null
3242
shuffle: True
3343

34-
# Model Arguments
35-
model:
36-
_component_: torchtune.models.qwen2_5.qwen2_5_0_5b
37-
38-
checkpointer:
39-
_component_: torchtune.training.FullModelHFCheckpointer
40-
checkpoint_dir: /tmp/Qwen2_5-0_5B-Instruct
41-
checkpoint_files: [
42-
model.safetensors
43-
]
44-
recipe_checkpoint: null
45-
output_dir: /tmp/Qwen2_5-0_5B-Instruct-finetune
46-
model_type: QWEN2
47-
resume_from_checkpoint: False
48-
4944
# Fine-tuning arguments
50-
batch_size: 2
5145
epochs: 1
46+
max_steps_per_epoch: null
47+
batch_size: 2
48+
gradient_accumulation_steps: 8 # Use to increase virtual batch size
5249
optimizer:
5350
_component_: torch.optim.AdamW
5451
fused: True
5552
lr: 2e-5
53+
optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1
5654
loss:
5755
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
58-
max_steps_per_epoch: null
59-
gradient_accumulation_steps: 8 # Use to increase virtual batch size
60-
compile: False # pytorch compile, set to true for better perf/memory
61-
optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1
6256

6357
# Training env
6458
device: cuda
6559

66-
# Memory management
67-
enable_activation_checkpointing: True # True reduces memory
60+
# Memory management / performance
61+
enable_activation_checkpointing: False # True reduces memory
6862
enable_activation_offloading: False # True reduces memory
69-
70-
# Reduced precision
7163
dtype: bf16
64+
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7265

7366
# Logging
67+
output_dir: /tmp/Qwen2.5-0.5B-Instruct-finetune
7468
metric_logger:
7569
_component_: torchtune.training.metric_logging.DiskLogger
76-
log_dir: ${output_dir}
77-
output_dir: /tmp/Qwen2_5-0_5B-Instruct-finetune
70+
log_dir: ${output_dir}/logs
7871
log_every_n_steps: 1
79-
log_peak_memory_stats: False
72+
log_peak_memory_stats: True
8073

8174
# Profiler (disabled)
8275
profiler:

recipes/configs/qwen2_5/0_5B_full_single_device.yaml renamed to recipes/configs/qwen2_5/0.5B_full_single_device.yaml

Lines changed: 32 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,86 +1,75 @@
11
# Config for single device full finetuning in full_finetune_single_device.py
22
# using a Qwen2.5 0.5B
33
#
4-
# This config assumes that you've run the following command before launching
5-
# this run:
6-
# tune download Qwen/Qwen2.5-0.5B-Instruct --output-dir /tmp/Qwen2_5-0_5B-Instruct --ignore-patterns None
7-
#
8-
# The default config uses an optimizer from bitsandbytes. If you do not have it installed,
9-
# you can install it with
10-
# pip install bitsandbytes
4+
# This config assumes that you've run the following command before launching:
5+
# tune download Qwen/Qwen2.5-0.5B-Instruct --ignore-patterns None
116
#
127
# To launch on a single device, run the following command from root:
13-
# tune run full_finetune_single_device --config qwen2_5/0_5B_full_single_device
8+
# tune run full_finetune_single_device --config qwen2_5/0.5B_full_single_device
149
#
1510
# You can add specific overrides through the command line. For example
16-
# to override the checkpointer directory while launching training
17-
# you can run:
18-
# tune run full_finetune_single_device --config qwen2_5/0_5B_full_single_device checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
11+
# to override the checkpointer directory while launching training:
12+
# tune run full_finetune_single_device --config qwen2_5/0.5B_full_single_device checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
1913
#
2014
# This config works only for training on single device.
2115

16+
# Model arguments
17+
model:
18+
_component_: torchtune.models.qwen2_5.qwen2_5_0_5b
19+
2220
# Tokenizer
2321
tokenizer:
2422
_component_: torchtune.models.qwen2_5.qwen2_5_tokenizer
25-
path: /tmp/Qwen2_5-0_5B-Instruct/vocab.json
26-
merges_file: /tmp/Qwen2_5-0_5B-Instruct/merges.txt
23+
path: /tmp/Qwen2.5-0.5B-Instruct/vocab.json
24+
merges_file: /tmp/Qwen2.5-0.5B-Instruct/merges.txt
2725
max_seq_len: null
2826

27+
# Checkpointer
28+
checkpointer:
29+
_component_: torchtune.training.FullModelHFCheckpointer
30+
checkpoint_dir: /tmp/Qwen2.5-0.5B-Instruct
31+
checkpoint_files: [model.safetensors]
32+
recipe_checkpoint: null
33+
output_dir: /tmp/Qwen2.5-0.5B-Instruct-finetune
34+
model_type: QWEN2
35+
resume_from_checkpoint: False
36+
2937
# Dataset
3038
dataset:
3139
_component_: torchtune.datasets.alpaca_cleaned_dataset
3240
packed: False # True increases speed
3341
seed: null
3442
shuffle: True
3543

36-
# Model Arguments
37-
model:
38-
_component_: torchtune.models.qwen2_5.qwen2_5_0_5b
39-
40-
checkpointer:
41-
_component_: torchtune.training.FullModelHFCheckpointer
42-
checkpoint_dir: /tmp/Qwen2_5-0_5B-Instruct
43-
checkpoint_files: [
44-
model.safetensors
45-
]
46-
recipe_checkpoint: null
47-
output_dir: /tmp/Qwen2_5-0_5B-Instruct-finetune
48-
model_type: QWEN2
49-
resume_from_checkpoint: False
50-
5144
# Fine-tuning arguments
52-
batch_size: 2
5345
epochs: 1
46+
max_steps_per_epoch: null
47+
batch_size: 2
48+
gradient_accumulation_steps: 8 # Use to increase virtual batch size
5449
optimizer:
5550
_component_: torch.optim.AdamW
5651
fused: True
5752
lr: 2e-5
58-
53+
optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1
5954
loss:
6055
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
61-
optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1
6256

63-
max_steps_per_epoch: null
64-
gradient_accumulation_steps: 8 # Use to increase virtual batch size
65-
compile: False # pytorch compile, set to true for better perf/memory
66-
67-
# Training environment
57+
# Training env
6858
device: cuda
6959

70-
# Memory management
71-
enable_activation_checkpointing: True # True reduces memory
60+
# Memory management / performance
61+
enable_activation_checkpointing: False # True reduces memory
7262
enable_activation_offloading: False # True reduces memory
73-
74-
# Reduced precision
7563
dtype: bf16
64+
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7665

7766
# Logging
67+
output_dir: /tmp/Qwen2.5-0.5B-Instruct-finetune
7868
metric_logger:
7969
_component_: torchtune.training.metric_logging.DiskLogger
80-
log_dir: ${output_dir}
81-
output_dir: /tmp/Qwen2_5-0_5B-Instruct-finetune
70+
log_dir: ${output_dir}/logs
8271
log_every_n_steps: 1
83-
log_peak_memory_stats: False
72+
log_peak_memory_stats: True
8473

8574
# Profiler (disabled)
8675
profiler:
Lines changed: 33 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,19 @@
11
# Config for multi-device LoRA finetuning in lora_finetune_distributed.py
22
# using a Qwen2.5 0.5B model
33
#
4-
# This config assumes that you've run the following command before launching
5-
# this run:
6-
# tune download Qwen/Qwen2.5-0.5B-Instruct --output-dir /tmp/Qwen2_5-0_5B-Instruct --ignore-patterns None
4+
# This config assumes that you've run the following command before launching:
5+
# tune download Qwen/Qwen2.5-0.5B-Instruct --ignore-patterns None
76
#
87
# To launch on 2 devices, run the following command from root:
9-
# tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config qwen2_5/0_5B_lora
8+
# tune run --nproc_per_node 2 lora_finetune_distributed --config qwen2_5/0.5B_lora
109
#
1110
# You can add specific overrides through the command line. For example
12-
# to override the checkpointer directory while launching training
13-
# you can run:
14-
# tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config qwen2_5/0_5B_lora checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
11+
# to override the checkpointer directory while launching training:
12+
# tune run --nproc_per_node 2 lora_finetune_distributed --config qwen2_5/0.5B_lora checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
1513
#
16-
# This config works best when the model is being fine-tuned on 2+ GPUs.
17-
# For single device LoRA finetuning please use 0_5B_lora_single_device.yaml
14+
# This config is for fine-tuning on 2+ GPUs.
1815

19-
20-
# Model Arguments
16+
# Model arguments
2117
model:
2218
_component_: torchtune.models.qwen2_5.lora_qwen2_5_0_5b
2319
lora_attn_modules: ['q_proj', 'v_proj', 'output_proj']
@@ -26,71 +22,66 @@ model:
2622
lora_alpha: 64 # usually alpha=2*rank
2723
lora_dropout: 0.0
2824

25+
# Tokenizer
2926
tokenizer:
3027
_component_: torchtune.models.qwen2_5.qwen2_5_tokenizer
31-
path: /tmp/Qwen2_5-0_5B-Instruct/vocab.json
32-
merges_file: /tmp/Qwen2_5-0_5B-Instruct/merges.txt
28+
path: /tmp/Qwen2.5-0.5B-Instruct/vocab.json
29+
merges_file: /tmp/Qwen2.5-0.5B-Instruct/merges.txt
3330
max_seq_len: null
3431

32+
# Checkpointer
3533
checkpointer:
3634
_component_: torchtune.training.FullModelHFCheckpointer
37-
checkpoint_dir: /tmp/Qwen2_5-0_5B-Instruct
38-
checkpoint_files: [
39-
model.safetensors
40-
]
35+
checkpoint_dir: /tmp/Qwen2.5-0.5B-Instruct
36+
checkpoint_files: [model.safetensors]
4137
recipe_checkpoint: null
42-
output_dir: /tmp/Qwen2_5-0_5B-Instruct-lora-finetune
38+
output_dir: /tmp/Qwen2.5-0.5B-Instruct-lora-finetune
4339
model_type: QWEN2
4440
resume_from_checkpoint: False
4541

46-
# Dataset and Sampler
42+
# Dataset
4743
dataset:
4844
_component_: torchtune.datasets.alpaca_cleaned_dataset
4945
packed: False # True increases speed
50-
5146
seed: null
5247
shuffle: True
53-
batch_size: 4
5448

55-
# Optimizer and Scheduler
49+
# Fine-tuning arguments
50+
epochs: 1
51+
max_steps_per_epoch: null
52+
batch_size: 2
53+
gradient_accumulation_steps: 8 # Use to increase virtual batch size
5654
optimizer:
5755
_component_: torch.optim.AdamW
5856
fused: True
5957
weight_decay: 0.01
6058
lr: 2e-3
61-
6259
lr_scheduler:
6360
_component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
6461
num_warmup_steps: 100
65-
6662
loss:
6763
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
6864

69-
# Training
70-
epochs: 1
71-
max_steps_per_epoch: null
72-
gradient_accumulation_steps: 8 # Use to increase virtual batch size
73-
compile: False # pytorch compile, set to true for better perf/memory
65+
# Training env
66+
device: cuda
67+
68+
# Memory management / performance
69+
enable_activation_checkpointing: False # True reduces memory
70+
enable_activation_offloading: False # True reduces memory
71+
dtype: bf16
72+
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7473

7574
# Logging
76-
output_dir: /tmp/Qwen2_5-0_5B-Instruct-lora-finetune
75+
output_dir: /tmp/Qwen2.5-0.5B-Instruct-lora-finetune
7776
metric_logger:
7877
_component_: torchtune.training.metric_logging.DiskLogger
79-
log_dir: ${output_dir}
78+
log_dir: ${output_dir}/logs
8079
log_every_n_steps: 1
81-
log_peak_memory_stats: False
80+
log_peak_memory_stats: True
8281

83-
# Environment
84-
device: cuda
85-
dtype: bf16
86-
enable_activation_checkpointing: True # True reduces memory
87-
enable_activation_offloading: False # True reduces memory
88-
89-
# Show case the usage of pytorch profiler
90-
# Set enabled to False as it's only needed for debugging training
82+
# Profiler (disabled)
9183
profiler:
9284
_component_: torchtune.training.setup_torch_profiler
93-
9485
enabled: False
9586

9687
#Output directory of trace artifacts
@@ -109,6 +100,6 @@ profiler:
109100
# `torch.profiler.schedule` options:
110101
# wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
111102
wait_steps: 5
112-
warmup_steps: 5
103+
warmup_steps: 3
113104
active_steps: 2
114105
num_cycles: 1

0 commit comments

Comments
 (0)