Skip to content

Commit 73d0821

Browse files
Qwen2.5 (#1863)
1 parent e99b890 commit 73d0821

29 files changed

+3673
-241
lines changed
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
# Config for multi-device full finetuning in full_finetune_distributed.py
2+
# using a Qwen2.5 0.5B model
3+
#
4+
# This config assumes that you've run the following command before launching
5+
# this run:
6+
# tune download Qwen/Qwen2.5-0.5B-Instruct --output-dir /tmp/Qwen2_5-0_5B-Instruct --ignore-patterns None
7+
#
8+
# To launch on 2 devices, run the following command from root:
9+
# tune run --nnodes 1 --nproc_per_node 2 full_finetune_distributed --config qwen2_5/0_5B_full
10+
#
11+
# You can add specific overrides through the command line. For example
12+
# to override the checkpointer directory while launching training
13+
# you can run:
14+
# tune run --nnodes 1 --nproc_per_node 2 full_finetune_distributed --config qwen2_5/0_5B_full checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
15+
#
16+
# This config works best when the model is being fine-tuned on 2+ GPUs.
17+
# Single device full finetuning requires more memory optimizations. It's
18+
# best to use 0_5B_full_single_device.yaml for those cases
19+
20+
# Tokenizer
21+
tokenizer:
22+
_component_: torchtune.models.qwen2_5.qwen2_5_tokenizer
23+
path: /tmp/Qwen2_5-0_5B-Instruct/vocab.json
24+
merges_file: /tmp/Qwen2_5-0_5B-Instruct/merges.txt
25+
max_seq_len: null
26+
27+
# Dataset
28+
dataset:
29+
_component_: torchtune.datasets.alpaca_cleaned_dataset
30+
packed: False
31+
seed: null
32+
shuffle: True
33+
34+
# Model Arguments
35+
model:
36+
_component_: torchtune.models.qwen2_5.qwen2_5_0_5b
37+
38+
checkpointer:
39+
_component_: torchtune.training.FullModelHFCheckpointer
40+
checkpoint_dir: /tmp/Qwen2_5-0_5B-Instruct
41+
checkpoint_files: [
42+
model.safetensors
43+
]
44+
recipe_checkpoint: null
45+
output_dir: /tmp/Qwen2_5-0_5B-Instruct-finetune
46+
model_type: QWEN2
47+
resume_from_checkpoint: False
48+
49+
# Fine-tuning arguments
50+
batch_size: 2
51+
epochs: 1
52+
optimizer:
53+
_component_: torch.optim.AdamW
54+
fused: True
55+
lr: 2e-5
56+
loss:
57+
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
58+
max_steps_per_epoch: null
59+
gradient_accumulation_steps: 16
60+
compile: False
61+
62+
# Training env
63+
device: cuda
64+
65+
# Memory management
66+
enable_activation_checkpointing: True
67+
68+
# Reduced precision
69+
dtype: bf16
70+
71+
# Logging
72+
metric_logger:
73+
_component_: torchtune.training.metric_logging.DiskLogger
74+
log_dir: ${output_dir}
75+
output_dir: /tmp/Qwen2_5-0_5B-Instruct-finetune
76+
log_every_n_steps: 1
77+
log_peak_memory_stats: False
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
# Config for single device full finetuning in full_finetune_single_device.py
2+
# using a Qwen2.5 0.5B
3+
#
4+
# This config assumes that you've run the following command before launching
5+
# this run:
6+
# tune download Qwen/Qwen2.5-0.5B-Instruct --output-dir /tmp/Qwen2_5-0_5B-Instruct --ignore-patterns None
7+
#
8+
# The default config uses an optimizer from bitsandbytes. If you do not have it installed,
9+
# you can install it with
10+
# pip install bitsandbytes
11+
#
12+
# To launch on a single device, run the following command from root:
13+
# tune run full_finetune_single_device --config qwen2_5/0_5B_full_single_device
14+
#
15+
# You can add specific overrides through the command line. For example
16+
# to override the checkpointer directory while launching training
17+
# you can run:
18+
# tune run full_finetune_single_device --config qwen2_5/0_5B_full_single_device checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
19+
#
20+
# This config works only for training on single device.
21+
22+
# Tokenizer
23+
tokenizer:
24+
_component_: torchtune.models.qwen2_5.qwen2_5_tokenizer
25+
path: /tmp/Qwen2_5-0_5B-Instruct/vocab.json
26+
merges_file: /tmp/Qwen2_5-0_5B-Instruct/merges.txt
27+
max_seq_len: null
28+
29+
# Dataset
30+
dataset:
31+
_component_: torchtune.datasets.alpaca_cleaned_dataset
32+
packed: False
33+
seed: null
34+
shuffle: True
35+
36+
# Model Arguments
37+
model:
38+
_component_: torchtune.models.qwen2_5.qwen2_5_0_5b
39+
40+
checkpointer:
41+
_component_: torchtune.training.FullModelHFCheckpointer
42+
checkpoint_dir: /tmp/Qwen2_5-0_5B-Instruct
43+
checkpoint_files: [
44+
model.safetensors
45+
]
46+
recipe_checkpoint: null
47+
output_dir: /tmp/Qwen2_5-0_5B-Instruct-finetune
48+
model_type: QWEN2
49+
resume_from_checkpoint: False
50+
51+
# Fine-tuning arguments
52+
batch_size: 2
53+
epochs: 1
54+
optimizer:
55+
_component_: torch.optim.AdamW
56+
fused: True
57+
lr: 2e-5
58+
59+
loss:
60+
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
61+
optimizer_in_bwd: False
62+
63+
max_steps_per_epoch: null
64+
gradient_accumulation_steps: 8
65+
compile: False
66+
67+
# Training environment
68+
device: cuda
69+
70+
# Memory management
71+
enable_activation_checkpointing: True
72+
73+
# Reduced precision
74+
dtype: bf16
75+
76+
# Logging
77+
metric_logger:
78+
_component_: torchtune.training.metric_logging.DiskLogger
79+
log_dir: ${output_dir}
80+
output_dir: /tmp/Qwen2_5-0_5B-Instruct-finetune
81+
log_every_n_steps: 1
82+
log_peak_memory_stats: False
Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
# Config for multi-device LoRA finetuning in lora_finetune_distributed.py
2+
# using a Qwen2.5 0.5B model
3+
#
4+
# This config assumes that you've run the following command before launching
5+
# this run:
6+
# tune download Qwen/Qwen2.5-0.5B-Instruct --output-dir /tmp/Qwen2_5-0_5B-Instruct --ignore-patterns None
7+
#
8+
# To launch on 2 devices, run the following command from root:
9+
# tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config qwen2_5/0_5B_lora
10+
#
11+
# You can add specific overrides through the command line. For example
12+
# to override the checkpointer directory while launching training
13+
# you can run:
14+
# tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config qwen2_5/0_5B_lora checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
15+
#
16+
# This config works best when the model is being fine-tuned on 2+ GPUs.
17+
# For single device LoRA finetuning please use 0_5B_lora_single_device.yaml
18+
19+
20+
# Model Arguments
21+
model:
22+
_component_: torchtune.models.qwen2_5.lora_qwen2_5_0_5b
23+
lora_attn_modules: ['q_proj', 'v_proj']
24+
apply_lora_to_mlp: False
25+
apply_lora_to_output: False
26+
lora_rank: 32
27+
lora_alpha: 64
28+
lora_dropout: 0.0
29+
30+
tokenizer:
31+
_component_: torchtune.models.qwen2_5.qwen2_5_tokenizer
32+
path: /tmp/Qwen2_5-0_5B-Instruct/vocab.json
33+
merges_file: /tmp/Qwen2_5-0_5B-Instruct/merges.txt
34+
max_seq_len: null
35+
36+
checkpointer:
37+
_component_: torchtune.training.FullModelHFCheckpointer
38+
checkpoint_dir: /tmp/Qwen2_5-0_5B-Instruct
39+
checkpoint_files: [
40+
model.safetensors
41+
]
42+
recipe_checkpoint: null
43+
output_dir: /tmp/Qwen2_5-0_5B-Instruct-lora-finetune
44+
model_type: QWEN2
45+
resume_from_checkpoint: False
46+
47+
# Dataset and Sampler
48+
dataset:
49+
_component_: torchtune.datasets.alpaca_cleaned_dataset
50+
packed: False
51+
52+
seed: null
53+
shuffle: True
54+
batch_size: 4
55+
56+
# Optimizer and Scheduler
57+
optimizer:
58+
_component_: torch.optim.AdamW
59+
fused: True
60+
weight_decay: 0.01
61+
lr: 2e-3
62+
63+
lr_scheduler:
64+
_component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
65+
num_warmup_steps: 100
66+
67+
loss:
68+
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
69+
70+
# Training
71+
epochs: 1
72+
max_steps_per_epoch: null
73+
gradient_accumulation_steps: 4
74+
compile: False
75+
76+
# Logging
77+
output_dir: /tmp/Qwen2_5-0_5B-Instruct-lora-finetune
78+
metric_logger:
79+
_component_: torchtune.training.metric_logging.DiskLogger
80+
log_dir: ${output_dir}
81+
log_every_n_steps: 1
82+
log_peak_memory_stats: False
83+
84+
# Environment
85+
device: cuda
86+
dtype: bf16
87+
enable_activation_checkpointing: True
88+
89+
# Show case the usage of pytorch profiler
90+
# Set enabled to False as it's only needed for debugging training
91+
profiler:
92+
_component_: torchtune.training.setup_torch_profiler
93+
94+
enabled: False
95+
96+
#Output directory of trace artifacts
97+
output_dir: ${output_dir}/profiling_outputs
98+
99+
#`torch.profiler.ProfilerActivity` types to trace
100+
cpu: True
101+
cuda: True
102+
103+
#trace options passed to `torch.profiler.profile`
104+
profile_memory: False
105+
with_stack: False
106+
record_shapes: True
107+
with_flops: False
108+
109+
# `torch.profiler.schedule` options:
110+
# wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
111+
wait_steps: 5
112+
warmup_steps: 5
113+
active_steps: 2
114+
num_cycles: 1
Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
# Config for single device LoRA finetuning in lora_finetune_single_device.py
2+
# using a Qwen2.5 0.5B model
3+
#
4+
# This config assumes that you've run the following command before launching
5+
# this run:
6+
# tune download Qwen/Qwen2.5-0.5B-Instruct --output-dir /tmp/Qwen2_5-0_5B-Instruct --ignore-patterns None
7+
#
8+
# To launch on a single device, run the following command from root:
9+
# tune run lora_finetune_single_device --config qwen2_5/0_5B_lora_single_device
10+
#
11+
# You can add specific overrides through the command line. For example
12+
# to override the checkpointer directory while launching training
13+
# you can run:
14+
# tune run lora_finetune_single_device --config qwen2_5/0_5B_lora_single_device checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
15+
#
16+
# This config works only for training on single device.
17+
18+
19+
# Model Arguments
20+
model:
21+
_component_: torchtune.models.qwen2_5.lora_qwen2_5_0_5b
22+
lora_attn_modules: ['q_proj', 'v_proj']
23+
apply_lora_to_mlp: False
24+
apply_lora_to_output: False
25+
lora_rank: 32
26+
lora_alpha: 64
27+
lora_dropout: 0.0
28+
29+
tokenizer:
30+
_component_: torchtune.models.qwen2_5.qwen2_5_tokenizer
31+
path: /tmp/Qwen2_5-0_5B-Instruct/vocab.json
32+
merges_file: /tmp/Qwen2_5-0_5B-Instruct/merges.txt
33+
max_seq_len: null
34+
35+
checkpointer:
36+
_component_: torchtune.training.FullModelHFCheckpointer
37+
checkpoint_dir: /tmp/Qwen2_5-0_5B-Instruct
38+
checkpoint_files: [
39+
model.safetensors
40+
]
41+
recipe_checkpoint: null
42+
output_dir: /tmp/Qwen2_5-0_5B-Instruct-lora-finetune
43+
model_type: QWEN2
44+
resume_from_checkpoint: False
45+
46+
# Dataset and Sampler
47+
dataset:
48+
_component_: torchtune.datasets.alpaca_cleaned_dataset
49+
packed: False
50+
seed: null
51+
shuffle: True
52+
batch_size: 4
53+
54+
# Optimizer and Scheduler
55+
optimizer:
56+
_component_: torch.optim.AdamW
57+
fused: True
58+
weight_decay: 0.01
59+
lr: 2e-3
60+
61+
lr_scheduler:
62+
_component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
63+
num_warmup_steps: 100
64+
65+
loss:
66+
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
67+
68+
# Training
69+
epochs: 1
70+
max_steps_per_epoch: null
71+
gradient_accumulation_steps: 4
72+
compile: False
73+
74+
# Logging
75+
output_dir: /tmp/Qwen2_5-0_5B-Instruct-lora-finetune
76+
metric_logger:
77+
_component_: torchtune.training.metric_logging.DiskLogger
78+
log_dir: ${output_dir}
79+
log_every_n_steps: 1
80+
log_peak_memory_stats: False
81+
82+
# Environment
83+
device: cuda
84+
dtype: bf16
85+
86+
# Activations Offloading
87+
enable_activation_checkpointing: True
88+
enable_activation_offloading: False
89+
90+
# Show case the usage of pytorch profiler
91+
# Set enabled to False as it's only needed for debugging training
92+
profiler:
93+
_component_: torchtune.training.setup_torch_profiler
94+
enabled: False
95+
96+
#Output directory of trace artifacts
97+
output_dir: ${output_dir}/profiling_outputs
98+
99+
#`torch.profiler.ProfilerActivity` types to trace
100+
cpu: True
101+
cuda: True
102+
103+
#trace options passed to `torch.profiler.profile`
104+
profile_memory: False
105+
with_stack: False
106+
record_shapes: True
107+
with_flops: False
108+
109+
# `torch.profiler.schedule` options:
110+
# wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
111+
wait_steps: 5
112+
warmup_steps: 5
113+
active_steps: 2
114+
num_cycles: 1

0 commit comments

Comments
 (0)