Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,82 +1,75 @@
# Config for multi-device full finetuning in full_finetune_distributed.py
# using a Qwen2.5 0.5B model
#
# This config assumes that you've run the following command before launching
# this run:
# tune download Qwen/Qwen2.5-0.5B-Instruct --output-dir /tmp/Qwen2_5-0_5B-Instruct --ignore-patterns None
# This config assumes that you've run the following command before launching:
# tune download Qwen/Qwen2.5-0.5B-Instruct --ignore-patterns None
#
# To launch on 2 devices, run the following command from root:
# tune run --nnodes 1 --nproc_per_node 2 full_finetune_distributed --config qwen2_5/0_5B_full
# tune run --nproc_per_node 2 full_finetune_distributed --config qwen2_5/0.5B_full
#
# You can add specific overrides through the command line. For example
# to override the checkpointer directory while launching training
# you can run:
# tune run --nnodes 1 --nproc_per_node 2 full_finetune_distributed --config qwen2_5/0_5B_full checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
# to override the checkpointer directory while launching training:
# tune run --nproc_per_node 2 full_finetune_distributed --config qwen2_5/0.5B_full checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
#
# This config works best when the model is being fine-tuned on 2+ GPUs.
# Single device full finetuning requires more memory optimizations. It's
# best to use 0_5B_full_single_device.yaml for those cases
# This config is for fine-tuning on 2+ GPUs.

# Model arguments
model:
_component_: torchtune.models.qwen2_5.qwen2_5_0_5b

# Tokenizer
tokenizer:
_component_: torchtune.models.qwen2_5.qwen2_5_tokenizer
path: /tmp/Qwen2_5-0_5B-Instruct/vocab.json
merges_file: /tmp/Qwen2_5-0_5B-Instruct/merges.txt
path: /tmp/Qwen2.5-0.5B-Instruct/vocab.json
merges_file: /tmp/Qwen2.5-0.5B-Instruct/merges.txt
max_seq_len: null

# Checkpointer
checkpointer:
_component_: torchtune.training.FullModelHFCheckpointer
checkpoint_dir: /tmp/Qwen2.5-0.5B-Instruct
checkpoint_files: [model.safetensors]
recipe_checkpoint: null
output_dir: /tmp/Qwen2.5-0.5B-Instruct-finetune
model_type: QWEN2
resume_from_checkpoint: False

# Dataset
dataset:
_component_: torchtune.datasets.alpaca_cleaned_dataset
packed: False # True increases speed
seed: null
shuffle: True

# Model Arguments
model:
_component_: torchtune.models.qwen2_5.qwen2_5_0_5b

checkpointer:
_component_: torchtune.training.FullModelHFCheckpointer
checkpoint_dir: /tmp/Qwen2_5-0_5B-Instruct
checkpoint_files: [
model.safetensors
]
recipe_checkpoint: null
output_dir: /tmp/Qwen2_5-0_5B-Instruct-finetune
model_type: QWEN2
resume_from_checkpoint: False

# Fine-tuning arguments
batch_size: 2
epochs: 1
max_steps_per_epoch: null
batch_size: 2
gradient_accumulation_steps: 8 # Use to increase virtual batch size
optimizer:
_component_: torch.optim.AdamW
fused: True
lr: 2e-5
optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1
loss:
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
max_steps_per_epoch: null
gradient_accumulation_steps: 8 # Use to increase virtual batch size
compile: False # pytorch compile, set to true for better perf/memory
optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1

# Training env
device: cuda

# Memory management
enable_activation_checkpointing: True # True reduces memory
# Memory management / performance
enable_activation_checkpointing: False # True reduces memory
enable_activation_offloading: False # True reduces memory

# Reduced precision
dtype: bf16
compile: False # torch.compile the model + loss, True increases speed + decreases memory

# Logging
output_dir: /tmp/Qwen2.5-0.5B-Instruct-finetune
metric_logger:
_component_: torchtune.training.metric_logging.DiskLogger
log_dir: ${output_dir}
output_dir: /tmp/Qwen2_5-0_5B-Instruct-finetune
log_dir: ${output_dir}/logs
log_every_n_steps: 1
log_peak_memory_stats: False
log_peak_memory_stats: True

# Profiler (disabled)
profiler:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,86 +1,75 @@
# Config for single device full finetuning in full_finetune_single_device.py
# using a Qwen2.5 0.5B
#
# This config assumes that you've run the following command before launching
# this run:
# tune download Qwen/Qwen2.5-0.5B-Instruct --output-dir /tmp/Qwen2_5-0_5B-Instruct --ignore-patterns None
#
# The default config uses an optimizer from bitsandbytes. If you do not have it installed,
# you can install it with
# pip install bitsandbytes
# This config assumes that you've run the following command before launching:
# tune download Qwen/Qwen2.5-0.5B-Instruct --ignore-patterns None
#
# To launch on a single device, run the following command from root:
# tune run full_finetune_single_device --config qwen2_5/0_5B_full_single_device
# tune run full_finetune_single_device --config qwen2_5/0.5B_full_single_device
#
# You can add specific overrides through the command line. For example
# to override the checkpointer directory while launching training
# you can run:
# tune run full_finetune_single_device --config qwen2_5/0_5B_full_single_device checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
# to override the checkpointer directory while launching training:
# tune run full_finetune_single_device --config qwen2_5/0.5B_full_single_device checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
#
# This config works only for training on single device.

# Model arguments
model:
_component_: torchtune.models.qwen2_5.qwen2_5_0_5b
Copy link
Contributor

@ebsmothers ebsmothers Nov 13, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I understand that you're making the filename change 0_5B -> 0.5B for consistency with other configs, but honestly would prefer to just move everything to 0_5B so it matches the builders (doesn't have to be in this PR though)

Copy link
Contributor

@felipemello1 felipemello1 Nov 13, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1, i would prefer if we avoided using periods for names, and only use them when they are a path or file type

Copy link
Contributor

@felipemello1 felipemello1 Nov 13, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for llama3.2 we added 3.2 to the path like you did, but not the components

image

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the same thing we have here, no?


# Tokenizer
tokenizer:
_component_: torchtune.models.qwen2_5.qwen2_5_tokenizer
path: /tmp/Qwen2_5-0_5B-Instruct/vocab.json
merges_file: /tmp/Qwen2_5-0_5B-Instruct/merges.txt
path: /tmp/Qwen2.5-0.5B-Instruct/vocab.json
merges_file: /tmp/Qwen2.5-0.5B-Instruct/merges.txt
max_seq_len: null

# Checkpointer
checkpointer:
_component_: torchtune.training.FullModelHFCheckpointer
checkpoint_dir: /tmp/Qwen2.5-0.5B-Instruct
checkpoint_files: [model.safetensors]
recipe_checkpoint: null
output_dir: /tmp/Qwen2.5-0.5B-Instruct-finetune
model_type: QWEN2
resume_from_checkpoint: False

# Dataset
dataset:
_component_: torchtune.datasets.alpaca_cleaned_dataset
packed: False # True increases speed
seed: null
shuffle: True

# Model Arguments
model:
_component_: torchtune.models.qwen2_5.qwen2_5_0_5b

checkpointer:
_component_: torchtune.training.FullModelHFCheckpointer
checkpoint_dir: /tmp/Qwen2_5-0_5B-Instruct
checkpoint_files: [
model.safetensors
]
recipe_checkpoint: null
output_dir: /tmp/Qwen2_5-0_5B-Instruct-finetune
model_type: QWEN2
resume_from_checkpoint: False

# Fine-tuning arguments
batch_size: 2
epochs: 1
max_steps_per_epoch: null
batch_size: 2
gradient_accumulation_steps: 8 # Use to increase virtual batch size
optimizer:
_component_: torch.optim.AdamW
fused: True
lr: 2e-5

optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1
loss:
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1

max_steps_per_epoch: null
gradient_accumulation_steps: 8 # Use to increase virtual batch size
compile: False # pytorch compile, set to true for better perf/memory

# Training environment
# Training env
device: cuda

# Memory management
enable_activation_checkpointing: True # True reduces memory
# Memory management / performance
enable_activation_checkpointing: False # True reduces memory
enable_activation_offloading: False # True reduces memory

# Reduced precision
dtype: bf16
compile: False # torch.compile the model + loss, True increases speed + decreases memory

# Logging
output_dir: /tmp/Qwen2.5-0.5B-Instruct-finetune
metric_logger:
_component_: torchtune.training.metric_logging.DiskLogger
log_dir: ${output_dir}
output_dir: /tmp/Qwen2_5-0_5B-Instruct-finetune
log_dir: ${output_dir}/logs
log_every_n_steps: 1
log_peak_memory_stats: False
log_peak_memory_stats: True

# Profiler (disabled)
profiler:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,23 +1,19 @@
# Config for multi-device LoRA finetuning in lora_finetune_distributed.py
# using a Qwen2.5 0.5B model
#
# This config assumes that you've run the following command before launching
# this run:
# tune download Qwen/Qwen2.5-0.5B-Instruct --output-dir /tmp/Qwen2_5-0_5B-Instruct --ignore-patterns None
# This config assumes that you've run the following command before launching:
# tune download Qwen/Qwen2.5-0.5B-Instruct --ignore-patterns None
#
# To launch on 2 devices, run the following command from root:
# tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config qwen2_5/0_5B_lora
# tune run --nproc_per_node 2 lora_finetune_distributed --config qwen2_5/0.5B_lora
#
# You can add specific overrides through the command line. For example
# to override the checkpointer directory while launching training
# you can run:
# tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config qwen2_5/0_5B_lora checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
# to override the checkpointer directory while launching training:
# tune run --nproc_per_node 2 lora_finetune_distributed --config qwen2_5/0.5B_lora checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
#
# This config works best when the model is being fine-tuned on 2+ GPUs.
# For single device LoRA finetuning please use 0_5B_lora_single_device.yaml
# This config is for fine-tuning on 2+ GPUs.


# Model Arguments
# Model arguments
model:
_component_: torchtune.models.qwen2_5.lora_qwen2_5_0_5b
lora_attn_modules: ['q_proj', 'v_proj', 'output_proj']
Expand All @@ -26,71 +22,66 @@ model:
lora_alpha: 64 # usually alpha=2*rank
lora_dropout: 0.0

# Tokenizer
tokenizer:
_component_: torchtune.models.qwen2_5.qwen2_5_tokenizer
path: /tmp/Qwen2_5-0_5B-Instruct/vocab.json
merges_file: /tmp/Qwen2_5-0_5B-Instruct/merges.txt
path: /tmp/Qwen2.5-0.5B-Instruct/vocab.json
merges_file: /tmp/Qwen2.5-0.5B-Instruct/merges.txt
max_seq_len: null

# Checkpointer
checkpointer:
_component_: torchtune.training.FullModelHFCheckpointer
checkpoint_dir: /tmp/Qwen2_5-0_5B-Instruct
checkpoint_files: [
model.safetensors
]
checkpoint_dir: /tmp/Qwen2.5-0.5B-Instruct
checkpoint_files: [model.safetensors]
recipe_checkpoint: null
output_dir: /tmp/Qwen2_5-0_5B-Instruct-lora-finetune
output_dir: /tmp/Qwen2.5-0.5B-Instruct-lora-finetune
model_type: QWEN2
resume_from_checkpoint: False

# Dataset and Sampler
# Dataset
dataset:
_component_: torchtune.datasets.alpaca_cleaned_dataset
packed: False # True increases speed

seed: null
shuffle: True
batch_size: 4

# Optimizer and Scheduler
# Fine-tuning arguments
epochs: 1
max_steps_per_epoch: null
batch_size: 2
gradient_accumulation_steps: 8 # Use to increase virtual batch size
optimizer:
_component_: torch.optim.AdamW
fused: True
weight_decay: 0.01
lr: 2e-3

lr_scheduler:
_component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
num_warmup_steps: 100

loss:
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss

# Training
epochs: 1
max_steps_per_epoch: null
gradient_accumulation_steps: 8 # Use to increase virtual batch size
compile: False # pytorch compile, set to true for better perf/memory
# Training env
device: cuda

# Memory management / performance
enable_activation_checkpointing: False # True reduces memory
enable_activation_offloading: False # True reduces memory
dtype: bf16
compile: False # torch.compile the model + loss, True increases speed + decreases memory

# Logging
output_dir: /tmp/Qwen2_5-0_5B-Instruct-lora-finetune
output_dir: /tmp/Qwen2.5-0.5B-Instruct-lora-finetune
metric_logger:
_component_: torchtune.training.metric_logging.DiskLogger
log_dir: ${output_dir}
log_dir: ${output_dir}/logs
log_every_n_steps: 1
log_peak_memory_stats: False
log_peak_memory_stats: True

# Environment
device: cuda
dtype: bf16
enable_activation_checkpointing: True # True reduces memory
enable_activation_offloading: False # True reduces memory

# Show case the usage of pytorch profiler
# Set enabled to False as it's only needed for debugging training
# Profiler (disabled)
profiler:
_component_: torchtune.training.setup_torch_profiler

enabled: False

#Output directory of trace artifacts
Expand All @@ -109,6 +100,6 @@ profiler:
# `torch.profiler.schedule` options:
# wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
wait_steps: 5
warmup_steps: 5
warmup_steps: 3
active_steps: 2
num_cycles: 1
Loading
Loading