diff --git a/recipes/configs/qwen2_5/0_5B_full.yaml b/recipes/configs/qwen2_5/0_5B_full.yaml new file mode 100644 index 0000000000..341c054991 --- /dev/null +++ b/recipes/configs/qwen2_5/0_5B_full.yaml @@ -0,0 +1,77 @@ +# Config for multi-device full finetuning in full_finetune_distributed.py +# using a Qwen2.5 0.5B model +# +# This config assumes that you've run the following command before launching +# this run: +# tune download Qwen/Qwen2.5-0.5B-Instruct --output-dir /tmp/Qwen2_5-0_5B-Instruct --ignore-patterns None +# +# To launch on 2 devices, run the following command from root: +# tune run --nnodes 1 --nproc_per_node 2 full_finetune_distributed --config qwen2_5/0_5B_full +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run --nnodes 1 --nproc_per_node 2 full_finetune_distributed --config qwen2_5/0_5B_full checkpointer.checkpoint_dir= +# +# This config works best when the model is being fine-tuned on 2+ GPUs. +# Single device full finetuning requires more memory optimizations. It's +# best to use 0_5B_full_single_device.yaml for those cases + +# Tokenizer +tokenizer: + _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer + path: /tmp/Qwen2_5-0_5B-Instruct/vocab.json + merges_file: /tmp/Qwen2_5-0_5B-Instruct/merges.txt + max_seq_len: null + +# Dataset +dataset: + _component_: torchtune.datasets.alpaca_cleaned_dataset + packed: False +seed: null +shuffle: True + +# Model Arguments +model: + _component_: torchtune.models.qwen2_5.qwen2_5_0_5b + +checkpointer: + _component_: torchtune.training.FullModelHFCheckpointer + checkpoint_dir: /tmp/Qwen2_5-0_5B-Instruct + checkpoint_files: [ + model.safetensors + ] + recipe_checkpoint: null + output_dir: /tmp/Qwen2_5-0_5B-Instruct-finetune + model_type: QWEN2 +resume_from_checkpoint: False + +# Fine-tuning arguments +batch_size: 2 +epochs: 1 +optimizer: + _component_: torch.optim.AdamW + fused: True + lr: 2e-5 +loss: + _component_: torchtune.modules.loss.CEWithChunkedOutputLoss +max_steps_per_epoch: null +gradient_accumulation_steps: 16 +compile: False + +# Training env +device: cuda + +# Memory management +enable_activation_checkpointing: True + +# Reduced precision +dtype: bf16 + +# Logging +metric_logger: + _component_: torchtune.training.metric_logging.DiskLogger + log_dir: ${output_dir} +output_dir: /tmp/Qwen2_5-0_5B-Instruct-finetune +log_every_n_steps: 1 +log_peak_memory_stats: False diff --git a/recipes/configs/qwen2_5/0_5B_full_single_device.yaml b/recipes/configs/qwen2_5/0_5B_full_single_device.yaml new file mode 100644 index 0000000000..58059e06a9 --- /dev/null +++ b/recipes/configs/qwen2_5/0_5B_full_single_device.yaml @@ -0,0 +1,82 @@ +# Config for single device full finetuning in full_finetune_single_device.py +# using a Qwen2.5 0.5B +# +# This config assumes that you've run the following command before launching +# this run: +# tune download Qwen/Qwen2.5-0.5B-Instruct --output-dir /tmp/Qwen2_5-0_5B-Instruct --ignore-patterns None +# +# The default config uses an optimizer from bitsandbytes. If you do not have it installed, +# you can install it with +# pip install bitsandbytes +# +# To launch on a single device, run the following command from root: +# tune run full_finetune_single_device --config qwen2_5/0_5B_full_single_device +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run full_finetune_single_device --config qwen2_5/0_5B_full_single_device checkpointer.checkpoint_dir= +# +# This config works only for training on single device. + +# Tokenizer +tokenizer: + _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer + path: /tmp/Qwen2_5-0_5B-Instruct/vocab.json + merges_file: /tmp/Qwen2_5-0_5B-Instruct/merges.txt + max_seq_len: null + +# Dataset +dataset: + _component_: torchtune.datasets.alpaca_cleaned_dataset + packed: False +seed: null +shuffle: True + +# Model Arguments +model: + _component_: torchtune.models.qwen2_5.qwen2_5_0_5b + +checkpointer: + _component_: torchtune.training.FullModelHFCheckpointer + checkpoint_dir: /tmp/Qwen2_5-0_5B-Instruct + checkpoint_files: [ + model.safetensors + ] + recipe_checkpoint: null + output_dir: /tmp/Qwen2_5-0_5B-Instruct-finetune + model_type: QWEN2 +resume_from_checkpoint: False + +# Fine-tuning arguments +batch_size: 2 +epochs: 1 +optimizer: + _component_: torch.optim.AdamW + fused: True + lr: 2e-5 + +loss: + _component_: torchtune.modules.loss.CEWithChunkedOutputLoss +optimizer_in_bwd: False + +max_steps_per_epoch: null +gradient_accumulation_steps: 8 +compile: False + +# Training environment +device: cuda + +# Memory management +enable_activation_checkpointing: True + +# Reduced precision +dtype: bf16 + +# Logging +metric_logger: + _component_: torchtune.training.metric_logging.DiskLogger + log_dir: ${output_dir} +output_dir: /tmp/Qwen2_5-0_5B-Instruct-finetune +log_every_n_steps: 1 +log_peak_memory_stats: False diff --git a/recipes/configs/qwen2_5/0_5B_lora.yaml b/recipes/configs/qwen2_5/0_5B_lora.yaml new file mode 100644 index 0000000000..c6a4af1ee4 --- /dev/null +++ b/recipes/configs/qwen2_5/0_5B_lora.yaml @@ -0,0 +1,114 @@ +# Config for multi-device LoRA finetuning in lora_finetune_distributed.py +# using a Qwen2.5 0.5B model +# +# This config assumes that you've run the following command before launching +# this run: +# tune download Qwen/Qwen2.5-0.5B-Instruct --output-dir /tmp/Qwen2_5-0_5B-Instruct --ignore-patterns None +# +# To launch on 2 devices, run the following command from root: +# tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config qwen2_5/0_5B_lora +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config qwen2_5/0_5B_lora checkpointer.checkpoint_dir= +# +# This config works best when the model is being fine-tuned on 2+ GPUs. +# For single device LoRA finetuning please use 0_5B_lora_single_device.yaml + + +# Model Arguments +model: + _component_: torchtune.models.qwen2_5.lora_qwen2_5_0_5b + lora_attn_modules: ['q_proj', 'v_proj'] + apply_lora_to_mlp: False + apply_lora_to_output: False + lora_rank: 32 + lora_alpha: 64 + lora_dropout: 0.0 + +tokenizer: + _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer + path: /tmp/Qwen2_5-0_5B-Instruct/vocab.json + merges_file: /tmp/Qwen2_5-0_5B-Instruct/merges.txt + max_seq_len: null + +checkpointer: + _component_: torchtune.training.FullModelHFCheckpointer + checkpoint_dir: /tmp/Qwen2_5-0_5B-Instruct + checkpoint_files: [ + model.safetensors + ] + recipe_checkpoint: null + output_dir: /tmp/Qwen2_5-0_5B-Instruct-lora-finetune + model_type: QWEN2 +resume_from_checkpoint: False + +# Dataset and Sampler +dataset: + _component_: torchtune.datasets.alpaca_cleaned_dataset + packed: False + +seed: null +shuffle: True +batch_size: 4 + +# Optimizer and Scheduler +optimizer: + _component_: torch.optim.AdamW + fused: True + weight_decay: 0.01 + lr: 2e-3 + +lr_scheduler: + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup + num_warmup_steps: 100 + +loss: + _component_: torchtune.modules.loss.CEWithChunkedOutputLoss + +# Training +epochs: 1 +max_steps_per_epoch: null +gradient_accumulation_steps: 4 +compile: False + +# Logging +output_dir: /tmp/Qwen2_5-0_5B-Instruct-lora-finetune +metric_logger: + _component_: torchtune.training.metric_logging.DiskLogger + log_dir: ${output_dir} +log_every_n_steps: 1 +log_peak_memory_stats: False + +# Environment +device: cuda +dtype: bf16 +enable_activation_checkpointing: True + +# Show case the usage of pytorch profiler +# Set enabled to False as it's only needed for debugging training +profiler: + _component_: torchtune.training.setup_torch_profiler + + enabled: False + + #Output directory of trace artifacts + output_dir: ${output_dir}/profiling_outputs + + #`torch.profiler.ProfilerActivity` types to trace + cpu: True + cuda: True + + #trace options passed to `torch.profiler.profile` + profile_memory: False + with_stack: False + record_shapes: True + with_flops: False + + # `torch.profiler.schedule` options: + # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat + wait_steps: 5 + warmup_steps: 5 + active_steps: 2 + num_cycles: 1 diff --git a/recipes/configs/qwen2_5/0_5B_lora_single_device.yaml b/recipes/configs/qwen2_5/0_5B_lora_single_device.yaml new file mode 100644 index 0000000000..2d9c089774 --- /dev/null +++ b/recipes/configs/qwen2_5/0_5B_lora_single_device.yaml @@ -0,0 +1,114 @@ +# Config for single device LoRA finetuning in lora_finetune_single_device.py +# using a Qwen2.5 0.5B model +# +# This config assumes that you've run the following command before launching +# this run: +# tune download Qwen/Qwen2.5-0.5B-Instruct --output-dir /tmp/Qwen2_5-0_5B-Instruct --ignore-patterns None +# +# To launch on a single device, run the following command from root: +# tune run lora_finetune_single_device --config qwen2_5/0_5B_lora_single_device +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run lora_finetune_single_device --config qwen2_5/0_5B_lora_single_device checkpointer.checkpoint_dir= +# +# This config works only for training on single device. + + +# Model Arguments +model: + _component_: torchtune.models.qwen2_5.lora_qwen2_5_0_5b + lora_attn_modules: ['q_proj', 'v_proj'] + apply_lora_to_mlp: False + apply_lora_to_output: False + lora_rank: 32 + lora_alpha: 64 + lora_dropout: 0.0 + +tokenizer: + _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer + path: /tmp/Qwen2_5-0_5B-Instruct/vocab.json + merges_file: /tmp/Qwen2_5-0_5B-Instruct/merges.txt + max_seq_len: null + +checkpointer: + _component_: torchtune.training.FullModelHFCheckpointer + checkpoint_dir: /tmp/Qwen2_5-0_5B-Instruct + checkpoint_files: [ + model.safetensors + ] + recipe_checkpoint: null + output_dir: /tmp/Qwen2_5-0_5B-Instruct-lora-finetune + model_type: QWEN2 +resume_from_checkpoint: False + +# Dataset and Sampler +dataset: + _component_: torchtune.datasets.alpaca_cleaned_dataset + packed: False +seed: null +shuffle: True +batch_size: 4 + +# Optimizer and Scheduler +optimizer: + _component_: torch.optim.AdamW + fused: True + weight_decay: 0.01 + lr: 2e-3 + +lr_scheduler: + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup + num_warmup_steps: 100 + +loss: + _component_: torchtune.modules.loss.CEWithChunkedOutputLoss + +# Training +epochs: 1 +max_steps_per_epoch: null +gradient_accumulation_steps: 4 +compile: False + +# Logging +output_dir: /tmp/Qwen2_5-0_5B-Instruct-lora-finetune +metric_logger: + _component_: torchtune.training.metric_logging.DiskLogger + log_dir: ${output_dir} +log_every_n_steps: 1 +log_peak_memory_stats: False + +# Environment +device: cuda +dtype: bf16 + +# Activations Offloading +enable_activation_checkpointing: True +enable_activation_offloading: False + +# Show case the usage of pytorch profiler +# Set enabled to False as it's only needed for debugging training +profiler: + _component_: torchtune.training.setup_torch_profiler + enabled: False + + #Output directory of trace artifacts + output_dir: ${output_dir}/profiling_outputs + + #`torch.profiler.ProfilerActivity` types to trace + cpu: True + cuda: True + + #trace options passed to `torch.profiler.profile` + profile_memory: False + with_stack: False + record_shapes: True + with_flops: False + + # `torch.profiler.schedule` options: + # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat + wait_steps: 5 + warmup_steps: 5 + active_steps: 2 + num_cycles: 1 diff --git a/recipes/configs/qwen2_5/14B_lora_single_device.yaml b/recipes/configs/qwen2_5/14B_lora_single_device.yaml new file mode 100644 index 0000000000..d89710d1a6 --- /dev/null +++ b/recipes/configs/qwen2_5/14B_lora_single_device.yaml @@ -0,0 +1,120 @@ +# Config for single device LoRA finetuning in lora_finetune_single_device.py +# using a Qwen2.5 14B model +# +# This config assumes that you've run the following command before launching +# this run: +# tune download Qwen/Qwen2.5-14B-Instruct --output-dir /tmp/Qwen2_5-14B-Instruct --ignore-patterns None +# +# To launch on a single device, run the following command from root: +# tune run lora_finetune_single_device --config qwen2_5/14B_lora_single_device +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run lora_finetune_single_device --config qwen2_5/14B_lora_single_device checkpointer.checkpoint_dir= +# +# This config works only for training on single device. + + +# Model Arguments +model: + _component_: torchtune.models.qwen2_5.lora_qwen2_5_14b_instruct + lora_attn_modules: ['q_proj', 'v_proj'] + apply_lora_to_mlp: False + apply_lora_to_output: False + lora_rank: 8 + lora_alpha: 16 + lora_dropout: 0.0 + +tokenizer: + _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer + path: /tmp/Qwen2_5-14B-Instruct/vocab.json + merges_file: /tmp/Qwen2_5-14B-Instruct/merges.txt + max_seq_len: null + +checkpointer: + _component_: torchtune.training.FullModelHFCheckpointer + checkpoint_dir: /tmp/Qwen2_5-14B-Instruct + checkpoint_files: [ + model-00001-of-00008.safetensors, + model-00002-of-00008.safetensors, + model-00003-of-00008.safetensors, + model-00004-of-00008.safetensors, + model-00005-of-00008.safetensors, + model-00006-of-00008.safetensors, + model-00007-of-00008.safetensors, + model-00008-of-00008.safetensors, + ] + recipe_checkpoint: null + output_dir: /tmp/Qwen2_5-14B-Instruct-lora-finetune + model_type: QWEN2 +resume_from_checkpoint: False + +# Dataset and Sampler +dataset: + _component_: torchtune.datasets.alpaca_cleaned_dataset + packed: False +seed: null +shuffle: True +batch_size: 2 + +# Optimizer and Scheduler +optimizer: + _component_: torch.optim.AdamW + fused: True + weight_decay: 0.01 + lr: 3e-4 +lr_scheduler: + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup + num_warmup_steps: 100 + +loss: + _component_: torchtune.modules.loss.CEWithChunkedOutputLoss + +# Training +epochs: 1 +max_steps_per_epoch: null +gradient_accumulation_steps: 64 +compile: False + +# Logging +output_dir: /tmp/Qwen2_5-14B-Instruct-lora-finetune +metric_logger: + _component_: torchtune.training.metric_logging.DiskLogger + log_dir: ${output_dir} +log_every_n_steps: 1 +log_peak_memory_stats: False + +# Environment +device: cuda +dtype: bf16 + +# Activations Offloading +enable_activation_checkpointing: True +enable_activation_offloading: False + +# Show case the usage of pytorch profiler +# Set enabled to False as it's only needed for debugging training +profiler: + _component_: torchtune.training.setup_torch_profiler + enabled: False + + #Output directory of trace artifacts + output_dir: ${output_dir}/profiling_outputs + + #`torch.profiler.ProfilerActivity` types to trace + cpu: True + cuda: True + + #trace options passed to `torch.profiler.profile` + profile_memory: False + with_stack: False + record_shapes: True + with_flops: False + + # `torch.profiler.schedule` options: + # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat + wait_steps: 5 + warmup_steps: 5 + active_steps: 2 + num_cycles: 1 diff --git a/recipes/configs/qwen2_5/1_5B_full.yaml b/recipes/configs/qwen2_5/1_5B_full.yaml new file mode 100644 index 0000000000..9456200422 --- /dev/null +++ b/recipes/configs/qwen2_5/1_5B_full.yaml @@ -0,0 +1,77 @@ +# Config for multi-device full finetuning in full_finetune_distributed.py +# using a Qwen2.5 1.5B model +# +# This config assumes that you've run the following command before launching +# this run: +# tune download Qwen/Qwen2.5-1.5B-Instruct --output-dir /tmp/Qwen2_5-1_5B-Instruct --ignore-patterns None +# +# To launch on 2 devices, run the following command from root: +# tune run --nnodes 1 --nproc_per_node 2 full_finetune_distributed --config qwen2_5/1_5B_full +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run --nnodes 1 --nproc_per_node 2 full_finetune_distributed --config qwen2_5/1_5B_full checkpointer.checkpoint_dir= +# +# This config works best when the model is being fine-tuned on 2+ GPUs. +# Single device full finetuning requires more memory optimizations. It's +# best to use 1_5B_full_single_device.yaml for those cases + +# Tokenizer +tokenizer: + _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer + path: /tmp/Qwen2_5-1_5B-Instruct/vocab.json + merges_file: /tmp/Qwen2_5-1_5B-Instruct/merges.txt + max_seq_len: null + +# Dataset +dataset: + _component_: torchtune.datasets.alpaca_cleaned_dataset + packed: False +seed: null +shuffle: True + +# Model Arguments +model: + _component_: torchtune.models.qwen2_5.qwen2_5_1_5b_instruct + +checkpointer: + _component_: torchtune.training.FullModelHFCheckpointer + checkpoint_dir: /tmp/Qwen2_5-1_5B-Instruct + checkpoint_files: [ + model.safetensors + ] + recipe_checkpoint: null + output_dir: /tmp/Qwen2_5-1_5B-Instruct-finetune + model_type: QWEN2 +resume_from_checkpoint: False + +# Fine-tuning arguments +batch_size: 2 +epochs: 3 +optimizer: + _component_: torch.optim.AdamW + fused: True + lr: 2e-5 +loss: + _component_: torchtune.modules.loss.CEWithChunkedOutputLoss +max_steps_per_epoch: null +gradient_accumulation_steps: 1 +compile: False + +# Training env +device: cuda + +# Memory management +enable_activation_checkpointing: False + +# Reduced precision +dtype: bf16 + +# Logging +metric_logger: + _component_: torchtune.training.metric_logging.DiskLogger + log_dir: ${output_dir} +output_dir: /tmp/Qwen2_5-1_5B-Instruct-finetune +log_every_n_steps: 1 +log_peak_memory_stats: False diff --git a/recipes/configs/qwen2_5/1_5B_full_single_device.yaml b/recipes/configs/qwen2_5/1_5B_full_single_device.yaml new file mode 100644 index 0000000000..6a78521c80 --- /dev/null +++ b/recipes/configs/qwen2_5/1_5B_full_single_device.yaml @@ -0,0 +1,82 @@ +# Config for single device full finetuning in full_finetune_single_device.py +# using a Qwen2.5 1.5B +# +# This config assumes that you've run the following command before launching +# this run: +# tune download Qwen/Qwen2.5-1.5B-Instruct --output-dir /tmp/Qwen2_5-1_5B-Instruct --ignore-patterns None +# +# The default config uses an optimizer from bitsandbytes. If you do not have it installed, +# you can install it with +# pip install bitsandbytes +# +# To launch on a single device, run the following command from root: +# tune run full_finetune_single_device --config qwen2_5/1_5B_full_single_device +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run full_finetune_single_device --config qwen2_5/1_5B_full_single_device checkpointer.checkpoint_dir= +# +# This config works only for training on single device. + +# Tokenizer +tokenizer: + _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer + path: /tmp/Qwen2_5-1_5B-Instruct/vocab.json + merges_file: /tmp/Qwen2_5-1_5B-Instruct/merges.txt + max_seq_len: null + +# Dataset +dataset: + _component_: torchtune.datasets.alpaca_cleaned_dataset + packed: False +seed: null +shuffle: True + +# Model Arguments +model: + _component_: torchtune.models.qwen2_5.qwen2_5_1_5b_instruct + +checkpointer: + _component_: torchtune.training.FullModelHFCheckpointer + checkpoint_dir: /tmp/Qwen2_5-1_5B-Instruct + checkpoint_files: [ + model.safetensors + ] + recipe_checkpoint: null + output_dir: /tmp/Qwen2_5-1_5B-Instruct-finetune + model_type: QWEN2 +resume_from_checkpoint: False + +# Fine-tuning arguments +batch_size: 2 +epochs: 1 +optimizer: + _component_: bitsandbytes.optim.PagedAdamW + lr: 2e-5 + +optimizer_in_bwd: True + +loss: + _component_: torchtune.modules.loss.CEWithChunkedOutputLoss + +max_steps_per_epoch: null +gradient_accumulation_steps: 1 +compile: False + +# Training environment +device: cuda + +# Memory management +enable_activation_checkpointing: True + +# Reduced precision +dtype: bf16 + +# Logging +metric_logger: + _component_: torchtune.training.metric_logging.DiskLogger + log_dir: ${output_dir} +output_dir: /tmp/Qwen2_5-1_5B-Instruct-finetune +log_every_n_steps: 1 +log_peak_memory_stats: False diff --git a/recipes/configs/qwen2_5/1_5B_lora.yaml b/recipes/configs/qwen2_5/1_5B_lora.yaml new file mode 100644 index 0000000000..9e3cfad1b6 --- /dev/null +++ b/recipes/configs/qwen2_5/1_5B_lora.yaml @@ -0,0 +1,112 @@ +# Config for multi-device LoRA finetuning in lora_finetune_distributed.py +# using a Qwen2.5 1.5B model +# +# This config assumes that you've run the following command before launching +# this run: +# tune download Qwen/Qwen2.5-1.5B-Instruct --output-dir /tmp/Qwen2_5-1_5B-Instruct --ignore-patterns None +# +# To launch on 2 devices, run the following command from root: +# tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config qwen2_5/1_5B_lora +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config qwen2_5/1_5B_lora checkpointer.checkpoint_dir= +# +# This config works best when the model is being fine-tuned on 2+ GPUs. +# For single device LoRA finetuning please use 1_5B_lora_single_device.yaml + + +# Model Arguments +model: + _component_: torchtune.models.qwen2_5.lora_qwen2_5_1_5b_instruct + lora_attn_modules: ['q_proj', 'v_proj'] + apply_lora_to_mlp: False + apply_lora_to_output: False + lora_rank: 32 + lora_alpha: 64 + lora_dropout: 0.0 + +tokenizer: + _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer + path: /tmp/Qwen2_5-1_5B-Instruct/vocab.json + merges_file: /tmp/Qwen2_5-1_5B-Instruct/merges.txt + max_seq_len: null + +checkpointer: + _component_: torchtune.training.FullModelHFCheckpointer + checkpoint_dir: /tmp/Qwen2_5-1_5B-Instruct + checkpoint_files: [ + model.safetensors + ] + recipe_checkpoint: null + output_dir: /tmp/Qwen2_5-1_5B-Instruct-lora-finetune + model_type: QWEN2 +resume_from_checkpoint: False + +# Dataset and Sampler +dataset: + _component_: torchtune.datasets.alpaca_cleaned_dataset + packed: False +seed: null +shuffle: True +batch_size: 2 + +# Optimizer and Scheduler +optimizer: + _component_: torch.optim.AdamW + fused: True + lr: 2e-5 + +lr_scheduler: + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup + num_warmup_steps: 100 + +loss: + _component_: torchtune.modules.loss.CEWithChunkedOutputLoss + +# Training +epochs: 1 +max_steps_per_epoch: null +gradient_accumulation_steps: 8 +compile: False + +# Logging +output_dir: /tmp/Qwen2_5-1_5B-Instruct-lora-finetune +metric_logger: + _component_: torchtune.training.metric_logging.DiskLogger + log_dir: ${output_dir} +log_every_n_steps: 1 +log_peak_memory_stats: False + +# Environment +device: cuda +dtype: bf16 +enable_activation_checkpointing: True + +# Show case the usage of pytorch profiler +# Set enabled to False as it's only needed for debugging training +profiler: + _component_: torchtune.training.setup_torch_profiler + + enabled: False + + #Output directory of trace artifacts + output_dir: ${output_dir}/profiling_outputs + + #`torch.profiler.ProfilerActivity` types to trace + cpu: True + cuda: True + + #trace options passed to `torch.profiler.profile` + profile_memory: False + with_stack: False + record_shapes: True + with_flops: False + + # `torch.profiler.schedule` options: + # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat + wait_steps: 5 + warmup_steps: 5 + active_steps: 2 + num_cycles: 1 diff --git a/recipes/configs/qwen2_5/1_5B_lora_single_device.yaml b/recipes/configs/qwen2_5/1_5B_lora_single_device.yaml new file mode 100644 index 0000000000..f35989fa4f --- /dev/null +++ b/recipes/configs/qwen2_5/1_5B_lora_single_device.yaml @@ -0,0 +1,113 @@ +# Config for single device LoRA finetuning in lora_finetune_single_device.py +# using a Qwen2.5 1.5B model +# +# This config assumes that you've run the following command before launching +# this run: +# tune download Qwen/Qwen2.5-1.5B-Instruct --output-dir /tmp/Qwen2_5-1_5B-Instruct --ignore-patterns None +# +# To launch on a single device, run the following command from root: +# tune run lora_finetune_single_device --config qwen2_5/1_5B_lora_single_device +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run lora_finetune_single_device --config qwen2_5/1_5B_lora_single_device checkpointer.checkpoint_dir= +# +# This config works only for training on single device. + + +# Model Arguments +model: + _component_: torchtune.models.qwen2_5.lora_qwen2_5_1_5b_instruct + lora_attn_modules: ['q_proj', 'v_proj'] + apply_lora_to_mlp: False + apply_lora_to_output: False + lora_rank: 32 + lora_alpha: 64 + lora_dropout: 0.0 + +tokenizer: + _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer + path: /tmp/Qwen2_5-1_5B-Instruct/vocab.json + merges_file: /tmp/Qwen2_5-1_5B-Instruct/merges.txt + max_seq_len: null + +checkpointer: + _component_: torchtune.training.FullModelHFCheckpointer + checkpoint_dir: /tmp/Qwen2_5-1_5B-Instruct + checkpoint_files: [ + model.safetensors + ] + recipe_checkpoint: null + output_dir: /tmp/Qwen2_5-1_5B-Instruct-lora-finetune + model_type: QWEN2 +resume_from_checkpoint: False + +# Dataset and Sampler +dataset: + _component_: torchtune.datasets.alpaca_cleaned_dataset + packed: False +seed: null +shuffle: True +batch_size: 2 + +# Optimizer and Scheduler +optimizer: + _component_: torch.optim.AdamW + fused: True + lr: 2e-3 + +lr_scheduler: + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup + num_warmup_steps: 100 + +loss: + _component_: torchtune.modules.loss.CEWithChunkedOutputLoss + +# Training +epochs: 1 +max_steps_per_epoch: null +gradient_accumulation_steps: 8 +compile: False + +# Logging +output_dir: /tmp/Qwen2_5-1_5B-Instruct-lora-finetune +metric_logger: + _component_: torchtune.training.metric_logging.DiskLogger + log_dir: ${output_dir} +log_every_n_steps: 1 +log_peak_memory_stats: False + +# Environment +device: cuda +dtype: bf16 + +# Activations Offloading +enable_activation_checkpointing: True +enable_activation_offloading: False + +# Show case the usage of pytorch profiler +# Set enabled to False as it's only needed for debugging training +profiler: + _component_: torchtune.training.setup_torch_profiler + enabled: False + + #Output directory of trace artifacts + output_dir: ${output_dir}/profiling_outputs + + #`torch.profiler.ProfilerActivity` types to trace + cpu: True + cuda: True + + #trace options passed to `torch.profiler.profile` + profile_memory: False + with_stack: False + record_shapes: True + with_flops: False + + # `torch.profiler.schedule` options: + # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat + wait_steps: 5 + warmup_steps: 5 + active_steps: 2 + num_cycles: 1 diff --git a/recipes/configs/qwen2_5/32B_lora.yaml b/recipes/configs/qwen2_5/32B_lora.yaml new file mode 100644 index 0000000000..19a9356c27 --- /dev/null +++ b/recipes/configs/qwen2_5/32B_lora.yaml @@ -0,0 +1,125 @@ +# Config for multi-device LoRA finetuning in lora_finetune_distributed.py +# using a Qwen2.5 32B model +# +# This config assumes that you've run the following command before launching +# this run: +# tune download Qwen/Qwen2.5-32B-Instruct --output-dir /tmp/Qwen2_5-32B-Instruct --ignore-patterns None +# +# To launch on 8 devices, run the following command from root: +# tune run --nnodes 1 --nproc_per_node 8 lora_finetune_distributed --config qwen2_5/32B_lora +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run --nnodes 1 --nproc_per_node 8 lora_finetune_distributed --config qwen2_5/32B_lora checkpointer.checkpoint_dir= + + +# Model Arguments +model: + _component_: torchtune.models.qwen2_5.lora_qwen2_5_32b_instruct + lora_attn_modules: ['q_proj', 'v_proj'] + apply_lora_to_mlp: False + apply_lora_to_output: False + lora_rank: 8 + lora_alpha: 16 + lora_dropout: 0.0 + +tokenizer: + _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer + path: /tmp/Qwen2_5-32B-Instruct/vocab.json + merges_file: /tmp/Qwen2_5-32B-Instruct/merges.txt + max_seq_len: null + +checkpointer: + _component_: torchtune.training.FullModelHFCheckpointer + checkpoint_dir: /tmp/Qwen2_5-32B-Instruct + checkpoint_files: [ + model-00001-of-00017.safetensors, + model-00002-of-00017.safetensors, + model-00003-of-00017.safetensors, + model-00004-of-00017.safetensors, + model-00005-of-00017.safetensors, + model-00006-of-00017.safetensors, + model-00007-of-00017.safetensors, + model-00008-of-00017.safetensors, + model-00009-of-00017.safetensors, + model-00010-of-00017.safetensors, + model-00011-of-00017.safetensors, + model-00012-of-00017.safetensors, + model-00013-of-00017.safetensors, + model-00014-of-00017.safetensors, + model-00015-of-00017.safetensors, + model-00016-of-00017.safetensors, + model-00017-of-00017.safetensors, + ] + recipe_checkpoint: null + output_dir: /tmp/Qwen2_5-32B-Instruct-lora-finetune + model_type: QWEN2 +resume_from_checkpoint: False + +# Dataset and Sampler +dataset: + _component_: torchtune.datasets.alpaca_cleaned_dataset + packed: False +seed: null +shuffle: True +batch_size: 2 + +# Optimizer and Scheduler +optimizer: + _component_: torch.optim.AdamW + fused: True + weight_decay: 0.01 + lr: 3e-4 +lr_scheduler: + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup + num_warmup_steps: 100 + +loss: + _component_: torchtune.modules.loss.CEWithChunkedOutputLoss + +# Training +epochs: 1 +max_steps_per_epoch: null +gradient_accumulation_steps: 32 +compile: False + +# Logging +output_dir: /tmp/Qwen2_5-32B-Instruct-lora-finetune +metric_logger: + _component_: torchtune.training.metric_logging.DiskLogger + log_dir: ${output_dir} +log_every_n_steps: 1 +log_peak_memory_stats: False + +# Environment +device: cuda +dtype: bf16 +enable_activation_checkpointing: False + +# Show case the usage of pytorch profiler +# Set enabled to False as it's only needed for debugging training +profiler: + _component_: torchtune.training.setup_torch_profiler + + enabled: False + + #Output directory of trace artifacts + output_dir: ${output_dir}/profiling_outputs + + #`torch.profiler.ProfilerActivity` types to trace + cpu: True + cuda: True + + #trace options passed to `torch.profiler.profile` + profile_memory: False + with_stack: False + record_shapes: True + with_flops: False + + # `torch.profiler.schedule` options: + # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat + wait_steps: 5 + warmup_steps: 5 + active_steps: 2 + num_cycles: 1 diff --git a/recipes/configs/qwen2_5/3B_full.yaml b/recipes/configs/qwen2_5/3B_full.yaml new file mode 100644 index 0000000000..79343ca457 --- /dev/null +++ b/recipes/configs/qwen2_5/3B_full.yaml @@ -0,0 +1,78 @@ +# Config for multi-device full finetuning in full_finetune_distributed.py +# using a Qwen2.5 3B model +# +# This config assumes that you've run the following command before launching +# this run: +# tune download Qwen/Qwen2.5-3B-Instruct --output-dir /tmp/Qwen2_5-3B-Instruct --ignore-patterns None +# +# To launch on 2 devices, run the following command from root: +# tune run --nnodes 1 --nproc_per_node 2 full_finetune_distributed --config qwen2_5/3B_full +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run --nnodes 1 --nproc_per_node 2 full_finetune_distributed --config qwen2_5/3B_full checkpointer.checkpoint_dir= +# +# This config works best when the model is being fine-tuned on 2+ GPUs. +# Single device full finetuning requires more memory optimizations. It's +# best to use 3B_full_single_device.yaml for those cases + +# Tokenizer +tokenizer: + _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer + path: /tmp/Qwen2_5-3B-Instruct/vocab.json + merges_file: /tmp/Qwen2_5-3B-Instruct/merges.txt + max_seq_len: null + +# Dataset +dataset: + _component_: torchtune.datasets.alpaca_cleaned_dataset + packed: False +seed: null +shuffle: True + +# Model Arguments +model: + _component_: torchtune.models.qwen2_5.qwen2_5_3b + +checkpointer: + _component_: torchtune.training.FullModelHFCheckpointer + checkpoint_dir: /tmp/Qwen2_5-3B-Instruct + checkpoint_files: [ + model-00001-of-00002.safetensors, + model-00002-of-00002.safetensors, + ] + recipe_checkpoint: null + output_dir: /tmp/Qwen2_5-3B-Instruct-finetune + model_type: QWEN2 +resume_from_checkpoint: False + +# Fine-tuning arguments +batch_size: 2 +epochs: 1 +optimizer: + _component_: torch.optim.AdamW + fused: True + lr: 5e-6 +loss: + _component_: torchtune.modules.loss.CEWithChunkedOutputLoss +max_steps_per_epoch: null +gradient_accumulation_steps: 16 +compile: False + +# Training env +device: cuda + +# Memory management +enable_activation_checkpointing: True + +# Reduced precision +dtype: bf16 + +# Logging +metric_logger: + _component_: torchtune.training.metric_logging.DiskLogger + log_dir: ${output_dir} +output_dir: /tmp/Qwen2_5-3B-Instruct-finetune +log_every_n_steps: 1 +log_peak_memory_stats: False diff --git a/recipes/configs/qwen2_5/3B_full_single_device.yaml b/recipes/configs/qwen2_5/3B_full_single_device.yaml new file mode 100644 index 0000000000..09494d6c28 --- /dev/null +++ b/recipes/configs/qwen2_5/3B_full_single_device.yaml @@ -0,0 +1,80 @@ +# Config for single device full finetuning in full_finetune_single_device.py +# using a Qwen2.5 3B +# +# This config assumes that you've run the following command before launching +# this run: +# tune download Qwen/Qwen2.5-3B-Instruct --output-dir /tmp/Qwen2_5-3B-Instruct --ignore-patterns None +# +# The default config uses an optimizer from bitsandbytes. If you do not have it installed, +# you can install it with +# pip install bitsandbytes +# +# To launch on a single device, run the following command from root: +# tune run full_finetune_single_device --config qwen2_5/3B_full_single_device +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run full_finetune_single_device --config qwen2_5/3B_full_single_device checkpointer.checkpoint_dir= +# +# This config works only for training on single device. + +# Tokenizer +tokenizer: + _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer + path: /tmp/Qwen2_5-3B-Instruct/vocab.json + merges_file: /tmp/Qwen2_5-3B-Instruct/merges.txt + max_seq_len: null + +# Dataset +dataset: + _component_: torchtune.datasets.alpaca_cleaned_dataset + packed: False +seed: null +shuffle: True + +# Model Arguments +model: + _component_: torchtune.models.qwen2_5.qwen2_5_3b + +checkpointer: + _component_: torchtune.training.FullModelHFCheckpointer + checkpoint_dir: /tmp/Qwen2_5-3B-Instruct + checkpoint_files: [ + model-00001-of-00002.safetensors, + model-00002-of-00002.safetensors, + ] + recipe_checkpoint: null + output_dir: /tmp/Qwen2_5-3B-Instruct-finetune + model_type: QWEN2 +resume_from_checkpoint: False + +# Fine-tuning arguments +batch_size: 2 +epochs: 1 +optimizer: + _component_: bitsandbytes.optim.PagedAdamW + lr: 5e-6 +optimizer_in_bwd: True +loss: + _component_: torchtune.modules.loss.CEWithChunkedOutputLoss +max_steps_per_epoch: null +gradient_accumulation_steps: 1 +compile: False + +# Training environment +device: cuda + +# Memory management +enable_activation_checkpointing: True + +# Reduced precision +dtype: bf16 + +# Logging +metric_logger: + _component_: torchtune.training.metric_logging.DiskLogger + log_dir: ${output_dir} +output_dir: /tmp/Qwen2_5-3B-Instruct-finetune +log_every_n_steps: 1 +log_peak_memory_stats: False diff --git a/recipes/configs/qwen2_5/3B_lora.yaml b/recipes/configs/qwen2_5/3B_lora.yaml new file mode 100644 index 0000000000..b987330a6d --- /dev/null +++ b/recipes/configs/qwen2_5/3B_lora.yaml @@ -0,0 +1,113 @@ +# Config for multi-device LoRA finetuning in lora_finetune_distributed.py +# using a Qwen2.5 3B model +# +# This config assumes that you've run the following command before launching +# this run: +# tune download Qwen/Qwen2.5-3B-Instruct --output-dir /tmp/Qwen2_5-3B-Instruct --ignore-patterns None +# +# To launch on 2 devices, run the following command from root: +# tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config qwen2_5/3B_lora +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config qwen2_5/3B_lora checkpointer.checkpoint_dir= +# +# This config works best when the model is being fine-tuned on 2+ GPUs. +# For single device LoRA finetuning please use 3B_lora_single_device.yaml + + +# Model Arguments +model: + _component_: torchtune.models.qwen2_5.lora_qwen2_5_3b + lora_attn_modules: ['q_proj', 'v_proj'] + apply_lora_to_mlp: False + apply_lora_to_output: False + lora_rank: 8 + lora_alpha: 16 + lora_dropout: 0.0 + +tokenizer: + _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer + path: /tmp/Qwen2_5-3B-Instruct/vocab.json + merges_file: /tmp/Qwen2_5-3B-Instruct/merges.txt + max_seq_len: null + +checkpointer: + _component_: torchtune.training.FullModelHFCheckpointer + checkpoint_dir: /tmp/Qwen2_5-3B-Instruct + checkpoint_files: [ + model-00001-of-00002.safetensors, + model-00002-of-00002.safetensors, + ] + recipe_checkpoint: null + output_dir: /tmp/Qwen2_5-3B-Instruct-lora-finetune + model_type: QWEN2 +resume_from_checkpoint: False + +# Dataset and Sampler +dataset: + _component_: torchtune.datasets.alpaca_cleaned_dataset + packed: False +seed: null +shuffle: True +batch_size: 2 + +# Optimizer and Scheduler +optimizer: + _component_: torch.optim.AdamW + fused: True + weight_decay: 0.01 + lr: 3e-4 +lr_scheduler: + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup + num_warmup_steps: 100 + +loss: + _component_: torchtune.modules.loss.CEWithChunkedOutputLoss + +# Training +epochs: 1 +max_steps_per_epoch: null +gradient_accumulation_steps: 32 +compile: False + +# Logging +output_dir: /tmp/Qwen2_5-3B-Instruct-lora-finetune +metric_logger: + _component_: torchtune.training.metric_logging.DiskLogger + log_dir: ${output_dir} +log_every_n_steps: 1 +log_peak_memory_stats: False + +# Environment +device: cuda +dtype: bf16 +enable_activation_checkpointing: False + +# Show case the usage of pytorch profiler +# Set enabled to False as it's only needed for debugging training +profiler: + _component_: torchtune.training.setup_torch_profiler + + enabled: False + + #Output directory of trace artifacts + output_dir: ${output_dir}/profiling_outputs + + #`torch.profiler.ProfilerActivity` types to trace + cpu: True + cuda: True + + #trace options passed to `torch.profiler.profile` + profile_memory: False + with_stack: False + record_shapes: True + with_flops: False + + # `torch.profiler.schedule` options: + # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat + wait_steps: 5 + warmup_steps: 5 + active_steps: 2 + num_cycles: 1 diff --git a/recipes/configs/qwen2_5/3B_lora_single_device.yaml b/recipes/configs/qwen2_5/3B_lora_single_device.yaml new file mode 100644 index 0000000000..8caf08d063 --- /dev/null +++ b/recipes/configs/qwen2_5/3B_lora_single_device.yaml @@ -0,0 +1,114 @@ +# Config for single device LoRA finetuning in lora_finetune_single_device.py +# using a Qwen2.5 3B model +# +# This config assumes that you've run the following command before launching +# this run: +# tune download Qwen/Qwen2.5-3B-Instruct --output-dir /tmp/Qwen2_5-3B-Instruct --ignore-patterns None +# +# To launch on a single device, run the following command from root: +# tune run lora_finetune_single_device --config qwen2_5/3B_lora_single_device +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run lora_finetune_single_device --config qwen2_5/3B_lora_single_device checkpointer.checkpoint_dir= +# +# This config works only for training on single device. + + +# Model Arguments +model: + _component_: torchtune.models.qwen2_5.lora_qwen2_5_3b + lora_attn_modules: ['q_proj', 'v_proj'] + apply_lora_to_mlp: False + apply_lora_to_output: False + lora_rank: 8 + lora_alpha: 16 + lora_dropout: 0.0 + +tokenizer: + _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer + path: /tmp/Qwen2_5-3B-Instruct/vocab.json + merges_file: /tmp/Qwen2_5-3B-Instruct/merges.txt + max_seq_len: null + +checkpointer: + _component_: torchtune.training.FullModelHFCheckpointer + checkpoint_dir: /tmp/Qwen2_5-3B-Instruct + checkpoint_files: [ + model-00001-of-00002.safetensors, + model-00002-of-00002.safetensors, + ] + recipe_checkpoint: null + output_dir: /tmp/Qwen2_5-3B-Instruct-lora-finetune + model_type: QWEN2 +resume_from_checkpoint: False + +# Dataset and Sampler +dataset: + _component_: torchtune.datasets.alpaca_cleaned_dataset + packed: False +seed: null +shuffle: True +batch_size: 2 + +# Optimizer and Scheduler +optimizer: + _component_: torch.optim.AdamW + fused: True + weight_decay: 0.01 + lr: 3e-4 +lr_scheduler: + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup + num_warmup_steps: 100 + +loss: + _component_: torchtune.modules.loss.CEWithChunkedOutputLoss + +# Training +epochs: 1 +max_steps_per_epoch: null +gradient_accumulation_steps: 64 +compile: False + +# Logging +output_dir: /tmp/Qwen2_5-3B-Instruct-lora-finetune +metric_logger: + _component_: torchtune.training.metric_logging.DiskLogger + log_dir: ${output_dir} +log_every_n_steps: 1 +log_peak_memory_stats: False + +# Environment +device: cuda +dtype: bf16 + +# Activations Offloading +enable_activation_checkpointing: True +enable_activation_offloading: False + +# Show case the usage of pytorch profiler +# Set enabled to False as it's only needed for debugging training +profiler: + _component_: torchtune.training.setup_torch_profiler + enabled: False + + #Output directory of trace artifacts + output_dir: ${output_dir}/profiling_outputs + + #`torch.profiler.ProfilerActivity` types to trace + cpu: True + cuda: True + + #trace options passed to `torch.profiler.profile` + profile_memory: False + with_stack: False + record_shapes: True + with_flops: False + + # `torch.profiler.schedule` options: + # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat + wait_steps: 5 + warmup_steps: 5 + active_steps: 2 + num_cycles: 1 diff --git a/recipes/configs/qwen2_5/72B_lora.yaml b/recipes/configs/qwen2_5/72B_lora.yaml new file mode 100644 index 0000000000..906e52dfde --- /dev/null +++ b/recipes/configs/qwen2_5/72B_lora.yaml @@ -0,0 +1,145 @@ +# Config for multi-device LoRA finetuning in lora_finetune_distributed.py +# using a Qwen2.5 72B model +# +# This config assumes that you've run the following command before launching +# this run: +# tune download Qwen/Qwen2.5-72B-Instruct --output-dir /tmp/Qwen2_5-72B-Instruct --ignore-patterns None +# +# To launch on 8 devices, run the following command from root: +# tune run --nnodes 1 --nproc_per_node 8 lora_finetune_distributed --config qwen2_5/72B_lora +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run --nnodes 1 --nproc_per_node 8 lora_finetune_distributed --config qwen2_5/72B_lora checkpointer.checkpoint_dir= + + +# Model Arguments +model: + _component_: torchtune.models.qwen2_5.lora_qwen2_5_72b_instruct + lora_attn_modules: ['q_proj', 'v_proj'] + apply_lora_to_mlp: False + apply_lora_to_output: False + lora_rank: 8 + lora_alpha: 16 + lora_dropout: 0.0 + +tokenizer: + _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer + path: /tmp/Qwen2_5-72B-Instruct/vocab.json + merges_file: /tmp/Qwen2_5-72B-Instruct/merges.txt + max_seq_len: null + +checkpointer: + _component_: torchtune.training.FullModelHFCheckpointer + checkpoint_dir: /tmp/Qwen2_5-72B-Instruct + checkpoint_files: [ + model-00001-of-00037.safetensors, + model-00002-of-00037.safetensors, + model-00003-of-00037.safetensors, + model-00004-of-00037.safetensors, + model-00005-of-00037.safetensors, + model-00006-of-00037.safetensors, + model-00007-of-00037.safetensors, + model-00008-of-00037.safetensors, + model-00009-of-00037.safetensors, + model-00010-of-00037.safetensors, + model-00011-of-00037.safetensors, + model-00012-of-00037.safetensors, + model-00013-of-00037.safetensors, + model-00014-of-00037.safetensors, + model-00015-of-00037.safetensors, + model-00016-of-00037.safetensors, + model-00017-of-00037.safetensors, + model-00018-of-00037.safetensors, + model-00019-of-00037.safetensors, + model-00020-of-00037.safetensors, + model-00021-of-00037.safetensors, + model-00022-of-00037.safetensors, + model-00023-of-00037.safetensors, + model-00024-of-00037.safetensors, + model-00025-of-00037.safetensors, + model-00026-of-00037.safetensors, + model-00027-of-00037.safetensors, + model-00028-of-00037.safetensors, + model-00029-of-00037.safetensors, + model-00030-of-00037.safetensors, + model-00031-of-00037.safetensors, + model-00032-of-00037.safetensors, + model-00033-of-00037.safetensors, + model-00034-of-00037.safetensors, + model-00035-of-00037.safetensors, + model-00036-of-00037.safetensors, + model-00037-of-00037.safetensors, + ] + recipe_checkpoint: null + output_dir: /tmp/Qwen2_5-72B-Instruct-lora-finetune + model_type: QWEN2 +resume_from_checkpoint: False + +# Dataset and Sampler +dataset: + _component_: torchtune.datasets.alpaca_dataset + packed: False +seed: null +shuffle: True +batch_size: 2 + +# Optimizer and Scheduler +optimizer: + _component_: torch.optim.AdamW + fused: True + weight_decay: 0.01 + lr: 3e-4 +lr_scheduler: + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup + num_warmup_steps: 100 + +loss: + _component_: torchtune.modules.loss.CEWithChunkedOutputLoss + +# Training +epochs: 1 +max_steps_per_epoch: null +gradient_accumulation_steps: 1 +compile: False + +# Logging +output_dir: /tmp/Qwen2_5-72B-Instruct-lora-finetune +metric_logger: + _component_: torchtune.training.metric_logging.DiskLogger + log_dir: ${output_dir} +log_every_n_steps: 1 +log_peak_memory_stats: False + +# Environment +device: cuda +dtype: bf16 +enable_activation_checkpointing: True + +# Show case the usage of pytorch profiler +# Set enabled to False as it's only needed for debugging training +profiler: + _component_: torchtune.training.setup_torch_profiler + + enabled: False + + #Output directory of trace artifacts + output_dir: ${output_dir}/profiling_outputs + + #`torch.profiler.ProfilerActivity` types to trace + cpu: True + cuda: True + + #trace options passed to `torch.profiler.profile` + profile_memory: False + with_stack: False + record_shapes: True + with_flops: False + + # `torch.profiler.schedule` options: + # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat + wait_steps: 5 + warmup_steps: 5 + active_steps: 2 + num_cycles: 1 diff --git a/recipes/configs/qwen2_5/7B_full.yaml b/recipes/configs/qwen2_5/7B_full.yaml new file mode 100644 index 0000000000..78313ca921 --- /dev/null +++ b/recipes/configs/qwen2_5/7B_full.yaml @@ -0,0 +1,80 @@ +# Config for multi-device full finetuning in full_finetune_distributed.py +# using a Qwen2.5 7B model +# +# This config assumes that you've run the following command before launching +# this run: +# tune download Qwen/Qwen2.5-7B-Instruct --output-dir /tmp/Qwen2_5-7B-Instruct --ignore-patterns None +# +# To launch on 2 devices, run the following command from root: +# tune run --nnodes 1 --nproc_per_node 2 full_finetune_distributed --config qwen2_5/7B_full +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run --nnodes 1 --nproc_per_node 2 full_finetune_distributed --config qwen2_5/7B_full checkpointer.checkpoint_dir= +# +# This config works best when the model is being fine-tuned on 2+ GPUs. +# Single device full finetuning requires more memory optimizations. It's +# best to use 7B_full_single_device.yaml for those cases + +# Tokenizer +tokenizer: + _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer + path: /tmp/Qwen2_5-7B-Instruct/vocab.json + merges_file: /tmp/Qwen2_5-7B-Instruct/merges.txt + max_seq_len: null + +# Dataset +dataset: + _component_: torchtune.datasets.alpaca_cleaned_dataset + packed: False +seed: null +shuffle: True + +# Model Arguments +model: + _component_: torchtune.models.qwen2_5.qwen2_5_7b_instruct + +checkpointer: + _component_: torchtune.training.FullModelHFCheckpointer + checkpoint_dir: /tmp/Qwen2_5-7B-Instruct + checkpoint_files: [ + model-00001-of-00004.safetensors, + model-00002-of-00004.safetensors, + model-00003-of-00004.safetensors, + model-00004-of-00004.safetensors, + ] + recipe_checkpoint: null + output_dir: /tmp/Qwen2_5-7B-Instruct-finetune + model_type: QWEN2 +resume_from_checkpoint: False + +# Fine-tuning arguments +batch_size: 2 +epochs: 1 +optimizer: + _component_: torch.optim.AdamW + fused: True + lr: 5e-6 +loss: + _component_: torchtune.modules.loss.CEWithChunkedOutputLoss +max_steps_per_epoch: null +gradient_accumulation_steps: 16 +compile: False + +# Training env +device: cuda + +# Memory management +enable_activation_checkpointing: True + +# Reduced precision +dtype: bf16 + +# Logging +metric_logger: + _component_: torchtune.training.metric_logging.DiskLogger + log_dir: ${output_dir} +output_dir: /tmp/Qwen2_5-7B-Instruct-finetune +log_every_n_steps: 1 +log_peak_memory_stats: False diff --git a/recipes/configs/qwen2_5/7B_full_single_device.yaml b/recipes/configs/qwen2_5/7B_full_single_device.yaml new file mode 100644 index 0000000000..c4f464e97e --- /dev/null +++ b/recipes/configs/qwen2_5/7B_full_single_device.yaml @@ -0,0 +1,82 @@ +# Config for single device full finetuning in full_finetune_single_device.py +# using a Qwen2.5 7B +# +# This config assumes that you've run the following command before launching +# this run: +# tune download Qwen/Qwen2.5-7B-Instruct --output-dir /tmp/Qwen2_5-7B-Instruct --ignore-patterns None +# +# The default config uses an optimizer from bitsandbytes. If you do not have it installed, +# you can install it with +# pip install bitsandbytes +# +# To launch on a single device, run the following command from root: +# tune run full_finetune_single_device --config qwen2_5/7B_full_single_device +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run full_finetune_single_device --config qwen2_5/7B_full_single_device checkpointer.checkpoint_dir= +# +# This config works only for training on single device. + +# Tokenizer +tokenizer: + _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer + path: /tmp/Qwen2_5-7B-Instruct/vocab.json + merges_file: /tmp/Qwen2_5-7B-Instruct/merges.txt + max_seq_len: null + +# Dataset +dataset: + _component_: torchtune.datasets.alpaca_cleaned_dataset + packed: False +seed: null +shuffle: True + +# Model Arguments +model: + _component_: torchtune.models.qwen2_5.qwen2_5_7b_instruct + +checkpointer: + _component_: torchtune.training.FullModelHFCheckpointer + checkpoint_dir: /tmp/Qwen2_5-7B-Instruct + checkpoint_files: [ + model-00001-of-00004.safetensors, + model-00002-of-00004.safetensors, + model-00003-of-00004.safetensors, + model-00004-of-00004.safetensors, + ] + recipe_checkpoint: null + output_dir: /tmp/Qwen2_5-7B-Instruct-finetune + model_type: QWEN2 +resume_from_checkpoint: False + +# Fine-tuning arguments +batch_size: 2 +epochs: 1 +optimizer: + _component_: bitsandbytes.optim.PagedAdamW + lr: 5e-6 +optimizer_in_bwd: True +loss: + _component_: torchtune.modules.loss.CEWithChunkedOutputLoss +max_steps_per_epoch: null +gradient_accumulation_steps: 1 +compile: False + +# Training environment +device: cuda + +# Memory management +enable_activation_checkpointing: True + +# Reduced precision +dtype: bf16 + +# Logging +metric_logger: + _component_: torchtune.training.metric_logging.DiskLogger + log_dir: ${output_dir} +output_dir: /tmp/Qwen2_5-7B-Instruct-finetune +log_every_n_steps: 1 +log_peak_memory_stats: False diff --git a/recipes/configs/qwen2_5/7B_lora.yaml b/recipes/configs/qwen2_5/7B_lora.yaml new file mode 100644 index 0000000000..61365316be --- /dev/null +++ b/recipes/configs/qwen2_5/7B_lora.yaml @@ -0,0 +1,115 @@ +# Config for multi-device LoRA finetuning in lora_finetune_distributed.py +# using a Qwen2.5 7B model +# +# This config assumes that you've run the following command before launching +# this run: +# tune download Qwen/Qwen2.5-7B-Instruct --output-dir /tmp/Qwen2_5-7B-Instruct --ignore-patterns None +# +# To launch on 2 devices, run the following command from root: +# tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config qwen2_5/7B_lora +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config qwen2_5/7B_lora checkpointer.checkpoint_dir= +# +# This config works best when the model is being fine-tuned on 2+ GPUs. +# For single device LoRA finetuning please use 7B_lora_single_device.yaml + + +# Model Arguments +model: + _component_: torchtune.models.qwen2_5.lora_qwen2_5_7b_instruct + lora_attn_modules: ['q_proj', 'v_proj'] + apply_lora_to_mlp: False + apply_lora_to_output: False + lora_rank: 8 + lora_alpha: 16 + lora_dropout: 0.0 + +tokenizer: + _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer + path: /tmp/Qwen2_5-7B-Instruct/vocab.json + merges_file: /tmp/Qwen2_5-7B-Instruct/merges.txt + max_seq_len: null + +checkpointer: + _component_: torchtune.training.FullModelHFCheckpointer + checkpoint_dir: /tmp/Qwen2_5-7B-Instruct + checkpoint_files: [ + model-00001-of-00004.safetensors, + model-00002-of-00004.safetensors, + model-00003-of-00004.safetensors, + model-00004-of-00004.safetensors, + ] + recipe_checkpoint: null + output_dir: /tmp/Qwen2_5-7B-Instruct-lora-finetune + model_type: QWEN2 +resume_from_checkpoint: False + +# Dataset and Sampler +dataset: + _component_: torchtune.datasets.alpaca_cleaned_dataset + packed: False +seed: null +shuffle: True +batch_size: 2 + +# Optimizer and Scheduler +optimizer: + _component_: torch.optim.AdamW + fused: True + weight_decay: 0.01 + lr: 3e-4 +lr_scheduler: + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup + num_warmup_steps: 100 + +loss: + _component_: torchtune.modules.loss.CEWithChunkedOutputLoss + +# Training +epochs: 1 +max_steps_per_epoch: null +gradient_accumulation_steps: 32 +compile: False + +# Logging +output_dir: /tmp/Qwen2_5-7B-Instruct-lora-finetune +metric_logger: + _component_: torchtune.training.metric_logging.DiskLogger + log_dir: ${output_dir} +log_every_n_steps: 1 +log_peak_memory_stats: False + +# Environment +device: cuda +dtype: bf16 +enable_activation_checkpointing: False + +# Show case the usage of pytorch profiler +# Set enabled to False as it's only needed for debugging training +profiler: + _component_: torchtune.training.setup_torch_profiler + + enabled: False + + #Output directory of trace artifacts + output_dir: ${output_dir}/profiling_outputs + + #`torch.profiler.ProfilerActivity` types to trace + cpu: True + cuda: True + + #trace options passed to `torch.profiler.profile` + profile_memory: False + with_stack: False + record_shapes: True + with_flops: False + + # `torch.profiler.schedule` options: + # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat + wait_steps: 5 + warmup_steps: 5 + active_steps: 2 + num_cycles: 1 diff --git a/recipes/configs/qwen2_5/7B_lora_single_device.yaml b/recipes/configs/qwen2_5/7B_lora_single_device.yaml new file mode 100644 index 0000000000..53949bc307 --- /dev/null +++ b/recipes/configs/qwen2_5/7B_lora_single_device.yaml @@ -0,0 +1,116 @@ +# Config for single device LoRA finetuning in lora_finetune_single_device.py +# using a Qwen2.5 7B model +# +# This config assumes that you've run the following command before launching +# this run: +# tune download Qwen/Qwen2.5-7B-Instruct --output-dir /tmp/Qwen2_5-7B-Instruct --ignore-patterns None +# +# To launch on a single device, run the following command from root: +# tune run lora_finetune_single_device --config qwen2_5/7B_lora_single_device +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run lora_finetune_single_device --config qwen2_5/7B_lora_single_device checkpointer.checkpoint_dir= +# +# This config works only for training on single device. + + +# Model Arguments +model: + _component_: torchtune.models.qwen2_5.lora_qwen2_5_7b_instruct + lora_attn_modules: ['q_proj', 'v_proj'] + apply_lora_to_mlp: False + apply_lora_to_output: False + lora_rank: 8 + lora_alpha: 16 + lora_dropout: 0.0 + +tokenizer: + _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer + path: /tmp/Qwen2_5-7B-Instruct/vocab.json + merges_file: /tmp/Qwen2_5-7B-Instruct/merges.txt + max_seq_len: null + +checkpointer: + _component_: torchtune.training.FullModelHFCheckpointer + checkpoint_dir: /tmp/Qwen2_5-7B-Instruct + checkpoint_files: [ + model-00001-of-00004.safetensors, + model-00002-of-00004.safetensors, + model-00003-of-00004.safetensors, + model-00004-of-00004.safetensors, + ] + recipe_checkpoint: null + output_dir: /tmp/Qwen2_5-7B-Instruct-lora-finetune + model_type: QWEN2 +resume_from_checkpoint: False + +# Dataset and Sampler +dataset: + _component_: torchtune.datasets.alpaca_cleaned_dataset + packed: False +seed: null +shuffle: True +batch_size: 2 + +# Optimizer and Scheduler +optimizer: + _component_: torch.optim.AdamW + fused: True + weight_decay: 0.01 + lr: 3e-4 +lr_scheduler: + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup + num_warmup_steps: 100 + +loss: + _component_: torchtune.modules.loss.CEWithChunkedOutputLoss + +# Training +epochs: 1 +max_steps_per_epoch: null +gradient_accumulation_steps: 64 +compile: False + +# Logging +output_dir: /tmp/Qwen2_5-7B-Instruct-lora-finetune +metric_logger: + _component_: torchtune.training.metric_logging.DiskLogger + log_dir: ${output_dir} +log_every_n_steps: 1 +log_peak_memory_stats: False + +# Environment +device: cuda +dtype: bf16 + +# Activations Offloading +enable_activation_checkpointing: True +enable_activation_offloading: False + +# Show case the usage of pytorch profiler +# Set enabled to False as it's only needed for debugging training +profiler: + _component_: torchtune.training.setup_torch_profiler + enabled: False + + #Output directory of trace artifacts + output_dir: ${output_dir}/profiling_outputs + + #`torch.profiler.ProfilerActivity` types to trace + cpu: True + cuda: True + + #trace options passed to `torch.profiler.profile` + profile_memory: False + with_stack: False + record_shapes: True + with_flops: False + + # `torch.profiler.schedule` options: + # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat + wait_steps: 5 + warmup_steps: 5 + active_steps: 2 + num_cycles: 1 diff --git a/tests/torchtune/models/qwen2/test_qwen2_tokenizer.py b/tests/torchtune/models/qwen2/test_qwen2_tokenizer.py index b11d6679e1..8304a7c1bf 100644 --- a/tests/torchtune/models/qwen2/test_qwen2_tokenizer.py +++ b/tests/torchtune/models/qwen2/test_qwen2_tokenizer.py @@ -14,13 +14,12 @@ from torchtune.models.qwen2 import qwen2_tokenizer -class TestQwen2Tokenizer: - def tokenizer(self, template: bool = False, max_seq_len: Optional[int] = None): +class TestQwenTokenizer: + def tokenizer(self, max_seq_len: Optional[int] = None): return qwen2_tokenizer( path=str(ASSETS / "tiny_bpe_vocab.json"), merges_file=str(ASSETS / "tiny_bpe_merges.txt"), special_tokens_path=str(ASSETS / "tiny_bpe_tokenizer.json"), - prompt_template="torchtune.data.ChatMLTemplate" if template else None, max_seq_len=max_seq_len, ) @@ -45,8 +44,8 @@ def messages(self): ), ] - def test_tokenize_messages_chat_template(self, messages): - tokenizer = self.tokenizer(template=True) + def test_tokenize_messages(self, messages): + tokenizer = self.tokenizer() tokens, mask = tokenizer.tokenize_messages(messages) expected_tokens = [ 2001, @@ -236,218 +235,33 @@ def test_tokenize_messages_chat_template(self, messages): 318, 1278, 13, - 2002, - 94, 2000, ] - expected_mask = [True] * 67 + [False] * 123 + expected_mask = [True] * 67 + [False] * 121 assert expected_tokens == tokens assert expected_mask == mask formatted_messages = tokenizer.decode(tokens) expected_formatted_messages = ( f"<|im_start|>user\n{messages[0].text_content}<|im_end|>\n" - f"<|im_start|>assistant\n{messages[1].text_content}<|im_end|>\n" - "<|endoftext|>" + f"<|im_start|>assistant\n{messages[1].text_content}<|endoftext|>" ) assert expected_formatted_messages == formatted_messages - def test_tokenize_messages(self, messages): - tokenizer = self.tokenizer(template=False) - tokens, mask = tokenizer.tokenize_messages(messages) - expected_tokens = [ - 33, - 214, - 174, - 156, - 194, - 130, - 197, - 184, - 446, - 789, - 113, - 98, - 1914, - 13, - 346, - 788, - 98, - 706, - 102, - 182, - 184, - 1916, - 176, - 762, - 83, - 113, - 103, - 874, - 269, - 13, - 94, - 94, - 2, - 2, - 2, - 483, - 197, - 25, - 94, - 885, - 98, - 1226, - 1960, - 348, - 114, - 1123, - 399, - 1583, - 78, - 13, - 94, - 94, - 2, - 2, - 2, - 360, - 1733, - 102, - 182, - 25, - 94, - 40, - 1791, - 194, - 453, - 70, - 78, - 114, - 120, - 967, - 176, - 618, - 628, - 1275, - 794, - 294, - 1095, - 445, - 212, - 1356, - 120, - 1299, - 13, - 223, - 1791, - 451, - 98, - 127, - 181, - 1047, - 375, - 915, - 380, - 120, - 1448, - 1732, - 114, - 453, - 447, - 1219, - 64, - 187, - 921, - 120, - 742, - 107, - 84, - 122, - 893, - 13, - 223, - 1791, - 98, - 127, - 181, - 123, - 124, - 131, - 103, - 744, - 82, - 120, - 1506, - 416, - 114, - 128, - 1429, - 182, - 253, - 82, - 120, - 163, - 330, - 105, - 262, - 13, - 223, - 1791, - 155, - 1551, - 171, - 1951, - 628, - 296, - 64, - 237, - 886, - 1390, - 130, - 883, - 1678, - 447, - 306, - 279, - 113, - 11, - 215, - 785, - 215, - 1951, - 628, - 378, - 101, - 66, - 72, - 593, - 98, - 984, - 208, - 1580, - 167, - 510, - 737, - 318, - 1278, - 13, - 2000, - ] - expected_mask = [True] * 61 + [False] * 116 - assert expected_tokens == tokens - assert expected_mask == mask - def test_tokenize_messages_gt_max_seq_len(self, messages): # Super basic test to make sure max_seq_len is working properly - tokenizer = self.tokenizer(template=False, max_seq_len=10) + tokenizer = self.tokenizer(max_seq_len=10) tokens, mask = tokenizer.tokenize_messages(messages) assert len(tokens) == 10 assert len(mask) == 10 def test_tokenize_message_drop_eos(self, messages): - tokenizer = self.tokenizer(template=False) + tokenizer = self.tokenizer() expected_tokens = [ + 2001, + 273, + 105, + 94, 33, 214, 174, @@ -509,6 +323,13 @@ def test_tokenize_message_drop_eos(self, messages): 182, 25, 94, + 2002, + 94, + 2001, + 397, + 251, + 249, + 94, 40, 1791, 194, @@ -624,14 +445,8 @@ def test_tokenize_message_drop_eos(self, messages): 318, 1278, 13, - 2000, ] - - # Remove the EOS token - expected_tokens = expected_tokens[:-1] - # On 1 less then with eos - expected_mask = [True] * 61 + [False] * 115 - + expected_mask = [True] * 67 + [False] * 120 tokens, mask = tokenizer.tokenize_messages(messages, add_eos=False) assert tokens == expected_tokens assert mask == expected_mask diff --git a/tests/torchtune/models/qwen2_5/__init__.py b/tests/torchtune/models/qwen2_5/__init__.py new file mode 100644 index 0000000000..2e41cd717f --- /dev/null +++ b/tests/torchtune/models/qwen2_5/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. diff --git a/tests/torchtune/models/qwen2_5/test_tokenizer.py b/tests/torchtune/models/qwen2_5/test_tokenizer.py new file mode 100644 index 0000000000..332fef2b92 --- /dev/null +++ b/tests/torchtune/models/qwen2_5/test_tokenizer.py @@ -0,0 +1,195 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from tests.common import ASSETS + +from torchtune.data import Message +from torchtune.models.qwen2_5 import qwen2_5_tokenizer + + +class TestQwen2_5Tokenizer: # noqa: N801 + def tokenizer(self): + return qwen2_5_tokenizer( + path=str(ASSETS / "tiny_bpe_vocab.json"), + merges_file=str(ASSETS / "tiny_bpe_merges.txt"), + ) + + def test_tokenize_messages(self): + tokenizer = self.tokenizer() + messages = [ + Message(role="system", content="You are a helpful assistant."), + Message(role="user", content="Give me a short introduction to LLMs."), + Message(role="assistant", content=""), + ] + expected_tokens = [ + 151644, + 82, + 88, + 479, + 94, + 56, + 119, + 230, + 98, + 374, + 494, + 1318, + 249, + 13, + 151645, + 94, + 151644, + 273, + 105, + 94, + 38, + 229, + 362, + 98, + 1695, + 310, + 1305, + 165, + 128, + 432, + 43, + 44, + 82, + 13, + 151645, + 94, + 151644, + 397, + 251, + 249, + 94, + 151643, + ] + expected_formatted_messages = ( + "<|im_start|>system\n" + "You are a helpful assistant.<|im_end|>\n" + "<|im_start|>user\n" + "Give me a short introduction to LLMs.<|im_end|>\n" + "<|im_start|>assistant\n" + "<|endoftext|>" + ) + _test_tokenize_messages( + tokenizer, + messages, + expected_tokens, + expected_formatted_messages, + ) + + def test_tool_call(self): + tokenizer = self.tokenizer() + messages = [ + Message(role="system", content="a"), + Message(role="user", content="b"), + Message(role="assistant", content="test call", ipython=True), + Message(role="ipython", content="test response"), + Message(role="assistant", content=""), + ] + expected_tokens = [ + 151644, + 82, + 88, + 479, + 94, + 64, + 151645, + 94, + 151644, + 273, + 105, + 94, + 65, + 151645, + 94, + 151644, + 397, + 251, + 249, + 94, + 151657, + 94, + 83, + 269, + 107, + 330, + 94, + 151658, + 151645, + 94, + 151644, + 273, + 105, + 94, + 27, + 83, + 1364, + 62, + 237, + 79, + 102, + 182, + 29, + 94, + 83, + 269, + 706, + 102, + 182, + 94, + 1932, + 83, + 1364, + 62, + 237, + 79, + 102, + 182, + 29, + 151645, + 94, + 151644, + 397, + 251, + 249, + 94, + 151643, + ] + expected_formatted_messages = ( + "<|im_start|>system\n" + "a<|im_end|>\n" + "<|im_start|>user\n" + "b<|im_end|>\n" + "<|im_start|>assistant\n" + "\n" + "test call\n" + "<|im_end|>\n" + "<|im_start|>user\n" + "\n" + "test response\n" + "<|im_end|>\n" + "<|im_start|>assistant\n" + "<|endoftext|>" + ) + _test_tokenize_messages( + tokenizer, + messages, + expected_tokens, + expected_formatted_messages, + ) + + +def _test_tokenize_messages( + tokenizer, messages, expected_tokens, expected_formatted_messages +): + tokens, mask = tokenizer.tokenize_messages(messages) + assert len(tokens) == len(mask) + assert expected_tokens == tokens + formatted_messages = tokenizer.decode(tokens) + assert expected_formatted_messages == formatted_messages diff --git a/torchtune/_recipe_registry.py b/torchtune/_recipe_registry.py index 7bed74a6e7..cdb1d45f01 100644 --- a/torchtune/_recipe_registry.py +++ b/torchtune/_recipe_registry.py @@ -71,6 +71,22 @@ class Recipe: name="qwen2/1.5B_full_single_device", file_path="qwen2/1.5B_full_single_device.yaml", ), + Config( + name="qwen2_5/0_5B_full_single_device", + file_path="qwen2_5/0_5B_full_single_device.yaml", + ), + Config( + name="qwen2_5/1_5B_full_single_device", + file_path="qwen2_5/1_5B_full_single_device.yaml", + ), + Config( + name="qwen2_5/3B_full_single_device", + file_path="qwen2_5/3B_full_single_device.yaml", + ), + Config( + name="qwen2_5/7B_full_single_device", + file_path="qwen2_5/7B_full_single_device.yaml", + ), Config( name="llama3_2_vision/11B_full_single_device", file_path="llama3_2_vision/11B_full_single_device.yaml", @@ -97,6 +113,10 @@ class Recipe: Config(name="qwen2/7B_full", file_path="qwen2/7B_full.yaml"), Config(name="qwen2/0.5B_full", file_path="qwen2/0.5B_full.yaml"), Config(name="qwen2/1.5B_full", file_path="qwen2/1.5B_full.yaml"), + Config(name="qwen2_5/0_5B_full", file_path="qwen2_5/0_5B_full.yaml"), + Config(name="qwen2_5/1_5B_full", file_path="qwen2_5/1_5B_full.yaml"), + Config(name="qwen2_5/3B_full", file_path="qwen2_5/3B_full.yaml"), + Config(name="qwen2_5/7B_full", file_path="qwen2_5/7B_full.yaml"), Config( name="llama3_2_vision/11B_full", file_path="llama3_2_vision/11B_full.yaml", @@ -216,6 +236,26 @@ class Recipe: name="qwen2/1.5B_lora_single_device", file_path="qwen2/1.5B_lora_single_device.yaml", ), + Config( + name="qwen2_5/0_5B_lora_single_device", + file_path="qwen2_5/0_5B_lora_single_device.yaml", + ), + Config( + name="qwen2_5/1_5B_lora_single_device", + file_path="qwen2_5/1_5B_lora_single_device.yaml", + ), + Config( + name="qwen2_5/3B_lora_single_device", + file_path="qwen2_5/3B_lora_single_device.yaml", + ), + Config( + name="qwen2_5/7B_lora_single_device", + file_path="qwen2_5/7B_lora_single_device.yaml", + ), + Config( + name="qwen2_5/14B_lora_single_device", + file_path="qwen2_5/14B_lora_single_device.yaml", + ), Config( name="llama3_2_vision/11B_lora_single_device", file_path="llama3_2_vision/11B_lora_single_device.yaml", @@ -293,6 +333,12 @@ class Recipe: Config(name="qwen2/7B_lora", file_path="qwen2/7B_lora.yaml"), Config(name="qwen2/0.5B_lora", file_path="qwen2/0.5B_lora.yaml"), Config(name="qwen2/1.5B_lora", file_path="qwen2/1.5B_lora.yaml"), + Config(name="qwen2_5/0_5B_lora", file_path="qwen2_5/0_5B_lora.yaml"), + Config(name="qwen2_5/1_5B_lora", file_path="qwen2_5/1_5B_lora.yaml"), + Config(name="qwen2_5/3B_lora", file_path="qwen2_5/3B_lora.yaml"), + Config(name="qwen2_5/7B_lora", file_path="qwen2_5/7B_lora.yaml"), + Config(name="qwen2_5/32B_lora", file_path="qwen2_5/32B_lora.yaml"), + Config(name="qwen2_5/72B_lora", file_path="qwen2_5/72B_lora.yaml"), Config( name="llama3_2_vision/11B_lora", file_path="llama3_2_vision/11B_lora.yaml", diff --git a/torchtune/models/qwen2/__init__.py b/torchtune/models/qwen2/__init__.py index a2f36a3661..8e04fba85d 100644 --- a/torchtune/models/qwen2/__init__.py +++ b/torchtune/models/qwen2/__init__.py @@ -19,17 +19,17 @@ from ._tokenizer import Qwen2Tokenizer __all__ = [ - "qwen2_7b", - "qwen2_0_5b", - "qwen2_1_5b", - "qwen2_tokenizer", - "lora_qwen2_7b", - "lora_qwen2_0_5b", - "lora_qwen2_1_5b", - "qwen2", "lora_qwen2", + "qwen2", "qwen2_hf_to_tune", "qwen2_tune_to_hf", + "lora_qwen2_0_5b", + "lora_qwen2_1_5b", + "lora_qwen2_7b", + "qwen2_0_5b", + "qwen2_1_5b", + "qwen2_7b", + "qwen2_tokenizer", "Qwen2RotaryPositionalEmbeddings", "Qwen2Tokenizer", ] diff --git a/torchtune/models/qwen2/_model_builders.py b/torchtune/models/qwen2/_model_builders.py index 8fef948643..2a0ee06f83 100644 --- a/torchtune/models/qwen2/_model_builders.py +++ b/torchtune/models/qwen2/_model_builders.py @@ -5,13 +5,13 @@ # LICENSE file in the root directory of this source tree. from typing import List, Optional -from torchtune.models.qwen2._component_builders import qwen2, lora_qwen2 -from torchtune.models.qwen2._tokenizer import Qwen2Tokenizer +from torchtune.data._prompt_templates import _get_prompt_template, _TemplateType + +from torchtune.models.qwen2._component_builders import lora_qwen2, qwen2 +from torchtune.models.qwen2._tokenizer import QWEN2_SPECIAL_TOKENS, Qwen2Tokenizer from torchtune.modules import TransformerDecoder from torchtune.modules.peft import LORA_ATTN_MODULES from torchtune.modules.tokenizers import parse_hf_tokenizer_json -from torchtune.data._prompt_templates import _TemplateType -from torchtune.data._prompt_templates import _get_prompt_template """ Model builders build specific instantiations using component builders. For example @@ -101,7 +101,7 @@ def qwen2_tokenizer( merges_file: str = None, special_tokens_path: Optional[str] = None, max_seq_len: Optional[int] = None, - prompt_template: Optional[_TemplateType] = "torchtune.data.ChatMLTemplate", + prompt_template: Optional[_TemplateType] = None, **kwargs, ) -> Qwen2Tokenizer: """ @@ -118,14 +118,27 @@ def qwen2_tokenizer( prompt_template (Optional[_TemplateType]): optional specified prompt template. If a string, it is assumed to be the dotpath of a :class:`~torchtune.data.PromptTemplateInterface` class. If a dictionary, it is assumed to be a custom prompt template mapping role to the - prepend/append tags. Default is :class:`~torchtune.models.llama2.Llama2ChatTemplate`. + prepend/append tags. Default is None. Returns: Qwen2Tokenizer: Instantiation of the Qwen2 tokenizer """ - special_tokens = parse_hf_tokenizer_json(special_tokens_path) if special_tokens_path is not None else None - template = _get_prompt_template(prompt_template) if prompt_template is not None else None - return Qwen2Tokenizer(path=path, merges_file=merges_file, special_tokens=special_tokens, max_seq_len=max_seq_len, prompt_template=template, **kwargs) + special_tokens = ( + parse_hf_tokenizer_json(special_tokens_path) + if special_tokens_path is not None + else QWEN2_SPECIAL_TOKENS + ) + template = ( + _get_prompt_template(prompt_template) if prompt_template is not None else None + ) + return Qwen2Tokenizer( + path=path, + merges_file=merges_file, + special_tokens=special_tokens, + max_seq_len=max_seq_len, + prompt_template=template, + **kwargs, + ) def lora_qwen2_7b( diff --git a/torchtune/models/qwen2/_tokenizer.py b/torchtune/models/qwen2/_tokenizer.py index 952100002e..0e4ee6bd35 100644 --- a/torchtune/models/qwen2/_tokenizer.py +++ b/torchtune/models/qwen2/_tokenizer.py @@ -24,12 +24,11 @@ "<|im_end|>": 151645, } - ENDOFTEXT = "<|endoftext|>" IM_START = "<|im_start|>" IM_END = "<|im_end|>" -DEFAULT_QWEN2_TOKENIZER_BPE_CACHE_SIZE = 151646 +DEFAULT_QWEN2_TOKENIZER_BPE_CACHE_SIZE = 152064 @lru_cache() @@ -83,7 +82,7 @@ class Qwen2Tokenizer(ModelTokenizer): merges_file (str): Path to merges.txt file. merges.txt contains all BPE merge operations, and this file is required to split a single word into byte-level BPE tokens. - special_tokens (Optional[Dict[str, int]]): Special tokens to add to the tokenizer. Default is None. + special_tokens (Dict[str, int]): Special tokens to add to the tokenizer. Default is QWEN2_SPECIAL_TOKENS. max_seq_len (Optional[int]): A max sequence length to truncate tokens to. Default: None prompt_template (Optional[PromptTemplate]): template used to format the messages based on their role. This is used @@ -95,7 +94,7 @@ class Qwen2Tokenizer(ModelTokenizer): - Community standardized templates, such as :class:`~torchtune.data.ChatMLTemplate` The extra text will still get tokenized as normal text, not as special tokens. - Default is :class:`~torchtune.data.ChatMLTemplate`. + Default: None errors (str): Paradigm to follow when decoding bytes to UTF-8. Defaults to "replace". See [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information. unk_token (Optional[str]): The unknown token. A token that is not in the vocabulary cannot be converted @@ -110,7 +109,8 @@ class Qwen2Tokenizer(ModelTokenizer): By default, we set the cache size equals to size of the official Qwen2 tokenizer. Example: - >>> tokenizer = Qwen2Tokenizer(path="/path/to/vocab.json", merges_file="/path/to/merges.txt") + >>> tokenizer = Qwen2Tokenizer( + path="/path/to/vocab.json", merges_file="/path/to/merges.txt", special_tokens=QWEN2_SPECIAL_TOKENS) >>> tokenized_text = tokenizer.encode("Hello world!") >>> print(tokenized_text) [39, 385, 78, 675, 0, 2000] @@ -120,10 +120,10 @@ def __init__( self, path: str, merges_file: str, - special_tokens: Optional[Dict[str, int]] = None, + special_tokens: Dict[str, int] = QWEN2_SPECIAL_TOKENS, max_seq_len: Optional[int] = None, *, - prompt_template: Optional[PromptTemplate] = ChatMLTemplate(), + prompt_template: Optional[PromptTemplate] = None, errors: str = "replace", unk_token: Optional[str] = ENDOFTEXT, bos_token: Optional[str] = None, @@ -151,9 +151,7 @@ def __init__( self.pat = re.compile(PRETOKENIZE_REGEX) - self.special_tokens = ( - special_tokens if special_tokens is not None else QWEN2_SPECIAL_TOKENS - ) + self.special_tokens = special_tokens self._special_tokens_reversed = {v: k for k, v in self.special_tokens.items()} self.unk_id = None if unk_token is None else self.special_tokens[unk_token] @@ -345,6 +343,10 @@ def tokenize_messages( Raises: RuntimeError: If a message contains non-text content """ + assert not isinstance(self.prompt_template, ChatMLTemplate), ( + "Using ChatMLTemplate with tokenize_messages will result in multiple <|im_*|> tokens wrapping each message." + "Please use a different template or set to None." + ) templated_messages = ( self.prompt_template(messages) if self.prompt_template is not None @@ -355,29 +357,48 @@ def tokenize_messages( mask = [] for index, message in enumerate(templated_messages): tokens = [] + + # message header + if message.role != "ipython": + tokens.append(self.im_start_id) + tokens.extend( + self.encode(f"{message.role}\n", add_bos=False, add_eos=False) + ) + + # message content for item in message.content: if item["type"] == "text": - tokens = tokens + self.encode( - item["content"], - add_bos=False, - add_eos=False, + tokens.extend( + self.encode( + item["content"], + add_bos=False, + add_eos=False, + ) ) else: raise RuntimeError( f"Unsupported message content type: {item['type']}" ) + + # message footer + if message.role != "ipython" and ( + message.role != "assistant" or index != len(messages) - 1 + ): + tokens.append(self.im_end_id) + tokens.extend(self.encode("\n", add_bos=False, add_eos=False)) + tokenized_messages.extend(tokens) mask.extend([message.masked] * len(tokens)) - # If assistant message, append EOS at end - if message.role == "assistant" and add_eos: - tokenized_messages.append(self.eos_id) - mask.append(message.masked) - # Break out early if we reach max_seq_len if self.max_seq_len and len(tokenized_messages) >= self.max_seq_len: break + # Add the End-Of-Sequence token + if add_eos: + tokenized_messages.append(self.eos_id) + mask.append(mask[-1]) + # Finally, truncate if necessary if self.max_seq_len: tokenized_messages = truncate( diff --git a/torchtune/models/qwen2_5/__init__.py b/torchtune/models/qwen2_5/__init__.py new file mode 100644 index 0000000000..50d1e570b8 --- /dev/null +++ b/torchtune/models/qwen2_5/__init__.py @@ -0,0 +1,61 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from ._model_builders import ( + lora_qwen2_5_0_5b, + lora_qwen2_5_14b_base, + lora_qwen2_5_14b_instruct, + lora_qwen2_5_1_5b_base, + lora_qwen2_5_1_5b_instruct, + lora_qwen2_5_32b_base, + lora_qwen2_5_32b_instruct, + lora_qwen2_5_3b, + lora_qwen2_5_72b_base, + lora_qwen2_5_72b_instruct, + lora_qwen2_5_7b_base, + lora_qwen2_5_7b_instruct, + qwen2_5_0_5b, + qwen2_5_14b_base, + qwen2_5_14b_instruct, + qwen2_5_1_5b_base, + qwen2_5_1_5b_instruct, + qwen2_5_32b_base, + qwen2_5_32b_instruct, + qwen2_5_3b, + qwen2_5_72b_base, + qwen2_5_72b_instruct, + qwen2_5_7b_base, + qwen2_5_7b_instruct, + qwen2_5_tokenizer, +) + +__all__ = [ + "lora_qwen2_5_0_5b", + "lora_qwen2_5_14b_base", + "lora_qwen2_5_14b_instruct", + "lora_qwen2_5_1_5b_base", + "lora_qwen2_5_1_5b_instruct", + "lora_qwen2_5_32b_base", + "lora_qwen2_5_32b_instruct", + "lora_qwen2_5_3b", + "lora_qwen2_5_72b_base", + "lora_qwen2_5_72b_instruct", + "lora_qwen2_5_7b_base", + "lora_qwen2_5_7b_instruct", + "qwen2_5_0_5b", + "qwen2_5_14b_base", + "qwen2_5_14b_instruct", + "qwen2_5_1_5b_base", + "qwen2_5_1_5b_instruct", + "qwen2_5_32b_base", + "qwen2_5_32b_instruct", + "qwen2_5_3b", + "qwen2_5_72b_base", + "qwen2_5_72b_instruct", + "qwen2_5_7b_base", + "qwen2_5_7b_instruct", + "qwen2_5_tokenizer", +] diff --git a/torchtune/models/qwen2_5/_model_builders.py b/torchtune/models/qwen2_5/_model_builders.py new file mode 100644 index 0000000000..4474958862 --- /dev/null +++ b/torchtune/models/qwen2_5/_model_builders.py @@ -0,0 +1,1095 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +from typing import List, Optional + +from torchtune.data._prompt_templates import _get_prompt_template, _TemplateType + +from torchtune.models.qwen2._component_builders import lora_qwen2, qwen2 +from torchtune.models.qwen2_5._tokenizer import QWEN2_5_SPECIAL_TOKENS, Qwen2_5Tokenizer +from torchtune.modules import TransformerDecoder +from torchtune.modules.peft import LORA_ATTN_MODULES +from torchtune.modules.tokenizers import parse_hf_tokenizer_json + +""" +Model builders build specific instantiations using component builders. For example +the qwen2_5_7b model builder uses the qwen2 component builder to create the +Qwen2.5 7B model. +""" + + +def qwen2_5_0_5b() -> TransformerDecoder: + """ + Builder for creating a Qwen2.5 model (base or instruct) initialized w/ the default 0.5B parameter values + from https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct + + Returns: + TransformerDecoder: Instantiation of Qwen2.5 0.5B model + + Note: + Qwen2.5 0.5B-3B model builders will enable `tie_word_embeddings` by default. + """ + return qwen2( + vocab_size=151936, + num_layers=24, + num_heads=14, + num_kv_heads=2, + embed_dim=896, + intermediate_dim=4864, + max_seq_len=32768, + attn_dropout=0.0, + norm_eps=1e-6, + rope_base=1000000.0, + tie_word_embeddings=True, + ) + + +def qwen2_5_1_5b_base() -> TransformerDecoder: + """ + Builder for creating a Qwen2.5 base model initialized w/ the default 1.5B parameter values + from https://huggingface.co/Qwen/Qwen2.5-1.5B + + Returns: + TransformerDecoder: Instantiation of Qwen2.5 1.5B model + + Note: + The base and instruct versions have slightly different architectures for all Qwen2.5 model sizes + except 0.5B and 3B. Make sure to select the correct model builder for the weights. + + Note: + Qwen2.5 0.5B-3B model builders will enable `tie_word_embeddings` by default. + """ + return qwen2( + vocab_size=151936, + num_layers=28, + num_heads=12, + num_kv_heads=2, + embed_dim=1536, + intermediate_dim=8960, + max_seq_len=131072, + attn_dropout=0.0, + norm_eps=1e-6, + rope_base=1000000.0, + tie_word_embeddings=True, + ) + + +def qwen2_5_1_5b_instruct() -> TransformerDecoder: + """ + Builder for creating a Qwen2.5 instruct model initialized w/ the default 1.5B parameter values + from https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct + + Returns: + TransformerDecoder: Instantiation of Qwen2.5 1.5B instruct model + + Note: + The base and instruct versions have slightly different architectures for all Qwen2.5 model sizes + except 0.5B and 3B. Make sure to select the correct model builder for the weights. + + Note: + Qwen2.5 0.5B-3B model builders will enable `tie_word_embeddings` by default. + """ + return qwen2( + vocab_size=151936, + num_layers=28, + num_heads=12, + num_kv_heads=2, + embed_dim=1536, + intermediate_dim=8960, + max_seq_len=32768, + attn_dropout=0.0, + norm_eps=1e-6, + rope_base=1000000.0, + tie_word_embeddings=True, + ) + + +def qwen2_5_3b() -> TransformerDecoder: + """ + Builder for creating a Qwen2.5 model (base or instruct) initialized w/ the default 3B parameter values + from https://huggingface.co/Qwen/Qwen2.5-3B-Instruct + + Returns: + TransformerDecoder: Instantiation of Qwen2.5 3B model + + Note: + Qwen2.5 0.5B-3B model builders will enable `tie_word_embeddings` by default. + """ + return qwen2( + vocab_size=151936, + num_layers=36, + num_heads=16, + num_kv_heads=2, + embed_dim=2048, + intermediate_dim=11008, + max_seq_len=32768, + attn_dropout=0.0, + norm_eps=1e-6, + rope_base=1000000.0, + tie_word_embeddings=True, + ) + + +def qwen2_5_7b_base() -> TransformerDecoder: + """ + Builder for creating a Qwen2.5 base model initialized w/ the default 7B parameter values + from https://huggingface.co/Qwen/Qwen2.5-7B + + Returns: + TransformerDecoder: Instantiation of Qwen2.5 7B model + + Note: + The base and instruct versions have slightly different architectures for all Qwen2.5 model sizes + except 0.5B and 3B. Make sure to select the correct model builder for the weights. + """ + return qwen2( + vocab_size=152064, + num_layers=28, + num_heads=28, + num_kv_heads=4, + embed_dim=3584, + intermediate_dim=18944, + max_seq_len=131072, + attn_dropout=0.0, + norm_eps=1e-6, + rope_base=1000000.0, + ) + + +def qwen2_5_7b_instruct() -> TransformerDecoder: + """ + Builder for creating a Qwen2.5 instruct model initialized w/ the default 7B parameter values + from https://huggingface.co/Qwen/Qwen2.5-7B-Instruct + + Returns: + TransformerDecoder: Instantiation of Qwen2.5 7B instruct model + + Note: + The base and instruct versions have slightly different architectures for all Qwen2.5 model sizes + except 0.5B and 3B. Make sure to select the correct model builder for the weights. + """ + return qwen2( + vocab_size=152064, + num_layers=28, + num_heads=28, + num_kv_heads=4, + embed_dim=3584, + intermediate_dim=18944, + max_seq_len=32768, + attn_dropout=0.0, + norm_eps=1e-6, + rope_base=1000000.0, + ) + + +def qwen2_5_14b_base() -> TransformerDecoder: + """ + Builder for creating a Qwen2.5 base model initialized w/ the default 14B parameter values + from https://huggingface.co/Qwen/Qwen2.5-14B + + Returns: + TransformerDecoder: Instantiation of Qwen2.5 14B model + + Note: + The base and instruct versions have slightly different architectures for all Qwen2.5 model sizes + except 0.5B and 3B. Make sure to select the correct model builder for the weights. + """ + return qwen2( + vocab_size=152064, + num_layers=48, + num_heads=40, + num_kv_heads=8, + embed_dim=5120, + intermediate_dim=13824, + max_seq_len=131072, + attn_dropout=0.0, + norm_eps=1e-5, + rope_base=1000000.0, + ) + + +def qwen2_5_14b_instruct() -> TransformerDecoder: + """ + Builder for creating a Qwen2.5 instruct model initialized w/ the default 14B parameter values + from https://huggingface.co/Qwen/Qwen2.5-14B-Instruct + + Returns: + TransformerDecoder: Instantiation of Qwen2.5 14B instruct model + + Note: + The base and instruct versions have slightly different architectures for all Qwen2.5 model sizes + except 0.5B and 3B. Make sure to select the correct model builder for the weights. + """ + return qwen2( + vocab_size=152064, + num_layers=48, + num_heads=40, + num_kv_heads=8, + embed_dim=5120, + intermediate_dim=13824, + max_seq_len=32768, + attn_dropout=0.0, + norm_eps=1e-6, + rope_base=1000000.0, + ) + + +def qwen2_5_32b_base() -> TransformerDecoder: + """ + Builder for creating a Qwen2.5 base model initialized w/ the default 32B parameter values + from https://huggingface.co/Qwen/Qwen2.5-32B + + Returns: + TransformerDecoder: Instantiation of Qwen2.5 32B model + + Note: + The base and instruct versions have slightly different architectures for all Qwen2.5 model sizes + except 0.5B and 3B. Make sure to select the correct model builder for the weights. + """ + return qwen2( + vocab_size=152064, + num_layers=64, + num_heads=40, + num_kv_heads=8, + embed_dim=5120, + intermediate_dim=27648, + max_seq_len=131072, + attn_dropout=0.0, + norm_eps=1e-5, + rope_base=1000000.0, + ) + + +def qwen2_5_32b_instruct() -> TransformerDecoder: + """ + Builder for creating a Qwen2.5 instruct model initialized w/ the default 32B parameter values + from https://huggingface.co/Qwen/Qwen2.5-32B-Instruct + + Returns: + TransformerDecoder: Instantiation of Qwen2.5 32B instruct model + + Note: + The base and instruct versions have slightly different architectures for all Qwen2.5 model sizes + except 0.5B and 3B. Make sure to select the correct model builder for the weights. + """ + return qwen2( + vocab_size=152064, + num_layers=64, + num_heads=40, + num_kv_heads=8, + embed_dim=5120, + intermediate_dim=27648, + max_seq_len=32768, + attn_dropout=0.0, + norm_eps=1e-6, + rope_base=1000000.0, + ) + + +def qwen2_5_72b_base() -> TransformerDecoder: + """ + Builder for creating a Qwen2.5 base model initialized w/ the default 72B parameter values + from https://huggingface.co/Qwen/Qwen2.5-72B + + Returns: + TransformerDecoder: Instantiation of Qwen2.5 72B model + + Note: + The base and instruct versions have slightly different architectures for all Qwen2.5 model sizes + except 0.5B and 3B. Make sure to select the correct model builder for the weights. + """ + return qwen2( + vocab_size=152064, + num_layers=80, + num_heads=64, + num_kv_heads=8, + embed_dim=8192, + intermediate_dim=29568, + max_seq_len=131072, + attn_dropout=0.0, + norm_eps=1e-5, + rope_base=1000000.0, + ) + + +def qwen2_5_72b_instruct() -> TransformerDecoder: + """ + Builder for creating a Qwen2.5 instruct model initialized w/ the default 72B parameter values + from https://huggingface.co/Qwen/Qwen2.5-72B-Instruct + + Returns: + TransformerDecoder: Instantiation of Qwen2.5 72B instruct model + + Note: + The base and instruct versions have slightly different architectures for all Qwen2.5 model sizes + except 0.5B and 3B. Make sure to select the correct model builder for the weights. + """ + return qwen2( + vocab_size=152064, + num_layers=80, + num_heads=64, + num_kv_heads=8, + embed_dim=8192, + intermediate_dim=29568, + max_seq_len=32768, + attn_dropout=0.0, + norm_eps=1e-6, + rope_base=1000000.0, + ) + + +def qwen2_5_tokenizer( + path: str, + merges_file: str, + special_tokens_path: Optional[str] = None, + max_seq_len: Optional[int] = None, + prompt_template: Optional[_TemplateType] = None, + **kwargs, +) -> Qwen2_5Tokenizer: + """ + Tokenizer for Qwen2.5. + + Args: + path (str): path to the vocab.json file. + merges_file (str): path to the merges.txt file. + special_tokens_path (Optional[str]): Path to ``tokenizer.json`` from Hugging Face + model files that contains all registered special tokens, or a local json file + structured similarly. Default is None to use the canonical Qwen2.5 special tokens. + max_seq_len (Optional[int]): A max sequence length to truncate tokens to. + Default: None + prompt_template (Optional[_TemplateType]): optional specified prompt template. + If a string, it is assumed to be the dotpath of a :class:`~torchtune.data.PromptTemplateInterface` + class. If a dictionary, it is assumed to be a custom prompt template mapping role to the + prepend/append tags. + Default is None. + + Returns: + Qwen2_5Tokenizer: Instantiation of the Qwen2.5 tokenizer + """ + special_tokens = ( + QWEN2_5_SPECIAL_TOKENS + if special_tokens_path is None + else parse_hf_tokenizer_json(special_tokens_path) + ) + + if prompt_template is not None: + prompt_template = _get_prompt_template(prompt_template) + + return Qwen2_5Tokenizer( + path=path, + merges_file=merges_file, + special_tokens=special_tokens, + max_seq_len=max_seq_len, + prompt_template=prompt_template, + **kwargs, + ) + + +def lora_qwen2_5_0_5b( + lora_attn_modules: List[LORA_ATTN_MODULES], + apply_lora_to_mlp: bool = False, + apply_lora_to_output: bool = False, + lora_rank: int = 8, + lora_alpha: float = 16, + lora_dropout: float = 0.0, + use_dora: bool = False, + quantize_base: bool = False, +) -> TransformerDecoder: + """ + Builder for creating a Qwen2.5 0.5B model (base or instruct) with LoRA enabled. + + The Qwen2.5 defaults are the same as in :func:`~torchtune.models.qwen2_5.qwen2_5_0_5b`, + while LoRA default params are based on + https://github.com/tloen/alpaca-lora/blob/8bb8579e403dc78e37fe81ffbb253c413007323f/finetune.py#L41-L43. + + Args: + lora_attn_modules (List[LORA_ATTN_MODULES]): list of which linear layers + LoRA should be applied to in each self-attention block. Options are + ``{"q_proj", "k_proj", "v_proj", "output_proj"}``. + apply_lora_to_mlp (bool): whether to apply LoRA to the MLP in each transformer layer. + Default: False + lora_rank (int): rank of each low-rank approximation + lora_alpha (float): scaling factor for the low-rank approximation + lora_dropout (float): dropout probability for the low-rank approximation. Default: 0.0 + quantize_base (bool): Whether to quantize base model weights + + Returns: + TransformerDecoder: Instantiation of Qwen2.5 0.5B model with LoRA applied + + Note: + Qwen2.5 0.5B-3B model builders will enable `tie_word_embeddings` by default. + """ + return lora_qwen2( + lora_attn_modules=lora_attn_modules, + apply_lora_to_mlp=apply_lora_to_mlp, + apply_lora_to_output=apply_lora_to_output, + vocab_size=151936, + num_layers=24, + num_heads=14, + num_kv_heads=2, + embed_dim=896, + intermediate_dim=4864, + max_seq_len=32768, + attn_dropout=0.0, + norm_eps=1e-6, + rope_base=1000000.0, + tie_word_embeddings=True, + lora_rank=lora_rank, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + use_dora=use_dora, + quantize_base=quantize_base, + ) + + +def lora_qwen2_5_1_5b_base( + lora_attn_modules: List[LORA_ATTN_MODULES], + apply_lora_to_mlp: bool = False, + apply_lora_to_output: bool = False, + lora_rank: int = 8, + lora_alpha: float = 16, + lora_dropout: float = 0.0, + use_dora: bool = False, + quantize_base: bool = False, +) -> TransformerDecoder: + """ + Builder for creating a Qwen2.5 1.5B base model with LoRA enabled. + + The Qwen2.5 defaults are the same as in :func:`~torchtune.models.qwen2_5.qwen2_5_1_5b_base`, + while LoRA default params are based on + https://github.com/tloen/alpaca-lora/blob/8bb8579e403dc78e37fe81ffbb253c413007323f/finetune.py#L41-L43. + + Args: + lora_attn_modules (List[LORA_ATTN_MODULES]): list of which linear layers + LoRA should be applied to in each self-attention block. Options are + ``{"q_proj", "k_proj", "v_proj", "output_proj"}``. + apply_lora_to_mlp (bool): whether to apply LoRA to the MLP in each transformer layer. + Default: False + lora_rank (int): rank of each low-rank approximation + lora_alpha (float): scaling factor for the low-rank approximation + lora_dropout (float): dropout probability for the low-rank approximation. Default: 0.0 + quantize_base (bool): Whether to quantize base model weights + + Returns: + TransformerDecoder: Instantiation of Qwen2.5 1.5B model with LoRA applied + + Note: + Qwen2.5 0.5B-3B model builders will enable `tie_word_embeddings` by default. + + Note: + The base and instruct versions have slightly different architectures for all Qwen2.5 model sizes + except 0.5B and 3B. Make sure to select the correct model builder for the weights. + """ + return lora_qwen2( + lora_attn_modules=lora_attn_modules, + apply_lora_to_mlp=apply_lora_to_mlp, + apply_lora_to_output=apply_lora_to_output, + vocab_size=151936, + num_layers=28, + num_heads=12, + num_kv_heads=2, + embed_dim=1536, + intermediate_dim=8960, + max_seq_len=131072, + attn_dropout=0.0, + norm_eps=1e-6, + rope_base=1000000.0, + tie_word_embeddings=True, + lora_rank=lora_rank, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + use_dora=use_dora, + quantize_base=quantize_base, + ) + + +def lora_qwen2_5_1_5b_instruct( + lora_attn_modules: List[LORA_ATTN_MODULES], + apply_lora_to_mlp: bool = False, + apply_lora_to_output: bool = False, + lora_rank: int = 8, + lora_alpha: float = 16, + lora_dropout: float = 0.0, + use_dora: bool = False, + quantize_base: bool = False, +) -> TransformerDecoder: + """ + Builder for creating a Qwen2.5 1.5B instruct model with LoRA enabled. + + The Qwen2.5 defaults are the same as in :func:`~torchtune.models.qwen2_5.qwen2_5_1_5b_instruct`, + while LoRA default params are based on + https://github.com/tloen/alpaca-lora/blob/8bb8579e403dc78e37fe81ffbb253c413007323f/finetune.py#L41-L43. + + Args: + lora_attn_modules (List[LORA_ATTN_MODULES]): list of which linear layers + LoRA should be applied to in each self-attention block. Options are + ``{"q_proj", "k_proj", "v_proj", "output_proj"}``. + apply_lora_to_mlp (bool): whether to apply LoRA to the MLP in each transformer layer. + Default: False + lora_rank (int): rank of each low-rank approximation + lora_alpha (float): scaling factor for the low-rank approximation + lora_dropout (float): dropout probability for the low-rank approximation. Default: 0.0 + quantize_base (bool): Whether to quantize base model weights + + Returns: + TransformerDecoder: Instantiation of Qwen2.5 1.5B model with LoRA applied + + Note: + Qwen2.5 0.5B-3B model builders will enable `tie_word_embeddings` by default. + + Note: + The base and instruct versions have slightly different architectures for all Qwen2.5 model sizes + except 0.5B and 3B. Make sure to select the correct model builder for the weights. + """ + return lora_qwen2( + lora_attn_modules=lora_attn_modules, + apply_lora_to_mlp=apply_lora_to_mlp, + apply_lora_to_output=apply_lora_to_output, + vocab_size=151936, + num_layers=28, + num_heads=12, + num_kv_heads=2, + embed_dim=1536, + intermediate_dim=8960, + max_seq_len=32768, + attn_dropout=0.0, + norm_eps=1e-6, + rope_base=1000000.0, + tie_word_embeddings=True, + lora_rank=lora_rank, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + use_dora=use_dora, + quantize_base=quantize_base, + ) + + +def lora_qwen2_5_3b( + lora_attn_modules: List[LORA_ATTN_MODULES], + apply_lora_to_mlp: bool = False, + apply_lora_to_output: bool = False, + lora_rank: int = 8, + lora_alpha: float = 16, + lora_dropout: float = 0.0, + use_dora: bool = False, + quantize_base: bool = False, +) -> TransformerDecoder: + """ + Builder for creating a Qwen2.5 3B model (base or instruct) with LoRA enabled. + + The Qwen2.5 defaults are the same as in :func:`~torchtune.models.qwen2_5.qwen2_5_3b`, + while LoRA default params are based on + https://github.com/tloen/alpaca-lora/blob/8bb8579e403dc78e37fe81ffbb253c413007323f/finetune.py#L41-L43. + + Args: + lora_attn_modules (List[LORA_ATTN_MODULES]): list of which linear layers + LoRA should be applied to in each self-attention block. Options are + ``{"q_proj", "k_proj", "v_proj", "output_proj"}``. + apply_lora_to_mlp (bool): whether to apply LoRA to the MLP in each transformer layer. + Default: False + lora_rank (int): rank of each low-rank approximation + lora_alpha (float): scaling factor for the low-rank approximation + lora_dropout (float): dropout probability for the low-rank approximation. Default: 0.0 + quantize_base (bool): Whether to quantize base model weights + + Returns: + TransformerDecoder: Instantiation of Qwen2.5 3B model with LoRA applied + + Note: + Qwen2.5 0.5B-3B model builders will enable `tie_word_embeddings` by default. + """ + return lora_qwen2( + lora_attn_modules=lora_attn_modules, + apply_lora_to_mlp=apply_lora_to_mlp, + apply_lora_to_output=apply_lora_to_output, + vocab_size=151936, + num_layers=36, + num_heads=16, + num_kv_heads=2, + embed_dim=2048, + intermediate_dim=11008, + max_seq_len=32768, + attn_dropout=0.0, + norm_eps=1e-6, + rope_base=1000000.0, + tie_word_embeddings=True, + lora_rank=lora_rank, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + use_dora=use_dora, + quantize_base=quantize_base, + ) + + +def lora_qwen2_5_7b_base( + lora_attn_modules: List[LORA_ATTN_MODULES], + apply_lora_to_mlp: bool = False, + apply_lora_to_output: bool = False, + lora_rank: int = 8, + lora_alpha: float = 16, + lora_dropout: float = 0.0, + use_dora: bool = False, + quantize_base: bool = False, +) -> TransformerDecoder: + """ + Builder for creating a Qwen2.5 7B base model with LoRA enabled. + + The Qwen2.5 defaults are the same as in :func:`~torchtune.models.qwen2_5.qwen2_5_7b_base`, + while LoRA default params are based on + https://github.com/tloen/alpaca-lora/blob/8bb8579e403dc78e37fe81ffbb253c413007323f/finetune.py#L41-L43. + + Args: + lora_attn_modules (List[LORA_ATTN_MODULES]): list of which linear layers + LoRA should be applied to in each self-attention block. Options are + ``{"q_proj", "k_proj", "v_proj", "output_proj"}``. + apply_lora_to_mlp (bool): whether to apply LoRA to the MLP in each transformer layer. + Default: False + apply_lora_to_output (bool): whether to apply LoRA to the model's final output projection. + Default: False + lora_rank (int): rank of each low-rank approximation + lora_alpha (float): scaling factor for the low-rank approximation + lora_dropout (float): dropout probability for the low-rank approximation. Default: 0.0 + quantize_base (bool): Whether to quantize base model weights + + Returns: + TransformerDecoder: Instantiation of Qwen2.5 7B model with LoRA applied + + Note: + The base and instruct versions have slightly different architectures for all Qwen2.5 model sizes + except 0.5B and 3B. Make sure to select the correct model builder for the weights. + """ + return lora_qwen2( + lora_attn_modules=lora_attn_modules, + apply_lora_to_mlp=apply_lora_to_mlp, + apply_lora_to_output=apply_lora_to_output, + vocab_size=152064, + num_layers=28, + num_heads=28, + num_kv_heads=4, + embed_dim=3584, + intermediate_dim=18944, + max_seq_len=131072, + attn_dropout=0.0, + norm_eps=1e-6, + rope_base=1000000.0, + lora_rank=lora_rank, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + use_dora=use_dora, + quantize_base=quantize_base, + ) + + +def lora_qwen2_5_7b_instruct( + lora_attn_modules: List[LORA_ATTN_MODULES], + apply_lora_to_mlp: bool = False, + apply_lora_to_output: bool = False, + lora_rank: int = 8, + lora_alpha: float = 16, + lora_dropout: float = 0.0, + use_dora: bool = False, + quantize_base: bool = False, +) -> TransformerDecoder: + """ + Builder for creating a Qwen2.5 7B instruct model with LoRA enabled. + + The Qwen2.5 defaults are the same as in :func:`~torchtune.models.qwen2_5.qwen2_5_7b_instruct`, + while LoRA default params are based on + https://github.com/tloen/alpaca-lora/blob/8bb8579e403dc78e37fe81ffbb253c413007323f/finetune.py#L41-L43. + + Args: + lora_attn_modules (List[LORA_ATTN_MODULES]): list of which linear layers + LoRA should be applied to in each self-attention block. Options are + ``{"q_proj", "k_proj", "v_proj", "output_proj"}``. + apply_lora_to_mlp (bool): whether to apply LoRA to the MLP in each transformer layer. + Default: False + apply_lora_to_output (bool): whether to apply LoRA to the model's final output projection. + Default: False + lora_rank (int): rank of each low-rank approximation + lora_alpha (float): scaling factor for the low-rank approximation + lora_dropout (float): dropout probability for the low-rank approximation. Default: 0.0 + quantize_base (bool): Whether to quantize base model weights + + Returns: + TransformerDecoder: Instantiation of Qwen2.5 7B model with LoRA applied + + Note: + The base and instruct versions have slightly different architectures for all Qwen2.5 model sizes + except 0.5B and 3B. Make sure to select the correct model builder for the weights. + """ + return lora_qwen2( + lora_attn_modules=lora_attn_modules, + apply_lora_to_mlp=apply_lora_to_mlp, + apply_lora_to_output=apply_lora_to_output, + vocab_size=152064, + num_layers=28, + num_heads=28, + num_kv_heads=4, + embed_dim=3584, + intermediate_dim=18944, + max_seq_len=32768, + attn_dropout=0.0, + norm_eps=1e-6, + rope_base=1000000.0, + lora_rank=lora_rank, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + use_dora=use_dora, + quantize_base=quantize_base, + ) + + +def lora_qwen2_5_14b_base( + lora_attn_modules: List[LORA_ATTN_MODULES], + apply_lora_to_mlp: bool = False, + apply_lora_to_output: bool = False, + lora_rank: int = 8, + lora_alpha: float = 16, + lora_dropout: float = 0.0, + use_dora: bool = False, + quantize_base: bool = False, +) -> TransformerDecoder: + """ + Builder for creating a Qwen2.5 14B base model with LoRA enabled. + + The Qwen2.5 defaults are the same as in :func:`~torchtune.models.qwen2_5.qwen2_5_14b_base`, + while LoRA default params are based on + https://github.com/tloen/alpaca-lora/blob/8bb8579e403dc78e37fe81ffbb253c413007323f/finetune.py#L41-L43. + + Args: + lora_attn_modules (List[LORA_ATTN_MODULES]): list of which linear layers + LoRA should be applied to in each self-attention block. Options are + ``{"q_proj", "k_proj", "v_proj", "output_proj"}``. + apply_lora_to_mlp (bool): whether to apply LoRA to the MLP in each transformer layer. + Default: False + apply_lora_to_output (bool): whether to apply LoRA to the model's final output projection. + Default: False + lora_rank (int): rank of each low-rank approximation + lora_alpha (float): scaling factor for the low-rank approximation + lora_dropout (float): dropout probability for the low-rank approximation. Default: 0.0 + quantize_base (bool): Whether to quantize base model weights + + Returns: + TransformerDecoder: Instantiation of Qwen2.5 14B model with LoRA applied + + Note: + The base and instruct versions have slightly different architectures for all Qwen2.5 model sizes + except 0.5B and 3B. Make sure to select the correct model builder for the weights. + """ + return lora_qwen2( + lora_attn_modules=lora_attn_modules, + apply_lora_to_mlp=apply_lora_to_mlp, + apply_lora_to_output=apply_lora_to_output, + vocab_size=152064, + num_layers=48, + num_heads=40, + num_kv_heads=8, + embed_dim=5120, + intermediate_dim=13824, + max_seq_len=131072, + attn_dropout=0.0, + norm_eps=1e-5, + rope_base=1000000.0, + lora_rank=lora_rank, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + use_dora=use_dora, + quantize_base=quantize_base, + ) + + +def lora_qwen2_5_14b_instruct( + lora_attn_modules: List[LORA_ATTN_MODULES], + apply_lora_to_mlp: bool = False, + apply_lora_to_output: bool = False, + lora_rank: int = 8, + lora_alpha: float = 16, + lora_dropout: float = 0.0, + use_dora: bool = False, + quantize_base: bool = False, +) -> TransformerDecoder: + """ + Builder for creating a Qwen2.5 14B instruct model with LoRA enabled. + + The Qwen2.5 defaults are the same as in :func:`~torchtune.models.qwen2_5.qwen2_5_14b_instruct`, + while LoRA default params are based on + https://github.com/tloen/alpaca-lora/blob/8bb8579e403dc78e37fe81ffbb253c413007323f/finetune.py#L41-L43. + + Args: + lora_attn_modules (List[LORA_ATTN_MODULES]): list of which linear layers + LoRA should be applied to in each self-attention block. Options are + ``{"q_proj", "k_proj", "v_proj", "output_proj"}``. + apply_lora_to_mlp (bool): whether to apply LoRA to the MLP in each transformer layer. + Default: False + apply_lora_to_output (bool): whether to apply LoRA to the model's final output projection. + Default: False + lora_rank (int): rank of each low-rank approximation + lora_alpha (float): scaling factor for the low-rank approximation + lora_dropout (float): dropout probability for the low-rank approximation. Default: 0.0 + quantize_base (bool): Whether to quantize base model weights + + Returns: + TransformerDecoder: Instantiation of Qwen2.5 14B model with LoRA applied + + Note: + The base and instruct versions have slightly different architectures for all Qwen2.5 model sizes + except 0.5B and 3B. Make sure to select the correct model builder for the weights. + """ + return lora_qwen2( + lora_attn_modules=lora_attn_modules, + apply_lora_to_mlp=apply_lora_to_mlp, + apply_lora_to_output=apply_lora_to_output, + vocab_size=152064, + num_layers=48, + num_heads=40, + num_kv_heads=8, + embed_dim=5120, + intermediate_dim=13824, + max_seq_len=32768, + attn_dropout=0.0, + norm_eps=1e-6, + rope_base=1000000.0, + lora_rank=lora_rank, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + use_dora=use_dora, + quantize_base=quantize_base, + ) + + +def lora_qwen2_5_32b_base( + lora_attn_modules: List[LORA_ATTN_MODULES], + apply_lora_to_mlp: bool = False, + apply_lora_to_output: bool = False, + lora_rank: int = 8, + lora_alpha: float = 16, + lora_dropout: float = 0.0, + use_dora: bool = False, + quantize_base: bool = False, +) -> TransformerDecoder: + """ + Builder for creating a Qwen2.5 32B base model with LoRA enabled. + + The Qwen2.5 defaults are the same as in :func:`~torchtune.models.qwen2_5.qwen2_5_32b_base`, + while LoRA default params are based on + https://github.com/tloen/alpaca-lora/blob/8bb8579e403dc78e37fe81ffbb253c413007323f/finetune.py#L41-L43. + + Args: + lora_attn_modules (List[LORA_ATTN_MODULES]): list of which linear layers + LoRA should be applied to in each self-attention block. Options are + ``{"q_proj", "k_proj", "v_proj", "output_proj"}``. + apply_lora_to_mlp (bool): whether to apply LoRA to the MLP in each transformer layer. + Default: False + apply_lora_to_output (bool): whether to apply LoRA to the model's final output projection. + Default: False + lora_rank (int): rank of each low-rank approximation + lora_alpha (float): scaling factor for the low-rank approximation + lora_dropout (float): dropout probability for the low-rank approximation. Default: 0.0 + quantize_base (bool): Whether to quantize base model weights + + Returns: + TransformerDecoder: Instantiation of Qwen2.5 32B model with LoRA applied + + Note: + The base and instruct versions have slightly different architectures for all Qwen2.5 model sizes + except 0.5B and 3B. Make sure to select the correct model builder for the weights. + """ + return lora_qwen2( + lora_attn_modules=lora_attn_modules, + apply_lora_to_mlp=apply_lora_to_mlp, + apply_lora_to_output=apply_lora_to_output, + vocab_size=152064, + num_layers=64, + num_heads=40, + num_kv_heads=8, + embed_dim=5120, + intermediate_dim=27648, + max_seq_len=131072, + attn_dropout=0.0, + norm_eps=1e-5, + rope_base=1000000.0, + lora_rank=lora_rank, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + use_dora=use_dora, + quantize_base=quantize_base, + ) + + +def lora_qwen2_5_32b_instruct( + lora_attn_modules: List[LORA_ATTN_MODULES], + apply_lora_to_mlp: bool = False, + apply_lora_to_output: bool = False, + lora_rank: int = 8, + lora_alpha: float = 16, + lora_dropout: float = 0.0, + use_dora: bool = False, + quantize_base: bool = False, +) -> TransformerDecoder: + """ + Builder for creating a Qwen2.5 32B instruct model with LoRA enabled. + + The Qwen2.5 defaults are the same as in :func:`~torchtune.models.qwen2_5.qwen2_5_32b_instruct`, + while LoRA default params are based on + https://github.com/tloen/alpaca-lora/blob/8bb8579e403dc78e37fe81ffbb253c413007323f/finetune.py#L41-L43. + + Args: + lora_attn_modules (List[LORA_ATTN_MODULES]): list of which linear layers + LoRA should be applied to in each self-attention block. Options are + ``{"q_proj", "k_proj", "v_proj", "output_proj"}``. + apply_lora_to_mlp (bool): whether to apply LoRA to the MLP in each transformer layer. + Default: False + apply_lora_to_output (bool): whether to apply LoRA to the model's final output projection. + Default: False + lora_rank (int): rank of each low-rank approximation + lora_alpha (float): scaling factor for the low-rank approximation + lora_dropout (float): dropout probability for the low-rank approximation. Default: 0.0 + quantize_base (bool): Whether to quantize base model weights + + Returns: + TransformerDecoder: Instantiation of Qwen2.5 32B model with LoRA applied + + Note: + The base and instruct versions have slightly different architectures for all Qwen2.5 model sizes + except 0.5B and 3B. Make sure to select the correct model builder for the weights. + """ + return lora_qwen2( + lora_attn_modules=lora_attn_modules, + apply_lora_to_mlp=apply_lora_to_mlp, + apply_lora_to_output=apply_lora_to_output, + vocab_size=152064, + num_layers=64, + num_heads=40, + num_kv_heads=8, + embed_dim=5120, + intermediate_dim=27648, + max_seq_len=32768, + attn_dropout=0.0, + norm_eps=1e-6, + rope_base=1000000.0, + lora_rank=lora_rank, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + use_dora=use_dora, + quantize_base=quantize_base, + ) + + +def lora_qwen2_5_72b_base( + lora_attn_modules: List[LORA_ATTN_MODULES], + apply_lora_to_mlp: bool = False, + apply_lora_to_output: bool = False, + lora_rank: int = 8, + lora_alpha: float = 16, + lora_dropout: float = 0.0, + use_dora: bool = False, + quantize_base: bool = False, +) -> TransformerDecoder: + """ + Builder for creating a Qwen2.5 72B base model with LoRA enabled. + + The Qwen2.5 defaults are the same as in :func:`~torchtune.models.qwen2_5.qwen2_5_72b_base`, + while LoRA default params are based on + https://github.com/tloen/alpaca-lora/blob/8bb8579e403dc78e37fe81ffbb253c413007323f/finetune.py#L41-L43. + + Args: + lora_attn_modules (List[LORA_ATTN_MODULES]): list of which linear layers + LoRA should be applied to in each self-attention block. Options are + ``{"q_proj", "k_proj", "v_proj", "output_proj"}``. + apply_lora_to_mlp (bool): whether to apply LoRA to the MLP in each transformer layer. + Default: False + apply_lora_to_output (bool): whether to apply LoRA to the model's final output projection. + Default: False + lora_rank (int): rank of each low-rank approximation + lora_alpha (float): scaling factor for the low-rank approximation + lora_dropout (float): dropout probability for the low-rank approximation. Default: 0.0 + quantize_base (bool): Whether to quantize base model weights + + Returns: + TransformerDecoder: Instantiation of Qwen2.5 72B model with LoRA applied + + Note: + The base and instruct versions have slightly different architectures for all Qwen2.5 model sizes + except 0.5B and 3B. Make sure to select the correct model builder for the weights. + """ + return lora_qwen2( + lora_attn_modules=lora_attn_modules, + apply_lora_to_mlp=apply_lora_to_mlp, + apply_lora_to_output=apply_lora_to_output, + vocab_size=152064, + num_layers=80, + num_heads=64, + num_kv_heads=8, + embed_dim=8192, + intermediate_dim=29568, + max_seq_len=131072, + attn_dropout=0.0, + norm_eps=1e-5, + rope_base=1000000.0, + lora_rank=lora_rank, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + use_dora=use_dora, + quantize_base=quantize_base, + ) + + +def lora_qwen2_5_72b_instruct( + lora_attn_modules: List[LORA_ATTN_MODULES], + apply_lora_to_mlp: bool = False, + apply_lora_to_output: bool = False, + lora_rank: int = 8, + lora_alpha: float = 16, + lora_dropout: float = 0.0, + use_dora: bool = False, + quantize_base: bool = False, +) -> TransformerDecoder: + """ + Builder for creating a Qwen2.5 72B instruct model with LoRA enabled. + + The Qwen2.5 defaults are the same as in :func:`~torchtune.models.qwen2_5.qwen2_5_72b_instruct`, + while LoRA default params are based on + https://github.com/tloen/alpaca-lora/blob/8bb8579e403dc78e37fe81ffbb253c413007323f/finetune.py#L41-L43. + + Args: + lora_attn_modules (List[LORA_ATTN_MODULES]): list of which linear layers + LoRA should be applied to in each self-attention block. Options are + ``{"q_proj", "k_proj", "v_proj", "output_proj"}``. + apply_lora_to_mlp (bool): whether to apply LoRA to the MLP in each transformer layer. + Default: False + apply_lora_to_output (bool): whether to apply LoRA to the model's final output projection. + Default: False + lora_rank (int): rank of each low-rank approximation + lora_alpha (float): scaling factor for the low-rank approximation + lora_dropout (float): dropout probability for the low-rank approximation. Default: 0.0 + quantize_base (bool): Whether to quantize base model weights + + Returns: + TransformerDecoder: Instantiation of Qwen2.5 72B model with LoRA applied + + Note: + The base and instruct versions have slightly different architectures for all Qwen2.5 model sizes + except 0.5B and 3B. Make sure to select the correct model builder for the weights. + """ + return lora_qwen2( + lora_attn_modules=lora_attn_modules, + apply_lora_to_mlp=apply_lora_to_mlp, + apply_lora_to_output=apply_lora_to_output, + vocab_size=152064, + num_layers=80, + num_heads=64, + num_kv_heads=8, + embed_dim=8192, + intermediate_dim=29568, + max_seq_len=32768, + attn_dropout=0.0, + norm_eps=1e-6, + rope_base=1000000.0, + lora_rank=lora_rank, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + use_dora=use_dora, + quantize_base=quantize_base, + ) diff --git a/torchtune/models/qwen2_5/_tokenizer.py b/torchtune/models/qwen2_5/_tokenizer.py new file mode 100644 index 0000000000..2d3eb1d01a --- /dev/null +++ b/torchtune/models/qwen2_5/_tokenizer.py @@ -0,0 +1,242 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +from typing import Dict, List, Optional, Tuple + +from torchtune.data import ChatMLTemplate, Message, PromptTemplate, truncate +from torchtune.models.qwen2._tokenizer import ( + DEFAULT_QWEN2_TOKENIZER_BPE_CACHE_SIZE, + ENDOFTEXT, + QWEN2_SPECIAL_TOKENS, + Qwen2Tokenizer, +) + + +QWEN2_5_SPECIAL_TOKENS = { + **QWEN2_SPECIAL_TOKENS, + "<|object_ref_start|>": 151646, + "<|object_ref_end|>": 151647, + "<|box_start|>": 151648, + "<|box_end|>": 151649, + "<|quad_start|>": 151650, + "<|quad_end|>": 151651, + "<|vision_start|>": 151652, + "<|vision_end|>": 151653, + "<|vision_pad|>": 151654, + "<|image_pad|>": 151655, + "<|video_pad|>": 151656, + "": 151657, + "": 151658, + "<|fim_prefix|>": 151659, + "<|fim_middle|>": 151660, + "<|fim_suffix|>": 151661, + "<|fim_pad|>": 151662, + "<|repo_name|>": 151663, + "<|file_sep|>": 151664, +} + + +class Qwen2_5Tokenizer(Qwen2Tokenizer): # noqa: N801 + """This class construct a Qwen2.5 tokenizer, based on GPT-2 byte-level BPE tokenization. + + See + and . + + Args: + path (str): Path to vocab.json file. + merges_file (str): Path to merges.txt file. + merges.txt contains all BPE merge operations, and this file is required to split a single word into + byte-level BPE tokens. + special_tokens (Dict[str, int]): Special tokens to add to the tokenizer. Default is QWEN2_5_SPECIAL_TOKENS. + max_seq_len (Optional[int]): A max sequence length to truncate tokens to. + Default: None + prompt_template (Optional[PromptTemplate]): template used to format the messages based on their role. This is used + to add structured text around the actual messages. The structured text is used in three scenarios: + + - Task-specific templates to gear models for a particular task that it will expect after training + - Model-specific templates that are required whenever the model is prompted, such as the [INST] + tags in Llama2 and in Mistral + - Community standardized templates, such as :class:`~torchtune.data.ChatMLTemplate` + + The extra text will still get tokenized as normal text, not as special tokens. + Default: None + errors (str): Paradigm to follow when decoding bytes to UTF-8. Defaults to "replace". + See [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information. + unk_token (Optional[str]): The unknown token. A token that is not in the vocabulary cannot be converted + to an ID and is set to be this token instead. Defaults to ``<|endoftext|>``. + bos_token (Optional[str]): The beginning of sequence token. Defaults to None. + eos_token (str): The end of sequence token. Defaults to ``<|endoftext|>``. + pad_token (Optional[str]): The token used for padding. Defaults to ``<|endoftext|>``. + bpe_cache_size (int): BPE token cache size in Qwen2Tokenizer. + NOTE: large cache size will speed up tokenization, but the cache object will get really + large for long running processes (esp. for texts of language that do not use space between + word, e.g. Chinese); technically not a memory leak but appears as one. + By default, we set the cache size equals to size of the official Qwen2 tokenizer. + + Example: + >>> tokenizer = Qwen2Tokenizer( + path="/path/to/vocab.json", merges_file="/path/to/merges.txt", special_tokens=QWEN2_SPECIAL_TOKENS) + >>> tokenized_text = tokenizer.encode("Hello world!") + >>> print(tokenized_text) + [39, 385, 78, 675, 0, 2000] + """ + + def __init__( + self, + path: str, + merges_file: str, + special_tokens: Dict[str, int] = QWEN2_5_SPECIAL_TOKENS, + max_seq_len: Optional[int] = None, + *, + prompt_template: Optional[PromptTemplate] = None, + errors: str = "replace", + unk_token: Optional[str] = ENDOFTEXT, + bos_token: Optional[str] = None, + eos_token: str = ENDOFTEXT, + pad_token: Optional[str] = ENDOFTEXT, + bpe_cache_size: int = DEFAULT_QWEN2_TOKENIZER_BPE_CACHE_SIZE, + ): + super().__init__( + path=path, + merges_file=merges_file, + special_tokens=special_tokens, + max_seq_len=max_seq_len, + prompt_template=prompt_template, + errors=errors, + unk_token=unk_token, + bos_token=bos_token, + eos_token=eos_token, + pad_token=pad_token, + bpe_cache_size=bpe_cache_size, + ) + + self.tool_call_start_id = self.special_tokens[""] + self.tool_call_end_id = self.special_tokens[""] + + def tokenize_messages( + self, + messages: List[Message], + *, + add_eos: bool = True, + ) -> Tuple[List[int], List[bool]]: + """ + Given a list of messages, return a list of tokens for the concatenated + and formatted messages. + + Args: + messages (List[Message]): The message list to tokenize. + add_eos (bool): Wether to add the tokenizer's eos_id at the end of the + sequence of messages. Default is True. + + Returns: + Tuple[List[int], List[bool]]: The list of token ids and the list of masks. + + Raises: + RuntimeError: If a message contains non-text content + """ + assert not isinstance(self.prompt_template, ChatMLTemplate), ( + "Using ChatMLTemplate with tokenize_messages will result in multiple <|im_*|> tokens wrapping each message." + "Please use a different template or set to None." + ) + templated_messages = ( + self.prompt_template(messages) + if self.prompt_template is not None + else messages + ) + + tokenized_messages = [] + mask = [] + for i, message in enumerate(templated_messages): + # message header + tokens = self._tokenize_header(templated_messages, i) + + # message content + for item in message.content: + if item["type"] == "text": + tokens.extend( + self.encode( + item["content"], + add_bos=False, + add_eos=False, + ) + ) + else: + raise RuntimeError( + f"Unsupported message content type: {item['type']}" + ) + + # message footer + tokens.extend(self._tokenize_footer(templated_messages, i)) + + tokenized_messages.extend(tokens) + mask.extend([message.masked] * len(tokens)) + + # Break out early if we reach max_seq_len + if self.max_seq_len and len(tokenized_messages) >= self.max_seq_len: + break + + # Add the End-Of-Sequence token + if add_eos: + tokenized_messages.append(self.eos_id) + mask.append(mask[-1]) + + # Finally, truncate if necessary + if self.max_seq_len: + tokenized_messages = truncate( + tokenized_messages, self.max_seq_len, self.eos_id if add_eos else None + ) + mask = truncate(mask, self.max_seq_len, True if add_eos else None) + + return tokenized_messages, mask + + def _tokenize_header(self, messages, i): + tokens = [] + message = messages[i] + if message.role == "ipython": + if i == 0 or messages[i - 1].role != "ipython": + # only add the "user" header if this is the first tool response msg + self._add_message_start_tokens(tokens, "user") + tokens.extend( + self.encode("\n", add_bos=False, add_eos=False) + ) + else: + tokens.extend( + self.encode("\n\n", add_bos=False, add_eos=False) + ) + else: + self._add_message_start_tokens(tokens, message.role) + if message.role == "assistant" and message.ipython: + tokens.append(self.tool_call_start_id) + tokens.extend(self.encode("\n", add_bos=False, add_eos=False)) + return tokens + + def _tokenize_footer(self, messages, i): + tokens = [] + message = messages[i] + if message.role == "ipython": + if i == len(messages) - 1 or messages[i + 1].role != "ipython": + tokens.extend( + self.encode("\n", add_bos=False, add_eos=False) + ) + self._add_message_end_tokens(tokens) + else: + tokens.extend( + self.encode("\n", add_bos=False, add_eos=False) + ) + else: + if message.role == "assistant" and message.ipython: + tokens.extend(self.encode("\n", add_bos=False, add_eos=False)) + tokens.append(self.tool_call_end_id) + if message.role != "assistant" or i != len(messages) - 1: + self._add_message_end_tokens(tokens) + return tokens + + def _add_message_start_tokens(self, tokens, role): + tokens.append(self.im_start_id) + tokens.extend(self.encode(f"{role}\n", add_bos=False, add_eos=False)) + + def _add_message_end_tokens(self, tokens): + tokens.append(self.im_end_id) + tokens.extend(self.encode("\n", add_bos=False, add_eos=False))