diff --git a/recipes/configs/qwen2_5/0_5B_full.yaml b/recipes/configs/qwen2_5/0_5B_full.yaml
new file mode 100644
index 0000000000..341c054991
--- /dev/null
+++ b/recipes/configs/qwen2_5/0_5B_full.yaml
@@ -0,0 +1,77 @@
+# Config for multi-device full finetuning in full_finetune_distributed.py
+# using a Qwen2.5 0.5B model
+#
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download Qwen/Qwen2.5-0.5B-Instruct --output-dir /tmp/Qwen2_5-0_5B-Instruct --ignore-patterns None
+#
+# To launch on 2 devices, run the following command from root:
+#   tune run --nnodes 1 --nproc_per_node 2 full_finetune_distributed --config qwen2_5/0_5B_full
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+#   tune run --nnodes 1 --nproc_per_node 2 full_finetune_distributed --config qwen2_5/0_5B_full checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#
+# This config works best when the model is being fine-tuned on 2+ GPUs.
+# Single device full finetuning requires more memory optimizations. It's
+# best to use 0_5B_full_single_device.yaml for those cases
+
+# Tokenizer
+tokenizer:
+  _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer
+  path: /tmp/Qwen2_5-0_5B-Instruct/vocab.json
+  merges_file: /tmp/Qwen2_5-0_5B-Instruct/merges.txt
+  max_seq_len: null
+
+# Dataset
+dataset:
+  _component_: torchtune.datasets.alpaca_cleaned_dataset
+  packed: False
+seed: null
+shuffle: True
+
+# Model Arguments
+model:
+  _component_: torchtune.models.qwen2_5.qwen2_5_0_5b
+
+checkpointer:
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Qwen2_5-0_5B-Instruct
+  checkpoint_files: [
+    model.safetensors
+  ]
+  recipe_checkpoint: null
+  output_dir: /tmp/Qwen2_5-0_5B-Instruct-finetune
+  model_type: QWEN2
+resume_from_checkpoint: False
+
+# Fine-tuning arguments
+batch_size: 2
+epochs: 1
+optimizer:
+  _component_: torch.optim.AdamW
+  fused: True
+  lr: 2e-5
+loss:
+  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
+max_steps_per_epoch: null
+gradient_accumulation_steps: 16
+compile: False
+
+# Training env
+device: cuda
+
+# Memory management
+enable_activation_checkpointing: True
+
+# Reduced precision
+dtype: bf16
+
+# Logging
+metric_logger:
+  _component_: torchtune.training.metric_logging.DiskLogger
+  log_dir: ${output_dir}
+output_dir: /tmp/Qwen2_5-0_5B-Instruct-finetune
+log_every_n_steps: 1
+log_peak_memory_stats: False
diff --git a/recipes/configs/qwen2_5/0_5B_full_single_device.yaml b/recipes/configs/qwen2_5/0_5B_full_single_device.yaml
new file mode 100644
index 0000000000..58059e06a9
--- /dev/null
+++ b/recipes/configs/qwen2_5/0_5B_full_single_device.yaml
@@ -0,0 +1,82 @@
+# Config for single device full finetuning in full_finetune_single_device.py
+# using a Qwen2.5 0.5B
+#
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download Qwen/Qwen2.5-0.5B-Instruct --output-dir /tmp/Qwen2_5-0_5B-Instruct --ignore-patterns None
+#
+# The default config uses an optimizer from bitsandbytes. If you do not have it installed,
+# you can install it with
+#   pip install bitsandbytes
+#
+# To launch on a single device, run the following command from root:
+#   tune run full_finetune_single_device --config qwen2_5/0_5B_full_single_device
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+#   tune run full_finetune_single_device --config qwen2_5/0_5B_full_single_device checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#
+# This config works only for training on single device.
+
+# Tokenizer
+tokenizer:
+  _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer
+  path: /tmp/Qwen2_5-0_5B-Instruct/vocab.json
+  merges_file: /tmp/Qwen2_5-0_5B-Instruct/merges.txt
+  max_seq_len: null
+
+# Dataset
+dataset:
+  _component_: torchtune.datasets.alpaca_cleaned_dataset
+  packed: False
+seed: null
+shuffle: True
+
+# Model Arguments
+model:
+  _component_: torchtune.models.qwen2_5.qwen2_5_0_5b
+
+checkpointer:
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Qwen2_5-0_5B-Instruct
+  checkpoint_files: [
+    model.safetensors
+  ]
+  recipe_checkpoint: null
+  output_dir: /tmp/Qwen2_5-0_5B-Instruct-finetune
+  model_type: QWEN2
+resume_from_checkpoint: False
+
+# Fine-tuning arguments
+batch_size: 2
+epochs: 1
+optimizer:
+  _component_: torch.optim.AdamW
+  fused: True
+  lr: 2e-5
+
+loss:
+  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
+optimizer_in_bwd: False
+
+max_steps_per_epoch: null
+gradient_accumulation_steps: 8
+compile: False
+
+# Training environment
+device: cuda
+
+# Memory management
+enable_activation_checkpointing: True
+
+# Reduced precision
+dtype: bf16
+
+# Logging
+metric_logger:
+  _component_: torchtune.training.metric_logging.DiskLogger
+  log_dir: ${output_dir}
+output_dir: /tmp/Qwen2_5-0_5B-Instruct-finetune
+log_every_n_steps: 1
+log_peak_memory_stats: False
diff --git a/recipes/configs/qwen2_5/0_5B_lora.yaml b/recipes/configs/qwen2_5/0_5B_lora.yaml
new file mode 100644
index 0000000000..c6a4af1ee4
--- /dev/null
+++ b/recipes/configs/qwen2_5/0_5B_lora.yaml
@@ -0,0 +1,114 @@
+# Config for multi-device LoRA finetuning in lora_finetune_distributed.py
+# using a Qwen2.5 0.5B model
+#
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download Qwen/Qwen2.5-0.5B-Instruct --output-dir /tmp/Qwen2_5-0_5B-Instruct --ignore-patterns None
+#
+# To launch on 2 devices, run the following command from root:
+#   tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config qwen2_5/0_5B_lora
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+#   tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config qwen2_5/0_5B_lora checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#
+# This config works best when the model is being fine-tuned on 2+ GPUs.
+# For single device LoRA finetuning please use 0_5B_lora_single_device.yaml
+
+
+# Model Arguments
+model:
+  _component_: torchtune.models.qwen2_5.lora_qwen2_5_0_5b
+  lora_attn_modules: ['q_proj', 'v_proj']
+  apply_lora_to_mlp: False
+  apply_lora_to_output: False
+  lora_rank: 32
+  lora_alpha: 64
+  lora_dropout: 0.0
+
+tokenizer:
+  _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer
+  path: /tmp/Qwen2_5-0_5B-Instruct/vocab.json
+  merges_file: /tmp/Qwen2_5-0_5B-Instruct/merges.txt
+  max_seq_len: null
+
+checkpointer:
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Qwen2_5-0_5B-Instruct
+  checkpoint_files: [
+    model.safetensors
+  ]
+  recipe_checkpoint: null
+  output_dir: /tmp/Qwen2_5-0_5B-Instruct-lora-finetune
+  model_type: QWEN2
+resume_from_checkpoint: False
+
+# Dataset and Sampler
+dataset:
+  _component_: torchtune.datasets.alpaca_cleaned_dataset
+  packed: False
+
+seed: null
+shuffle: True
+batch_size: 4
+
+# Optimizer and Scheduler
+optimizer:
+  _component_: torch.optim.AdamW
+  fused: True
+  weight_decay: 0.01
+  lr: 2e-3
+
+lr_scheduler:
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
+  num_warmup_steps: 100
+
+loss:
+  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
+
+# Training
+epochs: 1
+max_steps_per_epoch: null
+gradient_accumulation_steps: 4
+compile: False
+
+# Logging
+output_dir: /tmp/Qwen2_5-0_5B-Instruct-lora-finetune
+metric_logger:
+  _component_: torchtune.training.metric_logging.DiskLogger
+  log_dir: ${output_dir}
+log_every_n_steps: 1
+log_peak_memory_stats: False
+
+# Environment
+device: cuda
+dtype: bf16
+enable_activation_checkpointing: True
+
+# Show case the usage of pytorch profiler
+# Set enabled to False as it's only needed for debugging training
+profiler:
+  _component_: torchtune.training.setup_torch_profiler
+
+  enabled: False
+
+  #Output directory of trace artifacts
+  output_dir: ${output_dir}/profiling_outputs
+
+  #`torch.profiler.ProfilerActivity` types to trace
+  cpu: True
+  cuda: True
+
+  #trace options passed to `torch.profiler.profile`
+  profile_memory: False
+  with_stack: False
+  record_shapes: True
+  with_flops: False
+
+  # `torch.profiler.schedule` options:
+  # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
+  wait_steps: 5
+  warmup_steps: 5
+  active_steps: 2
+  num_cycles: 1
diff --git a/recipes/configs/qwen2_5/0_5B_lora_single_device.yaml b/recipes/configs/qwen2_5/0_5B_lora_single_device.yaml
new file mode 100644
index 0000000000..2d9c089774
--- /dev/null
+++ b/recipes/configs/qwen2_5/0_5B_lora_single_device.yaml
@@ -0,0 +1,114 @@
+# Config for single device LoRA finetuning in lora_finetune_single_device.py
+# using a Qwen2.5 0.5B model
+#
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download Qwen/Qwen2.5-0.5B-Instruct --output-dir /tmp/Qwen2_5-0_5B-Instruct --ignore-patterns None
+#
+# To launch on a single device, run the following command from root:
+#   tune run lora_finetune_single_device --config qwen2_5/0_5B_lora_single_device
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+#   tune run lora_finetune_single_device --config qwen2_5/0_5B_lora_single_device checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#
+# This config works only for training on single device.
+
+
+# Model Arguments
+model:
+  _component_: torchtune.models.qwen2_5.lora_qwen2_5_0_5b
+  lora_attn_modules: ['q_proj', 'v_proj']
+  apply_lora_to_mlp: False
+  apply_lora_to_output: False
+  lora_rank: 32
+  lora_alpha: 64
+  lora_dropout: 0.0
+
+tokenizer:
+  _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer
+  path: /tmp/Qwen2_5-0_5B-Instruct/vocab.json
+  merges_file: /tmp/Qwen2_5-0_5B-Instruct/merges.txt
+  max_seq_len: null
+
+checkpointer:
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Qwen2_5-0_5B-Instruct
+  checkpoint_files: [
+    model.safetensors
+  ]
+  recipe_checkpoint: null
+  output_dir: /tmp/Qwen2_5-0_5B-Instruct-lora-finetune
+  model_type: QWEN2
+resume_from_checkpoint: False
+
+# Dataset and Sampler
+dataset:
+  _component_: torchtune.datasets.alpaca_cleaned_dataset
+  packed: False
+seed: null
+shuffle: True
+batch_size: 4
+
+# Optimizer and Scheduler
+optimizer:
+  _component_: torch.optim.AdamW
+  fused: True
+  weight_decay: 0.01
+  lr: 2e-3
+
+lr_scheduler:
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
+  num_warmup_steps: 100
+
+loss:
+  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
+
+# Training
+epochs: 1
+max_steps_per_epoch: null
+gradient_accumulation_steps: 4
+compile: False
+
+# Logging
+output_dir: /tmp/Qwen2_5-0_5B-Instruct-lora-finetune
+metric_logger:
+  _component_: torchtune.training.metric_logging.DiskLogger
+  log_dir: ${output_dir}
+log_every_n_steps: 1
+log_peak_memory_stats: False
+
+# Environment
+device: cuda
+dtype: bf16
+
+# Activations Offloading
+enable_activation_checkpointing: True
+enable_activation_offloading: False
+
+# Show case the usage of pytorch profiler
+# Set enabled to False as it's only needed for debugging training
+profiler:
+  _component_: torchtune.training.setup_torch_profiler
+  enabled: False
+
+  #Output directory of trace artifacts
+  output_dir: ${output_dir}/profiling_outputs
+
+  #`torch.profiler.ProfilerActivity` types to trace
+  cpu: True
+  cuda: True
+
+  #trace options passed to `torch.profiler.profile`
+  profile_memory: False
+  with_stack: False
+  record_shapes: True
+  with_flops: False
+
+  # `torch.profiler.schedule` options:
+  # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
+  wait_steps: 5
+  warmup_steps: 5
+  active_steps: 2
+  num_cycles: 1
diff --git a/recipes/configs/qwen2_5/14B_lora_single_device.yaml b/recipes/configs/qwen2_5/14B_lora_single_device.yaml
new file mode 100644
index 0000000000..d89710d1a6
--- /dev/null
+++ b/recipes/configs/qwen2_5/14B_lora_single_device.yaml
@@ -0,0 +1,120 @@
+# Config for single device LoRA finetuning in lora_finetune_single_device.py
+# using a Qwen2.5 14B model
+#
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download Qwen/Qwen2.5-14B-Instruct --output-dir /tmp/Qwen2_5-14B-Instruct --ignore-patterns None
+#
+# To launch on a single device, run the following command from root:
+#   tune run lora_finetune_single_device --config qwen2_5/14B_lora_single_device
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+#   tune run lora_finetune_single_device --config qwen2_5/14B_lora_single_device checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#
+# This config works only for training on single device.
+
+
+# Model Arguments
+model:
+  _component_: torchtune.models.qwen2_5.lora_qwen2_5_14b_instruct
+  lora_attn_modules: ['q_proj', 'v_proj']
+  apply_lora_to_mlp: False
+  apply_lora_to_output: False
+  lora_rank: 8
+  lora_alpha: 16
+  lora_dropout: 0.0
+
+tokenizer:
+  _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer
+  path: /tmp/Qwen2_5-14B-Instruct/vocab.json
+  merges_file: /tmp/Qwen2_5-14B-Instruct/merges.txt
+  max_seq_len: null
+
+checkpointer:
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Qwen2_5-14B-Instruct
+  checkpoint_files: [
+    model-00001-of-00008.safetensors,
+    model-00002-of-00008.safetensors,
+    model-00003-of-00008.safetensors,
+    model-00004-of-00008.safetensors,
+    model-00005-of-00008.safetensors,
+    model-00006-of-00008.safetensors,
+    model-00007-of-00008.safetensors,
+    model-00008-of-00008.safetensors,
+  ]
+  recipe_checkpoint: null
+  output_dir: /tmp/Qwen2_5-14B-Instruct-lora-finetune
+  model_type: QWEN2
+resume_from_checkpoint: False
+
+# Dataset and Sampler
+dataset:
+  _component_: torchtune.datasets.alpaca_cleaned_dataset
+  packed: False
+seed: null
+shuffle: True
+batch_size: 2
+
+# Optimizer and Scheduler
+optimizer:
+  _component_: torch.optim.AdamW
+  fused: True
+  weight_decay: 0.01
+  lr: 3e-4
+lr_scheduler:
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
+  num_warmup_steps: 100
+
+loss:
+  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
+
+# Training
+epochs: 1
+max_steps_per_epoch: null
+gradient_accumulation_steps: 64
+compile: False
+
+# Logging
+output_dir: /tmp/Qwen2_5-14B-Instruct-lora-finetune
+metric_logger:
+  _component_: torchtune.training.metric_logging.DiskLogger
+  log_dir: ${output_dir}
+log_every_n_steps: 1
+log_peak_memory_stats: False
+
+# Environment
+device: cuda
+dtype: bf16
+
+# Activations Offloading
+enable_activation_checkpointing: True
+enable_activation_offloading: False
+
+# Show case the usage of pytorch profiler
+# Set enabled to False as it's only needed for debugging training
+profiler:
+  _component_: torchtune.training.setup_torch_profiler
+  enabled: False
+
+  #Output directory of trace artifacts
+  output_dir: ${output_dir}/profiling_outputs
+
+  #`torch.profiler.ProfilerActivity` types to trace
+  cpu: True
+  cuda: True
+
+  #trace options passed to `torch.profiler.profile`
+  profile_memory: False
+  with_stack: False
+  record_shapes: True
+  with_flops: False
+
+  # `torch.profiler.schedule` options:
+  # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
+  wait_steps: 5
+  warmup_steps: 5
+  active_steps: 2
+  num_cycles: 1
diff --git a/recipes/configs/qwen2_5/1_5B_full.yaml b/recipes/configs/qwen2_5/1_5B_full.yaml
new file mode 100644
index 0000000000..9456200422
--- /dev/null
+++ b/recipes/configs/qwen2_5/1_5B_full.yaml
@@ -0,0 +1,77 @@
+# Config for multi-device full finetuning in full_finetune_distributed.py
+# using a Qwen2.5 1.5B model
+#
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download Qwen/Qwen2.5-1.5B-Instruct --output-dir /tmp/Qwen2_5-1_5B-Instruct --ignore-patterns None
+#
+# To launch on 2 devices, run the following command from root:
+#   tune run --nnodes 1 --nproc_per_node 2 full_finetune_distributed --config qwen2_5/1_5B_full
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+#   tune run --nnodes 1 --nproc_per_node 2 full_finetune_distributed --config qwen2_5/1_5B_full checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#
+# This config works best when the model is being fine-tuned on 2+ GPUs.
+# Single device full finetuning requires more memory optimizations. It's
+# best to use 1_5B_full_single_device.yaml for those cases
+
+# Tokenizer
+tokenizer:
+  _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer
+  path: /tmp/Qwen2_5-1_5B-Instruct/vocab.json
+  merges_file: /tmp/Qwen2_5-1_5B-Instruct/merges.txt
+  max_seq_len: null
+
+# Dataset
+dataset:
+  _component_: torchtune.datasets.alpaca_cleaned_dataset
+  packed: False
+seed: null
+shuffle: True
+
+# Model Arguments
+model:
+  _component_: torchtune.models.qwen2_5.qwen2_5_1_5b_instruct
+
+checkpointer:
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Qwen2_5-1_5B-Instruct
+  checkpoint_files: [
+    model.safetensors
+  ]
+  recipe_checkpoint: null
+  output_dir: /tmp/Qwen2_5-1_5B-Instruct-finetune
+  model_type: QWEN2
+resume_from_checkpoint: False
+
+# Fine-tuning arguments
+batch_size: 2
+epochs: 3
+optimizer:
+  _component_: torch.optim.AdamW
+  fused: True
+  lr: 2e-5
+loss:
+  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
+max_steps_per_epoch: null
+gradient_accumulation_steps: 1
+compile: False
+
+# Training env
+device: cuda
+
+# Memory management
+enable_activation_checkpointing: False
+
+# Reduced precision
+dtype: bf16
+
+# Logging
+metric_logger:
+  _component_: torchtune.training.metric_logging.DiskLogger
+  log_dir: ${output_dir}
+output_dir: /tmp/Qwen2_5-1_5B-Instruct-finetune
+log_every_n_steps: 1
+log_peak_memory_stats: False
diff --git a/recipes/configs/qwen2_5/1_5B_full_single_device.yaml b/recipes/configs/qwen2_5/1_5B_full_single_device.yaml
new file mode 100644
index 0000000000..6a78521c80
--- /dev/null
+++ b/recipes/configs/qwen2_5/1_5B_full_single_device.yaml
@@ -0,0 +1,82 @@
+# Config for single device full finetuning in full_finetune_single_device.py
+# using a Qwen2.5 1.5B
+#
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download Qwen/Qwen2.5-1.5B-Instruct --output-dir /tmp/Qwen2_5-1_5B-Instruct --ignore-patterns None
+#
+# The default config uses an optimizer from bitsandbytes. If you do not have it installed,
+# you can install it with
+#   pip install bitsandbytes
+#
+# To launch on a single device, run the following command from root:
+#   tune run full_finetune_single_device --config qwen2_5/1_5B_full_single_device
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+#   tune run full_finetune_single_device --config qwen2_5/1_5B_full_single_device checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#
+# This config works only for training on single device.
+
+# Tokenizer
+tokenizer:
+  _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer
+  path: /tmp/Qwen2_5-1_5B-Instruct/vocab.json
+  merges_file: /tmp/Qwen2_5-1_5B-Instruct/merges.txt
+  max_seq_len: null
+
+# Dataset
+dataset:
+  _component_: torchtune.datasets.alpaca_cleaned_dataset
+  packed: False
+seed: null
+shuffle: True
+
+# Model Arguments
+model:
+  _component_: torchtune.models.qwen2_5.qwen2_5_1_5b_instruct
+
+checkpointer:
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Qwen2_5-1_5B-Instruct
+  checkpoint_files: [
+    model.safetensors
+  ]
+  recipe_checkpoint: null
+  output_dir: /tmp/Qwen2_5-1_5B-Instruct-finetune
+  model_type: QWEN2
+resume_from_checkpoint: False
+
+# Fine-tuning arguments
+batch_size: 2
+epochs: 1
+optimizer:
+  _component_: bitsandbytes.optim.PagedAdamW
+  lr: 2e-5
+
+optimizer_in_bwd: True
+
+loss:
+  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
+
+max_steps_per_epoch: null
+gradient_accumulation_steps: 1
+compile: False
+
+# Training environment
+device: cuda
+
+# Memory management
+enable_activation_checkpointing: True
+
+# Reduced precision
+dtype: bf16
+
+# Logging
+metric_logger:
+  _component_: torchtune.training.metric_logging.DiskLogger
+  log_dir: ${output_dir}
+output_dir: /tmp/Qwen2_5-1_5B-Instruct-finetune
+log_every_n_steps: 1
+log_peak_memory_stats: False
diff --git a/recipes/configs/qwen2_5/1_5B_lora.yaml b/recipes/configs/qwen2_5/1_5B_lora.yaml
new file mode 100644
index 0000000000..9e3cfad1b6
--- /dev/null
+++ b/recipes/configs/qwen2_5/1_5B_lora.yaml
@@ -0,0 +1,112 @@
+# Config for multi-device LoRA finetuning in lora_finetune_distributed.py
+# using a Qwen2.5 1.5B model
+#
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download Qwen/Qwen2.5-1.5B-Instruct --output-dir /tmp/Qwen2_5-1_5B-Instruct --ignore-patterns None
+#
+# To launch on 2 devices, run the following command from root:
+#   tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config qwen2_5/1_5B_lora
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+#   tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config qwen2_5/1_5B_lora checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#
+# This config works best when the model is being fine-tuned on 2+ GPUs.
+# For single device LoRA finetuning please use 1_5B_lora_single_device.yaml
+
+
+# Model Arguments
+model:
+  _component_: torchtune.models.qwen2_5.lora_qwen2_5_1_5b_instruct
+  lora_attn_modules: ['q_proj', 'v_proj']
+  apply_lora_to_mlp: False
+  apply_lora_to_output: False
+  lora_rank: 32
+  lora_alpha: 64
+  lora_dropout: 0.0
+
+tokenizer:
+  _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer
+  path: /tmp/Qwen2_5-1_5B-Instruct/vocab.json
+  merges_file: /tmp/Qwen2_5-1_5B-Instruct/merges.txt
+  max_seq_len: null
+
+checkpointer:
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Qwen2_5-1_5B-Instruct
+  checkpoint_files: [
+    model.safetensors
+  ]
+  recipe_checkpoint: null
+  output_dir: /tmp/Qwen2_5-1_5B-Instruct-lora-finetune
+  model_type: QWEN2
+resume_from_checkpoint: False
+
+# Dataset and Sampler
+dataset:
+  _component_: torchtune.datasets.alpaca_cleaned_dataset
+  packed: False
+seed: null
+shuffle: True
+batch_size: 2
+
+# Optimizer and Scheduler
+optimizer:
+  _component_: torch.optim.AdamW
+  fused: True
+  lr: 2e-5
+
+lr_scheduler:
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
+  num_warmup_steps: 100
+
+loss:
+  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
+
+# Training
+epochs: 1
+max_steps_per_epoch: null
+gradient_accumulation_steps: 8
+compile: False
+
+# Logging
+output_dir: /tmp/Qwen2_5-1_5B-Instruct-lora-finetune
+metric_logger:
+  _component_: torchtune.training.metric_logging.DiskLogger
+  log_dir: ${output_dir}
+log_every_n_steps: 1
+log_peak_memory_stats: False
+
+# Environment
+device: cuda
+dtype: bf16
+enable_activation_checkpointing: True
+
+# Show case the usage of pytorch profiler
+# Set enabled to False as it's only needed for debugging training
+profiler:
+  _component_: torchtune.training.setup_torch_profiler
+
+  enabled: False
+
+  #Output directory of trace artifacts
+  output_dir: ${output_dir}/profiling_outputs
+
+  #`torch.profiler.ProfilerActivity` types to trace
+  cpu: True
+  cuda: True
+
+  #trace options passed to `torch.profiler.profile`
+  profile_memory: False
+  with_stack: False
+  record_shapes: True
+  with_flops: False
+
+  # `torch.profiler.schedule` options:
+  # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
+  wait_steps: 5
+  warmup_steps: 5
+  active_steps: 2
+  num_cycles: 1
diff --git a/recipes/configs/qwen2_5/1_5B_lora_single_device.yaml b/recipes/configs/qwen2_5/1_5B_lora_single_device.yaml
new file mode 100644
index 0000000000..f35989fa4f
--- /dev/null
+++ b/recipes/configs/qwen2_5/1_5B_lora_single_device.yaml
@@ -0,0 +1,113 @@
+# Config for single device LoRA finetuning in lora_finetune_single_device.py
+# using a Qwen2.5 1.5B model
+#
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download Qwen/Qwen2.5-1.5B-Instruct --output-dir /tmp/Qwen2_5-1_5B-Instruct --ignore-patterns None
+#
+# To launch on a single device, run the following command from root:
+#   tune run lora_finetune_single_device --config qwen2_5/1_5B_lora_single_device
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+#   tune run lora_finetune_single_device --config qwen2_5/1_5B_lora_single_device checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#
+# This config works only for training on single device.
+
+
+# Model Arguments
+model:
+  _component_: torchtune.models.qwen2_5.lora_qwen2_5_1_5b_instruct
+  lora_attn_modules: ['q_proj', 'v_proj']
+  apply_lora_to_mlp: False
+  apply_lora_to_output: False
+  lora_rank: 32
+  lora_alpha: 64
+  lora_dropout: 0.0
+
+tokenizer:
+  _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer
+  path: /tmp/Qwen2_5-1_5B-Instruct/vocab.json
+  merges_file: /tmp/Qwen2_5-1_5B-Instruct/merges.txt
+  max_seq_len: null
+
+checkpointer:
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Qwen2_5-1_5B-Instruct
+  checkpoint_files: [
+    model.safetensors
+  ]
+  recipe_checkpoint: null
+  output_dir: /tmp/Qwen2_5-1_5B-Instruct-lora-finetune
+  model_type: QWEN2
+resume_from_checkpoint: False
+
+# Dataset and Sampler
+dataset:
+  _component_: torchtune.datasets.alpaca_cleaned_dataset
+  packed: False
+seed: null
+shuffle: True
+batch_size: 2
+
+# Optimizer and Scheduler
+optimizer:
+  _component_: torch.optim.AdamW
+  fused: True
+  lr: 2e-3
+
+lr_scheduler:
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
+  num_warmup_steps: 100
+
+loss:
+  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
+
+# Training
+epochs: 1
+max_steps_per_epoch: null
+gradient_accumulation_steps: 8
+compile: False
+
+# Logging
+output_dir: /tmp/Qwen2_5-1_5B-Instruct-lora-finetune
+metric_logger:
+  _component_: torchtune.training.metric_logging.DiskLogger
+  log_dir: ${output_dir}
+log_every_n_steps: 1
+log_peak_memory_stats: False
+
+# Environment
+device: cuda
+dtype: bf16
+
+# Activations Offloading
+enable_activation_checkpointing: True
+enable_activation_offloading: False
+
+# Show case the usage of pytorch profiler
+# Set enabled to False as it's only needed for debugging training
+profiler:
+  _component_: torchtune.training.setup_torch_profiler
+  enabled: False
+
+  #Output directory of trace artifacts
+  output_dir: ${output_dir}/profiling_outputs
+
+  #`torch.profiler.ProfilerActivity` types to trace
+  cpu: True
+  cuda: True
+
+  #trace options passed to `torch.profiler.profile`
+  profile_memory: False
+  with_stack: False
+  record_shapes: True
+  with_flops: False
+
+  # `torch.profiler.schedule` options:
+  # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
+  wait_steps: 5
+  warmup_steps: 5
+  active_steps: 2
+  num_cycles: 1
diff --git a/recipes/configs/qwen2_5/32B_lora.yaml b/recipes/configs/qwen2_5/32B_lora.yaml
new file mode 100644
index 0000000000..19a9356c27
--- /dev/null
+++ b/recipes/configs/qwen2_5/32B_lora.yaml
@@ -0,0 +1,125 @@
+# Config for multi-device LoRA finetuning in lora_finetune_distributed.py
+# using a Qwen2.5 32B model
+#
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download Qwen/Qwen2.5-32B-Instruct --output-dir /tmp/Qwen2_5-32B-Instruct --ignore-patterns None
+#
+# To launch on 8 devices, run the following command from root:
+#   tune run --nnodes 1 --nproc_per_node 8 lora_finetune_distributed --config qwen2_5/32B_lora
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+#   tune run --nnodes 1 --nproc_per_node 8 lora_finetune_distributed --config qwen2_5/32B_lora checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+
+
+# Model Arguments
+model:
+  _component_: torchtune.models.qwen2_5.lora_qwen2_5_32b_instruct
+  lora_attn_modules: ['q_proj', 'v_proj']
+  apply_lora_to_mlp: False
+  apply_lora_to_output: False
+  lora_rank: 8
+  lora_alpha: 16
+  lora_dropout: 0.0
+
+tokenizer:
+  _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer
+  path: /tmp/Qwen2_5-32B-Instruct/vocab.json
+  merges_file: /tmp/Qwen2_5-32B-Instruct/merges.txt
+  max_seq_len: null
+
+checkpointer:
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Qwen2_5-32B-Instruct
+  checkpoint_files: [
+    model-00001-of-00017.safetensors,
+    model-00002-of-00017.safetensors,
+    model-00003-of-00017.safetensors,
+    model-00004-of-00017.safetensors,
+    model-00005-of-00017.safetensors,
+    model-00006-of-00017.safetensors,
+    model-00007-of-00017.safetensors,
+    model-00008-of-00017.safetensors,
+    model-00009-of-00017.safetensors,
+    model-00010-of-00017.safetensors,
+    model-00011-of-00017.safetensors,
+    model-00012-of-00017.safetensors,
+    model-00013-of-00017.safetensors,
+    model-00014-of-00017.safetensors,
+    model-00015-of-00017.safetensors,
+    model-00016-of-00017.safetensors,
+    model-00017-of-00017.safetensors,
+  ]
+  recipe_checkpoint: null
+  output_dir: /tmp/Qwen2_5-32B-Instruct-lora-finetune
+  model_type: QWEN2
+resume_from_checkpoint: False
+
+# Dataset and Sampler
+dataset:
+  _component_: torchtune.datasets.alpaca_cleaned_dataset
+  packed: False
+seed: null
+shuffle: True
+batch_size: 2
+
+# Optimizer and Scheduler
+optimizer:
+  _component_: torch.optim.AdamW
+  fused: True
+  weight_decay: 0.01
+  lr: 3e-4
+lr_scheduler:
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
+  num_warmup_steps: 100
+
+loss:
+  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
+
+# Training
+epochs: 1
+max_steps_per_epoch: null
+gradient_accumulation_steps: 32
+compile: False
+
+# Logging
+output_dir: /tmp/Qwen2_5-32B-Instruct-lora-finetune
+metric_logger:
+  _component_: torchtune.training.metric_logging.DiskLogger
+  log_dir: ${output_dir}
+log_every_n_steps: 1
+log_peak_memory_stats: False
+
+# Environment
+device: cuda
+dtype: bf16
+enable_activation_checkpointing: False
+
+# Show case the usage of pytorch profiler
+# Set enabled to False as it's only needed for debugging training
+profiler:
+  _component_: torchtune.training.setup_torch_profiler
+
+  enabled: False
+
+  #Output directory of trace artifacts
+  output_dir: ${output_dir}/profiling_outputs
+
+  #`torch.profiler.ProfilerActivity` types to trace
+  cpu: True
+  cuda: True
+
+  #trace options passed to `torch.profiler.profile`
+  profile_memory: False
+  with_stack: False
+  record_shapes: True
+  with_flops: False
+
+  # `torch.profiler.schedule` options:
+  # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
+  wait_steps: 5
+  warmup_steps: 5
+  active_steps: 2
+  num_cycles: 1
diff --git a/recipes/configs/qwen2_5/3B_full.yaml b/recipes/configs/qwen2_5/3B_full.yaml
new file mode 100644
index 0000000000..79343ca457
--- /dev/null
+++ b/recipes/configs/qwen2_5/3B_full.yaml
@@ -0,0 +1,78 @@
+# Config for multi-device full finetuning in full_finetune_distributed.py
+# using a Qwen2.5 3B model
+#
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download Qwen/Qwen2.5-3B-Instruct --output-dir /tmp/Qwen2_5-3B-Instruct --ignore-patterns None
+#
+# To launch on 2 devices, run the following command from root:
+#   tune run --nnodes 1 --nproc_per_node 2 full_finetune_distributed --config qwen2_5/3B_full
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+#   tune run --nnodes 1 --nproc_per_node 2 full_finetune_distributed --config qwen2_5/3B_full checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#
+# This config works best when the model is being fine-tuned on 2+ GPUs.
+# Single device full finetuning requires more memory optimizations. It's
+# best to use 3B_full_single_device.yaml for those cases
+
+# Tokenizer
+tokenizer:
+  _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer
+  path: /tmp/Qwen2_5-3B-Instruct/vocab.json
+  merges_file: /tmp/Qwen2_5-3B-Instruct/merges.txt
+  max_seq_len: null
+
+# Dataset
+dataset:
+  _component_: torchtune.datasets.alpaca_cleaned_dataset
+  packed: False
+seed: null
+shuffle: True
+
+# Model Arguments
+model:
+  _component_: torchtune.models.qwen2_5.qwen2_5_3b
+
+checkpointer:
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Qwen2_5-3B-Instruct
+  checkpoint_files: [
+    model-00001-of-00002.safetensors,
+    model-00002-of-00002.safetensors,
+  ]
+  recipe_checkpoint: null
+  output_dir: /tmp/Qwen2_5-3B-Instruct-finetune
+  model_type: QWEN2
+resume_from_checkpoint: False
+
+# Fine-tuning arguments
+batch_size: 2
+epochs: 1
+optimizer:
+  _component_: torch.optim.AdamW
+  fused: True
+  lr: 5e-6
+loss:
+  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
+max_steps_per_epoch: null
+gradient_accumulation_steps: 16
+compile: False
+
+# Training env
+device: cuda
+
+# Memory management
+enable_activation_checkpointing: True
+
+# Reduced precision
+dtype: bf16
+
+# Logging
+metric_logger:
+  _component_: torchtune.training.metric_logging.DiskLogger
+  log_dir: ${output_dir}
+output_dir: /tmp/Qwen2_5-3B-Instruct-finetune
+log_every_n_steps: 1
+log_peak_memory_stats: False
diff --git a/recipes/configs/qwen2_5/3B_full_single_device.yaml b/recipes/configs/qwen2_5/3B_full_single_device.yaml
new file mode 100644
index 0000000000..09494d6c28
--- /dev/null
+++ b/recipes/configs/qwen2_5/3B_full_single_device.yaml
@@ -0,0 +1,80 @@
+# Config for single device full finetuning in full_finetune_single_device.py
+# using a Qwen2.5 3B
+#
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download Qwen/Qwen2.5-3B-Instruct --output-dir /tmp/Qwen2_5-3B-Instruct --ignore-patterns None
+#
+# The default config uses an optimizer from bitsandbytes. If you do not have it installed,
+# you can install it with
+#   pip install bitsandbytes
+#
+# To launch on a single device, run the following command from root:
+#   tune run full_finetune_single_device --config qwen2_5/3B_full_single_device
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+#   tune run full_finetune_single_device --config qwen2_5/3B_full_single_device checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#
+# This config works only for training on single device.
+
+# Tokenizer
+tokenizer:
+  _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer
+  path: /tmp/Qwen2_5-3B-Instruct/vocab.json
+  merges_file: /tmp/Qwen2_5-3B-Instruct/merges.txt
+  max_seq_len: null
+
+# Dataset
+dataset:
+  _component_: torchtune.datasets.alpaca_cleaned_dataset
+  packed: False
+seed: null
+shuffle: True
+
+# Model Arguments
+model:
+  _component_: torchtune.models.qwen2_5.qwen2_5_3b
+
+checkpointer:
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Qwen2_5-3B-Instruct
+  checkpoint_files: [
+    model-00001-of-00002.safetensors,
+    model-00002-of-00002.safetensors,
+  ]
+  recipe_checkpoint: null
+  output_dir: /tmp/Qwen2_5-3B-Instruct-finetune
+  model_type: QWEN2
+resume_from_checkpoint: False
+
+# Fine-tuning arguments
+batch_size: 2
+epochs: 1
+optimizer:
+  _component_: bitsandbytes.optim.PagedAdamW
+  lr: 5e-6
+optimizer_in_bwd: True
+loss:
+  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
+max_steps_per_epoch: null
+gradient_accumulation_steps: 1
+compile: False
+
+# Training environment
+device: cuda
+
+# Memory management
+enable_activation_checkpointing: True
+
+# Reduced precision
+dtype: bf16
+
+# Logging
+metric_logger:
+  _component_: torchtune.training.metric_logging.DiskLogger
+  log_dir: ${output_dir}
+output_dir: /tmp/Qwen2_5-3B-Instruct-finetune
+log_every_n_steps: 1
+log_peak_memory_stats: False
diff --git a/recipes/configs/qwen2_5/3B_lora.yaml b/recipes/configs/qwen2_5/3B_lora.yaml
new file mode 100644
index 0000000000..b987330a6d
--- /dev/null
+++ b/recipes/configs/qwen2_5/3B_lora.yaml
@@ -0,0 +1,113 @@
+# Config for multi-device LoRA finetuning in lora_finetune_distributed.py
+# using a Qwen2.5 3B model
+#
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download Qwen/Qwen2.5-3B-Instruct --output-dir /tmp/Qwen2_5-3B-Instruct --ignore-patterns None
+#
+# To launch on 2 devices, run the following command from root:
+#   tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config qwen2_5/3B_lora
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+#   tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config qwen2_5/3B_lora checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#
+# This config works best when the model is being fine-tuned on 2+ GPUs.
+# For single device LoRA finetuning please use 3B_lora_single_device.yaml
+
+
+# Model Arguments
+model:
+  _component_: torchtune.models.qwen2_5.lora_qwen2_5_3b
+  lora_attn_modules: ['q_proj', 'v_proj']
+  apply_lora_to_mlp: False
+  apply_lora_to_output: False
+  lora_rank: 8
+  lora_alpha: 16
+  lora_dropout: 0.0
+
+tokenizer:
+  _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer
+  path: /tmp/Qwen2_5-3B-Instruct/vocab.json
+  merges_file: /tmp/Qwen2_5-3B-Instruct/merges.txt
+  max_seq_len: null
+
+checkpointer:
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Qwen2_5-3B-Instruct
+  checkpoint_files: [
+    model-00001-of-00002.safetensors,
+    model-00002-of-00002.safetensors,
+  ]
+  recipe_checkpoint: null
+  output_dir: /tmp/Qwen2_5-3B-Instruct-lora-finetune
+  model_type: QWEN2
+resume_from_checkpoint: False
+
+# Dataset and Sampler
+dataset:
+  _component_: torchtune.datasets.alpaca_cleaned_dataset
+  packed: False
+seed: null
+shuffle: True
+batch_size: 2
+
+# Optimizer and Scheduler
+optimizer:
+  _component_: torch.optim.AdamW
+  fused: True
+  weight_decay: 0.01
+  lr: 3e-4
+lr_scheduler:
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
+  num_warmup_steps: 100
+
+loss:
+  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
+
+# Training
+epochs: 1
+max_steps_per_epoch: null
+gradient_accumulation_steps: 32
+compile: False
+
+# Logging
+output_dir: /tmp/Qwen2_5-3B-Instruct-lora-finetune
+metric_logger:
+  _component_: torchtune.training.metric_logging.DiskLogger
+  log_dir: ${output_dir}
+log_every_n_steps: 1
+log_peak_memory_stats: False
+
+# Environment
+device: cuda
+dtype: bf16
+enable_activation_checkpointing: False
+
+# Show case the usage of pytorch profiler
+# Set enabled to False as it's only needed for debugging training
+profiler:
+  _component_: torchtune.training.setup_torch_profiler
+
+  enabled: False
+
+  #Output directory of trace artifacts
+  output_dir: ${output_dir}/profiling_outputs
+
+  #`torch.profiler.ProfilerActivity` types to trace
+  cpu: True
+  cuda: True
+
+  #trace options passed to `torch.profiler.profile`
+  profile_memory: False
+  with_stack: False
+  record_shapes: True
+  with_flops: False
+
+  # `torch.profiler.schedule` options:
+  # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
+  wait_steps: 5
+  warmup_steps: 5
+  active_steps: 2
+  num_cycles: 1
diff --git a/recipes/configs/qwen2_5/3B_lora_single_device.yaml b/recipes/configs/qwen2_5/3B_lora_single_device.yaml
new file mode 100644
index 0000000000..8caf08d063
--- /dev/null
+++ b/recipes/configs/qwen2_5/3B_lora_single_device.yaml
@@ -0,0 +1,114 @@
+# Config for single device LoRA finetuning in lora_finetune_single_device.py
+# using a Qwen2.5 3B model
+#
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download Qwen/Qwen2.5-3B-Instruct --output-dir /tmp/Qwen2_5-3B-Instruct --ignore-patterns None
+#
+# To launch on a single device, run the following command from root:
+#   tune run lora_finetune_single_device --config qwen2_5/3B_lora_single_device
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+#   tune run lora_finetune_single_device --config qwen2_5/3B_lora_single_device checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#
+# This config works only for training on single device.
+
+
+# Model Arguments
+model:
+  _component_: torchtune.models.qwen2_5.lora_qwen2_5_3b
+  lora_attn_modules: ['q_proj', 'v_proj']
+  apply_lora_to_mlp: False
+  apply_lora_to_output: False
+  lora_rank: 8
+  lora_alpha: 16
+  lora_dropout: 0.0
+
+tokenizer:
+  _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer
+  path: /tmp/Qwen2_5-3B-Instruct/vocab.json
+  merges_file: /tmp/Qwen2_5-3B-Instruct/merges.txt
+  max_seq_len: null
+
+checkpointer:
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Qwen2_5-3B-Instruct
+  checkpoint_files: [
+    model-00001-of-00002.safetensors,
+    model-00002-of-00002.safetensors,
+  ]
+  recipe_checkpoint: null
+  output_dir: /tmp/Qwen2_5-3B-Instruct-lora-finetune
+  model_type: QWEN2
+resume_from_checkpoint: False
+
+# Dataset and Sampler
+dataset:
+  _component_: torchtune.datasets.alpaca_cleaned_dataset
+  packed: False
+seed: null
+shuffle: True
+batch_size: 2
+
+# Optimizer and Scheduler
+optimizer:
+  _component_: torch.optim.AdamW
+  fused: True
+  weight_decay: 0.01
+  lr: 3e-4
+lr_scheduler:
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
+  num_warmup_steps: 100
+
+loss:
+  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
+
+# Training
+epochs: 1
+max_steps_per_epoch: null
+gradient_accumulation_steps: 64
+compile: False
+
+# Logging
+output_dir: /tmp/Qwen2_5-3B-Instruct-lora-finetune
+metric_logger:
+  _component_: torchtune.training.metric_logging.DiskLogger
+  log_dir: ${output_dir}
+log_every_n_steps: 1
+log_peak_memory_stats: False
+
+# Environment
+device: cuda
+dtype: bf16
+
+# Activations Offloading
+enable_activation_checkpointing: True
+enable_activation_offloading: False
+
+# Show case the usage of pytorch profiler
+# Set enabled to False as it's only needed for debugging training
+profiler:
+  _component_: torchtune.training.setup_torch_profiler
+  enabled: False
+
+  #Output directory of trace artifacts
+  output_dir: ${output_dir}/profiling_outputs
+
+  #`torch.profiler.ProfilerActivity` types to trace
+  cpu: True
+  cuda: True
+
+  #trace options passed to `torch.profiler.profile`
+  profile_memory: False
+  with_stack: False
+  record_shapes: True
+  with_flops: False
+
+  # `torch.profiler.schedule` options:
+  # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
+  wait_steps: 5
+  warmup_steps: 5
+  active_steps: 2
+  num_cycles: 1
diff --git a/recipes/configs/qwen2_5/72B_lora.yaml b/recipes/configs/qwen2_5/72B_lora.yaml
new file mode 100644
index 0000000000..906e52dfde
--- /dev/null
+++ b/recipes/configs/qwen2_5/72B_lora.yaml
@@ -0,0 +1,145 @@
+# Config for multi-device LoRA finetuning in lora_finetune_distributed.py
+# using a Qwen2.5 72B model
+#
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download Qwen/Qwen2.5-72B-Instruct --output-dir /tmp/Qwen2_5-72B-Instruct --ignore-patterns None
+#
+# To launch on 8 devices, run the following command from root:
+#   tune run --nnodes 1 --nproc_per_node 8 lora_finetune_distributed --config qwen2_5/72B_lora
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+#   tune run --nnodes 1 --nproc_per_node 8 lora_finetune_distributed --config qwen2_5/72B_lora checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+
+
+# Model Arguments
+model:
+  _component_: torchtune.models.qwen2_5.lora_qwen2_5_72b_instruct
+  lora_attn_modules: ['q_proj', 'v_proj']
+  apply_lora_to_mlp: False
+  apply_lora_to_output: False
+  lora_rank: 8
+  lora_alpha: 16
+  lora_dropout: 0.0
+
+tokenizer:
+  _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer
+  path: /tmp/Qwen2_5-72B-Instruct/vocab.json
+  merges_file: /tmp/Qwen2_5-72B-Instruct/merges.txt
+  max_seq_len: null
+
+checkpointer:
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Qwen2_5-72B-Instruct
+  checkpoint_files: [
+    model-00001-of-00037.safetensors,
+    model-00002-of-00037.safetensors,
+    model-00003-of-00037.safetensors,
+    model-00004-of-00037.safetensors,
+    model-00005-of-00037.safetensors,
+    model-00006-of-00037.safetensors,
+    model-00007-of-00037.safetensors,
+    model-00008-of-00037.safetensors,
+    model-00009-of-00037.safetensors,
+    model-00010-of-00037.safetensors,
+    model-00011-of-00037.safetensors,
+    model-00012-of-00037.safetensors,
+    model-00013-of-00037.safetensors,
+    model-00014-of-00037.safetensors,
+    model-00015-of-00037.safetensors,
+    model-00016-of-00037.safetensors,
+    model-00017-of-00037.safetensors,
+    model-00018-of-00037.safetensors,
+    model-00019-of-00037.safetensors,
+    model-00020-of-00037.safetensors,
+    model-00021-of-00037.safetensors,
+    model-00022-of-00037.safetensors,
+    model-00023-of-00037.safetensors,
+    model-00024-of-00037.safetensors,
+    model-00025-of-00037.safetensors,
+    model-00026-of-00037.safetensors,
+    model-00027-of-00037.safetensors,
+    model-00028-of-00037.safetensors,
+    model-00029-of-00037.safetensors,
+    model-00030-of-00037.safetensors,
+    model-00031-of-00037.safetensors,
+    model-00032-of-00037.safetensors,
+    model-00033-of-00037.safetensors,
+    model-00034-of-00037.safetensors,
+    model-00035-of-00037.safetensors,
+    model-00036-of-00037.safetensors,
+    model-00037-of-00037.safetensors,
+  ]
+  recipe_checkpoint: null
+  output_dir: /tmp/Qwen2_5-72B-Instruct-lora-finetune
+  model_type: QWEN2
+resume_from_checkpoint: False
+
+# Dataset and Sampler
+dataset:
+  _component_: torchtune.datasets.alpaca_dataset
+  packed: False
+seed: null
+shuffle: True
+batch_size: 2
+
+# Optimizer and Scheduler
+optimizer:
+  _component_: torch.optim.AdamW
+  fused: True
+  weight_decay: 0.01
+  lr: 3e-4
+lr_scheduler:
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
+  num_warmup_steps: 100
+
+loss:
+  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
+
+# Training
+epochs: 1
+max_steps_per_epoch: null
+gradient_accumulation_steps: 1
+compile: False
+
+# Logging
+output_dir: /tmp/Qwen2_5-72B-Instruct-lora-finetune
+metric_logger:
+  _component_: torchtune.training.metric_logging.DiskLogger
+  log_dir: ${output_dir}
+log_every_n_steps: 1
+log_peak_memory_stats: False
+
+# Environment
+device: cuda
+dtype: bf16
+enable_activation_checkpointing: True
+
+# Show case the usage of pytorch profiler
+# Set enabled to False as it's only needed for debugging training
+profiler:
+  _component_: torchtune.training.setup_torch_profiler
+
+  enabled: False
+
+  #Output directory of trace artifacts
+  output_dir: ${output_dir}/profiling_outputs
+
+  #`torch.profiler.ProfilerActivity` types to trace
+  cpu: True
+  cuda: True
+
+  #trace options passed to `torch.profiler.profile`
+  profile_memory: False
+  with_stack: False
+  record_shapes: True
+  with_flops: False
+
+  # `torch.profiler.schedule` options:
+  # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
+  wait_steps: 5
+  warmup_steps: 5
+  active_steps: 2
+  num_cycles: 1
diff --git a/recipes/configs/qwen2_5/7B_full.yaml b/recipes/configs/qwen2_5/7B_full.yaml
new file mode 100644
index 0000000000..78313ca921
--- /dev/null
+++ b/recipes/configs/qwen2_5/7B_full.yaml
@@ -0,0 +1,80 @@
+# Config for multi-device full finetuning in full_finetune_distributed.py
+# using a Qwen2.5 7B model
+#
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download Qwen/Qwen2.5-7B-Instruct --output-dir /tmp/Qwen2_5-7B-Instruct --ignore-patterns None
+#
+# To launch on 2 devices, run the following command from root:
+#   tune run --nnodes 1 --nproc_per_node 2 full_finetune_distributed --config qwen2_5/7B_full
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+#   tune run --nnodes 1 --nproc_per_node 2 full_finetune_distributed --config qwen2_5/7B_full checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#
+# This config works best when the model is being fine-tuned on 2+ GPUs.
+# Single device full finetuning requires more memory optimizations. It's
+# best to use 7B_full_single_device.yaml for those cases
+
+# Tokenizer
+tokenizer:
+  _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer
+  path: /tmp/Qwen2_5-7B-Instruct/vocab.json
+  merges_file: /tmp/Qwen2_5-7B-Instruct/merges.txt
+  max_seq_len: null
+
+# Dataset
+dataset:
+  _component_: torchtune.datasets.alpaca_cleaned_dataset
+  packed: False
+seed: null
+shuffle: True
+
+# Model Arguments
+model:
+  _component_: torchtune.models.qwen2_5.qwen2_5_7b_instruct
+
+checkpointer:
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Qwen2_5-7B-Instruct
+  checkpoint_files: [
+    model-00001-of-00004.safetensors,
+    model-00002-of-00004.safetensors,
+    model-00003-of-00004.safetensors,
+    model-00004-of-00004.safetensors,
+  ]
+  recipe_checkpoint: null
+  output_dir: /tmp/Qwen2_5-7B-Instruct-finetune
+  model_type: QWEN2
+resume_from_checkpoint: False
+
+# Fine-tuning arguments
+batch_size: 2
+epochs: 1
+optimizer:
+  _component_: torch.optim.AdamW
+  fused: True
+  lr: 5e-6
+loss:
+  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
+max_steps_per_epoch: null
+gradient_accumulation_steps: 16
+compile: False
+
+# Training env
+device: cuda
+
+# Memory management
+enable_activation_checkpointing: True
+
+# Reduced precision
+dtype: bf16
+
+# Logging
+metric_logger:
+  _component_: torchtune.training.metric_logging.DiskLogger
+  log_dir: ${output_dir}
+output_dir: /tmp/Qwen2_5-7B-Instruct-finetune
+log_every_n_steps: 1
+log_peak_memory_stats: False
diff --git a/recipes/configs/qwen2_5/7B_full_single_device.yaml b/recipes/configs/qwen2_5/7B_full_single_device.yaml
new file mode 100644
index 0000000000..c4f464e97e
--- /dev/null
+++ b/recipes/configs/qwen2_5/7B_full_single_device.yaml
@@ -0,0 +1,82 @@
+# Config for single device full finetuning in full_finetune_single_device.py
+# using a Qwen2.5 7B
+#
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download Qwen/Qwen2.5-7B-Instruct --output-dir /tmp/Qwen2_5-7B-Instruct --ignore-patterns None
+#
+# The default config uses an optimizer from bitsandbytes. If you do not have it installed,
+# you can install it with
+#   pip install bitsandbytes
+#
+# To launch on a single device, run the following command from root:
+#   tune run full_finetune_single_device --config qwen2_5/7B_full_single_device
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+#   tune run full_finetune_single_device --config qwen2_5/7B_full_single_device checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#
+# This config works only for training on single device.
+
+# Tokenizer
+tokenizer:
+  _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer
+  path: /tmp/Qwen2_5-7B-Instruct/vocab.json
+  merges_file: /tmp/Qwen2_5-7B-Instruct/merges.txt
+  max_seq_len: null
+
+# Dataset
+dataset:
+  _component_: torchtune.datasets.alpaca_cleaned_dataset
+  packed: False
+seed: null
+shuffle: True
+
+# Model Arguments
+model:
+  _component_: torchtune.models.qwen2_5.qwen2_5_7b_instruct
+
+checkpointer:
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Qwen2_5-7B-Instruct
+  checkpoint_files: [
+    model-00001-of-00004.safetensors,
+    model-00002-of-00004.safetensors,
+    model-00003-of-00004.safetensors,
+    model-00004-of-00004.safetensors,
+  ]
+  recipe_checkpoint: null
+  output_dir: /tmp/Qwen2_5-7B-Instruct-finetune
+  model_type: QWEN2
+resume_from_checkpoint: False
+
+# Fine-tuning arguments
+batch_size: 2
+epochs: 1
+optimizer:
+  _component_: bitsandbytes.optim.PagedAdamW
+  lr: 5e-6
+optimizer_in_bwd: True
+loss:
+  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
+max_steps_per_epoch: null
+gradient_accumulation_steps: 1
+compile: False
+
+# Training environment
+device: cuda
+
+# Memory management
+enable_activation_checkpointing: True
+
+# Reduced precision
+dtype: bf16
+
+# Logging
+metric_logger:
+  _component_: torchtune.training.metric_logging.DiskLogger
+  log_dir: ${output_dir}
+output_dir: /tmp/Qwen2_5-7B-Instruct-finetune
+log_every_n_steps: 1
+log_peak_memory_stats: False
diff --git a/recipes/configs/qwen2_5/7B_lora.yaml b/recipes/configs/qwen2_5/7B_lora.yaml
new file mode 100644
index 0000000000..61365316be
--- /dev/null
+++ b/recipes/configs/qwen2_5/7B_lora.yaml
@@ -0,0 +1,115 @@
+# Config for multi-device LoRA finetuning in lora_finetune_distributed.py
+# using a Qwen2.5 7B model
+#
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download Qwen/Qwen2.5-7B-Instruct --output-dir /tmp/Qwen2_5-7B-Instruct --ignore-patterns None
+#
+# To launch on 2 devices, run the following command from root:
+#   tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config qwen2_5/7B_lora
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+#   tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config qwen2_5/7B_lora checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#
+# This config works best when the model is being fine-tuned on 2+ GPUs.
+# For single device LoRA finetuning please use 7B_lora_single_device.yaml
+
+
+# Model Arguments
+model:
+  _component_: torchtune.models.qwen2_5.lora_qwen2_5_7b_instruct
+  lora_attn_modules: ['q_proj', 'v_proj']
+  apply_lora_to_mlp: False
+  apply_lora_to_output: False
+  lora_rank: 8
+  lora_alpha: 16
+  lora_dropout: 0.0
+
+tokenizer:
+  _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer
+  path: /tmp/Qwen2_5-7B-Instruct/vocab.json
+  merges_file: /tmp/Qwen2_5-7B-Instruct/merges.txt
+  max_seq_len: null
+
+checkpointer:
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Qwen2_5-7B-Instruct
+  checkpoint_files: [
+    model-00001-of-00004.safetensors,
+    model-00002-of-00004.safetensors,
+    model-00003-of-00004.safetensors,
+    model-00004-of-00004.safetensors,
+  ]
+  recipe_checkpoint: null
+  output_dir: /tmp/Qwen2_5-7B-Instruct-lora-finetune
+  model_type: QWEN2
+resume_from_checkpoint: False
+
+# Dataset and Sampler
+dataset:
+  _component_: torchtune.datasets.alpaca_cleaned_dataset
+  packed: False
+seed: null
+shuffle: True
+batch_size: 2
+
+# Optimizer and Scheduler
+optimizer:
+  _component_: torch.optim.AdamW
+  fused: True
+  weight_decay: 0.01
+  lr: 3e-4
+lr_scheduler:
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
+  num_warmup_steps: 100
+
+loss:
+  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
+
+# Training
+epochs: 1
+max_steps_per_epoch: null
+gradient_accumulation_steps: 32
+compile: False
+
+# Logging
+output_dir: /tmp/Qwen2_5-7B-Instruct-lora-finetune
+metric_logger:
+  _component_: torchtune.training.metric_logging.DiskLogger
+  log_dir: ${output_dir}
+log_every_n_steps: 1
+log_peak_memory_stats: False
+
+# Environment
+device: cuda
+dtype: bf16
+enable_activation_checkpointing: False
+
+# Show case the usage of pytorch profiler
+# Set enabled to False as it's only needed for debugging training
+profiler:
+  _component_: torchtune.training.setup_torch_profiler
+
+  enabled: False
+
+  #Output directory of trace artifacts
+  output_dir: ${output_dir}/profiling_outputs
+
+  #`torch.profiler.ProfilerActivity` types to trace
+  cpu: True
+  cuda: True
+
+  #trace options passed to `torch.profiler.profile`
+  profile_memory: False
+  with_stack: False
+  record_shapes: True
+  with_flops: False
+
+  # `torch.profiler.schedule` options:
+  # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
+  wait_steps: 5
+  warmup_steps: 5
+  active_steps: 2
+  num_cycles: 1
diff --git a/recipes/configs/qwen2_5/7B_lora_single_device.yaml b/recipes/configs/qwen2_5/7B_lora_single_device.yaml
new file mode 100644
index 0000000000..53949bc307
--- /dev/null
+++ b/recipes/configs/qwen2_5/7B_lora_single_device.yaml
@@ -0,0 +1,116 @@
+# Config for single device LoRA finetuning in lora_finetune_single_device.py
+# using a Qwen2.5 7B model
+#
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download Qwen/Qwen2.5-7B-Instruct --output-dir /tmp/Qwen2_5-7B-Instruct --ignore-patterns None
+#
+# To launch on a single device, run the following command from root:
+#   tune run lora_finetune_single_device --config qwen2_5/7B_lora_single_device
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+#   tune run lora_finetune_single_device --config qwen2_5/7B_lora_single_device checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#
+# This config works only for training on single device.
+
+
+# Model Arguments
+model:
+  _component_: torchtune.models.qwen2_5.lora_qwen2_5_7b_instruct
+  lora_attn_modules: ['q_proj', 'v_proj']
+  apply_lora_to_mlp: False
+  apply_lora_to_output: False
+  lora_rank: 8
+  lora_alpha: 16
+  lora_dropout: 0.0
+
+tokenizer:
+  _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer
+  path: /tmp/Qwen2_5-7B-Instruct/vocab.json
+  merges_file: /tmp/Qwen2_5-7B-Instruct/merges.txt
+  max_seq_len: null
+
+checkpointer:
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Qwen2_5-7B-Instruct
+  checkpoint_files: [
+    model-00001-of-00004.safetensors,
+    model-00002-of-00004.safetensors,
+    model-00003-of-00004.safetensors,
+    model-00004-of-00004.safetensors,
+  ]
+  recipe_checkpoint: null
+  output_dir: /tmp/Qwen2_5-7B-Instruct-lora-finetune
+  model_type: QWEN2
+resume_from_checkpoint: False
+
+# Dataset and Sampler
+dataset:
+  _component_: torchtune.datasets.alpaca_cleaned_dataset
+  packed: False
+seed: null
+shuffle: True
+batch_size: 2
+
+# Optimizer and Scheduler
+optimizer:
+  _component_: torch.optim.AdamW
+  fused: True
+  weight_decay: 0.01
+  lr: 3e-4
+lr_scheduler:
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
+  num_warmup_steps: 100
+
+loss:
+  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
+
+# Training
+epochs: 1
+max_steps_per_epoch: null
+gradient_accumulation_steps: 64
+compile: False
+
+# Logging
+output_dir: /tmp/Qwen2_5-7B-Instruct-lora-finetune
+metric_logger:
+  _component_: torchtune.training.metric_logging.DiskLogger
+  log_dir: ${output_dir}
+log_every_n_steps: 1
+log_peak_memory_stats: False
+
+# Environment
+device: cuda
+dtype: bf16
+
+# Activations Offloading
+enable_activation_checkpointing: True
+enable_activation_offloading: False
+
+# Show case the usage of pytorch profiler
+# Set enabled to False as it's only needed for debugging training
+profiler:
+  _component_: torchtune.training.setup_torch_profiler
+  enabled: False
+
+  #Output directory of trace artifacts
+  output_dir: ${output_dir}/profiling_outputs
+
+  #`torch.profiler.ProfilerActivity` types to trace
+  cpu: True
+  cuda: True
+
+  #trace options passed to `torch.profiler.profile`
+  profile_memory: False
+  with_stack: False
+  record_shapes: True
+  with_flops: False
+
+  # `torch.profiler.schedule` options:
+  # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
+  wait_steps: 5
+  warmup_steps: 5
+  active_steps: 2
+  num_cycles: 1
diff --git a/tests/torchtune/models/qwen2/test_qwen2_tokenizer.py b/tests/torchtune/models/qwen2/test_qwen2_tokenizer.py
index b11d6679e1..8304a7c1bf 100644
--- a/tests/torchtune/models/qwen2/test_qwen2_tokenizer.py
+++ b/tests/torchtune/models/qwen2/test_qwen2_tokenizer.py
@@ -14,13 +14,12 @@
 from torchtune.models.qwen2 import qwen2_tokenizer
 
 
-class TestQwen2Tokenizer:
-    def tokenizer(self, template: bool = False, max_seq_len: Optional[int] = None):
+class TestQwenTokenizer:
+    def tokenizer(self, max_seq_len: Optional[int] = None):
         return qwen2_tokenizer(
             path=str(ASSETS / "tiny_bpe_vocab.json"),
             merges_file=str(ASSETS / "tiny_bpe_merges.txt"),
             special_tokens_path=str(ASSETS / "tiny_bpe_tokenizer.json"),
-            prompt_template="torchtune.data.ChatMLTemplate" if template else None,
             max_seq_len=max_seq_len,
         )
 
@@ -45,8 +44,8 @@ def messages(self):
             ),
         ]
 
-    def test_tokenize_messages_chat_template(self, messages):
-        tokenizer = self.tokenizer(template=True)
+    def test_tokenize_messages(self, messages):
+        tokenizer = self.tokenizer()
         tokens, mask = tokenizer.tokenize_messages(messages)
         expected_tokens = [
             2001,
@@ -236,218 +235,33 @@ def test_tokenize_messages_chat_template(self, messages):
             318,
             1278,
             13,
-            2002,
-            94,
             2000,
         ]
-        expected_mask = [True] * 67 + [False] * 123
+        expected_mask = [True] * 67 + [False] * 121
         assert expected_tokens == tokens
         assert expected_mask == mask
 
         formatted_messages = tokenizer.decode(tokens)
         expected_formatted_messages = (
             f"<|im_start|>user\n{messages[0].text_content}<|im_end|>\n"
-            f"<|im_start|>assistant\n{messages[1].text_content}<|im_end|>\n"
-            "<|endoftext|>"
+            f"<|im_start|>assistant\n{messages[1].text_content}<|endoftext|>"
         )
         assert expected_formatted_messages == formatted_messages
 
-    def test_tokenize_messages(self, messages):
-        tokenizer = self.tokenizer(template=False)
-        tokens, mask = tokenizer.tokenize_messages(messages)
-        expected_tokens = [
-            33,
-            214,
-            174,
-            156,
-            194,
-            130,
-            197,
-            184,
-            446,
-            789,
-            113,
-            98,
-            1914,
-            13,
-            346,
-            788,
-            98,
-            706,
-            102,
-            182,
-            184,
-            1916,
-            176,
-            762,
-            83,
-            113,
-            103,
-            874,
-            269,
-            13,
-            94,
-            94,
-            2,
-            2,
-            2,
-            483,
-            197,
-            25,
-            94,
-            885,
-            98,
-            1226,
-            1960,
-            348,
-            114,
-            1123,
-            399,
-            1583,
-            78,
-            13,
-            94,
-            94,
-            2,
-            2,
-            2,
-            360,
-            1733,
-            102,
-            182,
-            25,
-            94,
-            40,
-            1791,
-            194,
-            453,
-            70,
-            78,
-            114,
-            120,
-            967,
-            176,
-            618,
-            628,
-            1275,
-            794,
-            294,
-            1095,
-            445,
-            212,
-            1356,
-            120,
-            1299,
-            13,
-            223,
-            1791,
-            451,
-            98,
-            127,
-            181,
-            1047,
-            375,
-            915,
-            380,
-            120,
-            1448,
-            1732,
-            114,
-            453,
-            447,
-            1219,
-            64,
-            187,
-            921,
-            120,
-            742,
-            107,
-            84,
-            122,
-            893,
-            13,
-            223,
-            1791,
-            98,
-            127,
-            181,
-            123,
-            124,
-            131,
-            103,
-            744,
-            82,
-            120,
-            1506,
-            416,
-            114,
-            128,
-            1429,
-            182,
-            253,
-            82,
-            120,
-            163,
-            330,
-            105,
-            262,
-            13,
-            223,
-            1791,
-            155,
-            1551,
-            171,
-            1951,
-            628,
-            296,
-            64,
-            237,
-            886,
-            1390,
-            130,
-            883,
-            1678,
-            447,
-            306,
-            279,
-            113,
-            11,
-            215,
-            785,
-            215,
-            1951,
-            628,
-            378,
-            101,
-            66,
-            72,
-            593,
-            98,
-            984,
-            208,
-            1580,
-            167,
-            510,
-            737,
-            318,
-            1278,
-            13,
-            2000,
-        ]
-        expected_mask = [True] * 61 + [False] * 116
-        assert expected_tokens == tokens
-        assert expected_mask == mask
-
     def test_tokenize_messages_gt_max_seq_len(self, messages):
         # Super basic test to make sure max_seq_len is working properly
-        tokenizer = self.tokenizer(template=False, max_seq_len=10)
+        tokenizer = self.tokenizer(max_seq_len=10)
         tokens, mask = tokenizer.tokenize_messages(messages)
         assert len(tokens) == 10
         assert len(mask) == 10
 
     def test_tokenize_message_drop_eos(self, messages):
-        tokenizer = self.tokenizer(template=False)
+        tokenizer = self.tokenizer()
         expected_tokens = [
+            2001,
+            273,
+            105,
+            94,
             33,
             214,
             174,
@@ -509,6 +323,13 @@ def test_tokenize_message_drop_eos(self, messages):
             182,
             25,
             94,
+            2002,
+            94,
+            2001,
+            397,
+            251,
+            249,
+            94,
             40,
             1791,
             194,
@@ -624,14 +445,8 @@ def test_tokenize_message_drop_eos(self, messages):
             318,
             1278,
             13,
-            2000,
         ]
-
-        # Remove the EOS token
-        expected_tokens = expected_tokens[:-1]
-        # On 1 less then with eos
-        expected_mask = [True] * 61 + [False] * 115
-
+        expected_mask = [True] * 67 + [False] * 120
         tokens, mask = tokenizer.tokenize_messages(messages, add_eos=False)
         assert tokens == expected_tokens
         assert mask == expected_mask
diff --git a/tests/torchtune/models/qwen2_5/__init__.py b/tests/torchtune/models/qwen2_5/__init__.py
new file mode 100644
index 0000000000..2e41cd717f
--- /dev/null
+++ b/tests/torchtune/models/qwen2_5/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/tests/torchtune/models/qwen2_5/test_tokenizer.py b/tests/torchtune/models/qwen2_5/test_tokenizer.py
new file mode 100644
index 0000000000..332fef2b92
--- /dev/null
+++ b/tests/torchtune/models/qwen2_5/test_tokenizer.py
@@ -0,0 +1,195 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from tests.common import ASSETS
+
+from torchtune.data import Message
+from torchtune.models.qwen2_5 import qwen2_5_tokenizer
+
+
+class TestQwen2_5Tokenizer:  # noqa: N801
+    def tokenizer(self):
+        return qwen2_5_tokenizer(
+            path=str(ASSETS / "tiny_bpe_vocab.json"),
+            merges_file=str(ASSETS / "tiny_bpe_merges.txt"),
+        )
+
+    def test_tokenize_messages(self):
+        tokenizer = self.tokenizer()
+        messages = [
+            Message(role="system", content="You are a helpful assistant."),
+            Message(role="user", content="Give me a short introduction to LLMs."),
+            Message(role="assistant", content=""),
+        ]
+        expected_tokens = [
+            151644,
+            82,
+            88,
+            479,
+            94,
+            56,
+            119,
+            230,
+            98,
+            374,
+            494,
+            1318,
+            249,
+            13,
+            151645,
+            94,
+            151644,
+            273,
+            105,
+            94,
+            38,
+            229,
+            362,
+            98,
+            1695,
+            310,
+            1305,
+            165,
+            128,
+            432,
+            43,
+            44,
+            82,
+            13,
+            151645,
+            94,
+            151644,
+            397,
+            251,
+            249,
+            94,
+            151643,
+        ]
+        expected_formatted_messages = (
+            "<|im_start|>system\n"
+            "You are a helpful assistant.<|im_end|>\n"
+            "<|im_start|>user\n"
+            "Give me a short introduction to LLMs.<|im_end|>\n"
+            "<|im_start|>assistant\n"
+            "<|endoftext|>"
+        )
+        _test_tokenize_messages(
+            tokenizer,
+            messages,
+            expected_tokens,
+            expected_formatted_messages,
+        )
+
+    def test_tool_call(self):
+        tokenizer = self.tokenizer()
+        messages = [
+            Message(role="system", content="a"),
+            Message(role="user", content="b"),
+            Message(role="assistant", content="test call", ipython=True),
+            Message(role="ipython", content="test response"),
+            Message(role="assistant", content=""),
+        ]
+        expected_tokens = [
+            151644,
+            82,
+            88,
+            479,
+            94,
+            64,
+            151645,
+            94,
+            151644,
+            273,
+            105,
+            94,
+            65,
+            151645,
+            94,
+            151644,
+            397,
+            251,
+            249,
+            94,
+            151657,
+            94,
+            83,
+            269,
+            107,
+            330,
+            94,
+            151658,
+            151645,
+            94,
+            151644,
+            273,
+            105,
+            94,
+            27,
+            83,
+            1364,
+            62,
+            237,
+            79,
+            102,
+            182,
+            29,
+            94,
+            83,
+            269,
+            706,
+            102,
+            182,
+            94,
+            1932,
+            83,
+            1364,
+            62,
+            237,
+            79,
+            102,
+            182,
+            29,
+            151645,
+            94,
+            151644,
+            397,
+            251,
+            249,
+            94,
+            151643,
+        ]
+        expected_formatted_messages = (
+            "<|im_start|>system\n"
+            "a<|im_end|>\n"
+            "<|im_start|>user\n"
+            "b<|im_end|>\n"
+            "<|im_start|>assistant\n"
+            "<tool_call>\n"
+            "test call\n"
+            "</tool_call><|im_end|>\n"
+            "<|im_start|>user\n"
+            "<tool_response>\n"
+            "test response\n"
+            "</tool_response><|im_end|>\n"
+            "<|im_start|>assistant\n"
+            "<|endoftext|>"
+        )
+        _test_tokenize_messages(
+            tokenizer,
+            messages,
+            expected_tokens,
+            expected_formatted_messages,
+        )
+
+
+def _test_tokenize_messages(
+    tokenizer, messages, expected_tokens, expected_formatted_messages
+):
+    tokens, mask = tokenizer.tokenize_messages(messages)
+    assert len(tokens) == len(mask)
+    assert expected_tokens == tokens
+    formatted_messages = tokenizer.decode(tokens)
+    assert expected_formatted_messages == formatted_messages
diff --git a/torchtune/_recipe_registry.py b/torchtune/_recipe_registry.py
index 7bed74a6e7..cdb1d45f01 100644
--- a/torchtune/_recipe_registry.py
+++ b/torchtune/_recipe_registry.py
@@ -71,6 +71,22 @@ class Recipe:
                 name="qwen2/1.5B_full_single_device",
                 file_path="qwen2/1.5B_full_single_device.yaml",
             ),
+            Config(
+                name="qwen2_5/0_5B_full_single_device",
+                file_path="qwen2_5/0_5B_full_single_device.yaml",
+            ),
+            Config(
+                name="qwen2_5/1_5B_full_single_device",
+                file_path="qwen2_5/1_5B_full_single_device.yaml",
+            ),
+            Config(
+                name="qwen2_5/3B_full_single_device",
+                file_path="qwen2_5/3B_full_single_device.yaml",
+            ),
+            Config(
+                name="qwen2_5/7B_full_single_device",
+                file_path="qwen2_5/7B_full_single_device.yaml",
+            ),
             Config(
                 name="llama3_2_vision/11B_full_single_device",
                 file_path="llama3_2_vision/11B_full_single_device.yaml",
@@ -97,6 +113,10 @@ class Recipe:
             Config(name="qwen2/7B_full", file_path="qwen2/7B_full.yaml"),
             Config(name="qwen2/0.5B_full", file_path="qwen2/0.5B_full.yaml"),
             Config(name="qwen2/1.5B_full", file_path="qwen2/1.5B_full.yaml"),
+            Config(name="qwen2_5/0_5B_full", file_path="qwen2_5/0_5B_full.yaml"),
+            Config(name="qwen2_5/1_5B_full", file_path="qwen2_5/1_5B_full.yaml"),
+            Config(name="qwen2_5/3B_full", file_path="qwen2_5/3B_full.yaml"),
+            Config(name="qwen2_5/7B_full", file_path="qwen2_5/7B_full.yaml"),
             Config(
                 name="llama3_2_vision/11B_full",
                 file_path="llama3_2_vision/11B_full.yaml",
@@ -216,6 +236,26 @@ class Recipe:
                 name="qwen2/1.5B_lora_single_device",
                 file_path="qwen2/1.5B_lora_single_device.yaml",
             ),
+            Config(
+                name="qwen2_5/0_5B_lora_single_device",
+                file_path="qwen2_5/0_5B_lora_single_device.yaml",
+            ),
+            Config(
+                name="qwen2_5/1_5B_lora_single_device",
+                file_path="qwen2_5/1_5B_lora_single_device.yaml",
+            ),
+            Config(
+                name="qwen2_5/3B_lora_single_device",
+                file_path="qwen2_5/3B_lora_single_device.yaml",
+            ),
+            Config(
+                name="qwen2_5/7B_lora_single_device",
+                file_path="qwen2_5/7B_lora_single_device.yaml",
+            ),
+            Config(
+                name="qwen2_5/14B_lora_single_device",
+                file_path="qwen2_5/14B_lora_single_device.yaml",
+            ),
             Config(
                 name="llama3_2_vision/11B_lora_single_device",
                 file_path="llama3_2_vision/11B_lora_single_device.yaml",
@@ -293,6 +333,12 @@ class Recipe:
             Config(name="qwen2/7B_lora", file_path="qwen2/7B_lora.yaml"),
             Config(name="qwen2/0.5B_lora", file_path="qwen2/0.5B_lora.yaml"),
             Config(name="qwen2/1.5B_lora", file_path="qwen2/1.5B_lora.yaml"),
+            Config(name="qwen2_5/0_5B_lora", file_path="qwen2_5/0_5B_lora.yaml"),
+            Config(name="qwen2_5/1_5B_lora", file_path="qwen2_5/1_5B_lora.yaml"),
+            Config(name="qwen2_5/3B_lora", file_path="qwen2_5/3B_lora.yaml"),
+            Config(name="qwen2_5/7B_lora", file_path="qwen2_5/7B_lora.yaml"),
+            Config(name="qwen2_5/32B_lora", file_path="qwen2_5/32B_lora.yaml"),
+            Config(name="qwen2_5/72B_lora", file_path="qwen2_5/72B_lora.yaml"),
             Config(
                 name="llama3_2_vision/11B_lora",
                 file_path="llama3_2_vision/11B_lora.yaml",
diff --git a/torchtune/models/qwen2/__init__.py b/torchtune/models/qwen2/__init__.py
index a2f36a3661..8e04fba85d 100644
--- a/torchtune/models/qwen2/__init__.py
+++ b/torchtune/models/qwen2/__init__.py
@@ -19,17 +19,17 @@
 from ._tokenizer import Qwen2Tokenizer
 
 __all__ = [
-    "qwen2_7b",
-    "qwen2_0_5b",
-    "qwen2_1_5b",
-    "qwen2_tokenizer",
-    "lora_qwen2_7b",
-    "lora_qwen2_0_5b",
-    "lora_qwen2_1_5b",
-    "qwen2",
     "lora_qwen2",
+    "qwen2",
     "qwen2_hf_to_tune",
     "qwen2_tune_to_hf",
+    "lora_qwen2_0_5b",
+    "lora_qwen2_1_5b",
+    "lora_qwen2_7b",
+    "qwen2_0_5b",
+    "qwen2_1_5b",
+    "qwen2_7b",
+    "qwen2_tokenizer",
     "Qwen2RotaryPositionalEmbeddings",
     "Qwen2Tokenizer",
 ]
diff --git a/torchtune/models/qwen2/_model_builders.py b/torchtune/models/qwen2/_model_builders.py
index 8fef948643..2a0ee06f83 100644
--- a/torchtune/models/qwen2/_model_builders.py
+++ b/torchtune/models/qwen2/_model_builders.py
@@ -5,13 +5,13 @@
 # LICENSE file in the root directory of this source tree.
 from typing import List, Optional
 
-from torchtune.models.qwen2._component_builders import qwen2, lora_qwen2
-from torchtune.models.qwen2._tokenizer import Qwen2Tokenizer
+from torchtune.data._prompt_templates import _get_prompt_template, _TemplateType
+
+from torchtune.models.qwen2._component_builders import lora_qwen2, qwen2
+from torchtune.models.qwen2._tokenizer import QWEN2_SPECIAL_TOKENS, Qwen2Tokenizer
 from torchtune.modules import TransformerDecoder
 from torchtune.modules.peft import LORA_ATTN_MODULES
 from torchtune.modules.tokenizers import parse_hf_tokenizer_json
-from torchtune.data._prompt_templates import _TemplateType
-from torchtune.data._prompt_templates import _get_prompt_template
 
 """
 Model builders build specific instantiations using component builders. For example
@@ -101,7 +101,7 @@ def qwen2_tokenizer(
     merges_file: str = None,
     special_tokens_path: Optional[str] = None,
     max_seq_len: Optional[int] = None,
-    prompt_template: Optional[_TemplateType] = "torchtune.data.ChatMLTemplate",
+    prompt_template: Optional[_TemplateType] = None,
     **kwargs,
 ) -> Qwen2Tokenizer:
     """
@@ -118,14 +118,27 @@ def qwen2_tokenizer(
         prompt_template (Optional[_TemplateType]): optional specified prompt template.
             If a string, it is assumed to be the dotpath of a :class:`~torchtune.data.PromptTemplateInterface`
             class. If a dictionary, it is assumed to be a custom prompt template mapping role to the
-            prepend/append tags. Default is :class:`~torchtune.models.llama2.Llama2ChatTemplate`.
+            prepend/append tags. Default is None.
 
     Returns:
         Qwen2Tokenizer: Instantiation of the Qwen2 tokenizer
     """
-    special_tokens = parse_hf_tokenizer_json(special_tokens_path) if special_tokens_path is not None else None
-    template = _get_prompt_template(prompt_template) if prompt_template is not None else None
-    return Qwen2Tokenizer(path=path, merges_file=merges_file, special_tokens=special_tokens, max_seq_len=max_seq_len, prompt_template=template, **kwargs)
+    special_tokens = (
+        parse_hf_tokenizer_json(special_tokens_path)
+        if special_tokens_path is not None
+        else QWEN2_SPECIAL_TOKENS
+    )
+    template = (
+        _get_prompt_template(prompt_template) if prompt_template is not None else None
+    )
+    return Qwen2Tokenizer(
+        path=path,
+        merges_file=merges_file,
+        special_tokens=special_tokens,
+        max_seq_len=max_seq_len,
+        prompt_template=template,
+        **kwargs,
+    )
 
 
 def lora_qwen2_7b(
diff --git a/torchtune/models/qwen2/_tokenizer.py b/torchtune/models/qwen2/_tokenizer.py
index 952100002e..0e4ee6bd35 100644
--- a/torchtune/models/qwen2/_tokenizer.py
+++ b/torchtune/models/qwen2/_tokenizer.py
@@ -24,12 +24,11 @@
     "<|im_end|>": 151645,
 }
 
-
 ENDOFTEXT = "<|endoftext|>"
 IM_START = "<|im_start|>"
 IM_END = "<|im_end|>"
 
-DEFAULT_QWEN2_TOKENIZER_BPE_CACHE_SIZE = 151646
+DEFAULT_QWEN2_TOKENIZER_BPE_CACHE_SIZE = 152064
 
 
 @lru_cache()
@@ -83,7 +82,7 @@ class Qwen2Tokenizer(ModelTokenizer):
         merges_file (str): Path to merges.txt file.
             merges.txt contains all BPE merge operations, and this file is required to split a single word into
             byte-level BPE tokens.
-        special_tokens (Optional[Dict[str, int]]): Special tokens to add to the tokenizer. Default is None.
+        special_tokens (Dict[str, int]): Special tokens to add to the tokenizer. Default is QWEN2_SPECIAL_TOKENS.
         max_seq_len (Optional[int]): A max sequence length to truncate tokens to.
             Default: None
         prompt_template (Optional[PromptTemplate]): template used to format the messages based on their role. This is used
@@ -95,7 +94,7 @@ class Qwen2Tokenizer(ModelTokenizer):
             - Community standardized templates, such as :class:`~torchtune.data.ChatMLTemplate`
 
             The extra text will still get tokenized as normal text, not as special tokens.
-            Default is :class:`~torchtune.data.ChatMLTemplate`.
+            Default: None
         errors (str): Paradigm to follow when decoding bytes to UTF-8. Defaults to "replace".
             See [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
         unk_token (Optional[str]): The unknown token. A token that is not in the vocabulary cannot be converted
@@ -110,7 +109,8 @@ class Qwen2Tokenizer(ModelTokenizer):
             By default, we set the cache size equals to size of the official Qwen2 tokenizer.
 
     Example:
-        >>> tokenizer = Qwen2Tokenizer(path="/path/to/vocab.json", merges_file="/path/to/merges.txt")
+        >>> tokenizer = Qwen2Tokenizer(
+                path="/path/to/vocab.json", merges_file="/path/to/merges.txt", special_tokens=QWEN2_SPECIAL_TOKENS)
         >>> tokenized_text = tokenizer.encode("Hello world!")
         >>> print(tokenized_text)
         [39, 385, 78, 675, 0, 2000]
@@ -120,10 +120,10 @@ def __init__(
         self,
         path: str,
         merges_file: str,
-        special_tokens: Optional[Dict[str, int]] = None,
+        special_tokens: Dict[str, int] = QWEN2_SPECIAL_TOKENS,
         max_seq_len: Optional[int] = None,
         *,
-        prompt_template: Optional[PromptTemplate] = ChatMLTemplate(),
+        prompt_template: Optional[PromptTemplate] = None,
         errors: str = "replace",
         unk_token: Optional[str] = ENDOFTEXT,
         bos_token: Optional[str] = None,
@@ -151,9 +151,7 @@ def __init__(
 
         self.pat = re.compile(PRETOKENIZE_REGEX)
 
-        self.special_tokens = (
-            special_tokens if special_tokens is not None else QWEN2_SPECIAL_TOKENS
-        )
+        self.special_tokens = special_tokens
         self._special_tokens_reversed = {v: k for k, v in self.special_tokens.items()}
 
         self.unk_id = None if unk_token is None else self.special_tokens[unk_token]
@@ -345,6 +343,10 @@ def tokenize_messages(
         Raises:
             RuntimeError: If a message contains non-text content
         """
+        assert not isinstance(self.prompt_template, ChatMLTemplate), (
+            "Using ChatMLTemplate with tokenize_messages will result in multiple <|im_*|> tokens wrapping each message."
+            "Please use a different template or set to None."
+        )
         templated_messages = (
             self.prompt_template(messages)
             if self.prompt_template is not None
@@ -355,29 +357,48 @@ def tokenize_messages(
         mask = []
         for index, message in enumerate(templated_messages):
             tokens = []
+
+            # message header
+            if message.role != "ipython":
+                tokens.append(self.im_start_id)
+                tokens.extend(
+                    self.encode(f"{message.role}\n", add_bos=False, add_eos=False)
+                )
+
+            # message content
             for item in message.content:
                 if item["type"] == "text":
-                    tokens = tokens + self.encode(
-                        item["content"],
-                        add_bos=False,
-                        add_eos=False,
+                    tokens.extend(
+                        self.encode(
+                            item["content"],
+                            add_bos=False,
+                            add_eos=False,
+                        )
                     )
                 else:
                     raise RuntimeError(
                         f"Unsupported message content type: {item['type']}"
                     )
+
+            # message footer
+            if message.role != "ipython" and (
+                message.role != "assistant" or index != len(messages) - 1
+            ):
+                tokens.append(self.im_end_id)
+                tokens.extend(self.encode("\n", add_bos=False, add_eos=False))
+
             tokenized_messages.extend(tokens)
             mask.extend([message.masked] * len(tokens))
 
-            # If assistant message, append EOS at end
-            if message.role == "assistant" and add_eos:
-                tokenized_messages.append(self.eos_id)
-                mask.append(message.masked)
-
             # Break out early if we reach max_seq_len
             if self.max_seq_len and len(tokenized_messages) >= self.max_seq_len:
                 break
 
+        # Add the End-Of-Sequence token
+        if add_eos:
+            tokenized_messages.append(self.eos_id)
+            mask.append(mask[-1])
+
         # Finally, truncate if necessary
         if self.max_seq_len:
             tokenized_messages = truncate(
diff --git a/torchtune/models/qwen2_5/__init__.py b/torchtune/models/qwen2_5/__init__.py
new file mode 100644
index 0000000000..50d1e570b8
--- /dev/null
+++ b/torchtune/models/qwen2_5/__init__.py
@@ -0,0 +1,61 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from ._model_builders import (
+    lora_qwen2_5_0_5b,
+    lora_qwen2_5_14b_base,
+    lora_qwen2_5_14b_instruct,
+    lora_qwen2_5_1_5b_base,
+    lora_qwen2_5_1_5b_instruct,
+    lora_qwen2_5_32b_base,
+    lora_qwen2_5_32b_instruct,
+    lora_qwen2_5_3b,
+    lora_qwen2_5_72b_base,
+    lora_qwen2_5_72b_instruct,
+    lora_qwen2_5_7b_base,
+    lora_qwen2_5_7b_instruct,
+    qwen2_5_0_5b,
+    qwen2_5_14b_base,
+    qwen2_5_14b_instruct,
+    qwen2_5_1_5b_base,
+    qwen2_5_1_5b_instruct,
+    qwen2_5_32b_base,
+    qwen2_5_32b_instruct,
+    qwen2_5_3b,
+    qwen2_5_72b_base,
+    qwen2_5_72b_instruct,
+    qwen2_5_7b_base,
+    qwen2_5_7b_instruct,
+    qwen2_5_tokenizer,
+)
+
+__all__ = [
+    "lora_qwen2_5_0_5b",
+    "lora_qwen2_5_14b_base",
+    "lora_qwen2_5_14b_instruct",
+    "lora_qwen2_5_1_5b_base",
+    "lora_qwen2_5_1_5b_instruct",
+    "lora_qwen2_5_32b_base",
+    "lora_qwen2_5_32b_instruct",
+    "lora_qwen2_5_3b",
+    "lora_qwen2_5_72b_base",
+    "lora_qwen2_5_72b_instruct",
+    "lora_qwen2_5_7b_base",
+    "lora_qwen2_5_7b_instruct",
+    "qwen2_5_0_5b",
+    "qwen2_5_14b_base",
+    "qwen2_5_14b_instruct",
+    "qwen2_5_1_5b_base",
+    "qwen2_5_1_5b_instruct",
+    "qwen2_5_32b_base",
+    "qwen2_5_32b_instruct",
+    "qwen2_5_3b",
+    "qwen2_5_72b_base",
+    "qwen2_5_72b_instruct",
+    "qwen2_5_7b_base",
+    "qwen2_5_7b_instruct",
+    "qwen2_5_tokenizer",
+]
diff --git a/torchtune/models/qwen2_5/_model_builders.py b/torchtune/models/qwen2_5/_model_builders.py
new file mode 100644
index 0000000000..4474958862
--- /dev/null
+++ b/torchtune/models/qwen2_5/_model_builders.py
@@ -0,0 +1,1095 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import List, Optional
+
+from torchtune.data._prompt_templates import _get_prompt_template, _TemplateType
+
+from torchtune.models.qwen2._component_builders import lora_qwen2, qwen2
+from torchtune.models.qwen2_5._tokenizer import QWEN2_5_SPECIAL_TOKENS, Qwen2_5Tokenizer
+from torchtune.modules import TransformerDecoder
+from torchtune.modules.peft import LORA_ATTN_MODULES
+from torchtune.modules.tokenizers import parse_hf_tokenizer_json
+
+"""
+Model builders build specific instantiations using component builders. For example
+the qwen2_5_7b model builder uses the qwen2 component builder to create the
+Qwen2.5 7B model.
+"""
+
+
+def qwen2_5_0_5b() -> TransformerDecoder:
+    """
+    Builder for creating a Qwen2.5 model (base or instruct) initialized w/ the default 0.5B parameter values
+    from https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct
+
+    Returns:
+        TransformerDecoder: Instantiation of Qwen2.5 0.5B model
+
+    Note:
+        Qwen2.5 0.5B-3B model builders will enable `tie_word_embeddings` by default.
+    """
+    return qwen2(
+        vocab_size=151936,
+        num_layers=24,
+        num_heads=14,
+        num_kv_heads=2,
+        embed_dim=896,
+        intermediate_dim=4864,
+        max_seq_len=32768,
+        attn_dropout=0.0,
+        norm_eps=1e-6,
+        rope_base=1000000.0,
+        tie_word_embeddings=True,
+    )
+
+
+def qwen2_5_1_5b_base() -> TransformerDecoder:
+    """
+    Builder for creating a Qwen2.5 base model initialized w/ the default 1.5B parameter values
+    from https://huggingface.co/Qwen/Qwen2.5-1.5B
+
+    Returns:
+        TransformerDecoder: Instantiation of Qwen2.5 1.5B model
+
+    Note:
+        The base and instruct versions have slightly different architectures for all Qwen2.5 model sizes
+        except 0.5B and 3B. Make sure to select the correct model builder for the weights.
+
+    Note:
+        Qwen2.5 0.5B-3B model builders will enable `tie_word_embeddings` by default.
+    """
+    return qwen2(
+        vocab_size=151936,
+        num_layers=28,
+        num_heads=12,
+        num_kv_heads=2,
+        embed_dim=1536,
+        intermediate_dim=8960,
+        max_seq_len=131072,
+        attn_dropout=0.0,
+        norm_eps=1e-6,
+        rope_base=1000000.0,
+        tie_word_embeddings=True,
+    )
+
+
+def qwen2_5_1_5b_instruct() -> TransformerDecoder:
+    """
+    Builder for creating a Qwen2.5 instruct model initialized w/ the default 1.5B parameter values
+    from https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct
+
+    Returns:
+        TransformerDecoder: Instantiation of Qwen2.5 1.5B instruct model
+
+    Note:
+        The base and instruct versions have slightly different architectures for all Qwen2.5 model sizes
+        except 0.5B and 3B. Make sure to select the correct model builder for the weights.
+
+    Note:
+        Qwen2.5 0.5B-3B model builders will enable `tie_word_embeddings` by default.
+    """
+    return qwen2(
+        vocab_size=151936,
+        num_layers=28,
+        num_heads=12,
+        num_kv_heads=2,
+        embed_dim=1536,
+        intermediate_dim=8960,
+        max_seq_len=32768,
+        attn_dropout=0.0,
+        norm_eps=1e-6,
+        rope_base=1000000.0,
+        tie_word_embeddings=True,
+    )
+
+
+def qwen2_5_3b() -> TransformerDecoder:
+    """
+    Builder for creating a Qwen2.5 model (base or instruct) initialized w/ the default 3B parameter values
+    from https://huggingface.co/Qwen/Qwen2.5-3B-Instruct
+
+    Returns:
+        TransformerDecoder: Instantiation of Qwen2.5 3B model
+
+    Note:
+        Qwen2.5 0.5B-3B model builders will enable `tie_word_embeddings` by default.
+    """
+    return qwen2(
+        vocab_size=151936,
+        num_layers=36,
+        num_heads=16,
+        num_kv_heads=2,
+        embed_dim=2048,
+        intermediate_dim=11008,
+        max_seq_len=32768,
+        attn_dropout=0.0,
+        norm_eps=1e-6,
+        rope_base=1000000.0,
+        tie_word_embeddings=True,
+    )
+
+
+def qwen2_5_7b_base() -> TransformerDecoder:
+    """
+    Builder for creating a Qwen2.5 base model initialized w/ the default 7B parameter values
+    from https://huggingface.co/Qwen/Qwen2.5-7B
+
+    Returns:
+        TransformerDecoder: Instantiation of Qwen2.5 7B model
+
+    Note:
+        The base and instruct versions have slightly different architectures for all Qwen2.5 model sizes
+        except 0.5B and 3B. Make sure to select the correct model builder for the weights.
+    """
+    return qwen2(
+        vocab_size=152064,
+        num_layers=28,
+        num_heads=28,
+        num_kv_heads=4,
+        embed_dim=3584,
+        intermediate_dim=18944,
+        max_seq_len=131072,
+        attn_dropout=0.0,
+        norm_eps=1e-6,
+        rope_base=1000000.0,
+    )
+
+
+def qwen2_5_7b_instruct() -> TransformerDecoder:
+    """
+    Builder for creating a Qwen2.5 instruct model initialized w/ the default 7B parameter values
+    from https://huggingface.co/Qwen/Qwen2.5-7B-Instruct
+
+    Returns:
+        TransformerDecoder: Instantiation of Qwen2.5 7B instruct model
+
+    Note:
+        The base and instruct versions have slightly different architectures for all Qwen2.5 model sizes
+        except 0.5B and 3B. Make sure to select the correct model builder for the weights.
+    """
+    return qwen2(
+        vocab_size=152064,
+        num_layers=28,
+        num_heads=28,
+        num_kv_heads=4,
+        embed_dim=3584,
+        intermediate_dim=18944,
+        max_seq_len=32768,
+        attn_dropout=0.0,
+        norm_eps=1e-6,
+        rope_base=1000000.0,
+    )
+
+
+def qwen2_5_14b_base() -> TransformerDecoder:
+    """
+    Builder for creating a Qwen2.5 base model initialized w/ the default 14B parameter values
+    from https://huggingface.co/Qwen/Qwen2.5-14B
+
+    Returns:
+        TransformerDecoder: Instantiation of Qwen2.5 14B model
+
+    Note:
+        The base and instruct versions have slightly different architectures for all Qwen2.5 model sizes
+        except 0.5B and 3B. Make sure to select the correct model builder for the weights.
+    """
+    return qwen2(
+        vocab_size=152064,
+        num_layers=48,
+        num_heads=40,
+        num_kv_heads=8,
+        embed_dim=5120,
+        intermediate_dim=13824,
+        max_seq_len=131072,
+        attn_dropout=0.0,
+        norm_eps=1e-5,
+        rope_base=1000000.0,
+    )
+
+
+def qwen2_5_14b_instruct() -> TransformerDecoder:
+    """
+    Builder for creating a Qwen2.5 instruct model initialized w/ the default 14B parameter values
+    from https://huggingface.co/Qwen/Qwen2.5-14B-Instruct
+
+    Returns:
+        TransformerDecoder: Instantiation of Qwen2.5 14B instruct model
+
+    Note:
+        The base and instruct versions have slightly different architectures for all Qwen2.5 model sizes
+        except 0.5B and 3B. Make sure to select the correct model builder for the weights.
+    """
+    return qwen2(
+        vocab_size=152064,
+        num_layers=48,
+        num_heads=40,
+        num_kv_heads=8,
+        embed_dim=5120,
+        intermediate_dim=13824,
+        max_seq_len=32768,
+        attn_dropout=0.0,
+        norm_eps=1e-6,
+        rope_base=1000000.0,
+    )
+
+
+def qwen2_5_32b_base() -> TransformerDecoder:
+    """
+    Builder for creating a Qwen2.5 base model initialized w/ the default 32B parameter values
+    from https://huggingface.co/Qwen/Qwen2.5-32B
+
+    Returns:
+        TransformerDecoder: Instantiation of Qwen2.5 32B model
+
+    Note:
+        The base and instruct versions have slightly different architectures for all Qwen2.5 model sizes
+        except 0.5B and 3B. Make sure to select the correct model builder for the weights.
+    """
+    return qwen2(
+        vocab_size=152064,
+        num_layers=64,
+        num_heads=40,
+        num_kv_heads=8,
+        embed_dim=5120,
+        intermediate_dim=27648,
+        max_seq_len=131072,
+        attn_dropout=0.0,
+        norm_eps=1e-5,
+        rope_base=1000000.0,
+    )
+
+
+def qwen2_5_32b_instruct() -> TransformerDecoder:
+    """
+    Builder for creating a Qwen2.5 instruct model initialized w/ the default 32B parameter values
+    from https://huggingface.co/Qwen/Qwen2.5-32B-Instruct
+
+    Returns:
+        TransformerDecoder: Instantiation of Qwen2.5 32B instruct model
+
+    Note:
+        The base and instruct versions have slightly different architectures for all Qwen2.5 model sizes
+        except 0.5B and 3B. Make sure to select the correct model builder for the weights.
+    """
+    return qwen2(
+        vocab_size=152064,
+        num_layers=64,
+        num_heads=40,
+        num_kv_heads=8,
+        embed_dim=5120,
+        intermediate_dim=27648,
+        max_seq_len=32768,
+        attn_dropout=0.0,
+        norm_eps=1e-6,
+        rope_base=1000000.0,
+    )
+
+
+def qwen2_5_72b_base() -> TransformerDecoder:
+    """
+    Builder for creating a Qwen2.5 base model initialized w/ the default 72B parameter values
+    from https://huggingface.co/Qwen/Qwen2.5-72B
+
+    Returns:
+        TransformerDecoder: Instantiation of Qwen2.5 72B model
+
+    Note:
+        The base and instruct versions have slightly different architectures for all Qwen2.5 model sizes
+        except 0.5B and 3B. Make sure to select the correct model builder for the weights.
+    """
+    return qwen2(
+        vocab_size=152064,
+        num_layers=80,
+        num_heads=64,
+        num_kv_heads=8,
+        embed_dim=8192,
+        intermediate_dim=29568,
+        max_seq_len=131072,
+        attn_dropout=0.0,
+        norm_eps=1e-5,
+        rope_base=1000000.0,
+    )
+
+
+def qwen2_5_72b_instruct() -> TransformerDecoder:
+    """
+    Builder for creating a Qwen2.5 instruct model initialized w/ the default 72B parameter values
+    from https://huggingface.co/Qwen/Qwen2.5-72B-Instruct
+
+    Returns:
+        TransformerDecoder: Instantiation of Qwen2.5 72B instruct model
+
+    Note:
+        The base and instruct versions have slightly different architectures for all Qwen2.5 model sizes
+        except 0.5B and 3B. Make sure to select the correct model builder for the weights.
+    """
+    return qwen2(
+        vocab_size=152064,
+        num_layers=80,
+        num_heads=64,
+        num_kv_heads=8,
+        embed_dim=8192,
+        intermediate_dim=29568,
+        max_seq_len=32768,
+        attn_dropout=0.0,
+        norm_eps=1e-6,
+        rope_base=1000000.0,
+    )
+
+
+def qwen2_5_tokenizer(
+    path: str,
+    merges_file: str,
+    special_tokens_path: Optional[str] = None,
+    max_seq_len: Optional[int] = None,
+    prompt_template: Optional[_TemplateType] = None,
+    **kwargs,
+) -> Qwen2_5Tokenizer:
+    """
+    Tokenizer for Qwen2.5.
+
+    Args:
+        path (str): path to the vocab.json file.
+        merges_file (str): path to the merges.txt file.
+        special_tokens_path (Optional[str]): Path to ``tokenizer.json`` from Hugging Face
+            model files that contains all registered special tokens, or a local json file
+            structured similarly. Default is None to use the canonical Qwen2.5 special tokens.
+        max_seq_len (Optional[int]): A max sequence length to truncate tokens to.
+            Default: None
+        prompt_template (Optional[_TemplateType]): optional specified prompt template.
+            If a string, it is assumed to be the dotpath of a :class:`~torchtune.data.PromptTemplateInterface`
+            class. If a dictionary, it is assumed to be a custom prompt template mapping role to the
+            prepend/append tags.
+            Default is None.
+
+    Returns:
+        Qwen2_5Tokenizer: Instantiation of the Qwen2.5 tokenizer
+    """
+    special_tokens = (
+        QWEN2_5_SPECIAL_TOKENS
+        if special_tokens_path is None
+        else parse_hf_tokenizer_json(special_tokens_path)
+    )
+
+    if prompt_template is not None:
+        prompt_template = _get_prompt_template(prompt_template)
+
+    return Qwen2_5Tokenizer(
+        path=path,
+        merges_file=merges_file,
+        special_tokens=special_tokens,
+        max_seq_len=max_seq_len,
+        prompt_template=prompt_template,
+        **kwargs,
+    )
+
+
+def lora_qwen2_5_0_5b(
+    lora_attn_modules: List[LORA_ATTN_MODULES],
+    apply_lora_to_mlp: bool = False,
+    apply_lora_to_output: bool = False,
+    lora_rank: int = 8,
+    lora_alpha: float = 16,
+    lora_dropout: float = 0.0,
+    use_dora: bool = False,
+    quantize_base: bool = False,
+) -> TransformerDecoder:
+    """
+    Builder for creating a Qwen2.5 0.5B model (base or instruct) with LoRA enabled.
+
+    The Qwen2.5 defaults are the same as in :func:`~torchtune.models.qwen2_5.qwen2_5_0_5b`,
+    while LoRA default params are based on
+    https://github.com/tloen/alpaca-lora/blob/8bb8579e403dc78e37fe81ffbb253c413007323f/finetune.py#L41-L43.
+
+    Args:
+        lora_attn_modules (List[LORA_ATTN_MODULES]): list of which linear layers
+            LoRA should be applied to in each self-attention block. Options are
+            ``{"q_proj", "k_proj", "v_proj", "output_proj"}``.
+        apply_lora_to_mlp (bool): whether to apply LoRA to the MLP in each transformer layer.
+            Default: False
+        lora_rank (int): rank of each low-rank approximation
+        lora_alpha (float): scaling factor for the low-rank approximation
+        lora_dropout (float): dropout probability for the low-rank approximation. Default: 0.0
+        quantize_base (bool): Whether to quantize base model weights
+
+    Returns:
+        TransformerDecoder: Instantiation of Qwen2.5 0.5B model with LoRA applied
+
+    Note:
+        Qwen2.5 0.5B-3B model builders will enable `tie_word_embeddings` by default.
+    """
+    return lora_qwen2(
+        lora_attn_modules=lora_attn_modules,
+        apply_lora_to_mlp=apply_lora_to_mlp,
+        apply_lora_to_output=apply_lora_to_output,
+        vocab_size=151936,
+        num_layers=24,
+        num_heads=14,
+        num_kv_heads=2,
+        embed_dim=896,
+        intermediate_dim=4864,
+        max_seq_len=32768,
+        attn_dropout=0.0,
+        norm_eps=1e-6,
+        rope_base=1000000.0,
+        tie_word_embeddings=True,
+        lora_rank=lora_rank,
+        lora_alpha=lora_alpha,
+        lora_dropout=lora_dropout,
+        use_dora=use_dora,
+        quantize_base=quantize_base,
+    )
+
+
+def lora_qwen2_5_1_5b_base(
+    lora_attn_modules: List[LORA_ATTN_MODULES],
+    apply_lora_to_mlp: bool = False,
+    apply_lora_to_output: bool = False,
+    lora_rank: int = 8,
+    lora_alpha: float = 16,
+    lora_dropout: float = 0.0,
+    use_dora: bool = False,
+    quantize_base: bool = False,
+) -> TransformerDecoder:
+    """
+    Builder for creating a Qwen2.5 1.5B base model with LoRA enabled.
+
+    The Qwen2.5 defaults are the same as in :func:`~torchtune.models.qwen2_5.qwen2_5_1_5b_base`,
+    while LoRA default params are based on
+    https://github.com/tloen/alpaca-lora/blob/8bb8579e403dc78e37fe81ffbb253c413007323f/finetune.py#L41-L43.
+
+    Args:
+        lora_attn_modules (List[LORA_ATTN_MODULES]): list of which linear layers
+            LoRA should be applied to in each self-attention block. Options are
+            ``{"q_proj", "k_proj", "v_proj", "output_proj"}``.
+        apply_lora_to_mlp (bool): whether to apply LoRA to the MLP in each transformer layer.
+            Default: False
+        lora_rank (int): rank of each low-rank approximation
+        lora_alpha (float): scaling factor for the low-rank approximation
+        lora_dropout (float): dropout probability for the low-rank approximation. Default: 0.0
+        quantize_base (bool): Whether to quantize base model weights
+
+    Returns:
+        TransformerDecoder: Instantiation of Qwen2.5 1.5B model with LoRA applied
+
+    Note:
+        Qwen2.5 0.5B-3B model builders will enable `tie_word_embeddings` by default.
+
+    Note:
+        The base and instruct versions have slightly different architectures for all Qwen2.5 model sizes
+        except 0.5B and 3B. Make sure to select the correct model builder for the weights.
+    """
+    return lora_qwen2(
+        lora_attn_modules=lora_attn_modules,
+        apply_lora_to_mlp=apply_lora_to_mlp,
+        apply_lora_to_output=apply_lora_to_output,
+        vocab_size=151936,
+        num_layers=28,
+        num_heads=12,
+        num_kv_heads=2,
+        embed_dim=1536,
+        intermediate_dim=8960,
+        max_seq_len=131072,
+        attn_dropout=0.0,
+        norm_eps=1e-6,
+        rope_base=1000000.0,
+        tie_word_embeddings=True,
+        lora_rank=lora_rank,
+        lora_alpha=lora_alpha,
+        lora_dropout=lora_dropout,
+        use_dora=use_dora,
+        quantize_base=quantize_base,
+    )
+
+
+def lora_qwen2_5_1_5b_instruct(
+    lora_attn_modules: List[LORA_ATTN_MODULES],
+    apply_lora_to_mlp: bool = False,
+    apply_lora_to_output: bool = False,
+    lora_rank: int = 8,
+    lora_alpha: float = 16,
+    lora_dropout: float = 0.0,
+    use_dora: bool = False,
+    quantize_base: bool = False,
+) -> TransformerDecoder:
+    """
+    Builder for creating a Qwen2.5 1.5B instruct model with LoRA enabled.
+
+    The Qwen2.5 defaults are the same as in :func:`~torchtune.models.qwen2_5.qwen2_5_1_5b_instruct`,
+    while LoRA default params are based on
+    https://github.com/tloen/alpaca-lora/blob/8bb8579e403dc78e37fe81ffbb253c413007323f/finetune.py#L41-L43.
+
+    Args:
+        lora_attn_modules (List[LORA_ATTN_MODULES]): list of which linear layers
+            LoRA should be applied to in each self-attention block. Options are
+            ``{"q_proj", "k_proj", "v_proj", "output_proj"}``.
+        apply_lora_to_mlp (bool): whether to apply LoRA to the MLP in each transformer layer.
+            Default: False
+        lora_rank (int): rank of each low-rank approximation
+        lora_alpha (float): scaling factor for the low-rank approximation
+        lora_dropout (float): dropout probability for the low-rank approximation. Default: 0.0
+        quantize_base (bool): Whether to quantize base model weights
+
+    Returns:
+        TransformerDecoder: Instantiation of Qwen2.5 1.5B model with LoRA applied
+
+    Note:
+        Qwen2.5 0.5B-3B model builders will enable `tie_word_embeddings` by default.
+
+    Note:
+        The base and instruct versions have slightly different architectures for all Qwen2.5 model sizes
+        except 0.5B and 3B. Make sure to select the correct model builder for the weights.
+    """
+    return lora_qwen2(
+        lora_attn_modules=lora_attn_modules,
+        apply_lora_to_mlp=apply_lora_to_mlp,
+        apply_lora_to_output=apply_lora_to_output,
+        vocab_size=151936,
+        num_layers=28,
+        num_heads=12,
+        num_kv_heads=2,
+        embed_dim=1536,
+        intermediate_dim=8960,
+        max_seq_len=32768,
+        attn_dropout=0.0,
+        norm_eps=1e-6,
+        rope_base=1000000.0,
+        tie_word_embeddings=True,
+        lora_rank=lora_rank,
+        lora_alpha=lora_alpha,
+        lora_dropout=lora_dropout,
+        use_dora=use_dora,
+        quantize_base=quantize_base,
+    )
+
+
+def lora_qwen2_5_3b(
+    lora_attn_modules: List[LORA_ATTN_MODULES],
+    apply_lora_to_mlp: bool = False,
+    apply_lora_to_output: bool = False,
+    lora_rank: int = 8,
+    lora_alpha: float = 16,
+    lora_dropout: float = 0.0,
+    use_dora: bool = False,
+    quantize_base: bool = False,
+) -> TransformerDecoder:
+    """
+    Builder for creating a Qwen2.5 3B model (base or instruct) with LoRA enabled.
+
+    The Qwen2.5 defaults are the same as in :func:`~torchtune.models.qwen2_5.qwen2_5_3b`,
+    while LoRA default params are based on
+    https://github.com/tloen/alpaca-lora/blob/8bb8579e403dc78e37fe81ffbb253c413007323f/finetune.py#L41-L43.
+
+    Args:
+        lora_attn_modules (List[LORA_ATTN_MODULES]): list of which linear layers
+            LoRA should be applied to in each self-attention block. Options are
+            ``{"q_proj", "k_proj", "v_proj", "output_proj"}``.
+        apply_lora_to_mlp (bool): whether to apply LoRA to the MLP in each transformer layer.
+            Default: False
+        lora_rank (int): rank of each low-rank approximation
+        lora_alpha (float): scaling factor for the low-rank approximation
+        lora_dropout (float): dropout probability for the low-rank approximation. Default: 0.0
+        quantize_base (bool): Whether to quantize base model weights
+
+    Returns:
+        TransformerDecoder: Instantiation of Qwen2.5 3B model with LoRA applied
+
+    Note:
+        Qwen2.5 0.5B-3B model builders will enable `tie_word_embeddings` by default.
+    """
+    return lora_qwen2(
+        lora_attn_modules=lora_attn_modules,
+        apply_lora_to_mlp=apply_lora_to_mlp,
+        apply_lora_to_output=apply_lora_to_output,
+        vocab_size=151936,
+        num_layers=36,
+        num_heads=16,
+        num_kv_heads=2,
+        embed_dim=2048,
+        intermediate_dim=11008,
+        max_seq_len=32768,
+        attn_dropout=0.0,
+        norm_eps=1e-6,
+        rope_base=1000000.0,
+        tie_word_embeddings=True,
+        lora_rank=lora_rank,
+        lora_alpha=lora_alpha,
+        lora_dropout=lora_dropout,
+        use_dora=use_dora,
+        quantize_base=quantize_base,
+    )
+
+
+def lora_qwen2_5_7b_base(
+    lora_attn_modules: List[LORA_ATTN_MODULES],
+    apply_lora_to_mlp: bool = False,
+    apply_lora_to_output: bool = False,
+    lora_rank: int = 8,
+    lora_alpha: float = 16,
+    lora_dropout: float = 0.0,
+    use_dora: bool = False,
+    quantize_base: bool = False,
+) -> TransformerDecoder:
+    """
+    Builder for creating a Qwen2.5 7B base model with LoRA enabled.
+
+    The Qwen2.5 defaults are the same as in :func:`~torchtune.models.qwen2_5.qwen2_5_7b_base`,
+    while LoRA default params are based on
+    https://github.com/tloen/alpaca-lora/blob/8bb8579e403dc78e37fe81ffbb253c413007323f/finetune.py#L41-L43.
+
+    Args:
+        lora_attn_modules (List[LORA_ATTN_MODULES]): list of which linear layers
+            LoRA should be applied to in each self-attention block. Options are
+            ``{"q_proj", "k_proj", "v_proj", "output_proj"}``.
+        apply_lora_to_mlp (bool): whether to apply LoRA to the MLP in each transformer layer.
+            Default: False
+        apply_lora_to_output (bool): whether to apply LoRA to the model's final output projection.
+            Default: False
+        lora_rank (int): rank of each low-rank approximation
+        lora_alpha (float): scaling factor for the low-rank approximation
+        lora_dropout (float): dropout probability for the low-rank approximation. Default: 0.0
+        quantize_base (bool): Whether to quantize base model weights
+
+    Returns:
+        TransformerDecoder: Instantiation of Qwen2.5 7B model with LoRA applied
+
+    Note:
+        The base and instruct versions have slightly different architectures for all Qwen2.5 model sizes
+        except 0.5B and 3B. Make sure to select the correct model builder for the weights.
+    """
+    return lora_qwen2(
+        lora_attn_modules=lora_attn_modules,
+        apply_lora_to_mlp=apply_lora_to_mlp,
+        apply_lora_to_output=apply_lora_to_output,
+        vocab_size=152064,
+        num_layers=28,
+        num_heads=28,
+        num_kv_heads=4,
+        embed_dim=3584,
+        intermediate_dim=18944,
+        max_seq_len=131072,
+        attn_dropout=0.0,
+        norm_eps=1e-6,
+        rope_base=1000000.0,
+        lora_rank=lora_rank,
+        lora_alpha=lora_alpha,
+        lora_dropout=lora_dropout,
+        use_dora=use_dora,
+        quantize_base=quantize_base,
+    )
+
+
+def lora_qwen2_5_7b_instruct(
+    lora_attn_modules: List[LORA_ATTN_MODULES],
+    apply_lora_to_mlp: bool = False,
+    apply_lora_to_output: bool = False,
+    lora_rank: int = 8,
+    lora_alpha: float = 16,
+    lora_dropout: float = 0.0,
+    use_dora: bool = False,
+    quantize_base: bool = False,
+) -> TransformerDecoder:
+    """
+    Builder for creating a Qwen2.5 7B instruct model with LoRA enabled.
+
+    The Qwen2.5 defaults are the same as in :func:`~torchtune.models.qwen2_5.qwen2_5_7b_instruct`,
+    while LoRA default params are based on
+    https://github.com/tloen/alpaca-lora/blob/8bb8579e403dc78e37fe81ffbb253c413007323f/finetune.py#L41-L43.
+
+    Args:
+        lora_attn_modules (List[LORA_ATTN_MODULES]): list of which linear layers
+            LoRA should be applied to in each self-attention block. Options are
+            ``{"q_proj", "k_proj", "v_proj", "output_proj"}``.
+        apply_lora_to_mlp (bool): whether to apply LoRA to the MLP in each transformer layer.
+            Default: False
+        apply_lora_to_output (bool): whether to apply LoRA to the model's final output projection.
+            Default: False
+        lora_rank (int): rank of each low-rank approximation
+        lora_alpha (float): scaling factor for the low-rank approximation
+        lora_dropout (float): dropout probability for the low-rank approximation. Default: 0.0
+        quantize_base (bool): Whether to quantize base model weights
+
+    Returns:
+        TransformerDecoder: Instantiation of Qwen2.5 7B model with LoRA applied
+
+    Note:
+        The base and instruct versions have slightly different architectures for all Qwen2.5 model sizes
+        except 0.5B and 3B. Make sure to select the correct model builder for the weights.
+    """
+    return lora_qwen2(
+        lora_attn_modules=lora_attn_modules,
+        apply_lora_to_mlp=apply_lora_to_mlp,
+        apply_lora_to_output=apply_lora_to_output,
+        vocab_size=152064,
+        num_layers=28,
+        num_heads=28,
+        num_kv_heads=4,
+        embed_dim=3584,
+        intermediate_dim=18944,
+        max_seq_len=32768,
+        attn_dropout=0.0,
+        norm_eps=1e-6,
+        rope_base=1000000.0,
+        lora_rank=lora_rank,
+        lora_alpha=lora_alpha,
+        lora_dropout=lora_dropout,
+        use_dora=use_dora,
+        quantize_base=quantize_base,
+    )
+
+
+def lora_qwen2_5_14b_base(
+    lora_attn_modules: List[LORA_ATTN_MODULES],
+    apply_lora_to_mlp: bool = False,
+    apply_lora_to_output: bool = False,
+    lora_rank: int = 8,
+    lora_alpha: float = 16,
+    lora_dropout: float = 0.0,
+    use_dora: bool = False,
+    quantize_base: bool = False,
+) -> TransformerDecoder:
+    """
+    Builder for creating a Qwen2.5 14B base model with LoRA enabled.
+
+    The Qwen2.5 defaults are the same as in :func:`~torchtune.models.qwen2_5.qwen2_5_14b_base`,
+    while LoRA default params are based on
+    https://github.com/tloen/alpaca-lora/blob/8bb8579e403dc78e37fe81ffbb253c413007323f/finetune.py#L41-L43.
+
+    Args:
+        lora_attn_modules (List[LORA_ATTN_MODULES]): list of which linear layers
+            LoRA should be applied to in each self-attention block. Options are
+            ``{"q_proj", "k_proj", "v_proj", "output_proj"}``.
+        apply_lora_to_mlp (bool): whether to apply LoRA to the MLP in each transformer layer.
+            Default: False
+        apply_lora_to_output (bool): whether to apply LoRA to the model's final output projection.
+            Default: False
+        lora_rank (int): rank of each low-rank approximation
+        lora_alpha (float): scaling factor for the low-rank approximation
+        lora_dropout (float): dropout probability for the low-rank approximation. Default: 0.0
+        quantize_base (bool): Whether to quantize base model weights
+
+    Returns:
+        TransformerDecoder: Instantiation of Qwen2.5 14B model with LoRA applied
+
+    Note:
+        The base and instruct versions have slightly different architectures for all Qwen2.5 model sizes
+        except 0.5B and 3B. Make sure to select the correct model builder for the weights.
+    """
+    return lora_qwen2(
+        lora_attn_modules=lora_attn_modules,
+        apply_lora_to_mlp=apply_lora_to_mlp,
+        apply_lora_to_output=apply_lora_to_output,
+        vocab_size=152064,
+        num_layers=48,
+        num_heads=40,
+        num_kv_heads=8,
+        embed_dim=5120,
+        intermediate_dim=13824,
+        max_seq_len=131072,
+        attn_dropout=0.0,
+        norm_eps=1e-5,
+        rope_base=1000000.0,
+        lora_rank=lora_rank,
+        lora_alpha=lora_alpha,
+        lora_dropout=lora_dropout,
+        use_dora=use_dora,
+        quantize_base=quantize_base,
+    )
+
+
+def lora_qwen2_5_14b_instruct(
+    lora_attn_modules: List[LORA_ATTN_MODULES],
+    apply_lora_to_mlp: bool = False,
+    apply_lora_to_output: bool = False,
+    lora_rank: int = 8,
+    lora_alpha: float = 16,
+    lora_dropout: float = 0.0,
+    use_dora: bool = False,
+    quantize_base: bool = False,
+) -> TransformerDecoder:
+    """
+    Builder for creating a Qwen2.5 14B instruct model with LoRA enabled.
+
+    The Qwen2.5 defaults are the same as in :func:`~torchtune.models.qwen2_5.qwen2_5_14b_instruct`,
+    while LoRA default params are based on
+    https://github.com/tloen/alpaca-lora/blob/8bb8579e403dc78e37fe81ffbb253c413007323f/finetune.py#L41-L43.
+
+    Args:
+        lora_attn_modules (List[LORA_ATTN_MODULES]): list of which linear layers
+            LoRA should be applied to in each self-attention block. Options are
+            ``{"q_proj", "k_proj", "v_proj", "output_proj"}``.
+        apply_lora_to_mlp (bool): whether to apply LoRA to the MLP in each transformer layer.
+            Default: False
+        apply_lora_to_output (bool): whether to apply LoRA to the model's final output projection.
+            Default: False
+        lora_rank (int): rank of each low-rank approximation
+        lora_alpha (float): scaling factor for the low-rank approximation
+        lora_dropout (float): dropout probability for the low-rank approximation. Default: 0.0
+        quantize_base (bool): Whether to quantize base model weights
+
+    Returns:
+        TransformerDecoder: Instantiation of Qwen2.5 14B model with LoRA applied
+
+    Note:
+        The base and instruct versions have slightly different architectures for all Qwen2.5 model sizes
+        except 0.5B and 3B. Make sure to select the correct model builder for the weights.
+    """
+    return lora_qwen2(
+        lora_attn_modules=lora_attn_modules,
+        apply_lora_to_mlp=apply_lora_to_mlp,
+        apply_lora_to_output=apply_lora_to_output,
+        vocab_size=152064,
+        num_layers=48,
+        num_heads=40,
+        num_kv_heads=8,
+        embed_dim=5120,
+        intermediate_dim=13824,
+        max_seq_len=32768,
+        attn_dropout=0.0,
+        norm_eps=1e-6,
+        rope_base=1000000.0,
+        lora_rank=lora_rank,
+        lora_alpha=lora_alpha,
+        lora_dropout=lora_dropout,
+        use_dora=use_dora,
+        quantize_base=quantize_base,
+    )
+
+
+def lora_qwen2_5_32b_base(
+    lora_attn_modules: List[LORA_ATTN_MODULES],
+    apply_lora_to_mlp: bool = False,
+    apply_lora_to_output: bool = False,
+    lora_rank: int = 8,
+    lora_alpha: float = 16,
+    lora_dropout: float = 0.0,
+    use_dora: bool = False,
+    quantize_base: bool = False,
+) -> TransformerDecoder:
+    """
+    Builder for creating a Qwen2.5 32B base model with LoRA enabled.
+
+    The Qwen2.5 defaults are the same as in :func:`~torchtune.models.qwen2_5.qwen2_5_32b_base`,
+    while LoRA default params are based on
+    https://github.com/tloen/alpaca-lora/blob/8bb8579e403dc78e37fe81ffbb253c413007323f/finetune.py#L41-L43.
+
+    Args:
+        lora_attn_modules (List[LORA_ATTN_MODULES]): list of which linear layers
+            LoRA should be applied to in each self-attention block. Options are
+            ``{"q_proj", "k_proj", "v_proj", "output_proj"}``.
+        apply_lora_to_mlp (bool): whether to apply LoRA to the MLP in each transformer layer.
+            Default: False
+        apply_lora_to_output (bool): whether to apply LoRA to the model's final output projection.
+            Default: False
+        lora_rank (int): rank of each low-rank approximation
+        lora_alpha (float): scaling factor for the low-rank approximation
+        lora_dropout (float): dropout probability for the low-rank approximation. Default: 0.0
+        quantize_base (bool): Whether to quantize base model weights
+
+    Returns:
+        TransformerDecoder: Instantiation of Qwen2.5 32B model with LoRA applied
+
+    Note:
+        The base and instruct versions have slightly different architectures for all Qwen2.5 model sizes
+        except 0.5B and 3B. Make sure to select the correct model builder for the weights.
+    """
+    return lora_qwen2(
+        lora_attn_modules=lora_attn_modules,
+        apply_lora_to_mlp=apply_lora_to_mlp,
+        apply_lora_to_output=apply_lora_to_output,
+        vocab_size=152064,
+        num_layers=64,
+        num_heads=40,
+        num_kv_heads=8,
+        embed_dim=5120,
+        intermediate_dim=27648,
+        max_seq_len=131072,
+        attn_dropout=0.0,
+        norm_eps=1e-5,
+        rope_base=1000000.0,
+        lora_rank=lora_rank,
+        lora_alpha=lora_alpha,
+        lora_dropout=lora_dropout,
+        use_dora=use_dora,
+        quantize_base=quantize_base,
+    )
+
+
+def lora_qwen2_5_32b_instruct(
+    lora_attn_modules: List[LORA_ATTN_MODULES],
+    apply_lora_to_mlp: bool = False,
+    apply_lora_to_output: bool = False,
+    lora_rank: int = 8,
+    lora_alpha: float = 16,
+    lora_dropout: float = 0.0,
+    use_dora: bool = False,
+    quantize_base: bool = False,
+) -> TransformerDecoder:
+    """
+    Builder for creating a Qwen2.5 32B instruct model with LoRA enabled.
+
+    The Qwen2.5 defaults are the same as in :func:`~torchtune.models.qwen2_5.qwen2_5_32b_instruct`,
+    while LoRA default params are based on
+    https://github.com/tloen/alpaca-lora/blob/8bb8579e403dc78e37fe81ffbb253c413007323f/finetune.py#L41-L43.
+
+    Args:
+        lora_attn_modules (List[LORA_ATTN_MODULES]): list of which linear layers
+            LoRA should be applied to in each self-attention block. Options are
+            ``{"q_proj", "k_proj", "v_proj", "output_proj"}``.
+        apply_lora_to_mlp (bool): whether to apply LoRA to the MLP in each transformer layer.
+            Default: False
+        apply_lora_to_output (bool): whether to apply LoRA to the model's final output projection.
+            Default: False
+        lora_rank (int): rank of each low-rank approximation
+        lora_alpha (float): scaling factor for the low-rank approximation
+        lora_dropout (float): dropout probability for the low-rank approximation. Default: 0.0
+        quantize_base (bool): Whether to quantize base model weights
+
+    Returns:
+        TransformerDecoder: Instantiation of Qwen2.5 32B model with LoRA applied
+
+    Note:
+        The base and instruct versions have slightly different architectures for all Qwen2.5 model sizes
+        except 0.5B and 3B. Make sure to select the correct model builder for the weights.
+    """
+    return lora_qwen2(
+        lora_attn_modules=lora_attn_modules,
+        apply_lora_to_mlp=apply_lora_to_mlp,
+        apply_lora_to_output=apply_lora_to_output,
+        vocab_size=152064,
+        num_layers=64,
+        num_heads=40,
+        num_kv_heads=8,
+        embed_dim=5120,
+        intermediate_dim=27648,
+        max_seq_len=32768,
+        attn_dropout=0.0,
+        norm_eps=1e-6,
+        rope_base=1000000.0,
+        lora_rank=lora_rank,
+        lora_alpha=lora_alpha,
+        lora_dropout=lora_dropout,
+        use_dora=use_dora,
+        quantize_base=quantize_base,
+    )
+
+
+def lora_qwen2_5_72b_base(
+    lora_attn_modules: List[LORA_ATTN_MODULES],
+    apply_lora_to_mlp: bool = False,
+    apply_lora_to_output: bool = False,
+    lora_rank: int = 8,
+    lora_alpha: float = 16,
+    lora_dropout: float = 0.0,
+    use_dora: bool = False,
+    quantize_base: bool = False,
+) -> TransformerDecoder:
+    """
+    Builder for creating a Qwen2.5 72B base model with LoRA enabled.
+
+    The Qwen2.5 defaults are the same as in :func:`~torchtune.models.qwen2_5.qwen2_5_72b_base`,
+    while LoRA default params are based on
+    https://github.com/tloen/alpaca-lora/blob/8bb8579e403dc78e37fe81ffbb253c413007323f/finetune.py#L41-L43.
+
+    Args:
+        lora_attn_modules (List[LORA_ATTN_MODULES]): list of which linear layers
+            LoRA should be applied to in each self-attention block. Options are
+            ``{"q_proj", "k_proj", "v_proj", "output_proj"}``.
+        apply_lora_to_mlp (bool): whether to apply LoRA to the MLP in each transformer layer.
+            Default: False
+        apply_lora_to_output (bool): whether to apply LoRA to the model's final output projection.
+            Default: False
+        lora_rank (int): rank of each low-rank approximation
+        lora_alpha (float): scaling factor for the low-rank approximation
+        lora_dropout (float): dropout probability for the low-rank approximation. Default: 0.0
+        quantize_base (bool): Whether to quantize base model weights
+
+    Returns:
+        TransformerDecoder: Instantiation of Qwen2.5 72B model with LoRA applied
+
+    Note:
+        The base and instruct versions have slightly different architectures for all Qwen2.5 model sizes
+        except 0.5B and 3B. Make sure to select the correct model builder for the weights.
+    """
+    return lora_qwen2(
+        lora_attn_modules=lora_attn_modules,
+        apply_lora_to_mlp=apply_lora_to_mlp,
+        apply_lora_to_output=apply_lora_to_output,
+        vocab_size=152064,
+        num_layers=80,
+        num_heads=64,
+        num_kv_heads=8,
+        embed_dim=8192,
+        intermediate_dim=29568,
+        max_seq_len=131072,
+        attn_dropout=0.0,
+        norm_eps=1e-5,
+        rope_base=1000000.0,
+        lora_rank=lora_rank,
+        lora_alpha=lora_alpha,
+        lora_dropout=lora_dropout,
+        use_dora=use_dora,
+        quantize_base=quantize_base,
+    )
+
+
+def lora_qwen2_5_72b_instruct(
+    lora_attn_modules: List[LORA_ATTN_MODULES],
+    apply_lora_to_mlp: bool = False,
+    apply_lora_to_output: bool = False,
+    lora_rank: int = 8,
+    lora_alpha: float = 16,
+    lora_dropout: float = 0.0,
+    use_dora: bool = False,
+    quantize_base: bool = False,
+) -> TransformerDecoder:
+    """
+    Builder for creating a Qwen2.5 72B instruct model with LoRA enabled.
+
+    The Qwen2.5 defaults are the same as in :func:`~torchtune.models.qwen2_5.qwen2_5_72b_instruct`,
+    while LoRA default params are based on
+    https://github.com/tloen/alpaca-lora/blob/8bb8579e403dc78e37fe81ffbb253c413007323f/finetune.py#L41-L43.
+
+    Args:
+        lora_attn_modules (List[LORA_ATTN_MODULES]): list of which linear layers
+            LoRA should be applied to in each self-attention block. Options are
+            ``{"q_proj", "k_proj", "v_proj", "output_proj"}``.
+        apply_lora_to_mlp (bool): whether to apply LoRA to the MLP in each transformer layer.
+            Default: False
+        apply_lora_to_output (bool): whether to apply LoRA to the model's final output projection.
+            Default: False
+        lora_rank (int): rank of each low-rank approximation
+        lora_alpha (float): scaling factor for the low-rank approximation
+        lora_dropout (float): dropout probability for the low-rank approximation. Default: 0.0
+        quantize_base (bool): Whether to quantize base model weights
+
+    Returns:
+        TransformerDecoder: Instantiation of Qwen2.5 72B model with LoRA applied
+
+    Note:
+        The base and instruct versions have slightly different architectures for all Qwen2.5 model sizes
+        except 0.5B and 3B. Make sure to select the correct model builder for the weights.
+    """
+    return lora_qwen2(
+        lora_attn_modules=lora_attn_modules,
+        apply_lora_to_mlp=apply_lora_to_mlp,
+        apply_lora_to_output=apply_lora_to_output,
+        vocab_size=152064,
+        num_layers=80,
+        num_heads=64,
+        num_kv_heads=8,
+        embed_dim=8192,
+        intermediate_dim=29568,
+        max_seq_len=32768,
+        attn_dropout=0.0,
+        norm_eps=1e-6,
+        rope_base=1000000.0,
+        lora_rank=lora_rank,
+        lora_alpha=lora_alpha,
+        lora_dropout=lora_dropout,
+        use_dora=use_dora,
+        quantize_base=quantize_base,
+    )
diff --git a/torchtune/models/qwen2_5/_tokenizer.py b/torchtune/models/qwen2_5/_tokenizer.py
new file mode 100644
index 0000000000..2d3eb1d01a
--- /dev/null
+++ b/torchtune/models/qwen2_5/_tokenizer.py
@@ -0,0 +1,242 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Dict, List, Optional, Tuple
+
+from torchtune.data import ChatMLTemplate, Message, PromptTemplate, truncate
+from torchtune.models.qwen2._tokenizer import (
+    DEFAULT_QWEN2_TOKENIZER_BPE_CACHE_SIZE,
+    ENDOFTEXT,
+    QWEN2_SPECIAL_TOKENS,
+    Qwen2Tokenizer,
+)
+
+
+QWEN2_5_SPECIAL_TOKENS = {
+    **QWEN2_SPECIAL_TOKENS,
+    "<|object_ref_start|>": 151646,
+    "<|object_ref_end|>": 151647,
+    "<|box_start|>": 151648,
+    "<|box_end|>": 151649,
+    "<|quad_start|>": 151650,
+    "<|quad_end|>": 151651,
+    "<|vision_start|>": 151652,
+    "<|vision_end|>": 151653,
+    "<|vision_pad|>": 151654,
+    "<|image_pad|>": 151655,
+    "<|video_pad|>": 151656,
+    "<tool_call>": 151657,
+    "</tool_call>": 151658,
+    "<|fim_prefix|>": 151659,
+    "<|fim_middle|>": 151660,
+    "<|fim_suffix|>": 151661,
+    "<|fim_pad|>": 151662,
+    "<|repo_name|>": 151663,
+    "<|file_sep|>": 151664,
+}
+
+
+class Qwen2_5Tokenizer(Qwen2Tokenizer):  # noqa: N801
+    """This class construct a Qwen2.5 tokenizer, based on GPT-2 byte-level BPE tokenization.
+
+    See <https://github.com/huggingface/transformers/blob/v4.40.1/src/transformers/models/qwen2/tokenization_qwen2.py>
+    and <https://huggingface.co/Qwen/Qwen2.5-7B-Instruct/blob/main/tokenizer_config.json>.
+
+    Args:
+        path (str): Path to vocab.json file.
+        merges_file (str): Path to merges.txt file.
+            merges.txt contains all BPE merge operations, and this file is required to split a single word into
+            byte-level BPE tokens.
+        special_tokens (Dict[str, int]): Special tokens to add to the tokenizer. Default is QWEN2_5_SPECIAL_TOKENS.
+        max_seq_len (Optional[int]): A max sequence length to truncate tokens to.
+            Default: None
+        prompt_template (Optional[PromptTemplate]): template used to format the messages based on their role. This is used
+            to add structured text around the actual messages. The structured text is used in three scenarios:
+
+            - Task-specific templates to gear models for a particular task that it will expect after training
+            - Model-specific templates that are required whenever the model is prompted, such as the [INST]
+              tags in Llama2 and in Mistral
+            - Community standardized templates, such as :class:`~torchtune.data.ChatMLTemplate`
+
+            The extra text will still get tokenized as normal text, not as special tokens.
+            Default: None
+        errors (str): Paradigm to follow when decoding bytes to UTF-8. Defaults to "replace".
+            See [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+        unk_token (Optional[str]): The unknown token. A token that is not in the vocabulary cannot be converted
+            to an ID and is set to be this token instead. Defaults to ``<|endoftext|>``.
+        bos_token (Optional[str]): The beginning of sequence token. Defaults to None.
+        eos_token (str): The end of sequence token. Defaults to ``<|endoftext|>``.
+        pad_token (Optional[str]): The token used for padding. Defaults to ``<|endoftext|>``.
+        bpe_cache_size (int): BPE token cache size in Qwen2Tokenizer.
+            NOTE: large cache size will speed up tokenization, but the cache object will get really
+            large for long running processes (esp. for texts of language that do not use space between
+            word, e.g. Chinese); technically not a memory leak but appears as one.
+            By default, we set the cache size equals to size of the official Qwen2 tokenizer.
+
+    Example:
+        >>> tokenizer = Qwen2Tokenizer(
+                path="/path/to/vocab.json", merges_file="/path/to/merges.txt", special_tokens=QWEN2_SPECIAL_TOKENS)
+        >>> tokenized_text = tokenizer.encode("Hello world!")
+        >>> print(tokenized_text)
+        [39, 385, 78, 675, 0, 2000]
+    """
+
+    def __init__(
+        self,
+        path: str,
+        merges_file: str,
+        special_tokens: Dict[str, int] = QWEN2_5_SPECIAL_TOKENS,
+        max_seq_len: Optional[int] = None,
+        *,
+        prompt_template: Optional[PromptTemplate] = None,
+        errors: str = "replace",
+        unk_token: Optional[str] = ENDOFTEXT,
+        bos_token: Optional[str] = None,
+        eos_token: str = ENDOFTEXT,
+        pad_token: Optional[str] = ENDOFTEXT,
+        bpe_cache_size: int = DEFAULT_QWEN2_TOKENIZER_BPE_CACHE_SIZE,
+    ):
+        super().__init__(
+            path=path,
+            merges_file=merges_file,
+            special_tokens=special_tokens,
+            max_seq_len=max_seq_len,
+            prompt_template=prompt_template,
+            errors=errors,
+            unk_token=unk_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            pad_token=pad_token,
+            bpe_cache_size=bpe_cache_size,
+        )
+
+        self.tool_call_start_id = self.special_tokens["<tool_call>"]
+        self.tool_call_end_id = self.special_tokens["</tool_call>"]
+
+    def tokenize_messages(
+        self,
+        messages: List[Message],
+        *,
+        add_eos: bool = True,
+    ) -> Tuple[List[int], List[bool]]:
+        """
+        Given a list of messages, return a list of tokens for the concatenated
+        and formatted messages.
+
+        Args:
+            messages (List[Message]): The message list to tokenize.
+            add_eos (bool): Wether to add the tokenizer's eos_id at the end of the
+                sequence of messages. Default is True.
+
+        Returns:
+            Tuple[List[int], List[bool]]: The list of token ids and the list of masks.
+
+        Raises:
+            RuntimeError: If a message contains non-text content
+        """
+        assert not isinstance(self.prompt_template, ChatMLTemplate), (
+            "Using ChatMLTemplate with tokenize_messages will result in multiple <|im_*|> tokens wrapping each message."
+            "Please use a different template or set to None."
+        )
+        templated_messages = (
+            self.prompt_template(messages)
+            if self.prompt_template is not None
+            else messages
+        )
+
+        tokenized_messages = []
+        mask = []
+        for i, message in enumerate(templated_messages):
+            # message header
+            tokens = self._tokenize_header(templated_messages, i)
+
+            # message content
+            for item in message.content:
+                if item["type"] == "text":
+                    tokens.extend(
+                        self.encode(
+                            item["content"],
+                            add_bos=False,
+                            add_eos=False,
+                        )
+                    )
+                else:
+                    raise RuntimeError(
+                        f"Unsupported message content type: {item['type']}"
+                    )
+
+            # message footer
+            tokens.extend(self._tokenize_footer(templated_messages, i))
+
+            tokenized_messages.extend(tokens)
+            mask.extend([message.masked] * len(tokens))
+
+            # Break out early if we reach max_seq_len
+            if self.max_seq_len and len(tokenized_messages) >= self.max_seq_len:
+                break
+
+        # Add the End-Of-Sequence token
+        if add_eos:
+            tokenized_messages.append(self.eos_id)
+            mask.append(mask[-1])
+
+        # Finally, truncate if necessary
+        if self.max_seq_len:
+            tokenized_messages = truncate(
+                tokenized_messages, self.max_seq_len, self.eos_id if add_eos else None
+            )
+            mask = truncate(mask, self.max_seq_len, True if add_eos else None)
+
+        return tokenized_messages, mask
+
+    def _tokenize_header(self, messages, i):
+        tokens = []
+        message = messages[i]
+        if message.role == "ipython":
+            if i == 0 or messages[i - 1].role != "ipython":
+                # only add the "user" header if this is the first tool response msg
+                self._add_message_start_tokens(tokens, "user")
+                tokens.extend(
+                    self.encode("<tool_response>\n", add_bos=False, add_eos=False)
+                )
+            else:
+                tokens.extend(
+                    self.encode("\n<tool_response>\n", add_bos=False, add_eos=False)
+                )
+        else:
+            self._add_message_start_tokens(tokens, message.role)
+            if message.role == "assistant" and message.ipython:
+                tokens.append(self.tool_call_start_id)
+                tokens.extend(self.encode("\n", add_bos=False, add_eos=False))
+        return tokens
+
+    def _tokenize_footer(self, messages, i):
+        tokens = []
+        message = messages[i]
+        if message.role == "ipython":
+            if i == len(messages) - 1 or messages[i + 1].role != "ipython":
+                tokens.extend(
+                    self.encode("\n</tool_response>", add_bos=False, add_eos=False)
+                )
+                self._add_message_end_tokens(tokens)
+            else:
+                tokens.extend(
+                    self.encode("\n</tool_response>", add_bos=False, add_eos=False)
+                )
+        else:
+            if message.role == "assistant" and message.ipython:
+                tokens.extend(self.encode("\n", add_bos=False, add_eos=False))
+                tokens.append(self.tool_call_end_id)
+            if message.role != "assistant" or i != len(messages) - 1:
+                self._add_message_end_tokens(tokens)
+        return tokens
+
+    def _add_message_start_tokens(self, tokens, role):
+        tokens.append(self.im_start_id)
+        tokens.extend(self.encode(f"{role}\n", add_bos=False, add_eos=False))
+
+    def _add_message_end_tokens(self, tokens):
+        tokens.append(self.im_end_id)
+        tokens.extend(self.encode("\n", add_bos=False, add_eos=False))