From 3978d15b4e158583bfe1537d6624f622d7cfc472 Mon Sep 17 00:00:00 2001
From: krammnic <krammnic@krammnic.krammnic.com>
Date: Sun, 20 Oct 2024 14:23:38 -0400
Subject: [PATCH 01/10] add required arguments in configs

---
 recipes/configs/code_llama2/7B_full_low_memory.yaml        | 4 +++-
 recipes/configs/code_llama2/7B_lora_single_device.yaml     | 4 +++-
 recipes/configs/code_llama2/7B_qlora_single_device.yaml    | 3 ++-
 recipes/configs/dev/8B_full_experimental.yaml              | 4 +++-
 recipes/configs/gemma/2B_full.yaml                         | 4 +++-
 recipes/configs/gemma/2B_lora.yaml                         | 4 +++-
 recipes/configs/gemma/2B_lora_single_device.yaml           | 3 ++-
 recipes/configs/gemma/2B_qlora_single_device.yaml          | 3 ++-
 recipes/configs/gemma/7B_full.yaml                         | 4 +++-
 recipes/configs/gemma/7B_lora.yaml                         | 4 +++-
 recipes/configs/gemma/7B_lora_single_device.yaml           | 3 ++-
 recipes/configs/gemma/7B_qlora_single_device.yaml          | 3 ++-
 recipes/configs/llama2/13B_full.yaml                       | 4 +++-
 recipes/configs/llama2/13B_lora.yaml                       | 4 +++-
 recipes/configs/llama2/13B_qlora_single_device.yaml        | 3 ++-
 recipes/configs/llama2/70B_lora.yaml                       | 3 ++-
 recipes/configs/llama2/70B_qlora.yaml                      | 3 ++-
 recipes/configs/llama2/7B_full.yaml                        | 5 +++--
 recipes/configs/llama2/7B_full_low_memory.yaml             | 3 ++-
 recipes/configs/llama2/7B_lora.yaml                        | 7 ++++---
 recipes/configs/llama2/7B_lora_dpo.yaml                    | 4 +++-
 recipes/configs/llama2/7B_lora_dpo_single_device.yaml      | 3 ++-
 recipes/configs/llama2/7B_lora_single_device.yaml          | 3 ++-
 recipes/configs/llama2/7B_qat_full.yaml                    | 4 +++-
 recipes/configs/llama2/7B_qlora.yaml                       | 3 ++-
 recipes/configs/llama2/7B_qlora_single_device.yaml         | 3 ++-
 recipes/configs/llama3/70B_full.yaml                       | 3 ++-
 recipes/configs/llama3/70B_lora.yaml                       | 3 ++-
 recipes/configs/llama3/8B_dora.yaml                        | 4 +++-
 recipes/configs/llama3/8B_dora_single_device.yaml          | 3 ++-
 recipes/configs/llama3/8B_full.yaml                        | 5 +++--
 recipes/configs/llama3/8B_full_single_device.yaml          | 3 ++-
 recipes/configs/llama3/8B_lora.yaml                        | 4 +++-
 recipes/configs/llama3/8B_lora_single_device.yaml          | 7 ++++---
 recipes/configs/llama3/8B_qat_full.yaml                    | 4 +++-
 recipes/configs/llama3/8B_qdora_single_device.yaml         | 3 ++-
 recipes/configs/llama3/8B_qlora_single_device.yaml         | 3 ++-
 recipes/configs/llama3_1/405B_qlora.yaml                   | 1 +
 recipes/configs/llama3_1/70B_full.yaml                     | 3 ++-
 recipes/configs/llama3_1/70B_lora.yaml                     | 3 ++-
 recipes/configs/llama3_1/8B_full.yaml                      | 5 +++--
 recipes/configs/llama3_1/8B_full_single_device.yaml        | 3 ++-
 recipes/configs/llama3_1/8B_lora.yaml                      | 3 ++-
 recipes/configs/llama3_1/8B_lora_single_device.yaml        | 3 ++-
 recipes/configs/llama3_1/8B_qlora_single_device.yaml       | 3 ++-
 recipes/configs/llama3_2/1B_full.yaml                      | 3 ++-
 recipes/configs/llama3_2/1B_full_single_device.yaml        | 3 ++-
 recipes/configs/llama3_2/1B_lora.yaml                      | 3 ++-
 recipes/configs/llama3_2/1B_lora_single_device.yaml        | 3 ++-
 recipes/configs/llama3_2/1B_qlora_single_device.yaml       | 3 ++-
 recipes/configs/llama3_2/3B_full.yaml                      | 3 ++-
 recipes/configs/llama3_2/3B_full_single_device.yaml        | 3 ++-
 recipes/configs/llama3_2/3B_lora.yaml                      | 3 ++-
 recipes/configs/llama3_2/3B_lora_single_device.yaml        | 3 ++-
 recipes/configs/llama3_2/3B_qlora_single_device.yaml       | 3 ++-
 .../llama3_2/knowledge_distillation_single_device.yaml     | 4 +++-
 recipes/configs/llama3_2_vision/11B_full.yaml              | 3 ++-
 .../configs/llama3_2_vision/11B_full_single_device.yaml    | 3 ++-
 recipes/configs/llama3_2_vision/11B_lora.yaml              | 3 ++-
 .../configs/llama3_2_vision/11B_lora_single_device.yaml    | 3 ++-
 recipes/configs/mistral/7B_full.yaml                       | 4 +++-
 recipes/configs/mistral/7B_full_low_memory.yaml            | 3 ++-
 recipes/configs/mistral/7B_full_ppo_low_memory.yaml        | 3 ++-
 recipes/configs/mistral/7B_lora.yaml                       | 4 +++-
 recipes/configs/mistral/7B_lora_single_device.yaml         | 3 ++-
 recipes/configs/mistral/7B_qlora_single_device.yaml        | 3 ++-
 recipes/configs/phi3/mini_full.yaml                        | 4 +++-
 recipes/configs/phi3/mini_full_low_memory.yaml             | 3 ++-
 recipes/configs/phi3/mini_lora.yaml                        | 4 +++-
 recipes/configs/phi3/mini_lora_single_device.yaml          | 3 ++-
 recipes/configs/phi3/mini_qlora_single_device.yaml         | 3 ++-
 recipes/configs/qwen2/0.5B_full.yaml                       | 5 +++--
 recipes/configs/qwen2/0.5B_full_single_device.yaml         | 3 ++-
 recipes/configs/qwen2/0.5B_lora.yaml                       | 4 +++-
 recipes/configs/qwen2/0.5B_lora_single_device.yaml         | 3 ++-
 recipes/configs/qwen2/1.5B_full.yaml                       | 5 +++--
 recipes/configs/qwen2/1.5B_full_single_device.yaml         | 3 ++-
 recipes/configs/qwen2/1.5B_lora.yaml                       | 4 +++-
 recipes/configs/qwen2/1.5B_lora_single_device.yaml         | 3 ++-
 recipes/configs/qwen2/7B_full.yaml                         | 5 +++--
 recipes/configs/qwen2/7B_full_single_device.yaml           | 3 ++-
 recipes/configs/qwen2/7B_lora.yaml                         | 4 +++-
 recipes/configs/qwen2/7B_lora_single_device.yaml           | 3 ++-
 .../qwen2/knowledge_distillation_single_device.yaml        | 4 +++-
 recipes/full_finetune_distributed.py                       | 3 ++-
 recipes/full_finetune_single_device.py                     | 3 ++-
 recipes/knowledge_distillation_single_device.py            | 3 ++-
 recipes/lora_dpo_distributed.py                            | 3 ++-
 recipes/lora_dpo_single_device.py                          | 3 ++-
 recipes/lora_finetune_distributed.py                       | 3 ++-
 recipes/lora_finetune_single_device.py                     | 3 ++-
 recipes/ppo_full_finetune_single_device.py                 | 3 ++-
 recipes/qat_distributed.py                                 | 3 ++-
 93 files changed, 218 insertions(+), 102 deletions(-)

diff --git a/recipes/configs/code_llama2/7B_full_low_memory.yaml b/recipes/configs/code_llama2/7B_full_low_memory.yaml
index 6bca6c378f..bae760c67e 100644
--- a/recipes/configs/code_llama2/7B_full_low_memory.yaml
+++ b/recipes/configs/code_llama2/7B_full_low_memory.yaml
@@ -45,7 +45,9 @@ resume_from_checkpoint: False
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
+
 seed: null
 shuffle: True
 
@@ -75,4 +77,4 @@ metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
   log_dir: /tmp/CodeLlama-7b-hf/logs
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
diff --git a/recipes/configs/code_llama2/7B_lora_single_device.yaml b/recipes/configs/code_llama2/7B_lora_single_device.yaml
index 263e3c12e1..1ada63446b 100644
--- a/recipes/configs/code_llama2/7B_lora_single_device.yaml
+++ b/recipes/configs/code_llama2/7B_lora_single_device.yaml
@@ -49,7 +49,9 @@ save_adapter_weights_only: False
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_cleaned_dataset
+
 seed: null
 shuffle: True
 
@@ -84,7 +86,7 @@ metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
   log_dir: /tmp/CodeLlama-7b-hf/logs
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Showcase the usage of PyTorch profiler
 # Set enabled to False as it's only needed for debugging training
diff --git a/recipes/configs/code_llama2/7B_qlora_single_device.yaml b/recipes/configs/code_llama2/7B_qlora_single_device.yaml
index 4f6fd9be61..e7910d73cc 100644
--- a/recipes/configs/code_llama2/7B_qlora_single_device.yaml
+++ b/recipes/configs/code_llama2/7B_qlora_single_device.yaml
@@ -49,6 +49,7 @@ save_adapter_weights_only: False
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_cleaned_dataset
 seed: null
 shuffle: True
@@ -84,7 +85,7 @@ metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
   log_dir: /tmp/CodeLlama-7b-hf/logs
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Show case the usage of pytorch profiler
 # Set enabled to False as it's only needed for debugging training
diff --git a/recipes/configs/dev/8B_full_experimental.yaml b/recipes/configs/dev/8B_full_experimental.yaml
index 4ed8a80e09..ee1e0f650c 100644
--- a/recipes/configs/dev/8B_full_experimental.yaml
+++ b/recipes/configs/dev/8B_full_experimental.yaml
@@ -26,6 +26,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
 seed: null
 shuffle: True
@@ -57,7 +58,7 @@ loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1
-
+compile: False
 
 # Training env
 device: cuda
@@ -78,3 +79,4 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/alpaca-llama3-finetune
 log_every_n_steps: null
+log_peak_memory_stats: True
diff --git a/recipes/configs/gemma/2B_full.yaml b/recipes/configs/gemma/2B_full.yaml
index e1bd3272d0..a3b8ed59f7 100644
--- a/recipes/configs/gemma/2B_full.yaml
+++ b/recipes/configs/gemma/2B_full.yaml
@@ -23,6 +23,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
 seed: null
 shuffle: True
@@ -54,6 +55,7 @@ loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1
+compile: False
 
 # Training env
 device: cuda
@@ -70,4 +72,4 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/alpaca-gemma-finetune
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
diff --git a/recipes/configs/gemma/2B_lora.yaml b/recipes/configs/gemma/2B_lora.yaml
index b82faa39e2..8ed92dd115 100644
--- a/recipes/configs/gemma/2B_lora.yaml
+++ b/recipes/configs/gemma/2B_lora.yaml
@@ -22,6 +22,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
 seed: null
 shuffle: True
@@ -66,6 +67,7 @@ batch_size: 4
 epochs: 3
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1
+compile: False
 
 # Training env
 device: cuda
@@ -82,4 +84,4 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/alpaca-gemma-lora
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
diff --git a/recipes/configs/gemma/2B_lora_single_device.yaml b/recipes/configs/gemma/2B_lora_single_device.yaml
index d6e1664b71..b661710caf 100644
--- a/recipes/configs/gemma/2B_lora_single_device.yaml
+++ b/recipes/configs/gemma/2B_lora_single_device.yaml
@@ -22,6 +22,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
 seed: null
 shuffle: True
@@ -83,7 +84,7 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/alpaca-gemma-lora
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Show case the usage of pytorch profiler
 # Set enabled to False as it's only needed for debugging training
diff --git a/recipes/configs/gemma/2B_qlora_single_device.yaml b/recipes/configs/gemma/2B_qlora_single_device.yaml
index 9b24d6c0ee..2b5cbf96bb 100644
--- a/recipes/configs/gemma/2B_qlora_single_device.yaml
+++ b/recipes/configs/gemma/2B_qlora_single_device.yaml
@@ -22,6 +22,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
 seed: null
 shuffle: True
@@ -83,7 +84,7 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/alpaca-gemma-lora
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Show case the usage of pytorch profiler
 # Set enabled to False as it's only needed for debugging training
diff --git a/recipes/configs/gemma/7B_full.yaml b/recipes/configs/gemma/7B_full.yaml
index a8924836fe..eb6b8c9426 100644
--- a/recipes/configs/gemma/7B_full.yaml
+++ b/recipes/configs/gemma/7B_full.yaml
@@ -23,6 +23,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
 seed: null
 shuffle: True
@@ -56,6 +57,7 @@ loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1
+compile: False
 
 # Training env
 device: cuda
@@ -72,4 +74,4 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/alpaca-gemma-finetune
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
diff --git a/recipes/configs/gemma/7B_lora.yaml b/recipes/configs/gemma/7B_lora.yaml
index 6db9b0ab82..4d74f93671 100644
--- a/recipes/configs/gemma/7B_lora.yaml
+++ b/recipes/configs/gemma/7B_lora.yaml
@@ -23,6 +23,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
 seed: null
 shuffle: True
@@ -68,6 +69,7 @@ batch_size: 4
 epochs: 3
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1
+compile: False
 
 # Training env
 device: cuda
@@ -84,4 +86,4 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/alpaca-gemma-lora
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
diff --git a/recipes/configs/gemma/7B_lora_single_device.yaml b/recipes/configs/gemma/7B_lora_single_device.yaml
index c82f0b76ba..369ba715e5 100644
--- a/recipes/configs/gemma/7B_lora_single_device.yaml
+++ b/recipes/configs/gemma/7B_lora_single_device.yaml
@@ -22,6 +22,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
 seed: null
 shuffle: True
@@ -85,7 +86,7 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/alpaca-gemma-lora
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Show case the usage of pytorch profiler
 # Set enabled to False as it's only needed for debugging training
diff --git a/recipes/configs/gemma/7B_qlora_single_device.yaml b/recipes/configs/gemma/7B_qlora_single_device.yaml
index fcbccb786b..301a7b4a5d 100644
--- a/recipes/configs/gemma/7B_qlora_single_device.yaml
+++ b/recipes/configs/gemma/7B_qlora_single_device.yaml
@@ -22,6 +22,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
 seed: null
 shuffle: True
@@ -85,7 +86,7 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/alpaca-gemma-lora
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Show case the usage of pytorch profiler
 # Set enabled to False as it's only needed for debugging training
diff --git a/recipes/configs/llama2/13B_full.yaml b/recipes/configs/llama2/13B_full.yaml
index f5ecffc2ab..be5a4e8b1d 100644
--- a/recipes/configs/llama2/13B_full.yaml
+++ b/recipes/configs/llama2/13B_full.yaml
@@ -43,6 +43,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
 seed: null
 shuffle: True
@@ -58,6 +59,7 @@ loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1
+compile: False
 
 # Training env
 device: cuda
@@ -74,4 +76,4 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/alpaca-llama2-finetune
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
diff --git a/recipes/configs/llama2/13B_lora.yaml b/recipes/configs/llama2/13B_lora.yaml
index d657754139..797abc2a63 100644
--- a/recipes/configs/llama2/13B_lora.yaml
+++ b/recipes/configs/llama2/13B_lora.yaml
@@ -52,6 +52,7 @@ tokenizer:
 
 # Dataset and Sampler
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_cleaned_dataset
 seed: null
 shuffle: True
@@ -74,6 +75,7 @@ loss:
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 16
+compile: False
 
 # Logging
 output_dir: /tmp/lora_finetune_output
@@ -81,7 +83,7 @@ metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
   log_dir: ${output_dir}
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Environment
 device: cuda
diff --git a/recipes/configs/llama2/13B_qlora_single_device.yaml b/recipes/configs/llama2/13B_qlora_single_device.yaml
index 56431fdff5..9e8faaa800 100644
--- a/recipes/configs/llama2/13B_qlora_single_device.yaml
+++ b/recipes/configs/llama2/13B_qlora_single_device.yaml
@@ -47,6 +47,7 @@ save_adapter_weights_only: False
 
 # Dataset and Sampler
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_cleaned_dataset
 seed: null
 shuffle: True
@@ -77,7 +78,7 @@ metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
   log_dir: ${output_dir}
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Environment
 device: cuda
diff --git a/recipes/configs/llama2/70B_lora.yaml b/recipes/configs/llama2/70B_lora.yaml
index b4d0d9c9a9..9502690be2 100644
--- a/recipes/configs/llama2/70B_lora.yaml
+++ b/recipes/configs/llama2/70B_lora.yaml
@@ -52,6 +52,7 @@ save_adapter_weights_only: False
 
 # Dataset and Sampler
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
 seed: null
 shuffle: True
@@ -81,7 +82,7 @@ metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
   log_dir: ${output_dir}
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Environment
 device: cuda
diff --git a/recipes/configs/llama2/70B_qlora.yaml b/recipes/configs/llama2/70B_qlora.yaml
index c1de2c2358..c0e2e320f3 100644
--- a/recipes/configs/llama2/70B_qlora.yaml
+++ b/recipes/configs/llama2/70B_qlora.yaml
@@ -57,6 +57,7 @@ save_adapter_weights_only: False
 
 # Dataset and Sampler
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
   train_on_input: True
 seed: null
@@ -91,7 +92,7 @@ metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
   log_dir: ${output_dir}
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Environment
 device: cuda
diff --git a/recipes/configs/llama2/7B_full.yaml b/recipes/configs/llama2/7B_full.yaml
index 2e80276c84..3a6e3c35f2 100644
--- a/recipes/configs/llama2/7B_full.yaml
+++ b/recipes/configs/llama2/7B_full.yaml
@@ -26,6 +26,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
 seed: null
 shuffle: True
@@ -57,7 +58,7 @@ loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1
-
+compile: False
 
 # Training env
 device: cuda
@@ -74,4 +75,4 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/alpaca-llama2-finetune
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
diff --git a/recipes/configs/llama2/7B_full_low_memory.yaml b/recipes/configs/llama2/7B_full_low_memory.yaml
index 06558009ed..b9b933c2df 100644
--- a/recipes/configs/llama2/7B_full_low_memory.yaml
+++ b/recipes/configs/llama2/7B_full_low_memory.yaml
@@ -28,6 +28,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
 seed: null
 shuffle: True
@@ -79,4 +80,4 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/alpaca-llama2-finetune
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
diff --git a/recipes/configs/llama2/7B_lora.yaml b/recipes/configs/llama2/7B_lora.yaml
index 2c9a694d7b..82276fa317 100644
--- a/recipes/configs/llama2/7B_lora.yaml
+++ b/recipes/configs/llama2/7B_lora.yaml
@@ -49,6 +49,7 @@ save_adapter_weights_only: False
 
 # Dataset and Sampler
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_cleaned_dataset
 seed: null
 shuffle: True
@@ -78,7 +79,7 @@ metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
   log_dir: ${output_dir}
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Environment
 device: cuda
@@ -92,14 +93,14 @@ profiler:
 
   enabled: False
 
-  #Output directory of trace artifacts
+  # Output directory of trace artifacts
   output_dir: ${output_dir}/profiling_outputs
 
   #`torch.profiler.ProfilerActivity` types to trace
   cpu: True
   cuda: True
 
-  #trace options passed to `torch.profiler.profile`
+  # trace options passed to `torch.profiler.profile`
   profile_memory: False
   with_stack: False
   record_shapes: True
diff --git a/recipes/configs/llama2/7B_lora_dpo.yaml b/recipes/configs/llama2/7B_lora_dpo.yaml
index 26f824814f..1a870956ff 100644
--- a/recipes/configs/llama2/7B_lora_dpo.yaml
+++ b/recipes/configs/llama2/7B_lora_dpo.yaml
@@ -46,6 +46,7 @@ save_adapter_weights_only: False
 
 # Dataset and Sampler
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.stack_exchange_paired_dataset
 seed: null
 shuffle: True
@@ -70,6 +71,7 @@ loss:
 epochs: 1
 max_steps_per_epoch: 1000
 gradient_accumulation_steps: 8
+compile: False
 
 # Logging
 output_dir: /tmp/lora_dpo_output/
@@ -77,7 +79,7 @@ metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
   log_dir: ${output_dir}
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Environment
 device: cuda
diff --git a/recipes/configs/llama2/7B_lora_dpo_single_device.yaml b/recipes/configs/llama2/7B_lora_dpo_single_device.yaml
index 2ad3988867..408e28a7be 100644
--- a/recipes/configs/llama2/7B_lora_dpo_single_device.yaml
+++ b/recipes/configs/llama2/7B_lora_dpo_single_device.yaml
@@ -45,6 +45,7 @@ save_adapter_weights_only: False
 
 # Dataset and Sampler
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.stack_exchange_paired_dataset
 seed: null
 shuffle: True
@@ -75,7 +76,7 @@ metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
   log_dir: ${output_dir}
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Environment
 device: cuda
diff --git a/recipes/configs/llama2/7B_lora_single_device.yaml b/recipes/configs/llama2/7B_lora_single_device.yaml
index ebaee584c2..a1c001b868 100644
--- a/recipes/configs/llama2/7B_lora_single_device.yaml
+++ b/recipes/configs/llama2/7B_lora_single_device.yaml
@@ -47,6 +47,7 @@ save_adapter_weights_only: False
 
 # Dataset and Sampler
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_cleaned_dataset
 seed: null
 shuffle: True
@@ -77,7 +78,7 @@ metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
   log_dir: ${output_dir}
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Environment
 device: cuda
diff --git a/recipes/configs/llama2/7B_qat_full.yaml b/recipes/configs/llama2/7B_qat_full.yaml
index 6fca6c4d4a..d1a408aca5 100644
--- a/recipes/configs/llama2/7B_qat_full.yaml
+++ b/recipes/configs/llama2/7B_qat_full.yaml
@@ -22,6 +22,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
 seed: null
 shuffle: True
@@ -53,6 +54,7 @@ loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1
+compile: False
 
 # QAT arguments
 quantizer:
@@ -75,4 +77,4 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/alpaca-llama2-finetune
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
diff --git a/recipes/configs/llama2/7B_qlora.yaml b/recipes/configs/llama2/7B_qlora.yaml
index 052cdb9296..26fc4faf11 100644
--- a/recipes/configs/llama2/7B_qlora.yaml
+++ b/recipes/configs/llama2/7B_qlora.yaml
@@ -48,6 +48,7 @@ save_adapter_weights_only: False
 
 # Dataset and Sampler
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_cleaned_dataset
   train_on_input: True
 seed: null
@@ -82,7 +83,7 @@ metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
   log_dir: ${output_dir}
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Environment
 device: cuda
diff --git a/recipes/configs/llama2/7B_qlora_single_device.yaml b/recipes/configs/llama2/7B_qlora_single_device.yaml
index 0893f48579..611c5b155b 100644
--- a/recipes/configs/llama2/7B_qlora_single_device.yaml
+++ b/recipes/configs/llama2/7B_qlora_single_device.yaml
@@ -46,6 +46,7 @@ save_adapter_weights_only: False
 
 # Dataset and Sampler
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_cleaned_dataset
 seed: null
 shuffle: True
@@ -76,7 +77,7 @@ metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
   log_dir: ${output_dir}
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Environment
 device: cuda
diff --git a/recipes/configs/llama3/70B_full.yaml b/recipes/configs/llama3/70B_full.yaml
index a8b7ba619c..64e678b910 100644
--- a/recipes/configs/llama3/70B_full.yaml
+++ b/recipes/configs/llama3/70B_full.yaml
@@ -30,6 +30,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
 seed: null
 shuffle: True
@@ -110,4 +111,4 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/full-llama3-finetune
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
diff --git a/recipes/configs/llama3/70B_lora.yaml b/recipes/configs/llama3/70B_lora.yaml
index f3a921f289..baac18bedd 100644
--- a/recipes/configs/llama3/70B_lora.yaml
+++ b/recipes/configs/llama3/70B_lora.yaml
@@ -67,6 +67,7 @@ save_adapter_weights_only: False
 
 # Dataset and Sampler
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
 seed: null
 shuffle: True
@@ -97,7 +98,7 @@ metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
   log_dir: ${output_dir}
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Environment
 device: cuda
diff --git a/recipes/configs/llama3/8B_dora.yaml b/recipes/configs/llama3/8B_dora.yaml
index 1265c82c72..a9ea97986e 100644
--- a/recipes/configs/llama3/8B_dora.yaml
+++ b/recipes/configs/llama3/8B_dora.yaml
@@ -42,6 +42,7 @@ resume_from_checkpoint: False
 
 # Dataset and Sampler
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_cleaned_dataset
 seed: null
 shuffle: True
@@ -64,6 +65,7 @@ loss:
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1
+compile: False
 
 # Logging
 output_dir: /tmp/dora_finetune_output
@@ -71,7 +73,7 @@ metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
   log_dir: ${output_dir}
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Environment
 device: cuda
diff --git a/recipes/configs/llama3/8B_dora_single_device.yaml b/recipes/configs/llama3/8B_dora_single_device.yaml
index 0fc0a484dc..188b54f757 100644
--- a/recipes/configs/llama3/8B_dora_single_device.yaml
+++ b/recipes/configs/llama3/8B_dora_single_device.yaml
@@ -44,6 +44,7 @@ resume_from_checkpoint: False
 
 # Dataset and Sampler
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_cleaned_dataset
 seed: null
 shuffle: True
@@ -74,7 +75,7 @@ metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
   log_dir: ${output_dir}
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Environment
 device: cuda
diff --git a/recipes/configs/llama3/8B_full.yaml b/recipes/configs/llama3/8B_full.yaml
index 7f24376db7..baa4a79417 100644
--- a/recipes/configs/llama3/8B_full.yaml
+++ b/recipes/configs/llama3/8B_full.yaml
@@ -26,6 +26,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
 seed: null
 shuffle: True
@@ -57,7 +58,7 @@ loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1
-
+compile: False
 
 # Training env
 device: cuda
@@ -75,4 +76,4 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/full-llama3-finetune
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
diff --git a/recipes/configs/llama3/8B_full_single_device.yaml b/recipes/configs/llama3/8B_full_single_device.yaml
index cd3e3586ce..6b8e1ad4b8 100644
--- a/recipes/configs/llama3/8B_full_single_device.yaml
+++ b/recipes/configs/llama3/8B_full_single_device.yaml
@@ -28,6 +28,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
 seed: null
 shuffle: True
@@ -78,4 +79,4 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/full-llama3-finetune
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
diff --git a/recipes/configs/llama3/8B_lora.yaml b/recipes/configs/llama3/8B_lora.yaml
index d65138f348..69a2349035 100644
--- a/recipes/configs/llama3/8B_lora.yaml
+++ b/recipes/configs/llama3/8B_lora.yaml
@@ -47,6 +47,7 @@ save_adapter_weights_only: False
 
 # Dataset and Sampler
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_cleaned_dataset
 seed: null
 shuffle: True
@@ -69,6 +70,7 @@ loss:
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 32
+compile: False
 
 # Logging
 output_dir: /tmp/lora_finetune_output
@@ -76,7 +78,7 @@ metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
   log_dir: ${output_dir}
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Environment
 device: cuda
diff --git a/recipes/configs/llama3/8B_lora_single_device.yaml b/recipes/configs/llama3/8B_lora_single_device.yaml
index e49afacbb1..661bbe86db 100644
--- a/recipes/configs/llama3/8B_lora_single_device.yaml
+++ b/recipes/configs/llama3/8B_lora_single_device.yaml
@@ -46,6 +46,7 @@ save_adapter_weights_only: False
 
 # Dataset and Sampler
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_cleaned_dataset
 seed: null
 shuffle: True
@@ -76,7 +77,7 @@ metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
   log_dir: ${output_dir}
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Environment
 device: cuda
@@ -91,14 +92,14 @@ profiler:
   _component_: torchtune.training.setup_torch_profiler
   enabled: False
 
-  #Output directory of trace artifacts
+  # Output directory of trace artifacts
   output_dir: ${output_dir}/profiling_outputs
 
   #`torch.profiler.ProfilerActivity` types to trace
   cpu: True
   cuda: True
 
-  #trace options passed to `torch.profiler.profile`
+  # trace options passed to `torch.profiler.profile`
   profile_memory: False
   with_stack: False
   record_shapes: True
diff --git a/recipes/configs/llama3/8B_qat_full.yaml b/recipes/configs/llama3/8B_qat_full.yaml
index ff4d9c3195..07461e8243 100644
--- a/recipes/configs/llama3/8B_qat_full.yaml
+++ b/recipes/configs/llama3/8B_qat_full.yaml
@@ -21,6 +21,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
 seed: null
 shuffle: True
@@ -43,6 +44,7 @@ resume_from_checkpoint: False
 # Fine-tuning arguments
 batch_size: 2
 epochs: 3
+compile: False
 
 # QAT arguments
 quantizer:
@@ -74,4 +76,4 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/alpaca-llama3-finetune
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
diff --git a/recipes/configs/llama3/8B_qdora_single_device.yaml b/recipes/configs/llama3/8B_qdora_single_device.yaml
index 7180c5a72c..fafda9a123 100644
--- a/recipes/configs/llama3/8B_qdora_single_device.yaml
+++ b/recipes/configs/llama3/8B_qdora_single_device.yaml
@@ -45,6 +45,7 @@ resume_from_checkpoint: False
 
 # Dataset and Sampler
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_cleaned_dataset
 seed: null
 shuffle: True
@@ -75,7 +76,7 @@ metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
   log_dir: ${output_dir}
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Environment
 device: cuda
diff --git a/recipes/configs/llama3/8B_qlora_single_device.yaml b/recipes/configs/llama3/8B_qlora_single_device.yaml
index 1eef476d17..83c0dcb9d1 100644
--- a/recipes/configs/llama3/8B_qlora_single_device.yaml
+++ b/recipes/configs/llama3/8B_qlora_single_device.yaml
@@ -45,6 +45,7 @@ save_adapter_weights_only: False
 
 # Dataset and Sampler
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_cleaned_dataset
 seed: null
 shuffle: True
@@ -75,7 +76,7 @@ metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
   log_dir: ${output_dir}
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Environment
 device: cuda
diff --git a/recipes/configs/llama3_1/405B_qlora.yaml b/recipes/configs/llama3_1/405B_qlora.yaml
index 6398a840ec..421f2bb4c2 100644
--- a/recipes/configs/llama3_1/405B_qlora.yaml
+++ b/recipes/configs/llama3_1/405B_qlora.yaml
@@ -45,6 +45,7 @@ save_adapter_weights_only: True
 
 # Dataset and Sampler
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
   train_on_input: True
 seed: null
diff --git a/recipes/configs/llama3_1/70B_full.yaml b/recipes/configs/llama3_1/70B_full.yaml
index fcae062999..654b86c6e6 100644
--- a/recipes/configs/llama3_1/70B_full.yaml
+++ b/recipes/configs/llama3_1/70B_full.yaml
@@ -29,6 +29,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
 seed: null
 shuffle: True
@@ -112,4 +113,4 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/full-llama3_1-finetune
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
diff --git a/recipes/configs/llama3_1/70B_lora.yaml b/recipes/configs/llama3_1/70B_lora.yaml
index 861279127a..6e98357f13 100644
--- a/recipes/configs/llama3_1/70B_lora.yaml
+++ b/recipes/configs/llama3_1/70B_lora.yaml
@@ -66,6 +66,7 @@ save_adapter_weights_only: False
 
 # Dataset and Sampler
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
 seed: null
 shuffle: True
@@ -96,7 +97,7 @@ metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
   log_dir: ${output_dir}
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Environment
 device: cuda
diff --git a/recipes/configs/llama3_1/8B_full.yaml b/recipes/configs/llama3_1/8B_full.yaml
index 4420b0cae5..1c71813e42 100644
--- a/recipes/configs/llama3_1/8B_full.yaml
+++ b/recipes/configs/llama3_1/8B_full.yaml
@@ -26,6 +26,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
 seed: null
 shuffle: True
@@ -60,7 +61,7 @@ loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1
-
+compile: False
 
 # Training env
 device: cuda
@@ -79,4 +80,4 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/full-llama3.1-finetune
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
diff --git a/recipes/configs/llama3_1/8B_full_single_device.yaml b/recipes/configs/llama3_1/8B_full_single_device.yaml
index 9f7d9472ce..3db2de566b 100644
--- a/recipes/configs/llama3_1/8B_full_single_device.yaml
+++ b/recipes/configs/llama3_1/8B_full_single_device.yaml
@@ -28,6 +28,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
 seed: null
 shuffle: True
@@ -78,7 +79,7 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/full-llama3.1-finetune
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Profiler (disabled)
 profiler:
diff --git a/recipes/configs/llama3_1/8B_lora.yaml b/recipes/configs/llama3_1/8B_lora.yaml
index 5f101b170f..586ab6cd19 100644
--- a/recipes/configs/llama3_1/8B_lora.yaml
+++ b/recipes/configs/llama3_1/8B_lora.yaml
@@ -50,6 +50,7 @@ save_adapter_weights_only: False
 
 # Dataset and Sampler
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_cleaned_dataset
 seed: null
 shuffle: True
@@ -80,7 +81,7 @@ metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
   log_dir: ${output_dir}
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Environment
 device: cuda
diff --git a/recipes/configs/llama3_1/8B_lora_single_device.yaml b/recipes/configs/llama3_1/8B_lora_single_device.yaml
index 3991f728ce..4c24d330f9 100644
--- a/recipes/configs/llama3_1/8B_lora_single_device.yaml
+++ b/recipes/configs/llama3_1/8B_lora_single_device.yaml
@@ -49,6 +49,7 @@ save_adapter_weights_only: False
 
 # Dataset and Sampler
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_cleaned_dataset
 seed: null
 shuffle: True
@@ -79,7 +80,7 @@ metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
   log_dir: ${output_dir}
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Environment
 device: cuda
diff --git a/recipes/configs/llama3_1/8B_qlora_single_device.yaml b/recipes/configs/llama3_1/8B_qlora_single_device.yaml
index a9b0662105..5e66d817cc 100644
--- a/recipes/configs/llama3_1/8B_qlora_single_device.yaml
+++ b/recipes/configs/llama3_1/8B_qlora_single_device.yaml
@@ -48,6 +48,7 @@ save_adapter_weights_only: False
 
 # Dataset and Sampler
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_cleaned_dataset
 seed: null
 shuffle: True
@@ -78,7 +79,7 @@ metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
   log_dir: ${output_dir}
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Environment
 device: cuda
diff --git a/recipes/configs/llama3_2/1B_full.yaml b/recipes/configs/llama3_2/1B_full.yaml
index 23b699f754..74f2e15e43 100644
--- a/recipes/configs/llama3_2/1B_full.yaml
+++ b/recipes/configs/llama3_2/1B_full.yaml
@@ -26,6 +26,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
 seed: null
 shuffle: True
@@ -75,4 +76,4 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/full-llama3.2-finetune
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
diff --git a/recipes/configs/llama3_2/1B_full_single_device.yaml b/recipes/configs/llama3_2/1B_full_single_device.yaml
index fc4b0a507c..e7b9ca8939 100644
--- a/recipes/configs/llama3_2/1B_full_single_device.yaml
+++ b/recipes/configs/llama3_2/1B_full_single_device.yaml
@@ -28,6 +28,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
 seed: null
 shuffle: True
@@ -75,7 +76,7 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/full-llama3.2-finetune
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Profiler (disabled)
 profiler:
diff --git a/recipes/configs/llama3_2/1B_lora.yaml b/recipes/configs/llama3_2/1B_lora.yaml
index 228e4989d5..bb182d3192 100644
--- a/recipes/configs/llama3_2/1B_lora.yaml
+++ b/recipes/configs/llama3_2/1B_lora.yaml
@@ -47,6 +47,7 @@ save_adapter_weights_only: False
 
 # Dataset and Sampler
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_cleaned_dataset
 seed: null
 shuffle: True
@@ -77,7 +78,7 @@ metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
   log_dir: ${output_dir}
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Environment
 device: cuda
diff --git a/recipes/configs/llama3_2/1B_lora_single_device.yaml b/recipes/configs/llama3_2/1B_lora_single_device.yaml
index c9ebed6dc7..eca60cd2ce 100644
--- a/recipes/configs/llama3_2/1B_lora_single_device.yaml
+++ b/recipes/configs/llama3_2/1B_lora_single_device.yaml
@@ -46,6 +46,7 @@ save_adapter_weights_only: False
 
 # Dataset and Sampler
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_cleaned_dataset
 seed: null
 shuffle: True
@@ -76,7 +77,7 @@ metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
   log_dir: ${output_dir}
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Environment
 device: cuda
diff --git a/recipes/configs/llama3_2/1B_qlora_single_device.yaml b/recipes/configs/llama3_2/1B_qlora_single_device.yaml
index da552b2a0f..f896668a45 100644
--- a/recipes/configs/llama3_2/1B_qlora_single_device.yaml
+++ b/recipes/configs/llama3_2/1B_qlora_single_device.yaml
@@ -45,6 +45,7 @@ save_adapter_weights_only: False
 
 # Dataset and Sampler
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_cleaned_dataset
 seed: null
 shuffle: True
@@ -75,7 +76,7 @@ metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
   log_dir: ${output_dir}
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Environment
 device: cuda
diff --git a/recipes/configs/llama3_2/3B_full.yaml b/recipes/configs/llama3_2/3B_full.yaml
index 6d738331ae..dee24434ad 100644
--- a/recipes/configs/llama3_2/3B_full.yaml
+++ b/recipes/configs/llama3_2/3B_full.yaml
@@ -26,6 +26,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
 seed: null
 shuffle: True
@@ -75,4 +76,4 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/full-llama3.2-finetune
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
diff --git a/recipes/configs/llama3_2/3B_full_single_device.yaml b/recipes/configs/llama3_2/3B_full_single_device.yaml
index 9b21f4f865..5a61d297d7 100644
--- a/recipes/configs/llama3_2/3B_full_single_device.yaml
+++ b/recipes/configs/llama3_2/3B_full_single_device.yaml
@@ -28,6 +28,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
 seed: null
 shuffle: True
@@ -76,7 +77,7 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/full-llama3.2-finetune
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Profiler (disabled)
 profiler:
diff --git a/recipes/configs/llama3_2/3B_lora.yaml b/recipes/configs/llama3_2/3B_lora.yaml
index d13a303814..9ea9745a77 100644
--- a/recipes/configs/llama3_2/3B_lora.yaml
+++ b/recipes/configs/llama3_2/3B_lora.yaml
@@ -48,6 +48,7 @@ save_adapter_weights_only: False
 
 # Dataset and Sampler
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_cleaned_dataset
 seed: null
 shuffle: True
@@ -78,7 +79,7 @@ metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
   log_dir: ${output_dir}
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Environment
 device: cuda
diff --git a/recipes/configs/llama3_2/3B_lora_single_device.yaml b/recipes/configs/llama3_2/3B_lora_single_device.yaml
index 255c75e227..283f9eda40 100644
--- a/recipes/configs/llama3_2/3B_lora_single_device.yaml
+++ b/recipes/configs/llama3_2/3B_lora_single_device.yaml
@@ -47,6 +47,7 @@ save_adapter_weights_only: False
 
 # Dataset and Sampler
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_cleaned_dataset
 seed: null
 shuffle: True
@@ -77,7 +78,7 @@ metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
   log_dir: ${output_dir}
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Environment
 device: cuda
diff --git a/recipes/configs/llama3_2/3B_qlora_single_device.yaml b/recipes/configs/llama3_2/3B_qlora_single_device.yaml
index 360443b9e1..f36c5ee126 100644
--- a/recipes/configs/llama3_2/3B_qlora_single_device.yaml
+++ b/recipes/configs/llama3_2/3B_qlora_single_device.yaml
@@ -46,6 +46,7 @@ save_adapter_weights_only: False
 
 # Dataset and Sampler
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_cleaned_dataset
 seed: null
 shuffle: True
@@ -76,7 +77,7 @@ metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
   log_dir: ${output_dir}
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Environment
 device: cuda
diff --git a/recipes/configs/llama3_2/knowledge_distillation_single_device.yaml b/recipes/configs/llama3_2/knowledge_distillation_single_device.yaml
index 9cb029666f..ba39474639 100644
--- a/recipes/configs/llama3_2/knowledge_distillation_single_device.yaml
+++ b/recipes/configs/llama3_2/knowledge_distillation_single_device.yaml
@@ -7,6 +7,7 @@
 #   tune download meta-llama/Meta-Llama-3.1-8B-Instruct --output-dir /tmp/Meta-Llama-3.1-8B-Instruct --ignore-patterns "original/consolidated.00.pth"
 #
 # You get better results using KD if the teacher model has already been fine-tuned on the target dataset:
+  packed: False # Set to true for great speed ups
 #   tune run lora_finetune_single_device --config llama3_1/8B_lora_single_device
 #
 # To launch on a single device, run the following command from root:
@@ -62,6 +63,7 @@ teacher_checkpointer:
 
 # Dataset and Sampler
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_cleaned_dataset
 seed: null
 shuffle: True
@@ -96,7 +98,7 @@ metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
   log_dir: ${output_dir}
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Environment
 device: cuda
diff --git a/recipes/configs/llama3_2_vision/11B_full.yaml b/recipes/configs/llama3_2_vision/11B_full.yaml
index ee9180dbcf..1a4c76d307 100644
--- a/recipes/configs/llama3_2_vision/11B_full.yaml
+++ b/recipes/configs/llama3_2_vision/11B_full.yaml
@@ -42,6 +42,7 @@ resume_from_checkpoint: False
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.multimodal.the_cauldron_dataset
   subset: ocrvqa
 seed: null
@@ -76,4 +77,4 @@ metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
   log_dir: /tmp/Llama-3.2-11B-Vision-Instruct/logs
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
diff --git a/recipes/configs/llama3_2_vision/11B_full_single_device.yaml b/recipes/configs/llama3_2_vision/11B_full_single_device.yaml
index 3372c1a540..3e02d5d103 100644
--- a/recipes/configs/llama3_2_vision/11B_full_single_device.yaml
+++ b/recipes/configs/llama3_2_vision/11B_full_single_device.yaml
@@ -44,6 +44,7 @@ resume_from_checkpoint: False
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.multimodal.the_cauldron_dataset
   subset: ocrvqa
 seed: null
@@ -77,7 +78,7 @@ metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
   log_dir: /tmp/Llama-3.2-11B-Vision-Instruct/logs
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Profiler (default is disabled)
 profiler:
diff --git a/recipes/configs/llama3_2_vision/11B_lora.yaml b/recipes/configs/llama3_2_vision/11B_lora.yaml
index 357af64496..f0cd05d012 100644
--- a/recipes/configs/llama3_2_vision/11B_lora.yaml
+++ b/recipes/configs/llama3_2_vision/11B_lora.yaml
@@ -48,6 +48,7 @@ resume_from_checkpoint: False
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.multimodal.the_cauldron_dataset
   subset: ocrvqa
 seed: null
@@ -86,4 +87,4 @@ metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
   log_dir: /tmp/Llama-3.2-11B-Vision-Instruct/logs
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
diff --git a/recipes/configs/llama3_2_vision/11B_lora_single_device.yaml b/recipes/configs/llama3_2_vision/11B_lora_single_device.yaml
index f56828c301..83e2227ca5 100644
--- a/recipes/configs/llama3_2_vision/11B_lora_single_device.yaml
+++ b/recipes/configs/llama3_2_vision/11B_lora_single_device.yaml
@@ -46,6 +46,7 @@ resume_from_checkpoint: False
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.multimodal.the_cauldron_dataset
   subset: ocrvqa
 seed: null
@@ -85,7 +86,7 @@ metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
   log_dir: /tmp/Llama-3.2-11B-Vision-Instruct/logs
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Profiler (disabled)
 profiler:
diff --git a/recipes/configs/mistral/7B_full.yaml b/recipes/configs/mistral/7B_full.yaml
index 602b3fe082..25cf783846 100644
--- a/recipes/configs/mistral/7B_full.yaml
+++ b/recipes/configs/mistral/7B_full.yaml
@@ -29,6 +29,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
 seed: null
 shuffle: True
@@ -60,6 +61,7 @@ loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1
+compile: False
 
 # Training env
 device: cuda
@@ -76,4 +78,4 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/Mistral-7B-v0.1/
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
diff --git a/recipes/configs/mistral/7B_full_low_memory.yaml b/recipes/configs/mistral/7B_full_low_memory.yaml
index 7e68ee8066..a6cf37fa8c 100644
--- a/recipes/configs/mistral/7B_full_low_memory.yaml
+++ b/recipes/configs/mistral/7B_full_low_memory.yaml
@@ -31,6 +31,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
 seed: null
 shuffle: True
@@ -81,4 +82,4 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/Mistral-7B-v0.1/
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
diff --git a/recipes/configs/mistral/7B_full_ppo_low_memory.yaml b/recipes/configs/mistral/7B_full_ppo_low_memory.yaml
index bf9aad71c3..8c583fac0b 100644
--- a/recipes/configs/mistral/7B_full_ppo_low_memory.yaml
+++ b/recipes/configs/mistral/7B_full_ppo_low_memory.yaml
@@ -32,6 +32,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.text_completion_dataset
   source: trl-internal-testing/sentiment-trl-style
   split: train
@@ -135,7 +136,7 @@ optimizer:
   _component_: bitsandbytes.optim.PagedAdamW
   lr: 3e-6
 optimizer_in_bwd: True
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 enable_activation_checkpointing: True
 
 # Reduced precision
diff --git a/recipes/configs/mistral/7B_lora.yaml b/recipes/configs/mistral/7B_lora.yaml
index 08196660fc..a2dc801925 100644
--- a/recipes/configs/mistral/7B_lora.yaml
+++ b/recipes/configs/mistral/7B_lora.yaml
@@ -30,6 +30,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
 seed: null
 shuffle: True
@@ -74,6 +75,7 @@ batch_size: 4
 epochs: 3
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1
+compile: False
 
 # Training env
 device: cuda
@@ -90,4 +92,4 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/Mistral-7B-v0.1
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
diff --git a/recipes/configs/mistral/7B_lora_single_device.yaml b/recipes/configs/mistral/7B_lora_single_device.yaml
index 2ebc9f798e..21212f4983 100644
--- a/recipes/configs/mistral/7B_lora_single_device.yaml
+++ b/recipes/configs/mistral/7B_lora_single_device.yaml
@@ -27,6 +27,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
 seed: null
 shuffle: True
@@ -89,7 +90,7 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/Mistral-7B-v0.1
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Show case the usage of pytorch profiler
 # Set enabled to False as it's only needed for debugging training
diff --git a/recipes/configs/mistral/7B_qlora_single_device.yaml b/recipes/configs/mistral/7B_qlora_single_device.yaml
index 3bbfebe3ba..e2f6884a9f 100644
--- a/recipes/configs/mistral/7B_qlora_single_device.yaml
+++ b/recipes/configs/mistral/7B_qlora_single_device.yaml
@@ -28,6 +28,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_dataset
 seed: null
 shuffle: True
@@ -90,7 +91,7 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/Mistral-7B-v0.1
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Show case the usage of pytorch profiler
 # Set enabled to False as it's only needed for debugging training
diff --git a/recipes/configs/phi3/mini_full.yaml b/recipes/configs/phi3/mini_full.yaml
index 0ee746ddd4..0be89337a7 100644
--- a/recipes/configs/phi3/mini_full.yaml
+++ b/recipes/configs/phi3/mini_full.yaml
@@ -42,6 +42,7 @@ resume_from_checkpoint: False
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_cleaned_dataset
 seed: null
 shuffle: True
@@ -57,6 +58,7 @@ optimizer:
   lr: 5e-6
 loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
+compile: False
 
 # Training env
 device: cuda
@@ -71,4 +73,4 @@ metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
   log_dir: /tmp/Phi-3-mini-4k-instruct/logs
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
diff --git a/recipes/configs/phi3/mini_full_low_memory.yaml b/recipes/configs/phi3/mini_full_low_memory.yaml
index 182a4f6a98..470f4a1afe 100644
--- a/recipes/configs/phi3/mini_full_low_memory.yaml
+++ b/recipes/configs/phi3/mini_full_low_memory.yaml
@@ -44,6 +44,7 @@ resume_from_checkpoint: False
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_cleaned_dataset
 seed: null
 shuffle: True
@@ -74,4 +75,4 @@ metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
   log_dir: /tmp/Phi-3-mini-4k-instruct/logs
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
diff --git a/recipes/configs/phi3/mini_lora.yaml b/recipes/configs/phi3/mini_lora.yaml
index fff05885ef..1af4929985 100644
--- a/recipes/configs/phi3/mini_lora.yaml
+++ b/recipes/configs/phi3/mini_lora.yaml
@@ -49,6 +49,7 @@ save_adapter_weights_only: False
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_cleaned_dataset
 seed: null
 shuffle: True
@@ -68,6 +69,7 @@ lr_scheduler:
   num_warmup_steps: 100
 loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
+compile: False
 
 # Training env
 device: cuda
@@ -82,4 +84,4 @@ metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
   log_dir: /tmp/Phi-3-mini-4k-instruct/logs
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
diff --git a/recipes/configs/phi3/mini_lora_single_device.yaml b/recipes/configs/phi3/mini_lora_single_device.yaml
index b5c14b19ca..21a12a3cc1 100644
--- a/recipes/configs/phi3/mini_lora_single_device.yaml
+++ b/recipes/configs/phi3/mini_lora_single_device.yaml
@@ -47,6 +47,7 @@ save_adapter_weights_only: False
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_cleaned_dataset
 seed: null
 shuffle: True
@@ -84,7 +85,7 @@ metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
   log_dir: /tmp/Phi-3-mini-4k-instruct/logs
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Showcase the usage of PyTorch profiler
 # Set enabled to False as it's only needed for debugging training
diff --git a/recipes/configs/phi3/mini_qlora_single_device.yaml b/recipes/configs/phi3/mini_qlora_single_device.yaml
index 10114bc67a..21c9403bef 100644
--- a/recipes/configs/phi3/mini_qlora_single_device.yaml
+++ b/recipes/configs/phi3/mini_qlora_single_device.yaml
@@ -47,6 +47,7 @@ save_adapter_weights_only: False
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_cleaned_dataset
 seed: null
 shuffle: True
@@ -84,7 +85,7 @@ metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
   log_dir: /tmp/Phi-3-mini-4k-instruct/logs
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Showcase the usage of PyTorch profiler
 # Set enabled to False as it's only needed for debugging training
diff --git a/recipes/configs/qwen2/0.5B_full.yaml b/recipes/configs/qwen2/0.5B_full.yaml
index 5bf14591f9..39748ee052 100644
--- a/recipes/configs/qwen2/0.5B_full.yaml
+++ b/recipes/configs/qwen2/0.5B_full.yaml
@@ -26,6 +26,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_cleaned_dataset
 seed: null
 shuffle: True
@@ -56,7 +57,7 @@ loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 max_steps_per_epoch: null
 gradient_accumulation_steps: 16
-
+compile: False
 
 # Training env
 device: cuda
@@ -73,4 +74,4 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/Qwen2-0.5B-Instruct-finetune
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
diff --git a/recipes/configs/qwen2/0.5B_full_single_device.yaml b/recipes/configs/qwen2/0.5B_full_single_device.yaml
index 67091a4e8a..2d2afe883e 100644
--- a/recipes/configs/qwen2/0.5B_full_single_device.yaml
+++ b/recipes/configs/qwen2/0.5B_full_single_device.yaml
@@ -24,6 +24,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_cleaned_dataset
 seed: null
 shuffle: True
@@ -74,4 +75,4 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/Qwen2-0.5B-Instruct-finetune
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
diff --git a/recipes/configs/qwen2/0.5B_lora.yaml b/recipes/configs/qwen2/0.5B_lora.yaml
index e0608eba5c..33b5e968d0 100644
--- a/recipes/configs/qwen2/0.5B_lora.yaml
+++ b/recipes/configs/qwen2/0.5B_lora.yaml
@@ -46,6 +46,7 @@ resume_from_checkpoint: False
 
 # Dataset and Sampler
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_cleaned_dataset
 
 seed: null
@@ -70,6 +71,7 @@ loss:
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 4
+compile: False
 
 # Logging
 output_dir: /tmp/Qwen2-0.5B-Instruct-lora-finetune
@@ -78,7 +80,7 @@ metric_logger:
   log_dir: ${output_dir}
 
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Environment
 device: cuda
diff --git a/recipes/configs/qwen2/0.5B_lora_single_device.yaml b/recipes/configs/qwen2/0.5B_lora_single_device.yaml
index 602c63853a..beeb21b072 100644
--- a/recipes/configs/qwen2/0.5B_lora_single_device.yaml
+++ b/recipes/configs/qwen2/0.5B_lora_single_device.yaml
@@ -45,6 +45,7 @@ resume_from_checkpoint: False
 
 # Dataset and Sampler
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_cleaned_dataset
 seed: null
 shuffle: True
@@ -76,7 +77,7 @@ metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
   log_dir: ${output_dir}
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Environment
 device: cuda
diff --git a/recipes/configs/qwen2/1.5B_full.yaml b/recipes/configs/qwen2/1.5B_full.yaml
index cb7b5e2318..8e850bae50 100644
--- a/recipes/configs/qwen2/1.5B_full.yaml
+++ b/recipes/configs/qwen2/1.5B_full.yaml
@@ -26,6 +26,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_cleaned_dataset
 seed: null
 shuffle: True
@@ -56,7 +57,7 @@ loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1
-
+compile: False
 
 # Training env
 device: cuda
@@ -73,4 +74,4 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/Qwen2-1.5B-Instruct-finetune
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
diff --git a/recipes/configs/qwen2/1.5B_full_single_device.yaml b/recipes/configs/qwen2/1.5B_full_single_device.yaml
index 5da79ceb69..cc7fd5f566 100644
--- a/recipes/configs/qwen2/1.5B_full_single_device.yaml
+++ b/recipes/configs/qwen2/1.5B_full_single_device.yaml
@@ -28,6 +28,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_cleaned_dataset
 
 seed: null
@@ -79,4 +80,4 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/Qwen2-1.5B-Instruct-finetune
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
diff --git a/recipes/configs/qwen2/1.5B_lora.yaml b/recipes/configs/qwen2/1.5B_lora.yaml
index a496dade08..845cb71184 100644
--- a/recipes/configs/qwen2/1.5B_lora.yaml
+++ b/recipes/configs/qwen2/1.5B_lora.yaml
@@ -44,6 +44,7 @@ resume_from_checkpoint: False
 
 # Dataset and Sampler
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_cleaned_dataset
 seed: null
 shuffle: True
@@ -66,6 +67,7 @@ loss:
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 8
+compile: False
 
 # Logging
 output_dir: /tmp/Qwen2-1.5B-Instruct-lora-finetune
@@ -73,7 +75,7 @@ metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
   log_dir: ${output_dir}
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Environment
 device: cuda
diff --git a/recipes/configs/qwen2/1.5B_lora_single_device.yaml b/recipes/configs/qwen2/1.5B_lora_single_device.yaml
index b41269de1a..f2e8d2beb4 100644
--- a/recipes/configs/qwen2/1.5B_lora_single_device.yaml
+++ b/recipes/configs/qwen2/1.5B_lora_single_device.yaml
@@ -44,6 +44,7 @@ resume_from_checkpoint: False
 
 # Dataset and Sampler
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_cleaned_dataset
 seed: null
 shuffle: True
@@ -74,7 +75,7 @@ metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
   log_dir: ${output_dir}
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Environment
 device: cuda
diff --git a/recipes/configs/qwen2/7B_full.yaml b/recipes/configs/qwen2/7B_full.yaml
index 7ffc07e457..06083d908f 100644
--- a/recipes/configs/qwen2/7B_full.yaml
+++ b/recipes/configs/qwen2/7B_full.yaml
@@ -26,6 +26,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_cleaned_dataset
 seed: null
 shuffle: True
@@ -59,7 +60,7 @@ loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 max_steps_per_epoch: null
 gradient_accumulation_steps: 16
-
+compile: False
 
 # Training env
 device: cuda
@@ -76,4 +77,4 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/Qwen2-7B-Instruct-finetune
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
diff --git a/recipes/configs/qwen2/7B_full_single_device.yaml b/recipes/configs/qwen2/7B_full_single_device.yaml
index 560dd5fc9f..13290d82a0 100644
--- a/recipes/configs/qwen2/7B_full_single_device.yaml
+++ b/recipes/configs/qwen2/7B_full_single_device.yaml
@@ -28,6 +28,7 @@ tokenizer:
 
 # Dataset
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_cleaned_dataset
 seed: null
 shuffle: True
@@ -78,4 +79,4 @@ metric_logger:
   log_dir: ${output_dir}
 output_dir: /tmp/Qwen2-7B-Instruct-finetune
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
diff --git a/recipes/configs/qwen2/7B_lora.yaml b/recipes/configs/qwen2/7B_lora.yaml
index d3b63fd1df..6e778ecd7d 100644
--- a/recipes/configs/qwen2/7B_lora.yaml
+++ b/recipes/configs/qwen2/7B_lora.yaml
@@ -50,6 +50,7 @@ resume_from_checkpoint: False
 
 # Dataset and Sampler
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_cleaned_dataset
 seed: null
 shuffle: True
@@ -72,6 +73,7 @@ loss:
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 32
+compile: False
 
 # Logging
 output_dir: /tmp/Qwen2-7B-Instruct-lora-finetune
@@ -79,7 +81,7 @@ metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
   log_dir: ${output_dir}
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Environment
 device: cuda
diff --git a/recipes/configs/qwen2/7B_lora_single_device.yaml b/recipes/configs/qwen2/7B_lora_single_device.yaml
index 6f9fb35b15..e0b19d03a3 100644
--- a/recipes/configs/qwen2/7B_lora_single_device.yaml
+++ b/recipes/configs/qwen2/7B_lora_single_device.yaml
@@ -48,6 +48,7 @@ resume_from_checkpoint: False
 
 # Dataset and Sampler
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_cleaned_dataset
 seed: null
 shuffle: True
@@ -78,7 +79,7 @@ metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
   log_dir: ${output_dir}
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Environment
 device: cuda
diff --git a/recipes/configs/qwen2/knowledge_distillation_single_device.yaml b/recipes/configs/qwen2/knowledge_distillation_single_device.yaml
index 9cc894a7e5..078a91c417 100644
--- a/recipes/configs/qwen2/knowledge_distillation_single_device.yaml
+++ b/recipes/configs/qwen2/knowledge_distillation_single_device.yaml
@@ -7,6 +7,7 @@
 #   tune download Qwen/Qwen2-1.5B-Instruct --output-dir /tmp/Qwen2-1.5B-Instruct --ignore-patterns None
 #
 # You get better results using KD if the teacher model has already been fine-tuned on the target dataset:
+  packed: False # Set to true for great speed ups
 #   tune run lora_finetune_single_device --config qwen2/1.5B_lora_single_device
 #
 # To launch on a single device, run the following command from root:
@@ -56,6 +57,7 @@ resume_from_checkpoint: False
 
 # Dataset and Sampler
 dataset:
+  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.alpaca_cleaned_dataset
 seed: null
 shuffle: True
@@ -89,7 +91,7 @@ metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
   log_dir: ${output_dir}
 log_every_n_steps: 1
-log_peak_memory_stats: False
+log_peak_memory_stats: True
 
 # Environment
 device: cuda
diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py
index 6e83e575f9..50f32878af 100644
--- a/recipes/full_finetune_distributed.py
+++ b/recipes/full_finetune_distributed.py
@@ -472,7 +472,8 @@ def _setup_optimizer(
 
     def _setup_data(
         self,
-        cfg_dataset: DictConfig,
+        cfg_dataset:
+  packed: False # Set to true for great speed ups DictConfig,
         shuffle: bool,
         batch_size: int,
         collate_fn: str,
diff --git a/recipes/full_finetune_single_device.py b/recipes/full_finetune_single_device.py
index 2addd92944..a35cd27079 100644
--- a/recipes/full_finetune_single_device.py
+++ b/recipes/full_finetune_single_device.py
@@ -478,7 +478,8 @@ def _setup_lr_scheduler(
 
     def _setup_data(
         self,
-        cfg_dataset: DictConfig,
+        cfg_dataset:
+  packed: False # Set to true for great speed ups DictConfig,
         shuffle: bool,
         batch_size: int,
         collate_fn: str,
diff --git a/recipes/knowledge_distillation_single_device.py b/recipes/knowledge_distillation_single_device.py
index c2ee8c7cc4..63d680626c 100644
--- a/recipes/knowledge_distillation_single_device.py
+++ b/recipes/knowledge_distillation_single_device.py
@@ -494,7 +494,8 @@ def _setup_lr_scheduler(
 
     def _setup_data(
         self,
-        cfg_dataset: DictConfig,
+        cfg_dataset:
+  packed: False # Set to true for great speed ups DictConfig,
         shuffle: bool,
         batch_size: int,
     ) -> Tuple[DistributedSampler, DataLoader]:
diff --git a/recipes/lora_dpo_distributed.py b/recipes/lora_dpo_distributed.py
index e903ab274a..830012cd02 100644
--- a/recipes/lora_dpo_distributed.py
+++ b/recipes/lora_dpo_distributed.py
@@ -446,7 +446,8 @@ def _setup_lr_scheduler(
 
     def _setup_data(
         self,
-        cfg_dataset: DictConfig,
+        cfg_dataset:
+  packed: False # Set to true for great speed ups DictConfig,
         shuffle: bool,
         batch_size: int,
     ) -> Tuple[DistributedSampler, DataLoader]:
diff --git a/recipes/lora_dpo_single_device.py b/recipes/lora_dpo_single_device.py
index c158d17875..2b4a4bc00e 100644
--- a/recipes/lora_dpo_single_device.py
+++ b/recipes/lora_dpo_single_device.py
@@ -334,7 +334,8 @@ def _setup_lr_scheduler(
 
     def _setup_data(
         self,
-        cfg_dataset: DictConfig,
+        cfg_dataset:
+  packed: False # Set to true for great speed ups DictConfig,
         shuffle: bool,
         batch_size: int,
     ) -> Tuple[DistributedSampler, DataLoader]:
diff --git a/recipes/lora_finetune_distributed.py b/recipes/lora_finetune_distributed.py
index 1569dfee63..235c376751 100644
--- a/recipes/lora_finetune_distributed.py
+++ b/recipes/lora_finetune_distributed.py
@@ -586,7 +586,8 @@ def _setup_lr_scheduler(
 
     def _setup_data(
         self,
-        cfg_dataset: DictConfig,
+        cfg_dataset:
+  packed: False # Set to true for great speed ups DictConfig,
         shuffle: bool,
         batch_size: int,
         collate_fn: str,
diff --git a/recipes/lora_finetune_single_device.py b/recipes/lora_finetune_single_device.py
index 5d39b72086..3499c412b7 100644
--- a/recipes/lora_finetune_single_device.py
+++ b/recipes/lora_finetune_single_device.py
@@ -497,7 +497,8 @@ def _setup_lr_scheduler(
 
     def _setup_data(
         self,
-        cfg_dataset: DictConfig,
+        cfg_dataset:
+  packed: False # Set to true for great speed ups DictConfig,
         shuffle: bool,
         batch_size: int,
         collate_fn: str,
diff --git a/recipes/ppo_full_finetune_single_device.py b/recipes/ppo_full_finetune_single_device.py
index 7679af3fd3..7681319581 100644
--- a/recipes/ppo_full_finetune_single_device.py
+++ b/recipes/ppo_full_finetune_single_device.py
@@ -554,7 +554,8 @@ def _setup_optimizer(
             return optimizer
 
     def _setup_data(
-        self, cfg_dataset: DictConfig, shuffle: bool, batch_size: int
+        self, cfg_dataset:
+  packed: False # Set to true for great speed ups DictConfig, shuffle: bool, batch_size: int
     ) -> Tuple[DistributedSampler, DataLoader]:
         """
         All data related setup happens here.
diff --git a/recipes/qat_distributed.py b/recipes/qat_distributed.py
index eb2e44fae2..f660b6f9f8 100644
--- a/recipes/qat_distributed.py
+++ b/recipes/qat_distributed.py
@@ -494,7 +494,8 @@ def _setup_optimizer(
 
     def _setup_data(
         self,
-        cfg_dataset: DictConfig,
+        cfg_dataset:
+  packed: False # Set to true for great speed ups DictConfig,
         shuffle: bool,
         batch_size: int,
     ) -> Tuple[DistributedSampler, DataLoader]:

From 00d09d0742c8aa5b06f2e4933e4e236bb1a364e8 Mon Sep 17 00:00:00 2001
From: krammnic <krammnic@krammnic.krammnic.com>
Date: Sun, 20 Oct 2024 14:31:12 -0400
Subject: [PATCH 02/10] fix incorrect replacement

---
 recipes/full_finetune_distributed.py            | 3 +--
 recipes/full_finetune_single_device.py          | 3 +--
 recipes/knowledge_distillation_single_device.py | 3 +--
 recipes/lora_dpo_distributed.py                 | 3 +--
 recipes/lora_dpo_single_device.py               | 3 +--
 recipes/lora_finetune_distributed.py            | 3 +--
 recipes/lora_finetune_single_device.py          | 3 +--
 recipes/ppo_full_finetune_single_device.py      | 3 +--
 recipes/qat_distributed.py                      | 3 +--
 9 files changed, 9 insertions(+), 18 deletions(-)

diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py
index 50f32878af..6e83e575f9 100644
--- a/recipes/full_finetune_distributed.py
+++ b/recipes/full_finetune_distributed.py
@@ -472,8 +472,7 @@ def _setup_optimizer(
 
     def _setup_data(
         self,
-        cfg_dataset:
-  packed: False # Set to true for great speed ups DictConfig,
+        cfg_dataset: DictConfig,
         shuffle: bool,
         batch_size: int,
         collate_fn: str,
diff --git a/recipes/full_finetune_single_device.py b/recipes/full_finetune_single_device.py
index a35cd27079..2addd92944 100644
--- a/recipes/full_finetune_single_device.py
+++ b/recipes/full_finetune_single_device.py
@@ -478,8 +478,7 @@ def _setup_lr_scheduler(
 
     def _setup_data(
         self,
-        cfg_dataset:
-  packed: False # Set to true for great speed ups DictConfig,
+        cfg_dataset: DictConfig,
         shuffle: bool,
         batch_size: int,
         collate_fn: str,
diff --git a/recipes/knowledge_distillation_single_device.py b/recipes/knowledge_distillation_single_device.py
index 63d680626c..c2ee8c7cc4 100644
--- a/recipes/knowledge_distillation_single_device.py
+++ b/recipes/knowledge_distillation_single_device.py
@@ -494,8 +494,7 @@ def _setup_lr_scheduler(
 
     def _setup_data(
         self,
-        cfg_dataset:
-  packed: False # Set to true for great speed ups DictConfig,
+        cfg_dataset: DictConfig,
         shuffle: bool,
         batch_size: int,
     ) -> Tuple[DistributedSampler, DataLoader]:
diff --git a/recipes/lora_dpo_distributed.py b/recipes/lora_dpo_distributed.py
index 830012cd02..e903ab274a 100644
--- a/recipes/lora_dpo_distributed.py
+++ b/recipes/lora_dpo_distributed.py
@@ -446,8 +446,7 @@ def _setup_lr_scheduler(
 
     def _setup_data(
         self,
-        cfg_dataset:
-  packed: False # Set to true for great speed ups DictConfig,
+        cfg_dataset: DictConfig,
         shuffle: bool,
         batch_size: int,
     ) -> Tuple[DistributedSampler, DataLoader]:
diff --git a/recipes/lora_dpo_single_device.py b/recipes/lora_dpo_single_device.py
index 2b4a4bc00e..c158d17875 100644
--- a/recipes/lora_dpo_single_device.py
+++ b/recipes/lora_dpo_single_device.py
@@ -334,8 +334,7 @@ def _setup_lr_scheduler(
 
     def _setup_data(
         self,
-        cfg_dataset:
-  packed: False # Set to true for great speed ups DictConfig,
+        cfg_dataset: DictConfig,
         shuffle: bool,
         batch_size: int,
     ) -> Tuple[DistributedSampler, DataLoader]:
diff --git a/recipes/lora_finetune_distributed.py b/recipes/lora_finetune_distributed.py
index 235c376751..1569dfee63 100644
--- a/recipes/lora_finetune_distributed.py
+++ b/recipes/lora_finetune_distributed.py
@@ -586,8 +586,7 @@ def _setup_lr_scheduler(
 
     def _setup_data(
         self,
-        cfg_dataset:
-  packed: False # Set to true for great speed ups DictConfig,
+        cfg_dataset: DictConfig,
         shuffle: bool,
         batch_size: int,
         collate_fn: str,
diff --git a/recipes/lora_finetune_single_device.py b/recipes/lora_finetune_single_device.py
index 3499c412b7..5d39b72086 100644
--- a/recipes/lora_finetune_single_device.py
+++ b/recipes/lora_finetune_single_device.py
@@ -497,8 +497,7 @@ def _setup_lr_scheduler(
 
     def _setup_data(
         self,
-        cfg_dataset:
-  packed: False # Set to true for great speed ups DictConfig,
+        cfg_dataset: DictConfig,
         shuffle: bool,
         batch_size: int,
         collate_fn: str,
diff --git a/recipes/ppo_full_finetune_single_device.py b/recipes/ppo_full_finetune_single_device.py
index 7681319581..7679af3fd3 100644
--- a/recipes/ppo_full_finetune_single_device.py
+++ b/recipes/ppo_full_finetune_single_device.py
@@ -554,8 +554,7 @@ def _setup_optimizer(
             return optimizer
 
     def _setup_data(
-        self, cfg_dataset:
-  packed: False # Set to true for great speed ups DictConfig, shuffle: bool, batch_size: int
+        self, cfg_dataset: DictConfig, shuffle: bool, batch_size: int
     ) -> Tuple[DistributedSampler, DataLoader]:
         """
         All data related setup happens here.
diff --git a/recipes/qat_distributed.py b/recipes/qat_distributed.py
index f660b6f9f8..eb2e44fae2 100644
--- a/recipes/qat_distributed.py
+++ b/recipes/qat_distributed.py
@@ -494,8 +494,7 @@ def _setup_optimizer(
 
     def _setup_data(
         self,
-        cfg_dataset:
-  packed: False # Set to true for great speed ups DictConfig,
+        cfg_dataset: DictConfig,
         shuffle: bool,
         batch_size: int,
     ) -> Tuple[DistributedSampler, DataLoader]:

From bb7648f0af863019413c0a14baaa0d5711bfa1eb Mon Sep 17 00:00:00 2001
From: krammnic <krammnic@krammnic.krammnic.com>
Date: Mon, 21 Oct 2024 11:42:43 -0400
Subject: [PATCH 03/10] remove packed from dpo

---
 recipes/configs/llama2/7B_lora_dpo.yaml               | 1 -
 recipes/configs/llama2/7B_lora_dpo_single_device.yaml | 1 -
 2 files changed, 2 deletions(-)

diff --git a/recipes/configs/llama2/7B_lora_dpo.yaml b/recipes/configs/llama2/7B_lora_dpo.yaml
index 1a870956ff..1a0b4bc390 100644
--- a/recipes/configs/llama2/7B_lora_dpo.yaml
+++ b/recipes/configs/llama2/7B_lora_dpo.yaml
@@ -46,7 +46,6 @@ save_adapter_weights_only: False
 
 # Dataset and Sampler
 dataset:
-  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.stack_exchange_paired_dataset
 seed: null
 shuffle: True
diff --git a/recipes/configs/llama2/7B_lora_dpo_single_device.yaml b/recipes/configs/llama2/7B_lora_dpo_single_device.yaml
index 408e28a7be..bfe8185f06 100644
--- a/recipes/configs/llama2/7B_lora_dpo_single_device.yaml
+++ b/recipes/configs/llama2/7B_lora_dpo_single_device.yaml
@@ -45,7 +45,6 @@ save_adapter_weights_only: False
 
 # Dataset and Sampler
 dataset:
-  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.stack_exchange_paired_dataset
 seed: null
 shuffle: True

From 22c84abf57f099dfe557ee9b4176a5301d45532d Mon Sep 17 00:00:00 2001
From: krammnic <krammnic@krammnic.krammnic.com>
Date: Mon, 21 Oct 2024 11:43:21 -0400
Subject: [PATCH 04/10] remove packed from dpo

---
 recipes/configs/mistral/7B_full_ppo_low_memory.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/recipes/configs/mistral/7B_full_ppo_low_memory.yaml b/recipes/configs/mistral/7B_full_ppo_low_memory.yaml
index 8c583fac0b..db3b3f5e86 100644
--- a/recipes/configs/mistral/7B_full_ppo_low_memory.yaml
+++ b/recipes/configs/mistral/7B_full_ppo_low_memory.yaml
@@ -32,7 +32,6 @@ tokenizer:
 
 # Dataset
 dataset:
-  packed: False # Set to true for great speed ups
   _component_: torchtune.datasets.text_completion_dataset
   source: trl-internal-testing/sentiment-trl-style
   split: train

From 8acfa7558cf655f56b60081d3d5cabf6adb759c9 Mon Sep 17 00:00:00 2001
From: krammnic <krammnic@krammnic.krammnic.com>
Date: Mon, 21 Oct 2024 11:59:58 -0400
Subject: [PATCH 05/10] add comment

---
 recipes/configs/llama3/70B_full.yaml                        | 2 +-
 recipes/configs/llama3/70B_lora.yaml                        | 2 +-
 recipes/configs/llama3_1/405B_qlora.yaml                    | 2 +-
 recipes/configs/llama3_1/70B_full.yaml                      | 2 +-
 recipes/configs/llama3_1/70B_lora.yaml                      | 2 +-
 recipes/configs/llama3_1/8B_full.yaml                       | 2 +-
 recipes/configs/llama3_1/8B_full_single_device.yaml         | 2 +-
 recipes/configs/llama3_1/8B_lora.yaml                       | 2 +-
 recipes/configs/llama3_1/8B_lora_single_device.yaml         | 2 +-
 recipes/configs/llama3_1/8B_qlora_single_device.yaml        | 2 +-
 recipes/configs/llama3_2/1B_full.yaml                       | 2 +-
 recipes/configs/llama3_2/1B_full_single_device.yaml         | 2 +-
 recipes/configs/llama3_2/1B_lora.yaml                       | 2 +-
 recipes/configs/llama3_2/1B_lora_single_device.yaml         | 2 +-
 recipes/configs/llama3_2/1B_qlora_single_device.yaml        | 2 +-
 recipes/configs/llama3_2/3B_full.yaml                       | 2 +-
 recipes/configs/llama3_2/3B_full_single_device.yaml         | 2 +-
 recipes/configs/llama3_2/3B_lora.yaml                       | 2 +-
 recipes/configs/llama3_2/3B_lora_single_device.yaml         | 2 +-
 recipes/configs/llama3_2/3B_qlora_single_device.yaml        | 2 +-
 recipes/configs/llama3_2_vision/11B_full.yaml               | 2 +-
 recipes/configs/llama3_2_vision/11B_full_single_device.yaml | 2 +-
 recipes/configs/llama3_2_vision/11B_lora.yaml               | 2 +-
 recipes/configs/llama3_2_vision/11B_lora_single_device.yaml | 2 +-
 24 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/recipes/configs/llama3/70B_full.yaml b/recipes/configs/llama3/70B_full.yaml
index 64e678b910..21f0870094 100644
--- a/recipes/configs/llama3/70B_full.yaml
+++ b/recipes/configs/llama3/70B_full.yaml
@@ -100,7 +100,7 @@ device: cuda
 enable_activation_checkpointing: True
 custom_sharded_layers: ['tok_embeddings', 'output']
 fsdp_cpu_offload: True
-compile: False # set it to True for better memory and performance
+compile=False  # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance
 
 # Reduced precision
 dtype: bf16
diff --git a/recipes/configs/llama3/70B_lora.yaml b/recipes/configs/llama3/70B_lora.yaml
index baac18bedd..49b246a23c 100644
--- a/recipes/configs/llama3/70B_lora.yaml
+++ b/recipes/configs/llama3/70B_lora.yaml
@@ -90,7 +90,7 @@ loss:
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1
-compile: False # set it to True for better memory and performance
+compile=False  # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance
 
 # Logging
 output_dir: /tmp/lora_finetune_output
diff --git a/recipes/configs/llama3_1/405B_qlora.yaml b/recipes/configs/llama3_1/405B_qlora.yaml
index 421f2bb4c2..8e4359b632 100644
--- a/recipes/configs/llama3_1/405B_qlora.yaml
+++ b/recipes/configs/llama3_1/405B_qlora.yaml
@@ -72,7 +72,7 @@ fsdp:
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 16
-compile: False # set it to True for better memory and performance
+compile=False  # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance
 
 # Logging
 output_dir: /tmp/qlora_finetune_output
diff --git a/recipes/configs/llama3_1/70B_full.yaml b/recipes/configs/llama3_1/70B_full.yaml
index 654b86c6e6..b5619ac0b4 100644
--- a/recipes/configs/llama3_1/70B_full.yaml
+++ b/recipes/configs/llama3_1/70B_full.yaml
@@ -102,7 +102,7 @@ device: cuda
 enable_activation_checkpointing: True
 custom_sharded_layers: ['tok_embeddings', 'output']
 fsdp_cpu_offload: True
-compile: False # set it to True for better memory and performance
+compile=False  # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance
 
 # Reduced precision
 dtype: bf16
diff --git a/recipes/configs/llama3_1/70B_lora.yaml b/recipes/configs/llama3_1/70B_lora.yaml
index 6e98357f13..030eb0a5fb 100644
--- a/recipes/configs/llama3_1/70B_lora.yaml
+++ b/recipes/configs/llama3_1/70B_lora.yaml
@@ -89,7 +89,7 @@ loss:
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1
-compile: False # set it to True for better memory and performance
+compile=False  # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance
 
 # Logging
 output_dir: /tmp/lora-llama3_1-finetune-output
diff --git a/recipes/configs/llama3_1/8B_full.yaml b/recipes/configs/llama3_1/8B_full.yaml
index 1c71813e42..43fb5a11b8 100644
--- a/recipes/configs/llama3_1/8B_full.yaml
+++ b/recipes/configs/llama3_1/8B_full.yaml
@@ -69,7 +69,7 @@ device: cuda
 # Memory management
 enable_activation_checkpointing: True
 custom_sharded_layers: ['tok_embeddings', 'output']
-compile: False # set it to True for better memory and performance
+compile=False  # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance
 
 # Reduced precision
 dtype: bf16
diff --git a/recipes/configs/llama3_1/8B_full_single_device.yaml b/recipes/configs/llama3_1/8B_full_single_device.yaml
index 3db2de566b..e99d77164f 100644
--- a/recipes/configs/llama3_1/8B_full_single_device.yaml
+++ b/recipes/configs/llama3_1/8B_full_single_device.yaml
@@ -62,7 +62,7 @@ loss:
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1
 optimizer_in_bwd: True
-compile: False # set it to True for better memory and performance
+compile=False  # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance
 
 # Training environment
 device: cuda
diff --git a/recipes/configs/llama3_1/8B_lora.yaml b/recipes/configs/llama3_1/8B_lora.yaml
index 586ab6cd19..6f37eeeaa8 100644
--- a/recipes/configs/llama3_1/8B_lora.yaml
+++ b/recipes/configs/llama3_1/8B_lora.yaml
@@ -73,7 +73,7 @@ loss:
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 32
-compile: False # set it to True for better memory and performance
+compile=False  # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance
 
 # Logging
 output_dir: /tmp/lora_finetune_output
diff --git a/recipes/configs/llama3_1/8B_lora_single_device.yaml b/recipes/configs/llama3_1/8B_lora_single_device.yaml
index 4c24d330f9..41caed2a38 100644
--- a/recipes/configs/llama3_1/8B_lora_single_device.yaml
+++ b/recipes/configs/llama3_1/8B_lora_single_device.yaml
@@ -72,7 +72,7 @@ loss:
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 64
-compile: False # set it to True for better memory and performance
+compile=False  # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance
 
 # Logging
 output_dir: /tmp/lora_finetune_output
diff --git a/recipes/configs/llama3_1/8B_qlora_single_device.yaml b/recipes/configs/llama3_1/8B_qlora_single_device.yaml
index 5e66d817cc..f3476fcb16 100644
--- a/recipes/configs/llama3_1/8B_qlora_single_device.yaml
+++ b/recipes/configs/llama3_1/8B_qlora_single_device.yaml
@@ -71,7 +71,7 @@ loss:
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 16
-compile: False # set it to True for better memory and performance
+compile=False  # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance
 
 # Logging
 output_dir: /tmp/qlora_finetune_output/
diff --git a/recipes/configs/llama3_2/1B_full.yaml b/recipes/configs/llama3_2/1B_full.yaml
index 74f2e15e43..97257c623a 100644
--- a/recipes/configs/llama3_2/1B_full.yaml
+++ b/recipes/configs/llama3_2/1B_full.yaml
@@ -65,7 +65,7 @@ device: cuda
 
 # Memory management
 enable_activation_checkpointing: False
-compile: False # set it to True for better memory and performance
+compile=False  # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance
 
 # Reduced precision
 dtype: bf16
diff --git a/recipes/configs/llama3_2/1B_full_single_device.yaml b/recipes/configs/llama3_2/1B_full_single_device.yaml
index e7b9ca8939..c69b592cf8 100644
--- a/recipes/configs/llama3_2/1B_full_single_device.yaml
+++ b/recipes/configs/llama3_2/1B_full_single_device.yaml
@@ -59,7 +59,7 @@ loss:
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1
 optimizer_in_bwd: True
-compile: False # set it to True for better memory and performance
+compile=False  # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance
 
 # Training environment
 device: cuda
diff --git a/recipes/configs/llama3_2/1B_lora.yaml b/recipes/configs/llama3_2/1B_lora.yaml
index bb182d3192..b531461a4b 100644
--- a/recipes/configs/llama3_2/1B_lora.yaml
+++ b/recipes/configs/llama3_2/1B_lora.yaml
@@ -70,7 +70,7 @@ loss:
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 4
-compile: False # set it to True for better memory and performance
+compile=False  # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance
 
 # Logging
 output_dir: /tmp/lora_finetune_output
diff --git a/recipes/configs/llama3_2/1B_lora_single_device.yaml b/recipes/configs/llama3_2/1B_lora_single_device.yaml
index eca60cd2ce..e2b97a6d77 100644
--- a/recipes/configs/llama3_2/1B_lora_single_device.yaml
+++ b/recipes/configs/llama3_2/1B_lora_single_device.yaml
@@ -69,7 +69,7 @@ loss:
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 4
-compile: False # set it to True for better memory and performance
+compile=False  # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance
 
 # Logging
 output_dir: /tmp/lora_finetune_output
diff --git a/recipes/configs/llama3_2/1B_qlora_single_device.yaml b/recipes/configs/llama3_2/1B_qlora_single_device.yaml
index f896668a45..0372578e23 100644
--- a/recipes/configs/llama3_2/1B_qlora_single_device.yaml
+++ b/recipes/configs/llama3_2/1B_qlora_single_device.yaml
@@ -68,7 +68,7 @@ loss:
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 4
-compile: False # set it to True for better memory and performance
+compile=False  # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance
 
 # Logging
 output_dir: /tmp/lora_finetune_output
diff --git a/recipes/configs/llama3_2/3B_full.yaml b/recipes/configs/llama3_2/3B_full.yaml
index dee24434ad..027a1d9e27 100644
--- a/recipes/configs/llama3_2/3B_full.yaml
+++ b/recipes/configs/llama3_2/3B_full.yaml
@@ -65,7 +65,7 @@ device: cuda
 
 # Memory management
 enable_activation_checkpointing: True
-compile: False # set it to True for better memory and performance
+compile=False  # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance
 
 # Reduced precision
 dtype: bf16
diff --git a/recipes/configs/llama3_2/3B_full_single_device.yaml b/recipes/configs/llama3_2/3B_full_single_device.yaml
index 5a61d297d7..29955d8d47 100644
--- a/recipes/configs/llama3_2/3B_full_single_device.yaml
+++ b/recipes/configs/llama3_2/3B_full_single_device.yaml
@@ -60,7 +60,7 @@ loss:
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1
 optimizer_in_bwd: True
-compile: False # set it to True for better memory and performance
+compile=False  # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance
 
 # Training environment
 device: cuda
diff --git a/recipes/configs/llama3_2/3B_lora.yaml b/recipes/configs/llama3_2/3B_lora.yaml
index 9ea9745a77..e30ad9d63b 100644
--- a/recipes/configs/llama3_2/3B_lora.yaml
+++ b/recipes/configs/llama3_2/3B_lora.yaml
@@ -71,7 +71,7 @@ loss:
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 4
-compile: False # set it to True for better memory and performance
+compile=False  # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance
 
 # Logging
 output_dir: /tmp/lora_finetune_output
diff --git a/recipes/configs/llama3_2/3B_lora_single_device.yaml b/recipes/configs/llama3_2/3B_lora_single_device.yaml
index 283f9eda40..3e888090a8 100644
--- a/recipes/configs/llama3_2/3B_lora_single_device.yaml
+++ b/recipes/configs/llama3_2/3B_lora_single_device.yaml
@@ -70,7 +70,7 @@ loss:
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 4
-compile: False # set it to True for better memory and performance
+compile=False  # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance
 
 # Logging
 output_dir: /tmp/lora_finetune_output
diff --git a/recipes/configs/llama3_2/3B_qlora_single_device.yaml b/recipes/configs/llama3_2/3B_qlora_single_device.yaml
index f36c5ee126..a98e1ac9c0 100644
--- a/recipes/configs/llama3_2/3B_qlora_single_device.yaml
+++ b/recipes/configs/llama3_2/3B_qlora_single_device.yaml
@@ -69,7 +69,7 @@ loss:
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 4
-compile: False # set it to True for better memory and performance
+compile=False  # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance
 
 # Logging
 output_dir: /tmp/lora_finetune_output
diff --git a/recipes/configs/llama3_2_vision/11B_full.yaml b/recipes/configs/llama3_2_vision/11B_full.yaml
index 1a4c76d307..2ef3298f11 100644
--- a/recipes/configs/llama3_2_vision/11B_full.yaml
+++ b/recipes/configs/llama3_2_vision/11B_full.yaml
@@ -61,7 +61,7 @@ optimizer:
 loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 clip_grad_norm: 1.0
-compile: False # set it to True for better memory and performance
+compile=False  # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance
 
 # Training env
 device: cuda
diff --git a/recipes/configs/llama3_2_vision/11B_full_single_device.yaml b/recipes/configs/llama3_2_vision/11B_full_single_device.yaml
index 3e02d5d103..e72ccbfdbd 100644
--- a/recipes/configs/llama3_2_vision/11B_full_single_device.yaml
+++ b/recipes/configs/llama3_2_vision/11B_full_single_device.yaml
@@ -63,7 +63,7 @@ optimizer_in_bwd: False
 loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 clip_grad_norm: 1.0
-compile: False # set it to True for better memory and performance
+compile=False  # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance
 
 # Training env
 device: cuda
diff --git a/recipes/configs/llama3_2_vision/11B_lora.yaml b/recipes/configs/llama3_2_vision/11B_lora.yaml
index f0cd05d012..9d94e5ce7f 100644
--- a/recipes/configs/llama3_2_vision/11B_lora.yaml
+++ b/recipes/configs/llama3_2_vision/11B_lora.yaml
@@ -71,7 +71,7 @@ lr_scheduler:
 loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 clip_grad_norm: 1.0
-compile: False # set it to True for better memory and performance
+compile=False  # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance
 
 # Training env
 device: cuda
diff --git a/recipes/configs/llama3_2_vision/11B_lora_single_device.yaml b/recipes/configs/llama3_2_vision/11B_lora_single_device.yaml
index 83e2227ca5..fe1c04929c 100644
--- a/recipes/configs/llama3_2_vision/11B_lora_single_device.yaml
+++ b/recipes/configs/llama3_2_vision/11B_lora_single_device.yaml
@@ -70,7 +70,7 @@ lr_scheduler:
 loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 clip_grad_norm: 1.0
-compile: False # set it to True for better memory and performance
+compile=False  # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance
 
 # Training env
 device: cuda

From 9b475018178d40c6d5397048c61f2e2abb56c474 Mon Sep 17 00:00:00 2001
From: krammnic <krammnic@krammnic.krammnic.com>
Date: Wed, 23 Oct 2024 09:19:12 -0400
Subject: [PATCH 06/10] add cuda check

---
 recipes/full_finetune_distributed.py            | 6 ++++++
 recipes/full_finetune_single_device.py          | 6 ++++++
 recipes/knowledge_distillation_single_device.py | 6 ++++++
 recipes/lora_dpo_distributed.py                 | 7 +++++++
 recipes/lora_dpo_single_device.py               | 6 ++++++
 recipes/lora_finetune_distributed.py            | 7 +++++++
 recipes/lora_finetune_single_device.py          | 6 ++++++
 recipes/ppo_full_finetune_single_device.py      | 6 ++++++
 recipes/qat_distributed.py                      | 6 ++++++
 9 files changed, 56 insertions(+)

diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py
index 6e83e575f9..8221eadc04 100644
--- a/recipes/full_finetune_distributed.py
+++ b/recipes/full_finetune_distributed.py
@@ -121,6 +121,12 @@ def __init__(self, cfg: DictConfig) -> None:
         self._log_every_n_steps = cfg.get("log_every_n_steps", 1)
         self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False)
 
+        if self._log_peak_memory_stats and self._device.type == "cuda":
+            log.info(
+                "log_peak_memory_stats was se to True, however, training does not use cuda. Setting log_peak_memory_stats=False."
+            )
+            self._log_peak_memory_stats = False
+
         # _is_rank_zero is used primarily for logging. In the future, the logger
         # should directly take care of this
         _, rank = training.get_world_size_and_rank()
diff --git a/recipes/full_finetune_single_device.py b/recipes/full_finetune_single_device.py
index 2addd92944..ea7bd5a6e6 100644
--- a/recipes/full_finetune_single_device.py
+++ b/recipes/full_finetune_single_device.py
@@ -116,6 +116,12 @@ def __init__(self, cfg: DictConfig) -> None:
         self._log_every_n_steps = cfg.get("log_every_n_steps", 1)
         self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False)
 
+        if self._log_peak_memory_stats and self._device.type == "cuda":
+            log.info(
+                "log_peak_memory_stats was se to True, however, training does not use cuda. Setting log_peak_memory_stats=False."
+            )
+            self._log_peak_memory_stats = False
+
         # Training cfg
         self._resume_from_checkpoint = cfg.resume_from_checkpoint
         self._gradient_accumulation_steps = cfg.gradient_accumulation_steps
diff --git a/recipes/knowledge_distillation_single_device.py b/recipes/knowledge_distillation_single_device.py
index c2ee8c7cc4..db357e031c 100644
--- a/recipes/knowledge_distillation_single_device.py
+++ b/recipes/knowledge_distillation_single_device.py
@@ -120,6 +120,12 @@ def __init__(self, cfg: DictConfig) -> None:
         self._log_every_n_steps = cfg.get("log_every_n_steps", 1)
         self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False)
 
+        if self._log_peak_memory_stats and self._device.type == "cuda":
+            log.info(
+                "log_peak_memory_stats was se to True, however, training does not use cuda. Setting log_peak_memory_stats=False."
+            )
+            self._log_peak_memory_stats = False
+
         # These are public properties which are updated by the checkpoint loader
         # when ``resume_from_checkpoint`` is `True` or validated in tests
         self.seed = training.set_seed(seed=cfg.seed)
diff --git a/recipes/lora_dpo_distributed.py b/recipes/lora_dpo_distributed.py
index e903ab274a..acc48e6390 100644
--- a/recipes/lora_dpo_distributed.py
+++ b/recipes/lora_dpo_distributed.py
@@ -130,6 +130,13 @@ def __init__(self, cfg: DictConfig) -> None:
         self._log_every_n_steps = cfg.get("log_every_n_steps", 1)
         self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False)
 
+        if self._log_peak_memory_stats and self._device.type == "cuda":
+            log.info(
+                "log_peak_memory_stats was se to True, however, training does not use cuda. Setting log_peak_memory_stats=False."
+            )
+            self._log_peak_memory_stats = False
+
+
         # training attributes
         self._enable_activation_checkpointing = cfg.enable_activation_checkpointing
 
diff --git a/recipes/lora_dpo_single_device.py b/recipes/lora_dpo_single_device.py
index c158d17875..a7a882afda 100644
--- a/recipes/lora_dpo_single_device.py
+++ b/recipes/lora_dpo_single_device.py
@@ -95,6 +95,12 @@ def __init__(self, cfg: DictConfig) -> None:
         self._log_every_n_steps = cfg.get("log_every_n_steps", 1)
         self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False)
 
+        if self._log_peak_memory_stats and self._device.type == "cuda":
+            log.info(
+                "log_peak_memory_stats was se to True, however, training does not use cuda. Setting log_peak_memory_stats=False."
+            )
+            self._log_peak_memory_stats
+
         # These are public properties which are updated by the checkpoint loader
         # when ``resume_from_checkpoint`` is `True` or validated in tests
         self.seed = training.set_seed(seed=cfg.seed)
diff --git a/recipes/lora_finetune_distributed.py b/recipes/lora_finetune_distributed.py
index 1569dfee63..556a11c16b 100644
--- a/recipes/lora_finetune_distributed.py
+++ b/recipes/lora_finetune_distributed.py
@@ -151,6 +151,12 @@ def __init__(self, cfg: DictConfig) -> None:
         self._log_every_n_steps = cfg.get("log_every_n_steps", 1)
         self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False)
 
+        if self._log_peak_memory_stats and self._device.type == "cuda":
+            log.info(
+                "log_peak_memory_stats was se to True, however, training does not use cuda. Setting log_peak_memory_stats=False."
+            )
+            self._log_peak_memory_stats = False
+
         # training attributes
         self._enable_activation_checkpointing = cfg.enable_activation_checkpointing
         self._enable_activation_offloading = cfg.get(
@@ -836,6 +842,7 @@ def train(self) -> None:
                             log_dict.update(
                                 training.get_memory_stats(device=self._device)
                             )
+
                         if self._clip_grad_norm is not None:
                             log_dict.update({"grad_norm": grad_norm})
                         self._metric_logger.log_dict(
diff --git a/recipes/lora_finetune_single_device.py b/recipes/lora_finetune_single_device.py
index 5d39b72086..c580389cb3 100644
--- a/recipes/lora_finetune_single_device.py
+++ b/recipes/lora_finetune_single_device.py
@@ -141,6 +141,12 @@ def __init__(self, cfg: DictConfig) -> None:
         self._log_every_n_steps = cfg.get("log_every_n_steps", 1)
         self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False)
 
+        if self._log_peak_memory_stats and self._device.type == "cuda":
+            log.info(
+                "log_peak_memory_stats was se to True, however, training does not use cuda. Setting log_peak_memory_stats=False."
+            )
+            self._log_peak_memory_stats = False
+
         # These are public properties which are updated by the checkpoint loader
         # when ``resume_from_checkpoint`` is `True` or validated in tests
         self.seed = training.set_seed(seed=cfg.seed)
diff --git a/recipes/ppo_full_finetune_single_device.py b/recipes/ppo_full_finetune_single_device.py
index 7679af3fd3..aa0895b7f8 100644
--- a/recipes/ppo_full_finetune_single_device.py
+++ b/recipes/ppo_full_finetune_single_device.py
@@ -119,6 +119,12 @@ def __init__(self, cfg: DictConfig) -> None:
         self._log_every_n_steps = cfg.get("log_every_n_steps", 1)
         self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False)
 
+        if self._log_peak_memory_stats and self._device.type == "cuda":
+            log.info(
+                "log_peak_memory_stats was se to True, however, training does not use cuda. Setting log_peak_memory_stats=False."
+            )
+            self._log_peak_memory_stats = False
+
         # These are public properties which are updated by the checkpoint loader
         # when ``resume_from_checkpoint`` is `True` or validated in tests
         self.seed = training.set_seed(seed=cfg.seed)
diff --git a/recipes/qat_distributed.py b/recipes/qat_distributed.py
index eb2e44fae2..2d7169e9da 100644
--- a/recipes/qat_distributed.py
+++ b/recipes/qat_distributed.py
@@ -127,6 +127,12 @@ def __init__(self, cfg: DictConfig) -> None:
         self._log_every_n_steps = cfg.get("log_every_n_steps", 1)
         self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False)
 
+        if self._log_peak_memory_stats and self._device.type == "cuda":
+            log.info(
+                "log_peak_memory_stats was se to True, however, training does not use cuda. Setting log_peak_memory_stats=False."
+            )
+            self._log_peak_memory_stats = False
+
         # _is_rank_zero is used primarily for logging. In the future, the logger
         # should directly take care of this
         _, rank = training.get_world_size_and_rank()

From e60eebd8bec69d9fc65aad4b9710d06c699f5437 Mon Sep 17 00:00:00 2001
From: krammnic <krammnic@krammnic.krammnic.com>
Date: Wed, 23 Oct 2024 09:20:37 -0400
Subject: [PATCH 07/10] fix lint

---
 recipes/lora_dpo_distributed.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/recipes/lora_dpo_distributed.py b/recipes/lora_dpo_distributed.py
index acc48e6390..f008e71b91 100644
--- a/recipes/lora_dpo_distributed.py
+++ b/recipes/lora_dpo_distributed.py
@@ -136,7 +136,6 @@ def __init__(self, cfg: DictConfig) -> None:
             )
             self._log_peak_memory_stats = False
 
-
         # training attributes
         self._enable_activation_checkpointing = cfg.enable_activation_checkpointing
 

From dacc3d2b3a045352ad0e2a2f5b29f19e03cbe360 Mon Sep 17 00:00:00 2001
From: krammnic <krammnic@krammnic.krammnic.com>
Date: Thu, 24 Oct 2024 09:38:19 -0400
Subject: [PATCH 08/10] fixes

---
 recipes/configs/qwen2/knowledge_distillation_single_device.yaml | 1 -
 recipes/lora_dpo_single_device.py                               | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/recipes/configs/qwen2/knowledge_distillation_single_device.yaml b/recipes/configs/qwen2/knowledge_distillation_single_device.yaml
index 078a91c417..f7d1b191cd 100644
--- a/recipes/configs/qwen2/knowledge_distillation_single_device.yaml
+++ b/recipes/configs/qwen2/knowledge_distillation_single_device.yaml
@@ -7,7 +7,6 @@
 #   tune download Qwen/Qwen2-1.5B-Instruct --output-dir /tmp/Qwen2-1.5B-Instruct --ignore-patterns None
 #
 # You get better results using KD if the teacher model has already been fine-tuned on the target dataset:
-  packed: False # Set to true for great speed ups
 #   tune run lora_finetune_single_device --config qwen2/1.5B_lora_single_device
 #
 # To launch on a single device, run the following command from root:
diff --git a/recipes/lora_dpo_single_device.py b/recipes/lora_dpo_single_device.py
index a7a882afda..dca5553dac 100644
--- a/recipes/lora_dpo_single_device.py
+++ b/recipes/lora_dpo_single_device.py
@@ -99,7 +99,7 @@ def __init__(self, cfg: DictConfig) -> None:
             log.info(
                 "log_peak_memory_stats was se to True, however, training does not use cuda. Setting log_peak_memory_stats=False."
             )
-            self._log_peak_memory_stats
+            self._log_peak_memory_stats = False
 
         # These are public properties which are updated by the checkpoint loader
         # when ``resume_from_checkpoint`` is `True` or validated in tests

From a311232e845b5052131254029351c6467bf84c32 Mon Sep 17 00:00:00 2001
From: krammnic <krammnic@krammnic.krammnic.com>
Date: Sat, 26 Oct 2024 07:36:42 -0400
Subject: [PATCH 09/10] fix compile typo

---
 recipes/configs/llama3/70B_full.yaml                        | 2 +-
 recipes/configs/llama3/70B_lora.yaml                        | 2 +-
 recipes/configs/llama3_1/405B_qlora.yaml                    | 2 +-
 recipes/configs/llama3_1/70B_full.yaml                      | 2 +-
 recipes/configs/llama3_1/70B_lora.yaml                      | 2 +-
 recipes/configs/llama3_1/8B_full.yaml                       | 2 +-
 recipes/configs/llama3_1/8B_full_single_device.yaml         | 2 +-
 recipes/configs/llama3_1/8B_lora.yaml                       | 2 +-
 recipes/configs/llama3_1/8B_lora_single_device.yaml         | 2 +-
 recipes/configs/llama3_1/8B_qlora_single_device.yaml        | 2 +-
 recipes/configs/llama3_2/1B_full.yaml                       | 2 +-
 recipes/configs/llama3_2/1B_full_single_device.yaml         | 2 +-
 recipes/configs/llama3_2/1B_lora.yaml                       | 2 +-
 recipes/configs/llama3_2/1B_lora_single_device.yaml         | 2 +-
 recipes/configs/llama3_2/1B_qlora_single_device.yaml        | 2 +-
 recipes/configs/llama3_2/3B_full.yaml                       | 2 +-
 recipes/configs/llama3_2/3B_full_single_device.yaml         | 2 +-
 recipes/configs/llama3_2/3B_lora.yaml                       | 2 +-
 recipes/configs/llama3_2/3B_lora_single_device.yaml         | 2 +-
 recipes/configs/llama3_2/3B_qlora_single_device.yaml        | 2 +-
 recipes/configs/llama3_2_vision/11B_full.yaml               | 2 +-
 recipes/configs/llama3_2_vision/11B_full_single_device.yaml | 2 +-
 recipes/configs/llama3_2_vision/11B_lora.yaml               | 2 +-
 recipes/configs/llama3_2_vision/11B_lora_single_device.yaml | 2 +-
 24 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/recipes/configs/llama3/70B_full.yaml b/recipes/configs/llama3/70B_full.yaml
index 21f0870094..91d95d2898 100644
--- a/recipes/configs/llama3/70B_full.yaml
+++ b/recipes/configs/llama3/70B_full.yaml
@@ -100,7 +100,7 @@ device: cuda
 enable_activation_checkpointing: True
 custom_sharded_layers: ['tok_embeddings', 'output']
 fsdp_cpu_offload: True
-compile=False  # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance
+compile: False # pytorch compile, set to true for perf/memory improvement
 
 # Reduced precision
 dtype: bf16
diff --git a/recipes/configs/llama3/70B_lora.yaml b/recipes/configs/llama3/70B_lora.yaml
index 49b246a23c..247daba5cc 100644
--- a/recipes/configs/llama3/70B_lora.yaml
+++ b/recipes/configs/llama3/70B_lora.yaml
@@ -90,7 +90,7 @@ loss:
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1
-compile=False  # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance
+compile: False # pytorch compile, set to true for perf/memory improvement
 
 # Logging
 output_dir: /tmp/lora_finetune_output
diff --git a/recipes/configs/llama3_1/405B_qlora.yaml b/recipes/configs/llama3_1/405B_qlora.yaml
index 8e4359b632..e59b333fa1 100644
--- a/recipes/configs/llama3_1/405B_qlora.yaml
+++ b/recipes/configs/llama3_1/405B_qlora.yaml
@@ -72,7 +72,7 @@ fsdp:
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 16
-compile=False  # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance
+compile: False # pytorch compile, set to true for perf/memory improvement
 
 # Logging
 output_dir: /tmp/qlora_finetune_output
diff --git a/recipes/configs/llama3_1/70B_full.yaml b/recipes/configs/llama3_1/70B_full.yaml
index b5619ac0b4..c016015056 100644
--- a/recipes/configs/llama3_1/70B_full.yaml
+++ b/recipes/configs/llama3_1/70B_full.yaml
@@ -102,7 +102,7 @@ device: cuda
 enable_activation_checkpointing: True
 custom_sharded_layers: ['tok_embeddings', 'output']
 fsdp_cpu_offload: True
-compile=False  # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance
+compile: False # pytorch compile, set to true for perf/memory improvement
 
 # Reduced precision
 dtype: bf16
diff --git a/recipes/configs/llama3_1/70B_lora.yaml b/recipes/configs/llama3_1/70B_lora.yaml
index 030eb0a5fb..ad1bc64110 100644
--- a/recipes/configs/llama3_1/70B_lora.yaml
+++ b/recipes/configs/llama3_1/70B_lora.yaml
@@ -89,7 +89,7 @@ loss:
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1
-compile=False  # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance
+compile: False # pytorch compile, set to true for perf/memory improvement
 
 # Logging
 output_dir: /tmp/lora-llama3_1-finetune-output
diff --git a/recipes/configs/llama3_1/8B_full.yaml b/recipes/configs/llama3_1/8B_full.yaml
index 43fb5a11b8..da27c91852 100644
--- a/recipes/configs/llama3_1/8B_full.yaml
+++ b/recipes/configs/llama3_1/8B_full.yaml
@@ -69,7 +69,7 @@ device: cuda
 # Memory management
 enable_activation_checkpointing: True
 custom_sharded_layers: ['tok_embeddings', 'output']
-compile=False  # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance
+compile: False # pytorch compile, set to true for perf/memory improvement
 
 # Reduced precision
 dtype: bf16
diff --git a/recipes/configs/llama3_1/8B_full_single_device.yaml b/recipes/configs/llama3_1/8B_full_single_device.yaml
index e99d77164f..04ba339b23 100644
--- a/recipes/configs/llama3_1/8B_full_single_device.yaml
+++ b/recipes/configs/llama3_1/8B_full_single_device.yaml
@@ -62,7 +62,7 @@ loss:
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1
 optimizer_in_bwd: True
-compile=False  # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance
+compile: False # pytorch compile, set to true for perf/memory improvement
 
 # Training environment
 device: cuda
diff --git a/recipes/configs/llama3_1/8B_lora.yaml b/recipes/configs/llama3_1/8B_lora.yaml
index 6f37eeeaa8..d0a5202847 100644
--- a/recipes/configs/llama3_1/8B_lora.yaml
+++ b/recipes/configs/llama3_1/8B_lora.yaml
@@ -73,7 +73,7 @@ loss:
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 32
-compile=False  # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance
+compile: False # pytorch compile, set to true for perf/memory improvement
 
 # Logging
 output_dir: /tmp/lora_finetune_output
diff --git a/recipes/configs/llama3_1/8B_lora_single_device.yaml b/recipes/configs/llama3_1/8B_lora_single_device.yaml
index 41caed2a38..bc9a3956f3 100644
--- a/recipes/configs/llama3_1/8B_lora_single_device.yaml
+++ b/recipes/configs/llama3_1/8B_lora_single_device.yaml
@@ -72,7 +72,7 @@ loss:
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 64
-compile=False  # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance
+compile: False # pytorch compile, set to true for perf/memory improvement
 
 # Logging
 output_dir: /tmp/lora_finetune_output
diff --git a/recipes/configs/llama3_1/8B_qlora_single_device.yaml b/recipes/configs/llama3_1/8B_qlora_single_device.yaml
index f3476fcb16..b194acb181 100644
--- a/recipes/configs/llama3_1/8B_qlora_single_device.yaml
+++ b/recipes/configs/llama3_1/8B_qlora_single_device.yaml
@@ -71,7 +71,7 @@ loss:
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 16
-compile=False  # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance
+compile: False # pytorch compile, set to true for perf/memory improvement
 
 # Logging
 output_dir: /tmp/qlora_finetune_output/
diff --git a/recipes/configs/llama3_2/1B_full.yaml b/recipes/configs/llama3_2/1B_full.yaml
index 97257c623a..c90fea966f 100644
--- a/recipes/configs/llama3_2/1B_full.yaml
+++ b/recipes/configs/llama3_2/1B_full.yaml
@@ -65,7 +65,7 @@ device: cuda
 
 # Memory management
 enable_activation_checkpointing: False
-compile=False  # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance
+compile: False # pytorch compile, set to true for perf/memory improvement
 
 # Reduced precision
 dtype: bf16
diff --git a/recipes/configs/llama3_2/1B_full_single_device.yaml b/recipes/configs/llama3_2/1B_full_single_device.yaml
index c69b592cf8..e4d1f87fac 100644
--- a/recipes/configs/llama3_2/1B_full_single_device.yaml
+++ b/recipes/configs/llama3_2/1B_full_single_device.yaml
@@ -59,7 +59,7 @@ loss:
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1
 optimizer_in_bwd: True
-compile=False  # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance
+compile: False # pytorch compile, set to true for perf/memory improvement
 
 # Training environment
 device: cuda
diff --git a/recipes/configs/llama3_2/1B_lora.yaml b/recipes/configs/llama3_2/1B_lora.yaml
index b531461a4b..b5e53900ef 100644
--- a/recipes/configs/llama3_2/1B_lora.yaml
+++ b/recipes/configs/llama3_2/1B_lora.yaml
@@ -70,7 +70,7 @@ loss:
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 4
-compile=False  # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance
+compile: False # pytorch compile, set to true for perf/memory improvement
 
 # Logging
 output_dir: /tmp/lora_finetune_output
diff --git a/recipes/configs/llama3_2/1B_lora_single_device.yaml b/recipes/configs/llama3_2/1B_lora_single_device.yaml
index e2b97a6d77..8c94bb0582 100644
--- a/recipes/configs/llama3_2/1B_lora_single_device.yaml
+++ b/recipes/configs/llama3_2/1B_lora_single_device.yaml
@@ -69,7 +69,7 @@ loss:
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 4
-compile=False  # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance
+compile: False # pytorch compile, set to true for perf/memory improvement
 
 # Logging
 output_dir: /tmp/lora_finetune_output
diff --git a/recipes/configs/llama3_2/1B_qlora_single_device.yaml b/recipes/configs/llama3_2/1B_qlora_single_device.yaml
index 0372578e23..282d0d9e89 100644
--- a/recipes/configs/llama3_2/1B_qlora_single_device.yaml
+++ b/recipes/configs/llama3_2/1B_qlora_single_device.yaml
@@ -68,7 +68,7 @@ loss:
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 4
-compile=False  # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance
+compile: False # pytorch compile, set to true for perf/memory improvement
 
 # Logging
 output_dir: /tmp/lora_finetune_output
diff --git a/recipes/configs/llama3_2/3B_full.yaml b/recipes/configs/llama3_2/3B_full.yaml
index 027a1d9e27..bfe9ef6420 100644
--- a/recipes/configs/llama3_2/3B_full.yaml
+++ b/recipes/configs/llama3_2/3B_full.yaml
@@ -65,7 +65,7 @@ device: cuda
 
 # Memory management
 enable_activation_checkpointing: True
-compile=False  # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance
+compile: False # pytorch compile, set to true for perf/memory improvement
 
 # Reduced precision
 dtype: bf16
diff --git a/recipes/configs/llama3_2/3B_full_single_device.yaml b/recipes/configs/llama3_2/3B_full_single_device.yaml
index 29955d8d47..14a5369e71 100644
--- a/recipes/configs/llama3_2/3B_full_single_device.yaml
+++ b/recipes/configs/llama3_2/3B_full_single_device.yaml
@@ -60,7 +60,7 @@ loss:
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1
 optimizer_in_bwd: True
-compile=False  # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance
+compile: False # pytorch compile, set to true for perf/memory improvement
 
 # Training environment
 device: cuda
diff --git a/recipes/configs/llama3_2/3B_lora.yaml b/recipes/configs/llama3_2/3B_lora.yaml
index e30ad9d63b..076f9d9171 100644
--- a/recipes/configs/llama3_2/3B_lora.yaml
+++ b/recipes/configs/llama3_2/3B_lora.yaml
@@ -71,7 +71,7 @@ loss:
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 4
-compile=False  # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance
+compile: False # pytorch compile, set to true for perf/memory improvement
 
 # Logging
 output_dir: /tmp/lora_finetune_output
diff --git a/recipes/configs/llama3_2/3B_lora_single_device.yaml b/recipes/configs/llama3_2/3B_lora_single_device.yaml
index 3e888090a8..b36d18f872 100644
--- a/recipes/configs/llama3_2/3B_lora_single_device.yaml
+++ b/recipes/configs/llama3_2/3B_lora_single_device.yaml
@@ -70,7 +70,7 @@ loss:
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 4
-compile=False  # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance
+compile: False # pytorch compile, set to true for perf/memory improvement
 
 # Logging
 output_dir: /tmp/lora_finetune_output
diff --git a/recipes/configs/llama3_2/3B_qlora_single_device.yaml b/recipes/configs/llama3_2/3B_qlora_single_device.yaml
index a98e1ac9c0..3efbd6c43c 100644
--- a/recipes/configs/llama3_2/3B_qlora_single_device.yaml
+++ b/recipes/configs/llama3_2/3B_qlora_single_device.yaml
@@ -69,7 +69,7 @@ loss:
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 4
-compile=False  # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance
+compile: False # pytorch compile, set to true for perf/memory improvement
 
 # Logging
 output_dir: /tmp/lora_finetune_output
diff --git a/recipes/configs/llama3_2_vision/11B_full.yaml b/recipes/configs/llama3_2_vision/11B_full.yaml
index 2ef3298f11..3837e8c39c 100644
--- a/recipes/configs/llama3_2_vision/11B_full.yaml
+++ b/recipes/configs/llama3_2_vision/11B_full.yaml
@@ -61,7 +61,7 @@ optimizer:
 loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 clip_grad_norm: 1.0
-compile=False  # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance
+compile: False # pytorch compile, set to true for perf/memory improvement
 
 # Training env
 device: cuda
diff --git a/recipes/configs/llama3_2_vision/11B_full_single_device.yaml b/recipes/configs/llama3_2_vision/11B_full_single_device.yaml
index e72ccbfdbd..93d8c2cd11 100644
--- a/recipes/configs/llama3_2_vision/11B_full_single_device.yaml
+++ b/recipes/configs/llama3_2_vision/11B_full_single_device.yaml
@@ -63,7 +63,7 @@ optimizer_in_bwd: False
 loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 clip_grad_norm: 1.0
-compile=False  # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance
+compile: False # pytorch compile, set to true for perf/memory improvement
 
 # Training env
 device: cuda
diff --git a/recipes/configs/llama3_2_vision/11B_lora.yaml b/recipes/configs/llama3_2_vision/11B_lora.yaml
index 9d94e5ce7f..449d786153 100644
--- a/recipes/configs/llama3_2_vision/11B_lora.yaml
+++ b/recipes/configs/llama3_2_vision/11B_lora.yaml
@@ -71,7 +71,7 @@ lr_scheduler:
 loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 clip_grad_norm: 1.0
-compile=False  # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance
+compile: False # pytorch compile, set to true for perf/memory improvement
 
 # Training env
 device: cuda
diff --git a/recipes/configs/llama3_2_vision/11B_lora_single_device.yaml b/recipes/configs/llama3_2_vision/11B_lora_single_device.yaml
index fe1c04929c..15b097e6ed 100644
--- a/recipes/configs/llama3_2_vision/11B_lora_single_device.yaml
+++ b/recipes/configs/llama3_2_vision/11B_lora_single_device.yaml
@@ -70,7 +70,7 @@ lr_scheduler:
 loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 clip_grad_norm: 1.0
-compile=False  # pytorch compile, set to true for perf/memory improvement# set it to True for better memory and performance
+compile: False # pytorch compile, set to true for perf/memory improvement
 
 # Training env
 device: cuda

From b0b4b1410f99fb99718483b9a8ebd48418b6e8f5 Mon Sep 17 00:00:00 2001
From: krammnic <krammnic@krammnic.krammnic.com>
Date: Sat, 26 Oct 2024 07:37:56 -0400
Subject: [PATCH 10/10] fix other typos

---
 recipes/full_finetune_distributed.py            | 4 ++--
 recipes/full_finetune_single_device.py          | 4 ++--
 recipes/knowledge_distillation_single_device.py | 4 ++--
 recipes/lora_dpo_distributed.py                 | 4 ++--
 recipes/lora_dpo_single_device.py               | 4 ++--
 recipes/lora_finetune_distributed.py            | 4 ++--
 recipes/lora_finetune_single_device.py          | 4 ++--
 recipes/ppo_full_finetune_single_device.py      | 4 ++--
 recipes/qat_distributed.py                      | 4 ++--
 9 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py
index 8221eadc04..a6c0d81724 100644
--- a/recipes/full_finetune_distributed.py
+++ b/recipes/full_finetune_distributed.py
@@ -121,9 +121,9 @@ def __init__(self, cfg: DictConfig) -> None:
         self._log_every_n_steps = cfg.get("log_every_n_steps", 1)
         self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False)
 
-        if self._log_peak_memory_stats and self._device.type == "cuda":
+        if self._log_peak_memory_stats and self._device.type != "cuda":
             log.info(
-                "log_peak_memory_stats was se to True, however, training does not use cuda. Setting log_peak_memory_stats=False."
+                "log_peak_memory_stats was set to True, however, training does not use cuda. Setting log_peak_memory_stats=False."
             )
             self._log_peak_memory_stats = False
 
diff --git a/recipes/full_finetune_single_device.py b/recipes/full_finetune_single_device.py
index ea7bd5a6e6..469732a217 100644
--- a/recipes/full_finetune_single_device.py
+++ b/recipes/full_finetune_single_device.py
@@ -116,9 +116,9 @@ def __init__(self, cfg: DictConfig) -> None:
         self._log_every_n_steps = cfg.get("log_every_n_steps", 1)
         self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False)
 
-        if self._log_peak_memory_stats and self._device.type == "cuda":
+        if self._log_peak_memory_stats and self._device.type != "cuda":
             log.info(
-                "log_peak_memory_stats was se to True, however, training does not use cuda. Setting log_peak_memory_stats=False."
+                "log_peak_memory_stats was set to True, however, training does not use cuda. Setting log_peak_memory_stats=False."
             )
             self._log_peak_memory_stats = False
 
diff --git a/recipes/knowledge_distillation_single_device.py b/recipes/knowledge_distillation_single_device.py
index db357e031c..8612fa8f8e 100644
--- a/recipes/knowledge_distillation_single_device.py
+++ b/recipes/knowledge_distillation_single_device.py
@@ -120,9 +120,9 @@ def __init__(self, cfg: DictConfig) -> None:
         self._log_every_n_steps = cfg.get("log_every_n_steps", 1)
         self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False)
 
-        if self._log_peak_memory_stats and self._device.type == "cuda":
+        if self._log_peak_memory_stats and self._device.type != "cuda":
             log.info(
-                "log_peak_memory_stats was se to True, however, training does not use cuda. Setting log_peak_memory_stats=False."
+                "log_peak_memory_stats was set to True, however, training does not use cuda. Setting log_peak_memory_stats=False."
             )
             self._log_peak_memory_stats = False
 
diff --git a/recipes/lora_dpo_distributed.py b/recipes/lora_dpo_distributed.py
index f008e71b91..dd98f72685 100644
--- a/recipes/lora_dpo_distributed.py
+++ b/recipes/lora_dpo_distributed.py
@@ -130,9 +130,9 @@ def __init__(self, cfg: DictConfig) -> None:
         self._log_every_n_steps = cfg.get("log_every_n_steps", 1)
         self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False)
 
-        if self._log_peak_memory_stats and self._device.type == "cuda":
+        if self._log_peak_memory_stats and self._device.type != "cuda":
             log.info(
-                "log_peak_memory_stats was se to True, however, training does not use cuda. Setting log_peak_memory_stats=False."
+                "log_peak_memory_stats was set to True, however, training does not use cuda. Setting log_peak_memory_stats=False."
             )
             self._log_peak_memory_stats = False
 
diff --git a/recipes/lora_dpo_single_device.py b/recipes/lora_dpo_single_device.py
index dca5553dac..f34694ccc8 100644
--- a/recipes/lora_dpo_single_device.py
+++ b/recipes/lora_dpo_single_device.py
@@ -95,9 +95,9 @@ def __init__(self, cfg: DictConfig) -> None:
         self._log_every_n_steps = cfg.get("log_every_n_steps", 1)
         self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False)
 
-        if self._log_peak_memory_stats and self._device.type == "cuda":
+        if self._log_peak_memory_stats and self._device.type != "cuda":
             log.info(
-                "log_peak_memory_stats was se to True, however, training does not use cuda. Setting log_peak_memory_stats=False."
+                "log_peak_memory_stats was set to True, however, training does not use cuda. Setting log_peak_memory_stats=False."
             )
             self._log_peak_memory_stats = False
 
diff --git a/recipes/lora_finetune_distributed.py b/recipes/lora_finetune_distributed.py
index 556a11c16b..98a09705f6 100644
--- a/recipes/lora_finetune_distributed.py
+++ b/recipes/lora_finetune_distributed.py
@@ -151,9 +151,9 @@ def __init__(self, cfg: DictConfig) -> None:
         self._log_every_n_steps = cfg.get("log_every_n_steps", 1)
         self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False)
 
-        if self._log_peak_memory_stats and self._device.type == "cuda":
+        if self._log_peak_memory_stats and self._device.type != "cuda":
             log.info(
-                "log_peak_memory_stats was se to True, however, training does not use cuda. Setting log_peak_memory_stats=False."
+                "log_peak_memory_stats was set to True, however, training does not use cuda. Setting log_peak_memory_stats=False."
             )
             self._log_peak_memory_stats = False
 
diff --git a/recipes/lora_finetune_single_device.py b/recipes/lora_finetune_single_device.py
index c580389cb3..34b5406e77 100644
--- a/recipes/lora_finetune_single_device.py
+++ b/recipes/lora_finetune_single_device.py
@@ -141,9 +141,9 @@ def __init__(self, cfg: DictConfig) -> None:
         self._log_every_n_steps = cfg.get("log_every_n_steps", 1)
         self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False)
 
-        if self._log_peak_memory_stats and self._device.type == "cuda":
+        if self._log_peak_memory_stats and self._device.type != "cuda":
             log.info(
-                "log_peak_memory_stats was se to True, however, training does not use cuda. Setting log_peak_memory_stats=False."
+                "log_peak_memory_stats was set to True, however, training does not use cuda. Setting log_peak_memory_stats=False."
             )
             self._log_peak_memory_stats = False
 
diff --git a/recipes/ppo_full_finetune_single_device.py b/recipes/ppo_full_finetune_single_device.py
index aa0895b7f8..1030217d74 100644
--- a/recipes/ppo_full_finetune_single_device.py
+++ b/recipes/ppo_full_finetune_single_device.py
@@ -119,9 +119,9 @@ def __init__(self, cfg: DictConfig) -> None:
         self._log_every_n_steps = cfg.get("log_every_n_steps", 1)
         self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False)
 
-        if self._log_peak_memory_stats and self._device.type == "cuda":
+        if self._log_peak_memory_stats and self._device.type != "cuda":
             log.info(
-                "log_peak_memory_stats was se to True, however, training does not use cuda. Setting log_peak_memory_stats=False."
+                "log_peak_memory_stats was set to True, however, training does not use cuda. Setting log_peak_memory_stats=False."
             )
             self._log_peak_memory_stats = False
 
diff --git a/recipes/qat_distributed.py b/recipes/qat_distributed.py
index 2d7169e9da..b717c8233a 100644
--- a/recipes/qat_distributed.py
+++ b/recipes/qat_distributed.py
@@ -127,9 +127,9 @@ def __init__(self, cfg: DictConfig) -> None:
         self._log_every_n_steps = cfg.get("log_every_n_steps", 1)
         self._log_peak_memory_stats = cfg.get("log_peak_memory_stats", False)
 
-        if self._log_peak_memory_stats and self._device.type == "cuda":
+        if self._log_peak_memory_stats and self._device.type != "cuda":
             log.info(
-                "log_peak_memory_stats was se to True, however, training does not use cuda. Setting log_peak_memory_stats=False."
+                "log_peak_memory_stats was set to True, however, training does not use cuda. Setting log_peak_memory_stats=False."
             )
             self._log_peak_memory_stats = False