sarvamai · rahul-sarvam · Dec 8, 2024 · Dec 6, 2024 · Dec 6, 2024 · Dec 6, 2024
diff --git a/README.md b/README.md
@@ -10,6 +10,7 @@
 [**Introduction**](#introduction) | [**Installation**](#installation) | [**Get Started**](#get-started) |  [**Documentation**](https://pytorch.org/torchtune/main/index.html) | [**Community**](#community) | [**License**](#license) | [**Citing torchtune**](#citing-torchtune)
 
 ### 📣 Recent updates 📣
+* *December 2024*: torchtune now supports **Llama 3.3 70B**! Try it out by following our installation instructions [here](#Installation), then run any of the configs [here](recipes/configs/llama3_3).
 * *November 2024*: torchtune has released [v0.4.0](https://github.com/pytorch/torchtune/releases/tag/v0.4.0) which includes stable support for exciting features like activation offloading and multimodal QLoRA
 * *November 2024*: torchtune has added [Gemma2](recipes/configs/gemma2) to its models!
 * *October 2024*: torchtune added support for Qwen2.5 models - find the recipes [here](recipes/configs/qwen2_5/)
@@ -39,6 +40,7 @@ torchtune currently supports the following models.
 
 | Model                                         | Sizes     |
 |-----------------------------------------------|-----------|
+| [Llama3.3](https://www.llama.com/docs/model-cards-and-prompt-formats/llama3_3)    | 70B [[models](torchtune/models/llama3_3/_model_builders.py), [configs](recipes/configs/llama3_3/)]        |
 | [Llama3.2-Vision](https://www.llama.com/docs/model-cards-and-prompt-formats/llama3_2#-llama-3.2-vision-models-(11b/90b)-)    | 11B, 90B [[models](torchtune/models/llama3_2_vision/_model_builders.py), [configs](recipes/configs/llama3_2_vision/)]        |
 | [Llama3.2](https://www.llama.com/docs/model-cards-and-prompt-formats/llama3_2)    | 1B, 3B [[models](torchtune/models/llama3_2/_model_builders.py), [configs](recipes/configs/llama3_2/)]        |
 | [Llama3.1](https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1)    | 8B, 70B, 405B [[models](torchtune/models/llama3_1/_model_builders.py), [configs](recipes/configs/llama3_1/)]        |

diff --git a/docs/source/api_ref_models.rst b/docs/source/api_ref_models.rst
@@ -6,6 +6,31 @@ torchtune.models
 
 .. currentmodule:: torchtune.models
 
+llama3.3
+--------
+
+Text-only models from the 3.3 version of `Llama3 family <https://llama.meta.com/llama3/>`_.
+
+Important: You need to request access on `Hugging Face <https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct>`__ before downloading it.
+
+To download the Llama-3.3-70B-Instruct model:
+
+.. code-block:: bash
+
+    tune download meta-llama/Llama-3.3-70B-Instruct --ignore-patterns "original/consolidated.00.pth" --hf-token <HF_TOKEN>
+
+.. autosummary::
+    :toctree: generated/
+    :nosignatures:
+
+    llama3_3.llama3_3_70b
+    llama3_3.lora_llama3_3_70b
+    llama3_3.qlora_llama3_3_70b
+
+.. note::
+
+    The Llama3.3 tokenizer reuses the :class:`~torchtune.models.llama3.llama3_tokenizer` class.
+
 llama3.2
 --------
 

diff --git a/docs/source/api_ref_modules.rst b/docs/source/api_ref_modules.rst
@@ -23,6 +23,8 @@ Modeling Components and Building Blocks
     TransformerCrossAttentionLayer
     TransformerDecoder
     VisionTransformer
+    LayerDropout
+    prepare_layer_dropout
 
 Losses
 ------

diff --git a/recipes/configs/code_llama2/7B_full_low_memory.yaml b/recipes/configs/code_llama2/7B_full_low_memory.yaml
@@ -19,6 +19,8 @@
 #
 # This config works only for training on single device.
 
+output_dir: /tmp/torchtune/code_llama2_7B/full_low_memory # /tmp may be deleted by your system. Change it to your preference.
+
 # Model arguments
 model:
   _component_: torchtune.models.code_llama2.code_llama2_7b
@@ -39,7 +41,7 @@ checkpointer:
     pytorch_model-00003-of-00003.bin
   ]
   recipe_checkpoint: null
-  output_dir: /tmp/CodeLlama-7b-hf
+  output_dir: ${output_dir}
   model_type: LLAMA2
 resume_from_checkpoint: False
 
@@ -55,14 +57,14 @@ shuffle: True
 epochs: 1
 max_steps_per_epoch: null
 batch_size: 2
-gradient_accumulation_steps: 1  # Use to increase virtual batch size
+gradient_accumulation_steps: 1  # Use to increase effective batch size
 optimizer:
   _component_: bitsandbytes.optim.PagedAdamW
   lr: 2e-5
 optimizer_in_bwd: True  # True saves memory. Requires gradient_accumulation_steps=1
 loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
-compile: False  # pytorch compile, set to true for better perf/memory
+compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Training env
 device: cuda
@@ -73,13 +75,13 @@ enable_activation_offloading: True  # True reduces memory
 dtype: bf16
 
 # Logging
-output_dir: /tmp/codellama_finetune_output
 metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
-  log_dir: /tmp/CodeLlama-7b-hf/logs
+  log_dir: ${output_dir}/logs
 log_every_n_steps: 1
 log_peak_memory_stats: True
 
+
 # Profiler (disabled)
 profiler:
   _component_: torchtune.training.setup_torch_profiler

diff --git a/recipes/configs/code_llama2/7B_lora_single_device.yaml b/recipes/configs/code_llama2/7B_lora_single_device.yaml
@@ -15,6 +15,8 @@
 #
 # This config works only for training on single device.
 
+output_dir: /tmp/torchtune/code_llama2_7B/lora_single_device # /tmp may be deleted by your system. Change it to your preference.
+
 # Model Arguments
 model:
   _component_: torchtune.models.code_llama2.lora_code_llama2_7b
@@ -42,7 +44,7 @@ checkpointer:
   ]
   adapter_checkpoint: null
   recipe_checkpoint: null
-  output_dir: /tmp/CodeLlama-7b-hf
+  output_dir: ${output_dir}
   model_type: LLAMA2
 resume_from_checkpoint: False
 save_adapter_weights_only: False
@@ -59,7 +61,7 @@ shuffle: True
 epochs: 1
 max_steps_per_epoch: null
 batch_size: 2
-gradient_accumulation_steps: 8  # Use to increase virtual batch size
+gradient_accumulation_steps: 8  # Use to increase effective batch size
 optimizer:
   _component_: torch.optim.AdamW
   fused: True
@@ -70,7 +72,7 @@ lr_scheduler:
   num_warmup_steps: 100
 loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
-compile: False  # pytorch compile, set to true for better perf/memory
+compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Training env
 device: cuda
@@ -81,10 +83,9 @@ enable_activation_offloading: False  # True reduces memory
 dtype: bf16
 
 # Logging
-output_dir: /tmp/codellama_lora_finetune_output
 metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
-  log_dir: /tmp/CodeLlama-7b-hf/logs
+  log_dir: ${output_dir}/logs
 log_every_n_steps: 1
 log_peak_memory_stats: True
 

diff --git a/recipes/configs/code_llama2/7B_qlora_single_device.yaml b/recipes/configs/code_llama2/7B_qlora_single_device.yaml
@@ -15,6 +15,8 @@
 #
 # This config works only for training on single device.
 
+output_dir: /tmp/torchtune/code_llama2_7B/qlora_single_device # /tmp may be deleted by your system. Change it to your preference.
+
 # Model Arguments
 model:
   _component_: torchtune.models.code_llama2.qlora_code_llama2_7b
@@ -42,7 +44,7 @@ checkpointer:
   ]
   adapter_checkpoint: null
   recipe_checkpoint: null
-  output_dir: /tmp/CodeLlama-7b-hf
+  output_dir: ${output_dir}
   model_type: LLAMA2
 resume_from_checkpoint: False
 save_adapter_weights_only: False
@@ -58,7 +60,7 @@ shuffle: True
 epochs: 1
 max_steps_per_epoch: null
 batch_size: 2
-gradient_accumulation_steps: 8  # Use to increase virtual batch size
+gradient_accumulation_steps: 8  # Use to increase effective batch size
 optimizer:
   _component_: torch.optim.AdamW
   fused: True
@@ -69,7 +71,7 @@ lr_scheduler:
   num_warmup_steps: 100
 loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
-compile: False  # pytorch compile, set to true for better perf/memory
+compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Training env
 device: cuda
@@ -80,10 +82,9 @@ enable_activation_offloading: False  # True reduces memory
 dtype: bf16
 
 # Logging
-output_dir: /tmp/codellama_qlora_finetune_output
 metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
-  log_dir: /tmp/CodeLlama-7b-hf/logs
+  log_dir: ${output_dir}/logs
 log_every_n_steps: 1
 log_peak_memory_stats: True
 

diff --git a/recipes/configs/dev/8B_full_experimental.yaml b/recipes/configs/dev/8B_full_experimental.yaml
@@ -18,6 +18,8 @@
 # best to use 8B_full_single_device.yaml for those cases
 
 
+output_dir: /tmp/torchtune/dev_8B/full_experimental # /tmp may be deleted by your system. Change it to your preference.
+
 # Tokenizer
 tokenizer:
   _component_: torchtune.models.llama3.llama3_tokenizer
@@ -42,7 +44,7 @@ checkpointer:
     consolidated.00.pth
   ]
   recipe_checkpoint: null
-  output_dir: /tmp/Meta-Llama-3-8B/
+  output_dir: ${output_dir}
   model_type: LLAMA3
 resume_from_checkpoint: False
 
@@ -57,8 +59,8 @@ optimizer:
 loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 max_steps_per_epoch: null
-gradient_accumulation_steps: 1  # Use to increase virtual batch size
-compile: False  # pytorch compile, set to true for better perf/memory
+gradient_accumulation_steps: 1  # Use to increase effective batch size
+compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Training env
 device: cuda
@@ -77,11 +79,11 @@ dtype: bf16
 # Logging
 metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
-  log_dir: ${output_dir}
-output_dir: /tmp/alpaca-llama3-finetune
+  log_dir: ${output_dir}/logs
 log_every_n_steps: null
 log_peak_memory_stats: True
 
+
 # Profiler (disabled)
 profiler:
   _component_: torchtune.training.setup_torch_profiler

diff --git a/recipes/configs/gemma/2B_full.yaml b/recipes/configs/gemma/2B_full.yaml
@@ -16,6 +16,8 @@
 # This config works only when the model is being fine-tuned on 2+ GPUs.
 
 
+output_dir: /tmp/torchtune/gemma_2B/full # /tmp may be deleted by your system. Change it to your preference.
+
 # Tokenizer
 tokenizer:
   _component_: torchtune.models.gemma.gemma_tokenizer
@@ -40,7 +42,7 @@ checkpointer:
     model-00002-of-00002.safetensors,
   ]
   recipe_checkpoint: null
-  output_dir: /tmp/gemma-2b
+  output_dir: ${output_dir}
   model_type: GEMMA
 resume_from_checkpoint: False
 
@@ -54,8 +56,8 @@ optimizer:
 loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 max_steps_per_epoch: null
-gradient_accumulation_steps: 1  # Use to increase virtual batch size
-compile: False  # pytorch compile, set to true for better perf/memory
+gradient_accumulation_steps: 1  # Use to increase effective batch size
+compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 optimizer_in_bwd: False  # True saves memory. Requires gradient_accumulation_steps=1
 
 # Training env
@@ -71,11 +73,11 @@ dtype: bf16
 # Logging
 metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
-  log_dir: ${output_dir}
-output_dir: /tmp/alpaca-gemma-finetune
+  log_dir: ${output_dir}/logs
 log_every_n_steps: 1
 log_peak_memory_stats: True
 
+
 # Profiler (disabled)
 profiler:
   _component_: torchtune.training.setup_torch_profiler

diff --git a/recipes/configs/gemma/2B_lora.yaml b/recipes/configs/gemma/2B_lora.yaml
@@ -15,6 +15,8 @@
 #
 # This config works only when the model is being fine-tuned on 2+ GPUs.
 
+output_dir: /tmp/torchtune/gemma_2B/lora # /tmp may be deleted by your system. Change it to your preference.
+
 # Tokenizer
 tokenizer:
   _component_: torchtune.models.gemma.gemma_tokenizer
@@ -44,7 +46,7 @@ checkpointer:
     model-00002-of-00002.safetensors,
   ]
   recipe_checkpoint: null
-  output_dir: /tmp/gemma-2b
+  output_dir: ${output_dir}
   model_type: GEMMA
 resume_from_checkpoint: False
 
@@ -66,8 +68,8 @@ loss:
 batch_size: 4
 epochs: 1
 max_steps_per_epoch: null
-gradient_accumulation_steps: 1  # Use to increase virtual batch size
-compile: False  # pytorch compile, set to true for better perf/memory
+gradient_accumulation_steps: 1  # Use to increase effective batch size
+compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Training env
 device: cuda
@@ -82,11 +84,11 @@ dtype: bf16
 # Logging
 metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
-  log_dir: ${output_dir}
-output_dir: /tmp/alpaca-gemma-lora
+  log_dir: ${output_dir}/logs
 log_every_n_steps: 1
 log_peak_memory_stats: True
 
+
 # Profiler (disabled)
 profiler:
   _component_: torchtune.training.setup_torch_profiler

diff --git a/recipes/configs/gemma/2B_lora_single_device.yaml b/recipes/configs/gemma/2B_lora_single_device.yaml
@@ -15,6 +15,8 @@
 #
 # This config works only for training on single device.
 
+output_dir: /tmp/torchtune/gemma_2B/lora_single_device # /tmp may be deleted by your system. Change it to your preference.
+
 # Tokenizer
 tokenizer:
   _component_: torchtune.models.gemma.gemma_tokenizer
@@ -44,7 +46,7 @@ checkpointer:
     model-00002-of-00002.safetensors,
   ]
   recipe_checkpoint: null
-  output_dir: /tmp/gemma-2b
+  output_dir: ${output_dir}
   model_type: GEMMA
 resume_from_checkpoint: False
 save_adapter_weights_only: False
@@ -65,8 +67,8 @@ loss:
 batch_size: 4
 epochs: 1
 max_steps_per_epoch: null
-gradient_accumulation_steps: 8  # Use to increase virtual batch size
-compile: False  # pytorch compile, set to true for better perf/memory
+gradient_accumulation_steps: 8  # Use to increase effective batch size
+compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Training env
 device: cuda
@@ -81,8 +83,7 @@ dtype: bf16
 # Logging
 metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
-  log_dir: ${output_dir}
-output_dir: /tmp/alpaca-gemma-lora
+  log_dir: ${output_dir}/logs
 log_every_n_steps: 1
 log_peak_memory_stats: True