pytorch · pbontrager · Sep 25, 2024 · Sep 25, 2024 · Sep 25, 2024 · Sep 25, 2024
diff --git a/docs/source/api_ref_models.rst b/docs/source/api_ref_models.rst
@@ -6,10 +6,77 @@ torchtune.models
 
 .. currentmodule:: torchtune.models
 
+llama3.2
+--------
+
+Text-only models from the 3.2 version of `Llama3 family <https://llama.meta.com/llama3/>`_.
+
+Important: You need to request access on `Hugging Face <https://huggingface.co/meta-llama/Meta-Llama-3.2-1B-Instruct>`__ before downloading it.
+
+To download the Llama-3.2-1B-Instruct model:
+
+.. code-block:: bash
+
+    tune download meta-llama/Meta-Llama-3.2-1B-Instruct --output-dir /tmp/Meta-Llama-3.2-1B-Instruct --ignore-patterns "original/consolidated.00.pth" --hf-token <HF_TOKEN>
+
+To download the Llama-3.2-3B-Instruct model:
+
+.. code-block:: bash
+
+    tune download meta-llama/Meta-Llama-3.2-3B-Instruct --output-dir /tmp/Meta-Llama-3.2-3B-Instruct --ignore-patterns "original/consolidated*" --hf-token <HF_TOKEN>
+
+.. autosummary::
+    :toctree: generated/
+    :nosignatures:
+
+    llama3_2.llama3_2_1b
+    llama3_2.llama3_2_3b
+    llama3_2.lora_llama3_2_1b
+    llama3_2.lora_llama3_2_3b
+    llama3_2.qlora_llama3_2_1b
+    llama3_2.qlora_llama3_2_3b
+
+.. note::
+
+    The Llama3.2 tokenizer reuses the :class:`~torchtune.models.llama3.llama3_tokenizer` class.
+
+llama3.2 Vision
+---------------
+
+Vision-Language Models from the 3.2 version of `Llama3 family <https://llama.meta.com/llama3/>`_.
+
+Important: You need to request access on `Hugging Face <https://huggingface.co/meta-llama/Meta-Llama-3.2-11B-Vision-Instruct>`__ before downloading it.
+
+To download the Llama-3.2-11B-Instruct model:
+
+.. code-block:: bash
+
+    tune download meta-llama/Meta-Llama-3.2-11B-Vision-Instruct --output-dir /tmp/Meta-Llama-3.2-11B-Vision-Instruct --hf-token <HF_TOKEN>
+
+.. autosummary::
+    :toctree: generated/
+    :nosignatures:
+
+    llama3_2_vision.llama3_2_vision_11b
+    llama3_2_vision.llama3_2_vision_transform
+    llama3_2_vision.lora_llama3_2_vision_11b
+    llama3_2_vision.qlora_llama3_2_vision_11b
+    llama3_2_vision.llama3_2_vision_decoder
+    llama3_2_vision.llama3_2_vision_encoder
+    llama3_2_vision.lora_llama3_2_vision_decoder
+    llama3_2_vision.lora_llama3_2_vision_encoder
+    llama3_2_vision.Llama3VisionEncoder
+    llama3_2_vision.Llama3VisionProjectionHead
+    llama3_2_vision.Llama3VisionTransform
+
+.. note::
+
+    The Llama3.2 tokenizer reuses the :class:`~torchtune.models.llama3.llama3_tokenizer` class.
+
 llama3 & llama3.1
 -----------------
 
-All models from the `Llama3 family <https://llama.meta.com/llama3/>`_.
+Models 3 and 3.1 from the `Llama3 family <https://llama.meta.com/llama3/>`_.
 
 Important: You need to request access on `Hugging Face <https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct>`__ before downloading it.
 

diff --git a/docs/source/basics/model_transforms.rst b/docs/source/basics/model_transforms.rst
@@ -11,8 +11,8 @@ These are intended to be drop-in replacements for tokenizers in multimodal datas
 
 .. code-block:: python
 
-    # torchtune.models.flamingo.FlamingoTransform
-    class FlamingoTransform(ModelTokenizer, Transform):
+    # torchtune.models.llama3_2_vision.Llama3VisionTransform
+    class Llama3VisionTransform(ModelTokenizer, Transform):
         def __init__(...):
             # Text transform - standard tokenization
             self.tokenizer = llama3_tokenizer(...)
@@ -23,7 +23,7 @@ These are intended to be drop-in replacements for tokenizers in multimodal datas
 
 .. code-block:: python
 
-    from torchtune.models.flamingo import FlamingoTransform
+    from torchtune.models.llama3_2_vision import Llama3VisionTransform
     from torchtune.data import Message
     from PIL import Image
 
@@ -43,7 +43,7 @@ These are intended to be drop-in replacements for tokenizers in multimodal datas
             ),
         ],
     }
-    transform = FlamingoTransform(
+    transform = Llama3VisionTransform(
         path="/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model",
         tile_size=224,
         patch_size=14,
@@ -62,9 +62,9 @@ You can pass them into any multimodal dataset builder just as you would a model
 .. code-block:: python
 
     from torchtune.datasets.multimodal import the_cauldron_dataset
-    from torchtune.models.flamingo import FlamingoTransform
+    from torchtune.models.llama3_2_vision import Llama3VisionTransform
 
-    transform = FlamingoTransform(
+    transform = Llama3VisionTransform(
         path="/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model",
         tile_size=224,
         patch_size=14,
@@ -166,5 +166,5 @@ The following methods are required on the model transform:
 
 Example model transforms
 ------------------------
-- Flamingo
-    - :class:`~torchtune.models.flamingo.FlamingoTransform`
+- Llama 3.2 Vision
+    - :class:`~torchtune.models.llama3_2_vision.Llama3VisionTransform`
diff --git a/docs/source/basics/multimodal_datasets.rst b/docs/source/basics/multimodal_datasets.rst
@@ -42,15 +42,14 @@ in the text, ``"<image>"`` for where to place the image tokens. This will get re
 
 .. code-block:: python
 
-    from torchtune.models.flamingo import FlamingoTransform
+    from torchtune.models.llama3_2_vision import llama3_2_vision_transform
     from torchtune.datasets.multimodal import multimodal_chat_dataset
 
-    transform = FlamingoTransform(
+    transform = Llama3VisionTransform(
         path="/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model",
         prompt_template="torchtune.data.QuestionAnswerTemplate",
         max_seq_len=8192,
-        tile_size=224,
-        patch_size=14,
+        image_size=560,
     )
     ds = multimodal_chat_dataset(
         model_transform=model_transform,
@@ -74,7 +73,7 @@ in the text, ``"<image>"`` for where to place the image tokens. This will get re
 
     # In config - model_transforms takes the place of the tokenizer
     model_transform:
-      _component_: torchtune.models.flamingo
+      _component_: torchtune.models.llama3_2_vision_transform
       path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model
       prompt_template: torchtune.data.QuestionAnswerTemplate
       max_seq_len: 8192
@@ -118,14 +117,13 @@ For most datasets, you will also need to specify the ``split`` and/or the subset
 .. code-block:: python
 
     # In code
-    from torchtune.models.flamingo import FlamingoTransform
+    from torchtune.models.llama3_2_vision import llama3_2_vision_transform
     from torchtune.datasets.multimodal import multimodal_chat_dataset
 
-    transform = FlamingoTransform(
+    transform = llama3_2_vision_transform(
         path="/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model",
         max_seq_len=8192,
-        tile_size=224,
-        patch_size=14,
+        image_size=560,
     )
     ds = multimodal_chat_dataset(
         model_transform=model_transform,
@@ -140,11 +138,10 @@ For most datasets, you will also need to specify the ``split`` and/or the subset
 
     # In config
     model_transform:
-      _component_: torchtune.models.flamingo.FlamingoTransform
+      _component_: torchtune.models.llama3_2_vision.llama3_2_vision_transform
       path: /tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model
       max_seq_len: 8192
-      tile_size: 224
-      patch_size: 14
+      image_size: 560
 
     # Tokenizer is passed into the dataset in the recipe
     dataset:

diff --git a/recipes/configs/llama3_2/1B_full.yaml b/recipes/configs/llama3_2/1B_full.yaml
@@ -0,0 +1,77 @@
+# Config for multi-device full finetuning in full_finetune_distributed.py
+# using a Llama3.2 1B Instruct model
+#
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download meta-llama/Meta-Llama-3.2-1B-Instruct --output-dir /tmp/Meta-Llama-3.2-1B-Instruct --ignore-patterns "original/consolidated.00.pth"
+#
+# To launch on 4 devices, run the following command from root:
+#   tune run --nproc_per_node 4 full_finetune_distributed --config llama3_2/1B_full
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+#   tune run --nproc_per_node 4 full_finetune_distributed --config llama3_2/1B_full checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#
+# This config works best when the model is being fine-tuned on 2+ GPUs.
+# Single device full finetuning requires more memory optimizations. It's
+# best to use 1B_full_single_device.yaml for those cases
+
+
+# Tokenizer
+tokenizer:
+  _component_: torchtune.models.llama3.llama3_tokenizer
+  path: /tmp/Meta-Llama-3.2-1B-Instruct/original/tokenizer.model
+  max_seq_len: null
+
+# Dataset
+dataset:
+  _component_: torchtune.datasets.alpaca_dataset
+seed: null
+shuffle: True
+
+# Model Arguments
+model:
+  _component_: torchtune.models.llama3_2.llama3_2_1b
+
+checkpointer:
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Meta-Llama-3.2-1B-Instruct/
+  checkpoint_files: [
+    model.safetensors
+  ]
+  recipe_checkpoint: null
+  output_dir: /tmp/Meta-Llama-3.2-1B-Instruct/
+  model_type: LLAMA3
+resume_from_checkpoint: False
+
+# Fine-tuning arguments
+batch_size: 4
+epochs: 3
+
+optimizer:
+  _component_: torch.optim.AdamW
+  lr: 2e-5
+  fused: True
+loss:
+  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
+max_steps_per_epoch: null
+gradient_accumulation_steps: 4
+
+
+# Training env
+device: cuda
+
+# Memory management
+enable_activation_checkpointing: False
+
+# Reduced precision
+dtype: bf16
+
+# Logging
+metric_logger:
+  _component_: torchtune.training.metric_logging.DiskLogger
+  log_dir: ${output_dir}
+output_dir: /tmp/full-llama3.2-finetune
+log_every_n_steps: 1
+log_peak_memory_stats: False
diff --git a/recipes/configs/llama3_2/1B_full_single_device.yaml b/recipes/configs/llama3_2/1B_full_single_device.yaml
@@ -0,0 +1,103 @@
+# Config for single device full finetuning in full_finetune_single_device.py
+# using a Llama3.2 1B Instruct model
+#
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download meta-llama/Meta-Llama-3.2-1B-Instruct --output-dir /tmp/Meta-Llama-3.2-1B-Instruct --ignore-patterns "original/consolidated.00.pth"
+#
+# The default config uses an optimizer from bitsandbytes. If you do not have it installed,
+# you can install it with
+#   pip install bitsandbytes
+#
+# To launch on a single device, run the following command from root:
+#   tune run full_finetune_single_device --config llama3_2/1B_full_single_device
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+#   tune run full_finetune_single_device --config llama3_2/1B_full_single_device checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#
+# This config works only for training on single device.
+
+
+# Tokenizer
+tokenizer:
+  _component_: torchtune.models.llama3.llama3_tokenizer
+  path: /tmp/Meta-Llama-3.2-1B-Instruct/original/tokenizer.model
+  max_seq_len: null
+
+# Dataset
+dataset:
+  _component_: torchtune.datasets.alpaca_dataset
+seed: null
+shuffle: True
+
+# Model Arguments
+model:
+  _component_: torchtune.models.llama3_2.llama3_2_1b
+
+checkpointer:
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Meta-Llama-3.2-1B-Instruct/
+  checkpoint_files: [
+    model.safetensors
+  ]
+  recipe_checkpoint: null
+  output_dir: /tmp/Meta-Llama-3.2-1B-Instruct/
+  model_type: LLAMA3
+resume_from_checkpoint: False
+
+# Fine-tuning arguments
+batch_size: 4
+epochs: 3
+optimizer:
+  _component_: bitsandbytes.optim.PagedAdamW8bit
+  lr: 2e-5
+loss:
+  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
+max_steps_per_epoch: null
+gradient_accumulation_steps: 1
+optimizer_in_bwd: True
+compile: False
+
+# Training environment
+device: cuda
+
+# Memory management
+enable_activation_checkpointing: False
+
+# Reduced precision
+dtype: bf16
+
+# Logging
+metric_logger:
+  _component_: torchtune.training.metric_logging.DiskLogger
+  log_dir: ${output_dir}
+output_dir: /tmp/full-llama3.2-finetune
+log_every_n_steps: 1
+log_peak_memory_stats: False
+
+# Profiler (disabled)
+profiler:
+  _component_: torchtune.training.setup_torch_profiler
+  enabled: False
+
+  #Output directory of trace artifacts
+  output_dir: ${output_dir}/profiling_outputs
+
+  #`torch.profiler.ProfilerActivity` types to trace
+  cpu: True
+  cuda: True
+
+  #trace options passed to `torch.profiler.profile`
+  profile_memory: True
+  with_stack: False
+  record_shapes: True
+  with_flops: False
+
+  # `torch.profiler.schedule` options:
+  # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
+  wait_steps: 1
+  warmup_steps: 2
+  active_steps: 1
+  num_cycles: 1