From 5cb914070fb269d4f8ab0d450591a76c193fdea6 Mon Sep 17 00:00:00 2001
From: Salman Mohammadi <salman.mohammadi@outlook.com>
Date: Wed, 23 Oct 2024 17:46:09 +0100
Subject: [PATCH 01/20] mm eval tests

---
 recipes/eleuther_eval.py                      |  2 +
 tests/cache_artifacts.sh                      |  3 +
 tests/recipes/test_eleuther_eval.py           | 67 +++++++++++++++-
 tests/recipes/utils.py                        | 78 +++++++++++++++++++
 tests/test_utils.py                           |  2 +
 torchtune/models/clip/_transform.py           |  2 +-
 .../llama3_2_vision/_component_builders.py    |  1 +
 torchtune/modules/model_fusion/_fusion.py     |  5 +-
 8 files changed, 157 insertions(+), 3 deletions(-)

diff --git a/recipes/eleuther_eval.py b/recipes/eleuther_eval.py
index 590e4f902a..b052ce3547 100644
--- a/recipes/eleuther_eval.py
+++ b/recipes/eleuther_eval.py
@@ -188,6 +188,8 @@ def tok_batch_multimodal_encode(
             pad_direction="left",
             pad_max_images=self._max_images_per_sample,
         )
+        import pdb
+        # pdb.set_trace()
         utils.batch_to_device(tok_batch, self.device)
 
         # Convert the batch to the format expected by the HF
diff --git a/tests/cache_artifacts.sh b/tests/cache_artifacts.sh
index 81b50b5889..230d26dba0 100755
--- a/tests/cache_artifacts.sh
+++ b/tests/cache_artifacts.sh
@@ -18,6 +18,9 @@ SMALL_MODEL_URLS=(
     "https://ossci-datasets.s3.amazonaws.com/torchtune/small-ckpt-hf-03082024.pt"
     "https://ossci-datasets.s3.amazonaws.com/torchtune/small-ckpt-tune-llama3-05052024.pt"
     "https://ossci-datasets.s3.amazonaws.com/torchtune/small-ckpt-hf-reward-07122024.pt"
+    "https://ossci-datasets.s3.amazonaws.com/torchtune/small-ckpt-meta-vision-10172024.pt"
+    "https://ossci-datasets.s3.amazonaws.com/torchtune/small-ckpt-hf-vision-10172024.pt"
+
 )
 FULL_MODEL_URL=("s3://pytorch-multimodal/llama2-7b-torchtune.pt")
 TOKENIZER_URLS=(
diff --git a/tests/recipes/test_eleuther_eval.py b/tests/recipes/test_eleuther_eval.py
index 1c3a7bb65f..8084fb7452 100644
--- a/tests/recipes/test_eleuther_eval.py
+++ b/tests/recipes/test_eleuther_eval.py
@@ -13,7 +13,12 @@
 import pytest
 
 from tests.common import TUNE_PATH
-from tests.recipes.utils import llama2_test_config, write_hf_ckpt_config
+from tests.recipes.utils import (
+    llama2_test_config,
+    llama3_2_vision_test_config,
+    write_hf_ckpt_config,
+    write_hf_vision_ckpt_config,
+)
 from tests.test_utils import CKPT_MODEL_PATHS
 
 
@@ -194,3 +199,63 @@ def test_eval_recipe_errors_with_qat_quantizer(self, capsys, monkeypatch, tmpdir
             match="QAT quantizers should only be used during quantization aware training",
         ):
             runpy.run_path(TUNE_PATH, run_name="__main__")
+
+    @pytest.mark.integration_test
+    def test_meta_eval_vision(self, capsys, monkeypatch, tmpdir):
+        ckpt = "llama3_2_vision_meta"
+        ckpt_path = Path(CKPT_MODEL_PATHS[ckpt])
+        ckpt_dir = ckpt_path.parent
+
+        cmd = f"""
+        tune run eleuther_eval \
+            --config llama3_2_vision/evaluation \
+            output_dir={tmpdir} \
+            checkpointer.checkpoint_dir='{ckpt_dir}' \
+            checkpointer.checkpoint_files=[{ckpt_path}]\
+            checkpointer.output_dir={tmpdir} \
+            checkpointer.model_type=LLAMA3_VISION \
+            tokenizer.path=/tmp/test-artifacts/tokenizer_llama3.model \
+            tokenizer.prompt_template=null \
+            limit=1 \
+            dtype=fp32 \
+            device=cpu \
+        """.split()
+
+        model_config = llama3_2_vision_test_config()
+        cmd = cmd + model_config
+
+        monkeypatch.setattr(sys, "argv", cmd)
+        with pytest.raises(SystemExit, match=""):
+            runpy.run_path(TUNE_PATH, run_name="__main__")
+
+    @pytest.mark.integration_test
+    def test_hf_eval_vision(self, capsys, monkeypatch, tmpdir):
+        ckpt = "llama3_2_vision_hf"
+        ckpt_path = Path(CKPT_MODEL_PATHS[ckpt])
+        ckpt_dir = ckpt_path.parent
+
+        # Config file needed for model conversion.
+        write_hf_vision_ckpt_config(ckpt_dir)
+
+        cmd = f"""
+        tune run eleuther_eval \
+            --config llama3_2_vision/evaluation \
+            output_dir={tmpdir} \
+            checkpointer=torchtune.training.FullModelHFCheckpointer \
+            checkpointer.checkpoint_dir='{ckpt_dir}' \
+            checkpointer.checkpoint_files=[{ckpt_path}]\
+            checkpointer.output_dir={tmpdir} \
+            checkpointer.model_type=LLAMA3_VISION \
+            tokenizer.path=/tmp/test-artifacts/tokenizer_llama3.model \
+            tokenizer.prompt_template=null \
+            limit=1 \
+            dtype=fp32 \
+            device=cpu \
+        """.split()
+
+        model_config = llama3_2_vision_test_config()
+        cmd = cmd + model_config
+
+        monkeypatch.setattr(sys, "argv", cmd)
+        with pytest.raises(SystemExit, match=""):
+            runpy.run_path(TUNE_PATH, run_name="__main__")
diff --git a/tests/recipes/utils.py b/tests/recipes/utils.py
index a79f5be715..eb0e3419d6 100644
--- a/tests/recipes/utils.py
+++ b/tests/recipes/utils.py
@@ -128,6 +128,63 @@ def llama3_test_config() -> List[str]:
     ]
 
 
+def llama3_2_vision_test_config() -> List[str]:
+    return [
+        "model=tests.recipes.utils.dummy_vision_model",
+        "tokenizer._component_=torchtune.models.llama3_2_vision._transform.Llama3VisionTransform",
+        "tokenizer.patch_size=9",
+        "tokenizer.max_num_tiles=2",
+        "tokenizer.tile_size=18",
+        "tokenizer.max_seq_len=4096",
+    ]
+    return [
+        "model._component_=torchtune.modules.model_fusion.DeepFusionModel",
+        "model.encoder._component_=torchtune.models.llama3_2_vision._component_builders.llama3_2_vision_encoder",
+        "model.encoder._component_=torchtune.models.llama3_2_vision._component_builders.llama3_2_vision_decoder",
+    ]
+
+
+def dummy_vision_model():
+    from torchtune.models.llama3_2_vision._component_builders import (
+        llama3_2_vision_decoder,
+        llama3_2_vision_encoder,
+    )
+    from torchtune.modules.model_fusion import DeepFusionModel
+
+    vision_encoder = llama3_2_vision_encoder(
+        clip_embed_dim=128,
+        clip_num_layers=4,
+        num_heads=4,
+        tile_size=18,
+        patch_size=9,
+        max_num_tiles=2,
+        in_channels=3,
+        clip_hidden_states=[0, 1],
+        num_layers_projection=2,
+        decoder_embed_dim=128,
+    )
+    vision_decoder = llama3_2_vision_decoder(
+        vocab_size=128256,
+        num_layers=4,
+        fusion_interval=2,
+        num_special_tokens=2,
+        num_heads=8,
+        num_kv_heads=4,
+        embed_dim=128,
+        max_seq_len=4096,
+        encoder_max_seq_len=4096,
+    )
+
+    model = DeepFusionModel(
+        encoder=vision_encoder,
+        decoder=vision_decoder,
+        encoder_trainable=False,
+        decoder_trainable=False,
+        fusion_trainable=False,
+    )
+    return model
+
+
 def lora_llama2_test_config(
     lora_attn_modules,
     apply_lora_to_mlp: bool = False,
@@ -197,6 +254,27 @@ def write_hf_ckpt_config(ckpt_dir: str):
         json.dump(config, f)
 
 
+def write_hf_vision_ckpt_config(ckpt_dir: str):
+    config = {
+        "text_config": {
+            "num_attention_heads": 8,  # Ensure this matches your expectations
+            "num_key_value_heads": 4,  # This should match your expected key
+            "hidden_size": 128,  # Corresponds to dim
+            "vocab_size": 128256,
+            "cross_attention_layers": [1, 4],
+        },
+        "vision_config": {
+            "hidden_size": 128,  # Corresponds to encoder_dim
+            "image_size": 18,  # This corresponds to tile_size
+            "max_num_tiles": 2,  # Corresponds to num_tiles
+            "supported_aspect_ratios": [[1, 1], [1, 2], [2, 1]],
+        },
+    }
+    config_file = Path.joinpath(Path(ckpt_dir), "config.json")
+    with config_file.open("w") as f:
+        json.dump(config, f)
+
+
 MODEL_TEST_CONFIGS = {
     "llama2": llama2_test_config(),
     "llama3": llama3_test_config(),
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 8ba28f1bf4..9adcf58582 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -33,6 +33,8 @@
     "llama2_hf": "/tmp/test-artifacts/small-ckpt-hf-03082024.pt",
     "llama2_reward_hf": "/tmp/test-artifacts/small-ckpt-hf-reward-07122024.pt",
     "llama3_tune": "/tmp/test-artifacts/small-ckpt-tune-llama3-05052024.pt",
+    "llama3_2_vision_hf": "/tmp/test-artifacts/small-ckpt-hf-vision-10172024.pt",
+    "llama3_2_vision_meta": "/tmp/test-artifacts/small-ckpt-meta-vision-10172024.pt",
     "llama2_7b": "/tmp/test-artifacts/llama2-7b-torchtune.pt",
 }
 
diff --git a/torchtune/models/clip/_transform.py b/torchtune/models/clip/_transform.py
index a9b60624ff..5f966d1c60 100644
--- a/torchtune/models/clip/_transform.py
+++ b/torchtune/models/clip/_transform.py
@@ -99,7 +99,7 @@ def __init__(
         possible_resolutions: Optional[List[Tuple[int, int]]] = None,
         tile_size: int = 224,
         max_num_tiles: Optional[int] = 4,
-        dtype: torch.dtype = torch.bfloat16,
+        dtype: torch.dtype = torch.float32,
         resample: str = "bilinear",
         resize_to_max_canvas: bool = False,
     ) -> None:
diff --git a/torchtune/models/llama3_2_vision/_component_builders.py b/torchtune/models/llama3_2_vision/_component_builders.py
index 111393501d..14c93ee099 100644
--- a/torchtune/models/llama3_2_vision/_component_builders.py
+++ b/torchtune/models/llama3_2_vision/_component_builders.py
@@ -157,6 +157,7 @@ def llama3_2_vision_decoder(
             by :func:`~torchtune.modules.KVCache`.
         encoder_max_seq_len (int): maximum sequence length the encoder will be run with, as used
             by :func:`~torchtune.modules.KVCache`.
+        rope_base (int): base for the rotary positional embeddings. Default: 500_000
         intermediate_dim (Optional[int]): intermediate dimension for MLP. If not specified,
             this is computed using :func:`~torchtune.modules.scale_hidden_dim_for_mlp`.
 
diff --git a/torchtune/modules/model_fusion/_fusion.py b/torchtune/modules/model_fusion/_fusion.py
index 1a5452daae..1393aef8bb 100644
--- a/torchtune/modules/model_fusion/_fusion.py
+++ b/torchtune/modules/model_fusion/_fusion.py
@@ -271,7 +271,10 @@ def forward(self, input: torch.Tensor) -> torch.Tensor:
         tokens = torch.masked_select(input, mask)
         # num_fusion_tokens = (input >= vocab_size).sum()
         fusion_tokens = torch.masked_select(input, ~mask) - vocab_size
-
+        import pdb
+        print(tokens.max())
+        if input.max() > vocab_size:
+            pdb.set_trace()
         # [batch_size x num_tokens x embed_dim]
         embeds = self.embedding(tokens)
         # [batch_size x num_fusion_tokens x embed_dim]

From 63ba1755b7aa95c4ba5264a75cc67c01124e4270 Mon Sep 17 00:00:00 2001
From: Salman Mohammadi <salman.mohammadi@outlook.com>
Date: Wed, 23 Oct 2024 17:52:44 +0100
Subject: [PATCH 02/20] mm eval tests

---
 recipes/eleuther_eval.py                  | 3 +--
 torchtune/modules/model_fusion/_fusion.py | 5 +----
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/recipes/eleuther_eval.py b/recipes/eleuther_eval.py
index b052ce3547..36bf490570 100644
--- a/recipes/eleuther_eval.py
+++ b/recipes/eleuther_eval.py
@@ -188,8 +188,7 @@ def tok_batch_multimodal_encode(
             pad_direction="left",
             pad_max_images=self._max_images_per_sample,
         )
-        import pdb
-        # pdb.set_trace()
+        
         utils.batch_to_device(tok_batch, self.device)
 
         # Convert the batch to the format expected by the HF
diff --git a/torchtune/modules/model_fusion/_fusion.py b/torchtune/modules/model_fusion/_fusion.py
index 1393aef8bb..1a5452daae 100644
--- a/torchtune/modules/model_fusion/_fusion.py
+++ b/torchtune/modules/model_fusion/_fusion.py
@@ -271,10 +271,7 @@ def forward(self, input: torch.Tensor) -> torch.Tensor:
         tokens = torch.masked_select(input, mask)
         # num_fusion_tokens = (input >= vocab_size).sum()
         fusion_tokens = torch.masked_select(input, ~mask) - vocab_size
-        import pdb
-        print(tokens.max())
-        if input.max() > vocab_size:
-            pdb.set_trace()
+
         # [batch_size x num_tokens x embed_dim]
         embeds = self.embedding(tokens)
         # [batch_size x num_fusion_tokens x embed_dim]

From 578aa48ca36333616de567a1a75a3fed6f5cf2e7 Mon Sep 17 00:00:00 2001
From: Salman Mohammadi <salman.mohammadi@outlook.com>
Date: Fri, 8 Nov 2024 14:10:37 +0000
Subject: [PATCH 03/20] adding test values

---
 .../llama3_2_vision/11B_evaluation.yaml       |  2 +-
 recipes/eleuther_eval.py                      |  2 +-
 tests/recipes/test_eleuther_eval.py           | 84 ++++++++++++++-----
 tests/regression_tests/test_llama2_7b.py      |  2 +-
 4 files changed, 64 insertions(+), 26 deletions(-)

diff --git a/recipes/configs/llama3_2_vision/11B_evaluation.yaml b/recipes/configs/llama3_2_vision/11B_evaluation.yaml
index 44f7c5d925..13bbabf549 100644
--- a/recipes/configs/llama3_2_vision/11B_evaluation.yaml
+++ b/recipes/configs/llama3_2_vision/11B_evaluation.yaml
@@ -7,7 +7,7 @@
 #   pip install lm_eval==0.4.5
 #
 # To launch, run the following command from root torchtune directory:
-#    tune run eleuther_eval --config llama3_2_vision/evaluation
+#    tune run eleuther_eval --config llama3_2_vision/11B_evaluation
 
 # Model arguments
 model:
diff --git a/recipes/eleuther_eval.py b/recipes/eleuther_eval.py
index deeffd3934..c07d145175 100644
--- a/recipes/eleuther_eval.py
+++ b/recipes/eleuther_eval.py
@@ -189,7 +189,7 @@ def tok_batch_multimodal_encode(
             pad_max_images=self._max_images_per_sample,
             pad_max_tiles=self._transform.max_num_tiles,
         )
-        
+
         utils.batch_to_device(tok_batch, self.device)
 
         # Convert the batch to the format expected by the HF
diff --git a/tests/recipes/test_eleuther_eval.py b/tests/recipes/test_eleuther_eval.py
index 8084fb7452..e6db55c1e9 100644
--- a/tests/recipes/test_eleuther_eval.py
+++ b/tests/recipes/test_eleuther_eval.py
@@ -31,6 +31,30 @@ class TestEleutherEval:
             ("truthfulqa_mc2", 0.4, 4),
         ],
     )
+    @pytest.fixture
+    def hide_correct_version_number(self, monkeypatch):
+        import importlib.metadata
+
+        import_orig = importlib.metadata.version
+
+        def mocked_import(name, *args, **kwargs):
+            if name == "lm-eval":
+                return "0.4.4"  # Hardcode wrong version number
+            return import_orig(name, *args, **kwargs)
+
+        monkeypatch.setattr(importlib.metadata, "version", mocked_import)
+
+    @pytest.fixture
+    def expected_vision_acc(self):
+        return {
+            "Science": 0.16,
+            "Biology": 0.4,
+            "Chemistry": 0.0,
+            "Geography": 0.2,
+            "Math": 0.0,
+            "Physics": 0.2,
+        }
+
     @pytest.mark.integration_test
     def test_torchtune_checkpoint_eval_results(
         self, caplog, monkeypatch, tmpdir, eval_name, expected_acc, bsz
@@ -79,22 +103,9 @@ def test_torchtune_checkpoint_eval_results(
         acc_result = float(search_results.group(1))
         assert math.isclose(acc_result, expected_acc, abs_tol=0.05)
 
-    @pytest.fixture
-    def hide_correct_version_number(self, monkeypatch):
-        import importlib.metadata
-
-        import_orig = importlib.metadata.version
-
-        def mocked_import(name, *args, **kwargs):
-            if name == "lm-eval":
-                return "0.4.4"  # Hardcode wrong version number
-            return import_orig(name, *args, **kwargs)
-
-        monkeypatch.setattr(importlib.metadata, "version", mocked_import)
-
     @pytest.mark.integration_test
     @pytest.mark.usefixtures("hide_correct_version_number")
-    def test_eval_recipe_errors_without_lm_eval(self, capsys, monkeypatch, tmpdir):
+    def test_eval_recipe_errors_without_lm_eval(self, monkeypatch, tmpdir):
         ckpt = "llama2_tune"
         ckpt_path = Path(CKPT_MODEL_PATHS[ckpt])
         ckpt_dir = ckpt_path.parent
@@ -128,7 +139,7 @@ def test_eval_recipe_errors_without_lm_eval(self, capsys, monkeypatch, tmpdir):
 
     @pytest.mark.integration_test
     def test_eval_recipe_errors_with_quantization_hf_checkpointer(
-        self, capsys, monkeypatch, tmpdir
+        self, monkeypatch, tmpdir
     ):
         ckpt = "llama2_hf"
         ckpt_path = Path(CKPT_MODEL_PATHS[ckpt])
@@ -167,7 +178,7 @@ def test_eval_recipe_errors_with_quantization_hf_checkpointer(
             runpy.run_path(TUNE_PATH, run_name="__main__")
 
     @pytest.mark.integration_test
-    def test_eval_recipe_errors_with_qat_quantizer(self, capsys, monkeypatch, tmpdir):
+    def test_eval_recipe_errors_with_qat_quantizer(self, monkeypatch, tmpdir):
         ckpt = "llama2_tune"
         ckpt_path = Path(CKPT_MODEL_PATHS[ckpt])
         ckpt_dir = ckpt_path.parent
@@ -201,22 +212,25 @@ def test_eval_recipe_errors_with_qat_quantizer(self, capsys, monkeypatch, tmpdir
             runpy.run_path(TUNE_PATH, run_name="__main__")
 
     @pytest.mark.integration_test
-    def test_meta_eval_vision(self, capsys, monkeypatch, tmpdir):
+    def test_meta_eval_vision(self, caplog, monkeypatch, tmpdir, expected_vision_acc):
         ckpt = "llama3_2_vision_meta"
         ckpt_path = Path(CKPT_MODEL_PATHS[ckpt])
         ckpt_dir = ckpt_path.parent
 
         cmd = f"""
         tune run eleuther_eval \
-            --config llama3_2_vision/evaluation \
+            --config llama3_2_vision/11B_evaluation \
             output_dir={tmpdir} \
+            checkpointer=torchtune.training.FullModelMetaCheckpointer \
             checkpointer.checkpoint_dir='{ckpt_dir}' \
-            checkpointer.checkpoint_files=[{ckpt_path}]\
+            checkpointer.checkpoint_files=[{ckpt_path}] \
+            ~checkpointer.checkpoint_files.filename_format \
+            ~checkpointer.checkpoint_files.max_filename \
             checkpointer.output_dir={tmpdir} \
             checkpointer.model_type=LLAMA3_VISION \
             tokenizer.path=/tmp/test-artifacts/tokenizer_llama3.model \
             tokenizer.prompt_template=null \
-            limit=1 \
+            limit=5 \
             dtype=fp32 \
             device=cpu \
         """.split()
@@ -228,8 +242,19 @@ def test_meta_eval_vision(self, capsys, monkeypatch, tmpdir):
         with pytest.raises(SystemExit, match=""):
             runpy.run_path(TUNE_PATH, run_name="__main__")
 
+        out = caplog.text
+
+        pattern = r"^\|\s*(?:-\s*)?([^\|]+?)\s*\|\s*(\d+)\s*\|.*?\|.*?\|acc\s*\|\s*↑\s*\|\s*([\d.]+)"
+
+        # Find all matches in the table text
+        matches = re.findall(pattern, out, re.MULTILINE)
+
+        # Print the task names and their corresponding accuracy scores
+        for task_name, _, accuracy in matches:
+            assert math.isclose(float(accuracy), expected_vision_acc[task_name])
+
     @pytest.mark.integration_test
-    def test_hf_eval_vision(self, capsys, monkeypatch, tmpdir):
+    def test_hf_eval_vision(self, caplog, monkeypatch, tmpdir, expected_vision_acc):
         ckpt = "llama3_2_vision_hf"
         ckpt_path = Path(CKPT_MODEL_PATHS[ckpt])
         ckpt_dir = ckpt_path.parent
@@ -239,16 +264,18 @@ def test_hf_eval_vision(self, capsys, monkeypatch, tmpdir):
 
         cmd = f"""
         tune run eleuther_eval \
-            --config llama3_2_vision/evaluation \
+            --config llama3_2_vision/11B_evaluation \
             output_dir={tmpdir} \
             checkpointer=torchtune.training.FullModelHFCheckpointer \
             checkpointer.checkpoint_dir='{ckpt_dir}' \
             checkpointer.checkpoint_files=[{ckpt_path}]\
+            ~checkpointer.checkpoint_files.filename_format \
+            ~checkpointer.checkpoint_files.max_filename \
             checkpointer.output_dir={tmpdir} \
             checkpointer.model_type=LLAMA3_VISION \
             tokenizer.path=/tmp/test-artifacts/tokenizer_llama3.model \
             tokenizer.prompt_template=null \
-            limit=1 \
+            limit=5 \
             dtype=fp32 \
             device=cpu \
         """.split()
@@ -259,3 +286,14 @@ def test_hf_eval_vision(self, capsys, monkeypatch, tmpdir):
         monkeypatch.setattr(sys, "argv", cmd)
         with pytest.raises(SystemExit, match=""):
             runpy.run_path(TUNE_PATH, run_name="__main__")
+
+        out = caplog.text
+
+        pattern = r"^\|\s*(?:-\s*)?([^\|]+?)\s*\|\s*(\d+)\s*\|.*?\|.*?\|acc\s*\|\s*↑\s*\|\s*([\d.]+)"
+
+        # Find all matches in the table text
+        matches = re.findall(pattern, out, re.MULTILINE)
+
+        # Print the task names and their corresponding accuracy scores
+        for task_name, _, accuracy in matches:
+            assert math.isclose(float(accuracy), expected_vision_acc[task_name])
diff --git a/tests/regression_tests/test_llama2_7b.py b/tests/regression_tests/test_llama2_7b.py
index cba0a39032..115ab3a121 100644
--- a/tests/regression_tests/test_llama2_7b.py
+++ b/tests/regression_tests/test_llama2_7b.py
@@ -57,7 +57,7 @@ def test_finetune_and_eval(self, tmpdir, capsys, monkeypatch):
             checkpointer.checkpoint_files=[torchtune_model_0.pt] \
             checkpointer.output_dir={tmpdir} \
             tokenizer.path=/tmp/test-artifacts/tokenizer.model \
-            tasks=['truthfulqa_mc2']
+            tasks=['truthfulqa_mc2'] \
             limit=10 \
             device=cuda \
         """.split()

From f0a94d7acb10185b0133ce5dd5148a226e838a18 Mon Sep 17 00:00:00 2001
From: Salman Mohammadi <salman.mohammadi@outlook.com>
Date: Fri, 8 Nov 2024 14:24:00 +0000
Subject: [PATCH 04/20] reverting changes

---
 recipes/eleuther_eval.py                 | 1 -
 tests/regression_tests/test_llama2_7b.py | 2 +-
 torchtune/models/clip/_transform.py      | 2 +-
 3 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/recipes/eleuther_eval.py b/recipes/eleuther_eval.py
index c07d145175..68503ff63c 100644
--- a/recipes/eleuther_eval.py
+++ b/recipes/eleuther_eval.py
@@ -189,7 +189,6 @@ def tok_batch_multimodal_encode(
             pad_max_images=self._max_images_per_sample,
             pad_max_tiles=self._transform.max_num_tiles,
         )
-
         utils.batch_to_device(tok_batch, self.device)
 
         # Convert the batch to the format expected by the HF
diff --git a/tests/regression_tests/test_llama2_7b.py b/tests/regression_tests/test_llama2_7b.py
index 115ab3a121..cba0a39032 100644
--- a/tests/regression_tests/test_llama2_7b.py
+++ b/tests/regression_tests/test_llama2_7b.py
@@ -57,7 +57,7 @@ def test_finetune_and_eval(self, tmpdir, capsys, monkeypatch):
             checkpointer.checkpoint_files=[torchtune_model_0.pt] \
             checkpointer.output_dir={tmpdir} \
             tokenizer.path=/tmp/test-artifacts/tokenizer.model \
-            tasks=['truthfulqa_mc2'] \
+            tasks=['truthfulqa_mc2']
             limit=10 \
             device=cuda \
         """.split()
diff --git a/torchtune/models/clip/_transform.py b/torchtune/models/clip/_transform.py
index 5f966d1c60..a9b60624ff 100644
--- a/torchtune/models/clip/_transform.py
+++ b/torchtune/models/clip/_transform.py
@@ -99,7 +99,7 @@ def __init__(
         possible_resolutions: Optional[List[Tuple[int, int]]] = None,
         tile_size: int = 224,
         max_num_tiles: Optional[int] = 4,
-        dtype: torch.dtype = torch.float32,
+        dtype: torch.dtype = torch.bfloat16,
         resample: str = "bilinear",
         resize_to_max_canvas: bool = False,
     ) -> None:

From 60bccc67d87353329ffbb91293f4c642c173805d Mon Sep 17 00:00:00 2001
From: Salman Mohammadi <salman.mohammadi@outlook.com>
Date: Fri, 8 Nov 2024 14:31:54 +0000
Subject: [PATCH 05/20] whoops

---
 tests/recipes/test_eleuther_eval.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/recipes/test_eleuther_eval.py b/tests/recipes/test_eleuther_eval.py
index e6db55c1e9..28f52e0c4a 100644
--- a/tests/recipes/test_eleuther_eval.py
+++ b/tests/recipes/test_eleuther_eval.py
@@ -195,7 +195,7 @@ def test_eval_recipe_errors_with_qat_quantizer(self, monkeypatch, tmpdir):
             tokenizer.path=/tmp/test-artifacts/tokenizer.model \
             tokenizer.prompt_template=null \
             limit=1 \
-            dtype=fp32 \
+            dtype=bf16 \
             device=cpu \
             quantizer._component_=torchtune.training.quantization.Int8DynActInt4WeightQATQuantizer \
             quantizer.groupsize=32\
@@ -231,7 +231,7 @@ def test_meta_eval_vision(self, caplog, monkeypatch, tmpdir, expected_vision_acc
             tokenizer.path=/tmp/test-artifacts/tokenizer_llama3.model \
             tokenizer.prompt_template=null \
             limit=5 \
-            dtype=fp32 \
+            dtype=bf16 \
             device=cpu \
         """.split()
 

From 668174965cacebb5bce6e4bf632f36c8ea56aeba Mon Sep 17 00:00:00 2001
From: Salman Mohammadi <salman.mohammadi@outlook.com>
Date: Fri, 8 Nov 2024 16:08:15 +0000
Subject: [PATCH 06/20] whoops 2

---
 tests/recipes/test_eleuther_eval.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/recipes/test_eleuther_eval.py b/tests/recipes/test_eleuther_eval.py
index 28f52e0c4a..255634e27b 100644
--- a/tests/recipes/test_eleuther_eval.py
+++ b/tests/recipes/test_eleuther_eval.py
@@ -195,7 +195,7 @@ def test_eval_recipe_errors_with_qat_quantizer(self, monkeypatch, tmpdir):
             tokenizer.path=/tmp/test-artifacts/tokenizer.model \
             tokenizer.prompt_template=null \
             limit=1 \
-            dtype=bf16 \
+            dtype=fp32 \
             device=cpu \
             quantizer._component_=torchtune.training.quantization.Int8DynActInt4WeightQATQuantizer \
             quantizer.groupsize=32\
@@ -276,7 +276,7 @@ def test_hf_eval_vision(self, caplog, monkeypatch, tmpdir, expected_vision_acc):
             tokenizer.path=/tmp/test-artifacts/tokenizer_llama3.model \
             tokenizer.prompt_template=null \
             limit=5 \
-            dtype=fp32 \
+            dtype=bf16 \
             device=cpu \
         """.split()
 

From d214f521d806ddbf772b9461092863a859f544d7 Mon Sep 17 00:00:00 2001
From: Salman Mohammadi <salman.mohammadi@outlook.com>
Date: Fri, 8 Nov 2024 16:09:37 +0000
Subject: [PATCH 07/20] tidy tidy tidy tidy fresh clean

---
 tests/recipes/test_eleuther_eval.py |  6 ------
 tests/recipes/utils.py              | 17 ++++++-----------
 2 files changed, 6 insertions(+), 17 deletions(-)

diff --git a/tests/recipes/test_eleuther_eval.py b/tests/recipes/test_eleuther_eval.py
index 255634e27b..78c39e128f 100644
--- a/tests/recipes/test_eleuther_eval.py
+++ b/tests/recipes/test_eleuther_eval.py
@@ -246,10 +246,7 @@ def test_meta_eval_vision(self, caplog, monkeypatch, tmpdir, expected_vision_acc
 
         pattern = r"^\|\s*(?:-\s*)?([^\|]+?)\s*\|\s*(\d+)\s*\|.*?\|.*?\|acc\s*\|\s*↑\s*\|\s*([\d.]+)"
 
-        # Find all matches in the table text
         matches = re.findall(pattern, out, re.MULTILINE)
-
-        # Print the task names and their corresponding accuracy scores
         for task_name, _, accuracy in matches:
             assert math.isclose(float(accuracy), expected_vision_acc[task_name])
 
@@ -291,9 +288,6 @@ def test_hf_eval_vision(self, caplog, monkeypatch, tmpdir, expected_vision_acc):
 
         pattern = r"^\|\s*(?:-\s*)?([^\|]+?)\s*\|\s*(\d+)\s*\|.*?\|.*?\|acc\s*\|\s*↑\s*\|\s*([\d.]+)"
 
-        # Find all matches in the table text
         matches = re.findall(pattern, out, re.MULTILINE)
-
-        # Print the task names and their corresponding accuracy scores
         for task_name, _, accuracy in matches:
             assert math.isclose(float(accuracy), expected_vision_acc[task_name])
diff --git a/tests/recipes/utils.py b/tests/recipes/utils.py
index 5696a289fa..7c35eedc2a 100644
--- a/tests/recipes/utils.py
+++ b/tests/recipes/utils.py
@@ -137,11 +137,6 @@ def llama3_2_vision_test_config() -> List[str]:
         "tokenizer.tile_size=18",
         "tokenizer.max_seq_len=4096",
     ]
-    return [
-        "model._component_=torchtune.modules.model_fusion.DeepFusionModel",
-        "model.encoder._component_=torchtune.models.llama3_2_vision._component_builders.llama3_2_vision_encoder",
-        "model.encoder._component_=torchtune.models.llama3_2_vision._component_builders.llama3_2_vision_decoder",
-    ]
 
 
 def dummy_vision_model():
@@ -259,16 +254,16 @@ def write_hf_ckpt_config(ckpt_dir: str):
 def write_hf_vision_ckpt_config(ckpt_dir: str):
     config = {
         "text_config": {
-            "num_attention_heads": 8,  # Ensure this matches your expectations
-            "num_key_value_heads": 4,  # This should match your expected key
-            "hidden_size": 128,  # Corresponds to dim
+            "num_attention_heads": 8,
+            "num_key_value_heads": 4,
+            "hidden_size": 128,
             "vocab_size": 128256,
             "cross_attention_layers": [1, 4],
         },
         "vision_config": {
-            "hidden_size": 128,  # Corresponds to encoder_dim
-            "image_size": 18,  # This corresponds to tile_size
-            "max_num_tiles": 2,  # Corresponds to num_tiles
+            "hidden_size": 128,
+            "image_size": 18,
+            "max_num_tiles": 2,
             "supported_aspect_ratios": [[1, 1], [1, 2], [2, 1]],
         },
     }

From e3155a1684a7242f60bfbba974870c9f2eee714c Mon Sep 17 00:00:00 2001
From: salman <salman.mohammadi@outlook.com>
Date: Fri, 8 Nov 2024 23:48:23 +0000
Subject: [PATCH 08/20] what is this rounding nonesense?

---
 tests/recipes/test_eleuther_eval.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/recipes/test_eleuther_eval.py b/tests/recipes/test_eleuther_eval.py
index 78c39e128f..45acf5ca99 100644
--- a/tests/recipes/test_eleuther_eval.py
+++ b/tests/recipes/test_eleuther_eval.py
@@ -47,7 +47,7 @@ def mocked_import(name, *args, **kwargs):
     @pytest.fixture
     def expected_vision_acc(self):
         return {
-            "Science": 0.16,
+            "Science": 0.2,
             "Biology": 0.4,
             "Chemistry": 0.0,
             "Geography": 0.2,

From 7add9af226d6145d3486a877ad4c46c60e492163 Mon Sep 17 00:00:00 2001
From: Salman Mohammadi <salman.mohammadi@outlook.com>
Date: Sat, 9 Nov 2024 11:01:27 +0000
Subject: [PATCH 09/20] fixing values

---
 tests/recipes/test_eleuther_eval.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/recipes/test_eleuther_eval.py b/tests/recipes/test_eleuther_eval.py
index 45acf5ca99..ec71fe3312 100644
--- a/tests/recipes/test_eleuther_eval.py
+++ b/tests/recipes/test_eleuther_eval.py
@@ -51,8 +51,8 @@ def expected_vision_acc(self):
             "Biology": 0.4,
             "Chemistry": 0.0,
             "Geography": 0.2,
-            "Math": 0.0,
-            "Physics": 0.2,
+            "Math": 0.4,
+            "Physics": 0.0,
         }
 
     @pytest.mark.integration_test

From c3246c02af3a3833b1deac41b72f39714c5c347c Mon Sep 17 00:00:00 2001
From: salman <salman.mohammadi@outlook.com>
Date: Sat, 9 Nov 2024 11:50:49 +0000
Subject: [PATCH 10/20] fixing parameterize

---
 tests/recipes/test_eleuther_eval.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tests/recipes/test_eleuther_eval.py b/tests/recipes/test_eleuther_eval.py
index ec71fe3312..e0e526bd3e 100644
--- a/tests/recipes/test_eleuther_eval.py
+++ b/tests/recipes/test_eleuther_eval.py
@@ -23,14 +23,6 @@
 
 
 class TestEleutherEval:
-    @pytest.mark.parametrize(
-        "eval_name, expected_acc, bsz",
-        [
-            ("truthfulqa_gen", 0.1, 4),
-            ("truthfulqa_gen", 0.1, 1),
-            ("truthfulqa_mc2", 0.4, 4),
-        ],
-    )
     @pytest.fixture
     def hide_correct_version_number(self, monkeypatch):
         import importlib.metadata
@@ -55,6 +47,14 @@ def expected_vision_acc(self):
             "Physics": 0.0,
         }
 
+    @pytest.mark.parametrize(
+        "eval_name, expected_acc, bsz",
+        [
+            ("truthfulqa_gen", 0.1, 4),
+            ("truthfulqa_gen", 0.1, 1),
+            ("truthfulqa_mc2", 0.4, 4),
+        ],
+    )
     @pytest.mark.integration_test
     def test_torchtune_checkpoint_eval_results(
         self, caplog, monkeypatch, tmpdir, eval_name, expected_acc, bsz

From e3f8178792e2ee8541e2dda5063485ce1bdaace7 Mon Sep 17 00:00:00 2001
From: Salman Mohammadi <salman.mohammadi@outlook.com>
Date: Mon, 11 Nov 2024 20:24:40 +0000
Subject: [PATCH 11/20] just put it on teh gpu?

---
 tests/recipes/test_eleuther_eval.py | 10 +++++++---
 tests/recipes/utils.py              |  2 +-
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/tests/recipes/test_eleuther_eval.py b/tests/recipes/test_eleuther_eval.py
index ec71fe3312..047b8d3715 100644
--- a/tests/recipes/test_eleuther_eval.py
+++ b/tests/recipes/test_eleuther_eval.py
@@ -19,7 +19,7 @@
     write_hf_ckpt_config,
     write_hf_vision_ckpt_config,
 )
-from tests.test_utils import CKPT_MODEL_PATHS
+from tests.test_utils import CKPT_MODEL_PATHS, gpu_test
 
 
 class TestEleutherEval:
@@ -212,6 +212,7 @@ def test_eval_recipe_errors_with_qat_quantizer(self, monkeypatch, tmpdir):
             runpy.run_path(TUNE_PATH, run_name="__main__")
 
     @pytest.mark.integration_test
+    @gpu_test(gpu_count=1)
     def test_meta_eval_vision(self, caplog, monkeypatch, tmpdir, expected_vision_acc):
         ckpt = "llama3_2_vision_meta"
         ckpt_path = Path(CKPT_MODEL_PATHS[ckpt])
@@ -232,7 +233,8 @@ def test_meta_eval_vision(self, caplog, monkeypatch, tmpdir, expected_vision_acc
             tokenizer.prompt_template=null \
             limit=5 \
             dtype=bf16 \
-            device=cpu \
+            device=cuda \
+            max_seq_len=1024 \
         """.split()
 
         model_config = llama3_2_vision_test_config()
@@ -251,6 +253,7 @@ def test_meta_eval_vision(self, caplog, monkeypatch, tmpdir, expected_vision_acc
             assert math.isclose(float(accuracy), expected_vision_acc[task_name])
 
     @pytest.mark.integration_test
+    @gpu_test(gpu_count=1)
     def test_hf_eval_vision(self, caplog, monkeypatch, tmpdir, expected_vision_acc):
         ckpt = "llama3_2_vision_hf"
         ckpt_path = Path(CKPT_MODEL_PATHS[ckpt])
@@ -274,7 +277,8 @@ def test_hf_eval_vision(self, caplog, monkeypatch, tmpdir, expected_vision_acc):
             tokenizer.prompt_template=null \
             limit=5 \
             dtype=bf16 \
-            device=cpu \
+            device=cuda \
+            max_seq_len=1024 \
         """.split()
 
         model_config = llama3_2_vision_test_config()
diff --git a/tests/recipes/utils.py b/tests/recipes/utils.py
index 7c35eedc2a..6f94dd2399 100644
--- a/tests/recipes/utils.py
+++ b/tests/recipes/utils.py
@@ -135,7 +135,7 @@ def llama3_2_vision_test_config() -> List[str]:
         "tokenizer.patch_size=9",
         "tokenizer.max_num_tiles=2",
         "tokenizer.tile_size=18",
-        "tokenizer.max_seq_len=4096",
+        "tokenizer.max_seq_len=1024",
     ]
 
 

From ed3f02eb3cfe7c6a1ec65ce4e5b12ca86707f37f Mon Sep 17 00:00:00 2001
From: salman <salman.mohammadi@outlook.com>
Date: Tue, 12 Nov 2024 00:31:47 +0000
Subject: [PATCH 12/20] what a silly billy I am oh boy

---
 tests/recipes/test_eleuther_eval.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/recipes/test_eleuther_eval.py b/tests/recipes/test_eleuther_eval.py
index 4e9f21fea1..91030fae04 100644
--- a/tests/recipes/test_eleuther_eval.py
+++ b/tests/recipes/test_eleuther_eval.py
@@ -234,7 +234,7 @@ def test_meta_eval_vision(self, caplog, monkeypatch, tmpdir, expected_vision_acc
             limit=5 \
             dtype=bf16 \
             device=cuda \
-            max_seq_len=1024 \
+            max_seq_length=1024 \
         """.split()
 
         model_config = llama3_2_vision_test_config()
@@ -278,7 +278,7 @@ def test_hf_eval_vision(self, caplog, monkeypatch, tmpdir, expected_vision_acc):
             limit=5 \
             dtype=bf16 \
             device=cuda \
-            max_seq_len=1024 \
+            max_seq_length=1024 \
         """.split()
 
         model_config = llama3_2_vision_test_config()

From 8de33503ac5030cea306d9d7a46d3b6668a96958 Mon Sep 17 00:00:00 2001
From: Salman Mohammadi <salman.mohammadi@outlook.com>
Date: Tue, 12 Nov 2024 13:52:40 +0000
Subject: [PATCH 13/20] is it a python version thing?

---
 .github/workflows/gpu_test.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/gpu_test.yaml b/.github/workflows/gpu_test.yaml
index 67b4a0705a..5cc78a8ea7 100644
--- a/.github/workflows/gpu_test.yaml
+++ b/.github/workflows/gpu_test.yaml
@@ -27,7 +27,7 @@ jobs:
     runs-on: linux.8xlarge.nvidia.gpu
     strategy:
       matrix:
-        python-version: ['3.9', '3.10', '3.11']
+        python-version: ['3.11']
         torch-version: ["stable", "nightly"]
         # Do not run against nightlies on PR
         exclude:

From 3424c320446d2752288b4f9d6a44e1461bb318e6 Mon Sep 17 00:00:00 2001
From: Salman Mohammadi <salman.mohammadi@outlook.com>
Date: Tue, 12 Nov 2024 14:09:23 +0000
Subject: [PATCH 14/20] it is NOT. BACK TO THE CPU

---
 .github/workflows/gpu_test.yaml     | 2 +-
 tests/recipes/test_eleuther_eval.py | 8 +++-----
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/gpu_test.yaml b/.github/workflows/gpu_test.yaml
index 5cc78a8ea7..67b4a0705a 100644
--- a/.github/workflows/gpu_test.yaml
+++ b/.github/workflows/gpu_test.yaml
@@ -27,7 +27,7 @@ jobs:
     runs-on: linux.8xlarge.nvidia.gpu
     strategy:
       matrix:
-        python-version: ['3.11']
+        python-version: ['3.9', '3.10', '3.11']
         torch-version: ["stable", "nightly"]
         # Do not run against nightlies on PR
         exclude:
diff --git a/tests/recipes/test_eleuther_eval.py b/tests/recipes/test_eleuther_eval.py
index 91030fae04..deef430c45 100644
--- a/tests/recipes/test_eleuther_eval.py
+++ b/tests/recipes/test_eleuther_eval.py
@@ -19,7 +19,7 @@
     write_hf_ckpt_config,
     write_hf_vision_ckpt_config,
 )
-from tests.test_utils import CKPT_MODEL_PATHS, gpu_test
+from tests.test_utils import CKPT_MODEL_PATHS
 
 
 class TestEleutherEval:
@@ -212,7 +212,6 @@ def test_eval_recipe_errors_with_qat_quantizer(self, monkeypatch, tmpdir):
             runpy.run_path(TUNE_PATH, run_name="__main__")
 
     @pytest.mark.integration_test
-    @gpu_test(gpu_count=1)
     def test_meta_eval_vision(self, caplog, monkeypatch, tmpdir, expected_vision_acc):
         ckpt = "llama3_2_vision_meta"
         ckpt_path = Path(CKPT_MODEL_PATHS[ckpt])
@@ -233,7 +232,7 @@ def test_meta_eval_vision(self, caplog, monkeypatch, tmpdir, expected_vision_acc
             tokenizer.prompt_template=null \
             limit=5 \
             dtype=bf16 \
-            device=cuda \
+            device=cpu \
             max_seq_length=1024 \
         """.split()
 
@@ -253,7 +252,6 @@ def test_meta_eval_vision(self, caplog, monkeypatch, tmpdir, expected_vision_acc
             assert math.isclose(float(accuracy), expected_vision_acc[task_name])
 
     @pytest.mark.integration_test
-    @gpu_test(gpu_count=1)
     def test_hf_eval_vision(self, caplog, monkeypatch, tmpdir, expected_vision_acc):
         ckpt = "llama3_2_vision_hf"
         ckpt_path = Path(CKPT_MODEL_PATHS[ckpt])
@@ -277,7 +275,7 @@ def test_hf_eval_vision(self, caplog, monkeypatch, tmpdir, expected_vision_acc):
             tokenizer.prompt_template=null \
             limit=5 \
             dtype=bf16 \
-            device=cuda \
+            device=cpu \
             max_seq_length=1024 \
         """.split()
 

From abca4d13014fd60553435a32dc6b3879414f3d2d Mon Sep 17 00:00:00 2001
From: Salman Mohammadi <salman.mohammadi@outlook.com>
Date: Tue, 12 Nov 2024 14:44:13 +0000
Subject: [PATCH 15/20] back to gpu.. it's a max_seq_len thing??

---
 tests/recipes/test_eleuther_eval.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/recipes/test_eleuther_eval.py b/tests/recipes/test_eleuther_eval.py
index deef430c45..972ccfadb0 100644
--- a/tests/recipes/test_eleuther_eval.py
+++ b/tests/recipes/test_eleuther_eval.py
@@ -19,7 +19,7 @@
     write_hf_ckpt_config,
     write_hf_vision_ckpt_config,
 )
-from tests.test_utils import CKPT_MODEL_PATHS
+from tests.test_utils import CKPT_MODEL_PATHS, gpu_test
 
 
 class TestEleutherEval:
@@ -212,6 +212,7 @@ def test_eval_recipe_errors_with_qat_quantizer(self, monkeypatch, tmpdir):
             runpy.run_path(TUNE_PATH, run_name="__main__")
 
     @pytest.mark.integration_test
+    @gpu_test(gpu_count=1)
     def test_meta_eval_vision(self, caplog, monkeypatch, tmpdir, expected_vision_acc):
         ckpt = "llama3_2_vision_meta"
         ckpt_path = Path(CKPT_MODEL_PATHS[ckpt])
@@ -232,8 +233,7 @@ def test_meta_eval_vision(self, caplog, monkeypatch, tmpdir, expected_vision_acc
             tokenizer.prompt_template=null \
             limit=5 \
             dtype=bf16 \
-            device=cpu \
-            max_seq_length=1024 \
+            device=cuda \
         """.split()
 
         model_config = llama3_2_vision_test_config()
@@ -252,6 +252,7 @@ def test_meta_eval_vision(self, caplog, monkeypatch, tmpdir, expected_vision_acc
             assert math.isclose(float(accuracy), expected_vision_acc[task_name])
 
     @pytest.mark.integration_test
+    @gpu_test(gpu_count=1)
     def test_hf_eval_vision(self, caplog, monkeypatch, tmpdir, expected_vision_acc):
         ckpt = "llama3_2_vision_hf"
         ckpt_path = Path(CKPT_MODEL_PATHS[ckpt])
@@ -275,8 +276,7 @@ def test_hf_eval_vision(self, caplog, monkeypatch, tmpdir, expected_vision_acc):
             tokenizer.prompt_template=null \
             limit=5 \
             dtype=bf16 \
-            device=cpu \
-            max_seq_length=1024 \
+            device=cuda \
         """.split()
 
         model_config = llama3_2_vision_test_config()

From 5ab8f838f2b0f70829fe9329b8c2aa6f96ccf1d5 Mon Sep 17 00:00:00 2001
From: Salman Mohammadi <salman.mohammadi@outlook.com>
Date: Tue, 12 Nov 2024 14:53:59 +0000
Subject: [PATCH 16/20] that didn't work...

---
 tests/recipes/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/recipes/utils.py b/tests/recipes/utils.py
index 6f94dd2399..7c35eedc2a 100644
--- a/tests/recipes/utils.py
+++ b/tests/recipes/utils.py
@@ -135,7 +135,7 @@ def llama3_2_vision_test_config() -> List[str]:
         "tokenizer.patch_size=9",
         "tokenizer.max_num_tiles=2",
         "tokenizer.tile_size=18",
-        "tokenizer.max_seq_len=1024",
+        "tokenizer.max_seq_len=4096",
     ]
 
 

From 19c029eb1166f9429dcdd6645c74ea4d45992329 Mon Sep 17 00:00:00 2001
From: Salman Mohammadi <salman.mohammadi@outlook.com>
Date: Tue, 12 Nov 2024 15:08:06 +0000
Subject: [PATCH 17/20] this is a terrible experience for me

---
 tests/recipes/test_eleuther_eval.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/recipes/test_eleuther_eval.py b/tests/recipes/test_eleuther_eval.py
index 972ccfadb0..4a861ad399 100644
--- a/tests/recipes/test_eleuther_eval.py
+++ b/tests/recipes/test_eleuther_eval.py
@@ -234,6 +234,7 @@ def test_meta_eval_vision(self, caplog, monkeypatch, tmpdir, expected_vision_acc
             limit=5 \
             dtype=bf16 \
             device=cuda \
+            max_seq_length=4096 \
         """.split()
 
         model_config = llama3_2_vision_test_config()
@@ -277,6 +278,7 @@ def test_hf_eval_vision(self, caplog, monkeypatch, tmpdir, expected_vision_acc):
             limit=5 \
             dtype=bf16 \
             device=cuda \
+            max_seq_length=4096 \
         """.split()
 
         model_config = llama3_2_vision_test_config()

From a691a0862d4cfceec35e1d2e2024fe38190ad62c Mon Sep 17 00:00:00 2001
From: Salman Mohammadi <salman.mohammadi@outlook.com>
Date: Tue, 12 Nov 2024 15:27:20 +0000
Subject: [PATCH 18/20] stg if this doesn't work

---
 tests/recipes/test_eleuther_eval.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/tests/recipes/test_eleuther_eval.py b/tests/recipes/test_eleuther_eval.py
index 4a861ad399..e0e526bd3e 100644
--- a/tests/recipes/test_eleuther_eval.py
+++ b/tests/recipes/test_eleuther_eval.py
@@ -19,7 +19,7 @@
     write_hf_ckpt_config,
     write_hf_vision_ckpt_config,
 )
-from tests.test_utils import CKPT_MODEL_PATHS, gpu_test
+from tests.test_utils import CKPT_MODEL_PATHS
 
 
 class TestEleutherEval:
@@ -212,7 +212,6 @@ def test_eval_recipe_errors_with_qat_quantizer(self, monkeypatch, tmpdir):
             runpy.run_path(TUNE_PATH, run_name="__main__")
 
     @pytest.mark.integration_test
-    @gpu_test(gpu_count=1)
     def test_meta_eval_vision(self, caplog, monkeypatch, tmpdir, expected_vision_acc):
         ckpt = "llama3_2_vision_meta"
         ckpt_path = Path(CKPT_MODEL_PATHS[ckpt])
@@ -233,8 +232,7 @@ def test_meta_eval_vision(self, caplog, monkeypatch, tmpdir, expected_vision_acc
             tokenizer.prompt_template=null \
             limit=5 \
             dtype=bf16 \
-            device=cuda \
-            max_seq_length=4096 \
+            device=cpu \
         """.split()
 
         model_config = llama3_2_vision_test_config()
@@ -253,7 +251,6 @@ def test_meta_eval_vision(self, caplog, monkeypatch, tmpdir, expected_vision_acc
             assert math.isclose(float(accuracy), expected_vision_acc[task_name])
 
     @pytest.mark.integration_test
-    @gpu_test(gpu_count=1)
     def test_hf_eval_vision(self, caplog, monkeypatch, tmpdir, expected_vision_acc):
         ckpt = "llama3_2_vision_hf"
         ckpt_path = Path(CKPT_MODEL_PATHS[ckpt])
@@ -277,8 +274,7 @@ def test_hf_eval_vision(self, caplog, monkeypatch, tmpdir, expected_vision_acc):
             tokenizer.prompt_template=null \
             limit=5 \
             dtype=bf16 \
-            device=cuda \
-            max_seq_length=4096 \
+            device=cpu \
         """.split()
 
         model_config = llama3_2_vision_test_config()

From 3bb57faeeebf0fcfa989ba69bc7e443dc237b9bb Mon Sep 17 00:00:00 2001
From: Salman Mohammadi <salman.mohammadi@outlook.com>
Date: Tue, 12 Nov 2024 16:10:35 +0000
Subject: [PATCH 19/20] I don't even know at this point

---
 tests/recipes/test_eleuther_eval.py | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/tests/recipes/test_eleuther_eval.py b/tests/recipes/test_eleuther_eval.py
index e0e526bd3e..68ffca0f82 100644
--- a/tests/recipes/test_eleuther_eval.py
+++ b/tests/recipes/test_eleuther_eval.py
@@ -19,7 +19,7 @@
     write_hf_ckpt_config,
     write_hf_vision_ckpt_config,
 )
-from tests.test_utils import CKPT_MODEL_PATHS
+from tests.test_utils import CKPT_MODEL_PATHS, gpu_test
 
 
 class TestEleutherEval:
@@ -39,12 +39,12 @@ def mocked_import(name, *args, **kwargs):
     @pytest.fixture
     def expected_vision_acc(self):
         return {
-            "Science": 0.2,
-            "Biology": 0.4,
-            "Chemistry": 0.0,
-            "Geography": 0.2,
-            "Math": 0.4,
-            "Physics": 0.0,
+            "Science": 0.35,
+            "Biology": 0.25,
+            "Chemistry": 0.25,
+            "Geography": 0.5,
+            "Math": 0.0,
+            "Physics": 0.75,
         }
 
     @pytest.mark.parametrize(
@@ -212,6 +212,7 @@ def test_eval_recipe_errors_with_qat_quantizer(self, monkeypatch, tmpdir):
             runpy.run_path(TUNE_PATH, run_name="__main__")
 
     @pytest.mark.integration_test
+    @gpu_test(gpu_count=1)
     def test_meta_eval_vision(self, caplog, monkeypatch, tmpdir, expected_vision_acc):
         ckpt = "llama3_2_vision_meta"
         ckpt_path = Path(CKPT_MODEL_PATHS[ckpt])
@@ -230,9 +231,9 @@ def test_meta_eval_vision(self, caplog, monkeypatch, tmpdir, expected_vision_acc
             checkpointer.model_type=LLAMA3_VISION \
             tokenizer.path=/tmp/test-artifacts/tokenizer_llama3.model \
             tokenizer.prompt_template=null \
-            limit=5 \
+            limit=4 \
             dtype=bf16 \
-            device=cpu \
+            device=cuda \
         """.split()
 
         model_config = llama3_2_vision_test_config()
@@ -251,6 +252,7 @@ def test_meta_eval_vision(self, caplog, monkeypatch, tmpdir, expected_vision_acc
             assert math.isclose(float(accuracy), expected_vision_acc[task_name])
 
     @pytest.mark.integration_test
+    @gpu_test(gpu_count=1)
     def test_hf_eval_vision(self, caplog, monkeypatch, tmpdir, expected_vision_acc):
         ckpt = "llama3_2_vision_hf"
         ckpt_path = Path(CKPT_MODEL_PATHS[ckpt])
@@ -272,9 +274,9 @@ def test_hf_eval_vision(self, caplog, monkeypatch, tmpdir, expected_vision_acc):
             checkpointer.model_type=LLAMA3_VISION \
             tokenizer.path=/tmp/test-artifacts/tokenizer_llama3.model \
             tokenizer.prompt_template=null \
-            limit=5 \
+            limit=4 \
             dtype=bf16 \
-            device=cpu \
+            device=cuda \
         """.split()
 
         model_config = llama3_2_vision_test_config()

From 76ff0fde5c32bf09b87c3b37e608ac50230819d7 Mon Sep 17 00:00:00 2001
From: Salman Mohammadi <salman.mohammadi@outlook.com>
Date: Tue, 12 Nov 2024 16:29:45 +0000
Subject: [PATCH 20/20] OKAY this should work right?

---
 README.md                                     |   1 -
 docs/source/api_ref_models.rst                |  31 --
 .../source/tutorials/memory_optimizations.rst |   4 +-
 recipes/configs/gemma2/27B_full.yaml          |  74 ----
 recipes/configs/gemma2/27B_lora.yaml          |  86 ----
 .../gemma2/27B_lora_single_device.yaml        | 112 -----
 .../gemma2/27B_qlora_single_device.yaml       | 115 -----
 recipes/configs/gemma2/2B_full.yaml           |  76 ----
 recipes/configs/gemma2/2B_lora.yaml           |  88 ----
 .../configs/gemma2/2B_lora_single_device.yaml | 114 -----
 .../gemma2/2B_qlora_single_device.yaml        | 114 -----
 recipes/configs/gemma2/9B_full.yaml           |  74 ----
 recipes/configs/gemma2/9B_lora.yaml           |  86 ----
 .../configs/gemma2/9B_lora_single_device.yaml | 112 -----
 .../gemma2/9B_qlora_single_device.yaml        | 115 -----
 recipes/configs/llama2/7B_qat_full.yaml       |   2 +-
 recipes/configs/llama3/8B_qat_full.yaml       |   9 +-
 recipes/lora_finetune_single_device.py        |   1 +
 recipes/qat_distributed.py                    | 306 +++----------
 tests/recipes/test_eleuther_eval.py           |  40 +-
 torchtune/_recipe_registry.py                 |  30 --
 torchtune/generation/_generation.py           |  20 +-
 torchtune/models/gemma/__init__.py            |   2 +
 torchtune/models/gemma/_component_builders.py |   6 +
 torchtune/models/gemma2/__init__.py           |  36 --
 torchtune/models/gemma2/_attention.py         | 339 --------------
 .../models/gemma2/_component_builders.py      | 413 ------------------
 torchtune/models/gemma2/_convert_weights.py   | 132 ------
 torchtune/models/gemma2/_model_builders.py    | 286 ------------
 .../training/checkpointing/_checkpointer.py   |  20 -
 torchtune/training/checkpointing/_utils.py    |   2 -
 31 files changed, 115 insertions(+), 2731 deletions(-)
 delete mode 100644 recipes/configs/gemma2/27B_full.yaml
 delete mode 100644 recipes/configs/gemma2/27B_lora.yaml
 delete mode 100644 recipes/configs/gemma2/27B_lora_single_device.yaml
 delete mode 100644 recipes/configs/gemma2/27B_qlora_single_device.yaml
 delete mode 100644 recipes/configs/gemma2/2B_full.yaml
 delete mode 100644 recipes/configs/gemma2/2B_lora.yaml
 delete mode 100644 recipes/configs/gemma2/2B_lora_single_device.yaml
 delete mode 100644 recipes/configs/gemma2/2B_qlora_single_device.yaml
 delete mode 100644 recipes/configs/gemma2/9B_full.yaml
 delete mode 100644 recipes/configs/gemma2/9B_lora.yaml
 delete mode 100644 recipes/configs/gemma2/9B_lora_single_device.yaml
 delete mode 100644 recipes/configs/gemma2/9B_qlora_single_device.yaml
 delete mode 100644 torchtune/models/gemma2/__init__.py
 delete mode 100644 torchtune/models/gemma2/_attention.py
 delete mode 100644 torchtune/models/gemma2/_component_builders.py
 delete mode 100644 torchtune/models/gemma2/_convert_weights.py
 delete mode 100644 torchtune/models/gemma2/_model_builders.py

diff --git a/README.md b/README.md
index 2d885a3779..31fc280e04 100644
--- a/README.md
+++ b/README.md
@@ -44,7 +44,6 @@ torchtune currently supports the following models.
 | [Code-Llama2](https://ai.meta.com/blog/code-llama-large-language-model-coding/)   | 7B, 13B, 70B [[models](torchtune/models/code_llama2/_model_builders.py), [configs](recipes/configs/code_llama2/)] |
 | [Mistral](https://huggingface.co/mistralai)   | 7B [[models](torchtune/models/mistral/_model_builders.py), [configs](recipes/configs/mistral/)] |
 | [Gemma](https://huggingface.co/collections/google/gemma-release-65d5efbccdbb8c4202ec078b)   | 2B, 7B [[models](torchtune/models/gemma/_model_builders.py), [configs](recipes/configs/gemma/)] |
-| [Gemma2](https://huggingface.co/docs/transformers/main/en/model_doc/gemma2)   | 2B, 9B, 27B [[models](torchtune/models/gemma2/_model_builders.py), [configs](recipes/configs/gemma2/)] |
 | [Microsoft Phi3](https://huggingface.co/collections/microsoft/phi-3-6626e15e9585a200d2d761e3) | Mini [[models](torchtune/models/phi3/), [configs](recipes/configs/phi3/)]
 | [Qwen2](https://qwenlm.github.io/blog/qwen2/) | 0.5B, 1.5B, 7B [[models](torchtune/models/qwen2/), [configs](recipes/configs/qwen2/)]
 
diff --git a/docs/source/api_ref_models.rst b/docs/source/api_ref_models.rst
index b2d74022b1..36175bb392 100644
--- a/docs/source/api_ref_models.rst
+++ b/docs/source/api_ref_models.rst
@@ -361,37 +361,6 @@ To download the Gemma 7B model:
     gemma.gemma_tokenizer
 
 
-gemma2 :
---------
-
-Models of size 2B, 9B, 27B from the `Gemma family <https://blog.google/technology/developers/gemma-open-models/>`_.
-
-Important: You need to request access on `Hugging Face <https://huggingface.co/google/gemma-2-2b>`__ to use this model.
-
-To download the Gemma2 2B, 9B, 27B models :
-
-.. code-block:: bash
-
-    tune download google/gemma-2-<MODEL_SIZE>b --ignore-patterns "gemma-2-<MODEL_SIZE>b.gguf"  --hf-token <HF_TOKEN>
-
-
-.. autosummary::
-    :toctree: generated/
-    :nosignatures:
-
-    gemma2.gemma2
-    gemma2.lora_gemma2
-    gemma2.gemma2_2b
-    gemma2.lora_gemma2_2b
-    gemma2.qlora_gemma2_2b
-    gemma2.gemma2_9b
-    gemma2.lora_gemma2_9b
-    gemma2.qlora_gemma2_9b
-    gemma2.gemma2_27b
-    gemma2.lora_gemma2_27b
-    gemma2.qlora_gemma2_27b
-    gemma.gemma_tokenizer
-
 clip
 ----
 
diff --git a/docs/source/tutorials/memory_optimizations.rst b/docs/source/tutorials/memory_optimizations.rst
index a0f6d16c91..aa75024e6a 100644
--- a/docs/source/tutorials/memory_optimizations.rst
+++ b/docs/source/tutorials/memory_optimizations.rst
@@ -167,7 +167,7 @@ In addition to :ref:`reducing model and optimizer precision <glossary_precision>
 All of our recipes support lower-precision optimizers from the `torchao <https://github.com/pytorch/ao/tree/main/torchao/prototype/low_bit_optim>`_ library.
 For single device recipes, we also support `bitsandbytes <https://huggingface.co/docs/bitsandbytes/main/en/index>`_.
 
-A good place to start might be the :class:`torchao.prototype.low_bit_optim.AdamW8bit` and :class:`bitsandbytes.optim.PagedAdamW8bit` optimizers.
+A good place to start might be the :class:`torchao.prototype.low_bit_optim.torchao.AdamW8bit` and :class:`bitsandbytes.optim.PagedAdamW8bit` optimizers.
 Both reduce memory by quantizing the optimizer state dict. Paged optimizers will also offload to CPU if there isn't enough GPU memory available. In practice,
 you can expect higher memory savings from bnb's PagedAdamW8bit but higher training speed from torchao's AdamW8bit.
 
@@ -180,7 +180,7 @@ a low precision optimizer using the :ref:`cli_label`:
 .. code-block:: bash
 
   tune run <RECIPE> --config <CONFIG> \
-  optimizer=torchao.prototype.low_bit_optim.AdamW8bit
+  optimizer=torchao.prototype.low_bit_optim.torchao.AdamW8bit
 
 .. code-block:: bash
 
diff --git a/recipes/configs/gemma2/27B_full.yaml b/recipes/configs/gemma2/27B_full.yaml
deleted file mode 100644
index ddc89b38b2..0000000000
--- a/recipes/configs/gemma2/27B_full.yaml
+++ /dev/null
@@ -1,74 +0,0 @@
-# Config for multi-device full finetuning in full_finetune_distributed.py
-# using a gemma2 27B model
-#
-# This config assumes that you've run the following command before launching
-# this run:
-#   tune download google/gemma-2-27b --ignore-patterns "gemma-2-27b.gguf"  --hf-token <HF_TOKEN>
-#
-# To launch on 4 devices, run the following command from root:
-#   tune run --nnodes 1 --nproc_per_node 4 full_finetune_distributed --config gemma2/27B_full
-#
-# You can add specific overrides through the command line. For example
-# to override the checkpointer directory while launching training
-# you can run:
-#   tune run --nnodes 1 --nproc_per_node 4 full_finetune_distributed --config gemma2/27B_full checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
-#
-# This config works only when the model is being fine-tuned on 2+ GPUs.
-
-
-# Tokenizer
-tokenizer:
-  _component_: torchtune.models.gemma.gemma_tokenizer
-  path: /tmp/gemma-2-27b/tokenizer.model
-
-# Dataset
-dataset:
-  packed: False # Set to true for great speed ups
-  _component_: torchtune.datasets.alpaca_dataset
-seed: null
-shuffle: True
-
-# Model Arguments
-model:
-  _component_: torchtune.models.gemma2.gemma2_27b
-
-checkpointer:
-  _component_: torchtune.training.FullModelHFCheckpointer
-  checkpoint_dir: /tmp/gemma-2-27b/
-  checkpoint_files:
-    filename_format: model-{}-of-{}.safetensors
-    max_filename: "00024"
-  recipe_checkpoint: null
-  output_dir: /tmp/gemma-2-27b
-  model_type: GEMMA2
-resume_from_checkpoint: False
-
-# Fine-tuning arguments
-batch_size: 1
-epochs: 1
-optimizer:
-  _component_: torch.optim.AdamW
-  fused: True
-  lr: 2e-5
-loss:
-  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
-max_steps_per_epoch: null
-gradient_accumulation_steps: 1
-compile: False  # pytorch compile, set to true for perf/memory improvement
-
-# Training env
-device: cuda
-
-# Memory management
-enable_activation_checkpointing: True
-
-# Reduced precision
-dtype: bf16
-
-# Logging
-metric_logger:
-  _component_: torchtune.training.metric_logging.DiskLogger
-  log_dir: ${output_dir}
-output_dir: /tmp/alpaca-gemma2-27b-finetune
-log_every_n_steps: 1
-log_peak_memory_stats: True
diff --git a/recipes/configs/gemma2/27B_lora.yaml b/recipes/configs/gemma2/27B_lora.yaml
deleted file mode 100644
index a138441199..0000000000
--- a/recipes/configs/gemma2/27B_lora.yaml
+++ /dev/null
@@ -1,86 +0,0 @@
-# Config for multi-device LoRA finetuning in lora_finetune_distributed.py
-# using a gemma2 27B model
-#
-# This config assumes that you've run the following command before launching
-# this run:
-#   tune download google/gemma-2-27b --ignore-patterns "gemma-2-27b.gguf"  --hf-token <HF_TOKEN>
-#
-# To launch on 4 devices, run the following command from root:
-#   tune run --nnodes 1 --nproc_per_node 4 lora_finetune_distributed --config gemma2/27B_lora
-#
-# You can add specific overrides through the command line. For example
-# to override the checkpointer directory while launching training
-# you can run:
-#   tune run --nnodes 1 --nproc_per_node 4 lora_finetune_distributed --config gemma2/27B_lora checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
-#
-# This config works only when the model is being fine-tuned on 2+ GPUs.
-
-
-# Tokenizer
-tokenizer:
-  _component_: torchtune.models.gemma.gemma_tokenizer
-  path: /tmp/gemma-2-27b/tokenizer.model
-
-# Dataset
-dataset:
-  packed: False # Set to true for great speed ups
-  _component_: torchtune.datasets.alpaca_dataset
-seed: null
-shuffle: True
-
-# Model Arguments
-model:
-  _component_: torchtune.models.gemma2.lora_gemma2_27b
-  lora_attn_modules: ['q_proj', 'k_proj', 'v_proj']
-  apply_lora_to_mlp: True
-  lora_rank: 64
-  lora_alpha: 128
-  lora_dropout: 0.0
-
-checkpointer:
-  _component_: torchtune.training.FullModelHFCheckpointer
-  checkpoint_dir: /tmp/gemma-2-27b/
-  checkpoint_files:
-    filename_format: model-{}-of-{}.safetensors
-    max_filename: "00024"
-  recipe_checkpoint: null
-  output_dir: /tmp/gemma-2-27b/
-  model_type: GEMMA2
-resume_from_checkpoint: False
-save_adapter_weights_only: False
-
-optimizer:
-  _component_: torch.optim.AdamW
-  fused: True
-  lr: 2e-5
-
-lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
-  num_warmup_steps: 10
-
-loss:
-  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
-
-# Fine-tuning arguments
-batch_size: 4
-epochs: 3
-max_steps_per_epoch: null
-gradient_accumulation_steps: 1
-compile: False  # pytorch compile, set to true for perf/memory improvement
-
-# Training env
-device: cuda
-
-# Memory management
-enable_activation_checkpointing: True
-
-# Reduced precision
-dtype: bf16
-
-# Logging
-metric_logger:
-  _component_: torchtune.training.metric_logging.DiskLogger
-  log_dir: ${output_dir}
-output_dir: /tmp/alpaca-gemma2-27b-lora
-log_every_n_steps: 1
-log_peak_memory_stats: True
diff --git a/recipes/configs/gemma2/27B_lora_single_device.yaml b/recipes/configs/gemma2/27B_lora_single_device.yaml
deleted file mode 100644
index 577b0715c5..0000000000
--- a/recipes/configs/gemma2/27B_lora_single_device.yaml
+++ /dev/null
@@ -1,112 +0,0 @@
-# Config for multi-device LoRA finetuning in lora_finetune_single_device.py
-# using a gemma2 27B model
-#
-# This config assumes that you've run the following command before launching
-# this run (torchtune does not use gguf so you can ignore it to save time and space):
-#   tune download google/gemma-2-27b --ignore-patterns "gemma-2-27b.gguf"  --hf-token <HF_TOKEN>
-#
-# To launch on a single device, run the following command from root:
-#   tune run lora_finetune_single_device --config gemma2/27B_lora_single_device
-#
-# You can add specific overrides through the command line. For example
-# to override the checkpointer directory while launching training
-# you can run:
-#   tune run lora_finetune_single_device --config gemma2/27B_lora_single_device checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
-#
-# This config works only for training on single device.
-
-# Tokenizer
-tokenizer:
-  _component_: torchtune.models.gemma.gemma_tokenizer
-  path: /tmp/gemma-2-27b/tokenizer.model
-
-# Dataset
-dataset:
-  packed: False # Set to true for great speed ups
-  _component_: torchtune.datasets.alpaca_dataset
-seed: null
-shuffle: True
-
-# Model Arguments
-model:
-  _component_: torchtune.models.gemma2.lora_gemma2_27b
-  lora_attn_modules: ['q_proj', 'k_proj', 'v_proj']
-  apply_lora_to_mlp: True
-  lora_rank: 8
-  lora_alpha: 16
-  lora_dropout: 0.0
-
-checkpointer:
-  _component_: torchtune.training.FullModelHFCheckpointer
-  checkpoint_dir: /tmp/gemma-2-27b/
-  checkpoint_files:
-    filename_format: model-{}-of-{}.safetensors
-    max_filename: "00024"
-  recipe_checkpoint: null
-  output_dir: /tmp/gemma-2-27b/
-  model_type: GEMMA2
-resume_from_checkpoint: False
-save_adapter_weights_only: False
-
-optimizer:
-  _component_: torch.optim.AdamW
-  fused: True
-  lr: 5e-5
-
-lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
-  num_warmup_steps: 10
-
-loss:
-  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
-
-# Fine-tuning arguments
-batch_size: 2
-epochs: 1
-max_steps_per_epoch: null
-gradient_accumulation_steps: 8
-compile: False  # pytorch compile, set to true for perf/memory improvement
-
-# Training env
-device: cuda
-
-# Memory management
-enable_activation_checkpointing: True
-enable_activation_offloading: False
-
-# Reduced precision
-dtype: bf16
-
-# Logging
-metric_logger:
-  _component_: torchtune.training.metric_logging.DiskLogger
-  log_dir: ${output_dir}
-output_dir: /tmp/alpaca-gemma2-27b-lora
-log_every_n_steps: 1
-log_peak_memory_stats: True
-
-# Show case the usage of pytorch profiler
-# Set enabled to False as it's only needed for debugging training
-profiler:
-  _component_: torchtune.training.setup_torch_profiler
-  enabled: False
-
-  #Output directory of trace artifacts
-  output_dir: ${output_dir}/profiling_outputs
-
-  #`torch.profiler.ProfilerActivity` types to trace
-  cpu: True
-  cuda: True
-
-  #trace options passed to `torch.profiler.profile`
-  profile_memory: False
-  with_stack: False
-  record_shapes: True
-  with_flops: False
-
-  # `torch.profiler.schedule` options:
-  # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
-  wait_steps: 5
-  warmup_steps: 5
-  active_steps: 2
-  num_cycles: 1
diff --git a/recipes/configs/gemma2/27B_qlora_single_device.yaml b/recipes/configs/gemma2/27B_qlora_single_device.yaml
deleted file mode 100644
index 14d9b75ba7..0000000000
--- a/recipes/configs/gemma2/27B_qlora_single_device.yaml
+++ /dev/null
@@ -1,115 +0,0 @@
-# Config for multi-device QLoRA finetuning in lora_finetune_single_device.py
-# using a gemma2 27B model
-#
-# This config assumes that you've run the following command before launching
-# this run:
-#   tune download google/gemma-2-27b --ignore-patterns "gemma-2-27b.gguf"  --hf-token <HF_TOKEN>
-#
-# To launch on a single device, run the following command from root:
-#   tune run lora_finetune_single_device --config gemma2/27B_qlora_single_device
-#
-# You can add specific overrides through the command line. For example
-# to override the checkpointer directory while launching training
-# you can run:
-#   tune run lora_finetune_single_device --config gemma2/27B_qlora_single_device checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
-#
-# This config works only for training on single device.
-
-# Tokenizer
-tokenizer:
-  _component_: torchtune.models.gemma.gemma_tokenizer
-  path: /tmp/gemma-2-27b/tokenizer.model
-
-# Dataset
-dataset:
-  packed: False # Set to true for great speed ups
-  _component_: torchtune.datasets.alpaca_dataset
-seed: null
-shuffle: True
-
-# Model Arguments
-model:
-  _component_: torchtune.models.gemma2.qlora_gemma2_27b
-  lora_attn_modules: ['q_proj', 'k_proj', 'v_proj']
-  apply_lora_to_mlp: True
-  lora_rank: 64
-  lora_alpha: 128
-  lora_dropout: 0.0
-
-checkpointer:
-  _component_: torchtune.training.FullModelHFCheckpointer
-  checkpoint_dir: /tmp/gemma-2-27b/
-  checkpoint_files:
-    filename_format: model-{}-of-{}.safetensors
-    max_filename: "00024"
-  recipe_checkpoint: null
-  output_dir: /tmp/gemma-2-27b/
-  model_type: GEMMA2
-resume_from_checkpoint: False
-save_adapter_weights_only: False
-
-optimizer:
-  _component_: torch.optim.AdamW
-  fused: True
-  lr: 2e-5
-
-lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
-  num_warmup_steps: 10
-
-loss:
-  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
-
-# Fine-tuning arguments
-batch_size: 4
-epochs: 3
-max_steps_per_epoch: null
-gradient_accumulation_steps: 4
-compile: False  # pytorch compile, set to true for perf/memory improvement
-
-# Training env
-device: cuda
-
-# Memory management
-enable_activation_checkpointing: True
-enable_activation_offloading: False
-
-# Reduced precision
-dtype: bf16
-
-# Logging
-metric_logger:
-  _component_: torchtune.training.metric_logging.DiskLogger
-  log_dir: ${output_dir}
-output_dir: /tmp/alpaca-gemma2-27b-lora
-log_every_n_steps: 1
-log_peak_memory_stats: True
-
-# Show case the usage of pytorch profiler
-# Set enabled to False as it's only needed for debugging training
-profiler:
-  _component_: torchtune.training.setup_torch_profiler
-  enabled: False
-
-  #Output directory of trace artifacts
-  output_dir: ${output_dir}/profiling_outputs
-
-  #`torch.profiler.ProfilerActivity` types to trace
-  cpu: True
-  cuda: True
-
-  #trace options passed to `torch.profiler.profile`
-  profile_memory: False
-  with_stack: False
-  record_shapes: True
-  with_flops: False
-
-  # `torch.profiler.schedule` options:
-  # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
-  wait_steps: 5
-  warmup_steps: 5
-  active_steps: 2
-  num_cycles: 1
-
-# For colab use True
-low_cpu_ram: False
diff --git a/recipes/configs/gemma2/2B_full.yaml b/recipes/configs/gemma2/2B_full.yaml
deleted file mode 100644
index e302dd759d..0000000000
--- a/recipes/configs/gemma2/2B_full.yaml
+++ /dev/null
@@ -1,76 +0,0 @@
-# Config for multi-device full finetuning in full_finetune_distributed.py
-# using a gemma2 2B model
-#
-# This config assumes that you've run the following command before launching
-# this run:
-#   tune download google/gemma-2-2b --ignore-patterns "gemma-2-2b.gguf"  --hf-token <HF_TOKEN>
-#
-# To launch on 4 devices, run the following command from root:
-#   tune run --nnodes 1 --nproc_per_node 4 full_finetune_distributed --config gemma2/2B_full
-#
-# You can add specific overrides through the command line. For example
-# to override the checkpointer directory while launching training
-# you can run:
-#   tune run --nnodes 1 --nproc_per_node 4 full_finetune_distributed --config gemma2/2B_full checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
-#
-# This config works only when the model is being fine-tuned on 2+ GPUs.
-
-
-# Tokenizer
-tokenizer:
-  _component_: torchtune.models.gemma.gemma_tokenizer
-  path: /tmp/gemma-2-2b/tokenizer.model
-
-# Dataset
-dataset:
-  packed: False # Set to true for great speed ups
-  _component_: torchtune.datasets.alpaca_dataset
-seed: null
-shuffle: True
-
-# Model Arguments
-model:
-  _component_: torchtune.models.gemma2.gemma2_2b
-
-checkpointer:
-  _component_: torchtune.training.FullModelHFCheckpointer
-  checkpoint_dir: /tmp/gemma-2-2b/
-  checkpoint_files: [
-    model-00001-of-00003.safetensors,
-    model-00002-of-00003.safetensors,
-    model-00003-of-00003.safetensors,
-  ]
-  recipe_checkpoint: null
-  output_dir: /tmp/gemma-2-2b
-  model_type: GEMMA2
-resume_from_checkpoint: False
-
-# Fine-tuning arguments
-batch_size: 2
-epochs: 3
-optimizer:
-  _component_: torch.optim.AdamW
-  fused: True
-  lr: 2e-5
-loss:
-  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
-max_steps_per_epoch: null
-gradient_accumulation_steps: 1
-compile: False  # pytorch compile, set to true for perf/memory improvement
-
-# Training env
-device: cuda
-
-# Memory management
-enable_activation_checkpointing: True
-
-# Reduced precision
-dtype: bf16
-
-# Logging
-metric_logger:
-  _component_: torchtune.training.metric_logging.DiskLogger
-  log_dir: ${output_dir}
-output_dir: /tmp/alpaca-gemma2-finetune
-log_every_n_steps: 1
-log_peak_memory_stats: True
diff --git a/recipes/configs/gemma2/2B_lora.yaml b/recipes/configs/gemma2/2B_lora.yaml
deleted file mode 100644
index 9a439ee0a3..0000000000
--- a/recipes/configs/gemma2/2B_lora.yaml
+++ /dev/null
@@ -1,88 +0,0 @@
-# Config for multi-device LoRA finetuning in lora_finetune_distributed.py
-# using a gemma2 2B model
-#
-# This config assumes that you've run the following command before launching
-# this run:
-#   tune download google/gemma-2-2b --ignore-patterns "gemma-2-2b.gguf"  --hf-token <HF_TOKEN>
-#
-# To launch on 4 devices, run the following command from root:
-#   tune run --nnodes 1 --nproc_per_node 4 lora_finetune_distributed --config gemma2/2B_lora
-#
-# You can add specific overrides through the command line. For example
-# to override the checkpointer directory while launching training
-# you can run:
-#   tune run --nnodes 1 --nproc_per_node 4 lora_finetune_distributed --config gemma2/2B_lora checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
-#
-# This config works only when the model is being fine-tuned on 2+ GPUs.
-
-# Tokenizer
-tokenizer:
-  _component_: torchtune.models.gemma.gemma_tokenizer
-  path: /tmp/gemma-2-2b/tokenizer.model
-
-# Dataset
-dataset:
-  packed: False # Set to true for great speed ups
-  _component_: torchtune.datasets.alpaca_dataset
-seed: null
-shuffle: True
-
-# Model Arguments
-model:
-  _component_: torchtune.models.gemma2.lora_gemma2_2b
-  lora_attn_modules: ['q_proj', 'k_proj', 'v_proj']
-  apply_lora_to_mlp: True
-  lora_rank: 64
-  lora_alpha: 128
-  lora_dropout: 0.0
-
-checkpointer:
-  _component_: torchtune.training.FullModelHFCheckpointer
-  checkpoint_dir: /tmp/gemma-2-2b/
-  checkpoint_files: [
-    model-00001-of-00003.safetensors,
-    model-00002-of-00003.safetensors,
-    model-00003-of-00003.safetensors,
-  ]
-  recipe_checkpoint: null
-  output_dir: /tmp/gemma-2-2b
-  model_type: GEMMA2
-resume_from_checkpoint: False
-
-save_adapter_weights_only: False
-
-optimizer:
-  _component_: torch.optim.AdamW
-  fused: True
-  lr: 2e-5
-
-lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
-  num_warmup_steps: 10
-
-loss:
-  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
-
-# Fine-tuning arguments
-batch_size: 4
-epochs: 3
-max_steps_per_epoch: null
-gradient_accumulation_steps: 1
-compile: False  # pytorch compile, set to true for perf/memory improvement
-
-# Training env
-device: cuda
-
-# Memory management
-enable_activation_checkpointing: True
-
-# Reduced precision
-dtype: bf16
-
-# Logging
-metric_logger:
-  _component_: torchtune.training.metric_logging.DiskLogger
-  log_dir: ${output_dir}
-output_dir: /tmp/alpaca-gemma2-lora
-log_every_n_steps: 1
-log_peak_memory_stats: True
diff --git a/recipes/configs/gemma2/2B_lora_single_device.yaml b/recipes/configs/gemma2/2B_lora_single_device.yaml
deleted file mode 100644
index 1a2703fb47..0000000000
--- a/recipes/configs/gemma2/2B_lora_single_device.yaml
+++ /dev/null
@@ -1,114 +0,0 @@
-# Config for multi-device LoRA finetuning in lora_finetune_single_device.py
-# using a gemma2 2B model
-#
-# This config assumes that you've run the following command before launching
-# this run:
-#   tune download google/gemma-2-2b --ignore-patterns "gemma-2-2b.gguf"  --hf-token <HF_TOKEN>
-#
-# To launch on a single device, run the following command from root:
-#   tune run lora_finetune_single_device --config gemma2/2B_lora_single_device
-#
-# You can add specific overrides through the command line. For example
-# to override the checkpointer directory while launching training
-# you can run:
-#   tune run lora_finetune_single_device --config gemma2/2B_lora_single_device checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
-#
-# This config works only for training on single device.
-
-# Tokenizer
-tokenizer:
-  _component_: torchtune.models.gemma.gemma_tokenizer
-  path: /tmp/gemma-2-2b/tokenizer.model
-
-# Dataset
-dataset:
-  packed: False # Set to true for great speed ups
-  _component_: torchtune.datasets.alpaca_dataset
-seed: null
-shuffle: True
-
-# Model Arguments
-model:
-  _component_: torchtune.models.gemma2.lora_gemma2_2b
-  lora_attn_modules: ['q_proj', 'k_proj', 'v_proj']
-  apply_lora_to_mlp: True
-  lora_rank: 64
-  lora_alpha: 128
-  lora_dropout: 0.0
-
-checkpointer:
-  _component_: torchtune.training.FullModelHFCheckpointer
-  checkpoint_dir: /tmp/gemma-2-2b/
-  checkpoint_files: [
-    model-00001-of-00003.safetensors,
-    model-00002-of-00003.safetensors,
-    model-00003-of-00003.safetensors,
-  ]
-  recipe_checkpoint: null
-  output_dir: /tmp/gemma-2-2b
-  model_type: GEMMA2
-resume_from_checkpoint: False
-save_adapter_weights_only: False
-
-optimizer:
-  _component_: torch.optim.AdamW
-  fused: True
-  lr: 2e-5
-
-lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
-  num_warmup_steps: 10
-
-loss:
-  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
-
-# Fine-tuning arguments
-batch_size: 8
-epochs: 3
-max_steps_per_epoch: null
-gradient_accumulation_steps: 2
-compile: False  # pytorch compile, set to true for perf/memory improvement
-
-# Training env
-device: cuda
-
-# Memory management
-enable_activation_checkpointing: True
-enable_activation_offloading: False
-
-# Reduced precision
-dtype: bf16
-
-# Logging
-metric_logger:
-  _component_: torchtune.training.metric_logging.DiskLogger
-  log_dir: ${output_dir}
-output_dir: /tmp/alpaca-gemma2-lora
-log_every_n_steps: 1
-log_peak_memory_stats: True
-
-# Show case the usage of pytorch profiler
-# Set enabled to False as it's only needed for debugging training
-profiler:
-  _component_: torchtune.training.setup_torch_profiler
-  enabled: False
-
-  #Output directory of trace artifacts
-  output_dir: ${output_dir}/profiling_outputs
-
-  #`torch.profiler.ProfilerActivity` types to trace
-  cpu: True
-  cuda: True
-
-  #trace options passed to `torch.profiler.profile`
-  profile_memory: False
-  with_stack: False
-  record_shapes: True
-  with_flops: False
-
-  # `torch.profiler.schedule` options:
-  # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
-  wait_steps: 5
-  warmup_steps: 5
-  active_steps: 2
-  num_cycles: 1
diff --git a/recipes/configs/gemma2/2B_qlora_single_device.yaml b/recipes/configs/gemma2/2B_qlora_single_device.yaml
deleted file mode 100644
index c2525460ff..0000000000
--- a/recipes/configs/gemma2/2B_qlora_single_device.yaml
+++ /dev/null
@@ -1,114 +0,0 @@
-# Config for multi-device QLoRA finetuning in lora_finetune_single_device.py
-# using a gemma2 2B model
-#
-# This config assumes that you've run the following command before launching
-# this run:
-#   tune download google/gemma-2-2b --ignore-patterns "gemma-2-2b.gguf"  --hf-token <HF_TOKEN>
-#
-# To launch on a single device, run the following command from root:
-#   tune run lora_finetune_single_device --config gemma2/2B_qlora_single_device
-#
-# You can add specific overrides through the command line. For example
-# to override the checkpointer directory while launching training
-# you can run:
-#   tune run lora_finetune_single_device --config gemma2/2B_qlora_single_device checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
-#
-# This config works only for training on single device.
-
-# Tokenizer
-tokenizer:
-  _component_: torchtune.models.gemma.gemma_tokenizer
-  path: /tmp/gemma-2-2b/tokenizer.model
-
-# Dataset
-dataset:
-  packed: False # Set to true for great speed ups
-  _component_: torchtune.datasets.alpaca_dataset
-seed: null
-shuffle: True
-
-# Model Arguments
-model:
-  _component_: torchtune.models.gemma2.qlora_gemma2_2b
-  lora_attn_modules: ['q_proj', 'k_proj', 'v_proj']
-  apply_lora_to_mlp: True
-  lora_rank: 64
-  lora_alpha: 128
-  lora_dropout: 0.0
-
-checkpointer:
-  _component_: torchtune.training.FullModelHFCheckpointer
-  checkpoint_dir: /tmp/gemma-2-2b/
-  checkpoint_files: [
-    model-00001-of-00003.safetensors,
-    model-00002-of-00003.safetensors,
-    model-00003-of-00003.safetensors,
-  ]
-  recipe_checkpoint: null
-  output_dir: /tmp/gemma-2-2b
-  model_type: GEMMA2
-resume_from_checkpoint: False
-save_adapter_weights_only: False
-
-optimizer:
-  _component_: torch.optim.AdamW
-  fused: True
-  lr: 2e-5
-
-lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
-  num_warmup_steps: 10
-
-loss:
-  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
-
-# Fine-tuning arguments
-batch_size: 4
-epochs: 3
-max_steps_per_epoch: null
-gradient_accumulation_steps: 4
-compile: False  # pytorch compile, set to true for perf/memory improvement
-
-# Training env
-device: cuda
-
-# Memory management
-enable_activation_checkpointing: True
-enable_activation_offloading: False
-
-# Reduced precision
-dtype: bf16
-
-# Logging
-metric_logger:
-  _component_: torchtune.training.metric_logging.DiskLogger
-  log_dir: ${output_dir}
-output_dir: /tmp/alpaca-gemma2-lora
-log_every_n_steps: 1
-log_peak_memory_stats: True
-
-# Show case the usage of pytorch profiler
-# Set enabled to False as it's only needed for debugging training
-profiler:
-  _component_: torchtune.training.setup_torch_profiler
-  enabled: False
-
-  #Output directory of trace artifacts
-  output_dir: ${output_dir}/profiling_outputs
-
-  #`torch.profiler.ProfilerActivity` types to trace
-  cpu: True
-  cuda: True
-
-  #trace options passed to `torch.profiler.profile`
-  profile_memory: False
-  with_stack: False
-  record_shapes: True
-  with_flops: False
-
-  # `torch.profiler.schedule` options:
-  # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
-  wait_steps: 5
-  warmup_steps: 5
-  active_steps: 2
-  num_cycles: 1
diff --git a/recipes/configs/gemma2/9B_full.yaml b/recipes/configs/gemma2/9B_full.yaml
deleted file mode 100644
index 0fc7e6e4e4..0000000000
--- a/recipes/configs/gemma2/9B_full.yaml
+++ /dev/null
@@ -1,74 +0,0 @@
-# Config for multi-device full finetuning in full_finetune_distributed.py
-# using a gemma2 9B model
-#
-# This config assumes that you've run the following command before launching
-# this run:
-#   tune download google/gemma-2-9b --ignore-patterns "gemma-2-9b.gguf"  --hf-token <HF_TOKEN>
-#
-# To launch on 4 devices, run the following command from root:
-#   tune run --nnodes 1 --nproc_per_node 4 full_finetune_distributed --config gemma2/9B_full
-#
-# You can add specific overrides through the command line. For example
-# to override the checkpointer directory while launching training
-# you can run:
-#   tune run --nnodes 1 --nproc_per_node 4 full_finetune_distributed --config gemma2/9B_full checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
-#
-# This config works only when the model is being fine-tuned on 2+ GPUs.
-
-
-# Tokenizer
-tokenizer:
-  _component_: torchtune.models.gemma.gemma_tokenizer
-  path: /tmp/gemma-2-9b/tokenizer.model
-
-# Dataset
-dataset:
-  packed: False # Set to true for great speed ups
-  _component_: torchtune.datasets.alpaca_dataset
-seed: null
-shuffle: True
-
-# Model Arguments
-model:
-  _component_: torchtune.models.gemma2.gemma2_9b
-
-checkpointer:
-  _component_: torchtune.training.FullModelHFCheckpointer
-  checkpoint_dir: /tmp/gemma-2-9b/
-  checkpoint_files:
-    filename_format: model-{}-of-{}.safetensors
-    max_filename: "00008"
-  recipe_checkpoint: null
-  output_dir: /tmp/gemma-2-9b
-  model_type: GEMMA2
-resume_from_checkpoint: False
-
-# Fine-tuning arguments
-batch_size: 1
-epochs: 1
-optimizer:
-  _component_: torch.optim.AdamW
-  fused: True
-  lr: 2e-5
-loss:
-  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
-max_steps_per_epoch: null
-gradient_accumulation_steps: 1
-compile: False  # pytorch compile, set to true for perf/memory improvement
-
-# Training env
-device: cuda
-
-# Memory management
-enable_activation_checkpointing: True
-
-# Reduced precision
-dtype: bf16
-
-# Logging
-metric_logger:
-  _component_: torchtune.training.metric_logging.DiskLogger
-  log_dir: ${output_dir}
-output_dir: /tmp/alpaca-gemma2-9b-finetune
-log_every_n_steps: 1
-log_peak_memory_stats: True
diff --git a/recipes/configs/gemma2/9B_lora.yaml b/recipes/configs/gemma2/9B_lora.yaml
deleted file mode 100644
index 960e4fa881..0000000000
--- a/recipes/configs/gemma2/9B_lora.yaml
+++ /dev/null
@@ -1,86 +0,0 @@
-# Config for multi-device LoRA finetuning in lora_finetune_distributed.py
-# using a gemma2 9B model
-#
-# This config assumes that you've run the following command before launching
-# this run:
-#   tune download google/gemma-2-9b --ignore-patterns "gemma-2-9b.gguf"  --hf-token <HF_TOKEN>
-#
-# To launch on 4 devices, run the following command from root:
-#   tune run --nnodes 1 --nproc_per_node 4 lora_finetune_distributed --config gemma2/9B_lora
-#
-# You can add specific overrides through the command line. For example
-# to override the checkpointer directory while launching training
-# you can run:
-#   tune run --nnodes 1 --nproc_per_node 4 lora_finetune_distributed --config gemma2/9B_lora checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
-#
-# This config works only when the model is being fine-tuned on 2+ GPUs.
-
-
-# Tokenizer
-tokenizer:
-  _component_: torchtune.models.gemma.gemma_tokenizer
-  path: /tmp/gemma-2-9b/tokenizer.model
-
-# Dataset
-dataset:
-  packed: False # Set to true for great speed ups
-  _component_: torchtune.datasets.alpaca_dataset
-seed: null
-shuffle: True
-
-# Model Arguments
-model:
-  _component_: torchtune.models.gemma2.lora_gemma2_9b
-  lora_attn_modules: ['q_proj', 'k_proj', 'v_proj']
-  apply_lora_to_mlp: True
-  lora_rank: 64
-  lora_alpha: 128
-  lora_dropout: 0.0
-
-checkpointer:
-  _component_: torchtune.training.FullModelHFCheckpointer
-  checkpoint_dir: /tmp/gemma-2-9b/
-  checkpoint_files:
-    filename_format: model-{}-of-{}.safetensors
-    max_filename: "00008"
-  recipe_checkpoint: null
-  output_dir: /tmp/gemma-2-9b/
-  model_type: GEMMA2
-resume_from_checkpoint: False
-save_adapter_weights_only: False
-
-optimizer:
-  _component_: torch.optim.AdamW
-  fused: True
-  lr: 2e-5
-
-lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
-  num_warmup_steps: 10
-
-loss:
-  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
-
-# Fine-tuning arguments
-batch_size: 4
-epochs: 3
-max_steps_per_epoch: null
-gradient_accumulation_steps: 1
-compile: False  # pytorch compile, set to true for perf/memory improvement
-
-# Training env
-device: cuda
-
-# Memory management
-enable_activation_checkpointing: True
-
-# Reduced precision
-dtype: bf16
-
-# Logging
-metric_logger:
-  _component_: torchtune.training.metric_logging.DiskLogger
-  log_dir: ${output_dir}
-output_dir: /tmp/alpaca-gemma2-9b-lora
-log_every_n_steps: 1
-log_peak_memory_stats: True
diff --git a/recipes/configs/gemma2/9B_lora_single_device.yaml b/recipes/configs/gemma2/9B_lora_single_device.yaml
deleted file mode 100644
index e9d6c22a73..0000000000
--- a/recipes/configs/gemma2/9B_lora_single_device.yaml
+++ /dev/null
@@ -1,112 +0,0 @@
-# Config for multi-device LoRA finetuning in lora_finetune_single_device.py
-# using a gemma2 9B model
-#
-# This config assumes that you've run the following command before launching
-# this run (torchtune does not use gguf so you can ignore it to save time and space):
-#   tune download google/gemma-2-9b --ignore-patterns "gemma-2-9b.gguf"  --hf-token <HF_TOKEN>
-#
-# To launch on a single device, run the following command from root:
-#   tune run lora_finetune_single_device --config gemma2/9B_lora_single_device
-#
-# You can add specific overrides through the command line. For example
-# to override the checkpointer directory while launching training
-# you can run:
-#   tune run lora_finetune_single_device --config gemma2/9B_lora_single_device checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
-#
-# This config works only for training on single device.
-
-# Tokenizer
-tokenizer:
-  _component_: torchtune.models.gemma.gemma_tokenizer
-  path: /tmp/gemma-2-9b/tokenizer.model
-
-# Dataset
-dataset:
-  packed: False # Set to true for great speed ups
-  _component_: torchtune.datasets.alpaca_dataset
-seed: null
-shuffle: True
-
-# Model Arguments
-model:
-  _component_: torchtune.models.gemma2.lora_gemma2_9b
-  lora_attn_modules: ['q_proj', 'k_proj', 'v_proj']
-  apply_lora_to_mlp: True
-  lora_rank: 8
-  lora_alpha: 16
-  lora_dropout: 0.0
-
-checkpointer:
-  _component_: torchtune.training.FullModelHFCheckpointer
-  checkpoint_dir: /tmp/gemma-2-9b/
-  checkpoint_files:
-    filename_format: model-{}-of-{}.safetensors
-    max_filename: "00008"
-  recipe_checkpoint: null
-  output_dir: /tmp/gemma-2-9b/
-  model_type: GEMMA2
-resume_from_checkpoint: False
-save_adapter_weights_only: False
-
-optimizer:
-  _component_: torch.optim.AdamW
-  fused: True
-  lr: 5e-5
-
-lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
-  num_warmup_steps: 10
-
-loss:
-  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
-
-# Fine-tuning arguments
-batch_size: 8
-epochs: 1
-max_steps_per_epoch: null
-gradient_accumulation_steps: 2
-compile: False  # pytorch compile, set to true for perf/memory improvement
-
-# Training env
-device: cuda
-
-# Memory management
-enable_activation_checkpointing: True
-enable_activation_offloading: False
-
-# Reduced precision
-dtype: bf16
-
-# Logging
-metric_logger:
-  _component_: torchtune.training.metric_logging.DiskLogger
-  log_dir: ${output_dir}
-output_dir: /tmp/alpaca-gemma2-9b-lora
-log_every_n_steps: 1
-log_peak_memory_stats: True
-
-# Show case the usage of pytorch profiler
-# Set enabled to False as it's only needed for debugging training
-profiler:
-  _component_: torchtune.training.setup_torch_profiler
-  enabled: False
-
-  #Output directory of trace artifacts
-  output_dir: ${output_dir}/profiling_outputs
-
-  #`torch.profiler.ProfilerActivity` types to trace
-  cpu: True
-  cuda: True
-
-  #trace options passed to `torch.profiler.profile`
-  profile_memory: False
-  with_stack: False
-  record_shapes: True
-  with_flops: False
-
-  # `torch.profiler.schedule` options:
-  # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
-  wait_steps: 5
-  warmup_steps: 5
-  active_steps: 2
-  num_cycles: 1
diff --git a/recipes/configs/gemma2/9B_qlora_single_device.yaml b/recipes/configs/gemma2/9B_qlora_single_device.yaml
deleted file mode 100644
index 8991ba9ece..0000000000
--- a/recipes/configs/gemma2/9B_qlora_single_device.yaml
+++ /dev/null
@@ -1,115 +0,0 @@
-# Config for multi-device QLoRA finetuning in lora_finetune_single_device.py
-# using a gemma2 9B model
-#
-# This config assumes that you've run the following command before launching
-# this run:
-#   tune download google/gemma-2-9b --ignore-patterns "gemma-2-9b.gguf"  --hf-token <HF_TOKEN>
-#
-# To launch on a single device, run the following command from root:
-#   tune run lora_finetune_single_device --config gemma2/9B_qlora_single_device
-#
-# You can add specific overrides through the command line. For example
-# to override the checkpointer directory while launching training
-# you can run:
-#   tune run lora_finetune_single_device --config gemma2/9B_qlora_single_device checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
-#
-# This config works only for training on single device.
-
-# Tokenizer
-tokenizer:
-  _component_: torchtune.models.gemma.gemma_tokenizer
-  path: /tmp/gemma-2-9b/tokenizer.model
-
-# Dataset
-dataset:
-  packed: False # Set to true for great speed ups
-  _component_: torchtune.datasets.alpaca_dataset
-seed: null
-shuffle: True
-
-# Model Arguments
-model:
-  _component_: torchtune.models.gemma2.qlora_gemma2_9b
-  lora_attn_modules: ['q_proj', 'k_proj', 'v_proj']
-  apply_lora_to_mlp: True
-  lora_rank: 64
-  lora_alpha: 128
-  lora_dropout: 0.0
-
-checkpointer:
-  _component_: torchtune.training.FullModelHFCheckpointer
-  checkpoint_dir: /tmp/gemma-2-9b/
-  checkpoint_files:
-    filename_format: model-{}-of-{}.safetensors
-    max_filename: "00008"
-  recipe_checkpoint: null
-  output_dir: /tmp/gemma-2-9b/
-  model_type: GEMMA2
-resume_from_checkpoint: False
-save_adapter_weights_only: False
-
-optimizer:
-  _component_: torch.optim.AdamW
-  fused: True
-  lr: 2e-5
-
-lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
-  num_warmup_steps: 10
-
-loss:
-  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
-
-# Fine-tuning arguments
-batch_size: 4
-epochs: 3
-max_steps_per_epoch: null
-gradient_accumulation_steps: 4
-compile: False  # pytorch compile, set to true for perf/memory improvement
-
-# Training env
-device: cuda
-
-# Memory management
-enable_activation_checkpointing: True
-enable_activation_offloading: False
-
-# Reduced precision
-dtype: bf16
-
-# Logging
-metric_logger:
-  _component_: torchtune.training.metric_logging.DiskLogger
-  log_dir: ${output_dir}
-output_dir: /tmp/alpaca-gemma2-9b-lora
-log_every_n_steps: 1
-log_peak_memory_stats: True
-
-# Show case the usage of pytorch profiler
-# Set enabled to False as it's only needed for debugging training
-profiler:
-  _component_: torchtune.training.setup_torch_profiler
-  enabled: False
-
-  #Output directory of trace artifacts
-  output_dir: ${output_dir}/profiling_outputs
-
-  #`torch.profiler.ProfilerActivity` types to trace
-  cpu: True
-  cuda: True
-
-  #trace options passed to `torch.profiler.profile`
-  profile_memory: False
-  with_stack: False
-  record_shapes: True
-  with_flops: False
-
-  # `torch.profiler.schedule` options:
-  # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
-  wait_steps: 5
-  warmup_steps: 5
-  active_steps: 2
-  num_cycles: 1
-
-# For colab use True
-low_cpu_ram: False
diff --git a/recipes/configs/llama2/7B_qat_full.yaml b/recipes/configs/llama2/7B_qat_full.yaml
index e404b0c4dc..0cbf6c7b7a 100644
--- a/recipes/configs/llama2/7B_qat_full.yaml
+++ b/recipes/configs/llama2/7B_qat_full.yaml
@@ -67,7 +67,7 @@ device: cuda
 
 # Memory management
 enable_activation_checkpointing: True  # True reduces memory
-enable_activation_offloading: False  # True reduces memory
+memory_efficient_fsdp_wrap: False
 
 # Reduced precision
 dtype: bf16
diff --git a/recipes/configs/llama3/8B_qat_full.yaml b/recipes/configs/llama3/8B_qat_full.yaml
index 2b08cbb10f..ce409d1bbb 100644
--- a/recipes/configs/llama3/8B_qat_full.yaml
+++ b/recipes/configs/llama3/8B_qat_full.yaml
@@ -44,6 +44,8 @@ resume_from_checkpoint: False
 # Fine-tuning arguments
 batch_size: 2
 epochs: 3
+compile: False  # pytorch compile, set to true for better perf/memory
+optimizer_in_bwd: False  # True saves memory. Requires gradient_accumulation_steps=1
 
 # QAT arguments
 quantizer:
@@ -58,16 +60,13 @@ loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1  # Use to increase virtual batch size
-compile: False  # pytorch compile, set to true for better perf/memory
-optimizer_in_bwd: False  # True saves memory. Requires gradient_accumulation_steps=1
 
 # Training env
 device: cuda
 
 # Memory management
 enable_activation_checkpointing: True  # True reduces memory
-enable_activation_offloading: False  # True reduces memory
-custom_sharded_layers: ['tok_embeddings', 'output']  # Layers to shard separately (useful for large vocab size models). Lower Memory, but lower speed.
+memory_efficient_fsdp_wrap: True
 
 # Reduced precision
 dtype: bf16
@@ -76,7 +75,7 @@ dtype: bf16
 metric_logger:
   _component_: torchtune.training.metric_logging.DiskLogger
   log_dir: ${output_dir}
-output_dir: /tmp/full-llama3-finetune
+output_dir: /tmp/alpaca-llama3-finetune
 log_every_n_steps: 1
 log_peak_memory_stats: True
 
diff --git a/recipes/lora_finetune_single_device.py b/recipes/lora_finetune_single_device.py
index daf0ea8cdc..c84830a9f3 100644
--- a/recipes/lora_finetune_single_device.py
+++ b/recipes/lora_finetune_single_device.py
@@ -631,6 +631,7 @@ def save_checkpoint(self, epoch: int) -> None:
     def _loss_step(self, batch: Dict[str, torch.Tensor]) -> torch.Tensor:
         # Shape [b, s], needed for the loss not the model
         labels = batch.pop("labels")
+
         # run model
         with self.activations_handling_ctx:
             logits = self._model(**batch)
diff --git a/recipes/qat_distributed.py b/recipes/qat_distributed.py
index b1040880d0..f09ffc1c7b 100644
--- a/recipes/qat_distributed.py
+++ b/recipes/qat_distributed.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import os
 import sys
 import time
 
@@ -20,13 +21,11 @@
 from torch.optim import Optimizer
 from torch.utils.data import DataLoader, DistributedSampler
 from torchtune import config, modules, training, utils
-from torchtune.config._utils import _get_component_from_path
-from torchtune.data import padded_collate_packed
+from torchtune.data import padded_collate_packed, padded_collate_sft
 from torchtune.datasets import ConcatDataset
 from torchtune.recipe_interfaces import FTRecipeInterface
 from torchtune.training import DummyProfiler, PROFILER_KEY
 from torchtune.training.activations import apply_selective_activation_checkpointing
-from torchtune.training.lr_schedulers import get_lr
 
 from tqdm import tqdm
 
@@ -51,7 +50,7 @@ class QATRecipeDistributed(FTRecipeInterface):
             to improved quantized accuracy. This can be specified through ``fake_quant_after_n_steps``.
 
         - FSDP. Supported using PyTorch's FSDP APIs. CPU offload of parameters, gradients, and optimizer states
-            is supported via ``fsdp_cpu_offload``. Resharding of parameters after the forward pass is
+            is supported via the ``fsdp_cpu_offload``. Resharding of parameters after the forward pass is
             done by default (corresponding to FULL_SHARD sharding strategy), but can be disabled by setting the config
             ``fsdp_reshard_after_forward`` to False (this corresponds to SHARD_GRAD_OP sharding strategy).
             DDP is currently not supported. Training on CPU is not supported.
@@ -63,18 +62,6 @@ class QATRecipeDistributed(FTRecipeInterface):
             come at the cost of training performance. In most cases training can slow-down quite a bit as
             a result of this activation recomputation.
 
-        - Activation Offloading. This can be controlled using the ``enable_activation_offloading``
-            flag. Activation offloading is a technique similar to activations checkpointing that helps
-            reduce the memory footprint to prevent OOMs on CUDA and enable bigger batches. Where activations
-            checkpointing drops the activation in the forward to recompute it later in the backward,
-            activations offloading will drop the activation in the forward to the CPU and bring it
-            back during the backward pass. As always, there is a tradeoff--these savings in memory can
-            come at the cost of training performance and CPU resources. To recover some runtime cost,
-            we've added an option to enable offloading on a different stream to permit overlapping with
-            the computation. This option is currently only available on PyTorch 2.5 or later and will
-            be enabled by default if an acceptable torch version is found. Activation offloading can be
-            used in conjunction with activation checkpointing.
-
         - Precision. Full fp32 and bf16 training are supported. Precision is controlled using the ``dtype``
             flag. When ``dtype=bf16``, all activations, gradients and optimizer states are in bfloat16. In
             most cases this should halve the memory footprint of full precision (fp32) training, without
@@ -106,10 +93,6 @@ class QATRecipeDistributed(FTRecipeInterface):
 
         - Logging. Terminal, Disk, WandB and TensorBoard are all supported.
 
-        - Gradient Clipping. Gradient clipping is supported using the ``clip_grad_norm`` flag. By default,
-            ``clip_grad_norm`` is set to ``None``. If you only want to log the grad norm, you can set
-            ``clip_grad_norm='inf'``.
-
     For a full list of example configs for this recipe, run ``tune ls`` on the command line. Each config
     has example commands for how to kick-off training.
 
@@ -119,9 +102,6 @@ class QATRecipeDistributed(FTRecipeInterface):
     Raises:
         ValueError: If ``dtype`` is set to fp16.
         RuntimeError: If ``dtype`` is set to bf16 and the hardware does not support bf16.
-        RuntimeError: If ``left_pad_sequence`` is set as the data collator.
-        RuntimeError: If ``enable_activation_offloading`` is True and device is not CUDA.
-        RuntimeError: If ``enable_activation_offloading`` is True and ``enable_activation_checkpointing`` is False.
     """
 
     def __init__(self, cfg: DictConfig) -> None:
@@ -161,50 +141,12 @@ def __init__(self, cfg: DictConfig) -> None:
         # Training cfg
         self._resume_from_checkpoint = cfg.resume_from_checkpoint
         self._gradient_accumulation_steps = cfg.gradient_accumulation_steps
-        self._optimizer_in_bwd = cfg.get("optimizer_in_bwd", False)
-        self._clip_grad_norm = cfg.get("clip_grad_norm", None)
+        self._fsdp_sharding_strategy = torch.distributed.fsdp.ShardingStrategy[
+            cfg.get("fsdp_sharding_strategy", "FULL_SHARD")
+        ]
         self._fake_quant_after_n_steps = cfg.get("fake_quant_after_n_steps", None)
         self._quantizer_mode = None
 
-        # Optimizer in backward is not compatible with gradient accumulation or gradient clipping
-        if self._optimizer_in_bwd:
-            if self._clip_grad_norm is not None:
-                raise RuntimeError(
-                    "Gradient clipping is not supported with optimizer in bwd."
-                    "Please set clip_grad_norm=None, or optimizer_in_bwd=False."
-                )
-            if self._gradient_accumulation_steps > 1:
-                raise RuntimeError(
-                    "Gradient accumulation is not supported with optimizer in bwd."
-                    "Please set gradient_accumulation_steps=1, or optimizer_in_bwd=False."
-                )
-
-        # activation checkpointing/offloading
-        self._enable_activation_checkpointing = cfg.get(
-            "enable_activation_checkpointing", False
-        )
-        self._enable_activation_offloading = cfg.get(
-            "enable_activation_offloading", False
-        )
-        if self._enable_activation_offloading:
-            if self._device.type != "cuda":
-                raise RuntimeError(
-                    "enable_activation_offloading should only be True when training on CUDA"
-                )
-            if not self._enable_activation_checkpointing:
-                raise RuntimeError(
-                    "enable_activation_offloading should only be True when enable_activation_checkpointing is True"
-                )
-        elif (
-            self._enable_activation_checkpointing
-            and cfg.checkpointer.model_type != "LLAMA3_VISION"
-        ):
-            utils.log_rank_zero(
-                log,
-                "Hint: enable_activation_checkpointing is True, but enable_activation_offloading isn't. "
-                "Enabling activation offloading should reduce memory further.",
-            )
-
         # These are public properties which are updated by the checkpoint loader
         # when ``resume_from_checkpoint`` is `True` or validated in tests
         self.seed = training.set_seed(seed=cfg.seed)
@@ -281,11 +223,10 @@ def setup(self, cfg: DictConfig) -> None:
 
         checkpoint_dict = self.load_checkpoint(cfg_checkpointer=cfg.checkpointer)
 
-        self._compile = cfg.get("compile", False)
+        self._model_compile = cfg.get("compile", False)
         self._model = self._setup_model(
             cfg_model=cfg.model,
-            enable_activation_checkpointing=self._enable_activation_checkpointing,
-            enable_activation_offloading=self._enable_activation_offloading,
+            enable_activation_checkpointing=cfg.enable_activation_checkpointing,
             custom_sharded_layers=cfg.get("custom_sharded_layers", None),
             fsdp_cpu_offload=cfg.get("fsdp_cpu_offload", False),
             reshard_after_forward=cfg.get("fsdp_reshard_after_forward", True),
@@ -298,7 +239,6 @@ def setup(self, cfg: DictConfig) -> None:
 
         self._optimizer = self._setup_optimizer(
             cfg_optimizer=cfg.optimizer,
-            optimizer_in_bwd=self._optimizer_in_bwd,
             opt_state_dict=(
                 checkpoint_dict[training.OPT_KEY]
                 if self._resume_from_checkpoint
@@ -308,25 +248,30 @@ def setup(self, cfg: DictConfig) -> None:
 
         # initialize loss
         self._loss_fn = config.instantiate(cfg.loss)
-
-        if self._compile:
-            training.compile_loss(self._loss_fn, verbose=self._is_rank_zero)
-
+        backend = os.environ.get("TORCH_COMPILE_BACKEND", "inductor")
         if self._loss_fn.__class__.__name__ == "CEWithChunkedOutputLoss":
             # set num_output_chunks for model
             self._model.set_num_output_chunks(self._loss_fn.num_output_chunks)
-
-        if self._is_rank_zero:
-            log.info("Loss is initialized.")
+            if self._model_compile:
+                log.info("Compiling loss with torch.compile...")
+                # For CEWithChunkedOutputLoss, if we compile the entire class
+                # we lose the benefits from the chunked loss.
+                # Therefore, we only compile the cross entropy function + upcasting
+                self._loss_fn.compute_cross_entropy = torch.compile(
+                    self._loss_fn.compute_cross_entropy, backend=backend
+                )
+        else:
+            if self._model_compile:
+                log.info("Compiling loss with torch.compile...")
+                self._loss_fn = torch.compile(self._loss_fn, backend=backend)
+        log.info("Loss is initialized.")
 
         # sampler and dataloader depend on the tokenizer and loss_fn and should be
         # setup after both of these are initialized
-        collate_name = cfg.get("collate_fn", "torchtune.data.padded_collate_sft")
         self._sampler, self._dataloader = self._setup_data(
             cfg_dataset=cfg.dataset,
             shuffle=cfg.shuffle,
             batch_size=cfg.batch_size,
-            collate_fn=collate_name,
         )
 
         # Finally update the recipe state which can only be correctly set after all of the
@@ -426,7 +371,6 @@ def _setup_model(
         self,
         cfg_model: DictConfig,
         enable_activation_checkpointing: bool,
-        enable_activation_offloading: bool,
         fsdp_cpu_offload: bool,
         reshard_after_forward: bool,
         model_state_dict: Dict[str, Any],
@@ -452,9 +396,6 @@ def _setup_model(
         with training.set_default_dtype(self._dtype), torch.device("meta"):
             model = config.instantiate(cfg_model)
 
-        if self._compile:
-            training.compile_model(model, verbose=self._is_rank_zero)
-
         # We currently have two versions of activation checkpointing in this recipe
         # for testing and BC purposes. ``enable_activation_checkpointing`` controls
         # the older version of AC and this behavior is unchanged
@@ -510,17 +451,7 @@ def _setup_model(
         # This method will convert the full model state dict into a sharded state
         # dict and load into the model
         training.load_from_full_model_state_dict(
-            model,
-            model_state_dict,
-            self._device,
-            self._is_rank_zero,
-            strict=True,
-            cpu_offload=fsdp_cpu_offload,
-        )
-
-        # activation offloading
-        self.activations_handling_ctx = training.get_act_offloading_ctx_manager(
-            model, enable_activation_offloading
+            model, model_state_dict, self._device, self._is_rank_zero, strict=True
         )
 
         # Ensure no params and buffers are on meta device
@@ -539,64 +470,25 @@ def _setup_model(
         return model
 
     def _setup_optimizer(
-        self,
-        cfg_optimizer: DictConfig,
-        optimizer_in_bwd: bool = False,
-        opt_state_dict: Optional[Dict[str, Any]] = None,
-    ) -> Optional[Optimizer]:
-        if optimizer_in_bwd:
-            # Maintain a dict of optims for every parameter.
-            optim_dict = {
-                param: config.instantiate(cfg_optimizer, [param])
-                for param in self._model.parameters()
-            }
-
-            # Register optimizer step hooks on the model to run optimizer in backward.
-            training.register_optim_in_bwd_hooks(
-                model=self._model, optim_dict=optim_dict
+        self, cfg_optimizer: DictConfig, opt_state_dict: Optional[Dict[str, Any]] = None
+    ) -> Optimizer:
+        optimizer = config.instantiate(cfg_optimizer, self._model.parameters())
+        if opt_state_dict:
+            training.load_from_full_optimizer_state_dict(
+                optimizer,
+                opt_state_dict,
+                self._device,
             )
-            # Create a wrapper for checkpoint save/load of optimizer states when running in backward.
-            self._optim_ckpt_wrapper = training.create_optim_in_bwd_wrapper(
-                model=self._model, optim_dict=optim_dict
-            )
-            # Load optimizer states for each param. If optimizer states are being restored in an optimizer in
-            # backward run, these need to have been saved with the same setting. Cannot restore from runs that
-            # did not use optimizer in backward.
-            if opt_state_dict is not None:
-                for param in opt_state_dict.keys():
-                    try:
-                        training.load_from_full_optimizer_state_dict(
-                            self._optim_ckpt_wrapper.state_dict()[param],
-                            opt_state_dict[param],
-                            self._device,
-                        )
-                    except BaseException as e:
-                        raise RuntimeError(
-                            "Failed loading in-backward optimizer checkpoints."
-                            "Please make sure run being restored from was using in-backward optimizer."
-                        ) from e
-            if self._is_rank_zero:
-                log.info("In-backward optimizers are set up.")
-            return None
-        else:
-            optimizer = config.instantiate(cfg_optimizer, self._model.parameters())
-            if opt_state_dict:
-                training.load_from_full_optimizer_state_dict(
-                    optimizer,
-                    opt_state_dict,
-                    self._device,
-                )
 
-            if self._is_rank_zero:
-                log.info("Optimizer is initialized.")
-            return optimizer
+        if self._is_rank_zero:
+            log.info("Optimizer is initialized.")
+        return optimizer
 
     def _setup_data(
         self,
         cfg_dataset: DictConfig,
         shuffle: bool,
         batch_size: int,
-        collate_fn: str,
     ) -> Tuple[DistributedSampler, DataLoader]:
         """
         All data related setup happens here. Currently this recipe only supports the
@@ -607,20 +499,15 @@ def _setup_data(
 
         if isinstance(cfg_dataset, ListConfig):
             datasets = [
-                config.instantiate(single_cfg_dataset, self._tokenizer)
+                config.instantiate(single_cfg_dataset, tokenizer=self._tokenizer)
                 for single_cfg_dataset in cfg_dataset
             ]
             ds = ConcatDataset(datasets=datasets)
             packed = False
         else:
-            ds = config.instantiate(cfg_dataset, self._tokenizer)
+            ds = config.instantiate(cfg_dataset, tokenizer=self._tokenizer)
             packed = cfg_dataset.get("packed", False)
 
-        # Instantiate collate_fn
-        if "left_pad_sequence" in collate_fn:
-            raise RuntimeError("left_pad_sequence collator is only for inference.")
-        collate_fn = _get_component_from_path(collate_fn)
-
         sampler = DistributedSampler(
             ds, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=0
         )
@@ -632,12 +519,14 @@ def _setup_data(
             drop_last=True,
             collate_fn=(
                 partial(
-                    collate_fn,
+                    padded_collate_sft,
                     padding_idx=self._tokenizer.pad_id,
                     ignore_idx=self._loss_fn.ignore_index,
                 )
                 if not packed
-                else padded_collate_packed
+                else partial(
+                    padded_collate_packed,
+                )
             ),
         )
 
@@ -664,54 +553,25 @@ def save_checkpoint(
         checkpoint_dict = {}
 
         intermediate_checkpoint = epoch + 1 < self.total_epochs
-
-        if self._is_rank_zero:
-            log.info(
-                "Saving checkpoint. This may take some time. Retrieving full model state dict..."
-            )
-            start = time.perf_counter()
-
         # To prevent GPU memory from spiking during checkpoint save,
         # we consolidate the full model and optim state dicts on CPU for rank 0
         cpu_state_dict = training.get_full_model_state_dict(
             self._model,
             self._is_rank_zero,
-            device=self._device,
         )
 
-        if self._is_rank_zero:
-            log.info(
-                f"Getting full model state dict took {time.perf_counter() - start:.2f} secs"
-            )
-
         if intermediate_checkpoint:
-            start = time.perf_counter()
-            if self._is_rank_zero:
-                log.info("Getting optimizer state dict...")
-            if not self._optimizer_in_bwd:
-                opt_state_dict = training.get_full_optimizer_state_dict(
-                    self._optimizer,
-                    self._is_rank_zero,
-                    device=self._device,
-                )
-            else:
-                opt_state_dict = {}
-                for param, opt in self._optim_ckpt_wrapper.optim_map.items():
-                    opt_state_dict[param] = training.get_full_optimizer_state_dict(
-                        opt, self._is_rank_zero, device=self._device
-                    )
-            if self._is_rank_zero:
-                log.info(
-                    f"Getting optimizer state dict took {time.perf_counter() - start:.2f} secs"
-                )
+            opt_state_dict = training.get_full_optimizer_state_dict(
+                self._optimizer,
+                self._is_rank_zero,
+            )
         else:
             opt_state_dict = None
 
         # Now that we have the model and opt state dict, create the actual checkpoint dict
         # to be sent to the checkpointer and ultimately written to file
-
         if self._is_rank_zero:
-            start = time.perf_counter()
+
             checkpoint_dict.update({training.MODEL_KEY: cpu_state_dict})
 
             # if training is in-progress, checkpoint the optimizer state and recipe state
@@ -732,9 +592,6 @@ def save_checkpoint(
                 epoch=epoch,
                 intermediate_checkpoint=intermediate_checkpoint,
             )
-            log.info(f"Saving checkpoint took {time.perf_counter() - start:.2f} secs")
-
-        torch.distributed.barrier()
 
     def train(self) -> None:
         """
@@ -742,15 +599,10 @@ def train(self) -> None:
         """
         # clean up before training begins
         training.cleanup_before_training()
-
         world_size, rank = training.get_world_size_and_rank()
 
         # zero out the gradients before starting training
-        if not self._optimizer_in_bwd:
-            self._optimizer.zero_grad()
-        else:
-            for opt in self._optim_ckpt_wrapper.optim_map.values():
-                opt.zero_grad()
+        self._optimizer.zero_grad()
 
         # Initialize tokens count and running loss (for grad accumulation)
         t0 = time.perf_counter()
@@ -760,6 +612,7 @@ def train(self) -> None:
         self._profiler.start()
         # self.epochs_run should be non-zero when we're resuming from a checkpoint
         for curr_epoch in range(self.epochs_run, self.total_epochs):
+
             # Update the sampler to ensure data is correctly shuffled across epochs
             # in case shuffle is True
             self._sampler.set_epoch(curr_epoch)
@@ -782,6 +635,13 @@ def train(self) -> None:
                 ):
                     torch.cuda.memory._record_memory_history()
 
+                # Both are shape [b, s]
+                tokens, labels = batch["tokens"], batch["labels"]
+                # Get the attention mask and position ids from the dataset if they
+                # exist. Currently, only sample packing in PackedDataset returns these
+                mask = batch.get("mask", None)  # shape [b, s, s]
+                input_pos = batch.get("input_pos", None)  # shape [b, s]
+
                 # Optionally wait N steps before enabling fake quant
                 if self._fake_quant_after_n_steps is not None:
                     if self.global_step == 0:
@@ -803,20 +663,20 @@ def train(self) -> None:
                         )
                         self._model.apply(enable_fq)
 
-                utils.batch_to_device(batch, self._device)
+                tokens = tokens.to(self._device)
 
                 # Calculate the number of unmasked tokens in the current batch
                 # and increment the total number of tokens seen in the step
+
+                utils.batch_to_device(batch, self._device)
+
                 current_num_tokens = (
                     batch["labels"] != self._loss_fn.ignore_index
                 ).sum()
                 num_tokens += current_num_tokens
-
-                # Shape [b, s], needed for the loss not the model
                 labels = batch.pop("labels")
 
-                with self.activations_handling_ctx:
-                    logits = self._model(**batch)
+                logits = self._model(**batch)
 
                 # Shift labels to compute loss
                 # equivalent to doing labels[..., 1:] and logits[..., :-1, :]
@@ -829,40 +689,25 @@ def train(self) -> None:
                     logits = logits.reshape(-1, logits.size(-1))
 
                 # Compute loss
-                # Loss is normalized by default so we multiply by the number of tokens
-                # This way we can normalize by the total number of tokens if we're accumulating gradients
                 current_loss = self._loss_fn(logits, labels) * current_num_tokens
 
                 # free logits otherwise it peaks backward memory
                 del logits
 
                 running_loss += current_loss
-
-                # For optimizer in backward, we need to normalize before calling backward
-                # This case and gradient accumulation are mutually exclusive
-                if self._optimizer_in_bwd:
-                    torch.distributed.all_reduce(num_tokens)
-                    torch.distributed.all_reduce(running_loss)
-                    current_loss = current_loss / num_tokens
-
                 current_loss.backward()
 
                 # Step with optimizer
                 if (idx + 1) % self._gradient_accumulation_steps == 0:
-                    if not self._optimizer_in_bwd:
-                        # Get total number of tokens across all ranks to normalize gradients
-                        torch.distributed.all_reduce(num_tokens)
-                        # This will ensure that the logged loss matches what we're optimizing
-                        torch.distributed.all_reduce(running_loss)
-                        # Manually scale the gradients from unnormalized loss by total # of tokens
-                        training.scale_grads(self._model, 1 / num_tokens)
-                        if self._clip_grad_norm is not None:
-                            grad_norm = torch.nn.utils.clip_grad_norm_(
-                                self._model.parameters(),
-                                max_norm=float(self._clip_grad_norm),
-                            )
-                        self._optimizer.step()
-                        self._optimizer.zero_grad(set_to_none=True)
+                    # Get total number of tokens across all ranks to normalize gradients
+                    torch.distributed.all_reduce(num_tokens)
+                    # This will ensure that the logged loss matches what we're optimizing
+                    torch.distributed.all_reduce(running_loss)
+                    # Manually scale the gradients from unnormalized loss by total # of tokens
+                    training.scale_grads(self._model, 1 / num_tokens)
+
+                    self._optimizer.step()
+                    self._optimizer.zero_grad(set_to_none=True)
 
                     # Update the number of steps when the weights are updated
                     self.global_step += 1
@@ -881,22 +726,15 @@ def train(self) -> None:
                         time_per_step = time.perf_counter() - t0
                         log_dict = {
                             "loss": loss_to_log,
-                            "lr": get_lr(
-                                (
-                                    self._optimizer
-                                    if not self._optimizer_in_bwd
-                                    else self._optim_ckpt_wrapper
-                                ),
+                            "lr": self._optimizer.param_groups[0]["lr"],
+                            "tokens_per_second_per_gpu": (
+                                num_tokens / time_per_step * world_size
                             ),
-                            "tokens_per_second_per_gpu": num_tokens
-                            / (time_per_step * world_size),
                         }
                         if self._log_peak_memory_stats:
                             log_dict.update(
                                 training.get_memory_stats(device=self._device)
                             )
-                        if self._clip_grad_norm is not None:
-                            log_dict.update({"grad_norm": grad_norm})
                         self._metric_logger.log_dict(
                             log_dict,
                             step=self.global_step,
@@ -946,7 +784,7 @@ def recipe_main(cfg: DictConfig) -> None:
     """
     if not training.is_distributed():
         raise RuntimeError(
-            "Distributed finetune recipe should be run via a distributed launcher."
+            "Distributed QAT recipe should be run via a distributed launcher."
             "If using tune CLI, please specify --nnodes 1 and --nproc_per_node [num_gpus]"
         )
     init_process_group(backend="gloo" if cfg.device == "cpu" else "nccl")
diff --git a/tests/recipes/test_eleuther_eval.py b/tests/recipes/test_eleuther_eval.py
index 68ffca0f82..ec71fe3312 100644
--- a/tests/recipes/test_eleuther_eval.py
+++ b/tests/recipes/test_eleuther_eval.py
@@ -19,10 +19,18 @@
     write_hf_ckpt_config,
     write_hf_vision_ckpt_config,
 )
-from tests.test_utils import CKPT_MODEL_PATHS, gpu_test
+from tests.test_utils import CKPT_MODEL_PATHS
 
 
 class TestEleutherEval:
+    @pytest.mark.parametrize(
+        "eval_name, expected_acc, bsz",
+        [
+            ("truthfulqa_gen", 0.1, 4),
+            ("truthfulqa_gen", 0.1, 1),
+            ("truthfulqa_mc2", 0.4, 4),
+        ],
+    )
     @pytest.fixture
     def hide_correct_version_number(self, monkeypatch):
         import importlib.metadata
@@ -39,22 +47,14 @@ def mocked_import(name, *args, **kwargs):
     @pytest.fixture
     def expected_vision_acc(self):
         return {
-            "Science": 0.35,
-            "Biology": 0.25,
-            "Chemistry": 0.25,
-            "Geography": 0.5,
-            "Math": 0.0,
-            "Physics": 0.75,
+            "Science": 0.2,
+            "Biology": 0.4,
+            "Chemistry": 0.0,
+            "Geography": 0.2,
+            "Math": 0.4,
+            "Physics": 0.0,
         }
 
-    @pytest.mark.parametrize(
-        "eval_name, expected_acc, bsz",
-        [
-            ("truthfulqa_gen", 0.1, 4),
-            ("truthfulqa_gen", 0.1, 1),
-            ("truthfulqa_mc2", 0.4, 4),
-        ],
-    )
     @pytest.mark.integration_test
     def test_torchtune_checkpoint_eval_results(
         self, caplog, monkeypatch, tmpdir, eval_name, expected_acc, bsz
@@ -212,7 +212,6 @@ def test_eval_recipe_errors_with_qat_quantizer(self, monkeypatch, tmpdir):
             runpy.run_path(TUNE_PATH, run_name="__main__")
 
     @pytest.mark.integration_test
-    @gpu_test(gpu_count=1)
     def test_meta_eval_vision(self, caplog, monkeypatch, tmpdir, expected_vision_acc):
         ckpt = "llama3_2_vision_meta"
         ckpt_path = Path(CKPT_MODEL_PATHS[ckpt])
@@ -231,9 +230,9 @@ def test_meta_eval_vision(self, caplog, monkeypatch, tmpdir, expected_vision_acc
             checkpointer.model_type=LLAMA3_VISION \
             tokenizer.path=/tmp/test-artifacts/tokenizer_llama3.model \
             tokenizer.prompt_template=null \
-            limit=4 \
+            limit=5 \
             dtype=bf16 \
-            device=cuda \
+            device=cpu \
         """.split()
 
         model_config = llama3_2_vision_test_config()
@@ -252,7 +251,6 @@ def test_meta_eval_vision(self, caplog, monkeypatch, tmpdir, expected_vision_acc
             assert math.isclose(float(accuracy), expected_vision_acc[task_name])
 
     @pytest.mark.integration_test
-    @gpu_test(gpu_count=1)
     def test_hf_eval_vision(self, caplog, monkeypatch, tmpdir, expected_vision_acc):
         ckpt = "llama3_2_vision_hf"
         ckpt_path = Path(CKPT_MODEL_PATHS[ckpt])
@@ -274,9 +272,9 @@ def test_hf_eval_vision(self, caplog, monkeypatch, tmpdir, expected_vision_acc):
             checkpointer.model_type=LLAMA3_VISION \
             tokenizer.path=/tmp/test-artifacts/tokenizer_llama3.model \
             tokenizer.prompt_template=null \
-            limit=4 \
+            limit=5 \
             dtype=bf16 \
-            device=cuda \
+            device=cpu \
         """.split()
 
         model_config = llama3_2_vision_test_config()
diff --git a/torchtune/_recipe_registry.py b/torchtune/_recipe_registry.py
index c40e89184b..cdb1d45f01 100644
--- a/torchtune/_recipe_registry.py
+++ b/torchtune/_recipe_registry.py
@@ -109,9 +109,6 @@ class Recipe:
             Config(name="mistral/7B_full", file_path="mistral/7B_full.yaml"),
             Config(name="gemma/2B_full", file_path="gemma/2B_full.yaml"),
             Config(name="gemma/7B_full", file_path="gemma/7B_full.yaml"),
-            Config(name="gemma2/2B_full", file_path="gemma2/2B_full.yaml"),
-            Config(name="gemma2/9B_full", file_path="gemma2/9B_full.yaml"),
-            Config(name="gemma2/27B_full", file_path="gemma2/27B_full.yaml"),
             Config(name="phi3/mini_full", file_path="phi3/mini_full.yaml"),
             Config(name="qwen2/7B_full", file_path="qwen2/7B_full.yaml"),
             Config(name="qwen2/0.5B_full", file_path="qwen2/0.5B_full.yaml"),
@@ -219,30 +216,6 @@ class Recipe:
                 name="gemma/7B_qlora_single_device",
                 file_path="gemma/7B_qlora_single_device.yaml",
             ),
-            Config(
-                name="gemma2/2B_lora_single_device",
-                file_path="gemma2/2B_lora_single_device.yaml",
-            ),
-            Config(
-                name="gemma2/2B_qlora_single_device",
-                file_path="gemma2/2B_qlora_single_device.yaml",
-            ),
-            Config(
-                name="gemma2/9B_lora_single_device",
-                file_path="gemma2/9B_lora_single_device.yaml",
-            ),
-            Config(
-                name="gemma2/9B_qlora_single_device",
-                file_path="gemma2/9B_qlora_single_device.yaml",
-            ),
-            Config(
-                name="gemma2/27B_lora_single_device",
-                file_path="gemma2/27B_lora_single_device.yaml",
-            ),
-            Config(
-                name="gemma2/27B_qlora_single_device",
-                file_path="gemma2/27B_qlora_single_device.yaml",
-            ),
             Config(
                 name="phi3/mini_lora_single_device",
                 file_path="phi3/mini_lora_single_device.yaml",
@@ -356,9 +329,6 @@ class Recipe:
             Config(name="mistral/7B_lora", file_path="mistral/7B_lora.yaml"),
             Config(name="gemma/2B_lora", file_path="gemma/2B_lora.yaml"),
             Config(name="gemma/7B_lora", file_path="gemma/7B_lora.yaml"),
-            Config(name="gemma2/2B_lora", file_path="gemma2/2B_lora.yaml"),
-            Config(name="gemma2/9B_lora", file_path="gemma2/9B_lora.yaml"),
-            Config(name="gemma2/27B_lora", file_path="gemma2/27B_lora.yaml"),
             Config(name="phi3/mini_lora", file_path="phi3/mini_lora.yaml"),
             Config(name="qwen2/7B_lora", file_path="qwen2/7B_lora.yaml"),
             Config(name="qwen2/0.5B_lora", file_path="qwen2/0.5B_lora.yaml"),
diff --git a/torchtune/generation/_generation.py b/torchtune/generation/_generation.py
index bb4b1ff0b0..c2d60a7373 100644
--- a/torchtune/generation/_generation.py
+++ b/torchtune/generation/_generation.py
@@ -67,7 +67,7 @@ def generate_next_token(
     model: TransformerDecoder,
     input_pos: torch.Tensor,
     x: torch.Tensor,
-    q: Optional[torch.Tensor] = None,
+    q: torch.Tensor,
     *,
     mask: Optional[torch.Tensor] = None,
     temperature: float = 1.0,
@@ -82,7 +82,7 @@ def generate_next_token(
             with shape [bsz x seq_length].
         x (torch.Tensor): tensor with the token IDs associated with the given prompt,
             with shape [bsz x seq_length].
-        q (Optional[torch.Tensor]): randomly sampled tensor for softmax sampling trick.
+        q (torch.Tensor): randomly sampled tensor for softmax sampling trick.
             See https://github.com/pytorch-labs/gpt-fast/blob/32971d3129541c5bfb4f715abc33d1c5f408d204/generate.py#L40
         mask (Optional[torch.Tensor]): attention mask with shape [bsz x seq_length x seq_length],
             default None.
@@ -302,11 +302,9 @@ def generate(
         # tensors are of identical shape to the prompt
         curr_masks = masks[:, :prompt_length, :prompt_length]
 
-    q = None
-    if rng is not None:
-        q = torch.empty(
-            (bsz, model.tok_embeddings.num_embeddings), device=prompt.device
-        ).exponential_(1, generator=rng)
+    q = torch.empty(
+        (bsz, model.tok_embeddings.num_embeddings), device=prompt.device
+    ).exponential_(1, generator=rng)
     tokens, generated_logits = generate_next_token(
         model,
         input_pos=input_pos[:, :prompt_length].squeeze(),
@@ -362,11 +360,9 @@ def generate(
             curr_input_pos = input_pos[:, : curr_pos + 1]
             curr_masks = masks[:, : curr_pos + 1, : curr_pos + 1]
 
-        q = None
-        if rng is not None:
-            q = torch.empty(
-                (bsz, model.tok_embeddings.num_embeddings), device=prompt.device
-            ).exponential_(1, generator=rng)
+        q = torch.empty(
+            (bsz, model.tok_embeddings.num_embeddings), device=prompt.device
+        ).exponential_(1, generator=rng)
         tokens, logits = custom_generate_next_token(
             model,
             input_pos=curr_input_pos,
diff --git a/torchtune/models/gemma/__init__.py b/torchtune/models/gemma/__init__.py
index f762de86b6..48e4e84b10 100644
--- a/torchtune/models/gemma/__init__.py
+++ b/torchtune/models/gemma/__init__.py
@@ -27,4 +27,6 @@
     "lora_gemma_7b",
     "qlora_gemma_2b",
     "qlora_gemma_7b",
+    "gemma_hf_to_tune",
+    "gemma_tune_to_hf",
 ]
diff --git a/torchtune/models/gemma/_component_builders.py b/torchtune/models/gemma/_component_builders.py
index ba5b666c98..e7ab9b224c 100644
--- a/torchtune/models/gemma/_component_builders.py
+++ b/torchtune/models/gemma/_component_builders.py
@@ -46,6 +46,7 @@ def gemma(
     attn_dropout: float = 0.0,
     norm_eps: float = 1e-6,
     rope_base: int = 10_000,
+    norm_embeddings: bool = True,
 ) -> TransformerDecoder:
     """
     Build the decoder associated with the gemma model. This includes:
@@ -71,6 +72,8 @@ def gemma(
             Default: 0.0
         norm_eps (float): epsilon in RMS norms Default: 1e-6
         rope_base (int): base for the rotary positional embeddings. Default: 10_000
+        norm_embeddings (bool): whether to apply layer norm before the self-attention
+            and mlp layers. Default: True
 
     Returns:
         TransformerDecoder: Instantiation of gemma model.
@@ -143,6 +146,7 @@ def lora_gemma(
     attn_dropout: float = 0.0,
     norm_eps: float = 1e-6,
     rope_base: int = 10_000,
+    norm_embeddings: bool = True,
     # LoRA args
     lora_rank: int,
     lora_alpha: float,
@@ -173,6 +177,8 @@ def lora_gemma(
             Default: 0.0
         norm_eps (float): epsilon in RMS norms Default: 1e-6
         rope_base (int): base for the rotary positional embeddings. Default: 10_000
+        norm_embeddings (bool): whether to apply layer norm before the self-attention
+            and mlp layers. Default: True
         lora_rank (int): rank of each low-rank approximation
         lora_alpha (float): scaling factor for the low-rank approximation
         lora_dropout (float): LoRA dropout probability. Default: 0.0
diff --git a/torchtune/models/gemma2/__init__.py b/torchtune/models/gemma2/__init__.py
deleted file mode 100644
index 9fe11db7ab..0000000000
--- a/torchtune/models/gemma2/__init__.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-from ..gemma._model_builders import gemma_tokenizer
-from ..gemma._tokenizer import GemmaTokenizer  # noqa
-from ._component_builders import gemma2, lora_gemma2  # noqa
-from ._model_builders import (  # noqa
-    gemma2_27b,
-    gemma2_2b,
-    gemma2_9b,
-    lora_gemma2_27b,
-    lora_gemma2_2b,
-    lora_gemma2_9b,
-    qlora_gemma2_27b,
-    qlora_gemma2_2b,
-    qlora_gemma2_9b,
-)
-
-__all__ = [
-    "GemmaTokenizer",
-    "gemma2",
-    "gemma2_2b",
-    "gemma2_9b",
-    "gemma2_27b",
-    "gemma_tokenizer",
-    "lora_gemma2",
-    "lora_gemma2_2b",
-    "lora_gemma2_9b",
-    "lora_gemma2_27b",
-    "qlora_gemma2_2b",
-    "qlora_gemma2_9b",
-    "qlora_gemma2_27b",
-]
diff --git a/torchtune/models/gemma2/_attention.py b/torchtune/models/gemma2/_attention.py
deleted file mode 100644
index b00612d032..0000000000
--- a/torchtune/models/gemma2/_attention.py
+++ /dev/null
@@ -1,339 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-import logging
-from typing import Optional
-
-import torch
-import torch.nn.functional as F
-from torch import nn
-from torchtune.modules.attention_utils import _MaskType
-from torchtune.modules.kv_cache import KVCache
-
-logger = logging.getLogger(__name__)
-
-
-class Gemma2Attention(nn.Module):
-    """
-    Adapated from official Google Pytorch Implementation:
-    https://github.com/google/gemma_pytorch/blob/80881c2e6e797ef1913a4a705d4b40394791cc58/gemma/model.py#L213
-    to match torchtune style.
-    A new attention had to be added since nn.functional.scaled_dot_product_attention does allow soft capping
-    Args:
-        embed_dim (int): embedding dimension for the model
-        num_heads (int): number of query heads. For MHA this is also the
-            number of heads for key and value
-        num_kv_heads (int): number of key and value heads. User should ensure
-            ``num_heads % num_kv_heads == 0``. For standard MHA set ``num_kv_heads == num_heads``,
-            for GQA ``num_kv_heads < num_heads``, and for MQA set ``num_kv_heads == 1``.
-        head_dim (int): dimension of each head, calculated by ``embed_dim // num_heads``.
-        q_proj (nn.Module): projection layer for query.
-        k_proj (nn.Module): projection layer for key.
-        v_proj (nn.Module): projection layer for value.
-        output_proj (nn.Module): projection layer for output.
-        pos_embeddings (Optional[nn.Module]): positional embeddings layer, e.g. RotaryPositionalEmbeddings.
-        q_norm (Optional[nn.Module]): normalization layer for query, e.g. RMSNorm. For decoding, this is applied
-            before updating from kv_cache. This means it will only support token wide normalization and not
-            batch or sequence wide normalization.
-        k_norm (Optional[nn.Module]): normalization layer for key, must be set if q_norm is.
-        kv_cache (Optional[KVCache]): KVCache object used to cache key and value
-        max_seq_len (int): maximum sequence length supported by the model.
-            This is needed to compute the RoPE Cache. Default: 4096.
-        is_causal (bool): sets the default mask to causal when no mask is provided
-        attn_dropout (float): dropout value passed onto the
-            scaled_dot_product_attention function. This argument is ignored if the
-            self.training is False. Default value is 0.0.
-        sliding_window_size (Optional[int]): size of the sliding window if None no sliding window is applied
-        softcapping (Optional[float]): capping value used for soft caping, if None no capping is performed
-        query_pre_attn_scalar (Optional[int]): value used for pre attention normalisation, if None head_dim is used instead
-    Raises:
-        ValueError: If ``num_heads % num_kv_heads != 0``
-        ValueError: If ``embed_dim % num_heads != 0``
-        ValueError: If ``attn_dropout < 0`` or ``attn_dropout > 1``
-        ValueError: if q_norm is defined without k_norm or vice versa
-    """
-
-    def __init__(
-        self,
-        *,
-        embed_dim: int,
-        num_heads: int,
-        num_kv_heads: int,
-        head_dim: int,
-        q_proj: nn.Module,
-        k_proj: nn.Module,
-        v_proj: nn.Module,
-        output_proj: nn.Module,
-        pos_embeddings: Optional[nn.Module] = None,
-        q_norm: Optional[nn.Module] = None,
-        k_norm: Optional[nn.Module] = None,
-        kv_cache: Optional[KVCache] = None,
-        max_seq_len: int = 4096,
-        is_causal: bool = True,
-        attn_dropout: float = 0.0,
-        sliding_window_size: Optional[int] = None,
-        softcapping: Optional[float] = 50.0,
-        query_pre_attn_scalar: Optional[int] = None,
-    ) -> None:
-        super().__init__()
-        if num_heads % num_kv_heads != 0:
-            raise ValueError(
-                f"num_heads ({num_heads}) must be divisible by "
-                f"num_kv_heads ({num_kv_heads})"
-            )
-
-        if embed_dim % num_heads != 0:
-            raise ValueError(
-                f"embed_dim ({embed_dim}) must be divisible by "
-                f"num_heads ({num_heads})"
-            )
-
-        if attn_dropout < 0 or attn_dropout > 1:
-            raise ValueError(f"attn_dropout ({embed_dim}) must be between 0.0 and 1.0")
-
-        if bool(q_norm) ^ bool(k_norm):
-            raise ValueError("q and k norm must be set together")
-
-        # Set attributes
-        self.num_heads = num_heads
-        self.num_kv_heads = num_kv_heads
-        self.embed_dim = embed_dim
-        self.attn_dropout = attn_dropout
-        self.head_dim = head_dim
-        self.max_seq_len = max_seq_len
-        self.is_causal = is_causal
-
-        # Set layers
-        self.kv_cache = kv_cache
-        self.q_proj = q_proj
-        self.k_proj = k_proj
-        self.v_proj = v_proj
-        self.output_proj = output_proj
-        self.q_norm = q_norm
-        self.k_norm = k_norm
-        self.pos_embeddings = pos_embeddings
-
-        # gemma related parameters
-        self.sliding_window_size = sliding_window_size
-        self.softcapping = softcapping
-        if query_pre_attn_scalar is not None:
-            self.scaling = query_pre_attn_scalar**-0.5
-        else:
-            self.scaling = self.head_dim**-0.5
-
-        # this flag indicates whether to update the kv-cache during forward
-        # passes. when disabled, we can have the cache setup but still
-        # perform normal forward passes
-        self.cache_enabled = False
-
-    def setup_cache(
-        self, batch_size: int, dtype: torch.dtype, max_seq_len: int
-    ) -> None:
-        """Setup key value caches for attention calculation. If called
-        after kv_cache is already setup, this will be skipped.
-
-        Args:
-            batch_size (int): batch size for the caches.
-            dtype (torch.dtype): dtype for the caches.
-            max_seq_len (int): maximum sequence length model will be run with.
-        """
-        # Don't overwrite user defined kv_cache from init
-        if self.kv_cache is not None:
-            logger.warning(
-                "Key value caches are already setup. You cannot call ``setup_caches()`` twice. Skipping."
-            )
-        else:
-            self.kv_cache = KVCache(
-                batch_size=batch_size,
-                max_seq_len=max_seq_len,
-                num_heads=self.num_heads,
-                head_dim=self.head_dim,
-                dtype=dtype,
-            )
-            self.cache_enabled = True
-
-    def reset_cache(self):
-        """Reset the key value caches."""
-        if self.kv_cache is None:
-            raise RuntimeError(
-                "Key value caches are not setup. Call ``setup_caches()`` first."
-            )
-        self.kv_cache.reset()
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        y: Optional[torch.Tensor] = None,
-        *,
-        mask: Optional[_MaskType] = None,
-        input_pos: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        """
-        Args:
-            x (torch.Tensor): input tensor with shape [b x s_x x d] for the query
-            y (Optional[torch.Tensor]): second input tensor with shape [b x s_y x d], is the input
-                for k and v. For self attention, x=y. Optional only with kv_cache enabled.
-            mask (Optional[_MaskType]): Used to mask the scores after the query-key multiplication
-                and before the softmax. Either:
-
-                A boolean tensor with shape ``[b x s x s]``, ``[b x s x self.encoder_max_cache_seq_len]``,
-                or ``[b x s x self.encoder_max_cache_seq_len]`` if using KV-cacheing with encoder/decoder layers.
-                A value of True in row ``i`` and column ``j`` means token ``i`` attends to token ``j``. A value of False means
-                token ``i`` does not attend to token ``j``. If no mask is specified, a causal mask
-                is used by default.
-
-                A :class:`~torch.nn.attention.flex_attention.BlockMask` for document masking in a packed sequence
-                created via `create_block_mask <https://pytorch.org/blog/flexattention/#mask-mods>`_. We  use
-                :func:`~torch.nn.attention.flex_attention.flex_attention` when computing attention with block masks.
-                Default is None.
-            input_pos (Optional[torch.Tensor]): Optional tensor which contains the position ids
-                of each token. During training, this is used to indicate the positions
-                of each token relative to its sample when packed, shape [b x s].
-                During inference, this indicates the position of the current token.
-                If none, assume the index of the token is its position id. Default is None.
-
-        Raises:
-            ValueError: If no ``y`` input and ``kv_cache`` is not enabled.
-
-        Returns:
-            torch.Tensor: output tensor with attention applied
-
-        Notation used for tensor shapes:
-            - b: batch size
-            - s_x: sequence length for x
-            - s_y: sequence length for y
-            - n_h: num heads
-            - n_kv: num kv heads
-            - d: embed dim
-            - h_d: head dim
-        """
-        #  until flex attention implementation exists, we do not accept block masks
-        if (mask is not None) and (type(mask) != torch.Tensor()):
-            raise NotImplementedError(
-                "Block masks are not implemeted yet, use packed=False"
-            )
-
-        # x has shape [b, s_x, d]
-        # y has shape [b, s_y, d]
-        b, s_x, _ = x.shape
-        s_y = y.shape[1] if y is not None else 0
-
-        # q has shape [b, s_x, num_heads * head_dim]
-        q = self.q_proj(x)
-
-        # number of queries per key/value
-        q_per_kv = self.num_heads // self.num_kv_heads
-        q = q.view(b, s_x, self.num_kv_heads * q_per_kv, self.head_dim)
-
-        # Apply positional embeddings
-        if self.pos_embeddings is not None:
-            q = self.pos_embeddings(q, input_pos=input_pos)
-
-        # [b, n_h, s_x, h_d]
-        q = q.transpose(1, 2)
-
-        # Normalize q
-        if self.q_norm is not None:
-            q = self.q_norm(q)
-
-        if y is None:
-            if self.kv_cache is None:
-                raise ValueError(
-                    "Must provide y input or use kv_cache to enable streaming decoding"
-                )
-            k = self.kv_cache.k_cache
-            v = self.kv_cache.v_cache
-        else:
-            # Update k and v shape, positional embeddings, and normalization
-
-            # k has shape [b, s_y, num_kv_heads * head_dim]
-            # v has shape [b, s_y, num_kv_heads * head_dim]
-            k = self.k_proj(y)
-            v = self.v_proj(y)
-
-            # Apply positional embeddings
-            # k: [b, s_y, n_kv, h_d]
-            k = k.view(b, s_y, -1, self.head_dim)
-            if self.pos_embeddings is not None:
-                k = self.pos_embeddings(k, input_pos=input_pos)
-
-            # View + expand + reshape bring num_kv_heads to num_heads for k and v
-            # to match q.
-
-            # k: [b, s_y, n_kv, 1, h_d]
-            # v: [b, s_y, n_kv, 1, h_d]
-            k = k.view(b, s_y, self.num_kv_heads, 1, self.head_dim)
-            v = v.view(b, s_y, self.num_kv_heads, 1, self.head_dim)
-
-            # If needed, expand the key and value tensors to have the same shape
-            # as the query tensor by copying values across the relevant dim
-            if self.num_heads != self.num_kv_heads:
-                k = k.expand(b, s_y, self.num_kv_heads, q_per_kv, self.head_dim)
-                v = v.expand(b, s_y, self.num_kv_heads, q_per_kv, self.head_dim)
-
-            # [b, s, n_h, h_d]
-            k = k.reshape(b, s_y, -1, self.head_dim)
-            v = v.reshape(b, s_y, -1, self.head_dim)
-
-            # [b, n_h, s, h_d]
-            k = k.transpose(1, 2)
-            v = v.transpose(1, 2)
-
-            # Normalize k
-            if self.k_norm is not None:
-                k = self.k_norm(k)
-
-            # Update key-value cache
-            if self.kv_cache is not None and self.cache_enabled:
-                k, v = self.kv_cache.update(k, v)
-
-        q.mul_(self.scaling)
-        output = torch.matmul(
-            q, k.transpose(2, 3)
-        )  # [batch_size, n_local_heads, input_len, head_dim]
-
-        # if mask is None: default to causal mask
-        if mask is None:
-            mask = torch.tril(
-                torch.ones(
-                    size=(s_x, s_x),
-                    dtype=torch.bool,
-                ).to(x.device)
-            )
-
-        # update masks bias to be 0 for visible tokens and -2.3819763e38 otherwise
-        # this is similar to what torch sdpa is doing:
-        # https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
-        if mask.dtype == torch.bool:
-            mask = torch.where(mask.logical_not(), -2.3819763e38, 0)
-
-        if self.sliding_window_size is not None:
-            all_ones = torch.ones_like(mask)
-
-            sliding_mask = torch.triu(
-                all_ones, -1 * self.sliding_window_size + 1
-            ) * torch.tril(all_ones, self.sliding_window_size - 1)
-            mask = torch.where(sliding_mask == 1, mask, -2.3819763e38)
-
-        if mask.dim() == 3:
-            # This is the case for block masks where attention is different per sample
-            # we want mask to be broadcastable with output so we aim for (bs, 1, s_x, s_y)
-            mask = mask.unsqueeze(1)
-
-        if self.softcapping is not None:
-            output = output / self.softcapping
-            output = torch.tanh(output)
-            output = output * self.softcapping
-
-        output = output + mask
-        output = F.softmax(output.float(), dim=-1).type_as(q)
-
-        # [batch_size, n_local_heads, input_len, head_dim]
-        output = torch.matmul(output, v)
-
-        # reshape the output to be the same shape as the input
-        output = output.transpose(1, 2).contiguous().view(b, s_x, -1)
-        return self.output_proj(output)
diff --git a/torchtune/models/gemma2/_component_builders.py b/torchtune/models/gemma2/_component_builders.py
deleted file mode 100644
index 0ddef36857..0000000000
--- a/torchtune/models/gemma2/_component_builders.py
+++ /dev/null
@@ -1,413 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-from torch import nn
-import torch
-from typing import List
-from torchtune.modules.common_utils import _register_reparametrize_state_dict_hooks
-from typing import List, Optional
-
-from torchtune.modules import (
-    FrozenNF4Linear,
-    RotaryPositionalEmbeddings,
-    TransformerSelfAttentionLayer,
-)
-
-from torchtune.models.gemma2._attention import Gemma2Attention
-from torchtune.models.gemma.rms_norm import GemmaRMSNorm
-from torchtune.modules import TransformerDecoder, TiedLinear
-from torchtune.models.gemma.gemma_norm_embedding import GemmaNormEmbeddings
-from torchtune.modules.peft import DoRALinear, LORA_ATTN_MODULES, LoRALinear
-from torchtune.models.gemma._component_builders import gemma_mlp, lora_gemma_mlp
-
-"""
-Component builders for the Gemma2 2B, 9B models and popular variants such as LoRA.
-
-torchtune provides composable building blocks. Builder functions help
-stitch these building blocks into higher-level components. This design has
-two benefits:
-- The building blocks themselves are very flexible. For example, ``MultiHeadAttention``
-can take either nn.Linear or nn.LoRALinear for ``q_proj``.
-- Builder functions expose a set of configurable params which keep the constructors of
-the building blocks simple.
-"""
-
-class TanhSoftCapping(nn.Module):
-    def __init__(
-        self,
-        capping_value: float,
-    ) -> None:
-        super().__init__()
-        self.capping_value = capping_value
-
-    def forward(self, attn_weights):
-        attn_weights = attn_weights / self.capping_value
-        attn_weights = torch.tanh(attn_weights)
-        attn_weights = attn_weights * self.capping_value
-        return attn_weights
-
-class Gemma2FinalNorm(nn.Module):
-    """
-    Combines RMSNorm and SoftCapping
-    """
-    def __init__(
-        self,
-        capping_value: float,
-        embed_dim: int,
-        eps: float
-    ) -> None:
-        super().__init__()
-        self.capping_value = capping_value
-        self.rms_norm = GemmaRMSNorm(embed_dim, eps=eps)
-        self.logit_capping = TanhSoftCapping(capping_value)
-        
-    def forward(self, x):
-        x = self.rms_norm(x)
-        x = self.logit_capping(x)
-        return x
-
-
-def gemma2(
-    vocab_size: int,
-    num_layers: int,
-    num_heads: int,
-    head_dim: int,
-    num_kv_heads: int,
-    embed_dim: int,
-    intermediate_dim: int,
-    max_seq_len: int,
-    attn_dropout: float = 0.0,
-    norm_eps: float = 1e-6,
-    rope_base: int = 10_000,
-    hidden_capping_value: float = 50.,
-    final_capping_value: float = 30.,
-    sliding_window_size: int = 4096,
-    query_pre_attn_scalar:  Optional[int] = None,
-) -> TransformerDecoder:
-    """
-    Build the decoder associated with the gemma2 model. This includes:
-    - Token embeddings
-    - num_layers number of TransformerSelfAttentionLayer blocks
-    - RMS Norm layer applied to the output of the transformer
-    - Final projection into token space
-
-
-    Args:
-        vocab_size (int): number of tokens in vocabulary.
-        num_layers (int): number of layers in the transformer decoder.
-        num_heads (int): number of query heads. For MHA this is also the
-            number of heads for key and value
-        head_dim (int): dimension of head
-        num_kv_heads (int): number of key and value heads.
-        embed_dim (int): embedding dimension for self-attention
-        intermediate_dim (int): intermediate dimension for MLP
-        max_seq_len (int): maximum sequence length the model will be run with,
-        attn_dropout (float): dropout value passed onto scaled_dot_product_attention.
-            Default: 0.0
-        norm_eps (float): epsilon in RMS norms Default: 1e-6
-        rope_base (int): base for the rotary positional embeddings. Default: 10_000
-
-    Returns:
-        TransformerDecoder: Instantiation of gemma model.
-    """
-    rope = RotaryPositionalEmbeddings(dim=head_dim, max_seq_len=max_seq_len, base=rope_base)
-    
-    layers = torch.nn.ModuleList()
-    
-    for layer_idx in range(num_layers):
-        
-        mlp = gemma_mlp(dim=embed_dim, hidden_dim=intermediate_dim)
-        
-        self_att = Gemma2Attention(
-            embed_dim=embed_dim,
-            num_heads=num_heads,
-            num_kv_heads=num_kv_heads,
-            head_dim=head_dim,
-            q_proj=nn.Linear(embed_dim, num_heads * head_dim, bias=False),
-            k_proj=nn.Linear(embed_dim, num_kv_heads * head_dim, bias=False),
-            v_proj=nn.Linear(embed_dim, num_kv_heads * head_dim, bias=False),
-            output_proj=nn.Linear(num_heads * head_dim, embed_dim, bias=False),
-            pos_embeddings=rope,
-            kv_cache=None,
-            max_seq_len=max_seq_len,
-            attn_dropout=attn_dropout,
-            # perform sliding window on half of the layers only
-            sliding_window_size=sliding_window_size if (layer_idx % 2)==0 else None,
-            softcapping=hidden_capping_value,
-            query_pre_attn_scalar=query_pre_attn_scalar
-        )
-        
-        layer = TransformerSelfAttentionLayer(
-            attn=self_att,
-            mlp=mlp,
-            sa_norm=GemmaRMSNorm(embed_dim, eps=norm_eps),
-            mlp_norm=GemmaRMSNorm(embed_dim, eps=norm_eps),
-            sa_scale=GemmaRMSNorm(embed_dim, eps=norm_eps),
-            mlp_scale=GemmaRMSNorm(embed_dim, eps=norm_eps),
-        )
-        layers.append(layer)
-    tok_embeddings = GemmaNormEmbeddings(vocab_size, embed_dim)
-    output_proj = TiedLinear(tok_embeddings)
-    model = TransformerDecoder(
-        tok_embeddings=tok_embeddings,
-        layers=layers,
-        max_seq_len=max_seq_len,
-        num_heads=num_heads,
-        output=output_proj,
-        head_dim=head_dim,
-        norm=Gemma2FinalNorm(final_capping_value, embed_dim, eps=norm_eps),
-    )
-    return model
-
-
-
-def lora_gemma2(
-    lora_attn_modules: List[LORA_ATTN_MODULES],
-    apply_lora_to_mlp: bool = False,
-    *,
-    # gemma args
-    vocab_size: int,
-    num_layers: int,
-    num_heads: int,
-    head_dim: int,
-    num_kv_heads: int,
-    embed_dim: int,
-    intermediate_dim: int,
-    max_seq_len: int,
-    attn_dropout: float = 0.0,
-    norm_eps: float = 1e-6,
-    rope_base: int = 10_000,
-    hidden_capping_value: float = 50.,
-    final_capping_value: float = 30.,
-    sliding_window_size: int = 4096,
-    query_pre_attn_scalar:  Optional[int] = None,
-    # LoRA args
-    lora_rank: int,
-    lora_alpha: float,
-    lora_dropout: float = 0.0,
-    use_dora: bool = False,
-    quantize_base: bool = False,
-) -> TransformerDecoder:
-    """
-    Return a version of Gemma with LoRA applied based on the passed in configuration.
-    Note: output projection lora is not supported because it is tied to token embeddings
-
-    Args:
-        lora_attn_modules (List[LORA_ATTN_MODULES]): list of which linear layers
-            LoRA should be applied to in each self-attention block. Options are
-            ``{"q_proj", "k_proj", "v_proj", "output_proj"}``.
-        apply_lora_to_mlp (bool): whether to apply LoRA to the MLP in each transformer layer.
-            Default: False
-        vocab_size (int): number of tokens in vocabulary.
-        num_layers (int): number of layers in the transformer decoder.
-        num_heads (int): number of query heads. For MHA this is also the
-            number of heads for key and value
-        head_dim (int): dimension of head
-        num_kv_heads (int): number of key and value heads.
-        embed_dim (int): embedding dimension for self-attention
-        intermediate_dim (int): intermediate dimension for MLP
-        max_seq_len (int): maximum sequence length the model will be run with,
-        attn_dropout (float): dropout value passed onto scaled_dot_product_attention.
-            Default: 0.0
-        norm_eps (float): epsilon in RMS norms Default: 1e-6
-        rope_base (int): base for the rotary positional embeddings. Default: 10_000
-        lora_rank (int): rank of each low-rank approximation
-        lora_alpha (float): scaling factor for the low-rank approximation
-        lora_dropout (float): LoRA dropout probability. Default: 0.0
-        use_dora (bool): Decompose the LoRA weight into magnitude and direction, as
-            introduced in "DoRA: Weight-Decomposed Low-Rank Adaptation" (https://arxiv.org/abs/2402.09353).
-        quantize_base: (bool): Whether to quantize base model weights or not. Only applied to base
-            weights within linear layers LoRA is applied to. The final output linear projection is not
-            supported for quantization currently.
-
-    Returns:
-        TransformerDecoder: Instantiation of Gemma model with LoRA applied to
-        a subset of the attention projections in each layer.
-    """
-
-    tok_embeddings = GemmaNormEmbeddings(vocab_size, embed_dim)
-    output_proj = TiedLinear(tok_embeddings)
-    
-    layers = torch.nn.ModuleList()
-    
-    for layer_idx in range(num_layers):
-        if apply_lora_to_mlp:
-            mlp = lora_gemma_mlp(
-                dim=embed_dim,
-                hidden_dim=intermediate_dim,
-                lora_rank=lora_rank,
-                lora_alpha=lora_alpha,
-                lora_dropout=lora_dropout,
-                use_dora=use_dora,
-                quantize_base=quantize_base,
-            )
-        else:
-            mlp = gemma_mlp(dim=embed_dim, hidden_dim=intermediate_dim, quantize_base=quantize_base)
-        self_att = lora_gemma2_self_attention(
-            lora_modules=lora_attn_modules,
-            embed_dim=embed_dim,
-            num_heads=num_heads,
-            num_kv_heads=num_kv_heads,
-            head_dim=head_dim,
-            rope_base=rope_base,
-            max_seq_len=max_seq_len,
-            attn_dropout=attn_dropout,
-            # perform sliding window on half of the layers only
-            sliding_window_size=sliding_window_size if (layer_idx % 2)==0 else None,
-            softcapping=hidden_capping_value,
-            query_pre_attn_scalar=query_pre_attn_scalar,
-            lora_rank=lora_rank,
-            lora_alpha=lora_alpha,
-            lora_dropout=lora_dropout,
-            use_dora = use_dora,
-            quantize_base = quantize_base,
-        )
-        
-        layer = TransformerSelfAttentionLayer(
-            attn=self_att,
-            mlp=mlp,
-            sa_norm=GemmaRMSNorm(embed_dim, eps=norm_eps),
-            mlp_norm=GemmaRMSNorm(embed_dim, eps=norm_eps),
-            sa_scale=GemmaRMSNorm(embed_dim, eps=norm_eps),
-            mlp_scale=GemmaRMSNorm(embed_dim, eps=norm_eps),
-        )
-        layers.append(layer)
-        
-    model = TransformerDecoder(
-        tok_embeddings=tok_embeddings,
-        layers=layers,
-        max_seq_len=max_seq_len,
-        num_heads=num_heads,
-        output=output_proj,
-        head_dim=head_dim,
-        norm=Gemma2FinalNorm(final_capping_value, embed_dim, eps=norm_eps)
-    )
-
-    if quantize_base:
-        # For QLoRA, we reparametrize 4-bit tensors to higher precision, and offload to CPU on the fly
-        # so as to not increase peak memory
-        # TODO this is clowny, figure out a better way to get what precision the rest
-        # of the model is in
-        _register_reparametrize_state_dict_hooks(model, dtype=tok_embeddings.weight.dtype)
-
-    return model
-
-
-def lora_gemma2_self_attention(
-    lora_modules: List[LORA_ATTN_MODULES],
-    *,
-    # MultiHeadAttention args
-    embed_dim: int,
-    num_heads: int,
-    head_dim: int,
-    num_kv_heads: int,
-    max_seq_len: int,
-    attn_dropout: float = 0.0,
-    rope_base: int = 10_000,
-    sliding_window_size: Optional[int] = None,
-    softcapping: Optional[float] = 50.,
-    query_pre_attn_scalar: Optional[int],
-    # LoRA args
-    lora_rank: int,
-    lora_alpha: float,
-    lora_dropout: float = 0.0,
-    use_dora: bool = False,
-    quantize_base: bool = False,
-    
-) -> Gemma2Attention:
-    if not lora_modules:
-        raise ValueError(
-            f"Must pass one or more of {LORA_ATTN_MODULES} as lora_modules"
-        )
-
-    num_kv_heads = num_kv_heads if num_kv_heads else num_heads
-    adapter_cls = DoRALinear if use_dora else LoRALinear
-
-    q_proj = (
-        adapter_cls(
-            embed_dim,
-            num_heads * head_dim,
-            rank=lora_rank,
-            alpha=lora_alpha,
-            dropout=lora_dropout,
-            quantize_base=quantize_base,
-        )
-        if "q_proj" in lora_modules
-        else (
-            nn.Linear(embed_dim, num_heads * head_dim, bias=False)
-            if not quantize_base
-            else FrozenNF4Linear(embed_dim, num_heads * head_dim, bias=False)
-        )
-    )
-    k_proj = (
-        adapter_cls(
-            embed_dim,
-            num_kv_heads * head_dim,
-            rank=lora_rank,
-            alpha=lora_alpha,
-            dropout=lora_dropout,
-            quantize_base=quantize_base,
-        )
-        if "k_proj" in lora_modules
-        else (
-            nn.Linear(embed_dim, num_kv_heads * head_dim, bias=False)
-            if not quantize_base
-            else FrozenNF4Linear(embed_dim, num_kv_heads * head_dim, bias=False)
-        )
-    )
-    v_proj = (
-        adapter_cls(
-            embed_dim,
-            num_kv_heads * head_dim,
-            rank=lora_rank,
-            alpha=lora_alpha,
-            dropout=lora_dropout,
-            quantize_base=quantize_base,
-        )
-        if "v_proj" in lora_modules
-        else (
-            nn.Linear(embed_dim, num_kv_heads * head_dim, bias=False)
-            if not quantize_base
-            else FrozenNF4Linear(embed_dim, num_kv_heads * head_dim, bias=False)
-        )
-    )
-    output_proj = (
-        adapter_cls(
-            num_heads * head_dim,
-            embed_dim,
-            rank=lora_rank,
-            alpha=lora_alpha,
-            dropout=lora_dropout,
-            quantize_base=quantize_base,
-        )
-        if "output_proj" in lora_modules
-        else (
-            nn.Linear(num_heads * head_dim, embed_dim, bias=False)
-            if not quantize_base
-            else FrozenNF4Linear(num_heads * head_dim, embed_dim, bias=False)
-        )
-    )
-
-    rope = RotaryPositionalEmbeddings(dim=head_dim, max_seq_len=max_seq_len, base=rope_base)
-    
-    self_att = Gemma2Attention(
-            embed_dim=embed_dim,
-            num_heads=num_heads,
-            num_kv_heads=num_kv_heads,
-            head_dim=head_dim,
-            q_proj=q_proj,
-            k_proj=k_proj,
-            v_proj=v_proj,
-            output_proj=output_proj,
-            pos_embeddings=rope,
-            kv_cache=None,
-            max_seq_len=max_seq_len,
-            attn_dropout=attn_dropout,
-            sliding_window_size=sliding_window_size,
-            softcapping=softcapping,
-            query_pre_attn_scalar=query_pre_attn_scalar
-        )
-    return self_att
\ No newline at end of file
diff --git a/torchtune/models/gemma2/_convert_weights.py b/torchtune/models/gemma2/_convert_weights.py
deleted file mode 100644
index fa4df0e469..0000000000
--- a/torchtune/models/gemma2/_convert_weights.py
+++ /dev/null
@@ -1,132 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-from typing import Dict
-
-import torch
-
-from torchtune.models.convert_weights import get_mapped_key
-
-"""
-Gemma 2 and Gemma original implementations share different normalization but with
-the same name, so it is mandatory to differentiate their state dict in order to map
-correctly the different weights.
-They are essentially the same except for "model.layers.{}.post_attention_layernorm.weight" key.
-See discussion here: https://github.com/pytorch/torchtune/pull/1835#discussion_r1803410251
-"""
-
-_GEMMA2_FROM_HF = {
-    "model.embed_tokens.weight": "tok_embeddings.weight",
-    "model.layers.{}.self_attn.q_proj.weight": "layers.{}.attn.q_proj.weight",
-    "model.layers.{}.self_attn.k_proj.weight": "layers.{}.attn.k_proj.weight",
-    "model.layers.{}.self_attn.v_proj.weight": "layers.{}.attn.v_proj.weight",
-    "model.layers.{}.self_attn.o_proj.weight": "layers.{}.attn.output_proj.weight",
-    "model.layers.{}.self_attn.rotary_emb.inv_freq": None,
-    "model.layers.{}.mlp.gate_proj.weight": "layers.{}.mlp.w1.weight",
-    "model.layers.{}.mlp.up_proj.weight": "layers.{}.mlp.w3.weight",
-    "model.layers.{}.mlp.down_proj.weight": "layers.{}.mlp.w2.weight",
-    "model.layers.{}.input_layernorm.weight": "layers.{}.sa_norm.scale",
-    "model.layers.{}.post_attention_layernorm.weight": "layers.{}.sa_scale.scale",
-    "model.layers.{}.post_feedforward_layernorm.weight": "layers.{}.mlp_norm.scale",
-    "model.layers.{}.pre_feedforward_layernorm.weight": "layers.{}.mlp_scale.scale",
-    "model.norm.weight": "norm.rms_norm.scale",
-    "lm_head.weight": "output.weight",
-}
-
-
-def gemma2_hf_to_tune(
-    state_dict: Dict[str, torch.Tensor],
-    num_heads: int = 32,
-    num_kv_heads: int = 32,
-    dim: int = 4096,
-    head_dim: int = None,
-) -> Dict[str, torch.Tensor]:
-    """
-    Convert a state dict from HF's format to torchtune's format. State dicts
-    from multiple checkpoint files should be consolidated into a single state dict
-    before calling this function.
-
-    Eg of HF-format state dict can be found in the ``meta-llama/Llama-2-7b-hf``
-    repo in HF (https://huggingface.co/meta-llama/Llama-2-7b-hf).
-
-    Args:
-        state_dict (Dict[str, torch.Tensor]): State dict in HF's format.
-        num_heads (int): Number of heads in the model.
-        num_kv_heads (int): Number of heads in the key/value projection layers.
-        dim (int): Dimension of the model.
-        head_dim (int): Dimension of the head. If not provided, it will be calculated
-            as dim // num_heads.
-
-    Returns:
-        Dict[str, torch.Tensor]: State dict in torchtune's format.
-    """
-    converted_state_dict = {}
-    if head_dim is None:
-        head_dim = dim // num_heads
-
-    def _permute(t, n_heads):
-        return (
-            t.view(n_heads, 2, head_dim // 2, dim)
-            .transpose(1, 2)
-            .reshape((head_dim * n_heads), dim)
-        )
-
-    for key, value in state_dict.items():
-        if "rotary_emb.inv_freq" not in key:  # Skip loading the position embeddings
-            new_key = get_mapped_key(key, _GEMMA2_FROM_HF)
-            if "q_proj" in key:
-                value = _permute(value, num_heads)
-            elif "k_proj" in key:
-                value = _permute(value, num_kv_heads)
-
-            converted_state_dict[new_key] = value
-    return converted_state_dict
-
-
-def gemma2_tune_to_hf(
-    state_dict: Dict[str, torch.Tensor],
-    num_heads: int = 32,
-    num_kv_heads: int = 32,
-    dim: int = 4096,
-    head_dim: int = None,
-):
-    """
-    Convert a state dict from torchtune's format to HF's format. This function
-    doesn't handle any sharding or splitting of state dicts. It follows the
-    state_dict IN -> state_dict OUT pattern.
-
-    Args:
-        state_dict (Dict[str, torch.Tensor]): State dict in torchtune's format.
-        num_heads (int): Number of heads in the model.
-        num_kv_heads (int): Number of heads in the key/value projection layers.
-        dim (int): Dimension of the model.
-        head_dim (int): Dimension of model attention heads. Default None.
-
-    Returns:
-        Dict[str, torch.Tensor]: State dict in HF's format.
-    """
-    converted_state_dict = {}
-    inverted_mapping_dict = {v: k for k, v in _GEMMA2_FROM_HF.items()}
-
-    if head_dim is None:
-        head_dim = dim // num_heads
-
-    def _permute(t, n_heads):
-        return (
-            t.view(n_heads, head_dim // 2, 2, dim)
-            .transpose(1, 2)
-            .reshape((head_dim * n_heads), dim)
-        )
-
-    for key, value in state_dict.items():
-        new_key = get_mapped_key(key, inverted_mapping_dict)
-        if "q_proj" in key:
-            value = _permute(value, num_heads)
-        elif "k_proj" in key:
-            value = _permute(value, num_kv_heads)
-        converted_state_dict[new_key] = value
-
-    return converted_state_dict
diff --git a/torchtune/models/gemma2/_model_builders.py b/torchtune/models/gemma2/_model_builders.py
deleted file mode 100644
index a07021c518..0000000000
--- a/torchtune/models/gemma2/_model_builders.py
+++ /dev/null
@@ -1,286 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-from typing import List
-
-from torchtune.models.gemma2._component_builders import gemma2, lora_gemma2
-from torchtune.modules import TransformerDecoder
-
-from torchtune.modules.peft import LORA_ATTN_MODULES
-from functools import partial
-
-"""
-Model builders build specific instantiations using component builders. For example
-the ``gemma_2b`` model builder uses the ``gemma2`` component builder.
-"""
-
-
-def gemma2_2b() -> TransformerDecoder:
-    """
-    Builder for creating a Gemma2 2B model initialized w/ the default 2b parameter values
-    from: https://github.com/google/gemma_pytorch/blob/main/gemma/config.py
-
-    Returns:
-        TransformerDecoder: Instantiation of Gemma2 2B model
-    """
-    return gemma2(
-        vocab_size=256_000,
-        num_layers=26,
-        num_heads=8,
-        head_dim=256,
-        num_kv_heads=4,
-        embed_dim=2304,
-        intermediate_dim=9216,
-        max_seq_len=8192,
-        attn_dropout=0.0,
-        norm_eps=1e-6,
-        hidden_capping_value=30.0,
-        final_capping_value=50.0,
-        sliding_window_size=4096,
-    )
-
-
-def lora_gemma2_2b(
-    lora_attn_modules: List[LORA_ATTN_MODULES],
-    apply_lora_to_mlp: bool = False,
-    lora_rank: int = 8,
-    lora_alpha: float = 16,
-    lora_dropout: float = 0.0,
-    use_dora: bool = False,
-    quantize_base: bool = False,
-) -> TransformerDecoder:
-    """
-    Builder for creating a Gemma2 2B model with LoRA enabled.
-
-    The Gemma defaults are the same as in :func:`~torchtune.models.gemma.gemma_2b`,
-    while LoRA default params are based on
-    https://github.com/tloen/alpaca-lora/blob/8bb8579e403dc78e37fe81ffbb253c413007323f/finetune.py#L41-L43.
-
-    Args:
-        lora_attn_modules (List[LORA_ATTN_MODULES]): list of which linear layers
-            LoRA should be applied to in each self-attention block. Options are
-            ``{"q_proj", "k_proj", "v_proj", "output_proj"}``.
-        apply_lora_to_mlp (bool): whether to apply LoRA to the MLP in each transformer layer.
-            Default: False
-        lora_rank (int): rank of each low-rank approximation
-        lora_alpha (float): scaling factor for the low-rank approximation
-        lora_dropout (float): dropout probability for the low-rank approximation. Default: 0.0
-        use_dora (bool): Decompose the LoRA weight into magnitude and direction, as
-            introduced in "DoRA: Weight-Decomposed Low-Rank Adaptation" (https://arxiv.org/abs/2402.09353).
-        quantize_base (bool): Whether to quantize base model weights
-
-    Returns:
-        TransformerDecoder: Instantiation of Gemma2 2B model with LoRA applied
-    """
-    return lora_gemma2(
-        lora_attn_modules=lora_attn_modules,
-        apply_lora_to_mlp=apply_lora_to_mlp,
-        vocab_size=256_000,
-        num_layers=26,
-        num_heads=8,
-        head_dim=256,
-        num_kv_heads=4,
-        embed_dim=2304,
-        intermediate_dim=9216,
-        max_seq_len=8192,
-        attn_dropout=0.0,
-        norm_eps=1e-6,
-        hidden_capping_value=30.0,
-        final_capping_value=50.0,
-        sliding_window_size=4096,
-        lora_rank=lora_rank,
-        lora_alpha=lora_alpha,
-        lora_dropout=lora_dropout,
-        use_dora=use_dora,
-        quantize_base=quantize_base,
-    )
-
-qlora_gemma2_2b = partial(lora_gemma2_2b, quantize_base=True)
-
-qlora_gemma2_2b.__doc__ = """
-Builder for creating a Gemma2 model with QLoRA enabled. Base model weights in linear layers
-that LoRA is applied to are quantized per the QLoRA paper: https://arxiv.org/abs/2305.14314.
-Please see `lora_gemm2a_2b` for full API arguments.
-"""
-
-
-
-def gemma2_9b() -> TransformerDecoder:
-    """
-    Builder for creating a Gemma2 9B model initialized w/ the default 9b parameter values
-    from: https://github.com/google/gemma_pytorch/blob/main/gemma/config.py
-
-    Returns:
-        TransformerDecoder: Instantiation of Gemma 9B model
-    """
-    return gemma2(
-        vocab_size=256_000,
-        num_layers=42,
-        num_heads=16,
-        head_dim=256,
-        num_kv_heads=8,
-        embed_dim=3584,
-        intermediate_dim=14336,
-        max_seq_len=8192,
-        attn_dropout=0.0,
-        norm_eps=1e-6,
-        hidden_capping_value=30.0,
-        final_capping_value=50.0,
-        sliding_window_size=4096,
-    )
-    
-    
-def lora_gemma2_9b(
-    lora_attn_modules: List[LORA_ATTN_MODULES],
-    apply_lora_to_mlp: bool = False,
-    lora_rank: int = 8,
-    lora_alpha: float = 16,
-    lora_dropout: float = 0.0,
-    use_dora: bool = False,
-    quantize_base: bool = False,
-) -> TransformerDecoder:
-    """
-    Builder for creating a Gemma 9B model with LoRA enabled.
-
-    The Gemma defaults are the same as in :func:`~torchtune.models.gemma.gemma_7b`,
-    while LoRA default params are based on
-    https://github.com/tloen/alpaca-lora/blob/8bb8579e403dc78e37fe81ffbb253c413007323f/finetune.py#L41-L43.
-
-    Args:
-        lora_attn_modules (List[LORA_ATTN_MODULES]): list of which linear layers
-            LoRA should be applied to in each self-attention block. Options are
-            ``{"q_proj", "k_proj", "v_proj", "output_proj"}``.
-        apply_lora_to_mlp (bool): whether to apply LoRA to the MLP in each transformer layer.
-            Default: False
-        lora_rank (int): rank of each low-rank approximation
-        lora_alpha (float): scaling factor for the low-rank approximation
-        lora_dropout (float): dropout probability for the low-rank approximation. Default: 0.0
-        use_dora (bool): Decompose the LoRA weight into magnitude and direction, as
-            introduced in "DoRA: Weight-Decomposed Low-Rank Adaptation" (https://arxiv.org/abs/2402.09353).
-        quantize_base (bool): Whether to quantize base model weights
-
-    Returns:
-        TransformerDecoder: Instantiation of Gemma2 9B model with LoRA applied
-    """
-    return lora_gemma2(
-        lora_attn_modules=lora_attn_modules,
-        apply_lora_to_mlp=apply_lora_to_mlp,
-        vocab_size=256_000,
-        num_layers=42,
-        num_heads=16,
-        head_dim=256,
-        num_kv_heads=8,
-        embed_dim=3584,
-        intermediate_dim=14336,
-        max_seq_len=8192,
-        attn_dropout=0.0,
-        norm_eps=1e-6,
-        hidden_capping_value=30.0,
-        final_capping_value=50.0,
-        sliding_window_size=4096,
-        lora_rank=lora_rank,
-        lora_alpha=lora_alpha,
-        lora_dropout=lora_dropout,
-        use_dora=use_dora,
-        quantize_base=quantize_base,
-    )
-
-qlora_gemma2_9b = partial(lora_gemma2_9b, quantize_base=True)
-
-qlora_gemma2_9b.__doc__ = """
-Builder for creating a Gemma model with QLoRA enabled. Base model weights in linear layers
-that LoRA is applied to are quantized per the QLoRA paper: https://arxiv.org/abs/2305.14314.
-Please see `lora_gemma2_9b` for full API arguments.
-"""
-
-def gemma2_27b() -> TransformerDecoder:
-    """
-    Builder for creating a Gemma2 27B model initialized w/ the default 27b parameter values
-    from: https://github.com/google/gemma_pytorch/blob/main/gemma/config.py
-
-    Returns:
-        TransformerDecoder: Instantiation of Gemma2 27B model
-    """
-    return gemma2(
-        vocab_size=256_000,
-        num_layers=46,
-        num_heads=32,
-        head_dim=128,
-        num_kv_heads=16,
-        embed_dim=4608,
-        intermediate_dim=36864,
-        max_seq_len=8192,
-        attn_dropout=0.0,
-        norm_eps=1e-6,
-        hidden_capping_value=30.0,
-        final_capping_value=50.0,
-        sliding_window_size=4096,
-        query_pre_attn_scalar=144,
-    )
-    
-    
-def lora_gemma2_27b(
-    lora_attn_modules: List[LORA_ATTN_MODULES],
-    apply_lora_to_mlp: bool = False,
-    lora_rank: int = 8,
-    lora_alpha: float = 16,
-    lora_dropout: float = 0.0,
-    use_dora: bool = False,
-    quantize_base: bool = False,
-) -> TransformerDecoder:
-    """
-    Builder for creating a Gemma2 27B model with LoRA enabled.
-
-    The Gemma defaults are the same as in :func:`~torchtune.models.gemma.gemma_7b`,
-    while LoRA default params are based on
-    https://github.com/tloen/alpaca-lora/blob/8bb8579e403dc78e37fe81ffbb253c413007323f/finetune.py#L41-L43.
-
-    Args:
-        lora_attn_modules (List[LORA_ATTN_MODULES]): list of which linear layers
-            LoRA should be applied to in each self-attention block. Options are
-            ``{"q_proj", "k_proj", "v_proj", "output_proj"}``.
-        apply_lora_to_mlp (bool): whether to apply LoRA to the MLP in each transformer layer.
-            Default: False
-        lora_rank (int): rank of each low-rank approximation
-        lora_alpha (float): scaling factor for the low-rank approximation
-        lora_dropout (float): dropout probability for the low-rank approximation. Default: 0.0
-        use_dora (bool): Decompose the LoRA weight into magnitude and direction, as
-            introduced in "DoRA: Weight-Decomposed Low-Rank Adaptation" (https://arxiv.org/abs/2402.09353).
-        quantize_base (bool): Whether to quantize base model weights
-
-    Returns:
-        TransformerDecoder: Instantiation of Gemma2 27B model with LoRA applied
-    """
-    return lora_gemma2(
-        lora_attn_modules=lora_attn_modules,
-        apply_lora_to_mlp=apply_lora_to_mlp,
-        vocab_size=256_000,
-        num_layers=46,
-        num_heads=32,
-        head_dim=128,
-        num_kv_heads=16,
-        embed_dim=4608,
-        intermediate_dim=36864,
-        max_seq_len=8192,
-        attn_dropout=0.0,
-        norm_eps=1e-6,
-        hidden_capping_value=30.0,
-        final_capping_value=50.0,
-        sliding_window_size=4096,
-        query_pre_attn_scalar=144,
-        lora_rank=lora_rank,
-        lora_alpha=lora_alpha,
-        lora_dropout=lora_dropout,
-        use_dora=use_dora,
-        quantize_base=quantize_base,
-    )
-
-qlora_gemma2_27b = partial(lora_gemma2_27b, quantize_base=True)
-
-qlora_gemma2_27b.__doc__ = """
-Builder for creating a Gemma model with QLoRA enabled. Base model weights in linear layers
-that LoRA is applied to are quantized per the QLoRA paper: https://arxiv.org/abs/2305.14314.
-Please see `lora_gemma2_27b` for full API arguments.
-"""
diff --git a/torchtune/training/checkpointing/_checkpointer.py b/torchtune/training/checkpointing/_checkpointer.py
index ec79f0a4ba..8255fdac8c 100644
--- a/torchtune/training/checkpointing/_checkpointer.py
+++ b/torchtune/training/checkpointing/_checkpointer.py
@@ -488,16 +488,6 @@ def load_checkpoint(self) -> Dict[str, Any]:
                     "supported_aspect_ratios", None
                 ),
             )
-        elif self._model_type == ModelType.GEMMA2:
-            from torchtune.models.gemma2._convert_weights import gemma2_hf_to_tune
-
-            converted_state_dict[training.MODEL_KEY] = gemma2_hf_to_tune(
-                merged_state_dict,
-                num_heads=self._config["num_attention_heads"],
-                num_kv_heads=self._config["num_key_value_heads"],
-                dim=self._config["hidden_size"],
-                head_dim=self._config.get("head_dim", None),
-            )
         else:
             converted_state_dict[training.MODEL_KEY] = convert_weights.hf_to_tune(
                 merged_state_dict,
@@ -588,16 +578,6 @@ def save_checkpoint(
                         "supported_aspect_ratios", None
                     ),
                 )
-            elif self._model_type == ModelType.GEMMA2:
-                from torchtune.models.gemma2._convert_weights import gemma2_tune_to_hf
-
-                state_dict[training.MODEL_KEY] = gemma2_tune_to_hf(
-                    state_dict[training.MODEL_KEY],
-                    num_heads=self._config["num_attention_heads"],
-                    num_kv_heads=self._config["num_key_value_heads"],
-                    dim=self._config["hidden_size"],
-                    head_dim=self._config.get("head_dim", None),
-                )
             else:
                 state_dict[training.MODEL_KEY] = convert_weights.tune_to_hf(
                     state_dict[training.MODEL_KEY],
diff --git a/torchtune/training/checkpointing/_utils.py b/torchtune/training/checkpointing/_utils.py
index 2fa7265194..2d353b007c 100644
--- a/torchtune/training/checkpointing/_utils.py
+++ b/torchtune/training/checkpointing/_utils.py
@@ -45,7 +45,6 @@ class ModelType(Enum):
 
     Attributes:
         GEMMA (str): Gemma family of models. See :func:`~torchtune.models.gemma.gemma`
-        GEMMA2 (str): Gemma 2 family of models. See :func:`~torchtune.models.gemma2.gemma2`
         LLAMA2 (str): Llama2 family of models. See :func:`~torchtune.models.llama2.llama2`
         LLAMA3 (str): Llama3 family of models. See :func:`~torchtune.models.llama3.llama3`
         LLAMA3_2 (str): Llama3.2 family of models. See :func:`~torchtune.models.llama3_2.llama3_2`
@@ -66,7 +65,6 @@ class ModelType(Enum):
     """
 
     GEMMA: str = "gemma"
-    GEMMA2: str = "gemma2"
     LLAMA2: str = "llama2"
     LLAMA3: str = "llama3"
     LLAMA3_2: str = "llama3_2"