diff --git a/README.md b/README.md
index dd39769439..02e71a2992 100644
--- a/README.md
+++ b/README.md
@@ -3,6 +3,11 @@
 ![Recipe Integration Test](https://github.com/pytorch/torchtune/actions/workflows/recipe_test.yaml/badge.svg)
 [![](https://dcbadge.vercel.app/api/server/4Xsdn8Rr9Q?style=flat)](https://discord.gg/4Xsdn8Rr9Q)
 
+&nbsp;
+&nbsp;
+
+**Note: torchtune now supports Llama3! Currently we support the Llama3 8B Model with LoRA, QLoRA and Full fine-tune. Find more details in the [Llama3](#llama3) section!**
+
 
 # torchtune
 
@@ -40,6 +45,7 @@ torchtune currently supports the following models.
 
 | Model                                         | Sizes     |
 |-----------------------------------------------|-----------|
+| [Llama3](https://llama.meta.com/llama3)    | 8B [[models](torchtune/models/llama3/_model_builders.py), [configs](recipes/configs/llama3/)]        |
 | [Llama2](https://llama.meta.com/llama2/)   | 7B, 13B [[models](torchtune/models/llama2/_model_builders.py), [configs](recipes/configs/llama2/)]        |
 | [Mistral](https://huggingface.co/mistralai)   | 7B [[model](torchtune/models/mistral/_model_builders.py), [configs](recipes/configs/mistral/)] |
 | [Gemma](https://huggingface.co/collections/google/gemma-release-65d5efbccdbb8c4202ec078b)   | 2B [[model](torchtune/models/gemma/_model_builders.py), [configs](recipes/configs/gemma/)] |
@@ -54,8 +60,8 @@ torchtune provides the following fine-tuning recipes.
 
 | Training                           | Fine-tuning Method                 |
 |------------------------------------|------------------------------------|
-| Distributed Training [1 to 8 GPUs] | Full [[code](recipes/full_finetune_distributed.py), [example](recipes/configs/llama2/7B_full.yaml)], LoRA [[code](recipes/lora_finetune_distributed.py), [example](recipes/configs/llama2/7B_lora.yaml)] |
-| Single Device / Low Memory [1 GPU] | Full [[code](recipes/full_finetune_single_device.py), [example](recipes/configs/llama2/7B_full_low_memory.yaml)], LoRA + QLoRA [[code](recipes/lora_finetune_single_device.py), [example](recipes/configs/llama2/7B_qlora_single_device.yaml)] |
+| Distributed Training [1 to 8 GPUs] | Full [[code](recipes/full_finetune_distributed.py), [example](recipes/configs/llama3/8B_full.yaml)], LoRA [[code](recipes/lora_finetune_distributed.py), [example](recipes/configs/llama3/8B_lora.yaml)] |
+| Single Device / Low Memory [1 GPU] | Full [[code](recipes/full_finetune_single_device.py), [example](recipes/configs/llama3/8B_full_single_device.yaml)], LoRA + QLoRA [[code](recipes/lora_finetune_single_device.py), [example](recipes/configs/llama3/8B_lora_single_device.yaml)] |
 | Single Device [1 GPU]              | DPO [[code](recipes/full_finetune_distributed.py), [example](recipes/configs/llama2/7B_lora_dpo_single_device.yaml)]
 
 &nbsp;
@@ -69,14 +75,47 @@ This table captures the minimum memory requirements for our different recipes us
 
 | Example HW Resources | Finetuning Method | Config | Model | Peak Memory per GPU
 |--------------|-------------------|---------|------------|---------------------|
-| 1 x RTX 4090 |     QLoRA          | [qlora_finetune_single_device](recipes/configs/llama2/7B_qlora_single_device.yaml)         |    Llama-7B      |     9.29 GB            |
-| 2 x RTX 4090 |     LoRA          | [lora_finetune_distributed](recipes/configs/llama2/7B_lora.yaml)         |    Llama-7B      |    20.95 GB            |
-| 1 x RTX 4090 |     LoRA          | [lora_finetune_single_device](recipes/configs/llama2/7B_lora_single_device.yaml)     |    Llama-7B      | 17.18 GB           |
-| 1 x RTX 4090 |   Full finetune   | [full_finetune_single_device](recipes/configs/llama2/7B_full_low_memory.yaml)     |    Llama-7B      |    14.97 GB            |
-| 4 x RTX 4090 |   Full finetune   | [full_finetune_distributed](recipes/configs/llama2/7B_full.yaml)         |    Llama-7B      |    22.9 GB           |
+| 1 x RTX 4090 |     QLoRA          | [qlora_finetune_single_device](recipes/configs/llama2/7B_qlora_single_device.yaml)         |    Llama2-7B      |     8.57 GB            |
+| 2 x RTX 4090 |     LoRA          | [lora_finetune_distributed](recipes/configs/llama2/7B_lora.yaml)         |    Llama2-7B      |    20.95 GB            |
+| 1 x RTX 4090 |     LoRA          | [lora_finetune_single_device](recipes/configs/llama2/7B_lora_single_device.yaml)     |    Llama2-7B      | 17.18 GB           |
+| 1 x RTX 4090 |   Full finetune   | [full_finetune_single_device](recipes/configs/llama2/7B_full_low_memory.yaml)     |    Llama2-7B      |    14.97 GB            |
+| 4 x RTX 4090 |   Full finetune   | [full_finetune_distributed](recipes/configs/llama2/7B_full.yaml)         |    Llama2-7B      |    22.9 GB           |
 
 * these are averaged over multiple runs, but there might be some variance based on the setup. We'll update this table regularly.
 
+&nbsp;
+
+## Llama3
+
+torchtune supports fine-tuning for the Llama3 8B models with support for 70B on its way. We currently support LoRA, QLoRA and Full-finetune on a single GPU as well as LoRA and Full fine-tune on multiple devices. For all the details, take a look at our [tutorial](https://pytorch.org/torchtune/main/tutorials/llama3.html).
+
+
+In our initial experiments, QLoRA has a peak allocated memory of ``~9GB`` while LoRA on a single GPU has a peak allocated memory of ``~19GB``. To get started, you can use our default configs to kick off training.
+
+- LoRA on a single GPU.
+
+```bash
+tune run lora_finetune_single_device --config llama3/8B_lora_single_device
+```
+
+- QLoRA on a single GPU
+
+```bash
+tune run lora_finetune_single_device --config llama3/8B_qlora_single_device
+```
+
+- LoRA on 2 GPUs
+
+```bash
+tune run --nproc_per_node 4 lora_finetune_distributed --config llama3/8B_lora
+```
+
+- Full fine-tune on 2 GPUs
+
+```bash
+tune run --nproc_per_node 2 full_finetune_distributed --config llama3/8B_full
+```
+
 
 &nbsp;
 
diff --git a/docs/source/api_ref_models.rst b/docs/source/api_ref_models.rst
index 2467599b2d..e9a29ede9c 100644
--- a/docs/source/api_ref_models.rst
+++ b/docs/source/api_ref_models.rst
@@ -4,6 +4,25 @@ torchtune.models
 
 .. currentmodule:: torchtune.models
 
+llama3
+------
+
+All models from the `Llama3 family <https://llama.meta.com/llama3/>`_.
+
+.. code-block:: bash
+
+    tune download meta-llama/Meta-Llama-3-8B --hf-token <ACCESS_TOKEN>
+
+
+.. autosummary::
+    :toctree: generated/
+    :nosignatures:
+
+    llama3.llama3_8b
+    llama3.lora_llama3_8b
+    llama3.qlora_llama3_8b
+
+
 llama2
 ------
 
@@ -26,6 +45,7 @@ Pre-trained models can be downloaded from the Hugging Face Hub with the followin
     llama2.lora_llama2_13b
     llama2.qlora_llama2_13b
 
+
 mistral
 -------
 
diff --git a/docs/source/api_ref_modules.rst b/docs/source/api_ref_modules.rst
index 70d545357f..e38926f36f 100644
--- a/docs/source/api_ref_modules.rst
+++ b/docs/source/api_ref_modules.rst
@@ -17,10 +17,18 @@ Modeling Components and Building Blocks
     get_cosine_schedule_with_warmup
     RotaryPositionalEmbeddings
     RMSNorm
-    Tokenizer
     TransformerDecoderLayer
     TransformerDecoder
 
+Tokenizers
+------------------------
+
+.. autosummary::
+    :toctree: generated/
+    :nosignatures:
+
+    tokenizers.SentencePieceTokenizer
+    tokenizers.TikTokenTokenizer
 
 PEFT Components
 ---------------
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 78340bd769..c55c723634 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -43,6 +43,13 @@ torchtune tutorials.
 
 .. customcardstart::
 
+.. customcarditem::
+   :header: Llama3 in torchtune
+   :card_description:
+   :image: _static/img/generic-pytorch-logo.png
+   :link: tutorials/lora_finetune.html
+   :tags: finetuning,llama3
+
 .. customcarditem::
    :header: Finetuning with LoRA in torchtune
    :card_description: Parameter-efficient finetuning of Llama2 using LoRA
@@ -88,6 +95,7 @@ torchtune tutorials.
    :caption: Tutorials
    :hidden:
 
+   tutorials/llama3
    tutorials/lora_finetune
    tutorials/qlora_finetune
    tutorials/e2e_flow
diff --git a/docs/source/tutorials/first_finetune_tutorial.rst b/docs/source/tutorials/first_finetune_tutorial.rst
index dbff6a2d29..072cb2d79c 100644
--- a/docs/source/tutorials/first_finetune_tutorial.rst
+++ b/docs/source/tutorials/first_finetune_tutorial.rst
@@ -98,6 +98,8 @@ a single device. For a more in-depth discussion on LoRA in torchtune, you can se
 
 |
 
+.. _tune_cp_label:
+
 Modifying a config
 ------------------
 YAML configs hold most of the important information needed for running your recipe.
diff --git a/docs/source/tutorials/llama3.rst b/docs/source/tutorials/llama3.rst
new file mode 100644
index 0000000000..ff1c0120e1
--- /dev/null
+++ b/docs/source/tutorials/llama3.rst
@@ -0,0 +1,328 @@
+====================
+Llama3 in torchtune
+====================
+
+.. grid:: 2
+
+    .. grid-item-card:: :octicon:`mortar-board;1em;` You will learn how to:
+
+      * Download the Llama3-8B weights and tokenizer
+      * Fine-tune Llama3-8B with LoRA and QLoRA
+      * Evaluate your fine-tuned Llama3-8B model
+      * Generate text with your fine-tuned model
+      * Quantize your model to speed up generation
+
+    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites
+
+      * Be familiar with :ref:`torchtune<overview_label>`
+      * Make sure to :ref:`install torchtune<install_label>`
+
+
+Llama3-8B
+----------
+
+`Llama3-8B <https://llama.meta.com/llama3>`_ is a new model released by Meta AI that improves upon the performance of the Llama2 family
+of models across a `range of different benchmarks <https://github.com/meta-llama/llama3/blob/main/eval_details.md>`_.
+There are a few main changes between Llama2-7B and Llama3-8B models:
+
+- Llama3-8B uses `grouped-query attention <https://arxiv.org/abs/2305.13245>`_ instead of the standard multi-head attention from Llama2-7B
+- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models)
+- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken <https://github.com/openai/tiktoken>`_ instead of `sentencepiece <https://github.com/google/sentencepiece>`_)
+- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B
+- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings <https://arxiv.org/abs/2104.09864>`_
+
+|
+
+Getting access to Llama3-8B
+---------------------------
+
+First, let's download the model from Hugging Face. You will need to follow the instructions
+on the `official Meta page <https://github.com/meta-llama/llama3/blob/main/README.md>`_ to gain access to the model.
+Next, make sure you grab your Hugging Face token from `here <https://huggingface.co/settings/tokens>`_.
+
+
+.. code-block:: bash
+
+    tune download meta-llama/Meta-Llama-3-8B \
+        --output-dir <checkpoint_dir> \
+        --hf-token <ACCESS TOKEN>
+
+|
+
+Fine-tuning Llama3-8B in torchtune
+----------------------------------
+
+torchtune provides `LoRA <https://arxiv.org/abs/2106.09685>`_, `QLoRA <https://arxiv.org/abs/2305.14314>`_, and full fine-tuning
+recipes for fine-tuning Llama3-8B on one or more GPUs. For more on LoRA in torchtune, see our :ref:`LoRA Tutorial <lora_finetune_label>`.
+For more on QLoRA in torchtune, see our :ref:`QLoRA Tutorial <qlora_finetune_label>`.
+
+Let's take a look at how we can fine-tune Llama3-8B with LoRA on a single device using torchtune. In this example, we will fine-tune
+for one epoch on a common instruct dataset for illustrative purposes. The basic command for a single-device LoRA fine-tune is
+
+.. code-block:: bash
+
+    tune run lora_finetune_single_device --config llama3/8B_lora_single_device
+
+.. note::
+    To see a full list of recipes and their corresponding configs, simply run ``tune ls`` from the command line.
+
+We can also add command-line overrides as needed, e.g.
+
+.. code-block:: bash
+
+    tune run lora_finetune_single_device --config llama3/8B_lora_single_device \
+        checkpointer.checkpoint_dir=<checkpoint_dir> \
+        tokenizer.path=<checkpoint_dir>/tokenizer.model \
+        checkpointer.output_dir=<checkpoint_dir>
+
+This will load the Llama3-8B checkpoint and tokenizer from ``<checkpoint_dir>`` used in the ``tune download`` command above,
+then save a final checkpoint in the same directory following the original format. For more details on the
+checkpoint formats supported in torchtune, see our :ref:`checkpointing deep-dive <understand_checkpointer>`.
+
+.. note::
+    To see the full set of configurable parameters for this (and other) configs we can use ``tune cp`` to copy (and modify)
+    the default config. ``tune cp`` can be used with recipe scripts too, in case you want to make more custom changes
+    that cannot be achieved by directly modifying existing configurable parameters. For more on ``tune cp`` see the section on
+    :ref:`modifying configs <tune_cp_label>`.
+
+Once training is complete, the model checkpoints will be saved and their locations will be logged. For
+LoRA fine-tuning, the final checkpoint will contain the merged weights, and a copy of just the (much smaller) LoRA weights
+will be saved separately.
+
+In our experiments, we observed a peak memory usage of 18.5 GB. The default config can be trained on a consumer GPU with 24 GB VRAM.
+
+If you have multiple GPUs available, you can run the distributed version of the recipe.
+torchtune makes use of the `FSDP <https://pytorch.org/tutorials/intermediate/FSDP_tutorial.html>`_ APIs from PyTorch Distributed
+to shard the model, optimizer states, and gradients. This should enable you to increase your batch size, resulting in faster training.
+For example, on two devices:
+
+.. code-block:: bash
+
+    tune run --nproc_per_node 2 lora_finetune_distributed --config llama3/8B_lora
+
+Finally, if we want to use even less memory, we can leverage TorchTune's QLoRA recipe via:
+
+.. code-block:: bash
+
+    tune run lora_finetune_single_device --config llama3/8B_qlora_single_device
+
+Since our default configs enable full bfloat16 training, all of the above commands can be run with
+devices having at least 24 GB of VRAM, and in fact the QLoRA recipe should have peak allocated memory
+below 10 GB. You can also experiment with different configurations of LoRA and QLoRA, or even run a full fine-tune.
+Try it out!
+
+|
+
+Evaluating fine-tuned Llama3-8B models with EleutherAI's Eval Harness
+---------------------------------------------------------------------
+
+Now that we've fine-tuned Llama3-8B, what's next? Let's take our LoRA-finetuned model from the
+preceding section and look at a couple different ways we can evaluate its performance on the tasks we care about.
+
+First, torchtune provides an integration with
+`EleutherAI's evaluation harness <https://github.com/EleutherAI/lm-evaluation-harness>`_
+for model evaluation on common benchmark tasks.
+
+.. note::
+    Make sure you've first installed the evaluation harness via :code:`pip install "lm_eval==0.4.*"`.
+
+For this tutorial we'll use the ``truthfulqa_mc2`` task from the harness.
+This task measures a model's propensity to be truthful when answering questions and
+measures the model's zero-shot accuracy on a question followed by one or more true
+responses and one or more false responses. First, let's copy the config so we can point the YAML
+file to our fine-tuned checkpoint files.
+
+.. code-block:: bash
+
+    tune cp eleuther_evaluation ./custom_eval_config.yaml
+
+Next, we modify ``custom_eval_config.yaml`` to include the fine-tuned checkpoints.
+
+.. code-block:: yaml
+
+    checkpointer:
+        _component_: torchtune.utils.FullModelMetaCheckpointer
+
+        # directory with the checkpoint files
+        # this should match the output_dir specified during
+        # fine-tuning
+        checkpoint_dir: <checkpoint_dir>
+
+        # checkpoint files for the fine-tuned model. These will be logged
+        # at the end of your fine-tune
+        checkpoint_files: [
+            consolidated.00.pth
+        ]
+
+        output_dir: <checkpoint_dir>
+        model_type: LLAMA3
+
+    # Make sure to update the tokenizer path to the right
+    # checkpoint directory as well
+    tokenizer:
+        _component_: torchtune.models.llama3.llama3_tokenizer
+        path: <checkpoint_dir>/tokenizer.model
+
+Finally, we can run evaluation using our modified config.
+
+.. code-block:: bash
+
+    tune run eleuther_eval --config ./custom_eval_config.yaml
+
+Try it for yourself and see what accuracy your model gets!
+
+|
+
+Generating text with our fine-tuned Llama3-8B model
+---------------------------------------------------
+
+Next, let's look at one other way we can evaluate our model: generating text! torchtune provides a
+`recipe for generation <https://github.com/pytorch/torchtune/blob/main/recipes/generate.py>`_ as well.
+
+Similar to what we did, let's copy and modify the default generation config.
+
+.. code-block:: bash
+
+    tune cp generation ./custom_generation_config.yaml
+
+Now we modify ``custom_generation_config.yaml`` to point to our checkpoint and tokenizer.
+
+.. code-block:: yaml
+
+    checkpointer:
+        _component_: torchtune.utils.FullModelMetaCheckpointer
+
+        # directory with the checkpoint files
+        # this should match the output_dir specified during
+        # fine-tuning
+        checkpoint_dir: <checkpoint_dir>
+
+        # checkpoint files for the fine-tuned model. These will be logged
+        # at the end of your fine-tune
+        checkpoint_files: [
+            consolidated.00.pth
+        ]
+
+        output_dir: <checkpoint_dir>
+        model_type: LLAMA3
+
+    # Make sure to update the tokenizer path to the right
+    # checkpoint directory as well
+    tokenizer:
+        _component_: torchtune.models.llama3.llama3_tokenizer
+        path: <checkpoint_dir>/tokenizer.model
+
+Running generation with our LoRA-finetuned model, we see the following output:
+
+.. code-block:: bash
+
+    tune run generate --config ./custom_generation_config.yaml \
+    prompt="Hello, my name is"
+
+    [generate.py:122] Hello, my name is Sarah and I am a busy working mum of two young children, living in the North East of England.
+    ...
+    [generate.py:135] Time for inference: 10.88 sec total, 18.94 tokens/sec
+    [generate.py:138] Bandwidth achieved: 346.09 GB/s
+    [generate.py:139] Memory used: 18.31 GB
+
+Faster generation via quantization
+----------------------------------
+
+We can see that the model took just under 11 seconds, generating almost 19 tokens per second.
+We can speed this up a bit by quantizing our model. Here we'll use 4-bit weights-only quantization
+as provided by `torchao <https://github.com/pytorch-labs/ao>`_.
+
+If you've been following along this far, you know the drill by now.
+Let's copy the quantization config and point it at our fine-tuned model.
+
+.. code-block:: bash
+
+    tune cp quantization ./custom_quantization_config.yaml
+
+And update ``custom_quantization_config.yaml`` with the following:
+
+.. code-block:: yaml
+
+    checkpointer:
+        _component_: torchtune.utils.FullModelMetaCheckpointer
+
+        # directory with the checkpoint files
+        # this should match the output_dir specified during
+        # fine-tuning
+        checkpoint_dir: <checkpoint_dir>
+
+        # checkpoint files for the fine-tuned model. These will be logged
+        # at the end of your fine-tune
+        checkpoint_files: [
+            consolidated.00.pth
+        ]
+
+        output_dir: <checkpoint_dir>
+        model_type: LLAMA3
+
+To quantize the model, we can now run:
+
+.. code-block:: bash
+
+    tune run quantize ./custom_quantization_config.yaml
+
+    [quantize.py:90] Time for quantization: 2.93 sec
+    [quantize.py:91] Memory used: 23.13 GB
+    [quantize.py:104] Model checkpoint of size 4.92 GB saved to /tmp/Llama-3-8B-hf/meta_model_0-4w.pt
+
+We can see that the model is now under 5 GB, or just over four bits for each of the 8B parameters.
+
+.. note::
+    Unlike the fine-tuned checkpoints, the quantization recipe outputs a single checkpoint file. This is
+    because our quantization APIs currently don't support any conversion across formats.
+    As a result you won't be able to use these quantized models outside of torchtune.
+    But you should be able to use these with the generation and evaluation recipes within
+    torchtune. These results will help inform which quantization methods you should use
+    with your favorite inference engine.
+
+Let's take our quantized model and run the same generation again.
+First, we'll make one more change to our ``custom_generation_config.yaml``.
+
+.. code-block:: yaml
+
+    checkpointer:
+        # we need to use the custom TorchTune checkpointer
+        # instead of the HF checkpointer for loading
+        # quantized models
+        _component_: torchtune.utils.FullModelTorchTuneCheckpointer
+
+        # directory with the checkpoint files
+        # this should match the output_dir specified during
+        # fine-tuning
+        checkpoint_dir: <checkpoint_dir>
+
+        # checkpoint files point to the quantized model
+        checkpoint_files: [
+            meta_model_0-4w.pt,
+        ]
+
+        output_dir: <checkpoint_dir>
+        model_type: LLAMA3
+
+    # we also need to update the quantizer to what was used during
+    # quantization
+    quantizer:
+        _component_: torchtune.utils.quantization.Int4WeightOnlyQuantizer
+        groupsize: 256
+
+Let's re-run generation!
+
+.. code-block:: bash
+
+    tune run generate --config ./custom_generation_config.yaml \
+    prompt="Hello, my name is"
+
+    [generate.py:122] Hello, my name is Jake.
+    I am a multi-disciplined artist with a passion for creating, drawing and painting.
+    ...
+    Time for inference: 1.62 sec total, 57.95 tokens/sec
+
+By quantizing the model and running ``torch.compile`` we get over a 3x speedup!
+
+This is just the beginning of what you can do with Llama3-8B using torchtune and the broader ecosystem.
+We look forward to seeing what you build!
diff --git a/pyproject.toml b/pyproject.toml
index 72f17d1cb3..158a538f3a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,8 +15,12 @@ dependencies = [
     "huggingface_hub",
     "safetensors",
 
-    # Miscellaneous
+    # Tokenization
     "sentencepiece",
+    "tiktoken",
+    "blobfile>=2",
+
+    # Miscellaneous
     "tqdm",
     "omegaconf",
 
@@ -35,7 +39,7 @@ tune = "torchtune._cli.tune:main"
 
 [project.optional-dependencies]
 dev = [
-    "bitsandbytes",
+    "bitsandbytes>=0.43.0",
     "pre-commit",
     "pytest",
     "pytest-cov",
diff --git a/recipes/configs/generation.yaml b/recipes/configs/generation.yaml
index 6cd9c1ba87..96a54d3e5c 100644
--- a/recipes/configs/generation.yaml
+++ b/recipes/configs/generation.yaml
@@ -30,7 +30,7 @@ tokenizer:
 # Generation arguments; defaults taken from gpt-fast
 prompt: "Hello, my name is"
 max_new_tokens: 300
-temperature: 0.8
+temperature: 0.6 # 0.8 and 0.6 are popular values to try
 top_k: 300
 
 quantizer: null
diff --git a/recipes/configs/llama3/8B_full.yaml b/recipes/configs/llama3/8B_full.yaml
new file mode 100644
index 0000000000..d2d060d269
--- /dev/null
+++ b/recipes/configs/llama3/8B_full.yaml
@@ -0,0 +1,77 @@
+# Config for multi-device full finetuning in full_finetune_distributed.py
+# using a Llama3 8B model
+#
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download meta-llama/Meta-Llama-3-8B --output-dir /tmp/Meta-Llama-3-8B --hf-token <HF_TOKEN>
+#
+# To launch on 4 devices, run the following command from root:
+#   tune run --nproc_per_node 4 full_finetune_distributed --config llama3/8B_full
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+#   tune run --nproc_per_node 4 full_finetune_distributed --config llama3/8B_full checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#
+# This config works best when the model is being fine-tuned on 2+ GPUs.
+# Single device full finetuning requires more memory optimizations. It's
+# best to use 8B_full_single_device.yaml for those cases
+
+
+# Tokenizer
+tokenizer:
+  _component_: torchtune.models.llama3.llama3_tokenizer
+  path: /tmp/Meta-Llama-3-8B/original/tokenizer.model
+
+# Dataset
+dataset:
+  _component_: torchtune.datasets.alpaca_dataset
+  train_on_input: True
+seed: null
+shuffle: True
+
+# Model Arguments
+model:
+  _component_: torchtune.models.llama3.llama3_8b
+
+checkpointer:
+  _component_: torchtune.utils.FullModelMetaCheckpointer
+  checkpoint_dir: /tmp/Meta-Llama-3-8B/original/
+  checkpoint_files: [
+    consolidated.00.pth
+  ]
+  recipe_checkpoint: null
+  output_dir: /tmp/Meta-Llama-3-8B/
+  model_type: LLAMA3
+resume_from_checkpoint: False
+
+# Fine-tuning arguments
+batch_size: 2
+epochs: 3
+
+optimizer:
+  _component_: torch.optim.AdamW
+  lr: 2e-5
+  foreach: False
+
+loss:
+  _component_: torch.nn.CrossEntropyLoss
+max_steps_per_epoch: null
+gradient_accumulation_steps: 1
+
+
+# Training env
+device: cuda
+
+# Memory management
+enable_activation_checkpointing: True
+
+# Reduced precision
+dtype: bf16
+
+# Logging
+metric_logger:
+  _component_: torchtune.utils.metric_logging.DiskLogger
+  log_dir: ${output_dir}
+output_dir: /tmp/alpaca-llama3-finetune
+log_every_n_steps: null
diff --git a/recipes/configs/llama3/8B_full_single_device.yaml b/recipes/configs/llama3/8B_full_single_device.yaml
new file mode 100644
index 0000000000..1ecc5e7b61
--- /dev/null
+++ b/recipes/configs/llama3/8B_full_single_device.yaml
@@ -0,0 +1,77 @@
+# Config for single device full finetuning in full_finetune_single_device.py
+# using a Llama3 8B model
+#
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download meta-llama/Meta-Llama-3-8B --output-dir /tmp/Meta-Llama-3-8B --hf-token <HF_TOKEN>
+#
+# The default config uses an optimizer from bitsandbytes. If you do not have it installed,
+# you can install it with
+#   pip install bitsandbytes
+#
+# To launch on a single device, run the following command from root:
+#   tune run full_finetune_single_device --config llama3/8B_full_single_device
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+#   tune run full_finetune_single_device --config llama3/8B_full_single_device checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#
+# This config works only for training on single device.
+
+
+# Tokenizer
+tokenizer:
+  _component_: torchtune.models.llama3.llama3_tokenizer
+  path: /tmp/Meta-Llama-3-8B/original/tokenizer.model
+
+# Dataset
+dataset:
+  _component_: torchtune.datasets.alpaca_dataset
+  train_on_input: True
+seed: null
+shuffle: True
+
+# Model Arguments
+model:
+  _component_: torchtune.models.llama3.llama3_8b
+
+checkpointer:
+  _component_: torchtune.utils.FullModelMetaCheckpointer
+  checkpoint_dir: /tmp/Meta-Llama-3-8B/original/
+  checkpoint_files: [
+    consolidated.00.pth
+  ]
+  recipe_checkpoint: null
+  output_dir: /tmp/Meta-Llama-3-8B/
+  model_type: LLAMA3
+resume_from_checkpoint: False
+
+# Fine-tuning arguments
+batch_size: 2
+epochs: 3
+optimizer:
+  _component_: bitsandbytes.optim.AdamW8bit
+  lr: 2e-5
+loss:
+  _component_: torch.nn.CrossEntropyLoss
+max_steps_per_epoch: null
+gradient_accumulation_steps: 1
+optimizer_in_bwd: True
+compile: False
+
+# Training environment
+device: cuda
+
+# Memory management
+enable_activation_checkpointing: True
+
+# Reduced precision
+dtype: bf16
+
+# Logging
+metric_logger:
+  _component_: torchtune.utils.metric_logging.DiskLogger
+  log_dir: ${output_dir}
+output_dir: /tmp/alpaca-llama3-finetune
+log_every_n_steps: null
diff --git a/recipes/configs/llama3/8B_lora.yaml b/recipes/configs/llama3/8B_lora.yaml
new file mode 100644
index 0000000000..3b8479a823
--- /dev/null
+++ b/recipes/configs/llama3/8B_lora.yaml
@@ -0,0 +1,80 @@
+# Config for multi-device LoRA finetuning in lora_finetune_distributed.py
+# using a Llama3 8B model
+#
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download meta-llama/Meta-Llama-3-8B --output-dir /tmp/Meta-Llama-3-8B --hf-token <HF_TOKEN>
+#
+# To launch on 2 devices, run the following command from root:
+#   tune run --nproc_per_node 2 lora_finetune_distributed --config llama3/8B_lora
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+#   tune run --nproc_per_node 2 lora_finetune_distributed --config llama3/8B_lora checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#
+# This config works best when the model is being fine-tuned on 2+ GPUs.
+# For single device LoRA finetuning please use 8B_lora_single_device.yaml
+# or 8B_qlora_single_device.yaml
+
+# Tokenizer
+tokenizer:
+  _component_: torchtune.models.llama3.llama3_tokenizer
+  path: /tmp/Meta-Llama-3-8B/original/tokenizer.model
+
+# Model Arguments
+model:
+  _component_: torchtune.models.llama3.lora_llama3_8b
+  lora_attn_modules: ['q_proj', 'v_proj']
+  apply_lora_to_mlp: False
+  apply_lora_to_output: False
+  lora_rank: 8
+  lora_alpha: 16
+
+checkpointer:
+  _component_: torchtune.utils.FullModelMetaCheckpointer
+  checkpoint_dir: /tmp/Meta-Llama-3-8B/original/
+  checkpoint_files: [
+    consolidated.00.pth
+  ]
+  recipe_checkpoint: null
+  output_dir: /tmp/Meta-Llama-3-8B/
+  model_type: LLAMA3
+resume_from_checkpoint: False
+
+# Dataset and Sampler
+dataset:
+  _component_: torchtune.datasets.alpaca_cleaned_dataset
+  train_on_input: True
+seed: null
+shuffle: True
+batch_size: 2
+
+# Optimizer and Scheduler
+optimizer:
+  _component_: torch.optim.AdamW
+  weight_decay: 0.01
+  lr: 3e-4
+lr_scheduler:
+  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  num_warmup_steps: 100
+
+loss:
+  _component_: torch.nn.CrossEntropyLoss
+
+# Training
+epochs: 1
+max_steps_per_epoch: null
+gradient_accumulation_steps: 32
+
+# Logging
+output_dir: /tmp/lora_finetune_output
+metric_logger:
+  _component_: torchtune.utils.metric_logging.DiskLogger
+  log_dir: ${output_dir}
+log_every_n_steps: null
+
+# Environment
+device: cuda
+dtype: bf16
+enable_activation_checkpointing: False
diff --git a/recipes/configs/llama3/8B_lora_single_device.yaml b/recipes/configs/llama3/8B_lora_single_device.yaml
new file mode 100644
index 0000000000..b6b33466ca
--- /dev/null
+++ b/recipes/configs/llama3/8B_lora_single_device.yaml
@@ -0,0 +1,85 @@
+# Config for single device LoRA finetuning in lora_finetune_single_device.py
+# using a Llama3 8B model
+#
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download meta-llama/Meta-Llama-3-8B --output-dir /tmp/Meta-Llama-3-8B --hf-token <HF_TOKEN>
+#
+# To launch on a single device, run the following command from root:
+#   tune run lora_finetune_single_device --config llama3/8B_lora_single_device
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+#   tune run lora_finetune_single_device --config llama3/8B_lora_single_device checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#
+# This config works only for training on single device.
+
+
+# Model Arguments
+model:
+  _component_: torchtune.models.llama3.lora_llama3_8b
+  lora_attn_modules: ['q_proj', 'v_proj']
+  apply_lora_to_mlp: False
+  apply_lora_to_output: False
+  lora_rank: 8
+  lora_alpha: 16
+
+# Tokenizer
+tokenizer:
+  _component_: torchtune.models.llama3.llama3_tokenizer
+  path: /tmp/Meta-Llama-3-8B/original/tokenizer.model
+
+checkpointer:
+  _component_: torchtune.utils.FullModelMetaCheckpointer
+  checkpoint_dir: /tmp/Meta-Llama-3-8B/original/
+  checkpoint_files: [
+    consolidated.00.pth
+  ]
+  recipe_checkpoint: null
+  output_dir: /tmp/Meta-Llama-3-8B/
+  model_type: LLAMA3
+resume_from_checkpoint: False
+
+# Dataset and Sampler
+dataset:
+  _component_: torchtune.datasets.alpaca_cleaned_dataset
+  train_on_input: True
+seed: null
+shuffle: True
+batch_size: 2
+
+# Optimizer and Scheduler
+optimizer:
+  _component_: torch.optim.AdamW
+  weight_decay: 0.01
+  lr: 3e-4
+lr_scheduler:
+  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  num_warmup_steps: 100
+
+loss:
+  _component_: torch.nn.CrossEntropyLoss
+
+# Training
+epochs: 1
+max_steps_per_epoch: null
+gradient_accumulation_steps: 64
+compile: False
+
+# Logging
+output_dir: /tmp/lora_finetune_output
+metric_logger:
+  _component_: torchtune.utils.metric_logging.DiskLogger
+  log_dir: ${output_dir}
+log_every_n_steps: null
+
+# Environment
+device: cuda
+dtype: bf16
+enable_activation_checkpointing: True
+
+# Profiler (disabled)
+profiler:
+  _component_: torchtune.utils.profiler
+  enabled: False
diff --git a/recipes/configs/llama3/8B_qlora_single_device.yaml b/recipes/configs/llama3/8B_qlora_single_device.yaml
new file mode 100644
index 0000000000..a951b9d660
--- /dev/null
+++ b/recipes/configs/llama3/8B_qlora_single_device.yaml
@@ -0,0 +1,86 @@
+# Config for single device QLoRA with lora_finetune_single_device.py
+# using a Llama3 8B model
+#
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download meta-llama/Meta-Llama-3-8B --output-dir /tmp/Meta-Llama-3-8B --hf-token <HF_TOKEN>
+#
+# To launch on a single device, run the following command from root:
+#   tune run lora_finetune_single_device --config llama3/8B_qlora_single_device
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+#   tune run lora_finetune_single_device --config llama3/8B_qlora_single_device checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#
+# This config works only for training on single device.
+
+# Model Arguments
+model:
+  _component_: torchtune.models.llama3.qlora_llama3_8b
+  lora_attn_modules: ['q_proj', 'v_proj', 'k_proj', 'output_proj']
+  apply_lora_to_mlp: True
+  apply_lora_to_output: False
+  lora_rank: 8
+  lora_alpha: 16
+
+# Tokenizer
+tokenizer:
+  _component_: torchtune.models.llama3.llama3_tokenizer
+  path: /tmp/Meta-Llama-3-8B/original/tokenizer.model
+
+checkpointer:
+  _component_: torchtune.utils.FullModelMetaCheckpointer
+  checkpoint_dir: /tmp/Meta-Llama-3-8B/original/
+  checkpoint_files: [
+    consolidated.00.pth
+  ]
+  recipe_checkpoint: null
+  output_dir: /tmp/Meta-Llama-3-8B/
+  model_type: LLAMA3
+resume_from_checkpoint: False
+
+# Dataset and Sampler
+dataset:
+  _component_: torchtune.datasets.alpaca_cleaned_dataset
+  train_on_input: True
+seed: null
+shuffle: True
+batch_size: 2
+
+# Optimizer and Scheduler
+optimizer:
+  _component_: torch.optim.AdamW
+  weight_decay: 0.01
+  lr: 3e-4
+lr_scheduler:
+  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  num_warmup_steps: 100
+
+loss:
+  _component_: torch.nn.CrossEntropyLoss
+
+# Training
+epochs: 1
+max_steps_per_epoch: null
+gradient_accumulation_steps: 16
+# Note: compile for QLoRA is only supported on nightly
+# PyTorch (>= 2.4.0.dev20240408)
+compile: False
+
+# Logging
+output_dir: /tmp/qlora_finetune_output/
+metric_logger:
+  _component_: torchtune.utils.metric_logging.DiskLogger
+  log_dir: ${output_dir}
+log_every_n_steps: 1
+
+# Environment
+device: cuda
+dtype: bf16
+enable_activation_checkpointing: True
+
+# Profiler (disabled)
+profiler:
+  _component_: torchtune.utils.profiler
+  enabled: False
diff --git a/recipes/eleuther_eval.py b/recipes/eleuther_eval.py
index 81c0253f35..c6911886fa 100644
--- a/recipes/eleuther_eval.py
+++ b/recipes/eleuther_eval.py
@@ -15,7 +15,8 @@
 from torch import nn
 
 from torchtune import config, utils
-from torchtune.modules import Tokenizer, TransformerDecoder
+from torchtune.modules import TransformerDecoder
+from torchtune.modules.tokenizers import Tokenizer
 from torchtune.recipe_interfaces import EvalRecipeInterface
 
 
diff --git a/recipes/full_finetune_single_device.py b/recipes/full_finetune_single_device.py
index 95aedacc17..8ea06343aa 100644
--- a/recipes/full_finetune_single_device.py
+++ b/recipes/full_finetune_single_device.py
@@ -396,7 +396,6 @@ def train(self) -> None:
                     == self.max_steps_per_epoch
                 ):
                     break
-
                 input_ids, labels = batch
                 input_ids = input_ids.to(self._device)
                 labels = labels.to(self._device)
diff --git a/tests/assets/tiktoken_small.model b/tests/assets/tiktoken_small.model
new file mode 100644
index 0000000000..4bfad62542
--- /dev/null
+++ b/tests/assets/tiktoken_small.model
@@ -0,0 +1,2000 @@
+AA== 0
+AQ== 1
+Ag== 2
+Aw== 3
+BA== 4
+BQ== 5
+Bg== 6
+Bw== 7
+CA== 8
+CQ== 9
+Cg== 10
+Cw== 11
+DA== 12
+DQ== 13
+Dg== 14
+Dw== 15
+EA== 16
+EQ== 17
+Eg== 18
+Ew== 19
+FA== 20
+FQ== 21
+Fg== 22
+Fw== 23
+GA== 24
+GQ== 25
+Gg== 26
+Gw== 27
+HA== 28
+HQ== 29
+Hg== 30
+Hw== 31
+IA== 32
+IQ== 33
+Ig== 34
+Iw== 35
+JA== 36
+JQ== 37
+Jg== 38
+Jw== 39
+KA== 40
+KQ== 41
+Kg== 42
+Kw== 43
+LA== 44
+LQ== 45
+Lg== 46
+Lw== 47
+MA== 48
+MQ== 49
+Mg== 50
+Mw== 51
+NA== 52
+NQ== 53
+Ng== 54
+Nw== 55
+OA== 56
+OQ== 57
+Og== 58
+Ow== 59
+PA== 60
+PQ== 61
+Pg== 62
+Pw== 63
+QA== 64
+QQ== 65
+Qg== 66
+Qw== 67
+RA== 68
+RQ== 69
+Rg== 70
+Rw== 71
+SA== 72
+SQ== 73
+Sg== 74
+Sw== 75
+TA== 76
+TQ== 77
+Tg== 78
+Tw== 79
+UA== 80
+UQ== 81
+Ug== 82
+Uw== 83
+VA== 84
+VQ== 85
+Vg== 86
+Vw== 87
+WA== 88
+WQ== 89
+Wg== 90
+Ww== 91
+XA== 92
+XQ== 93
+Xg== 94
+Xw== 95
+YA== 96
+YQ== 97
+Yg== 98
+Yw== 99
+ZA== 100
+ZQ== 101
+Zg== 102
+Zw== 103
+aA== 104
+aQ== 105
+ag== 106
+aw== 107
+bA== 108
+bQ== 109
+bg== 110
+bw== 111
+cA== 112
+cQ== 113
+cg== 114
+cw== 115
+dA== 116
+dQ== 117
+dg== 118
+dw== 119
+eA== 120
+eQ== 121
+eg== 122
+ew== 123
+fA== 124
+fQ== 125
+fg== 126
+fw== 127
+gA== 128
+gQ== 129
+gg== 130
+gw== 131
+hA== 132
+hQ== 133
+hg== 134
+hw== 135
+iA== 136
+iQ== 137
+ig== 138
+iw== 139
+jA== 140
+jQ== 141
+jg== 142
+jw== 143
+kA== 144
+kQ== 145
+kg== 146
+kw== 147
+lA== 148
+lQ== 149
+lg== 150
+lw== 151
+mA== 152
+mQ== 153
+mg== 154
+mw== 155
+nA== 156
+nQ== 157
+ng== 158
+nw== 159
+oA== 160
+oQ== 161
+og== 162
+ow== 163
+pA== 164
+pQ== 165
+pg== 166
+pw== 167
+qA== 168
+qQ== 169
+qg== 170
+qw== 171
+rA== 172
+rQ== 173
+rg== 174
+rw== 175
+sA== 176
+sQ== 177
+sg== 178
+sw== 179
+tA== 180
+tQ== 181
+tg== 182
+tw== 183
+uA== 184
+uQ== 185
+ug== 186
+uw== 187
+vA== 188
+vQ== 189
+vg== 190
+vw== 191
+wA== 192
+wQ== 193
+wg== 194
+ww== 195
+xA== 196
+xQ== 197
+xg== 198
+xw== 199
+yA== 200
+yQ== 201
+yg== 202
+yw== 203
+zA== 204
+zQ== 205
+zg== 206
+zw== 207
+0A== 208
+0Q== 209
+0g== 210
+0w== 211
+1A== 212
+1Q== 213
+1g== 214
+1w== 215
+2A== 216
+2Q== 217
+2g== 218
+2w== 219
+3A== 220
+3Q== 221
+3g== 222
+3w== 223
+4A== 224
+4Q== 225
+4g== 226
+4w== 227
+5A== 228
+5Q== 229
+5g== 230
+5w== 231
+6A== 232
+6Q== 233
+6g== 234
+6w== 235
+7A== 236
+7Q== 237
+7g== 238
+7w== 239
+8A== 240
+8Q== 241
+8g== 242
+8w== 243
+9A== 244
+9Q== 245
+9g== 246
+9w== 247
++A== 248
++Q== 249
++g== 250
++w== 251
+/A== 252
+/Q== 253
+/g== 254
+/w== 255
+IHQ= 256
+aGU= 257
+IGE= 258
+aW4= 259
+IHM= 260
+IHc= 261
+IHRoZQ== 262
+IG8= 263
+cmU= 264
+IGI= 265
+b3U= 266
+ZWQ= 267
+IG0= 268
+bmQ= 269
+IEk= 270
+aGE= 271
+aXQ= 272
+ZXI= 273
+aW5n 274
+IGY= 275
+aXM= 276
+IHRv 277
+ZW4= 278
+b24= 279
+b3I= 280
+YXM= 281
+IGM= 282
+IG9m 283
+IGFuZA== 284
+IGQ= 285
+bGw= 286
+YXQ= 287
+YW4= 288
+YXI= 289
+IHA= 290
+IG4= 291
+IGlu 292
+bGU= 293
+b20= 294
+b3Q= 295
+IGJl 296
+IGg= 297
+dXQ= 298
+b3c= 299
+ZXM= 300
+aGF0 301
+IGc= 302
+IGhl 303
+IGhh 304
+IGw= 305
+IHdhcw== 306
+bGQ= 307
+Z2g= 308
+aWQ= 309
+Y2g= 310
+IHRo 311
+IGl0 312
+YXk= 313
+IG9u 314
+Y2U= 315
+c2U= 316
+ZW50 317
+IHN0 318
+bHk= 319
+dmU= 320
+ZXQ= 321
+c3Q= 322
+IFQ= 323
+IGU= 324
+IHk= 325
+Z2h0 326
+aXI= 327
+IG1l 328
+b28= 329
+YWw= 330
+aXRo 331
+IHJl 332
+aW0= 333
+IHRoYXQ= 334
+IGFz 335
+b3VsZA== 336
+cm8= 337
+YWQ= 338
+aW9u 339
+Lgo= 340
+aGVy 341
+IG15 342
+Y3Q= 343
+IG5vdA== 344
+IHdpdGg= 345
+IGZvcg== 346
+IHU= 347
+a2U= 348
+IHlvdQ== 349
+IFM= 350
+IGlz 351
+aWdodA== 352
+Igo= 353
+YW0= 354
+aWM= 355
+dXI= 356
+IGF0 357
+Li4= 358
+YWM= 359
+dGVy 360
+IHdo 361
+IGFu 362
+IHdl 363
+IFRoZQ== 364
+aWY= 365
+IG9y 366
+IGJ1dA== 367
+dmVy 368
+ICI= 369
+IHI= 370
+b3V0 371
+b21l 372
+IGhhZA== 373
+cHA= 374
+cXU= 375
+IHN1 376
+IHRoaXM= 377
+cmVk 378
+YXJk 379
+IHNv 380
+ZWxs 381
+IHdvdWxk 382
+IGhpcw== 383
+IHNo 384
+aW5l 385
+cmE= 386
+IHNl 387
+IGJ5 388
+LiIK 389
+IFA= 390
+aGVu 391
+IEE= 392
+IGhhdmU= 393
+IGZy 394
+IHNh 395
+IEg= 396
+IG9uZQ== 397
+ZW0= 398
+a2Vk 399
+aXJ0 400
+ZWN0 401
+IGhpbQ== 402
+IGxp 403
+IGFi 404
+YXRpb24= 405
+aGluZw== 406
+dGhl 407
+IFI= 408
+IGxl 409
+c3M= 410
+IFc= 411
+Y3U= 412
+aWxs 413
+J3Q= 414
+YXJ0 415
+YWxs 416
+LAo= 417
+b3du 418
+b3Jl 419
+IGFsbA== 420
+IGs= 421
+IGdv 422
+aGlydA== 423
+YW5k 424
+IG91dA== 425
+YW1l 426
+YWlu 427
+IGlm 428
+IG5v 429
+IGRv 430
+IHRoZXk= 431
+b29s 432
+dW4= 433
+dG8= 434
+IHVw 435
+IFJlZA== 436
+IG5l 437
+IEs= 438
+IGZyb20= 439
+IFNoaXJ0 440
+IHdvcg== 441
+b25n 442
+IHRoZXJl 443
+IHNhaWQ= 444
+cmk= 445
+YW50 446
+IEI= 447
+IGFueQ== 448
+dWQ= 449
+aW5k 450
+IHdoaQ== 451
+YWI= 452
+b3VuZA== 453
+IGFib3V0 454
+IHRoZW0= 455
+Y3Vw 456
+YWs= 457
+IGRl 458
+IHRl 459
+IE0= 460
+YWtl 461
+Y3VwaW5l 462
+aWc= 463
+IHdlcmU= 464
+b3JjdXBpbmU= 465
+aWw= 466
+Y2hvb2w= 467
+IHJv 468
+b29k 469
+IGFyZQ== 470
+aXZl 471
+IGxpa2U= 472
+eW8= 473
+IGhvdQ== 474
+J3M= 475
+b25l 476
+dXM= 477
+ZWw= 478
+dWw= 479
+YWNr 480
+b3A= 481
+LCI= 482
+dGg= 483
+YWNoZXI= 484
+dW0= 485
+YW5n 486
+IGZh 487
+YWc= 488
+IHNjaG9vbA== 489
+IGo= 490
+dGU= 491
+b2s= 492
+ZXNz 493
+dXN0 494
+ZXJz 495
+Li4uLg== 496
+IEM= 497
+dGhlcg== 498
+aGFu 499
+IHdoZW4= 500
+IHNw 501
+IG1hbg== 502
+IGNhbg== 503
+b3VnaA== 504
+IHdobw== 505
+IGdldA== 506
+IGRpZA== 507
+IHBv 508
+Y2k= 509
+IGFs 510
+aXN0 511
+IGNvbQ== 512
+bGY= 513
+YXU= 514
+IFBvcmN1cGluZQ== 515
+IHdoaWNo 516
+dmVu 517
+IGFm 518
+d24= 519
+YXNz 520
+YmVy 521
+IGV4 522
+b3Vz 523
+ZXN0 524
+bG8= 525
+IHRy 526
+ZWxsb3c= 527
+IHNheQ== 528
+b3VnaHQ= 529
+IHJvb20= 530
+IHNvbWU= 531
+LS0= 532
+IE8= 533
+YXRl 534
+IHY= 535
+aGVk 536
+YXA= 537
+IHR3 538
+IGJlYw== 539
+cmVl 540
+amVjdA== 541
+a3M= 542
+IGNvbg== 543
+IGJlZW4= 544
+ZW50cw== 545
+aWRl 546
+IGNvdWxk 547
+IEc= 548
+ZXA= 549
+IHBybw== 550
+bnQ= 551
+IGhvdXNl 552
+IGFn 553
+IElm 554
+IGtu 555
+IGZlbGxvdw== 556
+IHdoYXQ= 557
+d2F5 558
+aXNo 559
+IGFt 560
+aXRl 561
+bmRlcg== 562
+aW1l 563
+IHBy 564
+IHRlYWNoZXI= 565
+YXJl 566
+IGJv 567
+IHNoZQ== 568
+IE4= 569
+aWNl 570
+YXN0 571
+dXJl 572
+aWU= 573
+IHN1Y2g= 574
+dXRlbg== 575
+dXRlbmJlcg== 576
+dXRlbmJlcmc= 577
+IHF1 578
+bG93bg== 579
+IHdy 580
+cHQ= 581
+IEhl 582
+IHN0dWQ= 583
+aGVyZQ== 584
+IG1vcmU= 585
+cnk= 586
+dHRlcg== 587
+IFk= 588
+IG1heQ== 589
+aXR5 590
+IGxvbw== 591
+IG90aGVy 592
+aGlz 593
+IFBybw== 594
+IHdpbGw= 595
+IEl0 596
+b3J0 597
+IHNob3VsZA== 598
+dmVyeQ== 599
+d2U= 600
+IHBs 601
+YXNo 602
+LiI= 603
+IGFwcA== 604
+IGRheQ== 605
+dXJu 606
+cG8= 607
+IGhlcg== 608
+ICA= 609
+bm90 610
+Y2s= 611
+IHVu 612
+aGk= 613
+dmluZw== 614
+IG9sZA== 615
+IHRpbWU= 616
+IlQ= 617
+IHdheQ== 618
+YWJsZQ== 619
+PyIK 620
+IENsb3du 621
+IG9ubHk= 622
+dWI= 623
+YWNo 624
+IG9mZg== 625
+IHRoYW4= 626
+YWxseQ== 627
+IHRoZWly 628
+YmU= 629
+a2luZw== 630
+b3RoZXI= 631
+YXJ5 632
+YW5z 633
+YXRlZA== 634
+c2VsZg== 635
+IGdvaW5n 636
+dWNo 637
+b2xs 638
+IGJhY2s= 639
+aXlv 640
+LXQ= 641
+YW5jZQ== 642
+YWRl 643
+IFByb2plY3Q= 644
+c3A= 645
+IHR3bw== 646
+IHRob3VnaHQ= 647
+c28= 648
+IHJpZ2h0 649
+IGhlYWQ= 650
+dmVk 651
+IEQ= 652
+IHByZQ== 653
+IHNlZQ== 654
+IHVz 655
+IHN0dWRlbnRz 656
+Y2lw 657
+IGRvbg== 658
+IG5pZ2h0 659
+aW5jaXA= 660
+IEtpeW8= 661
+cGw= 662
+YXJlZA== 663
+IEd1dGVuYmVyZw== 664
+IGNv 665
+IGhvdw== 666
+b21ldA== 667
+ZmY= 668
+Ikk= 669
+LC0t 670
+IGFza2Vk 671
+aW5jaXBhbA== 672
+ZXZlcg== 673
+IGFj 674
+IEY= 675
+IG1ha2U= 676
+aXR0 677
+IG1pZ2h0 678
+Z2U= 679
+bGVk 680
+IGFmdGVy 681
+aWdu 682
+IGdy 683
+IG1hZGU= 684
+ZGQ= 685
+IGtub3c= 686
+IGNvbWU= 687
+IGJy 688
+dGhpbmc= 689
+IEJ1dA== 690
+IG1hdA== 691
+IE9u 692
+b3J5 693
+Y2w= 694
+IEU= 695
+Ymxl 696
+b2c= 697
+IHlvdXI= 698
+dWxs 699
+IHdvcms= 700
+ZWFy 701
+IHRocmVl 702
+aWVk 703
+YnV0 704
+VGhl 705
+cGU= 706
+YWNl 707
+IHN0YXJ0 708
+aWNr 709
+IG92ZXI= 710
+b3Vy 711
+IG11Y2g= 712
+IHdhbnQ= 713
+aW1w 714
+IHBhcnQ= 715
+aG8= 716
+aW5r 717
+ZW5jZQ== 718
+IGRvd24= 719
+IGV2ZW4= 720
+IHByaW5jaXBhbA== 721
+bGluZw== 722
+b3VudA== 723
+YXVzZQ== 724
+IGNs 725
+IGJs 726
+LXRt 727
+b21ldGhpbmc= 728
+IGludG8= 729
+b3Jt 730
+b2t5bw== 731
+IGRpcw== 732
+IGZl 733
+IGZhY2U= 734
+Li4uLi4u 735
+cmVzcw== 736
+bWVudA== 737
+aXJl 738
+IGFy 739
+dHk= 740
+IG1v 741
+cmVhdA== 742
+IGZpcg== 743
+cGVy 744
+IG91cg== 745
+Y28= 746
+IHRoZW4= 747
+IHRvbGQ= 748
+aW5ncw== 749
+IHRha2U= 750
+IGJlZw== 751
+bmVy 752
+aXRpb24= 753
+b3Nl 754
+IG93bg== 755
+IGFnYWlu 756
+IHNlZW0= 757
+aXNl 758
+IHdhdA== 759
+Ilc= 760
+IGZhcg== 761
+YWtpbmc= 762
+Zm9yZQ== 763
+YWR5 764
+LXM= 765
+bGVzcw== 766
+IHJldA== 767
+IHNoYQ== 768
+IGNhbWU= 769
+Z2Vy 770
+IGdvb2Q= 771
+YXRoZXI= 772
+YXJr 773
+cm93 774
+IGtl 775
+J20= 776
+IGhhcw== 777
+YXRo 778
+cHBlZA== 779
+IHdlbnQ= 780
+IHRlbGw= 781
+cXVhc2g= 782
+IGVu 783
+IGZpcnN0 784
+IGhvdA== 785
+aXo= 786
+IGF3YXk= 787
+IHNvbWV0aGluZw== 788
+IHJlbQ== 789
+IHRvd24= 790
+IHNt 791
+IFRoaXM= 792
+IGJldHRlcg== 793
+IFRoZW4= 794
+d2Fz 795
+b2Y= 796
+YmFyZA== 797
+IEw= 798
+bGk= 799
+ZmU= 800
+IFRva3lv 801
+IGxvbmc= 802
+aWx5 803
+IHN1cmU= 804
+IGxvb2tlZA== 805
+dWJiYXJk 806
+Y3Rpb24= 807
+b3Jk 808
+IG1hbnk= 809
+aW91cw== 810
+IHRvbw== 811
+IGhlcmU= 812
+b3M= 813
+IHVuZGVy 814
+YXNl 815
+bmc= 816
+cGVk 817
+b2Q= 818
+bWU= 819
+IGp1c3Q= 820
+IG5vdw== 821
+aW5jZQ== 822
+IGhlYXJk 823
+IGtpbmQ= 824
+IFRoZXk= 825
+IGJlZm9yZQ== 826
+aHk= 827
+IElu 828
+IGVudA== 829
+IGJvYXJk 830
+ISI= 831
+d2FyZA== 832
+IGJlaW5n 833
+IHdlbGw= 834
+ZXJt 835
+cmllZA== 836
+IHdyb25n 837
+YWlk 838
+eHQ= 839
+IHJldHVybg== 840
+aXRlZA== 841
+IHllbg== 842
+IG1hdHRlcg== 843
+IGNhbGw= 844
+IHRhbA== 845
+IFlvdQ== 846
+Y2Vk 847
+aXNlZA== 848
+IGNoYQ== 849
+b25z 850
+IHNhbWU= 851
+IG9uY2U= 852
+ZGF5 853
+ZnQ= 854
+IHN3 855
+IGJlY2F1c2U= 856
+IHRoaW5r 857
+IHdoZXJl 858
+IE5v 859
+IEh1YmJhcmQ= 860
+IFNxdWFzaA== 861
+IGNvcA== 862
+d2l0aA== 863
+ZXJlZA== 864
+b2xsb3c= 865
+IHBsYWNl 866
+aWRk 867
+Y2Vzcw== 868
+IHNob3c= 869
+aXNoYQ== 870
+IHJh 871
+IGxldHRlcg== 872
+bmU= 873
+dmVz 874
+YXRpbmc= 875
+cmFuZw== 876
+IGFmZg== 877
+IGhhbmQ= 878
+IHNj 879
+IHBlcnM= 880
+aW50 881
+cHI= 882
+c2lkZQ== 883
+ZnRlcg== 884
+IHNheWluZw== 885
+IGxhdQ== 886
+dGhhdA== 887
+IHdpdGhvdXQ= 888
+cm9u 889
+YWly 890
+bGVjdA== 891
+IFdoYXQ= 892
+ZWx0 893
+IHdoaWxl 894
+b2dh 895
+YXBlcg== 896
+IHBl 897
+b3k= 898
+IHNhdA== 899
+aWVz 900
+IGFkZA== 901
+IGRheXM= 902
+IHNwZQ== 903
+IGhv 904
+IGFucw== 905
+IGhhcg== 906
+IFdoZW4= 907
+IGFueXRoaW5n 908
+cGVu 909
+XQo= 910
+dGFpbg== 911
+IG11c3Q= 912
+IG5ldw== 913
+bGlj 914
+IHZv 915
+aGlsZQ== 916
+Z2V0 917
+IEFz 918
+IHZlcnk= 919
+J3Jl 920
+IGV2ZXJ5 921
+YXZl 922
+PyI= 923
+YWRnZXI= 924
+IEtvZ2E= 925
+IE1y 926
+cm91Z2g= 927
+dWx0 928
+IGZvbGxvdw== 929
+dGluZw== 930
+aWZl 931
+aWRkbGU= 932
+ZnVs 933
+YW5r 934
+IFNv 935
+IHNlZW1lZA== 936
+IEFuZA== 937
+aXg= 938
+IHNldA== 939
+IGNhcmU= 940
+IHJlcw== 941
+IG5ldmVy 942
+IGZvdW5k 943
+IGxv 944
+Y2lk 945
+aW5lZA== 946
+IGNsYXNz 947
+IG15c2VsZg== 948
+YXc= 949
+IHdvbQ== 950
+YXRpb25z 951
+IGxlZnQ= 952
+IFdl 953
+IHRlYWNoZXJz 954
+Ilk= 955
+bmE= 956
+b250 957
+IGRlcw== 958
+IHRob3Nl 959
+aXJlZA== 960
+IHNlbg== 961
+eWluZw== 962
+IHRoZXNl 963
+YXo= 964
+IFRoZXJl 965
+Y2VwdA== 966
+IGRhbmc= 967
+IFU= 968
+Ikg= 969
+Ym9k 970
+Ym9keQ== 971
+IGhhdmluZw== 972
+YWxhcnk= 973
+IHdhdGNo 974
+IGdpdmU= 975
+YWdl 976
+IGl0cw== 977
+IGFwcGU= 978
+dWU= 979
+IGNvdW50 980
+IGhhcmQ= 981
+IGJlbA== 982
+b3R0 983
+IGRpc3Q= 984
+IlM= 985
+IE1hZA== 986
+LW4= 987
+cmlidXQ= 988
+Z2Vk 989
+IGF0dA== 990
+ZmVyZQ== 991
+aXRoZXI= 992
+IHVwb24= 993
+IHRlbQ== 994
+IHBlcnNvbg== 995
+bmluZw== 996
+IGNoZQ== 997
+YXJseQ== 998
+b25leQ== 999
+IHNvb24= 1000
+ZW1lbnQ= 1001
+ICg= 1002
+IHRyYW5z 1003
+IGV4cA== 1004
+IHNlcg== 1005
+IHJlZw== 1006
+YXNvbg== 1007
+IHNhdw== 1008
+IG5leHQ= 1009
+b290 1010
+IGhhbGY= 1011
+IHRvb2s= 1012
+IGJhZA== 1013
+IGhvdXI= 1014
+IHNhbGFyeQ== 1015
+IGJlZ2Fu 1016
+cmlnaHQ= 1017
+b25uYQ== 1018
+LXNhbg== 1019
+IHdvcmtz 1020
+IEo= 1021
+Zm9ybQ== 1022
+aWNhbA== 1023
+IHRyYQ== 1024
+bWFu 1025
+IG5vdGhpbmc= 1026
+IHN0aWxs 1027
+ZWFycw== 1028
+IHN1cHA= 1029
+IHR1cm4= 1030
+IGZlbHQ= 1031
+IHdvbWFu 1032
+IHN0YXJ0ZWQ= 1033
+b3VibGU= 1034
+dXJh 1035
+aXNoaW5n 1036
+Ogo= 1037
+bGVjdHJvbg== 1038
+bGVjdHJvbmlj 1039
+b29r 1040
+IGNvcHk= 1041
+IGZ1bGw= 1042
+Y29uZA== 1043
+bWF0 1044
+IG1pZGRsZQ== 1045
+IGxvb2s= 1046
+IGNvbW0= 1047
+d2VyZWQ= 1048
+IGJlY2FtZQ== 1049
+IGZlbGxvd3M= 1050
+d291bGQ= 1051
+IGdvdA== 1052
+IGds 1053
+IGd1 1054
+IGtlZXA= 1055
+IGdl 1056
+IE1hZG9ubmE= 1057
+aXRlcg== 1058
+aXNoZWQ= 1059
+IHVuZGVyc3Q= 1060
+IHN0cmE= 1061
+c2lk 1062
+IGNvdW50cnk= 1063
+b3BsZQ== 1064
+IHByb3Y= 1065
+IHB1dA== 1066
+bm8= 1067
+J2xs 1068
+IHNsZQ== 1069
+cmFuZ2U= 1070
+IFNoZQ== 1071
+cG9z 1072
+IG1pbmQ= 1073
+IHBhc3M= 1074
+IHRocm91Z2g= 1075
+IHF1aXRl 1076
+IGluZA== 1077
+IGJvYXJkaW5n 1078
+dGVhY2hlcg== 1079
+cGxl 1080
+UG9yY3VwaW5l 1081
+IHBsZQ== 1082
+IGdlaXNoYQ== 1083
+ICAgIA== 1084
+b3N0 1085
+ZW5zZQ== 1086
+Tm8= 1087
+aWJsZQ== 1088
+IHJlYWQ= 1089
+IHJlZA== 1090
+ZW50aW9u 1091
+ZW5lZA== 1092
+ISIK 1093
+IHJlZg== 1094
+IGFk 1095
+IGZs 1096
+IHN0YXk= 1097
+dXA= 1098
+IHJvdW5k 1099
+IGNsZQ== 1100
+IG9wZW4= 1101
+IG9i 1102
+dGVuZA== 1103
+IGZpbmQ= 1104
+IHBlcg== 1105
+IGNhbGxlZA== 1106
+IHN1cg== 1107
+cmV3 1108
+IHBhcGVy 1109
+IEJhZGdlcg== 1110
+IG1lZXQ= 1111
+aXNz 1112
+IlRoYXQ= 1113
+ZXJtcw== 1114
+VEU= 1115
+aXR0ZW4= 1116
+YWJseQ== 1117
+bmVzcw== 1118
+IGNhbm5vdA== 1119
+IHNpbXA= 1120
+Y29u 1121
+IHJlYXNvbg== 1122
+eW91 1123
+IGhvbWU= 1124
+Ynk= 1125
+IGZpZ2h0 1126
+aXR0bGU= 1127
+IHRoaW5ncw== 1128
+IGVhcw== 1129
+IGltcA== 1130
+cmVzc2Vk 1131
+IG1lYW4= 1132
+IGFwcGVhcmVk 1133
+IG5hdA== 1134
+IGhlbA== 1135
+cmV0 1136
+YWtlbg== 1137
+IHN0cmFpZ2h0 1138
+IGFmZmFpcg== 1139
+aXRpbmc= 1140
+IGVk 1141
+IHNpbmNl 1142
+bG9n 1143
+IHBheQ== 1144
+IGZyb250 1145
+bXk= 1146
+IHZvaWNl 1147
+cmVhZHk= 1148
+IGZvb2w= 1149
+b3VuZGF0aW9u 1150
+IGVsZWN0cm9uaWM= 1151
+IHRlcm1z 1152
+IG1hcg== 1153
+YXBhbg== 1154
+YW55 1155
+IHJlc3A= 1156
+IGVuZA== 1157
+YXBw 1158
+d2hhdA== 1159
+c3Ry 1160
+cmFw 1161
+aWFs 1162
+aWN1bA== 1163
+IGFjYw== 1164
+b3Ro 1165
+IHNlY29uZA== 1166
+IGZsbw== 1167
+IHNpeA== 1168
+IGZlZXQ= 1169
+YnI= 1170
+aWV0 1171
+IGxpdHRsZQ== 1172
+bGVz 1173
+IG1vbmV5 1174
+IGRlY2w= 1175
+IGV5 1176
+IGNvbXA= 1177
+YXJpbmc= 1178
+IGFncmU= 1179
+d2hlcmU= 1180
+IFN0 1181
+IHN0cmU= 1182
+ZXg= 1183
+cmFjdA== 1184
+IGludA== 1185
+IGRpcmU= 1186
+IGJlY29tZQ== 1187
+IGhvbg== 1188
+IGNvbnNpZA== 1189
+ZXJ0YWlu 1190
+bm93 1191
+IHNs 1192
+aXRvcg== 1193
+Z2c= 1194
+IGp1bQ== 1195
+IGJ1 1196
+IHRoaW5n 1197
+IGFuc3dlcmVk 1198
+b2Vz 1199
+eWE= 1200
+IFRoYXQ= 1201
+aXpl 1202
+b25k 1203
+YWN0 1204
+IGVmZg== 1205
+IGJhbmc= 1206
+YWJvdXQ= 1207
+IGJlZA== 1208
+b3Jyb3c= 1209
+dW5n 1210
+IFRv 1211
+IGtlcHQ= 1212
+IHdhbA== 1213
+IGJhdGg= 1214
+IGRyYQ== 1215
+IkE= 1216
+cmluZ3M= 1217
+aG9wcA== 1218
+IHJlc2lnbg== 1219
+IGRpbg== 1220
+IGxhZHk= 1221
+LkU= 1222
+IHVzZQ== 1223
+bGlzaA== 1224
+b3Jz 1225
+IHdyaXR0ZW4= 1226
+ZW5l 1227
+aXY= 1228
+IGRpZg== 1229
+IHN0ZQ== 1230
+IHN0b3J5 1231
+Y29t 1232
+cmVz 1233
+ZW50bHk= 1234
+IGZhY3Q= 1235
+aGVz 1236
+d2F5cw== 1237
+IHdoeQ== 1238
+IHRob3VnaA== 1239
+IHN0cg== 1240
+b25kZXI= 1241
+aGVhZA== 1242
+IGNvdXI= 1243
+IG1vbg== 1244
+IHNr 1245
+IGJlbGll 1246
+IGxldA== 1247
+ZmVy 1248
+IHJlcXU= 1249
+IGxpbmU= 1250
+cm9vbQ== 1251
+LWRheQ== 1252
+IGRvbmU= 1253
+IGRvZXM= 1254
+IE9uZQ== 1255
+IGRhbmdv 1256
+YXNzaG9wcA== 1257
+IGNvbnNpZGVy 1258
+IGRpbm5lcg== 1259
+IEZvdW5kYXRpb24= 1260
+Kio= 1261
+ZW1wdA== 1262
+ZXNl 1263
+IHdvcmQ= 1264
+cmVzdA== 1265
+IGVub3VnaA== 1266
+IGdyZWF0 1267
+IG5hbWU= 1268
+IHB1Yg== 1269
+IG1hbm5lcg== 1270
+d2Vy 1271
+aWN0 1272
+aW5lc3M= 1273
+IGhpbXNlbGY= 1274
+IHBlb3BsZQ== 1275
+ZXc= 1276
+IGNvcg== 1277
+ZXN0aW9u 1278
+IGJpZw== 1279
+ZWU= 1280
+IHJp 1281
+aWRlcw== 1282
+IGJyb3RoZXI= 1283
+IGhlYXJ0 1284
+ZWN0ZWQ= 1285
+ZWVk 1286
+IG90aGVycw== 1287
+c29s 1288
+dGVk 1289
+IGV5ZXM= 1290
+IHRyb3VibGU= 1291
+IHRlYWNo 1292
+IGJvYXQ= 1293
+IGZvdXI= 1294
+IGFscmVhZHk= 1295
+cm9t 1296
+Z2hlZA== 1297
+IHNxdQ== 1298
+IHBvbA== 1299
+Y2Vz 1300
+IEhvdHQ= 1301
+IGxlYXZl 1302
+IGRpc3RyaWJ1dA== 1303
+YXN0ZXI= 1304
+Q0g= 1305
+dWM= 1306
+IGlt 1307
+IGhvd2V2ZXI= 1308
+dGhlcmU= 1309
+YXBhbmVzZQ== 1310
+IGxhc3Q= 1311
+IGNy 1312
+aWxpdHk= 1313
+IHNpbXBsZQ== 1314
+IGxpZmU= 1315
+LWM= 1316
+IHJlZ2FyZA== 1317
+IGZpbg== 1318
+dWFs 1319
+IG1lYW5z 1320
+IHN0YW5k 1321
+YXRjaA== 1322
+IHNob3J0 1323
+bmVk 1324
+IHNlZW4= 1325
+IGhhcHA= 1326
+LWs= 1327
+IGFnYWluc3Q= 1328
+aGlt 1329
+YW1lZA== 1330
+IHN0b29k 1331
+IGdyYQ== 1332
+IG1vdGhlcg== 1333
+IGZpc2g= 1334
+IHdhdGVy 1335
+YWls 1336
+Y2Vp 1337
+IHJhdGhlcg== 1338
+IGlucw== 1339
+IGZlZWw= 1340
+IGFsc28= 1341
+IG9yZA== 1342
+IGNvbWluZw== 1343
+aWNz 1344
+IGVpdGhlcg== 1345
+bmNl 1346
+ICc= 1347
+IGtpZA== 1348
+IGxhdWdoZWQ= 1349
+bGlrZQ== 1350
+IEFy 1351
+Z3I= 1352
+IEhvdHRh 1353
+IHRhbGs= 1354
+Z2V0aGVy 1355
+IFNpcg== 1356
+IHB1bg== 1357
+UHJv 1358
+YXRz 1359
+bW9zdA== 1360
+IHJlcA== 1361
+IGdp 1362
+aXNm 1363
+YmFibHk= 1364
+YWtlcw== 1365
+IE5vdA== 1366
+bnk= 1367
+IGFwcGVhcg== 1368
+bXA= 1369
+Y2hh 1370
+IGFjdA== 1371
+YmVk 1372
+aWVm 1373
+dWZm 1374
+IGFwbw== 1375
+IG1ldA== 1376
+IHJldHVybmVk 1377
+IHNvdW5k 1378
+dXNpbmVzcw== 1379
+IGxhdWdo 1380
+IGNsZWFy 1381
+IG5lZWQ= 1382
+ZmVzcw== 1383
+ZXN0ZWQ= 1384
+IGludg== 1385
+IGFjY2VwdA== 1386
+dW5kZXI= 1387
+Owo= 1388
+IHN1cnBy 1389
+ZGU= 1390
+IHRyYWlu 1391
+IGhvdGVs 1392
+IHNsZWVw 1393
+IGRy 1394
+IGhvbGQ= 1395
+bG9jaw== 1396
+cHVyYQ== 1397
+IHNwcmluZ3M= 1398
+IC4uLi4uLg== 1399
+IGFncmVlbWVudA== 1400
+IERhcg== 1401
+IHJlc3Q= 1402
+Y2x1ZA== 1403
+YXRvcg== 1404
+YXY= 1405
+IG9yaWc= 1406
+IG9yaWdpbg== 1407
+IGVs 1408
+IG5vcg== 1409
+IHByZXM= 1410
+IHVuZGVyc3RhbmQ= 1411
+IHRha2Vu 1412
+IGxpZ2h0 1413
+ZW5lcg== 1414
+c29tZQ== 1415
+IGJyb3VnaHQ= 1416
+cmFwaA== 1417
+IG1vc3Q= 1418
+b2tl 1419
+LXc= 1420
+IHVudA== 1421
+IGZhdGhlcg== 1422
+IHVzZWQ= 1423
+IGVhdA== 1424
+IHllYXJz 1425
+IFdoaWxl 1426
+IGNoYW4= 1427
+IHN1ZGQ= 1428
+IHN1ZGRlbg== 1429
+IGFwb2xvZw== 1430
+IHNldHQ= 1431
+IHRoaW4= 1432
+IE15 1433
+IHRlbg== 1434
+aW1lcw== 1435
+Zm9y 1436
+b3Vk 1437
+V2hlbg== 1438
+IGRldA== 1439
+IGxpdmU= 1440
+IG9j 1441
+IGZpdmU= 1442
+IGNvbnQ= 1443
+IGhlbHA= 1444
+IHdh 1445
+IHBhc3NlZA== 1446
+IHJ1bg== 1447
+IG1ha2luZw== 1448
+IHN0cmFuZ2U= 1449
+IHRha2luZw== 1450
+IGVhY2g= 1451
+IllvdQ== 1452
+IGFub3RoZXI= 1453
+IlNheQ== 1454
+IlRoZQ== 1455
+YXRlcw== 1456
+IHBsZWFz 1457
+YXNzaG9wcGVycw== 1458
+IG1vbQ== 1459
+IG1vbWVudA== 1460
+ZW50bGU= 1461
+bmdsaXNo 1462
+Q0hB 1463
+IG9yaWdpbmFs 1464
+aW9ucw== 1465
+dXJpbmc= 1466
+IHB1YmxpYw== 1467
+dWN0 1468
+dWNr 1469
+IHF1ZXN0aW9u 1470
+YWk= 1471
+Y3k= 1472
+ZWs= 1473
+IGZsb29y 1474
+IGNhcg== 1475
+b3VzZQ== 1476
+IHNpZGU= 1477
+LXlh 1478
+IGNlcnRhaW4= 1479
+aHlz 1480
+LWQ= 1481
+aWdo 1482
+YWdpbg== 1483
+d2VldA== 1484
+IHBvb3I= 1485
+IGRlY2lk 1486
+dWFsbHk= 1487
+IGJ1c2luZXNz 1488
+cHJv 1489
+cGxhaW4= 1490
+IHN0b3A= 1491
+IQo= 1492
+IEhvdw== 1493
+IldoYXQ= 1494
+Y2Fu 1495
+IFVu 1496
+cHM= 1497
+dW5k 1498
+LW5pZ2h0 1499
+IG1lZXRpbmc= 1500
+ZWRv 1501
+IHJhaXNl 1502
+R3V0ZW5iZXJn 1503
+IERhcmxpbmc= 1504
+dW1l 1505
+IEVuZ2xpc2g= 1506
+VEVS 1507
+YWRpbmc= 1508
+IHRyYW5zbA== 1509
+IGFibGU= 1510
+c3NpYmxl 1511
+IHNhdGlzZg== 1512
+IHdhbnRlZA== 1513
+IHN1Yg== 1514
+IGNhc2U= 1515
+aWZpYw== 1516
+aXRlcmFyeQ== 1517
+IG1haWQ= 1518
+IGluYw== 1519
+IHBvcw== 1520
+IHBvc2l0aW9u 1521
+IHBhdA== 1522
+dXJlZA== 1523
+b3JyeQ== 1524
+IGFjY291bnQ= 1525
+IGJvdGg= 1526
+IGZyaWU= 1527
+IGZyaWVuZA== 1528
+dGhpcw== 1529
+IGFsd2F5cw== 1530
+IHBhcnRpY3Vs 1531
+V2hhdA== 1532
+IHNtYWxs 1533
+ZW50eQ== 1534
+dXNoZWQ= 1535
+IG1pcw== 1536
+dWxseQ== 1537
+IHJlY2Vp 1538
+WW91 1539
+IHlldA== 1540
+IGdhdmU= 1541
+QnV0 1542
+aGFk 1543
+IGFuc3dlcg== 1544
+IGFicw== 1545
+aWxl 1546
+Y2tldA== 1547
+IG5vb2Q= 1548
+IGNvdXJzZQ== 1549
+IGZvcm0= 1550
+IGV2ZXJ5dGhpbmc= 1551
+ZWN0aW9u 1552
+SWY= 1553
+cGFydA== 1554
+IHNpbmc= 1555
+IHNpdA== 1556
+IHB1cg== 1557
+aXA= 1558
+IGZpc2hpbmc= 1559
+IGVo 1560
+IHBhcg== 1561
+IHRvZ2V0aGVy 1562
+SGU= 1563
+IHdoZQ== 1564
+IHdoZXRoZXI= 1565
+IGJyYQ== 1566
+Illlcw== 1567
+IHB1bmlzaA== 1568
+U2hpcnQ= 1569
+IFllZG8= 1570
+IGZhcmV3 1571
+IGZhcmV3ZWxs 1572
+IGRhbmNl 1573
+IGxlc3M= 1574
+dXJhbA== 1575
+IGRlZg== 1576
+IGF0dGVtcHQ= 1577
+d2Vlbg== 1578
+IHNpZ24= 1579
+IHN5 1580
+ZmVyZW50 1581
+IGxlYXN0 1582
+c2Vy 1583
+b2I= 1584
+bmRpbmc= 1585
+IHNvcnJ5 1586
+IGp1bXBlZA== 1587
+IGphbg== 1588
+IGphbml0b3I= 1589
+aXplZA== 1590
+IHRvd2FyZA== 1591
+IG1vcg== 1592
+YXZpbmc= 1593
+IGJpdA== 1594
+IlRoaXM= 1595
+IHJlbWFyaw== 1596
+IGZ1dA== 1597
+IHdvbmRlcg== 1598
+IGZ1bg== 1599
+VGhlbg== 1600
+IGRlYw== 1601
+IHdob20= 1602
+IGRpZG4= 1603
+IHJlYw== 1604
+YmVj 1605
+Iklm 1606
+IGtuZXc= 1607
+YWZ0ZXI= 1608
+IHRodXM= 1609
+IGlzbg== 1610
+IHNpZ2h0 1611
+bWVk 1612
+W0Y= 1613
+dXNz 1614
+Y2lkZW50 1615
+dGhlbQ== 1616
+IGZpZg== 1617
+IGRyYXc= 1618
+IGhlYXI= 1619
+IHdyaXRpbmc= 1620
+IGdldHRpbmc= 1621
+c2g= 1622
+ZmVyZW5jZQ== 1623
+IHJhaXNlZA== 1624
+dGhleQ== 1625
+YXg= 1626
+IGZpbmU= 1627
+c2Vs 1628
+IE5vYmU= 1629
+IE5vYmVvaw== 1630
+IE5vYmVva2E= 1631
+b3JtYWw= 1632
+IGVC 1633
+aWNlbnNl 1634
+MDA= 1635
+IGJlc3Q= 1636
+d29y 1637
+Zmlj 1638
+dGVyZXN0 1639
+IHJlbWFy 1640
+Ymw= 1641
+YXJ0ZWQ= 1642
+IGRhcms= 1643
+IHlvdW5n 1644
+dXNo 1645
+IGJldA== 1646
+b3V0aA== 1647
+aG91c2U= 1648
+YXVnaHQ= 1649
+IHBoeXM= 1650
+IHN0cm9uZw== 1651
+IGZ1cg== 1652
+IHJvbGw= 1653
+Y292ZQ== 1654
+Y2hpZWY= 1655
+YXdh 1656
+IGZvbGxvd2Vk 1657
+IGZvbmQ= 1658
+IGZ1dHVyZQ== 1659
+aXJk 1660
+ZnVsbHk= 1661
+IGVmZm9ydA== 1662
+QWZ0ZXI= 1663
+b3dhcmQ= 1664
+IHJlYWxseQ== 1665
+IGFtb25n 1666
+IGFyb3VuZA== 1667
+IGNvbXBs 1668
+IGdheg== 1669
+IGJvdw== 1670
+YXRlcg== 1671
+IGluc2lzdA== 1672
+IHR1cm5lZA== 1673
+aGVs 1674
+cmVt 1675
+IGhvdXJz 1676
+IGRlY2lkZWQ= 1677
+eXM= 1678
+IG1vbnRo 1679
+LWE= 1680
+IGFkdg== 1681
+IGJlbGlldmU= 1682
+IHRlYWNoaW5n 1683
+IGVhc3k= 1684
+IGRpcmVjdGlvbg== 1685
+b29rZWQ= 1686
+IHdhcg== 1687
+IHVubGVzcw== 1688
+aGF2ZQ== 1689
+IHNxdWFyZQ== 1690
+dmls 1691
+IHF1aWV0 1692
+IGh1bmc= 1693
+IGdvZXM= 1694
+IHBhaWQ= 1695
+IHNoYWxs 1696
+Ik5v 1697
+IHB1bmlzaG1lbnQ= 1698
+cG9zZQ== 1699
+IHN3ZWV0 1700
+J3Zl 1701
+IldlbGw= 1702
+IGdlbnRsZQ== 1703
+IG5vcm1hbA== 1704
+YWdyYXBo 1705
+Y2hpdmU= 1706
+Y2hhbg== 1707
+IGluY2x1ZA== 1708
+d3c= 1709
+b3Jn 1710
+dGVt 1711
+QVI= 1712
+IFRI 1713
+IGVxdQ== 1714
+IHRvbmU= 1715
+IHBvc3NpYmxl 1716
+IGJlY29t 1717
+IEphcGFuZXNl 1718
+dmVycw== 1719
+IGZvbGxvd2luZw== 1720
+IHBhaW4= 1721
+IHdob2xl 1722
+d3I= 1723
+IHNlcmlvdXM= 1724
+IG5hcg== 1725
+IHRpcmVk 1726
+SW4= 1727
+IHBsYXk= 1728
+IHByb20= 1729
+IGdhbWU= 1730
+IFNvbWU= 1731
+IGhhcHBlbmVk 1732
+IGN1dA== 1733
+IHR3ZW50eQ== 1734
+IGRvb3I= 1735
+IG1vcm5pbmc= 1736
+aGluZA== 1737
+IGJyZQ== 1738
+IGluc2lkZQ== 1739
+b3Zl 1740
+YWx0aA== 1741
+dWs= 1742
+YXJnZQ== 1743
+YW1i 1744
+IGRhbQ== 1745
+IHdvcnJ5 1746
+YXRpdmU= 1747
+IGV4cGVjdGVk 1748
+IGZhbQ== 1749
+IHByYQ== 1750
+IHBvY2tldA== 1751
+b29rcw== 1752
+Y2hlZA== 1753
+IHNpbA== 1754
+b2w= 1755
+IGZhdg== 1756
+IGVsc2U= 1757
+IGhpZ2g= 1758
+IHJlYWw= 1759
+IGFsb25n 1760
+IG1lZA== 1761
+aGlr 1762
+aGVtYXQ= 1763
+aGVtYXRpY3M= 1764
+IGxpc3Q= 1765
+IHNpY2s= 1766
+b2ludA== 1767
+W0Zvb3Q= 1768
+W0Zvb3Rub3Q= 1769
+W0Zvb3Rub3Rl 1770
+Ll0K 1771
+bmlnaHQ= 1772
+c2Vz 1773
+aW9y 1774
+IHNheXM= 1775
+IG1vdXRo 1776
+aG93 1777
+bWluZw== 1778
+IGNsbw== 1779
+IGN1cg== 1780
+Z2luZw== 1781
+IHN1ZGRlbmx5 1782
+LWFo 1783
+YW1w 1784
+IGJsYWNr 1785
+cm9zcw== 1786
+IGZhYw== 1787
+c2VsdmVz 1788
+aWV3 1789
+aXNzaW9u 1790
+IGNvcHlyaWdodA== 1791
+IHBhcmFncmFwaA== 1792
+IEFyY2hpdmU= 1793
+IGRvbmF0aW9ucw== 1794
+UHJvamVjdA== 1795
+IGNvc3Q= 1796
+Lm9yZw== 1797
+TEk= 1798
+dWNlZA== 1799
+IHN1Yw== 1800
+eWxl 1801
+IGZvcmNl 1802
+am95 1803
+b3VjaA== 1804
+dHI= 1805
+SXQ= 1806
+IHRyYWQ= 1807
+IHByZXNlbnQ= 1808
+IGV4dA== 1809
+YXNlZA== 1810
+cmVkaXQ= 1811
+IGZhdWx0 1812
+aWI= 1813
+LW0= 1814
+dXJk 1815
+IHRyaWVk 1816
+dGltZQ== 1817
+IHByZXQ= 1818
+IHNwZWU= 1819
+b3dlcg== 1820
+IHdvcmRz 1821
+Q0hBUA== 1822
+Q0hBUFRFUg== 1823
+c2Nob29s 1824
+IGFzaw== 1825
+IGRvaW5n 1826
+YXRlbHk= 1827
+IHVudGls 1828
+Ym91dA== 1829
+IHRyZWU= 1830
+Y2FsbA== 1831
+YW1hc2g= 1832
+YW1hc2hpcg== 1833
+YW1hc2hpcm8= 1834
+c3Rl 1835
+IGJlaGluZA== 1836
+b2xk 1837
+IHdhbGw= 1838
+aXRvcnk= 1839
+IHJvbGxlZA== 1840
+IG1vdmU= 1841
+IGFwb2xvZ2l6ZQ== 1842
+IGxhcmdl 1843
+YW1ib28= 1844
+c3U= 1845
+IHNldHRsZWQ= 1846
+Ikhl 1847
+d28= 1848
+IHRoaW5raW5n 1849
+dXNlZA== 1850
+aWZpZWQ= 1851
+IGFsbW9zdA== 1852
+IHRyZQ== 1853
+IHRyZWF0 1854
+IG5vb2RsZQ== 1855
+IG5vdGU= 1856
+IEFsbA== 1857
+IGJlYXQ= 1858
+IG9iamVjdA== 1859
+IHNlZW1z 1860
+IGlkZQ== 1861
+WWVz 1862
+b3dz 1863
+IHJlbWFpbg== 1864
+IGJlZ2lu 1865
+dWdodA== 1866
+bWVudHM= 1867
+IGFsb25l 1868
+c3BlY3Q= 1869
+IG1hdGhlbWF0aWNz 1870
+IHJvdWdo 1871
+IG91dHNpZGU= 1872
+IGNvbWVz 1873
+YmFjaw== 1874
+IHdpbmQ= 1875
+c2Vk 1876
+IHdvdWxkbg== 1877
+ZWVy 1878
+aW51dA== 1879
+ZnJvbQ== 1880
+IHJlcGw= 1881
+IG5hcnJvdw== 1882
+IGluY2lkZW50 1883
+IGFpcg== 1884
+IHNlYQ== 1885
+dHM= 1886
+IHN1cnByaXNlZA== 1887
+IHRlYQ== 1888
+UmVk 1889
+IHRhbGtpbmc= 1890
+IGJvc3M= 1891
+cXVl 1892
+IHBpY3Q= 1893
+aXJ0eQ== 1894
+IGNl 1895
+IGxpbQ== 1896
+IFdoeQ== 1897
+IHBvaW50 1898
+IGxhdw== 1899
+Y2lhdGVk 1900
+IG1vb24= 1901
+aXJjdQ== 1902
+Z290 1903
+IElz 1904
+IGhhbmRz 1905
+IGhvbm9y 1906
+YXV0 1907
+cmdl 1908
+IHN0YXRl 1909
+IExpdGVyYXJ5 1910
+LkY= 1911
+VGhpcw== 1912
+bGluZQ== 1913
+Lmc= 1914
+Lmd1dGVuYmVyZw== 1915
+IE9G 1916
+RU4= 1917
+cmFjdGVy 1918
+IGJlbmU= 1919
+IEV2ZW4= 1920
+b3Vi 1921
+IG1ha2Vz 1922
+IGludGVyZXN0 1923
+b3Bl 1924
+bXM= 1925
+IHJlc3BvbnM= 1926
+IGZvcmU= 1927
+IHNvbWV3aGF0 1928
+IGhvbmVzdA== 1929
+b2Nr 1930
+aXJpdA== 1931
+IGhlbGQ= 1932
+IGFkZGVk 1933
+ZnU= 1934
+YWRlZA== 1935
+YWxz 1936
+YXR0 1937
+dGVybg== 1938
+IHBlcnNvbmFs 1939
+IGFzcw== 1940
+IFdpdGg= 1941
+dGlj 1942
+VG9reW8= 1943
+IHNob3V0 1944
+IHByZXR0eQ== 1945
+dW1i 1946
+IGVhcmx5 1947
+b3BwZWQ= 1948
+IGZ1cnRoZXI= 1949
+IGZyZQ== 1950
+ZXNpZGVz 1951
+IGJhbWJvbw== 1952
+IGly 1953
+bW9yZQ== 1954
+IGxpdmluZw== 1955
+IHJlY2VpdmVk 1956
+IGxpdmVk 1957
+IG1lYW50 1958
+IGNvd2FyZA== 1959
+cG9zaXRpb24= 1960
+IGxvYw== 1961
+aWxlZA== 1962
+IHRlbmRlcg== 1963
+IGNo 1964
+IEFmdGVy 1965
+Y2Vy 1966
+IGZhdm9y 1967
+d2hv 1968
+IGxpa2Vk 1969
+cmFuY2U= 1970
+IHByaQ== 1971
+a2lzaGE= 1972
+IHN0dWR5 1973
+IG9yZGVy 1974
+IGFmdGVyd2FyZA== 1975
+IGdyZWF0bHk= 1976
+IHVuYWJsZQ== 1977
+Z28= 1978
+IHdhaXQ= 1979
+ZXBpbmc= 1980
+aWRpbmc= 1981
+IGZvcnR5 1982
+IHNreQ== 1983
+IG9mZmljZQ== 1984
+d2lsbA== 1985
+IkQ= 1986
+d2Vs 1987
+IHN0YXRpb24= 1988
+Ym8= 1989
+aG90 1990
+c3VjaA== 1991
+IGxvdWQ= 1992
+IGF3 1993
+bGFuZA== 1994
+Pwo= 1995
+IHJlc3BlY3Q= 1996
+YW5jZXM= 1997
+aWVudA== 1998
+IG91Z2h0 1999
diff --git a/tests/test_utils.py b/tests/test_utils.py
index c575134e8f..8c2536a15d 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -18,7 +18,7 @@
 
 import torch
 from torch import nn
-from torchtune.modules import Tokenizer
+from torchtune.modules.tokenizers import SentencePieceTokenizer
 
 skip_if_cuda_not_available = unittest.skipIf(
     not torch.cuda.is_available(), "CUDA is not available"
@@ -39,8 +39,8 @@ def torch_version_ge(version: str) -> bool:
     return version in torch.__version__ or torch.__version__ >= version
 
 
-# Inherit from tokenizer class to reuse its tokenize_messages method
-class DummyTokenizer(Tokenizer):
+# Inherit from SentencePieceTokenizer class to reuse its tokenize_messages method
+class DummyTokenizer(SentencePieceTokenizer):
     def __init__(self):
         self.encodes_whitespace = False
 
diff --git a/tests/torchtune/datasets/test_alpaca_dataset.py b/tests/torchtune/datasets/test_alpaca_dataset.py
index 9b9cb56b07..2a05cefd06 100644
--- a/tests/torchtune/datasets/test_alpaca_dataset.py
+++ b/tests/torchtune/datasets/test_alpaca_dataset.py
@@ -10,9 +10,8 @@
 
 from tests.test_utils import get_assets_path
 from torchtune.data._common import CROSS_ENTROPY_IGNORE_IDX
-
 from torchtune.datasets import alpaca_cleaned_dataset, alpaca_dataset
-from torchtune.modules.tokenizer import Tokenizer
+from torchtune.modules.tokenizers import SentencePieceTokenizer
 
 
 class TestAlpacaDataset:
@@ -20,7 +19,7 @@ class TestAlpacaDataset:
     def tokenizer(self):
         # m.model is a pretrained Sentencepiece model using the following command:
         # spm.SentencePieceTrainer.train('--input=<TRAIN_FILE> --model_prefix=m --vocab_size=2000')
-        return Tokenizer.from_file(str(get_assets_path() / "m.model"))
+        return SentencePieceTokenizer(str(get_assets_path() / "m.model"))
 
     @patch("torchtune.datasets._instruct.load_dataset")
     def test_label_no_masking(self, load_dataset, tokenizer):
diff --git a/tests/torchtune/datasets/test_grammar_dataset.py b/tests/torchtune/datasets/test_grammar_dataset.py
index 5fb41d39eb..20c209f004 100644
--- a/tests/torchtune/datasets/test_grammar_dataset.py
+++ b/tests/torchtune/datasets/test_grammar_dataset.py
@@ -12,7 +12,7 @@
 from torchtune.data._common import CROSS_ENTROPY_IGNORE_IDX
 
 from torchtune.datasets import grammar_dataset
-from torchtune.modules.tokenizer import Tokenizer
+from torchtune.modules.tokenizers import SentencePieceTokenizer
 
 
 class TestGrammarDataset:
@@ -20,7 +20,7 @@ class TestGrammarDataset:
     def tokenizer(self):
         # m.model is a pretrained Sentencepiece model using the following command:
         # spm.SentencePieceTrainer.train('--input=<TRAIN_FILE> --model_prefix=m --vocab_size=2000')
-        return Tokenizer.from_file(str(get_assets_path() / "m.model"))
+        return SentencePieceTokenizer(str(get_assets_path() / "m.model"))
 
     @patch("torchtune.datasets._instruct.load_dataset")
     def test_label_no_masking(self, load_dataset, tokenizer):
diff --git a/tests/torchtune/datasets/test_samsum_dataset.py b/tests/torchtune/datasets/test_samsum_dataset.py
index 972b8bbb25..6ec6a52679 100644
--- a/tests/torchtune/datasets/test_samsum_dataset.py
+++ b/tests/torchtune/datasets/test_samsum_dataset.py
@@ -12,7 +12,7 @@
 from torchtune.data._common import CROSS_ENTROPY_IGNORE_IDX
 
 from torchtune.datasets import samsum_dataset
-from torchtune.modules.tokenizer import Tokenizer
+from torchtune.modules.tokenizers import SentencePieceTokenizer
 
 
 class TestSamsumDataset:
@@ -20,7 +20,7 @@ class TestSamsumDataset:
     def tokenizer(self):
         # m.model is a pretrained Sentencepiece model using the following command:
         # spm.SentencePieceTrainer.train('--input=<TRAIN_FILE> --model_prefix=m --vocab_size=2000')
-        return Tokenizer.from_file(str(get_assets_path() / "m.model"))
+        return SentencePieceTokenizer(str(get_assets_path() / "m.model"))
 
     @patch("torchtune.datasets._instruct.load_dataset")
     def test_label_no_masking(self, load_dataset, tokenizer):
diff --git a/tests/torchtune/datasets/test_slimorca_dataset.py b/tests/torchtune/datasets/test_slimorca_dataset.py
index 725b60d49d..03a8396271 100644
--- a/tests/torchtune/datasets/test_slimorca_dataset.py
+++ b/tests/torchtune/datasets/test_slimorca_dataset.py
@@ -10,7 +10,7 @@
 from tests.test_utils import get_assets_path
 
 from torchtune.datasets import slimorca_dataset
-from torchtune.modules.tokenizer import Tokenizer
+from torchtune.modules.tokenizers import SentencePieceTokenizer
 
 
 class TestSlimOrcaDataset:
@@ -18,7 +18,7 @@ class TestSlimOrcaDataset:
     def tokenizer(self):
         # m.model is a pretrained Sentencepiece model using the following command:
         # spm.SentencePieceTrainer.train('--input=<TRAIN_FILE> --model_prefix=m --vocab_size=2000')
-        return Tokenizer.from_file(str(get_assets_path() / "m.model"))
+        return SentencePieceTokenizer(str(get_assets_path() / "m.model"))
 
     @patch("torchtune.datasets._chat.load_dataset")
     def test_value_error(self, load_dataset, tokenizer):
diff --git a/tests/torchtune/models/test_llama3.py b/tests/torchtune/models/test_llama3.py
new file mode 100644
index 0000000000..190eaf413e
--- /dev/null
+++ b/tests/torchtune/models/test_llama3.py
@@ -0,0 +1,46 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import pytest
+import torch
+from tests.test_utils import fixed_init_model
+from torchtune.models.llama3 import llama3
+from torchtune.utils.seed import set_seed
+
+EMBED_DIM = 128
+NUM_LAYERS = 4
+NUM_HEADS = 16
+NUM_KV_HEADS = 8
+VOCAB_SIZE = 32000
+MAX_SEQ_LEN = 2048
+BSZ = 2
+SEQ_LEN = 100
+
+
+@pytest.fixture(autouse=True)
+def random():
+    set_seed(16)
+
+
+class TestLlama3:
+    @pytest.fixture
+    def inputs(self):
+        return torch.randint(0, VOCAB_SIZE, (BSZ, SEQ_LEN))
+
+    def test_forward(self, inputs):
+        model = llama3(
+            vocab_size=VOCAB_SIZE,
+            num_layers=NUM_LAYERS,
+            num_heads=NUM_HEADS,
+            num_kv_heads=NUM_KV_HEADS,
+            embed_dim=EMBED_DIM,
+            max_seq_len=MAX_SEQ_LEN,
+        )
+        fixed_init_model(model, min_val=-0.25, max_val=0.5)
+        actual = model(inputs)
+        expected = torch.tensor(3.9763)
+        assert actual.shape == (BSZ, SEQ_LEN, VOCAB_SIZE)
+        torch.testing.assert_close(actual.mean(), expected, atol=1e-4, rtol=1e-4)
diff --git a/tests/torchtune/modules/test_tokenizer.py b/tests/torchtune/modules/tokenizers/test_sentencepiece.py
similarity index 96%
rename from tests/torchtune/modules/test_tokenizer.py
rename to tests/torchtune/modules/tokenizers/test_sentencepiece.py
index 5ac4255e01..bc8f61c2a1 100644
--- a/tests/torchtune/modules/test_tokenizer.py
+++ b/tests/torchtune/modules/tokenizers/test_sentencepiece.py
@@ -8,17 +8,17 @@
 
 import pytest
 from torchtune.data._types import Message
-from torchtune.modules.tokenizer import Tokenizer
+from torchtune.modules.tokenizers import SentencePieceTokenizer
 
-ASSETS = Path(__file__).parent.parent.parent / "assets"
+ASSETS = Path(__file__).parent.parent.parent.parent / "assets"
 
 
-class TestTokenizer:
+class TestSentencePieceTokenizer:
     @pytest.fixture
     def tokenizer(self):
         # m.model is a pretrained Sentencepiece model using the following command:
         # spm.SentencePieceTrainer.train('--input=<TRAIN_FILE> --model_prefix=m --vocab_size=2000')
-        return Tokenizer.from_file(str(ASSETS / "m.model"))
+        return SentencePieceTokenizer(str(ASSETS / "m.model"))
 
     def test_encode(self, tokenizer):
         assert tokenizer.encode("Hello world!") == [
diff --git a/tests/torchtune/modules/tokenizers/test_tiktoken.py b/tests/torchtune/modules/tokenizers/test_tiktoken.py
new file mode 100644
index 0000000000..8796213e76
--- /dev/null
+++ b/tests/torchtune/modules/tokenizers/test_tiktoken.py
@@ -0,0 +1,192 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from pathlib import Path
+
+import pytest
+from torchtune.data._types import Message
+from torchtune.modules.tokenizers import TikTokenTokenizer
+
+ASSETS = Path(__file__).parent.parent.parent.parent / "assets"
+
+
+class TestTikTokenTokenizer:
+    @pytest.fixture
+    def tokenizer(self):
+        # Pretrained tiktoken model generated via the script in
+        # https://gist.github.com/ebsmothers/54b133dd87db6679b14318545aaa2de4
+        return TikTokenTokenizer(str(ASSETS / "tiktoken_small.model"))
+
+    @pytest.fixture
+    def texts(self):
+        return [
+            "I can see the sun. But even if I cannot see the sun, I know that it exists.",
+            "And to know that the sun is there - that is living.",
+        ]
+
+    @pytest.fixture
+    def messages(self, texts):
+        return [
+            Message(role="user", content=texts[0], masked=True),
+            Message(role="assistant", content=texts[1], masked=False),
+        ]
+
+    @pytest.fixture
+    def token_ids(self):
+        return [
+            73,
+            503,
+            654,
+            262,
+            376,
+            110,
+            46,
+            690,
+            720,
+            428,
+            270,
+            1119,
+            654,
+            262,
+            376,
+            110,
+            44,
+            270,
+            686,
+            334,
+            312,
+            522,
+            511,
+            115,
+            46,
+        ]
+
+    @pytest.fixture
+    def tokenized_messages(self, token_ids):
+        return (
+            [2000, 2006, 477, 273, 2007, 10, 10]
+            + token_ids
+            + [
+                2009,
+                2006,
+                520,
+                511,
+                446,
+                2007,
+                10,
+                10,
+                65,
+                269,
+                277,
+                686,
+                334,
+                262,
+                376,
+                110,
+                351,
+                443,
+                32,
+                45,
+                334,
+                351,
+                1955,
+                46,
+                2009,
+                2001,
+            ],
+            [
+                True,
+                True,
+                True,
+                True,
+                True,
+                True,
+                True,
+                True,
+                True,
+                True,
+                True,
+                True,
+                True,
+                True,
+                True,
+                True,
+                True,
+                True,
+                True,
+                True,
+                True,
+                True,
+                True,
+                True,
+                True,
+                True,
+                True,
+                True,
+                True,
+                True,
+                True,
+                True,
+                True,
+                False,
+                False,
+                False,
+                False,
+                False,
+                False,
+                False,
+                False,
+                False,
+                False,
+                False,
+                False,
+                False,
+                False,
+                False,
+                False,
+                False,
+                False,
+                False,
+                False,
+                False,
+                False,
+                False,
+                False,
+                True,
+            ],
+        )
+
+    def test_encode(self, tokenizer, texts, token_ids):
+        assert tokenizer.encode(texts[0], add_bos=True, add_eos=True) == [
+            tokenizer.bos_id
+        ] + token_ids + [tokenizer.eos_id]
+        assert tokenizer.encode(texts[0], add_bos=False, add_eos=False) == token_ids
+
+    def test_decode(self, tokenizer, texts, token_ids):
+        assert tokenizer.decode(token_ids) == texts[0]
+
+    def test_encode_and_decode(self, tokenizer, texts):
+        token_ids = tokenizer.encode(texts[0], add_bos=True, add_eos=True)
+        decoded_text = tokenizer.decode(token_ids)
+        assert texts[0] == decoded_text
+
+    def test_token_ids(self, tokenizer):
+        assert tokenizer.bos_id == 2000
+        assert tokenizer.eos_id == 2001
+        assert tokenizer.pad_id == -1
+        assert tokenizer.step_id == 2005
+        assert tokenizer.start_header_id == 2006
+        assert tokenizer.end_header_id == 2007
+        assert tokenizer.eom_id == 2008
+        assert tokenizer.eot_id == 2009
+        assert tokenizer.python_tag == 2255
+
+    def test_tokenizer_vocab_size(self, tokenizer):
+        assert tokenizer.base_vocab_size == 2000
+        assert tokenizer.vocab_size == 2256
+
+    def test_tokenize_messages(self, tokenizer, messages, tokenized_messages):
+        assert tokenizer.tokenize_messages(messages) == tokenized_messages
diff --git a/tests/torchtune/modules/tokenizers/test_utils.py b/tests/torchtune/modules/tokenizers/test_utils.py
new file mode 100644
index 0000000000..023f86f3fd
--- /dev/null
+++ b/tests/torchtune/modules/tokenizers/test_utils.py
@@ -0,0 +1,47 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import pytest
+from torchtune.modules.tokenizers._utils import _split_long_repetitions
+
+
+class TestUtils:
+    def test_split_long_repetitions(self):
+        normal_str = "Here is a normal string"
+        ten_spaces = "".join(10 * [" "])
+        space_str = ten_spaces.join(
+            ["Here", "is", "a", "string", "with", "long", "spaces"]
+        )
+        no_space_str = "".join(10 * ["ab"])
+
+        actual_split = _split_long_repetitions(normal_str, 5)
+        expected_split = ["Here is a norma", "l strin", "g"]
+        for actual_substr, expected_substr in zip(actual_split, expected_split):
+            assert actual_substr == expected_substr
+        with pytest.raises(StopIteration):
+            next(actual_split)
+
+        actual_split = _split_long_repetitions(space_str, 9)
+        expected_split = [
+            "Here" + ten_spaces[:-1],
+            " is" + ten_spaces[:-1],
+            " a" + ten_spaces[:-1],
+            " string" + ten_spaces[:-1],
+            " with" + ten_spaces[:-1],
+            " long" + ten_spaces[:-1],
+            " spaces",
+        ]
+        for actual_substr, expected_substr in zip(actual_split, expected_split):
+            assert actual_substr == expected_substr
+        with pytest.raises(StopIteration):
+            next(actual_split)
+
+        actual_split = _split_long_repetitions(no_space_str, 4)
+        expected_split = ["abab"] * 5
+        for actual_substr, expected_substr in zip(actual_split, expected_split):
+            assert actual_substr == expected_substr
+        with pytest.raises(StopIteration):
+            next(actual_split)
diff --git a/torchtune/_recipe_registry.py b/torchtune/_recipe_registry.py
index ddab8f45f1..d4b14a7486 100644
--- a/torchtune/_recipe_registry.py
+++ b/torchtune/_recipe_registry.py
@@ -31,6 +31,10 @@ class Recipe:
                 name="llama2/7B_full_low_memory",
                 file_path="llama2/7B_full_low_memory.yaml",
             ),
+            Config(
+                name="llama3/8B_full_single_device",
+                file_path="llama3/8B_full_single_device.yaml",
+            ),
             Config(
                 name="mistral/7B_full_low_memory",
                 file_path="mistral/7B_full_low_memory.yaml",
@@ -44,6 +48,7 @@ class Recipe:
         configs=[
             Config(name="llama2/7B_full", file_path="llama2/7B_full.yaml"),
             Config(name="llama2/13B_full", file_path="llama2/13B_full.yaml"),
+            Config(name="llama3/8B_full", file_path="llama3/8B_full.yaml"),
             Config(name="mistral/7B_full", file_path="mistral/7B_full.yaml"),
             Config(name="gemma/2B_full", file_path="gemma/2B_full.yaml"),
         ],
@@ -61,6 +66,14 @@ class Recipe:
                 name="llama2/7B_qlora_single_device",
                 file_path="llama2/7B_qlora_single_device.yaml",
             ),
+            Config(
+                name="llama3/8B_lora_single_device",
+                file_path="llama3/8B_lora_single_device.yaml",
+            ),
+            Config(
+                name="llama3/8B_qlora_single_device",
+                file_path="llama3/8B_qlora_single_device.yaml",
+            ),
             Config(
                 name="llama2/13B_qlora_single_device",
                 file_path="llama2/13B_qlora_single_device.yaml",
@@ -94,6 +107,7 @@ class Recipe:
             Config(name="llama2/7B_lora", file_path="llama2/7B_lora.yaml"),
             Config(name="llama2/13B_lora", file_path="llama2/13B_lora.yaml"),
             Config(name="llama2/70B_lora", file_path="llama2/70B_lora.yaml"),
+            Config(name="llama3/8B_lora", file_path="llama3/8B_lora.yaml"),
             Config(name="mistral/7B_lora", file_path="mistral/7B_lora.yaml"),
         ],
         supports_distributed=True,
diff --git a/torchtune/data/_converters.py b/torchtune/data/_converters.py
index 2c6025e8c8..5208220738 100644
--- a/torchtune/data/_converters.py
+++ b/torchtune/data/_converters.py
@@ -42,10 +42,11 @@ def sharegpt_to_llama2_messages(
 
     Returns:
         List[Message]: a list of messages with "role" and "content" fields. See `torchtune.datasets._types.Message`
-            and `torchtune.datasets._types.Dialogue` for more details.
+            for more details.
     """
     role_map = {"system": "system", "human": "user", "gpt": "assistant"}
     conversations = sample["conversations"]
+
     messages = []
     for message in conversations:
         role = role_map[message["from"]]
diff --git a/torchtune/data/_types.py b/torchtune/data/_types.py
index 4aba199e3c..087cafa008 100644
--- a/torchtune/data/_types.py
+++ b/torchtune/data/_types.py
@@ -12,6 +12,23 @@
 
 @dataclass
 class Message:
+    """
+    This dataclass represents individual messages in an instruction or chat dataset.
+
+    Note that the fields ipython and eot are only relevant when tokenizing with tiktoken,
+    as they inform handling of special tokens in that case.
+
+    Attributes:
+        role (Role): role of the message writer. Can be "system", "user", "assistant".
+        content (str): content of the message.
+        masked (bool): whether the message is masked in the sample. Default: False
+        ipython (bool): whether the message is an ipython call. Default: False
+        eot (bool): whether the message corresponds to the end of a turn. Should be true
+            except in the case of multiple consecutive assistant messages. Default: True
+    """
+
     role: Role
     content: str
     masked: bool = False
+    ipython: bool = False
+    eot: bool = True
diff --git a/torchtune/datasets/_alpaca.py b/torchtune/datasets/_alpaca.py
index c339086468..52cbce6d52 100644
--- a/torchtune/datasets/_alpaca.py
+++ b/torchtune/datasets/_alpaca.py
@@ -8,7 +8,7 @@
 
 from torchtune.data import AlpacaInstructTemplate
 from torchtune.datasets._instruct import InstructDataset
-from torchtune.modules import Tokenizer
+from torchtune.modules.tokenizers import Tokenizer
 
 
 def alpaca_dataset(
diff --git a/torchtune/datasets/_chat.py b/torchtune/datasets/_chat.py
index 69c7b2f2e5..3f026aad73 100644
--- a/torchtune/datasets/_chat.py
+++ b/torchtune/datasets/_chat.py
@@ -18,7 +18,7 @@
     sharegpt_to_llama2_messages,
     validate_messages,
 )
-from torchtune.modules import Tokenizer
+from torchtune.modules.tokenizers import Tokenizer
 
 
 class ChatDataset(Dataset):
diff --git a/torchtune/datasets/_grammar.py b/torchtune/datasets/_grammar.py
index e87c261faf..c7b4e05121 100644
--- a/torchtune/datasets/_grammar.py
+++ b/torchtune/datasets/_grammar.py
@@ -6,7 +6,7 @@
 
 from torchtune.data import GrammarErrorCorrectionTemplate
 from torchtune.datasets._instruct import InstructDataset
-from torchtune.modules import Tokenizer
+from torchtune.modules.tokenizers import Tokenizer
 
 
 def grammar_dataset(
diff --git a/torchtune/datasets/_instruct.py b/torchtune/datasets/_instruct.py
index 53f573c186..46e6ea3bba 100644
--- a/torchtune/datasets/_instruct.py
+++ b/torchtune/datasets/_instruct.py
@@ -10,15 +10,13 @@
 from datasets import load_dataset
 from torch.utils.data import Dataset
 from torchtune.config._utils import _get_instruct_template
-
 from torchtune.data import (
     CROSS_ENTROPY_IGNORE_IDX,
     InstructTemplate,
     Message,
     validate_messages,
 )
-
-from torchtune.modules import Tokenizer
+from torchtune.modules.tokenizers import Tokenizer
 
 
 class InstructDataset(Dataset):
diff --git a/torchtune/datasets/_preference.py b/torchtune/datasets/_preference.py
index 199ddfbae8..18871fefaf 100644
--- a/torchtune/datasets/_preference.py
+++ b/torchtune/datasets/_preference.py
@@ -12,7 +12,7 @@
 
 from torchtune.data import CROSS_ENTROPY_IGNORE_IDX, InstructTemplate, Message
 
-from torchtune.modules import Tokenizer
+from torchtune.modules.tokenizers import Tokenizer
 
 
 class PreferenceDataset(Dataset):
diff --git a/torchtune/datasets/_samsum.py b/torchtune/datasets/_samsum.py
index ba5561b64a..4fe750178e 100644
--- a/torchtune/datasets/_samsum.py
+++ b/torchtune/datasets/_samsum.py
@@ -6,7 +6,7 @@
 
 from torchtune.data import SummarizeTemplate
 from torchtune.datasets import InstructDataset
-from torchtune.modules import Tokenizer
+from torchtune.modules.tokenizers import Tokenizer
 
 
 def samsum_dataset(
diff --git a/torchtune/datasets/_slimorca.py b/torchtune/datasets/_slimorca.py
index 188aa692ba..dd70456f9f 100644
--- a/torchtune/datasets/_slimorca.py
+++ b/torchtune/datasets/_slimorca.py
@@ -8,7 +8,7 @@
 
 from torchtune.datasets._chat import ChatDataset
 
-from torchtune.modules import Tokenizer
+from torchtune.modules.tokenizers import Tokenizer
 
 
 def slimorca_dataset(
diff --git a/torchtune/datasets/_stack_exchanged_paired.py b/torchtune/datasets/_stack_exchanged_paired.py
index 5781cb4e55..f37b5d13cb 100644
--- a/torchtune/datasets/_stack_exchanged_paired.py
+++ b/torchtune/datasets/_stack_exchanged_paired.py
@@ -6,7 +6,7 @@
 
 from torchtune.data import StackExchangedPairedTemplate
 from torchtune.datasets._preference import PreferenceDataset
-from torchtune.modules import Tokenizer
+from torchtune.modules.tokenizers import Tokenizer
 
 
 def stack_exchanged_paired_dataset(
diff --git a/torchtune/models/gemma/_model_builders.py b/torchtune/models/gemma/_model_builders.py
index ea7ad953ed..f598510ac2 100644
--- a/torchtune/models/gemma/_model_builders.py
+++ b/torchtune/models/gemma/_model_builders.py
@@ -5,7 +5,8 @@
 # LICENSE file in the root directory of this source tree.
 from torchtune.models.gemma._component_builders import gemma
 
-from torchtune.modules import Tokenizer, TransformerDecoder
+from torchtune.modules import TransformerDecoder
+from torchtune.modules.tokenizers import SentencePieceTokenizer
 
 """
 Model builders build specific instantiations using component builders. For example
@@ -35,7 +36,7 @@ def gemma_2b() -> TransformerDecoder:
     )
 
 
-def gemma_tokenizer(path: str) -> Tokenizer:
-    tokenizer = Tokenizer.from_file(path)
+def gemma_tokenizer(path: str) -> SentencePieceTokenizer:
+    tokenizer = SentencePieceTokenizer(path)
     tokenizer.pad_id = 0
     return tokenizer
diff --git a/torchtune/models/llama2/_model_builders.py b/torchtune/models/llama2/_model_builders.py
index 839b10580f..15db69e146 100644
--- a/torchtune/models/llama2/_model_builders.py
+++ b/torchtune/models/llama2/_model_builders.py
@@ -8,7 +8,8 @@
 
 from torchtune.models.llama2._component_builders import llama2, lora_llama2
 
-from torchtune.modules import Tokenizer, TransformerDecoder
+from torchtune.modules import TransformerDecoder
+from torchtune.modules.tokenizers import SentencePieceTokenizer
 from torchtune.modules.peft import LORA_ATTN_MODULES
 
 
@@ -39,8 +40,8 @@ def llama2_7b() -> TransformerDecoder:
     )
 
 
-def llama2_tokenizer(path: str) -> Tokenizer:
-    tokenizer = Tokenizer.from_file(path)
+def llama2_tokenizer(path: str) -> SentencePieceTokenizer:
+    tokenizer = SentencePieceTokenizer(path)
     # Original tokenizer has no pad_id, which causes indexing errors when batch training
     tokenizer.pad_id = 0
     return tokenizer
diff --git a/torchtune/models/llama3/__init__.py b/torchtune/models/llama3/__init__.py
new file mode 100644
index 0000000000..99309b9300
--- /dev/null
+++ b/torchtune/models/llama3/__init__.py
@@ -0,0 +1,25 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from ._component_builders import llama3, lora_llama3
+
+from ._model_builders import (  # noqa
+    llama3_8b,
+    llama3_tokenizer,
+    lora_llama3_8b,
+    qlora_llama3_8b,
+)
+from ._model_utils import scale_hidden_dim_for_mlp
+
+__all__ = [
+    "llama3",
+    "llama3_8b",
+    "llama3_tokenizer",
+    "lora_llama3",
+    "lora_llama3_8b",
+    "qlora_llama3_8b",
+    "scale_hidden_dim_for_mlp",
+]
diff --git a/torchtune/models/llama3/_component_builders.py b/torchtune/models/llama3/_component_builders.py
new file mode 100644
index 0000000000..0828285645
--- /dev/null
+++ b/torchtune/models/llama3/_component_builders.py
@@ -0,0 +1,410 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from functools import partial
+from typing import List, Literal, Optional
+
+from torch import nn
+
+from torchtune.models.llama3._model_utils import scale_hidden_dim_for_mlp
+
+from torchtune.modules import (
+    CausalSelfAttention,
+    FeedForward,
+    KVCache,
+    RMSNorm,
+    RotaryPositionalEmbeddings,
+    TransformerDecoder,
+    TransformerDecoderLayer,
+)
+
+from torchtune.modules.common_utils import reparametrize_as_dtype_state_dict_post_hook
+
+from torchtune.modules.peft import LORA_ATTN_MODULES, LoRALinear
+
+"""
+Component builders for the Llama3 model and popular variants such as LoRA.
+
+TorchTune provides composable building blocks. Builder functions help
+stitch these building blocks into higher-level components. This design has
+two benefits:
+- The building blocks themselves are very flexible. For example, ``CausalSelfAttention``
+can take either nn.Linear or nn.LoRALinear for ``q_proj``.
+- Builder functions expose a set of configurable params which keep the constructors of
+the building blocks simple.
+"""
+
+
+# ------------------ Vanilla Llama3 ------------------
+
+def llama3(
+    vocab_size: int,
+    num_layers: int,
+    num_heads: int,
+    num_kv_heads: int,
+    embed_dim: int,
+    max_seq_len: int,
+    attn_dropout: float = 0.0,
+    rope_base: int = 500000.0,
+    intermediate_dim: Optional[int] = None,
+    norm_eps: float = 1e-5,
+) -> TransformerDecoder:
+    """
+    Build the decoder associated with the Llama3 model. This includes:
+    - Token embeddings
+    - num_layers number of TransformerDecoderLayer blocks
+    - RMS Norm layer applied to the output of the transformer
+    - Final projection into token space
+
+    Args:
+        vocab_size (int): number of tokens in vocabulary.
+        num_layers (int): number of layers in the transformer decoder.
+        num_heads (int): number of query heads. For MHA this is also the
+            number of heads for key and value
+        num_kv_heads (int): number of key and value heads. If specified,
+            user should ensure `num_heads` % `num_kv_heads` == 0. Default value is
+            `None`, in which case this is the same as MHA
+        embed_dim (int): embedding dimension for self-attention
+        max_seq_len (int): maximum sequence length the model will be run with, as used
+            by :func:`~torchtune.modules.KVCache`
+        attn_dropout (float): dropout value passed onto scaled_dot_product_attention.
+            Default: 0.0
+        intermediate_dim (Optional[int]): intermediate dimension for MLP. If not specified,
+            this is computed using :func:`~torchtune.modules.scale_hidden_dim_for_mlp`
+        norm_eps (float): epsilon in RMS norms.
+
+    Returns:
+        TransformerDecoder: Instantiation of Llama3 model.
+    """
+    head_dim = embed_dim // num_heads
+    num_kv_heads = num_kv_heads if num_kv_heads else num_heads
+    rope = RotaryPositionalEmbeddings(dim=head_dim, max_seq_len=max_seq_len, base=rope_base)
+    self_attn = CausalSelfAttention(
+        embed_dim=embed_dim,
+        num_heads=num_heads,
+        num_kv_heads=num_kv_heads,
+        head_dim=head_dim,
+        q_proj=nn.Linear(embed_dim, num_heads * head_dim, bias=False),
+        k_proj=nn.Linear(embed_dim, num_kv_heads * head_dim, bias=False),
+        v_proj=nn.Linear(embed_dim, num_kv_heads * head_dim, bias=False),
+        output_proj=nn.Linear(embed_dim, embed_dim, bias=False),
+        pos_embeddings=rope,
+        max_seq_len=max_seq_len,
+        attn_dropout=attn_dropout,
+    )
+    hidden_dim = intermediate_dim if intermediate_dim else scale_hidden_dim_for_mlp(embed_dim)
+    mlp = llama3_mlp(dim=embed_dim, hidden_dim=hidden_dim)
+    layer = TransformerDecoderLayer(
+        attn=self_attn,
+        mlp=mlp,
+        sa_norm=RMSNorm(dim=embed_dim, eps=norm_eps),
+        mlp_norm=RMSNorm(dim=embed_dim, eps=norm_eps),
+    )
+    tok_embeddings = nn.Embedding(vocab_size, embed_dim)
+    output_proj = nn.Linear(embed_dim, vocab_size, bias=False)
+    return TransformerDecoder(
+        tok_embeddings=tok_embeddings,
+        layer=layer,
+        num_layers=num_layers,
+        max_seq_len=max_seq_len,
+        num_heads=num_heads,
+        head_dim=head_dim,
+        norm=RMSNorm(embed_dim, eps=norm_eps),
+        output=output_proj,
+    )
+
+def llama3_mlp(dim: int, hidden_dim: int) -> FeedForward:
+    """
+    Build the MLP layer associated with the Llama model.
+    """
+    gate_proj = nn.Linear(dim, hidden_dim, bias=False)
+    down_proj = nn.Linear(hidden_dim, dim, bias=False)
+    up_proj = nn.Linear(dim, hidden_dim, bias=False)
+    return FeedForward(gate_proj=gate_proj, down_proj=down_proj, up_proj=up_proj)
+
+
+
+# ------------------ LoRA Llama3 ------------------
+
+
+def lora_llama3(
+    lora_attn_modules: List[LORA_ATTN_MODULES],
+    apply_lora_to_mlp: bool = False,
+    apply_lora_to_output: bool = False,
+    *,
+    # llama3 args
+    vocab_size: int,
+    num_layers: int,
+    num_heads: int,
+    num_kv_heads: int,
+    embed_dim: int,
+    max_seq_len: int,
+    intermediate_dim: Optional[int] = None,
+    attn_dropout: float = 0.0,
+    norm_eps: float = 1e-5,
+    rope_base: float = 500000.0,
+    # LoRA args
+    lora_rank: int,
+    lora_alpha: float,
+    lora_dropout: float = 0.0,
+    # Quantization args
+    quantize_base: bool = False,
+) -> TransformerDecoder:
+    """
+    Return a version of Llama3 (an instance of :func:`~torchtune.modules.TransformerDecoder`)
+    with LoRA applied to some of the linear layers in its self-attention modules.
+
+    Args:
+        lora_attn_modules (List[LORA_ATTN_MODULES]): list of which linear layers
+            LoRA should be applied to in each self-attention block. Options are
+            ``{"q_proj", "k_proj", "v_proj", "output_proj"}``.
+        apply_lora_to_mlp (bool): whether to apply LoRA to the MLP in each transformer layer.
+            Default: False
+        apply_lora_to_output (bool): whether to apply LoRA to the model's final output projection.
+            Default: False
+        vocab_size (int): number of tokens in vocabulary.
+        num_layers (int): number of layers in the transformer decoder.
+        num_heads (int): number of query heads. For MHA this is also the
+            number of heads for key and value
+        num_kv_heads (int): number of key and value heads. If specified,
+            user should ensure `num_heads` % `num_kv_heads` == 0. Default value is
+            `None`, in which case this is the same as MHA
+        embed_dim (int): embedding dimension for self-attention
+        max_seq_len (int): maximum sequence length the model will be run with, as used
+            by :func:`~torchtune.modules.KVCache`
+        attn_dropout (float): dropout value passed onto scaled_dot_product_attention.
+            Default: 0.0
+        intermediate_dim (Optional[int]): intermediate dimension for MLP. If not specified,
+            this is computed using :func:`~torchtune.modules.scale_hidden_dim_for_mlp`
+        norm_eps (float): epsilon in RMS norms.
+        lora_rank (int): rank of each low-rank approximation
+        lora_alpha (float): scaling factor for the low-rank approximation
+        lora_dropout (float): LoRA dropout probability. Default: 0.0
+        quantize_base: (bool): Whether to quantize base model weights or not. Only applied to base
+            weights within linear layers LoRA is applied to. The final output linear projection is not
+            supported for quantization currently.
+
+    Returns:
+        TransformerDecoder: Instantiation of Llama3 model with LoRA applied to
+        a subset of the attention projections in each layer.
+
+    """
+
+    self_attn = lora_llama3_self_attention(
+        lora_modules=lora_attn_modules,
+        embed_dim=embed_dim,
+        num_heads=num_heads,
+        num_kv_heads=num_kv_heads,
+        max_seq_len=max_seq_len,
+        attn_dropout=attn_dropout,
+        rope_base=rope_base,
+        lora_rank=lora_rank,
+        lora_alpha=lora_alpha,
+        lora_dropout=lora_dropout,
+        quantize_base=quantize_base,
+    )
+
+    hidden_dim = intermediate_dim if intermediate_dim else scale_hidden_dim_for_mlp(embed_dim)
+    if apply_lora_to_mlp:
+        mlp = lora_llama3_mlp(
+            dim=embed_dim,
+            hidden_dim=hidden_dim,
+            lora_rank=lora_rank,
+            lora_alpha=lora_alpha,
+            quantize_base=quantize_base,
+        )
+    else:
+        mlp = llama3_mlp(dim=embed_dim, hidden_dim=hidden_dim)
+
+    layer = TransformerDecoderLayer(
+        attn=self_attn,
+        mlp=mlp,
+        sa_norm=RMSNorm(dim=embed_dim, eps=norm_eps),
+        mlp_norm=RMSNorm(dim=embed_dim, eps=norm_eps),
+    )
+
+    tok_embeddings = nn.Embedding(vocab_size, embed_dim)
+
+    # TODO: quantize_base is not applied to final output_proj currently.
+    output_proj = (
+        LoRALinear(embed_dim, vocab_size, rank=lora_rank, alpha=lora_alpha)
+        if apply_lora_to_output
+        else nn.Linear(embed_dim, vocab_size, bias=False)
+    )
+    model = TransformerDecoder(
+        tok_embeddings=tok_embeddings,
+        layer=layer,
+        num_layers=num_layers,
+        max_seq_len=max_seq_len,
+        num_heads=num_heads,
+        head_dim=(embed_dim // num_heads),
+        norm=RMSNorm(embed_dim, eps=norm_eps),
+        output=output_proj,
+    )
+
+    if quantize_base:
+        # For QLoRA, we reparametrize 4-bit tensors to bf16, and offload to CPU on the fly
+        # so as to not increase peak memory
+        model._register_state_dict_hook(
+            partial(reparametrize_as_dtype_state_dict_post_hook, offload_to_cpu=True)
+        )
+
+    return model
+
+
+def lora_llama3_self_attention(
+    lora_modules: List[LORA_ATTN_MODULES],
+    *,
+    # CausalSelfAttention args
+    embed_dim: int,
+    num_heads: int,
+    num_kv_heads: int,
+    max_seq_len: int,
+    attn_dropout: float = 0.0,
+    rope_base: float = 500000.0,
+    # LoRA args
+    lora_rank: int,
+    lora_alpha: float,
+    lora_dropout: float = 0.0,
+    quantize_base: bool = False,
+) -> CausalSelfAttention:
+    """
+    Return an instance of :func:`~torchtune.modules.CausalSelfAttention` with LoRA
+    applied to a subset of its linear layers
+
+    Args:
+        lora_modules (List[LORA_ATTN_MODULES]): list of which linear layers
+            LoRA should be applied to. Options are ``{"q_proj", "k_proj", "v_proj",
+            "output_proj"}``.
+        embed_dim (int): embedding dimension for self-attention
+        num_heads (int): number of query heads. For MHA this is also the
+            number of heads for key and value
+        num_kv_heads (int): number of key and value heads. If specified,
+            user should ensure `num_heads` % `num_kv_heads` == 0. Default value is
+            `None`, in which case this is the same as MHA
+        max_seq_len (int): maximum sequence length the model will be run with, as used
+            by :func:`~torchtune.modules.KVCache`
+        attn_dropout (float): dropout value passed onto scaled_dot_product_attention.
+            Default: 0.0
+        lora_rank (int): rank of each low-rank approximation
+        lora_alpha (float): scaling factor for the low-rank approximation
+        lora_dropout (float): LoRA dropout probability. Default: 0.0
+        quantize_base (bool): Whether to quantize base model parameters for linear layers
+            LoRA is being applied to. Default is ``False``.
+
+    Returns:
+        CausalSelfAttention: instantiation of self-attention module with LoRA
+        applied to a subset of Q, K, V, output projections.
+
+    Raises:
+        ValueError: If lora_modules arg is an empty list
+    """
+    if not lora_modules:
+        raise ValueError(
+            f"Must pass one or more of {LORA_ATTN_MODULES} as lora_modules"
+        )
+
+    head_dim = embed_dim // num_heads
+    num_kv_heads = num_kv_heads if num_kv_heads else num_heads
+    q_proj = (
+        LoRALinear(
+            embed_dim,
+            num_heads * head_dim,
+            rank=lora_rank,
+            alpha=lora_alpha,
+            quantize_base=quantize_base,
+        )
+        if "q_proj" in lora_modules
+        else nn.Linear(embed_dim, num_heads * head_dim, bias=False)
+    )
+    k_proj = (
+        LoRALinear(
+            embed_dim,
+            num_kv_heads * head_dim,
+            rank=lora_rank,
+            alpha=lora_alpha,
+            quantize_base=quantize_base,
+        )
+        if "k_proj" in lora_modules
+        else nn.Linear(embed_dim, num_kv_heads * head_dim, bias=False)
+    )
+    v_proj = (
+        LoRALinear(
+            embed_dim,
+            num_kv_heads * head_dim,
+            rank=lora_rank,
+            alpha=lora_alpha,
+            quantize_base=quantize_base,
+        )
+        if "v_proj" in lora_modules
+        else nn.Linear(embed_dim, num_kv_heads * head_dim, bias=False)
+    )
+    output_proj = (
+        LoRALinear(
+            embed_dim,
+            embed_dim,
+            rank=lora_rank,
+            alpha=lora_alpha,
+            quantize_base=quantize_base,
+        )
+        if "output_proj" in lora_modules
+        else nn.Linear(embed_dim, embed_dim, bias=False)
+    )
+    rope = RotaryPositionalEmbeddings(dim=head_dim, max_seq_len=max_seq_len, base=rope_base)
+    self_attn = CausalSelfAttention(
+        embed_dim=embed_dim,
+        num_heads=num_heads,
+        num_kv_heads=num_kv_heads,
+        head_dim=head_dim,
+        q_proj=q_proj,
+        k_proj=k_proj,
+        v_proj=v_proj,
+        output_proj=output_proj,
+        pos_embeddings=rope,
+        max_seq_len=max_seq_len,
+        attn_dropout=attn_dropout,
+    )
+    return self_attn
+
+
+def lora_llama3_mlp(
+    *,
+    dim: int,
+    hidden_dim: int,
+    lora_rank: int,
+    lora_alpha: float,
+    lora_dropout: float = 0.0,
+    quantize_base: bool = False,
+) -> FeedForward:
+    gate_proj = LoRALinear(
+        in_dim=dim,
+        out_dim=hidden_dim,
+        rank=lora_rank,
+        alpha=lora_alpha,
+        dropout=lora_dropout,
+        quantize_base=quantize_base,
+    )
+    down_proj = LoRALinear(
+        in_dim=hidden_dim,
+        out_dim=dim,
+        rank=lora_rank,
+        alpha=lora_alpha,
+        dropout=lora_dropout,
+        quantize_base=quantize_base,
+    )
+    up_proj = LoRALinear(
+        in_dim=dim,
+        out_dim=hidden_dim,
+        rank=lora_rank,
+        alpha=lora_alpha,
+        dropout=lora_dropout,
+        quantize_base=quantize_base,
+    )
+    return FeedForward(
+        gate_proj=gate_proj,
+        down_proj=down_proj,
+        up_proj=up_proj,
+    )
diff --git a/torchtune/models/llama3/_model_builders.py b/torchtune/models/llama3/_model_builders.py
new file mode 100644
index 0000000000..4286a0f145
--- /dev/null
+++ b/torchtune/models/llama3/_model_builders.py
@@ -0,0 +1,109 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import List, Optional
+from functools import partial
+
+from torch import nn
+
+from torchtune.models.llama3._component_builders import llama3, lora_llama3
+from torchtune.models.llama3._model_utils import scale_hidden_dim_for_mlp
+
+from torchtune.modules import TransformerDecoder
+from torchtune.modules.tokenizers import TikTokenTokenizer
+from torchtune.modules.peft import LORA_ATTN_MODULES
+
+
+"""
+Model builders build specific instantiations using component builders. For example
+the llama3_8b model builder uses the llama3 component builder to create the
+Llama3 8B model.
+"""
+
+
+def llama3_8b() -> TransformerDecoder:
+    """
+    Builder for creating a Llama3 model initialized w/ the default 8b parameter values.
+
+    Returns:
+        TransformerDecoder: Instantiation of Llama3 8B model
+    """
+    return llama3(
+        vocab_size=128_256,
+        num_layers=32,
+        num_heads=32,
+        num_kv_heads=8,
+        embed_dim=4096,
+        max_seq_len=4096,
+        intermediate_dim=14336,
+        attn_dropout=0.0,
+        norm_eps=1e-5,
+        rope_base=500000.0,
+    )
+
+
+def llama3_tokenizer(path: str) -> TikTokenTokenizer:
+    tiktoken = TikTokenTokenizer(path)
+    tiktoken.pad_id = 0
+    return tiktoken
+
+
+def lora_llama3_8b(
+    lora_attn_modules: List[LORA_ATTN_MODULES],
+    apply_lora_to_mlp: bool = False,
+    apply_lora_to_output: bool = False,
+    lora_rank: int = 8,
+    lora_alpha: float = 16,
+    quantize_base: bool = False,
+) -> TransformerDecoder:
+    """
+    Builder for creating a Llama3 8B model with LoRA enabled.
+
+    The Llama3 defaults are the same as in :func:`~torchtune.models.llama3.llama3_8b`,
+    while LoRA default params are based on
+    https://github.com/tloen/alpaca-lora/blob/8bb8579e403dc78e37fe81ffbb253c413007323f/finetune.py#L41-L43.
+
+    Args:
+        lora_attn_modules (List[LORA_ATTN_MODULES]): list of which linear layers
+            LoRA should be applied to in each self-attention block. Options are
+            ``{"q_proj", "k_proj", "v_proj", "output_proj"}``.
+        apply_lora_to_mlp (bool): whether to apply LoRA to the MLP in each transformer layer.
+            Default: False
+        apply_lora_to_output (bool): whether to apply LoRA to the model's final output projection.
+            Default: False
+        lora_rank (int): rank of each low-rank approximation
+        lora_alpha (float): scaling factor for the low-rank approximation
+        quantize_base (bool): Whether to quantize base model weights
+
+    Returns:
+        TransformerDecoder: Instantiation of Llama3 8B model with LoRA applied
+    """
+    return lora_llama3(
+        lora_attn_modules=lora_attn_modules,
+        apply_lora_to_mlp=apply_lora_to_mlp,
+        apply_lora_to_output=apply_lora_to_output,
+        vocab_size=128_256,
+        num_layers=32,
+        num_heads=32,
+        num_kv_heads=8,
+        embed_dim=4096,
+        max_seq_len=4096,
+        intermediate_dim=14336,
+        attn_dropout=0.0,
+        norm_eps=1e-5,
+        rope_base=500000.0,
+        lora_rank=lora_rank,
+        lora_alpha=lora_alpha,
+        lora_dropout=0.05,
+        quantize_base=quantize_base,
+    )
+
+qlora_llama3_8b = partial(lora_llama3_8b, quantize_base=True)
+
+qlora_llama3_8b.__doc__ = """
+Builder for creating a Llama3 model with QLoRA enabled. Base model weights in linear layers
+that LoRA is applied to are quantized per the QLoRA paper: https://arxiv.org/abs/2305.14314.
+Please see `lora_llama3_8b` for full API arguments.
+"""
diff --git a/torchtune/models/llama3/_model_utils.py b/torchtune/models/llama3/_model_utils.py
new file mode 100644
index 0000000000..010c1bcc2f
--- /dev/null
+++ b/torchtune/models/llama3/_model_utils.py
@@ -0,0 +1,23 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+def scale_hidden_dim_for_mlp(dim: int, multiple_of: int = 256) -> int:
+    """Scale hidden dimension for MLP to keep number of parameters and computation constant.
+
+    Args:
+        dim (int): Input dimension.
+        multiple_of (int): Round scaled dimension to nearest multiple of `multiple_of` for clean computation.
+
+    Returns:
+        Scaled hidden dimension.
+    """
+    # Scale hidden dimension by (2/3)4d for SwiGLU to keep number of
+    # parameters and computation constant
+    hidden_dim = 4 * int(2 * dim / 3)
+    # Round hidden dimension to nearest multiple of `multiple_of`
+    hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
+    return hidden_dim
diff --git a/torchtune/models/mistral/_model_builders.py b/torchtune/models/mistral/_model_builders.py
index 3071608eb0..940d1820ae 100644
--- a/torchtune/models/mistral/_model_builders.py
+++ b/torchtune/models/mistral/_model_builders.py
@@ -7,7 +7,8 @@
 
 from torchtune.models.mistral._component_builders import mistral, lora_mistral
 
-from torchtune.modules import Tokenizer, TransformerDecoder
+from torchtune.modules import TransformerDecoder
+from torchtune.modules.tokenizers import SentencePieceTokenizer
 from torchtune.modules.peft import LORA_ATTN_MODULES
 from functools import partial
 
@@ -40,8 +41,8 @@ def mistral_7b() -> TransformerDecoder:
     )
 
 
-def mistral_tokenizer(path: str) -> Tokenizer:
-    tokenizer = Tokenizer.from_file(path)
+def mistral_tokenizer(path: str) -> SentencePieceTokenizer:
+    tokenizer = SentencePieceTokenizer(path)
     # Original tokenizer has no pad_id, which causes indexing errors when batch training
     tokenizer.pad_id = 0
     return tokenizer
diff --git a/torchtune/modules/__init__.py b/torchtune/modules/__init__.py
index 7d08ea5bd2..46b8e93b0f 100644
--- a/torchtune/modules/__init__.py
+++ b/torchtune/modules/__init__.py
@@ -11,7 +11,6 @@
 from .lr_schedulers import get_cosine_schedule_with_warmup  # noqa
 from .position_embeddings import RotaryPositionalEmbeddings  # noqa
 from .rms_norm import RMSNorm  # noqa
-from .tokenizer import Tokenizer  # noqa
 from .transformer import TransformerDecoder, TransformerDecoderLayer  # noqa
 
 __all__ = [
@@ -21,7 +20,6 @@
     "KVCache",
     "RotaryPositionalEmbeddings",
     "RMSNorm",
-    "Tokenizer",
     "TransformerDecoder",
     "TransformerDecoderLayer",
     "reparametrize_as_dtype_state_dict_post_hook",
diff --git a/torchtune/modules/tokenizers/__init__.py b/torchtune/modules/tokenizers/__init__.py
new file mode 100644
index 0000000000..069849bf35
--- /dev/null
+++ b/torchtune/modules/tokenizers/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from ._sentencepiece import SentencePieceTokenizer
+from ._tiktoken import TikTokenTokenizer
+from ._utils import Tokenizer
+
+__all__ = ["SentencePieceTokenizer", "TikTokenTokenizer", "Tokenizer"]
diff --git a/torchtune/modules/tokenizer.py b/torchtune/modules/tokenizers/_sentencepiece.py
similarity index 84%
rename from torchtune/modules/tokenizer.py
rename to torchtune/modules/tokenizers/_sentencepiece.py
index 07c0268fb4..94104faa2f 100644
--- a/torchtune/modules/tokenizer.py
+++ b/torchtune/modules/tokenizers/_sentencepiece.py
@@ -13,37 +13,31 @@
 WHITESPACE_CHARS = [" ", "\n", "\t", "\r", "\v"]
 
 
-class Tokenizer:
+class SentencePieceTokenizer:
     """A wrapper around SentencePieceProcessor.
 
     Args:
-        spm_model (SentencePieceProcessor): The SentencePiece model.
-        vocab_size (int): The size of the vocabulary.
-        bos_id (int): The ID of the beginning-of-sentence token.
-        eos_id (int): The ID of the end-of-sentence token.
-        pad_id (int): The ID of the padding token.
+        path (str): Path to pretrained tokenizer file.
 
     Example:
         # Accepts only non-batched input for now
-        >>> tokenizer = Tokenizer.from_file("/path/to/spm_model")
-        >>> tokenized_text = tokenizer.encode("Hello world!", add_bos=True, add_eos=True)
+        >>> tokenizer = SentencePieceTokenizer("/path/to/spm_model")
+        >>> tokenized_text = SentencePieceTokenizer.encode("Hello world!", add_bos=True, add_eos=True)
         >>> print(tokenized_text)
         [1, 31587, 29644, 102, 2]
     """
 
     def __init__(
         self,
-        spm_model: SentencePieceProcessor,
-        vocab_size: int,
-        bos_id: int,
-        eos_id: int,
-        pad_id: int,
+        path: str,
     ):
+        spm_model = SentencePieceProcessor()
+        spm_model.load(path)
         self.spm_model = spm_model
-        self.vocab_size = vocab_size
-        self.bos_id = bos_id
-        self.eos_id = eos_id
-        self.pad_id = pad_id
+        self.vocab_size = spm_model.vocab_size()
+        self.bos_id = spm_model.bos_id()
+        self.eos_id = spm_model.eos_id()
+        self.pad_id = spm_model.pad_id()
 
         # This is used in tokenize_messages: if the tokenizer does not
         # encode whitespace, then we can more easily split strings
@@ -52,20 +46,6 @@ def __init__(
             [self.spm_model.encode(c) for c in WHITESPACE_CHARS]
         )
 
-    @classmethod
-    def from_file(cls, path: str) -> "Tokenizer":
-        """Initialize a `Tokenizer` instance from a SentencePiece model file.
-
-        Args:
-            path (str): The path to the SentencePiece model file.
-
-        Returns:
-            Tokenizer: A `Tokenizer` instance.
-        """
-        spm = SentencePieceProcessor()
-        spm.load(path)
-        return cls(spm, spm.vocab_size(), spm.bos_id(), spm.eos_id(), spm.pad_id())
-
     def encode(
         self,
         text: str,
@@ -135,7 +115,7 @@ def tokenize_messages(
         beginning off the tokenized s2.
 
         Example:
-            >>> tokenizer = Tokenizer.from_file(tokenizer_path)
+            >>> tokenizer = SentencePieceTokenizer(tokenizer_path)
             >>> messages = [
                 Message(role="system", content="system message\n", masked=True),
                 Message(role="user", content="user prompt\n", masked=True),
diff --git a/torchtune/modules/tokenizers/_tiktoken.py b/torchtune/modules/tokenizers/_tiktoken.py
new file mode 100644
index 0000000000..104cee1353
--- /dev/null
+++ b/torchtune/modules/tokenizers/_tiktoken.py
@@ -0,0 +1,367 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict, List, Optional, Tuple
+
+from tiktoken import Encoding
+from tiktoken.load import load_tiktoken_bpe
+from torchtune.data._types import Message
+from torchtune.modules.tokenizers._utils import (
+    _split_long_repetitions,
+    Tokenizer,
+    truncate,
+)
+
+
+CL100K_PATTERN = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""  # noqa
+
+# bos and eos tokens
+BEGIN_OF_TEXT = "<|begin_of_text|>"
+END_OF_TEXT = "<|end_of_text|>"
+# fill-in-the-middle tags
+FIM_PREFIX = "<|fim_prefix|>"
+FIM_MIDDLE = "<|fim_middle|>"
+FIM_SUFFIX = "<|fim_suffix|>"
+# start and end header tokens for formatting chat messages
+START_HEADER_ID = "<|start_header_id|>"
+END_HEADER_ID = "<|end_header_id|>"
+STEP_ID = "<|step_id|>"
+# different end of message tags
+EOM_ID = "<|eom_id|>"
+EOT_ID = "<|eot_id|>"
+# special token for ipython messages
+PYTHON_TAG = "<|python_tag|>"
+
+ALL_SPECIAL_TOKENS = [
+    BEGIN_OF_TEXT,
+    END_OF_TEXT,
+    FIM_PREFIX,
+    FIM_MIDDLE,
+    FIM_SUFFIX,
+    STEP_ID,
+    START_HEADER_ID,
+    END_HEADER_ID,
+    EOM_ID,
+    EOT_ID,
+    PYTHON_TAG,
+]
+
+PAD_ID = -1
+
+# Constants controlling encode logic
+MAX_ENCODE_CHARS = 400_000
+MAX_NO_WHITESPACE_CHARS = 25_000
+
+
+class TikTokenTokenizer(Tokenizer):
+    """A wrapper around tiktoken Encoding.
+
+    Args:
+        path (str): Path to pretrained tokenizer checkpoint file.
+        name (str): Name of the tokenizer (used by tiktoken for identification).
+        pattern (str): Regex pattern used to for string parsing.
+        all_special_tokens (Optional[List[str]]): List of all special tokens. First element
+            must be bos token, second element must be eos token, final element must be
+            python tag. All elements must be unique. Length must be at most 256.
+            Default: None (will use ALL_SPECIAL_TOKENS)
+        bos_token (str): Beginning of sequence token. Defaults to BEGIN_OF_TEXT.
+        eos_token (str): End of sequence token. Defaults to END_OF_TEXT.
+        start_header_id (str): Start header token. Defaults to START_HEADER_ID.
+        end_header_id (str): End header token. Defaults to END_HEADER_ID.
+        step_id (str): Step token. Defaults to STEP_ID.
+        eom_id (str): End of message token. Defaults to EOM_ID.
+        eot_id (str): End of turn token. Defaults to EOT_ID.
+        python_tag (str): Python tag token. Defaults to PYTHON_TAG.
+    """
+
+    def __init__(
+        self,
+        path: str,
+        *,
+        name: str = "llama3_tiktoken",
+        pattern: str = CL100K_PATTERN,
+        all_special_tokens: Optional[List[str]] = None,
+        bos_token: str = BEGIN_OF_TEXT,
+        eos_token: str = END_OF_TEXT,
+        start_header_id: str = START_HEADER_ID,
+        end_header_id: str = END_HEADER_ID,
+        step_id: str = STEP_ID,
+        eom_id: str = EOM_ID,
+        eot_id: str = EOT_ID,
+        python_tag: str = PYTHON_TAG,
+    ):
+        self.path = path
+        self.num_reserved_special_tokens = 256
+        all_special_tokens = all_special_tokens or ALL_SPECIAL_TOKENS
+        self._validate_special_tokens(
+            all_special_tokens=all_special_tokens,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            step_id=step_id,
+            start_header_id=start_header_id,
+            end_header_id=end_header_id,
+            eom_id=eom_id,
+            eot_id=eot_id,
+            python_tag=python_tag,
+        )
+        self.all_special_tokens = all_special_tokens
+
+        mergeable_ranks = load_tiktoken_bpe(self.path)
+        self.base_vocab_size = len(mergeable_ranks)
+        all_special_tokens_with_ids = self._get_all_special_tokens_with_ids()
+        self.tt_model = Encoding(
+            name=name,
+            pat_str=pattern,
+            mergeable_ranks=mergeable_ranks,
+            special_tokens={**all_special_tokens_with_ids},
+        )
+
+        # Encode BOS and EOS, define pad ID
+        self.bos_id = self._encode_special_token(self.all_special_tokens[0])
+        self.eos_id = self._encode_special_token(self.all_special_tokens[1])
+        self.pad_id = PAD_ID
+
+        self.vocab_size = self.tt_model.n_vocab
+
+        # Encode extra special tokens
+        self.step_id = self._encode_special_token(step_id)
+        self.start_header_id = self._encode_special_token(start_header_id)
+        self.end_header_id = self._encode_special_token(end_header_id)
+        self.eom_id = self._encode_special_token(eom_id)
+        self.eot_id = self._encode_special_token(eot_id)
+        self.python_tag = self._encode_special_token(python_tag)
+
+    def _validate_special_tokens(
+        self,
+        *,
+        all_special_tokens: List[str],
+        bos_token: str,
+        eos_token: str,
+        step_id: str,
+        start_header_id: str,
+        end_header_id: str,
+        eom_id: str,
+        eot_id: str,
+        python_tag: str,
+    ):
+        """
+        Validate all the special tokens are as expected. Should satisfy:
+
+        (1) bos_token, eos_token, step_id, start_header_id, end_header_id, eom_id,
+            eot_id, python_tag are all in all_special_tokens,
+        (2) bos_token should be first, eos_token should be second, python_tag should be last,
+        (3) all special tokens are unique, and
+        (4) at most 256 special tokens
+        """
+        for token in [
+            bos_token,
+            eos_token,
+            step_id,
+            start_header_id,
+            end_header_id,
+            eom_id,
+            eot_id,
+            python_tag,
+        ]:
+            assert (
+                token in all_special_tokens
+            ), f"{token} missing from all_special_tokens"
+        assert (
+            all_special_tokens[0] == bos_token
+        ), f"First special token must be bos, got {all_special_tokens[0]}"
+        assert (
+            all_special_tokens[1] == eos_token
+        ), f"Second special token must be eos, got {all_special_tokens[1]}"
+        assert (
+            all_special_tokens[-1] == python_tag
+        ), f"Last special token must be python_tag, got {all_special_tokens[-1]}"
+        assert len(set(all_special_tokens)) == len(
+            all_special_tokens
+        ), "Special tokens must be unique."
+        assert (
+            len(all_special_tokens) <= self.num_reserved_special_tokens
+        ), "The total number of basic and extra special tokens cannot exceed the number of reserved tokens."
+
+    def _get_all_special_tokens_with_ids(self) -> Dict[str, int]:
+        """
+        Returns a dictionary of all special tokens and their corresponding ids to be passed
+        to tiktoken Encoding.
+
+        There are 256 slots for special tokens, any remaining spaces beyond self.all_special_tokens
+        will be filled with dummy reserved tokens. Tokens will be added in the order:
+        (1) all special tokens but python_tag, (2) all reserved tokens, (3) python_tag.
+        """
+        reserved_tokens = [
+            f"<|reserved_special_token_{i}|>"
+            for i in range(
+                self.num_reserved_special_tokens - len(self.all_special_tokens)
+            )
+        ]
+        # Python tag special token should come last (validated in __init__)
+        all_special_tokens = (
+            self.all_special_tokens[:-1]
+            + reserved_tokens
+            + [self.all_special_tokens[-1]]
+        )
+
+        return {
+            token: self.base_vocab_size + i
+            for i, token in enumerate(all_special_tokens)
+        }
+
+    def _encode_special_token(self, token: str) -> int:
+        """
+        Encodes a special token.
+
+        Args:
+            token (str): The special token to encode.
+
+        Returns:
+            int: The encoded special token.
+        """
+        return self.tt_model.encode(
+            token,
+            allowed_special="all",
+            disallowed_special=(),
+        )[0]
+
+    def encode(
+        self,
+        text: str,
+        add_bos: bool,
+        add_eos: bool,
+    ) -> List[int]:
+        """
+        Encode a string into a list of token ids. Assumes that the string
+        contains no special tokens.
+
+        Args:
+            text (str): The string to encode.
+            add_bos (bool): Whether to add the beginning of sequence token.
+            add_eos (bool): Whether to add the end of sequence token.
+
+        Returns:
+            List[int]: The list of token ids.
+        """
+        substrs: List[str] = []
+        tokens = []
+        for i in range(0, len(text), MAX_ENCODE_CHARS):
+            substr = text[i : i + MAX_ENCODE_CHARS]
+            # See https://github.com/openai/tiktoken/issues/195
+            sliced_substr = _split_long_repetitions(substr, MAX_NO_WHITESPACE_CHARS)
+            substrs.extend(sliced_substr)
+        for substr in substrs:
+            # allowed_special and disallowed_special are used by tiktoken to define
+            # how special tokens are encoded. Our setting here is to encode any
+            # special token as regular text and prevent tiktoken from raising errors.
+            # This means we should only call encode on strings not containing special tokens.
+            tokens.extend(
+                self.tt_model.encode(
+                    substr,
+                    allowed_special=set(),
+                    disallowed_special=(),
+                )
+            )
+        if add_bos:
+            tokens.insert(0, self.bos_id)
+        if add_eos:
+            tokens.append(self.eos_id)
+        return tokens
+
+    def decode(
+        self,
+        token_ids: List[int],
+        truncate_at_eos: bool = True,
+    ) -> str:
+        """
+        Decode a list of token ids into a string.
+
+        Args:
+            token_ids (List[int]): The list of token ids.
+            truncate_at_eos (bool): Whether to truncate the string at the end of
+                sequence token.
+
+        Returns:
+            str: The decoded string.
+        """
+        if truncate_at_eos:
+            try:
+                k = token_ids.index(self.eos_id)
+            except ValueError:
+                k = None
+            if k:
+                token_ids = token_ids[:k]
+        token_ids = [token_id for token_id in token_ids if token_id != self.bos_id]
+        return self.tt_model.decode(token_ids)
+
+    def tokenize_message(
+        self, message: Message, tokenize_header: bool = False
+    ) -> List[int]:
+        """
+        Tokenize a message into a list of token ids.
+
+        Args:
+            message (Message): The message to tokenize.
+            tokenize_header (bool): Whether to prepend a tokenized header to each message.
+
+        Returns:
+            List[int]: The list of token ids.
+        """
+        if tokenize_header:
+            tokenized_header = (
+                [self.start_header_id]
+                + self.encode(message.role.strip(), add_bos=False, add_eos=False)
+                + [self.end_header_id]
+                + self.encode("\n\n", add_bos=False, add_eos=False)
+            )
+        else:
+            tokenized_header = []
+        tokenized_body = self.encode(
+            message.content.strip(), add_bos=False, add_eos=False
+        )
+        if message.ipython:
+            tokenized_body = [self.python_tag] + tokenized_body
+        tokenized_message = tokenized_header + tokenized_body
+        if message.eot:
+            tokenized_message = tokenized_message + [self.eot_id]
+        else:
+            tokenized_message = tokenized_message + [self.eom_id]
+        return tokenized_message
+
+    def tokenize_messages(
+        self,
+        messages: List[Message],
+        max_seq_len: Optional[int] = None,
+        tokenize_header: bool = True,
+    ) -> Tuple[List[int], List[bool]]:
+        """
+        Tokenize a list of messages into a list of token ids and masks.
+
+        Args:
+            messages (List[Message]): The list of messages to tokenize.
+            max_seq_len (Optional[int]): The maximum sequence length.
+            tokenize_header (bool): Whether to prepend a tokenized header to each message.
+
+        Returns:
+            Tuple[List[int], List[bool]]: The list of token ids and the list of masks.
+        """
+        tokens = [self.bos_id]
+        # bos and eos are always masked
+        mask = [True]
+        for message in messages:
+            tokenized_message = self.tokenize_message(
+                message, tokenize_header=tokenize_header
+            )
+            tokens = tokens + tokenized_message
+            mask = mask + ([message.masked] * len(tokenized_message))
+            if max_seq_len and len(tokens) >= max_seq_len:
+                break
+        tokens = tokens + [self.eos_id]
+        mask = mask + [True]
+        if max_seq_len:
+            tokens = truncate(tokens, max_seq_len, self.eos_id)
+            mask = truncate(mask, max_seq_len, True)
+        return tokens, mask
diff --git a/torchtune/modules/tokenizers/_utils.py b/torchtune/modules/tokenizers/_utils.py
new file mode 100644
index 0000000000..cdfaddfcdf
--- /dev/null
+++ b/torchtune/modules/tokenizers/_utils.py
@@ -0,0 +1,71 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Iterator, List, Protocol, Union
+
+from torchtune.data._types import Message
+
+
+class Tokenizer(Protocol):
+    """Abstract tokenizer"""
+
+    bos_id: int
+    eos_id: int
+    pad_id: int
+
+    def encode(self, text: str, **kwargs) -> List[int]:
+        """
+        Given a string, return the a list of token ids.
+        """
+
+    def decode(
+        self, token_ids: List[int], add_bos: bool, add_eos: bool, **kwargs
+    ) -> str:
+        """
+        Given a list of token ids, return the decoded text.
+        """
+
+    def tokenize_messages(self, token_ids: List[Message], **kwargs):
+        """
+        Given a list of messages, return a list of tokens for the concatenated
+        and formatted messages.
+        """
+        pass
+
+
+def truncate(
+    tokens: List[int],
+    max_seq_len: int,
+    eos_id: Union[int, bool],
+):
+    tokens_truncated = tokens[:max_seq_len]
+    if tokens_truncated[-1] != eos_id:
+        tokens_truncated[-1] = eos_id
+    return tokens_truncated
+
+
+def _split_long_repetitions(s: str, max_consecutive_slice_len: int) -> Iterator[str]:
+    """
+    Split the string `s` so that each substring contains no more than `max_consecutive_slice_len`
+    consecutive whitespaces or consecutive non-whitespaces
+    """
+    current_slice_len = 0
+    current_slice_is_space = s[0].isspace() if len(s) > 0 else False
+    slice_start = 0
+
+    for i in range(len(s)):
+        is_now_space = s[i].isspace()
+
+        if current_slice_is_space ^ is_now_space:
+            current_slice_len = 1
+            current_slice_is_space = is_now_space
+        else:
+            current_slice_len += 1
+            if current_slice_len > max_consecutive_slice_len:
+                yield s[slice_start:i]
+                slice_start = i
+                current_slice_len = 1
+    yield s[slice_start:]
diff --git a/torchtune/utils/_checkpointing/_checkpointer_utils.py b/torchtune/utils/_checkpointing/_checkpointer_utils.py
index ec1f4a47cd..a1391028e0 100644
--- a/torchtune/utils/_checkpointing/_checkpointer_utils.py
+++ b/torchtune/utils/_checkpointing/_checkpointer_utils.py
@@ -22,6 +22,7 @@ class ModelType(Enum):
     LLAMA2 = "llama2"
     MISTRAL = "mistral"
     GEMMA = "gemma"
+    LLAMA3 = "llama3"
 
 
 def get_path(input_dir: Path, filename: str, missing_ok: bool = False) -> Path: