diff --git a/README.md b/README.md
index dd39769439..02e71a2992 100644
--- a/README.md
+++ b/README.md
@@ -3,6 +3,11 @@
![Recipe Integration Test](https://github.com/pytorch/torchtune/actions/workflows/recipe_test.yaml/badge.svg)
[![](https://dcbadge.vercel.app/api/server/4Xsdn8Rr9Q?style=flat)](https://discord.gg/4Xsdn8Rr9Q)
+
+
+
+**Note: torchtune now supports Llama3! Currently we support the Llama3 8B Model with LoRA, QLoRA and Full fine-tune. Find more details in the [Llama3](#llama3) section!**
+
# torchtune
@@ -40,6 +45,7 @@ torchtune currently supports the following models.
| Model | Sizes |
|-----------------------------------------------|-----------|
+| [Llama3](https://llama.meta.com/llama3) | 8B [[models](torchtune/models/llama3/_model_builders.py), [configs](recipes/configs/llama3/)] |
| [Llama2](https://llama.meta.com/llama2/) | 7B, 13B [[models](torchtune/models/llama2/_model_builders.py), [configs](recipes/configs/llama2/)] |
| [Mistral](https://huggingface.co/mistralai) | 7B [[model](torchtune/models/mistral/_model_builders.py), [configs](recipes/configs/mistral/)] |
| [Gemma](https://huggingface.co/collections/google/gemma-release-65d5efbccdbb8c4202ec078b) | 2B [[model](torchtune/models/gemma/_model_builders.py), [configs](recipes/configs/gemma/)] |
@@ -54,8 +60,8 @@ torchtune provides the following fine-tuning recipes.
| Training | Fine-tuning Method |
|------------------------------------|------------------------------------|
-| Distributed Training [1 to 8 GPUs] | Full [[code](recipes/full_finetune_distributed.py), [example](recipes/configs/llama2/7B_full.yaml)], LoRA [[code](recipes/lora_finetune_distributed.py), [example](recipes/configs/llama2/7B_lora.yaml)] |
-| Single Device / Low Memory [1 GPU] | Full [[code](recipes/full_finetune_single_device.py), [example](recipes/configs/llama2/7B_full_low_memory.yaml)], LoRA + QLoRA [[code](recipes/lora_finetune_single_device.py), [example](recipes/configs/llama2/7B_qlora_single_device.yaml)] |
+| Distributed Training [1 to 8 GPUs] | Full [[code](recipes/full_finetune_distributed.py), [example](recipes/configs/llama3/8B_full.yaml)], LoRA [[code](recipes/lora_finetune_distributed.py), [example](recipes/configs/llama3/8B_lora.yaml)] |
+| Single Device / Low Memory [1 GPU] | Full [[code](recipes/full_finetune_single_device.py), [example](recipes/configs/llama3/8B_full_single_device.yaml)], LoRA + QLoRA [[code](recipes/lora_finetune_single_device.py), [example](recipes/configs/llama3/8B_lora_single_device.yaml)] |
| Single Device [1 GPU] | DPO [[code](recipes/full_finetune_distributed.py), [example](recipes/configs/llama2/7B_lora_dpo_single_device.yaml)]
@@ -69,14 +75,47 @@ This table captures the minimum memory requirements for our different recipes us
| Example HW Resources | Finetuning Method | Config | Model | Peak Memory per GPU
|--------------|-------------------|---------|------------|---------------------|
-| 1 x RTX 4090 | QLoRA | [qlora_finetune_single_device](recipes/configs/llama2/7B_qlora_single_device.yaml) | Llama-7B | 9.29 GB |
-| 2 x RTX 4090 | LoRA | [lora_finetune_distributed](recipes/configs/llama2/7B_lora.yaml) | Llama-7B | 20.95 GB |
-| 1 x RTX 4090 | LoRA | [lora_finetune_single_device](recipes/configs/llama2/7B_lora_single_device.yaml) | Llama-7B | 17.18 GB |
-| 1 x RTX 4090 | Full finetune | [full_finetune_single_device](recipes/configs/llama2/7B_full_low_memory.yaml) | Llama-7B | 14.97 GB |
-| 4 x RTX 4090 | Full finetune | [full_finetune_distributed](recipes/configs/llama2/7B_full.yaml) | Llama-7B | 22.9 GB |
+| 1 x RTX 4090 | QLoRA | [qlora_finetune_single_device](recipes/configs/llama2/7B_qlora_single_device.yaml) | Llama2-7B | 8.57 GB |
+| 2 x RTX 4090 | LoRA | [lora_finetune_distributed](recipes/configs/llama2/7B_lora.yaml) | Llama2-7B | 20.95 GB |
+| 1 x RTX 4090 | LoRA | [lora_finetune_single_device](recipes/configs/llama2/7B_lora_single_device.yaml) | Llama2-7B | 17.18 GB |
+| 1 x RTX 4090 | Full finetune | [full_finetune_single_device](recipes/configs/llama2/7B_full_low_memory.yaml) | Llama2-7B | 14.97 GB |
+| 4 x RTX 4090 | Full finetune | [full_finetune_distributed](recipes/configs/llama2/7B_full.yaml) | Llama2-7B | 22.9 GB |
* these are averaged over multiple runs, but there might be some variance based on the setup. We'll update this table regularly.
+
+
+## Llama3
+
+torchtune supports fine-tuning for the Llama3 8B models with support for 70B on its way. We currently support LoRA, QLoRA and Full-finetune on a single GPU as well as LoRA and Full fine-tune on multiple devices. For all the details, take a look at our [tutorial](https://pytorch.org/torchtune/main/tutorials/llama3.html).
+
+
+In our initial experiments, QLoRA has a peak allocated memory of ``~9GB`` while LoRA on a single GPU has a peak allocated memory of ``~19GB``. To get started, you can use our default configs to kick off training.
+
+- LoRA on a single GPU.
+
+```bash
+tune run lora_finetune_single_device --config llama3/8B_lora_single_device
+```
+
+- QLoRA on a single GPU
+
+```bash
+tune run lora_finetune_single_device --config llama3/8B_qlora_single_device
+```
+
+- LoRA on 2 GPUs
+
+```bash
+tune run --nproc_per_node 4 lora_finetune_distributed --config llama3/8B_lora
+```
+
+- Full fine-tune on 2 GPUs
+
+```bash
+tune run --nproc_per_node 2 full_finetune_distributed --config llama3/8B_full
+```
+
diff --git a/docs/source/api_ref_models.rst b/docs/source/api_ref_models.rst
index 2467599b2d..e9a29ede9c 100644
--- a/docs/source/api_ref_models.rst
+++ b/docs/source/api_ref_models.rst
@@ -4,6 +4,25 @@ torchtune.models
.. currentmodule:: torchtune.models
+llama3
+------
+
+All models from the `Llama3 family `_.
+
+.. code-block:: bash
+
+ tune download meta-llama/Meta-Llama-3-8B --hf-token
+
+
+.. autosummary::
+ :toctree: generated/
+ :nosignatures:
+
+ llama3.llama3_8b
+ llama3.lora_llama3_8b
+ llama3.qlora_llama3_8b
+
+
llama2
------
@@ -26,6 +45,7 @@ Pre-trained models can be downloaded from the Hugging Face Hub with the followin
llama2.lora_llama2_13b
llama2.qlora_llama2_13b
+
mistral
-------
diff --git a/docs/source/api_ref_modules.rst b/docs/source/api_ref_modules.rst
index 70d545357f..e38926f36f 100644
--- a/docs/source/api_ref_modules.rst
+++ b/docs/source/api_ref_modules.rst
@@ -17,10 +17,18 @@ Modeling Components and Building Blocks
get_cosine_schedule_with_warmup
RotaryPositionalEmbeddings
RMSNorm
- Tokenizer
TransformerDecoderLayer
TransformerDecoder
+Tokenizers
+------------------------
+
+.. autosummary::
+ :toctree: generated/
+ :nosignatures:
+
+ tokenizers.SentencePieceTokenizer
+ tokenizers.TikTokenTokenizer
PEFT Components
---------------
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 78340bd769..c55c723634 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -43,6 +43,13 @@ torchtune tutorials.
.. customcardstart::
+.. customcarditem::
+ :header: Llama3 in torchtune
+ :card_description:
+ :image: _static/img/generic-pytorch-logo.png
+ :link: tutorials/lora_finetune.html
+ :tags: finetuning,llama3
+
.. customcarditem::
:header: Finetuning with LoRA in torchtune
:card_description: Parameter-efficient finetuning of Llama2 using LoRA
@@ -88,6 +95,7 @@ torchtune tutorials.
:caption: Tutorials
:hidden:
+ tutorials/llama3
tutorials/lora_finetune
tutorials/qlora_finetune
tutorials/e2e_flow
diff --git a/docs/source/tutorials/first_finetune_tutorial.rst b/docs/source/tutorials/first_finetune_tutorial.rst
index dbff6a2d29..072cb2d79c 100644
--- a/docs/source/tutorials/first_finetune_tutorial.rst
+++ b/docs/source/tutorials/first_finetune_tutorial.rst
@@ -98,6 +98,8 @@ a single device. For a more in-depth discussion on LoRA in torchtune, you can se
|
+.. _tune_cp_label:
+
Modifying a config
------------------
YAML configs hold most of the important information needed for running your recipe.
diff --git a/docs/source/tutorials/llama3.rst b/docs/source/tutorials/llama3.rst
new file mode 100644
index 0000000000..ff1c0120e1
--- /dev/null
+++ b/docs/source/tutorials/llama3.rst
@@ -0,0 +1,328 @@
+====================
+Llama3 in torchtune
+====================
+
+.. grid:: 2
+
+ .. grid-item-card:: :octicon:`mortar-board;1em;` You will learn how to:
+
+ * Download the Llama3-8B weights and tokenizer
+ * Fine-tune Llama3-8B with LoRA and QLoRA
+ * Evaluate your fine-tuned Llama3-8B model
+ * Generate text with your fine-tuned model
+ * Quantize your model to speed up generation
+
+ .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites
+
+ * Be familiar with :ref:`torchtune`
+ * Make sure to :ref:`install torchtune`
+
+
+Llama3-8B
+----------
+
+`Llama3-8B `_ is a new model released by Meta AI that improves upon the performance of the Llama2 family
+of models across a `range of different benchmarks `_.
+There are a few main changes between Llama2-7B and Llama3-8B models:
+
+- Llama3-8B uses `grouped-query attention `_ instead of the standard multi-head attention from Llama2-7B
+- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models)
+- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_)
+- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B
+- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings `_
+
+|
+
+Getting access to Llama3-8B
+---------------------------
+
+First, let's download the model from Hugging Face. You will need to follow the instructions
+on the `official Meta page `_ to gain access to the model.
+Next, make sure you grab your Hugging Face token from `here `_.
+
+
+.. code-block:: bash
+
+ tune download meta-llama/Meta-Llama-3-8B \
+ --output-dir \
+ --hf-token
+
+|
+
+Fine-tuning Llama3-8B in torchtune
+----------------------------------
+
+torchtune provides `LoRA `_, `QLoRA `_, and full fine-tuning
+recipes for fine-tuning Llama3-8B on one or more GPUs. For more on LoRA in torchtune, see our :ref:`LoRA Tutorial `.
+For more on QLoRA in torchtune, see our :ref:`QLoRA Tutorial `.
+
+Let's take a look at how we can fine-tune Llama3-8B with LoRA on a single device using torchtune. In this example, we will fine-tune
+for one epoch on a common instruct dataset for illustrative purposes. The basic command for a single-device LoRA fine-tune is
+
+.. code-block:: bash
+
+ tune run lora_finetune_single_device --config llama3/8B_lora_single_device
+
+.. note::
+ To see a full list of recipes and their corresponding configs, simply run ``tune ls`` from the command line.
+
+We can also add command-line overrides as needed, e.g.
+
+.. code-block:: bash
+
+ tune run lora_finetune_single_device --config llama3/8B_lora_single_device \
+ checkpointer.checkpoint_dir= \
+ tokenizer.path=/tokenizer.model \
+ checkpointer.output_dir=
+
+This will load the Llama3-8B checkpoint and tokenizer from ```` used in the ``tune download`` command above,
+then save a final checkpoint in the same directory following the original format. For more details on the
+checkpoint formats supported in torchtune, see our :ref:`checkpointing deep-dive `.
+
+.. note::
+ To see the full set of configurable parameters for this (and other) configs we can use ``tune cp`` to copy (and modify)
+ the default config. ``tune cp`` can be used with recipe scripts too, in case you want to make more custom changes
+ that cannot be achieved by directly modifying existing configurable parameters. For more on ``tune cp`` see the section on
+ :ref:`modifying configs `.
+
+Once training is complete, the model checkpoints will be saved and their locations will be logged. For
+LoRA fine-tuning, the final checkpoint will contain the merged weights, and a copy of just the (much smaller) LoRA weights
+will be saved separately.
+
+In our experiments, we observed a peak memory usage of 18.5 GB. The default config can be trained on a consumer GPU with 24 GB VRAM.
+
+If you have multiple GPUs available, you can run the distributed version of the recipe.
+torchtune makes use of the `FSDP `_ APIs from PyTorch Distributed
+to shard the model, optimizer states, and gradients. This should enable you to increase your batch size, resulting in faster training.
+For example, on two devices:
+
+.. code-block:: bash
+
+ tune run --nproc_per_node 2 lora_finetune_distributed --config llama3/8B_lora
+
+Finally, if we want to use even less memory, we can leverage TorchTune's QLoRA recipe via:
+
+.. code-block:: bash
+
+ tune run lora_finetune_single_device --config llama3/8B_qlora_single_device
+
+Since our default configs enable full bfloat16 training, all of the above commands can be run with
+devices having at least 24 GB of VRAM, and in fact the QLoRA recipe should have peak allocated memory
+below 10 GB. You can also experiment with different configurations of LoRA and QLoRA, or even run a full fine-tune.
+Try it out!
+
+|
+
+Evaluating fine-tuned Llama3-8B models with EleutherAI's Eval Harness
+---------------------------------------------------------------------
+
+Now that we've fine-tuned Llama3-8B, what's next? Let's take our LoRA-finetuned model from the
+preceding section and look at a couple different ways we can evaluate its performance on the tasks we care about.
+
+First, torchtune provides an integration with
+`EleutherAI's evaluation harness `_
+for model evaluation on common benchmark tasks.
+
+.. note::
+ Make sure you've first installed the evaluation harness via :code:`pip install "lm_eval==0.4.*"`.
+
+For this tutorial we'll use the ``truthfulqa_mc2`` task from the harness.
+This task measures a model's propensity to be truthful when answering questions and
+measures the model's zero-shot accuracy on a question followed by one or more true
+responses and one or more false responses. First, let's copy the config so we can point the YAML
+file to our fine-tuned checkpoint files.
+
+.. code-block:: bash
+
+ tune cp eleuther_evaluation ./custom_eval_config.yaml
+
+Next, we modify ``custom_eval_config.yaml`` to include the fine-tuned checkpoints.
+
+.. code-block:: yaml
+
+ checkpointer:
+ _component_: torchtune.utils.FullModelMetaCheckpointer
+
+ # directory with the checkpoint files
+ # this should match the output_dir specified during
+ # fine-tuning
+ checkpoint_dir:
+
+ # checkpoint files for the fine-tuned model. These will be logged
+ # at the end of your fine-tune
+ checkpoint_files: [
+ consolidated.00.pth
+ ]
+
+ output_dir:
+ model_type: LLAMA3
+
+ # Make sure to update the tokenizer path to the right
+ # checkpoint directory as well
+ tokenizer:
+ _component_: torchtune.models.llama3.llama3_tokenizer
+ path: /tokenizer.model
+
+Finally, we can run evaluation using our modified config.
+
+.. code-block:: bash
+
+ tune run eleuther_eval --config ./custom_eval_config.yaml
+
+Try it for yourself and see what accuracy your model gets!
+
+|
+
+Generating text with our fine-tuned Llama3-8B model
+---------------------------------------------------
+
+Next, let's look at one other way we can evaluate our model: generating text! torchtune provides a
+`recipe for generation `_ as well.
+
+Similar to what we did, let's copy and modify the default generation config.
+
+.. code-block:: bash
+
+ tune cp generation ./custom_generation_config.yaml
+
+Now we modify ``custom_generation_config.yaml`` to point to our checkpoint and tokenizer.
+
+.. code-block:: yaml
+
+ checkpointer:
+ _component_: torchtune.utils.FullModelMetaCheckpointer
+
+ # directory with the checkpoint files
+ # this should match the output_dir specified during
+ # fine-tuning
+ checkpoint_dir:
+
+ # checkpoint files for the fine-tuned model. These will be logged
+ # at the end of your fine-tune
+ checkpoint_files: [
+ consolidated.00.pth
+ ]
+
+ output_dir:
+ model_type: LLAMA3
+
+ # Make sure to update the tokenizer path to the right
+ # checkpoint directory as well
+ tokenizer:
+ _component_: torchtune.models.llama3.llama3_tokenizer
+ path: /tokenizer.model
+
+Running generation with our LoRA-finetuned model, we see the following output:
+
+.. code-block:: bash
+
+ tune run generate --config ./custom_generation_config.yaml \
+ prompt="Hello, my name is"
+
+ [generate.py:122] Hello, my name is Sarah and I am a busy working mum of two young children, living in the North East of England.
+ ...
+ [generate.py:135] Time for inference: 10.88 sec total, 18.94 tokens/sec
+ [generate.py:138] Bandwidth achieved: 346.09 GB/s
+ [generate.py:139] Memory used: 18.31 GB
+
+Faster generation via quantization
+----------------------------------
+
+We can see that the model took just under 11 seconds, generating almost 19 tokens per second.
+We can speed this up a bit by quantizing our model. Here we'll use 4-bit weights-only quantization
+as provided by `torchao `_.
+
+If you've been following along this far, you know the drill by now.
+Let's copy the quantization config and point it at our fine-tuned model.
+
+.. code-block:: bash
+
+ tune cp quantization ./custom_quantization_config.yaml
+
+And update ``custom_quantization_config.yaml`` with the following:
+
+.. code-block:: yaml
+
+ checkpointer:
+ _component_: torchtune.utils.FullModelMetaCheckpointer
+
+ # directory with the checkpoint files
+ # this should match the output_dir specified during
+ # fine-tuning
+ checkpoint_dir:
+
+ # checkpoint files for the fine-tuned model. These will be logged
+ # at the end of your fine-tune
+ checkpoint_files: [
+ consolidated.00.pth
+ ]
+
+ output_dir:
+ model_type: LLAMA3
+
+To quantize the model, we can now run:
+
+.. code-block:: bash
+
+ tune run quantize ./custom_quantization_config.yaml
+
+ [quantize.py:90] Time for quantization: 2.93 sec
+ [quantize.py:91] Memory used: 23.13 GB
+ [quantize.py:104] Model checkpoint of size 4.92 GB saved to /tmp/Llama-3-8B-hf/meta_model_0-4w.pt
+
+We can see that the model is now under 5 GB, or just over four bits for each of the 8B parameters.
+
+.. note::
+ Unlike the fine-tuned checkpoints, the quantization recipe outputs a single checkpoint file. This is
+ because our quantization APIs currently don't support any conversion across formats.
+ As a result you won't be able to use these quantized models outside of torchtune.
+ But you should be able to use these with the generation and evaluation recipes within
+ torchtune. These results will help inform which quantization methods you should use
+ with your favorite inference engine.
+
+Let's take our quantized model and run the same generation again.
+First, we'll make one more change to our ``custom_generation_config.yaml``.
+
+.. code-block:: yaml
+
+ checkpointer:
+ # we need to use the custom TorchTune checkpointer
+ # instead of the HF checkpointer for loading
+ # quantized models
+ _component_: torchtune.utils.FullModelTorchTuneCheckpointer
+
+ # directory with the checkpoint files
+ # this should match the output_dir specified during
+ # fine-tuning
+ checkpoint_dir:
+
+ # checkpoint files point to the quantized model
+ checkpoint_files: [
+ meta_model_0-4w.pt,
+ ]
+
+ output_dir:
+ model_type: LLAMA3
+
+ # we also need to update the quantizer to what was used during
+ # quantization
+ quantizer:
+ _component_: torchtune.utils.quantization.Int4WeightOnlyQuantizer
+ groupsize: 256
+
+Let's re-run generation!
+
+.. code-block:: bash
+
+ tune run generate --config ./custom_generation_config.yaml \
+ prompt="Hello, my name is"
+
+ [generate.py:122] Hello, my name is Jake.
+ I am a multi-disciplined artist with a passion for creating, drawing and painting.
+ ...
+ Time for inference: 1.62 sec total, 57.95 tokens/sec
+
+By quantizing the model and running ``torch.compile`` we get over a 3x speedup!
+
+This is just the beginning of what you can do with Llama3-8B using torchtune and the broader ecosystem.
+We look forward to seeing what you build!
diff --git a/pyproject.toml b/pyproject.toml
index 72f17d1cb3..158a538f3a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,8 +15,12 @@ dependencies = [
"huggingface_hub",
"safetensors",
- # Miscellaneous
+ # Tokenization
"sentencepiece",
+ "tiktoken",
+ "blobfile>=2",
+
+ # Miscellaneous
"tqdm",
"omegaconf",
@@ -35,7 +39,7 @@ tune = "torchtune._cli.tune:main"
[project.optional-dependencies]
dev = [
- "bitsandbytes",
+ "bitsandbytes>=0.43.0",
"pre-commit",
"pytest",
"pytest-cov",
diff --git a/recipes/configs/generation.yaml b/recipes/configs/generation.yaml
index 6cd9c1ba87..96a54d3e5c 100644
--- a/recipes/configs/generation.yaml
+++ b/recipes/configs/generation.yaml
@@ -30,7 +30,7 @@ tokenizer:
# Generation arguments; defaults taken from gpt-fast
prompt: "Hello, my name is"
max_new_tokens: 300
-temperature: 0.8
+temperature: 0.6 # 0.8 and 0.6 are popular values to try
top_k: 300
quantizer: null
diff --git a/recipes/configs/llama3/8B_full.yaml b/recipes/configs/llama3/8B_full.yaml
new file mode 100644
index 0000000000..d2d060d269
--- /dev/null
+++ b/recipes/configs/llama3/8B_full.yaml
@@ -0,0 +1,77 @@
+# Config for multi-device full finetuning in full_finetune_distributed.py
+# using a Llama3 8B model
+#
+# This config assumes that you've run the following command before launching
+# this run:
+# tune download meta-llama/Meta-Llama-3-8B --output-dir /tmp/Meta-Llama-3-8B --hf-token
+#
+# To launch on 4 devices, run the following command from root:
+# tune run --nproc_per_node 4 full_finetune_distributed --config llama3/8B_full
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+# tune run --nproc_per_node 4 full_finetune_distributed --config llama3/8B_full checkpointer.checkpoint_dir=
+#
+# This config works best when the model is being fine-tuned on 2+ GPUs.
+# Single device full finetuning requires more memory optimizations. It's
+# best to use 8B_full_single_device.yaml for those cases
+
+
+# Tokenizer
+tokenizer:
+ _component_: torchtune.models.llama3.llama3_tokenizer
+ path: /tmp/Meta-Llama-3-8B/original/tokenizer.model
+
+# Dataset
+dataset:
+ _component_: torchtune.datasets.alpaca_dataset
+ train_on_input: True
+seed: null
+shuffle: True
+
+# Model Arguments
+model:
+ _component_: torchtune.models.llama3.llama3_8b
+
+checkpointer:
+ _component_: torchtune.utils.FullModelMetaCheckpointer
+ checkpoint_dir: /tmp/Meta-Llama-3-8B/original/
+ checkpoint_files: [
+ consolidated.00.pth
+ ]
+ recipe_checkpoint: null
+ output_dir: /tmp/Meta-Llama-3-8B/
+ model_type: LLAMA3
+resume_from_checkpoint: False
+
+# Fine-tuning arguments
+batch_size: 2
+epochs: 3
+
+optimizer:
+ _component_: torch.optim.AdamW
+ lr: 2e-5
+ foreach: False
+
+loss:
+ _component_: torch.nn.CrossEntropyLoss
+max_steps_per_epoch: null
+gradient_accumulation_steps: 1
+
+
+# Training env
+device: cuda
+
+# Memory management
+enable_activation_checkpointing: True
+
+# Reduced precision
+dtype: bf16
+
+# Logging
+metric_logger:
+ _component_: torchtune.utils.metric_logging.DiskLogger
+ log_dir: ${output_dir}
+output_dir: /tmp/alpaca-llama3-finetune
+log_every_n_steps: null
diff --git a/recipes/configs/llama3/8B_full_single_device.yaml b/recipes/configs/llama3/8B_full_single_device.yaml
new file mode 100644
index 0000000000..1ecc5e7b61
--- /dev/null
+++ b/recipes/configs/llama3/8B_full_single_device.yaml
@@ -0,0 +1,77 @@
+# Config for single device full finetuning in full_finetune_single_device.py
+# using a Llama3 8B model
+#
+# This config assumes that you've run the following command before launching
+# this run:
+# tune download meta-llama/Meta-Llama-3-8B --output-dir /tmp/Meta-Llama-3-8B --hf-token
+#
+# The default config uses an optimizer from bitsandbytes. If you do not have it installed,
+# you can install it with
+# pip install bitsandbytes
+#
+# To launch on a single device, run the following command from root:
+# tune run full_finetune_single_device --config llama3/8B_full_single_device
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+# tune run full_finetune_single_device --config llama3/8B_full_single_device checkpointer.checkpoint_dir=
+#
+# This config works only for training on single device.
+
+
+# Tokenizer
+tokenizer:
+ _component_: torchtune.models.llama3.llama3_tokenizer
+ path: /tmp/Meta-Llama-3-8B/original/tokenizer.model
+
+# Dataset
+dataset:
+ _component_: torchtune.datasets.alpaca_dataset
+ train_on_input: True
+seed: null
+shuffle: True
+
+# Model Arguments
+model:
+ _component_: torchtune.models.llama3.llama3_8b
+
+checkpointer:
+ _component_: torchtune.utils.FullModelMetaCheckpointer
+ checkpoint_dir: /tmp/Meta-Llama-3-8B/original/
+ checkpoint_files: [
+ consolidated.00.pth
+ ]
+ recipe_checkpoint: null
+ output_dir: /tmp/Meta-Llama-3-8B/
+ model_type: LLAMA3
+resume_from_checkpoint: False
+
+# Fine-tuning arguments
+batch_size: 2
+epochs: 3
+optimizer:
+ _component_: bitsandbytes.optim.AdamW8bit
+ lr: 2e-5
+loss:
+ _component_: torch.nn.CrossEntropyLoss
+max_steps_per_epoch: null
+gradient_accumulation_steps: 1
+optimizer_in_bwd: True
+compile: False
+
+# Training environment
+device: cuda
+
+# Memory management
+enable_activation_checkpointing: True
+
+# Reduced precision
+dtype: bf16
+
+# Logging
+metric_logger:
+ _component_: torchtune.utils.metric_logging.DiskLogger
+ log_dir: ${output_dir}
+output_dir: /tmp/alpaca-llama3-finetune
+log_every_n_steps: null
diff --git a/recipes/configs/llama3/8B_lora.yaml b/recipes/configs/llama3/8B_lora.yaml
new file mode 100644
index 0000000000..3b8479a823
--- /dev/null
+++ b/recipes/configs/llama3/8B_lora.yaml
@@ -0,0 +1,80 @@
+# Config for multi-device LoRA finetuning in lora_finetune_distributed.py
+# using a Llama3 8B model
+#
+# This config assumes that you've run the following command before launching
+# this run:
+# tune download meta-llama/Meta-Llama-3-8B --output-dir /tmp/Meta-Llama-3-8B --hf-token
+#
+# To launch on 2 devices, run the following command from root:
+# tune run --nproc_per_node 2 lora_finetune_distributed --config llama3/8B_lora
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+# tune run --nproc_per_node 2 lora_finetune_distributed --config llama3/8B_lora checkpointer.checkpoint_dir=
+#
+# This config works best when the model is being fine-tuned on 2+ GPUs.
+# For single device LoRA finetuning please use 8B_lora_single_device.yaml
+# or 8B_qlora_single_device.yaml
+
+# Tokenizer
+tokenizer:
+ _component_: torchtune.models.llama3.llama3_tokenizer
+ path: /tmp/Meta-Llama-3-8B/original/tokenizer.model
+
+# Model Arguments
+model:
+ _component_: torchtune.models.llama3.lora_llama3_8b
+ lora_attn_modules: ['q_proj', 'v_proj']
+ apply_lora_to_mlp: False
+ apply_lora_to_output: False
+ lora_rank: 8
+ lora_alpha: 16
+
+checkpointer:
+ _component_: torchtune.utils.FullModelMetaCheckpointer
+ checkpoint_dir: /tmp/Meta-Llama-3-8B/original/
+ checkpoint_files: [
+ consolidated.00.pth
+ ]
+ recipe_checkpoint: null
+ output_dir: /tmp/Meta-Llama-3-8B/
+ model_type: LLAMA3
+resume_from_checkpoint: False
+
+# Dataset and Sampler
+dataset:
+ _component_: torchtune.datasets.alpaca_cleaned_dataset
+ train_on_input: True
+seed: null
+shuffle: True
+batch_size: 2
+
+# Optimizer and Scheduler
+optimizer:
+ _component_: torch.optim.AdamW
+ weight_decay: 0.01
+ lr: 3e-4
+lr_scheduler:
+ _component_: torchtune.modules.get_cosine_schedule_with_warmup
+ num_warmup_steps: 100
+
+loss:
+ _component_: torch.nn.CrossEntropyLoss
+
+# Training
+epochs: 1
+max_steps_per_epoch: null
+gradient_accumulation_steps: 32
+
+# Logging
+output_dir: /tmp/lora_finetune_output
+metric_logger:
+ _component_: torchtune.utils.metric_logging.DiskLogger
+ log_dir: ${output_dir}
+log_every_n_steps: null
+
+# Environment
+device: cuda
+dtype: bf16
+enable_activation_checkpointing: False
diff --git a/recipes/configs/llama3/8B_lora_single_device.yaml b/recipes/configs/llama3/8B_lora_single_device.yaml
new file mode 100644
index 0000000000..b6b33466ca
--- /dev/null
+++ b/recipes/configs/llama3/8B_lora_single_device.yaml
@@ -0,0 +1,85 @@
+# Config for single device LoRA finetuning in lora_finetune_single_device.py
+# using a Llama3 8B model
+#
+# This config assumes that you've run the following command before launching
+# this run:
+# tune download meta-llama/Meta-Llama-3-8B --output-dir /tmp/Meta-Llama-3-8B --hf-token
+#
+# To launch on a single device, run the following command from root:
+# tune run lora_finetune_single_device --config llama3/8B_lora_single_device
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+# tune run lora_finetune_single_device --config llama3/8B_lora_single_device checkpointer.checkpoint_dir=
+#
+# This config works only for training on single device.
+
+
+# Model Arguments
+model:
+ _component_: torchtune.models.llama3.lora_llama3_8b
+ lora_attn_modules: ['q_proj', 'v_proj']
+ apply_lora_to_mlp: False
+ apply_lora_to_output: False
+ lora_rank: 8
+ lora_alpha: 16
+
+# Tokenizer
+tokenizer:
+ _component_: torchtune.models.llama3.llama3_tokenizer
+ path: /tmp/Meta-Llama-3-8B/original/tokenizer.model
+
+checkpointer:
+ _component_: torchtune.utils.FullModelMetaCheckpointer
+ checkpoint_dir: /tmp/Meta-Llama-3-8B/original/
+ checkpoint_files: [
+ consolidated.00.pth
+ ]
+ recipe_checkpoint: null
+ output_dir: /tmp/Meta-Llama-3-8B/
+ model_type: LLAMA3
+resume_from_checkpoint: False
+
+# Dataset and Sampler
+dataset:
+ _component_: torchtune.datasets.alpaca_cleaned_dataset
+ train_on_input: True
+seed: null
+shuffle: True
+batch_size: 2
+
+# Optimizer and Scheduler
+optimizer:
+ _component_: torch.optim.AdamW
+ weight_decay: 0.01
+ lr: 3e-4
+lr_scheduler:
+ _component_: torchtune.modules.get_cosine_schedule_with_warmup
+ num_warmup_steps: 100
+
+loss:
+ _component_: torch.nn.CrossEntropyLoss
+
+# Training
+epochs: 1
+max_steps_per_epoch: null
+gradient_accumulation_steps: 64
+compile: False
+
+# Logging
+output_dir: /tmp/lora_finetune_output
+metric_logger:
+ _component_: torchtune.utils.metric_logging.DiskLogger
+ log_dir: ${output_dir}
+log_every_n_steps: null
+
+# Environment
+device: cuda
+dtype: bf16
+enable_activation_checkpointing: True
+
+# Profiler (disabled)
+profiler:
+ _component_: torchtune.utils.profiler
+ enabled: False
diff --git a/recipes/configs/llama3/8B_qlora_single_device.yaml b/recipes/configs/llama3/8B_qlora_single_device.yaml
new file mode 100644
index 0000000000..a951b9d660
--- /dev/null
+++ b/recipes/configs/llama3/8B_qlora_single_device.yaml
@@ -0,0 +1,86 @@
+# Config for single device QLoRA with lora_finetune_single_device.py
+# using a Llama3 8B model
+#
+# This config assumes that you've run the following command before launching
+# this run:
+# tune download meta-llama/Meta-Llama-3-8B --output-dir /tmp/Meta-Llama-3-8B --hf-token
+#
+# To launch on a single device, run the following command from root:
+# tune run lora_finetune_single_device --config llama3/8B_qlora_single_device
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+# tune run lora_finetune_single_device --config llama3/8B_qlora_single_device checkpointer.checkpoint_dir=
+#
+# This config works only for training on single device.
+
+# Model Arguments
+model:
+ _component_: torchtune.models.llama3.qlora_llama3_8b
+ lora_attn_modules: ['q_proj', 'v_proj', 'k_proj', 'output_proj']
+ apply_lora_to_mlp: True
+ apply_lora_to_output: False
+ lora_rank: 8
+ lora_alpha: 16
+
+# Tokenizer
+tokenizer:
+ _component_: torchtune.models.llama3.llama3_tokenizer
+ path: /tmp/Meta-Llama-3-8B/original/tokenizer.model
+
+checkpointer:
+ _component_: torchtune.utils.FullModelMetaCheckpointer
+ checkpoint_dir: /tmp/Meta-Llama-3-8B/original/
+ checkpoint_files: [
+ consolidated.00.pth
+ ]
+ recipe_checkpoint: null
+ output_dir: /tmp/Meta-Llama-3-8B/
+ model_type: LLAMA3
+resume_from_checkpoint: False
+
+# Dataset and Sampler
+dataset:
+ _component_: torchtune.datasets.alpaca_cleaned_dataset
+ train_on_input: True
+seed: null
+shuffle: True
+batch_size: 2
+
+# Optimizer and Scheduler
+optimizer:
+ _component_: torch.optim.AdamW
+ weight_decay: 0.01
+ lr: 3e-4
+lr_scheduler:
+ _component_: torchtune.modules.get_cosine_schedule_with_warmup
+ num_warmup_steps: 100
+
+loss:
+ _component_: torch.nn.CrossEntropyLoss
+
+# Training
+epochs: 1
+max_steps_per_epoch: null
+gradient_accumulation_steps: 16
+# Note: compile for QLoRA is only supported on nightly
+# PyTorch (>= 2.4.0.dev20240408)
+compile: False
+
+# Logging
+output_dir: /tmp/qlora_finetune_output/
+metric_logger:
+ _component_: torchtune.utils.metric_logging.DiskLogger
+ log_dir: ${output_dir}
+log_every_n_steps: 1
+
+# Environment
+device: cuda
+dtype: bf16
+enable_activation_checkpointing: True
+
+# Profiler (disabled)
+profiler:
+ _component_: torchtune.utils.profiler
+ enabled: False
diff --git a/recipes/eleuther_eval.py b/recipes/eleuther_eval.py
index 81c0253f35..c6911886fa 100644
--- a/recipes/eleuther_eval.py
+++ b/recipes/eleuther_eval.py
@@ -15,7 +15,8 @@
from torch import nn
from torchtune import config, utils
-from torchtune.modules import Tokenizer, TransformerDecoder
+from torchtune.modules import TransformerDecoder
+from torchtune.modules.tokenizers import Tokenizer
from torchtune.recipe_interfaces import EvalRecipeInterface
diff --git a/recipes/full_finetune_single_device.py b/recipes/full_finetune_single_device.py
index 95aedacc17..8ea06343aa 100644
--- a/recipes/full_finetune_single_device.py
+++ b/recipes/full_finetune_single_device.py
@@ -396,7 +396,6 @@ def train(self) -> None:
== self.max_steps_per_epoch
):
break
-
input_ids, labels = batch
input_ids = input_ids.to(self._device)
labels = labels.to(self._device)
diff --git a/tests/assets/tiktoken_small.model b/tests/assets/tiktoken_small.model
new file mode 100644
index 0000000000..4bfad62542
--- /dev/null
+++ b/tests/assets/tiktoken_small.model
@@ -0,0 +1,2000 @@
+AA== 0
+AQ== 1
+Ag== 2
+Aw== 3
+BA== 4
+BQ== 5
+Bg== 6
+Bw== 7
+CA== 8
+CQ== 9
+Cg== 10
+Cw== 11
+DA== 12
+DQ== 13
+Dg== 14
+Dw== 15
+EA== 16
+EQ== 17
+Eg== 18
+Ew== 19
+FA== 20
+FQ== 21
+Fg== 22
+Fw== 23
+GA== 24
+GQ== 25
+Gg== 26
+Gw== 27
+HA== 28
+HQ== 29
+Hg== 30
+Hw== 31
+IA== 32
+IQ== 33
+Ig== 34
+Iw== 35
+JA== 36
+JQ== 37
+Jg== 38
+Jw== 39
+KA== 40
+KQ== 41
+Kg== 42
+Kw== 43
+LA== 44
+LQ== 45
+Lg== 46
+Lw== 47
+MA== 48
+MQ== 49
+Mg== 50
+Mw== 51
+NA== 52
+NQ== 53
+Ng== 54
+Nw== 55
+OA== 56
+OQ== 57
+Og== 58
+Ow== 59
+PA== 60
+PQ== 61
+Pg== 62
+Pw== 63
+QA== 64
+QQ== 65
+Qg== 66
+Qw== 67
+RA== 68
+RQ== 69
+Rg== 70
+Rw== 71
+SA== 72
+SQ== 73
+Sg== 74
+Sw== 75
+TA== 76
+TQ== 77
+Tg== 78
+Tw== 79
+UA== 80
+UQ== 81
+Ug== 82
+Uw== 83
+VA== 84
+VQ== 85
+Vg== 86
+Vw== 87
+WA== 88
+WQ== 89
+Wg== 90
+Ww== 91
+XA== 92
+XQ== 93
+Xg== 94
+Xw== 95
+YA== 96
+YQ== 97
+Yg== 98
+Yw== 99
+ZA== 100
+ZQ== 101
+Zg== 102
+Zw== 103
+aA== 104
+aQ== 105
+ag== 106
+aw== 107
+bA== 108
+bQ== 109
+bg== 110
+bw== 111
+cA== 112
+cQ== 113
+cg== 114
+cw== 115
+dA== 116
+dQ== 117
+dg== 118
+dw== 119
+eA== 120
+eQ== 121
+eg== 122
+ew== 123
+fA== 124
+fQ== 125
+fg== 126
+fw== 127
+gA== 128
+gQ== 129
+gg== 130
+gw== 131
+hA== 132
+hQ== 133
+hg== 134
+hw== 135
+iA== 136
+iQ== 137
+ig== 138
+iw== 139
+jA== 140
+jQ== 141
+jg== 142
+jw== 143
+kA== 144
+kQ== 145
+kg== 146
+kw== 147
+lA== 148
+lQ== 149
+lg== 150
+lw== 151
+mA== 152
+mQ== 153
+mg== 154
+mw== 155
+nA== 156
+nQ== 157
+ng== 158
+nw== 159
+oA== 160
+oQ== 161
+og== 162
+ow== 163
+pA== 164
+pQ== 165
+pg== 166
+pw== 167
+qA== 168
+qQ== 169
+qg== 170
+qw== 171
+rA== 172
+rQ== 173
+rg== 174
+rw== 175
+sA== 176
+sQ== 177
+sg== 178
+sw== 179
+tA== 180
+tQ== 181
+tg== 182
+tw== 183
+uA== 184
+uQ== 185
+ug== 186
+uw== 187
+vA== 188
+vQ== 189
+vg== 190
+vw== 191
+wA== 192
+wQ== 193
+wg== 194
+ww== 195
+xA== 196
+xQ== 197
+xg== 198
+xw== 199
+yA== 200
+yQ== 201
+yg== 202
+yw== 203
+zA== 204
+zQ== 205
+zg== 206
+zw== 207
+0A== 208
+0Q== 209
+0g== 210
+0w== 211
+1A== 212
+1Q== 213
+1g== 214
+1w== 215
+2A== 216
+2Q== 217
+2g== 218
+2w== 219
+3A== 220
+3Q== 221
+3g== 222
+3w== 223
+4A== 224
+4Q== 225
+4g== 226
+4w== 227
+5A== 228
+5Q== 229
+5g== 230
+5w== 231
+6A== 232
+6Q== 233
+6g== 234
+6w== 235
+7A== 236
+7Q== 237
+7g== 238
+7w== 239
+8A== 240
+8Q== 241
+8g== 242
+8w== 243
+9A== 244
+9Q== 245
+9g== 246
+9w== 247
++A== 248
++Q== 249
++g== 250
++w== 251
+/A== 252
+/Q== 253
+/g== 254
+/w== 255
+IHQ= 256
+aGU= 257
+IGE= 258
+aW4= 259
+IHM= 260
+IHc= 261
+IHRoZQ== 262
+IG8= 263
+cmU= 264
+IGI= 265
+b3U= 266
+ZWQ= 267
+IG0= 268
+bmQ= 269
+IEk= 270
+aGE= 271
+aXQ= 272
+ZXI= 273
+aW5n 274
+IGY= 275
+aXM= 276
+IHRv 277
+ZW4= 278
+b24= 279
+b3I= 280
+YXM= 281
+IGM= 282
+IG9m 283
+IGFuZA== 284
+IGQ= 285
+bGw= 286
+YXQ= 287
+YW4= 288
+YXI= 289
+IHA= 290
+IG4= 291
+IGlu 292
+bGU= 293
+b20= 294
+b3Q= 295
+IGJl 296
+IGg= 297
+dXQ= 298
+b3c= 299
+ZXM= 300
+aGF0 301
+IGc= 302
+IGhl 303
+IGhh 304
+IGw= 305
+IHdhcw== 306
+bGQ= 307
+Z2g= 308
+aWQ= 309
+Y2g= 310
+IHRo 311
+IGl0 312
+YXk= 313
+IG9u 314
+Y2U= 315
+c2U= 316
+ZW50 317
+IHN0 318
+bHk= 319
+dmU= 320
+ZXQ= 321
+c3Q= 322
+IFQ= 323
+IGU= 324
+IHk= 325
+Z2h0 326
+aXI= 327
+IG1l 328
+b28= 329
+YWw= 330
+aXRo 331
+IHJl 332
+aW0= 333
+IHRoYXQ= 334
+IGFz 335
+b3VsZA== 336
+cm8= 337
+YWQ= 338
+aW9u 339
+Lgo= 340
+aGVy 341
+IG15 342
+Y3Q= 343
+IG5vdA== 344
+IHdpdGg= 345
+IGZvcg== 346
+IHU= 347
+a2U= 348
+IHlvdQ== 349
+IFM= 350
+IGlz 351
+aWdodA== 352
+Igo= 353
+YW0= 354
+aWM= 355
+dXI= 356
+IGF0 357
+Li4= 358
+YWM= 359
+dGVy 360
+IHdo 361
+IGFu 362
+IHdl 363
+IFRoZQ== 364
+aWY= 365
+IG9y 366
+IGJ1dA== 367
+dmVy 368
+ICI= 369
+IHI= 370
+b3V0 371
+b21l 372
+IGhhZA== 373
+cHA= 374
+cXU= 375
+IHN1 376
+IHRoaXM= 377
+cmVk 378
+YXJk 379
+IHNv 380
+ZWxs 381
+IHdvdWxk 382
+IGhpcw== 383
+IHNo 384
+aW5l 385
+cmE= 386
+IHNl 387
+IGJ5 388
+LiIK 389
+IFA= 390
+aGVu 391
+IEE= 392
+IGhhdmU= 393
+IGZy 394
+IHNh 395
+IEg= 396
+IG9uZQ== 397
+ZW0= 398
+a2Vk 399
+aXJ0 400
+ZWN0 401
+IGhpbQ== 402
+IGxp 403
+IGFi 404
+YXRpb24= 405
+aGluZw== 406
+dGhl 407
+IFI= 408
+IGxl 409
+c3M= 410
+IFc= 411
+Y3U= 412
+aWxs 413
+J3Q= 414
+YXJ0 415
+YWxs 416
+LAo= 417
+b3du 418
+b3Jl 419
+IGFsbA== 420
+IGs= 421
+IGdv 422
+aGlydA== 423
+YW5k 424
+IG91dA== 425
+YW1l 426
+YWlu 427
+IGlm 428
+IG5v 429
+IGRv 430
+IHRoZXk= 431
+b29s 432
+dW4= 433
+dG8= 434
+IHVw 435
+IFJlZA== 436
+IG5l 437
+IEs= 438
+IGZyb20= 439
+IFNoaXJ0 440
+IHdvcg== 441
+b25n 442
+IHRoZXJl 443
+IHNhaWQ= 444
+cmk= 445
+YW50 446
+IEI= 447
+IGFueQ== 448
+dWQ= 449
+aW5k 450
+IHdoaQ== 451
+YWI= 452
+b3VuZA== 453
+IGFib3V0 454
+IHRoZW0= 455
+Y3Vw 456
+YWs= 457
+IGRl 458
+IHRl 459
+IE0= 460
+YWtl 461
+Y3VwaW5l 462
+aWc= 463
+IHdlcmU= 464
+b3JjdXBpbmU= 465
+aWw= 466
+Y2hvb2w= 467
+IHJv 468
+b29k 469
+IGFyZQ== 470
+aXZl 471
+IGxpa2U= 472
+eW8= 473
+IGhvdQ== 474
+J3M= 475
+b25l 476
+dXM= 477
+ZWw= 478
+dWw= 479
+YWNr 480
+b3A= 481
+LCI= 482
+dGg= 483
+YWNoZXI= 484
+dW0= 485
+YW5n 486
+IGZh 487
+YWc= 488
+IHNjaG9vbA== 489
+IGo= 490
+dGU= 491
+b2s= 492
+ZXNz 493
+dXN0 494
+ZXJz 495
+Li4uLg== 496
+IEM= 497
+dGhlcg== 498
+aGFu 499
+IHdoZW4= 500
+IHNw 501
+IG1hbg== 502
+IGNhbg== 503
+b3VnaA== 504
+IHdobw== 505
+IGdldA== 506
+IGRpZA== 507
+IHBv 508
+Y2k= 509
+IGFs 510
+aXN0 511
+IGNvbQ== 512
+bGY= 513
+YXU= 514
+IFBvcmN1cGluZQ== 515
+IHdoaWNo 516
+dmVu 517
+IGFm 518
+d24= 519
+YXNz 520
+YmVy 521
+IGV4 522
+b3Vz 523
+ZXN0 524
+bG8= 525
+IHRy 526
+ZWxsb3c= 527
+IHNheQ== 528
+b3VnaHQ= 529
+IHJvb20= 530
+IHNvbWU= 531
+LS0= 532
+IE8= 533
+YXRl 534
+IHY= 535
+aGVk 536
+YXA= 537
+IHR3 538
+IGJlYw== 539
+cmVl 540
+amVjdA== 541
+a3M= 542
+IGNvbg== 543
+IGJlZW4= 544
+ZW50cw== 545
+aWRl 546
+IGNvdWxk 547
+IEc= 548
+ZXA= 549
+IHBybw== 550
+bnQ= 551
+IGhvdXNl 552
+IGFn 553
+IElm 554
+IGtu 555
+IGZlbGxvdw== 556
+IHdoYXQ= 557
+d2F5 558
+aXNo 559
+IGFt 560
+aXRl 561
+bmRlcg== 562
+aW1l 563
+IHBy 564
+IHRlYWNoZXI= 565
+YXJl 566
+IGJv 567
+IHNoZQ== 568
+IE4= 569
+aWNl 570
+YXN0 571
+dXJl 572
+aWU= 573
+IHN1Y2g= 574
+dXRlbg== 575
+dXRlbmJlcg== 576
+dXRlbmJlcmc= 577
+IHF1 578
+bG93bg== 579
+IHdy 580
+cHQ= 581
+IEhl 582
+IHN0dWQ= 583
+aGVyZQ== 584
+IG1vcmU= 585
+cnk= 586
+dHRlcg== 587
+IFk= 588
+IG1heQ== 589
+aXR5 590
+IGxvbw== 591
+IG90aGVy 592
+aGlz 593
+IFBybw== 594
+IHdpbGw= 595
+IEl0 596
+b3J0 597
+IHNob3VsZA== 598
+dmVyeQ== 599
+d2U= 600
+IHBs 601
+YXNo 602
+LiI= 603
+IGFwcA== 604
+IGRheQ== 605
+dXJu 606
+cG8= 607
+IGhlcg== 608
+ICA= 609
+bm90 610
+Y2s= 611
+IHVu 612
+aGk= 613
+dmluZw== 614
+IG9sZA== 615
+IHRpbWU= 616
+IlQ= 617
+IHdheQ== 618
+YWJsZQ== 619
+PyIK 620
+IENsb3du 621
+IG9ubHk= 622
+dWI= 623
+YWNo 624
+IG9mZg== 625
+IHRoYW4= 626
+YWxseQ== 627
+IHRoZWly 628
+YmU= 629
+a2luZw== 630
+b3RoZXI= 631
+YXJ5 632
+YW5z 633
+YXRlZA== 634
+c2VsZg== 635
+IGdvaW5n 636
+dWNo 637
+b2xs 638
+IGJhY2s= 639
+aXlv 640
+LXQ= 641
+YW5jZQ== 642
+YWRl 643
+IFByb2plY3Q= 644
+c3A= 645
+IHR3bw== 646
+IHRob3VnaHQ= 647
+c28= 648
+IHJpZ2h0 649
+IGhlYWQ= 650
+dmVk 651
+IEQ= 652
+IHByZQ== 653
+IHNlZQ== 654
+IHVz 655
+IHN0dWRlbnRz 656
+Y2lw 657
+IGRvbg== 658
+IG5pZ2h0 659
+aW5jaXA= 660
+IEtpeW8= 661
+cGw= 662
+YXJlZA== 663
+IEd1dGVuYmVyZw== 664
+IGNv 665
+IGhvdw== 666
+b21ldA== 667
+ZmY= 668
+Ikk= 669
+LC0t 670
+IGFza2Vk 671
+aW5jaXBhbA== 672
+ZXZlcg== 673
+IGFj 674
+IEY= 675
+IG1ha2U= 676
+aXR0 677
+IG1pZ2h0 678
+Z2U= 679
+bGVk 680
+IGFmdGVy 681
+aWdu 682
+IGdy 683
+IG1hZGU= 684
+ZGQ= 685
+IGtub3c= 686
+IGNvbWU= 687
+IGJy 688
+dGhpbmc= 689
+IEJ1dA== 690
+IG1hdA== 691
+IE9u 692
+b3J5 693
+Y2w= 694
+IEU= 695
+Ymxl 696
+b2c= 697
+IHlvdXI= 698
+dWxs 699
+IHdvcms= 700
+ZWFy 701
+IHRocmVl 702
+aWVk 703
+YnV0 704
+VGhl 705
+cGU= 706
+YWNl 707
+IHN0YXJ0 708
+aWNr 709
+IG92ZXI= 710
+b3Vy 711
+IG11Y2g= 712
+IHdhbnQ= 713
+aW1w 714
+IHBhcnQ= 715
+aG8= 716
+aW5r 717
+ZW5jZQ== 718
+IGRvd24= 719
+IGV2ZW4= 720
+IHByaW5jaXBhbA== 721
+bGluZw== 722
+b3VudA== 723
+YXVzZQ== 724
+IGNs 725
+IGJs 726
+LXRt 727
+b21ldGhpbmc= 728
+IGludG8= 729
+b3Jt 730
+b2t5bw== 731
+IGRpcw== 732
+IGZl 733
+IGZhY2U= 734
+Li4uLi4u 735
+cmVzcw== 736
+bWVudA== 737
+aXJl 738
+IGFy 739
+dHk= 740
+IG1v 741
+cmVhdA== 742
+IGZpcg== 743
+cGVy 744
+IG91cg== 745
+Y28= 746
+IHRoZW4= 747
+IHRvbGQ= 748
+aW5ncw== 749
+IHRha2U= 750
+IGJlZw== 751
+bmVy 752
+aXRpb24= 753
+b3Nl 754
+IG93bg== 755
+IGFnYWlu 756
+IHNlZW0= 757
+aXNl 758
+IHdhdA== 759
+Ilc= 760
+IGZhcg== 761
+YWtpbmc= 762
+Zm9yZQ== 763
+YWR5 764
+LXM= 765
+bGVzcw== 766
+IHJldA== 767
+IHNoYQ== 768
+IGNhbWU= 769
+Z2Vy 770
+IGdvb2Q= 771
+YXRoZXI= 772
+YXJr 773
+cm93 774
+IGtl 775
+J20= 776
+IGhhcw== 777
+YXRo 778
+cHBlZA== 779
+IHdlbnQ= 780
+IHRlbGw= 781
+cXVhc2g= 782
+IGVu 783
+IGZpcnN0 784
+IGhvdA== 785
+aXo= 786
+IGF3YXk= 787
+IHNvbWV0aGluZw== 788
+IHJlbQ== 789
+IHRvd24= 790
+IHNt 791
+IFRoaXM= 792
+IGJldHRlcg== 793
+IFRoZW4= 794
+d2Fz 795
+b2Y= 796
+YmFyZA== 797
+IEw= 798
+bGk= 799
+ZmU= 800
+IFRva3lv 801
+IGxvbmc= 802
+aWx5 803
+IHN1cmU= 804
+IGxvb2tlZA== 805
+dWJiYXJk 806
+Y3Rpb24= 807
+b3Jk 808
+IG1hbnk= 809
+aW91cw== 810
+IHRvbw== 811
+IGhlcmU= 812
+b3M= 813
+IHVuZGVy 814
+YXNl 815
+bmc= 816
+cGVk 817
+b2Q= 818
+bWU= 819
+IGp1c3Q= 820
+IG5vdw== 821
+aW5jZQ== 822
+IGhlYXJk 823
+IGtpbmQ= 824
+IFRoZXk= 825
+IGJlZm9yZQ== 826
+aHk= 827
+IElu 828
+IGVudA== 829
+IGJvYXJk 830
+ISI= 831
+d2FyZA== 832
+IGJlaW5n 833
+IHdlbGw= 834
+ZXJt 835
+cmllZA== 836
+IHdyb25n 837
+YWlk 838
+eHQ= 839
+IHJldHVybg== 840
+aXRlZA== 841
+IHllbg== 842
+IG1hdHRlcg== 843
+IGNhbGw= 844
+IHRhbA== 845
+IFlvdQ== 846
+Y2Vk 847
+aXNlZA== 848
+IGNoYQ== 849
+b25z 850
+IHNhbWU= 851
+IG9uY2U= 852
+ZGF5 853
+ZnQ= 854
+IHN3 855
+IGJlY2F1c2U= 856
+IHRoaW5r 857
+IHdoZXJl 858
+IE5v 859
+IEh1YmJhcmQ= 860
+IFNxdWFzaA== 861
+IGNvcA== 862
+d2l0aA== 863
+ZXJlZA== 864
+b2xsb3c= 865
+IHBsYWNl 866
+aWRk 867
+Y2Vzcw== 868
+IHNob3c= 869
+aXNoYQ== 870
+IHJh 871
+IGxldHRlcg== 872
+bmU= 873
+dmVz 874
+YXRpbmc= 875
+cmFuZw== 876
+IGFmZg== 877
+IGhhbmQ= 878
+IHNj 879
+IHBlcnM= 880
+aW50 881
+cHI= 882
+c2lkZQ== 883
+ZnRlcg== 884
+IHNheWluZw== 885
+IGxhdQ== 886
+dGhhdA== 887
+IHdpdGhvdXQ= 888
+cm9u 889
+YWly 890
+bGVjdA== 891
+IFdoYXQ= 892
+ZWx0 893
+IHdoaWxl 894
+b2dh 895
+YXBlcg== 896
+IHBl 897
+b3k= 898
+IHNhdA== 899
+aWVz 900
+IGFkZA== 901
+IGRheXM= 902
+IHNwZQ== 903
+IGhv 904
+IGFucw== 905
+IGhhcg== 906
+IFdoZW4= 907
+IGFueXRoaW5n 908
+cGVu 909
+XQo= 910
+dGFpbg== 911
+IG11c3Q= 912
+IG5ldw== 913
+bGlj 914
+IHZv 915
+aGlsZQ== 916
+Z2V0 917
+IEFz 918
+IHZlcnk= 919
+J3Jl 920
+IGV2ZXJ5 921
+YXZl 922
+PyI= 923
+YWRnZXI= 924
+IEtvZ2E= 925
+IE1y 926
+cm91Z2g= 927
+dWx0 928
+IGZvbGxvdw== 929
+dGluZw== 930
+aWZl 931
+aWRkbGU= 932
+ZnVs 933
+YW5r 934
+IFNv 935
+IHNlZW1lZA== 936
+IEFuZA== 937
+aXg= 938
+IHNldA== 939
+IGNhcmU= 940
+IHJlcw== 941
+IG5ldmVy 942
+IGZvdW5k 943
+IGxv 944
+Y2lk 945
+aW5lZA== 946
+IGNsYXNz 947
+IG15c2VsZg== 948
+YXc= 949
+IHdvbQ== 950
+YXRpb25z 951
+IGxlZnQ= 952
+IFdl 953
+IHRlYWNoZXJz 954
+Ilk= 955
+bmE= 956
+b250 957
+IGRlcw== 958
+IHRob3Nl 959
+aXJlZA== 960
+IHNlbg== 961
+eWluZw== 962
+IHRoZXNl 963
+YXo= 964
+IFRoZXJl 965
+Y2VwdA== 966
+IGRhbmc= 967
+IFU= 968
+Ikg= 969
+Ym9k 970
+Ym9keQ== 971
+IGhhdmluZw== 972
+YWxhcnk= 973
+IHdhdGNo 974
+IGdpdmU= 975
+YWdl 976
+IGl0cw== 977
+IGFwcGU= 978
+dWU= 979
+IGNvdW50 980
+IGhhcmQ= 981
+IGJlbA== 982
+b3R0 983
+IGRpc3Q= 984
+IlM= 985
+IE1hZA== 986
+LW4= 987
+cmlidXQ= 988
+Z2Vk 989
+IGF0dA== 990
+ZmVyZQ== 991
+aXRoZXI= 992
+IHVwb24= 993
+IHRlbQ== 994
+IHBlcnNvbg== 995
+bmluZw== 996
+IGNoZQ== 997
+YXJseQ== 998
+b25leQ== 999
+IHNvb24= 1000
+ZW1lbnQ= 1001
+ICg= 1002
+IHRyYW5z 1003
+IGV4cA== 1004
+IHNlcg== 1005
+IHJlZw== 1006
+YXNvbg== 1007
+IHNhdw== 1008
+IG5leHQ= 1009
+b290 1010
+IGhhbGY= 1011
+IHRvb2s= 1012
+IGJhZA== 1013
+IGhvdXI= 1014
+IHNhbGFyeQ== 1015
+IGJlZ2Fu 1016
+cmlnaHQ= 1017
+b25uYQ== 1018
+LXNhbg== 1019
+IHdvcmtz 1020
+IEo= 1021
+Zm9ybQ== 1022
+aWNhbA== 1023
+IHRyYQ== 1024
+bWFu 1025
+IG5vdGhpbmc= 1026
+IHN0aWxs 1027
+ZWFycw== 1028
+IHN1cHA= 1029
+IHR1cm4= 1030
+IGZlbHQ= 1031
+IHdvbWFu 1032
+IHN0YXJ0ZWQ= 1033
+b3VibGU= 1034
+dXJh 1035
+aXNoaW5n 1036
+Ogo= 1037
+bGVjdHJvbg== 1038
+bGVjdHJvbmlj 1039
+b29r 1040
+IGNvcHk= 1041
+IGZ1bGw= 1042
+Y29uZA== 1043
+bWF0 1044
+IG1pZGRsZQ== 1045
+IGxvb2s= 1046
+IGNvbW0= 1047
+d2VyZWQ= 1048
+IGJlY2FtZQ== 1049
+IGZlbGxvd3M= 1050
+d291bGQ= 1051
+IGdvdA== 1052
+IGds 1053
+IGd1 1054
+IGtlZXA= 1055
+IGdl 1056
+IE1hZG9ubmE= 1057
+aXRlcg== 1058
+aXNoZWQ= 1059
+IHVuZGVyc3Q= 1060
+IHN0cmE= 1061
+c2lk 1062
+IGNvdW50cnk= 1063
+b3BsZQ== 1064
+IHByb3Y= 1065
+IHB1dA== 1066
+bm8= 1067
+J2xs 1068
+IHNsZQ== 1069
+cmFuZ2U= 1070
+IFNoZQ== 1071
+cG9z 1072
+IG1pbmQ= 1073
+IHBhc3M= 1074
+IHRocm91Z2g= 1075
+IHF1aXRl 1076
+IGluZA== 1077
+IGJvYXJkaW5n 1078
+dGVhY2hlcg== 1079
+cGxl 1080
+UG9yY3VwaW5l 1081
+IHBsZQ== 1082
+IGdlaXNoYQ== 1083
+ICAgIA== 1084
+b3N0 1085
+ZW5zZQ== 1086
+Tm8= 1087
+aWJsZQ== 1088
+IHJlYWQ= 1089
+IHJlZA== 1090
+ZW50aW9u 1091
+ZW5lZA== 1092
+ISIK 1093
+IHJlZg== 1094
+IGFk 1095
+IGZs 1096
+IHN0YXk= 1097
+dXA= 1098
+IHJvdW5k 1099
+IGNsZQ== 1100
+IG9wZW4= 1101
+IG9i 1102
+dGVuZA== 1103
+IGZpbmQ= 1104
+IHBlcg== 1105
+IGNhbGxlZA== 1106
+IHN1cg== 1107
+cmV3 1108
+IHBhcGVy 1109
+IEJhZGdlcg== 1110
+IG1lZXQ= 1111
+aXNz 1112
+IlRoYXQ= 1113
+ZXJtcw== 1114
+VEU= 1115
+aXR0ZW4= 1116
+YWJseQ== 1117
+bmVzcw== 1118
+IGNhbm5vdA== 1119
+IHNpbXA= 1120
+Y29u 1121
+IHJlYXNvbg== 1122
+eW91 1123
+IGhvbWU= 1124
+Ynk= 1125
+IGZpZ2h0 1126
+aXR0bGU= 1127
+IHRoaW5ncw== 1128
+IGVhcw== 1129
+IGltcA== 1130
+cmVzc2Vk 1131
+IG1lYW4= 1132
+IGFwcGVhcmVk 1133
+IG5hdA== 1134
+IGhlbA== 1135
+cmV0 1136
+YWtlbg== 1137
+IHN0cmFpZ2h0 1138
+IGFmZmFpcg== 1139
+aXRpbmc= 1140
+IGVk 1141
+IHNpbmNl 1142
+bG9n 1143
+IHBheQ== 1144
+IGZyb250 1145
+bXk= 1146
+IHZvaWNl 1147
+cmVhZHk= 1148
+IGZvb2w= 1149
+b3VuZGF0aW9u 1150
+IGVsZWN0cm9uaWM= 1151
+IHRlcm1z 1152
+IG1hcg== 1153
+YXBhbg== 1154
+YW55 1155
+IHJlc3A= 1156
+IGVuZA== 1157
+YXBw 1158
+d2hhdA== 1159
+c3Ry 1160
+cmFw 1161
+aWFs 1162
+aWN1bA== 1163
+IGFjYw== 1164
+b3Ro 1165
+IHNlY29uZA== 1166
+IGZsbw== 1167
+IHNpeA== 1168
+IGZlZXQ= 1169
+YnI= 1170
+aWV0 1171
+IGxpdHRsZQ== 1172
+bGVz 1173
+IG1vbmV5 1174
+IGRlY2w= 1175
+IGV5 1176
+IGNvbXA= 1177
+YXJpbmc= 1178
+IGFncmU= 1179
+d2hlcmU= 1180
+IFN0 1181
+IHN0cmU= 1182
+ZXg= 1183
+cmFjdA== 1184
+IGludA== 1185
+IGRpcmU= 1186
+IGJlY29tZQ== 1187
+IGhvbg== 1188
+IGNvbnNpZA== 1189
+ZXJ0YWlu 1190
+bm93 1191
+IHNs 1192
+aXRvcg== 1193
+Z2c= 1194
+IGp1bQ== 1195
+IGJ1 1196
+IHRoaW5n 1197
+IGFuc3dlcmVk 1198
+b2Vz 1199
+eWE= 1200
+IFRoYXQ= 1201
+aXpl 1202
+b25k 1203
+YWN0 1204
+IGVmZg== 1205
+IGJhbmc= 1206
+YWJvdXQ= 1207
+IGJlZA== 1208
+b3Jyb3c= 1209
+dW5n 1210
+IFRv 1211
+IGtlcHQ= 1212
+IHdhbA== 1213
+IGJhdGg= 1214
+IGRyYQ== 1215
+IkE= 1216
+cmluZ3M= 1217
+aG9wcA== 1218
+IHJlc2lnbg== 1219
+IGRpbg== 1220
+IGxhZHk= 1221
+LkU= 1222
+IHVzZQ== 1223
+bGlzaA== 1224
+b3Jz 1225
+IHdyaXR0ZW4= 1226
+ZW5l 1227
+aXY= 1228
+IGRpZg== 1229
+IHN0ZQ== 1230
+IHN0b3J5 1231
+Y29t 1232
+cmVz 1233
+ZW50bHk= 1234
+IGZhY3Q= 1235
+aGVz 1236
+d2F5cw== 1237
+IHdoeQ== 1238
+IHRob3VnaA== 1239
+IHN0cg== 1240
+b25kZXI= 1241
+aGVhZA== 1242
+IGNvdXI= 1243
+IG1vbg== 1244
+IHNr 1245
+IGJlbGll 1246
+IGxldA== 1247
+ZmVy 1248
+IHJlcXU= 1249
+IGxpbmU= 1250
+cm9vbQ== 1251
+LWRheQ== 1252
+IGRvbmU= 1253
+IGRvZXM= 1254
+IE9uZQ== 1255
+IGRhbmdv 1256
+YXNzaG9wcA== 1257
+IGNvbnNpZGVy 1258
+IGRpbm5lcg== 1259
+IEZvdW5kYXRpb24= 1260
+Kio= 1261
+ZW1wdA== 1262
+ZXNl 1263
+IHdvcmQ= 1264
+cmVzdA== 1265
+IGVub3VnaA== 1266
+IGdyZWF0 1267
+IG5hbWU= 1268
+IHB1Yg== 1269
+IG1hbm5lcg== 1270
+d2Vy 1271
+aWN0 1272
+aW5lc3M= 1273
+IGhpbXNlbGY= 1274
+IHBlb3BsZQ== 1275
+ZXc= 1276
+IGNvcg== 1277
+ZXN0aW9u 1278
+IGJpZw== 1279
+ZWU= 1280
+IHJp 1281
+aWRlcw== 1282
+IGJyb3RoZXI= 1283
+IGhlYXJ0 1284
+ZWN0ZWQ= 1285
+ZWVk 1286
+IG90aGVycw== 1287
+c29s 1288
+dGVk 1289
+IGV5ZXM= 1290
+IHRyb3VibGU= 1291
+IHRlYWNo 1292
+IGJvYXQ= 1293
+IGZvdXI= 1294
+IGFscmVhZHk= 1295
+cm9t 1296
+Z2hlZA== 1297
+IHNxdQ== 1298
+IHBvbA== 1299
+Y2Vz 1300
+IEhvdHQ= 1301
+IGxlYXZl 1302
+IGRpc3RyaWJ1dA== 1303
+YXN0ZXI= 1304
+Q0g= 1305
+dWM= 1306
+IGlt 1307
+IGhvd2V2ZXI= 1308
+dGhlcmU= 1309
+YXBhbmVzZQ== 1310
+IGxhc3Q= 1311
+IGNy 1312
+aWxpdHk= 1313
+IHNpbXBsZQ== 1314
+IGxpZmU= 1315
+LWM= 1316
+IHJlZ2FyZA== 1317
+IGZpbg== 1318
+dWFs 1319
+IG1lYW5z 1320
+IHN0YW5k 1321
+YXRjaA== 1322
+IHNob3J0 1323
+bmVk 1324
+IHNlZW4= 1325
+IGhhcHA= 1326
+LWs= 1327
+IGFnYWluc3Q= 1328
+aGlt 1329
+YW1lZA== 1330
+IHN0b29k 1331
+IGdyYQ== 1332
+IG1vdGhlcg== 1333
+IGZpc2g= 1334
+IHdhdGVy 1335
+YWls 1336
+Y2Vp 1337
+IHJhdGhlcg== 1338
+IGlucw== 1339
+IGZlZWw= 1340
+IGFsc28= 1341
+IG9yZA== 1342
+IGNvbWluZw== 1343
+aWNz 1344
+IGVpdGhlcg== 1345
+bmNl 1346
+ICc= 1347
+IGtpZA== 1348
+IGxhdWdoZWQ= 1349
+bGlrZQ== 1350
+IEFy 1351
+Z3I= 1352
+IEhvdHRh 1353
+IHRhbGs= 1354
+Z2V0aGVy 1355
+IFNpcg== 1356
+IHB1bg== 1357
+UHJv 1358
+YXRz 1359
+bW9zdA== 1360
+IHJlcA== 1361
+IGdp 1362
+aXNm 1363
+YmFibHk= 1364
+YWtlcw== 1365
+IE5vdA== 1366
+bnk= 1367
+IGFwcGVhcg== 1368
+bXA= 1369
+Y2hh 1370
+IGFjdA== 1371
+YmVk 1372
+aWVm 1373
+dWZm 1374
+IGFwbw== 1375
+IG1ldA== 1376
+IHJldHVybmVk 1377
+IHNvdW5k 1378
+dXNpbmVzcw== 1379
+IGxhdWdo 1380
+IGNsZWFy 1381
+IG5lZWQ= 1382
+ZmVzcw== 1383
+ZXN0ZWQ= 1384
+IGludg== 1385
+IGFjY2VwdA== 1386
+dW5kZXI= 1387
+Owo= 1388
+IHN1cnBy 1389
+ZGU= 1390
+IHRyYWlu 1391
+IGhvdGVs 1392
+IHNsZWVw 1393
+IGRy 1394
+IGhvbGQ= 1395
+bG9jaw== 1396
+cHVyYQ== 1397
+IHNwcmluZ3M= 1398
+IC4uLi4uLg== 1399
+IGFncmVlbWVudA== 1400
+IERhcg== 1401
+IHJlc3Q= 1402
+Y2x1ZA== 1403
+YXRvcg== 1404
+YXY= 1405
+IG9yaWc= 1406
+IG9yaWdpbg== 1407
+IGVs 1408
+IG5vcg== 1409
+IHByZXM= 1410
+IHVuZGVyc3RhbmQ= 1411
+IHRha2Vu 1412
+IGxpZ2h0 1413
+ZW5lcg== 1414
+c29tZQ== 1415
+IGJyb3VnaHQ= 1416
+cmFwaA== 1417
+IG1vc3Q= 1418
+b2tl 1419
+LXc= 1420
+IHVudA== 1421
+IGZhdGhlcg== 1422
+IHVzZWQ= 1423
+IGVhdA== 1424
+IHllYXJz 1425
+IFdoaWxl 1426
+IGNoYW4= 1427
+IHN1ZGQ= 1428
+IHN1ZGRlbg== 1429
+IGFwb2xvZw== 1430
+IHNldHQ= 1431
+IHRoaW4= 1432
+IE15 1433
+IHRlbg== 1434
+aW1lcw== 1435
+Zm9y 1436
+b3Vk 1437
+V2hlbg== 1438
+IGRldA== 1439
+IGxpdmU= 1440
+IG9j 1441
+IGZpdmU= 1442
+IGNvbnQ= 1443
+IGhlbHA= 1444
+IHdh 1445
+IHBhc3NlZA== 1446
+IHJ1bg== 1447
+IG1ha2luZw== 1448
+IHN0cmFuZ2U= 1449
+IHRha2luZw== 1450
+IGVhY2g= 1451
+IllvdQ== 1452
+IGFub3RoZXI= 1453
+IlNheQ== 1454
+IlRoZQ== 1455
+YXRlcw== 1456
+IHBsZWFz 1457
+YXNzaG9wcGVycw== 1458
+IG1vbQ== 1459
+IG1vbWVudA== 1460
+ZW50bGU= 1461
+bmdsaXNo 1462
+Q0hB 1463
+IG9yaWdpbmFs 1464
+aW9ucw== 1465
+dXJpbmc= 1466
+IHB1YmxpYw== 1467
+dWN0 1468
+dWNr 1469
+IHF1ZXN0aW9u 1470
+YWk= 1471
+Y3k= 1472
+ZWs= 1473
+IGZsb29y 1474
+IGNhcg== 1475
+b3VzZQ== 1476
+IHNpZGU= 1477
+LXlh 1478
+IGNlcnRhaW4= 1479
+aHlz 1480
+LWQ= 1481
+aWdo 1482
+YWdpbg== 1483
+d2VldA== 1484
+IHBvb3I= 1485
+IGRlY2lk 1486
+dWFsbHk= 1487
+IGJ1c2luZXNz 1488
+cHJv 1489
+cGxhaW4= 1490
+IHN0b3A= 1491
+IQo= 1492
+IEhvdw== 1493
+IldoYXQ= 1494
+Y2Fu 1495
+IFVu 1496
+cHM= 1497
+dW5k 1498
+LW5pZ2h0 1499
+IG1lZXRpbmc= 1500
+ZWRv 1501
+IHJhaXNl 1502
+R3V0ZW5iZXJn 1503
+IERhcmxpbmc= 1504
+dW1l 1505
+IEVuZ2xpc2g= 1506
+VEVS 1507
+YWRpbmc= 1508
+IHRyYW5zbA== 1509
+IGFibGU= 1510
+c3NpYmxl 1511
+IHNhdGlzZg== 1512
+IHdhbnRlZA== 1513
+IHN1Yg== 1514
+IGNhc2U= 1515
+aWZpYw== 1516
+aXRlcmFyeQ== 1517
+IG1haWQ= 1518
+IGluYw== 1519
+IHBvcw== 1520
+IHBvc2l0aW9u 1521
+IHBhdA== 1522
+dXJlZA== 1523
+b3JyeQ== 1524
+IGFjY291bnQ= 1525
+IGJvdGg= 1526
+IGZyaWU= 1527
+IGZyaWVuZA== 1528
+dGhpcw== 1529
+IGFsd2F5cw== 1530
+IHBhcnRpY3Vs 1531
+V2hhdA== 1532
+IHNtYWxs 1533
+ZW50eQ== 1534
+dXNoZWQ= 1535
+IG1pcw== 1536
+dWxseQ== 1537
+IHJlY2Vp 1538
+WW91 1539
+IHlldA== 1540
+IGdhdmU= 1541
+QnV0 1542
+aGFk 1543
+IGFuc3dlcg== 1544
+IGFicw== 1545
+aWxl 1546
+Y2tldA== 1547
+IG5vb2Q= 1548
+IGNvdXJzZQ== 1549
+IGZvcm0= 1550
+IGV2ZXJ5dGhpbmc= 1551
+ZWN0aW9u 1552
+SWY= 1553
+cGFydA== 1554
+IHNpbmc= 1555
+IHNpdA== 1556
+IHB1cg== 1557
+aXA= 1558
+IGZpc2hpbmc= 1559
+IGVo 1560
+IHBhcg== 1561
+IHRvZ2V0aGVy 1562
+SGU= 1563
+IHdoZQ== 1564
+IHdoZXRoZXI= 1565
+IGJyYQ== 1566
+Illlcw== 1567
+IHB1bmlzaA== 1568
+U2hpcnQ= 1569
+IFllZG8= 1570
+IGZhcmV3 1571
+IGZhcmV3ZWxs 1572
+IGRhbmNl 1573
+IGxlc3M= 1574
+dXJhbA== 1575
+IGRlZg== 1576
+IGF0dGVtcHQ= 1577
+d2Vlbg== 1578
+IHNpZ24= 1579
+IHN5 1580
+ZmVyZW50 1581
+IGxlYXN0 1582
+c2Vy 1583
+b2I= 1584
+bmRpbmc= 1585
+IHNvcnJ5 1586
+IGp1bXBlZA== 1587
+IGphbg== 1588
+IGphbml0b3I= 1589
+aXplZA== 1590
+IHRvd2FyZA== 1591
+IG1vcg== 1592
+YXZpbmc= 1593
+IGJpdA== 1594
+IlRoaXM= 1595
+IHJlbWFyaw== 1596
+IGZ1dA== 1597
+IHdvbmRlcg== 1598
+IGZ1bg== 1599
+VGhlbg== 1600
+IGRlYw== 1601
+IHdob20= 1602
+IGRpZG4= 1603
+IHJlYw== 1604
+YmVj 1605
+Iklm 1606
+IGtuZXc= 1607
+YWZ0ZXI= 1608
+IHRodXM= 1609
+IGlzbg== 1610
+IHNpZ2h0 1611
+bWVk 1612
+W0Y= 1613
+dXNz 1614
+Y2lkZW50 1615
+dGhlbQ== 1616
+IGZpZg== 1617
+IGRyYXc= 1618
+IGhlYXI= 1619
+IHdyaXRpbmc= 1620
+IGdldHRpbmc= 1621
+c2g= 1622
+ZmVyZW5jZQ== 1623
+IHJhaXNlZA== 1624
+dGhleQ== 1625
+YXg= 1626
+IGZpbmU= 1627
+c2Vs 1628
+IE5vYmU= 1629
+IE5vYmVvaw== 1630
+IE5vYmVva2E= 1631
+b3JtYWw= 1632
+IGVC 1633
+aWNlbnNl 1634
+MDA= 1635
+IGJlc3Q= 1636
+d29y 1637
+Zmlj 1638
+dGVyZXN0 1639
+IHJlbWFy 1640
+Ymw= 1641
+YXJ0ZWQ= 1642
+IGRhcms= 1643
+IHlvdW5n 1644
+dXNo 1645
+IGJldA== 1646
+b3V0aA== 1647
+aG91c2U= 1648
+YXVnaHQ= 1649
+IHBoeXM= 1650
+IHN0cm9uZw== 1651
+IGZ1cg== 1652
+IHJvbGw= 1653
+Y292ZQ== 1654
+Y2hpZWY= 1655
+YXdh 1656
+IGZvbGxvd2Vk 1657
+IGZvbmQ= 1658
+IGZ1dHVyZQ== 1659
+aXJk 1660
+ZnVsbHk= 1661
+IGVmZm9ydA== 1662
+QWZ0ZXI= 1663
+b3dhcmQ= 1664
+IHJlYWxseQ== 1665
+IGFtb25n 1666
+IGFyb3VuZA== 1667
+IGNvbXBs 1668
+IGdheg== 1669
+IGJvdw== 1670
+YXRlcg== 1671
+IGluc2lzdA== 1672
+IHR1cm5lZA== 1673
+aGVs 1674
+cmVt 1675
+IGhvdXJz 1676
+IGRlY2lkZWQ= 1677
+eXM= 1678
+IG1vbnRo 1679
+LWE= 1680
+IGFkdg== 1681
+IGJlbGlldmU= 1682
+IHRlYWNoaW5n 1683
+IGVhc3k= 1684
+IGRpcmVjdGlvbg== 1685
+b29rZWQ= 1686
+IHdhcg== 1687
+IHVubGVzcw== 1688
+aGF2ZQ== 1689
+IHNxdWFyZQ== 1690
+dmls 1691
+IHF1aWV0 1692
+IGh1bmc= 1693
+IGdvZXM= 1694
+IHBhaWQ= 1695
+IHNoYWxs 1696
+Ik5v 1697
+IHB1bmlzaG1lbnQ= 1698
+cG9zZQ== 1699
+IHN3ZWV0 1700
+J3Zl 1701
+IldlbGw= 1702
+IGdlbnRsZQ== 1703
+IG5vcm1hbA== 1704
+YWdyYXBo 1705
+Y2hpdmU= 1706
+Y2hhbg== 1707
+IGluY2x1ZA== 1708
+d3c= 1709
+b3Jn 1710
+dGVt 1711
+QVI= 1712
+IFRI 1713
+IGVxdQ== 1714
+IHRvbmU= 1715
+IHBvc3NpYmxl 1716
+IGJlY29t 1717
+IEphcGFuZXNl 1718
+dmVycw== 1719
+IGZvbGxvd2luZw== 1720
+IHBhaW4= 1721
+IHdob2xl 1722
+d3I= 1723
+IHNlcmlvdXM= 1724
+IG5hcg== 1725
+IHRpcmVk 1726
+SW4= 1727
+IHBsYXk= 1728
+IHByb20= 1729
+IGdhbWU= 1730
+IFNvbWU= 1731
+IGhhcHBlbmVk 1732
+IGN1dA== 1733
+IHR3ZW50eQ== 1734
+IGRvb3I= 1735
+IG1vcm5pbmc= 1736
+aGluZA== 1737
+IGJyZQ== 1738
+IGluc2lkZQ== 1739
+b3Zl 1740
+YWx0aA== 1741
+dWs= 1742
+YXJnZQ== 1743
+YW1i 1744
+IGRhbQ== 1745
+IHdvcnJ5 1746
+YXRpdmU= 1747
+IGV4cGVjdGVk 1748
+IGZhbQ== 1749
+IHByYQ== 1750
+IHBvY2tldA== 1751
+b29rcw== 1752
+Y2hlZA== 1753
+IHNpbA== 1754
+b2w= 1755
+IGZhdg== 1756
+IGVsc2U= 1757
+IGhpZ2g= 1758
+IHJlYWw= 1759
+IGFsb25n 1760
+IG1lZA== 1761
+aGlr 1762
+aGVtYXQ= 1763
+aGVtYXRpY3M= 1764
+IGxpc3Q= 1765
+IHNpY2s= 1766
+b2ludA== 1767
+W0Zvb3Q= 1768
+W0Zvb3Rub3Q= 1769
+W0Zvb3Rub3Rl 1770
+Ll0K 1771
+bmlnaHQ= 1772
+c2Vz 1773
+aW9y 1774
+IHNheXM= 1775
+IG1vdXRo 1776
+aG93 1777
+bWluZw== 1778
+IGNsbw== 1779
+IGN1cg== 1780
+Z2luZw== 1781
+IHN1ZGRlbmx5 1782
+LWFo 1783
+YW1w 1784
+IGJsYWNr 1785
+cm9zcw== 1786
+IGZhYw== 1787
+c2VsdmVz 1788
+aWV3 1789
+aXNzaW9u 1790
+IGNvcHlyaWdodA== 1791
+IHBhcmFncmFwaA== 1792
+IEFyY2hpdmU= 1793
+IGRvbmF0aW9ucw== 1794
+UHJvamVjdA== 1795
+IGNvc3Q= 1796
+Lm9yZw== 1797
+TEk= 1798
+dWNlZA== 1799
+IHN1Yw== 1800
+eWxl 1801
+IGZvcmNl 1802
+am95 1803
+b3VjaA== 1804
+dHI= 1805
+SXQ= 1806
+IHRyYWQ= 1807
+IHByZXNlbnQ= 1808
+IGV4dA== 1809
+YXNlZA== 1810
+cmVkaXQ= 1811
+IGZhdWx0 1812
+aWI= 1813
+LW0= 1814
+dXJk 1815
+IHRyaWVk 1816
+dGltZQ== 1817
+IHByZXQ= 1818
+IHNwZWU= 1819
+b3dlcg== 1820
+IHdvcmRz 1821
+Q0hBUA== 1822
+Q0hBUFRFUg== 1823
+c2Nob29s 1824
+IGFzaw== 1825
+IGRvaW5n 1826
+YXRlbHk= 1827
+IHVudGls 1828
+Ym91dA== 1829
+IHRyZWU= 1830
+Y2FsbA== 1831
+YW1hc2g= 1832
+YW1hc2hpcg== 1833
+YW1hc2hpcm8= 1834
+c3Rl 1835
+IGJlaGluZA== 1836
+b2xk 1837
+IHdhbGw= 1838
+aXRvcnk= 1839
+IHJvbGxlZA== 1840
+IG1vdmU= 1841
+IGFwb2xvZ2l6ZQ== 1842
+IGxhcmdl 1843
+YW1ib28= 1844
+c3U= 1845
+IHNldHRsZWQ= 1846
+Ikhl 1847
+d28= 1848
+IHRoaW5raW5n 1849
+dXNlZA== 1850
+aWZpZWQ= 1851
+IGFsbW9zdA== 1852
+IHRyZQ== 1853
+IHRyZWF0 1854
+IG5vb2RsZQ== 1855
+IG5vdGU= 1856
+IEFsbA== 1857
+IGJlYXQ= 1858
+IG9iamVjdA== 1859
+IHNlZW1z 1860
+IGlkZQ== 1861
+WWVz 1862
+b3dz 1863
+IHJlbWFpbg== 1864
+IGJlZ2lu 1865
+dWdodA== 1866
+bWVudHM= 1867
+IGFsb25l 1868
+c3BlY3Q= 1869
+IG1hdGhlbWF0aWNz 1870
+IHJvdWdo 1871
+IG91dHNpZGU= 1872
+IGNvbWVz 1873
+YmFjaw== 1874
+IHdpbmQ= 1875
+c2Vk 1876
+IHdvdWxkbg== 1877
+ZWVy 1878
+aW51dA== 1879
+ZnJvbQ== 1880
+IHJlcGw= 1881
+IG5hcnJvdw== 1882
+IGluY2lkZW50 1883
+IGFpcg== 1884
+IHNlYQ== 1885
+dHM= 1886
+IHN1cnByaXNlZA== 1887
+IHRlYQ== 1888
+UmVk 1889
+IHRhbGtpbmc= 1890
+IGJvc3M= 1891
+cXVl 1892
+IHBpY3Q= 1893
+aXJ0eQ== 1894
+IGNl 1895
+IGxpbQ== 1896
+IFdoeQ== 1897
+IHBvaW50 1898
+IGxhdw== 1899
+Y2lhdGVk 1900
+IG1vb24= 1901
+aXJjdQ== 1902
+Z290 1903
+IElz 1904
+IGhhbmRz 1905
+IGhvbm9y 1906
+YXV0 1907
+cmdl 1908
+IHN0YXRl 1909
+IExpdGVyYXJ5 1910
+LkY= 1911
+VGhpcw== 1912
+bGluZQ== 1913
+Lmc= 1914
+Lmd1dGVuYmVyZw== 1915
+IE9G 1916
+RU4= 1917
+cmFjdGVy 1918
+IGJlbmU= 1919
+IEV2ZW4= 1920
+b3Vi 1921
+IG1ha2Vz 1922
+IGludGVyZXN0 1923
+b3Bl 1924
+bXM= 1925
+IHJlc3BvbnM= 1926
+IGZvcmU= 1927
+IHNvbWV3aGF0 1928
+IGhvbmVzdA== 1929
+b2Nr 1930
+aXJpdA== 1931
+IGhlbGQ= 1932
+IGFkZGVk 1933
+ZnU= 1934
+YWRlZA== 1935
+YWxz 1936
+YXR0 1937
+dGVybg== 1938
+IHBlcnNvbmFs 1939
+IGFzcw== 1940
+IFdpdGg= 1941
+dGlj 1942
+VG9reW8= 1943
+IHNob3V0 1944
+IHByZXR0eQ== 1945
+dW1i 1946
+IGVhcmx5 1947
+b3BwZWQ= 1948
+IGZ1cnRoZXI= 1949
+IGZyZQ== 1950
+ZXNpZGVz 1951
+IGJhbWJvbw== 1952
+IGly 1953
+bW9yZQ== 1954
+IGxpdmluZw== 1955
+IHJlY2VpdmVk 1956
+IGxpdmVk 1957
+IG1lYW50 1958
+IGNvd2FyZA== 1959
+cG9zaXRpb24= 1960
+IGxvYw== 1961
+aWxlZA== 1962
+IHRlbmRlcg== 1963
+IGNo 1964
+IEFmdGVy 1965
+Y2Vy 1966
+IGZhdm9y 1967
+d2hv 1968
+IGxpa2Vk 1969
+cmFuY2U= 1970
+IHByaQ== 1971
+a2lzaGE= 1972
+IHN0dWR5 1973
+IG9yZGVy 1974
+IGFmdGVyd2FyZA== 1975
+IGdyZWF0bHk= 1976
+IHVuYWJsZQ== 1977
+Z28= 1978
+IHdhaXQ= 1979
+ZXBpbmc= 1980
+aWRpbmc= 1981
+IGZvcnR5 1982
+IHNreQ== 1983
+IG9mZmljZQ== 1984
+d2lsbA== 1985
+IkQ= 1986
+d2Vs 1987
+IHN0YXRpb24= 1988
+Ym8= 1989
+aG90 1990
+c3VjaA== 1991
+IGxvdWQ= 1992
+IGF3 1993
+bGFuZA== 1994
+Pwo= 1995
+IHJlc3BlY3Q= 1996
+YW5jZXM= 1997
+aWVudA== 1998
+IG91Z2h0 1999
diff --git a/tests/test_utils.py b/tests/test_utils.py
index c575134e8f..8c2536a15d 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -18,7 +18,7 @@
import torch
from torch import nn
-from torchtune.modules import Tokenizer
+from torchtune.modules.tokenizers import SentencePieceTokenizer
skip_if_cuda_not_available = unittest.skipIf(
not torch.cuda.is_available(), "CUDA is not available"
@@ -39,8 +39,8 @@ def torch_version_ge(version: str) -> bool:
return version in torch.__version__ or torch.__version__ >= version
-# Inherit from tokenizer class to reuse its tokenize_messages method
-class DummyTokenizer(Tokenizer):
+# Inherit from SentencePieceTokenizer class to reuse its tokenize_messages method
+class DummyTokenizer(SentencePieceTokenizer):
def __init__(self):
self.encodes_whitespace = False
diff --git a/tests/torchtune/datasets/test_alpaca_dataset.py b/tests/torchtune/datasets/test_alpaca_dataset.py
index 9b9cb56b07..2a05cefd06 100644
--- a/tests/torchtune/datasets/test_alpaca_dataset.py
+++ b/tests/torchtune/datasets/test_alpaca_dataset.py
@@ -10,9 +10,8 @@
from tests.test_utils import get_assets_path
from torchtune.data._common import CROSS_ENTROPY_IGNORE_IDX
-
from torchtune.datasets import alpaca_cleaned_dataset, alpaca_dataset
-from torchtune.modules.tokenizer import Tokenizer
+from torchtune.modules.tokenizers import SentencePieceTokenizer
class TestAlpacaDataset:
@@ -20,7 +19,7 @@ class TestAlpacaDataset:
def tokenizer(self):
# m.model is a pretrained Sentencepiece model using the following command:
# spm.SentencePieceTrainer.train('--input= --model_prefix=m --vocab_size=2000')
- return Tokenizer.from_file(str(get_assets_path() / "m.model"))
+ return SentencePieceTokenizer(str(get_assets_path() / "m.model"))
@patch("torchtune.datasets._instruct.load_dataset")
def test_label_no_masking(self, load_dataset, tokenizer):
diff --git a/tests/torchtune/datasets/test_grammar_dataset.py b/tests/torchtune/datasets/test_grammar_dataset.py
index 5fb41d39eb..20c209f004 100644
--- a/tests/torchtune/datasets/test_grammar_dataset.py
+++ b/tests/torchtune/datasets/test_grammar_dataset.py
@@ -12,7 +12,7 @@
from torchtune.data._common import CROSS_ENTROPY_IGNORE_IDX
from torchtune.datasets import grammar_dataset
-from torchtune.modules.tokenizer import Tokenizer
+from torchtune.modules.tokenizers import SentencePieceTokenizer
class TestGrammarDataset:
@@ -20,7 +20,7 @@ class TestGrammarDataset:
def tokenizer(self):
# m.model is a pretrained Sentencepiece model using the following command:
# spm.SentencePieceTrainer.train('--input= --model_prefix=m --vocab_size=2000')
- return Tokenizer.from_file(str(get_assets_path() / "m.model"))
+ return SentencePieceTokenizer(str(get_assets_path() / "m.model"))
@patch("torchtune.datasets._instruct.load_dataset")
def test_label_no_masking(self, load_dataset, tokenizer):
diff --git a/tests/torchtune/datasets/test_samsum_dataset.py b/tests/torchtune/datasets/test_samsum_dataset.py
index 972b8bbb25..6ec6a52679 100644
--- a/tests/torchtune/datasets/test_samsum_dataset.py
+++ b/tests/torchtune/datasets/test_samsum_dataset.py
@@ -12,7 +12,7 @@
from torchtune.data._common import CROSS_ENTROPY_IGNORE_IDX
from torchtune.datasets import samsum_dataset
-from torchtune.modules.tokenizer import Tokenizer
+from torchtune.modules.tokenizers import SentencePieceTokenizer
class TestSamsumDataset:
@@ -20,7 +20,7 @@ class TestSamsumDataset:
def tokenizer(self):
# m.model is a pretrained Sentencepiece model using the following command:
# spm.SentencePieceTrainer.train('--input= --model_prefix=m --vocab_size=2000')
- return Tokenizer.from_file(str(get_assets_path() / "m.model"))
+ return SentencePieceTokenizer(str(get_assets_path() / "m.model"))
@patch("torchtune.datasets._instruct.load_dataset")
def test_label_no_masking(self, load_dataset, tokenizer):
diff --git a/tests/torchtune/datasets/test_slimorca_dataset.py b/tests/torchtune/datasets/test_slimorca_dataset.py
index 725b60d49d..03a8396271 100644
--- a/tests/torchtune/datasets/test_slimorca_dataset.py
+++ b/tests/torchtune/datasets/test_slimorca_dataset.py
@@ -10,7 +10,7 @@
from tests.test_utils import get_assets_path
from torchtune.datasets import slimorca_dataset
-from torchtune.modules.tokenizer import Tokenizer
+from torchtune.modules.tokenizers import SentencePieceTokenizer
class TestSlimOrcaDataset:
@@ -18,7 +18,7 @@ class TestSlimOrcaDataset:
def tokenizer(self):
# m.model is a pretrained Sentencepiece model using the following command:
# spm.SentencePieceTrainer.train('--input= --model_prefix=m --vocab_size=2000')
- return Tokenizer.from_file(str(get_assets_path() / "m.model"))
+ return SentencePieceTokenizer(str(get_assets_path() / "m.model"))
@patch("torchtune.datasets._chat.load_dataset")
def test_value_error(self, load_dataset, tokenizer):
diff --git a/tests/torchtune/models/test_llama3.py b/tests/torchtune/models/test_llama3.py
new file mode 100644
index 0000000000..190eaf413e
--- /dev/null
+++ b/tests/torchtune/models/test_llama3.py
@@ -0,0 +1,46 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import pytest
+import torch
+from tests.test_utils import fixed_init_model
+from torchtune.models.llama3 import llama3
+from torchtune.utils.seed import set_seed
+
+EMBED_DIM = 128
+NUM_LAYERS = 4
+NUM_HEADS = 16
+NUM_KV_HEADS = 8
+VOCAB_SIZE = 32000
+MAX_SEQ_LEN = 2048
+BSZ = 2
+SEQ_LEN = 100
+
+
+@pytest.fixture(autouse=True)
+def random():
+ set_seed(16)
+
+
+class TestLlama3:
+ @pytest.fixture
+ def inputs(self):
+ return torch.randint(0, VOCAB_SIZE, (BSZ, SEQ_LEN))
+
+ def test_forward(self, inputs):
+ model = llama3(
+ vocab_size=VOCAB_SIZE,
+ num_layers=NUM_LAYERS,
+ num_heads=NUM_HEADS,
+ num_kv_heads=NUM_KV_HEADS,
+ embed_dim=EMBED_DIM,
+ max_seq_len=MAX_SEQ_LEN,
+ )
+ fixed_init_model(model, min_val=-0.25, max_val=0.5)
+ actual = model(inputs)
+ expected = torch.tensor(3.9763)
+ assert actual.shape == (BSZ, SEQ_LEN, VOCAB_SIZE)
+ torch.testing.assert_close(actual.mean(), expected, atol=1e-4, rtol=1e-4)
diff --git a/tests/torchtune/modules/test_tokenizer.py b/tests/torchtune/modules/tokenizers/test_sentencepiece.py
similarity index 96%
rename from tests/torchtune/modules/test_tokenizer.py
rename to tests/torchtune/modules/tokenizers/test_sentencepiece.py
index 5ac4255e01..bc8f61c2a1 100644
--- a/tests/torchtune/modules/test_tokenizer.py
+++ b/tests/torchtune/modules/tokenizers/test_sentencepiece.py
@@ -8,17 +8,17 @@
import pytest
from torchtune.data._types import Message
-from torchtune.modules.tokenizer import Tokenizer
+from torchtune.modules.tokenizers import SentencePieceTokenizer
-ASSETS = Path(__file__).parent.parent.parent / "assets"
+ASSETS = Path(__file__).parent.parent.parent.parent / "assets"
-class TestTokenizer:
+class TestSentencePieceTokenizer:
@pytest.fixture
def tokenizer(self):
# m.model is a pretrained Sentencepiece model using the following command:
# spm.SentencePieceTrainer.train('--input= --model_prefix=m --vocab_size=2000')
- return Tokenizer.from_file(str(ASSETS / "m.model"))
+ return SentencePieceTokenizer(str(ASSETS / "m.model"))
def test_encode(self, tokenizer):
assert tokenizer.encode("Hello world!") == [
diff --git a/tests/torchtune/modules/tokenizers/test_tiktoken.py b/tests/torchtune/modules/tokenizers/test_tiktoken.py
new file mode 100644
index 0000000000..8796213e76
--- /dev/null
+++ b/tests/torchtune/modules/tokenizers/test_tiktoken.py
@@ -0,0 +1,192 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from pathlib import Path
+
+import pytest
+from torchtune.data._types import Message
+from torchtune.modules.tokenizers import TikTokenTokenizer
+
+ASSETS = Path(__file__).parent.parent.parent.parent / "assets"
+
+
+class TestTikTokenTokenizer:
+ @pytest.fixture
+ def tokenizer(self):
+ # Pretrained tiktoken model generated via the script in
+ # https://gist.github.com/ebsmothers/54b133dd87db6679b14318545aaa2de4
+ return TikTokenTokenizer(str(ASSETS / "tiktoken_small.model"))
+
+ @pytest.fixture
+ def texts(self):
+ return [
+ "I can see the sun. But even if I cannot see the sun, I know that it exists.",
+ "And to know that the sun is there - that is living.",
+ ]
+
+ @pytest.fixture
+ def messages(self, texts):
+ return [
+ Message(role="user", content=texts[0], masked=True),
+ Message(role="assistant", content=texts[1], masked=False),
+ ]
+
+ @pytest.fixture
+ def token_ids(self):
+ return [
+ 73,
+ 503,
+ 654,
+ 262,
+ 376,
+ 110,
+ 46,
+ 690,
+ 720,
+ 428,
+ 270,
+ 1119,
+ 654,
+ 262,
+ 376,
+ 110,
+ 44,
+ 270,
+ 686,
+ 334,
+ 312,
+ 522,
+ 511,
+ 115,
+ 46,
+ ]
+
+ @pytest.fixture
+ def tokenized_messages(self, token_ids):
+ return (
+ [2000, 2006, 477, 273, 2007, 10, 10]
+ + token_ids
+ + [
+ 2009,
+ 2006,
+ 520,
+ 511,
+ 446,
+ 2007,
+ 10,
+ 10,
+ 65,
+ 269,
+ 277,
+ 686,
+ 334,
+ 262,
+ 376,
+ 110,
+ 351,
+ 443,
+ 32,
+ 45,
+ 334,
+ 351,
+ 1955,
+ 46,
+ 2009,
+ 2001,
+ ],
+ [
+ True,
+ True,
+ True,
+ True,
+ True,
+ True,
+ True,
+ True,
+ True,
+ True,
+ True,
+ True,
+ True,
+ True,
+ True,
+ True,
+ True,
+ True,
+ True,
+ True,
+ True,
+ True,
+ True,
+ True,
+ True,
+ True,
+ True,
+ True,
+ True,
+ True,
+ True,
+ True,
+ True,
+ False,
+ False,
+ False,
+ False,
+ False,
+ False,
+ False,
+ False,
+ False,
+ False,
+ False,
+ False,
+ False,
+ False,
+ False,
+ False,
+ False,
+ False,
+ False,
+ False,
+ False,
+ False,
+ False,
+ False,
+ True,
+ ],
+ )
+
+ def test_encode(self, tokenizer, texts, token_ids):
+ assert tokenizer.encode(texts[0], add_bos=True, add_eos=True) == [
+ tokenizer.bos_id
+ ] + token_ids + [tokenizer.eos_id]
+ assert tokenizer.encode(texts[0], add_bos=False, add_eos=False) == token_ids
+
+ def test_decode(self, tokenizer, texts, token_ids):
+ assert tokenizer.decode(token_ids) == texts[0]
+
+ def test_encode_and_decode(self, tokenizer, texts):
+ token_ids = tokenizer.encode(texts[0], add_bos=True, add_eos=True)
+ decoded_text = tokenizer.decode(token_ids)
+ assert texts[0] == decoded_text
+
+ def test_token_ids(self, tokenizer):
+ assert tokenizer.bos_id == 2000
+ assert tokenizer.eos_id == 2001
+ assert tokenizer.pad_id == -1
+ assert tokenizer.step_id == 2005
+ assert tokenizer.start_header_id == 2006
+ assert tokenizer.end_header_id == 2007
+ assert tokenizer.eom_id == 2008
+ assert tokenizer.eot_id == 2009
+ assert tokenizer.python_tag == 2255
+
+ def test_tokenizer_vocab_size(self, tokenizer):
+ assert tokenizer.base_vocab_size == 2000
+ assert tokenizer.vocab_size == 2256
+
+ def test_tokenize_messages(self, tokenizer, messages, tokenized_messages):
+ assert tokenizer.tokenize_messages(messages) == tokenized_messages
diff --git a/tests/torchtune/modules/tokenizers/test_utils.py b/tests/torchtune/modules/tokenizers/test_utils.py
new file mode 100644
index 0000000000..023f86f3fd
--- /dev/null
+++ b/tests/torchtune/modules/tokenizers/test_utils.py
@@ -0,0 +1,47 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import pytest
+from torchtune.modules.tokenizers._utils import _split_long_repetitions
+
+
+class TestUtils:
+ def test_split_long_repetitions(self):
+ normal_str = "Here is a normal string"
+ ten_spaces = "".join(10 * [" "])
+ space_str = ten_spaces.join(
+ ["Here", "is", "a", "string", "with", "long", "spaces"]
+ )
+ no_space_str = "".join(10 * ["ab"])
+
+ actual_split = _split_long_repetitions(normal_str, 5)
+ expected_split = ["Here is a norma", "l strin", "g"]
+ for actual_substr, expected_substr in zip(actual_split, expected_split):
+ assert actual_substr == expected_substr
+ with pytest.raises(StopIteration):
+ next(actual_split)
+
+ actual_split = _split_long_repetitions(space_str, 9)
+ expected_split = [
+ "Here" + ten_spaces[:-1],
+ " is" + ten_spaces[:-1],
+ " a" + ten_spaces[:-1],
+ " string" + ten_spaces[:-1],
+ " with" + ten_spaces[:-1],
+ " long" + ten_spaces[:-1],
+ " spaces",
+ ]
+ for actual_substr, expected_substr in zip(actual_split, expected_split):
+ assert actual_substr == expected_substr
+ with pytest.raises(StopIteration):
+ next(actual_split)
+
+ actual_split = _split_long_repetitions(no_space_str, 4)
+ expected_split = ["abab"] * 5
+ for actual_substr, expected_substr in zip(actual_split, expected_split):
+ assert actual_substr == expected_substr
+ with pytest.raises(StopIteration):
+ next(actual_split)
diff --git a/torchtune/_recipe_registry.py b/torchtune/_recipe_registry.py
index ddab8f45f1..d4b14a7486 100644
--- a/torchtune/_recipe_registry.py
+++ b/torchtune/_recipe_registry.py
@@ -31,6 +31,10 @@ class Recipe:
name="llama2/7B_full_low_memory",
file_path="llama2/7B_full_low_memory.yaml",
),
+ Config(
+ name="llama3/8B_full_single_device",
+ file_path="llama3/8B_full_single_device.yaml",
+ ),
Config(
name="mistral/7B_full_low_memory",
file_path="mistral/7B_full_low_memory.yaml",
@@ -44,6 +48,7 @@ class Recipe:
configs=[
Config(name="llama2/7B_full", file_path="llama2/7B_full.yaml"),
Config(name="llama2/13B_full", file_path="llama2/13B_full.yaml"),
+ Config(name="llama3/8B_full", file_path="llama3/8B_full.yaml"),
Config(name="mistral/7B_full", file_path="mistral/7B_full.yaml"),
Config(name="gemma/2B_full", file_path="gemma/2B_full.yaml"),
],
@@ -61,6 +66,14 @@ class Recipe:
name="llama2/7B_qlora_single_device",
file_path="llama2/7B_qlora_single_device.yaml",
),
+ Config(
+ name="llama3/8B_lora_single_device",
+ file_path="llama3/8B_lora_single_device.yaml",
+ ),
+ Config(
+ name="llama3/8B_qlora_single_device",
+ file_path="llama3/8B_qlora_single_device.yaml",
+ ),
Config(
name="llama2/13B_qlora_single_device",
file_path="llama2/13B_qlora_single_device.yaml",
@@ -94,6 +107,7 @@ class Recipe:
Config(name="llama2/7B_lora", file_path="llama2/7B_lora.yaml"),
Config(name="llama2/13B_lora", file_path="llama2/13B_lora.yaml"),
Config(name="llama2/70B_lora", file_path="llama2/70B_lora.yaml"),
+ Config(name="llama3/8B_lora", file_path="llama3/8B_lora.yaml"),
Config(name="mistral/7B_lora", file_path="mistral/7B_lora.yaml"),
],
supports_distributed=True,
diff --git a/torchtune/data/_converters.py b/torchtune/data/_converters.py
index 2c6025e8c8..5208220738 100644
--- a/torchtune/data/_converters.py
+++ b/torchtune/data/_converters.py
@@ -42,10 +42,11 @@ def sharegpt_to_llama2_messages(
Returns:
List[Message]: a list of messages with "role" and "content" fields. See `torchtune.datasets._types.Message`
- and `torchtune.datasets._types.Dialogue` for more details.
+ for more details.
"""
role_map = {"system": "system", "human": "user", "gpt": "assistant"}
conversations = sample["conversations"]
+
messages = []
for message in conversations:
role = role_map[message["from"]]
diff --git a/torchtune/data/_types.py b/torchtune/data/_types.py
index 4aba199e3c..087cafa008 100644
--- a/torchtune/data/_types.py
+++ b/torchtune/data/_types.py
@@ -12,6 +12,23 @@
@dataclass
class Message:
+ """
+ This dataclass represents individual messages in an instruction or chat dataset.
+
+ Note that the fields ipython and eot are only relevant when tokenizing with tiktoken,
+ as they inform handling of special tokens in that case.
+
+ Attributes:
+ role (Role): role of the message writer. Can be "system", "user", "assistant".
+ content (str): content of the message.
+ masked (bool): whether the message is masked in the sample. Default: False
+ ipython (bool): whether the message is an ipython call. Default: False
+ eot (bool): whether the message corresponds to the end of a turn. Should be true
+ except in the case of multiple consecutive assistant messages. Default: True
+ """
+
role: Role
content: str
masked: bool = False
+ ipython: bool = False
+ eot: bool = True
diff --git a/torchtune/datasets/_alpaca.py b/torchtune/datasets/_alpaca.py
index c339086468..52cbce6d52 100644
--- a/torchtune/datasets/_alpaca.py
+++ b/torchtune/datasets/_alpaca.py
@@ -8,7 +8,7 @@
from torchtune.data import AlpacaInstructTemplate
from torchtune.datasets._instruct import InstructDataset
-from torchtune.modules import Tokenizer
+from torchtune.modules.tokenizers import Tokenizer
def alpaca_dataset(
diff --git a/torchtune/datasets/_chat.py b/torchtune/datasets/_chat.py
index 69c7b2f2e5..3f026aad73 100644
--- a/torchtune/datasets/_chat.py
+++ b/torchtune/datasets/_chat.py
@@ -18,7 +18,7 @@
sharegpt_to_llama2_messages,
validate_messages,
)
-from torchtune.modules import Tokenizer
+from torchtune.modules.tokenizers import Tokenizer
class ChatDataset(Dataset):
diff --git a/torchtune/datasets/_grammar.py b/torchtune/datasets/_grammar.py
index e87c261faf..c7b4e05121 100644
--- a/torchtune/datasets/_grammar.py
+++ b/torchtune/datasets/_grammar.py
@@ -6,7 +6,7 @@
from torchtune.data import GrammarErrorCorrectionTemplate
from torchtune.datasets._instruct import InstructDataset
-from torchtune.modules import Tokenizer
+from torchtune.modules.tokenizers import Tokenizer
def grammar_dataset(
diff --git a/torchtune/datasets/_instruct.py b/torchtune/datasets/_instruct.py
index 53f573c186..46e6ea3bba 100644
--- a/torchtune/datasets/_instruct.py
+++ b/torchtune/datasets/_instruct.py
@@ -10,15 +10,13 @@
from datasets import load_dataset
from torch.utils.data import Dataset
from torchtune.config._utils import _get_instruct_template
-
from torchtune.data import (
CROSS_ENTROPY_IGNORE_IDX,
InstructTemplate,
Message,
validate_messages,
)
-
-from torchtune.modules import Tokenizer
+from torchtune.modules.tokenizers import Tokenizer
class InstructDataset(Dataset):
diff --git a/torchtune/datasets/_preference.py b/torchtune/datasets/_preference.py
index 199ddfbae8..18871fefaf 100644
--- a/torchtune/datasets/_preference.py
+++ b/torchtune/datasets/_preference.py
@@ -12,7 +12,7 @@
from torchtune.data import CROSS_ENTROPY_IGNORE_IDX, InstructTemplate, Message
-from torchtune.modules import Tokenizer
+from torchtune.modules.tokenizers import Tokenizer
class PreferenceDataset(Dataset):
diff --git a/torchtune/datasets/_samsum.py b/torchtune/datasets/_samsum.py
index ba5561b64a..4fe750178e 100644
--- a/torchtune/datasets/_samsum.py
+++ b/torchtune/datasets/_samsum.py
@@ -6,7 +6,7 @@
from torchtune.data import SummarizeTemplate
from torchtune.datasets import InstructDataset
-from torchtune.modules import Tokenizer
+from torchtune.modules.tokenizers import Tokenizer
def samsum_dataset(
diff --git a/torchtune/datasets/_slimorca.py b/torchtune/datasets/_slimorca.py
index 188aa692ba..dd70456f9f 100644
--- a/torchtune/datasets/_slimorca.py
+++ b/torchtune/datasets/_slimorca.py
@@ -8,7 +8,7 @@
from torchtune.datasets._chat import ChatDataset
-from torchtune.modules import Tokenizer
+from torchtune.modules.tokenizers import Tokenizer
def slimorca_dataset(
diff --git a/torchtune/datasets/_stack_exchanged_paired.py b/torchtune/datasets/_stack_exchanged_paired.py
index 5781cb4e55..f37b5d13cb 100644
--- a/torchtune/datasets/_stack_exchanged_paired.py
+++ b/torchtune/datasets/_stack_exchanged_paired.py
@@ -6,7 +6,7 @@
from torchtune.data import StackExchangedPairedTemplate
from torchtune.datasets._preference import PreferenceDataset
-from torchtune.modules import Tokenizer
+from torchtune.modules.tokenizers import Tokenizer
def stack_exchanged_paired_dataset(
diff --git a/torchtune/models/gemma/_model_builders.py b/torchtune/models/gemma/_model_builders.py
index ea7ad953ed..f598510ac2 100644
--- a/torchtune/models/gemma/_model_builders.py
+++ b/torchtune/models/gemma/_model_builders.py
@@ -5,7 +5,8 @@
# LICENSE file in the root directory of this source tree.
from torchtune.models.gemma._component_builders import gemma
-from torchtune.modules import Tokenizer, TransformerDecoder
+from torchtune.modules import TransformerDecoder
+from torchtune.modules.tokenizers import SentencePieceTokenizer
"""
Model builders build specific instantiations using component builders. For example
@@ -35,7 +36,7 @@ def gemma_2b() -> TransformerDecoder:
)
-def gemma_tokenizer(path: str) -> Tokenizer:
- tokenizer = Tokenizer.from_file(path)
+def gemma_tokenizer(path: str) -> SentencePieceTokenizer:
+ tokenizer = SentencePieceTokenizer(path)
tokenizer.pad_id = 0
return tokenizer
diff --git a/torchtune/models/llama2/_model_builders.py b/torchtune/models/llama2/_model_builders.py
index 839b10580f..15db69e146 100644
--- a/torchtune/models/llama2/_model_builders.py
+++ b/torchtune/models/llama2/_model_builders.py
@@ -8,7 +8,8 @@
from torchtune.models.llama2._component_builders import llama2, lora_llama2
-from torchtune.modules import Tokenizer, TransformerDecoder
+from torchtune.modules import TransformerDecoder
+from torchtune.modules.tokenizers import SentencePieceTokenizer
from torchtune.modules.peft import LORA_ATTN_MODULES
@@ -39,8 +40,8 @@ def llama2_7b() -> TransformerDecoder:
)
-def llama2_tokenizer(path: str) -> Tokenizer:
- tokenizer = Tokenizer.from_file(path)
+def llama2_tokenizer(path: str) -> SentencePieceTokenizer:
+ tokenizer = SentencePieceTokenizer(path)
# Original tokenizer has no pad_id, which causes indexing errors when batch training
tokenizer.pad_id = 0
return tokenizer
diff --git a/torchtune/models/llama3/__init__.py b/torchtune/models/llama3/__init__.py
new file mode 100644
index 0000000000..99309b9300
--- /dev/null
+++ b/torchtune/models/llama3/__init__.py
@@ -0,0 +1,25 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from ._component_builders import llama3, lora_llama3
+
+from ._model_builders import ( # noqa
+ llama3_8b,
+ llama3_tokenizer,
+ lora_llama3_8b,
+ qlora_llama3_8b,
+)
+from ._model_utils import scale_hidden_dim_for_mlp
+
+__all__ = [
+ "llama3",
+ "llama3_8b",
+ "llama3_tokenizer",
+ "lora_llama3",
+ "lora_llama3_8b",
+ "qlora_llama3_8b",
+ "scale_hidden_dim_for_mlp",
+]
diff --git a/torchtune/models/llama3/_component_builders.py b/torchtune/models/llama3/_component_builders.py
new file mode 100644
index 0000000000..0828285645
--- /dev/null
+++ b/torchtune/models/llama3/_component_builders.py
@@ -0,0 +1,410 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from functools import partial
+from typing import List, Literal, Optional
+
+from torch import nn
+
+from torchtune.models.llama3._model_utils import scale_hidden_dim_for_mlp
+
+from torchtune.modules import (
+ CausalSelfAttention,
+ FeedForward,
+ KVCache,
+ RMSNorm,
+ RotaryPositionalEmbeddings,
+ TransformerDecoder,
+ TransformerDecoderLayer,
+)
+
+from torchtune.modules.common_utils import reparametrize_as_dtype_state_dict_post_hook
+
+from torchtune.modules.peft import LORA_ATTN_MODULES, LoRALinear
+
+"""
+Component builders for the Llama3 model and popular variants such as LoRA.
+
+TorchTune provides composable building blocks. Builder functions help
+stitch these building blocks into higher-level components. This design has
+two benefits:
+- The building blocks themselves are very flexible. For example, ``CausalSelfAttention``
+can take either nn.Linear or nn.LoRALinear for ``q_proj``.
+- Builder functions expose a set of configurable params which keep the constructors of
+the building blocks simple.
+"""
+
+
+# ------------------ Vanilla Llama3 ------------------
+
+def llama3(
+ vocab_size: int,
+ num_layers: int,
+ num_heads: int,
+ num_kv_heads: int,
+ embed_dim: int,
+ max_seq_len: int,
+ attn_dropout: float = 0.0,
+ rope_base: int = 500000.0,
+ intermediate_dim: Optional[int] = None,
+ norm_eps: float = 1e-5,
+) -> TransformerDecoder:
+ """
+ Build the decoder associated with the Llama3 model. This includes:
+ - Token embeddings
+ - num_layers number of TransformerDecoderLayer blocks
+ - RMS Norm layer applied to the output of the transformer
+ - Final projection into token space
+
+ Args:
+ vocab_size (int): number of tokens in vocabulary.
+ num_layers (int): number of layers in the transformer decoder.
+ num_heads (int): number of query heads. For MHA this is also the
+ number of heads for key and value
+ num_kv_heads (int): number of key and value heads. If specified,
+ user should ensure `num_heads` % `num_kv_heads` == 0. Default value is
+ `None`, in which case this is the same as MHA
+ embed_dim (int): embedding dimension for self-attention
+ max_seq_len (int): maximum sequence length the model will be run with, as used
+ by :func:`~torchtune.modules.KVCache`
+ attn_dropout (float): dropout value passed onto scaled_dot_product_attention.
+ Default: 0.0
+ intermediate_dim (Optional[int]): intermediate dimension for MLP. If not specified,
+ this is computed using :func:`~torchtune.modules.scale_hidden_dim_for_mlp`
+ norm_eps (float): epsilon in RMS norms.
+
+ Returns:
+ TransformerDecoder: Instantiation of Llama3 model.
+ """
+ head_dim = embed_dim // num_heads
+ num_kv_heads = num_kv_heads if num_kv_heads else num_heads
+ rope = RotaryPositionalEmbeddings(dim=head_dim, max_seq_len=max_seq_len, base=rope_base)
+ self_attn = CausalSelfAttention(
+ embed_dim=embed_dim,
+ num_heads=num_heads,
+ num_kv_heads=num_kv_heads,
+ head_dim=head_dim,
+ q_proj=nn.Linear(embed_dim, num_heads * head_dim, bias=False),
+ k_proj=nn.Linear(embed_dim, num_kv_heads * head_dim, bias=False),
+ v_proj=nn.Linear(embed_dim, num_kv_heads * head_dim, bias=False),
+ output_proj=nn.Linear(embed_dim, embed_dim, bias=False),
+ pos_embeddings=rope,
+ max_seq_len=max_seq_len,
+ attn_dropout=attn_dropout,
+ )
+ hidden_dim = intermediate_dim if intermediate_dim else scale_hidden_dim_for_mlp(embed_dim)
+ mlp = llama3_mlp(dim=embed_dim, hidden_dim=hidden_dim)
+ layer = TransformerDecoderLayer(
+ attn=self_attn,
+ mlp=mlp,
+ sa_norm=RMSNorm(dim=embed_dim, eps=norm_eps),
+ mlp_norm=RMSNorm(dim=embed_dim, eps=norm_eps),
+ )
+ tok_embeddings = nn.Embedding(vocab_size, embed_dim)
+ output_proj = nn.Linear(embed_dim, vocab_size, bias=False)
+ return TransformerDecoder(
+ tok_embeddings=tok_embeddings,
+ layer=layer,
+ num_layers=num_layers,
+ max_seq_len=max_seq_len,
+ num_heads=num_heads,
+ head_dim=head_dim,
+ norm=RMSNorm(embed_dim, eps=norm_eps),
+ output=output_proj,
+ )
+
+def llama3_mlp(dim: int, hidden_dim: int) -> FeedForward:
+ """
+ Build the MLP layer associated with the Llama model.
+ """
+ gate_proj = nn.Linear(dim, hidden_dim, bias=False)
+ down_proj = nn.Linear(hidden_dim, dim, bias=False)
+ up_proj = nn.Linear(dim, hidden_dim, bias=False)
+ return FeedForward(gate_proj=gate_proj, down_proj=down_proj, up_proj=up_proj)
+
+
+
+# ------------------ LoRA Llama3 ------------------
+
+
+def lora_llama3(
+ lora_attn_modules: List[LORA_ATTN_MODULES],
+ apply_lora_to_mlp: bool = False,
+ apply_lora_to_output: bool = False,
+ *,
+ # llama3 args
+ vocab_size: int,
+ num_layers: int,
+ num_heads: int,
+ num_kv_heads: int,
+ embed_dim: int,
+ max_seq_len: int,
+ intermediate_dim: Optional[int] = None,
+ attn_dropout: float = 0.0,
+ norm_eps: float = 1e-5,
+ rope_base: float = 500000.0,
+ # LoRA args
+ lora_rank: int,
+ lora_alpha: float,
+ lora_dropout: float = 0.0,
+ # Quantization args
+ quantize_base: bool = False,
+) -> TransformerDecoder:
+ """
+ Return a version of Llama3 (an instance of :func:`~torchtune.modules.TransformerDecoder`)
+ with LoRA applied to some of the linear layers in its self-attention modules.
+
+ Args:
+ lora_attn_modules (List[LORA_ATTN_MODULES]): list of which linear layers
+ LoRA should be applied to in each self-attention block. Options are
+ ``{"q_proj", "k_proj", "v_proj", "output_proj"}``.
+ apply_lora_to_mlp (bool): whether to apply LoRA to the MLP in each transformer layer.
+ Default: False
+ apply_lora_to_output (bool): whether to apply LoRA to the model's final output projection.
+ Default: False
+ vocab_size (int): number of tokens in vocabulary.
+ num_layers (int): number of layers in the transformer decoder.
+ num_heads (int): number of query heads. For MHA this is also the
+ number of heads for key and value
+ num_kv_heads (int): number of key and value heads. If specified,
+ user should ensure `num_heads` % `num_kv_heads` == 0. Default value is
+ `None`, in which case this is the same as MHA
+ embed_dim (int): embedding dimension for self-attention
+ max_seq_len (int): maximum sequence length the model will be run with, as used
+ by :func:`~torchtune.modules.KVCache`
+ attn_dropout (float): dropout value passed onto scaled_dot_product_attention.
+ Default: 0.0
+ intermediate_dim (Optional[int]): intermediate dimension for MLP. If not specified,
+ this is computed using :func:`~torchtune.modules.scale_hidden_dim_for_mlp`
+ norm_eps (float): epsilon in RMS norms.
+ lora_rank (int): rank of each low-rank approximation
+ lora_alpha (float): scaling factor for the low-rank approximation
+ lora_dropout (float): LoRA dropout probability. Default: 0.0
+ quantize_base: (bool): Whether to quantize base model weights or not. Only applied to base
+ weights within linear layers LoRA is applied to. The final output linear projection is not
+ supported for quantization currently.
+
+ Returns:
+ TransformerDecoder: Instantiation of Llama3 model with LoRA applied to
+ a subset of the attention projections in each layer.
+
+ """
+
+ self_attn = lora_llama3_self_attention(
+ lora_modules=lora_attn_modules,
+ embed_dim=embed_dim,
+ num_heads=num_heads,
+ num_kv_heads=num_kv_heads,
+ max_seq_len=max_seq_len,
+ attn_dropout=attn_dropout,
+ rope_base=rope_base,
+ lora_rank=lora_rank,
+ lora_alpha=lora_alpha,
+ lora_dropout=lora_dropout,
+ quantize_base=quantize_base,
+ )
+
+ hidden_dim = intermediate_dim if intermediate_dim else scale_hidden_dim_for_mlp(embed_dim)
+ if apply_lora_to_mlp:
+ mlp = lora_llama3_mlp(
+ dim=embed_dim,
+ hidden_dim=hidden_dim,
+ lora_rank=lora_rank,
+ lora_alpha=lora_alpha,
+ quantize_base=quantize_base,
+ )
+ else:
+ mlp = llama3_mlp(dim=embed_dim, hidden_dim=hidden_dim)
+
+ layer = TransformerDecoderLayer(
+ attn=self_attn,
+ mlp=mlp,
+ sa_norm=RMSNorm(dim=embed_dim, eps=norm_eps),
+ mlp_norm=RMSNorm(dim=embed_dim, eps=norm_eps),
+ )
+
+ tok_embeddings = nn.Embedding(vocab_size, embed_dim)
+
+ # TODO: quantize_base is not applied to final output_proj currently.
+ output_proj = (
+ LoRALinear(embed_dim, vocab_size, rank=lora_rank, alpha=lora_alpha)
+ if apply_lora_to_output
+ else nn.Linear(embed_dim, vocab_size, bias=False)
+ )
+ model = TransformerDecoder(
+ tok_embeddings=tok_embeddings,
+ layer=layer,
+ num_layers=num_layers,
+ max_seq_len=max_seq_len,
+ num_heads=num_heads,
+ head_dim=(embed_dim // num_heads),
+ norm=RMSNorm(embed_dim, eps=norm_eps),
+ output=output_proj,
+ )
+
+ if quantize_base:
+ # For QLoRA, we reparametrize 4-bit tensors to bf16, and offload to CPU on the fly
+ # so as to not increase peak memory
+ model._register_state_dict_hook(
+ partial(reparametrize_as_dtype_state_dict_post_hook, offload_to_cpu=True)
+ )
+
+ return model
+
+
+def lora_llama3_self_attention(
+ lora_modules: List[LORA_ATTN_MODULES],
+ *,
+ # CausalSelfAttention args
+ embed_dim: int,
+ num_heads: int,
+ num_kv_heads: int,
+ max_seq_len: int,
+ attn_dropout: float = 0.0,
+ rope_base: float = 500000.0,
+ # LoRA args
+ lora_rank: int,
+ lora_alpha: float,
+ lora_dropout: float = 0.0,
+ quantize_base: bool = False,
+) -> CausalSelfAttention:
+ """
+ Return an instance of :func:`~torchtune.modules.CausalSelfAttention` with LoRA
+ applied to a subset of its linear layers
+
+ Args:
+ lora_modules (List[LORA_ATTN_MODULES]): list of which linear layers
+ LoRA should be applied to. Options are ``{"q_proj", "k_proj", "v_proj",
+ "output_proj"}``.
+ embed_dim (int): embedding dimension for self-attention
+ num_heads (int): number of query heads. For MHA this is also the
+ number of heads for key and value
+ num_kv_heads (int): number of key and value heads. If specified,
+ user should ensure `num_heads` % `num_kv_heads` == 0. Default value is
+ `None`, in which case this is the same as MHA
+ max_seq_len (int): maximum sequence length the model will be run with, as used
+ by :func:`~torchtune.modules.KVCache`
+ attn_dropout (float): dropout value passed onto scaled_dot_product_attention.
+ Default: 0.0
+ lora_rank (int): rank of each low-rank approximation
+ lora_alpha (float): scaling factor for the low-rank approximation
+ lora_dropout (float): LoRA dropout probability. Default: 0.0
+ quantize_base (bool): Whether to quantize base model parameters for linear layers
+ LoRA is being applied to. Default is ``False``.
+
+ Returns:
+ CausalSelfAttention: instantiation of self-attention module with LoRA
+ applied to a subset of Q, K, V, output projections.
+
+ Raises:
+ ValueError: If lora_modules arg is an empty list
+ """
+ if not lora_modules:
+ raise ValueError(
+ f"Must pass one or more of {LORA_ATTN_MODULES} as lora_modules"
+ )
+
+ head_dim = embed_dim // num_heads
+ num_kv_heads = num_kv_heads if num_kv_heads else num_heads
+ q_proj = (
+ LoRALinear(
+ embed_dim,
+ num_heads * head_dim,
+ rank=lora_rank,
+ alpha=lora_alpha,
+ quantize_base=quantize_base,
+ )
+ if "q_proj" in lora_modules
+ else nn.Linear(embed_dim, num_heads * head_dim, bias=False)
+ )
+ k_proj = (
+ LoRALinear(
+ embed_dim,
+ num_kv_heads * head_dim,
+ rank=lora_rank,
+ alpha=lora_alpha,
+ quantize_base=quantize_base,
+ )
+ if "k_proj" in lora_modules
+ else nn.Linear(embed_dim, num_kv_heads * head_dim, bias=False)
+ )
+ v_proj = (
+ LoRALinear(
+ embed_dim,
+ num_kv_heads * head_dim,
+ rank=lora_rank,
+ alpha=lora_alpha,
+ quantize_base=quantize_base,
+ )
+ if "v_proj" in lora_modules
+ else nn.Linear(embed_dim, num_kv_heads * head_dim, bias=False)
+ )
+ output_proj = (
+ LoRALinear(
+ embed_dim,
+ embed_dim,
+ rank=lora_rank,
+ alpha=lora_alpha,
+ quantize_base=quantize_base,
+ )
+ if "output_proj" in lora_modules
+ else nn.Linear(embed_dim, embed_dim, bias=False)
+ )
+ rope = RotaryPositionalEmbeddings(dim=head_dim, max_seq_len=max_seq_len, base=rope_base)
+ self_attn = CausalSelfAttention(
+ embed_dim=embed_dim,
+ num_heads=num_heads,
+ num_kv_heads=num_kv_heads,
+ head_dim=head_dim,
+ q_proj=q_proj,
+ k_proj=k_proj,
+ v_proj=v_proj,
+ output_proj=output_proj,
+ pos_embeddings=rope,
+ max_seq_len=max_seq_len,
+ attn_dropout=attn_dropout,
+ )
+ return self_attn
+
+
+def lora_llama3_mlp(
+ *,
+ dim: int,
+ hidden_dim: int,
+ lora_rank: int,
+ lora_alpha: float,
+ lora_dropout: float = 0.0,
+ quantize_base: bool = False,
+) -> FeedForward:
+ gate_proj = LoRALinear(
+ in_dim=dim,
+ out_dim=hidden_dim,
+ rank=lora_rank,
+ alpha=lora_alpha,
+ dropout=lora_dropout,
+ quantize_base=quantize_base,
+ )
+ down_proj = LoRALinear(
+ in_dim=hidden_dim,
+ out_dim=dim,
+ rank=lora_rank,
+ alpha=lora_alpha,
+ dropout=lora_dropout,
+ quantize_base=quantize_base,
+ )
+ up_proj = LoRALinear(
+ in_dim=dim,
+ out_dim=hidden_dim,
+ rank=lora_rank,
+ alpha=lora_alpha,
+ dropout=lora_dropout,
+ quantize_base=quantize_base,
+ )
+ return FeedForward(
+ gate_proj=gate_proj,
+ down_proj=down_proj,
+ up_proj=up_proj,
+ )
diff --git a/torchtune/models/llama3/_model_builders.py b/torchtune/models/llama3/_model_builders.py
new file mode 100644
index 0000000000..4286a0f145
--- /dev/null
+++ b/torchtune/models/llama3/_model_builders.py
@@ -0,0 +1,109 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import List, Optional
+from functools import partial
+
+from torch import nn
+
+from torchtune.models.llama3._component_builders import llama3, lora_llama3
+from torchtune.models.llama3._model_utils import scale_hidden_dim_for_mlp
+
+from torchtune.modules import TransformerDecoder
+from torchtune.modules.tokenizers import TikTokenTokenizer
+from torchtune.modules.peft import LORA_ATTN_MODULES
+
+
+"""
+Model builders build specific instantiations using component builders. For example
+the llama3_8b model builder uses the llama3 component builder to create the
+Llama3 8B model.
+"""
+
+
+def llama3_8b() -> TransformerDecoder:
+ """
+ Builder for creating a Llama3 model initialized w/ the default 8b parameter values.
+
+ Returns:
+ TransformerDecoder: Instantiation of Llama3 8B model
+ """
+ return llama3(
+ vocab_size=128_256,
+ num_layers=32,
+ num_heads=32,
+ num_kv_heads=8,
+ embed_dim=4096,
+ max_seq_len=4096,
+ intermediate_dim=14336,
+ attn_dropout=0.0,
+ norm_eps=1e-5,
+ rope_base=500000.0,
+ )
+
+
+def llama3_tokenizer(path: str) -> TikTokenTokenizer:
+ tiktoken = TikTokenTokenizer(path)
+ tiktoken.pad_id = 0
+ return tiktoken
+
+
+def lora_llama3_8b(
+ lora_attn_modules: List[LORA_ATTN_MODULES],
+ apply_lora_to_mlp: bool = False,
+ apply_lora_to_output: bool = False,
+ lora_rank: int = 8,
+ lora_alpha: float = 16,
+ quantize_base: bool = False,
+) -> TransformerDecoder:
+ """
+ Builder for creating a Llama3 8B model with LoRA enabled.
+
+ The Llama3 defaults are the same as in :func:`~torchtune.models.llama3.llama3_8b`,
+ while LoRA default params are based on
+ https://github.com/tloen/alpaca-lora/blob/8bb8579e403dc78e37fe81ffbb253c413007323f/finetune.py#L41-L43.
+
+ Args:
+ lora_attn_modules (List[LORA_ATTN_MODULES]): list of which linear layers
+ LoRA should be applied to in each self-attention block. Options are
+ ``{"q_proj", "k_proj", "v_proj", "output_proj"}``.
+ apply_lora_to_mlp (bool): whether to apply LoRA to the MLP in each transformer layer.
+ Default: False
+ apply_lora_to_output (bool): whether to apply LoRA to the model's final output projection.
+ Default: False
+ lora_rank (int): rank of each low-rank approximation
+ lora_alpha (float): scaling factor for the low-rank approximation
+ quantize_base (bool): Whether to quantize base model weights
+
+ Returns:
+ TransformerDecoder: Instantiation of Llama3 8B model with LoRA applied
+ """
+ return lora_llama3(
+ lora_attn_modules=lora_attn_modules,
+ apply_lora_to_mlp=apply_lora_to_mlp,
+ apply_lora_to_output=apply_lora_to_output,
+ vocab_size=128_256,
+ num_layers=32,
+ num_heads=32,
+ num_kv_heads=8,
+ embed_dim=4096,
+ max_seq_len=4096,
+ intermediate_dim=14336,
+ attn_dropout=0.0,
+ norm_eps=1e-5,
+ rope_base=500000.0,
+ lora_rank=lora_rank,
+ lora_alpha=lora_alpha,
+ lora_dropout=0.05,
+ quantize_base=quantize_base,
+ )
+
+qlora_llama3_8b = partial(lora_llama3_8b, quantize_base=True)
+
+qlora_llama3_8b.__doc__ = """
+Builder for creating a Llama3 model with QLoRA enabled. Base model weights in linear layers
+that LoRA is applied to are quantized per the QLoRA paper: https://arxiv.org/abs/2305.14314.
+Please see `lora_llama3_8b` for full API arguments.
+"""
diff --git a/torchtune/models/llama3/_model_utils.py b/torchtune/models/llama3/_model_utils.py
new file mode 100644
index 0000000000..010c1bcc2f
--- /dev/null
+++ b/torchtune/models/llama3/_model_utils.py
@@ -0,0 +1,23 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+def scale_hidden_dim_for_mlp(dim: int, multiple_of: int = 256) -> int:
+ """Scale hidden dimension for MLP to keep number of parameters and computation constant.
+
+ Args:
+ dim (int): Input dimension.
+ multiple_of (int): Round scaled dimension to nearest multiple of `multiple_of` for clean computation.
+
+ Returns:
+ Scaled hidden dimension.
+ """
+ # Scale hidden dimension by (2/3)4d for SwiGLU to keep number of
+ # parameters and computation constant
+ hidden_dim = 4 * int(2 * dim / 3)
+ # Round hidden dimension to nearest multiple of `multiple_of`
+ hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
+ return hidden_dim
diff --git a/torchtune/models/mistral/_model_builders.py b/torchtune/models/mistral/_model_builders.py
index 3071608eb0..940d1820ae 100644
--- a/torchtune/models/mistral/_model_builders.py
+++ b/torchtune/models/mistral/_model_builders.py
@@ -7,7 +7,8 @@
from torchtune.models.mistral._component_builders import mistral, lora_mistral
-from torchtune.modules import Tokenizer, TransformerDecoder
+from torchtune.modules import TransformerDecoder
+from torchtune.modules.tokenizers import SentencePieceTokenizer
from torchtune.modules.peft import LORA_ATTN_MODULES
from functools import partial
@@ -40,8 +41,8 @@ def mistral_7b() -> TransformerDecoder:
)
-def mistral_tokenizer(path: str) -> Tokenizer:
- tokenizer = Tokenizer.from_file(path)
+def mistral_tokenizer(path: str) -> SentencePieceTokenizer:
+ tokenizer = SentencePieceTokenizer(path)
# Original tokenizer has no pad_id, which causes indexing errors when batch training
tokenizer.pad_id = 0
return tokenizer
diff --git a/torchtune/modules/__init__.py b/torchtune/modules/__init__.py
index 7d08ea5bd2..46b8e93b0f 100644
--- a/torchtune/modules/__init__.py
+++ b/torchtune/modules/__init__.py
@@ -11,7 +11,6 @@
from .lr_schedulers import get_cosine_schedule_with_warmup # noqa
from .position_embeddings import RotaryPositionalEmbeddings # noqa
from .rms_norm import RMSNorm # noqa
-from .tokenizer import Tokenizer # noqa
from .transformer import TransformerDecoder, TransformerDecoderLayer # noqa
__all__ = [
@@ -21,7 +20,6 @@
"KVCache",
"RotaryPositionalEmbeddings",
"RMSNorm",
- "Tokenizer",
"TransformerDecoder",
"TransformerDecoderLayer",
"reparametrize_as_dtype_state_dict_post_hook",
diff --git a/torchtune/modules/tokenizers/__init__.py b/torchtune/modules/tokenizers/__init__.py
new file mode 100644
index 0000000000..069849bf35
--- /dev/null
+++ b/torchtune/modules/tokenizers/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from ._sentencepiece import SentencePieceTokenizer
+from ._tiktoken import TikTokenTokenizer
+from ._utils import Tokenizer
+
+__all__ = ["SentencePieceTokenizer", "TikTokenTokenizer", "Tokenizer"]
diff --git a/torchtune/modules/tokenizer.py b/torchtune/modules/tokenizers/_sentencepiece.py
similarity index 84%
rename from torchtune/modules/tokenizer.py
rename to torchtune/modules/tokenizers/_sentencepiece.py
index 07c0268fb4..94104faa2f 100644
--- a/torchtune/modules/tokenizer.py
+++ b/torchtune/modules/tokenizers/_sentencepiece.py
@@ -13,37 +13,31 @@
WHITESPACE_CHARS = [" ", "\n", "\t", "\r", "\v"]
-class Tokenizer:
+class SentencePieceTokenizer:
"""A wrapper around SentencePieceProcessor.
Args:
- spm_model (SentencePieceProcessor): The SentencePiece model.
- vocab_size (int): The size of the vocabulary.
- bos_id (int): The ID of the beginning-of-sentence token.
- eos_id (int): The ID of the end-of-sentence token.
- pad_id (int): The ID of the padding token.
+ path (str): Path to pretrained tokenizer file.
Example:
# Accepts only non-batched input for now
- >>> tokenizer = Tokenizer.from_file("/path/to/spm_model")
- >>> tokenized_text = tokenizer.encode("Hello world!", add_bos=True, add_eos=True)
+ >>> tokenizer = SentencePieceTokenizer("/path/to/spm_model")
+ >>> tokenized_text = SentencePieceTokenizer.encode("Hello world!", add_bos=True, add_eos=True)
>>> print(tokenized_text)
[1, 31587, 29644, 102, 2]
"""
def __init__(
self,
- spm_model: SentencePieceProcessor,
- vocab_size: int,
- bos_id: int,
- eos_id: int,
- pad_id: int,
+ path: str,
):
+ spm_model = SentencePieceProcessor()
+ spm_model.load(path)
self.spm_model = spm_model
- self.vocab_size = vocab_size
- self.bos_id = bos_id
- self.eos_id = eos_id
- self.pad_id = pad_id
+ self.vocab_size = spm_model.vocab_size()
+ self.bos_id = spm_model.bos_id()
+ self.eos_id = spm_model.eos_id()
+ self.pad_id = spm_model.pad_id()
# This is used in tokenize_messages: if the tokenizer does not
# encode whitespace, then we can more easily split strings
@@ -52,20 +46,6 @@ def __init__(
[self.spm_model.encode(c) for c in WHITESPACE_CHARS]
)
- @classmethod
- def from_file(cls, path: str) -> "Tokenizer":
- """Initialize a `Tokenizer` instance from a SentencePiece model file.
-
- Args:
- path (str): The path to the SentencePiece model file.
-
- Returns:
- Tokenizer: A `Tokenizer` instance.
- """
- spm = SentencePieceProcessor()
- spm.load(path)
- return cls(spm, spm.vocab_size(), spm.bos_id(), spm.eos_id(), spm.pad_id())
-
def encode(
self,
text: str,
@@ -135,7 +115,7 @@ def tokenize_messages(
beginning off the tokenized s2.
Example:
- >>> tokenizer = Tokenizer.from_file(tokenizer_path)
+ >>> tokenizer = SentencePieceTokenizer(tokenizer_path)
>>> messages = [
Message(role="system", content="system message\n", masked=True),
Message(role="user", content="user prompt\n", masked=True),
diff --git a/torchtune/modules/tokenizers/_tiktoken.py b/torchtune/modules/tokenizers/_tiktoken.py
new file mode 100644
index 0000000000..104cee1353
--- /dev/null
+++ b/torchtune/modules/tokenizers/_tiktoken.py
@@ -0,0 +1,367 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict, List, Optional, Tuple
+
+from tiktoken import Encoding
+from tiktoken.load import load_tiktoken_bpe
+from torchtune.data._types import Message
+from torchtune.modules.tokenizers._utils import (
+ _split_long_repetitions,
+ Tokenizer,
+ truncate,
+)
+
+
+CL100K_PATTERN = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""" # noqa
+
+# bos and eos tokens
+BEGIN_OF_TEXT = "<|begin_of_text|>"
+END_OF_TEXT = "<|end_of_text|>"
+# fill-in-the-middle tags
+FIM_PREFIX = "<|fim_prefix|>"
+FIM_MIDDLE = "<|fim_middle|>"
+FIM_SUFFIX = "<|fim_suffix|>"
+# start and end header tokens for formatting chat messages
+START_HEADER_ID = "<|start_header_id|>"
+END_HEADER_ID = "<|end_header_id|>"
+STEP_ID = "<|step_id|>"
+# different end of message tags
+EOM_ID = "<|eom_id|>"
+EOT_ID = "<|eot_id|>"
+# special token for ipython messages
+PYTHON_TAG = "<|python_tag|>"
+
+ALL_SPECIAL_TOKENS = [
+ BEGIN_OF_TEXT,
+ END_OF_TEXT,
+ FIM_PREFIX,
+ FIM_MIDDLE,
+ FIM_SUFFIX,
+ STEP_ID,
+ START_HEADER_ID,
+ END_HEADER_ID,
+ EOM_ID,
+ EOT_ID,
+ PYTHON_TAG,
+]
+
+PAD_ID = -1
+
+# Constants controlling encode logic
+MAX_ENCODE_CHARS = 400_000
+MAX_NO_WHITESPACE_CHARS = 25_000
+
+
+class TikTokenTokenizer(Tokenizer):
+ """A wrapper around tiktoken Encoding.
+
+ Args:
+ path (str): Path to pretrained tokenizer checkpoint file.
+ name (str): Name of the tokenizer (used by tiktoken for identification).
+ pattern (str): Regex pattern used to for string parsing.
+ all_special_tokens (Optional[List[str]]): List of all special tokens. First element
+ must be bos token, second element must be eos token, final element must be
+ python tag. All elements must be unique. Length must be at most 256.
+ Default: None (will use ALL_SPECIAL_TOKENS)
+ bos_token (str): Beginning of sequence token. Defaults to BEGIN_OF_TEXT.
+ eos_token (str): End of sequence token. Defaults to END_OF_TEXT.
+ start_header_id (str): Start header token. Defaults to START_HEADER_ID.
+ end_header_id (str): End header token. Defaults to END_HEADER_ID.
+ step_id (str): Step token. Defaults to STEP_ID.
+ eom_id (str): End of message token. Defaults to EOM_ID.
+ eot_id (str): End of turn token. Defaults to EOT_ID.
+ python_tag (str): Python tag token. Defaults to PYTHON_TAG.
+ """
+
+ def __init__(
+ self,
+ path: str,
+ *,
+ name: str = "llama3_tiktoken",
+ pattern: str = CL100K_PATTERN,
+ all_special_tokens: Optional[List[str]] = None,
+ bos_token: str = BEGIN_OF_TEXT,
+ eos_token: str = END_OF_TEXT,
+ start_header_id: str = START_HEADER_ID,
+ end_header_id: str = END_HEADER_ID,
+ step_id: str = STEP_ID,
+ eom_id: str = EOM_ID,
+ eot_id: str = EOT_ID,
+ python_tag: str = PYTHON_TAG,
+ ):
+ self.path = path
+ self.num_reserved_special_tokens = 256
+ all_special_tokens = all_special_tokens or ALL_SPECIAL_TOKENS
+ self._validate_special_tokens(
+ all_special_tokens=all_special_tokens,
+ bos_token=bos_token,
+ eos_token=eos_token,
+ step_id=step_id,
+ start_header_id=start_header_id,
+ end_header_id=end_header_id,
+ eom_id=eom_id,
+ eot_id=eot_id,
+ python_tag=python_tag,
+ )
+ self.all_special_tokens = all_special_tokens
+
+ mergeable_ranks = load_tiktoken_bpe(self.path)
+ self.base_vocab_size = len(mergeable_ranks)
+ all_special_tokens_with_ids = self._get_all_special_tokens_with_ids()
+ self.tt_model = Encoding(
+ name=name,
+ pat_str=pattern,
+ mergeable_ranks=mergeable_ranks,
+ special_tokens={**all_special_tokens_with_ids},
+ )
+
+ # Encode BOS and EOS, define pad ID
+ self.bos_id = self._encode_special_token(self.all_special_tokens[0])
+ self.eos_id = self._encode_special_token(self.all_special_tokens[1])
+ self.pad_id = PAD_ID
+
+ self.vocab_size = self.tt_model.n_vocab
+
+ # Encode extra special tokens
+ self.step_id = self._encode_special_token(step_id)
+ self.start_header_id = self._encode_special_token(start_header_id)
+ self.end_header_id = self._encode_special_token(end_header_id)
+ self.eom_id = self._encode_special_token(eom_id)
+ self.eot_id = self._encode_special_token(eot_id)
+ self.python_tag = self._encode_special_token(python_tag)
+
+ def _validate_special_tokens(
+ self,
+ *,
+ all_special_tokens: List[str],
+ bos_token: str,
+ eos_token: str,
+ step_id: str,
+ start_header_id: str,
+ end_header_id: str,
+ eom_id: str,
+ eot_id: str,
+ python_tag: str,
+ ):
+ """
+ Validate all the special tokens are as expected. Should satisfy:
+
+ (1) bos_token, eos_token, step_id, start_header_id, end_header_id, eom_id,
+ eot_id, python_tag are all in all_special_tokens,
+ (2) bos_token should be first, eos_token should be second, python_tag should be last,
+ (3) all special tokens are unique, and
+ (4) at most 256 special tokens
+ """
+ for token in [
+ bos_token,
+ eos_token,
+ step_id,
+ start_header_id,
+ end_header_id,
+ eom_id,
+ eot_id,
+ python_tag,
+ ]:
+ assert (
+ token in all_special_tokens
+ ), f"{token} missing from all_special_tokens"
+ assert (
+ all_special_tokens[0] == bos_token
+ ), f"First special token must be bos, got {all_special_tokens[0]}"
+ assert (
+ all_special_tokens[1] == eos_token
+ ), f"Second special token must be eos, got {all_special_tokens[1]}"
+ assert (
+ all_special_tokens[-1] == python_tag
+ ), f"Last special token must be python_tag, got {all_special_tokens[-1]}"
+ assert len(set(all_special_tokens)) == len(
+ all_special_tokens
+ ), "Special tokens must be unique."
+ assert (
+ len(all_special_tokens) <= self.num_reserved_special_tokens
+ ), "The total number of basic and extra special tokens cannot exceed the number of reserved tokens."
+
+ def _get_all_special_tokens_with_ids(self) -> Dict[str, int]:
+ """
+ Returns a dictionary of all special tokens and their corresponding ids to be passed
+ to tiktoken Encoding.
+
+ There are 256 slots for special tokens, any remaining spaces beyond self.all_special_tokens
+ will be filled with dummy reserved tokens. Tokens will be added in the order:
+ (1) all special tokens but python_tag, (2) all reserved tokens, (3) python_tag.
+ """
+ reserved_tokens = [
+ f"<|reserved_special_token_{i}|>"
+ for i in range(
+ self.num_reserved_special_tokens - len(self.all_special_tokens)
+ )
+ ]
+ # Python tag special token should come last (validated in __init__)
+ all_special_tokens = (
+ self.all_special_tokens[:-1]
+ + reserved_tokens
+ + [self.all_special_tokens[-1]]
+ )
+
+ return {
+ token: self.base_vocab_size + i
+ for i, token in enumerate(all_special_tokens)
+ }
+
+ def _encode_special_token(self, token: str) -> int:
+ """
+ Encodes a special token.
+
+ Args:
+ token (str): The special token to encode.
+
+ Returns:
+ int: The encoded special token.
+ """
+ return self.tt_model.encode(
+ token,
+ allowed_special="all",
+ disallowed_special=(),
+ )[0]
+
+ def encode(
+ self,
+ text: str,
+ add_bos: bool,
+ add_eos: bool,
+ ) -> List[int]:
+ """
+ Encode a string into a list of token ids. Assumes that the string
+ contains no special tokens.
+
+ Args:
+ text (str): The string to encode.
+ add_bos (bool): Whether to add the beginning of sequence token.
+ add_eos (bool): Whether to add the end of sequence token.
+
+ Returns:
+ List[int]: The list of token ids.
+ """
+ substrs: List[str] = []
+ tokens = []
+ for i in range(0, len(text), MAX_ENCODE_CHARS):
+ substr = text[i : i + MAX_ENCODE_CHARS]
+ # See https://github.com/openai/tiktoken/issues/195
+ sliced_substr = _split_long_repetitions(substr, MAX_NO_WHITESPACE_CHARS)
+ substrs.extend(sliced_substr)
+ for substr in substrs:
+ # allowed_special and disallowed_special are used by tiktoken to define
+ # how special tokens are encoded. Our setting here is to encode any
+ # special token as regular text and prevent tiktoken from raising errors.
+ # This means we should only call encode on strings not containing special tokens.
+ tokens.extend(
+ self.tt_model.encode(
+ substr,
+ allowed_special=set(),
+ disallowed_special=(),
+ )
+ )
+ if add_bos:
+ tokens.insert(0, self.bos_id)
+ if add_eos:
+ tokens.append(self.eos_id)
+ return tokens
+
+ def decode(
+ self,
+ token_ids: List[int],
+ truncate_at_eos: bool = True,
+ ) -> str:
+ """
+ Decode a list of token ids into a string.
+
+ Args:
+ token_ids (List[int]): The list of token ids.
+ truncate_at_eos (bool): Whether to truncate the string at the end of
+ sequence token.
+
+ Returns:
+ str: The decoded string.
+ """
+ if truncate_at_eos:
+ try:
+ k = token_ids.index(self.eos_id)
+ except ValueError:
+ k = None
+ if k:
+ token_ids = token_ids[:k]
+ token_ids = [token_id for token_id in token_ids if token_id != self.bos_id]
+ return self.tt_model.decode(token_ids)
+
+ def tokenize_message(
+ self, message: Message, tokenize_header: bool = False
+ ) -> List[int]:
+ """
+ Tokenize a message into a list of token ids.
+
+ Args:
+ message (Message): The message to tokenize.
+ tokenize_header (bool): Whether to prepend a tokenized header to each message.
+
+ Returns:
+ List[int]: The list of token ids.
+ """
+ if tokenize_header:
+ tokenized_header = (
+ [self.start_header_id]
+ + self.encode(message.role.strip(), add_bos=False, add_eos=False)
+ + [self.end_header_id]
+ + self.encode("\n\n", add_bos=False, add_eos=False)
+ )
+ else:
+ tokenized_header = []
+ tokenized_body = self.encode(
+ message.content.strip(), add_bos=False, add_eos=False
+ )
+ if message.ipython:
+ tokenized_body = [self.python_tag] + tokenized_body
+ tokenized_message = tokenized_header + tokenized_body
+ if message.eot:
+ tokenized_message = tokenized_message + [self.eot_id]
+ else:
+ tokenized_message = tokenized_message + [self.eom_id]
+ return tokenized_message
+
+ def tokenize_messages(
+ self,
+ messages: List[Message],
+ max_seq_len: Optional[int] = None,
+ tokenize_header: bool = True,
+ ) -> Tuple[List[int], List[bool]]:
+ """
+ Tokenize a list of messages into a list of token ids and masks.
+
+ Args:
+ messages (List[Message]): The list of messages to tokenize.
+ max_seq_len (Optional[int]): The maximum sequence length.
+ tokenize_header (bool): Whether to prepend a tokenized header to each message.
+
+ Returns:
+ Tuple[List[int], List[bool]]: The list of token ids and the list of masks.
+ """
+ tokens = [self.bos_id]
+ # bos and eos are always masked
+ mask = [True]
+ for message in messages:
+ tokenized_message = self.tokenize_message(
+ message, tokenize_header=tokenize_header
+ )
+ tokens = tokens + tokenized_message
+ mask = mask + ([message.masked] * len(tokenized_message))
+ if max_seq_len and len(tokens) >= max_seq_len:
+ break
+ tokens = tokens + [self.eos_id]
+ mask = mask + [True]
+ if max_seq_len:
+ tokens = truncate(tokens, max_seq_len, self.eos_id)
+ mask = truncate(mask, max_seq_len, True)
+ return tokens, mask
diff --git a/torchtune/modules/tokenizers/_utils.py b/torchtune/modules/tokenizers/_utils.py
new file mode 100644
index 0000000000..cdfaddfcdf
--- /dev/null
+++ b/torchtune/modules/tokenizers/_utils.py
@@ -0,0 +1,71 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Iterator, List, Protocol, Union
+
+from torchtune.data._types import Message
+
+
+class Tokenizer(Protocol):
+ """Abstract tokenizer"""
+
+ bos_id: int
+ eos_id: int
+ pad_id: int
+
+ def encode(self, text: str, **kwargs) -> List[int]:
+ """
+ Given a string, return the a list of token ids.
+ """
+
+ def decode(
+ self, token_ids: List[int], add_bos: bool, add_eos: bool, **kwargs
+ ) -> str:
+ """
+ Given a list of token ids, return the decoded text.
+ """
+
+ def tokenize_messages(self, token_ids: List[Message], **kwargs):
+ """
+ Given a list of messages, return a list of tokens for the concatenated
+ and formatted messages.
+ """
+ pass
+
+
+def truncate(
+ tokens: List[int],
+ max_seq_len: int,
+ eos_id: Union[int, bool],
+):
+ tokens_truncated = tokens[:max_seq_len]
+ if tokens_truncated[-1] != eos_id:
+ tokens_truncated[-1] = eos_id
+ return tokens_truncated
+
+
+def _split_long_repetitions(s: str, max_consecutive_slice_len: int) -> Iterator[str]:
+ """
+ Split the string `s` so that each substring contains no more than `max_consecutive_slice_len`
+ consecutive whitespaces or consecutive non-whitespaces
+ """
+ current_slice_len = 0
+ current_slice_is_space = s[0].isspace() if len(s) > 0 else False
+ slice_start = 0
+
+ for i in range(len(s)):
+ is_now_space = s[i].isspace()
+
+ if current_slice_is_space ^ is_now_space:
+ current_slice_len = 1
+ current_slice_is_space = is_now_space
+ else:
+ current_slice_len += 1
+ if current_slice_len > max_consecutive_slice_len:
+ yield s[slice_start:i]
+ slice_start = i
+ current_slice_len = 1
+ yield s[slice_start:]
diff --git a/torchtune/utils/_checkpointing/_checkpointer_utils.py b/torchtune/utils/_checkpointing/_checkpointer_utils.py
index ec1f4a47cd..a1391028e0 100644
--- a/torchtune/utils/_checkpointing/_checkpointer_utils.py
+++ b/torchtune/utils/_checkpointing/_checkpointer_utils.py
@@ -22,6 +22,7 @@ class ModelType(Enum):
LLAMA2 = "llama2"
MISTRAL = "mistral"
GEMMA = "gemma"
+ LLAMA3 = "llama3"
def get_path(input_dir: Path, filename: str, missing_ok: bool = False) -> Path: