diff --git a/README.md b/README.md index dd39769439..02e71a2992 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,11 @@ ![Recipe Integration Test](https://github.com/pytorch/torchtune/actions/workflows/recipe_test.yaml/badge.svg) [![](https://dcbadge.vercel.app/api/server/4Xsdn8Rr9Q?style=flat)](https://discord.gg/4Xsdn8Rr9Q) +  +  + +**Note: torchtune now supports Llama3! Currently we support the Llama3 8B Model with LoRA, QLoRA and Full fine-tune. Find more details in the [Llama3](#llama3) section!** + # torchtune @@ -40,6 +45,7 @@ torchtune currently supports the following models. | Model | Sizes | |-----------------------------------------------|-----------| +| [Llama3](https://llama.meta.com/llama3) | 8B [[models](torchtune/models/llama3/_model_builders.py), [configs](recipes/configs/llama3/)] | | [Llama2](https://llama.meta.com/llama2/) | 7B, 13B [[models](torchtune/models/llama2/_model_builders.py), [configs](recipes/configs/llama2/)] | | [Mistral](https://huggingface.co/mistralai) | 7B [[model](torchtune/models/mistral/_model_builders.py), [configs](recipes/configs/mistral/)] | | [Gemma](https://huggingface.co/collections/google/gemma-release-65d5efbccdbb8c4202ec078b) | 2B [[model](torchtune/models/gemma/_model_builders.py), [configs](recipes/configs/gemma/)] | @@ -54,8 +60,8 @@ torchtune provides the following fine-tuning recipes. | Training | Fine-tuning Method | |------------------------------------|------------------------------------| -| Distributed Training [1 to 8 GPUs] | Full [[code](recipes/full_finetune_distributed.py), [example](recipes/configs/llama2/7B_full.yaml)], LoRA [[code](recipes/lora_finetune_distributed.py), [example](recipes/configs/llama2/7B_lora.yaml)] | -| Single Device / Low Memory [1 GPU] | Full [[code](recipes/full_finetune_single_device.py), [example](recipes/configs/llama2/7B_full_low_memory.yaml)], LoRA + QLoRA [[code](recipes/lora_finetune_single_device.py), [example](recipes/configs/llama2/7B_qlora_single_device.yaml)] | +| Distributed Training [1 to 8 GPUs] | Full [[code](recipes/full_finetune_distributed.py), [example](recipes/configs/llama3/8B_full.yaml)], LoRA [[code](recipes/lora_finetune_distributed.py), [example](recipes/configs/llama3/8B_lora.yaml)] | +| Single Device / Low Memory [1 GPU] | Full [[code](recipes/full_finetune_single_device.py), [example](recipes/configs/llama3/8B_full_single_device.yaml)], LoRA + QLoRA [[code](recipes/lora_finetune_single_device.py), [example](recipes/configs/llama3/8B_lora_single_device.yaml)] | | Single Device [1 GPU] | DPO [[code](recipes/full_finetune_distributed.py), [example](recipes/configs/llama2/7B_lora_dpo_single_device.yaml)]   @@ -69,14 +75,47 @@ This table captures the minimum memory requirements for our different recipes us | Example HW Resources | Finetuning Method | Config | Model | Peak Memory per GPU |--------------|-------------------|---------|------------|---------------------| -| 1 x RTX 4090 | QLoRA | [qlora_finetune_single_device](recipes/configs/llama2/7B_qlora_single_device.yaml) | Llama-7B | 9.29 GB | -| 2 x RTX 4090 | LoRA | [lora_finetune_distributed](recipes/configs/llama2/7B_lora.yaml) | Llama-7B | 20.95 GB | -| 1 x RTX 4090 | LoRA | [lora_finetune_single_device](recipes/configs/llama2/7B_lora_single_device.yaml) | Llama-7B | 17.18 GB | -| 1 x RTX 4090 | Full finetune | [full_finetune_single_device](recipes/configs/llama2/7B_full_low_memory.yaml) | Llama-7B | 14.97 GB | -| 4 x RTX 4090 | Full finetune | [full_finetune_distributed](recipes/configs/llama2/7B_full.yaml) | Llama-7B | 22.9 GB | +| 1 x RTX 4090 | QLoRA | [qlora_finetune_single_device](recipes/configs/llama2/7B_qlora_single_device.yaml) | Llama2-7B | 8.57 GB | +| 2 x RTX 4090 | LoRA | [lora_finetune_distributed](recipes/configs/llama2/7B_lora.yaml) | Llama2-7B | 20.95 GB | +| 1 x RTX 4090 | LoRA | [lora_finetune_single_device](recipes/configs/llama2/7B_lora_single_device.yaml) | Llama2-7B | 17.18 GB | +| 1 x RTX 4090 | Full finetune | [full_finetune_single_device](recipes/configs/llama2/7B_full_low_memory.yaml) | Llama2-7B | 14.97 GB | +| 4 x RTX 4090 | Full finetune | [full_finetune_distributed](recipes/configs/llama2/7B_full.yaml) | Llama2-7B | 22.9 GB | * these are averaged over multiple runs, but there might be some variance based on the setup. We'll update this table regularly. +  + +## Llama3 + +torchtune supports fine-tuning for the Llama3 8B models with support for 70B on its way. We currently support LoRA, QLoRA and Full-finetune on a single GPU as well as LoRA and Full fine-tune on multiple devices. For all the details, take a look at our [tutorial](https://pytorch.org/torchtune/main/tutorials/llama3.html). + + +In our initial experiments, QLoRA has a peak allocated memory of ``~9GB`` while LoRA on a single GPU has a peak allocated memory of ``~19GB``. To get started, you can use our default configs to kick off training. + +- LoRA on a single GPU. + +```bash +tune run lora_finetune_single_device --config llama3/8B_lora_single_device +``` + +- QLoRA on a single GPU + +```bash +tune run lora_finetune_single_device --config llama3/8B_qlora_single_device +``` + +- LoRA on 2 GPUs + +```bash +tune run --nproc_per_node 4 lora_finetune_distributed --config llama3/8B_lora +``` + +- Full fine-tune on 2 GPUs + +```bash +tune run --nproc_per_node 2 full_finetune_distributed --config llama3/8B_full +``` +   diff --git a/docs/source/api_ref_models.rst b/docs/source/api_ref_models.rst index 2467599b2d..e9a29ede9c 100644 --- a/docs/source/api_ref_models.rst +++ b/docs/source/api_ref_models.rst @@ -4,6 +4,25 @@ torchtune.models .. currentmodule:: torchtune.models +llama3 +------ + +All models from the `Llama3 family `_. + +.. code-block:: bash + + tune download meta-llama/Meta-Llama-3-8B --hf-token + + +.. autosummary:: + :toctree: generated/ + :nosignatures: + + llama3.llama3_8b + llama3.lora_llama3_8b + llama3.qlora_llama3_8b + + llama2 ------ @@ -26,6 +45,7 @@ Pre-trained models can be downloaded from the Hugging Face Hub with the followin llama2.lora_llama2_13b llama2.qlora_llama2_13b + mistral ------- diff --git a/docs/source/api_ref_modules.rst b/docs/source/api_ref_modules.rst index 70d545357f..e38926f36f 100644 --- a/docs/source/api_ref_modules.rst +++ b/docs/source/api_ref_modules.rst @@ -17,10 +17,18 @@ Modeling Components and Building Blocks get_cosine_schedule_with_warmup RotaryPositionalEmbeddings RMSNorm - Tokenizer TransformerDecoderLayer TransformerDecoder +Tokenizers +------------------------ + +.. autosummary:: + :toctree: generated/ + :nosignatures: + + tokenizers.SentencePieceTokenizer + tokenizers.TikTokenTokenizer PEFT Components --------------- diff --git a/docs/source/index.rst b/docs/source/index.rst index 78340bd769..c55c723634 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -43,6 +43,13 @@ torchtune tutorials. .. customcardstart:: +.. customcarditem:: + :header: Llama3 in torchtune + :card_description: + :image: _static/img/generic-pytorch-logo.png + :link: tutorials/lora_finetune.html + :tags: finetuning,llama3 + .. customcarditem:: :header: Finetuning with LoRA in torchtune :card_description: Parameter-efficient finetuning of Llama2 using LoRA @@ -88,6 +95,7 @@ torchtune tutorials. :caption: Tutorials :hidden: + tutorials/llama3 tutorials/lora_finetune tutorials/qlora_finetune tutorials/e2e_flow diff --git a/docs/source/tutorials/first_finetune_tutorial.rst b/docs/source/tutorials/first_finetune_tutorial.rst index dbff6a2d29..072cb2d79c 100644 --- a/docs/source/tutorials/first_finetune_tutorial.rst +++ b/docs/source/tutorials/first_finetune_tutorial.rst @@ -98,6 +98,8 @@ a single device. For a more in-depth discussion on LoRA in torchtune, you can se | +.. _tune_cp_label: + Modifying a config ------------------ YAML configs hold most of the important information needed for running your recipe. diff --git a/docs/source/tutorials/llama3.rst b/docs/source/tutorials/llama3.rst new file mode 100644 index 0000000000..ff1c0120e1 --- /dev/null +++ b/docs/source/tutorials/llama3.rst @@ -0,0 +1,328 @@ +==================== +Llama3 in torchtune +==================== + +.. grid:: 2 + + .. grid-item-card:: :octicon:`mortar-board;1em;` You will learn how to: + + * Download the Llama3-8B weights and tokenizer + * Fine-tune Llama3-8B with LoRA and QLoRA + * Evaluate your fine-tuned Llama3-8B model + * Generate text with your fine-tuned model + * Quantize your model to speed up generation + + .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites + + * Be familiar with :ref:`torchtune` + * Make sure to :ref:`install torchtune` + + +Llama3-8B +---------- + +`Llama3-8B `_ is a new model released by Meta AI that improves upon the performance of the Llama2 family +of models across a `range of different benchmarks `_. +There are a few main changes between Llama2-7B and Llama3-8B models: + +- Llama3-8B uses `grouped-query attention `_ instead of the standard multi-head attention from Llama2-7B +- Llama3-8B has a larger vocab size (128,256 instead of 32,000 from Llama2 models) +- Llama3-8B uses a different tokenizer than Llama2 models (`tiktoken `_ instead of `sentencepiece `_) +- Llama3-8B uses a larger intermediate dimension in its MLP layers than Llama2-7B +- Llama3-8B uses a higher base value to calculate theta in its `rotary positional embeddings `_ + +| + +Getting access to Llama3-8B +--------------------------- + +First, let's download the model from Hugging Face. You will need to follow the instructions +on the `official Meta page `_ to gain access to the model. +Next, make sure you grab your Hugging Face token from `here `_. + + +.. code-block:: bash + + tune download meta-llama/Meta-Llama-3-8B \ + --output-dir \ + --hf-token + +| + +Fine-tuning Llama3-8B in torchtune +---------------------------------- + +torchtune provides `LoRA `_, `QLoRA `_, and full fine-tuning +recipes for fine-tuning Llama3-8B on one or more GPUs. For more on LoRA in torchtune, see our :ref:`LoRA Tutorial `. +For more on QLoRA in torchtune, see our :ref:`QLoRA Tutorial `. + +Let's take a look at how we can fine-tune Llama3-8B with LoRA on a single device using torchtune. In this example, we will fine-tune +for one epoch on a common instruct dataset for illustrative purposes. The basic command for a single-device LoRA fine-tune is + +.. code-block:: bash + + tune run lora_finetune_single_device --config llama3/8B_lora_single_device + +.. note:: + To see a full list of recipes and their corresponding configs, simply run ``tune ls`` from the command line. + +We can also add command-line overrides as needed, e.g. + +.. code-block:: bash + + tune run lora_finetune_single_device --config llama3/8B_lora_single_device \ + checkpointer.checkpoint_dir= \ + tokenizer.path=/tokenizer.model \ + checkpointer.output_dir= + +This will load the Llama3-8B checkpoint and tokenizer from ```` used in the ``tune download`` command above, +then save a final checkpoint in the same directory following the original format. For more details on the +checkpoint formats supported in torchtune, see our :ref:`checkpointing deep-dive `. + +.. note:: + To see the full set of configurable parameters for this (and other) configs we can use ``tune cp`` to copy (and modify) + the default config. ``tune cp`` can be used with recipe scripts too, in case you want to make more custom changes + that cannot be achieved by directly modifying existing configurable parameters. For more on ``tune cp`` see the section on + :ref:`modifying configs `. + +Once training is complete, the model checkpoints will be saved and their locations will be logged. For +LoRA fine-tuning, the final checkpoint will contain the merged weights, and a copy of just the (much smaller) LoRA weights +will be saved separately. + +In our experiments, we observed a peak memory usage of 18.5 GB. The default config can be trained on a consumer GPU with 24 GB VRAM. + +If you have multiple GPUs available, you can run the distributed version of the recipe. +torchtune makes use of the `FSDP `_ APIs from PyTorch Distributed +to shard the model, optimizer states, and gradients. This should enable you to increase your batch size, resulting in faster training. +For example, on two devices: + +.. code-block:: bash + + tune run --nproc_per_node 2 lora_finetune_distributed --config llama3/8B_lora + +Finally, if we want to use even less memory, we can leverage TorchTune's QLoRA recipe via: + +.. code-block:: bash + + tune run lora_finetune_single_device --config llama3/8B_qlora_single_device + +Since our default configs enable full bfloat16 training, all of the above commands can be run with +devices having at least 24 GB of VRAM, and in fact the QLoRA recipe should have peak allocated memory +below 10 GB. You can also experiment with different configurations of LoRA and QLoRA, or even run a full fine-tune. +Try it out! + +| + +Evaluating fine-tuned Llama3-8B models with EleutherAI's Eval Harness +--------------------------------------------------------------------- + +Now that we've fine-tuned Llama3-8B, what's next? Let's take our LoRA-finetuned model from the +preceding section and look at a couple different ways we can evaluate its performance on the tasks we care about. + +First, torchtune provides an integration with +`EleutherAI's evaluation harness `_ +for model evaluation on common benchmark tasks. + +.. note:: + Make sure you've first installed the evaluation harness via :code:`pip install "lm_eval==0.4.*"`. + +For this tutorial we'll use the ``truthfulqa_mc2`` task from the harness. +This task measures a model's propensity to be truthful when answering questions and +measures the model's zero-shot accuracy on a question followed by one or more true +responses and one or more false responses. First, let's copy the config so we can point the YAML +file to our fine-tuned checkpoint files. + +.. code-block:: bash + + tune cp eleuther_evaluation ./custom_eval_config.yaml + +Next, we modify ``custom_eval_config.yaml`` to include the fine-tuned checkpoints. + +.. code-block:: yaml + + checkpointer: + _component_: torchtune.utils.FullModelMetaCheckpointer + + # directory with the checkpoint files + # this should match the output_dir specified during + # fine-tuning + checkpoint_dir: + + # checkpoint files for the fine-tuned model. These will be logged + # at the end of your fine-tune + checkpoint_files: [ + consolidated.00.pth + ] + + output_dir: + model_type: LLAMA3 + + # Make sure to update the tokenizer path to the right + # checkpoint directory as well + tokenizer: + _component_: torchtune.models.llama3.llama3_tokenizer + path: /tokenizer.model + +Finally, we can run evaluation using our modified config. + +.. code-block:: bash + + tune run eleuther_eval --config ./custom_eval_config.yaml + +Try it for yourself and see what accuracy your model gets! + +| + +Generating text with our fine-tuned Llama3-8B model +--------------------------------------------------- + +Next, let's look at one other way we can evaluate our model: generating text! torchtune provides a +`recipe for generation `_ as well. + +Similar to what we did, let's copy and modify the default generation config. + +.. code-block:: bash + + tune cp generation ./custom_generation_config.yaml + +Now we modify ``custom_generation_config.yaml`` to point to our checkpoint and tokenizer. + +.. code-block:: yaml + + checkpointer: + _component_: torchtune.utils.FullModelMetaCheckpointer + + # directory with the checkpoint files + # this should match the output_dir specified during + # fine-tuning + checkpoint_dir: + + # checkpoint files for the fine-tuned model. These will be logged + # at the end of your fine-tune + checkpoint_files: [ + consolidated.00.pth + ] + + output_dir: + model_type: LLAMA3 + + # Make sure to update the tokenizer path to the right + # checkpoint directory as well + tokenizer: + _component_: torchtune.models.llama3.llama3_tokenizer + path: /tokenizer.model + +Running generation with our LoRA-finetuned model, we see the following output: + +.. code-block:: bash + + tune run generate --config ./custom_generation_config.yaml \ + prompt="Hello, my name is" + + [generate.py:122] Hello, my name is Sarah and I am a busy working mum of two young children, living in the North East of England. + ... + [generate.py:135] Time for inference: 10.88 sec total, 18.94 tokens/sec + [generate.py:138] Bandwidth achieved: 346.09 GB/s + [generate.py:139] Memory used: 18.31 GB + +Faster generation via quantization +---------------------------------- + +We can see that the model took just under 11 seconds, generating almost 19 tokens per second. +We can speed this up a bit by quantizing our model. Here we'll use 4-bit weights-only quantization +as provided by `torchao `_. + +If you've been following along this far, you know the drill by now. +Let's copy the quantization config and point it at our fine-tuned model. + +.. code-block:: bash + + tune cp quantization ./custom_quantization_config.yaml + +And update ``custom_quantization_config.yaml`` with the following: + +.. code-block:: yaml + + checkpointer: + _component_: torchtune.utils.FullModelMetaCheckpointer + + # directory with the checkpoint files + # this should match the output_dir specified during + # fine-tuning + checkpoint_dir: + + # checkpoint files for the fine-tuned model. These will be logged + # at the end of your fine-tune + checkpoint_files: [ + consolidated.00.pth + ] + + output_dir: + model_type: LLAMA3 + +To quantize the model, we can now run: + +.. code-block:: bash + + tune run quantize ./custom_quantization_config.yaml + + [quantize.py:90] Time for quantization: 2.93 sec + [quantize.py:91] Memory used: 23.13 GB + [quantize.py:104] Model checkpoint of size 4.92 GB saved to /tmp/Llama-3-8B-hf/meta_model_0-4w.pt + +We can see that the model is now under 5 GB, or just over four bits for each of the 8B parameters. + +.. note:: + Unlike the fine-tuned checkpoints, the quantization recipe outputs a single checkpoint file. This is + because our quantization APIs currently don't support any conversion across formats. + As a result you won't be able to use these quantized models outside of torchtune. + But you should be able to use these with the generation and evaluation recipes within + torchtune. These results will help inform which quantization methods you should use + with your favorite inference engine. + +Let's take our quantized model and run the same generation again. +First, we'll make one more change to our ``custom_generation_config.yaml``. + +.. code-block:: yaml + + checkpointer: + # we need to use the custom TorchTune checkpointer + # instead of the HF checkpointer for loading + # quantized models + _component_: torchtune.utils.FullModelTorchTuneCheckpointer + + # directory with the checkpoint files + # this should match the output_dir specified during + # fine-tuning + checkpoint_dir: + + # checkpoint files point to the quantized model + checkpoint_files: [ + meta_model_0-4w.pt, + ] + + output_dir: + model_type: LLAMA3 + + # we also need to update the quantizer to what was used during + # quantization + quantizer: + _component_: torchtune.utils.quantization.Int4WeightOnlyQuantizer + groupsize: 256 + +Let's re-run generation! + +.. code-block:: bash + + tune run generate --config ./custom_generation_config.yaml \ + prompt="Hello, my name is" + + [generate.py:122] Hello, my name is Jake. + I am a multi-disciplined artist with a passion for creating, drawing and painting. + ... + Time for inference: 1.62 sec total, 57.95 tokens/sec + +By quantizing the model and running ``torch.compile`` we get over a 3x speedup! + +This is just the beginning of what you can do with Llama3-8B using torchtune and the broader ecosystem. +We look forward to seeing what you build! diff --git a/pyproject.toml b/pyproject.toml index 72f17d1cb3..158a538f3a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,8 +15,12 @@ dependencies = [ "huggingface_hub", "safetensors", - # Miscellaneous + # Tokenization "sentencepiece", + "tiktoken", + "blobfile>=2", + + # Miscellaneous "tqdm", "omegaconf", @@ -35,7 +39,7 @@ tune = "torchtune._cli.tune:main" [project.optional-dependencies] dev = [ - "bitsandbytes", + "bitsandbytes>=0.43.0", "pre-commit", "pytest", "pytest-cov", diff --git a/recipes/configs/generation.yaml b/recipes/configs/generation.yaml index 6cd9c1ba87..96a54d3e5c 100644 --- a/recipes/configs/generation.yaml +++ b/recipes/configs/generation.yaml @@ -30,7 +30,7 @@ tokenizer: # Generation arguments; defaults taken from gpt-fast prompt: "Hello, my name is" max_new_tokens: 300 -temperature: 0.8 +temperature: 0.6 # 0.8 and 0.6 are popular values to try top_k: 300 quantizer: null diff --git a/recipes/configs/llama3/8B_full.yaml b/recipes/configs/llama3/8B_full.yaml new file mode 100644 index 0000000000..d2d060d269 --- /dev/null +++ b/recipes/configs/llama3/8B_full.yaml @@ -0,0 +1,77 @@ +# Config for multi-device full finetuning in full_finetune_distributed.py +# using a Llama3 8B model +# +# This config assumes that you've run the following command before launching +# this run: +# tune download meta-llama/Meta-Llama-3-8B --output-dir /tmp/Meta-Llama-3-8B --hf-token +# +# To launch on 4 devices, run the following command from root: +# tune run --nproc_per_node 4 full_finetune_distributed --config llama3/8B_full +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run --nproc_per_node 4 full_finetune_distributed --config llama3/8B_full checkpointer.checkpoint_dir= +# +# This config works best when the model is being fine-tuned on 2+ GPUs. +# Single device full finetuning requires more memory optimizations. It's +# best to use 8B_full_single_device.yaml for those cases + + +# Tokenizer +tokenizer: + _component_: torchtune.models.llama3.llama3_tokenizer + path: /tmp/Meta-Llama-3-8B/original/tokenizer.model + +# Dataset +dataset: + _component_: torchtune.datasets.alpaca_dataset + train_on_input: True +seed: null +shuffle: True + +# Model Arguments +model: + _component_: torchtune.models.llama3.llama3_8b + +checkpointer: + _component_: torchtune.utils.FullModelMetaCheckpointer + checkpoint_dir: /tmp/Meta-Llama-3-8B/original/ + checkpoint_files: [ + consolidated.00.pth + ] + recipe_checkpoint: null + output_dir: /tmp/Meta-Llama-3-8B/ + model_type: LLAMA3 +resume_from_checkpoint: False + +# Fine-tuning arguments +batch_size: 2 +epochs: 3 + +optimizer: + _component_: torch.optim.AdamW + lr: 2e-5 + foreach: False + +loss: + _component_: torch.nn.CrossEntropyLoss +max_steps_per_epoch: null +gradient_accumulation_steps: 1 + + +# Training env +device: cuda + +# Memory management +enable_activation_checkpointing: True + +# Reduced precision +dtype: bf16 + +# Logging +metric_logger: + _component_: torchtune.utils.metric_logging.DiskLogger + log_dir: ${output_dir} +output_dir: /tmp/alpaca-llama3-finetune +log_every_n_steps: null diff --git a/recipes/configs/llama3/8B_full_single_device.yaml b/recipes/configs/llama3/8B_full_single_device.yaml new file mode 100644 index 0000000000..1ecc5e7b61 --- /dev/null +++ b/recipes/configs/llama3/8B_full_single_device.yaml @@ -0,0 +1,77 @@ +# Config for single device full finetuning in full_finetune_single_device.py +# using a Llama3 8B model +# +# This config assumes that you've run the following command before launching +# this run: +# tune download meta-llama/Meta-Llama-3-8B --output-dir /tmp/Meta-Llama-3-8B --hf-token +# +# The default config uses an optimizer from bitsandbytes. If you do not have it installed, +# you can install it with +# pip install bitsandbytes +# +# To launch on a single device, run the following command from root: +# tune run full_finetune_single_device --config llama3/8B_full_single_device +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run full_finetune_single_device --config llama3/8B_full_single_device checkpointer.checkpoint_dir= +# +# This config works only for training on single device. + + +# Tokenizer +tokenizer: + _component_: torchtune.models.llama3.llama3_tokenizer + path: /tmp/Meta-Llama-3-8B/original/tokenizer.model + +# Dataset +dataset: + _component_: torchtune.datasets.alpaca_dataset + train_on_input: True +seed: null +shuffle: True + +# Model Arguments +model: + _component_: torchtune.models.llama3.llama3_8b + +checkpointer: + _component_: torchtune.utils.FullModelMetaCheckpointer + checkpoint_dir: /tmp/Meta-Llama-3-8B/original/ + checkpoint_files: [ + consolidated.00.pth + ] + recipe_checkpoint: null + output_dir: /tmp/Meta-Llama-3-8B/ + model_type: LLAMA3 +resume_from_checkpoint: False + +# Fine-tuning arguments +batch_size: 2 +epochs: 3 +optimizer: + _component_: bitsandbytes.optim.AdamW8bit + lr: 2e-5 +loss: + _component_: torch.nn.CrossEntropyLoss +max_steps_per_epoch: null +gradient_accumulation_steps: 1 +optimizer_in_bwd: True +compile: False + +# Training environment +device: cuda + +# Memory management +enable_activation_checkpointing: True + +# Reduced precision +dtype: bf16 + +# Logging +metric_logger: + _component_: torchtune.utils.metric_logging.DiskLogger + log_dir: ${output_dir} +output_dir: /tmp/alpaca-llama3-finetune +log_every_n_steps: null diff --git a/recipes/configs/llama3/8B_lora.yaml b/recipes/configs/llama3/8B_lora.yaml new file mode 100644 index 0000000000..3b8479a823 --- /dev/null +++ b/recipes/configs/llama3/8B_lora.yaml @@ -0,0 +1,80 @@ +# Config for multi-device LoRA finetuning in lora_finetune_distributed.py +# using a Llama3 8B model +# +# This config assumes that you've run the following command before launching +# this run: +# tune download meta-llama/Meta-Llama-3-8B --output-dir /tmp/Meta-Llama-3-8B --hf-token +# +# To launch on 2 devices, run the following command from root: +# tune run --nproc_per_node 2 lora_finetune_distributed --config llama3/8B_lora +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run --nproc_per_node 2 lora_finetune_distributed --config llama3/8B_lora checkpointer.checkpoint_dir= +# +# This config works best when the model is being fine-tuned on 2+ GPUs. +# For single device LoRA finetuning please use 8B_lora_single_device.yaml +# or 8B_qlora_single_device.yaml + +# Tokenizer +tokenizer: + _component_: torchtune.models.llama3.llama3_tokenizer + path: /tmp/Meta-Llama-3-8B/original/tokenizer.model + +# Model Arguments +model: + _component_: torchtune.models.llama3.lora_llama3_8b + lora_attn_modules: ['q_proj', 'v_proj'] + apply_lora_to_mlp: False + apply_lora_to_output: False + lora_rank: 8 + lora_alpha: 16 + +checkpointer: + _component_: torchtune.utils.FullModelMetaCheckpointer + checkpoint_dir: /tmp/Meta-Llama-3-8B/original/ + checkpoint_files: [ + consolidated.00.pth + ] + recipe_checkpoint: null + output_dir: /tmp/Meta-Llama-3-8B/ + model_type: LLAMA3 +resume_from_checkpoint: False + +# Dataset and Sampler +dataset: + _component_: torchtune.datasets.alpaca_cleaned_dataset + train_on_input: True +seed: null +shuffle: True +batch_size: 2 + +# Optimizer and Scheduler +optimizer: + _component_: torch.optim.AdamW + weight_decay: 0.01 + lr: 3e-4 +lr_scheduler: + _component_: torchtune.modules.get_cosine_schedule_with_warmup + num_warmup_steps: 100 + +loss: + _component_: torch.nn.CrossEntropyLoss + +# Training +epochs: 1 +max_steps_per_epoch: null +gradient_accumulation_steps: 32 + +# Logging +output_dir: /tmp/lora_finetune_output +metric_logger: + _component_: torchtune.utils.metric_logging.DiskLogger + log_dir: ${output_dir} +log_every_n_steps: null + +# Environment +device: cuda +dtype: bf16 +enable_activation_checkpointing: False diff --git a/recipes/configs/llama3/8B_lora_single_device.yaml b/recipes/configs/llama3/8B_lora_single_device.yaml new file mode 100644 index 0000000000..b6b33466ca --- /dev/null +++ b/recipes/configs/llama3/8B_lora_single_device.yaml @@ -0,0 +1,85 @@ +# Config for single device LoRA finetuning in lora_finetune_single_device.py +# using a Llama3 8B model +# +# This config assumes that you've run the following command before launching +# this run: +# tune download meta-llama/Meta-Llama-3-8B --output-dir /tmp/Meta-Llama-3-8B --hf-token +# +# To launch on a single device, run the following command from root: +# tune run lora_finetune_single_device --config llama3/8B_lora_single_device +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run lora_finetune_single_device --config llama3/8B_lora_single_device checkpointer.checkpoint_dir= +# +# This config works only for training on single device. + + +# Model Arguments +model: + _component_: torchtune.models.llama3.lora_llama3_8b + lora_attn_modules: ['q_proj', 'v_proj'] + apply_lora_to_mlp: False + apply_lora_to_output: False + lora_rank: 8 + lora_alpha: 16 + +# Tokenizer +tokenizer: + _component_: torchtune.models.llama3.llama3_tokenizer + path: /tmp/Meta-Llama-3-8B/original/tokenizer.model + +checkpointer: + _component_: torchtune.utils.FullModelMetaCheckpointer + checkpoint_dir: /tmp/Meta-Llama-3-8B/original/ + checkpoint_files: [ + consolidated.00.pth + ] + recipe_checkpoint: null + output_dir: /tmp/Meta-Llama-3-8B/ + model_type: LLAMA3 +resume_from_checkpoint: False + +# Dataset and Sampler +dataset: + _component_: torchtune.datasets.alpaca_cleaned_dataset + train_on_input: True +seed: null +shuffle: True +batch_size: 2 + +# Optimizer and Scheduler +optimizer: + _component_: torch.optim.AdamW + weight_decay: 0.01 + lr: 3e-4 +lr_scheduler: + _component_: torchtune.modules.get_cosine_schedule_with_warmup + num_warmup_steps: 100 + +loss: + _component_: torch.nn.CrossEntropyLoss + +# Training +epochs: 1 +max_steps_per_epoch: null +gradient_accumulation_steps: 64 +compile: False + +# Logging +output_dir: /tmp/lora_finetune_output +metric_logger: + _component_: torchtune.utils.metric_logging.DiskLogger + log_dir: ${output_dir} +log_every_n_steps: null + +# Environment +device: cuda +dtype: bf16 +enable_activation_checkpointing: True + +# Profiler (disabled) +profiler: + _component_: torchtune.utils.profiler + enabled: False diff --git a/recipes/configs/llama3/8B_qlora_single_device.yaml b/recipes/configs/llama3/8B_qlora_single_device.yaml new file mode 100644 index 0000000000..a951b9d660 --- /dev/null +++ b/recipes/configs/llama3/8B_qlora_single_device.yaml @@ -0,0 +1,86 @@ +# Config for single device QLoRA with lora_finetune_single_device.py +# using a Llama3 8B model +# +# This config assumes that you've run the following command before launching +# this run: +# tune download meta-llama/Meta-Llama-3-8B --output-dir /tmp/Meta-Llama-3-8B --hf-token +# +# To launch on a single device, run the following command from root: +# tune run lora_finetune_single_device --config llama3/8B_qlora_single_device +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run lora_finetune_single_device --config llama3/8B_qlora_single_device checkpointer.checkpoint_dir= +# +# This config works only for training on single device. + +# Model Arguments +model: + _component_: torchtune.models.llama3.qlora_llama3_8b + lora_attn_modules: ['q_proj', 'v_proj', 'k_proj', 'output_proj'] + apply_lora_to_mlp: True + apply_lora_to_output: False + lora_rank: 8 + lora_alpha: 16 + +# Tokenizer +tokenizer: + _component_: torchtune.models.llama3.llama3_tokenizer + path: /tmp/Meta-Llama-3-8B/original/tokenizer.model + +checkpointer: + _component_: torchtune.utils.FullModelMetaCheckpointer + checkpoint_dir: /tmp/Meta-Llama-3-8B/original/ + checkpoint_files: [ + consolidated.00.pth + ] + recipe_checkpoint: null + output_dir: /tmp/Meta-Llama-3-8B/ + model_type: LLAMA3 +resume_from_checkpoint: False + +# Dataset and Sampler +dataset: + _component_: torchtune.datasets.alpaca_cleaned_dataset + train_on_input: True +seed: null +shuffle: True +batch_size: 2 + +# Optimizer and Scheduler +optimizer: + _component_: torch.optim.AdamW + weight_decay: 0.01 + lr: 3e-4 +lr_scheduler: + _component_: torchtune.modules.get_cosine_schedule_with_warmup + num_warmup_steps: 100 + +loss: + _component_: torch.nn.CrossEntropyLoss + +# Training +epochs: 1 +max_steps_per_epoch: null +gradient_accumulation_steps: 16 +# Note: compile for QLoRA is only supported on nightly +# PyTorch (>= 2.4.0.dev20240408) +compile: False + +# Logging +output_dir: /tmp/qlora_finetune_output/ +metric_logger: + _component_: torchtune.utils.metric_logging.DiskLogger + log_dir: ${output_dir} +log_every_n_steps: 1 + +# Environment +device: cuda +dtype: bf16 +enable_activation_checkpointing: True + +# Profiler (disabled) +profiler: + _component_: torchtune.utils.profiler + enabled: False diff --git a/recipes/eleuther_eval.py b/recipes/eleuther_eval.py index 81c0253f35..c6911886fa 100644 --- a/recipes/eleuther_eval.py +++ b/recipes/eleuther_eval.py @@ -15,7 +15,8 @@ from torch import nn from torchtune import config, utils -from torchtune.modules import Tokenizer, TransformerDecoder +from torchtune.modules import TransformerDecoder +from torchtune.modules.tokenizers import Tokenizer from torchtune.recipe_interfaces import EvalRecipeInterface diff --git a/recipes/full_finetune_single_device.py b/recipes/full_finetune_single_device.py index 95aedacc17..8ea06343aa 100644 --- a/recipes/full_finetune_single_device.py +++ b/recipes/full_finetune_single_device.py @@ -396,7 +396,6 @@ def train(self) -> None: == self.max_steps_per_epoch ): break - input_ids, labels = batch input_ids = input_ids.to(self._device) labels = labels.to(self._device) diff --git a/tests/assets/tiktoken_small.model b/tests/assets/tiktoken_small.model new file mode 100644 index 0000000000..4bfad62542 --- /dev/null +++ b/tests/assets/tiktoken_small.model @@ -0,0 +1,2000 @@ +AA== 0 +AQ== 1 +Ag== 2 +Aw== 3 +BA== 4 +BQ== 5 +Bg== 6 +Bw== 7 +CA== 8 +CQ== 9 +Cg== 10 +Cw== 11 +DA== 12 +DQ== 13 +Dg== 14 +Dw== 15 +EA== 16 +EQ== 17 +Eg== 18 +Ew== 19 +FA== 20 +FQ== 21 +Fg== 22 +Fw== 23 +GA== 24 +GQ== 25 +Gg== 26 +Gw== 27 +HA== 28 +HQ== 29 +Hg== 30 +Hw== 31 +IA== 32 +IQ== 33 +Ig== 34 +Iw== 35 +JA== 36 +JQ== 37 +Jg== 38 +Jw== 39 +KA== 40 +KQ== 41 +Kg== 42 +Kw== 43 +LA== 44 +LQ== 45 +Lg== 46 +Lw== 47 +MA== 48 +MQ== 49 +Mg== 50 +Mw== 51 +NA== 52 +NQ== 53 +Ng== 54 +Nw== 55 +OA== 56 +OQ== 57 +Og== 58 +Ow== 59 +PA== 60 +PQ== 61 +Pg== 62 +Pw== 63 +QA== 64 +QQ== 65 +Qg== 66 +Qw== 67 +RA== 68 +RQ== 69 +Rg== 70 +Rw== 71 +SA== 72 +SQ== 73 +Sg== 74 +Sw== 75 +TA== 76 +TQ== 77 +Tg== 78 +Tw== 79 +UA== 80 +UQ== 81 +Ug== 82 +Uw== 83 +VA== 84 +VQ== 85 +Vg== 86 +Vw== 87 +WA== 88 +WQ== 89 +Wg== 90 +Ww== 91 +XA== 92 +XQ== 93 +Xg== 94 +Xw== 95 +YA== 96 +YQ== 97 +Yg== 98 +Yw== 99 +ZA== 100 +ZQ== 101 +Zg== 102 +Zw== 103 +aA== 104 +aQ== 105 +ag== 106 +aw== 107 +bA== 108 +bQ== 109 +bg== 110 +bw== 111 +cA== 112 +cQ== 113 +cg== 114 +cw== 115 +dA== 116 +dQ== 117 +dg== 118 +dw== 119 +eA== 120 +eQ== 121 +eg== 122 +ew== 123 +fA== 124 +fQ== 125 +fg== 126 +fw== 127 +gA== 128 +gQ== 129 +gg== 130 +gw== 131 +hA== 132 +hQ== 133 +hg== 134 +hw== 135 +iA== 136 +iQ== 137 +ig== 138 +iw== 139 +jA== 140 +jQ== 141 +jg== 142 +jw== 143 +kA== 144 +kQ== 145 +kg== 146 +kw== 147 +lA== 148 +lQ== 149 +lg== 150 +lw== 151 +mA== 152 +mQ== 153 +mg== 154 +mw== 155 +nA== 156 +nQ== 157 +ng== 158 +nw== 159 +oA== 160 +oQ== 161 +og== 162 +ow== 163 +pA== 164 +pQ== 165 +pg== 166 +pw== 167 +qA== 168 +qQ== 169 +qg== 170 +qw== 171 +rA== 172 +rQ== 173 +rg== 174 +rw== 175 +sA== 176 +sQ== 177 +sg== 178 +sw== 179 +tA== 180 +tQ== 181 +tg== 182 +tw== 183 +uA== 184 +uQ== 185 +ug== 186 +uw== 187 +vA== 188 +vQ== 189 +vg== 190 +vw== 191 +wA== 192 +wQ== 193 +wg== 194 +ww== 195 +xA== 196 +xQ== 197 +xg== 198 +xw== 199 +yA== 200 +yQ== 201 +yg== 202 +yw== 203 +zA== 204 +zQ== 205 +zg== 206 +zw== 207 +0A== 208 +0Q== 209 +0g== 210 +0w== 211 +1A== 212 +1Q== 213 +1g== 214 +1w== 215 +2A== 216 +2Q== 217 +2g== 218 +2w== 219 +3A== 220 +3Q== 221 +3g== 222 +3w== 223 +4A== 224 +4Q== 225 +4g== 226 +4w== 227 +5A== 228 +5Q== 229 +5g== 230 +5w== 231 +6A== 232 +6Q== 233 +6g== 234 +6w== 235 +7A== 236 +7Q== 237 +7g== 238 +7w== 239 +8A== 240 +8Q== 241 +8g== 242 +8w== 243 +9A== 244 +9Q== 245 +9g== 246 +9w== 247 ++A== 248 ++Q== 249 ++g== 250 ++w== 251 +/A== 252 +/Q== 253 +/g== 254 +/w== 255 +IHQ= 256 +aGU= 257 +IGE= 258 +aW4= 259 +IHM= 260 +IHc= 261 +IHRoZQ== 262 +IG8= 263 +cmU= 264 +IGI= 265 +b3U= 266 +ZWQ= 267 +IG0= 268 +bmQ= 269 +IEk= 270 +aGE= 271 +aXQ= 272 +ZXI= 273 +aW5n 274 +IGY= 275 +aXM= 276 +IHRv 277 +ZW4= 278 +b24= 279 +b3I= 280 +YXM= 281 +IGM= 282 +IG9m 283 +IGFuZA== 284 +IGQ= 285 +bGw= 286 +YXQ= 287 +YW4= 288 +YXI= 289 +IHA= 290 +IG4= 291 +IGlu 292 +bGU= 293 +b20= 294 +b3Q= 295 +IGJl 296 +IGg= 297 +dXQ= 298 +b3c= 299 +ZXM= 300 +aGF0 301 +IGc= 302 +IGhl 303 +IGhh 304 +IGw= 305 +IHdhcw== 306 +bGQ= 307 +Z2g= 308 +aWQ= 309 +Y2g= 310 +IHRo 311 +IGl0 312 +YXk= 313 +IG9u 314 +Y2U= 315 +c2U= 316 +ZW50 317 +IHN0 318 +bHk= 319 +dmU= 320 +ZXQ= 321 +c3Q= 322 +IFQ= 323 +IGU= 324 +IHk= 325 +Z2h0 326 +aXI= 327 +IG1l 328 +b28= 329 +YWw= 330 +aXRo 331 +IHJl 332 +aW0= 333 +IHRoYXQ= 334 +IGFz 335 +b3VsZA== 336 +cm8= 337 +YWQ= 338 +aW9u 339 +Lgo= 340 +aGVy 341 +IG15 342 +Y3Q= 343 +IG5vdA== 344 +IHdpdGg= 345 +IGZvcg== 346 +IHU= 347 +a2U= 348 +IHlvdQ== 349 +IFM= 350 +IGlz 351 +aWdodA== 352 +Igo= 353 +YW0= 354 +aWM= 355 +dXI= 356 +IGF0 357 +Li4= 358 +YWM= 359 +dGVy 360 +IHdo 361 +IGFu 362 +IHdl 363 +IFRoZQ== 364 +aWY= 365 +IG9y 366 +IGJ1dA== 367 +dmVy 368 +ICI= 369 +IHI= 370 +b3V0 371 +b21l 372 +IGhhZA== 373 +cHA= 374 +cXU= 375 +IHN1 376 +IHRoaXM= 377 +cmVk 378 +YXJk 379 +IHNv 380 +ZWxs 381 +IHdvdWxk 382 +IGhpcw== 383 +IHNo 384 +aW5l 385 +cmE= 386 +IHNl 387 +IGJ5 388 +LiIK 389 +IFA= 390 +aGVu 391 +IEE= 392 +IGhhdmU= 393 +IGZy 394 +IHNh 395 +IEg= 396 +IG9uZQ== 397 +ZW0= 398 +a2Vk 399 +aXJ0 400 +ZWN0 401 +IGhpbQ== 402 +IGxp 403 +IGFi 404 +YXRpb24= 405 +aGluZw== 406 +dGhl 407 +IFI= 408 +IGxl 409 +c3M= 410 +IFc= 411 +Y3U= 412 +aWxs 413 +J3Q= 414 +YXJ0 415 +YWxs 416 +LAo= 417 +b3du 418 +b3Jl 419 +IGFsbA== 420 +IGs= 421 +IGdv 422 +aGlydA== 423 +YW5k 424 +IG91dA== 425 +YW1l 426 +YWlu 427 +IGlm 428 +IG5v 429 +IGRv 430 +IHRoZXk= 431 +b29s 432 +dW4= 433 +dG8= 434 +IHVw 435 +IFJlZA== 436 +IG5l 437 +IEs= 438 +IGZyb20= 439 +IFNoaXJ0 440 +IHdvcg== 441 +b25n 442 +IHRoZXJl 443 +IHNhaWQ= 444 +cmk= 445 +YW50 446 +IEI= 447 +IGFueQ== 448 +dWQ= 449 +aW5k 450 +IHdoaQ== 451 +YWI= 452 +b3VuZA== 453 +IGFib3V0 454 +IHRoZW0= 455 +Y3Vw 456 +YWs= 457 +IGRl 458 +IHRl 459 +IE0= 460 +YWtl 461 +Y3VwaW5l 462 +aWc= 463 +IHdlcmU= 464 +b3JjdXBpbmU= 465 +aWw= 466 +Y2hvb2w= 467 +IHJv 468 +b29k 469 +IGFyZQ== 470 +aXZl 471 +IGxpa2U= 472 +eW8= 473 +IGhvdQ== 474 +J3M= 475 +b25l 476 +dXM= 477 +ZWw= 478 +dWw= 479 +YWNr 480 +b3A= 481 +LCI= 482 +dGg= 483 +YWNoZXI= 484 +dW0= 485 +YW5n 486 +IGZh 487 +YWc= 488 +IHNjaG9vbA== 489 +IGo= 490 +dGU= 491 +b2s= 492 +ZXNz 493 +dXN0 494 +ZXJz 495 +Li4uLg== 496 +IEM= 497 +dGhlcg== 498 +aGFu 499 +IHdoZW4= 500 +IHNw 501 +IG1hbg== 502 +IGNhbg== 503 +b3VnaA== 504 +IHdobw== 505 +IGdldA== 506 +IGRpZA== 507 +IHBv 508 +Y2k= 509 +IGFs 510 +aXN0 511 +IGNvbQ== 512 +bGY= 513 +YXU= 514 +IFBvcmN1cGluZQ== 515 +IHdoaWNo 516 +dmVu 517 +IGFm 518 +d24= 519 +YXNz 520 +YmVy 521 +IGV4 522 +b3Vz 523 +ZXN0 524 +bG8= 525 +IHRy 526 +ZWxsb3c= 527 +IHNheQ== 528 +b3VnaHQ= 529 +IHJvb20= 530 +IHNvbWU= 531 +LS0= 532 +IE8= 533 +YXRl 534 +IHY= 535 +aGVk 536 +YXA= 537 +IHR3 538 +IGJlYw== 539 +cmVl 540 +amVjdA== 541 +a3M= 542 +IGNvbg== 543 +IGJlZW4= 544 +ZW50cw== 545 +aWRl 546 +IGNvdWxk 547 +IEc= 548 +ZXA= 549 +IHBybw== 550 +bnQ= 551 +IGhvdXNl 552 +IGFn 553 +IElm 554 +IGtu 555 +IGZlbGxvdw== 556 +IHdoYXQ= 557 +d2F5 558 +aXNo 559 +IGFt 560 +aXRl 561 +bmRlcg== 562 +aW1l 563 +IHBy 564 +IHRlYWNoZXI= 565 +YXJl 566 +IGJv 567 +IHNoZQ== 568 +IE4= 569 +aWNl 570 +YXN0 571 +dXJl 572 +aWU= 573 +IHN1Y2g= 574 +dXRlbg== 575 +dXRlbmJlcg== 576 +dXRlbmJlcmc= 577 +IHF1 578 +bG93bg== 579 +IHdy 580 +cHQ= 581 +IEhl 582 +IHN0dWQ= 583 +aGVyZQ== 584 +IG1vcmU= 585 +cnk= 586 +dHRlcg== 587 +IFk= 588 +IG1heQ== 589 +aXR5 590 +IGxvbw== 591 +IG90aGVy 592 +aGlz 593 +IFBybw== 594 +IHdpbGw= 595 +IEl0 596 +b3J0 597 +IHNob3VsZA== 598 +dmVyeQ== 599 +d2U= 600 +IHBs 601 +YXNo 602 +LiI= 603 +IGFwcA== 604 +IGRheQ== 605 +dXJu 606 +cG8= 607 +IGhlcg== 608 +ICA= 609 +bm90 610 +Y2s= 611 +IHVu 612 +aGk= 613 +dmluZw== 614 +IG9sZA== 615 +IHRpbWU= 616 +IlQ= 617 +IHdheQ== 618 +YWJsZQ== 619 +PyIK 620 +IENsb3du 621 +IG9ubHk= 622 +dWI= 623 +YWNo 624 +IG9mZg== 625 +IHRoYW4= 626 +YWxseQ== 627 +IHRoZWly 628 +YmU= 629 +a2luZw== 630 +b3RoZXI= 631 +YXJ5 632 +YW5z 633 +YXRlZA== 634 +c2VsZg== 635 +IGdvaW5n 636 +dWNo 637 +b2xs 638 +IGJhY2s= 639 +aXlv 640 +LXQ= 641 +YW5jZQ== 642 +YWRl 643 +IFByb2plY3Q= 644 +c3A= 645 +IHR3bw== 646 +IHRob3VnaHQ= 647 +c28= 648 +IHJpZ2h0 649 +IGhlYWQ= 650 +dmVk 651 +IEQ= 652 +IHByZQ== 653 +IHNlZQ== 654 +IHVz 655 +IHN0dWRlbnRz 656 +Y2lw 657 +IGRvbg== 658 +IG5pZ2h0 659 +aW5jaXA= 660 +IEtpeW8= 661 +cGw= 662 +YXJlZA== 663 +IEd1dGVuYmVyZw== 664 +IGNv 665 +IGhvdw== 666 +b21ldA== 667 +ZmY= 668 +Ikk= 669 +LC0t 670 +IGFza2Vk 671 +aW5jaXBhbA== 672 +ZXZlcg== 673 +IGFj 674 +IEY= 675 +IG1ha2U= 676 +aXR0 677 +IG1pZ2h0 678 +Z2U= 679 +bGVk 680 +IGFmdGVy 681 +aWdu 682 +IGdy 683 +IG1hZGU= 684 +ZGQ= 685 +IGtub3c= 686 +IGNvbWU= 687 +IGJy 688 +dGhpbmc= 689 +IEJ1dA== 690 +IG1hdA== 691 +IE9u 692 +b3J5 693 +Y2w= 694 +IEU= 695 +Ymxl 696 +b2c= 697 +IHlvdXI= 698 +dWxs 699 +IHdvcms= 700 +ZWFy 701 +IHRocmVl 702 +aWVk 703 +YnV0 704 +VGhl 705 +cGU= 706 +YWNl 707 +IHN0YXJ0 708 +aWNr 709 +IG92ZXI= 710 +b3Vy 711 +IG11Y2g= 712 +IHdhbnQ= 713 +aW1w 714 +IHBhcnQ= 715 +aG8= 716 +aW5r 717 +ZW5jZQ== 718 +IGRvd24= 719 +IGV2ZW4= 720 +IHByaW5jaXBhbA== 721 +bGluZw== 722 +b3VudA== 723 +YXVzZQ== 724 +IGNs 725 +IGJs 726 +LXRt 727 +b21ldGhpbmc= 728 +IGludG8= 729 +b3Jt 730 +b2t5bw== 731 +IGRpcw== 732 +IGZl 733 +IGZhY2U= 734 +Li4uLi4u 735 +cmVzcw== 736 +bWVudA== 737 +aXJl 738 +IGFy 739 +dHk= 740 +IG1v 741 +cmVhdA== 742 +IGZpcg== 743 +cGVy 744 +IG91cg== 745 +Y28= 746 +IHRoZW4= 747 +IHRvbGQ= 748 +aW5ncw== 749 +IHRha2U= 750 +IGJlZw== 751 +bmVy 752 +aXRpb24= 753 +b3Nl 754 +IG93bg== 755 +IGFnYWlu 756 +IHNlZW0= 757 +aXNl 758 +IHdhdA== 759 +Ilc= 760 +IGZhcg== 761 +YWtpbmc= 762 +Zm9yZQ== 763 +YWR5 764 +LXM= 765 +bGVzcw== 766 +IHJldA== 767 +IHNoYQ== 768 +IGNhbWU= 769 +Z2Vy 770 +IGdvb2Q= 771 +YXRoZXI= 772 +YXJr 773 +cm93 774 +IGtl 775 +J20= 776 +IGhhcw== 777 +YXRo 778 +cHBlZA== 779 +IHdlbnQ= 780 +IHRlbGw= 781 +cXVhc2g= 782 +IGVu 783 +IGZpcnN0 784 +IGhvdA== 785 +aXo= 786 +IGF3YXk= 787 +IHNvbWV0aGluZw== 788 +IHJlbQ== 789 +IHRvd24= 790 +IHNt 791 +IFRoaXM= 792 +IGJldHRlcg== 793 +IFRoZW4= 794 +d2Fz 795 +b2Y= 796 +YmFyZA== 797 +IEw= 798 +bGk= 799 +ZmU= 800 +IFRva3lv 801 +IGxvbmc= 802 +aWx5 803 +IHN1cmU= 804 +IGxvb2tlZA== 805 +dWJiYXJk 806 +Y3Rpb24= 807 +b3Jk 808 +IG1hbnk= 809 +aW91cw== 810 +IHRvbw== 811 +IGhlcmU= 812 +b3M= 813 +IHVuZGVy 814 +YXNl 815 +bmc= 816 +cGVk 817 +b2Q= 818 +bWU= 819 +IGp1c3Q= 820 +IG5vdw== 821 +aW5jZQ== 822 +IGhlYXJk 823 +IGtpbmQ= 824 +IFRoZXk= 825 +IGJlZm9yZQ== 826 +aHk= 827 +IElu 828 +IGVudA== 829 +IGJvYXJk 830 +ISI= 831 +d2FyZA== 832 +IGJlaW5n 833 +IHdlbGw= 834 +ZXJt 835 +cmllZA== 836 +IHdyb25n 837 +YWlk 838 +eHQ= 839 +IHJldHVybg== 840 +aXRlZA== 841 +IHllbg== 842 +IG1hdHRlcg== 843 +IGNhbGw= 844 +IHRhbA== 845 +IFlvdQ== 846 +Y2Vk 847 +aXNlZA== 848 +IGNoYQ== 849 +b25z 850 +IHNhbWU= 851 +IG9uY2U= 852 +ZGF5 853 +ZnQ= 854 +IHN3 855 +IGJlY2F1c2U= 856 +IHRoaW5r 857 +IHdoZXJl 858 +IE5v 859 +IEh1YmJhcmQ= 860 +IFNxdWFzaA== 861 +IGNvcA== 862 +d2l0aA== 863 +ZXJlZA== 864 +b2xsb3c= 865 +IHBsYWNl 866 +aWRk 867 +Y2Vzcw== 868 +IHNob3c= 869 +aXNoYQ== 870 +IHJh 871 +IGxldHRlcg== 872 +bmU= 873 +dmVz 874 +YXRpbmc= 875 +cmFuZw== 876 +IGFmZg== 877 +IGhhbmQ= 878 +IHNj 879 +IHBlcnM= 880 +aW50 881 +cHI= 882 +c2lkZQ== 883 +ZnRlcg== 884 +IHNheWluZw== 885 +IGxhdQ== 886 +dGhhdA== 887 +IHdpdGhvdXQ= 888 +cm9u 889 +YWly 890 +bGVjdA== 891 +IFdoYXQ= 892 +ZWx0 893 +IHdoaWxl 894 +b2dh 895 +YXBlcg== 896 +IHBl 897 +b3k= 898 +IHNhdA== 899 +aWVz 900 +IGFkZA== 901 +IGRheXM= 902 +IHNwZQ== 903 +IGhv 904 +IGFucw== 905 +IGhhcg== 906 +IFdoZW4= 907 +IGFueXRoaW5n 908 +cGVu 909 +XQo= 910 +dGFpbg== 911 +IG11c3Q= 912 +IG5ldw== 913 +bGlj 914 +IHZv 915 +aGlsZQ== 916 +Z2V0 917 +IEFz 918 +IHZlcnk= 919 +J3Jl 920 +IGV2ZXJ5 921 +YXZl 922 +PyI= 923 +YWRnZXI= 924 +IEtvZ2E= 925 +IE1y 926 +cm91Z2g= 927 +dWx0 928 +IGZvbGxvdw== 929 +dGluZw== 930 +aWZl 931 +aWRkbGU= 932 +ZnVs 933 +YW5r 934 +IFNv 935 +IHNlZW1lZA== 936 +IEFuZA== 937 +aXg= 938 +IHNldA== 939 +IGNhcmU= 940 +IHJlcw== 941 +IG5ldmVy 942 +IGZvdW5k 943 +IGxv 944 +Y2lk 945 +aW5lZA== 946 +IGNsYXNz 947 +IG15c2VsZg== 948 +YXc= 949 +IHdvbQ== 950 +YXRpb25z 951 +IGxlZnQ= 952 +IFdl 953 +IHRlYWNoZXJz 954 +Ilk= 955 +bmE= 956 +b250 957 +IGRlcw== 958 +IHRob3Nl 959 +aXJlZA== 960 +IHNlbg== 961 +eWluZw== 962 +IHRoZXNl 963 +YXo= 964 +IFRoZXJl 965 +Y2VwdA== 966 +IGRhbmc= 967 +IFU= 968 +Ikg= 969 +Ym9k 970 +Ym9keQ== 971 +IGhhdmluZw== 972 +YWxhcnk= 973 +IHdhdGNo 974 +IGdpdmU= 975 +YWdl 976 +IGl0cw== 977 +IGFwcGU= 978 +dWU= 979 +IGNvdW50 980 +IGhhcmQ= 981 +IGJlbA== 982 +b3R0 983 +IGRpc3Q= 984 +IlM= 985 +IE1hZA== 986 +LW4= 987 +cmlidXQ= 988 +Z2Vk 989 +IGF0dA== 990 +ZmVyZQ== 991 +aXRoZXI= 992 +IHVwb24= 993 +IHRlbQ== 994 +IHBlcnNvbg== 995 +bmluZw== 996 +IGNoZQ== 997 +YXJseQ== 998 +b25leQ== 999 +IHNvb24= 1000 +ZW1lbnQ= 1001 +ICg= 1002 +IHRyYW5z 1003 +IGV4cA== 1004 +IHNlcg== 1005 +IHJlZw== 1006 +YXNvbg== 1007 +IHNhdw== 1008 +IG5leHQ= 1009 +b290 1010 +IGhhbGY= 1011 +IHRvb2s= 1012 +IGJhZA== 1013 +IGhvdXI= 1014 +IHNhbGFyeQ== 1015 +IGJlZ2Fu 1016 +cmlnaHQ= 1017 +b25uYQ== 1018 +LXNhbg== 1019 +IHdvcmtz 1020 +IEo= 1021 +Zm9ybQ== 1022 +aWNhbA== 1023 +IHRyYQ== 1024 +bWFu 1025 +IG5vdGhpbmc= 1026 +IHN0aWxs 1027 +ZWFycw== 1028 +IHN1cHA= 1029 +IHR1cm4= 1030 +IGZlbHQ= 1031 +IHdvbWFu 1032 +IHN0YXJ0ZWQ= 1033 +b3VibGU= 1034 +dXJh 1035 +aXNoaW5n 1036 +Ogo= 1037 +bGVjdHJvbg== 1038 +bGVjdHJvbmlj 1039 +b29r 1040 +IGNvcHk= 1041 +IGZ1bGw= 1042 +Y29uZA== 1043 +bWF0 1044 +IG1pZGRsZQ== 1045 +IGxvb2s= 1046 +IGNvbW0= 1047 +d2VyZWQ= 1048 +IGJlY2FtZQ== 1049 +IGZlbGxvd3M= 1050 +d291bGQ= 1051 +IGdvdA== 1052 +IGds 1053 +IGd1 1054 +IGtlZXA= 1055 +IGdl 1056 +IE1hZG9ubmE= 1057 +aXRlcg== 1058 +aXNoZWQ= 1059 +IHVuZGVyc3Q= 1060 +IHN0cmE= 1061 +c2lk 1062 +IGNvdW50cnk= 1063 +b3BsZQ== 1064 +IHByb3Y= 1065 +IHB1dA== 1066 +bm8= 1067 +J2xs 1068 +IHNsZQ== 1069 +cmFuZ2U= 1070 +IFNoZQ== 1071 +cG9z 1072 +IG1pbmQ= 1073 +IHBhc3M= 1074 +IHRocm91Z2g= 1075 +IHF1aXRl 1076 +IGluZA== 1077 +IGJvYXJkaW5n 1078 +dGVhY2hlcg== 1079 +cGxl 1080 +UG9yY3VwaW5l 1081 +IHBsZQ== 1082 +IGdlaXNoYQ== 1083 +ICAgIA== 1084 +b3N0 1085 +ZW5zZQ== 1086 +Tm8= 1087 +aWJsZQ== 1088 +IHJlYWQ= 1089 +IHJlZA== 1090 +ZW50aW9u 1091 +ZW5lZA== 1092 +ISIK 1093 +IHJlZg== 1094 +IGFk 1095 +IGZs 1096 +IHN0YXk= 1097 +dXA= 1098 +IHJvdW5k 1099 +IGNsZQ== 1100 +IG9wZW4= 1101 +IG9i 1102 +dGVuZA== 1103 +IGZpbmQ= 1104 +IHBlcg== 1105 +IGNhbGxlZA== 1106 +IHN1cg== 1107 +cmV3 1108 +IHBhcGVy 1109 +IEJhZGdlcg== 1110 +IG1lZXQ= 1111 +aXNz 1112 +IlRoYXQ= 1113 +ZXJtcw== 1114 +VEU= 1115 +aXR0ZW4= 1116 +YWJseQ== 1117 +bmVzcw== 1118 +IGNhbm5vdA== 1119 +IHNpbXA= 1120 +Y29u 1121 +IHJlYXNvbg== 1122 +eW91 1123 +IGhvbWU= 1124 +Ynk= 1125 +IGZpZ2h0 1126 +aXR0bGU= 1127 +IHRoaW5ncw== 1128 +IGVhcw== 1129 +IGltcA== 1130 +cmVzc2Vk 1131 +IG1lYW4= 1132 +IGFwcGVhcmVk 1133 +IG5hdA== 1134 +IGhlbA== 1135 +cmV0 1136 +YWtlbg== 1137 +IHN0cmFpZ2h0 1138 +IGFmZmFpcg== 1139 +aXRpbmc= 1140 +IGVk 1141 +IHNpbmNl 1142 +bG9n 1143 +IHBheQ== 1144 +IGZyb250 1145 +bXk= 1146 +IHZvaWNl 1147 +cmVhZHk= 1148 +IGZvb2w= 1149 +b3VuZGF0aW9u 1150 +IGVsZWN0cm9uaWM= 1151 +IHRlcm1z 1152 +IG1hcg== 1153 +YXBhbg== 1154 +YW55 1155 +IHJlc3A= 1156 +IGVuZA== 1157 +YXBw 1158 +d2hhdA== 1159 +c3Ry 1160 +cmFw 1161 +aWFs 1162 +aWN1bA== 1163 +IGFjYw== 1164 +b3Ro 1165 +IHNlY29uZA== 1166 +IGZsbw== 1167 +IHNpeA== 1168 +IGZlZXQ= 1169 +YnI= 1170 +aWV0 1171 +IGxpdHRsZQ== 1172 +bGVz 1173 +IG1vbmV5 1174 +IGRlY2w= 1175 +IGV5 1176 +IGNvbXA= 1177 +YXJpbmc= 1178 +IGFncmU= 1179 +d2hlcmU= 1180 +IFN0 1181 +IHN0cmU= 1182 +ZXg= 1183 +cmFjdA== 1184 +IGludA== 1185 +IGRpcmU= 1186 +IGJlY29tZQ== 1187 +IGhvbg== 1188 +IGNvbnNpZA== 1189 +ZXJ0YWlu 1190 +bm93 1191 +IHNs 1192 +aXRvcg== 1193 +Z2c= 1194 +IGp1bQ== 1195 +IGJ1 1196 +IHRoaW5n 1197 +IGFuc3dlcmVk 1198 +b2Vz 1199 +eWE= 1200 +IFRoYXQ= 1201 +aXpl 1202 +b25k 1203 +YWN0 1204 +IGVmZg== 1205 +IGJhbmc= 1206 +YWJvdXQ= 1207 +IGJlZA== 1208 +b3Jyb3c= 1209 +dW5n 1210 +IFRv 1211 +IGtlcHQ= 1212 +IHdhbA== 1213 +IGJhdGg= 1214 +IGRyYQ== 1215 +IkE= 1216 +cmluZ3M= 1217 +aG9wcA== 1218 +IHJlc2lnbg== 1219 +IGRpbg== 1220 +IGxhZHk= 1221 +LkU= 1222 +IHVzZQ== 1223 +bGlzaA== 1224 +b3Jz 1225 +IHdyaXR0ZW4= 1226 +ZW5l 1227 +aXY= 1228 +IGRpZg== 1229 +IHN0ZQ== 1230 +IHN0b3J5 1231 +Y29t 1232 +cmVz 1233 +ZW50bHk= 1234 +IGZhY3Q= 1235 +aGVz 1236 +d2F5cw== 1237 +IHdoeQ== 1238 +IHRob3VnaA== 1239 +IHN0cg== 1240 +b25kZXI= 1241 +aGVhZA== 1242 +IGNvdXI= 1243 +IG1vbg== 1244 +IHNr 1245 +IGJlbGll 1246 +IGxldA== 1247 +ZmVy 1248 +IHJlcXU= 1249 +IGxpbmU= 1250 +cm9vbQ== 1251 +LWRheQ== 1252 +IGRvbmU= 1253 +IGRvZXM= 1254 +IE9uZQ== 1255 +IGRhbmdv 1256 +YXNzaG9wcA== 1257 +IGNvbnNpZGVy 1258 +IGRpbm5lcg== 1259 +IEZvdW5kYXRpb24= 1260 +Kio= 1261 +ZW1wdA== 1262 +ZXNl 1263 +IHdvcmQ= 1264 +cmVzdA== 1265 +IGVub3VnaA== 1266 +IGdyZWF0 1267 +IG5hbWU= 1268 +IHB1Yg== 1269 +IG1hbm5lcg== 1270 +d2Vy 1271 +aWN0 1272 +aW5lc3M= 1273 +IGhpbXNlbGY= 1274 +IHBlb3BsZQ== 1275 +ZXc= 1276 +IGNvcg== 1277 +ZXN0aW9u 1278 +IGJpZw== 1279 +ZWU= 1280 +IHJp 1281 +aWRlcw== 1282 +IGJyb3RoZXI= 1283 +IGhlYXJ0 1284 +ZWN0ZWQ= 1285 +ZWVk 1286 +IG90aGVycw== 1287 +c29s 1288 +dGVk 1289 +IGV5ZXM= 1290 +IHRyb3VibGU= 1291 +IHRlYWNo 1292 +IGJvYXQ= 1293 +IGZvdXI= 1294 +IGFscmVhZHk= 1295 +cm9t 1296 +Z2hlZA== 1297 +IHNxdQ== 1298 +IHBvbA== 1299 +Y2Vz 1300 +IEhvdHQ= 1301 +IGxlYXZl 1302 +IGRpc3RyaWJ1dA== 1303 +YXN0ZXI= 1304 +Q0g= 1305 +dWM= 1306 +IGlt 1307 +IGhvd2V2ZXI= 1308 +dGhlcmU= 1309 +YXBhbmVzZQ== 1310 +IGxhc3Q= 1311 +IGNy 1312 +aWxpdHk= 1313 +IHNpbXBsZQ== 1314 +IGxpZmU= 1315 +LWM= 1316 +IHJlZ2FyZA== 1317 +IGZpbg== 1318 +dWFs 1319 +IG1lYW5z 1320 +IHN0YW5k 1321 +YXRjaA== 1322 +IHNob3J0 1323 +bmVk 1324 +IHNlZW4= 1325 +IGhhcHA= 1326 +LWs= 1327 +IGFnYWluc3Q= 1328 +aGlt 1329 +YW1lZA== 1330 +IHN0b29k 1331 +IGdyYQ== 1332 +IG1vdGhlcg== 1333 +IGZpc2g= 1334 +IHdhdGVy 1335 +YWls 1336 +Y2Vp 1337 +IHJhdGhlcg== 1338 +IGlucw== 1339 +IGZlZWw= 1340 +IGFsc28= 1341 +IG9yZA== 1342 +IGNvbWluZw== 1343 +aWNz 1344 +IGVpdGhlcg== 1345 +bmNl 1346 +ICc= 1347 +IGtpZA== 1348 +IGxhdWdoZWQ= 1349 +bGlrZQ== 1350 +IEFy 1351 +Z3I= 1352 +IEhvdHRh 1353 +IHRhbGs= 1354 +Z2V0aGVy 1355 +IFNpcg== 1356 +IHB1bg== 1357 +UHJv 1358 +YXRz 1359 +bW9zdA== 1360 +IHJlcA== 1361 +IGdp 1362 +aXNm 1363 +YmFibHk= 1364 +YWtlcw== 1365 +IE5vdA== 1366 +bnk= 1367 +IGFwcGVhcg== 1368 +bXA= 1369 +Y2hh 1370 +IGFjdA== 1371 +YmVk 1372 +aWVm 1373 +dWZm 1374 +IGFwbw== 1375 +IG1ldA== 1376 +IHJldHVybmVk 1377 +IHNvdW5k 1378 +dXNpbmVzcw== 1379 +IGxhdWdo 1380 +IGNsZWFy 1381 +IG5lZWQ= 1382 +ZmVzcw== 1383 +ZXN0ZWQ= 1384 +IGludg== 1385 +IGFjY2VwdA== 1386 +dW5kZXI= 1387 +Owo= 1388 +IHN1cnBy 1389 +ZGU= 1390 +IHRyYWlu 1391 +IGhvdGVs 1392 +IHNsZWVw 1393 +IGRy 1394 +IGhvbGQ= 1395 +bG9jaw== 1396 +cHVyYQ== 1397 +IHNwcmluZ3M= 1398 +IC4uLi4uLg== 1399 +IGFncmVlbWVudA== 1400 +IERhcg== 1401 +IHJlc3Q= 1402 +Y2x1ZA== 1403 +YXRvcg== 1404 +YXY= 1405 +IG9yaWc= 1406 +IG9yaWdpbg== 1407 +IGVs 1408 +IG5vcg== 1409 +IHByZXM= 1410 +IHVuZGVyc3RhbmQ= 1411 +IHRha2Vu 1412 +IGxpZ2h0 1413 +ZW5lcg== 1414 +c29tZQ== 1415 +IGJyb3VnaHQ= 1416 +cmFwaA== 1417 +IG1vc3Q= 1418 +b2tl 1419 +LXc= 1420 +IHVudA== 1421 +IGZhdGhlcg== 1422 +IHVzZWQ= 1423 +IGVhdA== 1424 +IHllYXJz 1425 +IFdoaWxl 1426 +IGNoYW4= 1427 +IHN1ZGQ= 1428 +IHN1ZGRlbg== 1429 +IGFwb2xvZw== 1430 +IHNldHQ= 1431 +IHRoaW4= 1432 +IE15 1433 +IHRlbg== 1434 +aW1lcw== 1435 +Zm9y 1436 +b3Vk 1437 +V2hlbg== 1438 +IGRldA== 1439 +IGxpdmU= 1440 +IG9j 1441 +IGZpdmU= 1442 +IGNvbnQ= 1443 +IGhlbHA= 1444 +IHdh 1445 +IHBhc3NlZA== 1446 +IHJ1bg== 1447 +IG1ha2luZw== 1448 +IHN0cmFuZ2U= 1449 +IHRha2luZw== 1450 +IGVhY2g= 1451 +IllvdQ== 1452 +IGFub3RoZXI= 1453 +IlNheQ== 1454 +IlRoZQ== 1455 +YXRlcw== 1456 +IHBsZWFz 1457 +YXNzaG9wcGVycw== 1458 +IG1vbQ== 1459 +IG1vbWVudA== 1460 +ZW50bGU= 1461 +bmdsaXNo 1462 +Q0hB 1463 +IG9yaWdpbmFs 1464 +aW9ucw== 1465 +dXJpbmc= 1466 +IHB1YmxpYw== 1467 +dWN0 1468 +dWNr 1469 +IHF1ZXN0aW9u 1470 +YWk= 1471 +Y3k= 1472 +ZWs= 1473 +IGZsb29y 1474 +IGNhcg== 1475 +b3VzZQ== 1476 +IHNpZGU= 1477 +LXlh 1478 +IGNlcnRhaW4= 1479 +aHlz 1480 +LWQ= 1481 +aWdo 1482 +YWdpbg== 1483 +d2VldA== 1484 +IHBvb3I= 1485 +IGRlY2lk 1486 +dWFsbHk= 1487 +IGJ1c2luZXNz 1488 +cHJv 1489 +cGxhaW4= 1490 +IHN0b3A= 1491 +IQo= 1492 +IEhvdw== 1493 +IldoYXQ= 1494 +Y2Fu 1495 +IFVu 1496 +cHM= 1497 +dW5k 1498 +LW5pZ2h0 1499 +IG1lZXRpbmc= 1500 +ZWRv 1501 +IHJhaXNl 1502 +R3V0ZW5iZXJn 1503 +IERhcmxpbmc= 1504 +dW1l 1505 +IEVuZ2xpc2g= 1506 +VEVS 1507 +YWRpbmc= 1508 +IHRyYW5zbA== 1509 +IGFibGU= 1510 +c3NpYmxl 1511 +IHNhdGlzZg== 1512 +IHdhbnRlZA== 1513 +IHN1Yg== 1514 +IGNhc2U= 1515 +aWZpYw== 1516 +aXRlcmFyeQ== 1517 +IG1haWQ= 1518 +IGluYw== 1519 +IHBvcw== 1520 +IHBvc2l0aW9u 1521 +IHBhdA== 1522 +dXJlZA== 1523 +b3JyeQ== 1524 +IGFjY291bnQ= 1525 +IGJvdGg= 1526 +IGZyaWU= 1527 +IGZyaWVuZA== 1528 +dGhpcw== 1529 +IGFsd2F5cw== 1530 +IHBhcnRpY3Vs 1531 +V2hhdA== 1532 +IHNtYWxs 1533 +ZW50eQ== 1534 +dXNoZWQ= 1535 +IG1pcw== 1536 +dWxseQ== 1537 +IHJlY2Vp 1538 +WW91 1539 +IHlldA== 1540 +IGdhdmU= 1541 +QnV0 1542 +aGFk 1543 +IGFuc3dlcg== 1544 +IGFicw== 1545 +aWxl 1546 +Y2tldA== 1547 +IG5vb2Q= 1548 +IGNvdXJzZQ== 1549 +IGZvcm0= 1550 +IGV2ZXJ5dGhpbmc= 1551 +ZWN0aW9u 1552 +SWY= 1553 +cGFydA== 1554 +IHNpbmc= 1555 +IHNpdA== 1556 +IHB1cg== 1557 +aXA= 1558 +IGZpc2hpbmc= 1559 +IGVo 1560 +IHBhcg== 1561 +IHRvZ2V0aGVy 1562 +SGU= 1563 +IHdoZQ== 1564 +IHdoZXRoZXI= 1565 +IGJyYQ== 1566 +Illlcw== 1567 +IHB1bmlzaA== 1568 +U2hpcnQ= 1569 +IFllZG8= 1570 +IGZhcmV3 1571 +IGZhcmV3ZWxs 1572 +IGRhbmNl 1573 +IGxlc3M= 1574 +dXJhbA== 1575 +IGRlZg== 1576 +IGF0dGVtcHQ= 1577 +d2Vlbg== 1578 +IHNpZ24= 1579 +IHN5 1580 +ZmVyZW50 1581 +IGxlYXN0 1582 +c2Vy 1583 +b2I= 1584 +bmRpbmc= 1585 +IHNvcnJ5 1586 +IGp1bXBlZA== 1587 +IGphbg== 1588 +IGphbml0b3I= 1589 +aXplZA== 1590 +IHRvd2FyZA== 1591 +IG1vcg== 1592 +YXZpbmc= 1593 +IGJpdA== 1594 +IlRoaXM= 1595 +IHJlbWFyaw== 1596 +IGZ1dA== 1597 +IHdvbmRlcg== 1598 +IGZ1bg== 1599 +VGhlbg== 1600 +IGRlYw== 1601 +IHdob20= 1602 +IGRpZG4= 1603 +IHJlYw== 1604 +YmVj 1605 +Iklm 1606 +IGtuZXc= 1607 +YWZ0ZXI= 1608 +IHRodXM= 1609 +IGlzbg== 1610 +IHNpZ2h0 1611 +bWVk 1612 +W0Y= 1613 +dXNz 1614 +Y2lkZW50 1615 +dGhlbQ== 1616 +IGZpZg== 1617 +IGRyYXc= 1618 +IGhlYXI= 1619 +IHdyaXRpbmc= 1620 +IGdldHRpbmc= 1621 +c2g= 1622 +ZmVyZW5jZQ== 1623 +IHJhaXNlZA== 1624 +dGhleQ== 1625 +YXg= 1626 +IGZpbmU= 1627 +c2Vs 1628 +IE5vYmU= 1629 +IE5vYmVvaw== 1630 +IE5vYmVva2E= 1631 +b3JtYWw= 1632 +IGVC 1633 +aWNlbnNl 1634 +MDA= 1635 +IGJlc3Q= 1636 +d29y 1637 +Zmlj 1638 +dGVyZXN0 1639 +IHJlbWFy 1640 +Ymw= 1641 +YXJ0ZWQ= 1642 +IGRhcms= 1643 +IHlvdW5n 1644 +dXNo 1645 +IGJldA== 1646 +b3V0aA== 1647 +aG91c2U= 1648 +YXVnaHQ= 1649 +IHBoeXM= 1650 +IHN0cm9uZw== 1651 +IGZ1cg== 1652 +IHJvbGw= 1653 +Y292ZQ== 1654 +Y2hpZWY= 1655 +YXdh 1656 +IGZvbGxvd2Vk 1657 +IGZvbmQ= 1658 +IGZ1dHVyZQ== 1659 +aXJk 1660 +ZnVsbHk= 1661 +IGVmZm9ydA== 1662 +QWZ0ZXI= 1663 +b3dhcmQ= 1664 +IHJlYWxseQ== 1665 +IGFtb25n 1666 +IGFyb3VuZA== 1667 +IGNvbXBs 1668 +IGdheg== 1669 +IGJvdw== 1670 +YXRlcg== 1671 +IGluc2lzdA== 1672 +IHR1cm5lZA== 1673 +aGVs 1674 +cmVt 1675 +IGhvdXJz 1676 +IGRlY2lkZWQ= 1677 +eXM= 1678 +IG1vbnRo 1679 +LWE= 1680 +IGFkdg== 1681 +IGJlbGlldmU= 1682 +IHRlYWNoaW5n 1683 +IGVhc3k= 1684 +IGRpcmVjdGlvbg== 1685 +b29rZWQ= 1686 +IHdhcg== 1687 +IHVubGVzcw== 1688 +aGF2ZQ== 1689 +IHNxdWFyZQ== 1690 +dmls 1691 +IHF1aWV0 1692 +IGh1bmc= 1693 +IGdvZXM= 1694 +IHBhaWQ= 1695 +IHNoYWxs 1696 +Ik5v 1697 +IHB1bmlzaG1lbnQ= 1698 +cG9zZQ== 1699 +IHN3ZWV0 1700 +J3Zl 1701 +IldlbGw= 1702 +IGdlbnRsZQ== 1703 +IG5vcm1hbA== 1704 +YWdyYXBo 1705 +Y2hpdmU= 1706 +Y2hhbg== 1707 +IGluY2x1ZA== 1708 +d3c= 1709 +b3Jn 1710 +dGVt 1711 +QVI= 1712 +IFRI 1713 +IGVxdQ== 1714 +IHRvbmU= 1715 +IHBvc3NpYmxl 1716 +IGJlY29t 1717 +IEphcGFuZXNl 1718 +dmVycw== 1719 +IGZvbGxvd2luZw== 1720 +IHBhaW4= 1721 +IHdob2xl 1722 +d3I= 1723 +IHNlcmlvdXM= 1724 +IG5hcg== 1725 +IHRpcmVk 1726 +SW4= 1727 +IHBsYXk= 1728 +IHByb20= 1729 +IGdhbWU= 1730 +IFNvbWU= 1731 +IGhhcHBlbmVk 1732 +IGN1dA== 1733 +IHR3ZW50eQ== 1734 +IGRvb3I= 1735 +IG1vcm5pbmc= 1736 +aGluZA== 1737 +IGJyZQ== 1738 +IGluc2lkZQ== 1739 +b3Zl 1740 +YWx0aA== 1741 +dWs= 1742 +YXJnZQ== 1743 +YW1i 1744 +IGRhbQ== 1745 +IHdvcnJ5 1746 +YXRpdmU= 1747 +IGV4cGVjdGVk 1748 +IGZhbQ== 1749 +IHByYQ== 1750 +IHBvY2tldA== 1751 +b29rcw== 1752 +Y2hlZA== 1753 +IHNpbA== 1754 +b2w= 1755 +IGZhdg== 1756 +IGVsc2U= 1757 +IGhpZ2g= 1758 +IHJlYWw= 1759 +IGFsb25n 1760 +IG1lZA== 1761 +aGlr 1762 +aGVtYXQ= 1763 +aGVtYXRpY3M= 1764 +IGxpc3Q= 1765 +IHNpY2s= 1766 +b2ludA== 1767 +W0Zvb3Q= 1768 +W0Zvb3Rub3Q= 1769 +W0Zvb3Rub3Rl 1770 +Ll0K 1771 +bmlnaHQ= 1772 +c2Vz 1773 +aW9y 1774 +IHNheXM= 1775 +IG1vdXRo 1776 +aG93 1777 +bWluZw== 1778 +IGNsbw== 1779 +IGN1cg== 1780 +Z2luZw== 1781 +IHN1ZGRlbmx5 1782 +LWFo 1783 +YW1w 1784 +IGJsYWNr 1785 +cm9zcw== 1786 +IGZhYw== 1787 +c2VsdmVz 1788 +aWV3 1789 +aXNzaW9u 1790 +IGNvcHlyaWdodA== 1791 +IHBhcmFncmFwaA== 1792 +IEFyY2hpdmU= 1793 +IGRvbmF0aW9ucw== 1794 +UHJvamVjdA== 1795 +IGNvc3Q= 1796 +Lm9yZw== 1797 +TEk= 1798 +dWNlZA== 1799 +IHN1Yw== 1800 +eWxl 1801 +IGZvcmNl 1802 +am95 1803 +b3VjaA== 1804 +dHI= 1805 +SXQ= 1806 +IHRyYWQ= 1807 +IHByZXNlbnQ= 1808 +IGV4dA== 1809 +YXNlZA== 1810 +cmVkaXQ= 1811 +IGZhdWx0 1812 +aWI= 1813 +LW0= 1814 +dXJk 1815 +IHRyaWVk 1816 +dGltZQ== 1817 +IHByZXQ= 1818 +IHNwZWU= 1819 +b3dlcg== 1820 +IHdvcmRz 1821 +Q0hBUA== 1822 +Q0hBUFRFUg== 1823 +c2Nob29s 1824 +IGFzaw== 1825 +IGRvaW5n 1826 +YXRlbHk= 1827 +IHVudGls 1828 +Ym91dA== 1829 +IHRyZWU= 1830 +Y2FsbA== 1831 +YW1hc2g= 1832 +YW1hc2hpcg== 1833 +YW1hc2hpcm8= 1834 +c3Rl 1835 +IGJlaGluZA== 1836 +b2xk 1837 +IHdhbGw= 1838 +aXRvcnk= 1839 +IHJvbGxlZA== 1840 +IG1vdmU= 1841 +IGFwb2xvZ2l6ZQ== 1842 +IGxhcmdl 1843 +YW1ib28= 1844 +c3U= 1845 +IHNldHRsZWQ= 1846 +Ikhl 1847 +d28= 1848 +IHRoaW5raW5n 1849 +dXNlZA== 1850 +aWZpZWQ= 1851 +IGFsbW9zdA== 1852 +IHRyZQ== 1853 +IHRyZWF0 1854 +IG5vb2RsZQ== 1855 +IG5vdGU= 1856 +IEFsbA== 1857 +IGJlYXQ= 1858 +IG9iamVjdA== 1859 +IHNlZW1z 1860 +IGlkZQ== 1861 +WWVz 1862 +b3dz 1863 +IHJlbWFpbg== 1864 +IGJlZ2lu 1865 +dWdodA== 1866 +bWVudHM= 1867 +IGFsb25l 1868 +c3BlY3Q= 1869 +IG1hdGhlbWF0aWNz 1870 +IHJvdWdo 1871 +IG91dHNpZGU= 1872 +IGNvbWVz 1873 +YmFjaw== 1874 +IHdpbmQ= 1875 +c2Vk 1876 +IHdvdWxkbg== 1877 +ZWVy 1878 +aW51dA== 1879 +ZnJvbQ== 1880 +IHJlcGw= 1881 +IG5hcnJvdw== 1882 +IGluY2lkZW50 1883 +IGFpcg== 1884 +IHNlYQ== 1885 +dHM= 1886 +IHN1cnByaXNlZA== 1887 +IHRlYQ== 1888 +UmVk 1889 +IHRhbGtpbmc= 1890 +IGJvc3M= 1891 +cXVl 1892 +IHBpY3Q= 1893 +aXJ0eQ== 1894 +IGNl 1895 +IGxpbQ== 1896 +IFdoeQ== 1897 +IHBvaW50 1898 +IGxhdw== 1899 +Y2lhdGVk 1900 +IG1vb24= 1901 +aXJjdQ== 1902 +Z290 1903 +IElz 1904 +IGhhbmRz 1905 +IGhvbm9y 1906 +YXV0 1907 +cmdl 1908 +IHN0YXRl 1909 +IExpdGVyYXJ5 1910 +LkY= 1911 +VGhpcw== 1912 +bGluZQ== 1913 +Lmc= 1914 +Lmd1dGVuYmVyZw== 1915 +IE9G 1916 +RU4= 1917 +cmFjdGVy 1918 +IGJlbmU= 1919 +IEV2ZW4= 1920 +b3Vi 1921 +IG1ha2Vz 1922 +IGludGVyZXN0 1923 +b3Bl 1924 +bXM= 1925 +IHJlc3BvbnM= 1926 +IGZvcmU= 1927 +IHNvbWV3aGF0 1928 +IGhvbmVzdA== 1929 +b2Nr 1930 +aXJpdA== 1931 +IGhlbGQ= 1932 +IGFkZGVk 1933 +ZnU= 1934 +YWRlZA== 1935 +YWxz 1936 +YXR0 1937 +dGVybg== 1938 +IHBlcnNvbmFs 1939 +IGFzcw== 1940 +IFdpdGg= 1941 +dGlj 1942 +VG9reW8= 1943 +IHNob3V0 1944 +IHByZXR0eQ== 1945 +dW1i 1946 +IGVhcmx5 1947 +b3BwZWQ= 1948 +IGZ1cnRoZXI= 1949 +IGZyZQ== 1950 +ZXNpZGVz 1951 +IGJhbWJvbw== 1952 +IGly 1953 +bW9yZQ== 1954 +IGxpdmluZw== 1955 +IHJlY2VpdmVk 1956 +IGxpdmVk 1957 +IG1lYW50 1958 +IGNvd2FyZA== 1959 +cG9zaXRpb24= 1960 +IGxvYw== 1961 +aWxlZA== 1962 +IHRlbmRlcg== 1963 +IGNo 1964 +IEFmdGVy 1965 +Y2Vy 1966 +IGZhdm9y 1967 +d2hv 1968 +IGxpa2Vk 1969 +cmFuY2U= 1970 +IHByaQ== 1971 +a2lzaGE= 1972 +IHN0dWR5 1973 +IG9yZGVy 1974 +IGFmdGVyd2FyZA== 1975 +IGdyZWF0bHk= 1976 +IHVuYWJsZQ== 1977 +Z28= 1978 +IHdhaXQ= 1979 +ZXBpbmc= 1980 +aWRpbmc= 1981 +IGZvcnR5 1982 +IHNreQ== 1983 +IG9mZmljZQ== 1984 +d2lsbA== 1985 +IkQ= 1986 +d2Vs 1987 +IHN0YXRpb24= 1988 +Ym8= 1989 +aG90 1990 +c3VjaA== 1991 +IGxvdWQ= 1992 +IGF3 1993 +bGFuZA== 1994 +Pwo= 1995 +IHJlc3BlY3Q= 1996 +YW5jZXM= 1997 +aWVudA== 1998 +IG91Z2h0 1999 diff --git a/tests/test_utils.py b/tests/test_utils.py index c575134e8f..8c2536a15d 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -18,7 +18,7 @@ import torch from torch import nn -from torchtune.modules import Tokenizer +from torchtune.modules.tokenizers import SentencePieceTokenizer skip_if_cuda_not_available = unittest.skipIf( not torch.cuda.is_available(), "CUDA is not available" @@ -39,8 +39,8 @@ def torch_version_ge(version: str) -> bool: return version in torch.__version__ or torch.__version__ >= version -# Inherit from tokenizer class to reuse its tokenize_messages method -class DummyTokenizer(Tokenizer): +# Inherit from SentencePieceTokenizer class to reuse its tokenize_messages method +class DummyTokenizer(SentencePieceTokenizer): def __init__(self): self.encodes_whitespace = False diff --git a/tests/torchtune/datasets/test_alpaca_dataset.py b/tests/torchtune/datasets/test_alpaca_dataset.py index 9b9cb56b07..2a05cefd06 100644 --- a/tests/torchtune/datasets/test_alpaca_dataset.py +++ b/tests/torchtune/datasets/test_alpaca_dataset.py @@ -10,9 +10,8 @@ from tests.test_utils import get_assets_path from torchtune.data._common import CROSS_ENTROPY_IGNORE_IDX - from torchtune.datasets import alpaca_cleaned_dataset, alpaca_dataset -from torchtune.modules.tokenizer import Tokenizer +from torchtune.modules.tokenizers import SentencePieceTokenizer class TestAlpacaDataset: @@ -20,7 +19,7 @@ class TestAlpacaDataset: def tokenizer(self): # m.model is a pretrained Sentencepiece model using the following command: # spm.SentencePieceTrainer.train('--input= --model_prefix=m --vocab_size=2000') - return Tokenizer.from_file(str(get_assets_path() / "m.model")) + return SentencePieceTokenizer(str(get_assets_path() / "m.model")) @patch("torchtune.datasets._instruct.load_dataset") def test_label_no_masking(self, load_dataset, tokenizer): diff --git a/tests/torchtune/datasets/test_grammar_dataset.py b/tests/torchtune/datasets/test_grammar_dataset.py index 5fb41d39eb..20c209f004 100644 --- a/tests/torchtune/datasets/test_grammar_dataset.py +++ b/tests/torchtune/datasets/test_grammar_dataset.py @@ -12,7 +12,7 @@ from torchtune.data._common import CROSS_ENTROPY_IGNORE_IDX from torchtune.datasets import grammar_dataset -from torchtune.modules.tokenizer import Tokenizer +from torchtune.modules.tokenizers import SentencePieceTokenizer class TestGrammarDataset: @@ -20,7 +20,7 @@ class TestGrammarDataset: def tokenizer(self): # m.model is a pretrained Sentencepiece model using the following command: # spm.SentencePieceTrainer.train('--input= --model_prefix=m --vocab_size=2000') - return Tokenizer.from_file(str(get_assets_path() / "m.model")) + return SentencePieceTokenizer(str(get_assets_path() / "m.model")) @patch("torchtune.datasets._instruct.load_dataset") def test_label_no_masking(self, load_dataset, tokenizer): diff --git a/tests/torchtune/datasets/test_samsum_dataset.py b/tests/torchtune/datasets/test_samsum_dataset.py index 972b8bbb25..6ec6a52679 100644 --- a/tests/torchtune/datasets/test_samsum_dataset.py +++ b/tests/torchtune/datasets/test_samsum_dataset.py @@ -12,7 +12,7 @@ from torchtune.data._common import CROSS_ENTROPY_IGNORE_IDX from torchtune.datasets import samsum_dataset -from torchtune.modules.tokenizer import Tokenizer +from torchtune.modules.tokenizers import SentencePieceTokenizer class TestSamsumDataset: @@ -20,7 +20,7 @@ class TestSamsumDataset: def tokenizer(self): # m.model is a pretrained Sentencepiece model using the following command: # spm.SentencePieceTrainer.train('--input= --model_prefix=m --vocab_size=2000') - return Tokenizer.from_file(str(get_assets_path() / "m.model")) + return SentencePieceTokenizer(str(get_assets_path() / "m.model")) @patch("torchtune.datasets._instruct.load_dataset") def test_label_no_masking(self, load_dataset, tokenizer): diff --git a/tests/torchtune/datasets/test_slimorca_dataset.py b/tests/torchtune/datasets/test_slimorca_dataset.py index 725b60d49d..03a8396271 100644 --- a/tests/torchtune/datasets/test_slimorca_dataset.py +++ b/tests/torchtune/datasets/test_slimorca_dataset.py @@ -10,7 +10,7 @@ from tests.test_utils import get_assets_path from torchtune.datasets import slimorca_dataset -from torchtune.modules.tokenizer import Tokenizer +from torchtune.modules.tokenizers import SentencePieceTokenizer class TestSlimOrcaDataset: @@ -18,7 +18,7 @@ class TestSlimOrcaDataset: def tokenizer(self): # m.model is a pretrained Sentencepiece model using the following command: # spm.SentencePieceTrainer.train('--input= --model_prefix=m --vocab_size=2000') - return Tokenizer.from_file(str(get_assets_path() / "m.model")) + return SentencePieceTokenizer(str(get_assets_path() / "m.model")) @patch("torchtune.datasets._chat.load_dataset") def test_value_error(self, load_dataset, tokenizer): diff --git a/tests/torchtune/models/test_llama3.py b/tests/torchtune/models/test_llama3.py new file mode 100644 index 0000000000..190eaf413e --- /dev/null +++ b/tests/torchtune/models/test_llama3.py @@ -0,0 +1,46 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import pytest +import torch +from tests.test_utils import fixed_init_model +from torchtune.models.llama3 import llama3 +from torchtune.utils.seed import set_seed + +EMBED_DIM = 128 +NUM_LAYERS = 4 +NUM_HEADS = 16 +NUM_KV_HEADS = 8 +VOCAB_SIZE = 32000 +MAX_SEQ_LEN = 2048 +BSZ = 2 +SEQ_LEN = 100 + + +@pytest.fixture(autouse=True) +def random(): + set_seed(16) + + +class TestLlama3: + @pytest.fixture + def inputs(self): + return torch.randint(0, VOCAB_SIZE, (BSZ, SEQ_LEN)) + + def test_forward(self, inputs): + model = llama3( + vocab_size=VOCAB_SIZE, + num_layers=NUM_LAYERS, + num_heads=NUM_HEADS, + num_kv_heads=NUM_KV_HEADS, + embed_dim=EMBED_DIM, + max_seq_len=MAX_SEQ_LEN, + ) + fixed_init_model(model, min_val=-0.25, max_val=0.5) + actual = model(inputs) + expected = torch.tensor(3.9763) + assert actual.shape == (BSZ, SEQ_LEN, VOCAB_SIZE) + torch.testing.assert_close(actual.mean(), expected, atol=1e-4, rtol=1e-4) diff --git a/tests/torchtune/modules/test_tokenizer.py b/tests/torchtune/modules/tokenizers/test_sentencepiece.py similarity index 96% rename from tests/torchtune/modules/test_tokenizer.py rename to tests/torchtune/modules/tokenizers/test_sentencepiece.py index 5ac4255e01..bc8f61c2a1 100644 --- a/tests/torchtune/modules/test_tokenizer.py +++ b/tests/torchtune/modules/tokenizers/test_sentencepiece.py @@ -8,17 +8,17 @@ import pytest from torchtune.data._types import Message -from torchtune.modules.tokenizer import Tokenizer +from torchtune.modules.tokenizers import SentencePieceTokenizer -ASSETS = Path(__file__).parent.parent.parent / "assets" +ASSETS = Path(__file__).parent.parent.parent.parent / "assets" -class TestTokenizer: +class TestSentencePieceTokenizer: @pytest.fixture def tokenizer(self): # m.model is a pretrained Sentencepiece model using the following command: # spm.SentencePieceTrainer.train('--input= --model_prefix=m --vocab_size=2000') - return Tokenizer.from_file(str(ASSETS / "m.model")) + return SentencePieceTokenizer(str(ASSETS / "m.model")) def test_encode(self, tokenizer): assert tokenizer.encode("Hello world!") == [ diff --git a/tests/torchtune/modules/tokenizers/test_tiktoken.py b/tests/torchtune/modules/tokenizers/test_tiktoken.py new file mode 100644 index 0000000000..8796213e76 --- /dev/null +++ b/tests/torchtune/modules/tokenizers/test_tiktoken.py @@ -0,0 +1,192 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from pathlib import Path + +import pytest +from torchtune.data._types import Message +from torchtune.modules.tokenizers import TikTokenTokenizer + +ASSETS = Path(__file__).parent.parent.parent.parent / "assets" + + +class TestTikTokenTokenizer: + @pytest.fixture + def tokenizer(self): + # Pretrained tiktoken model generated via the script in + # https://gist.github.com/ebsmothers/54b133dd87db6679b14318545aaa2de4 + return TikTokenTokenizer(str(ASSETS / "tiktoken_small.model")) + + @pytest.fixture + def texts(self): + return [ + "I can see the sun. But even if I cannot see the sun, I know that it exists.", + "And to know that the sun is there - that is living.", + ] + + @pytest.fixture + def messages(self, texts): + return [ + Message(role="user", content=texts[0], masked=True), + Message(role="assistant", content=texts[1], masked=False), + ] + + @pytest.fixture + def token_ids(self): + return [ + 73, + 503, + 654, + 262, + 376, + 110, + 46, + 690, + 720, + 428, + 270, + 1119, + 654, + 262, + 376, + 110, + 44, + 270, + 686, + 334, + 312, + 522, + 511, + 115, + 46, + ] + + @pytest.fixture + def tokenized_messages(self, token_ids): + return ( + [2000, 2006, 477, 273, 2007, 10, 10] + + token_ids + + [ + 2009, + 2006, + 520, + 511, + 446, + 2007, + 10, + 10, + 65, + 269, + 277, + 686, + 334, + 262, + 376, + 110, + 351, + 443, + 32, + 45, + 334, + 351, + 1955, + 46, + 2009, + 2001, + ], + [ + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + True, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + False, + True, + ], + ) + + def test_encode(self, tokenizer, texts, token_ids): + assert tokenizer.encode(texts[0], add_bos=True, add_eos=True) == [ + tokenizer.bos_id + ] + token_ids + [tokenizer.eos_id] + assert tokenizer.encode(texts[0], add_bos=False, add_eos=False) == token_ids + + def test_decode(self, tokenizer, texts, token_ids): + assert tokenizer.decode(token_ids) == texts[0] + + def test_encode_and_decode(self, tokenizer, texts): + token_ids = tokenizer.encode(texts[0], add_bos=True, add_eos=True) + decoded_text = tokenizer.decode(token_ids) + assert texts[0] == decoded_text + + def test_token_ids(self, tokenizer): + assert tokenizer.bos_id == 2000 + assert tokenizer.eos_id == 2001 + assert tokenizer.pad_id == -1 + assert tokenizer.step_id == 2005 + assert tokenizer.start_header_id == 2006 + assert tokenizer.end_header_id == 2007 + assert tokenizer.eom_id == 2008 + assert tokenizer.eot_id == 2009 + assert tokenizer.python_tag == 2255 + + def test_tokenizer_vocab_size(self, tokenizer): + assert tokenizer.base_vocab_size == 2000 + assert tokenizer.vocab_size == 2256 + + def test_tokenize_messages(self, tokenizer, messages, tokenized_messages): + assert tokenizer.tokenize_messages(messages) == tokenized_messages diff --git a/tests/torchtune/modules/tokenizers/test_utils.py b/tests/torchtune/modules/tokenizers/test_utils.py new file mode 100644 index 0000000000..023f86f3fd --- /dev/null +++ b/tests/torchtune/modules/tokenizers/test_utils.py @@ -0,0 +1,47 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import pytest +from torchtune.modules.tokenizers._utils import _split_long_repetitions + + +class TestUtils: + def test_split_long_repetitions(self): + normal_str = "Here is a normal string" + ten_spaces = "".join(10 * [" "]) + space_str = ten_spaces.join( + ["Here", "is", "a", "string", "with", "long", "spaces"] + ) + no_space_str = "".join(10 * ["ab"]) + + actual_split = _split_long_repetitions(normal_str, 5) + expected_split = ["Here is a norma", "l strin", "g"] + for actual_substr, expected_substr in zip(actual_split, expected_split): + assert actual_substr == expected_substr + with pytest.raises(StopIteration): + next(actual_split) + + actual_split = _split_long_repetitions(space_str, 9) + expected_split = [ + "Here" + ten_spaces[:-1], + " is" + ten_spaces[:-1], + " a" + ten_spaces[:-1], + " string" + ten_spaces[:-1], + " with" + ten_spaces[:-1], + " long" + ten_spaces[:-1], + " spaces", + ] + for actual_substr, expected_substr in zip(actual_split, expected_split): + assert actual_substr == expected_substr + with pytest.raises(StopIteration): + next(actual_split) + + actual_split = _split_long_repetitions(no_space_str, 4) + expected_split = ["abab"] * 5 + for actual_substr, expected_substr in zip(actual_split, expected_split): + assert actual_substr == expected_substr + with pytest.raises(StopIteration): + next(actual_split) diff --git a/torchtune/_recipe_registry.py b/torchtune/_recipe_registry.py index ddab8f45f1..d4b14a7486 100644 --- a/torchtune/_recipe_registry.py +++ b/torchtune/_recipe_registry.py @@ -31,6 +31,10 @@ class Recipe: name="llama2/7B_full_low_memory", file_path="llama2/7B_full_low_memory.yaml", ), + Config( + name="llama3/8B_full_single_device", + file_path="llama3/8B_full_single_device.yaml", + ), Config( name="mistral/7B_full_low_memory", file_path="mistral/7B_full_low_memory.yaml", @@ -44,6 +48,7 @@ class Recipe: configs=[ Config(name="llama2/7B_full", file_path="llama2/7B_full.yaml"), Config(name="llama2/13B_full", file_path="llama2/13B_full.yaml"), + Config(name="llama3/8B_full", file_path="llama3/8B_full.yaml"), Config(name="mistral/7B_full", file_path="mistral/7B_full.yaml"), Config(name="gemma/2B_full", file_path="gemma/2B_full.yaml"), ], @@ -61,6 +66,14 @@ class Recipe: name="llama2/7B_qlora_single_device", file_path="llama2/7B_qlora_single_device.yaml", ), + Config( + name="llama3/8B_lora_single_device", + file_path="llama3/8B_lora_single_device.yaml", + ), + Config( + name="llama3/8B_qlora_single_device", + file_path="llama3/8B_qlora_single_device.yaml", + ), Config( name="llama2/13B_qlora_single_device", file_path="llama2/13B_qlora_single_device.yaml", @@ -94,6 +107,7 @@ class Recipe: Config(name="llama2/7B_lora", file_path="llama2/7B_lora.yaml"), Config(name="llama2/13B_lora", file_path="llama2/13B_lora.yaml"), Config(name="llama2/70B_lora", file_path="llama2/70B_lora.yaml"), + Config(name="llama3/8B_lora", file_path="llama3/8B_lora.yaml"), Config(name="mistral/7B_lora", file_path="mistral/7B_lora.yaml"), ], supports_distributed=True, diff --git a/torchtune/data/_converters.py b/torchtune/data/_converters.py index 2c6025e8c8..5208220738 100644 --- a/torchtune/data/_converters.py +++ b/torchtune/data/_converters.py @@ -42,10 +42,11 @@ def sharegpt_to_llama2_messages( Returns: List[Message]: a list of messages with "role" and "content" fields. See `torchtune.datasets._types.Message` - and `torchtune.datasets._types.Dialogue` for more details. + for more details. """ role_map = {"system": "system", "human": "user", "gpt": "assistant"} conversations = sample["conversations"] + messages = [] for message in conversations: role = role_map[message["from"]] diff --git a/torchtune/data/_types.py b/torchtune/data/_types.py index 4aba199e3c..087cafa008 100644 --- a/torchtune/data/_types.py +++ b/torchtune/data/_types.py @@ -12,6 +12,23 @@ @dataclass class Message: + """ + This dataclass represents individual messages in an instruction or chat dataset. + + Note that the fields ipython and eot are only relevant when tokenizing with tiktoken, + as they inform handling of special tokens in that case. + + Attributes: + role (Role): role of the message writer. Can be "system", "user", "assistant". + content (str): content of the message. + masked (bool): whether the message is masked in the sample. Default: False + ipython (bool): whether the message is an ipython call. Default: False + eot (bool): whether the message corresponds to the end of a turn. Should be true + except in the case of multiple consecutive assistant messages. Default: True + """ + role: Role content: str masked: bool = False + ipython: bool = False + eot: bool = True diff --git a/torchtune/datasets/_alpaca.py b/torchtune/datasets/_alpaca.py index c339086468..52cbce6d52 100644 --- a/torchtune/datasets/_alpaca.py +++ b/torchtune/datasets/_alpaca.py @@ -8,7 +8,7 @@ from torchtune.data import AlpacaInstructTemplate from torchtune.datasets._instruct import InstructDataset -from torchtune.modules import Tokenizer +from torchtune.modules.tokenizers import Tokenizer def alpaca_dataset( diff --git a/torchtune/datasets/_chat.py b/torchtune/datasets/_chat.py index 69c7b2f2e5..3f026aad73 100644 --- a/torchtune/datasets/_chat.py +++ b/torchtune/datasets/_chat.py @@ -18,7 +18,7 @@ sharegpt_to_llama2_messages, validate_messages, ) -from torchtune.modules import Tokenizer +from torchtune.modules.tokenizers import Tokenizer class ChatDataset(Dataset): diff --git a/torchtune/datasets/_grammar.py b/torchtune/datasets/_grammar.py index e87c261faf..c7b4e05121 100644 --- a/torchtune/datasets/_grammar.py +++ b/torchtune/datasets/_grammar.py @@ -6,7 +6,7 @@ from torchtune.data import GrammarErrorCorrectionTemplate from torchtune.datasets._instruct import InstructDataset -from torchtune.modules import Tokenizer +from torchtune.modules.tokenizers import Tokenizer def grammar_dataset( diff --git a/torchtune/datasets/_instruct.py b/torchtune/datasets/_instruct.py index 53f573c186..46e6ea3bba 100644 --- a/torchtune/datasets/_instruct.py +++ b/torchtune/datasets/_instruct.py @@ -10,15 +10,13 @@ from datasets import load_dataset from torch.utils.data import Dataset from torchtune.config._utils import _get_instruct_template - from torchtune.data import ( CROSS_ENTROPY_IGNORE_IDX, InstructTemplate, Message, validate_messages, ) - -from torchtune.modules import Tokenizer +from torchtune.modules.tokenizers import Tokenizer class InstructDataset(Dataset): diff --git a/torchtune/datasets/_preference.py b/torchtune/datasets/_preference.py index 199ddfbae8..18871fefaf 100644 --- a/torchtune/datasets/_preference.py +++ b/torchtune/datasets/_preference.py @@ -12,7 +12,7 @@ from torchtune.data import CROSS_ENTROPY_IGNORE_IDX, InstructTemplate, Message -from torchtune.modules import Tokenizer +from torchtune.modules.tokenizers import Tokenizer class PreferenceDataset(Dataset): diff --git a/torchtune/datasets/_samsum.py b/torchtune/datasets/_samsum.py index ba5561b64a..4fe750178e 100644 --- a/torchtune/datasets/_samsum.py +++ b/torchtune/datasets/_samsum.py @@ -6,7 +6,7 @@ from torchtune.data import SummarizeTemplate from torchtune.datasets import InstructDataset -from torchtune.modules import Tokenizer +from torchtune.modules.tokenizers import Tokenizer def samsum_dataset( diff --git a/torchtune/datasets/_slimorca.py b/torchtune/datasets/_slimorca.py index 188aa692ba..dd70456f9f 100644 --- a/torchtune/datasets/_slimorca.py +++ b/torchtune/datasets/_slimorca.py @@ -8,7 +8,7 @@ from torchtune.datasets._chat import ChatDataset -from torchtune.modules import Tokenizer +from torchtune.modules.tokenizers import Tokenizer def slimorca_dataset( diff --git a/torchtune/datasets/_stack_exchanged_paired.py b/torchtune/datasets/_stack_exchanged_paired.py index 5781cb4e55..f37b5d13cb 100644 --- a/torchtune/datasets/_stack_exchanged_paired.py +++ b/torchtune/datasets/_stack_exchanged_paired.py @@ -6,7 +6,7 @@ from torchtune.data import StackExchangedPairedTemplate from torchtune.datasets._preference import PreferenceDataset -from torchtune.modules import Tokenizer +from torchtune.modules.tokenizers import Tokenizer def stack_exchanged_paired_dataset( diff --git a/torchtune/models/gemma/_model_builders.py b/torchtune/models/gemma/_model_builders.py index ea7ad953ed..f598510ac2 100644 --- a/torchtune/models/gemma/_model_builders.py +++ b/torchtune/models/gemma/_model_builders.py @@ -5,7 +5,8 @@ # LICENSE file in the root directory of this source tree. from torchtune.models.gemma._component_builders import gemma -from torchtune.modules import Tokenizer, TransformerDecoder +from torchtune.modules import TransformerDecoder +from torchtune.modules.tokenizers import SentencePieceTokenizer """ Model builders build specific instantiations using component builders. For example @@ -35,7 +36,7 @@ def gemma_2b() -> TransformerDecoder: ) -def gemma_tokenizer(path: str) -> Tokenizer: - tokenizer = Tokenizer.from_file(path) +def gemma_tokenizer(path: str) -> SentencePieceTokenizer: + tokenizer = SentencePieceTokenizer(path) tokenizer.pad_id = 0 return tokenizer diff --git a/torchtune/models/llama2/_model_builders.py b/torchtune/models/llama2/_model_builders.py index 839b10580f..15db69e146 100644 --- a/torchtune/models/llama2/_model_builders.py +++ b/torchtune/models/llama2/_model_builders.py @@ -8,7 +8,8 @@ from torchtune.models.llama2._component_builders import llama2, lora_llama2 -from torchtune.modules import Tokenizer, TransformerDecoder +from torchtune.modules import TransformerDecoder +from torchtune.modules.tokenizers import SentencePieceTokenizer from torchtune.modules.peft import LORA_ATTN_MODULES @@ -39,8 +40,8 @@ def llama2_7b() -> TransformerDecoder: ) -def llama2_tokenizer(path: str) -> Tokenizer: - tokenizer = Tokenizer.from_file(path) +def llama2_tokenizer(path: str) -> SentencePieceTokenizer: + tokenizer = SentencePieceTokenizer(path) # Original tokenizer has no pad_id, which causes indexing errors when batch training tokenizer.pad_id = 0 return tokenizer diff --git a/torchtune/models/llama3/__init__.py b/torchtune/models/llama3/__init__.py new file mode 100644 index 0000000000..99309b9300 --- /dev/null +++ b/torchtune/models/llama3/__init__.py @@ -0,0 +1,25 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from ._component_builders import llama3, lora_llama3 + +from ._model_builders import ( # noqa + llama3_8b, + llama3_tokenizer, + lora_llama3_8b, + qlora_llama3_8b, +) +from ._model_utils import scale_hidden_dim_for_mlp + +__all__ = [ + "llama3", + "llama3_8b", + "llama3_tokenizer", + "lora_llama3", + "lora_llama3_8b", + "qlora_llama3_8b", + "scale_hidden_dim_for_mlp", +] diff --git a/torchtune/models/llama3/_component_builders.py b/torchtune/models/llama3/_component_builders.py new file mode 100644 index 0000000000..0828285645 --- /dev/null +++ b/torchtune/models/llama3/_component_builders.py @@ -0,0 +1,410 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from functools import partial +from typing import List, Literal, Optional + +from torch import nn + +from torchtune.models.llama3._model_utils import scale_hidden_dim_for_mlp + +from torchtune.modules import ( + CausalSelfAttention, + FeedForward, + KVCache, + RMSNorm, + RotaryPositionalEmbeddings, + TransformerDecoder, + TransformerDecoderLayer, +) + +from torchtune.modules.common_utils import reparametrize_as_dtype_state_dict_post_hook + +from torchtune.modules.peft import LORA_ATTN_MODULES, LoRALinear + +""" +Component builders for the Llama3 model and popular variants such as LoRA. + +TorchTune provides composable building blocks. Builder functions help +stitch these building blocks into higher-level components. This design has +two benefits: +- The building blocks themselves are very flexible. For example, ``CausalSelfAttention`` +can take either nn.Linear or nn.LoRALinear for ``q_proj``. +- Builder functions expose a set of configurable params which keep the constructors of +the building blocks simple. +""" + + +# ------------------ Vanilla Llama3 ------------------ + +def llama3( + vocab_size: int, + num_layers: int, + num_heads: int, + num_kv_heads: int, + embed_dim: int, + max_seq_len: int, + attn_dropout: float = 0.0, + rope_base: int = 500000.0, + intermediate_dim: Optional[int] = None, + norm_eps: float = 1e-5, +) -> TransformerDecoder: + """ + Build the decoder associated with the Llama3 model. This includes: + - Token embeddings + - num_layers number of TransformerDecoderLayer blocks + - RMS Norm layer applied to the output of the transformer + - Final projection into token space + + Args: + vocab_size (int): number of tokens in vocabulary. + num_layers (int): number of layers in the transformer decoder. + num_heads (int): number of query heads. For MHA this is also the + number of heads for key and value + num_kv_heads (int): number of key and value heads. If specified, + user should ensure `num_heads` % `num_kv_heads` == 0. Default value is + `None`, in which case this is the same as MHA + embed_dim (int): embedding dimension for self-attention + max_seq_len (int): maximum sequence length the model will be run with, as used + by :func:`~torchtune.modules.KVCache` + attn_dropout (float): dropout value passed onto scaled_dot_product_attention. + Default: 0.0 + intermediate_dim (Optional[int]): intermediate dimension for MLP. If not specified, + this is computed using :func:`~torchtune.modules.scale_hidden_dim_for_mlp` + norm_eps (float): epsilon in RMS norms. + + Returns: + TransformerDecoder: Instantiation of Llama3 model. + """ + head_dim = embed_dim // num_heads + num_kv_heads = num_kv_heads if num_kv_heads else num_heads + rope = RotaryPositionalEmbeddings(dim=head_dim, max_seq_len=max_seq_len, base=rope_base) + self_attn = CausalSelfAttention( + embed_dim=embed_dim, + num_heads=num_heads, + num_kv_heads=num_kv_heads, + head_dim=head_dim, + q_proj=nn.Linear(embed_dim, num_heads * head_dim, bias=False), + k_proj=nn.Linear(embed_dim, num_kv_heads * head_dim, bias=False), + v_proj=nn.Linear(embed_dim, num_kv_heads * head_dim, bias=False), + output_proj=nn.Linear(embed_dim, embed_dim, bias=False), + pos_embeddings=rope, + max_seq_len=max_seq_len, + attn_dropout=attn_dropout, + ) + hidden_dim = intermediate_dim if intermediate_dim else scale_hidden_dim_for_mlp(embed_dim) + mlp = llama3_mlp(dim=embed_dim, hidden_dim=hidden_dim) + layer = TransformerDecoderLayer( + attn=self_attn, + mlp=mlp, + sa_norm=RMSNorm(dim=embed_dim, eps=norm_eps), + mlp_norm=RMSNorm(dim=embed_dim, eps=norm_eps), + ) + tok_embeddings = nn.Embedding(vocab_size, embed_dim) + output_proj = nn.Linear(embed_dim, vocab_size, bias=False) + return TransformerDecoder( + tok_embeddings=tok_embeddings, + layer=layer, + num_layers=num_layers, + max_seq_len=max_seq_len, + num_heads=num_heads, + head_dim=head_dim, + norm=RMSNorm(embed_dim, eps=norm_eps), + output=output_proj, + ) + +def llama3_mlp(dim: int, hidden_dim: int) -> FeedForward: + """ + Build the MLP layer associated with the Llama model. + """ + gate_proj = nn.Linear(dim, hidden_dim, bias=False) + down_proj = nn.Linear(hidden_dim, dim, bias=False) + up_proj = nn.Linear(dim, hidden_dim, bias=False) + return FeedForward(gate_proj=gate_proj, down_proj=down_proj, up_proj=up_proj) + + + +# ------------------ LoRA Llama3 ------------------ + + +def lora_llama3( + lora_attn_modules: List[LORA_ATTN_MODULES], + apply_lora_to_mlp: bool = False, + apply_lora_to_output: bool = False, + *, + # llama3 args + vocab_size: int, + num_layers: int, + num_heads: int, + num_kv_heads: int, + embed_dim: int, + max_seq_len: int, + intermediate_dim: Optional[int] = None, + attn_dropout: float = 0.0, + norm_eps: float = 1e-5, + rope_base: float = 500000.0, + # LoRA args + lora_rank: int, + lora_alpha: float, + lora_dropout: float = 0.0, + # Quantization args + quantize_base: bool = False, +) -> TransformerDecoder: + """ + Return a version of Llama3 (an instance of :func:`~torchtune.modules.TransformerDecoder`) + with LoRA applied to some of the linear layers in its self-attention modules. + + Args: + lora_attn_modules (List[LORA_ATTN_MODULES]): list of which linear layers + LoRA should be applied to in each self-attention block. Options are + ``{"q_proj", "k_proj", "v_proj", "output_proj"}``. + apply_lora_to_mlp (bool): whether to apply LoRA to the MLP in each transformer layer. + Default: False + apply_lora_to_output (bool): whether to apply LoRA to the model's final output projection. + Default: False + vocab_size (int): number of tokens in vocabulary. + num_layers (int): number of layers in the transformer decoder. + num_heads (int): number of query heads. For MHA this is also the + number of heads for key and value + num_kv_heads (int): number of key and value heads. If specified, + user should ensure `num_heads` % `num_kv_heads` == 0. Default value is + `None`, in which case this is the same as MHA + embed_dim (int): embedding dimension for self-attention + max_seq_len (int): maximum sequence length the model will be run with, as used + by :func:`~torchtune.modules.KVCache` + attn_dropout (float): dropout value passed onto scaled_dot_product_attention. + Default: 0.0 + intermediate_dim (Optional[int]): intermediate dimension for MLP. If not specified, + this is computed using :func:`~torchtune.modules.scale_hidden_dim_for_mlp` + norm_eps (float): epsilon in RMS norms. + lora_rank (int): rank of each low-rank approximation + lora_alpha (float): scaling factor for the low-rank approximation + lora_dropout (float): LoRA dropout probability. Default: 0.0 + quantize_base: (bool): Whether to quantize base model weights or not. Only applied to base + weights within linear layers LoRA is applied to. The final output linear projection is not + supported for quantization currently. + + Returns: + TransformerDecoder: Instantiation of Llama3 model with LoRA applied to + a subset of the attention projections in each layer. + + """ + + self_attn = lora_llama3_self_attention( + lora_modules=lora_attn_modules, + embed_dim=embed_dim, + num_heads=num_heads, + num_kv_heads=num_kv_heads, + max_seq_len=max_seq_len, + attn_dropout=attn_dropout, + rope_base=rope_base, + lora_rank=lora_rank, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + quantize_base=quantize_base, + ) + + hidden_dim = intermediate_dim if intermediate_dim else scale_hidden_dim_for_mlp(embed_dim) + if apply_lora_to_mlp: + mlp = lora_llama3_mlp( + dim=embed_dim, + hidden_dim=hidden_dim, + lora_rank=lora_rank, + lora_alpha=lora_alpha, + quantize_base=quantize_base, + ) + else: + mlp = llama3_mlp(dim=embed_dim, hidden_dim=hidden_dim) + + layer = TransformerDecoderLayer( + attn=self_attn, + mlp=mlp, + sa_norm=RMSNorm(dim=embed_dim, eps=norm_eps), + mlp_norm=RMSNorm(dim=embed_dim, eps=norm_eps), + ) + + tok_embeddings = nn.Embedding(vocab_size, embed_dim) + + # TODO: quantize_base is not applied to final output_proj currently. + output_proj = ( + LoRALinear(embed_dim, vocab_size, rank=lora_rank, alpha=lora_alpha) + if apply_lora_to_output + else nn.Linear(embed_dim, vocab_size, bias=False) + ) + model = TransformerDecoder( + tok_embeddings=tok_embeddings, + layer=layer, + num_layers=num_layers, + max_seq_len=max_seq_len, + num_heads=num_heads, + head_dim=(embed_dim // num_heads), + norm=RMSNorm(embed_dim, eps=norm_eps), + output=output_proj, + ) + + if quantize_base: + # For QLoRA, we reparametrize 4-bit tensors to bf16, and offload to CPU on the fly + # so as to not increase peak memory + model._register_state_dict_hook( + partial(reparametrize_as_dtype_state_dict_post_hook, offload_to_cpu=True) + ) + + return model + + +def lora_llama3_self_attention( + lora_modules: List[LORA_ATTN_MODULES], + *, + # CausalSelfAttention args + embed_dim: int, + num_heads: int, + num_kv_heads: int, + max_seq_len: int, + attn_dropout: float = 0.0, + rope_base: float = 500000.0, + # LoRA args + lora_rank: int, + lora_alpha: float, + lora_dropout: float = 0.0, + quantize_base: bool = False, +) -> CausalSelfAttention: + """ + Return an instance of :func:`~torchtune.modules.CausalSelfAttention` with LoRA + applied to a subset of its linear layers + + Args: + lora_modules (List[LORA_ATTN_MODULES]): list of which linear layers + LoRA should be applied to. Options are ``{"q_proj", "k_proj", "v_proj", + "output_proj"}``. + embed_dim (int): embedding dimension for self-attention + num_heads (int): number of query heads. For MHA this is also the + number of heads for key and value + num_kv_heads (int): number of key and value heads. If specified, + user should ensure `num_heads` % `num_kv_heads` == 0. Default value is + `None`, in which case this is the same as MHA + max_seq_len (int): maximum sequence length the model will be run with, as used + by :func:`~torchtune.modules.KVCache` + attn_dropout (float): dropout value passed onto scaled_dot_product_attention. + Default: 0.0 + lora_rank (int): rank of each low-rank approximation + lora_alpha (float): scaling factor for the low-rank approximation + lora_dropout (float): LoRA dropout probability. Default: 0.0 + quantize_base (bool): Whether to quantize base model parameters for linear layers + LoRA is being applied to. Default is ``False``. + + Returns: + CausalSelfAttention: instantiation of self-attention module with LoRA + applied to a subset of Q, K, V, output projections. + + Raises: + ValueError: If lora_modules arg is an empty list + """ + if not lora_modules: + raise ValueError( + f"Must pass one or more of {LORA_ATTN_MODULES} as lora_modules" + ) + + head_dim = embed_dim // num_heads + num_kv_heads = num_kv_heads if num_kv_heads else num_heads + q_proj = ( + LoRALinear( + embed_dim, + num_heads * head_dim, + rank=lora_rank, + alpha=lora_alpha, + quantize_base=quantize_base, + ) + if "q_proj" in lora_modules + else nn.Linear(embed_dim, num_heads * head_dim, bias=False) + ) + k_proj = ( + LoRALinear( + embed_dim, + num_kv_heads * head_dim, + rank=lora_rank, + alpha=lora_alpha, + quantize_base=quantize_base, + ) + if "k_proj" in lora_modules + else nn.Linear(embed_dim, num_kv_heads * head_dim, bias=False) + ) + v_proj = ( + LoRALinear( + embed_dim, + num_kv_heads * head_dim, + rank=lora_rank, + alpha=lora_alpha, + quantize_base=quantize_base, + ) + if "v_proj" in lora_modules + else nn.Linear(embed_dim, num_kv_heads * head_dim, bias=False) + ) + output_proj = ( + LoRALinear( + embed_dim, + embed_dim, + rank=lora_rank, + alpha=lora_alpha, + quantize_base=quantize_base, + ) + if "output_proj" in lora_modules + else nn.Linear(embed_dim, embed_dim, bias=False) + ) + rope = RotaryPositionalEmbeddings(dim=head_dim, max_seq_len=max_seq_len, base=rope_base) + self_attn = CausalSelfAttention( + embed_dim=embed_dim, + num_heads=num_heads, + num_kv_heads=num_kv_heads, + head_dim=head_dim, + q_proj=q_proj, + k_proj=k_proj, + v_proj=v_proj, + output_proj=output_proj, + pos_embeddings=rope, + max_seq_len=max_seq_len, + attn_dropout=attn_dropout, + ) + return self_attn + + +def lora_llama3_mlp( + *, + dim: int, + hidden_dim: int, + lora_rank: int, + lora_alpha: float, + lora_dropout: float = 0.0, + quantize_base: bool = False, +) -> FeedForward: + gate_proj = LoRALinear( + in_dim=dim, + out_dim=hidden_dim, + rank=lora_rank, + alpha=lora_alpha, + dropout=lora_dropout, + quantize_base=quantize_base, + ) + down_proj = LoRALinear( + in_dim=hidden_dim, + out_dim=dim, + rank=lora_rank, + alpha=lora_alpha, + dropout=lora_dropout, + quantize_base=quantize_base, + ) + up_proj = LoRALinear( + in_dim=dim, + out_dim=hidden_dim, + rank=lora_rank, + alpha=lora_alpha, + dropout=lora_dropout, + quantize_base=quantize_base, + ) + return FeedForward( + gate_proj=gate_proj, + down_proj=down_proj, + up_proj=up_proj, + ) diff --git a/torchtune/models/llama3/_model_builders.py b/torchtune/models/llama3/_model_builders.py new file mode 100644 index 0000000000..4286a0f145 --- /dev/null +++ b/torchtune/models/llama3/_model_builders.py @@ -0,0 +1,109 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +from typing import List, Optional +from functools import partial + +from torch import nn + +from torchtune.models.llama3._component_builders import llama3, lora_llama3 +from torchtune.models.llama3._model_utils import scale_hidden_dim_for_mlp + +from torchtune.modules import TransformerDecoder +from torchtune.modules.tokenizers import TikTokenTokenizer +from torchtune.modules.peft import LORA_ATTN_MODULES + + +""" +Model builders build specific instantiations using component builders. For example +the llama3_8b model builder uses the llama3 component builder to create the +Llama3 8B model. +""" + + +def llama3_8b() -> TransformerDecoder: + """ + Builder for creating a Llama3 model initialized w/ the default 8b parameter values. + + Returns: + TransformerDecoder: Instantiation of Llama3 8B model + """ + return llama3( + vocab_size=128_256, + num_layers=32, + num_heads=32, + num_kv_heads=8, + embed_dim=4096, + max_seq_len=4096, + intermediate_dim=14336, + attn_dropout=0.0, + norm_eps=1e-5, + rope_base=500000.0, + ) + + +def llama3_tokenizer(path: str) -> TikTokenTokenizer: + tiktoken = TikTokenTokenizer(path) + tiktoken.pad_id = 0 + return tiktoken + + +def lora_llama3_8b( + lora_attn_modules: List[LORA_ATTN_MODULES], + apply_lora_to_mlp: bool = False, + apply_lora_to_output: bool = False, + lora_rank: int = 8, + lora_alpha: float = 16, + quantize_base: bool = False, +) -> TransformerDecoder: + """ + Builder for creating a Llama3 8B model with LoRA enabled. + + The Llama3 defaults are the same as in :func:`~torchtune.models.llama3.llama3_8b`, + while LoRA default params are based on + https://github.com/tloen/alpaca-lora/blob/8bb8579e403dc78e37fe81ffbb253c413007323f/finetune.py#L41-L43. + + Args: + lora_attn_modules (List[LORA_ATTN_MODULES]): list of which linear layers + LoRA should be applied to in each self-attention block. Options are + ``{"q_proj", "k_proj", "v_proj", "output_proj"}``. + apply_lora_to_mlp (bool): whether to apply LoRA to the MLP in each transformer layer. + Default: False + apply_lora_to_output (bool): whether to apply LoRA to the model's final output projection. + Default: False + lora_rank (int): rank of each low-rank approximation + lora_alpha (float): scaling factor for the low-rank approximation + quantize_base (bool): Whether to quantize base model weights + + Returns: + TransformerDecoder: Instantiation of Llama3 8B model with LoRA applied + """ + return lora_llama3( + lora_attn_modules=lora_attn_modules, + apply_lora_to_mlp=apply_lora_to_mlp, + apply_lora_to_output=apply_lora_to_output, + vocab_size=128_256, + num_layers=32, + num_heads=32, + num_kv_heads=8, + embed_dim=4096, + max_seq_len=4096, + intermediate_dim=14336, + attn_dropout=0.0, + norm_eps=1e-5, + rope_base=500000.0, + lora_rank=lora_rank, + lora_alpha=lora_alpha, + lora_dropout=0.05, + quantize_base=quantize_base, + ) + +qlora_llama3_8b = partial(lora_llama3_8b, quantize_base=True) + +qlora_llama3_8b.__doc__ = """ +Builder for creating a Llama3 model with QLoRA enabled. Base model weights in linear layers +that LoRA is applied to are quantized per the QLoRA paper: https://arxiv.org/abs/2305.14314. +Please see `lora_llama3_8b` for full API arguments. +""" diff --git a/torchtune/models/llama3/_model_utils.py b/torchtune/models/llama3/_model_utils.py new file mode 100644 index 0000000000..010c1bcc2f --- /dev/null +++ b/torchtune/models/llama3/_model_utils.py @@ -0,0 +1,23 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + + +def scale_hidden_dim_for_mlp(dim: int, multiple_of: int = 256) -> int: + """Scale hidden dimension for MLP to keep number of parameters and computation constant. + + Args: + dim (int): Input dimension. + multiple_of (int): Round scaled dimension to nearest multiple of `multiple_of` for clean computation. + + Returns: + Scaled hidden dimension. + """ + # Scale hidden dimension by (2/3)4d for SwiGLU to keep number of + # parameters and computation constant + hidden_dim = 4 * int(2 * dim / 3) + # Round hidden dimension to nearest multiple of `multiple_of` + hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) + return hidden_dim diff --git a/torchtune/models/mistral/_model_builders.py b/torchtune/models/mistral/_model_builders.py index 3071608eb0..940d1820ae 100644 --- a/torchtune/models/mistral/_model_builders.py +++ b/torchtune/models/mistral/_model_builders.py @@ -7,7 +7,8 @@ from torchtune.models.mistral._component_builders import mistral, lora_mistral -from torchtune.modules import Tokenizer, TransformerDecoder +from torchtune.modules import TransformerDecoder +from torchtune.modules.tokenizers import SentencePieceTokenizer from torchtune.modules.peft import LORA_ATTN_MODULES from functools import partial @@ -40,8 +41,8 @@ def mistral_7b() -> TransformerDecoder: ) -def mistral_tokenizer(path: str) -> Tokenizer: - tokenizer = Tokenizer.from_file(path) +def mistral_tokenizer(path: str) -> SentencePieceTokenizer: + tokenizer = SentencePieceTokenizer(path) # Original tokenizer has no pad_id, which causes indexing errors when batch training tokenizer.pad_id = 0 return tokenizer diff --git a/torchtune/modules/__init__.py b/torchtune/modules/__init__.py index 7d08ea5bd2..46b8e93b0f 100644 --- a/torchtune/modules/__init__.py +++ b/torchtune/modules/__init__.py @@ -11,7 +11,6 @@ from .lr_schedulers import get_cosine_schedule_with_warmup # noqa from .position_embeddings import RotaryPositionalEmbeddings # noqa from .rms_norm import RMSNorm # noqa -from .tokenizer import Tokenizer # noqa from .transformer import TransformerDecoder, TransformerDecoderLayer # noqa __all__ = [ @@ -21,7 +20,6 @@ "KVCache", "RotaryPositionalEmbeddings", "RMSNorm", - "Tokenizer", "TransformerDecoder", "TransformerDecoderLayer", "reparametrize_as_dtype_state_dict_post_hook", diff --git a/torchtune/modules/tokenizers/__init__.py b/torchtune/modules/tokenizers/__init__.py new file mode 100644 index 0000000000..069849bf35 --- /dev/null +++ b/torchtune/modules/tokenizers/__init__.py @@ -0,0 +1,11 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from ._sentencepiece import SentencePieceTokenizer +from ._tiktoken import TikTokenTokenizer +from ._utils import Tokenizer + +__all__ = ["SentencePieceTokenizer", "TikTokenTokenizer", "Tokenizer"] diff --git a/torchtune/modules/tokenizer.py b/torchtune/modules/tokenizers/_sentencepiece.py similarity index 84% rename from torchtune/modules/tokenizer.py rename to torchtune/modules/tokenizers/_sentencepiece.py index 07c0268fb4..94104faa2f 100644 --- a/torchtune/modules/tokenizer.py +++ b/torchtune/modules/tokenizers/_sentencepiece.py @@ -13,37 +13,31 @@ WHITESPACE_CHARS = [" ", "\n", "\t", "\r", "\v"] -class Tokenizer: +class SentencePieceTokenizer: """A wrapper around SentencePieceProcessor. Args: - spm_model (SentencePieceProcessor): The SentencePiece model. - vocab_size (int): The size of the vocabulary. - bos_id (int): The ID of the beginning-of-sentence token. - eos_id (int): The ID of the end-of-sentence token. - pad_id (int): The ID of the padding token. + path (str): Path to pretrained tokenizer file. Example: # Accepts only non-batched input for now - >>> tokenizer = Tokenizer.from_file("/path/to/spm_model") - >>> tokenized_text = tokenizer.encode("Hello world!", add_bos=True, add_eos=True) + >>> tokenizer = SentencePieceTokenizer("/path/to/spm_model") + >>> tokenized_text = SentencePieceTokenizer.encode("Hello world!", add_bos=True, add_eos=True) >>> print(tokenized_text) [1, 31587, 29644, 102, 2] """ def __init__( self, - spm_model: SentencePieceProcessor, - vocab_size: int, - bos_id: int, - eos_id: int, - pad_id: int, + path: str, ): + spm_model = SentencePieceProcessor() + spm_model.load(path) self.spm_model = spm_model - self.vocab_size = vocab_size - self.bos_id = bos_id - self.eos_id = eos_id - self.pad_id = pad_id + self.vocab_size = spm_model.vocab_size() + self.bos_id = spm_model.bos_id() + self.eos_id = spm_model.eos_id() + self.pad_id = spm_model.pad_id() # This is used in tokenize_messages: if the tokenizer does not # encode whitespace, then we can more easily split strings @@ -52,20 +46,6 @@ def __init__( [self.spm_model.encode(c) for c in WHITESPACE_CHARS] ) - @classmethod - def from_file(cls, path: str) -> "Tokenizer": - """Initialize a `Tokenizer` instance from a SentencePiece model file. - - Args: - path (str): The path to the SentencePiece model file. - - Returns: - Tokenizer: A `Tokenizer` instance. - """ - spm = SentencePieceProcessor() - spm.load(path) - return cls(spm, spm.vocab_size(), spm.bos_id(), spm.eos_id(), spm.pad_id()) - def encode( self, text: str, @@ -135,7 +115,7 @@ def tokenize_messages( beginning off the tokenized s2. Example: - >>> tokenizer = Tokenizer.from_file(tokenizer_path) + >>> tokenizer = SentencePieceTokenizer(tokenizer_path) >>> messages = [ Message(role="system", content="system message\n", masked=True), Message(role="user", content="user prompt\n", masked=True), diff --git a/torchtune/modules/tokenizers/_tiktoken.py b/torchtune/modules/tokenizers/_tiktoken.py new file mode 100644 index 0000000000..104cee1353 --- /dev/null +++ b/torchtune/modules/tokenizers/_tiktoken.py @@ -0,0 +1,367 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Dict, List, Optional, Tuple + +from tiktoken import Encoding +from tiktoken.load import load_tiktoken_bpe +from torchtune.data._types import Message +from torchtune.modules.tokenizers._utils import ( + _split_long_repetitions, + Tokenizer, + truncate, +) + + +CL100K_PATTERN = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""" # noqa + +# bos and eos tokens +BEGIN_OF_TEXT = "<|begin_of_text|>" +END_OF_TEXT = "<|end_of_text|>" +# fill-in-the-middle tags +FIM_PREFIX = "<|fim_prefix|>" +FIM_MIDDLE = "<|fim_middle|>" +FIM_SUFFIX = "<|fim_suffix|>" +# start and end header tokens for formatting chat messages +START_HEADER_ID = "<|start_header_id|>" +END_HEADER_ID = "<|end_header_id|>" +STEP_ID = "<|step_id|>" +# different end of message tags +EOM_ID = "<|eom_id|>" +EOT_ID = "<|eot_id|>" +# special token for ipython messages +PYTHON_TAG = "<|python_tag|>" + +ALL_SPECIAL_TOKENS = [ + BEGIN_OF_TEXT, + END_OF_TEXT, + FIM_PREFIX, + FIM_MIDDLE, + FIM_SUFFIX, + STEP_ID, + START_HEADER_ID, + END_HEADER_ID, + EOM_ID, + EOT_ID, + PYTHON_TAG, +] + +PAD_ID = -1 + +# Constants controlling encode logic +MAX_ENCODE_CHARS = 400_000 +MAX_NO_WHITESPACE_CHARS = 25_000 + + +class TikTokenTokenizer(Tokenizer): + """A wrapper around tiktoken Encoding. + + Args: + path (str): Path to pretrained tokenizer checkpoint file. + name (str): Name of the tokenizer (used by tiktoken for identification). + pattern (str): Regex pattern used to for string parsing. + all_special_tokens (Optional[List[str]]): List of all special tokens. First element + must be bos token, second element must be eos token, final element must be + python tag. All elements must be unique. Length must be at most 256. + Default: None (will use ALL_SPECIAL_TOKENS) + bos_token (str): Beginning of sequence token. Defaults to BEGIN_OF_TEXT. + eos_token (str): End of sequence token. Defaults to END_OF_TEXT. + start_header_id (str): Start header token. Defaults to START_HEADER_ID. + end_header_id (str): End header token. Defaults to END_HEADER_ID. + step_id (str): Step token. Defaults to STEP_ID. + eom_id (str): End of message token. Defaults to EOM_ID. + eot_id (str): End of turn token. Defaults to EOT_ID. + python_tag (str): Python tag token. Defaults to PYTHON_TAG. + """ + + def __init__( + self, + path: str, + *, + name: str = "llama3_tiktoken", + pattern: str = CL100K_PATTERN, + all_special_tokens: Optional[List[str]] = None, + bos_token: str = BEGIN_OF_TEXT, + eos_token: str = END_OF_TEXT, + start_header_id: str = START_HEADER_ID, + end_header_id: str = END_HEADER_ID, + step_id: str = STEP_ID, + eom_id: str = EOM_ID, + eot_id: str = EOT_ID, + python_tag: str = PYTHON_TAG, + ): + self.path = path + self.num_reserved_special_tokens = 256 + all_special_tokens = all_special_tokens or ALL_SPECIAL_TOKENS + self._validate_special_tokens( + all_special_tokens=all_special_tokens, + bos_token=bos_token, + eos_token=eos_token, + step_id=step_id, + start_header_id=start_header_id, + end_header_id=end_header_id, + eom_id=eom_id, + eot_id=eot_id, + python_tag=python_tag, + ) + self.all_special_tokens = all_special_tokens + + mergeable_ranks = load_tiktoken_bpe(self.path) + self.base_vocab_size = len(mergeable_ranks) + all_special_tokens_with_ids = self._get_all_special_tokens_with_ids() + self.tt_model = Encoding( + name=name, + pat_str=pattern, + mergeable_ranks=mergeable_ranks, + special_tokens={**all_special_tokens_with_ids}, + ) + + # Encode BOS and EOS, define pad ID + self.bos_id = self._encode_special_token(self.all_special_tokens[0]) + self.eos_id = self._encode_special_token(self.all_special_tokens[1]) + self.pad_id = PAD_ID + + self.vocab_size = self.tt_model.n_vocab + + # Encode extra special tokens + self.step_id = self._encode_special_token(step_id) + self.start_header_id = self._encode_special_token(start_header_id) + self.end_header_id = self._encode_special_token(end_header_id) + self.eom_id = self._encode_special_token(eom_id) + self.eot_id = self._encode_special_token(eot_id) + self.python_tag = self._encode_special_token(python_tag) + + def _validate_special_tokens( + self, + *, + all_special_tokens: List[str], + bos_token: str, + eos_token: str, + step_id: str, + start_header_id: str, + end_header_id: str, + eom_id: str, + eot_id: str, + python_tag: str, + ): + """ + Validate all the special tokens are as expected. Should satisfy: + + (1) bos_token, eos_token, step_id, start_header_id, end_header_id, eom_id, + eot_id, python_tag are all in all_special_tokens, + (2) bos_token should be first, eos_token should be second, python_tag should be last, + (3) all special tokens are unique, and + (4) at most 256 special tokens + """ + for token in [ + bos_token, + eos_token, + step_id, + start_header_id, + end_header_id, + eom_id, + eot_id, + python_tag, + ]: + assert ( + token in all_special_tokens + ), f"{token} missing from all_special_tokens" + assert ( + all_special_tokens[0] == bos_token + ), f"First special token must be bos, got {all_special_tokens[0]}" + assert ( + all_special_tokens[1] == eos_token + ), f"Second special token must be eos, got {all_special_tokens[1]}" + assert ( + all_special_tokens[-1] == python_tag + ), f"Last special token must be python_tag, got {all_special_tokens[-1]}" + assert len(set(all_special_tokens)) == len( + all_special_tokens + ), "Special tokens must be unique." + assert ( + len(all_special_tokens) <= self.num_reserved_special_tokens + ), "The total number of basic and extra special tokens cannot exceed the number of reserved tokens." + + def _get_all_special_tokens_with_ids(self) -> Dict[str, int]: + """ + Returns a dictionary of all special tokens and their corresponding ids to be passed + to tiktoken Encoding. + + There are 256 slots for special tokens, any remaining spaces beyond self.all_special_tokens + will be filled with dummy reserved tokens. Tokens will be added in the order: + (1) all special tokens but python_tag, (2) all reserved tokens, (3) python_tag. + """ + reserved_tokens = [ + f"<|reserved_special_token_{i}|>" + for i in range( + self.num_reserved_special_tokens - len(self.all_special_tokens) + ) + ] + # Python tag special token should come last (validated in __init__) + all_special_tokens = ( + self.all_special_tokens[:-1] + + reserved_tokens + + [self.all_special_tokens[-1]] + ) + + return { + token: self.base_vocab_size + i + for i, token in enumerate(all_special_tokens) + } + + def _encode_special_token(self, token: str) -> int: + """ + Encodes a special token. + + Args: + token (str): The special token to encode. + + Returns: + int: The encoded special token. + """ + return self.tt_model.encode( + token, + allowed_special="all", + disallowed_special=(), + )[0] + + def encode( + self, + text: str, + add_bos: bool, + add_eos: bool, + ) -> List[int]: + """ + Encode a string into a list of token ids. Assumes that the string + contains no special tokens. + + Args: + text (str): The string to encode. + add_bos (bool): Whether to add the beginning of sequence token. + add_eos (bool): Whether to add the end of sequence token. + + Returns: + List[int]: The list of token ids. + """ + substrs: List[str] = [] + tokens = [] + for i in range(0, len(text), MAX_ENCODE_CHARS): + substr = text[i : i + MAX_ENCODE_CHARS] + # See https://github.com/openai/tiktoken/issues/195 + sliced_substr = _split_long_repetitions(substr, MAX_NO_WHITESPACE_CHARS) + substrs.extend(sliced_substr) + for substr in substrs: + # allowed_special and disallowed_special are used by tiktoken to define + # how special tokens are encoded. Our setting here is to encode any + # special token as regular text and prevent tiktoken from raising errors. + # This means we should only call encode on strings not containing special tokens. + tokens.extend( + self.tt_model.encode( + substr, + allowed_special=set(), + disallowed_special=(), + ) + ) + if add_bos: + tokens.insert(0, self.bos_id) + if add_eos: + tokens.append(self.eos_id) + return tokens + + def decode( + self, + token_ids: List[int], + truncate_at_eos: bool = True, + ) -> str: + """ + Decode a list of token ids into a string. + + Args: + token_ids (List[int]): The list of token ids. + truncate_at_eos (bool): Whether to truncate the string at the end of + sequence token. + + Returns: + str: The decoded string. + """ + if truncate_at_eos: + try: + k = token_ids.index(self.eos_id) + except ValueError: + k = None + if k: + token_ids = token_ids[:k] + token_ids = [token_id for token_id in token_ids if token_id != self.bos_id] + return self.tt_model.decode(token_ids) + + def tokenize_message( + self, message: Message, tokenize_header: bool = False + ) -> List[int]: + """ + Tokenize a message into a list of token ids. + + Args: + message (Message): The message to tokenize. + tokenize_header (bool): Whether to prepend a tokenized header to each message. + + Returns: + List[int]: The list of token ids. + """ + if tokenize_header: + tokenized_header = ( + [self.start_header_id] + + self.encode(message.role.strip(), add_bos=False, add_eos=False) + + [self.end_header_id] + + self.encode("\n\n", add_bos=False, add_eos=False) + ) + else: + tokenized_header = [] + tokenized_body = self.encode( + message.content.strip(), add_bos=False, add_eos=False + ) + if message.ipython: + tokenized_body = [self.python_tag] + tokenized_body + tokenized_message = tokenized_header + tokenized_body + if message.eot: + tokenized_message = tokenized_message + [self.eot_id] + else: + tokenized_message = tokenized_message + [self.eom_id] + return tokenized_message + + def tokenize_messages( + self, + messages: List[Message], + max_seq_len: Optional[int] = None, + tokenize_header: bool = True, + ) -> Tuple[List[int], List[bool]]: + """ + Tokenize a list of messages into a list of token ids and masks. + + Args: + messages (List[Message]): The list of messages to tokenize. + max_seq_len (Optional[int]): The maximum sequence length. + tokenize_header (bool): Whether to prepend a tokenized header to each message. + + Returns: + Tuple[List[int], List[bool]]: The list of token ids and the list of masks. + """ + tokens = [self.bos_id] + # bos and eos are always masked + mask = [True] + for message in messages: + tokenized_message = self.tokenize_message( + message, tokenize_header=tokenize_header + ) + tokens = tokens + tokenized_message + mask = mask + ([message.masked] * len(tokenized_message)) + if max_seq_len and len(tokens) >= max_seq_len: + break + tokens = tokens + [self.eos_id] + mask = mask + [True] + if max_seq_len: + tokens = truncate(tokens, max_seq_len, self.eos_id) + mask = truncate(mask, max_seq_len, True) + return tokens, mask diff --git a/torchtune/modules/tokenizers/_utils.py b/torchtune/modules/tokenizers/_utils.py new file mode 100644 index 0000000000..cdfaddfcdf --- /dev/null +++ b/torchtune/modules/tokenizers/_utils.py @@ -0,0 +1,71 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Iterator, List, Protocol, Union + +from torchtune.data._types import Message + + +class Tokenizer(Protocol): + """Abstract tokenizer""" + + bos_id: int + eos_id: int + pad_id: int + + def encode(self, text: str, **kwargs) -> List[int]: + """ + Given a string, return the a list of token ids. + """ + + def decode( + self, token_ids: List[int], add_bos: bool, add_eos: bool, **kwargs + ) -> str: + """ + Given a list of token ids, return the decoded text. + """ + + def tokenize_messages(self, token_ids: List[Message], **kwargs): + """ + Given a list of messages, return a list of tokens for the concatenated + and formatted messages. + """ + pass + + +def truncate( + tokens: List[int], + max_seq_len: int, + eos_id: Union[int, bool], +): + tokens_truncated = tokens[:max_seq_len] + if tokens_truncated[-1] != eos_id: + tokens_truncated[-1] = eos_id + return tokens_truncated + + +def _split_long_repetitions(s: str, max_consecutive_slice_len: int) -> Iterator[str]: + """ + Split the string `s` so that each substring contains no more than `max_consecutive_slice_len` + consecutive whitespaces or consecutive non-whitespaces + """ + current_slice_len = 0 + current_slice_is_space = s[0].isspace() if len(s) > 0 else False + slice_start = 0 + + for i in range(len(s)): + is_now_space = s[i].isspace() + + if current_slice_is_space ^ is_now_space: + current_slice_len = 1 + current_slice_is_space = is_now_space + else: + current_slice_len += 1 + if current_slice_len > max_consecutive_slice_len: + yield s[slice_start:i] + slice_start = i + current_slice_len = 1 + yield s[slice_start:] diff --git a/torchtune/utils/_checkpointing/_checkpointer_utils.py b/torchtune/utils/_checkpointing/_checkpointer_utils.py index ec1f4a47cd..a1391028e0 100644 --- a/torchtune/utils/_checkpointing/_checkpointer_utils.py +++ b/torchtune/utils/_checkpointing/_checkpointer_utils.py @@ -22,6 +22,7 @@ class ModelType(Enum): LLAMA2 = "llama2" MISTRAL = "mistral" GEMMA = "gemma" + LLAMA3 = "llama3" def get_path(input_dir: Path, filename: str, missing_ok: bool = False) -> Path: