meta-pytorch
diff --git a/‎README.md‎
Lines changed: 87 additions & 78 deletions b/‎README.md‎
Lines changed: 87 additions & 78 deletions
diff --git a/‎docs/source/api_ref_modules.rst‎
Lines changed: 1 addition & 0 deletions b/‎docs/source/api_ref_modules.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/source/api_ref_training.rst‎
Lines changed: 1 addition & 0 deletions b/‎docs/source/api_ref_training.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/source/basics/model_transforms.rst‎
Lines changed: 2 additions & 2 deletions b/‎docs/source/basics/model_transforms.rst‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/source/basics/multimodal_datasets.rst‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/basics/multimodal_datasets.rst‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/basics/tokenizers.rst‎
Lines changed: 24 additions & 0 deletions b/‎docs/source/basics/tokenizers.rst‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎docs/source/deep_dives/checkpointer.rst‎
Lines changed: 7 additions & 7 deletions b/‎docs/source/deep_dives/checkpointer.rst‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎docs/source/tutorials/e2e_flow.rst‎
Lines changed: 7 additions & 7 deletions b/‎docs/source/tutorials/e2e_flow.rst‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎docs/source/tutorials/qat_finetune.rst‎
Lines changed: 85 additions & 46 deletions b/‎docs/source/tutorials/qat_finetune.rst‎
Lines changed: 85 additions & 46 deletions
@@ -50,6 +50,7 @@ model specific tokenizers.
 
     transforms.tokenizers.SentencePieceBaseTokenizer
     transforms.tokenizers.TikTokenBaseTokenizer
+    transforms.tokenizers.HuggingFaceBaseTokenizer
     transforms.tokenizers.ModelTokenizer
     transforms.tokenizers.BaseTokenizer
 
 
@@ -102,6 +102,7 @@ Various logging utilities.
     metric_logging.TensorBoardLogger
     metric_logging.StdoutLogger
     metric_logging.DiskLogger
+    metric_logging.MLFlowLogger
 
 .. _perf_profiling_label:
 
 
@@ -32,8 +32,8 @@ These are intended to be drop-in replacements for tokenizers in multimodal datas
             Message(
                 role="user",
                 content=[
-                    {"type": "image", "content": Image.new(mode="RGB", size=(224, 224))},
-                    {"type": "image", "content": Image.new(mode="RGB", size=(224, 224))},
+                    {"type": "image", "content": Image.new(mode="RGB", size=(560, 560))},
+                    {"type": "image", "content": Image.new(mode="RGB", size=(560, 560))},
                     {"type": "text", "content": "What is common in these two images?"},
                 ],
             ),
 
@@ -45,7 +45,7 @@ in the text, ``"<image>"`` for where to place the image tokens. This will get re
     from torchtune.models.llama3_2_vision import llama3_2_vision_transform
     from torchtune.datasets.multimodal import multimodal_chat_dataset
 
-    model_transform = Llama3VisionTransform(
+    model_transform = llama3_2_vision_transform(
         path="/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model",
         prompt_template="torchtune.data.QuestionAnswerTemplate",
         max_seq_len=8192,
 
@@ -222,6 +222,30 @@ to do the actual encoding and decoding.
     print(sp_tokenizer.encode(text))
     # [1, 6312, 28709, 1526, 2]
 
+.. _hf_tokenizers:
+
+Using Hugging Face tokenizers
+-----------------------------
+
+Sometimes tokenizers hosted on Hugging Face do not contain files compatible with one of torchtune's
+existing tokenizer classes. In this case, we provide :class:`~torchtune.modules.transforms.tokenizers.HuggingFaceBaseTokenizer`
+to parse the Hugging Face ``tokenizer.json`` file and define the correct ``encode`` and ``decode`` methods to
+match torchtune's other :class:`~torchtune.modules.transforms.tokenizers.BaseTokenizer` classes. You should also pass the path to
+either ``tokenizer_config.json`` or ``generation_config.json``, which will allow torchtune to infer BOS and EOS tokens.
+Continuing with the Mistral example:
+
+.. code-block:: python
+
+    hf_tokenizer = HuggingFaceBaseTokenizer(
+        tokenizer_json_path="/tmp/Mistral-7B-v0.1/tokenizer.json",
+        tokenizer_config_json_path="/tmp/Mistral-7B-v0.1/tokenizer_config.json",
+    )
+
+    text = "hello world"
+
+    print(hf_tokenizer.encode(text))
+    # [1, 6312, 28709, 1526, 2]
+
 .. _model_tokenizers:
 
 Model tokenizers
 
@@ -293,8 +293,8 @@ For more details about each file, please check the End-to-End tutorial mentioned
         │   ├── adapter_model.pt
         │   ├── adapter_model.safetensors
         │   ├── config.json
-        │   ├── ft-model-00001-of-00002.safetensors
-        │   ├── ft-model-00002-of-00002.safetensors
+        │   ├── model-00001-of-00002.safetensors
+        │   ├── model-00002-of-00002.safetensors
         │   ├── generation_config.json
         │   ├── LICENSE.txt
         │   ├── model.safetensors.index.json
@@ -313,8 +313,8 @@ For more details about each file, please check the End-to-End tutorial mentioned
         │   ├── adapter_model.pt
         │   ├── adapter_model.safetensors
         │   ├── config.json
-        │   ├── ft-model-00001-of-00002.safetensors
-        │   ├── ft-model-00002-of-00002.safetensors
+        │   ├── model-00001-of-00002.safetensors
+        │   ├── model-00002-of-00002.safetensors
         │   ├── generation_config.json
         │   ├── LICENSE.txt
         │   ├── model.safetensors.index.json
@@ -394,7 +394,7 @@ you'll need to **update** the following fields in your configs:
 
 **resume_from_checkpoint**: Set it to True;
 
-**checkpoint_files**: change the path to ``epoch_{YOUR_EPOCH}/ft-model={}-of-{}.safetensors``;
+**checkpoint_files**: change the path to ``epoch_{YOUR_EPOCH}/model-{}-of-{}.safetensors``;
 
 Notice that we do **not** change our checkpoint_dir or output_dir. Since we are resuming from checkpoint, we know
 to look for it in the output_dir.
@@ -405,8 +405,8 @@ to look for it in the output_dir.
         # checkpoint files. Note that you will need to update this
         # section of the config with the intermediate checkpoint files
         checkpoint_files: [
-            epoch_{YOUR_EPOCH}/ft-model-00001-of-00002.safetensors,
-            epoch_{YOUR_EPOCH}/ft-model-00001-of-00002.safetensors,
+            epoch_{YOUR_EPOCH}/model-00001-of-00002.safetensors,
+            epoch_{YOUR_EPOCH}/model-00001-of-00002.safetensors,
         ]
 
     # set to True if restarting training
 
@@ -142,8 +142,8 @@ There are 3 types of folders:
     │   ├── adapter_model.pt
     │   ├── adapter_model.safetensors
     │   ├── config.json
-    │   ├── ft-model-00001-of-00002.safetensors
-    │   ├── ft-model-00002-of-00002.safetensors
+    │   ├── model-00001-of-00002.safetensors
+    │   ├── model-00002-of-00002.safetensors
     │   ├── generation_config.json
     │   ├── LICENSE.txt
     │   ├── model.safetensors.index.json
@@ -168,7 +168,7 @@ There are 3 types of folders:
 Let's understand the files:
 
 - ``adapter_model.safetensors`` and ``adapter_model.pt`` are your LoRA trained adapter weights. We save a duplicated .pt version of it to facilitate resuming from checkpoint.
-- ``ft-model-{}-of-{}.safetensors`` are your trained full model weights (not adapters). When LoRA finetuning, these are only present if we set ``save_adapter_weights_only=False``. In that case, we merge the merged base model with trained adapters, making inference easier.
+- ``model-{}-of-{}.safetensors`` are your trained full model weights (not adapters). When LoRA finetuning, these are only present if we set ``save_adapter_weights_only=False``. In that case, we merge the merged base model with trained adapters, making inference easier.
 - ``adapter_config.json`` is used by Huggingface PEFT when loading an adapter (more on that later);
 - ``model.safetensors.index.json`` is used by Hugging Face ``from_pretrained()`` when loading the model weights (more on that later)
 - All other files were originally in the checkpoint_dir. They are automatically copied during training. Files over 100MiB and ending on .safetensors, .pth, .pt, .bin are ignored, making it lightweight.
@@ -223,8 +223,8 @@ Notice that we are using the merged weights, and not the LoRA adapters.
         _component_: torchtune.training.FullModelHFCheckpointer
         checkpoint_dir: ${output_dir}
         checkpoint_files: [
-            ft-model-00001-of-00002.safetensors,
-            ft-model-00002-of-00002.safetensors,
+            model-00001-of-00002.safetensors,
+            model-00002-of-00002.safetensors,
         ]
         output_dir: ${output_dir}
         model_type: LLAMA3_2
@@ -299,8 +299,8 @@ Let's modify ``custom_generation_config.yaml`` to include the following changes.
         _component_: torchtune.training.FullModelHFCheckpointer
         checkpoint_dir: ${checkpoint_dir}
         checkpoint_files: [
-            ft-model-00001-of-00002.safetensors,
-            ft-model-00002-of-00002.safetensors,
+            model-00001-of-00002.safetensors,
+            model-00002-of-00002.safetensors,
         ]
         output_dir: ${output_dir}
         model_type: LLAMA3_2
 
@@ -64,32 +64,47 @@ Between these two steps, training can proceed exactly as before.
 Applying QAT to Llama3 models
 -----------------------------
 
-We can easily apply the above QAT transformations to Llama3 in torchtune for fine-tuning:
+We can easily apply the above QAT transformations to Llama3 for fine-tuning,
+leveraging the APIs in torchao as follows:
 
 .. code-block:: python
 
-  from torchtune.training.quantization import Int8DynActInt4WeightQATQuantizer
+  import copy
+  import torch
+  from torchao.quantization import quantize_
+  from torchao.quantization.qat import (
+      FakeQuantizeConfig,
+      IntXQuantizationAwareTrainingConfig,
+  )
   from torchtune.models.llama3 import llama3_8b
 
   model = llama3_8b()
+  original_model = copy.deepcopy(model)
+
+  # Config for int8 dynamic asymmetric per token activations +
+  # int4 symmetric per group weights, only for linear layers
+  activation_config = FakeQuantizeConfig(torch.int8, "per_token", is_symmetric=False)
+  weight_config = FakeQuantizeConfig(torch.int4, group_size=32)
+  qat_config = IntXQuantizationAwareTrainingConfig(activation_config, weight_config)
 
-  # Quantizer for int8 dynamic per token activations +
-  # int4 grouped per channel weights, only for linear layers
-  quantizer = Int8DynActInt4WeightQATQuantizer()
+  # Prepare the model for quantization-aware fine-tuning.
+  #
+  # This step inserts "fake quantize" ops that simulate
+  # quantization numerics during fine-tuning without
+  # actually casting the activations/weights to lower-bit
+  # dtypes like in "real" quantization.
+  quantize_(model, qat_config)
 
-  # Insert "fake quantize" operations into linear layers.
-  # These operations simulate quantization numerics during
-  # fine-tuning without performing any dtype casting
-  prepared_model = quantizer.prepare(model)
+  prepared_model = model
 
-If we print the model we’ll see that all linear layers have been swapped with
-:code:`Int8DynActInt4WeightQATLinear`, which simulates the numerics of int8
-dynamic per token activations + int4 grouped per channel weights. Now the model
-is ready for fine-tuning.
+The model is now ready for QAT fine-tuning! If we print the model we’ll see that
+all linear layers have been swapped with :code:`FakeQuantizedLinear`, which simulates
+the numerics of int8 dynamic asymmetric per token activations + int4 symmetric
+per group weights:
 
 .. code-block:: bash
 
-  >>> print(model.layers[0].attn)
+  >>> original_model.layers[0].attn
   MultiHeadAttention(
     (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
     (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
@@ -98,37 +113,71 @@ is ready for fine-tuning.
     (pos_embeddings): RotaryPositionalEmbeddings()
   )
 
-  >>> print(prepared_model.layers[0].attn)
+.. code-block:: bash
+
+  >>> prepared_model.layers[0].attn
   MultiHeadAttention(
-    (q_proj): Int8DynActInt4WeightQATLinear(in_features=4096, out_features=4096, bias=False)
-    (k_proj): Int8DynActInt4WeightQATLinear(in_features=4096, out_features=1024, bias=False)
-    (v_proj): Int8DynActInt4WeightQATLinear(in_features=4096, out_features=1024, bias=False)
-    (output_proj): Int8DynActInt4WeightQATLinear(in_features=4096, out_features=4096, bias=False)
+    (q_proj): FakeQuantizedLinear(
+      in_features=4096, out_features=4096, bias=False
+      (activation_fake_quantizer): FakeQuantizer(FakeQuantizeConfig(dtype=torch.int8, granularity=PerToken(), mapping_type=<MappingType.ASYMMETRIC: 3>, scale_precision=torch.float32, zero_point_precision=torch.int32, zero_point_domain=<ZeroPointDomain.INT: 1>, is_dynamic=True, range_learning=False))
+      (weight_fake_quantizer): FakeQuantizer(FakeQuantizeConfig(dtype=torch.int4, granularity=PerGroup(group_size=32), mapping_type=<MappingType.SYMMETRIC: 1>, scale_precision=torch.float32, zero_point_precision=torch.int32, zero_point_domain=<ZeroPointDomain.INT: 1>, is_dynamic=True, range_learning=False))
+    )
+    (k_proj): FakeQuantizedLinear(
+      in_features=4096, out_features=1024, bias=False
+      (activation_fake_quantizer): FakeQuantizer(FakeQuantizeConfig(dtype=torch.int8, granularity=PerToken(), mapping_type=<MappingType.ASYMMETRIC: 3>, scale_precision=torch.float32, zero_point_precision=torch.int32, zero_point_domain=<ZeroPointDomain.INT: 1>, is_dynamic=True, range_learning=False))
+      (weight_fake_quantizer): FakeQuantizer(FakeQuantizeConfig(dtype=torch.int4, granularity=PerGroup(group_size=32), mapping_type=<MappingType.SYMMETRIC: 1>, scale_precision=torch.float32, zero_point_precision=torch.int32, zero_point_domain=<ZeroPointDomain.INT: 1>, is_dynamic=True, range_learning=False))
+    )
+    (v_proj): FakeQuantizedLinear(
+      in_features=4096, out_features=1024, bias=False
+      (activation_fake_quantizer): FakeQuantizer(FakeQuantizeConfig(dtype=torch.int8, granularity=PerToken(), mapping_type=<MappingType.ASYMMETRIC: 3>, scale_precision=torch.float32, zero_point_precision=torch.int32, zero_point_domain=<ZeroPointDomain.INT: 1>, is_dynamic=True, range_learning=False))
+      (weight_fake_quantizer): FakeQuantizer(FakeQuantizeConfig(dtype=torch.int4, granularity=PerGroup(group_size=32), mapping_type=<MappingType.SYMMETRIC: 1>, scale_precision=torch.float32, zero_point_precision=torch.int32, zero_point_domain=<ZeroPointDomain.INT: 1>, is_dynamic=True, range_learning=False))
+    )
+    (output_proj): FakeQuantizedLinear(
+      in_features=4096, out_features=4096, bias=False
+      (activation_fake_quantizer): FakeQuantizer(FakeQuantizeConfig(dtype=torch.int8, granularity=PerToken(), mapping_type=<MappingType.ASYMMETRIC: 3>, scale_precision=torch.float32, zero_point_precision=torch.int32, zero_point_domain=<ZeroPointDomain.INT: 1>, is_dynamic=True, range_learning=False))
+      (weight_fake_quantizer): FakeQuantizer(FakeQuantizeConfig(dtype=torch.int4, granularity=PerGroup(group_size=32), mapping_type=<MappingType.SYMMETRIC: 1>, scale_precision=torch.float32, zero_point_precision=torch.int32, zero_point_domain=<ZeroPointDomain.INT: 1>, is_dynamic=True, range_learning=False))
+    )
     (pos_embeddings): RotaryPositionalEmbeddings()
   )
 
-After fine-tuning, we can convert the model to get an actual quantized model.
-If we print the converted model, we’ll see that the QAT linears have been
-swapped with `Int8DynActInt4WeightLinear <https://github.com/pytorch/ao/blob/428084356ace4ea94c22a3a9b3d74cff8ee41db3/torchao/quantization/prototype/qat.py#L38>`_, which are the quantized versions
-of the linear layers. This quantized model can then be saved to checkpoint and
-used for inference or generation.
+After fine-tuning, we can convert the model to get an actual quantized model:
 
 .. code-block:: python
 
+  from torchao.quantization.qat import (
+      FromIntXQuantizationAwareTrainingConfig,
+  )
+  from torchao.quantization import (
+      Int8DynamicActivationInt4WeightConfig,
+  )
+
   # Fine-tune as before
   train_loop(prepared_model)
 
-  # Convert fake quantize to actual quantize operations
-  converted_model = quantizer.convert(prepared_model)
+  # Convert the fake quantized model into an actual quantized model
+  #
+  # First, we swap `FakeQuantizedLinear` back to `torch.nn.Linear`
+  # while keeping the QAT fine-tuned weights. Then, we perform standard
+  # post-training quantization (PTQ), which inserts quantized activation
+  # and weight tensor subclasses
+  quantize_(prepared_model, FromIntXQuantizationAwareTrainingConfig())
+  quantize_(prepared_model, Int8DynamicActivationInt4WeightConfig(group_size=32))
+
+  converted_model = prepared_model
+
+The model is now fully quantized to int8 and int4 and ready for inference
+or generation. If we print the model now, we will see the linear layers
+are now swapped back to :code:`torch.nn.Linear`, but with quantized tensor
+activations and weights:
 
 .. code-block:: bash
 
-  >>> print(converted_model.layers[0].attn)
+  >>> converted_model.layers[0].attn
   MultiHeadAttention(
-    (q_proj): Int8DynActInt4WeightLinear()
-    (k_proj): Int8DynActInt4WeightLinear()
-    (v_proj): Int8DynActInt4WeightLinear()
-    (output_proj): Int8DynActInt4WeightLinear()
+    (q_proj): Linear(in_features=4096, out_features=4096, weight=LinearActivationQuantizedTensor(activation=<function _int8_asymm_per_token_quant at 0x7f801ce08790>, weight=AffineQuantizedTensor(shape=torch.Size([4096, 4096]), block_size=(1, 32), device=cpu, _layout=PlainLayout(), tensor_impl_dtype=torch.int8, quant_min=-8, quant_max=7)))
+    (k_proj): Linear(in_features=4096, out_features=1024, weight=LinearActivationQuantizedTensor(activation=<function _int8_asymm_per_token_quant at 0x7f801ce08790>, weight=AffineQuantizedTensor(shape=torch.Size([1024, 4096]), block_size=(1, 32), device=cpu, _layout=PlainLayout(), tensor_impl_dtype=torch.int8, quant_min=-8, quant_max=7)))
+    (v_proj): Linear(in_features=4096, out_features=1024, weight=LinearActivationQuantizedTensor(activation=<function _int8_asymm_per_token_quant at 0x7f801ce08790>, weight=AffineQuantizedTensor(shape=torch.Size([1024, 4096]), block_size=(1, 32), device=cpu, _layout=PlainLayout(), tensor_impl_dtype=torch.int8, quant_min=-8, quant_max=7)))
+    (output_proj): Linear(in_features=4096, out_features=4096, weight=LinearActivationQuantizedTensor(activation=<function _int8_asymm_per_token_quant at 0x7f801ce08790>, weight=AffineQuantizedTensor(shape=torch.Size([4096, 4096]), block_size=(1, 32), device=cpu, _layout=PlainLayout(), tensor_impl_dtype=torch.int8, quant_min=-8, quant_max=7)))
     (pos_embeddings): RotaryPositionalEmbeddings()
   )
 
@@ -150,23 +199,21 @@ modifications accordingly:
 
 .. code-block:: yaml
 
-  # Dataset
   dataset:
     _component_: torchtune.datasets.text_completion_dataset
     source: allenai/c4
-    max_seq_len: 8192
     column: text
     name: en
     split: train
-  seed: null
-  shuffle: True
 
   ...
 
   epochs: 1
   max_steps_per_epoch: 2000
   fake_quant_after_n_steps: 1000
-  memory_efficient_fsdp_wrap: False
+
+By default, this uses the :code:`torchtune.training.quantization.Int8DynActInt4WeightQATQuantizer`,
+which uses the same fake quantization configurations as the example above.
 
 Empirically, we observed that disabling fake quantization for the first N steps
 led to better results, presumably because doing so allows the weights to stabilize
@@ -213,15 +260,13 @@ copy and make the following modifications to the quantization config:
 
 .. code-block:: yaml
 
-  # Model arguments
   model:
     _component_: torchtune.models.llama3.llama3_8b
 
   checkpointer:
     _component_: torchtune.training.FullModelMetaCheckpointer
     checkpoint_dir: <your QAT checkpoint dir>
-    checkpoint_files: [meta_model_0.pt]
-    recipe_checkpoint: null
+    checkpoint_files: [ft-model-00001-of-00001.bin]
     output_dir: <your QAT checkpoint dir>
     model_type: LLAMA3
 
@@ -259,25 +304,19 @@ integrated in torchtune. First, copy the evaluation config and make the followin
 
 .. code-block:: yaml
 
-  # Model arguments
   model:
     _component_: torchtune.models.llama3.llama3_8b
 
   checkpointer:
     _component_: torchtune.training.FullModelTorchTuneCheckpointer
     checkpoint_dir: <your quantized model checkpoint dir>
-    checkpoint_files: [meta_model_0-8da4w.pt]
-    recipe_checkpoint: null
+    checkpoint_files: [ft-model-00001-of-00001-8da4w.bin]
     output_dir: <your quantized model checkpoint dir>
     model_type: LLAMA3
 
   ...
 
-  # EleutherAI specific eval args
   tasks: ["hellaswag", "wikitext"]
-  limit: null
-  max_seq_length: 8192
-  batch_size: 8
 
   quantizer:
     _component_: torchtune.training.quantization.Int8DynActInt4WeightQuantizer