diff --git a/main/_downloads/315c4c52fb68082a731b192d944e2ede/tutorials_python.zip b/main/_downloads/315c4c52fb68082a731b192d944e2ede/tutorials_python.zip
index 1c885790fc..b45ce5f6ef 100644
Binary files a/main/_downloads/315c4c52fb68082a731b192d944e2ede/tutorials_python.zip and b/main/_downloads/315c4c52fb68082a731b192d944e2ede/tutorials_python.zip differ
diff --git a/main/_downloads/a5659940aa3f8f568547d47752a43172/tutorials_jupyter.zip b/main/_downloads/a5659940aa3f8f568547d47752a43172/tutorials_jupyter.zip
index f250bb8a6b..8294ecadca 100644
Binary files a/main/_downloads/a5659940aa3f8f568547d47752a43172/tutorials_jupyter.zip and b/main/_downloads/a5659940aa3f8f568547d47752a43172/tutorials_jupyter.zip differ
diff --git a/main/_downloads/e148c8862a389bde3e2c2727c00d1f30/template_tutorial.zip b/main/_downloads/e148c8862a389bde3e2c2727c00d1f30/template_tutorial.zip
index ae5b2048f0..fda332ebf4 100644
Binary files a/main/_downloads/e148c8862a389bde3e2c2727c00d1f30/template_tutorial.zip and b/main/_downloads/e148c8862a389bde3e2c2727c00d1f30/template_tutorial.zip differ
diff --git a/main/_sources/tutorials/template_tutorial.rst.txt b/main/_sources/tutorials/template_tutorial.rst.txt
index 7bca92c867..6880e5f046 100644
--- a/main/_sources/tutorials/template_tutorial.rst.txt
+++ b/main/_sources/tutorials/template_tutorial.rst.txt
@@ -67,11 +67,11 @@ Example code (the output below is generated automatically):
.. code-block:: none
- tensor([[0.5060, 0.1671, 0.2317],
- [0.7330, 0.9476, 0.1239],
- [0.3325, 0.8057, 0.8212],
- [0.6673, 0.2430, 0.0813],
- [0.0879, 0.3014, 0.4889]])
+ tensor([[0.0870, 0.9183, 0.7696],
+ [0.3774, 0.1702, 0.2919],
+ [0.2416, 0.8915, 0.9341],
+ [0.7196, 0.4544, 0.8347],
+ [0.1172, 0.4801, 0.8118]])
diff --git a/main/searchindex.js b/main/searchindex.js
index 57f1f4131d..ca4be99395 100644
--- a/main/searchindex.js
+++ b/main/searchindex.js
@@ -1 +1 @@
-Search.setIndex({"docnames": ["api_ref_dtypes", "api_ref_intro", "api_ref_kernel", "api_ref_quantization", "api_ref_sparsity", "contributor_guide", "dtypes", "generated/torchao.dtypes.AffineQuantizedTensor", "generated/torchao.dtypes.BlockSparseLayout", "generated/torchao.dtypes.CutlassInt4PackedLayout", "generated/torchao.dtypes.Float8Layout", "generated/torchao.dtypes.Int4CPULayout", "generated/torchao.dtypes.Layout", "generated/torchao.dtypes.MarlinQQQLayout", "generated/torchao.dtypes.MarlinQQQTensor", "generated/torchao.dtypes.MarlinSparseLayout", "generated/torchao.dtypes.NF4Tensor", "generated/torchao.dtypes.PlainLayout", "generated/torchao.dtypes.SemiSparseLayout", "generated/torchao.dtypes.TensorCoreTiledLayout", "generated/torchao.dtypes.UintxLayout", "generated/torchao.dtypes.to_affine_quantized_floatx", "generated/torchao.dtypes.to_affine_quantized_floatx_static", "generated/torchao.dtypes.to_affine_quantized_fpx", "generated/torchao.dtypes.to_affine_quantized_intx", "generated/torchao.dtypes.to_affine_quantized_intx_static", "generated/torchao.dtypes.to_marlinqqq_quantized_intx", "generated/torchao.dtypes.to_nf4", "generated/torchao.quantization.MappingType", "generated/torchao.quantization.TorchAODType", "generated/torchao.quantization.ZeroPointDomain", "generated/torchao.quantization.autoquant", "generated/torchao.quantization.choose_qparams_affine", "generated/torchao.quantization.choose_qparams_affine_floatx", "generated/torchao.quantization.choose_qparams_affine_with_min_max", "generated/torchao.quantization.choose_qparams_and_quantize_affine_hqq", "generated/torchao.quantization.dequantize_affine", "generated/torchao.quantization.dequantize_affine_floatx", "generated/torchao.quantization.fake_quantize_affine", "generated/torchao.quantization.fake_quantize_affine_cachemask", "generated/torchao.quantization.float8_dynamic_activation_float8_weight", "generated/torchao.quantization.float8_static_activation_float8_weight", "generated/torchao.quantization.float8_weight_only", "generated/torchao.quantization.fpx_weight_only", "generated/torchao.quantization.gemlite_uintx_weight_only", "generated/torchao.quantization.int4_weight_only", "generated/torchao.quantization.int8_dynamic_activation_int4_weight", "generated/torchao.quantization.int8_dynamic_activation_int8_weight", "generated/torchao.quantization.int8_weight_only", "generated/torchao.quantization.int_scaled_matmul", "generated/torchao.quantization.intx_quantization_aware_training", "generated/torchao.quantization.quantize_", "generated/torchao.quantization.quantize_affine", "generated/torchao.quantization.quantize_affine_floatx", "generated/torchao.quantization.safe_int_mm", "generated/torchao.quantization.smooth_fq_linear_to_inference", "generated/torchao.quantization.swap_linear_with_smooth_fq_linear", "generated/torchao.quantization.to_linear_activation_quantized", "generated/torchao.quantization.uintx_weight_only", "generated/torchao.sparsity.PerChannelNormObserver", "generated/torchao.sparsity.WandaSparsifier", "generated/torchao.sparsity.apply_fake_sparsity", "generated/torchao.sparsity.int8_dynamic_activation_int8_semi_sparse_weight", "generated/torchao.sparsity.semi_sparse_weight", "generated/torchao.sparsity.sparsify_", "index", "performant_kernels", "quantization", "quick_start", "serialization", "sg_execution_times", "sparsity", "subclass_advanced", "subclass_basic", "tutorials/index", "tutorials/sg_execution_times", "tutorials/template_tutorial"], "filenames": ["api_ref_dtypes.rst", "api_ref_intro.rst", "api_ref_kernel.rst", "api_ref_quantization.rst", "api_ref_sparsity.rst", "contributor_guide.rst", "dtypes.rst", "generated/torchao.dtypes.AffineQuantizedTensor.rst", "generated/torchao.dtypes.BlockSparseLayout.rst", "generated/torchao.dtypes.CutlassInt4PackedLayout.rst", "generated/torchao.dtypes.Float8Layout.rst", "generated/torchao.dtypes.Int4CPULayout.rst", "generated/torchao.dtypes.Layout.rst", "generated/torchao.dtypes.MarlinQQQLayout.rst", "generated/torchao.dtypes.MarlinQQQTensor.rst", "generated/torchao.dtypes.MarlinSparseLayout.rst", "generated/torchao.dtypes.NF4Tensor.rst", "generated/torchao.dtypes.PlainLayout.rst", "generated/torchao.dtypes.SemiSparseLayout.rst", "generated/torchao.dtypes.TensorCoreTiledLayout.rst", "generated/torchao.dtypes.UintxLayout.rst", "generated/torchao.dtypes.to_affine_quantized_floatx.rst", "generated/torchao.dtypes.to_affine_quantized_floatx_static.rst", "generated/torchao.dtypes.to_affine_quantized_fpx.rst", "generated/torchao.dtypes.to_affine_quantized_intx.rst", "generated/torchao.dtypes.to_affine_quantized_intx_static.rst", "generated/torchao.dtypes.to_marlinqqq_quantized_intx.rst", "generated/torchao.dtypes.to_nf4.rst", "generated/torchao.quantization.MappingType.rst", "generated/torchao.quantization.TorchAODType.rst", "generated/torchao.quantization.ZeroPointDomain.rst", "generated/torchao.quantization.autoquant.rst", "generated/torchao.quantization.choose_qparams_affine.rst", "generated/torchao.quantization.choose_qparams_affine_floatx.rst", "generated/torchao.quantization.choose_qparams_affine_with_min_max.rst", "generated/torchao.quantization.choose_qparams_and_quantize_affine_hqq.rst", "generated/torchao.quantization.dequantize_affine.rst", "generated/torchao.quantization.dequantize_affine_floatx.rst", "generated/torchao.quantization.fake_quantize_affine.rst", "generated/torchao.quantization.fake_quantize_affine_cachemask.rst", "generated/torchao.quantization.float8_dynamic_activation_float8_weight.rst", "generated/torchao.quantization.float8_static_activation_float8_weight.rst", "generated/torchao.quantization.float8_weight_only.rst", "generated/torchao.quantization.fpx_weight_only.rst", "generated/torchao.quantization.gemlite_uintx_weight_only.rst", "generated/torchao.quantization.int4_weight_only.rst", "generated/torchao.quantization.int8_dynamic_activation_int4_weight.rst", "generated/torchao.quantization.int8_dynamic_activation_int8_weight.rst", "generated/torchao.quantization.int8_weight_only.rst", "generated/torchao.quantization.int_scaled_matmul.rst", "generated/torchao.quantization.intx_quantization_aware_training.rst", "generated/torchao.quantization.quantize_.rst", "generated/torchao.quantization.quantize_affine.rst", "generated/torchao.quantization.quantize_affine_floatx.rst", "generated/torchao.quantization.safe_int_mm.rst", "generated/torchao.quantization.smooth_fq_linear_to_inference.rst", "generated/torchao.quantization.swap_linear_with_smooth_fq_linear.rst", "generated/torchao.quantization.to_linear_activation_quantized.rst", "generated/torchao.quantization.uintx_weight_only.rst", "generated/torchao.sparsity.PerChannelNormObserver.rst", "generated/torchao.sparsity.WandaSparsifier.rst", "generated/torchao.sparsity.apply_fake_sparsity.rst", "generated/torchao.sparsity.int8_dynamic_activation_int8_semi_sparse_weight.rst", "generated/torchao.sparsity.semi_sparse_weight.rst", "generated/torchao.sparsity.sparsify_.rst", "index.rst", "performant_kernels.rst", "quantization.rst", "quick_start.rst", "serialization.rst", "sg_execution_times.rst", "sparsity.rst", "subclass_advanced.rst", "subclass_basic.rst", "tutorials/index.rst", "tutorials/sg_execution_times.rst", "tutorials/template_tutorial.rst"], "titles": ["torchao.dtypes", "torchao
API Reference", "torchao.kernel", "torchao.quantization", "torchao.sparsity", "Contributor Guide", "Dtypes", "AffineQuantizedTensor", "BlockSparseLayout", "CutlassInt4PackedLayout", "Float8Layout", "Int4CPULayout", "Layout", "MarlinQQQLayout", "MarlinQQQTensor", "MarlinSparseLayout", "NF4Tensor", "PlainLayout", "SemiSparseLayout", "TensorCoreTiledLayout", "UintxLayout", "to_affine_quantized_floatx", "to_affine_quantized_floatx_static", "to_affine_quantized_fpx", "to_affine_quantized_intx", "to_affine_quantized_intx_static", "to_marlinqqq_quantized_intx", "to_nf4", "MappingType", "TorchAODType", "ZeroPointDomain", "autoquant", "choose_qparams_affine", "choose_qparams_affine_floatx", "choose_qparams_affine_with_min_max", "choose_qparams_and_quantize_affine_hqq", "dequantize_affine", "dequantize_affine_floatx", "fake_quantize_affine", "fake_quantize_affine_cachemask", "float8_dynamic_activation_float8_weight", "float8_static_activation_float8_weight", "float8_weight_only", "fpx_weight_only", "gemlite_uintx_weight_only", "int4_weight_only", "int8_dynamic_activation_int4_weight", "int8_dynamic_activation_int8_weight", "int8_weight_only", "int_scaled_matmul", "intx_quantization_aware_training", "quantize", "quantize_affine", "quantize_affine_floatx", "safe_int_mm", "smooth_fq_linear_to_inference", "swap_linear_with_smooth_fq_linear", "to_linear_activation_quantized", "uintx_weight_only", "PerChannelNormObserver", "WandaSparsifier", "apply_fake_sparsity", "int8_dynamic_activation_int8_semi_sparse_weight", "semi_sparse_weight", "sparsify", "Welcome to the torchao Documentation", "Performant Kernels", "Quantization Overview", "Quick Start Guide", "Serialization", "Computation times", "Sparsity Overview", "Writing Your Own Quantized Tensor (advanced)", "Writing Your Own Quantized Tensor", "<no title>", "Computation times", "Template Tutorial"], "terms": {"thi": [1, 5, 7, 15, 16, 17, 18, 20, 28, 31, 32, 34, 36, 38, 39, 43, 44, 45, 46, 51, 52, 59, 60, 61, 64, 67, 68, 69, 71, 73, 76], "section": [1, 5, 67, 71], "introduc": 1, "dive": 1, "detail": [1, 5, 31, 43, 67, 68, 71, 73], "how": [1, 5, 7, 12, 20, 28, 32, 45, 52, 68, 69, 71, 73], "integr": [1, 5, 69, 71, 73], "pytorch": [1, 5, 7, 11, 14, 29, 65, 68, 71, 73, 76], "optim": [1, 5, 15, 31, 35, 51, 65, 71, 73], "your": [1, 5, 51, 65, 67, 68, 71], "machin": 1, "learn": [1, 45, 68, 71, 76], "model": [1, 31, 44, 46, 50, 51, 55, 56, 60, 61, 64, 68, 71, 73], "dtype": [1, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 29, 31, 32, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 51, 52, 58, 64, 65, 68, 69, 73], "quantiz": [1, 5, 7, 9, 10, 11, 13, 14, 15, 16, 18, 19, 21, 24, 26, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 52, 53, 54, 55, 56, 57, 58, 62, 64, 65, 69, 71], "sparsiti": [1, 15, 18, 59, 60, 61, 62, 63, 64, 65, 67, 69], "tba": [2, 6, 66], "For": [5, 32, 43, 67, 68, 69, 71, 73], "new": [5, 7, 51, 67, 73], "case": [5, 31, 54, 71, 73], "exampl": [5, 7, 28, 31, 32, 50, 51, 60, 64, 67, 69, 70, 71, 73, 74, 75, 76], "train": [5, 38, 39, 65, 71, 73], "like": [5, 12, 31, 32, 67, 68, 69, 71, 73], "fp4": 5, "s": [5, 7, 28, 31, 32, 36, 38, 52, 53, 67, 68, 71, 73], "fine": [5, 44, 45, 46, 58, 71], "start": [5, 28, 29, 30, 31, 67, 71, 73], "prototyp": [5, 67], "folder": 5, "you": [5, 51, 60, 67, 68, 69, 71, 73, 76], "could": [5, 67, 73], "also": [5, 31, 51, 67, 68, 69, 71, 73], "take": [5, 16, 51, 59, 64, 67, 71], "look": [5, 7, 67, 71], "affinequantizedtensor": [5, 14, 22, 23, 25, 67, 68, 69, 73], "what": [5, 7, 14, 31, 67, 68, 71, 76], "want": [5, 51, 64, 67, 69, 71, 73], "do": [5, 29, 31, 32, 49, 51, 67, 71, 73], "mostli": [5, 34], "e": [5, 7, 28, 31, 32, 36, 38, 43, 51, 52, 53, 64, 67, 69, 73], "g": [5, 7, 28, 31, 32, 36, 38, 43, 51, 52, 64, 67, 69, 73], "int3": 5, "exact": 5, "same": [5, 7, 32, 34, 36, 38, 39, 40, 52, 54, 64, 67, 71, 73], "affin": [5, 7, 9, 10, 11, 15, 18, 19, 24, 36, 38, 51, 52, 64, 67], "pleas": [5, 7, 14, 43, 45, 65, 67, 71, 73], "feel": [5, 67, 71, 73], "free": [5, 67, 73], "open": [5, 67, 71], "an": [5, 7, 19, 24, 25, 31, 39, 50, 60, 65, 67, 71, 73], "issu": [5, 67, 68, 73], "have": [5, 28, 31, 32, 44, 45, 52, 60, 67, 71, 73], "question": [5, 67, 69, 71, 73], "specif": [5, 12, 15, 17, 18, 60, 67, 68, 69, 71], "more": [5, 31, 39, 43, 44, 45, 46, 58, 67, 68, 71, 73], "refer": [5, 71, 73], "our": [5, 16, 68, 71, 73], "overview": [5, 65, 68], "page": [5, 68], "To": [5, 7, 14, 31, 67, 68, 69, 71], "contribut": [5, 68, 71], "exist": [5, 29, 67, 71, 73], "code": [5, 45, 67, 68, 71, 73, 74, 76], "base": [5, 12, 17, 28, 60, 67, 68, 71, 73], "make": [5, 32, 67, 73], "trainabl": [5, 67, 73], "add": [5, 17, 51, 73, 76], "parallel": [5, 73], "etc": [5, 67], "affine_quantized_tensor": [5, 69], "py": [5, 7, 14, 70, 75, 76], "api": [5, 31, 67, 68, 71, 73], "quant_api": [5, 51, 69], "primit": [5, 7, 14, 73], "op": [5, 7, 14, 31, 32, 38, 39, 45, 51, 71, 73], "slight": [5, 71], "variat": [5, 67], "quant_primit": [5, 7, 14], "autotun": [5, 68], "cpu": [5, 7, 11, 69, 71], "cuda": [5, 7, 35, 51, 68, 69, 71, 73], "mp": 5, "csrc": 5, "mayb": 5, "well": [5, 12, 31, 67, 71], "spars": [5, 8, 15, 18, 60, 67, 71], "marlin": [5, 13, 14, 15, 26], "aqt": 5, "621": 5, "we": [5, 7, 16, 28, 31, 32, 34, 36, 38, 50, 51, 52, 58, 64, 67, 68, 69, 71], "ar": [5, 7, 10, 18, 20, 31, 32, 36, 38, 40, 43, 45, 50, 51, 52, 54, 60, 67, 68, 69, 71], "still": [5, 67, 71], "decid": [5, 67, 71], "split": 5, "can": [5, 19, 28, 31, 40, 44, 51, 52, 67, 68, 69, 71, 73], "implement": [5, 69, 71], "regist": [5, 59, 73], "mai": [5, 34, 67, 69], "need": [5, 32, 40, 59, 60, 67, 68, 69, 71, 73], "defin": [5, 12, 20, 43, 59, 60, 71, 73], "own": [5, 51, 65, 71], "through": [5, 34, 67, 68, 73, 76], "int4": [5, 9, 11, 28, 46, 50, 51, 64, 68, 69], "access": 5, "my_custom_op": 5, "devic": [5, 7, 35, 51, 54, 68, 69, 73], "check": [5, 7, 14, 67, 68, 69, 73], "condit": [5, 67], "__torch_function__": [5, 67, 73], "__torch_dispatch__": [5, 73], "target": [5, 32, 40, 41, 42, 45, 60, 71], "oper": [5, 7, 10, 12, 15, 34], "bfloat16": [5, 16, 38, 51, 52, 67, 68, 69, 71], "activ": [5, 31, 40, 41, 46, 47, 50, 55, 60, 62, 65, 71], "uint4": [5, 45, 51, 67, 68], "weight": [5, 15, 16, 31, 40, 41, 42, 44, 45, 46, 47, 48, 51, 58, 60, 62, 63, 64, 65, 68, 69, 71, 73], "found": [5, 67, 68, 71, 73], "here": [5, 7, 52, 67, 69, 73], "allow": [5, 71, 73], "peopl": [5, 67, 69], "linear": [5, 15, 31, 32, 40, 42, 45, 46, 47, 48, 50, 51, 56, 58, 61, 62, 63, 64, 67, 68, 69, 71, 73], "two": [5, 14, 18, 40, 67, 71, 73], "dispatch_condit": [5, 67], "impl": [5, 7, 67], "actual": [5, 42, 67, 73], "bia": [5, 67, 68, 69, 73], "run": [5, 31, 51, 55, 59, 64, 67, 71, 73, 76], "both": [5, 40, 67, 71, 73], "input_tensor": [5, 16, 67], "weight_tensor": [5, 67], "argument": [5, 7, 19, 31, 36, 51, 67], "register_aqt_quantized_linear_dispatch": 5, "show": [5, 52, 67, 71], "work": [5, 18, 44, 69, 71, 73], "sometim": [5, 71], "ha": [5, 7, 67, 71, 73], "pack": [5, 7, 9, 19, 20, 43, 44, 58, 67], "order": [5, 31, 67, 71, 73], "yield": [5, 71], "And": [5, 16, 40, 67, 73], "abstract": [5, 67], "see": [5, 7, 14, 43, 67, 68, 69, 71, 73], "full": [5, 76], "after": [5, 31, 67, 69, 71], "wrap": [5, 31, 73], "factori": 5, "convert": [5, 7, 14, 16, 21, 24, 26, 27, 51, 53, 63, 64, 67, 71], "from": [5, 7, 16, 17, 22, 23, 25, 32, 34, 36, 38, 43, 46, 50, 51, 52, 64, 67, 68, 69, 70, 71, 73, 75, 76], "float": [5, 7, 14, 16, 24, 26, 27, 28, 30, 31, 32, 34, 35, 36, 38, 39, 43, 45, 51, 52, 53, 56, 60, 64, 67, 69, 73], "point": [5, 7, 14, 26, 28, 30, 32, 36, 38, 43, 45, 52, 53, 64, 67, 68, 69, 71, 73], "my": [5, 71], "to_my_dtyp": 5, "mydtypetensor": 5, "from_float": [5, 73], "level": [5, 60, 67, 71, 73], "reus": [5, 67, 73], "quantize_": [5, 50, 51, 64, 67, 68, 69], "appli": [5, 31, 40, 41, 42, 44, 45, 46, 47, 48, 50, 51, 58, 62, 67, 68, 71], "convers": [5, 7, 51, 67], "filter": [5, 31], "choos": [5, 67, 71, 73], "which": [5, 14, 20, 31, 67, 68, 69, 71], "modul": [5, 28, 29, 30, 31, 50, 51, 55, 56, 59, 60, 64, 68, 69], "should": [5, 7, 32, 36, 38, 44, 52, 59, 60, 67, 71], "algorithm": [5, 45, 58, 71], "onli": [5, 11, 40, 42, 44, 45, 46, 48, 51, 58, 64, 68, 69, 71, 73], "dynam": [5, 40, 46, 47, 64, 73], "quant": [5, 7, 14, 43, 67], "static": [5, 7, 12, 16, 22, 25, 34, 41], "type": [5, 7, 15, 16, 20, 28, 29, 30, 31, 35, 40, 41, 42, 45, 46, 49, 52, 54, 65, 67, 69, 71, 73], "note": [5, 39, 50, 60, 67, 68, 71, 73], "2": [5, 7, 11, 15, 18, 28, 31, 32, 39, 45, 51, 52, 61, 62, 63, 64, 67, 68, 71, 73, 76], "4": [5, 15, 18, 27, 32, 35, 44, 61, 62, 63, 64, 67, 68, 69, 71, 73], "below": [5, 67, 71, 73, 76], "follow": [5, 45, 67, 68, 71, 73], "util": [5, 44, 67, 68, 69, 73], "import": [5, 32, 50, 51, 64, 68, 69, 71, 73, 76], "unwrap_tensor_subclass": [5, 68], "m_unwrap": 5, "m": [5, 51, 53, 64, 68, 69, 73], "In": [5, 67, 68, 71, 73], "compat": [5, 15, 68], "aim": [5, 67, 71], "fullgraph": [5, 68], "true": [5, 7, 24, 31, 32, 34, 35, 50, 51, 55, 64, 68, 69, 73], "first": [5, 16, 31, 49, 60, 67, 73], "remov": [5, 60, 71], "ani": [5, 17, 31, 57, 60, 67, 71, 73], "unnecessari": 5, "graph": 5, "break": 5, "torch_log": 5, "output_cod": 5, "when": [5, 7, 17, 32, 36, 38, 52, 67, 71], "script": [5, 68, 73, 76], "inductor": [5, 31, 51], "python": [5, 67, 71, 74, 76], "mode": [5, 31, 45, 68], "max": [5, 28, 67, 68, 73], "checkout": [5, 7, 14, 65, 67], "doc": [5, 67, 73], "huggingfac": 5, "transform": [5, 7, 67], "deseri": [5, 67], "save_pretrain": 5, "push_to_hub": 5, "from_pretrain": 5, "http": [5, 7, 14, 31, 43, 60, 68, 71], "co": 5, "main": [5, 7, 14, 45, 67, 68, 71, 73], "en": [5, 31], "anoth": [5, 67, 71, 73], "diffus": 5, "github": [5, 7, 14, 43, 68], "com": [5, 7, 14, 43], "sayakpaul": 5, "blob": [5, 7, 14], "infer": [5, 7, 55, 65, 67, 68, 69, 71, 73], "serialization_and_load": 5, "md": 5, "The": [5, 7, 8, 12, 15, 20, 31, 40, 41, 42, 43, 49, 51, 54, 55, 56, 60, 67, 68, 69, 71, 73], "abov": [5, 28, 67, 69, 71, 73], "just": [5, 28, 67, 69, 71, 73], "talk": [5, 67], "about": [5, 45, 67, 68, 69, 71], "basic": [5, 17, 68, 73], "provid": [5, 12, 15, 18, 19, 31, 32, 67, 71, 73], "fsdp": [5, 67], "ll": [5, 28, 32, 67, 73], "put": [5, 64], "developer_api_guid": 5, "cover": [5, 67, 76], "executorch": [5, 46, 51], "torchchat": 5, "todo": [5, 67], "qat": [5, 38, 39, 50], "suit": 5, "out": [5, 18, 28, 31, 60, 67, 68, 71, 73], "differ": [5, 12, 34, 45, 52, 54, 67, 68, 69, 71, 73], "system": 5, "dtensor": [5, 73], "recommend": [5, 31, 51], "copi": [5, 7, 60, 68, 69, 71, 73], "past": [5, 71], "adapt": 5, "now": [5, 43, 46, 67, 68, 71, 73], "befor": [5, 51, 67, 69, 71, 73], "some": [5, 31, 51, 60, 67, 71, 73], "singl": [5, 31, 34, 40, 68, 71], "comput": [5, 15, 19, 42, 59, 60, 71, 73], "intens": 5, "memori": [5, 7, 39, 68, 71, 73], "input": [5, 7, 15, 16, 18, 31, 32, 34, 36, 38, 39, 49, 51, 52, 54, 60, 64, 67, 73], "dimens": [5, 7, 20, 32, 36, 38, 49, 52, 58, 73], "get": [5, 16, 67, 71], "sens": [5, 67, 73], "speedup": [5, 45, 67, 68, 71], "d": [5, 67], "creat": [5, 7, 22, 23, 25, 67, 71, 73], "file": [5, 70, 73, 75], "benchmark_aq": 5, "shape": [5, 7, 14, 31, 49, 54, 68, 73], "A": [5, 7, 20, 31, 34, 39, 59, 71, 73], "quick": [5, 65], "wai": [5, 7, 31, 67, 71, 73], "relev": [5, 45, 67, 76], "chang": [5, 51, 67, 68, 69, 71, 73], "interest": [5, 67, 71, 73], "tutori": [5, 67, 70, 71, 73, 74, 75], "print_op_and_shap": 5, "output": [5, 31, 32, 36, 38, 52, 67, 71, 76], "torch_func": 5, "built": [5, 73], "k": [5, 54, 68, 69, 73], "n": [5, 68, 69, 73], "10": [5, 28, 52], "method": [5, 12, 15, 18, 19, 31, 51, 60, 71, 73], "_c": 5, "tensorbas": 5, "object": [5, 20, 73], "arg": [5, 7, 60, 73], "0": [5, 7, 31, 32, 51, 52, 56, 60, 68, 69, 70, 71, 73, 75, 76], "size": [5, 7, 8, 14, 16, 32, 36, 38, 44, 45, 46, 52, 58, 68, 69, 71, 73], "all": [5, 28, 31, 34, 59, 60, 61, 67, 68, 69, 70, 71, 73, 74], "under": [5, 51], "benchmark_your_kernel": 5, "helper": 5, "right": [5, 67, 71], "1": [5, 15, 20, 28, 29, 30, 31, 32, 35, 45, 51, 52, 58, 60, 67, 68, 69, 70, 71, 73, 75, 76], "either": [5, 7, 32, 36, 38, 40, 52, 60, 71], "one": [5, 31, 34, 40, 59, 67, 71, 73], "probabl": 5, "keep": [5, 15, 60], "futur": [5, 43], "llama": 5, "llama2": 5, "llama3": 5, "sam": 5, "alreadi": [5, 7, 31, 73], "modifi": [5, 51, 60, 67, 71, 73], "friendli": [5, 67], "compar": [5, 39, 45, 60, 67], "techniqu": [5, 69, 71, 73], "repres": [5, 7, 8, 10, 12, 23, 52, 60, 67, 69, 73], "bound": [5, 71], "help": [5, 67], "option": [5, 7, 10, 14, 21, 24, 25, 26, 31, 32, 34, 36, 38, 39, 40, 41, 44, 50, 51, 52, 55, 56, 57, 60, 64, 68], "each": [5, 16, 31, 55, 59, 67, 71, 73], "understand": 5, "profil": 5, "profile_path": 5, "chrome": 5, "trace": [5, 67], "let": [5, 28, 44, 52, 67, 68, 71, 73], "know": [5, 31, 73], "class": [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 28, 29, 30, 31, 59, 60, 67, 68, 69, 73], "torchao": [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 68, 69, 71, 73], "tensor_impl": [7, 14, 67], "aqttensorimpl": [7, 14], "block_siz": [7, 12, 14, 16, 21, 22, 24, 25, 26, 27, 32, 34, 36, 38, 39, 52, 68], "tupl": [7, 14, 16, 21, 22, 24, 25, 26, 32, 34, 35, 36, 38, 39, 40, 41, 52, 60, 73], "int": [7, 8, 14, 16, 19, 20, 21, 22, 24, 25, 26, 27, 32, 33, 34, 35, 36, 37, 38, 39, 43, 44, 45, 51, 52, 53, 60, 68, 73], "quant_min": [7, 14, 24, 25, 26, 28, 32, 34, 36, 38, 39, 52, 67, 68, 73], "union": [7, 14, 32, 36, 38, 39, 40, 41, 51, 52], "none": [7, 10, 14, 21, 24, 25, 26, 28, 29, 30, 31, 32, 34, 36, 38, 39, 40, 41, 44, 45, 48, 50, 51, 52, 55, 56, 57, 60, 64, 73], "quant_max": [7, 14, 24, 25, 26, 28, 32, 34, 36, 38, 39, 52, 67, 68, 73], "zero_point_domain": [7, 14, 24, 25, 26, 32, 34, 36, 38, 39, 45, 51, 52], "zeropointdomain": [7, 14, 24, 25, 26, 32, 34, 36, 38, 39, 45, 52], "stride": [7, 14, 67, 73], "sourc": [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 58, 59, 60, 61, 62, 63, 64, 74, 76], "tensor": [7, 9, 10, 11, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 31, 32, 33, 34, 35, 36, 37, 38, 39, 41, 42, 45, 46, 49, 51, 52, 53, 54, 57, 60, 64, 65, 68, 69, 71, 76], "subclass": [7, 14, 31, 51, 59, 64, 68, 69, 71], "mean": [7, 16, 28, 32, 36, 38, 52, 53, 67, 68, 71], "quantized_tensor": 7, "float_tensor": [7, 73], "scale": [7, 12, 15, 22, 25, 28, 30, 32, 34, 36, 37, 38, 39, 41, 49, 52, 53, 55, 56, 67, 71, 73], "zero_point": [7, 12, 25, 30, 32, 34, 36, 38, 39, 45, 52, 67, 71, 73], "happen": [7, 14, 31, 67, 73], "dure": [7, 14, 31, 32, 36, 38, 52, 56, 71, 73], "choose_qparam": [7, 67], "dequant": [7, 14, 16, 36, 45, 67, 73], "ao": [7, 14, 71], "three": [7, 31, 60, 64, 67], "choose_qparams_affin": [7, 34, 45, 67], "quantize_affin": [7, 38, 39, 45, 67], "qand": 7, "dequantize_affin": [7, 38, 39, 45], "extern": 7, "regardless": 7, "intern": [7, 19], "represent": [7, 12, 23, 32, 45, 67, 71], "orient": 7, "field": 7, "serv": [7, 12, 73], "gener": [7, 38, 39, 67, 68, 71, 73, 74, 76], "storag": [7, 15, 67, 71], "data": [7, 8, 12, 15, 20, 34, 40, 41, 42, 45, 65, 67, 69, 71, 73], "store": [7, 15, 16, 20, 59, 67, 71], "plain": 7, "int_data": [7, 73], "format": [7, 15, 16, 43, 44, 53, 67, 71], "depend": [7, 31, 44, 69, 71, 73], "kernel": [7, 9, 11, 15, 19, 43, 44, 45, 51, 68, 71], "granular": [7, 32, 36, 38, 40, 41, 44, 45, 46, 52, 58, 67], "element": [7, 18, 20, 31, 32, 36, 38, 52, 71], "share": [7, 32, 36, 38, 52, 71], "qparam": [7, 32, 36, 38, 52], "us": [7, 10, 11, 12, 15, 16, 17, 20, 22, 25, 28, 31, 32, 34, 36, 38, 40, 41, 45, 46, 50, 51, 52, 58, 60, 65, 67, 68, 69, 71, 73], "per": [7, 32, 36, 38, 42, 45, 46, 47, 48, 52, 58, 60, 62, 67, 68, 71], "torch": [7, 15, 16, 20, 31, 32, 35, 36, 37, 38, 40, 41, 42, 45, 49, 50, 51, 52, 54, 55, 56, 58, 64, 67, 68, 69, 71, 73, 76], "origin": [7, 16, 38, 42, 52, 60, 67, 68, 69, 71], "high": [7, 21, 22, 23, 24, 25, 53, 67, 71, 73], "precis": [7, 21, 22, 23, 24, 25, 42, 53, 67, 73], "minimum": [7, 31, 32, 36, 38, 52], "valu": [7, 16, 28, 29, 30, 31, 32, 36, 38, 39, 45, 52, 55, 60, 67, 71, 73], "specifi": [7, 38, 44, 51, 52, 58, 60, 71], "deriv": [7, 34, 38, 52], "maximum": [7, 32, 36, 38, 52, 55], "domain": [7, 30, 32, 36, 38, 45, 52], "integ": [7, 24, 25, 28, 30, 32, 36, 38, 44, 45, 49, 52, 54], "zero": [7, 18, 32, 36, 38, 45, 52, 60, 71], "ad": [7, 32, 36, 38, 52, 60, 71, 73], "subtract": [7, 16, 32, 36, 38, 52], "unquant": [7, 32, 36, 38, 52], "default": [7, 8, 10, 17, 19, 20, 31, 32, 36, 38, 40, 41, 42, 45, 51, 52, 55, 56, 58, 73], "float32": [7, 36, 37, 38, 52, 53, 69, 71, 73], "given": [7, 14, 27, 32, 71], "return": [7, 14, 15, 16, 31, 39, 49, 50, 51, 54, 55, 56, 64, 67, 68, 69, 73], "classmethod": [7, 14, 73], "from_hp_to_floatx": 7, "input_float": [7, 14, 21, 22, 23, 24, 25, 26, 57], "target_dtyp": [7, 21, 22, 24, 25, 32, 34, 67], "_layout": [7, 14, 21, 22, 23, 24, 25, 26, 67, 68], "layout": [7, 8, 9, 10, 11, 13, 14, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 45, 46, 47, 64, 71], "scale_dtyp": [7, 21, 24, 32, 34], "float8": [7, 10, 21, 22, 40, 41, 42, 67], "from_hp_to_floatx_stat": 7, "paramet": [7, 12, 15, 16, 22, 25, 28, 31, 32, 36, 38, 40, 41, 42, 44, 45, 46, 49, 51, 52, 54, 55, 56, 58, 60, 64, 67, 69, 71, 73], "from_hp_to_fpx": 7, "floatx": [7, 23, 67], "ebit": [7, 23, 33, 37, 43, 53], "mbit": [7, 23, 33, 37, 43, 53], "support": [7, 23, 40, 46, 50, 64, 68, 69, 71, 73], "float1": [7, 23], "float7": [7, 23], "from_hp_to_intx": [7, 14], "mapping_typ": [7, 24, 32, 34, 46], "mappingtyp": [7, 24, 32, 34, 46, 47], "ep": [7, 24, 32, 34], "zero_point_dtyp": [7, 24, 32, 34, 51], "preserve_zero": [7, 24, 32, 34, 45, 51], "bool": [7, 24, 31, 32, 34, 35, 44, 51, 55, 64], "plainlayout": [7, 24, 25, 46, 47], "use_hqq": [7, 24, 45, 58], "fals": [7, 24, 31, 35, 45, 47, 50, 51, 55, 58, 60, 67, 68, 69, 73], "from_hp_to_intx_stat": 7, "kwarg": [7, 59, 60, 61, 73], "perform": [7, 19, 31, 44, 49, 54, 55, 59, 68, 71, 73], "self": [7, 67, 68, 69, 73], "If": [7, 10, 31, 32, 40, 49, 50, 54, 55, 60, 67, 68, 71, 73], "correct": [7, 15], "otherwis": [7, 67], "desir": [7, 31, 38], "call": [7, 31, 38, 39, 51, 59, 67, 68, 69, 71, 73], "non_block": 7, "memory_format": 7, "preserve_format": 7, "tri": [7, 71], "asynchron": 7, "respect": [7, 71], "host": 7, "possibl": [7, 71], "pin": 7, "set": [7, 10, 31, 34, 44, 51, 55, 60, 68, 71], "even": [7, 71], "match": [7, 36, 49, 71], "other": [7, 12, 60, 69, 71, 73, 76], "randn": [7, 68, 69, 73], "initi": [7, 67, 69], "float64": 7, "5044": 7, "0005": 7, "3310": 7, "0584": 7, "cuda0": 7, "blocksiz": 8, "64": [8, 27, 35, 44, 45, 58, 69, 73], "block": [8, 16, 60, 71], "matrix": [8, 10, 40, 41, 49, 54, 60, 68, 71], "variabl": [8, 10, 19, 20, 60, 71], "cutlass": 9, "mm_config": [10, 40, 41], "float8mmconfig": [10, 40, 41], "configur": [10, 40, 41, 67, 68], "multipl": [10, 31, 40, 41, 49, 54, 68, 71, 73], "involv": [10, 71], "tinygemm": [11, 45, 51, 67, 68], "_weight_int4pack_mm_for_cpu": [11, 45], "version": [11, 68, 73], "least": 11, "6": [11, 51, 67, 68, 71], "It": [12, 15, 17, 19, 71, 73], "pre": [12, 15, 19, 68, 71], "process": [12, 15, 17, 19, 20, 31, 56, 67, 71, 76], "post": [12, 19, 73], "addit": [12, 17, 31, 39, 71, 73], "design": [12, 15, 18], "extend": [12, 67, 71], "behavior": 12, "conjunct": 12, "tensorimpl": 12, "custom": [12, 59, 65, 67, 68, 71, 73], "interact": [12, 67], "qqq": [13, 14, 26], "marlinqqq": 14, "inherit": [14, 17, 73], "choose_qparams_and_quantize_affine_qqq": 14, "dequantize_affine_qqq": 14, "handl": [15, 18, 19, 31, 67], "pattern": [15, 18, 67], "ensur": 15, "preprocess": [15, 18], "manag": 15, "pre_process": 15, "1\u00ba": 15, "transpos": [15, 67, 73], "sinc": [15, 59, 67, 69, 71, 73], "layer": [15, 31, 40, 42, 45, 47, 48, 55, 56, 58, 60, 61, 62, 71, 73], "2\u00ba": 15, "inject": 15, "3\u00ba": 15, "again": [15, 16, 71], "becaus": [15, 67, 69, 71, 73], "dim": [15, 73], "tensor_meta": 16, "subclasstensorarg": 16, "n_block": 16, "scaler_block_s": [16, 27], "quantized_scal": 16, "quantization_factor": 16, "scaler_mean": 16, "quantized_data": 16, "nf4": 16, "qlora": 16, "convert_to_norm_float_weight": 16, "normal": [16, 27, 31, 71], "dequantize_scal": 16, "unpack": [16, 53, 67], "doubl": 16, "scaler": 16, "int8": [16, 46, 47, 48, 50, 51, 62, 64, 67, 73], "per_scaler_block": 16, "factor": [16, 49, 56, 71], "inpt_weight": 16, "double_quantize_scal": 16, "achiev": [16, 71, 73], "calcul": [16, 28, 32, 34, 55, 67, 71], "absmax": 16, "find": [16, 71], "posit": 16, "typic": [16, 17, 32, 67, 69], "per_block": 16, "int16": 16, "n_scaler_block": 16, "get_original_weight": 16, "quantize_tensor_nearest": 16, "float16": [16, 35, 38, 52, 71], "nearest": 16, "round": [16, 28, 32, 73], "up": [16, 51, 67, 68, 71], "most": [17, 67, 71], "doe": [17, 45, 67, 71, 73], "metadata": [17, 67, 73], "step": [17, 31, 67, 71], "requir": [17, 19, 32, 67, 71, 73], "semi": [18, 63, 64, 71], "structur": [18, 63, 64, 68, 69, 71, 73], "matric": [18, 71], "where": [18, 28, 34, 53, 58, 67, 71], "everi": [18, 59, 71, 73], "four": 18, "prune": [18, 60], "conform": 18, "inner_k_til": [19, 45, 68], "8": [19, 20, 28, 32, 44, 45, 67, 68], "core": [19, 29, 67], "tile": [19, 67], "fit": [19, 67, 69], "effici": [19, 68, 71], "function": [19, 31, 35, 50, 51, 59, 60, 61, 64, 68, 69, 71, 73], "affect": [19, 71], "matmul": [19, 42, 67, 71, 73], "pack_dim": [20, 58], "uintx": [20, 58, 67], "smaller": [20, 44, 45, 46, 58, 68, 69], "bit": [20, 27, 43, 44, 53, 58, 73], "width": [20, 44], "than": [20, 67, 71, 73], "standard": [20, 67], "byte": [20, 43, 58], "uintxtensor": 20, "determin": [20, 32, 38, 44, 45, 71], "along": [20, 71], "indic": [20, 30, 32, 71], "last": 20, "256": [27, 45], "name": [28, 29, 30, 51, 56, 60, 64, 71, 73], "qualnam": [28, 29, 30], "boundari": [28, 29, 30], "number": [28, 31, 53, 58, 60, 71, 73], "map": [28, 32, 67, 73], "symmetr": [28, 32, 40, 41, 42, 44, 46, 47, 48, 62, 73], "rang": [28, 71], "sai": [28, 52, 67], "3": [28, 31, 32, 52, 67, 68, 71, 76], "5": [28, 56, 60, 68, 71, 76], "7": [28, 32], "symmetric_no_clipping_err": 28, "variant": [28, 34, 73], "smin": 28, "smax": 28, "min_val_neg": [28, 73], "max_val_po": [28, 73], "By": [28, 71], "individu": [28, 71], "less": [28, 32, 71, 73], "error": [28, 31, 73], "neg": 28, "asymmetr": [28, 32, 44, 45, 46, 51, 58, 67, 68], "directli": [28, 34, 67, 71, 73], "placehold": 29, "yet": [29, 46, 73], "enum": 30, "whether": [30, 31, 32, 45, 51, 58, 73], "quantized_v": 30, "float_val": 30, "mid_point": 30, "example_input": [31, 68, 69], "qtensor_class_list": 31, "aqdefaultlinearweight": 31, "aqint8weightonlyquantizedlinearweight": 31, "aqint8weightonlyquantizedlinearweight2": 31, "aqint8dynamicallyquantizedlinearweight": 31, "filter_fn": [31, 51, 64], "interpol": 31, "85": 31, "manual": [31, 51], "set_inductor_config": [31, 51], "supress_autoquant_error": 31, "min_sqnr": 31, "aq_kwarg": 31, "autoquant": 31, "identifi": 31, "fastest": 31, "over": [31, 71], "potenti": [31, 71], "qtensor": 31, "prepar": [31, 55, 60, 67, 71], "search": [31, 71], "whose": 31, "exchang": 31, "autoquantizablelinearweight": 31, "calibr": [31, 34], "user": [31, 67, 68, 71, 73, 76], "seen": 31, "record": [31, 67], "so": [31, 67, 68, 69, 71, 73], "final": [31, 39, 51, 67, 68, 71], "benchmark": [31, 55], "member": 31, "pick": 31, "result": [31, 49, 53, 54, 67, 71], "highli": 31, "complet": 31, "simpli": [31, 71, 73], "had": [31, 73], "compil": [31, 51, 54, 67, 68, 73], "them": [31, 59, 67], "onc": [31, 71], "proce": 31, "combin": [31, 71, 73], "finalize_autoqu": 31, "been": [31, 73], "log": [31, 73], "nn": [31, 50, 51, 55, 56, 64, 67, 68, 69, 71, 73], "forward": [31, 59, 67, 68, 69, 71, 73], "pass": [31, 34, 59, 67, 73], "fulli": [31, 51, 56, 64, 71], "unless": 31, "list": [31, 36, 56, 60, 67, 68, 73], "default_autoquant_class_list": 31, "callabl": [31, 35, 50, 51, 57, 64], "contain": [31, 55, 56, 71, 73], "second": [31, 49, 67, 76], "stop": 31, "wait": [31, 67], "sever": 31, "automat": [31, 51, 73, 76], "config": [31, 50, 51, 60, 71], "suppress": 31, "accept": 31, "signal": 31, "nois": 31, "ration": 31, "wikipedia": 31, "org": [31, 43, 60, 67, 68, 71], "wiki": 31, "noise_ratio": 31, "v": 31, "non": [31, 67, 71, 73], "impact": [31, 44], "caus": 31, "too": 31, "larg": [31, 73], "numer": [31, 71], "resaon": 31, "40": 31, "adjust": 31, "keyword": 31, "usag": [31, 50], "example_input1": 31, "example_input2": 31, "fp32": [32, 36, 73], "bf16": [32, 67, 68, 71], "fp16": [32, 44], "optioanl": 32, "flag": 32, "exactli": [32, 45, 73], "pad": 32, "convolut": 32, "doesn": [32, 71], "t": [32, 60, 67, 68, 71, 73], "itself": [32, 71, 73], "sure": [32, 67], "correspond": [32, 51, 67, 69, 71, 73], "without": [32, 38, 39, 67, 71], "loss": [32, 71], "But": [32, 67, 73], "won": [32, 73], "gurante": 32, "don": [32, 60, 68, 71], "clamp": [32, 73], "request": [32, 36, 52], "min_val": [34, 67, 73], "max_val": [34, 67, 73], "instead": [34, 45, 59, 67, 68, 71, 73], "observ": [34, 59, 71], "obtain": 34, "track": [34, 67], "param": [34, 39, 60], "nbit": 35, "group_siz": [35, 44, 45, 46, 48, 50, 51, 58, 68], "axi": [35, 52], "compute_dtyp": 35, "str": [35, 51, 56, 57, 60, 64, 73], "verbos": 35, "raw_output": 35, "optimize_weight": 35, "optimize_weights_proximal_legaci": 35, "input_dtyp": 36, "output_dtyp": [36, 37, 52], "uint8": [36, 52, 67], "quant_dtyp": [38, 39], "fake": [38, 39, 50], "awar": [38, 39, 60, 71, 73], "equival": [38, 39, 56, 71], "cast": [38, 39], "valid": 38, "fake_quantize_affin": 39, "consum": 39, "outlier": 39, "mask": [39, 60, 71], "intermedi": 39, "activation_dtyp": [40, 41], "float8_e4m3fn": [40, 41, 42, 67], "weight_dtyp": [40, 41, 42], "pertensor": [40, 41], "perrow": [40, 41], "current": [40, 46, 51, 56, 60, 64, 71, 73], "fast": [40, 41, 71], "accumul": [40, 41], "float8_e4m": 41, "channel": [42, 47, 48, 59, 62], "sub": [43, 58, 71], "expon": [43, 53], "mantissa": [43, 53], "fp6_e3_m2": 43, "fp6_e2_m3": 43, "fp6": 43, "llm": 43, "paper": [43, 71, 76], "arxiv": [43, 60, 71], "ab": [43, 60, 71], "2401": 43, "14112": 43, "repo": 43, "usyd": 43, "fsalab": 43, "fp6_llm": 43, "renam": 43, "fpxtensorcoreaqttensorimpl": 43, "experiment": 43, "merg": 43, "to_affine_quantized_floatx": 43, "bit_width": 44, "packing_bitwidth": 44, "32": [44, 45, 46, 50, 51, 64, 68, 69, 73], "contigu": 44, "gemlit": 44, "triton": [44, 67], "its": [44, 71, 73], "associ": 44, "control": [44, 45, 46, 58, 60, 71], "grain": [44, 45, 46, 58, 73], "hardwar": [44, 67, 71], "leav": 44, "best": [44, 71], "choic": [44, 45], "128": [45, 73], "tensorcoretiledlayout": [45, 67, 68], "group": [45, 46, 58, 67, 68], "tensor_core_til": [45, 67], "int4mm": [45, 68], "aten": [45, 67, 73], "_weight_int4pack_mm": [45, 67], "tradit": 45, "chosen": [45, 71], "hqq": [45, 58, 67], "act_mapping_typ": [46, 47], "token": [46, 47, 62], "produc": 46, "backend": [46, 71], "did": 46, "lower": [46, 67, 71], "flow": [46, 71], "marlinqqqlayout": 46, "cutlassint4packedlayout": 46, "weight_only_decod": 47, "b": 49, "scales1": 49, "multipli": [49, 54, 71], "row": [49, 71], "rais": [49, 50, 54, 73], "assertionerror": [49, 54, 73], "expect": [49, 71, 73], "activation_config": 50, "fakequantizeconfig": 50, "weight_config": 50, "per_token": 50, "is_symmetr": 50, "embed": 50, "valueerror": 50, "apply_tensor_subclass": [51, 64, 67], "inplac": [51, 60, 68], "instanc": [51, 59, 64, 69, 73], "qualifi": [51, 56, 64, 71], "move": [51, 67], "speed": [51, 71], "predefin": 51, "execut": [51, 70, 73, 75], "path": [51, 54, 68], "customiz": 51, "int8_dynamic_activation_int4_weight": 51, "int8_dynamic_activation_int8_weight": [51, 64], "mm": [51, 73], "int4_weight_onli": [51, 67, 68, 69], "int8_weight_onli": 51, "sequenti": [51, 64], "1024": [51, 64, 68, 69], "write": [51, 65], "constructor": [51, 73], "to_affine_quantized_intx": [51, 67], "groupwis": 51, "groupsiz": [51, 52], "apply_weight_qu": 51, "lambda": 51, "x": [51, 58, 68, 69, 73, 76], "int32": [51, 67, 68], "15": [51, 68], "1e": 51, "def": [51, 64, 67, 68, 69, 73], "apply_weight_quant_to_linear": 51, "requires_grad": [51, 67, 73], "block0": 51, "submodul": 51, "fqn": [51, 60, 64], "isinst": [51, 64, 71, 73], "tabl": [52, 67, 71], "per_tensor": 52, "per_axi": 52, "per_group": 52, "low": [53, 71, 73], "00seeemm": 53, "fp6_e3m2": 53, "sign": 53, "mat2": 54, "safe": 54, "consid": [54, 67, 71], "cubla": 54, "fallback": 54, "i": [54, 71], "j": 54, "debug_skip_calibr": 55, "smoothquant": [55, 56], "smoothfakedynamicallyquantizedlinear": [55, 56], "debug": 55, "skip_fqn_list": 56, "cur_fqn": 56, "alpha": 56, "replac": [56, 71], "skip": [56, 60, 71], "being": [56, 67, 71], "input_quant_func": [57, 67], "quant_kwarg": 57, "dict": [57, 60, 73], "uint1": [58, 67], "uint7": [58, 67], "l2": [59, 71], "norm": [59, 60, 71], "buffer": 59, "x_orig": 59, "overridden": 59, "although": [59, 73], "recip": 59, "within": [59, 71], "afterward": 59, "former": 59, "care": [59, 69, 71], "hook": [59, 67], "while": [59, 60, 71, 73], "latter": 59, "silent": 59, "ignor": 59, "sparsity_level": [60, 71], "semi_structured_block_s": 60, "wanda": 60, "sparsifi": [60, 65, 69, 71], "propos": 60, "2306": 60, "11695": 60, "product": 60, "magnitud": [60, 71], "parametr": 60, "preserv": [60, 71], "deepcopi": [60, 68, 73], "squash_mask": [60, 71], "params_to_keep": 60, "params_to_keep_per_lay": 60, "squash": 60, "appropri": [60, 67], "sparse_param": 60, "attach": [60, 71], "kei": [60, 71, 76], "save": [60, 68, 69], "string": 60, "xdoctest": 60, "local": [60, 71], "undefin": 60, "hasattr": 60, "submodule1": 60, "linear1": [60, 68, 69, 73], "foo": 60, "bar": 60, "submodule2": 60, "linear42": 60, "baz": 60, "print": [60, 68, 69, 73, 76], "42": 60, "24": 60, "ones": [60, 67], "update_mask": 60, "tensor_nam": 60, "statist": [60, 67, 71], "retriev": 60, "act_per_input": 60, "Then": [60, 73], "metric": 60, "across": [60, 71, 73], "whole": 60, "simul": [61, 67, 71], "dnynam": 62, "moduel": 63, "sparsify_": 64, "essenti": 64, "semi_sparse_weight": 64, "semisparselayout": 64, "sparsemarlinlayout": 64, "sparse_api": 64, "librari": [65, 69], "gradient": [65, 71], "nativ": [65, 73], "readm": [65, 68, 71], "overal": [65, 68], "introduct": [65, 67], "recent": 65, "highlight": [65, 73, 76], "updat": [65, 68, 69, 71], "guid": [65, 67], "contributor": [65, 68], "serial": [65, 67], "advanc": [65, 73], "lai": 67, "stack": 67, "awq": 67, "gptq": 67, "codebookquantizedtensor": 67, "int1": 67, "float3": 67, "compon": [67, 73], "compos": [67, 71, 73], "overload": [67, 71], "term": [67, 71], "extra": 67, "empti": 67, "dev": 67, "discuss": [67, 73], "1833": 67, "No": [67, 69, 71], "matter": [67, 71], "end": [67, 71, 73, 76], "avail": 67, "later": [67, 73], "float3_e2_m0": 67, "float4_e2_m1": 67, "float4_e3_m0": 67, "float5_e2_m2": 67, "float5_e3_m1": 67, "float6_e2_m3": 67, "float6_e3_m2": 67, "float8_e5m2": 67, "float8_e4m3fnuz": 67, "float8_e5m2fnuz": 67, "plan": 67, "float4": 67, "float6": 67, "thei": [67, 71, 73], "becom": 67, "popular": 67, "part": [67, 71, 73], "uint2": 67, "117208": 67, "outsid": 67, "As": 67, "mention": 67, "criteria": 67, "wide": 67, "adopt": 67, "fundament": [67, 71], "until": 67, "evid": 67, "hopefulli": 67, "amen": 67, "haven": 67, "enough": 67, "ont": 67, "revisit": 67, "intx": 67, "connect": 67, "int4tensor": 67, "previou": 67, "between": [67, 71, 73], "preicison": 67, "mainli": 67, "There": [67, 73], "accommod": 67, "choose_qparams_affine_with_min_max": 67, "min": [67, 73], "int_matmul": 67, "int_scaled_matmul": 67, "reli": [67, 71, 73], "On": [67, 68], "top": [67, 73], "glue": 67, "everyth": 67, "togeth": 67, "build": [67, 71, 73], "construct": 67, "low_precision_v": 67, "high_precision_v": 67, "procedur": 67, "veri": [67, 71], "common": [67, 71], "straightforward": 67, "try": [67, 71, 73], "higher": [67, 73], "high_preicsion_v": 67, "especi": [67, 69, 71], "bitwidth": 67, "codebook": 67, "hardcod": 67, "select": 67, "multi": 67, "dimension": [67, 71], "view": [67, 73], "mkldnn": 67, "coo": [67, 71], "sparse_coo": [67, 71], "sparsetensorimpl": 67, "idea": [67, 71], "nice": [67, 71], "concept": [67, 76], "why": [67, 73, 76], "c": [67, 73], "conflict": 67, "properti": 67, "quantized_linear": 67, "semant": 67, "stai": [67, 68, 73], "develop": 67, "tradition": 67, "come": [67, 71, 72], "demonstr": [67, 68, 73], "purpos": [67, 73], "to_affine_quant": 67, "simplic": 67, "explain": 67, "simplest": [67, 71], "form": [67, 71], "easi": 67, "linear_modul": 67, "runtim": 67, "to_linear_activation_quant": 67, "quantized_weight": 67, "activation_and_weight_quant": 67, "encount": 67, "f": [67, 69, 71, 73], "input_qunat_func": 67, "redispatch": 67, "swap": [67, 71], "fx": 67, "symbolic_trac": 67, "prefer": [67, 68, 73], "easier": 67, "further": [67, 73], "modif": 67, "sampl": 67, "figur": [67, 71], "At": [67, 71], "collect": [67, 71], "thing": [67, 69, 71, 73], "address": 67, "stat": 67, "averag": 67, "calculate_qparam": 67, "affinequantizedminmaxobserv": 67, "insert_observer_": 67, "altern": [67, 73], "observedlinear": 67, "dataset": 67, "complic": [67, 71], "next": 67, "done": [67, 73], "manner": 67, "intend": 67, "autoround": 67, "multitensor": 67, "describ": [67, 69, 71, 76], "advis": 67, "focus": [67, 71], "todai": 67, "low_bit_optim": 67, "similar": [67, 71], "quantized_train": 67, "enabl": 67, "progress": 67, "lot": [67, 71], "includ": [67, 73], "walk": [67, 73, 76], "_convert_weight_to_int4pack": 67, "tensorcoretiledaqttensorimpl": 67, "_quantized_linear_op": 67, "goe": 67, "_aqt_qlinear_dispatch_t": 67, "dispatch": 67, "explan": 67, "wint4": 67, "explor": 68, "instal": 68, "latest": 68, "stabl": 68, "releas": 68, "pip": 68, "nightli": 68, "command": 68, "index": [68, 71], "url": 68, "download": [68, 74, 76], "whl": 68, "cu121": 68, "major": 68, "instruct": 68, "entri": 68, "mutat": 68, "insert": 68, "logic": [68, 73], "toi": [68, 73], "toylinearmodel": [68, 69], "__init__": [68, 69, 73], "super": [68, 69, 73], "linear2": [68, 69, 73], "eval": [68, 69], "faster": [68, 71], "model_bf16": 68, "leverag": [68, 73], "mix": 68, "readi": [68, 73], "in_featur": [68, 69, 73], "out_featur": [68, 73], "tensor_impl_dtyp": 68, "verifi": [68, 69, 73], "roughli": [68, 71], "quarter": 68, "os": 68, "tmp": 68, "int4_model": 68, "pt": 68, "bfloat16_model": 68, "int4_model_size_mb": 68, "getsiz": 68, "bfloat16_model_size_mb": 68, "2f": 68, "mb": [68, 69, 70, 75], "25": 68, "00": [68, 70, 75], "much": [68, 71], "torch_version_at_least_2_5": 68, "benchmark_model": 68, "temporari": 68, "workaround": 68, "num_run": 68, "100": [68, 73], "_dynamo": [68, 73], "reset": 68, "bf16_time": 68, "int4_tim": 68, "time": [68, 71, 73, 76], "3f": 68, "ms": 68, "1fx": 68, "a100": 68, "gpu": [68, 76], "80gb": 68, "30": 68, "393": 68, "410": 68, "9x": 68, "simpl": [68, 71, 73], "workflow": [68, 71], "visit": 68, "would": [68, 71, 73], "forget": 68, "good": [68, 73], "tempfil": 69, "get_model_size_in_byt": 69, "batch_siz": 69, "ref": 69, "namedtemporaryfil": 69, "state_dict": 69, "seek": [69, 71], "load": 69, "meta": 69, "m_load": 69, "load_state_dict": 69, "assign": 69, "re": [69, 73], "assert": [69, 73], "equal": [69, 71], "float_weight1": 69, "float_weight2": 69, "quantized_weight1": 69, "quantized_weight2": 69, "go": [69, 73, 76], "techinqu": 69, "reduct": [69, 71, 73], "around": 69, "4x": 69, "0625": 69, "reason": [69, 71], "avoid": [69, 71], "properli": 69, "003": [70, 75, 76], "total": [70, 75, 76], "galleri": [70, 74, 76], "mem": [70, 75], "templat": [70, 74, 75], "tutorials_sourc": 70, "template_tutori": [70, 75, 76], "neural": 71, "network": [71, 73], "reduc": 71, "overhead": 71, "latenc": 71, "carefulli": 71, "signific": 71, "pai": 71, "price": 71, "qualiti": 71, "accuraci": 71, "f1": 71, "problem": [71, 73], "research": [71, 76], "face": 71, "fragment": 71, "rightfulli": 71, "spent": 71, "compress": 71, "place": 71, "dens": 71, "solv": [71, 73], "focu": [71, 73], "realli": 71, "push": 71, "accur": 71, "concret": 71, "hope": 71, "modular": 71, "acceler": 71, "scratch": [71, 76], "minim": 71, "recov": 71, "algorthim": 71, "realiz": 71, "improv": 71, "trade": 71, "off": 71, "degrad": 71, "architectur": 71, "theoret": 71, "gain": 71, "2x": 71, "analog": 71, "fix": 71, "50": 71, "unstructur": 71, "One": [71, 73], "howev": 71, "close": 71, "relat": 71, "mitig": 71, "retrain": 71, "neglig": 71, "area": 71, "agre": 71, "upon": 71, "consensu": 71, "mind": 71, "thought": 71, "separ": 71, "subproblem": 71, "satisfi": 71, "consist": [71, 73], "answer": 71, "independ": 71, "frontend": 71, "arbitrari": 71, "handoff": 71, "piec": 71, "miss": 71, "natur": [71, 73], "present": 71, "clear": 71, "contract": 71, "7x": 71, "advantag": 71, "anticip": 71, "mani": [71, 73], "solut": 71, "third": 71, "parti": 71, "to_sparse_semi_structur": 71, "sparsesemistructuredtensor": 71, "weightnormsparsifi": 71, "half": 71, "subnetwork": 71, "sparse_config": 71, "mod": [71, 73], "named_modul": 71, "append": 71, "tensor_fqn": 71, "sparse_block_shap": 71, "zeros_per_block": 71, "fakespars": 71, "manipul": 71, "dictionari": 71, "paramer": 71, "parameter": 71, "necessari": [71, 73], "ve": 71, "suitabl": 71, "fuse": [71, 73], "0s": 71, "spot": 71, "definit": 71, "academia": 71, "industri": 71, "often": [71, 73], "interchang": 71, "confus": 71, "distinct": 71, "pretrain": 71, "behind": 71, "box": 71, "those": [71, 73], "loos": 71, "speak": 71, "tightli": 71, "coupl": [71, 73], "nvidia": 71, "csc": 71, "fbgemm": 71, "qnnpack": 71, "descript": 71, "coordin": 71, "vector": 71, "locat": 71, "bsr": 71, "sparse_bsr": 71, "except": [71, 73], "scalar": 71, "csr": 71, "sparse_csr": 71, "sparse_csc": 71, "column": 71, "compact": 71, "sparse_matrix": 71, "1d": 71, "indexptr": 71, "\u00bd": 71, "bitmask": 71, "2bit": 71, "unprun": 71, "quit": [71, 73], "must": 71, "successfulli": 71, "These": [71, 73], "broken": 71, "down": 71, "Not": 71, "sensit": 71, "effect": [71, 73], "subsequ": [71, 73], "infinit": 71, "lost": 71, "degre": 71, "analysi": 71, "drop": 71, "give": [71, 73], "curv": 71, "proxi": 71, "aforement": 71, "smallest": 71, "absolut": 71, "vs": 71, "global": [71, 73], "scope": 71, "impli": 71, "pro": 71, "con": 71, "tradeoff": 71, "span": 71, "threshold": 71, "increas": 71, "complex": 71, "constant": [71, 73], "ctr_mobile_fe": 71, "score": 71, "w": 71, "tenosr": 71, "udpat": 71, "cannot": 71, "histori": 71, "regrow": 71, "dw": 71, "via": 71, "backprop": 71, "pat": 71, "unmask": 71, "resid": 71, "backward": 71, "salienc": 71, "lowest": 71, "l1": 71, "commonli": 71, "shown": 71, "abl": [71, 73], "ident": 71, "repeat": 71, "loop": 71, "shot": 71, "movement": 71, "inform": 71, "tune": 71, "2005": 71, "07683": 71, "rank": [71, 73], "wx": 71, "sqx": 71, "q": 71, "usual": 71, "sort": 71, "wise": 71, "reconstruct": 71, "random": 71, "randomli": 71, "remedi": 71, "line": 71, "item": [71, 76], "ultim": 71, "literatur": 71, "vision": 71, "nlp": [71, 76], "iter": 71, "ctr_feed": 71, "na": 71, "multimask": 71, "pyspeech": 71, "fastna": 71, "approach": [71, 73], "knowledg": [71, 76], "distil": 71, "pdf": 71, "2204": 71, "09656": 71, "arrang": 71, "recal": 71, "counterpart": 71, "slower": 71, "suffici": 71, "flexibl": [71, 73], "98": 71, "benefit": [71, 73], "special": 71, "exhibit": 71, "maintain": 71, "penalti": 71, "expens": [71, 73], "dictat": 71, "characterist": 71, "highest": 71, "wouldn": [71, 73], "visual": 71, "fig": 71, "4x4": 71, "benchmak": 71, "soon": 72, "foundat": 73, "extens": 73, "featur": 73, "autograd": 73, "distribut": 73, "express": 73, "interpos": 73, "namespac": 73, "continu": 73, "seamlessli": 73, "obviou": 73, "int8quantizedlinear": 73, "few": 73, "finer": 73, "intercept": 73, "slightli": 73, "contrast": 73, "long": 73, "better": 73, "clunki": 73, "distributedlinear": 73, "duplic": 73, "bypass": 73, "offer": 73, "outer": 73, "inner": 73, "allgath": 73, "bandwidth": 73, "rest": 73, "read": 73, "document": 73, "zoo": 73, "podcast": 73, "edward": 73, "yang": 73, "begin": 73, "int8_symmetric_quant": 73, "fp32_tensor": 73, "127": 73, "amin": 73, "keepdim": 73, "amax": 73, "zeros_lik": 73, "quantizedlinear": 73, "w_int8": 73, "cl": 73, "new_linear": 73, "left": 73, "toymodel": 73, "float_model": 73, "quantized_model": 73, "child": 73, "named_children": 73, "setattr": 73, "drawback": 73, "suppos": 73, "clean": 73, "limit": 73, "eleg": 73, "pretti": 73, "power": 73, "overrid": 73, "almost": 73, "shard": 73, "ragged": 73, "rag": 73, "nestedtensor": 73, "resourc": 73, "who": 73, "link": [73, 76], "googl": 73, "collab": 73, "flopcount": 73, "memorytrack": 73, "With": 73, "bare": 73, "bone": 73, "int8symmetrictensor": 73, "hold": 73, "staticmethod": 73, "disabl": 73, "__new__": 73, "_make_wrapper_subclass": 73, "storage_offset": 73, "ndim": 73, "__tensor_flatten__": 73, "attribut": 73, "pt2": 73, "__tensor_unflatten__": 73, "tensor_data_dict": 73, "extra_metadata": 73, "outer_s": 73, "outer_strid": 73, "undo": 73, "back": 73, "__repr__": 73, "repr": 73, "ahead": 73, "insid": 73, "int8_tensor": 73, "func": 73, "op_implementations_dict": 73, "conveni": 73, "register_op": 73, "_op": 73, "opoverload": 73, "impl_decor": 73, "op_impl": 73, "wrapper": 73, "particular": 73, "largest": 73, "tell": 73, "desugar": 73, "decor": 73, "surfac": 73, "coverag": 73, "though": 73, "brute": 73, "forc": 73, "repeatedli": 73, "loggingtensor": 73, "_python_dispatch": 73, "return_and_correct_alias": 73, "int8_mm": 73, "detach": 73, "int8_view_op": 73, "out_data": 73, "out_scal": 73, "notic": 73, "quickli": 73, "hit": 73, "background": 73, "decomposit": 73, "live": 73, "decomp": 73, "shrink": 73, "author": [73, 76], "pain": 73, "rather": 73, "underli": 73, "worth": 73, "written": 73, "differenti": 73, "nuanc": 73, "longer": 73, "That": 73, "transposit": 73, "got": 73, "propag": 73, "fact": 73, "themselv": 73, "pointwis": 73, "alwai": 73, "were": 73, "might": 73, "unwrap": 73, "dim0": 73, "dim1": 73, "confirm": 73, "quantized_model_module_swap": 73, "quantized_model_subclass": 73, "subclass_param": 73, "no_grad": 73, "out_module_swap": 73, "allclos": 73, "out_compil": 73, "seri": 73, "wa": 73, "tutorials_python": 74, "zip": [74, 76], "jupyt": [74, 76], "notebook": [74, 76], "tutorials_jupyt": 74, "sphinx": [74, 76], "firstnam": 76, "lastnam": 76, "prerequisit": 76, "v2": 76, "topic": 76, "rand": 76, "5060": 76, "1671": 76, "2317": 76, "7330": 76, "9476": 76, "1239": 76, "3325": 76, "8057": 76, "8212": 76, "6673": 76, "2430": 76, "0813": 76, "0879": 76, "3014": 76, "4889": 76, "practic": 76, "test": 76, "summar": 76, "takeawai": 76, "link1": 76, "link2": 76, "minut": 76, "ipynb": 76}, "objects": {"torchao.dtypes": [[7, 0, 1, "", "AffineQuantizedTensor"], [8, 0, 1, "", "BlockSparseLayout"], [9, 0, 1, "", "CutlassInt4PackedLayout"], [10, 0, 1, "", "Float8Layout"], [11, 0, 1, "", "Int4CPULayout"], [12, 0, 1, "", "Layout"], [13, 0, 1, "", "MarlinQQQLayout"], [14, 0, 1, "", "MarlinQQQTensor"], [15, 0, 1, "", "MarlinSparseLayout"], [16, 0, 1, "", "NF4Tensor"], [17, 0, 1, "", "PlainLayout"], [18, 0, 1, "", "SemiSparseLayout"], [19, 0, 1, "", "TensorCoreTiledLayout"], [20, 0, 1, "", "UintxLayout"], [21, 2, 1, "", "to_affine_quantized_floatx"], [22, 2, 1, "", "to_affine_quantized_floatx_static"], [23, 2, 1, "", "to_affine_quantized_fpx"], [24, 2, 1, "", "to_affine_quantized_intx"], [25, 2, 1, "", "to_affine_quantized_intx_static"], [26, 2, 1, "", "to_marlinqqq_quantized_intx"], [27, 2, 1, "", "to_nf4"]], "torchao.dtypes.AffineQuantizedTensor": [[7, 1, 1, "", "dequantize"], [7, 1, 1, "", "from_hp_to_floatx"], [7, 1, 1, "", "from_hp_to_floatx_static"], [7, 1, 1, "", "from_hp_to_fpx"], [7, 1, 1, "", "from_hp_to_intx"], [7, 1, 1, "", "from_hp_to_intx_static"], [7, 1, 1, "", "to"]], "torchao.dtypes.MarlinQQQTensor": [[14, 1, 1, "", "dequantize"], [14, 1, 1, "", "from_hp_to_intx"]], "torchao.dtypes.MarlinSparseLayout": [[15, 1, 1, "", "pre_process"]], "torchao.dtypes.NF4Tensor": [[16, 1, 1, "", "convert_to_norm_float_weight"], [16, 1, 1, "", "dequantize"], [16, 1, 1, "", "dequantize_scalers"], [16, 1, 1, "", "double_quantize_scalers"], [16, 1, 1, "", "get_original_weight"], [16, 1, 1, "", "quantize_tensor_nearest"]], "torchao.quantization": [[28, 0, 1, "", "MappingType"], [29, 0, 1, "", "TorchAODType"], [30, 0, 1, "", "ZeroPointDomain"], [31, 2, 1, "", "autoquant"], [32, 2, 1, "", "choose_qparams_affine"], [33, 2, 1, "", "choose_qparams_affine_floatx"], [34, 2, 1, "", "choose_qparams_affine_with_min_max"], [35, 2, 1, "", "choose_qparams_and_quantize_affine_hqq"], [36, 2, 1, "", "dequantize_affine"], [37, 2, 1, "", "dequantize_affine_floatx"], [38, 2, 1, "", "fake_quantize_affine"], [39, 2, 1, "", "fake_quantize_affine_cachemask"], [40, 2, 1, "", "float8_dynamic_activation_float8_weight"], [41, 2, 1, "", "float8_static_activation_float8_weight"], [42, 2, 1, "", "float8_weight_only"], [43, 2, 1, "", "fpx_weight_only"], [44, 2, 1, "", "gemlite_uintx_weight_only"], [45, 2, 1, "", "int4_weight_only"], [46, 2, 1, "", "int8_dynamic_activation_int4_weight"], [47, 2, 1, "", "int8_dynamic_activation_int8_weight"], [48, 2, 1, "", "int8_weight_only"], [49, 2, 1, "", "int_scaled_matmul"], [50, 2, 1, "", "intx_quantization_aware_training"], [51, 2, 1, "", "quantize_"], [52, 2, 1, "", "quantize_affine"], [53, 2, 1, "", "quantize_affine_floatx"], [54, 2, 1, "", "safe_int_mm"], [55, 2, 1, "", "smooth_fq_linear_to_inference"], [56, 2, 1, "", "swap_linear_with_smooth_fq_linear"], [57, 2, 1, "", "to_linear_activation_quantized"], [58, 2, 1, "", "uintx_weight_only"]], "torchao": [[4, 3, 0, "-", "sparsity"]], "torchao.sparsity": [[59, 0, 1, "", "PerChannelNormObserver"], [60, 0, 1, "", "WandaSparsifier"], [61, 2, 1, "", "apply_fake_sparsity"], [62, 2, 1, "", "int8_dynamic_activation_int8_semi_sparse_weight"], [63, 2, 1, "", "semi_sparse_weight"], [64, 2, 1, "", "sparsify_"]], "torchao.sparsity.PerChannelNormObserver": [[59, 1, 1, "", "forward"]], "torchao.sparsity.WandaSparsifier": [[60, 1, 1, "", "prepare"], [60, 1, 1, "", "squash_mask"], [60, 1, 1, "", "update_mask"]]}, "objtypes": {"0": "py:class", "1": "py:method", "2": "py:function", "3": "py:module"}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "method", "Python method"], "2": ["py", "function", "Python function"], "3": ["py", "module", "Python module"]}, "titleterms": {"torchao": [0, 1, 2, 3, 4, 5, 65, 67], "dtype": [0, 6, 67], "layout": [0, 5, 12, 67], "tensor": [0, 5, 67, 72, 73], "subclass": [0, 5, 67, 73], "quantiz": [0, 3, 51, 67, 68, 72, 73], "techniqu": 0, "api": [1, 3, 65], "refer": [1, 65], "python": 1, "kernel": [2, 5, 66, 67], "main": 3, "quantize_": 3, "primit": [3, 67], "other": [3, 5, 67], "sparsiti": [4, 71], "contributor": 5, "guid": [5, 68], "gener": 5, "extend": 5, "ad": [5, 67], "effici": [5, 67], "custom": 5, "triton": 5, "hand": 5, "written": 5, "dispatch": 5, "tensorimpl": [5, 67], "flow": [5, 67, 69], "us": 5, "torch": 5, "compil": 5, "perform": [5, 66], "serial": [5, 69], "featur": 5, "support": [5, 67], "function": [5, 67], "compos": 5, "test": 5, "microbenchmark": 5, "model": [5, 67, 69], "benchmark": 5, "eval": 5, "affinequantizedtensor": 7, "blocksparselayout": 8, "cutlassint4packedlayout": 9, "float8layout": 10, "int4cpulayout": 11, "marlinqqqlayout": 13, "marlinqqqtensor": 14, "marlinsparselayout": 15, "nf4tensor": 16, "plainlayout": 17, "semisparselayout": 18, "tensorcoretiledlayout": 19, "uintxlayout": 20, "to_affine_quantized_floatx": 21, "to_affine_quantized_floatx_stat": 22, "to_affine_quantized_fpx": 23, "to_affine_quantized_intx": 24, "to_affine_quantized_intx_stat": 25, "to_marlinqqq_quantized_intx": 26, "to_nf4": 27, "mappingtyp": 28, "torchaodtyp": 29, "zeropointdomain": 30, "autoqu": 31, "choose_qparams_affin": 32, "choose_qparams_affine_floatx": 33, "choose_qparams_affine_with_min_max": 34, "choose_qparams_and_quantize_affine_hqq": 35, "dequantize_affin": 36, "dequantize_affine_floatx": 37, "fake_quantize_affin": 38, "fake_quantize_affine_cachemask": 39, "float8_dynamic_activation_float8_weight": 40, "float8_static_activation_float8_weight": 41, "float8_weight_onli": 42, "fpx_weight_onli": 43, "gemlite_uintx_weight_onli": 44, "int4_weight_onli": 45, "int8_dynamic_activation_int4_weight": 46, "int8_dynamic_activation_int8_weight": 47, "int8_weight_onli": 48, "int_scaled_matmul": 49, "intx_quantization_aware_train": 50, "quantize_affin": 52, "quantize_affine_floatx": 53, "safe_int_mm": 54, "smooth_fq_linear_to_infer": 55, "swap_linear_with_smooth_fq_linear": 56, "to_linear_activation_quant": 57, "uintx_weight_onli": 58, "perchannelnormobserv": 59, "wandasparsifi": 60, "apply_fake_spars": 61, "int8_dynamic_activation_int8_semi_sparse_weight": 62, "semi_sparse_weight": 63, "sparsifi": 64, "welcom": 65, "document": 65, "get": 65, "start": [65, 68], "develop": 65, "note": 65, "tutori": [65, 76], "overview": [67, 71, 76], "basic": 67, "current": 67, "placehold": 67, "pytorch": 67, "implement": [67, 73], "oper": [67, 73], "integr": 67, "nativ": 67, "factori": 67, "op": 67, "deriv": 67, "algorithm": 67, "weight": 67, "onli": 67, "dynam": 67, "activ": 67, "static": 67, "insert": 67, "observ": 67, "how": 67, "defin": 67, "modul": [67, 73], "add": 67, "calibr": 67, "train": 67, "awar": 67, "low": 67, "bit": 67, "optim": [67, 69], "case": 67, "studi": 67, "int4": 67, "work": 67, "dure": 67, "execut": 67, "save": 67, "load": 67, "quick": 68, "first": 68, "exampl": 68, "next": [68, 73], "step": [68, 73, 76], "deseri": 69, "what": [69, 73], "happen": 69, "when": 69, "an": 69, "comput": [70, 75], "time": [70, 75], "goal": 71, "design": 71, "context": 71, "prune": 71, "configur": 71, "criteria": 71, "strategi": 71, "pattern": 71, "write": [72, 73], "your": [72, 73], "own": [72, 73], "advanc": 72, "ar": 73, "swap": 73, "which": 73, "should": 73, "we": 73, "compar": 73, "output": 73, "templat": 76, "option": 76, "addit": 76, "exercis": 76, "conclus": 76, "further": 76, "read": 76}, "envversion": {"sphinx.domains.c": 2, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 6, "sphinx.domains.index": 1, "sphinx.domains.javascript": 2, "sphinx.domains.math": 2, "sphinx.domains.python": 3, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.todo": 2, "sphinx.ext.viewcode": 1, "sphinx": 56}})
\ No newline at end of file
+Search.setIndex({"docnames": ["api_ref_dtypes", "api_ref_intro", "api_ref_kernel", "api_ref_quantization", "api_ref_sparsity", "contributor_guide", "dtypes", "generated/torchao.dtypes.AffineQuantizedTensor", "generated/torchao.dtypes.BlockSparseLayout", "generated/torchao.dtypes.CutlassInt4PackedLayout", "generated/torchao.dtypes.Float8Layout", "generated/torchao.dtypes.Int4CPULayout", "generated/torchao.dtypes.Layout", "generated/torchao.dtypes.MarlinQQQLayout", "generated/torchao.dtypes.MarlinQQQTensor", "generated/torchao.dtypes.MarlinSparseLayout", "generated/torchao.dtypes.NF4Tensor", "generated/torchao.dtypes.PlainLayout", "generated/torchao.dtypes.SemiSparseLayout", "generated/torchao.dtypes.TensorCoreTiledLayout", "generated/torchao.dtypes.UintxLayout", "generated/torchao.dtypes.to_affine_quantized_floatx", "generated/torchao.dtypes.to_affine_quantized_floatx_static", "generated/torchao.dtypes.to_affine_quantized_fpx", "generated/torchao.dtypes.to_affine_quantized_intx", "generated/torchao.dtypes.to_affine_quantized_intx_static", "generated/torchao.dtypes.to_marlinqqq_quantized_intx", "generated/torchao.dtypes.to_nf4", "generated/torchao.quantization.MappingType", "generated/torchao.quantization.TorchAODType", "generated/torchao.quantization.ZeroPointDomain", "generated/torchao.quantization.autoquant", "generated/torchao.quantization.choose_qparams_affine", "generated/torchao.quantization.choose_qparams_affine_floatx", "generated/torchao.quantization.choose_qparams_affine_with_min_max", "generated/torchao.quantization.choose_qparams_and_quantize_affine_hqq", "generated/torchao.quantization.dequantize_affine", "generated/torchao.quantization.dequantize_affine_floatx", "generated/torchao.quantization.fake_quantize_affine", "generated/torchao.quantization.fake_quantize_affine_cachemask", "generated/torchao.quantization.float8_dynamic_activation_float8_weight", "generated/torchao.quantization.float8_static_activation_float8_weight", "generated/torchao.quantization.float8_weight_only", "generated/torchao.quantization.fpx_weight_only", "generated/torchao.quantization.gemlite_uintx_weight_only", "generated/torchao.quantization.int4_weight_only", "generated/torchao.quantization.int8_dynamic_activation_int4_weight", "generated/torchao.quantization.int8_dynamic_activation_int8_weight", "generated/torchao.quantization.int8_weight_only", "generated/torchao.quantization.int_scaled_matmul", "generated/torchao.quantization.intx_quantization_aware_training", "generated/torchao.quantization.quantize_", "generated/torchao.quantization.quantize_affine", "generated/torchao.quantization.quantize_affine_floatx", "generated/torchao.quantization.safe_int_mm", "generated/torchao.quantization.smooth_fq_linear_to_inference", "generated/torchao.quantization.swap_linear_with_smooth_fq_linear", "generated/torchao.quantization.to_linear_activation_quantized", "generated/torchao.quantization.uintx_weight_only", "generated/torchao.sparsity.PerChannelNormObserver", "generated/torchao.sparsity.WandaSparsifier", "generated/torchao.sparsity.apply_fake_sparsity", "generated/torchao.sparsity.int8_dynamic_activation_int8_semi_sparse_weight", "generated/torchao.sparsity.semi_sparse_weight", "generated/torchao.sparsity.sparsify_", "index", "performant_kernels", "quantization", "quick_start", "serialization", "sg_execution_times", "sparsity", "subclass_advanced", "subclass_basic", "tutorials/index", "tutorials/sg_execution_times", "tutorials/template_tutorial"], "filenames": ["api_ref_dtypes.rst", "api_ref_intro.rst", "api_ref_kernel.rst", "api_ref_quantization.rst", "api_ref_sparsity.rst", "contributor_guide.rst", "dtypes.rst", "generated/torchao.dtypes.AffineQuantizedTensor.rst", "generated/torchao.dtypes.BlockSparseLayout.rst", "generated/torchao.dtypes.CutlassInt4PackedLayout.rst", "generated/torchao.dtypes.Float8Layout.rst", "generated/torchao.dtypes.Int4CPULayout.rst", "generated/torchao.dtypes.Layout.rst", "generated/torchao.dtypes.MarlinQQQLayout.rst", "generated/torchao.dtypes.MarlinQQQTensor.rst", "generated/torchao.dtypes.MarlinSparseLayout.rst", "generated/torchao.dtypes.NF4Tensor.rst", "generated/torchao.dtypes.PlainLayout.rst", "generated/torchao.dtypes.SemiSparseLayout.rst", "generated/torchao.dtypes.TensorCoreTiledLayout.rst", "generated/torchao.dtypes.UintxLayout.rst", "generated/torchao.dtypes.to_affine_quantized_floatx.rst", "generated/torchao.dtypes.to_affine_quantized_floatx_static.rst", "generated/torchao.dtypes.to_affine_quantized_fpx.rst", "generated/torchao.dtypes.to_affine_quantized_intx.rst", "generated/torchao.dtypes.to_affine_quantized_intx_static.rst", "generated/torchao.dtypes.to_marlinqqq_quantized_intx.rst", "generated/torchao.dtypes.to_nf4.rst", "generated/torchao.quantization.MappingType.rst", "generated/torchao.quantization.TorchAODType.rst", "generated/torchao.quantization.ZeroPointDomain.rst", "generated/torchao.quantization.autoquant.rst", "generated/torchao.quantization.choose_qparams_affine.rst", "generated/torchao.quantization.choose_qparams_affine_floatx.rst", "generated/torchao.quantization.choose_qparams_affine_with_min_max.rst", "generated/torchao.quantization.choose_qparams_and_quantize_affine_hqq.rst", "generated/torchao.quantization.dequantize_affine.rst", "generated/torchao.quantization.dequantize_affine_floatx.rst", "generated/torchao.quantization.fake_quantize_affine.rst", "generated/torchao.quantization.fake_quantize_affine_cachemask.rst", "generated/torchao.quantization.float8_dynamic_activation_float8_weight.rst", "generated/torchao.quantization.float8_static_activation_float8_weight.rst", "generated/torchao.quantization.float8_weight_only.rst", "generated/torchao.quantization.fpx_weight_only.rst", "generated/torchao.quantization.gemlite_uintx_weight_only.rst", "generated/torchao.quantization.int4_weight_only.rst", "generated/torchao.quantization.int8_dynamic_activation_int4_weight.rst", "generated/torchao.quantization.int8_dynamic_activation_int8_weight.rst", "generated/torchao.quantization.int8_weight_only.rst", "generated/torchao.quantization.int_scaled_matmul.rst", "generated/torchao.quantization.intx_quantization_aware_training.rst", "generated/torchao.quantization.quantize_.rst", "generated/torchao.quantization.quantize_affine.rst", "generated/torchao.quantization.quantize_affine_floatx.rst", "generated/torchao.quantization.safe_int_mm.rst", "generated/torchao.quantization.smooth_fq_linear_to_inference.rst", "generated/torchao.quantization.swap_linear_with_smooth_fq_linear.rst", "generated/torchao.quantization.to_linear_activation_quantized.rst", "generated/torchao.quantization.uintx_weight_only.rst", "generated/torchao.sparsity.PerChannelNormObserver.rst", "generated/torchao.sparsity.WandaSparsifier.rst", "generated/torchao.sparsity.apply_fake_sparsity.rst", "generated/torchao.sparsity.int8_dynamic_activation_int8_semi_sparse_weight.rst", "generated/torchao.sparsity.semi_sparse_weight.rst", "generated/torchao.sparsity.sparsify_.rst", "index.rst", "performant_kernels.rst", "quantization.rst", "quick_start.rst", "serialization.rst", "sg_execution_times.rst", "sparsity.rst", "subclass_advanced.rst", "subclass_basic.rst", "tutorials/index.rst", "tutorials/sg_execution_times.rst", "tutorials/template_tutorial.rst"], "titles": ["torchao.dtypes", "torchao
API Reference", "torchao.kernel", "torchao.quantization", "torchao.sparsity", "Contributor Guide", "Dtypes", "AffineQuantizedTensor", "BlockSparseLayout", "CutlassInt4PackedLayout", "Float8Layout", "Int4CPULayout", "Layout", "MarlinQQQLayout", "MarlinQQQTensor", "MarlinSparseLayout", "NF4Tensor", "PlainLayout", "SemiSparseLayout", "TensorCoreTiledLayout", "UintxLayout", "to_affine_quantized_floatx", "to_affine_quantized_floatx_static", "to_affine_quantized_fpx", "to_affine_quantized_intx", "to_affine_quantized_intx_static", "to_marlinqqq_quantized_intx", "to_nf4", "MappingType", "TorchAODType", "ZeroPointDomain", "autoquant", "choose_qparams_affine", "choose_qparams_affine_floatx", "choose_qparams_affine_with_min_max", "choose_qparams_and_quantize_affine_hqq", "dequantize_affine", "dequantize_affine_floatx", "fake_quantize_affine", "fake_quantize_affine_cachemask", "float8_dynamic_activation_float8_weight", "float8_static_activation_float8_weight", "float8_weight_only", "fpx_weight_only", "gemlite_uintx_weight_only", "int4_weight_only", "int8_dynamic_activation_int4_weight", "int8_dynamic_activation_int8_weight", "int8_weight_only", "int_scaled_matmul", "intx_quantization_aware_training", "quantize", "quantize_affine", "quantize_affine_floatx", "safe_int_mm", "smooth_fq_linear_to_inference", "swap_linear_with_smooth_fq_linear", "to_linear_activation_quantized", "uintx_weight_only", "PerChannelNormObserver", "WandaSparsifier", "apply_fake_sparsity", "int8_dynamic_activation_int8_semi_sparse_weight", "semi_sparse_weight", "sparsify", "Welcome to the torchao Documentation", "Performant Kernels", "Quantization Overview", "Quick Start Guide", "Serialization", "Computation times", "Sparsity Overview", "Writing Your Own Quantized Tensor (advanced)", "Writing Your Own Quantized Tensor", "<no title>", "Computation times", "Template Tutorial"], "terms": {"thi": [1, 5, 7, 15, 16, 17, 18, 20, 28, 31, 32, 34, 36, 38, 39, 43, 44, 45, 46, 51, 52, 59, 60, 61, 64, 67, 68, 69, 71, 73, 76], "section": [1, 5, 67, 71], "introduc": 1, "dive": 1, "detail": [1, 5, 31, 43, 67, 68, 71, 73], "how": [1, 5, 7, 12, 20, 28, 32, 45, 52, 68, 69, 71, 73], "integr": [1, 5, 69, 71, 73], "pytorch": [1, 5, 7, 11, 14, 29, 65, 68, 71, 73, 76], "optim": [1, 5, 15, 31, 35, 51, 65, 71, 73], "your": [1, 5, 51, 65, 67, 68, 71], "machin": 1, "learn": [1, 45, 68, 71, 76], "model": [1, 31, 44, 46, 50, 51, 55, 56, 60, 61, 64, 68, 71, 73], "dtype": [1, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 29, 31, 32, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 51, 52, 58, 64, 65, 68, 69, 73], "quantiz": [1, 5, 7, 9, 10, 11, 13, 14, 15, 16, 18, 19, 21, 24, 26, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 52, 53, 54, 55, 56, 57, 58, 62, 64, 65, 69, 71], "sparsiti": [1, 15, 18, 59, 60, 61, 62, 63, 64, 65, 67, 69], "tba": [2, 6, 66], "For": [5, 32, 43, 67, 68, 69, 71, 73], "new": [5, 7, 51, 67, 73], "case": [5, 31, 54, 71, 73], "exampl": [5, 7, 28, 31, 32, 50, 51, 60, 64, 67, 69, 70, 71, 73, 74, 75, 76], "train": [5, 38, 39, 65, 71, 73], "like": [5, 12, 31, 32, 67, 68, 69, 71, 73], "fp4": 5, "s": [5, 7, 28, 31, 32, 36, 38, 52, 53, 67, 68, 71, 73], "fine": [5, 44, 45, 46, 58, 71], "start": [5, 28, 29, 30, 31, 67, 71, 73], "prototyp": [5, 67], "folder": 5, "you": [5, 51, 60, 67, 68, 69, 71, 73, 76], "could": [5, 67, 73], "also": [5, 31, 51, 67, 68, 69, 71, 73], "take": [5, 16, 51, 59, 64, 67, 71], "look": [5, 7, 67, 71], "affinequantizedtensor": [5, 14, 22, 23, 25, 67, 68, 69, 73], "what": [5, 7, 14, 31, 67, 68, 71, 76], "want": [5, 51, 64, 67, 69, 71, 73], "do": [5, 29, 31, 32, 49, 51, 67, 71, 73], "mostli": [5, 34], "e": [5, 7, 28, 31, 32, 36, 38, 43, 51, 52, 53, 64, 67, 69, 73], "g": [5, 7, 28, 31, 32, 36, 38, 43, 51, 52, 64, 67, 69, 73], "int3": 5, "exact": 5, "same": [5, 7, 32, 34, 36, 38, 39, 40, 52, 54, 64, 67, 71, 73], "affin": [5, 7, 9, 10, 11, 15, 18, 19, 24, 36, 38, 51, 52, 64, 67], "pleas": [5, 7, 14, 43, 45, 65, 67, 71, 73], "feel": [5, 67, 71, 73], "free": [5, 67, 73], "open": [5, 67, 71], "an": [5, 7, 19, 24, 25, 31, 39, 50, 60, 65, 67, 71, 73], "issu": [5, 67, 68, 73], "have": [5, 28, 31, 32, 44, 45, 52, 60, 67, 71, 73], "question": [5, 67, 69, 71, 73], "specif": [5, 12, 15, 17, 18, 60, 67, 68, 69, 71], "more": [5, 31, 39, 43, 44, 45, 46, 58, 67, 68, 71, 73], "refer": [5, 71, 73], "our": [5, 16, 68, 71, 73], "overview": [5, 65, 68], "page": [5, 68], "To": [5, 7, 14, 31, 67, 68, 69, 71], "contribut": [5, 68, 71], "exist": [5, 29, 67, 71, 73], "code": [5, 45, 67, 68, 71, 73, 74, 76], "base": [5, 12, 17, 28, 60, 67, 68, 71, 73], "make": [5, 32, 67, 73], "trainabl": [5, 67, 73], "add": [5, 17, 51, 73, 76], "parallel": [5, 73], "etc": [5, 67], "affine_quantized_tensor": [5, 69], "py": [5, 7, 14, 70, 75, 76], "api": [5, 31, 67, 68, 71, 73], "quant_api": [5, 51, 69], "primit": [5, 7, 14, 73], "op": [5, 7, 14, 31, 32, 38, 39, 45, 51, 71, 73], "slight": [5, 71], "variat": [5, 67], "quant_primit": [5, 7, 14], "autotun": [5, 68], "cpu": [5, 7, 11, 69, 71], "cuda": [5, 7, 35, 51, 68, 69, 71, 73], "mp": 5, "csrc": 5, "mayb": 5, "well": [5, 12, 31, 67, 71], "spars": [5, 8, 15, 18, 60, 67, 71], "marlin": [5, 13, 14, 15, 26], "aqt": 5, "621": 5, "we": [5, 7, 16, 28, 31, 32, 34, 36, 38, 50, 51, 52, 58, 64, 67, 68, 69, 71], "ar": [5, 7, 10, 18, 20, 31, 32, 36, 38, 40, 43, 45, 50, 51, 52, 54, 60, 67, 68, 69, 71], "still": [5, 67, 71], "decid": [5, 67, 71], "split": 5, "can": [5, 19, 28, 31, 40, 44, 51, 52, 67, 68, 69, 71, 73], "implement": [5, 69, 71], "regist": [5, 59, 73], "mai": [5, 34, 67, 69], "need": [5, 32, 40, 59, 60, 67, 68, 69, 71, 73], "defin": [5, 12, 20, 43, 59, 60, 71, 73], "own": [5, 51, 65, 71], "through": [5, 34, 67, 68, 73, 76], "int4": [5, 9, 11, 28, 46, 50, 51, 64, 68, 69], "access": 5, "my_custom_op": 5, "devic": [5, 7, 35, 51, 54, 68, 69, 73], "check": [5, 7, 14, 67, 68, 69, 73], "condit": [5, 67], "__torch_function__": [5, 67, 73], "__torch_dispatch__": [5, 73], "target": [5, 32, 40, 41, 42, 45, 60, 71], "oper": [5, 7, 10, 12, 15, 34], "bfloat16": [5, 16, 38, 51, 52, 67, 68, 69, 71], "activ": [5, 31, 40, 41, 46, 47, 50, 55, 60, 62, 65, 71], "uint4": [5, 45, 51, 67, 68], "weight": [5, 15, 16, 31, 40, 41, 42, 44, 45, 46, 47, 48, 51, 58, 60, 62, 63, 64, 65, 68, 69, 71, 73], "found": [5, 67, 68, 71, 73], "here": [5, 7, 52, 67, 69, 73], "allow": [5, 71, 73], "peopl": [5, 67, 69], "linear": [5, 15, 31, 32, 40, 42, 45, 46, 47, 48, 50, 51, 56, 58, 61, 62, 63, 64, 67, 68, 69, 71, 73], "two": [5, 14, 18, 40, 67, 71, 73], "dispatch_condit": [5, 67], "impl": [5, 7, 67], "actual": [5, 42, 67, 73], "bia": [5, 67, 68, 69, 73], "run": [5, 31, 51, 55, 59, 64, 67, 71, 73, 76], "both": [5, 40, 67, 71, 73], "input_tensor": [5, 16, 67], "weight_tensor": [5, 67], "argument": [5, 7, 19, 31, 36, 51, 67], "register_aqt_quantized_linear_dispatch": 5, "show": [5, 52, 67, 71], "work": [5, 18, 44, 69, 71, 73], "sometim": [5, 71], "ha": [5, 7, 67, 71, 73], "pack": [5, 7, 9, 19, 20, 43, 44, 58, 67], "order": [5, 31, 67, 71, 73], "yield": [5, 71], "And": [5, 16, 40, 67, 73], "abstract": [5, 67], "see": [5, 7, 14, 43, 67, 68, 69, 71, 73], "full": [5, 76], "after": [5, 31, 67, 69, 71], "wrap": [5, 31, 73], "factori": 5, "convert": [5, 7, 14, 16, 21, 24, 26, 27, 51, 53, 63, 64, 67, 71], "from": [5, 7, 16, 17, 22, 23, 25, 32, 34, 36, 38, 43, 46, 50, 51, 52, 64, 67, 68, 69, 70, 71, 73, 75, 76], "float": [5, 7, 14, 16, 24, 26, 27, 28, 30, 31, 32, 34, 35, 36, 38, 39, 43, 45, 51, 52, 53, 56, 60, 64, 67, 69, 73], "point": [5, 7, 14, 26, 28, 30, 32, 36, 38, 43, 45, 52, 53, 64, 67, 68, 69, 71, 73], "my": [5, 71], "to_my_dtyp": 5, "mydtypetensor": 5, "from_float": [5, 73], "level": [5, 60, 67, 71, 73], "reus": [5, 67, 73], "quantize_": [5, 50, 51, 64, 67, 68, 69], "appli": [5, 31, 40, 41, 42, 44, 45, 46, 47, 48, 50, 51, 58, 62, 67, 68, 71], "convers": [5, 7, 51, 67], "filter": [5, 31], "choos": [5, 67, 71, 73], "which": [5, 14, 20, 31, 67, 68, 69, 71], "modul": [5, 28, 29, 30, 31, 50, 51, 55, 56, 59, 60, 64, 68, 69], "should": [5, 7, 32, 36, 38, 44, 52, 59, 60, 67, 71], "algorithm": [5, 45, 58, 71], "onli": [5, 11, 40, 42, 44, 45, 46, 48, 51, 58, 64, 68, 69, 71, 73], "dynam": [5, 40, 46, 47, 64, 73], "quant": [5, 7, 14, 43, 67], "static": [5, 7, 12, 16, 22, 25, 34, 41], "type": [5, 7, 15, 16, 20, 28, 29, 30, 31, 35, 40, 41, 42, 45, 46, 49, 52, 54, 65, 67, 69, 71, 73], "note": [5, 39, 50, 60, 67, 68, 71, 73], "2": [5, 7, 11, 15, 18, 28, 31, 32, 39, 45, 51, 52, 61, 62, 63, 64, 67, 68, 71, 73, 76], "4": [5, 15, 18, 27, 32, 35, 44, 61, 62, 63, 64, 67, 68, 69, 71, 73], "below": [5, 67, 71, 73, 76], "follow": [5, 45, 67, 68, 71, 73], "util": [5, 44, 67, 68, 69, 73], "import": [5, 32, 50, 51, 64, 68, 69, 71, 73, 76], "unwrap_tensor_subclass": [5, 68], "m_unwrap": 5, "m": [5, 51, 53, 64, 68, 69, 73], "In": [5, 67, 68, 71, 73], "compat": [5, 15, 68], "aim": [5, 67, 71], "fullgraph": [5, 68], "true": [5, 7, 24, 31, 32, 34, 35, 50, 51, 55, 64, 68, 69, 73], "first": [5, 16, 31, 49, 60, 67, 73], "remov": [5, 60, 71], "ani": [5, 17, 31, 57, 60, 67, 71, 73], "unnecessari": 5, "graph": 5, "break": 5, "torch_log": 5, "output_cod": 5, "when": [5, 7, 17, 32, 36, 38, 52, 67, 71], "script": [5, 68, 73, 76], "inductor": [5, 31, 51], "python": [5, 67, 71, 74, 76], "mode": [5, 31, 45, 68], "max": [5, 28, 67, 68, 73], "checkout": [5, 7, 14, 65, 67], "doc": [5, 67, 73], "huggingfac": 5, "transform": [5, 7, 67], "deseri": [5, 67], "save_pretrain": 5, "push_to_hub": 5, "from_pretrain": 5, "http": [5, 7, 14, 31, 43, 60, 68, 71], "co": 5, "main": [5, 7, 14, 45, 67, 68, 71, 73], "en": [5, 31], "anoth": [5, 67, 71, 73], "diffus": 5, "github": [5, 7, 14, 43, 68], "com": [5, 7, 14, 43], "sayakpaul": 5, "blob": [5, 7, 14], "infer": [5, 7, 55, 65, 67, 68, 69, 71, 73], "serialization_and_load": 5, "md": 5, "The": [5, 7, 8, 12, 15, 20, 31, 40, 41, 42, 43, 49, 51, 54, 55, 56, 60, 67, 68, 69, 71, 73], "abov": [5, 28, 67, 69, 71, 73], "just": [5, 28, 67, 69, 71, 73], "talk": [5, 67], "about": [5, 45, 67, 68, 69, 71], "basic": [5, 17, 68, 73], "provid": [5, 12, 15, 18, 19, 31, 32, 67, 71, 73], "fsdp": [5, 67], "ll": [5, 28, 32, 67, 73], "put": [5, 64], "developer_api_guid": 5, "cover": [5, 67, 76], "executorch": [5, 46, 51], "torchchat": 5, "todo": [5, 67], "qat": [5, 38, 39, 50], "suit": 5, "out": [5, 18, 28, 31, 60, 67, 68, 71, 73], "differ": [5, 12, 34, 45, 52, 54, 67, 68, 69, 71, 73], "system": 5, "dtensor": [5, 73], "recommend": [5, 31, 51], "copi": [5, 7, 60, 68, 69, 71, 73], "past": [5, 71], "adapt": 5, "now": [5, 43, 46, 67, 68, 71, 73], "befor": [5, 51, 67, 69, 71, 73], "some": [5, 31, 51, 60, 67, 71, 73], "singl": [5, 31, 34, 40, 68, 71], "comput": [5, 15, 19, 42, 59, 60, 71, 73], "intens": 5, "memori": [5, 7, 39, 68, 71, 73], "input": [5, 7, 15, 16, 18, 31, 32, 34, 36, 38, 39, 49, 51, 52, 54, 60, 64, 67, 73], "dimens": [5, 7, 20, 32, 36, 38, 49, 52, 58, 73], "get": [5, 16, 67, 71], "sens": [5, 67, 73], "speedup": [5, 45, 67, 68, 71], "d": [5, 67], "creat": [5, 7, 22, 23, 25, 67, 71, 73], "file": [5, 70, 73, 75], "benchmark_aq": 5, "shape": [5, 7, 14, 31, 49, 54, 68, 73], "A": [5, 7, 20, 31, 34, 39, 59, 71, 73], "quick": [5, 65], "wai": [5, 7, 31, 67, 71, 73], "relev": [5, 45, 67, 76], "chang": [5, 51, 67, 68, 69, 71, 73], "interest": [5, 67, 71, 73], "tutori": [5, 67, 70, 71, 73, 74, 75], "print_op_and_shap": 5, "output": [5, 31, 32, 36, 38, 52, 67, 71, 76], "torch_func": 5, "built": [5, 73], "k": [5, 54, 68, 69, 73], "n": [5, 68, 69, 73], "10": [5, 28, 52], "method": [5, 12, 15, 18, 19, 31, 51, 60, 71, 73], "_c": 5, "tensorbas": 5, "object": [5, 20, 73], "arg": [5, 7, 60, 73], "0": [5, 7, 31, 32, 51, 52, 56, 60, 68, 69, 70, 71, 73, 75, 76], "size": [5, 7, 8, 14, 16, 32, 36, 38, 44, 45, 46, 52, 58, 68, 69, 71, 73], "all": [5, 28, 31, 34, 59, 60, 61, 67, 68, 69, 70, 71, 73, 74], "under": [5, 51], "benchmark_your_kernel": 5, "helper": 5, "right": [5, 67, 71], "1": [5, 15, 20, 28, 29, 30, 31, 32, 35, 45, 51, 52, 58, 60, 67, 68, 69, 70, 71, 73, 75, 76], "either": [5, 7, 32, 36, 38, 40, 52, 60, 71], "one": [5, 31, 34, 40, 59, 67, 71, 73], "probabl": 5, "keep": [5, 15, 60], "futur": [5, 43], "llama": 5, "llama2": 5, "llama3": 5, "sam": 5, "alreadi": [5, 7, 31, 73], "modifi": [5, 51, 60, 67, 71, 73], "friendli": [5, 67], "compar": [5, 39, 45, 60, 67], "techniqu": [5, 69, 71, 73], "repres": [5, 7, 8, 10, 12, 23, 52, 60, 67, 69, 73], "bound": [5, 71], "help": [5, 67], "option": [5, 7, 10, 14, 21, 24, 25, 26, 31, 32, 34, 36, 38, 39, 40, 41, 44, 50, 51, 52, 55, 56, 57, 60, 64, 68], "each": [5, 16, 31, 55, 59, 67, 71, 73], "understand": 5, "profil": 5, "profile_path": 5, "chrome": 5, "trace": [5, 67], "let": [5, 28, 44, 52, 67, 68, 71, 73], "know": [5, 31, 73], "class": [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 28, 29, 30, 31, 59, 60, 67, 68, 69, 73], "torchao": [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 68, 69, 71, 73], "tensor_impl": [7, 14, 67], "aqttensorimpl": [7, 14], "block_siz": [7, 12, 14, 16, 21, 22, 24, 25, 26, 27, 32, 34, 36, 38, 39, 52, 68], "tupl": [7, 14, 16, 21, 22, 24, 25, 26, 32, 34, 35, 36, 38, 39, 40, 41, 52, 60, 73], "int": [7, 8, 14, 16, 19, 20, 21, 22, 24, 25, 26, 27, 32, 33, 34, 35, 36, 37, 38, 39, 43, 44, 45, 51, 52, 53, 60, 68, 73], "quant_min": [7, 14, 24, 25, 26, 28, 32, 34, 36, 38, 39, 52, 67, 68, 73], "union": [7, 14, 32, 36, 38, 39, 40, 41, 51, 52], "none": [7, 10, 14, 21, 24, 25, 26, 28, 29, 30, 31, 32, 34, 36, 38, 39, 40, 41, 44, 45, 48, 50, 51, 52, 55, 56, 57, 60, 64, 73], "quant_max": [7, 14, 24, 25, 26, 28, 32, 34, 36, 38, 39, 52, 67, 68, 73], "zero_point_domain": [7, 14, 24, 25, 26, 32, 34, 36, 38, 39, 45, 51, 52], "zeropointdomain": [7, 14, 24, 25, 26, 32, 34, 36, 38, 39, 45, 52], "stride": [7, 14, 67, 73], "sourc": [7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 58, 59, 60, 61, 62, 63, 64, 74, 76], "tensor": [7, 9, 10, 11, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 31, 32, 33, 34, 35, 36, 37, 38, 39, 41, 42, 45, 46, 49, 51, 52, 53, 54, 57, 60, 64, 65, 68, 69, 71, 76], "subclass": [7, 14, 31, 51, 59, 64, 68, 69, 71], "mean": [7, 16, 28, 32, 36, 38, 52, 53, 67, 68, 71], "quantized_tensor": 7, "float_tensor": [7, 73], "scale": [7, 12, 15, 22, 25, 28, 30, 32, 34, 36, 37, 38, 39, 41, 49, 52, 53, 55, 56, 67, 71, 73], "zero_point": [7, 12, 25, 30, 32, 34, 36, 38, 39, 45, 52, 67, 71, 73], "happen": [7, 14, 31, 67, 73], "dure": [7, 14, 31, 32, 36, 38, 52, 56, 71, 73], "choose_qparam": [7, 67], "dequant": [7, 14, 16, 36, 45, 67, 73], "ao": [7, 14, 71], "three": [7, 31, 60, 64, 67], "choose_qparams_affin": [7, 34, 45, 67], "quantize_affin": [7, 38, 39, 45, 67], "qand": 7, "dequantize_affin": [7, 38, 39, 45], "extern": 7, "regardless": 7, "intern": [7, 19], "represent": [7, 12, 23, 32, 45, 67, 71], "orient": 7, "field": 7, "serv": [7, 12, 73], "gener": [7, 38, 39, 67, 68, 71, 73, 74, 76], "storag": [7, 15, 67, 71], "data": [7, 8, 12, 15, 20, 34, 40, 41, 42, 45, 65, 67, 69, 71, 73], "store": [7, 15, 16, 20, 59, 67, 71], "plain": 7, "int_data": [7, 73], "format": [7, 15, 16, 43, 44, 53, 67, 71], "depend": [7, 31, 44, 69, 71, 73], "kernel": [7, 9, 11, 15, 19, 43, 44, 45, 51, 68, 71], "granular": [7, 32, 36, 38, 40, 41, 44, 45, 46, 52, 58, 67], "element": [7, 18, 20, 31, 32, 36, 38, 52, 71], "share": [7, 32, 36, 38, 52, 71], "qparam": [7, 32, 36, 38, 52], "us": [7, 10, 11, 12, 15, 16, 17, 20, 22, 25, 28, 31, 32, 34, 36, 38, 40, 41, 45, 46, 50, 51, 52, 58, 60, 65, 67, 68, 69, 71, 73], "per": [7, 32, 36, 38, 42, 45, 46, 47, 48, 52, 58, 60, 62, 67, 68, 71], "torch": [7, 15, 16, 20, 31, 32, 35, 36, 37, 38, 40, 41, 42, 45, 49, 50, 51, 52, 54, 55, 56, 58, 64, 67, 68, 69, 71, 73, 76], "origin": [7, 16, 38, 42, 52, 60, 67, 68, 69, 71], "high": [7, 21, 22, 23, 24, 25, 53, 67, 71, 73], "precis": [7, 21, 22, 23, 24, 25, 42, 53, 67, 73], "minimum": [7, 31, 32, 36, 38, 52], "valu": [7, 16, 28, 29, 30, 31, 32, 36, 38, 39, 45, 52, 55, 60, 67, 71, 73], "specifi": [7, 38, 44, 51, 52, 58, 60, 71], "deriv": [7, 34, 38, 52], "maximum": [7, 32, 36, 38, 52, 55], "domain": [7, 30, 32, 36, 38, 45, 52], "integ": [7, 24, 25, 28, 30, 32, 36, 38, 44, 45, 49, 52, 54], "zero": [7, 18, 32, 36, 38, 45, 52, 60, 71], "ad": [7, 32, 36, 38, 52, 60, 71, 73], "subtract": [7, 16, 32, 36, 38, 52], "unquant": [7, 32, 36, 38, 52], "default": [7, 8, 10, 17, 19, 20, 31, 32, 36, 38, 40, 41, 42, 45, 51, 52, 55, 56, 58, 73], "float32": [7, 36, 37, 38, 52, 53, 69, 71, 73], "given": [7, 14, 27, 32, 71], "return": [7, 14, 15, 16, 31, 39, 49, 50, 51, 54, 55, 56, 64, 67, 68, 69, 73], "classmethod": [7, 14, 73], "from_hp_to_floatx": 7, "input_float": [7, 14, 21, 22, 23, 24, 25, 26, 57], "target_dtyp": [7, 21, 22, 24, 25, 32, 34, 67], "_layout": [7, 14, 21, 22, 23, 24, 25, 26, 67, 68], "layout": [7, 8, 9, 10, 11, 13, 14, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 45, 46, 47, 64, 71], "scale_dtyp": [7, 21, 24, 32, 34], "float8": [7, 10, 21, 22, 40, 41, 42, 67], "from_hp_to_floatx_stat": 7, "paramet": [7, 12, 15, 16, 22, 25, 28, 31, 32, 36, 38, 40, 41, 42, 44, 45, 46, 49, 51, 52, 54, 55, 56, 58, 60, 64, 67, 69, 71, 73], "from_hp_to_fpx": 7, "floatx": [7, 23, 67], "ebit": [7, 23, 33, 37, 43, 53], "mbit": [7, 23, 33, 37, 43, 53], "support": [7, 23, 40, 46, 50, 64, 68, 69, 71, 73], "float1": [7, 23], "float7": [7, 23], "from_hp_to_intx": [7, 14], "mapping_typ": [7, 24, 32, 34, 46], "mappingtyp": [7, 24, 32, 34, 46, 47], "ep": [7, 24, 32, 34], "zero_point_dtyp": [7, 24, 32, 34, 51], "preserve_zero": [7, 24, 32, 34, 45, 51], "bool": [7, 24, 31, 32, 34, 35, 44, 51, 55, 64], "plainlayout": [7, 24, 25, 46, 47], "use_hqq": [7, 24, 45, 58], "fals": [7, 24, 31, 35, 45, 47, 50, 51, 55, 58, 60, 67, 68, 69, 73], "from_hp_to_intx_stat": 7, "kwarg": [7, 59, 60, 61, 73], "perform": [7, 19, 31, 44, 49, 54, 55, 59, 68, 71, 73], "self": [7, 67, 68, 69, 73], "If": [7, 10, 31, 32, 40, 49, 50, 54, 55, 60, 67, 68, 71, 73], "correct": [7, 15], "otherwis": [7, 67], "desir": [7, 31, 38], "call": [7, 31, 38, 39, 51, 59, 67, 68, 69, 71, 73], "non_block": 7, "memory_format": 7, "preserve_format": 7, "tri": [7, 71], "asynchron": 7, "respect": [7, 71], "host": 7, "possibl": [7, 71], "pin": 7, "set": [7, 10, 31, 34, 44, 51, 55, 60, 68, 71], "even": [7, 71], "match": [7, 36, 49, 71], "other": [7, 12, 60, 69, 71, 73, 76], "randn": [7, 68, 69, 73], "initi": [7, 67, 69], "float64": 7, "5044": 7, "0005": 7, "3310": 7, "0584": 7, "cuda0": 7, "blocksiz": 8, "64": [8, 27, 35, 44, 45, 58, 69, 73], "block": [8, 16, 60, 71], "matrix": [8, 10, 40, 41, 49, 54, 60, 68, 71], "variabl": [8, 10, 19, 20, 60, 71], "cutlass": 9, "mm_config": [10, 40, 41], "float8mmconfig": [10, 40, 41], "configur": [10, 40, 41, 67, 68], "multipl": [10, 31, 40, 41, 49, 54, 68, 71, 73], "involv": [10, 71], "tinygemm": [11, 45, 51, 67, 68], "_weight_int4pack_mm_for_cpu": [11, 45], "version": [11, 68, 73], "least": 11, "6": [11, 51, 67, 68, 71], "It": [12, 15, 17, 19, 71, 73], "pre": [12, 15, 19, 68, 71], "process": [12, 15, 17, 19, 20, 31, 56, 67, 71, 76], "post": [12, 19, 73], "addit": [12, 17, 31, 39, 71, 73], "design": [12, 15, 18], "extend": [12, 67, 71], "behavior": 12, "conjunct": 12, "tensorimpl": 12, "custom": [12, 59, 65, 67, 68, 71, 73], "interact": [12, 67], "qqq": [13, 14, 26], "marlinqqq": 14, "inherit": [14, 17, 73], "choose_qparams_and_quantize_affine_qqq": 14, "dequantize_affine_qqq": 14, "handl": [15, 18, 19, 31, 67], "pattern": [15, 18, 67], "ensur": 15, "preprocess": [15, 18], "manag": 15, "pre_process": 15, "1\u00ba": 15, "transpos": [15, 67, 73], "sinc": [15, 59, 67, 69, 71, 73], "layer": [15, 31, 40, 42, 45, 47, 48, 55, 56, 58, 60, 61, 62, 71, 73], "2\u00ba": 15, "inject": 15, "3\u00ba": 15, "again": [15, 16, 71], "becaus": [15, 67, 69, 71, 73], "dim": [15, 73], "tensor_meta": 16, "subclasstensorarg": 16, "n_block": 16, "scaler_block_s": [16, 27], "quantized_scal": 16, "quantization_factor": 16, "scaler_mean": 16, "quantized_data": 16, "nf4": 16, "qlora": 16, "convert_to_norm_float_weight": 16, "normal": [16, 27, 31, 71], "dequantize_scal": 16, "unpack": [16, 53, 67], "doubl": 16, "scaler": 16, "int8": [16, 46, 47, 48, 50, 51, 62, 64, 67, 73], "per_scaler_block": 16, "factor": [16, 49, 56, 71], "inpt_weight": 16, "double_quantize_scal": 16, "achiev": [16, 71, 73], "calcul": [16, 28, 32, 34, 55, 67, 71], "absmax": 16, "find": [16, 71], "posit": 16, "typic": [16, 17, 32, 67, 69], "per_block": 16, "int16": 16, "n_scaler_block": 16, "get_original_weight": 16, "quantize_tensor_nearest": 16, "float16": [16, 35, 38, 52, 71], "nearest": 16, "round": [16, 28, 32, 73], "up": [16, 51, 67, 68, 71], "most": [17, 67, 71], "doe": [17, 45, 67, 71, 73], "metadata": [17, 67, 73], "step": [17, 31, 67, 71], "requir": [17, 19, 32, 67, 71, 73], "semi": [18, 63, 64, 71], "structur": [18, 63, 64, 68, 69, 71, 73], "matric": [18, 71], "where": [18, 28, 34, 53, 58, 67, 71], "everi": [18, 59, 71, 73], "four": 18, "prune": [18, 60], "conform": 18, "inner_k_til": [19, 45, 68], "8": [19, 20, 28, 32, 44, 45, 67, 68], "core": [19, 29, 67], "tile": [19, 67], "fit": [19, 67, 69], "effici": [19, 68, 71], "function": [19, 31, 35, 50, 51, 59, 60, 61, 64, 68, 69, 71, 73], "affect": [19, 71], "matmul": [19, 42, 67, 71, 73], "pack_dim": [20, 58], "uintx": [20, 58, 67], "smaller": [20, 44, 45, 46, 58, 68, 69], "bit": [20, 27, 43, 44, 53, 58, 73], "width": [20, 44], "than": [20, 67, 71, 73], "standard": [20, 67], "byte": [20, 43, 58], "uintxtensor": 20, "determin": [20, 32, 38, 44, 45, 71], "along": [20, 71], "indic": [20, 30, 32, 71], "last": 20, "256": [27, 45], "name": [28, 29, 30, 51, 56, 60, 64, 71, 73], "qualnam": [28, 29, 30], "boundari": [28, 29, 30], "number": [28, 31, 53, 58, 60, 71, 73], "map": [28, 32, 67, 73], "symmetr": [28, 32, 40, 41, 42, 44, 46, 47, 48, 62, 73], "rang": [28, 71], "sai": [28, 52, 67], "3": [28, 31, 32, 52, 67, 68, 71, 76], "5": [28, 56, 60, 68, 71, 76], "7": [28, 32], "symmetric_no_clipping_err": 28, "variant": [28, 34, 73], "smin": 28, "smax": 28, "min_val_neg": [28, 73], "max_val_po": [28, 73], "By": [28, 71], "individu": [28, 71], "less": [28, 32, 71, 73], "error": [28, 31, 73], "neg": 28, "asymmetr": [28, 32, 44, 45, 46, 51, 58, 67, 68], "directli": [28, 34, 67, 71, 73], "placehold": 29, "yet": [29, 46, 73], "enum": 30, "whether": [30, 31, 32, 45, 51, 58, 73], "quantized_v": 30, "float_val": 30, "mid_point": 30, "example_input": [31, 68, 69], "qtensor_class_list": 31, "aqdefaultlinearweight": 31, "aqint8weightonlyquantizedlinearweight": 31, "aqint8weightonlyquantizedlinearweight2": 31, "aqint8dynamicallyquantizedlinearweight": 31, "filter_fn": [31, 51, 64], "interpol": 31, "85": 31, "manual": [31, 51], "set_inductor_config": [31, 51], "supress_autoquant_error": 31, "min_sqnr": 31, "aq_kwarg": 31, "autoquant": 31, "identifi": 31, "fastest": 31, "over": [31, 71], "potenti": [31, 71], "qtensor": 31, "prepar": [31, 55, 60, 67, 71], "search": [31, 71], "whose": 31, "exchang": 31, "autoquantizablelinearweight": 31, "calibr": [31, 34], "user": [31, 67, 68, 71, 73, 76], "seen": 31, "record": [31, 67], "so": [31, 67, 68, 69, 71, 73], "final": [31, 39, 51, 67, 68, 71], "benchmark": [31, 55], "member": 31, "pick": 31, "result": [31, 49, 53, 54, 67, 71], "highli": 31, "complet": 31, "simpli": [31, 71, 73], "had": [31, 73], "compil": [31, 51, 54, 67, 68, 73], "them": [31, 59, 67], "onc": [31, 71], "proce": 31, "combin": [31, 71, 73], "finalize_autoqu": 31, "been": [31, 73], "log": [31, 73], "nn": [31, 50, 51, 55, 56, 64, 67, 68, 69, 71, 73], "forward": [31, 59, 67, 68, 69, 71, 73], "pass": [31, 34, 59, 67, 73], "fulli": [31, 51, 56, 64, 71], "unless": 31, "list": [31, 36, 56, 60, 67, 68, 73], "default_autoquant_class_list": 31, "callabl": [31, 35, 50, 51, 57, 64], "contain": [31, 55, 56, 71, 73], "second": [31, 49, 67, 76], "stop": 31, "wait": [31, 67], "sever": 31, "automat": [31, 51, 73, 76], "config": [31, 50, 51, 60, 71], "suppress": 31, "accept": 31, "signal": 31, "nois": 31, "ration": 31, "wikipedia": 31, "org": [31, 43, 60, 67, 68, 71], "wiki": 31, "noise_ratio": 31, "v": 31, "non": [31, 67, 71, 73], "impact": [31, 44], "caus": 31, "too": 31, "larg": [31, 73], "numer": [31, 71], "resaon": 31, "40": 31, "adjust": 31, "keyword": 31, "usag": [31, 50], "example_input1": 31, "example_input2": 31, "fp32": [32, 36, 73], "bf16": [32, 67, 68, 71], "fp16": [32, 44], "optioanl": 32, "flag": 32, "exactli": [32, 45, 73], "pad": 32, "convolut": 32, "doesn": [32, 71], "t": [32, 60, 67, 68, 71, 73], "itself": [32, 71, 73], "sure": [32, 67], "correspond": [32, 51, 67, 69, 71, 73], "without": [32, 38, 39, 67, 71], "loss": [32, 71], "But": [32, 67, 73], "won": [32, 73], "gurante": 32, "don": [32, 60, 68, 71], "clamp": [32, 73], "request": [32, 36, 52], "min_val": [34, 67, 73], "max_val": [34, 67, 73], "instead": [34, 45, 59, 67, 68, 71, 73], "observ": [34, 59, 71], "obtain": 34, "track": [34, 67], "param": [34, 39, 60], "nbit": 35, "group_siz": [35, 44, 45, 46, 48, 50, 51, 58, 68], "axi": [35, 52], "compute_dtyp": 35, "str": [35, 51, 56, 57, 60, 64, 73], "verbos": 35, "raw_output": 35, "optimize_weight": 35, "optimize_weights_proximal_legaci": 35, "input_dtyp": 36, "output_dtyp": [36, 37, 52], "uint8": [36, 52, 67], "quant_dtyp": [38, 39], "fake": [38, 39, 50], "awar": [38, 39, 60, 71, 73], "equival": [38, 39, 56, 71], "cast": [38, 39], "valid": 38, "fake_quantize_affin": 39, "consum": 39, "outlier": 39, "mask": [39, 60, 71], "intermedi": 39, "activation_dtyp": [40, 41], "float8_e4m3fn": [40, 41, 42, 67], "weight_dtyp": [40, 41, 42], "pertensor": [40, 41], "perrow": [40, 41], "current": [40, 46, 51, 56, 60, 64, 71, 73], "fast": [40, 41, 71], "accumul": [40, 41], "float8_e4m": 41, "channel": [42, 47, 48, 59, 62], "sub": [43, 58, 71], "expon": [43, 53], "mantissa": [43, 53], "fp6_e3_m2": 43, "fp6_e2_m3": 43, "fp6": 43, "llm": 43, "paper": [43, 71, 76], "arxiv": [43, 60, 71], "ab": [43, 60, 71], "2401": 43, "14112": 43, "repo": 43, "usyd": 43, "fsalab": 43, "fp6_llm": 43, "renam": 43, "fpxtensorcoreaqttensorimpl": 43, "experiment": 43, "merg": 43, "to_affine_quantized_floatx": 43, "bit_width": 44, "packing_bitwidth": 44, "32": [44, 45, 46, 50, 51, 64, 68, 69, 73], "contigu": 44, "gemlit": 44, "triton": [44, 67], "its": [44, 71, 73], "associ": 44, "control": [44, 45, 46, 58, 60, 71], "grain": [44, 45, 46, 58, 73], "hardwar": [44, 67, 71], "leav": 44, "best": [44, 71], "choic": [44, 45], "128": [45, 73], "tensorcoretiledlayout": [45, 67, 68], "group": [45, 46, 58, 67, 68], "tensor_core_til": [45, 67], "int4mm": [45, 68], "aten": [45, 67, 73], "_weight_int4pack_mm": [45, 67], "tradit": 45, "chosen": [45, 71], "hqq": [45, 58, 67], "act_mapping_typ": [46, 47], "token": [46, 47, 62], "produc": 46, "backend": [46, 71], "did": 46, "lower": [46, 67, 71], "flow": [46, 71], "marlinqqqlayout": 46, "cutlassint4packedlayout": 46, "weight_only_decod": 47, "b": 49, "scales1": 49, "multipli": [49, 54, 71], "row": [49, 71], "rais": [49, 50, 54, 73], "assertionerror": [49, 54, 73], "expect": [49, 71, 73], "activation_config": 50, "fakequantizeconfig": 50, "weight_config": 50, "per_token": 50, "is_symmetr": 50, "embed": 50, "valueerror": 50, "apply_tensor_subclass": [51, 64, 67], "inplac": [51, 60, 68], "instanc": [51, 59, 64, 69, 73], "qualifi": [51, 56, 64, 71], "move": [51, 67], "speed": [51, 71], "predefin": 51, "execut": [51, 70, 73, 75], "path": [51, 54, 68], "customiz": 51, "int8_dynamic_activation_int4_weight": 51, "int8_dynamic_activation_int8_weight": [51, 64], "mm": [51, 73], "int4_weight_onli": [51, 67, 68, 69], "int8_weight_onli": 51, "sequenti": [51, 64], "1024": [51, 64, 68, 69], "write": [51, 65], "constructor": [51, 73], "to_affine_quantized_intx": [51, 67], "groupwis": 51, "groupsiz": [51, 52], "apply_weight_qu": 51, "lambda": 51, "x": [51, 58, 68, 69, 73, 76], "int32": [51, 67, 68], "15": [51, 68], "1e": 51, "def": [51, 64, 67, 68, 69, 73], "apply_weight_quant_to_linear": 51, "requires_grad": [51, 67, 73], "block0": 51, "submodul": 51, "fqn": [51, 60, 64], "isinst": [51, 64, 71, 73], "tabl": [52, 67, 71], "per_tensor": 52, "per_axi": 52, "per_group": 52, "low": [53, 71, 73], "00seeemm": 53, "fp6_e3m2": 53, "sign": 53, "mat2": 54, "safe": 54, "consid": [54, 67, 71], "cubla": 54, "fallback": 54, "i": [54, 71], "j": 54, "debug_skip_calibr": 55, "smoothquant": [55, 56], "smoothfakedynamicallyquantizedlinear": [55, 56], "debug": 55, "skip_fqn_list": 56, "cur_fqn": 56, "alpha": 56, "replac": [56, 71], "skip": [56, 60, 71], "being": [56, 67, 71], "input_quant_func": [57, 67], "quant_kwarg": 57, "dict": [57, 60, 73], "uint1": [58, 67], "uint7": [58, 67], "l2": [59, 71], "norm": [59, 60, 71], "buffer": 59, "x_orig": 59, "overridden": 59, "although": [59, 73], "recip": 59, "within": [59, 71], "afterward": 59, "former": 59, "care": [59, 69, 71], "hook": [59, 67], "while": [59, 60, 71, 73], "latter": 59, "silent": 59, "ignor": 59, "sparsity_level": [60, 71], "semi_structured_block_s": 60, "wanda": 60, "sparsifi": [60, 65, 69, 71], "propos": 60, "2306": 60, "11695": 60, "product": 60, "magnitud": [60, 71], "parametr": 60, "preserv": [60, 71], "deepcopi": [60, 68, 73], "squash_mask": [60, 71], "params_to_keep": 60, "params_to_keep_per_lay": 60, "squash": 60, "appropri": [60, 67], "sparse_param": 60, "attach": [60, 71], "kei": [60, 71, 76], "save": [60, 68, 69], "string": 60, "xdoctest": 60, "local": [60, 71], "undefin": 60, "hasattr": 60, "submodule1": 60, "linear1": [60, 68, 69, 73], "foo": 60, "bar": 60, "submodule2": 60, "linear42": 60, "baz": 60, "print": [60, 68, 69, 73, 76], "42": 60, "24": 60, "ones": [60, 67], "update_mask": 60, "tensor_nam": 60, "statist": [60, 67, 71], "retriev": 60, "act_per_input": 60, "Then": [60, 73], "metric": 60, "across": [60, 71, 73], "whole": 60, "simul": [61, 67, 71], "dnynam": 62, "moduel": 63, "sparsify_": 64, "essenti": 64, "semi_sparse_weight": 64, "semisparselayout": 64, "sparsemarlinlayout": 64, "sparse_api": 64, "librari": [65, 69], "gradient": [65, 71], "nativ": [65, 73], "readm": [65, 68, 71], "overal": [65, 68], "introduct": [65, 67], "recent": 65, "highlight": [65, 73, 76], "updat": [65, 68, 69, 71], "guid": [65, 67], "contributor": [65, 68], "serial": [65, 67], "advanc": [65, 73], "lai": 67, "stack": 67, "awq": 67, "gptq": 67, "codebookquantizedtensor": 67, "int1": 67, "float3": 67, "compon": [67, 73], "compos": [67, 71, 73], "overload": [67, 71], "term": [67, 71], "extra": 67, "empti": 67, "dev": 67, "discuss": [67, 73], "1833": 67, "No": [67, 69, 71], "matter": [67, 71], "end": [67, 71, 73, 76], "avail": 67, "later": [67, 73], "float3_e2_m0": 67, "float4_e2_m1": 67, "float4_e3_m0": 67, "float5_e2_m2": 67, "float5_e3_m1": 67, "float6_e2_m3": 67, "float6_e3_m2": 67, "float8_e5m2": 67, "float8_e4m3fnuz": 67, "float8_e5m2fnuz": 67, "plan": 67, "float4": 67, "float6": 67, "thei": [67, 71, 73], "becom": 67, "popular": 67, "part": [67, 71, 73], "uint2": 67, "117208": 67, "outsid": 67, "As": 67, "mention": 67, "criteria": 67, "wide": 67, "adopt": 67, "fundament": [67, 71], "until": 67, "evid": 67, "hopefulli": 67, "amen": 67, "haven": 67, "enough": 67, "ont": 67, "revisit": 67, "intx": 67, "connect": 67, "int4tensor": 67, "previou": 67, "between": [67, 71, 73], "preicison": 67, "mainli": 67, "There": [67, 73], "accommod": 67, "choose_qparams_affine_with_min_max": 67, "min": [67, 73], "int_matmul": 67, "int_scaled_matmul": 67, "reli": [67, 71, 73], "On": [67, 68], "top": [67, 73], "glue": 67, "everyth": 67, "togeth": 67, "build": [67, 71, 73], "construct": 67, "low_precision_v": 67, "high_precision_v": 67, "procedur": 67, "veri": [67, 71], "common": [67, 71], "straightforward": 67, "try": [67, 71, 73], "higher": [67, 73], "high_preicsion_v": 67, "especi": [67, 69, 71], "bitwidth": 67, "codebook": 67, "hardcod": 67, "select": 67, "multi": 67, "dimension": [67, 71], "view": [67, 73], "mkldnn": 67, "coo": [67, 71], "sparse_coo": [67, 71], "sparsetensorimpl": 67, "idea": [67, 71], "nice": [67, 71], "concept": [67, 76], "why": [67, 73, 76], "c": [67, 73], "conflict": 67, "properti": 67, "quantized_linear": 67, "semant": 67, "stai": [67, 68, 73], "develop": 67, "tradition": 67, "come": [67, 71, 72], "demonstr": [67, 68, 73], "purpos": [67, 73], "to_affine_quant": 67, "simplic": 67, "explain": 67, "simplest": [67, 71], "form": [67, 71], "easi": 67, "linear_modul": 67, "runtim": 67, "to_linear_activation_quant": 67, "quantized_weight": 67, "activation_and_weight_quant": 67, "encount": 67, "f": [67, 69, 71, 73], "input_qunat_func": 67, "redispatch": 67, "swap": [67, 71], "fx": 67, "symbolic_trac": 67, "prefer": [67, 68, 73], "easier": 67, "further": [67, 73], "modif": 67, "sampl": 67, "figur": [67, 71], "At": [67, 71], "collect": [67, 71], "thing": [67, 69, 71, 73], "address": 67, "stat": 67, "averag": 67, "calculate_qparam": 67, "affinequantizedminmaxobserv": 67, "insert_observer_": 67, "altern": [67, 73], "observedlinear": 67, "dataset": 67, "complic": [67, 71], "next": 67, "done": [67, 73], "manner": 67, "intend": 67, "autoround": 67, "multitensor": 67, "describ": [67, 69, 71, 76], "advis": 67, "focus": [67, 71], "todai": 67, "low_bit_optim": 67, "similar": [67, 71], "quantized_train": 67, "enabl": 67, "progress": 67, "lot": [67, 71], "includ": [67, 73], "walk": [67, 73, 76], "_convert_weight_to_int4pack": 67, "tensorcoretiledaqttensorimpl": 67, "_quantized_linear_op": 67, "goe": 67, "_aqt_qlinear_dispatch_t": 67, "dispatch": 67, "explan": 67, "wint4": 67, "explor": 68, "instal": 68, "latest": 68, "stabl": 68, "releas": 68, "pip": 68, "nightli": 68, "command": 68, "index": [68, 71], "url": 68, "download": [68, 74, 76], "whl": 68, "cu121": 68, "major": 68, "instruct": 68, "entri": 68, "mutat": 68, "insert": 68, "logic": [68, 73], "toi": [68, 73], "toylinearmodel": [68, 69], "__init__": [68, 69, 73], "super": [68, 69, 73], "linear2": [68, 69, 73], "eval": [68, 69], "faster": [68, 71], "model_bf16": 68, "leverag": [68, 73], "mix": 68, "readi": [68, 73], "in_featur": [68, 69, 73], "out_featur": [68, 73], "tensor_impl_dtyp": 68, "verifi": [68, 69, 73], "roughli": [68, 71], "quarter": 68, "os": 68, "tmp": 68, "int4_model": 68, "pt": 68, "bfloat16_model": 68, "int4_model_size_mb": 68, "getsiz": 68, "bfloat16_model_size_mb": 68, "2f": 68, "mb": [68, 69, 70, 75], "25": 68, "00": [68, 70, 75], "much": [68, 71], "torch_version_at_least_2_5": 68, "benchmark_model": 68, "temporari": 68, "workaround": 68, "num_run": 68, "100": [68, 73], "_dynamo": [68, 73], "reset": 68, "bf16_time": 68, "int4_tim": 68, "time": [68, 71, 73, 76], "3f": 68, "ms": 68, "1fx": 68, "a100": 68, "gpu": [68, 76], "80gb": 68, "30": 68, "393": 68, "410": 68, "9x": 68, "simpl": [68, 71, 73], "workflow": [68, 71], "visit": 68, "would": [68, 71, 73], "forget": 68, "good": [68, 73], "tempfil": 69, "get_model_size_in_byt": 69, "batch_siz": 69, "ref": 69, "namedtemporaryfil": 69, "state_dict": 69, "seek": [69, 71], "load": 69, "meta": 69, "m_load": 69, "load_state_dict": 69, "assign": 69, "re": [69, 73], "assert": [69, 73], "equal": [69, 71], "float_weight1": 69, "float_weight2": 69, "quantized_weight1": 69, "quantized_weight2": 69, "go": [69, 73, 76], "techinqu": 69, "reduct": [69, 71, 73], "around": 69, "4x": 69, "0625": 69, "reason": [69, 71], "avoid": [69, 71], "properli": 69, "003": [70, 75, 76], "total": [70, 75, 76], "galleri": [70, 74, 76], "mem": [70, 75], "templat": [70, 74, 75], "tutorials_sourc": 70, "template_tutori": [70, 75, 76], "neural": 71, "network": [71, 73], "reduc": 71, "overhead": 71, "latenc": 71, "carefulli": 71, "signific": 71, "pai": 71, "price": 71, "qualiti": 71, "accuraci": 71, "f1": 71, "problem": [71, 73], "research": [71, 76], "face": 71, "fragment": 71, "rightfulli": 71, "spent": 71, "compress": 71, "place": 71, "dens": 71, "solv": [71, 73], "focu": [71, 73], "realli": 71, "push": 71, "accur": 71, "concret": 71, "hope": 71, "modular": 71, "acceler": 71, "scratch": [71, 76], "minim": 71, "recov": 71, "algorthim": 71, "realiz": 71, "improv": 71, "trade": 71, "off": 71, "degrad": 71, "architectur": 71, "theoret": 71, "gain": 71, "2x": 71, "analog": 71, "fix": 71, "50": 71, "unstructur": 71, "One": [71, 73], "howev": 71, "close": 71, "relat": 71, "mitig": 71, "retrain": 71, "neglig": 71, "area": 71, "agre": 71, "upon": 71, "consensu": 71, "mind": 71, "thought": 71, "separ": 71, "subproblem": 71, "satisfi": 71, "consist": [71, 73], "answer": 71, "independ": 71, "frontend": 71, "arbitrari": 71, "handoff": 71, "piec": 71, "miss": 71, "natur": [71, 73], "present": 71, "clear": 71, "contract": 71, "7x": 71, "advantag": 71, "anticip": 71, "mani": [71, 73], "solut": 71, "third": 71, "parti": 71, "to_sparse_semi_structur": 71, "sparsesemistructuredtensor": 71, "weightnormsparsifi": 71, "half": 71, "subnetwork": 71, "sparse_config": 71, "mod": [71, 73], "named_modul": 71, "append": 71, "tensor_fqn": 71, "sparse_block_shap": 71, "zeros_per_block": 71, "fakespars": 71, "manipul": 71, "dictionari": 71, "paramer": 71, "parameter": 71, "necessari": [71, 73], "ve": 71, "suitabl": 71, "fuse": [71, 73], "0s": 71, "spot": 71, "definit": 71, "academia": 71, "industri": 71, "often": [71, 73], "interchang": 71, "confus": 71, "distinct": 71, "pretrain": 71, "behind": 71, "box": 71, "those": [71, 73], "loos": 71, "speak": 71, "tightli": 71, "coupl": [71, 73], "nvidia": 71, "csc": 71, "fbgemm": 71, "qnnpack": 71, "descript": 71, "coordin": 71, "vector": 71, "locat": 71, "bsr": 71, "sparse_bsr": 71, "except": [71, 73], "scalar": 71, "csr": 71, "sparse_csr": 71, "sparse_csc": 71, "column": 71, "compact": 71, "sparse_matrix": 71, "1d": 71, "indexptr": 71, "\u00bd": 71, "bitmask": 71, "2bit": 71, "unprun": 71, "quit": [71, 73], "must": 71, "successfulli": 71, "These": [71, 73], "broken": 71, "down": 71, "Not": 71, "sensit": 71, "effect": [71, 73], "subsequ": [71, 73], "infinit": 71, "lost": 71, "degre": 71, "analysi": 71, "drop": 71, "give": [71, 73], "curv": 71, "proxi": 71, "aforement": 71, "smallest": 71, "absolut": 71, "vs": 71, "global": [71, 73], "scope": 71, "impli": 71, "pro": 71, "con": 71, "tradeoff": 71, "span": 71, "threshold": 71, "increas": 71, "complex": 71, "constant": [71, 73], "ctr_mobile_fe": 71, "score": 71, "w": 71, "tenosr": 71, "udpat": 71, "cannot": 71, "histori": 71, "regrow": 71, "dw": 71, "via": 71, "backprop": 71, "pat": 71, "unmask": 71, "resid": 71, "backward": 71, "salienc": 71, "lowest": 71, "l1": 71, "commonli": 71, "shown": 71, "abl": [71, 73], "ident": 71, "repeat": 71, "loop": 71, "shot": 71, "movement": 71, "inform": 71, "tune": 71, "2005": 71, "07683": 71, "rank": [71, 73], "wx": 71, "sqx": 71, "q": 71, "usual": 71, "sort": 71, "wise": 71, "reconstruct": 71, "random": 71, "randomli": 71, "remedi": 71, "line": 71, "item": [71, 76], "ultim": 71, "literatur": 71, "vision": 71, "nlp": [71, 76], "iter": 71, "ctr_feed": 71, "na": 71, "multimask": 71, "pyspeech": 71, "fastna": 71, "approach": [71, 73], "knowledg": [71, 76], "distil": 71, "pdf": 71, "2204": 71, "09656": 71, "arrang": 71, "recal": 71, "counterpart": 71, "slower": 71, "suffici": 71, "flexibl": [71, 73], "98": 71, "benefit": [71, 73], "special": 71, "exhibit": 71, "maintain": 71, "penalti": 71, "expens": [71, 73], "dictat": 71, "characterist": 71, "highest": 71, "wouldn": [71, 73], "visual": 71, "fig": 71, "4x4": 71, "benchmak": 71, "soon": 72, "foundat": 73, "extens": 73, "featur": 73, "autograd": 73, "distribut": 73, "express": 73, "interpos": 73, "namespac": 73, "continu": 73, "seamlessli": 73, "obviou": 73, "int8quantizedlinear": 73, "few": 73, "finer": 73, "intercept": 73, "slightli": 73, "contrast": 73, "long": 73, "better": 73, "clunki": 73, "distributedlinear": 73, "duplic": 73, "bypass": 73, "offer": 73, "outer": 73, "inner": 73, "allgath": 73, "bandwidth": 73, "rest": 73, "read": 73, "document": 73, "zoo": 73, "podcast": 73, "edward": 73, "yang": 73, "begin": 73, "int8_symmetric_quant": 73, "fp32_tensor": 73, "127": 73, "amin": 73, "keepdim": 73, "amax": 73, "zeros_lik": 73, "quantizedlinear": 73, "w_int8": 73, "cl": 73, "new_linear": 73, "left": 73, "toymodel": 73, "float_model": 73, "quantized_model": 73, "child": 73, "named_children": 73, "setattr": 73, "drawback": 73, "suppos": 73, "clean": 73, "limit": 73, "eleg": 73, "pretti": 73, "power": 73, "overrid": 73, "almost": 73, "shard": 73, "ragged": 73, "rag": 73, "nestedtensor": 73, "resourc": 73, "who": 73, "link": [73, 76], "googl": 73, "collab": 73, "flopcount": 73, "memorytrack": 73, "With": 73, "bare": 73, "bone": 73, "int8symmetrictensor": 73, "hold": 73, "staticmethod": 73, "disabl": 73, "__new__": 73, "_make_wrapper_subclass": 73, "storage_offset": 73, "ndim": 73, "__tensor_flatten__": 73, "attribut": 73, "pt2": 73, "__tensor_unflatten__": 73, "tensor_data_dict": 73, "extra_metadata": 73, "outer_s": 73, "outer_strid": 73, "undo": 73, "back": 73, "__repr__": 73, "repr": 73, "ahead": 73, "insid": 73, "int8_tensor": 73, "func": 73, "op_implementations_dict": 73, "conveni": 73, "register_op": 73, "_op": 73, "opoverload": 73, "impl_decor": 73, "op_impl": 73, "wrapper": 73, "particular": 73, "largest": 73, "tell": 73, "desugar": 73, "decor": 73, "surfac": 73, "coverag": 73, "though": 73, "brute": 73, "forc": 73, "repeatedli": 73, "loggingtensor": 73, "_python_dispatch": 73, "return_and_correct_alias": 73, "int8_mm": 73, "detach": 73, "int8_view_op": 73, "out_data": 73, "out_scal": 73, "notic": 73, "quickli": 73, "hit": 73, "background": 73, "decomposit": 73, "live": 73, "decomp": 73, "shrink": 73, "author": [73, 76], "pain": 73, "rather": 73, "underli": 73, "worth": 73, "written": 73, "differenti": 73, "nuanc": 73, "longer": 73, "That": 73, "transposit": 73, "got": 73, "propag": 73, "fact": 73, "themselv": 73, "pointwis": 73, "alwai": 73, "were": 73, "might": 73, "unwrap": 73, "dim0": 73, "dim1": 73, "confirm": 73, "quantized_model_module_swap": 73, "quantized_model_subclass": 73, "subclass_param": 73, "no_grad": 73, "out_module_swap": 73, "allclos": 73, "out_compil": 73, "seri": 73, "wa": 73, "tutorials_python": 74, "zip": [74, 76], "jupyt": [74, 76], "notebook": [74, 76], "tutorials_jupyt": 74, "sphinx": [74, 76], "firstnam": 76, "lastnam": 76, "prerequisit": 76, "v2": 76, "topic": 76, "rand": 76, "0870": 76, "9183": 76, "7696": 76, "3774": 76, "1702": 76, "2919": 76, "2416": 76, "8915": 76, "9341": 76, "7196": 76, "4544": 76, "8347": 76, "1172": 76, "4801": 76, "8118": 76, "practic": 76, "test": 76, "summar": 76, "takeawai": 76, "link1": 76, "link2": 76, "minut": 76, "ipynb": 76}, "objects": {"torchao.dtypes": [[7, 0, 1, "", "AffineQuantizedTensor"], [8, 0, 1, "", "BlockSparseLayout"], [9, 0, 1, "", "CutlassInt4PackedLayout"], [10, 0, 1, "", "Float8Layout"], [11, 0, 1, "", "Int4CPULayout"], [12, 0, 1, "", "Layout"], [13, 0, 1, "", "MarlinQQQLayout"], [14, 0, 1, "", "MarlinQQQTensor"], [15, 0, 1, "", "MarlinSparseLayout"], [16, 0, 1, "", "NF4Tensor"], [17, 0, 1, "", "PlainLayout"], [18, 0, 1, "", "SemiSparseLayout"], [19, 0, 1, "", "TensorCoreTiledLayout"], [20, 0, 1, "", "UintxLayout"], [21, 2, 1, "", "to_affine_quantized_floatx"], [22, 2, 1, "", "to_affine_quantized_floatx_static"], [23, 2, 1, "", "to_affine_quantized_fpx"], [24, 2, 1, "", "to_affine_quantized_intx"], [25, 2, 1, "", "to_affine_quantized_intx_static"], [26, 2, 1, "", "to_marlinqqq_quantized_intx"], [27, 2, 1, "", "to_nf4"]], "torchao.dtypes.AffineQuantizedTensor": [[7, 1, 1, "", "dequantize"], [7, 1, 1, "", "from_hp_to_floatx"], [7, 1, 1, "", "from_hp_to_floatx_static"], [7, 1, 1, "", "from_hp_to_fpx"], [7, 1, 1, "", "from_hp_to_intx"], [7, 1, 1, "", "from_hp_to_intx_static"], [7, 1, 1, "", "to"]], "torchao.dtypes.MarlinQQQTensor": [[14, 1, 1, "", "dequantize"], [14, 1, 1, "", "from_hp_to_intx"]], "torchao.dtypes.MarlinSparseLayout": [[15, 1, 1, "", "pre_process"]], "torchao.dtypes.NF4Tensor": [[16, 1, 1, "", "convert_to_norm_float_weight"], [16, 1, 1, "", "dequantize"], [16, 1, 1, "", "dequantize_scalers"], [16, 1, 1, "", "double_quantize_scalers"], [16, 1, 1, "", "get_original_weight"], [16, 1, 1, "", "quantize_tensor_nearest"]], "torchao.quantization": [[28, 0, 1, "", "MappingType"], [29, 0, 1, "", "TorchAODType"], [30, 0, 1, "", "ZeroPointDomain"], [31, 2, 1, "", "autoquant"], [32, 2, 1, "", "choose_qparams_affine"], [33, 2, 1, "", "choose_qparams_affine_floatx"], [34, 2, 1, "", "choose_qparams_affine_with_min_max"], [35, 2, 1, "", "choose_qparams_and_quantize_affine_hqq"], [36, 2, 1, "", "dequantize_affine"], [37, 2, 1, "", "dequantize_affine_floatx"], [38, 2, 1, "", "fake_quantize_affine"], [39, 2, 1, "", "fake_quantize_affine_cachemask"], [40, 2, 1, "", "float8_dynamic_activation_float8_weight"], [41, 2, 1, "", "float8_static_activation_float8_weight"], [42, 2, 1, "", "float8_weight_only"], [43, 2, 1, "", "fpx_weight_only"], [44, 2, 1, "", "gemlite_uintx_weight_only"], [45, 2, 1, "", "int4_weight_only"], [46, 2, 1, "", "int8_dynamic_activation_int4_weight"], [47, 2, 1, "", "int8_dynamic_activation_int8_weight"], [48, 2, 1, "", "int8_weight_only"], [49, 2, 1, "", "int_scaled_matmul"], [50, 2, 1, "", "intx_quantization_aware_training"], [51, 2, 1, "", "quantize_"], [52, 2, 1, "", "quantize_affine"], [53, 2, 1, "", "quantize_affine_floatx"], [54, 2, 1, "", "safe_int_mm"], [55, 2, 1, "", "smooth_fq_linear_to_inference"], [56, 2, 1, "", "swap_linear_with_smooth_fq_linear"], [57, 2, 1, "", "to_linear_activation_quantized"], [58, 2, 1, "", "uintx_weight_only"]], "torchao": [[4, 3, 0, "-", "sparsity"]], "torchao.sparsity": [[59, 0, 1, "", "PerChannelNormObserver"], [60, 0, 1, "", "WandaSparsifier"], [61, 2, 1, "", "apply_fake_sparsity"], [62, 2, 1, "", "int8_dynamic_activation_int8_semi_sparse_weight"], [63, 2, 1, "", "semi_sparse_weight"], [64, 2, 1, "", "sparsify_"]], "torchao.sparsity.PerChannelNormObserver": [[59, 1, 1, "", "forward"]], "torchao.sparsity.WandaSparsifier": [[60, 1, 1, "", "prepare"], [60, 1, 1, "", "squash_mask"], [60, 1, 1, "", "update_mask"]]}, "objtypes": {"0": "py:class", "1": "py:method", "2": "py:function", "3": "py:module"}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "method", "Python method"], "2": ["py", "function", "Python function"], "3": ["py", "module", "Python module"]}, "titleterms": {"torchao": [0, 1, 2, 3, 4, 5, 65, 67], "dtype": [0, 6, 67], "layout": [0, 5, 12, 67], "tensor": [0, 5, 67, 72, 73], "subclass": [0, 5, 67, 73], "quantiz": [0, 3, 51, 67, 68, 72, 73], "techniqu": 0, "api": [1, 3, 65], "refer": [1, 65], "python": 1, "kernel": [2, 5, 66, 67], "main": 3, "quantize_": 3, "primit": [3, 67], "other": [3, 5, 67], "sparsiti": [4, 71], "contributor": 5, "guid": [5, 68], "gener": 5, "extend": 5, "ad": [5, 67], "effici": [5, 67], "custom": 5, "triton": 5, "hand": 5, "written": 5, "dispatch": 5, "tensorimpl": [5, 67], "flow": [5, 67, 69], "us": 5, "torch": 5, "compil": 5, "perform": [5, 66], "serial": [5, 69], "featur": 5, "support": [5, 67], "function": [5, 67], "compos": 5, "test": 5, "microbenchmark": 5, "model": [5, 67, 69], "benchmark": 5, "eval": 5, "affinequantizedtensor": 7, "blocksparselayout": 8, "cutlassint4packedlayout": 9, "float8layout": 10, "int4cpulayout": 11, "marlinqqqlayout": 13, "marlinqqqtensor": 14, "marlinsparselayout": 15, "nf4tensor": 16, "plainlayout": 17, "semisparselayout": 18, "tensorcoretiledlayout": 19, "uintxlayout": 20, "to_affine_quantized_floatx": 21, "to_affine_quantized_floatx_stat": 22, "to_affine_quantized_fpx": 23, "to_affine_quantized_intx": 24, "to_affine_quantized_intx_stat": 25, "to_marlinqqq_quantized_intx": 26, "to_nf4": 27, "mappingtyp": 28, "torchaodtyp": 29, "zeropointdomain": 30, "autoqu": 31, "choose_qparams_affin": 32, "choose_qparams_affine_floatx": 33, "choose_qparams_affine_with_min_max": 34, "choose_qparams_and_quantize_affine_hqq": 35, "dequantize_affin": 36, "dequantize_affine_floatx": 37, "fake_quantize_affin": 38, "fake_quantize_affine_cachemask": 39, "float8_dynamic_activation_float8_weight": 40, "float8_static_activation_float8_weight": 41, "float8_weight_onli": 42, "fpx_weight_onli": 43, "gemlite_uintx_weight_onli": 44, "int4_weight_onli": 45, "int8_dynamic_activation_int4_weight": 46, "int8_dynamic_activation_int8_weight": 47, "int8_weight_onli": 48, "int_scaled_matmul": 49, "intx_quantization_aware_train": 50, "quantize_affin": 52, "quantize_affine_floatx": 53, "safe_int_mm": 54, "smooth_fq_linear_to_infer": 55, "swap_linear_with_smooth_fq_linear": 56, "to_linear_activation_quant": 57, "uintx_weight_onli": 58, "perchannelnormobserv": 59, "wandasparsifi": 60, "apply_fake_spars": 61, "int8_dynamic_activation_int8_semi_sparse_weight": 62, "semi_sparse_weight": 63, "sparsifi": 64, "welcom": 65, "document": 65, "get": 65, "start": [65, 68], "develop": 65, "note": 65, "tutori": [65, 76], "overview": [67, 71, 76], "basic": 67, "current": 67, "placehold": 67, "pytorch": 67, "implement": [67, 73], "oper": [67, 73], "integr": 67, "nativ": 67, "factori": 67, "op": 67, "deriv": 67, "algorithm": 67, "weight": 67, "onli": 67, "dynam": 67, "activ": 67, "static": 67, "insert": 67, "observ": 67, "how": 67, "defin": 67, "modul": [67, 73], "add": 67, "calibr": 67, "train": 67, "awar": 67, "low": 67, "bit": 67, "optim": [67, 69], "case": 67, "studi": 67, "int4": 67, "work": 67, "dure": 67, "execut": 67, "save": 67, "load": 67, "quick": 68, "first": 68, "exampl": 68, "next": [68, 73], "step": [68, 73, 76], "deseri": 69, "what": [69, 73], "happen": 69, "when": 69, "an": 69, "comput": [70, 75], "time": [70, 75], "goal": 71, "design": 71, "context": 71, "prune": 71, "configur": 71, "criteria": 71, "strategi": 71, "pattern": 71, "write": [72, 73], "your": [72, 73], "own": [72, 73], "advanc": 72, "ar": 73, "swap": 73, "which": 73, "should": 73, "we": 73, "compar": 73, "output": 73, "templat": 76, "option": 76, "addit": 76, "exercis": 76, "conclus": 76, "further": 76, "read": 76}, "envversion": {"sphinx.domains.c": 2, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 6, "sphinx.domains.index": 1, "sphinx.domains.javascript": 2, "sphinx.domains.math": 2, "sphinx.domains.python": 3, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx.ext.todo": 2, "sphinx.ext.viewcode": 1, "sphinx": 56}})
\ No newline at end of file
diff --git a/main/tutorials/template_tutorial.html b/main/tutorials/template_tutorial.html
index b62b0bcbc4..934bff2274 100644
--- a/main/tutorials/template_tutorial.html
+++ b/main/tutorials/template_tutorial.html
@@ -443,11 +443,11 @@
tensor([[0.5060, 0.1671, 0.2317],
- [0.7330, 0.9476, 0.1239],
- [0.3325, 0.8057, 0.8212],
- [0.6673, 0.2430, 0.0813],
- [0.0879, 0.3014, 0.4889]])
+tensor([[0.0870, 0.9183, 0.7696],
+ [0.3774, 0.1702, 0.2919],
+ [0.2416, 0.8915, 0.9341],
+ [0.7196, 0.4544, 0.8347],
+ [0.1172, 0.4801, 0.8118]])