pytorch
diff --git a/‎.github/workflows/1xL4_tests.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/1xL4_tests.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/benchmark_blockwise_scaled_linear_triton.py‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/benchmark_blockwise_scaled_linear_triton.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎test/dtypes/test_affine_quantized_float.py‎
Lines changed: 2 additions & 2 deletions b/‎test/dtypes/test_affine_quantized_float.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎test/prototype/test_blockwise_triton.py‎ renamed to ‎test/kernel/test_blockwise_triton.py‎
Lines changed: 1 addition & 1 deletion b/‎test/prototype/test_blockwise_triton.py‎ renamed to ‎test/kernel/test_blockwise_triton.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎test/prototype/moe_training/test_fsdp.py‎
Lines changed: 1 addition & 4 deletions b/‎test/prototype/moe_training/test_fsdp.py‎
Lines changed: 1 addition & 4 deletions
diff --git a/‎test/prototype/moe_training/test_fsdp_tp.py‎
Lines changed: 1 addition & 1 deletion b/‎test/prototype/moe_training/test_fsdp_tp.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎test/prototype/moe_training/test_tp.py‎
Lines changed: 2 additions & 5 deletions b/‎test/prototype/moe_training/test_tp.py‎
Lines changed: 2 additions & 5 deletions
diff --git a/‎test/prototype/moe_training/test_training.py‎
Lines changed: 2 additions & 2 deletions b/‎test/prototype/moe_training/test_training.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎test/prototype/mx_formats/test_mx_tensor.py‎
Lines changed: 2 additions & 8 deletions b/‎test/prototype/mx_formats/test_mx_tensor.py‎
Lines changed: 2 additions & 8 deletions
@@ -51,3 +51,4 @@ jobs:
         pytest test/dtypes/test_affine_quantized_float.py --verbose -s
         ./test/float8/test_everything_single_gpu.sh
         python test/quantization/quantize_/workflows/float8/test_float8_tensor.py
+        python test/kernel/test_blockwise_triton.py --verbose -s
@@ -254,7 +254,7 @@ If you believe there's other CUDA kernels we should be taking a closer look at p
 
 TorchAO is integrated into some of the leading open-source libraries including:
 
-* Unsloth for QAT, blog post coming soon!
+* Unsloth now supports QAT: [Read blog](https://docs.unsloth.ai/new/quantization-aware-training-qat) and [guide](https://docs.unsloth.ai/new/quantization-aware-training-qat#qat--lora-finetuning).
 * HuggingFace transformers with a [builtin inference backend](https://huggingface.co/docs/transformers/main/quantization/torchao) and [low bit optimizers](https://github.com/huggingface/transformers/pull/31865)
 * HuggingFace diffusers best practices with `torch.compile` and TorchAO in a standalone repo [diffusers-torchao](https://github.com/huggingface/diffusers/blob/main/docs/source/en/quantization/torchao.md)
 * vLLM for LLM serving: [usage](https://docs.vllm.ai/en/latest/features/quantization/torchao.html), [detailed docs](https://docs.pytorch.org/ao/main/torchao_vllm_integration.html)
 
@@ -13,7 +13,7 @@
     from triton.testing import do_bench
 
     from torchao.float8.float8_utils import compute_error
-    from torchao.prototype.blockwise_fp8_inference.blockwise_quantization import (
+    from torchao.kernel.blockwise_quantization import (
         blockwise_fp8_gemm,
         fp8_blockwise_act_quant,
         fp8_blockwise_weight_quant,
 
@@ -152,7 +152,7 @@ def test_invalid_granularity(self):
     def test_mismatched_granularity(self):
         with pytest.raises(
             ValueError,
-            match="Different granularities for activation and weight are not supported",
+            match="Unsupported granularity types",
         ):
             Float8DynamicActivationFloat8WeightConfig(
                 granularity=(PerTensor(), PerRow())
@@ -165,7 +165,7 @@ def test_unsupported_granularity(self):
         class UnsupportedGranularity:
             pass
 
-        with pytest.raises(ValueError, match="Invalid granularity types"):
+        with pytest.raises(ValueError, match="Unsupported granularity types"):
             Float8DynamicActivationFloat8WeightConfig(
                 granularity=(UnsupportedGranularity(), UnsupportedGranularity()),
             )
 
@@ -11,7 +11,7 @@
 
 triton = pytest.importorskip("triton", reason="Triton required to run this test")
 
-from torchao.prototype.blockwise_fp8_inference.blockwise_quantization import (
+from torchao.kernel.blockwise_quantization import (
     blockwise_fp8_gemm,
     fp8_blockwise_act_quant,
     fp8_blockwise_weight_dequant,
 
@@ -46,8 +46,8 @@
 
 # this test requires torchtitan
 try:
-    from torchtitan.distributed.expert_parallel import set_token_group_alignment_size_m
     from torchtitan.models.moe import MoE, MoEArgs
+    from torchtitan.models.moe.utils import set_token_group_alignment_size_m
 except ImportError:
     pytest.skip(
         "torchtitan not installed, skipping MoE tests.", allow_module_level=True
@@ -62,9 +62,6 @@ def device_mesh_1d() -> DeviceMesh:
     """
     rank = int(os.environ["RANK"])
     world_size = int(os.environ["WORLD_SIZE"])
-    if not dist.is_initialized():
-        dist.init_process_group("nccl", rank=rank, world_size=world_size)
-
     device_mesh = init_device_mesh("cuda", (world_size,))
     torch.manual_seed(1)
     torch.cuda.set_device(rank)
 
@@ -65,9 +65,9 @@
         ExpertTensorParallel,
         NoParallel,
         TensorParallel,
-        set_token_group_alignment_size_m,
     )
     from torchtitan.models.moe import MoE, MoEArgs
+    from torchtitan.models.moe.utils import set_token_group_alignment_size_m
 except ImportError:
     pytest.skip(
         "torchtitan not installed, skipping MoE tests.", allow_module_level=True
 
@@ -58,14 +58,14 @@
 
 # this test requires torchtitan
 try:
+    from torchtitan.distributed import NoParallel
     from torchtitan.distributed.expert_parallel import (
         ExpertParallel,
         ExpertTensorParallel,
-        NoParallel,
         TensorParallel,
-        set_token_group_alignment_size_m,
     )
     from torchtitan.models.moe import MoE, MoEArgs
+    from torchtitan.models.moe.utils import set_token_group_alignment_size_m
 except ImportError:
     pytest.skip(
         "torchtitan not installed, skipping MoE tests.", allow_module_level=True
@@ -80,9 +80,6 @@ def device_mesh_1d() -> DeviceMesh:
     """
     rank = int(os.environ["RANK"])
     world_size = int(os.environ["WORLD_SIZE"])
-    if not dist.is_initialized():
-        dist.init_process_group("nccl", rank=rank, world_size=world_size)
-
     device_mesh = init_device_mesh("cuda", (world_size,))
     torch.manual_seed(1)
     torch.cuda.set_device(rank)
 
@@ -22,10 +22,10 @@
 
 # this test requires torchtitan
 try:
-    from torchtitan.distributed.expert_parallel import (
+    from torchtitan.models.moe import MoE, MoEArgs
+    from torchtitan.models.moe.utils import (
         set_token_group_alignment_size_m,
     )
-    from torchtitan.models.moe import MoE, MoEArgs
 except ImportError:
     pytest.skip(
         "torchtitan not installed, skipping MoE tests.", allow_module_level=True
 
@@ -116,8 +116,6 @@ def test_some_zeros(elem_dtype):
     _test_mx(data, elem_dtype, block_size)
 
 
-# TODO(future PR): fix and reenable this test
-@pytest.mark.skip(reason="does not pass on B200 yet")
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
 def test_to_mx_rceil():
     # nan
@@ -131,11 +129,7 @@ def test_to_mx_rceil():
         ],
         dtype=torch.uint32,
     ).view(torch.float32)
-    # fmt: on
-    ground_truth_scale = torch.tensor([255], dtype=torch.uint8).view(
-        torch.float8_e8m0fnu
-    )
-    # fmt: off
+
     ground_truth_fp8 = torch.tensor(
         [
         127, 0, 0, 0, 0, 0, 0, 0,
@@ -149,7 +143,7 @@ def test_to_mx_rceil():
     data_mx = MXTensor.to_mx(
         data_hp, torch.float8_e4m3fn, 32, ScaleCalculationMode.RCEIL
     )
-    torch.testing.assert_close(data_mx.scale, ground_truth_scale)
+    assert torch.isnan(data_mx.scale)
     assert torch.isnan(data_mx.qdata[0])
     assert torch.all(data_mx.qdata[1:] == 0)
     # fp32 denorm