pytorch
diff --git a/‎.github/scripts/torchao_model_releases/quantize_and_upload.py‎
Lines changed: 15 additions & 5 deletions b/‎.github/scripts/torchao_model_releases/quantize_and_upload.py‎
Lines changed: 15 additions & 5 deletions
diff --git a/‎.github/scripts/torchao_model_releases/release.sh‎
Lines changed: 8 additions & 3 deletions b/‎.github/scripts/torchao_model_releases/release.sh‎
Lines changed: 8 additions & 3 deletions
diff --git a/‎.github/workflows/release_model.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/release_model.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/prototype/moe_training/benchmark_scaled_grouped_mm_dq.py‎
Lines changed: 18 additions & 11 deletions b/‎benchmarks/prototype/moe_training/benchmark_scaled_grouped_mm_dq.py‎
Lines changed: 18 additions & 11 deletions
diff --git a/‎scripts/clean_release_notes.py‎
Lines changed: 2 additions & 2 deletions b/‎scripts/clean_release_notes.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎setup.py‎
Lines changed: 6 additions & 8 deletions b/‎setup.py‎
Lines changed: 6 additions & 8 deletions
diff --git a/‎test/dtypes/test_affine_quantized_float.py‎
Lines changed: 27 additions & 22 deletions b/‎test/dtypes/test_affine_quantized_float.py‎
Lines changed: 27 additions & 22 deletions
diff --git a/‎test/dtypes/test_nf4.py‎
Lines changed: 3 additions & 3 deletions b/‎test/dtypes/test_nf4.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎test/integration/test_integration.py‎
Lines changed: 3 additions & 3 deletions b/‎test/integration/test_integration.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎test/integration/test_vllm.py‎
Lines changed: 2 additions & 2 deletions b/‎test/integration/test_vllm.py‎
Lines changed: 2 additions & 2 deletions
@@ -568,7 +568,7 @@ def _untie_weights_and_save_locally(model_id):
 """
 
 
-def quantize_and_upload(model_id, quant):
+def quantize_and_upload(model_id, quant, push_to_hub):
     _int8_int4_linear_config = Int8DynamicActivationIntxWeightConfig(
         weight_dtype=torch.int4,
         weight_granularity=PerGroup(32),
@@ -657,9 +657,13 @@ def quantize_and_upload(model_id, quant):
     card = ModelCard(content)
 
     # Push to hub
-    quantized_model.push_to_hub(quantized_model_id, safe_serialization=False)
-    tokenizer.push_to_hub(quantized_model_id)
-    card.push_to_hub(quantized_model_id)
+    if push_to_hub:
+        quantized_model.push_to_hub(quantized_model_id, safe_serialization=False)
+        tokenizer.push_to_hub(quantized_model_id)
+        card.push_to_hub(quantized_model_id)
+    else:
+        quantized_model.save_pretrained(quantized_model_id, safe_serialization=False)
+        tokenizer.save_pretrained(quantized_model_id)
 
     # Manual Testing
     prompt = "Hey, are you conscious? Can you talk to me?"
@@ -700,5 +704,11 @@ def quantize_and_upload(model_id, quant):
         type=str,
         help="Quantization method. Options are FP8, INT4, INT8_INT4, AWQ-INT4",
     )
+    parser.add_argument(
+        "--push_to_hub",
+        action="store_true",
+        default=False,
+        help="Flag to indicate whether push to huggingface hub or not",
+    )
     args = parser.parse_args()
-    quantize_and_upload(args.model_id, args.quant)
+    quantize_and_upload(args.model_id, args.quant, args.push_to_hub)
@@ -14,6 +14,7 @@
 
 # Default quantization options
 default_quants=("FP8" "INT4" "INT8-INT4")
+push_to_hub=""
 # Parse arguments
 while [[ $# -gt 0 ]]; do
   case "$1" in
@@ -29,6 +30,10 @@ while [[ $# -gt 0 ]]; do
         shift
       done
       ;;
+     --push_to_hub)
+      push_to_hub="--push_to_hub"
+      shift
+      ;;
     *)
       echo "Unknown option: $1"
       exit 1
@@ -38,14 +43,14 @@ done
 # Use default quants if none specified
 if [[ -z "$model_id" ]]; then
   echo "Error: --model_id is required"
-  echo "Usage: $0 --model_id <model_id> [--quants <quant1> [quant2 ...]]"
+  echo "Usage: $0 --model_id <model_id> [--quants <quant1> [quant2 ...]] [--push_to_hub]"
   exit 1
 fi
 if [[ ${#quants[@]} -eq 0 ]]; then
   quants=("${default_quants[@]}")
 fi
 # Run the python command for each quantization option
 for quant in "${quants[@]}"; do
-  echo "Running: python quantize_and_upload.py --model_id $model_id --quant $quant"
-  python quantize_and_upload.py --model_id "$model_id" --quant "$quant"
+  echo "Running: python quantize_and_upload.py --model_id $model_id --quant $quant $push_to_hub"
+  python quantize_and_upload.py --model_id "$model_id" --quant "$quant" $push_to_hub
 done
@@ -43,4 +43,4 @@ jobs:
         pip install .
         HF_MODEL_ID=${{ github.event.inputs.hf_model_id }}
         cd .github/scripts/torchao_model_releases
-        ./release.sh --model_id $HF_MODEL_ID
+        ./release.sh --model_id $HF_MODEL_ID --push_to_hub
@@ -35,8 +35,8 @@ class ExperimentConfig:
 @dataclass(frozen=True)
 class ExperimentResult:
     bf16_us: float
-    fp8_us: float
-    fp8_speedup: float
+    scaled_us: float
+    scaled_speedup: float
 
 
 @dataclass(frozen=True)
@@ -48,8 +48,8 @@ class Experiment:
 def get_configs() -> List[ExperimentConfig]:
     # Llama4 shapes
     A_shapes = [(16640, 5120)]
-    B_shapes = [(1, 8192, 5120), (16, 8192, 5120), (128, 8192, 5120)]
-    recipes = [MoEScalingType.FP8_ROWWISE]
+    B_shapes = [(16, 8192, 5120)]
+    recipes = [MoEScalingType.MXFP8, MoEScalingType.FP8_ROWWISE]
     high_precision_dtypes = [torch.bfloat16]
     configs = []
     for A_shape, B_shape, recipe, high_precision_dtype in itertools.product(
@@ -93,7 +93,8 @@ def run_experiment(
     #    which represents the right operand.
     n_groups = config.B_shape[0]
     Mg = A.shape[0]
-    offs = generate_jagged_offs(n_groups, Mg, multiple_of=16)
+    token_group_alignment_size = 32 if config.recipe == MoEScalingType.MXFP8 else 16
+    offs = generate_jagged_offs(n_groups, Mg, multiple_of=token_group_alignment_size)
 
     labels = torch.ones(
         (A.shape[0], B_t.shape[-1]), device=device, dtype=torch.bfloat16
@@ -107,6 +108,7 @@ def run_experiment(
         offs,
         labels=labels,
         use_compile=args.compile,
+        fullgraph=False,
     )
     if args.profile:
         profile_fwd_bwd(
@@ -116,18 +118,20 @@ def run_experiment(
             offs,
             labels=labels,
             use_compile=args.compile,
+            fullgraph=False,
             profile_name="bf16_profile",
         )
 
     # benchmark scaled grouped mm with dynamic fp8 rowwise quant
-    fp8_us = bench_fwd_bwd_microseconds(
+    scaled_us = bench_fwd_bwd_microseconds(
         _scaled_grouped_mm,
         A,
         B_t,
         offs,
         scaling_type=config.recipe,
         labels=labels,
         use_compile=args.compile,
+        fullgraph=False,
     )
     if args.profile:
         profile_fwd_bwd(
@@ -139,22 +143,24 @@ def run_experiment(
             labels=labels,
             use_compile=args.compile,
             profile_name="scaled_profile",
+            fullgraph=False,
         )
 
     return ExperimentResult(
         bf16_us=round(bf16_us, 3),
-        fp8_us=round(fp8_us, 3),
-        fp8_speedup=round(bf16_us / fp8_us, 3),
+        scaled_us=round(scaled_us, 3),
+        scaled_speedup=round(bf16_us / scaled_us, 3),
     )
 
 
 def print_results(experiments: List[Experiment]):
     headers = [
         "A_shape",
         "B_shape",
+        "recipe",
         "bf16_time_us",
         "scaled_time_us",
-        "fp8_speedup",
+        "scaled_speedup",
     ]
     rows = []
     for experiment in experiments:
@@ -164,9 +170,10 @@ def print_results(experiments: List[Experiment]):
             [
                 A_shape,
                 B_shape,
+                experiment.config.recipe,
                 experiment.result.bf16_us,
-                experiment.result.fp8_us,
-                f"{experiment.result.fp8_speedup}x",
+                experiment.result.scaled_us,
+                f"{experiment.result.scaled_speedup}x",
             ]
         )
     print(tabulate(rows, headers=headers))
 
@@ -89,6 +89,7 @@
     "topic: performance": "Performance",
     "topic: documentation": "Documentation",
     "topic: for developer": "Developers",
+    "topic: not user facing": "Not User Facing",
 }
 
 
@@ -123,6 +124,7 @@ def clean_release_notes():
         "Performance": [],
         "Documentation": [],
         "Developers": [],
+        "Not User Facing": [],
     }
     with open(input_file, "r") as in_f, open(output_file, "a") as out_f:
         for line in in_f.readlines():
@@ -195,8 +197,6 @@ def get_commit_category(
     pr_number = parse_pr_number(commit_line)
     if pr_number in pr_number_to_label:
         label = pr_number_to_label[pr_number]
-        if label == "topic: not user facing":
-            return None
         if label in GITHUB_LABEL_TO_CATEGORY:
             return GITHUB_LABEL_TO_CATEGORY[label]
     elif any(x in commit_line.lower() for x in ["revert", "version.txt"]):
 
@@ -433,6 +433,7 @@ def get_extensions():
             extra_link_args.append("/DEBUG")
 
     rocm_sparse_marlin_supported = False
+    rocm_tiled_layout_supported = False
     if use_rocm:
         # naive search for hipblalst.h, if any found contain HIPBLASLT_ORDER_COL16 and VEC_EXT
         found_col16 = False
@@ -488,8 +489,11 @@ def get_extensions():
     # Define ROCm source directories
     rocm_source_dirs = [
         os.path.join(extensions_dir, "rocm", "swizzle"),
-        os.path.join(extensions_dir, "cuda", "tensor_core_tiled_layout"),
     ]
+    if rocm_tiled_layout_supported:
+        rocm_source_dirs.append(
+            os.path.join(extensions_dir, "cuda", "tensor_core_tiled_layout")
+        )
     if rocm_sparse_marlin_supported:
         rocm_source_dirs.extend([os.path.join(extensions_dir, "cuda", "sparse_marlin")])
 
@@ -512,14 +516,8 @@ def get_extensions():
     sources = [s for s in sources if s not in mxfp8_sources_to_exclude]
 
     # TOOD: Remove this and use what CUDA has once we fix all the builds.
+    # TODO: Add support for other ROCm GPUs
     if use_rocm:
-        # Add ROCm GPU architecture check
-        gpu_arch = None
-        if torch.cuda.is_available():
-            gpu_arch = torch.cuda.get_device_properties(0).name
-        if gpu_arch and gpu_arch != "gfx942":
-            print(f"Warning: Unsupported ROCm GPU architecture: {gpu_arch}")
-            print("Currently only gfx942 is supported. Compiling only for gfx942.")
         extra_compile_args["nvcc"].append("--offload-arch=gfx942")
         sources += rocm_sources
     else:
 
@@ -14,7 +14,8 @@
 import pytest
 import torch
 from torch._inductor.test_case import TestCase as InductorTestCase
-from torch.profiler import ProfilerActivity, profile
+from torch._inductor.utils import run_and_get_code
+from torch.testing import FileCheck
 from torch.testing._internal import common_utils
 
 from torchao.dtypes.floatx.float8_layout import Float8AQTTensorImpl, preprocess_scale
@@ -766,32 +767,36 @@ def test_expected_kernels_on_gpu(self, granularity, float8_config_version):
             config,
         )
 
-        m = torch.compile(m, mode="default")
+        m = torch.compile(m)
         x = torch.randn(M, K, device="cuda", dtype=torch.bfloat16)
-
-        # warm up
-        _ = m(x)
-        # capture trace
-        with profile(activities=[ProfilerActivity.CUDA]) as prof:
-            _ = m(x)
-
-        cuda_kernel_events = [x for x in prof.key_averages() if x.cuda_time > 0]
-
-        if granularity == PerTensor():
+        out, code = run_and_get_code(m, x)
+
+        # triton kernel call looks like:
+        #   triton_per_fused__scaled_mm__to_copy_abs_amax_clamp_clone_div_expand_permute_transpose_unsqueeze_view_0.run(arg3_1, buf1, buf2, 128, 256, stream=stream0)
+        # scaled_mm call looks like:
+        #   extern_kernels._scaled_mm(buf1, reinterpret_tensor(arg0_1, (256, 512), (1, 256), 0), buf2, reinterpret_tensor(arg1_1, (1, 512), (1, 1), 0), arg2_1, out_dtype=torch.bfloat16, use_fast_accum=True, out=buf3)
+        if granularity == PerRow():
+            # one triton kernel for quantizing the activation
+            FileCheck().check("def call(").check_count(".run(", 1, exactly=True).run(
+                code[0]
+            )
+            # one scaled_mm call
+            FileCheck().check("def call(").check_count(
+                "._scaled_mm(", 1, exactly=True
+            ).run(code[0])
+        else:
+            assert granularity == PerTensor(), "unsupported"
+            # three triton kernels for quantizing the activation:
             # kernel 1: x_max_tmp = max(x, ...)
             # kernel 2: x_max = max(x_max_tmp)
             # kernel 3: x_float8 = to_float8(x, x_max)
-            # kernel 4: gemm
-            assert len(cuda_kernel_events) == 4, (
-                f"too many cuda kernels: {cuda_kernel_events}"
-            )
-        else:
-            assert granularity == PerRow()
-            # kernel 1: x_float8 = to_float8(x)
-            # kernel 2: gemm
-            assert len(cuda_kernel_events) == 2, (
-                f"too many cuda kernels: {cuda_kernel_events}"
+            FileCheck().check("def call(").check_count(".run(", 3, exactly=True).run(
+                code[0]
             )
+            # one scaled_mm call
+            FileCheck().check("def call(").check_count(
+                "._scaled_mm(", 1, exactly=True
+            ).run(code[0])
 
 
 common_utils.instantiate_parametrized_tests(TestAffineQuantizedFloat8Compile)
 
@@ -43,7 +43,7 @@
     to_nf4,
 )
 from torchao.testing.utils import skip_if_rocm
-from torchao.utils import TORCH_VERSION_AT_LEAST_2_7
+from torchao.utils import torch_version_at_least
 
 bnb_available = False
 
@@ -123,7 +123,7 @@ def test_backward_dtype_match(self, dtype: torch.dtype):
     @unittest.skipIf(not bnb_available, "Need bnb availble")
     @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     @unittest.skipIf(
-        TORCH_VERSION_AT_LEAST_2_7, reason="Failing in CI"
+        torch_version_at_least("2.7.0"), reason="Failing in CI"
     )  # TODO: fix this
     @skip_if_rocm("ROCm enablement in progress")
     @parametrize("dtype", [torch.bfloat16, torch.float16, torch.float32])
@@ -150,7 +150,7 @@ def test_reconstruction_qlora_vs_bnb(self, dtype: torch.dtype):
     @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
     @skip_if_rocm("ROCm enablement in progress")
     @unittest.skipIf(
-        TORCH_VERSION_AT_LEAST_2_7, reason="Failing in CI"
+        torch_version_at_least("2.7.0"), reason="Failing in CI"
     )  # TODO: fix this
     @parametrize("dtype", [torch.bfloat16, torch.float16, torch.float32])
     def test_nf4_bnb_linear(self, dtype: torch.dtype):
 
@@ -76,13 +76,13 @@
 )
 from torchao.testing.utils import skip_if_rocm
 from torchao.utils import (
-    TORCH_VERSION_AT_LEAST_2_7,
     benchmark_model,
     check_cpu_version,
     check_xpu_version,
     is_fbcode,
     is_sm_at_least_89,
     is_sm_at_least_90,
+    torch_version_at_least,
     unwrap_tensor_subclass,
 )
 
@@ -1883,7 +1883,7 @@ def forward(self, x):
         model(x)
 
         api(model)
-        if not TORCH_VERSION_AT_LEAST_2_7:
+        if not torch_version_at_least("2.7.0"):
             unwrap_tensor_subclass(model)
 
         # running model
@@ -1942,7 +1942,7 @@ def forward(self, x):
         model(x)
 
         api(model)
-        if not TORCH_VERSION_AT_LEAST_2_7:
+        if not torch_version_at_least("2.7.0"):
             unwrap_tensor_subclass(model)
 
         # running model
 
@@ -17,9 +17,9 @@
 import torch
 
 from packaging import version
-from torchao.utils import TORCH_VERSION_AT_LEAST_2_8
+from torchao.utils import torch_version_at_least
 
-if not TORCH_VERSION_AT_LEAST_2_8:
+if not torch_version_at_least("2.8.0"):
     pytest.skip("Requires PyTorch 2.8 or higher", allow_module_level=True)