pytorch
diff --git a/‎.github/scripts/torchao_model_releases/README.md‎
Lines changed: 9 additions & 2 deletions b/‎.github/scripts/torchao_model_releases/README.md‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎.github/scripts/torchao_model_releases/eval.sh‎
Lines changed: 11 additions & 2 deletions b/‎.github/scripts/torchao_model_releases/eval.sh‎
Lines changed: 11 additions & 2 deletions
diff --git a/‎.github/scripts/torchao_model_releases/eval_quality.sh‎
Lines changed: 15 additions & 7 deletions b/‎.github/scripts/torchao_model_releases/eval_quality.sh‎
Lines changed: 15 additions & 7 deletions
diff --git a/‎benchmarks/benchmark_e2e_fp8_sparse_linear.py‎
Lines changed: 15 additions & 0 deletions b/‎benchmarks/benchmark_e2e_fp8_sparse_linear.py‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎benchmarks/prototype/moe_training/mxfp8/bench_all_to_all_v.py‎
Lines changed: 240 additions & 0 deletions b/‎benchmarks/prototype/moe_training/mxfp8/bench_all_to_all_v.py‎
Lines changed: 240 additions & 0 deletions
@@ -119,7 +119,7 @@ uv pip install vllm --pre --extra-index-url https://download.pytorch.org/whl/nig
 
 After environment is setup, we can run eval:
 ```
-sh eval.sh --eval_type latency --model_ids Qwen/Qwen3-8B --batch_sizes 1,256
+sh eval.sh --eval_type latency --model_ids Qwen/Qwen3-8B --batch_sizes 1 256
 ```
 
 #### Model Quality Eval
@@ -129,9 +129,16 @@ uv pip install lm-eval
 ```
 After environment is setup, we can run eval:
 ```
-sh eval.sh --eval_type quality --model_ids Qwen/Qwen3-8B --tasks hellaswag,mmlu
+sh eval.sh --eval_type quality --model_ids Qwen/Qwen3-8B --tasks hellaswag mmlu
 ```
 
+Note: you can pass in `--use_cache` if the eval task failed during the middle of the run
+and you don't want to re-run all evals.
+```
+sh eval.sh --eval_type quality --model_ids Qwen/Qwen3-8B --tasks hellaswag mmlu --use_cache
+```
+
+
 #### Summarize results
 After we have finished all evals for each model, we can summarize the results with:
 ```
 
@@ -9,7 +9,7 @@ set -e
 source eval_env_checks.sh
 
 usage() {
-  echo "Usage: $0 --model_ids <model1> <model2> ... [--eval_type <all|memory|latency|quality>] [--batch_sizes <batch_sizes>] [--tasks <tasks>]"
+  echo "Usage: $0 --model_ids <model1> <model2> ... [--eval_type <all|memory|latency|quality>] [--batch_sizes <batch_sizes>] [--tasks <tasks>] [--use_cache]"
   echo "Defaults:"
   echo "  batch_sizes: 1 256"
   echo "  tasks: mmlu"
@@ -20,6 +20,7 @@ EVAL_TYPE="all"
 # these will be parsed in the other scripts
 BATCH_SIZES="1 256"    # Default for latency eval
 TASKS="mmlu"           # Default for quality eval
+USE_CACHE=false      # default: do not use cache
 # Parse arguments
 while [[ $# -gt 0 ]]; do
   case "$1" in
@@ -58,6 +59,10 @@ while [[ $# -gt 0 ]]; do
       TASKS="$1"
       shift
       ;;
+    --use_cache)
+      USE_CACHE=true
+      shift
+      ;;
     *)
       echo "Unknown argument: $1"
       usage
@@ -82,7 +87,11 @@ run_latency() {
 run_quality() {
   check_lm_eval
   local model_id="$1"
-  sh eval_quality.sh --model_ids "$model_id" --tasks $TASKS
+  if $USE_CACHE; then
+    sh eval_quality.sh --model_ids "$model_id" --tasks $TASKS --use_cache
+  else
+    sh eval_quality.sh --model_ids "$model_id" --tasks $TASKS
+  fi
 }
 for MODEL_ID in "${MODEL_ID_ARRAY[@]}"; do
   case "$EVAL_TYPE" in
 
@@ -11,6 +11,7 @@ check_lm_eval
 
 MODEL_ID_ARRAY=()
 TASK_ARRAY=("mmlu")  # default can be overwritten by user input
+USE_CACHE=false      # default: do not use cache
 # Parse arguments
 while [[ $# -gt 0 ]]; do
   case "$1" in
@@ -29,9 +30,13 @@ while [[ $# -gt 0 ]]; do
         shift
       done
       ;;
+    --use_cache)
+      USE_CACHE=true
+      shift
+      ;;
     *)
       echo "Unknown argument: $1"
-      echo "Usage: $0 --model_id <model_id> [--tasks <tasks> (comma-separated, e.g. mmlu,arc_challenge, default mmlu)]"
+      echo "Usage: $0 --model_id <model_id> [--tasks <tasks> (comma-separated, e.g. mmlu,arc_challenge, default mmlu)] [--use_cache]"
       exit 1
       ;;
   esac
@@ -51,16 +56,19 @@ for MODEL_ID in "${MODEL_ID_ARRAY[@]}"; do
         EVAL_CACHE_DB_PREFIX="/tmp/${SAFE_MODEL_ID}_quality_${TASK}"
         mkdir -p "${EVAL_CACHE_DB_PREFIX}"
         echo "Running model quality (accuracy) evaluation for model $MODEL_ID on task $TASK"
-
-        lm_eval \
+        LM_EVAL_CMD="lm_eval \
             --model hf \
-            --model_args pretrained="$MODEL_ID" \
-            --tasks "$TASK" \
+            --model_args pretrained=\"$MODEL_ID\" \
+            --tasks \"$TASK\" \
             --device cuda:0 \
-            --use_cache "$EVAL_CACHE_DB_PREFIX" \
             --batch_size auto \
-            --output_path "$RESULTS_DIR" > "$OUTPUT_FILE" 2>&1
+            --output_path \"$RESULTS_DIR\""
+
+        if $USE_CACHE; then
+            LM_EVAL_CMD="$LM_EVAL_CMD --use_cache \"$EVAL_CACHE_DB_PREFIX\""
+        fi
 
+        eval "$LM_EVAL_CMD" > "$OUTPUT_FILE" 2>&1
         echo "Quality eval output for task '$TASK' saved to $OUTPUT_FILE"
     done
     echo "======================== Eval Model Quality $MODEL_ID End =================="
 
@@ -40,6 +40,18 @@ def benchmark(num_tokens, hidden_size=8192, intermediate_size=8192):
     input_tensor = torch.randn(num_tokens, hidden_size).to(torch.bfloat16).cuda()
     fp16_time = benchmark_microseconds(ffn_ref, input_tensor)
 
+    # Sparsify-only benchmarks
+    ao_fast_sparsification_time = benchmark_microseconds(
+        torch.ops.torchao.sparse24_sm90_sparsify(
+            input_tensor,
+            "cutlass",
+            "identity",
+            "largest",
+            dtype=torch.float8_e4m3fn,
+        )
+    )
+    cusparselt_time = benchmark_microseconds(torch._cslt_compress, input_tensor)
+
     # bf16
     ffn_clone = (
         nn.Sequential(
@@ -117,7 +129,10 @@ def benchmark(num_tokens, hidden_size=8192, intermediate_size=8192):
         "fp8_c_time (us)": fp8_c_time,
         "fp8_c_sparse_time (us)": fp8_c_sparse_time,
         "fp8_c_activation_sparse_time (us)": fp8_c_activation_sparse_time,
+        "ao_fast_sparsification_time (us)": ao_fast_sparsification_time,
+        "cusparselt_compress_time (us)": cusparselt_time,
         "speedup": fp8_c_time / fp8_c_activation_sparse_time,
+        "sparsify_speedup": cusparselt_time / ao_fast_sparsification_time,
     }
 
 
 
@@ -0,0 +1,240 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+######################################################################
+#
+# To run these benchmarks, use the following command:
+#
+# torchrun --nproc-per-node=8 --local-ranks-filter=0 benchmarks/prototype/moe_training/mxfp8/bench_all_to_all_v.py
+#
+#######################################################################
+import os
+import time
+from dataclasses import dataclass
+from typing import List
+
+import torch
+from tabulate import tabulate
+from torch import distributed as dist
+from torch.distributed._functional_collectives import (
+    all_to_all_single_autograd,
+)
+from tqdm import tqdm
+
+from torchao.prototype.moe_training.kernels.mxfp8.comms import (
+    mxfp8_on_device_all_to_all_v,
+)
+
+device = torch.device("cuda")
+
+
+@dataclass(frozen=True)
+class ExperimentConfig:
+    input_shape: tuple[int]
+
+
+@dataclass(frozen=True)
+class ExperimentResult:
+    bf16_us: float
+    mxfp8_us: float
+
+
+@dataclass(frozen=True)
+class Experiment:
+    config: ExperimentConfig
+    result: ExperimentResult
+
+
+def get_configs() -> List[ExperimentConfig]:
+    # (batch_size, seq_len, dim)
+    input_shapes = [
+        (8, 8192, 5120),
+    ]
+    configs = []
+    for shape in input_shapes:
+        configs.append(
+            ExperimentConfig(
+                input_shape=shape,
+            )
+        )
+    return configs
+
+
+def run_experiment(config: ExperimentConfig) -> ExperimentResult:
+    batch_size, seq_len, dim = config.input_shape
+    x = torch.randn(
+        (batch_size * seq_len, dim),
+        dtype=torch.bfloat16,
+        device=device,
+    )
+    ref_x = x.detach().clone()
+
+    # Max output tokens per rank is worst case where one rank receives all tokens
+    input_tokens_per_rank = batch_size * seq_len
+    max_output_tokens_per_rank = input_tokens_per_rank * dist.get_world_size()
+
+    def using_bf16(
+        input_tensor: torch.Tensor, input_splits: torch.Tensor
+    ) -> torch.Tensor:
+        # Calculate output splits from input splits
+        output_splits = torch.empty_like(input_splits)
+        dist.all_to_all_single(output_splits, input_splits)
+
+        # Perform all-to-all
+        out = all_to_all_single_autograd(
+            input_tensor,
+            output_splits.tolist(),
+            input_splits.tolist(),
+            dist.group.WORLD,
+        )
+        out = torch.ops._c10d_functional.wait_tensor(out)
+        return out
+
+    def using_mxfp8(
+        input_tensor: torch.Tensor, input_splits: torch.Tensor
+    ) -> torch.Tensor:
+        output, output_splits = mxfp8_on_device_all_to_all_v(
+            input_tensor,
+            input_splits,
+            max_output_tokens_per_rank,
+            dist.group.WORLD.group_name,
+        )
+        output = torch.ops._c10d_functional.wait_tensor(output)
+        output_splits = torch.ops._c10d_functional.wait_tensor(output_splits)
+        return output
+
+    def warmup(func_no_args):
+        for _ in range(2):
+            func_no_args()
+
+    num_splits = dist.get_world_size()
+    input_splits = generate_split_sizes(
+        num_splits, input_tokens_per_rank, device=device
+    )
+
+    print(
+        "Benchmarking using bf16",
+        "batch_size",
+        batch_size,
+        "seq_len",
+        seq_len,
+        "dim",
+        dim,
+        "input_tokens_per_rank",
+        input_tokens_per_rank,
+        "max_output_tokens_per_rank",
+        max_output_tokens_per_rank,
+    )
+    warmup(lambda: using_bf16(ref_x, input_splits))
+    start_ns = time.perf_counter()
+    using_bf16(ref_x, input_splits)
+    end_ns = time.perf_counter()
+    bf16_us = (end_ns - start_ns) * 1e6
+
+    print(
+        "Benchmarking using_mxfp8",
+        "batch_size",
+        batch_size,
+        "seq_len",
+        seq_len,
+        "dim",
+        dim,
+        "input_tokens_per_rank",
+        input_tokens_per_rank,
+        "max_output_tokens_per_rank",
+        max_output_tokens_per_rank,
+    )
+    warmup(lambda: using_mxfp8(x, input_splits))
+    start_ns = time.perf_counter()
+    using_mxfp8(x, input_splits)
+    end_ns = time.perf_counter()
+    mxfp8_us = (end_ns - start_ns) * 1e6
+
+    return ExperimentResult(
+        bf16_us=bf16_us,
+        mxfp8_us=mxfp8_us,
+    )
+
+
+def print_results(experiments: List[Experiment]):
+    headers = [
+        "input_shape",
+        "num_splits",
+        "bf16_us",
+        "mxfp8_us",
+    ]
+    rows = []
+    num_splits = dist.get_world_size()
+    for experiment in experiments:
+        rows.append(
+            [
+                str(experiment.config.input_shape),
+                num_splits,
+                experiment.result.bf16_us,
+                experiment.result.mxfp8_us,
+            ]
+        )
+    print(tabulate(rows, headers=headers))
+
+
+def generate_split_sizes(K: int, N: int, device: str = "cuda") -> torch.Tensor:
+    """
+    Generates a tensor of K random non-negative integers that sum to N.
+    Used for testing mxfp8_all_to_all_v implementation.
+    """
+    if K <= 0:
+        raise ValueError("K must be a positive integer.")
+    if N < 0:
+        raise ValueError("N must be a non-negative integer.")
+
+    if K == 1:
+        return torch.tensor([N], dtype=torch.long, device=device)
+
+    # Generate K-1 random "dividers" in the range [0, N].
+    dividers = torch.randint(0, N + 1, (K - 1,), device=device)
+
+    # Add 0 and N to the set of dividers to form the boundaries.
+    boundaries = torch.cat(
+        [torch.tensor([0], device=device), dividers, torch.tensor([N], device=device)]
+    )
+
+    # Sort the boundaries to ensure they are in order
+    sorted_boundaries = torch.sort(boundaries).values
+
+    # The K integers are the differences between consecutive boundaries (will sum to N)
+    result = sorted_boundaries[1:] - sorted_boundaries[:-1]
+
+    return result.to(dtype=torch.int64)
+
+
+def main():
+    torch.random.manual_seed(123)
+
+    # Set up process group
+    setup_distributed()
+
+    # Generate experiment configs
+    configs = get_configs()
+    results = []
+    for config in tqdm(configs):
+        result = run_experiment(config)
+        results.append(Experiment(config=config, result=result))
+
+    # Use Tabulate to print results
+    print_results(results)
+
+    # Clean up process group
+    dist.destroy_process_group()
+
+
+def setup_distributed():
+    rank = int(os.environ["RANK"])
+    world_size = int(os.environ["WORLD_SIZE"])
+    dist.init_process_group("nccl", rank=rank, world_size=world_size)
+    torch.cuda.set_device(rank)
+
+
+if __name__ == "__main__":
+    main()