add mxfp8 and nvfp4 to Llama eval scripts

vkuzo · vkuzo · commit 1568f88f60bf · 2025-12-03T12:36:29.000-08:00
Summary: Adds mxfp8 and nvfp4 to llama eval scripts. Results: ``` // bf16 baseline with-proxy time python torchao/_models/llama/eval.py --checkpoint_path checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --print_model --tasks wikitext winogrande wikitext: {'alias': 'wikitext', 'word_perplexity,none': 7.5472105433748435, 'word_perplexity_stderr,none': 'N/A', 'byte_perplexity,none': 1.459319739134015, 'byte_perplexity_stderr,none': 'N/A', 'bits_per_byte,none': 0.5452960145272896, 'bits_per_byte_stderr,none': 'N/A'} winogrande: {'alias': 'winogrande', 'acc,none': 0.7426992896606156, 'acc_stderr,none': 0.012285989618865697} // mxfp8 with floor scaling, turned off compile as it seemed stuck in coordinate descent tuning with-proxy time python torchao/_models/llama/eval.py --checkpoint_path checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --print_model --tasks wikitext winogrande --quantization mxfp8 wikitext: {'alias': 'wikitext', 'word_perplexity,none': 7.609070006132819, 'word_perplexity_stderr,none': 'N/A', 'byte_perplexity,none': 1.4615491037668933, 'byte_perplexity_stderr,none': 'N/A', 'bits_per_byte,none': 0.5474983002838458, 'bits_per_byte_stderr,none': 'N/A'} winogrande: {'alias': 'winogrande', 'acc,none': 0.7292817679558011, 'acc_stderr,none': 0.012487904760626407} // mxfp8 with rceil scaling wikitext: {'alias': 'wikitext', 'word_perplexity,none': 7.605445025927753, 'word_perplexity_stderr,none': 'N/A', 'byte_perplexity,none': 1.4614188696390065, 'byte_perplexity_stderr,none': 'N/A', 'bits_per_byte,none': 0.5473697404554175, 'bits_per_byte_stderr,none': 'N/A'} winogrande: {'alias': 'winogrande', 'acc,none': 0.7387529597474349, 'acc_stderr,none': 0.012346914863415201} // nvfp4 wikitext: {'alias': 'wikitext', 'word_perplexity,none': 8.44478255417328, 'word_perplexity_stderr,none': 'N/A', 'byte_perplexity,none': 1.4903102070118779, 'byte_perplexity_stderr,none': 'N/A', 'bits_per_byte,none': 0.5756126578938119, 'bits_per_byte_stderr,none': 'N/A'} winogrande: {'alias': 'winogrande', 'acc,none': 0.7182320441988951, 'acc_stderr,none': 0.012643326011853038} // float8 rowwise (for comparison to existing technique) wikitext: {'alias': 'wikitext', 'word_perplexity,none': 7.618818730886612, 'word_perplexity_stderr,none': 'N/A', 'byte_perplexity,none': 1.4618990946965715, 'byte_perplexity_stderr,none': 'N/A', 'bits_per_byte,none': 0.5478437349532752, 'bits_per_byte_stderr,none': 'N/A'} winogrande: {'alias': 'winogrande', 'acc,none': 0.7371744277821626, 'acc_stderr,none': 0.01237092252726192} ``` Test Plan: Reviewers: Subscribers: Tasks: Tags: ghstack-source-id: 3a2d8ef ghstack-comment-id: 3581080988 Pull-Request: #3394
diff --git a/torchao/_models/llama/eval.py b/torchao/_models/llama/eval.py
@@ -16,6 +16,10 @@
 
 import torchao
 from torchao._models.llama.model import prepare_inputs_for_model
+from torchao.prototype.mx_formats.inference_workflow import (
+    MXDynamicActivationMXWeightConfig,
+    NVFP4DynamicActivationNVFP4WeightConfig,
+)
 from torchao.quantization import (
     Float8DynamicActivationFloat8WeightConfig,
     Float8WeightOnlyConfig,
@@ -170,15 +174,43 @@ def run_evaluation(
             quantize_(
                 model,
                 Float8DynamicActivationFloat8WeightConfig(granularity=granularity),
+                filter_fn=lambda mod, fqn: isinstance(mod, torch.nn.Linear)
+                and fqn != "output",
             )
         if quantization == "float8_a1x128_w128x128":
             config = Float8DynamicActivationFloat8WeightConfig(
                 granularity=(PerBlock([1, 128]), PerBlock([128, 128])),
                 activation_value_lb=1e-12,
             )
             # TODO(future): all workflows in this file should be skipping quantization
-            # of `lm_head`
+            # of `lm_head`/`output`
             quantize_(model, config)
+        if quantization == "mxfp8":
+            config = MXDynamicActivationMXWeightConfig(
+                activation_dtype=torch.float8_e4m3fn,
+                weight_dtype=torch.float8_e4m3fn,
+            )
+            # TODO(future): all workflows in this file should be skipping quantization
+            # of `lm_head`/`output`
+            quantize_(
+                model,
+                config,
+                filter_fn=lambda mod, fqn: isinstance(mod, torch.nn.Linear)
+                and fqn != "output",
+            )
+        if quantization == "nvfp4":
+            config = NVFP4DynamicActivationNVFP4WeightConfig(
+                use_dynamic_per_tensor_scale=True,
+                use_triton_kernel=True,
+            )
+            # TODO(future): all workflows in this file should be skipping quantization
+            # of `lm_head`/`output`
+            quantize_(
+                model,
+                config,
+                filter_fn=lambda mod, fqn: isinstance(mod, torch.nn.Linear)
+                and fqn != "output",
+            )
         if "autoround" in quantization:
             from transformers import AutoTokenizer
 
@@ -284,8 +316,8 @@ def run_evaluation(
 
     if compile:
         # TODO(future PR): clean this up
-        if quantization == "float8_a1x128_w128x128":
-            # we don't need max-autotune for float8 blockwise quant
+        if quantization in ("float8_a1x128_w128x128", "mxfp8", "nvfp4"):
+            # we don't need max-autotune for float8 blockwise or mxfp8 quant
             model = torch.compile(model)
         else:
             model = torch.compile(model, mode="max-autotune", fullgraph=True)
diff --git a/torchao/prototype/mx_formats/README.md b/torchao/prototype/mx_formats/README.md
@@ -223,7 +223,28 @@ To reproduce this on supported hardware, you can run the following command:
 
 ## inference
 
-Coming soon!
+Eval results on LLaMa 3.1 8B on common tasks. `mxfp8` and `nvfp4` recipes quantize all linears except `lm_head`. 
+
+Note: the accuracy results below are WIP and are not optimized yet.
+
+| recipe | wikitext word_perplexity | winogrande |
+| ------ | -------- | ---------- |
+| bfloat16 (baseline) | 7.5472105433748435 | 0.7426992896606156 |
+| mxfp8 | 7.609070006132819 | 0.7292817679558011 |
+| nvfp4 | 8.44478255417328 | 0.7182320441988951 |
+
+To reproduce:
+
+```bash
+# baseline
+python torchao/_models/llama/eval.py --checkpoint_path checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --print_model --tasks wikitext winogrande
+
+# mxfp8
+python torchao/_models/llama/eval.py --checkpoint_path checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --print_model --tasks wikitext winogrande --quantization mxfp8
+
+# nvfp4
+python torchao/_models/llama/eval.py --checkpoint_path checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --print_model --tasks wikitext winogrande --quantization nvfp4
+```
 
 # testing