From c00b120080f51ebf9f12b602a3dbdd365ce038e3 Mon Sep 17 00:00:00 2001
From: vasiliy <vasiliy@fb.com>
Date: Mon, 29 Jul 2024 09:56:20 -0700
Subject: [PATCH] move float8_experimental to torchao/float8

Summary:

This PR moves https://github.com/pytorch-labs/float8_experimental to
torchao/float8.

There are no logic changes here. Here is how to reproduce this PR:
* copy float8_experimental/float8_experimental/* to torchao/float8
* copy float8_experimental/test/* to test/float8
* copy float8_experimental/benchmarks/* to benchmarks/float8
* copy the README over and delete sections which no longer apply
  (license, installation)
* replace `float8_experimental` with `torchao.float8` everywhere

Test Plan:

```
// run local tests, they pass
./test/float8/test_everything.sh

// run every benchmark in `benchmarks/float8`, they still work
```

Reviewers:

Subscribers:

Tasks:

Tags:
---
 README.md                                  |   6 +
 benchmarks/float8/bench_linear_float8.py   | 307 +++++++++
 benchmarks/float8/bench_matmul.py          | 139 ++++
 benchmarks/float8/bench_multi_gpu.py       | 181 ++++++
 benchmarks/float8/bench_padding.py         | 223 +++++++
 benchmarks/float8/profile_linear_float8.py | 447 +++++++++++++
 benchmarks/float8/utils.py                 |  83 +++
 test/float8/test_base.py                   | 723 +++++++++++++++++++++
 test/float8/test_compile.py                | 329 ++++++++++
 test/float8/test_dtensor.py                | 327 ++++++++++
 test/float8/test_dtensor.sh                |  11 +
 test/float8/test_everything.sh             |  21 +
 test/float8/test_fsdp.py                   | 212 ++++++
 test/float8/test_fsdp.sh                   |  28 +
 test/float8/test_fsdp2/fsdp2_common.py     |  89 +++
 test/float8/test_fsdp2/test_fsdp2.py       | 561 ++++++++++++++++
 test/float8/test_fsdp_compile.py           | 139 ++++
 test/float8/test_fsdp_compile.sh           |  11 +
 test/float8/test_inference_flows.py        | 245 +++++++
 test/float8/test_numerics_integration.py   | 174 +++++
 torchao/float8/README.md                   | 159 +++++
 torchao/float8/__init__.py                 |  46 ++
 torchao/float8/config.py                   | 129 ++++
 torchao/float8/distributed_utils.py        | 113 ++++
 torchao/float8/float8_aten_api.py          |  49 ++
 torchao/float8/float8_linear.py            | 438 +++++++++++++
 torchao/float8/float8_linear_utils.py      | 327 ++++++++++
 torchao/float8/float8_ops.py               | 363 +++++++++++
 torchao/float8/float8_python_api.py        |  64 ++
 torchao/float8/float8_scaling_utils.py     | 216 ++++++
 torchao/float8/float8_tensor.py            | 363 +++++++++++
 torchao/float8/float8_tensor_parallel.py   | 235 +++++++
 torchao/float8/float8_utils.py             | 247 +++++++
 torchao/float8/fsdp_utils.py               | 388 +++++++++++
 torchao/float8/inference.py                | 244 +++++++
 35 files changed, 7637 insertions(+)
 create mode 100644 benchmarks/float8/bench_linear_float8.py
 create mode 100644 benchmarks/float8/bench_matmul.py
 create mode 100644 benchmarks/float8/bench_multi_gpu.py
 create mode 100644 benchmarks/float8/bench_padding.py
 create mode 100644 benchmarks/float8/profile_linear_float8.py
 create mode 100644 benchmarks/float8/utils.py
 create mode 100644 test/float8/test_base.py
 create mode 100644 test/float8/test_compile.py
 create mode 100644 test/float8/test_dtensor.py
 create mode 100755 test/float8/test_dtensor.sh
 create mode 100755 test/float8/test_everything.sh
 create mode 100644 test/float8/test_fsdp.py
 create mode 100755 test/float8/test_fsdp.sh
 create mode 100644 test/float8/test_fsdp2/fsdp2_common.py
 create mode 100644 test/float8/test_fsdp2/test_fsdp2.py
 create mode 100644 test/float8/test_fsdp_compile.py
 create mode 100755 test/float8/test_fsdp_compile.sh
 create mode 100644 test/float8/test_inference_flows.py
 create mode 100644 test/float8/test_numerics_integration.py
 create mode 100644 torchao/float8/README.md
 create mode 100644 torchao/float8/__init__.py
 create mode 100644 torchao/float8/config.py
 create mode 100644 torchao/float8/distributed_utils.py
 create mode 100644 torchao/float8/float8_aten_api.py
 create mode 100644 torchao/float8/float8_linear.py
 create mode 100644 torchao/float8/float8_linear_utils.py
 create mode 100644 torchao/float8/float8_ops.py
 create mode 100644 torchao/float8/float8_python_api.py
 create mode 100644 torchao/float8/float8_scaling_utils.py
 create mode 100644 torchao/float8/float8_tensor.py
 create mode 100644 torchao/float8/float8_tensor_parallel.py
 create mode 100644 torchao/float8/float8_utils.py
 create mode 100644 torchao/float8/fsdp_utils.py
 create mode 100644 torchao/float8/inference.py

diff --git a/README.md b/README.md
index e31dc63a8..8905b3f43 100644
--- a/README.md
+++ b/README.md
@@ -85,6 +85,12 @@ In some cases we rewrote popular GenAI models to be significantly faster in nati
 
 ### Training
 
+#### Float8
+
+[torchao.float8](torchao/float8) implements training recipes with the scaled float8 dtypes, as laid out in https://arxiv.org/abs/2209.05433.
+
+#### Sparsity
+
 We've added support for semi-structured 2:4 sparsity with 6% end to end speedups on ViT-L
 
 The code change is a 1 liner with the full example available [here](torchao/sparsity/training/)
diff --git a/benchmarks/float8/bench_linear_float8.py b/benchmarks/float8/bench_linear_float8.py
new file mode 100644
index 000000000..b44d4f5dc
--- /dev/null
+++ b/benchmarks/float8/bench_linear_float8.py
@@ -0,0 +1,307 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import copy
+from dataclasses import dataclass
+from itertools import product
+from pathlib import Path
+from typing import Callable, List, Optional, Tuple
+
+import pandas as pd
+
+import torch
+import torch.utils.benchmark as benchmark
+from torchao.float8.config import CastConfig, Float8LinearConfig, ScalingType
+from torchao.float8.float8_linear import Float8Linear
+from torchao.float8.float8_linear_utils import (
+    linear_requires_sync,
+    sync_float8_amax_and_scale_history,
+)
+from torchao.float8.float8_tensor import ScaledMMConfig
+from tqdm import tqdm
+
+# estimating TOPs for matmuls in fp32, fp16, fp8
+# assuming A * B = C, with A being M * K, B being K * N, C being M * N
+
+# H100 SXM specs: bottom of https://www.nvidia.com/en-us/data-center/h100/
+h100_peak_flops_float32 = 67e12
+h100_peak_flops_fp16_tc = 1979e12
+h100_peak_tops_float8_tc = 3958e12
+
+dtype_to_peak_tops = {
+    torch.float32: h100_peak_flops_float32,
+    torch.float16: h100_peak_flops_fp16_tc,
+    torch.bfloat16: h100_peak_flops_fp16_tc,
+    torch.float8_e4m3fn: h100_peak_tops_float8_tc,
+    torch.float8_e5m2: h100_peak_tops_float8_tc,
+}
+
+# prevent splitting columns when printing a data frame
+pd.set_option("display.expand_frame_repr", False)
+# print the entire data frame
+pd_print_full_ctx = pd.option_context(
+    "display.max_rows", None, "display.max_columns", None
+)
+
+
+def benchmark_torch_function_in_microseconds(
+    func: Callable,
+    *args,
+    **kwargs,
+) -> float:
+    t0 = benchmark.Timer(
+        stmt="func(*args, **kwargs)",
+        globals={"args": args, "kwargs": kwargs, "func": func},
+    )
+    return t0.blocked_autorange().median * 1e6
+
+
+@dataclass
+class Experiment:
+    name: str
+    shape: Tuple[int, int, int]
+    ref_time_sec: float
+    float8_time_sec: float
+    dtype: torch.dtype
+    compiled: bool
+    use_fast_accum: bool
+    scaling_repr: str
+
+    # 3 Times since we are calculating forward backward
+    @property
+    def ref_tops_sec(self):
+        M, K, N = self.shape
+        return float(3 * (2 * M * K * N)) / self.ref_time_sec
+
+    @property
+    def ref_pct_top_peak(self):
+        return self.ref_tops_sec / dtype_to_peak_tops[self.dtype]
+
+    @property
+    def float8_tops_sec(self):
+        M, K, N = self.shape
+        return float(3 * (2 * M * K * N)) / self.float8_time_sec
+
+    @property
+    def float8_pct_top_peak(self):
+        return self.float8_tops_sec / dtype_to_peak_tops[torch.float8_e4m3fn]
+
+
+def main(
+    sweep_path: Optional[Path] = None,
+    compile: bool = True,
+    n_limit: Optional[int] = None,
+    fast_accum_filter: Optional[bool] = None,
+    shape_name_filter: Optional[str] = None,
+    scaling_type_input: str = "dynamic",
+    scaling_type_weight: str = "dynamic",
+    scaling_type_grad_output: str = "dynamic",
+):
+    device = "cuda"
+    print(f"Compile is set to             | {compile}")
+
+    scaling_type_input = ScalingType(scaling_type_input)
+    scaling_type_weight = ScalingType(scaling_type_weight)
+    scaling_type_grad_output = ScalingType(scaling_type_grad_output)
+    config = Float8LinearConfig(
+        cast_config_input=CastConfig(scaling_type=scaling_type_input),
+        cast_config_weight=CastConfig(scaling_type=scaling_type_weight),
+        cast_config_grad_output=CastConfig(scaling_type=scaling_type_grad_output),
+    )
+
+    # LLaMa 2 70B single-node weight shapes
+    # assumes fused attn.wqkv and ffn.w13
+    name_to_shapes_70b = {
+        "attn.wqkv": (8192, 1280),
+        "attn.w0": (1024, 8192),
+        "ffn.w13": (8192, 7168),
+        "ffn.w2": (3584, 8192),
+    }
+    input_bias = False
+    if fast_accum_filter is not None:
+        use_fast_accum = [fast_accum_filter]
+    else:
+        use_fast_accum = [True, False]
+    if shape_name_filter is not None:
+        k = shape_name_filter
+        name_to_shapes_70b = {k: name_to_shapes_70b[k]}
+    experiment_list: List[Experiment] = []
+    dtype = torch.bfloat16
+    for idx, (fast_accum, (name, (K, N))) in enumerate(
+        tqdm(list(product(use_fast_accum, name_to_shapes_70b.items())))
+    ):
+        if n_limit is not None and idx >= n_limit:
+            break
+        linear_ref = torch.nn.Linear(K, N, bias=input_bias).to(
+            device=device, dtype=dtype
+        )
+
+        linear_float8 = Float8Linear.from_float(
+            copy.deepcopy(linear_ref),
+            config=config,
+        )
+        scaling_repr = linear_float8.scaling_repr()
+
+        if fast_accum:
+            linear_float8.forward_config = ScaledMMConfig(False, True, False)
+        else:
+            linear_float8.forward_config = ScaledMMConfig(False, False, False)
+
+        bsz, seq_len = 4, 4096
+        M = bsz * seq_len
+        input_tensor = torch.randn(M, K, device=device, dtype=dtype, requires_grad=True)
+        ref_forw_backward = lambda: linear_ref(input_tensor).sum().backward()
+
+        def float8_forw_backward():
+            if linear_requires_sync(config):
+                sync_float8_amax_and_scale_history(linear_float8)
+            linear_float8(input_tensor).sum().backward()
+
+        def n_times(n, fn, *args, **kwargs):
+            def wrapper(*args, **kwargs):
+                for _ in range(n):
+                    fn(*args, **kwargs)
+
+            return wrapper
+
+        REPEAT_N = 100
+
+        ref_forw_backward = n_times(REPEAT_N, ref_forw_backward)
+        float8_forw_backward = n_times(REPEAT_N, float8_forw_backward)
+
+        if compile:
+            ref_forw_backward = torch.compile(ref_forw_backward)
+            float8_forw_backward = torch.compile(float8_forw_backward)
+
+        for _ in range(5):
+            ref_forw_backward()
+            float8_forw_backward()
+
+        ref_time = (
+            benchmark_torch_function_in_microseconds(ref_forw_backward)
+            * 1e-6
+            / REPEAT_N
+        )
+        float8_time = (
+            benchmark_torch_function_in_microseconds(float8_forw_backward)
+            * 1e-6
+            / REPEAT_N
+        )
+        experiment = Experiment(
+            name,
+            (M, K, N),
+            ref_time,
+            float8_time,
+            dtype,
+            compile,
+            use_fast_accum=fast_accum,
+            scaling_repr=scaling_repr,
+        )
+        print(experiment)
+        print("float8 speedup", experiment.ref_time_sec / experiment.float8_time_sec)
+        experiment_list.append(experiment)
+        torch._dynamo.reset()
+
+    headers = [
+        "name",
+        "M",
+        "K",
+        "N",
+        "scaling_repr",
+        "ref_dtype",
+        "compiled",
+        "use_fast_accum",
+        "ref_time_sec",
+        "pt_fp8_time_sec",
+        "ref_tops_sec",
+        "ref_pct_top_peak",
+        "pt_fp8_tops_sec",
+        "pt_fp8_pct_top_peak",
+    ]
+    data = []
+    for experiment in experiment_list:
+        data.append(
+            [
+                experiment.name,
+                experiment.shape[0],
+                experiment.shape[1],
+                experiment.shape[2],
+                experiment.scaling_repr,
+                experiment.dtype,
+                experiment.compiled,
+                experiment.use_fast_accum,
+                experiment.ref_time_sec,
+                experiment.float8_time_sec,
+                experiment.ref_tops_sec,
+                experiment.ref_pct_top_peak,
+                experiment.float8_tops_sec,
+                experiment.float8_pct_top_peak,
+            ]
+        )
+
+    data_pd = pd.DataFrame(data, columns=headers)
+    data_pd["pt_fp8_speedup"] = data_pd["ref_time_sec"] / data_pd["pt_fp8_time_sec"]
+    data_pd["shape"] = (
+        "("
+        + data_pd["M"].astype(str)
+        + ", "
+        + data_pd["K"].astype(str)
+        + ", "
+        + data_pd["N"].astype(str)
+        + ")"
+    )
+
+    data_pd_simple = data_pd[
+        [
+            "name",
+            "shape",
+            "scaling_repr",
+            "compiled",
+            "use_fast_accum",
+            "ref_time_sec",
+            "pt_fp8_time_sec",
+            "pt_fp8_speedup",
+        ]
+    ]
+    with pd_print_full_ctx:
+        print(data_pd_simple)
+
+    if sweep_path is not None:
+        sweep_path = sweep_path.with_suffix(".csv")
+        data_pd.to_csv(sweep_path)
+
+
+def invoke_main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-o", "--output_path", type=str, required=False)
+    parser.add_argument("--disable_compile", action="store_true")
+    parser.add_argument("-n", "--n_limit", type=int, required=False)
+    parser.add_argument("--fast_accum_filter", type=bool, required=False)
+    parser.add_argument("--shape_name_filter", type=str, required=False)
+    parser.add_argument("--scaling_type_input", type=str, required=False)
+    parser.add_argument("--scaling_type_weight", type=str, required=False)
+    parser.add_argument("--scaling_type_grad_output", type=str, required=False)
+    args = parser.parse_args()
+    output_path = Path(args.output_path) if args.output_path is not None else None
+    kwargs = {}
+    if args.scaling_type_input is not None:
+        kwargs["scaling_type_input"] = args.scaling_type_input
+    if args.scaling_type_weight is not None:
+        kwargs["scaling_type_weight"] = args.scaling_type_weight
+    if args.scaling_type_grad_output is not None:
+        kwargs["scaling_type_grad_output"] = args.scaling_type_grad_output
+    main(
+        output_path,
+        not args.disable_compile,
+        args.n_limit,
+        args.fast_accum_filter,
+        args.shape_name_filter,
+        **kwargs,
+    )
+
+
+if __name__ == "__main__":
+    invoke_main()  # pragma: no cover
diff --git a/benchmarks/float8/bench_matmul.py b/benchmarks/float8/bench_matmul.py
new file mode 100644
index 000000000..6220670ee
--- /dev/null
+++ b/benchmarks/float8/bench_matmul.py
@@ -0,0 +1,139 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+import itertools
+from typing import Optional
+
+import fire
+import pandas as pd
+
+import torch
+import torch.nn as nn
+import torch.utils.benchmark as benchmark
+
+# estimating TOPs for matmuls in fp32, fp16, fp8
+# assuming A * B = C, with A being M * K, B being K * N, C being M * N
+
+# H100 SXM specs: bottom of https://www.nvidia.com/en-us/data-center/h100/
+h100_peak_flops_float32 = 67e12
+h100_peak_flops_fp16_tc = 989e12
+h100_peak_tops_float8_tc = 1979e12
+
+dtype_to_peak_tops = {
+    torch.float32: h100_peak_flops_float32,
+    torch.float16: h100_peak_flops_fp16_tc,
+    torch.bfloat16: h100_peak_flops_fp16_tc,
+    torch.float8_e4m3fn: h100_peak_tops_float8_tc,
+    torch.float8_e5m2: h100_peak_tops_float8_tc,
+}
+
+
+def benchmark_fn_in_sec(f, *args, **kwargs):
+    # Manual warmup
+    for _ in range(4):
+        f(*args, **kwargs)
+    t0 = benchmark.Timer(
+        stmt="f(*args, **kwargs)", globals={"args": args, "kwargs": kwargs, "f": f}
+    )
+    measurement = t0.blocked_autorange()
+    return measurement.mean
+
+
+def do_benchmarks(tops, peak_tops, f, *args, **kwargs):
+    time_sec = benchmark_fn_in_sec(f, *args, **kwargs)
+    tops_sec = float(tops) / time_sec
+    pct_top_peak = tops_sec / peak_tops
+    return time_sec, tops_sec, pct_top_peak
+
+
+@torch.inference_mode()
+def run(n_limit: Optional[int] = None):
+    device = "cuda"
+
+    # LLaMa 2 70B single-node weight shapes
+    # assumes fused attn.wqkv and ffn.w13
+    # source: https://fburl.com/gsheet/g8onr7rh
+    name_to_shapes_70b = {
+        "attn.wqkv": (8192, 1280),
+        "attn.w0": (1024, 8192),
+        "ffn.w13": (8192, 7168),
+        "ffn.w2": (3584, 8192),
+    }
+
+    headers = ("name", "shape", "dtype", "ref_time_s", "fp8_time_s", "fp8_speedup")
+    results = []
+
+    name_to_shapes = name_to_shapes_70b
+    dtypes = torch.bfloat16, torch.float16
+
+    for idx, (dtype, (name, (K, N))) in enumerate(
+        itertools.product(dtypes, name_to_shapes.items())
+    ):
+        if n_limit is not None and idx >= n_limit:
+            break
+
+        # source: Xiao Sun, these are realistic for LLaMa 70B training
+        bsz, seq_len = 4, 4096
+
+        M = bsz * seq_len
+        print("M, K, N:", M, K, N)
+        tops = 2 * M * N * K
+        print(f"tops: {tops:.2E}")
+
+        # raw torch.mm
+        A = torch.randn(M, K, device=device, dtype=dtype)
+        m_ref = nn.Sequential(nn.Linear(K, N, dtype=dtype, device=device, bias=False))
+        ref_time_sec, ref_tops_sec, ref_pct_top_peak = do_benchmarks(
+            tops, dtype_to_peak_tops[dtype], m_ref, A
+        )
+        print(
+            f"{dtype} time_sec {ref_time_sec:.2E}, tops/sec {ref_tops_sec:.2E}, pct_peak {ref_pct_top_peak:.3f}"
+        )
+
+        del A
+
+        # raw float8 matmul (upper bound for what we can achive in eager mode)
+        # TODO(future): add e5m2
+        d1, d2, d3 = torch.float8_e4m3fn, torch.float8_e4m3fn, dtype
+        A = torch.zeros(M, K, device=device, dtype=d1)
+        B = torch.zeros(K, N, device=device, dtype=d2).t().contiguous().t()
+
+        def do_matmul(A, B):
+            scale_a = torch.tensor([1.0], device=device)
+            scale_b = torch.tensor([1.0], device=device)
+            return torch._scaled_mm(
+                A, B, scale_a, scale_b, out_dtype=d3, use_fast_accum=False
+            )
+
+        fp8_time_sec, fp8_tops_sec, fp8_pct_top_peak = do_benchmarks(
+            tops, dtype_to_peak_tops[d1], do_matmul, A, B
+        )
+        print(
+            f"fp8 time_sec {fp8_time_sec:.2E}, tops/sec {fp8_tops_sec:.2E}, pct_peak {fp8_pct_top_peak:.3f}"
+        )
+
+        del A, B
+
+        results.append(
+            [
+                name,
+                (M, K, N),
+                dtype,
+                ref_time_sec,
+                fp8_time_sec,
+                ref_time_sec / fp8_time_sec,
+            ]
+        )
+
+    data_pd = pd.DataFrame(results, columns=headers)
+    print(data_pd)
+
+
+def main() -> None:
+    fire.Fire(run)
+
+
+if __name__ == "__main__":
+    main()  # pragma: no cover
diff --git a/benchmarks/float8/bench_multi_gpu.py b/benchmarks/float8/bench_multi_gpu.py
new file mode 100644
index 000000000..44c758d1b
--- /dev/null
+++ b/benchmarks/float8/bench_multi_gpu.py
@@ -0,0 +1,181 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+from typing import Callable
+
+import fire
+
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+import torch.nn as nn
+import torch.utils.benchmark as benchmark
+from torchao.float8.config import CastConfig, Float8LinearConfig, ScalingType
+from torchao.float8.float8_linear_utils import (
+    convert_to_float8_training,
+    sync_float8_amax_and_scale_history,
+)
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+
+
+torch.manual_seed(0)
+
+# TODO: Add more shapes for the benchmark
+B, M, K, N = 32, 1024, 1024, 1024
+lr = 0.01
+
+config = Float8LinearConfig(
+    cast_config_input=CastConfig(scaling_type=ScalingType.DELAYED),
+    cast_config_weight=CastConfig(scaling_type=ScalingType.DELAYED),
+    cast_config_grad_output=CastConfig(scaling_type=ScalingType.DELAYED),
+)
+
+
+def benchmark_torch_function_in_microseconds(
+    func: Callable,
+    *args,
+    **kwargs,
+) -> float:
+    t0 = benchmark.Timer(
+        stmt="func(*args, **kwargs)",
+        globals={"args": args, "kwargs": kwargs, "func": func},
+    )
+    return t0.blocked_autorange().median * 1e6
+
+
+def setup(rank, world_size):
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = "12355"
+
+    # initialize the process group
+    dist.init_process_group("nccl", rank=rank, world_size=world_size)
+
+
+def cleanup():
+    dist.destroy_process_group()
+
+
+def get_model(K, N, is_fp8, base_dtype=torch.float32):
+    modules = [
+        nn.Linear(K, N, dtype=base_dtype),
+        nn.ReLU(),
+    ]
+    N_LAYERS = 20
+    # N linear layers
+    for _ in range(N_LAYERS - 1):
+        modules.append(nn.Linear(N, N, dtype=base_dtype))
+        modules.append(nn.ReLU())
+    m = nn.Sequential(*modules)
+    if is_fp8:
+        convert_to_float8_training(
+            m,
+            config=config,
+        )
+    return m
+
+
+def fsdp_main(rank, world_size, args):
+    setup(rank, world_size)
+    torch.cuda.set_device(rank)
+
+    base_dtype, input_global, compile = args
+
+    # basic distributed data sampling
+    assert B % world_size == 0
+    bsz_local_start = int(rank / world_size * B)
+    bsz_local_end = int((rank + 1) / world_size * B)
+    input_tensor = input_global[bsz_local_start:bsz_local_end].to(rank)
+
+    fp8_model = get_model(K, N, is_fp8=True, base_dtype=base_dtype).to(rank)
+    # Need use_orig_params=True to compile FSDP
+    fp8_model = FSDP(fp8_model, use_orig_params=True)
+    fp8_optimizer = torch.optim.SGD(fp8_model.parameters(), lr=lr * world_size)
+
+    # Run one iteration to make compile work, see experiments doc for more context of this issue.
+    fp8_optimizer.zero_grad()
+    y_local = fp8_model(input_tensor)
+    y_local.sum().backward()
+    fp8_optimizer.step()
+    sync_float8_amax_and_scale_history(fp8_model)
+
+    sync_float8_func = sync_float8_amax_and_scale_history
+    if compile:
+        # TODO: Need to fix issues with compile
+        fp8_model = torch.compile(fp8_model)
+        sync_float8_func = torch.compile(sync_float8_amax_and_scale_history)
+
+    def float8_forw_backward():
+        fp8_optimizer.zero_grad()
+        y_local = fp8_model(input_tensor)
+        y_local.sum().backward()
+        fp8_optimizer.step()
+        sync_float8_func(fp8_model)
+
+    ref_model = get_model(K, N, is_fp8=False, base_dtype=base_dtype).to(rank)
+    ref_optimizer = torch.optim.SGD(ref_model.parameters(), lr=lr * world_size)
+    if compile:
+        ref_model = torch.compile(ref_model)
+
+    ref_model = FSDP(ref_model, use_orig_params=True)
+
+    def ref_forw_backward():
+        ref_optimizer.zero_grad()
+        ref_model(input_tensor).sum().backward()
+        ref_optimizer.step()
+
+    def run_n_iterations(n, fn):
+        for _ in range(n):
+            fn()
+        # make sure training is done on all ranks
+        dist.barrier()
+
+    # warmup
+    run_n_iterations(50, ref_forw_backward)
+    run_n_iterations(50, float8_forw_backward)
+
+    N_ITER = 50
+    ref_time = (
+        benchmark_torch_function_in_microseconds(
+            run_n_iterations, N_ITER, ref_forw_backward
+        )
+        * 1e-6
+        / N_ITER
+    )
+    float8_time = (
+        benchmark_torch_function_in_microseconds(
+            run_n_iterations, N_ITER, float8_forw_backward
+        )
+        * 1e-6
+        / N_ITER
+    )
+
+    if rank == 0:
+        print("ref_time", ref_time)
+        print("float8_time", float8_time)
+        print("float8 speedup", ref_time / float8_time)
+
+    cleanup()
+
+
+def run(compile: bool):
+    base_dtype = torch.bfloat16
+    WORLD_SIZE = torch.cuda.device_count()
+    print(f"{base_dtype = }")
+    print(f"{compile = }")
+    print(f"{WORLD_SIZE = }")
+
+    # generate input data
+    ref_input = torch.randn(B, M, K).cuda().to(base_dtype)
+    # run fsdp model
+    args = (base_dtype, ref_input, compile)
+    mp.spawn(fsdp_main, args=(WORLD_SIZE, args), nprocs=WORLD_SIZE, join=True)
+
+
+# Usgae:
+# CUDA_VISIBLE_DEVICES=0,1 python benchmarks/bench_multi_gpu.py
+if __name__ == "__main__":
+    fire.Fire(run)
diff --git a/benchmarks/float8/bench_padding.py b/benchmarks/float8/bench_padding.py
new file mode 100644
index 000000000..977755343
--- /dev/null
+++ b/benchmarks/float8/bench_padding.py
@@ -0,0 +1,223 @@
+from dataclasses import dataclass
+from typing import Optional
+
+import fire
+
+import torch
+from torchao.float8.float8_tensor import (
+    GemmInputRole,
+    hp_tensor_and_scale_to_float8,
+    LinearMMConfig,
+    ScaledMMConfig,
+)
+from torchao.float8.float8_utils import pad_tensor_for_matmul
+from tabulate import tabulate
+from torch._inductor.utils import do_bench_using_profiling
+from tqdm import tqdm
+
+# estimating TOPs for matmuls in fp32, fp16, fp8
+# assuming A * B = C, with A being M * K, B being K * N, C being M * N
+
+# H100 SXM specs: bottom of https://www.nvidia.com/en-us/data-center/h100/
+h100_peak_flops_float32 = 67e12
+h100_peak_flops_fp16_tc = 1979e12
+h100_peak_tops_float8_tc = 3958e12
+
+dtype_to_peak_tops = {
+    torch.float32: h100_peak_flops_float32,
+    torch.float16: h100_peak_flops_fp16_tc,
+    torch.bfloat16: h100_peak_flops_fp16_tc,
+    torch.float8_e4m3fn: h100_peak_tops_float8_tc,
+    torch.float8_e5m2: h100_peak_tops_float8_tc,
+}
+
+
+def benchmark_fn_in_usec(f, *args, **kwargs):
+    no_args = lambda: f(*args, **kwargs)
+    time = do_bench_using_profiling(no_args)
+    return time * 1e3
+
+
+def get_tops_info(tops, time, peak_tops):
+    time_sec = time / 1e6
+    tops_sec = float(tops) / time_sec
+    pct_top_peak = tops_sec / peak_tops
+    return tops_sec, pct_top_peak
+
+
+def do_fp8_matmul(A, B, fp8_dtype, out_dtype):
+    scale_a = torch.tensor([1], device="cuda", dtype=torch.float32)
+    scale_b = torch.tensor([1], device="cuda", dtype=torch.float32)
+
+    a_config = ScaledMMConfig(
+        emulate=False, use_fast_accum=True, fp8_output=True, pad_inner_dim=True
+    )
+    b_config = ScaledMMConfig(
+        emulate=False, use_fast_accum=True, fp8_output=True, pad_inner_dim=True
+    )
+    a_config = LinearMMConfig(a_config, a_config, a_config)
+    b_config = LinearMMConfig(b_config, b_config, b_config)
+
+    a_fp8 = hp_tensor_and_scale_to_float8(
+        A,
+        scale_a,
+        fp8_dtype,
+        a_config,
+        GemmInputRole.INPUT,
+    )
+    b_fp8 = hp_tensor_and_scale_to_float8(
+        B,
+        scale_b,
+        fp8_dtype,
+        b_config,
+        GemmInputRole.WEIGHT,
+    )
+
+    return a_fp8 @ b_fp8
+
+
+def do_fp8_pad_first_matmul(A, B, fp8_dtype, out_dtype):
+    # Breaks with compile due to trying to pad on fp8 dtype
+    # return do_fp8_matmul(A, B, fp8_dtype, out_dtype)
+    A_pad = pad_tensor_for_matmul(A, dims=1)  # mem copy
+    B_pad = pad_tensor_for_matmul(B, dims=0)  # mem copy
+
+    scale_a = torch.tensor([1], device="cuda", dtype=torch.float32)
+    scale_b = torch.tensor([1], device="cuda", dtype=torch.float32)
+
+    A_pad = A_pad.to(fp8_dtype)  # mem copy
+    B_pad = B_pad.to(fp8_dtype)  # mem copy
+
+    B_pad = B_pad.t().contiguous().t()  # mem copy
+
+    return torch._scaled_mm(
+        A_pad, B_pad, scale_a, scale_b, out_dtype=out_dtype, use_fast_accum=True
+    )
+
+
+def do_hp_matmul(A, B):
+    return torch.matmul(A, B)
+
+
+def do_aligned_bf16_matmul(A, B):
+    A_pad = pad_tensor_for_matmul(A, dims=1)
+    B_pad = pad_tensor_for_matmul(B, dims=0)
+    return torch.matmul(A_pad, B_pad)
+
+
+@dataclass
+class Experiment_config:
+    M: int
+    K: int
+    N: int
+    output_dtype: torch.dtype
+    fp8_dtype: torch.dtype
+
+    def __iter__(self):
+        return iter((self.M, self.K, self.N, self.output_dtype, self.fp8_dtype))
+
+
+def gen_configs():
+    shapes = shapes = [
+        (8193, 2501, 5008),
+        (65, 253, 4096),
+        (1023, 1029, 2512),
+        (4095, 511, 10000),
+        (2047, 3073, 8192),
+        (511, 769, 7504),
+        (127, 4097, 12288),
+        (32769, 15, 15024),
+        (9217, 8191, 20480),
+        (16385, 1025, 25008),
+    ]
+    output_dtype = torch.bfloat16
+    fp8_dtype = torch.float8_e4m3fn
+    return [Experiment_config(*shape, output_dtype, fp8_dtype) for shape in shapes]
+
+
+@torch.no_grad()
+def run(compile: bool = False, n_limit: Optional[int] = None):
+    device = "cuda"
+    experiments = gen_configs()
+    results = []
+    tops_table = []
+    tops_headers = [
+        "Shape",
+        "Ref Dtype",
+        "Ref Tops",
+        "Aligned BF16 Tops",
+        "FP8 Tops",
+        "Ref % Peak",
+        "Aligned BF16 % Peak",
+        "FP8 % Peak",
+    ]
+
+    for experiment in tqdm(experiments):
+        M, K, N, output_dtype, fp8_dtype = experiment
+        tops = 2 * M * N * K
+
+        A_base = torch.rand(M, K, device=device, dtype=output_dtype)
+        B_base = torch.rand(K, N, device=device, dtype=output_dtype)
+
+        hp_func = torch.compile(do_hp_matmul) if compile else do_hp_matmul
+        aligned_bf16_func = (
+            torch.compile(do_aligned_bf16_matmul) if compile else do_aligned_bf16_matmul
+        )
+        fp8_func = torch.compile(do_fp8_pad_first_matmul) if compile else do_fp8_matmul
+
+        ref_time = benchmark_fn_in_usec(hp_func, A_base, B_base)
+        aligned_bf16_time = benchmark_fn_in_usec(aligned_bf16_func, A_base, B_base)
+        fp8_time = benchmark_fn_in_usec(
+            fp8_func, A_base, B_base, fp8_dtype, output_dtype
+        )
+
+        ref_tops_sec, ref_pct_top_peak = get_tops_info(
+            tops, ref_time, dtype_to_peak_tops[output_dtype]
+        )
+        aligned_bf16_tops_sec, aligned_bf16_pct_top_peak = get_tops_info(
+            tops, aligned_bf16_time, dtype_to_peak_tops[torch.bfloat16]
+        )
+        fp8_tops_sec, fp8_pct_top_peak = get_tops_info(
+            tops, fp8_time, dtype_to_peak_tops[fp8_dtype]
+        )
+        tops_table.append(
+            [
+                f"({M}x{K}x{N})",
+                f"{output_dtype}",
+                f"{ref_tops_sec:.2E}",
+                f"{aligned_bf16_tops_sec:.2E}",
+                f"{fp8_tops_sec:.2E}",
+                f"{ref_pct_top_peak:.3f}",
+                f"{aligned_bf16_pct_top_peak:.3f}",
+                f"{fp8_pct_top_peak:.3f}",
+            ]
+        )
+        results.append(
+            [
+                (M, K, N),
+                output_dtype,
+                ref_time,
+                aligned_bf16_time,
+                fp8_time,
+                ref_time / aligned_bf16_time,
+                ref_time / fp8_time,
+            ]
+        )
+
+    print("TOPs".center(80, "*"))
+    print(tabulate(tops_table, headers=tops_headers))
+    print("Speed Results".center(80, "*"))
+    headers = [
+        "Shape",
+        "Ref Dtype",
+        "Ref Time",
+        "Aligned BF16 Time",
+        "FP8 Time",
+        "Aligned BF16 Speedup",
+        "FP8 Speedup",
+    ]
+    print(tabulate(results, headers=headers, tablefmt="grid"))
+
+
+if __name__ == "__main__":
+    fire.Fire(run)
diff --git a/benchmarks/float8/profile_linear_float8.py b/benchmarks/float8/profile_linear_float8.py
new file mode 100644
index 000000000..914759849
--- /dev/null
+++ b/benchmarks/float8/profile_linear_float8.py
@@ -0,0 +1,447 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+import copy
+import io
+import random
+from contextlib import nullcontext, redirect_stdout
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Callable, Optional
+
+import fire
+import pandas as pd
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchao.float8.config import CastConfig, Float8LinearConfig, ScalingType
+from torchao.float8.float8_linear_utils import (
+    convert_to_float8_training,
+    linear_requires_sync,
+    sync_float8_amax_and_scale_history,
+)
+from torch.profiler import profile, ProfilerActivity, record_function
+from utils import (
+    kernel_name_to_category,
+    parse_bw_and_kernel_name,
+    profiler_output_to_gpu_time_for_key,
+    profiler_output_to_time_by_kernel_name,
+)
+
+# don't truncate long kernel names
+pd.options.display.max_colwidth = 100
+# display 3 trailing decimal points for floats
+pd.set_option("display.float_format", "{:.3f}".format)
+
+
+class LNLinear(torch.nn.Module):
+    def __init__(self, fc_dim1, fc_dim2):
+        super().__init__()
+        self.ln = torch.nn.LayerNorm(fc_dim1, elementwise_affine=False)
+        self.fc = torch.nn.Linear(fc_dim1, fc_dim2, bias=False)
+
+    def forward(self, x):
+        x = self.ln(x)
+        x = self.fc(x)
+        return x
+
+
+# copied from https://github.com/pytorch/torchtitan/blob/main/torchtitan/models/norms.py
+class RMSNorm(nn.Module):
+    """
+    Initialize the RMSNorm normalization layer.
+
+    Args:
+        dim (int): The dimension of the input tensor.
+        eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6.
+
+    Attributes:
+        eps (float): A small value added to the denominator for numerical stability.
+        weight (nn.Parameter): Learnable scaling parameter.
+
+    """
+
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+
+    def _norm(self, x: torch.Tensor):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+
+    def forward(self, x: torch.Tensor):
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+
+    def reset_parameters(self):
+        torch.nn.init.ones_(self.weight)  # type: ignore
+
+
+# copied from https://github.com/pytorch/torchtitan/blob/main/torchtitan/models/llama/model.py
+class FeedForward(nn.Module):
+    """
+    FeedForward module
+
+    Args:
+        dim (int): Input dimension.
+        hidden_dim (int): Hidden dimension of the feedforward layer.
+        multiple_of (int): Value to ensure hidden dimension is a multiple of this value.
+        ffn_dim_multiplier (Optional[float]): Custom multiplier for hidden dimension. Defaults to None.
+
+    Attributes:
+        w1 (Linear): Linear transformation for the first layer.
+        w2 (Linear): Linear transformation for the second layer.
+        w3 (Linear): Linear transformation for the third layer.
+
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        hidden_dim: int,
+        multiple_of: int,
+        ffn_dim_multiplier: Optional[float],
+    ):
+        super().__init__()
+        hidden_dim = int(2 * hidden_dim / 3)
+        # custom dim factor multiplier
+        if ffn_dim_multiplier is not None:
+            hidden_dim = int(ffn_dim_multiplier * hidden_dim)
+        hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
+
+        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
+        self.w2 = nn.Linear(hidden_dim, dim, bias=False)
+        self.w3 = nn.Linear(dim, hidden_dim, bias=False)
+
+    def forward(self, x):
+        return self.w2(F.silu(self.w1(x)) * self.w3(x))
+
+    def init_weights(self, init_std: float):
+        nn.init.trunc_normal_(self.w1.weight, mean=0.0, std=0.02)
+        for linear in (self.w2, self.w3):
+            nn.init.trunc_normal_(linear.weight, mean=0.0, std=init_std)
+
+
+class NormFFNResidualNorm(nn.Module):
+    """
+    A fragment representing the end of TransformerBlock n and the start
+    of TransformerBlock n + 1, intended to include the fusions relevant
+    to float8 gemms in the FFN module in forward and backward.
+    """
+
+    def __init__(self, dim, hidden_dim, multiple_of, ffn_dim_multiplier):
+        super().__init__()
+        self.ffn_norm = RMSNorm(dim)
+        self.ffn = FeedForward(dim, hidden_dim, multiple_of, ffn_dim_multiplier)
+        self.attn_norm = RMSNorm(dim)
+
+    def forward(self, h):
+        # end of transformer block n
+        x = self.ffn_norm(h)
+        x = self.ffn(x)
+        x = h + x
+        # start of transformer block n + 1
+        x = self.attn_norm(x)
+        return x
+
+
+@dataclass
+class ProfileConfig:
+    file_path: Optional[str] = None
+    name: Optional[str] = None
+    cuda: bool = True
+    iters: int = 0
+    warmup_iters: int = 0
+    sync: bool = False
+    extra_kwargs: dict = field(default_factory=dict)
+    memory_profile_path: Optional[str] = None
+
+
+def profile_function(
+    config: ProfileConfig, func: Callable, *args, **kwargs
+) -> torch.profiler.profile:
+    """Profile a torch function and save the result to a file"""
+    seed = 123
+    random.seed(seed)
+    torch.manual_seed(seed)
+
+    activities = [ProfilerActivity.CPU]
+    if config.cuda:
+        activities.append(ProfilerActivity.CUDA)
+
+    if config.warmup_iters >= 0:
+        for _ in range(config.warmup_iters):
+            func(*args, **kwargs)
+    if config.sync:
+        torch.cuda.synchronize()
+    name_context = (
+        nullcontext() if config.name is None else record_function(config.name)
+    )
+    profile_memory = config.memory_profile_path is not None
+    with profile(
+        activities=activities,
+        profile_memory=profile_memory,
+        record_shapes=profile_memory,
+        with_stack=profile_memory,
+        **config.extra_kwargs,
+    ) as prof:
+        for _ in range(config.iters):
+            with name_context:
+                func(*args, **kwargs)
+                if config.sync:
+                    torch.cuda.synchronize()
+
+    if config.file_path is not None:
+        prof.export_chrome_trace(config.file_path)
+
+    return prof
+
+
+def main(
+    profile_path_prefix: Path,
+    compile: bool = True,
+    scaling_type_input: str = "dynamic",
+    scaling_type_weight: str = "dynamic",
+    scaling_type_grad_output: str = "dynamic",
+    model_type: str = "linear",
+    dtype_filter: str = "both",
+):
+    assert model_type in ("linear", "ln_linear", "norm_ffn_norm"), "unsupported"
+    assert dtype_filter in ("both", "float8", "bfloat16")
+
+    scaling_type_input = ScalingType(scaling_type_input)
+    scaling_type_weight = ScalingType(scaling_type_weight)
+    scaling_type_grad_output = ScalingType(scaling_type_grad_output)
+    config = Float8LinearConfig(
+        cast_config_input=CastConfig(scaling_type=scaling_type_input),
+        cast_config_weight=CastConfig(scaling_type=scaling_type_weight),
+        cast_config_grad_output=CastConfig(scaling_type=scaling_type_grad_output),
+    )
+    scaling_repr = "_".join(
+        [
+            s.short_str()
+            for s in (scaling_type_input, scaling_type_weight, scaling_type_grad_output)
+        ]
+    )
+
+    print(f"Compile is set to       | {compile}")
+    print(f"model_type is set to    | {model_type}")
+    print(f"scaling_repr is set to  | {scaling_repr}")
+
+    device = "cuda"
+    ref_dtype = torch.bfloat16
+    if model_type == "ln_linear":
+        M, K, N = 4 * 4096, 8192, 7168
+        m_ref = LNLinear(K, N)
+        input_tensor = torch.randn(
+            M, K, device=device, dtype=ref_dtype, requires_grad=True
+        )
+    elif model_type == "norm_ffn_norm":
+        m_ref = NormFFNResidualNorm(
+            dim=4096,
+            hidden_dim=16384,
+            multiple_of=1024,
+            ffn_dim_multiplier=1.3,
+        )
+        input_tensor = torch.randn(
+            1, 8192, 4096, device=device, dtype=ref_dtype
+        ).requires_grad_()
+    else:
+        M, K, N = 4 * 4096, 8192, 7168
+        m_ref = torch.nn.Sequential(
+            torch.nn.Linear(K, N, bias=False),
+        )
+        input_tensor = torch.randn(
+            M, K, device=device, dtype=ref_dtype, requires_grad=True
+        )
+
+    m_ref = m_ref.to(device).to(ref_dtype)
+
+    m_float8 = copy.deepcopy(m_ref)
+    convert_to_float8_training(m_float8, config=config)
+
+    def ref_forw_backward(x):
+        out = m_ref(x)
+        out.sum().backward()
+
+    def float8_forw(x):
+        out = m_float8(x)
+        return out
+
+    sync_amax_history = sync_float8_amax_and_scale_history
+
+    def float8_forw_backward_wrapper(x):
+        # sync_float8_amax_and_scale_history is not full graph torch
+        # compile friendly, so we add a high level wrapper to allow
+        # inspection of the fw+bw torch.compile without the scale
+        # syncing code
+        # TODO(future): make this better
+        if linear_requires_sync(config):
+            with record_function("scale_amax_and_scales"):
+                sync_amax_history(m_float8)
+        out = float8_forw(x)
+
+        # out.sum().backward() is also not torch.compile fullgraph
+        # friendly
+        with record_function("backward"):
+            out.sum().backward()
+
+    if compile:
+        m_ref = torch.compile(m_ref, fullgraph=True)
+        float8_forw = torch.compile(float8_forw, fullgraph=True)
+        # Note: it's faster to compile the combination of sync_amax_history wit
+        # forward because we only look up from dynamo cache once.
+        # However, compiling the sync function separately makes it more
+        # convenient to analyze the total time spent on it.
+        sync_amax_history = torch.compile(sync_amax_history)
+
+    # if the `TORCHINDUCTOR_PROFILE` env var is enabled, parse its output
+    # to populate triton kernel bandwidth further down in the script
+    f = io.StringIO()
+    with redirect_stdout(f):
+        # warm up
+        for _ in range(1):
+            if dtype_filter != "float8":
+                ref_forw_backward(input_tensor)
+            if dtype_filter != "bfloat16":
+                float8_forw_backward_wrapper(input_tensor)
+
+        profile_iters = 5
+        ref_times, float8_times = None, None
+        data = []
+
+        if dtype_filter != "float8":
+            # Profile Reference Model
+            print("profiling ref")
+            ref_suffix = f"_{model_type}_ref_compile_{compile}.json"
+            ref_path = profile_path_prefix + ref_suffix
+            profile_config = ProfileConfig(
+                ref_path, ref_suffix, iters=profile_iters, warmup_iters=2, sync=True
+            )
+            p = profile_function(profile_config, ref_forw_backward, input_tensor)
+            print(f"saved {ref_path}")
+            ref_times = profiler_output_to_time_by_kernel_name(p)
+            total_time_ms = sum(v for v in ref_times.values()) / 1e3 / profile_iters
+            for k, v in ref_times.items():
+                v_ms = v / 1e3 / profile_iters
+                data.append(
+                    [
+                        "0_ref",
+                        k,
+                        kernel_name_to_category(k),
+                        v_ms,
+                        v_ms / total_time_ms,
+                        None,
+                    ]
+                )
+
+        if dtype_filter != "bfloat16":
+            # Profile Float8 Model
+            print("profiling float8")
+            float8_suffix = (
+                f"_{model_type}_float8_compile_{compile}_{scaling_repr}.json"
+            )
+            float8_path = profile_path_prefix + float8_suffix
+            profile_config = ProfileConfig(
+                float8_path,
+                float8_suffix,
+                iters=profile_iters,
+                warmup_iters=2,
+                sync=True,
+            )
+            p = profile_function(
+                profile_config, float8_forw_backward_wrapper, input_tensor
+            )
+            print(f"saved {float8_path}")
+            float8_times = profiler_output_to_time_by_kernel_name(p)
+            total_time_ms = sum(v for v in float8_times.values()) / 1e3 / profile_iters
+            for k, v in float8_times.items():
+                v_ms = v / 1e3 / profile_iters
+                data.append(
+                    [
+                        "1_float8",
+                        k,
+                        kernel_name_to_category(k),
+                        v / 1e3 / profile_iters,
+                        v_ms / total_time_ms,
+                        None,
+                    ]
+                )
+
+            # get the time spent per user annotation
+            sync_time_us = profiler_output_to_gpu_time_for_key(
+                p, "scale_amax_and_scales"
+            )
+            sync_time_ms = sync_time_us / profile_iters / 1e3
+            print(f"Sync time ms: {sync_time_ms}")
+
+    # print the redirected stdout back to regular stdout
+    print(f.getvalue())
+
+    # populate the triton kernel bandwidth
+    for line in f.getvalue().split("\n"):
+        maybe_bw, maybe_kernel_name = parse_bw_and_kernel_name(line)
+        if maybe_kernel_name is not None:
+            # O(N) search, but it's ok since lists are small
+            for datum in data:
+                if datum[1] == maybe_kernel_name:
+                    datum[-1] = maybe_bw
+
+    df = pd.DataFrame(
+        data,
+        columns=[
+            "experiment",
+            "kernel",
+            "category",
+            "time_ms",
+            "pct_gpu_time",
+            "bw_gpbs",
+        ],
+    )
+    df.sort_values(
+        ["experiment", "category", "pct_gpu_time"],
+        ascending=[True, True, False],
+        inplace=True,
+    )
+    print("\nSummary of GPU time by CPU kernel\n\n", df)
+
+    # compare gemm and overhead time
+    df_p = df.pivot_table(
+        columns=["category"],
+        index="experiment",
+        values="time_ms",
+        aggfunc="sum",
+        fill_value=0,
+        margins=True,
+    )
+    # drop last row, which has totals across ref + float8 which does not make sense
+    df_p = df_p[:-1]
+    df_p = df_p.transpose()
+
+    if dtype_filter == "both":
+        df_p["f8_div_ref"] = df_p["1_float8"] / df_p["0_ref"]
+        df_p["ref_div_f8"] = df_p["0_ref"] / df_p["1_float8"]
+
+        # calculate sync time as pct of total float time
+        # note: this time is not useful if TORCHINDUCTOR_PROFILE is on
+        total_float8_ms = df_p.iloc[3]["1_float8"]
+        sync_approx_ratio = sync_time_ms / total_float8_ms
+        print(
+            f"\nFloat8 amax/scale sync approx ratio of total time: {sync_approx_ratio:.3f}"
+        )
+
+    print("\nSummary of time (ms) by kernel category\n\n", df_p)
+
+
+def invoke_main() -> None:
+    # Example usage: python benchmarks/profile_linear_float8.py benchmarks/data/profiles/current_profile --compile=True --linear_type="dynamic"
+    # You can set TORCHINDUCTOR_PROFILE=1 to also capture triton kernel bandwidth
+    fire.Fire(main)
+
+
+if __name__ == "__main__":
+    invoke_main()  # pragma: no cover
diff --git a/benchmarks/float8/utils.py b/benchmarks/float8/utils.py
new file mode 100644
index 000000000..aec19e2cd
--- /dev/null
+++ b/benchmarks/float8/utils.py
@@ -0,0 +1,83 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+import collections
+import re
+
+
+def profiler_output_to_time_by_kernel_name(prof):
+    """
+    Input: a profiler with captured events.
+    Output: a deduplicated list of GPU time in nanoseconds grouped by CPU kernel name
+
+    Note that if there are user_annotations in the captured events, `torch.profiler`
+    will include their time in the total GPU time displayed at the bottom of
+    `key_averages.table()`. The filter below excludes them to prevent double
+    counting.
+    """
+    key_averages = prof.key_averages()
+    thresh = 1e-10
+    kernel_name_to_gpu_time_us = collections.defaultdict(float)
+    for e in key_averages:
+        # manually filter top-level CPU events with attributed CUDA time
+        # example CPU event row:
+        #                                               aten::addmm         0.83%      76.554us         0.98%      90.846us      90.846us       1.022ms        31.82%       1.022ms       1.022ms             1
+        # and it maps to this CUDA event:
+        #   sm80_xmma_gemm_f32f32_f32f32_f32_tn_n_tilesize256x64...         0.00%       0.000us         0.00%       0.000us       0.000us       1.022ms        31.82%       1.022ms       1.022ms             1
+        if not (e.self_cpu_time_total > thresh and e.self_device_time_total > thresh):
+            continue
+        kernel_name_to_gpu_time_us[e.key] = e.self_device_time_total
+    return kernel_name_to_gpu_time_us
+
+
+def profiler_output_to_gpu_time_for_key(prof, key):
+    """
+    Input: an event name
+    Output: sum of GPU time of all events with that name in `prof`
+
+    This is useful to get the total time of a user annotation
+    """
+    total = 0
+    for e in prof.profiler.function_events:
+        if e.key == key:
+            total += e.device_time_total
+    return total
+
+
+def kernel_name_to_category(k):
+    # number prefix is for easy sorting
+    if k in ("aten::mm", "aten::addmm", "aten::_scaled_mm"):
+        return "0_gemm"
+    elif (
+        # max(abs(tensor))
+        ("abs" in k and "max" in k)
+        or
+        # casting pointwise to float8
+        ("clamp" in k)
+        or
+        # things related to scaled_mm
+        ("scaled_mm" in k)
+        or
+        # syncing amaxes and scales
+        ("roll" in k)
+    ):
+        # note: the above filter is approximate and will give false
+        # positives if model code contains other code to abs/max/clamp
+        return "1_f8_overhead"
+    return "2_other"
+
+
+def parse_bw_and_kernel_name(line):
+    """
+    Input: a single line of stdout of TORCHINDUCTOR_PROFILE=1 output, such as
+        0.257ms         0.537 GB         2092.43GB/s     triton_red_fused_native_layer_norm_0
+    Output: the bandwidth value and the kernel name, or None and None
+    """
+    result = re.search(".* ([0-9\.]+)GB/s.*(triton_[a-z_0-9]+)", line)
+    if result:
+        return result.group(1), result.group(2)
+    else:
+        return None, None
diff --git a/test/float8/test_base.py b/test/float8/test_base.py
new file mode 100644
index 000000000..0780968aa
--- /dev/null
+++ b/test/float8/test_base.py
@@ -0,0 +1,723 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+import copy
+import io
+import itertools
+import random
+import re
+import unittest
+import warnings
+
+import pytest
+
+import torch
+import torch.nn as nn
+
+from torchao.utils import TORCH_VERSION_AFTER_2_4
+
+if not TORCH_VERSION_AFTER_2_4:
+    pytest.skip("Unsupported PyTorch version", allow_module_level=True)
+
+
+from torchao.float8.config import CastConfig, Float8LinearConfig, ScalingType
+from torchao.float8.float8_linear import Float8Linear
+from torchao.float8.float8_linear_utils import (
+    convert_to_float8_training,
+    linear_requires_sync,
+    sync_float8_amax_and_scale_history,
+)
+from torchao.float8.float8_python_api import addmm_float8_unwrapped
+from torchao.float8.float8_tensor import (
+    Float8Tensor,
+    GemmInputRole,
+    hp_tensor_and_scale_to_float8,
+    LinearMMConfig,
+    ScaledMMConfig,
+)
+from torchao.float8.float8_utils import (
+    compute_error,
+    e4m3_dtype,
+    e5m2_dtype,
+    fp8_tensor_statistics,
+    FP8_TYPES,
+    tensor_to_scale,
+)
+from torchao.float8.inference import (
+    ActivationCasting,
+    QuantConfig,
+    quantize_to_float8,
+)
+
+random.seed(0)
+torch.manual_seed(0)
+
+is_H100 = torch.cuda.is_available() and torch.cuda.get_device_capability() >= (9, 0)
+
+
+def bitwise_identical(a: Float8Tensor, b: Float8Tensor) -> bool:
+    assert torch.all(a._data == b._data).item(), "scales are not identical"
+    assert torch.all(a._data == b._data).item(), "data is not identical"
+    return True
+
+
+class TestFloat8Tensor(unittest.TestCase):
+    def test_preserves_dtype(self) -> None:
+        # hp means high precision, lp means low precision
+        hp_dtypes = (torch.float32, torch.float16, torch.bfloat16)
+        lp_dtypes = FP8_TYPES
+        for hp_dtype, lp_dtype in itertools.product(hp_dtypes, lp_dtypes):
+            x1_hp = torch.randn(4, 4, dtype=hp_dtype)
+            x1_s = tensor_to_scale(x1_hp, lp_dtype)
+            x2_lp = hp_tensor_and_scale_to_float8(x1_hp, x1_s, lp_dtype)
+            x3_hp = x2_lp.to_original_precision()
+            self.assertTrue(x3_hp.dtype == hp_dtype)
+
+    def test_differentiable_casts(self) -> None:
+        lp_dtypes = (e4m3_dtype, e5m2_dtype)
+        for f8_dtype in lp_dtypes:
+            x = torch.randn(1).requires_grad_()
+            grad = torch.randn(1)
+            x_s = tensor_to_scale(x, f8_dtype)
+            x_f8 = hp_tensor_and_scale_to_float8(x, x_s, f8_dtype)
+            x_f8_hp = x_f8.to_original_precision()
+            x_f8_hp.backward(grad)
+            # the gradient should be unchanged through both casts
+            torch.testing.assert_close(grad, x.grad, rtol=0, atol=0)
+
+    def test_split_cat(self):
+        a = torch.rand(16, 16, dtype=torch.bfloat16)
+        scale = tensor_to_scale(a, e4m3_dtype)
+        fp8_a = hp_tensor_and_scale_to_float8(a, scale, e4m3_dtype)
+
+        splits = torch.split(fp8_a, 16)
+        catted = torch.cat(splits, dim=0)
+        assert bitwise_identical(fp8_a, catted)
+
+    def test_index_put(self):
+        a = torch.rand(16, dtype=torch.bfloat16)
+        scale_a = tensor_to_scale(a, torch.float8_e4m3fn)
+        fp8_a = hp_tensor_and_scale_to_float8(a, scale_a, torch.float8_e4m3fn)
+
+        index = torch.randint(0, 15, (16,), dtype=torch.long)
+
+        b = torch.rand(16, 16, dtype=torch.bfloat16)
+        scale_b = tensor_to_scale(b, torch.float8_e4m3fn)
+        fp8_b = hp_tensor_and_scale_to_float8(b, scale_a, torch.float8_e4m3fn)
+        fp8_b_bad = hp_tensor_and_scale_to_float8(b, scale_b, torch.float8_e4m3fn)
+
+        with self.assertRaises(AssertionError):
+            b[index] = fp8_a
+            fp8_b[index] = a
+            fp8_b_bad[index] = fp8_a
+        fp8_b[index] = fp8_a
+
+    def test_copy_(self):
+        a = torch.rand(16, dtype=torch.bfloat16)
+        scale_a = tensor_to_scale(a, torch.float8_e4m3fn)
+        fp8_a = hp_tensor_and_scale_to_float8(a, scale_a, torch.float8_e4m3fn)
+
+        b = torch.empty(16, dtype=torch.bfloat16)
+        b.copy_(fp8_a)  # Should work
+        torch.testing.assert_close(b, fp8_a.to_original_precision())
+        with self.assertRaises(RuntimeError):
+            fp8_a.copy_(b)  # Should fail
+
+        fp8_b = Float8Tensor(
+            torch.empty(16, dtype=torch.float8_e4m3fn),
+            scale_a,
+            torch.bfloat16,
+            fp8_a._linear_mm_config,
+        )
+        fp8_b.copy_(fp8_a)
+        torch.testing.assert_close(fp8_a._data, fp8_b._data)
+
+    def test_weights_only_load(self):
+        module = nn.Linear(16, 16)
+        # Save model state dict
+        buffer = io.BytesIO()
+        fp8_module = quantize_to_float8(
+            module,
+            QuantConfig(
+                ActivationCasting.DYNAMIC,
+            ),
+        )
+
+        torch.save(fp8_module.state_dict(), buffer)
+        buffer.seek(0)
+        _ = torch.load(buffer, weights_only=True)
+
+
+class TestFloat8Linear:
+    def _test_linear_impl(
+        self,
+        x,
+        m_ref,
+        config: Float8LinearConfig,
+    ):
+        m_fp8 = Float8Linear.from_float(
+            copy.deepcopy(m_ref),
+            config,
+        )
+        for _ in range(2):
+            if linear_requires_sync(config):
+                sync_float8_amax_and_scale_history(m_fp8)
+            y_fp8 = m_fp8(x)
+            y_fp8.sum().backward()
+            y_ref = m_ref(x)
+            y_ref.sum().backward()
+
+        assert y_ref.shape == y_fp8.shape
+
+        y_sqnr = compute_error(y_ref, y_fp8)
+        g_sqnr = compute_error(m_ref.weight.grad, m_fp8.weight.grad)
+        # verify sqnr is reasonable
+        assert y_sqnr >= 18.0, f"{y_sqnr} is too low"
+        assert g_sqnr >= 17.0, f"{g_sqnr} is too low"
+        if m_ref.bias is not None:
+            torch.testing.assert_close(m_ref.bias.grad, m_fp8.bias.grad)
+
+        # verify all of the amax buffers got updated
+        if linear_requires_sync(config):
+            # only check buffers that are actually used, based on per-tensor
+            # scaling settings
+            amax_buffer_names = []
+            amax_history_buffer_names = []
+            scale_buffer_names = []
+            if config.cast_config_input.scaling_type is ScalingType.DELAYED:
+                amax_buffer_names.append("fp8_amax_input")
+                amax_history_buffer_names.append("fp8_amax_history_input")
+                scale_buffer_names.append("fp8_scale_input")
+            if config.cast_config_weight.scaling_type is ScalingType.DELAYED:
+                amax_buffer_names.append("fp8_amax_weight")
+                amax_history_buffer_names.append("fp8_amax_history_weight")
+                scale_buffer_names.append("fp8_scale_weight")
+            if config.cast_config_grad_output.scaling_type is ScalingType.DELAYED:
+                amax_buffer_names.append("fp8_amax_grad_output")
+                amax_history_buffer_names.append("fp8_amax_history_grad_output")
+                scale_buffer_names.append("fp8_scale_grad_output")
+
+            # verify all of the amax buffers got updated
+            max_float8_pos = {torch.finfo(dtype).max for dtype in FP8_TYPES}
+            for buffer_name in amax_buffer_names:
+                buffer_value = getattr(m_fp8, buffer_name)
+                for init_val in max_float8_pos:
+                    assert torch.ne(
+                        buffer_value, torch.tensor(init_val)
+                    ), f"{buffer_name} not filled, current value {buffer_value}"
+
+            # verify all of the amax history buffers got updated
+            for buffer_name in amax_history_buffer_names:
+                buffer_value = getattr(m_fp8, buffer_name)
+                assert torch.max(buffer_value) > 0.0, f"{buffer_name} not filled"
+
+            # verify all of the scale buffers got updated
+            for buffer_name in scale_buffer_names:
+                buffer_value = getattr(m_fp8, buffer_name)
+                assert torch.ne(
+                    buffer_value, torch.tensor(1.0)
+                ), f"{buffer_name} not filled, current value {buffer_value}"
+
+            # verify initialization flags got updated
+            assert m_fp8.is_amax_initialized, "Amax was not properly initialized"
+
+    @pytest.mark.parametrize("emulate", [True, False] if is_H100 else [True])
+    @pytest.mark.parametrize("x_shape", [(16, 16), (2, 16, 16), (3, 2, 16, 16)])
+    @pytest.mark.parametrize(
+        "scaling_type_input", [ScalingType.DELAYED, ScalingType.DYNAMIC]
+    )
+    @pytest.mark.parametrize(
+        "scaling_type_weight", [ScalingType.DELAYED, ScalingType.DYNAMIC]
+    )
+    @pytest.mark.parametrize(
+        "scaling_type_grad_output",
+        [ScalingType.DELAYED, ScalingType.DYNAMIC],
+    )
+    @pytest.mark.parametrize("linear_dtype", [torch.bfloat16, torch.float32])
+    @pytest.mark.parametrize("linear_bias", [False, True])
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
+    def test_linear(
+        self,
+        x_shape,
+        emulate: bool,
+        scaling_type_input: ScalingType,
+        scaling_type_weight: ScalingType,
+        scaling_type_grad_output: ScalingType,
+        linear_dtype: torch.dtype,
+        linear_bias: bool,
+    ):
+        if not emulate:
+            if not torch.cuda.is_available():
+                warnings.warn("CUDA not available")
+                pytest.skip()
+            elif torch.cuda.get_device_capability() < (9, 0):
+                warnings.warn(
+                    f"CUDA capability {torch.cuda.get_device_capability()} < (9.0)"
+                )
+                pytest.skip()
+        x = torch.randn(*x_shape, device="cuda", dtype=linear_dtype)
+        m_ref = nn.Linear(16, 32, bias=linear_bias, device="cuda", dtype=linear_dtype)
+        config = Float8LinearConfig(
+            cast_config_input=CastConfig(scaling_type=scaling_type_input),
+            cast_config_weight=CastConfig(scaling_type=scaling_type_weight),
+            cast_config_grad_output=CastConfig(scaling_type=scaling_type_grad_output),
+            emulate=emulate,
+        )
+        self._test_linear_impl(
+            x,
+            m_ref,
+            config,
+        )
+
+    @pytest.mark.parametrize("emulate", [True, False] if is_H100 else [True])
+    @pytest.mark.parametrize(
+        "linear_dtype", [torch.float16, torch.bfloat16, torch.float32]
+    )
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
+    def test_autocast_outputs(
+        self,
+        emulate: bool,
+        linear_dtype: torch.dtype,
+    ):
+        if not emulate:
+            if not torch.cuda.is_available():
+                warnings.warn("CUDA not available")
+                pytest.skip()
+            elif torch.cuda.get_device_capability() < (9, 0):
+                warnings.warn(
+                    f"CUDA capability {torch.cuda.get_device_capability()} < (9.0)"
+                )
+                pytest.skip()
+
+        m_ref = nn.Linear(32, 16, device="cuda", dtype=linear_dtype)
+        config = Float8LinearConfig(
+            cast_config_input=CastConfig(scaling_type=ScalingType.DELAYED),
+            cast_config_weight=CastConfig(scaling_type=ScalingType.DELAYED),
+            cast_config_grad_output=CastConfig(scaling_type=ScalingType.DELAYED),
+            emulate=emulate,
+        )
+        m = Float8Linear.from_float(copy.deepcopy(m_ref), config)
+
+        # autocast off
+        x = torch.randn(16, 32, device="cuda", dtype=linear_dtype)
+        if linear_requires_sync(config):
+            sync_float8_amax_and_scale_history(m)
+        y = m(x)
+        assert y.dtype == linear_dtype, f"y.dtype is {y.dtype}, expected {linear_dtype}"
+
+        # autocast on
+        with torch.autocast("cuda"):
+            if linear_requires_sync(config):
+                sync_float8_amax_and_scale_history(m)
+            y = m(x)
+        assert y.dtype == torch.half, f"y.dtype is {y.dtype}, expected {torch.half}"
+
+        with torch.autocast("cuda", dtype=torch.bfloat16):
+            if linear_requires_sync(config):
+                sync_float8_amax_and_scale_history(m)
+            y = m(x)
+        assert (
+            y.dtype == torch.bfloat16
+        ), f"y.dtype is {y.dtype}, expected {torch.bfloat16}"
+
+    @pytest.mark.parametrize(
+        "linear_dtype", [torch.float16, torch.bfloat16, torch.float32]
+    )
+    @pytest.mark.parametrize("emulate", [True, False] if is_H100 else [True])
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
+    def test_type_cast(self, linear_dtype: torch.dtype, emulate: bool):
+        emulate = (
+            not torch.cuda.is_available() or torch.cuda.get_device_capability() < (9, 0)
+        )
+
+        m = nn.Linear(32, 16, device="cuda", dtype=linear_dtype)
+        config = Float8LinearConfig(emulate=emulate)
+        m = Float8Linear.from_float(copy.deepcopy(m), config)
+
+        # Cast the module to dtype
+        m = m.to(dtype=linear_dtype)
+        if linear_requires_sync(config):
+            # Check amax buffer types
+            for key in [
+                "fp8_amax_input",
+                "fp8_amax_history_input",
+                "fp8_scale_input",
+                "fp8_amax_weight",
+                "fp8_amax_history_weight",
+                "fp8_scale_weight",
+                "fp8_amax_grad_output",
+                "fp8_amax_history_grad_output",
+                "fp8_scale_grad_output",
+            ]:
+                assert (
+                    m._buffers[key].dtype == torch.float32
+                ), f"{key}.dtype is {m._buffers[key].dtype}, expected torch.float32"
+
+        # autocast off
+        x = torch.randn(16, 32, device="cuda", dtype=linear_dtype)
+        if linear_requires_sync(config):
+            sync_float8_amax_and_scale_history(m)
+        y = m(x)
+        assert y.dtype == linear_dtype, f"y.dtype is {y.dtype}, expected {linear_dtype}"
+
+        # autocast on
+        with torch.autocast("cuda"):
+            if linear_requires_sync(config):
+                sync_float8_amax_and_scale_history(m)
+            y = m(x)
+        assert y.dtype == torch.half, f"y.dtype is {y.dtype}, expected {torch.half}"
+
+        with torch.autocast("cuda", dtype=torch.bfloat16):
+            if linear_requires_sync(config):
+                sync_float8_amax_and_scale_history(m)
+            y = m(x)
+        assert (
+            y.dtype == torch.bfloat16
+        ), f"y.dtype is {y.dtype}, expected {torch.bfloat16}"
+
+    def test_repr(self):
+        m = nn.Linear(32, 16)
+        config = Float8LinearConfig(
+            cast_config_weight=CastConfig(scaling_type=ScalingType.DELAYED),
+            emulate=True,
+        )
+        m = Float8Linear.from_float(
+            copy.deepcopy(m),
+            config=config,
+        )
+        s = m.__repr__()
+        assert "i:dyn,w:del,go:dyn" in s
+
+
+class TestScaledMM:
+    @unittest.skipIf(
+        not is_H100,
+        "CUDA not available",
+    )
+    @pytest.mark.parametrize(
+        "base_dtype", [torch.float16, torch.bfloat16, torch.float32]
+    )
+    @pytest.mark.parametrize("use_fast_accum", [True, False])
+    def test_scaled_mm_vs_emulated(self, base_dtype, use_fast_accum):
+        torch.manual_seed(42)
+        input_dtype = e4m3_dtype
+        output_dtype = base_dtype
+        compare_type = torch.float32
+
+        a = torch.randn(16, 16, device="cuda", dtype=base_dtype)
+        b = torch.randn(32, 16, device="cuda", dtype=base_dtype).t()
+
+        a_scale = tensor_to_scale(a, input_dtype).float()
+        b_scale = tensor_to_scale(b, input_dtype).float()
+
+        a_fp8 = hp_tensor_and_scale_to_float8(a, a_scale, input_dtype)
+        b_fp8 = hp_tensor_and_scale_to_float8(b, b_scale, input_dtype)
+
+        out_scaled_mm = addmm_float8_unwrapped(
+            a_fp8._data,
+            a_fp8._scale,
+            b_fp8._data,
+            b_fp8._scale,
+            output_dtype=output_dtype,
+            use_fast_accum=use_fast_accum,
+        )
+        out_emulated = torch.ops.aten.mm_float8_emulated(
+            a_fp8._data, a_fp8._scale, b_fp8._data, b_fp8._scale, output_dtype
+        )
+
+        if output_dtype != base_dtype:
+            out_scaled_mm = out_scaled_mm.to(compare_type)
+            out_emulated = out_emulated.to(compare_type)
+
+        if base_dtype in {torch.bfloat16, torch.float16}:
+            atol, rtol = 7e-2, 7e-2
+        else:
+            atol, rtol = 2e-3, 2e-3
+        torch.testing.assert_close(out_scaled_mm, out_emulated, atol=atol, rtol=rtol)
+
+    @unittest.skipIf(not is_H100, "CUDA not available")
+    def test_different_configs_error(self):
+        x_fp32 = torch.randn(16, 16, device="cuda")
+        x_scale = torch.tensor(1.0, device="cuda")
+        fp8_dtype = e4m3_dtype
+        linear_config_a = LinearMMConfig(
+            ScaledMMConfig(False, True, False, False),
+            ScaledMMConfig(False, False, False, False),
+            ScaledMMConfig(False, False, False, False),
+        )
+        linear_config_b = LinearMMConfig(
+            ScaledMMConfig(True, True, False, False),
+            ScaledMMConfig(True, False, False, False),
+            ScaledMMConfig(True, False, False, False),
+        )
+        a = hp_tensor_and_scale_to_float8(
+            x_fp32,
+            x_scale,
+            fp8_dtype,
+            linear_config_a,
+            GemmInputRole.INPUT,
+        )
+        b = hp_tensor_and_scale_to_float8(
+            x_fp32,
+            x_scale,
+            fp8_dtype,
+            linear_config_b,
+            GemmInputRole.WEIGHT,
+        )
+        with pytest.raises(
+            AssertionError,
+            match="linear_mm_config.output mismatch",
+        ):
+            a @ b
+
+    @unittest.skipIf(
+        not is_H100,
+        "CUDA not available",
+    )
+    @pytest.mark.parametrize(
+        "base_dtype", [torch.float16, torch.bfloat16, torch.float32]
+    )
+    @pytest.mark.parametrize("use_fast_accum", [True, False])
+    def test_pad_inner_dim(self, base_dtype, use_fast_accum):
+        torch.manual_seed(42)
+        input_dtype = torch.float8_e4m3fn
+        compare_type = torch.float32
+
+        a = torch.randn(16, 41, device="cuda", dtype=base_dtype)
+        b = torch.randn(41, 128, device="cuda", dtype=base_dtype)
+
+        a_scale = tensor_to_scale(a, input_dtype).float()
+        b_scale = tensor_to_scale(b, input_dtype).float()
+
+        a_fp8 = hp_tensor_and_scale_to_float8(
+            a, a_scale, input_dtype, None, GemmInputRole.INPUT
+        )
+        b_fp8 = hp_tensor_and_scale_to_float8(
+            b, b_scale, input_dtype, None, GemmInputRole.WEIGHT
+        )
+
+        with pytest.raises(
+            RuntimeError,
+            match=re.escape(
+                "Expected trailing dimension of mat1 to be divisible by 16 but got mat1 shape: (16x41."
+            ),
+        ):
+            a_fp8 @ b_fp8
+
+        scaled_mm_config = ScaledMMConfig(False, use_fast_accum, False, True)
+        pad_config = LinearMMConfig(
+            scaled_mm_config, scaled_mm_config, scaled_mm_config
+        )
+
+        a_fp8 = hp_tensor_and_scale_to_float8(
+            a,
+            a_scale,
+            input_dtype,
+            pad_config,
+            GemmInputRole.INPUT,
+        )
+        b_fp8 = hp_tensor_and_scale_to_float8(
+            b,
+            b_scale,
+            input_dtype,
+            pad_config,
+            GemmInputRole.WEIGHT,
+        )
+        out_padded = a_fp8 @ b_fp8
+        out_padded.to(compare_type)
+
+        emulated_scaled_mm_config = ScaledMMConfig(True, use_fast_accum, False, False)
+        emulated_config = LinearMMConfig(
+            emulated_scaled_mm_config,
+            emulated_scaled_mm_config,
+            emulated_scaled_mm_config,
+        )
+        a_fp8 = hp_tensor_and_scale_to_float8(
+            a,
+            a_scale,
+            input_dtype,
+            emulated_config,
+            GemmInputRole.INPUT,
+        )
+        b_fp8 = hp_tensor_and_scale_to_float8(
+            b,
+            b_scale,
+            input_dtype,
+            emulated_config,
+            GemmInputRole.WEIGHT,
+        )
+        out_emualted = a_fp8 @ b_fp8
+        out_emualted.to(compare_type)
+
+        if base_dtype in {torch.bfloat16, torch.float16}:
+            atol, rtol = 7e-2, 7e-2
+        else:
+            atol, rtol = 2e-3, 2e-3
+        torch.testing.assert_close(out_padded, out_emualted, atol=atol, rtol=rtol)
+
+
+class TestNumerics:
+    @pytest.mark.parametrize(
+        "float8_dtype",
+        [
+            torch.float8_e4m3fn,
+            torch.float8_e5m2,
+            torch.float8_e4m3fnuz,
+            torch.float8_e5m2fnuz,
+        ],
+    )
+    @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
+    def test_small_amax_float16(self, float8_dtype):
+        # If we calculate scale naively with FP8_MAX_POS / amax,
+        # the result may not be representable in fp16. Verify that
+        # the way we calculate scales actually works for tensors with
+        # small values.
+        #
+        #   naive_s = fp8_max_pos / (amax + eps)
+        #
+        # failing case:
+        #
+        #   fp8_max_pos / (amax + eps) >= fp16_max_pos, or
+        #
+        #   amax + eps >= fp8_max_pos / fp16_max_pos
+
+        float8_max_pos = torch.finfo(float8_dtype).max
+        FP16_MAX_POS = torch.finfo(torch.float16).max
+
+        target_amax = float8_max_pos / (FP16_MAX_POS + 1e-12)
+        x = torch.tensor([target_amax], dtype=torch.float16, device="cuda")
+        scale = tensor_to_scale(x, float8_dtype)
+        assert not torch.any(torch.isinf(scale))
+
+
+class TestFloat8LinearUtils(unittest.TestCase):
+    def test_swap_root_linear(self):
+        for emulate in [True, False]:
+            module = nn.Linear(3, 3)
+            config = Float8LinearConfig(emulate=emulate)
+            module = convert_to_float8_training(module, config=config)
+            self.assertIsInstance(module, Float8Linear)
+            self.assertEqual(module.linear_mm_config.output.emulate, emulate)
+            self.assertEqual(module.linear_mm_config.output.emulate, emulate)
+
+    def test_swap_root_linear_with_children_raises(self):
+        for emulate in [True, False]:
+            module = nn.Linear(3, 3)
+            module.child = nn.Sequential(nn.Linear(3, 3))
+            config = Float8LinearConfig(emulate=emulate)
+            with self.assertRaisesRegex(
+                AssertionError,
+                "Does not support a root nn.Linear with children",
+            ):
+                convert_to_float8_training(module, config=config)
+
+    def test_swap_submodule_linears(self):
+        class MLP(nn.Module):
+            def __init__(self, dim: int):
+                super().__init__()
+                self.lin1 = nn.Linear(dim, 4 * dim)
+                self.lin2 = nn.Linear(4 * dim, dim)
+
+        for emulate in [True, False]:
+            model = nn.Sequential(MLP(3), nn.Linear(3, 3), MLP(3))
+            config = Float8LinearConfig(emulate=emulate)
+            model = convert_to_float8_training(model, config=config)
+            self.assertIsInstance(model[0].lin1, Float8Linear)
+            self.assertIsInstance(model[0].lin2, Float8Linear)
+            self.assertIsInstance(model[1], Float8Linear)
+            self.assertIsInstance(model[2].lin1, Float8Linear)
+            self.assertIsInstance(model[2].lin2, Float8Linear)
+
+    def test_swap_linears_with_filters(self):
+        class MLP(nn.Module):
+            def __init__(self, dim: int):
+                super().__init__()
+                self.lin1 = nn.Linear(dim, 4 * dim)
+                self.lin2 = nn.Linear(4 * dim, 4 * dim)
+
+        model = nn.Sequential(MLP(8), nn.Linear(32, 32), MLP(40))
+        # filter out the linear layers whose shape is smaller than 32 or non-divisible by 16.
+
+        size_limit = 32
+
+        def module_filter_fn(mod, fqn):
+            return (
+                mod.in_features >= size_limit
+                and mod.out_features >= size_limit
+                and mod.in_features % 16 == 0
+                and mod.out_features % 16 == 0
+            )
+
+        config = Float8LinearConfig(emulate=True)
+        model = convert_to_float8_training(
+            model,
+            config=config,
+            module_filter_fn=module_filter_fn,
+        )
+        # in_features=8, out_features=32, 8 is less than 32.
+        self.assertNotIsInstance(model[0].lin1, Float8Linear)
+        # in_features=32, out_features=32,
+        self.assertIsInstance(model[0].lin2, Float8Linear)
+        # in_features=32, out_features=32,
+        self.assertIsInstance(model[1], Float8Linear)
+        # in_features=40, out_features=160, 40 is not divisible by 16.
+        self.assertNotIsInstance(model[2].lin1, Float8Linear)
+        # in_features=160, out_features=160,
+        self.assertIsInstance(model[2].lin2, Float8Linear)
+
+    def test_swap_submodule_linears_with_skip(self):
+        class MLP(nn.Module):
+            def __init__(self, dim: int):
+                super().__init__()
+                self.lin1 = nn.Linear(dim, 4 * dim)
+                self.lin2 = nn.Linear(4 * dim, dim)
+
+        model = nn.Sequential(MLP(3), nn.Linear(3, 3), MLP(3))
+        module_filter_fn = lambda mod, fqn: fqn not in [
+            "0.lin2",
+            "2.lin1",
+        ]
+        config = Float8LinearConfig(emulate=True)
+        model = convert_to_float8_training(
+            model,
+            config=config,
+            module_filter_fn=module_filter_fn,
+        )
+        self.assertTrue(type(model[0].lin1) is Float8Linear)
+        self.assertTrue(type(model[0].lin2) is nn.Linear)
+        self.assertTrue(type(model[1]) is Float8Linear)
+        self.assertTrue(type(model[2].lin1) is nn.Linear)
+        self.assertTrue(type(model[2].lin2) is Float8Linear)
+
+    def test_fp8_tensor_statistics(self):
+        hp_dtypes = (torch.float32, torch.float16, torch.bfloat16)
+        lp_dtypes = (e4m3_dtype, e5m2_dtype)
+        for hp_dtype, lp_dtype in itertools.product(hp_dtypes, lp_dtypes):
+            x1_hp = torch.ones(4, 4, dtype=hp_dtype)
+            tensor_len = x1_hp.numel()
+
+            # Overflow caused by a too large scaling factor
+            s_overflow = torch.tensor(1e9)
+            fp8_overflow = hp_tensor_and_scale_to_float8(x1_hp, s_overflow, lp_dtype)
+            (zero_cnt, max_cnt) = fp8_tensor_statistics(fp8_overflow, lp_dtype)
+            self.assertEqual((zero_cnt, max_cnt), (0, tensor_len))
+
+            # Underflow caused by a too small scaling factor
+            s_underflow = torch.tensor(1e-9)
+            fp8_underflow = hp_tensor_and_scale_to_float8(x1_hp, s_underflow, lp_dtype)
+            (zero_cnt, max_cnt) = fp8_tensor_statistics(fp8_underflow, lp_dtype)
+            self.assertEqual((zero_cnt, max_cnt), (tensor_len, 0))
+
+            # Both overflow and underflow
+            x2_hp = torch.cat((x1_hp * 1e9, x1_hp * 1.0, x1_hp * 1e-9), 0)
+            fp8_over_underflow = hp_tensor_and_scale_to_float8(
+                x2_hp, torch.tensor(1.0), lp_dtype
+            )
+            (zero_cnt, max_cnt) = fp8_tensor_statistics(fp8_over_underflow, lp_dtype)
+            self.assertEqual((zero_cnt, max_cnt), (tensor_len, tensor_len))
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/test/float8/test_compile.py b/test/float8/test_compile.py
new file mode 100644
index 000000000..1f3ebe169
--- /dev/null
+++ b/test/float8/test_compile.py
@@ -0,0 +1,329 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+import copy
+import random
+import sys
+import unittest
+from io import StringIO
+
+import pytest
+
+from torchao.utils import TORCH_VERSION_AFTER_2_4
+
+if not TORCH_VERSION_AFTER_2_4:
+    pytest.skip("Unsupported PyTorch version", allow_module_level=True)
+
+import torch
+import torch.nn as nn
+from torchao.float8.config import CastConfig, Float8LinearConfig, ScalingType
+from torchao.float8.float8_linear import Float8Linear
+from torchao.float8.float8_linear_utils import (
+    convert_to_float8_training,
+    get_float8_layers,
+    sync_float8_amax_and_scale_history,
+)
+from torchao.float8.float8_scaling_utils import hp_tensor_to_float8_delayed
+from torchao.float8.float8_tensor import LinearMMConfig
+from torchao.float8.float8_utils import e4m3_dtype
+
+from torch._dynamo.test_case import TestCase as DynamoTestCase
+from torch._dynamo.testing import CompileCounterWithBackend
+
+is_H100 = torch.cuda.is_available() and torch.cuda.get_device_capability() >= (9, 0)
+
+
+def _test_compile_base(
+    backend: str,
+    fullgraph: bool,
+    config: Float8LinearConfig,
+    dtype: torch.dtype,
+):
+    random.seed(0)
+    torch.manual_seed(0)
+    x_shape = (16, 16)
+    linear_dtype = torch.bfloat16
+
+    x = torch.randn(*x_shape, device="cuda", dtype=linear_dtype)
+    m_ref = nn.Linear(16, 32, bias=True, device="cuda", dtype=linear_dtype)
+
+    m_fp8 = Float8Linear.from_float(
+        copy.deepcopy(m_ref),
+        config,
+    )
+
+    m_fp8 = torch.compile(m_fp8, backend=backend, fullgraph=fullgraph)
+    m_ref = torch.compile(m_ref, backend=backend, fullgraph=fullgraph)
+    y_fp8 = m_fp8(x)
+    y_fp8.sum().backward()
+    y_ref = m_ref(x)
+    y_ref.sum().backward()
+    torch.testing.assert_close(y_fp8, y_ref, atol=9.5e-2, rtol=9.5e-2)
+    torch.testing.assert_close(
+        m_fp8.weight.grad, m_ref.weight.grad, atol=2e-1, rtol=2e-1
+    )
+    torch.testing.assert_close(m_fp8.bias.grad, m_ref.bias.grad, atol=8e-2, rtol=8e-2)
+
+
+@pytest.mark.parametrize("fullgraph", [True])
+@pytest.mark.parametrize(
+    "scaling_type_input", [ScalingType.DELAYED, ScalingType.DYNAMIC]
+)
+@pytest.mark.parametrize(
+    "scaling_type_weight", [ScalingType.DELAYED, ScalingType.DYNAMIC]
+)
+@pytest.mark.parametrize(
+    "scaling_type_grad_output", [ScalingType.DELAYED, ScalingType.DYNAMIC]
+)
+@pytest.mark.parametrize("emulate", [False, True] if is_H100 else [True])
+@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16, torch.float32])
+@unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
+def test_eager_only(
+    fullgraph,
+    emulate: bool,
+    scaling_type_input: ScalingType,
+    scaling_type_weight: ScalingType,
+    scaling_type_grad_output: ScalingType,
+    dtype: torch.dtype,
+):
+    torch._dynamo.reset()
+    config = Float8LinearConfig(
+        cast_config_input=CastConfig(scaling_type=scaling_type_input),
+        cast_config_weight=CastConfig(scaling_type=scaling_type_weight),
+        cast_config_grad_output=CastConfig(scaling_type=scaling_type_grad_output),
+        emulate=emulate,
+    )
+    _test_compile_base(
+        "eager",
+        fullgraph,
+        config,
+        dtype,
+    )
+
+
+@pytest.mark.parametrize("fullgraph", [True])
+@pytest.mark.parametrize("emulate", [False, True] if is_H100 else [True])
+@pytest.mark.parametrize(
+    "scaling_type_input", [ScalingType.DELAYED, ScalingType.DYNAMIC]
+)
+@pytest.mark.parametrize(
+    "scaling_type_weight", [ScalingType.DELAYED, ScalingType.DYNAMIC]
+)
+@pytest.mark.parametrize(
+    "scaling_type_grad_output", [ScalingType.DELAYED, ScalingType.DYNAMIC]
+)
+@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16, torch.float32])
+@unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
+def test_aot_eager(
+    fullgraph,
+    emulate: bool,
+    scaling_type_input: ScalingType,
+    scaling_type_weight: ScalingType,
+    scaling_type_grad_output: ScalingType,
+    dtype: torch.dtype,
+):
+    torch._dynamo.reset()
+    config = Float8LinearConfig(
+        cast_config_input=CastConfig(scaling_type=scaling_type_input),
+        cast_config_weight=CastConfig(scaling_type=scaling_type_weight),
+        cast_config_grad_output=CastConfig(scaling_type=scaling_type_grad_output),
+        emulate=emulate,
+    )
+    _test_compile_base(
+        "aot_eager",
+        fullgraph,
+        config,
+        dtype,
+    )
+
+
+@pytest.mark.parametrize("fullgraph", [True])
+@pytest.mark.parametrize("emulate", [False])
+@pytest.mark.parametrize(
+    "scaling_type_input", [ScalingType.DELAYED, ScalingType.DYNAMIC]
+)
+@pytest.mark.parametrize(
+    "scaling_type_weight", [ScalingType.DELAYED, ScalingType.DYNAMIC]
+)
+@pytest.mark.parametrize(
+    "scaling_type_grad_output", [ScalingType.DELAYED, ScalingType.DYNAMIC]
+)
+@unittest.skipIf(not torch.cuda.is_available() or not is_H100, "CUDA with float8 support not available")
+@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16, torch.float32])
+def test_inductor(
+    fullgraph,
+    emulate: bool,
+    scaling_type_input: ScalingType,
+    scaling_type_weight: ScalingType,
+    scaling_type_grad_output: ScalingType,
+    dtype: torch.dtype,
+):
+    torch._dynamo.reset()
+    config = Float8LinearConfig(
+        cast_config_input=CastConfig(scaling_type=scaling_type_input),
+        cast_config_weight=CastConfig(scaling_type=scaling_type_weight),
+        cast_config_grad_output=CastConfig(scaling_type=scaling_type_grad_output),
+        emulate=emulate,
+    )
+    _test_compile_base(
+        "inductor",
+        fullgraph,
+        config,
+        dtype,
+    )
+
+
+class TestGraphBreaks(DynamoTestCase):
+    class MockLinear(torch.nn.Module):
+        def __init__(self, graph_break: bool):
+            super().__init__()
+            self.register_buffer("fp8_amax_x", torch.tensor(1.0))
+            self.register_buffer("fp8_scale_x", torch.tensor(1.0))
+            self.graph_break = graph_break
+
+        def forward(self, x):
+            x_fp8 = hp_tensor_to_float8_delayed(
+                x,
+                self.fp8_scale_x,
+                e4m3_dtype,
+                self.fp8_amax_x,
+                LinearMMConfig(),
+            )
+            if self.graph_break:
+                torch._dynamo.graph_break()
+                x_hp = x_fp8.to_original_precision()
+                return x_hp
+            return x_fp8
+
+    @unittest.skipIf(not torch.cuda.is_available() or not is_H100, "CUDA with float8 support not available")
+    def test_float8_with_graph_break_in_the_middle(self):
+        """Test that having Float8Tensor object at the boundary of a subgraph"""
+        cnts = CompileCounterWithBackend("inductor")
+        mod = self.MockLinear(graph_break=True).cuda()
+        compiled_mod = copy.deepcopy(mod)
+        compiled_mod = torch.compile(compiled_mod, backend=cnts)
+        x = torch.randn(16, 16, device="cuda")
+        y_eager = mod(x)
+        y_compiled = compiled_mod(x)
+        self.assertEqual(cnts.frame_count, 2, "Compiled graph should have 2 frames!")
+        torch.testing.assert_close(y_eager, y_compiled)
+
+    @unittest.skipIf(not torch.cuda.is_available() or not is_H100, "CUDA with float8 support not available")
+    def test_float8_graph_input(self):
+        """Test that having Float8Tensor object as a graph input"""
+
+        def to_float(x):
+            return x.to_original_precision()
+
+        cnts = CompileCounterWithBackend("inductor")
+        mod = self.MockLinear(graph_break=False).cuda()
+        x = torch.randn(2, 2, device="cuda")
+        compiled_to_float = torch.compile(to_float, backend=cnts)
+        y = mod(x)
+        y2_eager = to_float(y)
+        y2_compiled = compiled_to_float(y)
+        self.assertEqual(
+            cnts.frame_count,
+            1,
+            "to_float was not compiled into 1 frame and likely encountered a skip!",
+        )
+        torch.testing.assert_close(y2_eager, y2_compiled)
+
+    @unittest.skipIf(not torch.cuda.is_available() or not is_H100, "CUDA with float8 support not available")
+    def test_float8_graph_output(self):
+        """Test that having Float8Tensor object as a graph output works"""
+        cnts = CompileCounterWithBackend("inductor")
+        mod = self.MockLinear(graph_break=False).cuda()
+        compiled_mod = torch.compile(mod, backend=cnts)
+        x = torch.randn(16, 16, device="cuda")
+        y_compiled = compiled_mod(x)
+
+        self.assertEqual(cnts.frame_count, 1, "Compiled graph should have 1 frame!")
+        tensors, ctx = y_compiled.__tensor_flatten__()
+        for tensor in tensors:
+            assert not isinstance(
+                getattr(y_compiled, tensor), torch._subclasses.fake_tensor.FakeTensor
+            ), "Float8Tensor should not contain any FakeTensors!"
+        assert isinstance(
+            y_compiled._orig_dtype, torch.dtype
+        ), "Float8Tensor._orig_dtype should be a dtype but got {}".format(
+            type(y_compiled._orig_dtype)
+        )
+        assert isinstance(
+            y_compiled._linear_mm_config.output.emulate, bool
+        ), "Float8Tensor._emulate should be a bool but got {}".format(
+            type(y_compiled._linear_mm_config.output.emulate)
+        )
+
+
+@unittest.skipIf(not torch.cuda.is_available() or not is_H100, "CUDA with float8 support not available")
+def test_sync_amax_func():
+    torch._dynamo.reset()
+    cnts = CompileCounterWithBackend("inductor")
+    module = torch.nn.Sequential(
+        nn.Linear(16, 32, bias=True), nn.ReLU(), nn.Linear(32, 16, bias=True)
+    )
+    config = Float8LinearConfig(
+        cast_config_input=CastConfig(scaling_type=ScalingType.DELAYED),
+        cast_config_weight=CastConfig(scaling_type=ScalingType.DELAYED),
+        cast_config_grad_output=CastConfig(scaling_type=ScalingType.DELAYED),
+    )
+    float8_mod = convert_to_float8_training(
+        module,
+        config=config,
+    )
+    compiled_swap_func = torch.compile(sync_float8_amax_and_scale_history, backend=cnts)
+    compiled_swap_func(float8_mod)
+    assert cnts.frame_count == 1, "Compiled graph should have 1 frame!"
+
+
+class capture_stderr(list):
+    """
+    Replace sys.stderr with a temporary StringIO
+    """
+
+    def __enter__(self):
+        self.sys_stderr = sys.stderr
+        self.stringio = StringIO()
+        sys.stderr = self.stringio
+        return self
+
+    def __exit__(self, *args):
+        self.append(str(self.stringio.getvalue()))
+        del self.stringio
+        sys.stderr = self.sys_stderr
+
+
+@unittest.skipIf(not torch.cuda.is_available() or not is_H100, "CUDA with float8 support not available")
+def test_sync_amax_func_cuda_graph_success():
+    torch._dynamo.reset()
+    with capture_stderr() as stderr:
+        my_module = nn.Sequential(
+            nn.Linear(16, 32, bias=True), nn.ReLU(), nn.Linear(32, 16, bias=True)
+        ).to("cuda")
+        config = Float8LinearConfig(
+            cast_config_input=CastConfig(scaling_type=ScalingType.DELAYED),
+            cast_config_weight=CastConfig(scaling_type=ScalingType.DELAYED),
+            cast_config_grad_output=CastConfig(scaling_type=ScalingType.DELAYED),
+        )
+        convert_to_float8_training(
+            my_module,
+            config=config,
+        )
+        inpt = torch.randn(
+            16, 16, device="cuda", dtype=torch.float32, requires_grad=True
+        )
+        sync_func = torch.compile(
+            sync_float8_amax_and_scale_history, mode="reduce-overhead", fullgraph=True
+        )
+        fp8_layers = get_float8_layers(my_module)
+        my_module(inpt)
+        sync_func(my_module, fp8_layers)
+
+    assert "skipping cudagraphs due to mutaton on input" not in stderr[0]
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/test/float8/test_dtensor.py b/test/float8/test_dtensor.py
new file mode 100644
index 000000000..7a6c9125d
--- /dev/null
+++ b/test/float8/test_dtensor.py
@@ -0,0 +1,327 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Test numerics of manually defined float16 TP vs float8 TP of toy models
+
+Note: for now, this does not run in CI.
+TODO(future): make this run in CI
+"""
+
+import copy
+import os
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+import pytest
+
+from torchao.utils import TORCH_VERSION_AFTER_2_4
+
+if not TORCH_VERSION_AFTER_2_4:
+    pytest.skip("Unsupported PyTorch version", allow_module_level=True)
+
+from torchao.float8 import Float8LinearConfig
+from torchao.float8.float8_linear_utils import convert_to_float8_training
+
+from torchao.float8.float8_scaling_utils import NoopFwToFloat8E5M2BwDynamic
+from torchao.float8.float8_tensor import (
+    Float8Tensor,
+    GemmInputRole,
+    hp_tensor_and_scale_to_float8,
+    LinearMMConfig,
+)
+from torchao.float8.float8_tensor_parallel import (
+    Float8ColwiseParallel,
+    Float8RowwiseParallel,
+    PrepareFloat8ModuleInput,
+)
+from torchao.float8.float8_utils import e4m3_dtype, tensor_to_scale
+from torch.distributed._tensor import distribute_tensor, DTensor, Replicate, Shard
+from torch.distributed.device_mesh import DeviceMesh, init_device_mesh
+from torch.distributed.tensor.parallel import parallelize_module
+from tqdm import tqdm
+
+
+def setup_distributed():
+    world_size = int(os.environ.get("WORLD_SIZE", -1))
+    device_mesh = init_device_mesh("cuda", (world_size,))
+    # seed must be the same in all processes
+    torch.manual_seed(1)
+    return device_mesh
+
+
+class FeedForward(nn.Module):
+    """MLP based model"""
+
+    def __init__(self):
+        super(FeedForward, self).__init__()
+        self.w1 = nn.Linear(16, 32, bias=False)
+        self.w2 = nn.Linear(16, 32, bias=False)
+        self.out_proj = nn.Linear(32, 16, bias=False)
+
+    def forward(self, x):
+        return self.out_proj(F.silu(self.w1(x)) * self.w2(x))
+
+
+class ToyModel(nn.Module):
+    def __init__(self):
+        super(ToyModel, self).__init__()
+        self.ffn = FeedForward()
+
+    def forward(self, x):
+        return self.ffn(x)
+
+
+def _test_scaled_mm(mesh: DeviceMesh, size=16):
+    device = mesh.device_type
+    fp8_dtype = e4m3_dtype
+    world_size = mesh.size()
+
+    x_fp32 = torch.rand(size, size, device=device)
+    y_fp32 = torch.eye(size, device=device).t()
+
+    placement_combs = (
+        (Shard(0), Replicate()),
+        (Replicate(), Shard(1)),
+        (Shard(1), Shard(0)),
+    )
+    expected_dt_out_shape = (
+        (size * world_size, size),
+        (size, size * world_size),
+        (size, size),
+    )
+    for idx, (lhs_placement, rhs_placement) in enumerate(placement_combs):
+        x_scale = tensor_to_scale(x_fp32, fp8_dtype).float()
+        y_scale = tensor_to_scale(y_fp32, fp8_dtype).float()
+
+        x_fp8 = hp_tensor_and_scale_to_float8(
+            x_fp32, x_scale, fp8_dtype, None, GemmInputRole.INPUT
+        )
+        y_fp8 = hp_tensor_and_scale_to_float8(
+            y_fp32, y_scale, fp8_dtype, None, GemmInputRole.WEIGHT
+        )
+
+        dist_x_fp8 = DTensor.from_local(x_fp8, mesh, [lhs_placement], run_check=False)
+        dist_y_fp8 = DTensor.from_local(y_fp8, mesh, [rhs_placement], run_check=False)
+
+        assert isinstance(dist_x_fp8.to_local(), Float8Tensor)
+        assert isinstance(dist_y_fp8.to_local(), Float8Tensor)
+        assert dist_x_fp8.to_local()._orig_dtype == torch.float32
+        out_fp8 = torch.mm(dist_x_fp8, dist_y_fp8)
+        local_fp8_out = out_fp8.to_local()
+        assert out_fp8.shape == expected_dt_out_shape[idx], (idx, local_fp8_out.shape)
+
+        # after mm the out dtype should be fp32
+        assert local_fp8_out.dtype == torch.float32
+
+
+def _test_fp8_redistribute(mesh: DeviceMesh, size=16):
+    device = mesh.device_type
+    fp8_dtype = e4m3_dtype
+    world_size = mesh.size()
+
+    x_fp32 = torch.rand(size, size, device=device)
+
+    x_scale = tensor_to_scale(x_fp32, fp8_dtype).float()
+
+    x_fp8 = hp_tensor_and_scale_to_float8(x_fp32, x_scale, fp8_dtype)
+
+    dist_x_fp8 = DTensor.from_local(x_fp8, mesh, [Shard(0)], run_check=False)
+    out_dist = dist_x_fp8.redistribute(placements=[Replicate()])
+    assert out_dist.shape == (size * world_size, size)
+    assert out_dist.placements == (Replicate(),)
+    out_local = out_dist.to_local()
+    # after allgather the out shape should be replicate
+    assert out_local.shape == (size * world_size, size)
+    from torch.distributed._functional_collectives import AsyncCollectiveTensor
+
+    if isinstance(out_local, AsyncCollectiveTensor):
+        out_local = out_local.wait()
+
+    assert isinstance(out_local, Float8Tensor)
+    assert out_local._data.dtype == fp8_dtype
+
+
+def _test_dtensor_cast_to_fp8(mesh: DeviceMesh, size=16):
+    device = mesh.device_type
+    fp8_dtype = e4m3_dtype
+
+    x_fp32 = torch.rand(size, size, device=device)
+    dist_x_fp32 = distribute_tensor(x_fp32, mesh, [Shard(0)])
+
+    dist_x_scale = tensor_to_scale(dist_x_fp32, fp8_dtype).float()
+    assert isinstance(dist_x_scale, DTensor)
+
+    dist_x_fp8 = hp_tensor_and_scale_to_float8(dist_x_fp32, dist_x_scale, fp8_dtype)
+    assert isinstance(dist_x_fp8, DTensor)
+
+
+def _test_dtensor_fp8_autograd(mesh: DeviceMesh, size=16):
+    device = mesh.device_type
+    fp8_dtype = e4m3_dtype
+
+    x_fp32 = torch.rand(size, size, device=device, requires_grad=True)
+    local_weight = torch.rand(2 * size, size, device=device, requires_grad=True)
+    target = torch.rand(size, 2 * size, device=device)
+
+    dist_x_fp32 = distribute_tensor(x_fp32, mesh, [Shard(0)])
+    dist_x_scale = tensor_to_scale(dist_x_fp32, fp8_dtype).float()
+
+    dist_wight_fp32 = distribute_tensor(local_weight, mesh, [Shard(0)])
+    dist_weight_scale = tensor_to_scale(dist_wight_fp32, fp8_dtype).float()
+    dist_target = distribute_tensor(target, mesh, [Shard(0)])
+
+    dist_x_fp8 = hp_tensor_and_scale_to_float8(
+        dist_x_fp32,
+        dist_x_scale,
+        fp8_dtype,
+        None,
+        GemmInputRole.INPUT,
+    )
+    dist_weight_fp8 = hp_tensor_and_scale_to_float8(
+        dist_wight_fp32,
+        dist_weight_scale,
+        fp8_dtype,
+        None,
+        GemmInputRole.WEIGHT,
+    )
+
+    out = torch.nn.functional.linear(dist_x_fp8, dist_weight_fp8)
+    out = NoopFwToFloat8E5M2BwDynamic.apply(out, LinearMMConfig())
+    assert isinstance(out, DTensor), f"Expected DTensor, got {type(out)}"
+    loss = torch.sum(torch.abs(out - dist_target))
+    loss.backward()
+
+
+def _test_fp8_mlp_tensor_parallelism_base(
+    mesh: DeviceMesh, size=16, compile: bool = False
+):
+    device = mesh.device_type
+    # For now, only supports dynamic scaling of `x` and `dL_dY`.
+    # TODO(future): add support for float8 all-gather with delayed scaling
+    # for activations and gradients.
+    config = Float8LinearConfig(emulate=True)
+
+    toy_model = ToyModel().to(device)
+    toy_model_fp8 = convert_to_float8_training(toy_model, config=config)
+
+    tp_model = copy.deepcopy(toy_model)
+    tp_model = convert_to_float8_training(tp_model, config=config)
+    sp_model = copy.deepcopy(toy_model)
+    sp_model = convert_to_float8_training(sp_model, config=config)
+
+    # vanilla TP
+    tp_model = parallelize_module(
+        tp_model,
+        mesh,
+        {
+            "ffn.w1": Float8ColwiseParallel(),
+            "ffn.w2": Float8ColwiseParallel(),
+            "ffn.out_proj": Float8RowwiseParallel(),
+        },
+    )
+
+    # "sequence parallel" mlp computation
+    sp_model = parallelize_module(
+        sp_model,
+        mesh,
+        {
+            "ffn": PrepareFloat8ModuleInput(
+                input_layouts=Shard(1), desired_input_layouts=Replicate()
+            ),
+            "ffn.w1": Float8ColwiseParallel(),
+            "ffn.w2": Float8ColwiseParallel(),
+            "ffn.out_proj": Float8RowwiseParallel(
+                output_layouts=Shard(1), use_local_output=False
+            ),
+        },
+    )
+
+    # PrepareFloat8ModuleInput with specific submodule fqn
+    sp_model2 = copy.deepcopy(toy_model)
+    sp_model2 = convert_to_float8_training(sp_model2, config=config)
+
+    sp_model2 = parallelize_module(
+        sp_model2,
+        mesh,
+        {
+            "ffn": PrepareFloat8ModuleInput(
+                input_layouts=Shard(1),
+                desired_input_layouts=Replicate(),
+                fwd_config_submodule_fqn="w2",
+            ),
+            "ffn.w1": Float8ColwiseParallel(),
+            "ffn.w2": Float8ColwiseParallel(),
+            "ffn.out_proj": Float8RowwiseParallel(
+                output_layouts=Shard(1), use_local_output=False
+            ),
+        },
+    )
+
+    if compile:
+        tp_model = torch.compile(tp_model)
+        sp_model = torch.compile(sp_model)
+        sp_model2 = torch.compile(sp_model2)
+
+    x_fp32 = torch.rand(size, size * 2, size, device=device, requires_grad=False)
+    x_fp32_tp_input = x_fp32.clone()
+    x_fp32_sp_input = distribute_tensor(x_fp32.clone(), mesh, [Shard(0)])
+
+    tp_out = tp_model(x_fp32_tp_input)
+    tp_out.sum().backward()
+    sp_out = sp_model(x_fp32_sp_input)
+    sp_out.sum().backward()
+    global_out = toy_model_fp8(x_fp32)
+    global_out.sum().backward()
+    torch.testing.assert_close(tp_out, global_out)
+    torch.testing.assert_close(sp_out.full_tensor(), global_out)
+    torch.testing.assert_close(tp_model.ffn.w1.weight.grad, sp_model.ffn.w1.weight.grad)
+    torch.testing.assert_close(
+        tp_model.ffn.out_proj.weight.grad, sp_model.ffn.out_proj.weight.grad
+    )
+
+    sp_out2 = sp_model2(x_fp32_sp_input)
+    sp_out2.sum().backward()
+    torch.testing.assert_close(sp_out2.full_tensor(), global_out)
+    torch.testing.assert_close(
+        tp_model.ffn.w1.weight.grad, sp_model2.ffn.w1.weight.grad
+    )
+    torch.testing.assert_close(
+        tp_model.ffn.out_proj.weight.grad, sp_model2.ffn.out_proj.weight.grad
+    )
+
+
+def _test_fp8_mlp_tensor_parallelism_eager(mesh: DeviceMesh, size=16):
+    _test_fp8_mlp_tensor_parallelism_base(mesh, size, compile=False)
+
+
+def _test_fp8_mlp_tensor_parallelism_compile(mesh: DeviceMesh, size=16):
+    _test_fp8_mlp_tensor_parallelism_base(mesh, size, compile=True)
+
+
+if __name__ == "__main__":
+    # float8 only works on CUDA H100 so we only test cuda and we follow
+    # other test files to not use TestCase but instead just add the test
+    # cases in the main func.
+    device_mesh = setup_distributed()
+    tests = [
+        _test_scaled_mm,
+        _test_fp8_redistribute,
+        _test_dtensor_cast_to_fp8,
+        _test_dtensor_fp8_autograd,
+        _test_fp8_mlp_tensor_parallelism_eager,
+        _test_fp8_mlp_tensor_parallelism_compile,
+    ]
+
+    for test in tqdm(tests, desc="Running tests"):
+        try:
+            test(device_mesh)
+        except Exception as e:
+            print(f"Test {test.__name__} failed with error: {e}")
+            raise e
+
+    torch.distributed.destroy_process_group()
diff --git a/test/float8/test_dtensor.sh b/test/float8/test_dtensor.sh
new file mode 100755
index 000000000..2e38feffe
--- /dev/null
+++ b/test/float8/test_dtensor.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+# terminate script on first error
+set -e
+
+if python -c 'import torch;print(torch.cuda.is_available())' | grep -q "False"; then
+    echo "Skipping test_dtensor.sh because no CUDA devices are available."
+    exit
+fi
+
+NCCL_DEBUG=WARN torchrun --nproc_per_node 2 test/float8/test_dtensor.py
diff --git a/test/float8/test_everything.sh b/test/float8/test_everything.sh
new file mode 100755
index 000000000..d70833323
--- /dev/null
+++ b/test/float8/test_everything.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# terminate script on first error
+set -e
+IS_ROCM=$(rocm-smi --version || true)
+
+pytest test/float8/test_base.py
+pytest test/float8/test_compile.py
+pytest test/float8/test_inference_flows.py
+pytest test/float8/test_numerics_integration.py
+
+# These tests do not work on ROCm yet
+if [ -z "$IS_ROCM" ]
+then
+./test/float8/test_fsdp.sh
+./test/float8/test_fsdp_compile.sh
+./test/float8/test_dtensor.sh
+pytest test/float8/test_fsdp2/test_fsdp2.py
+fi
+
+echo "all tests successful"
diff --git a/test/float8/test_fsdp.py b/test/float8/test_fsdp.py
new file mode 100644
index 000000000..f30878b33
--- /dev/null
+++ b/test/float8/test_fsdp.py
@@ -0,0 +1,212 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Test numerics of bf16 versus float8 with FSDP on. At a high level:
+1. start with a reference model, with FSDP on
+2. run forward + backward + optim for 2 iterations
+3. repeat 2 with float8 enabled (2 iterations needed for delayed scaling)
+4. compare outputs and state dict between (2) and (3), should be close
+"""
+
+import copy
+import os
+import pytest
+import warnings
+
+import fire
+
+from torchao.utils import TORCH_VERSION_AFTER_2_4
+
+if not TORCH_VERSION_AFTER_2_4:
+    pytest.skip("Unsupported PyTorch version", allow_module_level=True)
+
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+import torch.nn as nn
+from torchao.float8.config import CastConfig, Float8LinearConfig, ScalingType
+from torchao.float8.float8_linear_utils import (
+    convert_to_float8_training,
+    linear_requires_sync,
+    sync_float8_amax_and_scale_history,
+)
+from torchao.float8.float8_utils import compute_error
+from torch.distributed.fsdp import (
+    FullStateDictConfig,
+    FullyShardedDataParallel as FSDP,
+    StateDictType,
+)
+
+torch.manual_seed(0)
+
+B, M, K, N = 8, 8, 32, 32
+lr = 0.01
+N_ITER = 2
+
+
+def setup(rank, world_size):
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = "12355"
+
+    # initialize the process group
+    dist.init_process_group("nccl", rank=rank, world_size=world_size)
+
+
+def cleanup():
+    dist.destroy_process_group()
+
+
+def get_model(K, N, base_dtype=torch.float32):
+    m = nn.Sequential(
+        nn.Linear(K, N, dtype=base_dtype),
+        nn.ReLU(),
+        nn.Linear(N, N, dtype=base_dtype),
+        nn.ReLU(),
+    )
+    return m
+
+
+# taken from https://pytorch.org/tutorials/intermediate/FSDP_tutorial.html
+# and modified
+def fsdp_main(rank, world_size, args):
+    setup(rank, world_size)
+    torch.cuda.set_device(rank)
+
+    emulate, base_dtype, compile, use_weight_dynamic_scaling = args
+    model = get_model(K, N, base_dtype=base_dtype).to(rank)
+    model_fp8 = copy.deepcopy(model)
+
+    scaling_type_weight = (
+        ScalingType.DYNAMIC if use_weight_dynamic_scaling else ScalingType.DELAYED
+    )
+    config = Float8LinearConfig(
+        cast_config_weight=CastConfig(scaling_type=scaling_type_weight),
+        # TODO(future): delete this arg as it's always False
+        emulate=False,
+    )
+
+    # Note: we only iterate over `scaling_type_weight` because FSDP only interacts
+    # with weights.
+    convert_to_float8_training(
+        model_fp8,
+        config=config,
+    )
+
+    # To compile FSDP, we need use_orig_params to True
+    model = FSDP(model, use_orig_params=True)
+    model_fp8 = FSDP(model_fp8, use_orig_params=True)
+    # TODO: The following line doesn't work. We should fix it.
+    # model = FSDP(torch.compile(model), use_orig_params=True)
+
+    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
+    optimizer_fp8 = torch.optim.SGD(model_fp8.parameters(), lr=lr)
+
+    # Note: we need two different inputs to properly measure the impact of
+    # delayed scaling, before the first input uses dynamic scaling to
+    # populate the buffers
+    ref_input_global = [
+        torch.randn(B, M, K).cuda().to(base_dtype),
+        torch.randn(B, M, K).cuda().to(base_dtype),
+    ]
+    ref_grad_global = [
+        torch.randn(B, M, N).cuda().to(base_dtype),
+        torch.randn(B, M, N).cuda().to(base_dtype),
+    ]
+    ref_input_local = []
+    ref_grad_local = []
+
+    # basic distributed data sampling
+    assert B % world_size == 0
+    bsz_local_start = int(rank / world_size * B)
+    bsz_local_end = int((rank + 1) / world_size * B)
+    for idx in range(N_ITER):
+        ref_input_local.append(
+            ref_input_global[idx][bsz_local_start:bsz_local_end].to(rank)
+        )
+        ref_grad_local.append(
+            ref_grad_global[idx][bsz_local_start:bsz_local_end].to(rank)
+        )
+
+    sync_float8_func = sync_float8_amax_and_scale_history
+    if compile:
+        sync_float8_func = torch.compile(sync_float8_amax_and_scale_history)
+
+    def forward_backward(model, optim, is_fp8, i):
+        optim.zero_grad()
+        y_local = model(ref_input_local[i])
+        y_local.backward(ref_grad_local[i])
+        if is_fp8 and linear_requires_sync(config):
+            sync_float8_func(model)
+        optim.step()
+        return y_local
+
+    for i in range(N_ITER):
+        # We first run one iteration without compile, as a workaround to compile float8 layer.
+        # In the first iter, float8 layers go to the branches of "self.is_amax_initialized == False"
+        # After that, float8 layers go the the branches of "self.is_amax_initialized == True"
+        # TODO: Need to fix compile to run wihtout this workaround.
+        if i == 1 and compile:
+            model = torch.compile(model)
+            model_fp8 = torch.compile(model_fp8)
+        y_local = forward_backward(model, optimizer, is_fp8=False, i=i)
+        y_local_fp8 = forward_backward(model_fp8, optimizer_fp8, is_fp8=True, i=i)
+        local_sqnr = compute_error(y_local, y_local_fp8)  # noqa: F841
+
+    # get global y
+    y_global = [
+        torch.zeros(*y_local.shape, dtype=base_dtype).to(rank)
+        for r in range(world_size)
+    ]
+    dist.all_gather(y_global, y_local)
+    y_global = torch.cat(y_global, dim=0)
+    y_global_fp8 = [
+        torch.zeros(*y_local_fp8.shape, dtype=base_dtype).to(rank)
+        for r in range(world_size)
+    ]
+    dist.all_gather(y_global_fp8, y_local_fp8)
+    y_global_fp8 = torch.cat(y_global_fp8, dim=0)
+    if rank == 0:
+        sqnr = compute_error(y_global, y_global_fp8)
+        assert sqnr > 15.0, f"SQNR of {sqnr} is too low"
+
+    # get global state dict
+    # https://pytorch.org/tutorials/intermediate/FSDP_adavnced_tutorial.html
+    dist.barrier()
+    save_policy = FullStateDictConfig(offload_to_cpu=True, rank0_only=True)
+    with FSDP.state_dict_type(model, StateDictType.FULL_STATE_DICT, save_policy):
+        cpu_state = model.state_dict()
+    with FSDP.state_dict_type(model_fp8, StateDictType.FULL_STATE_DICT, save_policy):
+        cpu_state_fp8 = model_fp8.state_dict()
+    if rank == 0:
+        for k, v1 in cpu_state.items():
+            v2 = cpu_state_fp8[k]
+            v1, v2 = v1.cpu(), v2.cpu()
+            sqnr = compute_error(v1, v2)
+            assert sqnr > 15.0, f"SQNR of {sqnr} is too low, k: {k}, v1: {v1}, v2: {v2}"
+
+    cleanup()
+
+
+def run(compile_fsdp: bool = False, use_weight_dynamic_scaling: bool = False):
+    base_dtype = torch.bfloat16
+
+    emulate = False
+    if not torch.cuda.is_available():
+        warnings.warn("CUDA not available, running in emulation_mode")
+        emulate = True
+    elif torch.cuda.get_device_capability() < (9, 0):
+        warnings.warn(
+            f"CUDA capability {torch.cuda.get_device_capability()} < (9.0), running in emulation mode"
+        )
+        emulate = True
+
+    WORLD_SIZE = torch.cuda.device_count()
+    args = (emulate, base_dtype, compile_fsdp, use_weight_dynamic_scaling)
+    mp.spawn(fsdp_main, args=(WORLD_SIZE, args), nprocs=WORLD_SIZE, join=True)
+
+
+if __name__ == "__main__":
+    fire.Fire(run)
diff --git a/test/float8/test_fsdp.sh b/test/float8/test_fsdp.sh
new file mode 100755
index 000000000..3ff19d917
--- /dev/null
+++ b/test/float8/test_fsdp.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+# terminate script on first error
+set -e
+
+launch() {
+    echo "launching compile_fsdp $COMPILE, use_weight_dynamic_scaling $USE_WEIGHT_DYNAMIC_SCALING"
+
+    # the NCCL_DEBUG setting is to avoid log spew
+    # the CUDA_VISIBLE_DEVICES setting is for easy debugging
+    NCCL_DEBUG=WARN CUDA_VISIBLE_DEVICES=0,1 python test/float8/test_fsdp.py \
+        --compile_fsdp $COMPILE --use_weight_dynamic_scaling $USE_WEIGHT_DYNAMIC_SCALING
+
+    echo "✅ All Tests Passed ✅"
+}
+
+if python -c 'import torch;print(torch.cuda.is_available())' | grep -q "False"; then
+    echo "Skipping test_fsdp.sh because no CUDA devices are available."
+    exit
+fi
+
+# COMPILE, USE_WEIGHT_DYNAMIC_SCALING
+for i in False,False False,True True,False True,True
+do
+    IFS=","; set -- $i;
+    COMPILE=$1; USE_WEIGHT_DYNAMIC_SCALING=$2
+    launch
+done
diff --git a/test/float8/test_fsdp2/fsdp2_common.py b/test/float8/test_fsdp2/fsdp2_common.py
new file mode 100644
index 000000000..333206ba4
--- /dev/null
+++ b/test/float8/test_fsdp2/fsdp2_common.py
@@ -0,0 +1,89 @@
+import contextlib
+from typing import List, Optional
+
+import torchao.float8.config as config
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from torchao.float8.config import Float8LinearConfig, ScalingType
+from torchao.float8.float8_linear_utils import (
+    linear_requires_sync,
+    sync_float8_amax_and_scale_history,
+)
+from torchao.float8.fsdp_utils import precompute_float8_dynamic_scale_for_fsdp
+
+
+def check_parity_no_mp(
+    test_cls,
+    ref_model: nn.Module,
+    ref_optim: torch.optim.Optimizer,
+    fsdp_model: nn.Module,
+    fsdp_optim: torch.optim.Optimizer,
+    local_inp: torch.Tensor,
+    precompute: bool = False,
+    config: Optional[Float8LinearConfig] = None,
+    compile_transformer_block: bool = False,
+):
+    # TODO(before land): reorder args and make config not optional
+    for iter_idx in range(10):
+        losses: List[torch.Tensor] = []
+        for model, optim in ((ref_model, ref_optim), (fsdp_model, fsdp_optim)):
+            optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
+            losses.append(model(local_inp).sum())
+            losses[-1].backward()
+            if model is ref_model:
+                for param in model.parameters():
+                    dist.all_reduce(param.grad)
+                    param.grad.div_(dist.get_world_size())
+
+            if linear_requires_sync(config):
+                sync_float8_amax_and_scale_history(model)
+
+            optim.step()
+            if (
+                model is fsdp_model
+                and precompute
+                and config.cast_config_weight.scaling_type is ScalingType.DYNAMIC
+            ):
+                precompute_float8_dynamic_scale_for_fsdp(model)
+
+        if compile_transformer_block:
+            test_cls.assertEqual(losses[0], losses[1], atol=1e-4, rtol=1e-4)
+        else:
+            test_cls.assertEqual(losses[0], losses[1])
+
+
+def check_parity_bf16_mp(
+    test_cls,
+    ref_model: nn.Module,
+    ref_model_bf16: nn.Module,
+    ref_optim: torch.optim.Optimizer,
+    fsdp_model: nn.Module,
+    fsdp_optim: torch.optim.Optimizer,
+    local_inp: torch.Tensor,
+):
+    for iter_idx in range(10):
+        losses: List[torch.Tensor] = []
+        for model, optim in (
+            (ref_model_bf16, ref_optim),
+            (fsdp_model, fsdp_optim),
+        ):
+            optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
+            losses.append(model(local_inp).sum())
+            losses[-1].backward()
+            if model is ref_model_bf16:
+                for param_bf16, param_fp32 in zip(
+                    ref_model_bf16.parameters(), ref_model.parameters()
+                ):
+                    dist.all_reduce(param_bf16.grad)
+                    param_bf16.grad.div_(dist.get_world_size())
+                    param_fp32.grad = param_bf16.grad.float()
+                    param_bf16.grad = None
+            # TODO(future): add amax syncing once delayed scaling is supported
+            optim.step()
+            for param_fp32, param_bf16 in zip(
+                ref_model.parameters(), ref_model_bf16.parameters()
+            ):
+                param_bf16.detach().copy_(param_fp32)
+        test_cls.assertEqual(losses[0], losses[1])
diff --git a/test/float8/test_fsdp2/test_fsdp2.py b/test/float8/test_fsdp2/test_fsdp2.py
new file mode 100644
index 000000000..7004b3a1c
--- /dev/null
+++ b/test/float8/test_fsdp2/test_fsdp2.py
@@ -0,0 +1,561 @@
+import copy
+import itertools
+import pytest
+import threading
+import unittest
+from typing import Any, List
+
+from torchao.utils import TORCH_VERSION_AFTER_2_4
+
+if not TORCH_VERSION_AFTER_2_4:
+    pytest.skip("Unsupported PyTorch version", allow_module_level=True)
+
+
+import torch
+import torch._dynamo.testing
+import torch.distributed as dist
+import torch.nn as nn
+from torchao.float8.config import CastConfig, Float8LinearConfig, ScalingType
+from torchao.float8.float8_linear_utils import convert_to_float8_training
+from torchao.float8.fsdp_utils import WeightWithDynamicFloat8CastTensor
+from fsdp2_common import check_parity_bf16_mp, check_parity_no_mp
+from torch.distributed._composable.fsdp import fully_shard, MixedPrecisionPolicy
+from torch.distributed._tensor import DTensor
+from torch.testing._internal.common_cuda import TEST_CUDA
+from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
+from torch.testing._internal.common_fsdp import (
+    FSDPTest,
+    FSDPTestMultiThread,
+    MLP,
+    patch_all_gather,
+)
+from torch.testing._internal.common_utils import run_tests
+from torch.testing._internal.distributed._tensor.common_dtensor import (
+    ModelArgs,
+    Transformer,
+    TransformerBlock,
+)
+
+is_H100 = torch.cuda.is_available() and torch.cuda.get_device_capability() >= (9, 0)
+if not is_H100:
+    pytest.skip("Unsupported CUDA device capability version", allow_module_level=True)
+
+class TestFloat8Common:
+    def broadcast_module(self, module: nn.Module) -> None:
+        # Broadcast for multi-threaded process group tests since seed is per
+        # process, not per thread
+        for param in module.parameters():
+            dist.broadcast(param, src=0)
+
+    def init_single_module(self) -> nn.Module:
+        torch.manual_seed(42)
+        module = nn.Linear(16, 16, device="cuda")
+        self.broadcast_module(module)
+        return module
+
+    def init_multi_module(self) -> nn.Module:
+        torch.manual_seed(42)
+        module = nn.Sequential(*[MLP(16, device="cuda") for _ in range(3)])
+        self.broadcast_module(module)
+        return module
+
+    def init_transformer(self, weight_tying: bool) -> nn.Module:
+        torch.manual_seed(42)
+        args = ModelArgs(
+            n_layers=3,
+            dim=768,
+            n_heads=12,
+            dropout_p=0.0,
+            weight_tying=weight_tying,
+            vocab_size=32,
+        )
+        module = Transformer(args).cuda()
+        self.broadcast_module(module)
+        return module
+
+    def get_local_inp(self, dtype: torch.dtype = torch.float32):
+        torch.manual_seed(42)
+        global_inp = torch.randn((16 * self.world_size, 16), device="cuda", dtype=dtype)
+        dist.broadcast(global_inp, src=0)
+        return global_inp.view(self.world_size, -1)[self.rank].view(16, 16)
+
+
+class TestFloat8MultiProcess(FSDPTest, TestFloat8Common):
+    @property
+    def world_size(self) -> int:
+        return min(torch.cuda.device_count(), 2)
+
+    @skip_if_lt_x_gpu(2)
+    def test_transformer_parity(self):
+        self.run_subtests(
+            {
+                "enable_fsdp_float8_all_gather": [False, True],
+                "precompute": [False, True],
+                "scaling_type_weight": [
+                    ScalingType.DYNAMIC,
+                    ScalingType.DELAYED,
+                ],
+                "compile_transformer_block": [False, True],
+            },
+            self._test_transformer_parity,
+        )
+
+    def _test_transformer_parity(
+        self,
+        enable_fsdp_float8_all_gather: bool,
+        precompute: bool,
+        scaling_type_weight: ScalingType,
+        compile_transformer_block: bool,
+    ):
+        if not enable_fsdp_float8_all_gather and precompute:
+            return
+        elif scaling_type_weight is ScalingType.DELAYED and precompute:
+            return
+
+        # NOTE: Weight-tying does not compose with fp8 all-gather because the
+        # embedding weight and output linear weight are tied but only the
+        # latter uses fp8 compute. With fp8 all-gather, FSDP would pre-cast to
+        # fp8 for that tied weight, incorrectly using fp8 for the embedding.
+        weight_tying = not enable_fsdp_float8_all_gather
+        module = self.init_transformer(weight_tying=weight_tying).cuda()
+        ref_module = copy.deepcopy(module)
+        float8_linear_config1 = Float8LinearConfig(
+            cast_config_weight=CastConfig(scaling_type=scaling_type_weight),
+        )
+        convert_to_float8_training(
+            ref_module,
+            config=float8_linear_config1,
+        )
+        if compile_transformer_block:
+            for layer_id, transformer_block in ref_module.layers.named_children():
+                transformer_block = torch.compile(transformer_block, dynamic=False)
+                ref_module.layers.register_module(layer_id, transformer_block)
+        float8_linear_config2 = Float8LinearConfig(
+            enable_fsdp_float8_all_gather=enable_fsdp_float8_all_gather,
+            cast_config_weight=CastConfig(scaling_type=scaling_type_weight),
+        )
+        convert_to_float8_training(
+            module,
+            config=float8_linear_config2,
+        )
+        for layer_id, transformer_block in module.layers.named_children():
+            if compile_transformer_block:
+                transformer_block = torch.compile(transformer_block, dynamic=False)
+            fully_shard(transformer_block)
+            module.layers.register_module(layer_id, transformer_block)
+        fully_shard(module)
+        ref_optim = torch.optim.Adam(ref_module.parameters(), lr=1e-2)
+        optim = torch.optim.Adam(module.parameters(), lr=1e-2, foreach=True)
+        local_inp = torch.randint(
+            0, ref_module.tok_embeddings.weight.size(0), (16, 16), device="cuda"
+        )
+        check_parity_no_mp(
+            self,
+            ref_module,
+            ref_optim,
+            module,
+            optim,
+            local_inp,
+            precompute,
+            config=float8_linear_config2,
+            compile_transformer_block=compile_transformer_block,
+        )
+
+    @skip_if_lt_x_gpu(2)
+    def test_transformer_memory(self):
+        """Tests peak active memory in the forward and backward passes."""
+        for enable_fsdp_float8_all_gather in [False, True]:
+            self._test_transformer_memory(enable_fsdp_float8_all_gather)
+
+    def _test_transformer_memory(self, enable_fsdp_float8_all_gather: bool):
+        torch.manual_seed(42)
+        # Pre-run a linear forward (gemm and bias) and backward (gemm) to
+        # allocate the cuBLAS workspaces before measuring the memory usage
+        # since the workspace size can differ between hardwares
+        lin = torch.nn.Linear(768, 768, device="cuda")
+        inp = torch.randn(1, 768, device="cuda")
+        lin(inp).sum().backward()
+        torch.cuda.empty_cache()
+        base_mem_mb = self._get_peak_active_memory_mb()
+
+        vocab_size = 32
+        model_args = ModelArgs(
+            vocab_size=vocab_size,
+            n_layers=3,
+            dim=768,
+            n_heads=12,
+            weight_tying=False,
+        )
+        model = Transformer(model_args)
+        # Emulate the fp8 matmul to bypass the scaled matmul op's divisibility
+        # requirement to use a smaller activation size
+        float8_linear_config = Float8LinearConfig(
+            enable_fsdp_float8_all_gather=enable_fsdp_float8_all_gather,
+            emulate=True,
+        )
+        convert_to_float8_training(model, config=float8_linear_config)
+        model_unsharded_numel = sum(p.numel() for p in model.parameters())
+        model_sharded_numel = (model_unsharded_numel + 1) // 2
+        block_lin_weight_numel = 0
+        block_other_numel = 0
+        for module in model.layers[0].modules():
+            for param in module.parameters(recurse=False):
+                if isinstance(module, nn.Linear):
+                    block_lin_weight_numel += param.numel()
+                else:
+                    block_other_numel += param.numel()
+        non_block_numel = round(
+            sum(p.numel() for p in model.tok_embeddings.parameters())
+            + sum(p.numel() for p in model.pos_embeddings.parameters())
+            + sum(p.numel() for p in model.norm.parameters())
+            + sum(p.numel() for p in model.output.parameters())
+        )
+        for module in model.modules():
+            if isinstance(module, TransformerBlock):
+                fully_shard(module)
+        fully_shard(model)
+
+        # Init: Each module is moved to GPU before sharding parameters
+        peak_mem_mb = self._get_peak_active_memory_mb()
+        curr_mem_mb = self._get_curr_active_memory_mb()
+        init_mem_mb = (
+            (model_sharded_numel + block_lin_weight_numel + block_other_numel) * 4 / 1e6
+        )
+        # Allow for some buffer for the peak memory since original parameters
+        # are not freed until a `fully_shard` call returns
+        buffer_mb = 4
+        self.assertLessEqual(peak_mem_mb - base_mem_mb, init_mem_mb + buffer_mb)
+        self.assertLessEqual(curr_mem_mb - base_mem_mb, init_mem_mb)
+
+        # Use a small input to minimize activation memory usage
+        inp = torch.randint(0, vocab_size, (1, 4), device="cuda")
+
+        # Forward:
+        loss = model(inp)
+        mem_mb = self._get_peak_active_memory_mb()
+        # Allow for some buffer for fragmentation/activations (where this
+        # number is kept much smaller than the actual memory usage, which is on
+        # the order of 100-200+ MB)
+        buffer_mb = 16
+        if enable_fsdp_float8_all_gather:
+            # Non-block parameters (fp32), 3x block non-linear-weight
+            # parameters (fp32) and block linear-weight parameters (fp8)
+            # (current all-gather, copy-out, and next all-gather), and other
+            expected_mem_mb = (
+                (non_block_numel * 4)
+                + 3 * (block_lin_weight_numel + block_other_numel * 4)
+            ) / 1e6 + buffer_mb
+        else:
+            # Non-block parameters (fp32), 3x block parameters (fp32)
+            # (current all-gather, copy-out, and next all-gather), Nx block
+            # linear-weight parameters (fp8) for N blocks (saved by autograd),
+            # and other
+            expected_mem_mb = (
+                (non_block_numel + 3 * (block_lin_weight_numel + block_other_numel)) * 4
+                + model_args.n_layers * block_lin_weight_numel
+            ) / 1e6 + buffer_mb
+        # Sharded parameters
+        expected_mem_mb += model_sharded_numel * 4 / 1e6
+        self.assertLessEqual(mem_mb, expected_mem_mb + base_mem_mb)
+
+        # Backward:
+        loss.sum().backward()
+        mem_mb = self._get_peak_active_memory_mb()
+        if enable_fsdp_float8_all_gather:
+            # Non-block parameters (fp32), 2x block non-linear weight
+            # parameters (fp32) and block linear-weight parameters (fp8)
+            # (current copy-out and next all-gather), 1x block gradients (fp32)
+            expected_mem_mb = (
+                (non_block_numel * 4)
+                + 2 * (block_lin_weight_numel + block_other_numel * 4)
+                + 1 * (block_lin_weight_numel + block_other_numel) * 4
+            ) / 1e6 + buffer_mb
+        else:
+            # Non-block parameters (fp32), 3x block parameters (fp32) (current
+            # copy-out, next all-gather, current gradients)
+            expected_mem_mb = (
+                non_block_numel + 3 * (block_lin_weight_numel + block_other_numel) * 4
+            ) * 4 / 1e6 + buffer_mb
+        # 2x sharded parameters/gradients
+        expected_mem_mb += 2 * model_sharded_numel * 4 / 1e6
+        self.assertLessEqual(mem_mb, expected_mem_mb + base_mem_mb)
+
+    def _get_peak_active_memory_mb(self) -> int:
+        mem_stats = torch.cuda.memory_stats()
+        return round(mem_stats["active_bytes.all.peak"] / 1e6)
+
+    def _get_curr_active_memory_mb(self) -> int:
+        mem_stats = torch.cuda.memory_stats()
+        return round(mem_stats["active_bytes.all.current"] / 1e6)
+
+
+class TestFloat8MultiThread(FSDPTestMultiThread, TestFloat8Common):
+    @property
+    def world_size(self) -> int:
+        return 2
+
+    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    def test_weight_subclass_dynamic(self):
+        tensor_cls = WeightWithDynamicFloat8CastTensor
+        # Check for a single FSDP paramter group
+        module_fp32 = self.init_single_module()
+        float8_linear_config = Float8LinearConfig(
+            enable_fsdp_float8_all_gather=True,
+            emulate=True,
+        )
+        module = convert_to_float8_training(
+            module_fp32,
+            config=float8_linear_config,
+        )
+        self.assertIsInstance(module.weight, tensor_cls)
+        fully_shard(module)
+        for param_name, param in module.named_parameters():
+            self.assertIsInstance(param, DTensor)
+            if "weight" in param_name:
+                self.assertIsInstance(param.to_local(), tensor_cls)
+
+        # Check for multiple FSDP paramter groups
+        module = self.init_multi_module()
+        module = convert_to_float8_training(
+            module,
+            config=float8_linear_config,
+        )
+        for param_name, param in module.named_parameters():
+            if "weight" in param_name:
+                self.assertIsInstance(param, tensor_cls)
+        for mlp in module:
+            fully_shard(mlp)
+        fully_shard(module)
+        for param_name, param in module.named_parameters():
+            self.assertIsInstance(param, DTensor)
+            if "weight" in param_name:
+                self.assertIsInstance(param.to_local(), tensor_cls)
+
+    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    def test_fp8_fp32_all_gather_dynamic_comm_size(self):
+        """
+        Tests that fp8 all-gather with dynamic scaling communicates the
+        expected number of bytes.
+        """
+        orig_all_gather = dist.all_gather_into_tensor
+        all_gather_sizes: List[int] = []
+        lock = threading.Lock()
+
+        def all_gather(*args: Any, **kwargs: Any):
+            nonlocal all_gather_sizes
+            if len(args) > 0:
+                output = args[0]
+            elif "output_tensor" in kwargs:
+                output = kwargs["output_tensor"]
+            else:
+                raise AssertionError(
+                    f"Cannot get all-gather output from\nargs: {args}\nkwargs: {kwargs}"
+                )
+            with lock:
+                all_gather_sizes.append(output.numel() * output.itemsize)
+            return orig_all_gather(*args, **kwargs)
+
+        def get_expected_all_gather_size(module: nn.Module):
+            size = 0
+            for param_name, param in module.named_parameters():
+                bytes_per_numel = 1 if "weight" in param_name else param.itemsize
+                size += param.numel() * bytes_per_numel
+            return size
+
+        # - Check for a single FSDP parameter group
+        module_fp32 = self.init_single_module()
+        ref_module = copy.deepcopy(module_fp32)
+        float8_linear_config = Float8LinearConfig(
+            enable_fsdp_float8_all_gather=True,
+        )
+        module_fp32 = convert_to_float8_training(
+            module_fp32, config=float8_linear_config
+        )
+        module = module_fp32
+        fully_shard(module)
+        local_inp = self.get_local_inp()
+        expected_all_gather_size = get_expected_all_gather_size(ref_module)
+        with patch_all_gather(all_gather):
+            out = module(local_inp)
+        # For MPTG, one rank runs all all-gathers, each of the same size
+        if all_gather_sizes:
+            self.assertEqual(len(all_gather_sizes), self.world_size)
+            self.assertEqual(
+                all_gather_sizes, [expected_all_gather_size] * self.world_size
+            )
+        all_gather_sizes.clear()
+        # Force-reshard the module to check the backward all-gather
+        module.reshard()
+        with patch_all_gather(all_gather):
+            out.sum().backward()
+        if all_gather_sizes:
+            self.assertEqual(len(all_gather_sizes), self.world_size)
+            self.assertEqual(
+                all_gather_sizes, [expected_all_gather_size] * self.world_size
+            )
+        all_gather_sizes.clear()
+
+        # - Check for multiple FSDP parameter groups
+        module = self.init_multi_module()
+        ref_module = copy.deepcopy(module)
+        module = convert_to_float8_training(module, config=float8_linear_config)
+        for submodule in module:
+            fully_shard(submodule)
+        fully_shard(module)
+        expected_all_gather_sizes = (
+            get_expected_all_gather_size(submodule) for submodule in module
+        )
+        with patch_all_gather(all_gather):
+            out = module(local_inp)
+        if all_gather_sizes:
+            self.assertEqual(len(all_gather_sizes), self.world_size * len(module))
+            self.assertEqual(
+                all_gather_sizes,
+                [s for s in expected_all_gather_sizes for _ in range(self.world_size)],
+            )
+
+    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    def test_fp32_fp8_single_module_parity(self):
+        """
+        Tests numeric parity for fp32 parameters with fp8 computation with a
+        single module/FSDP communication group.
+        """
+        choices = itertools.product(
+            [False, True],
+            [ScalingType.DYNAMIC, ScalingType.DELAYED],
+        )
+        for enable_fsdp_float8_all_gather, scaling_type_weight in choices:
+            float8_linear_config1 = Float8LinearConfig(
+                enable_fsdp_float8_all_gather=False,
+                cast_config_weight=CastConfig(scaling_type=scaling_type_weight),
+            )
+            float8_linear_config2 = Float8LinearConfig(
+                enable_fsdp_float8_all_gather=enable_fsdp_float8_all_gather,
+                cast_config_weight=CastConfig(scaling_type=scaling_type_weight),
+            )
+            module_fp32 = self.init_single_module()
+            ref_module = copy.deepcopy(module_fp32)
+            ref_module = convert_to_float8_training(
+                ref_module,
+                config=float8_linear_config1,
+            )
+            ref_module = ref_module.cuda()
+            module = convert_to_float8_training(
+                module_fp32,
+                config=float8_linear_config2,
+            )
+            fully_shard(module)
+            ref_optim = torch.optim.Adam(ref_module.parameters(), lr=1e-2)
+            optim = torch.optim.Adam(module.parameters(), lr=1e-2, foreach=True)
+            local_inp = self.get_local_inp()
+            check_parity_no_mp(
+                self,
+                ref_module,
+                ref_optim,
+                module,
+                optim,
+                local_inp,
+                config=float8_linear_config2,
+            )
+
+    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    def test_fp32_fp8_multi_module_parity(self):
+        """
+        Tests numeric parity for fp32 parameters with fp8 computation with
+        multiple modules/FSDP communication groups.
+        """
+        choices = itertools.product(
+            [False, True],
+            [ScalingType.DYNAMIC, ScalingType.DELAYED],
+        )
+        for enable_fsdp_float8_all_gather, scaling_type_weight in choices:
+            float8_linear_config1 = Float8LinearConfig(
+                enable_fsdp_float8_all_gather=False,
+                cast_config_weight=CastConfig(scaling_type=scaling_type_weight),
+            )
+            float8_linear_config2 = Float8LinearConfig(
+                enable_fsdp_float8_all_gather=enable_fsdp_float8_all_gather,
+                cast_config_weight=CastConfig(scaling_type=scaling_type_weight),
+            )
+            module = self.init_multi_module().cuda()
+            ref_module = copy.deepcopy(module)
+            ref_module = convert_to_float8_training(
+                ref_module,
+                config=float8_linear_config1,
+            )
+            module = convert_to_float8_training(
+                module,
+                config=float8_linear_config2,
+            )
+            for submodule in module:
+                fully_shard(submodule)
+            fully_shard(module)
+            ref_optim = torch.optim.Adam(ref_module.parameters(), lr=1e-2)
+            optim = torch.optim.Adam(module.parameters(), lr=1e-2, foreach=True)
+            local_inp = self.get_local_inp()
+            check_parity_no_mp(
+                self,
+                ref_module,
+                ref_optim,
+                module,
+                optim,
+                local_inp,
+                config=float8_linear_config2,
+            )
+
+    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    def test_bf16_mp_fp8_dynamic_multi_parity(self):
+        """
+        Tests numeric parity for fp32 parameters with FSDP's bf16 mixed
+        precision and fp8 computation with multiple modules/FSDP communication
+        groups. Parameters are all-gathered in bf16 before being cast to fp8.
+        """
+        # NOTE: We cannot test easily with fp8 all-gather because then the scale
+        # is computed using the fp32 sharded parameters, not the bf16 unsharded
+        # parameters, changing the numerics.
+        module = self.init_multi_module()
+        ref_module_bf16 = copy.deepcopy(module).to(torch.bfloat16)
+        float8_config = Float8LinearConfig(emulate=True)
+        ref_module_bf16 = convert_to_float8_training(
+            ref_module_bf16,
+            config=float8_config,
+        )
+        ref_module_fp32 = copy.deepcopy(module).cuda()
+        module = convert_to_float8_training(module, config=float8_config)
+        mp_policy = MixedPrecisionPolicy(param_dtype=torch.bfloat16)
+        for mlp in module:
+            fully_shard(mlp, mp_policy=mp_policy)
+        fully_shard(module, mp_policy=mp_policy)
+        check_parity_bf16_mp(
+            self,
+            ref_module_fp32,
+            ref_module_bf16,
+            torch.optim.Adam(ref_module_fp32.parameters(), lr=1e-2),
+            module,
+            torch.optim.Adam(module.parameters(), lr=1e-2, foreach=True),
+            self.get_local_inp(torch.bfloat16),
+        )
+
+    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    def test_delayed_scaling_inplace_update(self):
+        """
+        Verify that `WeightWithDelayedFloat8CastTensor` updates buffers inplace
+        """
+        module = self.init_single_module()
+        float8_linear_config = Float8LinearConfig(
+            enable_fsdp_float8_all_gather=True,
+            cast_config_weight=CastConfig(scaling_type=ScalingType.DELAYED),
+        )
+        m_fp8 = convert_to_float8_training(
+            module,
+            config=float8_linear_config,
+        )
+
+        fp8_amax_weight_old = m_fp8.fp8_amax_weight.clone().detach()
+        dummy_mesh = None
+        data, scale = m_fp8.weight.fsdp_pre_all_gather(dummy_mesh)
+        self.assertNotEqual(fp8_amax_weight_old.item(), m_fp8.fp8_amax_weight.item())
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/float8/test_fsdp_compile.py b/test/float8/test_fsdp_compile.py
new file mode 100644
index 000000000..f4ca160fd
--- /dev/null
+++ b/test/float8/test_fsdp_compile.py
@@ -0,0 +1,139 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Test autocast + torch.compile + FSDP + Float8Linear
+"""
+
+import os
+import warnings
+
+import fire
+
+import pytest
+
+from torchao.utils import TORCH_VERSION_AFTER_2_4
+
+if not TORCH_VERSION_AFTER_2_4:
+    pytest.skip("Unsupported PyTorch version", allow_module_level=True)
+
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+import torch.nn as nn
+from torchao.float8 import Float8LinearConfig
+from torchao.float8.config import CastConfig, ScalingType
+from torchao.float8.float8_linear_utils import (
+    convert_to_float8_training,
+    sync_float8_amax_and_scale_history,
+)
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+
+torch.manual_seed(0)
+
+B, M, K, N = 8, 8, 32, 32
+lr = 0.01
+N_ITER = 1
+
+
+def setup(rank, world_size):
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = "12355"
+
+    # initialize the process group
+    dist.init_process_group("nccl", rank=rank, world_size=world_size)
+
+
+def cleanup():
+    dist.destroy_process_group()
+
+
+def get_model(K, N, is_fp8, emulate, base_dtype=torch.float32):
+    # composability of torch.compile + FSDP + autocast + Float8Linear
+    # as fo 2023-12-30
+
+    # without any changes to the Float8Linear, we get this error:
+    # https://gist.github.com/vkuzo/3bcb81806cc92f99ac0b9c5fdf287730
+
+    # if we initialize Float8Linear with is_amax_initialized=True and
+    # amax_and_scale_synced=True, we get
+    # https://gist.github.com/vkuzo/ed8e168fd9f7463f1fce34301334ab55
+    # to get around this, we can disable amax init
+    config = Float8LinearConfig(
+        enable_amax_init=False,
+        cast_config_input=CastConfig(scaling_type=ScalingType.DELAYED),
+        cast_config_weight=CastConfig(scaling_type=ScalingType.DELAYED),
+        cast_config_grad_output=CastConfig(scaling_type=ScalingType.DELAYED),
+        emulate=emulate,
+    )
+
+    m = nn.Sequential(
+        nn.Linear(K, N, dtype=base_dtype),
+        nn.ReLU(),
+    )
+    convert_to_float8_training(
+        m,
+        config=config,
+    )
+    return m
+
+
+# taken from https://pytorch.org/tutorials/intermediate/FSDP_tutorial.html
+# and modified
+def fsdp_main(rank, world_size, args):
+    setup(rank, world_size)
+    torch.cuda.set_device(rank)
+
+    (emulate,) = args
+
+    # finally, if we remove the usage of self.bias_dtype, then
+    # things work e2e. Note that FSDP does not support full-graph compile
+    # regardless of float8.
+
+    model = get_model(K, N, is_fp8=True, emulate=emulate, base_dtype=torch.bfloat16).to(
+        rank
+    )
+
+    # To compile FSDP, we need use_orig_params to True
+    model = FSDP(model, use_orig_params=True)
+
+    optimizer = torch.optim.SGD(model.parameters(), lr=lr * world_size)
+    input_local = torch.randn(B, M, K, N, device="cuda")
+    sync_float8_func = torch.compile(sync_float8_amax_and_scale_history)
+
+    model = torch.compile(model)
+
+    for _iter in range(N_ITER):
+        optimizer.zero_grad()
+        with torch.autocast("cuda"):
+            y_local = model(input_local)
+        y_local.sum().backward()
+        sync_float8_func(model)
+        optimizer.step()
+
+    print("done!")
+    cleanup()
+
+
+def run():
+    emulate = False
+    if not torch.cuda.is_available():
+        warnings.warn("CUDA not available, running in emulation_mode", stacklevel=2)
+        emulate = True
+    elif torch.cuda.get_device_capability() < (9, 0):
+        warnings.warn(
+            f"CUDA capability {torch.cuda.get_device_capability()} < (9.0), running in emulation mode",
+            stacklevel=2,
+        )
+        emulate = True
+
+    WORLD_SIZE = torch.cuda.device_count()
+    args = (emulate,)
+    mp.spawn(fsdp_main, args=(WORLD_SIZE, args), nprocs=WORLD_SIZE, join=True)
+
+
+if __name__ == "__main__":
+    fire.Fire(run)
diff --git a/test/float8/test_fsdp_compile.sh b/test/float8/test_fsdp_compile.sh
new file mode 100755
index 000000000..666136aba
--- /dev/null
+++ b/test/float8/test_fsdp_compile.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+# terminate script on first error
+set -e
+if python -c 'import torch;print(torch.cuda.is_available())' | grep -q "False"; then
+    echo "Skipping test_fsdp_compile.sh because no CUDA devices are available."
+    exit
+fi
+
+# Code to be executed if CUDA devices are available
+NCCL_DEBUG=WARN CUDA_VISIBLE_DEVICES=0,1 python test/float8/test_fsdp_compile.py
diff --git a/test/float8/test_inference_flows.py b/test/float8/test_inference_flows.py
new file mode 100644
index 000000000..c76a43df0
--- /dev/null
+++ b/test/float8/test_inference_flows.py
@@ -0,0 +1,245 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+import copy
+import io
+import random
+import unittest
+
+import pytest
+
+from torchao.utils import TORCH_VERSION_AFTER_2_4
+
+if not TORCH_VERSION_AFTER_2_4:
+    pytest.skip("Unsupported PyTorch version", allow_module_level=True)
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchao.float8.config import ScalingType
+from torchao.float8.float8_linear_utils import convert_to_float8_training
+from torchao.float8.float8_tensor import Float8Tensor
+from torchao.float8.float8_utils import compute_error
+from torchao.float8.inference import (
+    ActivationCasting,
+    Float8InferenceLinear,
+    QuantConfig,
+    quantize_to_float8,
+)
+
+
+random.seed(0)
+torch.manual_seed(0)
+
+is_H100 = torch.cuda.is_available() and torch.cuda.get_device_capability() >= (9, 0)
+
+
+class FeedForward(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.w1 = nn.Linear(4096, 14336, bias=False)
+        self.w3 = nn.Linear(4096, 14336, bias=False)
+        self.w2 = nn.Linear(14336, 4096, bias=False)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.w2(F.silu(self.w1(x)) * self.w3(x))
+
+    def reset_parameters(self):
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                m.reset_parameters()
+
+
+class TestHPTrainToFP8LinearInference:
+    def base_test_mlp_transform(self, base_mlp, quantized_mlp, input_tensor):
+        with torch.no_grad():
+            base_output = base_mlp(input_tensor)
+            transformed_output = quantized_mlp(input_tensor)
+
+        # Compute and check SQNR
+        sqnr = compute_error(base_output, transformed_output)
+        assert sqnr.item() > 20, f"SQNR is too low: {sqnr.item()} dB"
+
+    @pytest.mark.parametrize("compile_backend", ["eager", "inductor"])
+    @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float32])
+    @unittest.skipIf(
+        not torch.cuda.is_available() or not is_H100,
+        "CUDA not available or on non H100 machine",
+    )
+    def test_dynamic_fp8_mlp(self, compile_backend, dtype):
+        original_mlp = FeedForward().to("cuda", dtype=dtype)
+        original_mlp.reset_parameters()
+
+        dynamic_fp8_mlp = copy.deepcopy(original_mlp)
+
+        quant_config = QuantConfig(ActivationCasting.DYNAMIC)
+        quantize_to_float8(dynamic_fp8_mlp, quant_config)
+
+        batch_size = 4
+        num_tokens = 1024
+        embedding_dim = 4096
+
+        input_tensor = torch.randn(
+            batch_size, num_tokens, embedding_dim, device="cuda", dtype=dtype
+        )
+
+        # Compile the models
+        compiled_original_mlp = torch.compile(
+            original_mlp, backend=compile_backend, fullgraph=True
+        )
+        compiled_dynamic_fp8_mlp = torch.compile(
+            dynamic_fp8_mlp, backend=compile_backend, fullgraph=True
+        )
+
+        self.base_test_mlp_transform(
+            compiled_original_mlp, compiled_dynamic_fp8_mlp, input_tensor
+        )
+
+    @pytest.mark.parametrize("compile_backend", ["eager", "inductor"])
+    @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float32])
+    @unittest.skipIf(
+        not torch.cuda.is_available() or not is_H100,
+        "CUDA not available or on non H100 machine",
+    )
+    def test_static_fp8_mlp(self, compile_backend, dtype):
+        original_mlp = FeedForward().to("cuda", dtype=dtype)
+        original_mlp.reset_parameters()
+
+        static_fp8_mlp = copy.deepcopy(original_mlp)
+        quant_config = QuantConfig(
+            ActivationCasting.STATIC,
+            static_quantization_scale=torch.tensor(
+                [1.0], device="cuda", dtype=torch.float32
+            ),
+        )
+        quantize_to_float8(static_fp8_mlp, quant_config)
+
+        batch_size = 4
+        num_tokens = 1024
+        embedding_dim = 4096
+
+        input_tensor = torch.randn(
+            batch_size, num_tokens, embedding_dim, device="cuda", dtype=dtype
+        )
+
+        # Compile the models
+        compiled_original_mlp = torch.compile(
+            original_mlp, backend=compile_backend, fullgraph=True
+        )
+        compiled_static_fp8_mlp = torch.compile(
+            static_fp8_mlp, backend=compile_backend, fullgraph=True
+        )
+
+        self.base_test_mlp_transform(
+            compiled_original_mlp, compiled_static_fp8_mlp, input_tensor
+        )
+
+    @pytest.mark.parametrize("compile_backend", ["eager", "inductor"])
+    @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float32])
+    @unittest.skipIf(
+        not torch.cuda.is_available() or not is_H100,
+        "CUDA not available or on non H100 machine",
+    )
+    def test_weight_only_fp8_mlp(self, compile_backend, dtype):
+        original_mlp = FeedForward().to("cuda", dtype=dtype)
+        original_mlp.reset_parameters()
+
+        static_fp8_mlp = copy.deepcopy(original_mlp)
+        quant_config = QuantConfig(ActivationCasting.WEIGHT_ONLY)
+        quantize_to_float8(static_fp8_mlp, quant_config)
+
+        batch_size = 4
+        num_tokens = 1024
+        embedding_dim = 4096
+
+        input_tensor = torch.randn(
+            batch_size, num_tokens, embedding_dim, device="cuda", dtype=dtype
+        )
+
+        # Compile the models
+        compiled_original_mlp = torch.compile(
+            original_mlp, backend=compile_backend, fullgraph=True
+        )
+        compiled_static_fp8_mlp = torch.compile(
+            static_fp8_mlp, backend=compile_backend, fullgraph=True
+        )
+
+        self.base_test_mlp_transform(
+            compiled_original_mlp, compiled_static_fp8_mlp, input_tensor
+        )
+
+
+class TestFP8TrainToFP8LinearInference:
+    def train(self, model: nn.Module, dtype: torch.dtype):
+        model.train()
+        optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
+        criterion = nn.MSELoss()
+        target_tensor = torch.randn(4, 1024, 4096, device="cuda", dtype=dtype)
+        for _ in range(10):
+            input_tensor = torch.randn(4, 1024, 4096, device="cuda", dtype=dtype)
+            optimizer.zero_grad()
+            output = model(input_tensor)
+            loss = criterion(output, target_tensor)
+            loss.backward()
+            optimizer.step()
+        model.eval()
+        return model
+
+    @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float32])
+    @unittest.skipIf(
+        not torch.cuda.is_available() or not is_H100,
+        "CUDA not available or on non H100 machine",
+    )
+    def test_fp8_save_and_load(self, dtype: torch.dtype):
+        # Initialize FP8 model
+        fp8_mlp = FeedForward().to("cuda", dtype=torch.float32)
+        fp8_mlp.reset_parameters()
+        convert_to_float8_training(fp8_mlp)
+
+        # Train the model
+        self.train(fp8_mlp, dtype)
+
+        # Generate input tensor and original out
+        input_tensor = torch.randn(4, 1024, 4096, device="cuda", dtype=dtype)
+        og_out = fp8_mlp(input_tensor)
+
+        # Save model state dict
+        buffer = io.BytesIO()
+        torch.save(fp8_mlp.state_dict(), buffer)
+
+        # Reset buffer position to the beginning
+        buffer.seek(0)
+
+        # Later on you load the model, will be w/ Float8Linear on meta device
+        with torch.device("meta"):
+            new_fp8_mlp = FeedForward().to(dtype=dtype)
+            convert_to_float8_training(new_fp8_mlp)
+
+        # Load the actual data
+        new_fp8_mlp.load_state_dict(
+            torch.load(buffer, weights_only=True), strict=True, assign=True
+        )
+
+        quant_config = QuantConfig(ActivationCasting.DYNAMIC)
+        quantize_to_float8(new_fp8_mlp, quant_config)
+
+        fp8_mod_count = 0
+        for module in new_fp8_mlp.modules():
+            if isinstance(module, Float8InferenceLinear):
+                assert isinstance(module.weight, Float8Tensor)
+                assert module.weight.requires_grad is False
+                fp8_mod_count += 1
+        assert fp8_mod_count == 3, "Expected 3 FP8 modules, got {}".format(
+            fp8_mod_count
+        )
+
+        new_out = new_fp8_mlp(input_tensor)
+
+        # Assert exact equality
+        assert torch.all(og_out == new_out).item()
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/test/float8/test_numerics_integration.py b/test/float8/test_numerics_integration.py
new file mode 100644
index 000000000..fd724b340
--- /dev/null
+++ b/test/float8/test_numerics_integration.py
@@ -0,0 +1,174 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Tests LLaMa FeedForward numerics with float8
+
+import copy
+from typing import Optional
+
+import pytest
+
+from torchao.utils import TORCH_VERSION_AFTER_2_4
+
+if not TORCH_VERSION_AFTER_2_4:
+    pytest.skip("Unsupported PyTorch version", allow_module_level=True)
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchao.float8.config import CastConfig, Float8LinearConfig, ScalingType
+from torchao.float8.float8_linear_utils import (
+    convert_to_float8_training,
+    linear_requires_sync,
+    sync_float8_amax_and_scale_history,
+)
+from torchao.float8.float8_utils import compute_error, IS_ROCM
+
+is_H100 = torch.cuda.is_available() and torch.cuda.get_device_capability() >= (9, 0)
+
+
+torch.manual_seed(0)
+
+
+# copied from https://github.com/pytorch/torchtitan/blob/main/torchtitan/models/llama/model.py
+class FeedForward(nn.Module):
+    """
+    FeedForward module
+
+    Args:
+        dim (int): Input dimension.
+        hidden_dim (int): Hidden dimension of the feedforward layer.
+        multiple_of (int): Value to ensure hidden dimension is a multiple of this value.
+        ffn_dim_multiplier (Optional[float]): Custom multiplier for hidden dimension. Defaults to None.
+
+    Attributes:
+        w1 (Linear): Linear transformation for the first layer.
+        w2 (Linear): Linear transformation for the second layer.
+        w3 (Linear): Linear transformation for the third layer.
+
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        hidden_dim: int,
+        multiple_of: int,
+        ffn_dim_multiplier: Optional[float],
+    ):
+        super().__init__()
+        hidden_dim = int(2 * hidden_dim / 3)
+        # custom dim factor multiplier
+        if ffn_dim_multiplier is not None:
+            hidden_dim = int(ffn_dim_multiplier * hidden_dim)
+        hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
+
+        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
+        self.w2 = nn.Linear(hidden_dim, dim, bias=False)
+        self.w3 = nn.Linear(dim, hidden_dim, bias=False)
+
+    def forward(self, x):
+        return self.w2(F.silu(self.w1(x)) * self.w3(x))
+
+    def init_weights(self, init_std: float):
+        nn.init.trunc_normal_(self.w1.weight, mean=0.0, std=0.02)
+        for linear in (self.w2, self.w3):
+            nn.init.trunc_normal_(linear.weight, mean=0.0, std=init_std)
+
+
+class TestFloat8NumericsIntegrationTest:
+    @pytest.mark.parametrize(
+        "scaling_type_input", [ScalingType.DELAYED, ScalingType.DYNAMIC]
+    )
+    @pytest.mark.parametrize(
+        "scaling_type_weight", [ScalingType.DELAYED, ScalingType.DYNAMIC]
+    )
+    @pytest.mark.parametrize(
+        "scaling_type_grad_output",
+        [ScalingType.DELAYED, ScalingType.DYNAMIC],
+    )
+    @pytest.mark.skipif(not is_H100, reason="requires H100 GPU")
+    @pytest.mark.skipif(IS_ROCM, reason="test doesn't currently work on the ROCm stack")
+    def test_encoder_fw_bw(
+        self,
+        scaling_type_input: ScalingType,
+        scaling_type_weight: ScalingType,
+        scaling_type_grad_output: ScalingType,
+    ):
+        # TODO(later): maybe add float16 back if it becomes important
+        data_dtype = torch.bfloat16
+
+        # LLaMa 3 70B shapes
+        model_ref = (
+            FeedForward(
+                dim=4096,
+                hidden_dim=16384,
+                multiple_of=1024,
+                ffn_dim_multiplier=1.3,
+            )
+            .cuda()
+            .to(data_dtype)
+        )
+
+        # for now just test the encoder to simplify things
+        model_fp8 = copy.deepcopy(model_ref)
+        config = Float8LinearConfig(
+            cast_config_input=CastConfig(scaling_type=scaling_type_input),
+            cast_config_weight=CastConfig(scaling_type=scaling_type_weight),
+            cast_config_grad_output=CastConfig(scaling_type=scaling_type_grad_output),
+        )
+        convert_to_float8_training(
+            model_fp8,
+            config=config,
+        )
+
+        lr = 0.01
+        optim_ref = torch.optim.SGD(model_ref.parameters(), lr=lr)
+        optim_fp8 = torch.optim.SGD(model_fp8.parameters(), lr=lr)
+
+        # Note: you need two different inputs to properly test numerics
+        # of delayed scaling, because the first time around the initialization
+        # logic of delayed scaling behaves as dynamic scaling
+        # TODO(future): also make unit tests do this properly
+        shape = (1, 8192, 4096)
+        data1 = torch.randn(*shape, device="cuda", dtype=data_dtype)
+        data2 = torch.randn(*shape, device="cuda", dtype=data_dtype)
+
+        model_ref(data1).sum().backward()
+        # zero out grads without stepping, since we just want to compare grads
+        # of the second datum
+        optim_ref.zero_grad()
+        model_ref_out = model_ref(data2)
+        model_ref_out.sum().backward()
+
+        if linear_requires_sync(config):
+            sync_float8_amax_and_scale_history(model_fp8)
+        model_fp8(data1).sum().backward()
+        # zero out grads without stepping, since we just want to compare grads
+        # of the second datum
+        optim_fp8.zero_grad()
+        if linear_requires_sync(config):
+            sync_float8_amax_and_scale_history(model_fp8)
+        model_fp8_out = model_fp8(data2)
+        model_fp8_out.sum().backward()
+
+        out_sqnr = compute_error(model_ref_out, model_fp8_out)
+        assert out_sqnr > 20.0
+
+        ref_name_to_grad = {
+            name: param.grad for name, param in model_ref.named_parameters()
+        }
+
+        grad_sqnr_threshold = 20.0
+
+        for name, param in model_fp8.named_parameters():
+            ref_grad = ref_name_to_grad[name]
+            cur_grad = param.grad
+            sqnr = compute_error(ref_grad, cur_grad)
+            assert sqnr > grad_sqnr_threshold
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/torchao/float8/README.md b/torchao/float8/README.md
new file mode 100644
index 000000000..abab3b9fa
--- /dev/null
+++ b/torchao/float8/README.md
@@ -0,0 +1,159 @@
+# torchao.float8
+
+This is an early version of a library for accelerating training with float8 in native PyTorch
+according to the recipes laid out in https://arxiv.org/pdf/2209.05433.pdf.
+The codebase strives to stay small, easily hackable, debuggable with native PyTorch tooling,
+and composable with key systems such as autograd, ```torch.compile``` and distributed.
+With ``torch.compile`` on, initial results show
+throughput speedups of up to 1.2x on small scale (8 GPUs) LLaMa pretraining jobs.
+
+:warning: <em>See the [feature tracker](https://github.com/pytorch-labs/torchao.float8/issues/187) for upcoming features.</em>
+
+:warning: <em>Backwards compatibility is not guaranteed at this point. The codebase is in active development and
+will change rapidly.</em>
+
+# Single GPU User API
+
+We provide two per-tensor scaling strategies: dynamic and delayed.  See https://arxiv.org/pdf/2209.05433.pdf, Section 4.3 for more details. These strategies are configurable separately for activations (`input`), weights (`weight`) and gradients (`grad_output`).
+
+## float8 linear with dynamic scaling for `input`, `weight` and `grad_output`
+
+This is the most accurate recipe as every tensor is scaled dynamically.
+
+```python
+from torchao.float8 import (
+    convert_to_float8_training,
+    precompute_float8_dynamic_scale_for_fsdp,
+)
+
+# create model
+m = Model(...)
+
+# optional: filter modules from being eligible for float8 conversion
+def module_filter_fn(mod: torch.nn.Module, fqn: str):
+    # don't convert the output module
+    if fqn == "output":
+        return False
+    # don't convert linear modules with weight dimensions not divisible by 16
+    if isinstance(mod, torch.nn.Linear):
+        if mod.in_features % 16 != 0 or mod.out_features % 16 != 0:
+            return False
+    return True
+
+# convert all `torch.nn.Linear` modules to `Float8Linear`
+convert_to_float8_training(m, module_filter_fn=module_filter_fn)
+
+# optional: use FSDP
+model = FSDP(model, use_orig_params=True)
+
+# optional: enable torch.compile for improved performance
+m = torch.compile(m)
+
+# toy training loop
+for _ in range(N_ITER):
+    optimizer.zero_grad()
+    y = m(x)
+    y.sum().backward()
+    optimizer.step()
+
+    # specific to fsdp2 + dynamic scaling, when fp8 all-gather is turned on
+    # this method is optional but is highly recommended for performance
+    # it calcuclates scales for all parameters in a single all-reduce
+    precompute_float8_dynamic_scale_for_fsdp(model)
+
+```
+
+## float8 linear with delayed scaling
+
+This is theoretically the most performant recipe as it minimizes memory reads.
+
+```python
+from torchao.float8 import (
+    convert_to_float8_training,
+    sync_float8_amax_and_scale_history,
+    ScalingType,
+)
+
+# create model
+m = Model(...)
+
+# optional: configure for compatibility with FSDP. Note that workarounds
+# gated with config.enable_amax_init and
+# config.enable_pre_and_post_forward are needed for
+# autocast + compile + FSDP + float8 to work
+from torchao.float8 import Float8LinearConfig, ScalingType, CastConfig
+config = Float8LinearConfig(
+    enable_amax_init=False,  # only needed for autocast + compile + FSDP +  float8 delayed
+    enable_pre_and_post_forward=False  # only needed for autocast + compile + FSDP +  float8 delayed
+    cast_config_input=CastConfig(scaling_type=ScalingType.DELAYED),
+    cast_config_weight=CastConfig(scaling_type=ScalingType.DELAYED),
+    cast_config_grad_output=CastConfig(scaling_type=ScalingType.DELAYED),
+)
+
+# convert all `torch.nn.Linear` modules to `Float8Linear`, specifying scaling
+# type
+convert_to_float8_training(
+    m,
+    config=config,
+)
+
+# optional: use FSDP
+model = FSDP(model, use_orig_params=True)
+
+# optional: enable torch.compile for improved performance
+m = torch.compile(m)
+
+# toy training loop
+for _ in range(N_ITER):
+    optimizer.zero_grad()
+    y = m(x)
+    y.sum().backward()
+
+    # specific to float8 with delayed scaling: separate step to sync scales/amaxes
+    # in the future, this may move to a context manager
+    sync_float8_amax_and_scale_history(model)
+
+    optimizer.step()
+```
+
+# Multi GPU User API
+
+We compose with the `DTensor` based [distributed APIs](https://pytorch.org/docs/stable/distributed.tensor.parallel.html),
+such as FSDP, TP and SP. Please see the [torchtitan](https://github.com/pytorch/torchtitan) repository for e2e examples
+on using `torchao.float8` in a distributed setting.
+
+# Testing
+
+```bash
+# run single-GPU unit tests
+pytest test/float8/test_base.py
+
+# run single-GPU compile tests
+pytest test/float8/test_compile.py
+
+# run single-GPU numerics integration tests
+pytest test/float8/test_numerics_integration.py
+
+# run a two-GPU integration test on FSDP
+./test/float8/test_fsdp.sh
+
+# run integration tests on the DTensor TP/SP integration
+./test/float8/test_dtensor.sh
+
+# run integration tests on the FSDP2 integration
+python test/float8/test_fsdp2/test_fsdp2.py
+
+# run all of these tests
+./test/float8/test_everything.sh
+```
+
+# Benchmarking
+
+```bash
+# benchmark the torch._scaled_mm function on LLaMa 2 70B shapes
+./benchmarks/float8/bench_matmul.py
+
+# benchmark fw/bw of `Linear` and `Float8Linear` on LLaMa 2 70B shapes
+# make sure to turn on torch.compile to get the best performance
+./benchmarks/float8/bench_linear_float8.py -o ../tmp/test.txt --compile
+```
diff --git a/torchao/float8/__init__.py b/torchao/float8/__init__.py
new file mode 100644
index 000000000..56c7b28f7
--- /dev/null
+++ b/torchao/float8/__init__.py
@@ -0,0 +1,46 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+# Lets define a few top level things here
+from torchao.float8.config import (
+    CastConfig,
+    DelayedScalingConfig,
+    Float8GemmConfig,
+    Float8LinearConfig,
+    ScalingType,
+)
+from torchao.float8.float8_linear import Float8Linear
+from torchao.float8.float8_linear_utils import (
+    convert_to_float8_training,
+    linear_requires_sync,
+    sync_float8_amax_and_scale_history,
+)
+from torchao.float8.float8_tensor import (
+    Float8Tensor,
+    GemmInputRole,
+    LinearMMConfig,
+    ScaledMMConfig,
+)
+from torchao.float8.fsdp_utils import precompute_float8_dynamic_scale_for_fsdp
+
+# Needed to load Float8Tensor with weights_only = True
+from torch.serialization import add_safe_globals
+
+add_safe_globals([Float8Tensor, ScaledMMConfig, GemmInputRole, LinearMMConfig])
+
+__all__ = [
+    # configuration
+    "DelayedScalingConfig",
+    "ScalingType",
+    "Float8GemmConfig",
+    "Float8LinearConfig",
+    "CastConfig",
+    # top level UX
+    "convert_to_float8_training",
+    "linear_requires_sync",
+    "sync_float8_amax_and_scale_history",
+    "precompute_float8_dynamic_scale_for_fsdp",
+    # note: Float8Tensor and Float8Linear are not public APIs
+]
diff --git a/torchao/float8/config.py b/torchao/float8/config.py
new file mode 100644
index 000000000..5d1bf9f54
--- /dev/null
+++ b/torchao/float8/config.py
@@ -0,0 +1,129 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+import enum
+from dataclasses import dataclass
+
+
+# TODO(future): consider renaming to ScalingType
+class ScalingType(enum.Enum):
+    DELAYED = "delayed"
+    DYNAMIC = "dynamic"
+
+    def short_str(self):
+        if self is ScalingType.DELAYED:
+            return "del"
+        else:
+            assert self is ScalingType.DYNAMIC
+            return "dyn"
+
+
+@dataclass(frozen=True)
+class CastConfig:
+    """
+    Configuration for casting a single tensor to float8
+    """
+
+    scaling_type: ScalingType = ScalingType.DYNAMIC
+
+
+@dataclass(frozen=True)
+class DelayedScalingConfig:
+    """
+    Configuration for delayed scaling.
+
+    Note: for now, `history_len` values must be the same for all layers in the
+    model using delayed scaling.
+
+    TODO(future): serialization for recipes
+    """
+
+    # Controls the history length of amax buffers
+    history_len: int = 16
+
+    # Controls the way to calculate current scale from amax history
+    # TODO(future): add other functions as needed, hardcoded or user defined
+    scale_fn_name: str = "max"
+
+    def __post_init__(self):
+        assert (
+            self.scale_fn_name == "max"
+        ), f"{self.scale_fn_name} is not implemented yet. Only max is supported for now."
+
+
+@dataclass(frozen=True)
+class Float8GemmConfig:
+    """
+    Configuration for a float8 gemm.
+    """
+
+    # If True, fast accumulation in lower precision is used.
+    # Note: this flag is currently a no-op if emulation is turned on.
+    use_fast_accum: bool = False
+
+
+@dataclass(frozen=True)
+class Float8LinearConfig:
+    """
+    Configuration for converting a `torch.nn.Linear` module to float8
+    for training.
+    """
+
+    #
+    # Per-tensor configuration for `input`, `weight`, `grad_output`
+    #
+    cast_config_input: CastConfig = CastConfig()
+    cast_config_weight: CastConfig = CastConfig()
+    cast_config_grad_output: CastConfig = CastConfig()
+
+    #
+    # Per-gemm configuration for gemms calculating `output`, `grad_input` and
+    # `grad_weight`
+    #
+    gemm_config_output: Float8GemmConfig = Float8GemmConfig(use_fast_accum=True)
+    gemm_config_grad_input: Float8GemmConfig = Float8GemmConfig()
+    gemm_config_grad_weight: Float8GemmConfig = Float8GemmConfig()
+
+    #
+    # Per-linear configuration
+    #
+
+    # If True, on the first iteration of Float8Linear the amaxes will be
+    # initialized with the incoming data. As of 2023-12-30, this doesn't work
+    # with autocast + torch.compile + FSDP. Enabling this option is nice for
+    # testing, but this is not necessary for real training jobs.
+    enable_amax_init: bool = True
+
+    # If True, pre-forward and post-forward functions are run. As of 2023-12-30,
+    # this doesn't work with autocast + torch.compile + FSDP. Enabling this
+    # option is useful for safety, but not strictly necessary.
+    enable_pre_and_post_forward: bool = True
+
+    # If True, then uses a tensor subclass for the float8 linear module's weight that
+    # implements pre/post-all-gather methods to do float8 all-gather with FSDP2.
+    enable_fsdp_float8_all_gather: bool = False
+
+    # If True, then prior to performing the fp8 scaled mamtmul we will pad the
+    # inner dimension of a (dim 1) and b (dim 2) with 0s. This is needed for matmuls
+    # _scaled_mm since it has the strong constraint that for M,N,K  N, K must be a multiple of 16.
+    # This can cause a memory spike however so we keep this off by default.
+    pad_inner_dim: bool = False
+
+    # If True, emulation is used instead of hardware accelerated gemm
+    emulate: bool = False
+
+    # Configuration for delayed scaling
+    # Note: this is actually applied per-tensor, but only using the same
+    # configuration for all tensors and layers in the model is currently
+    # supported. If in the future we add support for a more fine grained
+    # configuration, this field may move to per-tensor configs.
+    delayed_scaling_config: DelayedScalingConfig = DelayedScalingConfig()
+
+
+# If True, use 'fnuz' float8 types for calculations.
+# Currently, ROCm only supports fnuz variants.
+# TODO(future PR): move this to Float8LinearConfig
+use_fnuz_dtype = False
diff --git a/torchao/float8/distributed_utils.py b/torchao/float8/distributed_utils.py
new file mode 100644
index 000000000..ef174b073
--- /dev/null
+++ b/torchao/float8/distributed_utils.py
@@ -0,0 +1,113 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Any
+
+import torch
+
+from fairscale.nn.model_parallel.initialize import get_model_parallel_group
+
+# from float8_tensor import Float8Tensor
+from torchao.float8.float8_tensor import Float8Tensor
+
+# additional differentiable distributed primitives for SP which are not in
+# the Fairscale codebase
+
+
+def _gather_along_first_dim(input_: torch.Tensor):
+    # same as https://github.com/facebookresearch/fairscale/blob/main/fairscale/nn/model_parallel/mappings.py#L67,
+    # but gather along first dim instead of last dim
+    group = get_model_parallel_group()
+
+    # Bypass the function if we are using only 1 GPU.
+    if torch.distributed.get_world_size(group=group) == 1:
+        return input_
+
+    # Size and dimension.
+    first_dim = 0
+    rank = torch.distributed.get_rank(group=group)
+    world_size = torch.distributed.get_world_size(group=group)
+
+    # If the input is a float8 tensor, we need to do the transformation on the
+    # inner tensor and then return a new wrapper.
+    def _transform(t):
+        # tensors must be contiguous for all_gather to work
+        input_contig = t.contiguous()
+
+        tensor_list = [torch.empty_like(input_contig) for _ in range(world_size)]
+        tensor_list[rank] = input_contig
+        torch.distributed.all_gather(tensor_list, input_contig, group=group)
+
+        # Note: torch.cat already creates a contiguous tensor.
+        output = torch.cat(tensor_list, dim=first_dim).contiguous()
+        return output
+
+    if isinstance(input_, Float8Tensor):
+        new_data = input_._data
+        new_data = new_data.view(torch.int8)
+        new_data = _transform(new_data)
+        new_data = new_data.view(input_._data.dtype)
+        output = Float8Tensor(new_data, input_._scale, input_._orig_dtype)
+    else:
+        output = _transform(input_)
+
+    return output
+
+
+def _reduce_scatter(ctx: Any, input_: torch.Tensor):
+    group = get_model_parallel_group()
+    world_size = torch.distributed.get_world_size(group)
+
+    assert input_.shape[0] % world_size == 0
+    output_shape = (input_.shape[0] // world_size, *input_.shape[1:])
+    output = torch.empty(*output_shape, device=input_.device, dtype=input_.dtype)
+
+    torch.distributed.reduce_scatter_tensor(output, input_, group=group)
+    return output
+
+
+def _split_along_first_dim(input_: torch.Tensor):
+    # this is needed for testing
+
+    # like fairscale.nn.model_parallel.mappings._split, but
+    # along the first dim instead of last dim
+
+    group = get_model_parallel_group()
+    local_rank = torch.distributed.get_rank(group)
+    world_size = torch.distributed.get_world_size(group)
+
+    assert input_.shape[0] % world_size == 0
+    input_list = torch.split(input_, input_.shape[0] // world_size)
+    return input_list[local_rank]
+
+
+class _AllGatherFloat8FwReduceScatterBw(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input_):
+        return _gather_along_first_dim(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _reduce_scatter(ctx, grad_output)
+
+
+class _ReduceScatterFwAllGatherFloat8Bw(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input_):
+        return _reduce_scatter(ctx, input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _gather_along_first_dim(grad_output)
+
+
+class _AllGatherFwSplitBw(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input_):
+        return _gather_along_first_dim(input_)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return _split_along_first_dim(grad_output)
diff --git a/torchao/float8/float8_aten_api.py b/torchao/float8/float8_aten_api.py
new file mode 100644
index 000000000..41d5083d6
--- /dev/null
+++ b/torchao/float8/float8_aten_api.py
@@ -0,0 +1,49 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+This file defines the aten functions for float8. Today, all of these functions
+are emulated. In the future, they should be calling NVIDIA's float8 kernels.
+"""
+
+import torch
+
+from torch.library import Library
+
+
+def mm_float8_emulated(
+    m1,  # input 1 data
+    s1,  # input 1 scale
+    m2,  # input 2 data
+    s2,  # input 2 scale
+    dtype3,  # output dtype
+):
+    # naive implementation: dq -> op -> q
+    m1_fp32 = m1.float() / s1
+    m2_fp32 = m2.float() / s2
+    m3_fp32 = torch.mm(m1_fp32, m2_fp32)
+
+    return m3_fp32.to(dtype3)
+
+
+#
+# ATen op placeholders
+#
+
+# Register the aten level functions we need.
+# These are mostly placeholder and might need to be implemented in c++ as needed
+lib = Library("aten", "FRAGMENT")
+
+lib.define(
+    "mm_float8_emulated(Tensor m1, Tensor s1, Tensor m2, Tensor s2, ScalarType dtype3) -> Tensor"
+)
+lib.impl("mm_float8_emulated", mm_float8_emulated, "CPU")
+lib.impl("mm_float8_emulated", mm_float8_emulated, "CUDA")
+
+
+@torch.library.impl(lib, "mm_float8_emulated", "Meta")
+def _mm_float8_emulated_meta(m1, s1, m2, s2, dtype3):
+    out = torch.mm(m1.float(), m2.float()).to(dtype3)
+    return out
diff --git a/torchao/float8/float8_linear.py b/torchao/float8/float8_linear.py
new file mode 100644
index 000000000..dd85af921
--- /dev/null
+++ b/torchao/float8/float8_linear.py
@@ -0,0 +1,438 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+A simple module swap UX for a float8 version of `torch.nn.Linear`.
+"""
+
+import dataclasses
+import enum
+
+from typing import Optional
+
+import torch
+
+from torchao.float8.config import Float8LinearConfig, ScalingType
+
+from torchao.float8.float8_scaling_utils import (
+    _maybe_initialize_amaxes_scales_for_float8_cast,
+    hp_tensor_to_float8_delayed,
+    hp_tensor_to_float8_dynamic,
+    NoopFwToFloat8E5M2BwDelayed,
+    NoopFwToFloat8E5M2BwDynamic,
+)
+
+from torchao.float8.float8_tensor import (
+    Float8Tensor,
+    GemmInputRole,
+    LinearMMConfig,
+    ScaledMMConfig,
+)
+
+from torchao.float8.float8_utils import e4m3_dtype, e5m2_dtype, tensor_to_amax
+
+from torchao.float8.fsdp_utils import (
+    WeightWithDelayedFloat8CastTensor,
+    WeightWithDynamicFloat8CastTensor,
+)
+
+
+# this code was resurrected from https://github.com/pytorch-labs/torchao.float8/pull/128/files
+@torch._dynamo.allow_in_graph
+class manual_float8_matmul(torch.autograd.Function):
+    """
+    Like torch.matmul, but with the arguments in float8
+    """
+
+    @staticmethod
+    def forward(
+        ctx,
+        input_fp8,
+        weight_fp8_t,
+    ):
+        ctx.save_for_backward(input_fp8, weight_fp8_t)
+        # the reshapes are needed in order to make the shapes compatible with
+        # torch.mm
+        orig_shape = input_fp8.shape
+        input_fp8_reshaped = input_fp8.reshape(-1, orig_shape[-1])
+        res_bits = torch.mm(input_fp8_reshaped, weight_fp8_t)
+        res_bits = res_bits.reshape(*orig_shape[:-1], res_bits.shape[-1])
+        return res_bits
+
+    @staticmethod
+    def backward(ctx, grad_output_fp8):
+        input_fp8, weight_fp8_t = ctx.saved_tensors
+
+        # the reshapes are needed in order to make the shapes compatible with
+        # torch.mm
+        grad_output_fp8_orig_shape = grad_output_fp8.shape
+        grad_output_fp8_reshaped = grad_output_fp8.reshape(
+            -1, grad_output_fp8_orig_shape[-1]
+        )
+
+        # calculate grad_input
+        grad_input = torch.mm(
+            grad_output_fp8_reshaped,
+            weight_fp8_t.t(),
+        )
+        grad_input = grad_input.reshape(
+            *grad_output_fp8_orig_shape[:-1], grad_input.shape[-1]
+        )
+
+        input_fp8_orig_shape = input_fp8.shape
+        input_fp8_reshaped = input_fp8.reshape(-1, input_fp8_orig_shape[-1])
+
+        # calculate grad_weight
+        # Note: the variant below is slightly faster on LLaMa 3 8B pretraining
+        # compared to than calculating `grad_weight_t = input_fp8_t @ grad_output_fp8_reshaped`
+        grad_weight = torch.mm(
+            grad_output_fp8_reshaped.t(),
+            input_fp8_reshaped,
+        )
+
+        return grad_input, grad_weight.t()
+
+
+class Float8Linear(torch.nn.Linear):
+    """
+    Note: this is **not** a public API and is only intended to be used
+    inside of this repository. Please file an issue if you would benefit
+    from this being a public API.
+
+    A wrapper around a `torch.nn.Linear` module which does fp8 compute, and tracks
+    scales in way friendly to delayed scaling.
+    """
+
+    def __init__(self, *args, **kwargs):
+        """
+        Additional arguments on top of `torch.nn.Linear`'s arguments:
+        * `config`: Float8LinearConfig
+        """
+
+        # Amax scales should always be kept as float32.
+        self.always_float32_buffers = set()
+        config = kwargs.pop("config")
+        emulate = config.emulate
+        super().__init__(*args, **kwargs)
+
+        # Defines the scaling behavior of input, weight, grad_output
+        self.scaling_type_input = config.cast_config_input.scaling_type
+        self.scaling_type_weight = config.cast_config_weight.scaling_type
+        self.scaling_type_grad_output = config.cast_config_grad_output.scaling_type
+        # Convenience flag to skip code related to delayed scaling
+        self.has_any_delayed_scaling = (
+            self.scaling_type_input is ScalingType.DELAYED
+            or self.scaling_type_weight is ScalingType.DELAYED
+            or self.scaling_type_grad_output is ScalingType.DELAYED
+        )
+
+        self.config = config
+
+        self.create_buffers()
+
+        self.linear_mm_config = LinearMMConfig(
+            # output
+            ScaledMMConfig(
+                emulate,
+                self.config.gemm_config_output.use_fast_accum,
+                False,
+                self.config.pad_inner_dim,
+            ),
+            # grad_input
+            ScaledMMConfig(
+                emulate,
+                self.config.gemm_config_grad_input.use_fast_accum,
+                False,
+                self.config.pad_inner_dim,
+            ),
+            # grad_weight
+            ScaledMMConfig(
+                emulate,
+                self.config.gemm_config_grad_weight.use_fast_accum,
+                False,
+                self.config.pad_inner_dim,
+            ),
+        )
+
+        # Note: is_amax_initialized is not a buffer to avoid data dependent
+        # control flow visible to dynamo
+        # TODO(future PR): add serialization for this flag
+        self.is_amax_initialized = not self.config.enable_amax_init
+
+        # Syncing of amaxes and scales happens outside of this function. This
+        # flag is here to enforce that the user does not forget to do this.
+        self.amax_and_scale_synced = not self.config.enable_amax_init
+
+        # This is needed to properly handle autocast in the amax/scale
+        # update function for torch.float16
+        self.last_seen_input_dtype = None
+
+        # pre_forward and post_forward are currently broken with FSDP
+        # and torch.compile, this option can disable them
+        # Note that when using `self.config.enable_pre_and_post_forward = False`,
+        # it's recommended to also set `self.config.enable_amax_init = False`.
+        # Otherwise, the amax buffer would never be marked as initialized and
+        # would be initialized in every iteration.
+        self.enable_pre_and_post_forward = self.config.enable_pre_and_post_forward
+
+    def create_buffers(self):
+        # Default values for history buffers, see above TODO
+        history_len = self.config.delayed_scaling_config.history_len
+        device = self.weight.device
+        # TODO(future PR): dtype values below don't have the other float8
+        # flavors, fix it
+        default_input = torch.finfo(torch.float8_e4m3fn).max
+        default_weight = torch.finfo(torch.float8_e4m3fn).max
+        default_grad_output = torch.finfo(torch.float8_e5m2).max
+
+        # Note: for now, create all the buffers if any are needed, to postpone
+        # the work to make the scale and amax syncing and history calculation
+        # handle a heterogeneous setup. We can do that work later if benchmarks
+        # show it is worth doing.
+        if self.has_any_delayed_scaling:
+            self.register_always_float32_buffer(
+                "fp8_amax_input", torch.tensor([default_input], device=device)
+            )
+            self.register_always_float32_buffer(
+                "fp8_amax_history_input", torch.zeros(history_len, device=device)
+            )
+            self.register_always_float32_buffer(
+                "fp8_scale_input", torch.tensor([1.0], device=device)
+            )
+            self.register_always_float32_buffer(
+                "fp8_amax_weight", torch.tensor([default_weight], device=device)
+            )
+            self.register_always_float32_buffer(
+                "fp8_amax_history_weight", torch.zeros(history_len, device=device)
+            )
+            self.register_always_float32_buffer(
+                "fp8_scale_weight", torch.tensor([1.0], device=device)
+            )
+            self.register_always_float32_buffer(
+                "fp8_amax_grad_output",
+                torch.tensor([default_grad_output], device=device),
+            )
+            self.register_always_float32_buffer(
+                "fp8_amax_history_grad_output", torch.zeros(history_len, device=device)
+            )
+            self.register_always_float32_buffer(
+                "fp8_scale_grad_output", torch.tensor([1.0], device=device)
+            )
+
+    def register_always_float32_buffer(
+        self, name: str, tensor: Optional[torch.Tensor], persistent: bool = True
+    ) -> None:
+        self.register_buffer(name=name, tensor=tensor, persistent=persistent)
+        self.always_float32_buffers.add(name)
+
+    def _apply(self, fn, recurse=True):
+        ret = super()._apply(fn, recurse)
+        self.convert_amax_buffer_to_float32()
+        return ret
+
+    def convert_amax_buffer_to_float32(self):
+        for key in self.always_float32_buffers:
+            if self._buffers[key] is not None:
+                self._buffers[key] = self._buffers[key].to(torch.float32)
+
+    def cast_input_to_float8(
+        self, input: torch.Tensor, is_amax_initialized: bool
+    ) -> torch.Tensor:
+        # Duplicate the autocast logic for F.linear, so that the output
+        # of our module has the right original precision
+        if torch.is_autocast_enabled():
+            # For now, hardcode to GPU's autocast dtype
+            # if we need CPU support in the future, we can add it
+            autocast_dtype = torch.get_autocast_gpu_dtype()
+            input = input.to(autocast_dtype)
+
+        if self.scaling_type_input is ScalingType.DELAYED:
+            scale_fn_name = self.config.delayed_scaling_config.scale_fn_name
+            _maybe_initialize_amaxes_scales_for_float8_cast(
+                input,
+                self.fp8_amax_input,
+                self.fp8_amax_history_input,
+                self.fp8_scale_input,
+                scale_fn_name,
+                e4m3_dtype,
+                is_amax_initialized,
+                reduce_amax=True,
+            )
+            input_fp8 = hp_tensor_to_float8_delayed(
+                input,
+                self.fp8_scale_input,
+                e4m3_dtype,
+                self.fp8_amax_input,
+                linear_mm_config=self.linear_mm_config,
+                gemm_input_role=GemmInputRole.INPUT,
+            )
+        else:
+            assert self.scaling_type_input is ScalingType.DYNAMIC
+            input_fp8 = hp_tensor_to_float8_dynamic(
+                input, e4m3_dtype, self.linear_mm_config
+            )
+        return input_fp8
+
+    def cast_weight_to_float8(
+        self, weight: torch.Tensor, is_amax_initialized: bool
+    ) -> torch.Tensor:
+        if self.scaling_type_weight is ScalingType.DELAYED:
+            if isinstance(self.weight, Float8Tensor):  # cast by FSDP
+                weight_fp8 = self.weight
+            else:
+                scale_fn_name = self.config.delayed_scaling_config.scale_fn_name
+                _maybe_initialize_amaxes_scales_for_float8_cast(
+                    weight,
+                    self.fp8_amax_weight,
+                    self.fp8_amax_history_weight,
+                    self.fp8_scale_weight,
+                    scale_fn_name,
+                    e4m3_dtype,
+                    is_amax_initialized,
+                    reduce_amax=False,
+                )
+
+                weight_fp8 = hp_tensor_to_float8_delayed(
+                    weight,
+                    self.fp8_scale_weight,
+                    e4m3_dtype,
+                    self.fp8_amax_weight,
+                    linear_mm_config=self.linear_mm_config,
+                    gemm_input_role=GemmInputRole.WEIGHT,
+                )
+        else:
+            assert self.scaling_type_weight is ScalingType.DYNAMIC
+            if isinstance(self.weight, Float8Tensor):  # cast by FSDP
+                weight_fp8 = self.weight
+            else:
+                weight_fp8 = hp_tensor_to_float8_dynamic(
+                    self.weight,
+                    e4m3_dtype,
+                    self.linear_mm_config,
+                    gemm_input_role=GemmInputRole.WEIGHT,
+                )
+        return weight_fp8
+
+    def cast_output_to_float8_in_bw(self, output: torch.Tensor) -> torch.Tensor:
+        if self.scaling_type_grad_output is ScalingType.DELAYED:
+            scale_fn_name = self.config.delayed_scaling_config.scale_fn_name
+            output = NoopFwToFloat8E5M2BwDelayed.apply(
+                output,
+                self.fp8_amax_grad_output,
+                self.fp8_amax_history_grad_output,
+                self.fp8_scale_grad_output,
+                scale_fn_name,
+                self.is_amax_initialized,
+                self.linear_mm_config,
+            )
+        else:
+            assert self.scaling_type_grad_output is ScalingType.DYNAMIC
+            output = NoopFwToFloat8E5M2BwDynamic.apply(output, self.linear_mm_config)
+        return output
+
+    def float8_pre_forward(self, input):
+        if not self.enable_pre_and_post_forward:
+            return
+        if (
+            self.is_amax_initialized
+            and (not self.amax_and_scale_synced)
+            and torch.is_grad_enabled()
+        ):
+            raise AssertionError(
+                "amaxes and scales not synced, please call `sync_float8_amax_and_scale_history` before forward"
+            )
+        self.last_seen_input_dtype = input.dtype
+
+    def float8_post_forward(self):
+        if not self.enable_pre_and_post_forward:
+            return
+        # Ensure that calling forward again will fail until the user syncs
+        # amaxes and scales
+        self.is_amax_initialized = True
+        self.amax_and_scale_synced = False
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        if self.has_any_delayed_scaling:
+            self.float8_pre_forward(input)
+
+        input_fp8 = self.cast_input_to_float8(input, self.is_amax_initialized)
+        weight_fp8 = self.cast_weight_to_float8(self.weight, self.is_amax_initialized)
+
+        output = manual_float8_matmul.apply(input_fp8, weight_fp8.t())
+
+        # Cast grad_output to float8_e5m2 during backward
+        output = self.cast_output_to_float8_in_bw(output)
+
+        if self.bias is not None:
+            output = output + self.bias.to(output.dtype)
+
+        if self.has_any_delayed_scaling:
+            self.float8_post_forward()
+        return output
+
+    def scaling_repr(self):
+        # add scaling settings without using too many characters
+        # example: "i:del,w:del,go:dyn"
+        return f"i:{self.scaling_type_input.short_str()},w:{self.scaling_type_weight.short_str()},go:{self.scaling_type_grad_output.short_str()}"
+
+    def extra_repr(self):
+        s = f'{super().extra_repr()}, scaling="{self.scaling_repr()}"'
+        return s
+
+    @classmethod
+    def from_float(
+        cls,
+        mod,
+        config: Optional[Float8LinearConfig] = None,
+    ):
+        """
+        Create an nn.Linear with fp8 compute from a regular nn.Linear
+
+        Args:
+            mod (torch.nn.Linear): nn.Linear to convert
+            config (Optional[Float8LinearConfig]): configuration for conversion to float8
+        """
+        if config is None:
+            config = Float8LinearConfig()
+        with torch.device("meta"):
+            new_mod = cls(
+                mod.in_features,
+                mod.out_features,
+                bias=False,
+                config=config,
+            )
+        new_mod.weight = mod.weight
+        new_mod.bias = mod.bias
+        # need to create buffers again when moving from meta device to
+        # real device
+        new_mod.create_buffers()
+
+        # If FSDP float8 all-gather is on, wrap the weight in a float8-aware
+        # tensor subclass. This must happen last because:
+        # 1. weight needs to be on the correct device to create the buffers
+        # 2. buffers need to be already created for the delayed scaling version
+        #    of the weight wrapper to be initialized
+        if config.enable_fsdp_float8_all_gather:
+            if config.cast_config_weight.scaling_type is ScalingType.DYNAMIC:
+                new_mod.weight = torch.nn.Parameter(
+                    WeightWithDynamicFloat8CastTensor(
+                        new_mod.weight,
+                        new_mod.linear_mm_config,
+                    )
+                )
+            else:
+                assert config.cast_config_weight.scaling_type is ScalingType.DELAYED
+                new_mod.weight = torch.nn.Parameter(
+                    WeightWithDelayedFloat8CastTensor(
+                        new_mod.weight,
+                        new_mod.fp8_amax_weight,
+                        new_mod.fp8_amax_history_weight,
+                        new_mod.fp8_scale_weight,
+                        new_mod.linear_mm_config,
+                        new_mod.is_amax_initialized,
+                    )
+                )
+
+        return new_mod
diff --git a/torchao/float8/float8_linear_utils.py b/torchao/float8/float8_linear_utils.py
new file mode 100644
index 000000000..675ed5ee6
--- /dev/null
+++ b/torchao/float8/float8_linear_utils.py
@@ -0,0 +1,327 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+from typing import Callable, List, Optional
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from torchao.float8.config import Float8LinearConfig, ScalingType
+from torchao.float8.float8_linear import Float8Linear
+
+from torchao.float8.float8_utils import (
+    amax_history_to_scale_stack,
+    e4m3_dtype,
+    e5m2_dtype,
+)
+from torch.distributed._functional_collectives import all_reduce, AsyncCollectiveTensor
+
+log = logging.getLogger(__name__)
+log.addHandler(logging.NullHandler())
+
+
+def linear_requires_sync(config: Float8LinearConfig):
+    """Returns whether the given linear_type requires sync before forward."""
+    return any(
+        [
+            config.cast_config_input.scaling_type is ScalingType.DELAYED,
+            config.cast_config_weight.scaling_type is ScalingType.DELAYED,
+            config.cast_config_grad_output.scaling_type is ScalingType.DELAYED,
+        ]
+    )
+
+
+def _update_history_stack(
+    new_amax: torch.Tensor, amax_history_stack: torch.Tensor
+) -> torch.Tensor:
+    """
+    Updates `amax_history` (the last N cur_amax values) inplace with the value
+    of `new_amax`.
+
+    Args:
+        new_amax (torch.Tensor): The new amax value to add to the history. (n_amaxes, 1)
+        amax_history_stack (torch.Tensor): The history of amax values. (n_amaxes, history_length)
+    """
+    assert (
+        amax_history_stack.dim() == 2
+    ), f"Expected amat_history_stack to be 2D, got {amax_history_stack.shape()}"
+    assert new_amax.size(0) == amax_history_stack.size(
+        0
+    ), f"Expected new_amax to have the same size as the first dimension of amax_history_stack, got {new_amax.size(0)} and {amax_history_stack.size(0)}"
+    new_amax_history_stack = torch.roll(amax_history_stack, 1, dims=1)
+    new_amax_history_stack[:, 0] = new_amax.squeeze(-1)
+    amax_history_stack.copy_(new_amax_history_stack)
+
+
+def swap_linear_layers(
+    module: nn.Module,
+    from_float_func: Callable[[nn.Linear], nn.Linear],
+    *,
+    module_filter_fn: Optional[Callable[[nn.Module, str], bool]] = None,
+) -> nn.Module:
+    """
+    Generic function to swap linear layers in a module with a new type of linear layer.
+
+    Note:
+        If applied to a root-level nn.Linear, the module will not be modified in place
+        and returned instead
+
+    Args:
+        module: Module to modify.
+        from_float_func: Function that accepts a linear layer and returns a new type of linear layer.
+        module_filter_fn: If specified, only the `torch.nn.Linear` subclasses that
+            that pass the filter function will be swapped. The inputs to the
+            filter function are the module instance, and the FQN.
+
+    Returns:
+     nn.Module: The modified module with swapped linear layers.
+    """
+    if isinstance(module, nn.Linear) and (
+        module_filter_fn is None or module_filter_fn(module, "")
+    ):
+        if len(list(module.children())) > 0:
+            raise AssertionError(
+                f"Does not support a root nn.Linear with children: {module}"
+            )
+        return from_float_func(
+            module,
+        )
+
+    root_module = module
+
+    def post_order_traversal(
+        module: nn.Module,
+        cur_fqn: Optional[str] = None,
+        parent_module: Optional[nn.Module] = None,
+    ):
+        if cur_fqn is None:
+            cur_fqn = ""
+
+        for child_module_name, child_module in module.named_children():
+            if cur_fqn == "":
+                new_fqn = child_module_name
+            else:
+                new_fqn = f"{cur_fqn}.{child_module_name}"
+
+            post_order_traversal(child_module, new_fqn, module)
+
+        if isinstance(module, nn.Linear) and (
+            module_filter_fn is None or module_filter_fn(module, cur_fqn)
+        ):
+            assert (
+                parent_module is not None
+            ), f"Linear root module should return early: {module}"
+            new_linear_module = from_float_func(module)
+            cur_module_name = cur_fqn.split(".")[-1]
+            setattr(parent_module, cur_module_name, new_linear_module)
+
+    post_order_traversal(root_module)
+    return root_module
+
+
+def convert_to_float8_training(
+    module: nn.Module,
+    *,
+    module_filter_fn: Optional[Callable[[nn.Module, str], bool]] = None,
+    config: Float8LinearConfig = None,
+) -> nn.Module:
+    """
+    Swaps `torch.nn.Linear` in `module` with `Float8Linear`.
+
+    Args:
+        module: Module to modify.
+        module_filter_fn: If specified, only the `torch.nn.Linear` subclasses that
+            that pass the filter function will be swapped. The inputs to the
+            filter function are the module instance and the FQN.
+        config (Float8LinearConfig): configuration for conversion to float8
+
+    Returns:
+     nn.Module: The modified module with swapped linear layers.
+    """
+    if config is None:
+        config = Float8LinearConfig()
+    from_float = lambda m: Float8Linear.from_float(
+        m,
+        config=config,
+    )
+    return swap_linear_layers(
+        module,
+        from_float,
+        module_filter_fn=module_filter_fn,
+    )
+
+
+def get_float8_layers(model: torch.nn.Module):
+    """Iterates through the model and returns all the Float8Linear layers.
+    Args:
+        model (torch.nn.Module): The model to look for Float8Linear layers in.
+    """
+
+    # Get all fp8 layers and tensors
+    fp8_layers = [child for child in model.modules() if isinstance(child, Float8Linear)]
+    if not torch._dynamo.is_compiling():
+        for layer in fp8_layers:
+            for buf in layer.buffers():
+                torch._dynamo.mark_static_address(buf, guard=True)
+    return fp8_layers
+
+
+@torch.no_grad()
+def sync_float8_amax_and_scale_history(model: torch.nn.Module, fp8_layers=None) -> None:
+    """
+    Manages the float8 amax and scale bookkeeping. In detail, it does the
+    following:
+    1. in distributed contexts, syncs amax values across workers for activations and gradients
+    2. adds the `amax` values to history
+    3. calculates the scales to be used for next iteration
+    4. sets the `amax_and_scale_synced` flag on the Float8Linear modules
+       to signal that they have been synced
+
+    TODO(future): design the UX for this (context manager, etc)
+
+    PERFORMANCE NOTE:
+        When you can, it is much more efficient to call get_float8_layers once at
+        the beginning of the training loop and pass the result to this function.
+        Because of how this interacts with torch.compile
+
+    Args:
+        model (torch.nn.Module): The model to track amaxes for
+        fp8_layers (optional): If fp8_layers are provided, fp8_classes are ignored,
+            and we loop over all fp8_layers to sync and update amax scale histories.
+            Users can use get_float8_layers to get all fp8 layers.
+    """
+    if fp8_layers is None:
+        fp8_layers = get_float8_layers(model)
+
+    if len(fp8_layers) == 0:
+        log.warn(
+            "Calling sync_float8_amax_and_scale_history on a module with no Float8Linear layers"
+        )
+        return
+
+    def inner_func():
+        """Why do we have this inner_function?
+
+        There are two portions of the outer sync_function that cause graph_breaks:
+            1. The `get_float8_layers` call can cause graph breaks if the user did not pass
+                in the fp8_layers.
+            2. At the end of syncing all the amaxes and scales we set the attr on the module
+                signaling that we have synced the amaxes and scales and the next forward can be run.
+                # TODO Maybe we should remove this safety check to remove the graph break?
+
+        By having this inner function, we can ensure that although the outer function may cause graph breaks
+        the inner function will not.
+        """
+        # Loop over all fp8 layers and grab the needed tensors
+        fp8_amax_input_tensor_list = [None] * len(fp8_layers)
+        fp8_amax_weight_tensor_list = [None] * len(fp8_layers)
+        fp8_amax_grad_output_tensor_list = [None] * len(fp8_layers)
+
+        fp8_input_amax_history_stack = [None] * len(fp8_layers)
+        fp8_weight_amax_history_stack = [None] * len(fp8_layers)
+        fp8_grad_output_amax_history_stack = [None] * len(fp8_layers)
+
+        x_dtypes = set()
+        scale_fn_recipes = set()
+
+        for idx, child in enumerate(fp8_layers):
+            fp8_amax_input_tensor_list[idx] = child.fp8_amax_input
+            fp8_amax_weight_tensor_list[idx] = child.fp8_amax_weight
+            fp8_amax_grad_output_tensor_list[idx] = child.fp8_amax_grad_output
+
+            fp8_input_amax_history_stack[idx] = child.fp8_amax_history_input
+            fp8_weight_amax_history_stack[idx] = child.fp8_amax_history_weight
+            fp8_grad_output_amax_history_stack[idx] = child.fp8_amax_history_grad_output
+
+            x_dtypes.add(child.last_seen_input_dtype)
+            scale_fn_recipes.add(child.config.delayed_scaling_config.scale_fn_name)
+
+        # TODO This way to get the activation dtype is not ideal
+        if len(x_dtypes) != 1:
+            raise ValueError(
+                f"All layers must have the same last seen input_dtype, got {x_dtypes}"
+            )
+        x_dtype = next(iter(x_dtypes))
+
+        if len(scale_fn_recipes) != 1:
+            raise ValueError(
+                f"All layers must have the same scale_fn recipe, got {scale_fn_recipes}"
+            )
+        scale_fn_recipe = next(iter(scale_fn_recipes))
+
+        assert (
+            len(fp8_amax_input_tensor_list)
+            == len(fp8_amax_weight_tensor_list)
+            == len(fp8_amax_grad_output_tensor_list)
+        ), "Mismatched lengths of amax tensors."
+
+        if dist.is_initialized():
+            all_amax_tensors = torch.cat(
+                fp8_amax_input_tensor_list
+                + fp8_amax_weight_tensor_list
+                + fp8_amax_grad_output_tensor_list
+            )
+            all_reduced_amax_tensor = all_reduce(
+                all_amax_tensors, "MAX", list(range(dist.get_world_size()))
+            )
+            if isinstance(all_reduced_amax_tensor, AsyncCollectiveTensor):
+                all_reduced_amax_tensor = all_reduced_amax_tensor.wait()
+
+            (
+                reduced_fp8_amax_input_tensor,
+                reduced_fp8_amax_weight_tensor,
+                reduced_fp8_amax_grad_output_tensor,
+            ) = torch.split(all_reduced_amax_tensor, len(fp8_amax_input_tensor_list))
+
+            for idx, child in enumerate(fp8_layers):
+                child.fp8_amax_input.copy_(reduced_fp8_amax_input_tensor[idx])
+                child.fp8_amax_weight.copy_(reduced_fp8_amax_weight_tensor[idx])
+                child.fp8_amax_grad_output.copy_(
+                    reduced_fp8_amax_grad_output_tensor[idx]
+                )
+
+        # We create two stacked tensor groups, one for the amax history and one for the current scales
+        fp8_amax_input_tensors = torch.vstack(fp8_amax_input_tensor_list)
+        fp8_amax_weight_tensors = torch.vstack(fp8_amax_weight_tensor_list)
+        fp8_amax_grad_output_tensors = torch.vstack(fp8_amax_grad_output_tensor_list)
+
+        fp8_input_amax_history_stack = torch.vstack(fp8_input_amax_history_stack)
+        fp8_weight_amax_history_stack = torch.vstack(fp8_weight_amax_history_stack)
+        fp8_grad_output_amax_history_stack = torch.vstack(
+            fp8_grad_output_amax_history_stack
+        )
+
+        # Update the history stacks with the new amax values
+        _update_history_stack(fp8_amax_input_tensors, fp8_input_amax_history_stack)
+        _update_history_stack(fp8_amax_weight_tensors, fp8_weight_amax_history_stack)
+        _update_history_stack(
+            fp8_amax_grad_output_tensors, fp8_grad_output_amax_history_stack
+        )
+
+        # Calculate the new scales from the updated history stacks
+        new_input_scales = amax_history_to_scale_stack(
+            fp8_input_amax_history_stack, e4m3_dtype, x_dtype, scale_fn_recipe
+        )
+        new_weight_scales = amax_history_to_scale_stack(
+            fp8_weight_amax_history_stack, e4m3_dtype, x_dtype, scale_fn_recipe
+        )
+        new_grad_output_scales = amax_history_to_scale_stack(
+            fp8_grad_output_amax_history_stack, e5m2_dtype, x_dtype, scale_fn_recipe
+        )
+
+        # Iterate through the layers and update the scales
+        for idx, child in enumerate(fp8_layers):
+            child.fp8_scale_input.copy_(new_input_scales[idx])
+            child.fp8_scale_weight.copy_(new_weight_scales[idx])
+            child.fp8_scale_grad_output.copy_(new_grad_output_scales[idx])
+
+    # This allows for the compile to succede on the inner func and fail on the graph breaks
+    # at the beginning and and of syncing
+    inner_func()
+
+    for child in fp8_layers:
+        # Set a flag to signal amaxes/scales are ready
+        child.amax_and_scale_synced = True
diff --git a/torchao/float8/float8_ops.py b/torchao/float8/float8_ops.py
new file mode 100644
index 000000000..d3c3b405b
--- /dev/null
+++ b/torchao/float8/float8_ops.py
@@ -0,0 +1,363 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Any, Dict, Tuple
+
+import torch
+
+from torchao.float8.float8_python_api import addmm_float8_unwrapped
+from torchao.float8.float8_tensor import choose_scaled_mm_config, Float8Tensor
+from torchao.float8.float8_utils import is_row_major, pad_tensor_for_matmul
+
+from torch.utils._pytree import tree_map
+
+aten = torch.ops.aten
+c10d_functional = torch.ops.c10d_functional
+_c10d_functional = torch.ops._c10d_functional
+FLOAT8_OPS_TABLE: Dict[Any, Any] = {}
+
+
+def implements(aten_ops):
+    """Register aten ops to the float8 op table"""
+
+    def decorator(func):
+        for op in aten_ops:
+            FLOAT8_OPS_TABLE[op] = func
+        return func
+
+    return decorator
+
+
+@implements(
+    [
+        aten.view.default,
+        aten._unsafe_view.default,
+        aten.t.default,
+        aten.as_strided.default,
+        aten.clone.default,
+        aten.detach.default,
+        aten.slice.Tensor,
+        aten.transpose.int,
+        aten.fill_.Scalar,
+    ]
+)
+def float8_desugar_op(aten_op, args, kwargs=None):
+    new_data = aten_op(args[0]._data, *args[1:], **kwargs)
+    return Float8Tensor(
+        new_data,
+        args[0]._scale,
+        args[0]._orig_dtype,
+        args[0]._linear_mm_config,
+        args[0]._gemm_input_role,
+    )
+
+
+@implements([aten.split.Tensor])
+def float8_split(aten_op, args, kwargs=None):
+    new_data_tensors = aten_op(args[0]._data, *args[1:], **kwargs)
+
+    def make_float8(data):
+        return Float8Tensor(
+            data,
+            args[0]._scale,
+            args[0]._orig_dtype,
+            args[0]._linear_mm_config,
+            args[0]._gemm_input_role,
+        )
+
+    out = map(make_float8, new_data_tensors)
+    return list(out)
+
+
+# Errors cant `cat_cuda float8 e4m3fn`
+@implements([aten.cat.default])
+def float8_cat(aten_op, args, kwargs=None):
+    chunked_tensors: Tuple[Float8Tensor] = args[0]
+
+    orig_dtype = chunked_tensors[0]._orig_dtype
+    scale = chunked_tensors[0]._scale
+    mm_config = chunked_tensors[0]._linear_mm_config
+    fp8_dtype = chunked_tensors[0]._data.dtype
+    gemm_input_role = chunked_tensors[0]._gemm_input_role
+    chunk_data = []
+    for chunk in chunked_tensors:
+        assert isinstance(
+            chunk, Float8Tensor
+        ), "Expecting all chunks to be of type Float8Tensor"
+        assert (
+            chunk._orig_dtype == orig_dtype
+        ), "Expecting all chunks to be of the same dtype"
+        assert (
+            chunk._scale is scale
+        ), "Expecting all chunks to have thee same scale as a result of a split"
+        assert (
+            chunk._linear_mm_config is mm_config
+        ), "Expecting all chunks to have thee same mm config as a result of a split"
+        assert (
+            chunk._data.dtype == fp8_dtype
+        ), "Expecting all chunks to be of the same dtype as a result of a split"
+        assert (
+            chunk._gemm_input_role is gemm_input_role
+        ), "Expecting all chunks to have the same gemm_input_role as a result of a split"
+        chunk_data.append(chunk._data.view(torch.uint8))
+
+    new_data = aten_op(chunk_data, *args[1:], **kwargs)
+    new_data = new_data.view(fp8_dtype)
+    return Float8Tensor(new_data, scale, orig_dtype, mm_config, gemm_input_role)
+
+
+@implements([aten.sum.dim_IntList])
+def float8_cast_up_op(aten_op, args, kwargs=None):
+    """Be careful with this function, this is a "fallback" op that
+    casts the output of the op to the original precision. And performs the op.
+
+    We currently need this to support the backward for admmm bias.
+    "addmm" -> out
+    "hp_gradBias" <-"sum" <- "identity" <- gradOut <- "hp_gradOut"
+    """
+
+    def unwrap(x):
+        if isinstance(x, Float8Tensor):
+            return x.to_original_precision()
+        return x
+
+    new_args = tree_map(unwrap, args)
+    new_kwargs = tree_map(unwrap, kwargs)
+    return aten_op(*new_args, **new_kwargs)
+
+
+def preprocess_addmm(a: Float8Tensor, b: Float8Tensor):
+    a_data = a._data
+    a_scale = a._scale
+    b_data = b._data
+
+    scaled_mm_config = choose_scaled_mm_config(
+        a._gemm_input_role,
+        a._linear_mm_config,
+        b._gemm_input_role,
+        b._linear_mm_config,
+    )
+
+    if scaled_mm_config.pad_inner_dim:
+        assert a._data.size(1) == b._data.size(
+            0
+        ), f"Inner dims must match for mm, got {a._data.size(1)} and {b._data.size(0)}"
+        a_data = pad_tensor_for_matmul(a_data, dims=1)
+        b_data = pad_tensor_for_matmul(b_data, dims=0)
+
+    if not is_row_major(a_data.stride()):
+        a_data = a_data.contiguous()
+    if is_row_major(b_data.stride()):
+        b_data = b_data.t().contiguous().t()
+    b_scale = b._scale
+    return a_data, a_scale, b_data, b_scale
+
+
+@implements([aten.mm.default, aten.matmul.default])
+def float8_mm(aten_op, args, kwargs=None):
+    a = args[0]
+    b = args[1]
+
+    assert isinstance(a, Float8Tensor) and isinstance(
+        b, Float8Tensor
+    ), "Expecting  both Float8Tensor for mm inputs but found {} and {}".format(
+        type(a), type(b)
+    )
+    a_data, a_scale, b_data, b_scale = preprocess_addmm(a, b)
+    output_dtype = a._orig_dtype
+    scaled_mm_config = choose_scaled_mm_config(
+        a._gemm_input_role,
+        a._linear_mm_config,
+        b._gemm_input_role,
+        b._linear_mm_config,
+    )
+    if scaled_mm_config.emulate:
+        return torch.ops.aten.mm_float8_emulated(
+            a._data, a._scale, b._data, b._scale, output_dtype
+        )
+    tensor_out = addmm_float8_unwrapped(
+        a_data,
+        a_scale,
+        b_data,
+        b_scale,
+        output_dtype,
+        output_scale=None,
+        bias=None,
+        use_fast_accum=scaled_mm_config.use_fast_accum,
+    )
+    return tensor_out
+
+
+@implements([aten.addmm.default])
+def float8_addmm(aten_op, args, kwargs=None):
+    assert (
+        isinstance(args[0], torch.Tensor)
+        and isinstance(args[1], Float8Tensor)
+        and isinstance(args[2], Float8Tensor)
+    )
+    bias = args[0]
+    a = args[1]
+    b = args[2]
+    a_data, a_scale, b_data, b_scale = preprocess_addmm(a, b)
+    output_dtype = a._orig_dtype
+    assert bias.dtype == output_dtype, "bias dtype must match output dtype"
+    scaled_mm_config = choose_scaled_mm_config(
+        a._gemm_input_role,
+        a._linear_mm_config,
+        b._gemm_input_role,
+        b._linear_mm_config,
+    )
+    if scaled_mm_config.emulate:
+        out = torch.ops.aten.mm_float8_emulated(
+            a._data, a._scale, b._data, b._scale, output_dtype
+        )
+        return out + bias
+    tensor_out = addmm_float8_unwrapped(
+        a_data,
+        a_scale,
+        b_data,
+        b_scale,
+        output_dtype,
+        output_scale=None,
+        bias=bias,
+        use_fast_accum=scaled_mm_config.use_fast_accum,
+    )
+    return tensor_out
+
+
+@implements([aten.is_same_size.default])
+def float8_is_same_size(aten_op, args, kwargs=None):
+    return args[0].shape == args[1].shape
+
+
+@implements([aten._to_copy.default])
+def autocast_to_copy(aten_op, args, kwargs=None):
+    """This gets called when running matmul under autocast
+    when the input is a Float8Tensor, presenting as a fp32
+    tensor.
+    """
+    assert isinstance(args[0], Float8Tensor)
+    assert (
+        len(kwargs) == 1 and "dtype" in kwargs
+    ), "Only support dtype kwarg for autocast"
+    assert kwargs["dtype"] in {
+        torch.float16,
+        torch.bfloat16,
+    }, "Only support floating point conversion for autocast w/ Float8Tensor"
+    return Float8Tensor(
+        args[0]._data,
+        args[0]._scale,
+        kwargs["dtype"],
+        args[0]._linear_mm_config,
+        args[0]._gemm_input_role,
+    )
+
+
+@implements(
+    [
+        c10d_functional.all_gather_into_tensor.default,
+        _c10d_functional.all_gather_into_tensor.default,
+    ]
+)
+def allgather_fp8(aten_op, args, kwargs=None):
+    """
+    override funcol with FP8 handling
+    """
+    fp8_input = args[0]
+    assert isinstance(
+        fp8_input, Float8Tensor
+    ), f"expecting a Float8Tensor for allgather but found {type(fp8_input)}"
+
+    fp8_data = fp8_input._data
+    fp8_data = fp8_data.contiguous()
+    fp8_out = aten_op(fp8_data, *args[1:], **kwargs)
+    return Float8Tensor(
+        fp8_out,
+        fp8_input._scale,
+        fp8_input._orig_dtype,
+        fp8_input._linear_mm_config,
+        fp8_input._gemm_input_role,
+    )
+
+
+@implements([c10d_functional.wait_tensor.default, _c10d_functional.wait_tensor.default])
+def wait_tensor_fp8(aten_op, args, kwargs=None):
+    fp8_input = args[0]
+    assert isinstance(fp8_input, Float8Tensor)
+
+    fp8_data = fp8_input._data
+    fp8_out = aten_op(fp8_data, *args[1:], **kwargs)
+    return Float8Tensor(
+        fp8_out,
+        fp8_input._scale,
+        fp8_input._orig_dtype,
+        fp8_input._linear_mm_config,
+        fp8_input._gemm_input_role,
+    )
+
+
+@implements([aten.index_put_.default])
+def index_put_fp8(aten_op, args, kwargs=None):
+    fp8_self = args[0]
+    fp8_values = args[2]
+    assert isinstance(fp8_self, Float8Tensor)
+    assert isinstance(fp8_values, Float8Tensor)
+    assert fp8_self._scale == fp8_values._scale
+    assert fp8_self.dtype == fp8_values.dtype
+    assert fp8_self._orig_dtype == fp8_values._orig_dtype
+
+    fp8_data = fp8_self._data
+    fp8_values_data = fp8_values._data
+    fp8_out = aten_op(fp8_data, args[1], fp8_values_data, *args[3:], **kwargs)
+    return Float8Tensor(
+        fp8_out,
+        fp8_self._scale,
+        fp8_self._orig_dtype,
+        fp8_self._linear_mm_config,
+        fp8_self._gemm_input_role,
+    )
+
+
+@implements([aten.copy_.default])
+def copy_fp8(aten_op, args, kwargs=None):
+    # For a copy op with Float8Tensors involved, only the following combinations are allowed:
+    # 1. self is a high precision (hp) tensor, src is a Float8Tensor:
+    #    in this case src is upcasted and unscaled to go into the hp tensor
+    # 2. self and src are Float8Tensors:
+    #    the copy is only allowed if all the Float8Tensor properties are equal (a la torch.cat)
+    # Every other combination is banned as the semantics are not well defined
+
+    self = args[0]
+    src = args[1]
+
+    if not isinstance(self, Float8Tensor) and isinstance(src, Float8Tensor):
+        src_hp = src.to_original_precision()
+        return aten_op(self, src_hp, *args[2:], **kwargs)
+    elif isinstance(self, Float8Tensor) and isinstance(src, Float8Tensor):
+        assert (
+            self._orig_dtype == src._orig_dtype
+        ), "Expecting both Float8Tensors to be of the same dtype"
+        assert (
+            self._scale == src._scale
+        ), "Expecting both Float8Tensors to have thee same scale"
+        assert (
+            self._linear_mm_config == src._linear_mm_config
+        ), "Expecting both Float8Tensors to have thee same mm config"
+        assert (
+            self._data.dtype == src._data.dtype
+        ), "Expecting both Float8Tensors to be of the same dtypet"
+        assert (
+            self._gemm_input_role == src._gemm_input_role
+        ), "Expecting both Float8Tensors to have the same gemm_input_role"
+        fp8_out = aten_op(self._data, src._data, *args[2:], **kwargs)
+        return Float8Tensor(
+            fp8_out,
+            self._scale,
+            self._orig_dtype,
+            self._linear_mm_config,
+            self._gemm_input_role,
+        )
+    else:
+        raise RuntimeError("Unsupported semantics for copy_ in Float8Tensor")
diff --git a/torchao/float8/float8_python_api.py b/torchao/float8/float8_python_api.py
new file mode 100644
index 000000000..16e270574
--- /dev/null
+++ b/torchao/float8/float8_python_api.py
@@ -0,0 +1,64 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+This file defines the Python functions for float8 which expect inputs
+of class `Float8Tensor`. This is a thin wrapper on top of the aten API
+to simplify the product code.
+"""
+
+from typing import Optional
+
+import torchao.float8.float8_aten_api  # noqa
+
+import torch
+
+
+# [Note] Usage of scales
+# The meaning of scale in this library can be found in the definition of the Float8Tensor
+# Cublas defines scale to always mean a multiplicative factor for the respective matrices
+# For a,b going from fp8 -> fp32 we multiple by the inverse of the scale
+# For output going from fp32 -> fp8 we multiply by the scale
+def addmm_float8_unwrapped(
+    a_data: torch.Tensor,
+    a_scale: torch.Tensor,
+    b_data: torch.Tensor,
+    b_scale: torch.tensor,
+    output_dtype: torch.dtype,
+    output_scale: Optional[torch.Tensor] = None,
+    bias: Optional[torch.Tensor] = None,
+    use_fast_accum: bool = False,
+) -> torch.Tensor:
+    """
+    This is the unwrapped version of addmm_float8, which does not take in Float8Tensors
+    as inputs. This is used to standardize the logic between subclassed and non subclassed
+    versions of the linear module.
+    """
+    a_inverse_scale = a_scale.reciprocal()
+    b_inverse_scale = b_scale.reciprocal()
+    if output_dtype == torch.float32 and bias is not None:
+        # Bias is not supported by _scaled_mm when output is fp32
+        output = torch._scaled_mm(
+            a_data,
+            b_data,
+            scale_a=a_inverse_scale,
+            scale_b=b_inverse_scale,
+            scale_result=output_scale,
+            out_dtype=output_dtype,
+            use_fast_accum=use_fast_accum,
+        )
+        output += bias
+        return output
+    output = torch._scaled_mm(
+        a_data,
+        b_data,
+        scale_a=a_inverse_scale,
+        scale_b=b_inverse_scale,
+        bias=bias,
+        scale_result=output_scale,
+        out_dtype=output_dtype,
+        use_fast_accum=use_fast_accum,
+    )
+    return output
diff --git a/torchao/float8/float8_scaling_utils.py b/torchao/float8/float8_scaling_utils.py
new file mode 100644
index 000000000..bbf140eff
--- /dev/null
+++ b/torchao/float8/float8_scaling_utils.py
@@ -0,0 +1,216 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Utilities for scaling high precision tensors to float8.
+"""
+
+from typing import Optional
+
+import torch
+
+from torchao.float8.float8_tensor import (
+    Float8Tensor,
+    GemmInputRole,
+    hp_tensor_and_scale_to_float8,
+    LinearMMConfig,
+    ScaledMMConfig,
+    tensor_already_casted_to_fp8,
+)
+
+from torchao.float8.float8_utils import (
+    amax_history_to_scale,
+    e4m3_dtype,
+    e5m2_dtype,
+    tensor_to_amax,
+    tensor_to_scale,
+)
+
+
+def hp_tensor_to_float8_dynamic(
+    hp_tensor: torch.Tensor,
+    float8_dtype: torch.dtype,
+    linear_mm_config: LinearMMConfig,
+    reduce_amax: bool = False,
+    gemm_input_role: GemmInputRole = GemmInputRole.INPUT,
+) -> Float8Tensor:
+    """
+    Given a high precision tensor `hp_tensor`,
+    scales `hp_tensor` dynamically and returns a `Float8Tensor` of the result.
+
+    Args:
+        hp_tensor: the tensor to convert
+        float8_dtype: the float8 dtype to use
+        linear_mm_config: Defines the configuration for the scaled_mm for
+          the 3 fwd/bwd gemms of linear
+        reduce_amax: whether to reduce the max(abs(hp_tensor)) value across distributed ranks
+        gemm_input_role: Defines the role of this tensor (input, weight or grad_output) in
+          the 3 fwd/bwd gemms of linear
+    """
+    if tensor_already_casted_to_fp8(hp_tensor):
+        return hp_tensor
+    scale = tensor_to_scale(hp_tensor, float8_dtype, reduce_amax)
+    return hp_tensor_and_scale_to_float8(
+        hp_tensor,
+        scale,
+        float8_dtype,
+        linear_mm_config,
+        gemm_input_role,
+    )
+
+
+def hp_tensor_to_float8_delayed(
+    hp_tensor: torch.Tensor,
+    s: torch.Tensor,
+    float8_dtype: torch.dtype,
+    amax_buffer: torch.Tensor,
+    linear_mm_config: Optional[LinearMMConfig] = None,
+    gemm_input_role: Optional[GemmInputRole] = GemmInputRole.INPUT,
+) -> Float8Tensor:
+    """
+    Given a high precision tensor `hp_tensor` and relevant metadata, scales it using
+    delayed scaling and returns a `Float8Tensor` of the result. Specifically:
+    1. calculates max(abs(hp_tensor)) and stores the result in `amax_buffer`, inplace
+    2. scales `hp_tensor` by `s` and returns the result wrapped in Float8Tensor
+
+    Args:
+        hp_tensor: the tensor to convert
+        s: the scale to use to convert the tensor
+        float8_dtype: the float8 dtype to use
+        amax_buffer: the buffer to modify inplace with max(abs(hp_tensor))
+        linear_mm_config: Defines the configuration for the scaled_mm for
+          the 3 fwd/bwd gemms of linear
+        gemm_input_role: Defines the role of this tensor (input, weight or grad_output) in
+          the 3 fwd/bwd gemms of linear
+    """
+    amax_buffer.fill_(tensor_to_amax(hp_tensor))
+    return hp_tensor_and_scale_to_float8(
+        hp_tensor,
+        s,
+        float8_dtype,
+        linear_mm_config,
+        gemm_input_role,
+    )
+
+
+def _maybe_initialize_amaxes_scales_for_float8_cast(
+    x,
+    cur_amax,
+    amax_history,
+    scale,
+    scale_fn_name,
+    float8_dtype,
+    is_initialized,
+    reduce_amax,
+):
+    """
+    If x is about to be cast to `float8` and the amax buffers are not initialized,
+    initializes them inplace.
+    """
+    if is_initialized:
+        return
+    with torch.no_grad():
+        # Note: we need to enable distributed reduction here in order
+        # to match numerics between single GPU and multi GPU code for
+        # activations and gradients
+        new_amax = tensor_to_amax(x, reduce_amax=reduce_amax)
+        cur_amax.fill_(new_amax)
+        amax_history[0] = new_amax
+        new_scale = amax_history_to_scale(
+            amax_history, float8_dtype, x.dtype, scale_fn_name
+        )
+        scale.copy_(new_scale)
+
+
+@torch._dynamo.allow_in_graph
+class NoopFwToFloat8E5M2BwDelayed(torch.autograd.Function):
+    """
+    Forward: no-op
+    Backward: convert to float8_e5m2 with delayed scaling, initialize if needed
+    """
+
+    @staticmethod
+    def forward(
+        ctx,
+        tensor,
+        fp8_amax_grad_output,
+        fp8_amax_history_grad_output,
+        fp8_scale_grad_output,
+        scale_fn_name,
+        is_amax_initialized,
+        linear_mm_config: LinearMMConfig,
+    ):
+        ctx.save_for_backward(
+            fp8_amax_grad_output, fp8_amax_history_grad_output, fp8_scale_grad_output
+        )
+        ctx.scale_fn_name = scale_fn_name
+        ctx.is_amax_initialized = is_amax_initialized
+        ctx.linear_mm_config = linear_mm_config
+        return tensor
+
+    @staticmethod
+    def backward(ctx, go):
+        (
+            fp8_amax_grad_output,
+            fp8_amax_history_grad_output,
+            fp8_scale_grad_output,
+        ) = ctx.saved_tensors
+        scale_fn_name = ctx.scale_fn_name
+        is_amax_initialized = ctx.is_amax_initialized
+
+        _maybe_initialize_amaxes_scales_for_float8_cast(
+            go,
+            fp8_amax_grad_output,
+            fp8_amax_history_grad_output,
+            fp8_scale_grad_output,
+            scale_fn_name,
+            e5m2_dtype,
+            is_amax_initialized,
+            reduce_amax=True,
+        )
+
+        fp8_amax_grad_output.fill_(tensor_to_amax(go))
+
+        res = hp_tensor_and_scale_to_float8(
+            go,
+            fp8_scale_grad_output,
+            e5m2_dtype,
+            ctx.linear_mm_config,
+            GemmInputRole.GRAD_OUTPUT,
+        )
+        empty_grads = None, None, None, None, None, None
+        return res, *empty_grads
+
+
+@torch._dynamo.allow_in_graph
+class NoopFwToFloat8E5M2BwDynamic(torch.autograd.Function):
+    """
+    Forward: no-op
+    Backward: convert to float8_e5m2 with dynamic scaling
+    """
+
+    @staticmethod
+    def forward(
+        ctx,
+        tensor,
+        linear_mm_config: LinearMMConfig,
+    ):
+        ctx.linear_mm_config = linear_mm_config
+        return tensor
+
+    @staticmethod
+    def backward(ctx, gradY):
+        if tensor_already_casted_to_fp8(gradY):
+            return gradY, None
+        gradY_scale = tensor_to_scale(gradY, e5m2_dtype)
+        fp8_tensor = hp_tensor_and_scale_to_float8(
+            gradY,
+            gradY_scale,
+            e5m2_dtype,
+            ctx.linear_mm_config,
+            GemmInputRole.GRAD_OUTPUT,
+        )
+        return fp8_tensor, None
diff --git a/torchao/float8/float8_tensor.py b/torchao/float8/float8_tensor.py
new file mode 100644
index 000000000..a858408fe
--- /dev/null
+++ b/torchao/float8/float8_tensor.py
@@ -0,0 +1,363 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+import enum
+from collections import namedtuple
+from typing import Dict, Optional
+
+import torch
+
+import torch.distributed._functional_collectives as funcol
+from torchao.float8.float8_utils import (
+    e4m3_dtype,
+    tensor_to_amax,
+    to_fp8_saturated,
+)
+from torch.distributed._tensor import DTensor
+
+aten = torch.ops.aten
+
+#
+# A note on configuration of float8 logic in a linear
+# TODO(future): move all the configs to separate file
+# TODO(future): change this to input/weight/grad_output notation,
+#   can be separate PR because none of this is user facing
+#
+# There are three gemms in a forward + backward of a Linear layer:
+#
+# 1.       input @ weight_t    = output     (forward pass)
+# 2. grad_output @ weight      = grad_input (backward pass)
+# 3.     input_t @ grad_output = grad_weight (backward pass)
+#
+# In the formulas above, there are:
+# A. six input tensors (input, input_t, weight, weight_t, grad_output, grad_output_t).
+#    - Note that grad_output_t is implied because of memory format requirements
+#      of float8 gemms
+# B. three output tensors (output, grad_input, grad_weight)
+#
+# We want each input tensor, gemm, and output tensor to be configurable.
+# The state of this configuration today is:
+#
+# i. pairs of input tensors (non-t and t variants) have their scaling
+#    configurable via the scaling_type_* arguments to Float8Linear
+# ii. each gemm + output is configurable via ScaledMMConfig, which is not user facing
+# iii. LinearMMConfig is a container for the three ScaledMMConfig objects needed
+#    to configure all three gemms, also not user facing
+
+
+# ScaledMMConfig is a namedtuple that defines the configuration for the scaled_mm in the forward and backward pass.
+# emulate: whether to emulate the matmuls in fp32
+# use_fast_accum: whether to use the fast-accumulation option for scaled_mm
+# fp8_output: whether to output the result of the scaled_mm in fp8
+# pad_inner_dim: whether to pad the inner dimension of a and b with 0s. This is needed for matmuls not aligned to 16.
+ScaledMMConfig = namedtuple(
+    "ScaledMMConfig",
+    ["emulate", "use_fast_accum", "fp8_output", "pad_inner_dim"],
+    defaults=[False, False, False, False],
+)
+
+# The object below is not user facing and exists for convenience,
+# to allow Float8Tensor to use
+# the right config based on which gemm from gemms with outputs
+# `output`, `grad_input`, `grad_weight` is
+# being called.
+LinearMMConfig = namedtuple(
+    "LinearMMConfig",
+    ["output", "grad_input", "grad_weight"],
+    defaults=[
+        ScaledMMConfig(False, True, False, False),
+        ScaledMMConfig(False, False, False, False),
+        ScaledMMConfig(False, False, False, False),
+    ],
+)
+
+
+class GemmInputRole(enum.Enum):
+    """
+    Given a Float8Tensor, the enum below describes the expected role of this
+    tensor in the three gemms present in the fw + bw pass of a Linear layer.
+    This is used to choose the right config for a float8 gemm when the
+    gemm is performed.
+    """
+
+    INPUT = "input"
+    WEIGHT = "weight"
+    GRAD_OUTPUT = "grad_output"
+
+
+# choose which scaled_mm_config to use based on gemm inputs
+def choose_scaled_mm_config(
+    a_role: GemmInputRole,
+    a_linear_mm_config: LinearMMConfig,
+    b_role: GemmInputRole,
+    b_linear_mm_config: LinearMMConfig,
+):
+    if a_role is GemmInputRole.INPUT and b_role is GemmInputRole.WEIGHT:
+        assert (
+            a_linear_mm_config.output == b_linear_mm_config.output
+        ), f"linear_mm_config.output mismatch: {a_linear_mm_config.output} vs {b_linear_mm_config.output}"
+        return a_linear_mm_config.output
+    elif a_role is GemmInputRole.GRAD_OUTPUT and b_role is GemmInputRole.WEIGHT:
+        assert (
+            a_linear_mm_config.grad_input == b_linear_mm_config.grad_input
+        ), f"linear_mm_config.grad_input mismatch: {a_linear_mm_config.grad_input} vs {b_linear_mm_config.grad_input}"
+        return a_linear_mm_config.grad_input
+    elif a_role is GemmInputRole.GRAD_OUTPUT and b_role is GemmInputRole.INPUT:
+        assert (
+            a_linear_mm_config.grad_weight == b_linear_mm_config.grad_weight
+        ), f"linear_mm_config.grad_weight mismatch: {a_linear_mm_config.grad_weight} vs {b_linear_mm_config.grad_weight}"
+        return a_linear_mm_config.grad_weight
+    else:
+        raise AssertionError(f"unexpected a_role {a_role} and b_role {b_role}")
+
+
+def tensor_already_casted_to_fp8(tensor: torch.Tensor) -> bool:
+    """
+    Check if the tensor is already casted to fp8
+    """
+    if isinstance(tensor, Float8Tensor):
+        return True
+    elif isinstance(tensor, DTensor):
+        # TODO: shall we stick to public API and directly use tensor.to_local() here?
+        return tensor_already_casted_to_fp8(tensor._local_tensor)
+    elif isinstance(tensor, funcol.AsyncCollectiveTensor):
+        return tensor_already_casted_to_fp8(tensor.elem)
+
+    return False
+
+
+@torch._dynamo.allow_in_graph
+class _ToFloat8ConstrFunc(torch.autograd.Function):
+    """
+    A differentiable conversion to fp8.
+    * forward: convert from high precision to float8
+    * backward: pass the gradient without changes
+    """
+
+    @staticmethod
+    def forward(
+        ctx,
+        tensor: torch.Tensor,
+        scale: torch.Tensor,
+        float8_dtype=e4m3_dtype,
+        linear_mm_config: Optional[LinearMMConfig] = None,
+        gemm_input_role: Optional[GemmInputRole] = GemmInputRole.INPUT,
+    ):
+        """
+        This function will apply the scaling, and then convert to a Float8Tensor
+
+        Note:
+        We will call this function with a DTensor subclass. Ideally this would be an aten OP
+        that DTensor could overload to ensure proper semantics. There are some techincal issues
+        with that composing with FakeTensor, so we special case here.
+
+        DTensor Invariant: DTensor must always be the outer most tensor subclass
+        """
+        tensor_scaled = tensor * scale
+        bits_fp8 = to_fp8_saturated(tensor_scaled, float8_dtype)
+
+        if isinstance(bits_fp8, DTensor):
+            assert isinstance(
+                scale, DTensor
+            ), "Expected Float8 scale to be a DTensor if bits_fp8 is a DTensor"
+            bits_mesh = bits_fp8.device_mesh
+            bits_placements = bits_fp8.placements
+            local_bits = bits_fp8.to_local()
+            local_scale = scale.to_local()
+            inner_float8_tensor = Float8Tensor(
+                local_bits,
+                local_scale,
+                tensor.dtype,
+                linear_mm_config=linear_mm_config,
+                gemm_input_role=gemm_input_role,
+            )
+            return DTensor.from_local(
+                inner_float8_tensor,
+                bits_mesh,
+                bits_placements,
+                run_check=False,
+                shape=bits_fp8.size(),
+                stride=bits_fp8.stride(),
+            )
+
+        return Float8Tensor(
+            bits_fp8,
+            scale,
+            tensor.dtype,
+            linear_mm_config=linear_mm_config,
+            gemm_input_role=gemm_input_role,
+        )
+
+    @staticmethod
+    def backward(ctx, g):
+        return g, None, None, None, None, None
+
+
+@torch._dynamo.allow_in_graph
+class _FromFloat8ConstrFunc(torch.autograd.Function):
+    """
+    A differentiable conversion from fp8.
+    * forward: convert from float8 to high precision
+    * backward: pass the gradient without changes
+    """
+
+    @staticmethod
+    def forward(ctx, tensor):
+        return tensor._data.to(tensor._orig_dtype) / tensor._scale
+
+    @staticmethod
+    def backward(ctx, g):
+        return g, None, None
+
+
+def hp_tensor_and_scale_to_float8(
+    hp_tensor: torch.Tensor,
+    s: torch.Tensor,
+    float8_dtype=e4m3_dtype,
+    linear_mm_config: Optional[LinearMMConfig] = None,
+    gemm_input_role: Optional[GemmInputRole] = GemmInputRole.INPUT,
+):
+    """
+    Given a high precision tensor `hp_tensor` and a precalculated scale `s`,
+    scales `hp_tensor` by `s` and returns a `Float8Tensor` of the result.
+
+    Autograd-aware, the derivative is pass-through.
+    DTensor-aware, if the input is a DTensor the output will be DTensor(Float8Tensor).
+
+    Args:
+        hp_tensor: the tensor to convert
+        s: the scale to use to convert the tensor
+        float8_dtype: the float8 dtype to use
+        linear_mm_config: Defines the configuration for the scaled_mm for
+          the 3 fwd/bwd gemms of linear
+        gemm_input_role: Defines the role of this tensor (input, weight or grad_output) in
+          the 3 fwd/bwd gemms of linear
+    """
+    return _ToFloat8ConstrFunc.apply(
+        hp_tensor, s, float8_dtype, linear_mm_config, gemm_input_role
+    )
+
+
+class Float8Tensor(torch.Tensor):
+    """
+    Note: this is **not** a public API and is only intended to be used
+    inside of this repository. Please file an issue if you would benefit
+    from this being a public API.
+
+    A Python-only Float8 tensor subclass.  Contains:
+    * `_data`: the underlying e4m3 or e5m2 data
+    * `_scale`: the scale used to scale the original fp32 tensor. We multiply
+      by scale to go from fp32 range to fp8 range, and divide by scale to go
+      from fp8 range to fp32 range.
+    * `_orig_dtype`: the original dtype of the tensor used to create this
+      tensor.
+    * `_emulate`: if true using fp32 emulation for the matmuls, helpful
+      if you don't have access to h100 hardware.
+
+    Intended usage of this abstraction:
+    1. to bundle raw data + fp8 metadata together for easy passing through
+       Python PyTorch systems.
+    2. Float8-aware user code can use the private fields on these tensors
+       to call into float8 operations.
+    3. Float8-agnostic user code can use these tensors as is - they will
+       convert to original precision in `__torch_dispatch__`.
+    """
+
+    _data: torch.Tensor
+    _scale: torch.Tensor
+    _orig_dtype: torch.dtype
+    _linear_mm_config: LinearMMConfig
+    __slots__ = ["_data", "_scale", "_orig_dtype", "_linear_mm_config"]
+
+    def __new__(
+        cls,
+        data: torch.Tensor,
+        scale: torch.Tensor,
+        orig_dtype: torch.dtype,
+        linear_mm_config: Optional[LinearMMConfig],
+        gemm_input_role: Optional[GemmInputRole] = GemmInputRole.INPUT,
+    ):
+        assert (
+            scale.numel() == 1
+        ), "Scale should contain a single value, but got: {} elements".format(
+            scale.numel()
+        )
+
+        self = torch.Tensor._make_wrapper_subclass(
+            cls,
+            data.size(),
+            strides=data.stride(),
+            storage_offset=data.storage_offset(),
+            dtype=orig_dtype,
+            layout=data.layout,
+            requires_grad=data.requires_grad,
+            device=data.device,
+        )
+        self._data = data
+        self._scale = scale
+        self._orig_dtype = orig_dtype
+        self._linear_mm_config = (
+            linear_mm_config if linear_mm_config is not None else LinearMMConfig()
+        )
+        self._gemm_input_role = gemm_input_role
+
+        return self
+
+    def __repr__(self):
+        return f"Float8Tensor(dtype={self._data.dtype}, scale={self._scale}, linear_mm_config={self._linear_mm_config}\ngemm_input_role={self._gemm_input_role}\nas_orig_prec={self.to_original_precision()}"
+
+    def __tensor_flatten__(self):
+        ctx = {
+            "_orig_dtype": self._orig_dtype,
+            "_linear_mm_config": self._linear_mm_config,
+            "_gemm_input_role": self._gemm_input_role,
+        }
+        return ["_data", "_scale"], ctx
+
+    @staticmethod
+    def __tensor_unflatten__(inner_tensors: Dict, metadata, outer_size, outer_stride):
+        assert len(inner_tensors) == 2
+        return Float8Tensor(
+            inner_tensors["_data"],
+            inner_tensors["_scale"],
+            metadata["_orig_dtype"],
+            metadata["_linear_mm_config"],
+            metadata["_gemm_input_role"],
+        )
+
+    def to_original_precision(self):
+        return _FromFloat8ConstrFunc.apply(self)
+
+    @classmethod
+    def __torch_dispatch__(cls, func, types, args, kwargs=None):
+        # 1. tracing through __torch_function__ logic is not supported yet in
+        # PT2.0, so we explicitly disallow it here for callsites from user code.
+        # 2. We do need to handle a couple of ops in order for
+        # TorchDynamo tracing to succeed.
+
+        # Lazy import to avoid circular dependency
+        from torchao.float8.float8_ops import FLOAT8_OPS_TABLE
+
+        # All ops in the FLOAT8_OPS_TABLE expect Float8Tensor as inputs
+        # And don't support mixed tensor subclasses. This will trigger the handler for
+        # the next type in the dispatch list
+        def allowed_subclasses(type):
+            return (
+                issubclass(cls, type)
+                or issubclass(torch._subclasses.fake_tensor.FakeTensor, type)
+                or issubclass(
+                    torch._subclasses.functional_tensor.FunctionalTensor, type
+                )
+            )
+
+        if not all(allowed_subclasses(t) for t in types):
+            return NotImplemented
+
+        if func in FLOAT8_OPS_TABLE:
+            return FLOAT8_OPS_TABLE[func](func, args, kwargs)
+        raise NotImplementedError(f"attempting to run {func}, this is not supported")
+
+    # Do not force the Float8Tensor type on the returned tensor
+    __torch_function__ = torch._C._disabled_torch_function_impl
diff --git a/torchao/float8/float8_tensor_parallel.py b/torchao/float8/float8_tensor_parallel.py
new file mode 100644
index 000000000..affec2a76
--- /dev/null
+++ b/torchao/float8/float8_tensor_parallel.py
@@ -0,0 +1,235 @@
+import torch
+import torch.nn as nn
+from torchao.float8.config import ScalingType
+from torchao.float8.float8_scaling_utils import (
+    hp_tensor_to_float8_dynamic,
+    NoopFwToFloat8E5M2BwDynamic,
+)
+from torchao.float8.float8_tensor import GemmInputRole
+from torchao.float8.float8_utils import e4m3_dtype
+from torch.distributed._tensor import DTensor
+from torch.distributed.device_mesh import DeviceMesh
+from torch.distributed.tensor.parallel import (
+    ColwiseParallel,
+    PrepareModuleInput,
+    RowwiseParallel,
+)
+
+# subclass the ColwiseParallel and RowwiseParallel classes
+# to add the float8 support
+# The parameter sharding stays the same as the core
+# ColwiseParallel and RowwiseParallel, the only difference
+# here is that in input/output handling we do casting after
+# creating the DTensor.
+
+# NOTE: This only works and tested with the dynamic scaling
+
+
+def _float8_linear_supports_float8_allgather(m):
+    # TODO(future): add support for delayed scaling for activations
+    # and gradients
+    return (
+        m.scaling_type_input == ScalingType.DYNAMIC
+        and m.scaling_type_grad_output == ScalingType.DYNAMIC
+    )
+
+
+class Float8ColwiseParallel(ColwiseParallel):
+    @staticmethod
+    def _prepare_input_fn(
+        input_layouts, desired_input_layouts, mod, inputs, device_mesh
+    ):
+        # annotate module input placements/sharding with input_layouts
+        input_tensor = inputs[0]
+        if not isinstance(input_tensor, DTensor):
+            input_tensor = DTensor.from_local(
+                input_tensor, device_mesh, input_layouts, run_check=False
+            )
+
+        input_tensor = hp_tensor_to_float8_dynamic(
+            input_tensor,
+            e4m3_dtype,
+            mod.linear_mm_config,
+            gemm_input_role=GemmInputRole.INPUT,
+        )  # DTensor(Float8Tensor)
+
+        # transform the input layouts to the desired layouts of ColwiseParallel
+        if input_layouts != desired_input_layouts:
+            input_tensor = input_tensor.redistribute(
+                placements=desired_input_layouts, async_op=True
+            )
+        return input_tensor
+
+    @staticmethod
+    def _prepare_output_fn(output_layouts, use_local_output, mod, outputs, device_mesh):
+        # outputs is a shard on last dimension DTensor, i.e. Shard(-1)
+        if outputs.placements != output_layouts:
+            outputs = outputs.redistribute(
+                placements=output_layouts, async_op=True
+            )  # DTensor(torch.Tensor)
+
+        # fwd noop bwd cast to DTensor(Float8Tensor)
+        outputs = NoopFwToFloat8E5M2BwDynamic.apply(outputs, mod.linear_mm_config)
+
+        # back to local tensor
+        return outputs.to_local() if use_local_output else outputs
+
+    def _apply(self, module: nn.Module, device_mesh: DeviceMesh) -> nn.Module:
+        from torchao.float8.float8_linear import Float8Linear
+
+        if not isinstance(module, Float8Linear):
+            raise ValueError(
+                f"Expecting module to be Float8Linear but found {type(module)}"
+            )
+        elif isinstance(
+            module, Float8Linear
+        ) and not _float8_linear_supports_float8_allgather(module):
+            raise AssertionError("unsupported")
+
+        return super()._apply(module, device_mesh)
+
+
+class Float8RowwiseParallel(RowwiseParallel):
+    @staticmethod
+    def _prepare_input_fn(
+        input_layouts, desired_input_layouts, mod, inputs, device_mesh
+    ):
+        input_tensor = inputs[0]
+        if not isinstance(input_tensor, DTensor):
+            input_tensor = DTensor.from_local(
+                input_tensor, device_mesh, input_layouts, run_check=False
+            )
+
+        input_tensor = hp_tensor_to_float8_dynamic(
+            input_tensor,
+            e4m3_dtype,
+            mod.linear_mm_config,
+            gemm_input_role=GemmInputRole.INPUT,
+        )  # DTensor(Float8Tensor)
+
+        if input_layouts != desired_input_layouts:
+            input_tensor = input_tensor.redistribute(
+                placements=desired_input_layouts, async_op=True
+            )
+        return input_tensor
+
+    @staticmethod
+    def _prepare_output_fn(output_layouts, use_local_output, mod, outputs, device_mesh):
+        # Rowwise sharding produces partial output, depending on output layouts:
+        # 1. to replicate -> allreduce
+        # 2. to shard -> reduce_scatter
+        if outputs.placements != output_layouts:
+            outputs = outputs.redistribute(placements=output_layouts, async_op=True)
+
+        # fwd noop bwd cast to DTensor(Float8Tensor)
+        outputs = NoopFwToFloat8E5M2BwDynamic.apply(outputs, mod.linear_mm_config)
+
+        # back to local tensor if use_local_output is True
+        return outputs.to_local() if use_local_output else outputs
+
+    def _apply(self, module: nn.Module, device_mesh: DeviceMesh) -> nn.Module:
+        from torchao.float8.float8_linear import Float8Linear
+
+        if not isinstance(module, Float8Linear):
+            raise ValueError(
+                f"Expecting module to be Float8Linear but found {type(module)}"
+            )
+        elif isinstance(
+            module, Float8Linear
+        ) and not _float8_linear_supports_float8_allgather(module):
+            raise AssertionError("unsupported")
+
+        return super()._apply(module, device_mesh)
+
+
+class PrepareFloat8ModuleInput(PrepareModuleInput):
+    # subclass the PrepareModuleInput classes to implement fp8 specific logic, the only difference is that
+    # after we prepare the input DTensor, we cast the input to DTensor(Float8Tensor)
+    # This is to ensure the float8 cast happens before the all-gather (i.e. Shard -> Replicate)
+    # so that if there are multiple float8 users of the input activation, we perform fp8 allgather
+    # only once.
+    # FP8 Args:
+    #   float8_dtype (torch.dtype, optional): control what float8 dtype to cast to when prepare the module input,
+    #       we currently only support torch.float8_e4m3fn. default: torch.float8_e4m3fn
+    #   fwd_config_submodule_fqn (str, optional): the fqn of the submodule that contains the forward config used
+    #       for the float8 cast. If not specified, we will search for the Float8Linear in the submodules
+    #       and use the forward config from that module, in this case all module's forward config must be
+    #       the same.
+
+    def __init__(
+        self,
+        *,
+        input_layouts=None,
+        desired_input_layouts=None,
+        input_kwarg_layouts=None,
+        desired_input_kwarg_layouts=None,
+        use_local_output=False,
+        float8_dtype=torch.float8_e4m3fn,
+        fwd_config_submodule_fqn=None,
+    ):
+        super().__init__(
+            input_layouts=input_layouts,
+            desired_input_layouts=desired_input_layouts,
+            input_kwarg_layouts=input_kwarg_layouts,
+            desired_input_kwarg_layouts=desired_input_kwarg_layouts,
+            use_local_output=use_local_output,
+        )
+
+        # fp8 specific fields
+        self.float8_dtype = float8_dtype
+        self.linear_mm_config = None
+        self.fwd_config_submodule_fqn = fwd_config_submodule_fqn
+
+        if self.float8_dtype != torch.float8_e4m3fn:
+            raise NotImplementedError(
+                "PrepareFloat8ModuleInput only support casting to float8_e4m3fn for now"
+            )
+
+    def _prepare_input_arg(self, input, mesh, input_layout, desired_layout):
+        if input_layout is not None:
+            if isinstance(input, DTensor):
+                # TODO: re-enable the check once we fix the compile path
+                # assert inp.placements[0] == input_layout
+                dt_inp = input
+            else:
+                assert isinstance(
+                    input, torch.Tensor
+                ), "expecting input to be a torch.Tensor!"
+                dt_inp = DTensor.from_local(
+                    input, mesh, (input_layout,), run_check=False
+                )
+
+            dt_inp = hp_tensor_to_float8_dynamic(
+                dt_inp,
+                e4m3_dtype,
+                self.linear_mm_config,
+                gemm_input_role=GemmInputRole.INPUT,
+            )  # DTensor(Float8Tensor)
+            if desired_layout is not None and input_layout != desired_layout:
+                dt_inp = dt_inp.redistribute(placements=(desired_layout,))
+
+            return dt_inp.to_local() if self.use_local_output else dt_inp
+        else:
+            return input
+
+    def _apply(self, module: nn.Module, device_mesh: DeviceMesh) -> nn.Module:
+        from torchao.float8.float8_linear import Float8Linear
+
+        if self.fwd_config_submodule_fqn is not None:
+            fwd_linear = module.get_submodule(self.fwd_config_submodule_fqn)
+            assert isinstance(fwd_linear, Float8Linear)
+            self.linear_mm_config = fwd_linear.linear_mm_config
+        else:
+            # search for ScaledMM configs for all the submodules and make sure they are the same
+            for mod in module.modules():
+                if isinstance(mod, Float8Linear):
+                    if self.linear_mm_config is None:
+                        self.linear_mm_config = mod.linear_mm_config
+                    else:
+                        assert (
+                            self.linear_mm_config == mod.linear_mm_config
+                        ), "All the Float8Linear modules should have same linear_mm_config!"
+
+        assert self.linear_mm_config is not None
+        super()._apply(module, device_mesh)
+        return module
diff --git a/torchao/float8/float8_utils.py b/torchao/float8/float8_utils.py
new file mode 100644
index 000000000..1d6c69d17
--- /dev/null
+++ b/torchao/float8/float8_utils.py
@@ -0,0 +1,247 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Iterable, Literal, Tuple, Union
+
+import torchao.float8.config as config
+
+import torch
+import torch.distributed as dist
+
+# Helpful visualizer for debugging (only supports fp32):
+# https://www.h-schmidt.net/FloatConverter/IEEE754.html
+
+# avoid division by zero when calculating scale
+# TODO: align this value with NVIDIA's assumptions (current value is a guess)
+EPS = 1e-12
+
+IS_ROCM = torch.cuda.is_available() and torch.version.hip is not None
+FP8_TYPES = {
+    torch.float8_e4m3fn,
+    torch.float8_e5m2,
+    torch.float8_e4m3fnuz,
+    torch.float8_e5m2fnuz,
+}
+
+
+# User defined type for using the individual F8 type based on config
+e4m3_dtype = torch.float8_e4m3fn if not config.use_fnuz_dtype else torch.float8_e4m3fnuz
+e5m2_dtype = torch.float8_e5m2 if not config.use_fnuz_dtype else torch.float8_e5m2fnuz
+
+
+@torch.no_grad()
+def amax_to_scale(
+    amax: torch.Tensor, float8_dtype: torch.dtype, orig_dtype: torch.dtype
+):
+    """Converts the amax value of a tensor to the fp8 scale.
+    Args:
+        amax: The amax value of the tensor.
+        float8_dtype: The float8 dtype.
+        orig_dtype: The original dtype of the tensor.
+    """
+    scale = torch.empty_like(amax, dtype=torch.float32)
+    if float8_dtype in FP8_TYPES:
+        res = torch.finfo(float8_dtype).max / torch.clamp(amax, min=EPS)
+    else:
+        raise ValueError(f"Unsupported float8_dtype: {float8_dtype}")
+
+    # Ensure that the scale is representable in float16,
+    # this helps when amax is small. We are assuming that we don't need
+    # to care about this for float32/bfloat16.
+    if orig_dtype is torch.float16:
+        res = torch.clamp(res, max=torch.finfo(torch.float16).max)
+    scale.copy_(res)
+    return scale
+
+
+@torch.no_grad()
+def amax_history_to_scale(
+    amax_history: torch.Tensor,
+    float8_dtype: torch.Tensor,
+    orig_dtype: torch.dtype,
+    history_to_scale_fn_type: Literal["max"],
+):
+    """Takes in a history of amax values and returns a scale tensor.
+    Args:
+        amax_history: A tensor containing the history of amax values.
+        float8_dtype: The float8 dtype.
+        orig_dtype: The original dtype of the tensor.
+        history_to_scale_fn_type: The type of function to use to convert the history to a scale.
+    """
+    if history_to_scale_fn_type == "max":
+        amax = torch.max(amax_history)
+        return amax_to_scale(amax, float8_dtype, orig_dtype)
+    raise NotImplementedError()
+
+
+@torch.no_grad()
+def amax_history_to_scale_stack(
+    amax_history: torch.Tensor,
+    float8_dtype: torch.dtype,
+    orig_dtype: torch.dtype,
+    history_to_scale_fn_type: Literal["max"],
+) -> torch.Tensor:
+    """Takes in a stack of amax_history tensors and returns a scale tensor.
+    Args:
+        amax_history: A 2D tensor containing a stack of amax histories.
+        float8_dtype: The float8 dtype.
+        orig_dtype: The original dtype of the tensor.
+        history_to_scale_fn_type: The type of function to use to convert the history to a scale.
+    """
+    if history_to_scale_fn_type == "max":
+        amax_stack = torch.max(amax_history, dim=1).values
+        return amax_to_scale(amax_stack, float8_dtype, orig_dtype)
+    raise NotImplementedError(
+        f"Invalid history_to_scale_fn_type, only 'max' is supported. Got: {history_to_scale_fn_type}"
+    )
+
+
+@torch.no_grad()
+def tensor_to_amax(x: torch.Tensor, reduce_amax: bool = False) -> torch.Tensor:
+    amax = torch.max(torch.abs(x))
+
+    # If the user asked for distributed reduction, do it.
+    # If the user did not ask for it, assume that it will
+    # happen elsewhere.
+    if reduce_amax and dist.is_initialized():
+        dist.all_reduce(amax, op=dist.ReduceOp.MAX)
+
+    return amax
+
+
+@torch.no_grad()
+def tensor_to_scale(
+    x: torch.Tensor, float8_dtype: torch.dtype, reduce_amax: bool = False
+) -> torch.Tensor:
+    amax = tensor_to_amax(x, reduce_amax=reduce_amax)
+    return amax_to_scale(amax, float8_dtype, x.dtype)
+
+
+def to_fp8_saturated(x: torch.Tensor, float8_dtype: torch.dtype):
+    """Converts a tensor to a saturated fp8 tensor.
+
+    Note:
+        The default behavior in PyTorch for casting to `float8_e4m3fn`
+        and `e5m2` is to not saturate. In this context, we should saturate.
+        A common case where we want to saturate is when the history of a
+        tensor has a maximum value of `amax1`, and the current amax value
+        is `amax2`, where `amax1 < amax2`. This is common when using delayed
+        scaling.
+    """
+    if float8_dtype in FP8_TYPES:
+        max_value = torch.finfo(float8_dtype).max
+        x = x.clamp(min=-max_value, max=max_value)
+        return x.to(float8_dtype)
+    else:
+        raise ValueError(f"Unsupported float8_dtype: {float8_dtype}")
+
+
+def compute_error(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+    """Computes the error between two tensors in dB.
+
+    For more details see:
+        https://en.wikipedia.org/wiki/Signal-to-noise_ratio
+
+    Args:
+        x: The original tensor.
+        y: The tensor to compare to the original tensor.
+    """
+    Ps = torch.norm(x)
+    Pn = torch.norm(x - y)
+    return 20 * torch.log10(Ps / Pn)
+
+
+def fp8_tensor_statistics(
+    tensor: torch.Tensor, float8_dtype=e4m3_dtype
+) -> Tuple[int, ...]:
+    """Calculate FP8 tensor stats
+
+    Args:
+        tensor: The tensor to calculate stats for.
+        float8_dtype: The float8 dtype.
+
+    Returns:
+        A tuple containing the number of zeros and the number of max values.
+    """
+    if float8_dtype in FP8_TYPES:
+        FP8_MAX = torch.finfo(float8_dtype).max
+    else:
+        raise ValueError(f"Unsupported float8_dtype: {float8_dtype}")
+    tensor_orig_type = tensor._data.to(dtype=tensor._orig_dtype)
+    num_max = (torch.abs(tensor_orig_type) == FP8_MAX).sum().item()
+    num_zero = (tensor_orig_type == 0).sum().item()
+    return (num_zero, num_max)
+
+
+def is_row_major(stride):
+    assert len(stride) == 2, "is_row_major only supports 2D tensors"
+    return stride[0] > stride[1] and stride[1] == 1
+
+
+def _get_min_alignment(size: int, alignment_value: int) -> int:
+    """
+    Returns the minimum alignment value that is greater than or equal to the given size.
+
+    Args:
+        size: The size of the data to be aligned.
+        alignment_value: The alignment value to be used.
+
+    Returns:
+        int: The minimum alignment value that is greater than or equal to the given size.
+
+    Usage:
+    ```
+        >>> _get_min_alignment(10, 8)
+        16
+    ```
+    """
+    if size % alignment_value == 0:
+        return size
+    return (1 + (size // alignment_value)) * alignment_value
+
+
+def pad_tensor_for_matmul(
+    tensor: torch.Tensor, dims: Union[int, Iterable[int]]
+) -> torch.Tensor:
+    """
+    Pads a 2D tensor with zeros to ensure that its dimensions are multiples of 16, which is required `torch._scaled_mm`
+
+    Args:
+        tensor: The tensor to pad.
+        both: Whether to pad both dimensions or just the second dimension.
+
+    Returns:
+        torch.Tensor: The padded tensor.
+
+    Usage:
+    ```
+        >>> pad_tensor_for_matmul(torch.randn((10, 10)), dims=0).shape
+        torch.Size([16, 10])
+        >>> pad_tensor_for_matmul(torch.randn((10, 10)), dims=1).shape
+        torch.Size([10, 16])
+        >>> pad_tensor_for_matmul(torch.randn((10, 10)), dims=(0, 1)).shape
+        torch.Size([16, 16])
+    ```
+    """
+    assert tensor.dim() == 2
+    dim1, dim2 = tensor.shape
+
+    if isinstance(dims, int):
+        dims = (dims,)
+
+    # Calculate aligned dimensions based on the specified dims
+    dim1_aligned = _get_min_alignment(dim1, 16) if 0 in dims else dim1
+    dim2_aligned = _get_min_alignment(dim2, 16) if 1 in dims else dim2
+
+    # Check if padding is needed for either dimension
+    if dim1 == dim1_aligned and dim2 == dim2_aligned:
+        return tensor
+
+    # Calculate padding values for both dimensions
+    pad_dim1 = dim1_aligned - dim1
+    pad_dim2 = dim2_aligned - dim2
+
+    return torch.nn.functional.pad(tensor, (0, pad_dim2, 0, pad_dim1))
diff --git a/torchao/float8/fsdp_utils.py b/torchao/float8/fsdp_utils.py
new file mode 100644
index 000000000..5f53f5d82
--- /dev/null
+++ b/torchao/float8/fsdp_utils.py
@@ -0,0 +1,388 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from typing import Any, List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.utils._pytree as pytree
+from torchao.float8.float8_scaling_utils import (
+    hp_tensor_to_float8_delayed,
+    hp_tensor_to_float8_dynamic,
+)
+
+from torchao.float8.float8_tensor import (
+    Float8Tensor,
+    GemmInputRole,
+    hp_tensor_and_scale_to_float8,
+    LinearMMConfig,
+)
+
+from torchao.float8.float8_utils import e4m3_dtype, EPS
+from torch._prims_common import suggest_memory_format
+
+
+@torch.no_grad()
+def precompute_float8_dynamic_scale_for_fsdp(module: nn.Module) -> None:
+    """
+    Calculate scale dynamically for all float8 parameters.
+    This should be run after the optimizer step. It performs a single all-reduce to compute the
+    scales for all float8 weights.
+    Example usage:
+        model(input).sum().backward()
+        optim.step()
+        precompute_float8_dynamic_scale_for_fsdp(model)
+    """
+    from torchao.float8.config import ScalingType
+    from torchao.float8.float8_linear import Float8Linear
+    from torch.distributed._tensor import DTensor
+
+    if any(
+        isinstance(m, Float8Linear) and m.scaling_type_weight is ScalingType.DELAYED
+        for m in module.modules()
+    ):
+        raise NotImplementedError("Only supports delayed scaling")
+    float8_linears: List[Float8Linear] = [
+        m
+        for m in module.modules()
+        if isinstance(m, Float8Linear)
+        and isinstance(m.weight, DTensor)
+        and isinstance(m.weight._local_tensor, WeightWithDynamicFloat8CastTensor)
+    ]
+    weights: List[DTensor] = [float8_linear.weight for float8_linear in float8_linears]
+
+    if not weights:
+        return
+
+    # inf-norm is equivalent to max(abs(w))
+    max_weights = torch._foreach_norm(weights, ord=math.inf)  # Partial
+    amax_tensor = torch.stack(max_weights)  # Partial
+    # clamp is dispatched through DTensor
+    # it will issue a single all-reduce
+    amax_tensor = torch.clamp(amax_tensor, EPS)  # Replicate
+    scale_tensor = torch.finfo(torch.float8_e4m3fn).max / amax_tensor  # Replicate
+    if amax_tensor.dtype is torch.float16:
+        scale_tensor = torch.clamp(scale_tensor, max=torch.finfo(torch.float16).max)
+    local_scale_tensor = scale_tensor.to_local()
+    for i, float8_linear in enumerate(float8_linears):
+        float8_linear.weight._local_tensor._precomputed_scale = local_scale_tensor[i]
+
+
+# FSDP pads its local tensor on dim-0. The subclass should be preserved such
+# that the padded local tensor (and any transformations like copying to GPU)
+# is of the subclass as well.
+_ops_to_preserve_subclass = {
+    torch.ops.aten.empty_like.default,
+    torch.ops.aten.new_zeros.default,
+    torch.ops.aten.slice.Tensor,
+    torch.ops.aten.copy_.default,
+    torch.ops.aten.view.default,
+    torch.ops.aten.as_strided.default,
+    torch.ops.aten._to_copy.default,
+    torch.ops.aten._pin_memory.default,
+}
+
+
+class WeightWithDynamicFloat8CastTensor(torch.Tensor):
+    @staticmethod
+    def __new__(
+        cls,
+        tensor: torch.Tensor,
+        linear_mm_config: LinearMMConfig,
+        precomputed_scale: Optional[torch.Tensor] = None,
+    ):
+        return torch.Tensor._make_wrapper_subclass(
+            cls,
+            tensor.size(),
+            strides=tensor.stride(),
+            storage_offset=tensor.storage_offset(),
+            memory_format=suggest_memory_format(tensor),
+            dtype=tensor.dtype,
+            layout=tensor.layout,
+            device=tensor.device,
+            pin_memory=tensor.is_pinned(),
+            requires_grad=tensor.requires_grad,
+        )
+
+    def __init__(
+        self,
+        tensor: torch.Tensor,
+        linear_mm_config: LinearMMConfig,
+        precomputed_scale: Optional[torch.Tensor] = None,
+    ):
+        self._tensor = tensor
+        self._linear_mm_config = linear_mm_config
+        # for dynamic scaling
+        # `precompute_float8_dynamic_scale_for_fsdp` calculates scales
+        # for all float8 parameters after optimizer step
+        self._precomputed_scale = precomputed_scale
+
+    @classmethod
+    def __torch_dispatch__(cls, func, types, args, kwargs=None):
+        if func == torch.ops.aten.detach.default:
+            return WeightWithDynamicFloat8CastTensor(
+                args[0]._tensor, args[0]._linear_mm_config
+            )
+        mm_config: Optional[LinearMMConfig] = None
+
+        def unwrap(t):
+            nonlocal mm_config
+            if mm_config is None:
+                mm_config = t._linear_mm_config
+            else:
+                assert t._linear_mm_config == mm_config
+            return t._tensor
+
+        args, kwargs = pytree.tree_map_only(
+            WeightWithDynamicFloat8CastTensor, unwrap, (args, kwargs or {})
+        )
+        out = func(*args, **kwargs)
+        if func not in _ops_to_preserve_subclass:
+            return out
+        return pytree.tree_map_only(
+            torch.Tensor, lambda x: WeightWithDynamicFloat8CastTensor(x, mm_config), out
+        )
+
+    def __tensor_flatten__(self):
+        if self._precomputed_scale:
+            return ["_tensor", "_precomputed_scale"], self._linear_mm_config
+        else:
+            return ["_tensor"], self._linear_mm_config
+
+    @staticmethod
+    def __tensor_unflatten__(inner_tensors, flatten_spec, outer_size, outer_stride):
+        mm_config = flatten_spec
+        return WeightWithDynamicFloat8CastTensor(
+            inner_tensors["_tensor"],
+            mm_config,
+            getattr(inner_tensors, "_precomputed_scale", None),
+        )
+
+    def __repr__(self):
+        return f"WeightWithDynamicFloat8CastTensor(tensor={self._tensor}, linear_mm_config={self._linear_mm_config})"
+
+    def fsdp_pre_all_gather(self, mesh):
+        if self._precomputed_scale is not None:
+            float8_tensor = hp_tensor_and_scale_to_float8(
+                self._tensor,
+                self._precomputed_scale,
+                torch.float8_e4m3fn,
+                self._linear_mm_config,
+                GemmInputRole.WEIGHT,
+            )
+        else:
+            float8_tensor = hp_tensor_to_float8_dynamic(
+                self._tensor,
+                e4m3_dtype,
+                self._linear_mm_config,
+                reduce_amax=True,
+                gemm_input_role=GemmInputRole.WEIGHT,
+            )
+        return (float8_tensor._data,), (float8_tensor._scale,)
+
+    def fsdp_post_all_gather(
+        self,
+        all_gather_outputs: Tuple[torch.Tensor, ...],
+        metadata: Any,
+        param_dtype: torch.dtype,
+        *,
+        out: Optional[torch.Tensor] = None,
+    ):
+        (data,) = all_gather_outputs
+        (scale,) = metadata
+        if out is not None:
+            assert isinstance(out, Float8Tensor), f"{type(out)}"
+            out._scale = scale
+            return
+        return Float8Tensor(
+            data,
+            scale,
+            param_dtype,
+            self._linear_mm_config,
+            gemm_input_role=GemmInputRole.WEIGHT,
+        ), (data,)
+
+
+class WeightWithDelayedFloat8CastTensor(torch.Tensor):
+    @staticmethod
+    def __new__(
+        cls,
+        tensor: torch.Tensor,
+        amax_buffer: torch.Tensor,
+        amax_history_buffer: torch.Tensor,
+        scale_buffer: torch.Tensor,
+        linear_mm_config: LinearMMConfig,
+        is_amax_initialized: bool,
+    ):
+        return torch.Tensor._make_wrapper_subclass(
+            cls,
+            tensor.size(),
+            strides=tensor.stride(),
+            storage_offset=tensor.storage_offset(),
+            memory_format=suggest_memory_format(tensor),
+            dtype=tensor.dtype,
+            layout=tensor.layout,
+            device=tensor.device,
+            pin_memory=tensor.is_pinned(),
+            requires_grad=tensor.requires_grad,
+        )
+
+    def __init__(
+        self,
+        tensor: torch.Tensor,
+        amax_buffer: torch.Tensor,
+        amax_history_buffer: torch.Tensor,
+        scale_buffer: torch.Tensor,
+        linear_mm_config: LinearMMConfig,
+        is_amax_initialized: bool,
+    ):
+        self._tensor = tensor
+        self._amax_buffer = amax_buffer
+        self._amax_history_buffer = amax_history_buffer
+        self._scale_buffer = scale_buffer
+        self._linear_mm_config = linear_mm_config
+
+        # Note: is_amax_initialized is not a buffer to avoid data dependent
+        # control flow visible to dynamo
+        # TODO(future PR): add serialization for this flag
+        self.is_amax_initialized = is_amax_initialized
+
+    @classmethod
+    def __torch_dispatch__(cls, func, types, args, kwargs=None):
+        if func == torch.ops.aten.detach.default:
+            return WeightWithDelayedFloat8CastTensor(
+                args[0]._tensor,
+                args[0]._amax_buffer,
+                args[0]._amax_history_buffer,
+                args[0]._scale_buffer,
+                args[0]._linear_mm_config,
+                args[0].is_amax_initialized,
+            )
+        mm_config: Optional[LinearMMConfig] = None
+        amax_buffer: Optional[torch.Tensor] = None
+        amax_history_buffer: Optional[torch.Tensor] = None
+        scale_buffer: Optional[torch.Tensor] = None
+        is_amax_initialized: Optional[bool] = None
+
+        def unwrap(t):
+            nonlocal mm_config
+            if mm_config is None:
+                mm_config = t._linear_mm_config
+            else:
+                assert t._linear_mm_config == mm_config
+            nonlocal amax_buffer
+            if amax_buffer is None:
+                amax_buffer = t._amax_buffer
+            nonlocal amax_history_buffer
+            if amax_history_buffer is None:
+                amax_history_buffer = t._amax_history_buffer
+            nonlocal scale_buffer
+            if scale_buffer is None:
+                scale_buffer = t._scale_buffer
+            nonlocal is_amax_initialized
+            if is_amax_initialized is None:
+                is_amax_initialized = t.is_amax_initialized
+            return t._tensor
+
+        args, kwargs = pytree.tree_map_only(
+            WeightWithDelayedFloat8CastTensor, unwrap, (args, kwargs or {})
+        )
+        out = func(*args, **kwargs)
+        if func not in _ops_to_preserve_subclass:
+            return out
+        return pytree.tree_map_only(
+            torch.Tensor,
+            lambda x: WeightWithDelayedFloat8CastTensor(
+                x,
+                amax_buffer,
+                amax_history_buffer,
+                scale_buffer,
+                mm_config,
+                is_amax_initialized,
+            ),
+            out,
+        )
+
+    def __tensor_flatten__(self):
+        return (
+            [
+                "_tensor",
+                "_amax_buffer",
+                "_amax_history_buffer",
+                "_scale_buffer",
+            ],
+            {
+                "mm_config": self._linear_mm_config,
+                "is_amax_initialized": self.is_amax_initialized,
+            },
+        )
+
+    @staticmethod
+    def __tensor_unflatten__(inner_tensors, metadata, outer_size, outer_stride):
+        return WeightWithDelayedFloat8CastTensor(
+            inner_tensors["_tensor"],
+            inner_tensors["_amax_buffer"],
+            inner_tensors["_amax_history_buffer"],
+            inner_tensors["_scale_buffer"],
+            metadata["mm_config"],
+            metadata["is_amax_initialized"],
+        )
+
+    def __repr__(self):
+        return f"WeightWithDelayedFloat8CastTensor(tensor={self._tensor}, amax_buffer={self._amax_buffer}, scale_buffer={self._scale_buffer}, mm_config={self._linear_mm_config})"
+
+    def fsdp_pre_all_gather(self, mesh):
+        # initialize if needed
+        # TODO(before land): ensure settings are consistent between Float8Linear and here
+        if not self.is_amax_initialized:
+            from torchao.float8.float8_linear import (
+                _maybe_initialize_amaxes_scales_for_float8_cast,
+            )
+
+            _maybe_initialize_amaxes_scales_for_float8_cast(
+                self._tensor,
+                self._amax_buffer,
+                self._amax_history_buffer,
+                self._scale_buffer,
+                "max",  # TODO(before land): read this from parent
+                e4m3_dtype,
+                self.is_amax_initialized,
+                reduce_amax=True,
+            )
+            self.is_amax_initialized = True
+
+        float8_tensor = hp_tensor_to_float8_delayed(
+            self._tensor,
+            self._scale_buffer,
+            e4m3_dtype,
+            self._amax_buffer,
+            self._linear_mm_config,
+            GemmInputRole.WEIGHT,
+        )
+        return (float8_tensor._data,), (float8_tensor._scale,)
+
+    def fsdp_post_all_gather(
+        self,
+        all_gather_outputs: Tuple[torch.Tensor, ...],
+        metadata: Any,
+        param_dtype: torch.dtype,
+        *,
+        out: Optional[torch.Tensor] = None,
+    ):
+        (data,) = all_gather_outputs
+        (scale,) = metadata
+        if out is not None:
+            assert isinstance(out, Float8Tensor), f"{type(out)}"
+            out._scale = scale
+            return
+        return Float8Tensor(
+            data,
+            scale,
+            param_dtype,
+            self._linear_mm_config,
+            gemm_input_role=GemmInputRole.WEIGHT,
+        ), (data,)
diff --git a/torchao/float8/inference.py b/torchao/float8/inference.py
new file mode 100644
index 000000000..f5c504503
--- /dev/null
+++ b/torchao/float8/inference.py
@@ -0,0 +1,244 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Defines an nn module designed to be used during inference
+"""
+
+from dataclasses import dataclass
+
+from enum import auto, Enum
+from typing import Callable, List, Optional
+
+import torch
+import torch.nn as nn
+from torchao.float8.float8_linear_utils import swap_linear_layers
+
+from torchao.float8.float8_tensor import (
+    Float8Tensor,
+    GemmInputRole,
+    hp_tensor_and_scale_to_float8,
+    LinearMMConfig,
+    ScaledMMConfig,
+    tensor_already_casted_to_fp8,
+)
+from torchao.float8.float8_utils import e4m3_dtype, tensor_to_scale
+
+
+class ActivationCasting(Enum):
+    """Types of quantization to perform on the activations
+
+    WEIGHT_ONLY: Only quantize the weight, no activation casting, weight will be dequantized in the forward pass
+    STATIC: Activation is quantized during model initialization with a static scale
+    DYNAMIC: Activation is quantized during forward pass with a dynamic scale calculated from the input activation
+    """
+
+    # TODO: A better name would be NONE, we should unify this with torchao
+    WEIGHT_ONLY = auto()
+    DYNAMIC = auto()
+    STATIC = auto()
+
+
+@dataclass(frozen=True)
+class QuantConfig:
+    """Defines the configuration for the quantization to fp8 of a linear module
+
+    Args:
+        activation_casting: The type of quantization to perform on the activations
+        static_quantization_scale: The scale of the input to this linear module, used for static quantization only
+    """
+
+    activation_casting: ActivationCasting
+    static_quantization_scale: Optional[torch.Tensor] = None
+
+    # If True, then prior to performing the fp8 scaled mamtmul we will pad the
+    # inner dimension of a (dim 1) and b (dim 2) with 0s. This is needed for matmuls
+    # _scaled_mm since it has the strong constraint that for M,N,K  N, K must be a multiple of 16.
+    # This can cause a memory spike however so we keep this off by default.
+    pad_inner_dim = False
+
+    def __post_init__(self):
+        if self.activation_casting == ActivationCasting.STATIC:
+            assert isinstance(
+                self.static_quantization_scale, torch.Tensor
+            ), "When activation_casting is 'static', activation_scale must be a tensor."
+
+
+class Float8InferenceLinear(torch.nn.Linear):
+    """
+    This is a wrapper around torch.nn.Linear that supports FP8 inference
+    Supported forms of inference:
+        - FP8 inference with high precision matmul - weight only
+        - FP8 inference with fp8 matmul and dynamic weight casting
+        - FP8 inference with fp8 matmul and static weight casting
+    """
+
+    def __init__(
+        self,
+        # FP8 specific arguments
+        quant_config: QuantConfig,
+        linear_mm_config: LinearMMConfig,
+        # nn.Linear arguments
+        in_features: int,
+        out_features: int,
+        bias: bool = True,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ) -> None:
+        # Construct the superclass this will create dummy weights and biases
+        super().__init__(in_features, out_features, bias, device, dtype)
+        self.linear_mm_config = linear_mm_config
+        self.activation_casting = quant_config.activation_casting
+        if self.activation_casting == ActivationCasting.STATIC:
+            self.register_buffer(
+                "static_quantization_scale", quant_config.static_quantization_scale
+            )
+        else:
+            self.static_quantization_scale = None
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        if self.activation_casting == ActivationCasting.WEIGHT_ONLY:
+            return torch.nn.functional.linear(
+                input, self.weight.to_original_precision()
+            )
+
+        x_fp8 = cast_to_float8_e4m3_inference(
+            input,
+            self.linear_mm_config,
+            static_quantization_scale=self.static_quantization_scale,
+        )
+        return torch.nn.functional.linear(x_fp8, self.weight, self.bias)
+
+    # Builder functions for Float8LinearInference
+    def quantize_weight(self, dtype: torch.dtype = e4m3_dtype) -> None:
+        """This functions converts the weight to a Float8Tensor and sets its requires_grad to False.
+
+        Args:
+            dtype: The dtype to quantize the weight to. Default is e4m3_dtype.
+
+        Note:
+            This function is typically called during inference to quantize the weight once since
+            the weight is not updated during inference.
+
+        """
+        assert not isinstance(
+            self.weight, Float8Tensor
+        ), "Weight has already been quantized, cannot quantize again."
+        scale = tensor_to_scale(self.weight, dtype)
+        quantized_weight = hp_tensor_and_scale_to_float8(
+            self.weight,
+            scale,
+            dtype,
+            self.linear_mm_config,
+            GemmInputRole.WEIGHT,
+        )
+        self.weight = nn.Parameter(quantized_weight)
+        self.weight.requires_grad = False
+
+    def set_weight_and_bias(
+        self, weight: torch.nn.Parameter, bias: Optional[torch.nn.Parameter]
+    ):
+        self.weight = weight
+        self.bias = bias
+
+    @classmethod
+    def from_float(
+        cls, module: nn.Module, quant_config: QuantConfig, use_fast_accum: bool
+    ) -> "Float8InferenceLinear":
+        """
+        Create an nn.Linear with fp8 compute from another nn.Linear
+
+        Args:
+            mod (torch.nn.Linear): nn.Linear to convert
+            quant_config (QuantConfig): Configuration for the weight and activation casting
+        """
+        forward_config = ScaledMMConfig(
+            False, use_fast_accum, pad_inner_dim=quant_config.pad_inner_dim
+        )
+        linear_mm_config = LinearMMConfig(
+            forward_config, forward_config, forward_config
+        )
+        linear = cls(
+            quant_config,
+            linear_mm_config,
+            module.in_features,
+            module.out_features,
+            False,
+            device=torch.device("meta"),
+        )
+        linear.set_weight_and_bias(module.weight, module.bias)
+        linear.quantize_weight()
+        return linear
+
+
+def cast_to_float8_e4m3_inference(
+    inpt_tensor: torch.Tensor,
+    linear_mm_config: LinearMMConfig,
+    reduce_amax: bool = False,
+    static_quantization_scale: Optional[torch.Tensor] = None,
+) -> Float8Tensor:
+    """Casts an input tensor to the Float8 (e4m3fn*)
+
+    Args:
+        inpt_tensor: The input tensor to be cast.
+        linear_mm_config: Configuration settings for the matrix multiplication
+        reduce_amax: Whether to reduce the amax (absolute maximum) among the local distributed group.
+        static_quantization_scale: Optional tensor specifying the scale for activation. Default is None.
+
+    Returns:
+        Float8Tensor: The input tensor cast to Float8 (e4m3fn) format.
+
+    Note:
+        If the input tensor is already in Float8 format, it is returned as is without re-casting.
+    """
+    if tensor_already_casted_to_fp8(inpt_tensor):
+        return inpt_tensor
+    scale = (
+        static_quantization_scale
+        if static_quantization_scale is not None
+        else tensor_to_scale(inpt_tensor, e4m3_dtype, reduce_amax)
+    )
+    return hp_tensor_and_scale_to_float8(
+        inpt_tensor,
+        scale,
+        e4m3_dtype,
+        linear_mm_config,
+        GemmInputRole.INPUT,
+    )
+
+
+def quantize_to_float8(
+    module: nn.Module,
+    quant_config: QuantConfig,
+    *,
+    module_filter_fn: Optional[Callable[[nn.Module, str], bool]] = None,
+    use_fast_accum: bool = True,
+) -> nn.Module:
+    """
+    Converts torch.nn.Linear layers in the given module to Float8InferenceLinear.
+
+    Note:
+        If applied to a root-level nn.Linear, the module will not be modified in place
+        and returned instead
+
+    Args:
+        module (nn.Module): The module to modify.
+        quant_config (QuantConfig): Quantization configuration for Float8 conversion.
+        module_filter_fn: If specified, only the `torch.nn.Linear` subclasses that
+            that pass the filter function will be swapped. The inputs to the
+            filter function are the module instance and the FQN.
+        use_fast_accum : Whether to enable fast accumulation for the Float8InferenceLinear. Defaults to True.
+
+    Returns:
+        nn.Module: The modified module with applicable Linear layers converted to Float8.
+
+    Raises:
+        AssertionError: If a root-level nn.Linear with children is encountered.
+    """
+    return swap_linear_layers(
+        module,
+        lambda m: Float8InferenceLinear.from_float(m, quant_config, use_fast_accum),
+        module_filter_fn=module_filter_fn,
+    )