pytorch
diff --git a/‎.github/workflows/integration_test_4gpu.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/integration_test_4gpu.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎estimation.py‎
Lines changed: 17 additions & 11 deletions b/‎estimation.py‎
Lines changed: 17 additions & 11 deletions
diff --git a/‎test_runner.py‎
Lines changed: 17 additions & 33 deletions b/‎test_runner.py‎
Lines changed: 17 additions & 33 deletions
diff --git a/‎torchtitan/config_manager.py‎
Lines changed: 23 additions & 5 deletions b/‎torchtitan/config_manager.py‎
Lines changed: 23 additions & 5 deletions
diff --git a/‎torchtitan/float8_linear.py‎
Lines changed: 61 additions & 28 deletions b/‎torchtitan/float8_linear.py‎
Lines changed: 61 additions & 28 deletions
diff --git a/‎torchtitan/lr_scheduling.py‎
Lines changed: 15 additions & 15 deletions b/‎torchtitan/lr_scheduling.py‎
Lines changed: 15 additions & 15 deletions
@@ -39,6 +39,6 @@ jobs:
 
         python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121
         python -m pip install --pre torchdata --index-url https://download.pytorch.org/whl/nightly/
-        python -m pip install git+https://github.com/pytorch-labs/float8_experimental.git
+        USE_CPP=0 python -m pip install git+https://github.com/pytorch/ao.git
         mkdir artifacts-to-be-uploaded
         python ./test_runner.py artifacts-to-be-uploaded --ngpu 4
@@ -14,17 +14,19 @@
 from torch._subclasses.fake_tensor import FakeTensorMode
 from torch.distributed import destroy_process_group
 from torch.distributed._tools.fsdp2_mem_tracker import FSDPMemTracker
-from torch.distributed.tensor.parallel import loss_parallel
 from torch.testing._internal.distributed.fake_pg import FakeStore
 
 from torchtitan.config_manager import JobConfig
 from torchtitan.datasets import create_tokenizer
-from torchtitan.float8_linear import build_fp8_linear
+from torchtitan.float8_linear import (
+    maybe_build_fp8_linear,
+    maybe_precompute_fp8_dynamic_scale_for_fsdp,
+)
 from torchtitan.logging_utils import init_logger, logger
 from torchtitan.lr_scheduling import get_lr_schedulers
 from torchtitan.models import model_name_to_cls, model_name_to_tokenizer, models_config
 from torchtitan.parallelisms import models_parallelize_fns, ParallelDims
-from train import build_optimizers
+from train import build_optimizers, get_train_context
 
 
 def estimate_memory(job_config: JobConfig):
@@ -61,9 +63,10 @@ def estimate_memory(job_config: JobConfig):
         logger.info("Compiled RMSNorm is not supported yet. Switching to RMSNorm.")
         job_config.model.norm_type = "rmsnorm"
 
-    if job_config.training.compile:
+    if job_config.training.compile or job_config.experimental.enable_compiled_autograd:
         logger.info("Compile mode is not supported yet. Switching to eager mode.")
         job_config.training.compile = False
+        job_config.experimental.enable_compiled_autograd = False
 
     parallel_dims = ParallelDims(
         dp=job_config.training.data_parallel_degree,
@@ -97,9 +100,9 @@ def estimate_memory(job_config: JobConfig):
     tokenizer_type = model_name_to_tokenizer[model_name]
     tokenizer = create_tokenizer(tokenizer_type, job_config.model.tokenizer_path)
 
-    # loss_parallel enables dispatching to efficient loss operators
-    loss_parallel_ctx = (
-        loss_parallel if parallel_dims.loss_parallel_enabled else contextlib.nullcontext
+    train_context = get_train_context(
+        parallel_dims.loss_parallel_enabled,
+        job_config.experimental.enable_compiled_autograd,
     )
 
     # loss fn can be shared by pipeline-parallel or non-pp execution
@@ -125,9 +128,8 @@ def loss_fn(pred, labels):
         with torch.device("meta"):
             whole_model = model_cls.from_model_args(model_config)
 
-        # apply fp8 linear module swap
-        if job_config.training.enable_fp8_linear:
-            build_fp8_linear(whole_model, job_config, parallel_dims.dp_enabled)
+        # swap to Float8Linear base on fp8 config
+        maybe_build_fp8_linear(whole_model, job_config, parallel_dims.dp_enabled)
 
         # apply PT-D DP/TP parallelisms and activation checkpointing
         model_parts = [whole_model]
@@ -172,7 +174,7 @@ def loss_fn(pred, labels):
             for iter_idx in range(2):
                 input_ids, labels = batch
                 # train step
-                with loss_parallel_ctx():
+                with train_context():
                     pred = whole_model(input_ids)
                     loss = loss_fn(pred, labels)
                     del pred
@@ -186,6 +188,10 @@ def loss_fn(pred, labels):
                 # optimizer step
                 optimizers.step()
                 lr_schedulers.step()
+                # when fp8 config is on,
+                # calculate float8 dynamic amax/scale for all-parameter for FSDP2
+                # it issues a single all-reduce for all parameters at once for better performance
+                maybe_precompute_fp8_dynamic_scale_for_fsdp(whole_model, job_config)
                 optimizers.zero_grad()
                 print(f"Peak Memory at iter: {iter_idx}")
                 fsdp_memtracker.display_snapshot("peak", units="MiB", tabulate=True)
 
@@ -46,6 +46,21 @@ def build_test_list():
     """
     integration_tests_flavors = defaultdict(list)
     integration_tests_flavors["debug_model.toml"] = [
+        OverrideDefinitions(
+            [
+                [
+                    "--checkpoint.enable_checkpoint",
+                    "--experimental.pipeline_parallel_degree 4",
+                    "--experimental.pipeline_parallel_split_points layers.1,layers.2,layers.3,layers.4,layers.5,layers.6,layers.7",
+                    "--experimental.pipeline_parallel_schedule flexible_interleaved_1f1b",
+                    "--model.norm_type rmsnorm",  # fused_rmsnorm throws cuda context error with pp
+                ],
+            ],
+            "PP looped flexible 1f1b test",
+            "pp_looped_flexible_1f1b",
+            requires_seed_checkpoint=True,
+            ngpu=4,
+        ),
         OverrideDefinitions(
             [
                 [
@@ -273,39 +288,6 @@ def build_test_list():
             "fsdp2_mem_tracker",
             ngpu=4,
         ),
-        OverrideDefinitions(
-            [
-                [
-<<<<<<< HEAD
-                    "--training.enable_float8_linear",
-                ]
-            ],
-            "FSDP2 with original dtype",
-            "float8_fsdp2_orig_all_gather",
-            ngpu=4,
-        ),
-        OverrideDefinitions(
-            [
-                [
-                    "--training.enable_float8_linear",
-                    "--training.enable_fsdp_float8_all_gather",
-                ]
-            ],
-            "FSDP2 with float8 all-gather",
-            "fsdp2_float8_all_gather",
-            ngpu=4,
-        ),
-        OverrideDefinitions(
-            [
-                [
-                    "--training.enable_float8_linear",
-                    "--training.enable_fsdp_float8_all_gather",
-                    "--training.precompute_float8_dynamic_scale_for_fsdp",
-                ]
-            ],
-            "FSDP2 with float8 all-gather and precomputed dynamic scales",
-            "fsdp2_float8_all_gather_precompute_dynamic_scales",
-        ),
         OverrideDefinitions(
             [
                 [
@@ -347,6 +329,8 @@ def run_test(test_flavor: OverrideDefinitions, full_path: str, output_dir: str):
 
     for override_arg in test_flavor.override_args:
         cmd = f"CONFIG_FILE={full_path} NGPU={test_flavor.ngpu} LOG_RANK={all_ranks} ./run_llama_train.sh"
+        if test_name == "fsdp2_mem_tracker":
+            cmd = f"CONFIG_FILE={full_path} NGPU={test_flavor.ngpu} LOG_RANK={all_ranks} ./run_memory_estimation.sh"
         cmd += " " + dump_folder_arg
         cmd += " " + model_flavor_arg
         if override_arg:
 
@@ -275,7 +275,7 @@ def __init__(self):
         self.parser.add_argument(
             "--experimental.pipeline_parallel_schedule",
             type=str,
-            choices=["1f1b", "gpipe", "interleaved_1f1b"],
+            choices=["1f1b", "gpipe", "interleaved_1f1b", "flexible_interleaved_1f1b"],
             default="1f1b",
             help="""
                 Specify the Pipeline Parallel schedule to use.
@@ -358,10 +358,9 @@ def __init__(self):
             "--training.enable_float8_linear",
             action="store_true",
             help="""
-                If true, swaps `torch.nn.Linear` with `Float8Linear` with
-                default settings (dynamic scaling).
-                This feature requires you to install 'float8_experimental' which can be found
-                here: https://github.com/pytorch-labs/float8_experimental
+                If true, swaps `torch.nn.Linear` with `Float8Linear`.
+                This feature requires you to install 'torchao' which can be found
+                here: https://github.com/pytorch/ao
             """,
         )
         self.parser.add_argument(
@@ -376,6 +375,25 @@ def __init__(self):
             default=False,
             help="Whether precompute float8 scales dynamically for FSDP",
         )
+        self.parser.add_argument(
+            "--training.float8_scaling_type_input",
+            type=str,
+            default="dynamic",
+            help="float8 scaling for input, dynamic (default) or delayed",
+            choices=["dynamic", "delayed"],
+        )
+        self.parser.add_argument(
+            "--training.float8_scaling_type_weight",
+            type=str,
+            default="dynamic",
+            help="float8 scaling for input, dynamic (default) or delayed",
+        )
+        self.parser.add_argument(
+            "--training.float8_scaling_type_grad_output",
+            type=str,
+            default="dynamic",
+            help="float8 scaling for input, dynamic (default) or delayed",
+        )
         self.parser.add_argument(
             "--training.gc_freq",
             type=int,
 
@@ -4,15 +4,14 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-# [Note] Getting the 'float8_experimental' package:
-# This script requires the 'float8_experimental' package to function correctly.
+# [Note] Getting the 'torchao' package:
+# This script requires the 'torchao' package to function correctly.
 # Please ensure you have this package installed from the appropriate repository.
-# You can obtain it from https://github.com/pytorch-labs/float8_experimental.
-# Either clone and run `pip install .` or run `pip install git+https://github.com/pytorch-labs/float8_experimental.git`
+# You can obtain it from https://github.com/pytorch/ao by following the
+# installation instructions.
 
 # Note: Performance
 # Float8 experimental is intended to be ran under `torch.compile`` for competitive performance
-import contextlib
 import functools
 from typing import Optional
 
@@ -24,20 +23,6 @@
 from torchtitan.logging_utils import logger
 
 
-@contextlib.contextmanager
-def set_enable_fsdp_float8_all_gather(enable_fsdp_fp8_all_gather: bool):
-    import float8_experimental.config as config
-
-    prev = config.enable_fsdp_fp8_all_gather
-    torch.distributed.barrier()
-    config.enable_fsdp_fp8_all_gather = enable_fsdp_fp8_all_gather
-    try:
-        yield
-    finally:
-        torch.distributed.barrier()
-        config.enable_fsdp_fp8_all_gather = prev
-
-
 @functools.lru_cache(None)
 def is_sm90_or_later():
     # Float8 is only supported on H100+ GPUs
@@ -63,25 +48,42 @@ def maybe_build_fp8_linear(
         )
         return
     try:
-        from float8_experimental.float8_linear import TensorScalingType
-        from float8_experimental.float8_linear_utils import (
-            swap_linear_with_float8_linear,
+        from torchao.float8 import (
+            CastConfig,
+            convert_to_float8_training,
+            Float8LinearConfig,
+            ScalingType,
         )
 
         # Mutates the model inplace replacing instances of torch.nn.Linear with Float8Linear
         enable_fsdp_float8_all_gather = (
             job_config.training.enable_fsdp_float8_all_gather and dp_enabled
         )
-        with set_enable_fsdp_float8_all_gather(enable_fsdp_float8_all_gather):
-            swap_linear_with_float8_linear(
-                model, scaling_type_w=TensorScalingType.DYNAMIC
-            )
+        scaling_type_input = ScalingType(job_config.training.float8_scaling_type_input)
+        scaling_type_weight = ScalingType(
+            job_config.training.float8_scaling_type_weight
+        )
+        scaling_type_grad_output = ScalingType(
+            job_config.training.float8_scaling_type_grad_output
+        )
+        float8_config = Float8LinearConfig(
+            enable_fsdp_float8_all_gather=enable_fsdp_float8_all_gather,
+            cast_config_input=CastConfig(scaling_type=scaling_type_input),
+            cast_config_weight=CastConfig(scaling_type=scaling_type_weight),
+            cast_config_grad_output=CastConfig(scaling_type=scaling_type_grad_output),
+            enable_pre_and_post_forward=False,
+        )
+        convert_to_float8_training(
+            model,
+            config=float8_config,
+            module_filter_fn=lambda mod, fqn: fqn != "output",
+        )
         logger.info(
             f"Swapped to Float8Linear layers with {enable_fsdp_float8_all_gather=}"
         )
     except ImportError as exc:
         raise ImportError(
-            "float8_experimental is not installed. Please install it to use fp8 linear layers."
+            "torchao is not installed. Please install it to use fp8 linear layers."
         ) from exc
 
 
@@ -100,6 +102,37 @@ def maybe_precompute_fp8_dynamic_scale_for_fsdp(
             "Skipped precomputing fp8 scales because SM90 or later is not available",
         )
         return
-    from float8_experimental.fsdp_utils import precompute_float8_dynamic_scale_for_fsdp
+    from torchao.float8 import precompute_float8_dynamic_scale_for_fsdp
 
     precompute_float8_dynamic_scale_for_fsdp(model)
+
+
+_sync_float8_amax_and_scale_history = None
+
+
+def maybe_sync_float8_amax_and_scale_history(model: nn.Module, job_config: JobConfig):
+    if not (
+        job_config.training.enable_float8_linear
+        and (
+            job_config.training.float8_scaling_type_input == "delayed"
+            or job_config.training.float8_scaling_type_weight == "delayed"
+            or job_config.training.float8_scaling_type_grad_output == "delayed"
+        )
+    ):
+        return
+
+    from torchao.float8 import sync_float8_amax_and_scale_history
+
+    # TODO(future): see if precalculating the modules to sync over is going to
+    # meaningfully help performance
+
+    global _sync_float8_amax_and_scale_history
+    if _sync_float8_amax_and_scale_history is None:
+        if job_config.training.compile:
+            _sync_float8_amax_and_scale_history = torch.compile(
+                sync_float8_amax_and_scale_history
+            )
+        else:
+            _sync_float8_amax_and_scale_history = sync_float8_amax_and_scale_history
+
+    sync_float8_amax_and_scale_history(model)
@@ -4,43 +4,43 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import functools
+
 from torch.optim.lr_scheduler import LambdaLR
 from torchtitan.config_manager import JobConfig
 
-# global states for scheduling
-# these are needed as LambdaLR does not support argument passing
-_warmup_steps = 200
-_decay_steps = 0
-
 
-def linear_warmup_linear_decay(current_step: int) -> float:
+def linear_warmup_linear_decay(
+    warmup_steps: int, decay_steps: int, current_step: int
+) -> float:
     """Computes linear warmup followed by linear decay.
     Per LambdaLR requirement, this is accomplished by returning
     a multiplicative factor to adjust the learning rate to
     create the desired schedule.
     """
-    if current_step < _warmup_steps:
+    if current_step < warmup_steps:
         # linear warmup
         # 0-indexed step, hence + 1 adjustments
         current_step += 1
-        curr_adjustment = float(current_step / (_warmup_steps + 1))
+        curr_adjustment = float(current_step / (warmup_steps + 1))
 
     else:
         # linear decay
-        normalized_step = _decay_steps - (current_step - _warmup_steps)
-        curr_adjustment = 1 - (_decay_steps - normalized_step) / _decay_steps
+        normalized_step = decay_steps - (current_step - warmup_steps)
+        curr_adjustment = 1 - (decay_steps - normalized_step) / decay_steps
 
     return curr_adjustment
 
 
 def get_lr_schedulers(optimizers, job_config: JobConfig):
     def _get_lr_scheduler(optimizer):
         """Build a linear warmup and linear decay scheduler"""
-        global _warmup_steps, _decay_steps
-        _warmup_steps = int(job_config.training.warmup_steps)
-        _decay_steps = float(max(1, job_config.training.steps - _warmup_steps))
-
-        warmup_scheduler = LambdaLR(optimizer, lr_lambda=linear_warmup_linear_decay)
+        warmup_steps = int(job_config.training.warmup_steps)
+        decay_steps = float(max(1, job_config.training.steps - warmup_steps))
+        lr_lambda = functools.partial(
+            linear_warmup_linear_decay, warmup_steps, decay_steps
+        )
+        warmup_scheduler = LambdaLR(optimizer, lr_lambda=lr_lambda)
         return warmup_scheduler
 
     class SchedulersContainer: