clean up float8 configs in torchtitan (#466)

vkuzo · web-flow · commit 4a2de42f2ee3 · 2024-07-17T15:09:35.000-07:00
Summary:

1. standardizes on `float8` instead of `fp8` for config names
2. removes usage of non-public objects such as `Float8Linear`

Test Plan:

```
with-proxy NGPU=1 CUDA_VISIBLE_DEVICES=7 CONFIG_FILE="./train_configs/debug_model.toml" ./run_llama_train.sh --training.compile --training.enable_float8_linear
```

Reviewers:

Subscribers:

Tasks:

Tags:
diff --git a/test_runner.py b/test_runner.py
@@ -276,34 +276,34 @@ def build_test_list():
         OverrideDefinitions(
             [
                 [
-                    "--training.enable_fp8_linear",
+                    "--training.enable_float8_linear",
                 ]
             ],
             "FSDP2 with original dtype",
-            "fp8_fsdp2_orig_all_gather",
+            "float8_fsdp2_orig_all_gather",
             ngpu=4,
         ),
         OverrideDefinitions(
             [
                 [
-                    "--training.enable_fp8_linear",
-                    "--training.enable_fsdp_fp8_all_gather",
+                    "--training.enable_float8_linear",
+                    "--training.enable_fsdp_float8_all_gather",
                 ]
             ],
-            "FSDP2 with fp8 all-gather",
-            "fsdp2_fp8_all_gather",
+            "FSDP2 with float8 all-gather",
+            "fsdp2_float8_all_gather",
             ngpu=4,
         ),
         OverrideDefinitions(
             [
                 [
-                    "--training.enable_fp8_linear",
-                    "--training.enable_fsdp_fp8_all_gather",
+                    "--training.enable_float8_linear",
+                    "--training.enable_fsdp_float8_all_gather",
                     "--training.precompute_float8_dynamic_scale_for_fsdp",
                 ]
             ],
-            "FSDP2 with fp8 all-gather and precomputed dynamic scales",
-            "fsdp2_fp8_all_gather_precompute_dynamic_scales",
+            "FSDP2 with float8 all-gather and precomputed dynamic scales",
+            "fsdp2_float8_all_gather_precompute_dynamic_scales",
             ngpu=4,
         ),
     ]
diff --git a/torchtitan/config_manager.py b/torchtitan/config_manager.py
@@ -338,7 +338,7 @@ def __init__(self):
             help="Whether to compile the model",
         )
         self.parser.add_argument(
-            "--training.enable_fp8_linear",
+            "--training.enable_float8_linear",
             action="store_true",
             help="""
                 If true, swaps `torch.nn.Linear` with `Float8Linear` with
@@ -348,16 +348,16 @@ def __init__(self):
             """,
         )
         self.parser.add_argument(
-            "--training.enable_fsdp_fp8_all_gather",
+            "--training.enable_fsdp_float8_all_gather",
             action="store_true",
             default=False,
-            help="Whether enable fp8 all-gather in FSDP",
+            help="Whether enable float8 all-gather in FSDP",
         )
         self.parser.add_argument(
             "--training.precompute_float8_dynamic_scale_for_fsdp",
             action="store_true",
             default=False,
-            help="Whether precompute fp8 scales dynamically for FSDP",
+            help="Whether precompute float8 scales dynamically for FSDP",
         )
         self.parser.add_argument(
             "--training.gc_freq",
diff --git a/torchtitan/float8_linear.py b/torchtitan/float8_linear.py
@@ -25,7 +25,7 @@
 
 
 @contextlib.contextmanager
-def set_enable_fsdp_fp8_all_gather(enable_fsdp_fp8_all_gather: bool):
+def set_enable_fsdp_float8_all_gather(enable_fsdp_fp8_all_gather: bool):
     import float8_experimental.config as config
 
     prev = config.enable_fsdp_fp8_all_gather
@@ -53,8 +53,8 @@ def maybe_build_fp8_linear(
 
     This will mutate the model inplace.
     """
-    enable_fp8_linear = job_config.training.enable_fp8_linear
-    if not enable_fp8_linear:
+    enable_float8_linear = job_config.training.enable_float8_linear
+    if not enable_float8_linear:
         return
     if not is_sm90_or_later():
         warning_once(
@@ -69,15 +69,15 @@ def maybe_build_fp8_linear(
         )
 
         # Mutates the model inplace replacing instances of torch.nn.Linear with Float8Linear
-        enable_fsdp_fp8_all_gather = (
-            job_config.training.enable_fsdp_fp8_all_gather and dp_enabled
+        enable_fsdp_float8_all_gather = (
+            job_config.training.enable_fsdp_float8_all_gather and dp_enabled
         )
-        with set_enable_fsdp_fp8_all_gather(enable_fsdp_fp8_all_gather):
+        with set_enable_fsdp_float8_all_gather(enable_fsdp_float8_all_gather):
             swap_linear_with_float8_linear(
                 model, scaling_type_w=TensorScalingType.DYNAMIC
             )
         logger.info(
-            f"Swapped to Float8Linear layers with {enable_fsdp_fp8_all_gather=}"
+            f"Swapped to Float8Linear layers with {enable_fsdp_float8_all_gather=}"
         )
     except ImportError as exc:
         raise ImportError(
@@ -89,8 +89,8 @@ def maybe_precompute_fp8_dynamic_scale_for_fsdp(
     model: nn.Module, job_config: JobConfig
 ):
     if not (
-        job_config.training.enable_fp8_linear
-        and job_config.training.enable_fsdp_fp8_all_gather
+        job_config.training.enable_float8_linear
+        and job_config.training.enable_fsdp_float8_all_gather
         and job_config.training.precompute_float8_dynamic_scale_for_fsdp
     ):
         return
diff --git a/torchtitan/parallelisms/parallelize_llama.py b/torchtitan/parallelisms/parallelize_llama.py
@@ -123,18 +123,10 @@ def get_tp_parallel_strategy(
 
     This function handles the special case of using float8 with tensor parallelism.
     """
-    if job_config.training.enable_fp8_linear:
-        from float8_experimental.float8_linear import Float8Linear, TensorScalingType
-
-        if any(
-            isinstance(m, Float8Linear)
-            and m.scaling_type_w is TensorScalingType.DELAYED
-            for m in model.modules()
-        ):
-            raise NotImplementedError(
-                "1D TP fp8 all-gather only supports dynamic scaling"
-            )
-
+    if job_config.training.enable_float8_linear:
+        # TODO(future PR): once float8 configuration supports delayed
+        # scaling, add a check here to enforce supported float8 all-gather
+        # configurations
         from float8_experimental.float8_tensor_parallel import (
             Float8ColwiseParallel,
             Float8RowwiseParallel,
diff --git a/train_configs/debug_model.toml b/train_configs/debug_model.toml
@@ -37,7 +37,7 @@ max_norm = 1.0  # grad norm clipping
 steps = 10
 data_parallel_degree = -1
 tensor_parallel_degree = 1
-enable_fp8_linear = false
+enable_float8_linear = false
 compile = false
 dataset = "c4_mini"  # supported datasets: c4_mini (45K), c4 (177M)
 
diff --git a/train_configs/llama2_13b.toml b/train_configs/llama2_13b.toml
@@ -33,7 +33,7 @@ max_norm = 1.0  # grad norm clipping
 steps = 1000
 data_parallel_degree = -1
 tensor_parallel_degree = 1
-enable_fp8_linear = false
+enable_float8_linear = false
 compile = false
 dataset = "c4"
 
diff --git a/train_configs/llama2_70b.toml b/train_configs/llama2_70b.toml
@@ -33,7 +33,7 @@ max_norm = 1.0  # grad norm clipping
 steps = 1000
 data_parallel_degree = -1
 tensor_parallel_degree = 8  # 8-way TP
-enable_fp8_linear = false
+enable_float8_linear = false
 compile = false
 dataset = "c4"
 
diff --git a/train_configs/llama2_7b.toml b/train_configs/llama2_7b.toml
@@ -32,7 +32,7 @@ max_norm = 1.0  # grad norm clipping
 steps = 1000
 data_parallel_degree = -1
 tensor_parallel_degree = 1  # dp-only would be sufficient for 7B
-enable_fp8_linear = false
+enable_float8_linear = false
 compile = false
 dataset = "c4"
 
diff --git a/train_configs/llama3_70b.toml b/train_configs/llama3_70b.toml
@@ -33,7 +33,7 @@ max_norm = 1.0  # grad norm clipping
 steps = 1000
 data_parallel_degree = -1
 tensor_parallel_degree = 8  # 8-way TP
-enable_fp8_linear = false
+enable_float8_linear = false
 compile = false
 dataset = "c4"
 
diff --git a/train_configs/llama3_8b.toml b/train_configs/llama3_8b.toml
@@ -33,7 +33,7 @@ max_norm = 1.0  # grad norm clipping
 steps = 1000
 data_parallel_degree = -1
 tensor_parallel_degree = 1
-enable_fp8_linear = false
+enable_float8_linear = false
 compile = false
 dataset = "c4"
 

Original file line number	Diff line number	Diff line change
`@@ -276,34 +276,34 @@ def build_test_list():`
`276`	`276`	`OverrideDefinitions(`
`277`	`277`	`[`
`278`	`278`	`[`
`279`		`- "--training.enable_fp8_linear",`
	`279`	`+ "--training.enable_float8_linear",`
`280`	`280`	`]`
`281`	`281`	`],`
`282`	`282`	`"FSDP2 with original dtype",`
`283`		`- "fp8_fsdp2_orig_all_gather",`
	`283`	`+ "float8_fsdp2_orig_all_gather",`
`284`	`284`	`ngpu=4,`
`285`	`285`	`),`
`286`	`286`	`OverrideDefinitions(`
`287`	`287`	`[`
`288`	`288`	`[`
`289`		`- "--training.enable_fp8_linear",`
`290`		`- "--training.enable_fsdp_fp8_all_gather",`
	`289`	`+ "--training.enable_float8_linear",`
	`290`	`+ "--training.enable_fsdp_float8_all_gather",`
`291`	`291`	`]`
`292`	`292`	`],`
`293`		`- "FSDP2 with fp8 all-gather",`
`294`		`- "fsdp2_fp8_all_gather",`
	`293`	`+ "FSDP2 with float8 all-gather",`
	`294`	`+ "fsdp2_float8_all_gather",`
`295`	`295`	`ngpu=4,`
`296`	`296`	`),`
`297`	`297`	`OverrideDefinitions(`
`298`	`298`	`[`
`299`	`299`	`[`
`300`		`- "--training.enable_fp8_linear",`
`301`		`- "--training.enable_fsdp_fp8_all_gather",`
	`300`	`+ "--training.enable_float8_linear",`
	`301`	`+ "--training.enable_fsdp_float8_all_gather",`
`302`	`302`	`"--training.precompute_float8_dynamic_scale_for_fsdp",`
`303`	`303`	`]`
`304`	`304`	`],`
`305`		`- "FSDP2 with fp8 all-gather and precomputed dynamic scales",`
`306`		`- "fsdp2_fp8_all_gather_precompute_dynamic_scales",`
	`305`	`+ "FSDP2 with float8 all-gather and precomputed dynamic scales",`
	`306`	`+ "fsdp2_float8_all_gather_precompute_dynamic_scales",`
`307`	`307`	`ngpu=4,`
`308`	`308`	`),`
`309`	`309`	`]`