ufmt fixes.

githubsgi · githubsgi · commit dc2c2114ac7c · 2025-10-28T17:33:28.000-07:00
diff --git a/scripts/generate/test_generate.py b/scripts/generate/test_generate.py
@@ -25,7 +25,7 @@
     RowwiseParallel,
 )
 from torchtitan.components.metrics import build_device_memory_monitor
-from torchtitan.config import ConfigManager
+from torchtitan.config import ConfigManager, Debug as DebugConfig
 from torchtitan.distributed import ParallelDims, utils as dist_utils
 from torchtitan.protocols.train_spec import get_train_spec
 from torchtitan.tools import utils
@@ -133,7 +133,9 @@ def test_generate(
         # sequences would require https://github.com/pytorch/torchtitan/pull/686
         apply_tp_minus_sp(model, parallel_dims.world_mesh["tp"])
 
-    dist_utils.set_determinism(world_mesh, device, seed, deterministic)
+    debug_config = DebugConfig()
+    debug_config.deterministic = deterministic
+    dist_utils.set_determinism(world_mesh, device, debug_config, seed)
 
     # materalize model
     model.to_empty(device=device_type)
diff --git a/torchtitan/config/__init__.py b/torchtitan/config/__init__.py
@@ -16,6 +16,7 @@
     ActivationCheckpoint,
     Checkpoint,
     Comm,
+    Debug,
     FaultTolerance,
     Job,
     JobConfig,
@@ -28,7 +29,6 @@
     Quantize,
     Training,
     Validation,
-    Debug
 )
 from .manager import ConfigManager
 
@@ -50,5 +50,5 @@
     "Profiling",
     "Training",
     "Validation",
-    "Debug"
+    "Debug",
 ]
diff --git a/torchtitan/config/job_config.py b/torchtitan/config/job_config.py
@@ -905,6 +905,7 @@ class Debug:
     moe_force_load_balance: bool = False
     """If True, we force each experts to get the same amount of tokens via round-robin. This option is for debugging usage only."""
 
+
 @dataclass
 class JobConfig:
     """
diff --git a/torchtitan/distributed/activation_checkpoint.py b/torchtitan/distributed/activation_checkpoint.py
@@ -17,7 +17,6 @@
 )
 
 from torchtitan.config.job_config import ActivationCheckpoint as ACConfig
-from torchtitan.config.job_config import Debug as DebugConfig
 from torchtitan.tools.logging import logger, warn_once
 
 
@@ -43,7 +42,7 @@ def _apply_layer_sac(module: nn.Module, ac_config: ACConfig) -> nn.Module:
             preserve_rng_state=ac_config.preserve_rng_state,
             determinism_check=ac_config.determinism_check,
             early_stop=ac_config.early_stop,
-            debug=ac_config.debug
+            debug=ac_config.debug,
         )
     else:
         return module
@@ -133,7 +132,7 @@ def selective_checkpointing_context_fn():
         preserve_rng_state=ac_config.preserve_rng_state,
         determinism_check=ac_config.determinism_check,
         early_stop=ac_config.early_stop,
-        debug=ac_config.debug
+        debug=ac_config.debug,
     )
 
 
@@ -152,7 +151,7 @@ def _apply_full_ac(module: nn.Module, ac_config: ACConfig) -> nn.Module:
         preserve_rng_state=ac_config.preserve_rng_state,
         determinism_check=ac_config.determinism_check,
         early_stop=ac_config.early_stop,
-        debug=ac_config.debug
+        debug=ac_config.debug,
     )
 
 
@@ -198,7 +197,6 @@ def _apply_op_sac_to_transformer_block_with_flex(
         ),
     )
 
-
     def wrap_submodule(name: str, full_ac: bool = False) -> None:
         submodule = getattr(module, name)
         if full_ac:
diff --git a/torchtitan/distributed/utils.py b/torchtitan/distributed/utils.py
@@ -17,8 +17,7 @@
 from torch.distributed.device_mesh import DeviceMesh
 from torch.distributed.tensor import DTensor
 
-from torchtitan.config import Comm as CommConfig, TORCH_DTYPE_MAP
-from torchtitan.config import Debug as DebugConfig
+from torchtitan.config import Comm as CommConfig, Debug as DebugConfig, TORCH_DTYPE_MAP
 from torchtitan.distributed.parallel_dims import ParallelDims
 from torchtitan.tools.logging import logger
 from torchtitan.tools.utils import device_module, device_type
@@ -100,7 +99,9 @@ def set_determinism(
     if debug_config.deterministic:
         logger.info("Deterministic algorithm enabled (expect perf degradation).")
         torch.use_deterministic_algorithms(True)
-        torch.use_deterministic_algorithms(True, warn_only=debug_config.deterministic_warn_only)
+        torch.use_deterministic_algorithms(
+            True, warn_only=debug_config.deterministic_warn_only
+        )
         torch.backends.cudnn.deterministic = True
         torch.backends.cudnn.benchmark = False
         # env var for deterministic CuBLAS
diff --git a/torchtitan/experiments/forge/job_config.py b/torchtitan/experiments/forge/job_config.py
@@ -12,6 +12,7 @@
     Checkpoint,
     Comm,
     Compile,
+    Debug,
     Job,
     LRScheduler,
     MemoryEstimation,
@@ -20,7 +21,6 @@
     Parallelism,
     Quantize,
     Training,
-    Debug,
 )
 
 
diff --git a/torchtitan/models/attention.py b/torchtitan/models/attention.py
@@ -90,7 +90,7 @@ def __init__(self) -> None:
                 SDPBackend.CUDNN_ATTENTION,
                 SDPBackend.FLASH_ATTENTION,
                 SDPBackend.EFFICIENT_ATTENTION,
-                SDPBackend.MATH
+                SDPBackend.MATH,
             ]
 
     def forward(
diff --git a/torchtitan/models/qwen3/model/args.py b/torchtitan/models/qwen3/model/args.py
@@ -56,7 +56,7 @@ def update_from_config(self, job_config: JobConfig, **kwargs) -> None:
         self.max_seq_len = seq_len
 
         self.moe_args._debug_force_load_balance = (
-            job_config.training.debug_moe_force_load_balance
+            job_config.debug.moe_force_load_balance
         )
 
     def get_nparams_and_flops(

Original file line number	Diff line number	Diff line change
`@@ -90,7 +90,7 @@ def __init__(self) -> None:`
`90`	`90`	`SDPBackend.CUDNN_ATTENTION,`
`91`	`91`	`SDPBackend.FLASH_ATTENTION,`
`92`	`92`	`SDPBackend.EFFICIENT_ATTENTION,`
`93`		`- SDPBackend.MATH`
	`93`	`+ SDPBackend.MATH,`
`94`	`94`	`]`
`95`	`95`
`96`	`96`	`def forward(`
Original file line number	Diff line number	Diff line change
`@@ -56,7 +56,7 @@ def update_from_config(self, job_config: JobConfig, **kwargs) -> None:`
`56`	`56`	`self.max_seq_len = seq_len`
`57`	`57`
`58`	`58`	`self.moe_args._debug_force_load_balance = (`
`59`		`- job_config.training.debug_moe_force_load_balance`
	`59`	`+ job_config.debug.moe_force_load_balance`
`60`	`60`	`)`
`61`	`61`
`62`	`62`	`def get_nparams_and_flops(`