Deduplicate TorchTitan main function (#1995)

fegin · web-flow · commit 4caa37999d5e · 2025-11-07T10:52:53.000-08:00
Stack from [ghstack](https://github.com/ezyang/ghstack/tree/0.12.0) (oldest at bottom): * #2002 * #2001 * __->__ #1995 People are creating different train.py and duplicate the `main` function. But in realitly people just want to use different Trainer subclasses. This PR creates a main() in torchtitan/train.py to deduplicate the code.
diff --git a/torchtitan/experiments/forge/example_train.py b/torchtitan/experiments/forge/example_train.py
@@ -7,7 +7,7 @@
 import importlib
 import time
 from datetime import timedelta
-from typing import Any, Iterable, Optional
+from typing import Any, Iterable
 
 import torch
 from torch.distributed.elastic.multiprocessing.errors import record
@@ -17,15 +17,16 @@
 from torchtitan.components.metrics import build_metrics_processor
 from torchtitan.components.tokenizer import build_hf_tokenizer
 from torchtitan.components.validate import build_validator
-from torchtitan.config import ConfigManager, JobConfig
+from torchtitan.config import JobConfig
 from torchtitan.distributed import utils as dist_utils
 from torchtitan.hf_datasets.text_datasets import build_text_dataloader
 from torchtitan.tools import utils
-from torchtitan.tools.logging import init_logger, logger
+from torchtitan.tools.logging import logger
 from torchtitan.tools.profiling import (
     maybe_enable_memory_snapshot,
     maybe_enable_profiling,
 )
+from torchtitan.train import main
 
 from .engine import ForgeEngine
 
@@ -350,19 +351,4 @@ def close(self) -> None:
 
 
 if __name__ == "__main__":
-    init_logger()
-    config_manager = ConfigManager()
-    config = config_manager.parse_args()
-    trainer: Optional[Trainer] = None
-
-    try:
-        trainer = Trainer(config)
-        trainer.train()
-    except Exception:
-        if trainer:
-            trainer.close()
-        raise
-    else:
-        trainer.close()
-        torch.distributed.destroy_process_group()
-        logger.info("Process group destroyed.")
+    main(Trainer)
diff --git a/torchtitan/experiments/torchcomms/train.py b/torchtitan/experiments/torchcomms/train.py
@@ -4,15 +4,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import os
-from typing import Optional
-
-import torch
-
-from torchtitan.config import ConfigManager
 from torchtitan.distributed import ParallelDims
-from torchtitan.tools.logging import init_logger, logger
-from torchtitan.train import Trainer
+from torchtitan.train import main, Trainer
 
 from .parallel_dims import TorchCommsParallelDims
 
@@ -32,35 +25,13 @@ def _create_parallel_dims(self, parallelism_config, world_size) -> ParallelDims:
             world_size=world_size,
         )
 
+    def close(self) -> None:
+        # Call finalize on all comms after training and before destroying process group.
+        if hasattr(self, "parallel_dims"):
+            for comm in self.parallel_dims.comms:
+                comm.finalize()
+        super().close()
 
-if __name__ == "__main__":
-    init_logger()
-    config_manager = ConfigManager()
-    config = config_manager.parse_args()
-    trainer: Optional[TorchCommsTrainer] = None
-
-    try:
-        trainer = TorchCommsTrainer(config)
 
-        if config.checkpoint.create_seed_checkpoint:
-            assert (
-                int(os.environ["WORLD_SIZE"]) == 1
-            ), "Must create seed checkpoint using a single device, to disable sharding."
-            assert (
-                config.checkpoint.enable
-            ), "Must enable checkpointing when creating a seed checkpoint."
-            trainer.checkpointer.save(curr_step=0, last_step=True)
-            logger.info("Created seed checkpoint")
-        else:
-            trainer.train()
-            # Call finalize on all comms after training and before destroying process group.
-            for comm in trainer.parallel_dims.comms:
-                comm.finalize()
-    except Exception:
-        if trainer:
-            trainer.close()
-        raise
-    else:
-        trainer.close()
-        torch.distributed.destroy_process_group()
-        logger.info("Process group destroyed")
+if __name__ == "__main__":
+    main(TorchCommsTrainer)
diff --git a/torchtitan/models/flux/train.py b/torchtitan/models/flux/train.py
@@ -4,12 +4,9 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import os
-from typing import Optional
-
 import torch
 
-from torchtitan.config import ConfigManager, JobConfig, TORCH_DTYPE_MAP
+from torchtitan.config import JobConfig, TORCH_DTYPE_MAP
 from torchtitan.distributed import utils as dist_utils
 
 from torchtitan.models.flux.infra.parallelize import parallelize_encoders
@@ -20,8 +17,7 @@
     pack_latents,
     preprocess_data,
 )
-from torchtitan.tools.logging import init_logger, logger
-from torchtitan.train import Trainer
+from torchtitan.train import main, Trainer
 
 
 class FluxTrainer(Trainer):
@@ -175,29 +171,4 @@ def forward_backward_step(
 
 
 if __name__ == "__main__":
-    init_logger()
-    config_manager = ConfigManager()
-    config = config_manager.parse_args()
-    trainer: Optional[FluxTrainer] = None
-
-    try:
-        trainer = FluxTrainer(config)
-        if config.checkpoint.create_seed_checkpoint:
-            assert (
-                int(os.environ["WORLD_SIZE"]) == 1
-            ), "Must create seed checkpoint using a single device, to disable sharding."
-            assert (
-                config.checkpoint.enable
-            ), "Must enable checkpointing when creating a seed checkpoint."
-            trainer.checkpointer.save(curr_step=0, last_step=True)
-            logger.info("Created seed checkpoint")
-        else:
-            trainer.train()
-    except Exception:
-        if trainer:
-            trainer.close()
-        raise
-    else:
-        trainer.close()
-        torch.distributed.destroy_process_group()
-        logger.info("Process group destroyed.")
+    main(FluxTrainer)
diff --git a/torchtitan/train.py b/torchtitan/train.py
@@ -8,7 +8,7 @@
 import os
 import time
 from datetime import timedelta
-from typing import Any, Generator, Iterable, Optional
+from typing import Any, Generator, Iterable
 
 import torch
 
@@ -703,14 +703,19 @@ def close(self) -> None:
             self.metrics_processor.close()
 
 
-if __name__ == "__main__":
+def main(trainer_class: type[Trainer]) -> None:
+    """Main entry point for training with a specified trainer class.
+
+    Args:
+        trainer_class: The trainer class to instantiate (e.g., Trainer, FluxTrainer, TorchCommsTrainer)
+    """
     init_logger()
     config_manager = ConfigManager()
     config = config_manager.parse_args()
-    trainer: Optional[Trainer] = None
+    trainer: Trainer | None = None
 
     try:
-        trainer = Trainer(config)
+        trainer = trainer_class(config)
 
         if config.checkpoint.create_seed_checkpoint:
             assert (
@@ -731,3 +736,7 @@ def close(self) -> None:
         trainer.close()
         torch.distributed.destroy_process_group()
         logger.info("Process group destroyed")
+
+
+if __name__ == "__main__":
+    main(Trainer)