From 19fd5e8e7a17811038c5308719c86e0f1be5c18a Mon Sep 17 00:00:00 2001
From: Will Constable <whc@meta.com>
Date: Mon, 20 May 2024 11:17:28 -0700
Subject: [PATCH] Add a 3-stage PP config

Pipelining is unique in that there is no need to stick to power-of-2
numbers of stages, and there maybe reasons an odd number is optimal
depending on how you divide up your cluster.

Anyway, I use this for validation of the 1f1b schedule in a slightly-more-complicated
than 2-stage but simpler than 4-stage setup.

seems to run fine, if run with an even batch size
(`--training.batch_size 12`)

ghstack-source-id: b6fd58261ae7d38107cf81718bfbb8bfd40acba5
Pull Request resolved: https://github.com/pytorch/torchtitan/pull/345
---
 test_runner.py                               | 15 +++++++++++++++
 torchtitan/models/llama/__init__.py          |  4 ++--
 torchtitan/parallelisms/parallelize_llama.py |  6 ++++++
 3 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/test_runner.py b/test_runner.py
index 12085917..b17bc948 100755
--- a/test_runner.py
+++ b/test_runner.py
@@ -40,6 +40,21 @@ def build_test_list(args):
     """
     integration_tests_flavors = defaultdict(list)
     integration_tests_flavors["debug_model.toml"] = [
+        OverrideDefinitions(
+            [
+                [
+                    "--checkpoint.enable_checkpoint",
+                    f"--job.dump_folder {args.output_dir}/pp_1f1b_3stage/",
+                    "--experimental.pipeline_parallel_degree 3",
+                    "--experimental.pipeline_parallel_split_points layers.1, layers.2",
+                    "--experimental.pipeline_parallel_schedule 1f1b",
+                    "--training.data_parallel_degree 1",
+                ],
+            ],
+            "PP 1D test 1f1b with 3 PP stages",
+            requires_seed_checkpoint=True,
+            ngpu=3,
+        ),
         OverrideDefinitions(
             [
                 [
diff --git a/torchtitan/models/llama/__init__.py b/torchtitan/models/llama/__init__.py
index 2393d92f..3c08a065 100644
--- a/torchtitan/models/llama/__init__.py
+++ b/torchtitan/models/llama/__init__.py
@@ -12,7 +12,7 @@
 __all__ = ["Transformer"]
 
 llama2_configs = {
-    "debugmodel": ModelArgs(dim=256, n_layers=2, n_heads=16),
+    "debugmodel": ModelArgs(dim=256, n_layers=3, n_heads=16),
     "271M": ModelArgs(dim=1024, n_layers=16, n_heads=8),
     "1B": ModelArgs(dim=2048, n_layers=18, n_heads=16),
     "7B": ModelArgs(dim=4096, n_layers=32, n_heads=32),
@@ -29,7 +29,7 @@
 }
 
 llama3_configs = {
-    "debugmodel": ModelArgs(dim=256, n_layers=2, n_heads=16, rope_theta=500000),
+    "debugmodel": ModelArgs(dim=256, n_layers=3, n_heads=16, rope_theta=500000),
     "8B": ModelArgs(
         dim=4096,
         n_layers=32,
diff --git a/torchtitan/parallelisms/parallelize_llama.py b/torchtitan/parallelisms/parallelize_llama.py
index 909cd8d3..c7466c5f 100644
--- a/torchtitan/parallelisms/parallelize_llama.py
+++ b/torchtitan/parallelisms/parallelize_llama.py
@@ -205,6 +205,12 @@ def pipeline_llama_manual(
 
     logger.info(f"PP rank {pp_rank} is using this model chunk\n{model}")
 
+    # TODO, support this? or just guard against it inside the lib
+    if job_config.training.batch_size % parallel_dims.pp != 0:
+        raise ValueError(
+            f"batch_size {job_config.training.batch_size} not divisible by pp dim, currently unsupported"
+        )
+
     # TODO(whc) once ManualPipelineStage supports lazy shape inference, we can leave model on meta device longer and
     # get rid of the input shape hardcoded here. For now, it should not be a big deal since we only materialize the
     # layers of the model that map to this stage, not the whole model.