Update base for Update on "introduce batch sharding strategy"

zpcore · zpcore · commit 65250ebeddf1 · 2025-07-25T11:37:49.000-07:00
(Split out the large PR from #46) Introduce the batch sharding strategy: ```python from torch.distributed.tensor._op_schema import RuntimeSchemaInfo from autoparallel.dtensor_util.utils import batch_shard_strategy from autoparallel.dtensor_util import strategy_pool # create strategy with input tensor 1 replicated, input tensor 2 shard on dim 0. Output tensor shard on dim 0: custom_shard_strategy = functools.partial(batch_shard_strategy, input_shard_dim=[None, 0], output_shard_dim=[0]) # register the strategy: strategy_pool.register_op_strategy(new_op)(custom_shard_strategy) ``` For details, check func description in autoparallel/dtensor_util/utils.py and example usage in tests/test_dtensor.py. [ghstack-poisoned]
diff --git a/autoparallel/dtensor_util/__init__.py b/autoparallel/dtensor_util/__init__.py
@@ -3,7 +3,17 @@
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 
+# functions to expose
+from .utils import (
+    get_op_strategy,
+    op_strategy_context,
+    replicate_op_strategy,
+    with_implicit_strategies,
+)
 
-from . import utils
-
-strategy_pool = utils.StrategyPool()
+__all__ = [
+    "replicate_op_strategy",
+    "get_op_strategy",
+    "with_implicit_strategies",
+    "op_strategy_context",
+]
diff --git a/autoparallel/dtensor_util/utils.py b/autoparallel/dtensor_util/utils.py
@@ -3,23 +3,25 @@
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 
-
 import logging
 from contextlib import ExitStack, contextmanager
-from typing import Callable, TypeVar
 
 import torch
 from torch.distributed.tensor import DTensor
-from torch.distributed.tensor._op_schema import OpSchema, OutputSharding, StrategyType
+from torch.distributed.tensor._op_schema import OpSchema, StrategyType
 from torch.distributed.tensor._ops.utils import register_op_strategy
-from typing_extensions import ParamSpec
 
 logger = logging.getLogger(__name__)
 
 aten = torch.ops.aten
 
-_T = TypeVar("_T")
-_P = ParamSpec("_P")
+# reference to existing sharding_propagator DTensor upstream
+propagator = DTensor._op_dispatcher.sharding_propagator
+
+enable_implicit_replication = False
+_current_stack = None
+
+replicate_op_strategy = torch.distributed.tensor._ops.utils.replicate_op_strategy
 
 
 # TODO: remove and refer to
@@ -50,76 +52,57 @@ def op_strategy_context(op_overload, strategy_func, schema_info=None):
         propagator.propagate_op_sharding.cache.cache_clear()
 
 
-# -------------define universal op strategy-------------
-replicate_op_strategy = torch.distributed.tensor._ops.utils.replicate_op_strategy
-
+def get_op_strategy(op: torch._ops.OpOverload, op_schema: OpSchema) -> StrategyType:
+    global enable_implicit_replication, _current_stack
 
-class StrategyPool:
-    def __init__(self) -> None:
-        # reference to existing strategy from the DTensor upstream
-        self.op_strategy_funcs: dict[
-            torch._ops.OpOverload, Callable[[OpSchema], StrategyType]
-        ] = DTensor._op_dispatcher.sharding_propagator.op_strategy_funcs
-        # reference to existing rules
-        self.op_to_rules: dict[
-            torch._ops.OpOverload, Callable[[OpSchema], OutputSharding]
-        ] = DTensor._op_dispatcher.sharding_propagator.op_to_rules
-        # we probably don't need to care about existing op_to_schema_info for AP
-        self.op_to_schema_info = (
-            DTensor._op_dispatcher.sharding_propagator.op_to_schema_info
-        )
-        self.enable_implicit_replication: bool = False
-        self._current_stack = None
-
-    def get_op_strategy(
-        self, op: torch._ops.OpOverload, op_schema: OpSchema
-    ) -> StrategyType:
-        if op not in self.op_strategy_funcs:
-            if not self.enable_implicit_replication:
-                raise NotImplementedError(
-                    f"Operator {op} does not have a sharding strategy registered."
+    if op not in propagator.op_strategy_funcs:
+        if not enable_implicit_replication:
+            raise NotImplementedError(
+                f"Operator {op} does not have a sharding strategy registered."
+            )
+        else:
+            # Use the current stack if available
+            if _current_stack is not None:
+                _current_stack.enter_context(
+                    op_strategy_context(op, replicate_op_strategy)
                 )
             else:
-                # Use the instance's current stack if available
-                if self._current_stack is not None:
-                    self._current_stack.enter_context(
-                        op_strategy_context(op, replicate_op_strategy)
-                    )
-                else:
-                    # No stack available, just register permanently
-                    register_op_strategy(op)(replicate_op_strategy)
-                logger.warning(
-                    f"implicitly registering `{op}` with `{replicate_op_strategy.__name__}`"
-                )
-        return self.op_strategy_funcs[op](op_schema)
-
-    @contextmanager
-    def with_implicit_strategies(self):
-        """Context manager to enable implicit replication and clean up strategies."""
-        # Create a fresh ExitStack for this context
-        with ExitStack() as local_stack:
-            # Store the stack as an instance attribute
-            old_stack = self._current_stack
-            self._current_stack = local_stack
-
-            # Enable implicit replication
-            old_value = self.enable_implicit_replication
-            self.enable_implicit_replication = True
-            try:
-                yield
-            finally:
-                # Restore the original values
-                self._current_stack = old_stack
-                self.enable_implicit_replication = old_value
-
-    # TODO: automatic generate redistribute cost for strategies. There exists a
-    # `fill_missing_redistribute_cost` in autoparallel/utils.py, which is a hack
-    # to generate redistribute cost given input specs, and only tested on
-    # certain ops. We can potentially make an improvement.
-    def fill_missing_redistribute_cost(
-        self, op: torch._ops.OpOverload, op_schema: OpSchema
-    ):
-        """
-        Fill missing redistribute cost for strategies.
-        """
-        ...
+                # No stack available, just register permanently
+                register_op_strategy(op)(replicate_op_strategy)
+            logger.warning(
+                f"implicitly registering `{op}` with `{replicate_op_strategy.__name__}`"
+            )
+    return propagator.op_strategy_funcs[op](op_schema)
+
+
+@contextmanager
+def with_implicit_strategies():
+    """Context manager to enable implicit replication and clean up strategies."""
+    global enable_implicit_replication, _current_stack
+
+    # Create a fresh ExitStack for this context
+    with ExitStack() as local_stack:
+        # Store the stack as a global variable
+        old_stack = _current_stack
+        _current_stack = local_stack
+
+        # Enable implicit replication
+        old_value = enable_implicit_replication
+        enable_implicit_replication = True
+        try:
+            yield
+        finally:
+            # Restore the original values
+            _current_stack = old_stack
+            enable_implicit_replication = old_value
+
+
+# TODO: automatic generate redistribute cost for strategies. There exists a
+# `fill_missing_redistribute_cost` in autoparallel/utils.py, which is a hack
+# to generate redistribute cost given input specs, and only tested on
+# certain ops. We can potentially make an improvement.
+def fill_missing_redistribute_cost(op: torch._ops.OpOverload, op_schema: OpSchema):
+    """
+    Fill missing redistribute cost for strategies.
+    """
+    ...
diff --git a/autoparallel/propagation_rules.py b/autoparallel/propagation_rules.py
@@ -36,7 +36,7 @@
 from torch.distributed.tensor._ops.utils import generate_redistribute_costs
 from torch.distributed.tensor.placement_types import Replicate, Shard
 
-from .dtensor_util import strategy_pool
+from .dtensor_util import get_op_strategy
 
 # TODO: move this to PyTorch
 dim_maps[torch.t] = lambda input: dim_transpose(input.ndim, -2, -1)
@@ -584,7 +584,7 @@ def index_rule(mesh, op_schema):
 @register_opschema_rule(torch.ops.aten._scaled_dot_product_efficient_attention.default)
 def sdpa_rule(mesh, op_schema):
     op = torch.ops.aten._scaled_dot_product_efficient_attention.default
-    out_strat = strategy_pool.get_op_strategy(op, op_schema)
+    out_strat = get_op_strategy(op, op_schema)
     # remove wrong context-parallel strategy
     # https://github.com/pytorch/pytorch/pull/131351#discussion_r1716164659
     new_strats = []
@@ -611,7 +611,7 @@ def sdpa_rule(mesh, op_schema):
 @register_opschema_rule(torch.ops.aten.reshape.default)
 def reshape_rule(mesh, op_schema):
     op = torch.ops.aten.reshape.default
-    out_strat = strategy_pool.get_op_strategy(op, op_schema)
+    out_strat = get_op_strategy(op, op_schema)
     if mesh.ndim == 1:
         # remove duplicate strategy
         # TODO: hack, fixme
@@ -637,7 +637,7 @@ def expand_rule(mesh, op_schema_):
     ]
     if len(expand_dim) != 1:
         assert len(expand_dim) == 0
-        return strategy_pool.get_op_strategy(op, op_schema)
+        return get_op_strategy(op, op_schema)
     assert len(expand_dim) == 1, f"{expand_dim}"
     expand_dim = expand_dim[0]
     to_remove = []
@@ -651,7 +651,7 @@ def expand_rule(mesh, op_schema_):
     removed = []
     for i in reversed(to_remove):
         removed.append(input_strat.strategies.pop(i))
-    out_strat = strategy_pool.get_op_strategy(op, op_schema)
+    out_strat = get_op_strategy(op, op_schema)
     for i, ss in enumerate(out_strat.strategies):
         for remov in to_remove:
             ss.redistribute_cost[0].insert(remov, math.inf)
diff --git a/autoparallel/utils.py b/autoparallel/utils.py
@@ -10,7 +10,7 @@
 from torch.distributed.tensor._ops.utils import generate_redistribute_costs
 from torch.utils._pytree import tree_flatten, tree_map_only
 
-from .dtensor_util import strategy_pool
+from .dtensor_util import get_op_strategy
 from .propagation_rules import _op_partial_rules, _op_rules, remove_invalid_configs
 
 
@@ -111,7 +111,7 @@ def get_placement_options(mesh, op, specs, user_args, user_kwargs):
     if op in _op_partial_rules:
         out_strat = _op_partial_rules[op](mesh, op_schema)
     else:
-        out_strat = strategy_pool.get_op_strategy(op, op_schema)
+        out_strat = get_op_strategy(op, op_schema)
 
     propagate_tensor_meta(op, user_args, user_kwargs, out_strat)
     fill_missing_redistribute_cost(op, specs, out_strat)
diff --git a/tests/test_dtensor.py b/tests/test_dtensor.py
@@ -3,11 +3,10 @@
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 
-from contextlib import contextmanager
-
 import numpy as np
 import torch
-from torch.distributed.tensor import DTensor, Shard, distribute_tensor, init_device_mesh
+from torch.distributed.device_mesh import init_device_mesh
+from torch.distributed.tensor import DTensor, Shard, distribute_tensor
 from torch.distributed.tensor._dtensor_spec import DTensorSpec, TensorMeta
 from torch.distributed.tensor._op_schema import (
     OpInfo,
@@ -24,7 +23,9 @@
     with_comms,
 )
 
-from autoparallel.dtensor_util import strategy_pool
+from autoparallel.dtensor_util import get_op_strategy, with_implicit_strategies
+
+propagator = DTensor._op_dispatcher.sharding_propagator
 
 aten = torch.ops.aten
 
@@ -94,31 +95,6 @@ def _fw_tuple(x, y, z):
 )
 
 
-@contextmanager
-def op_strategy_context(op_overload, strategy_func, schema_info=None):
-    """
-    Context manager for setting and clearing op strategies in unit tests.
-    Args:
-        op_overload: The operator overload to set or clear the strategy for.
-        strategy_func: The strategy function to set for the operator overload.
-        schema_info: Optional schema information for the operator overload.
-    Yields:
-        None
-    """
-    try:
-        # register the op strategy
-        strategy_pool.register_op_strategy(op_overload, schema_info=schema_info)(
-            strategy_func
-        )
-        yield
-    finally:
-        # clear this op strategy cache
-        if op_overload in strategy_pool.op_strategy_funcs:
-            del strategy_pool.op_strategy_funcs[op_overload]
-        if op_overload in strategy_pool.op_to_schema_info:
-            del strategy_pool.op_to_schema_info[op_overload]
-
-
 # Overwrite upstream `_op_dispatcher.sharding_propagator` with customized
 # sharding_propagator. This is for testing purpose under eager mode and
 # AutoParallel won't use the propagate function. The main changes are 1) Skip
@@ -132,9 +108,9 @@ class CustomShardingPropagator(
     def __init__(self):
         super().__init__()
         self.propagate_op_sharding.cache.cache_clear()
-        self.op_to_rules = strategy_pool.op_to_rules
-        self.op_strategy_funcs = strategy_pool.op_strategy_funcs
-        self.op_to_schema_info = strategy_pool.op_to_schema_info
+        self.op_to_rules = propagator.op_to_rules
+        self.op_strategy_funcs = propagator.op_strategy_funcs
+        self.op_to_schema_info = propagator.op_to_schema_info
 
     def propagate(self, op_info: OpInfo) -> None:
         op_info.output_sharding = self.propagate_op_sharding_non_cached(op_info.schema)
@@ -199,7 +175,7 @@ def propagate_op_sharding_non_cached(self, op_schema: OpSchema) -> OutputShardin
             strategy_schema.schema_info = op_schema.schema_info
 
             # assign implicit strategy if enabled
-            strategy_pool.get_op_strategy(strategy_schema.op, strategy_schema)
+            get_op_strategy(strategy_schema.op, strategy_schema)
 
             # run sharding strategy propagation/generation
             op_strategy = self.op_strategy_funcs[op_schema.op](strategy_schema)
@@ -383,7 +359,7 @@ def test_implicit_registration(self):
             self._test_op_on_dtensor(test_op, input_x_dt, input_y_dt)
 
         # 2. test_op strategy implicitly registered under context manager
-        with strategy_pool.with_implicit_strategies():
+        with with_implicit_strategies():
             self._test_op_on_dtensor(test_op, input_x_dt, input_y_dt)
 
         # 3. remove registration after exiting the context manager