introduce batch sharding strategy

zpcore · zpcore · commit fbec8aba1ec9 · 2025-07-24T17:12:25.000-07:00
ghstack-source-id: 6753840 Pull Request resolved: #50
diff --git a/autoparallel/dtensor_util/utils.py b/autoparallel/dtensor_util/utils.py
@@ -3,15 +3,33 @@
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 
-
+import itertools
 import logging
 from contextlib import ExitStack, contextmanager
-from typing import Callable, TypeVar
+from typing import Callable, Optional, TypeVar
 
 import torch
 from torch.distributed.tensor import DTensor
-from torch.distributed.tensor._op_schema import OpSchema, OutputSharding, StrategyType
-from torch.distributed.tensor._ops.utils import register_op_strategy
+from torch.distributed.tensor._dtensor_spec import DTensorSpec
+from torch.distributed.tensor._op_schema import (
+    OpSchema,
+    OpSpec,
+    OpStrategy,
+    OutputSharding,
+    StrategyType,
+)
+from torch.distributed.tensor._ops.utils import (
+    generate_redistribute_costs,
+    is_tensor_shardable,
+    register_op_strategy,
+)
+from torch.distributed.tensor.placement_types import Placement, Replicate, Shard
+
+try:
+    from torch.utils._cxx_pytree import tree_leaves
+except ImportError:
+    from torch.utils._pytree import tree_leaves  # type: ignore[no-redef]
+
 from typing_extensions import ParamSpec
 
 logger = logging.getLogger(__name__)
@@ -54,6 +72,125 @@ def op_strategy_context(op_overload, strategy_func, schema_info=None):
 replicate_op_strategy = torch.distributed.tensor._ops.utils.replicate_op_strategy
 
 
+def batch_shard_strategy(
+    op_schema: OpSchema,
+    input_shard_dim: list[Optional[int]],
+    output_shard_dim: list[Optional[int]],
+    enable_shard_batch_dim_over_multiple_axis: bool = False,
+) -> OpStrategy:
+    """
+    Shard the input tensor over the specified dimensions. The strategy will map
+    batch dim of input/output tensors to the same device mesh axis (or same
+    multiple device axes). All input must either have one specified batch dim or
+    no batch dim. If an input doesn't have batch dim, the strategy will assume
+    the tensor will be broadcasted to batch dim and processed by the operator.
+    For inputs specified with a batch dim, user need to make sure the batch dim
+    size are all the same. Output should always have a batch dim.
+
+    Args:
+        op_schema (OpSchema): the op schema.
+
+        input_shard_dim (list[Optional[int]]): the list of shard dimensions to
+        consider for each input tensor argument. Use `None` if no batch dim of
+        the input arg. If an arg is List[Tenor], we flatten it first and then
+        match with input_shard_dim. Since the dim is not specific to the device
+        mesh axis, it can be a combination of any device axes. Example 1: input
+        tensor A[1024,64,8], B[1024,64,16], with input_shard_dim = [1,1], it can
+        shard A's dim 0 over device axis X, and shard B's dim 0 over device axis
+        X. X can be any of device axes. The output follow the same sharding as
+        input. Example 2: input tensor A[64,8], B[64,16,1024], C[64,8], with
+        input_shard_dim = [None,2,None], it will Replicate A,C over all device
+        dim and only shard B's dim 2 over the device mesh. Assume the device
+        mesh has 3 axis, then tensor B's placement can be (Shard(2), Shard(2),
+        Replicate()), (Shard(2), Replicate(), Shard(2)), (Replicate(), Shard(2),
+        Shard(2)).
+
+        output_shard_dim (list[Optional[int]]): the list of shard dimensions to
+        consider for each output tensor argument. Use `None` if no batch dim of
+        the output arg. For example, if the output is a single tensor and is
+        sharded on dim 0, pass in [0] then.
+
+        enable_shard_batch_dim_over_multiple_axis (bool): if True, the strategy
+        will try also map batch dim to multiple device axis. Default is False.
+
+    Note: It is the user's responsibility to make sure the sharded tensor for
+    processing is correct in shape.
+    """
+    output_type = [str(ret.type) for ret in op_schema.op._schema.returns]
+    # TODO(zpcore): Confirm if view op can be handle properly or not. Prevent
+    # handling view ops until confirmed.
+    if op_schema.op.is_view:
+        raise RuntimeError(
+            "fallback strategy is unable to handle view ops until confirmed"
+        )
+    if "List[Tensor]" in output_type:
+        raise RuntimeError(
+            "fallback strategy is unable to handle ops with List[Tensor] output "
+            "because size of the list may depend on the op's input value"
+        )
+    inputs_strategy = tree_leaves(op_schema.args_strategy)
+    assert len(inputs_strategy) == len(input_shard_dim)
+    output_strategy = OpStrategy([])
+    mesh = inputs_strategy[0].mesh
+    device_axis = list(range(mesh.ndim))
+    use_how_many_axis = (
+        [i + 1 for i in range(mesh.ndim)]
+        if enable_shard_batch_dim_over_multiple_axis
+        else [1]
+    )
+    # number of device axes to shard on for the batch dim
+    for num_axis in use_how_many_axis:
+        device_combinations = list(itertools.combinations(device_axis, num_axis))
+        # e.g., if num_axis == 2, device_combinations = [(0,1), (0,2), (1,2),
+        # ...]. Then One feasible strategy is to shard tensor dim on both axis
+        # (0,1). We check all combinations in device_combinations below.
+        for comb in device_combinations:
+            input_specs_list: list[DTensorSpec] = []
+            output_specs_list: list[DTensorSpec] = []
+            is_shardable = True
+            for op_stratgy, dim in zip(inputs_strategy, input_shard_dim):
+                # create a new list of shard_dim_option
+                new_placements: list[Placement] = [Replicate()] * mesh.ndim
+                for axis in comb:
+                    new_placements[axis] = Shard(dim) if dim else Replicate()
+                tensor_meta = op_stratgy.strategies[0].output_spec.tensor_meta
+                new_input_spec = DTensorSpec(
+                    mesh,
+                    tuple(new_placements),
+                    tensor_meta=op_stratgy.strategies[0].output_spec.tensor_meta,
+                )
+                if not is_tensor_shardable(tensor_meta.shape, new_input_spec):
+                    is_shardable = False
+                    break
+                input_specs_list.append(new_input_spec)
+            if not is_shardable:
+                continue
+            for dim in output_shard_dim:
+                new_placements = [Replicate()] * mesh.ndim
+                for axis in comb:
+                    new_placements[axis] = Shard(dim) if dim else Replicate()
+                output_spec = DTensorSpec(
+                    mesh,
+                    tuple(new_placements),
+                )
+                output_specs_list.append(output_spec)
+
+            output_specs = (
+                output_specs_list[0]
+                if len(output_specs_list) == 1
+                else tuple(output_specs_list)
+            )
+            input_specs = input_specs_list
+            redistribute_cost = [
+                generate_redistribute_costs(strat, input_specs_list[i])
+                for i, strat in enumerate(inputs_strategy)
+            ]
+            output_strategy.strategies.append(
+                OpSpec(output_specs, input_specs, redistribute_cost)  # type: ignore
+            )
+    return output_strategy
+
+
 class StrategyPool:
     def __init__(self) -> None:
         # reference to existing strategy from the DTensor upstream
diff --git a/tests/test_dtensor.py b/tests/test_dtensor.py
@@ -3,11 +3,17 @@
 # This source code is licensed under the BSD license found in the
 # LICENSE file in the root directory of this source tree.
 
-from contextlib import contextmanager
+import functools
 
 import numpy as np
 import torch
-from torch.distributed.tensor import DTensor, Shard, distribute_tensor, init_device_mesh
+from torch.distributed.tensor import (
+    DTensor,
+    Replicate,
+    Shard,
+    distribute_tensor,
+    init_device_mesh,
+)
 from torch.distributed.tensor._dtensor_spec import DTensorSpec, TensorMeta
 from torch.distributed.tensor._op_schema import (
     OpInfo,
@@ -16,6 +22,7 @@
     OpStrategy,
     OutputSharding,
     OutputSpecType,
+    RuntimeSchemaInfo,
     TupleStrategy,
 )
 from torch.testing._internal.common_utils import run_tests
@@ -25,6 +32,7 @@
 )
 
 from autoparallel.dtensor_util import strategy_pool
+from autoparallel.dtensor_util.utils import batch_shard_strategy, op_strategy_context
 
 aten = torch.ops.aten
 
@@ -94,31 +102,6 @@ def _fw_tuple(x, y, z):
 )
 
 
-@contextmanager
-def op_strategy_context(op_overload, strategy_func, schema_info=None):
-    """
-    Context manager for setting and clearing op strategies in unit tests.
-    Args:
-        op_overload: The operator overload to set or clear the strategy for.
-        strategy_func: The strategy function to set for the operator overload.
-        schema_info: Optional schema information for the operator overload.
-    Yields:
-        None
-    """
-    try:
-        # register the op strategy
-        strategy_pool.register_op_strategy(op_overload, schema_info=schema_info)(
-            strategy_func
-        )
-        yield
-    finally:
-        # clear this op strategy cache
-        if op_overload in strategy_pool.op_strategy_funcs:
-            del strategy_pool.op_strategy_funcs[op_overload]
-        if op_overload in strategy_pool.op_to_schema_info:
-            del strategy_pool.op_to_schema_info[op_overload]
-
-
 # Overwrite upstream `_op_dispatcher.sharding_propagator` with customized
 # sharding_propagator. This is for testing purpose under eager mode and
 # AutoParallel won't use the propagate function. The main changes are 1) Skip
@@ -394,5 +377,105 @@ def test_implicit_registration(self):
             self._test_op_on_dtensor(test_op, input_x_dt, input_y_dt)
 
 
+class DimShardingTest(DTensorTestBase):
+    @with_comms
+    def test_simple_batch_sharding(self):
+        # both input tensors batch on dim 0
+        mesh = init_device_mesh(self.device_type, (2, self.world_size // 2))
+        test_op = torch.ops.mylib.numpy_sin.default
+
+        # 1. strategy that will try shard dim 0 into one devices axis.
+        shard_first_dim_strategy = functools.partial(
+            batch_shard_strategy, input_shard_dim=[0, 0], output_shard_dim=[0]
+        )
+        with op_strategy_context(test_op, shard_first_dim_strategy):
+            # dim 0 is the batch dim. Here we shard 16 over one device axis
+            input_x = torch.randn([16, 1, 4], device=self.device_type)
+            input_y = torch.randn([16, 1, 4], device=self.device_type)
+            # any sharding below should work
+            input_x_dt = distribute_tensor(input_x, mesh, [Shard(1), Replicate()])
+            input_y_dt = distribute_tensor(input_y, mesh, [Replicate(), Shard(0)])
+            self._test_op_on_dtensor(test_op, input_x_dt, input_y_dt)
+
+        # 2. strategy that will try shard dim 0 into multiple devices.
+        shard_first_dim_to_multiple_devices_strategy = functools.partial(
+            batch_shard_strategy,
+            input_shard_dim=[0, 0],
+            output_shard_dim=[0],
+            enable_shard_batch_dim_over_multiple_axis=True,
+        )
+        with op_strategy_context(test_op, shard_first_dim_to_multiple_devices_strategy):
+            # dim 0 is the batch dim. Here we potentially shard 16 over multiple device axes
+            input_x = torch.randn([16, 1, 4], device=self.device_type)
+            input_y = torch.randn([16, 1, 4], device=self.device_type)
+            # any sharding below should work
+            input_x_dt = distribute_tensor(input_x, mesh, [Shard(1), Replicate()])
+            input_y_dt = distribute_tensor(input_y, mesh, [Replicate(), Shard(0)])
+            self._test_op_on_dtensor(test_op, input_x_dt, input_y_dt)
+
+    @with_comms
+    def test_broadcast_batch_sharding(self):
+        # Not recommended, user need to make sure the op supports input with
+        # broadcast first. If not supported, try unsqueeze inputs first to match
+        # each other's dimensions and and use the example in the
+        # test_simple_batch_sharding test.
+        mesh = init_device_mesh(self.device_type, (2, self.world_size // 2))
+        test_op = torch.ops.mylib.numpy_sin.default
+        shard_on_first_dim_strategy = functools.partial(
+            batch_shard_strategy, input_shard_dim=[None, 0], output_shard_dim=[0]
+        )
+        with op_strategy_context(test_op, shard_on_first_dim_strategy):
+            input_x = torch.randn([1, 4], device=self.device_type)
+            # input_y's 16 locates on the batch dim
+            input_y = torch.randn([16, 1, 4], device=self.device_type)
+            # any sharding below should work as long as the tensor dim it is shardable
+            input_x_dt = distribute_tensor(input_x, mesh, [Shard(1), Replicate()])
+            input_y_dt = distribute_tensor(input_y, mesh, [Replicate(), Shard(0)])
+
+            output_dt = test_op(input_x_dt, input_y_dt)
+
+            # split the batch dim to test correctness
+            input_y_chucks = torch.chunk(input_y, 4, dim=0)
+            output = torch.cat(
+                [test_op(input_x, input_y_part) for input_y_part in input_y_chucks]
+            )
+            self.assertEqual(output_dt.full_tensor(), output)
+
+            # or we can test directly since the op support broadcast.
+            self._test_op_on_dtensor(test_op, input_x_dt, input_y_dt)
+
+    @with_comms
+    def test_simple_tuple_batch_sharding(self):
+        # both input tensors batch on dim 0
+        mesh = init_device_mesh(self.device_type, (2, self.world_size // 2))
+        test_op = torch.ops.mylib.numpy_tuple_sin.default
+
+        # 1. strategy that will try shard dim 0 into one devices axis.
+        shard_first_dim_to_multiple_devices_strategy = functools.partial(
+            batch_shard_strategy,
+            input_shard_dim=[0, 0, 0, 0, 0],  # flatten input_y
+            output_shard_dim=[0],
+            enable_shard_batch_dim_over_multiple_axis=True,
+        )
+        with op_strategy_context(
+            test_op,
+            shard_first_dim_to_multiple_devices_strategy,
+            schema_info=RuntimeSchemaInfo(needs_pytree=True),
+        ):
+            # dim 0 is the batch dim. Here we shard 16 over one device axis
+            input_x = torch.randn([16, 8, 4], device=self.device_type)
+            input_y = [
+                torch.randn([16, 8, 4], device=self.device_type) for _ in range(3)
+            ]
+            input_z = torch.randn([16, 8, 4], device=self.device_type)
+            # any sharding below should work as long as the tensor dim it is shardable
+            input_x_dt = distribute_tensor(input_x, mesh, [Shard(0), Shard(1)])
+            input_y_dt = [
+                distribute_tensor(i, mesh, [Shard(1), Shard(1)]) for i in input_y
+            ]
+            input_z_dt = distribute_tensor(input_z, mesh, [Shard(1), Shard(0)])
+            self._test_op_on_dtensor(test_op, input_x_dt, input_y_dt, input_z_dt)
+
+
 if __name__ == "__main__":
     run_tests()