Support of explicit fallback

zpcore · zpcore · commit 78ed7e08943d · 2025-07-24T15:32:13.000-07:00
[ghstack-poisoned]
diff --git a/autoparallel/dtensor_util/__init__.py b/autoparallel/dtensor_util/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from . import utils
+
+strategy_pool = utils.StrategyPool()
diff --git a/autoparallel/dtensor_util/utils.py b/autoparallel/dtensor_util/utils.py
@@ -0,0 +1,119 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import logging
+from contextlib import contextmanager
+from typing import Callable, TypeVar
+
+import torch
+from torch.distributed.tensor import DTensor
+from torch.distributed.tensor._op_schema import (
+    OpSchema,
+    OutputSharding,
+    RuntimeSchemaInfo,
+    StrategyType,
+)
+from typing_extensions import ParamSpec
+
+logger = logging.getLogger(__name__)
+
+aten = torch.ops.aten
+
+_T = TypeVar("_T")
+_P = ParamSpec("_P")
+
+
+# -------------define universal op strategy-------------
+replicate_op_strategy = torch.distributed.tensor._ops.utils.replicate_op_strategy
+
+
+class StrategyPool:
+    def __init__(self) -> None:
+        # reference to existing strategy from the DTensor upstream
+        self.op_strategy_funcs: dict[
+            torch._ops.OpOverload, Callable[[OpSchema], StrategyType]
+        ] = DTensor._op_dispatcher.sharding_propagator.op_strategy_funcs
+        # reference to existing rules
+        self.op_to_rules: dict[
+            torch._ops.OpOverload, Callable[[OpSchema], OutputSharding]
+        ] = DTensor._op_dispatcher.sharding_propagator.op_to_rules
+        # we probably don't need to care about existing op_to_schema_info for AP
+        self.op_to_schema_info = (
+            DTensor._op_dispatcher.sharding_propagator.op_to_schema_info
+        )
+
+        self.enable_implicit_replication: bool = False
+        self.implicit_strategy_op_tracker: list[torch._ops.OpOverload] = []
+
+    def get_op_strategy(
+        self, op: torch._ops.OpOverload, op_schema: OpSchema
+    ) -> StrategyType:
+        if op not in self.op_strategy_funcs:
+            if not self.enable_implicit_replication:
+                raise NotImplementedError(
+                    f"Operator {op} does not have a sharding strategy registered."
+                )
+            else:
+                self.implicit_strategy_op_tracker.append(op)
+                logger.warning(
+                    f"implicitly register sharding strategy op {op.name()} using {replicate_op_strategy.__name__}"
+                )
+                self.register_op_strategy(op)(replicate_op_strategy)
+        return self.op_strategy_funcs[op](op_schema)
+
+    def register_op_strategy(
+        self,
+        op: torch._ops.OpOverload,
+        schema_info=RuntimeSchemaInfo(needs_pytree=True),
+    ) -> Callable[[Callable[_P, _T]], Callable[_P, _T]]:
+        # pyre-fixme[2]: Parameter must be annotated.
+        # always enable pytree as dispatching overhead is not a concern in AP.
+        def wrapper(impl):
+            if isinstance(op, list):
+                overloads = op
+            else:
+                overloads = [op]
+
+            for overload in overloads:
+                self.op_strategy_funcs[overload] = impl
+                self.op_to_schema_info[overload] = schema_info
+            return impl
+
+        return wrapper
+
+    @contextmanager
+    def replicate_for_unsupported_operators(self):
+        """
+        Context manager for setting and clearing implicit strategy.
+        """
+        try:
+            if self.enable_implicit_replication:
+                raise RuntimeError(
+                    "Implicit strategy is already enabled. Cannot enable it again."
+                )
+            self.enable_implicit_replication = True
+            yield
+        finally:
+            self.enable_implicit_replication = False
+            op_to_remove = self.implicit_strategy_op_tracker
+            for op_overload in op_to_remove:
+                if op_overload in self.op_strategy_funcs:
+                    del self.op_strategy_funcs[op_overload]
+                if op_overload in self.op_to_schema_info:
+                    del self.op_to_schema_info[op_overload]
+            self.implicit_strategy_op_tracker.clear()
+
+    # TODO: automatic generate redistribute cost for strategies. There exists a
+    # `fill_missing_redistribute_cost` in autoparallel/utils.py, which is a hack
+    # to generate redistribute cost given input specs, and only tested on
+    # certain ops. We can potentially make an improvement.
+    def fill_missing_redistribute_cost(
+        self, op: torch._ops.OpOverload, op_schema: OpSchema
+    ):
+        """
+        Fill missing redistribute cost for strategies.
+        """
+        ...
diff --git a/autoparallel/propagation_rules.py b/autoparallel/propagation_rules.py
@@ -36,6 +36,8 @@
 from torch.distributed.tensor._ops.utils import generate_redistribute_costs
 from torch.distributed.tensor.placement_types import Replicate, Shard
 
+from .dtensor_util import strategy_pool
+
 # TODO: move this to PyTorch
 dim_maps[torch.t] = lambda input: dim_transpose(input.ndim, -2, -1)
 
@@ -582,11 +584,7 @@ def index_rule(mesh, op_schema):
 @register_opschema_rule(torch.ops.aten._scaled_dot_product_efficient_attention.default)
 def sdpa_rule(mesh, op_schema):
     op = torch.ops.aten._scaled_dot_product_efficient_attention.default
-    out_strat = torch.distributed.tensor.DTensor._op_dispatcher.sharding_propagator.op_strategy_funcs[
-        op
-    ](
-        op_schema
-    )
+    out_strat = strategy_pool.get_op_strategy(op, op_schema)
     # remove wrong context-parallel strategy
     # https://github.com/pytorch/pytorch/pull/131351#discussion_r1716164659
     new_strats = []
@@ -613,11 +611,7 @@ def sdpa_rule(mesh, op_schema):
 @register_opschema_rule(torch.ops.aten.reshape.default)
 def reshape_rule(mesh, op_schema):
     op = torch.ops.aten.reshape.default
-    out_strat = torch.distributed.tensor.DTensor._op_dispatcher.sharding_propagator.op_strategy_funcs[
-        op
-    ](
-        op_schema
-    )
+    out_strat = strategy_pool.get_op_strategy(op, op_schema)
     if mesh.ndim == 1:
         # remove duplicate strategy
         # TODO: hack, fixme
@@ -643,11 +637,7 @@ def expand_rule(mesh, op_schema_):
     ]
     if len(expand_dim) != 1:
         assert len(expand_dim) == 0
-        return torch.distributed.tensor.DTensor._op_dispatcher.sharding_propagator.op_strategy_funcs[
-            op
-        ](
-            op_schema
-        )
+        return strategy_pool.get_op_strategy(op, op_schema)
     assert len(expand_dim) == 1, f"{expand_dim}"
     expand_dim = expand_dim[0]
     to_remove = []
@@ -661,11 +651,7 @@ def expand_rule(mesh, op_schema_):
     removed = []
     for i in reversed(to_remove):
         removed.append(input_strat.strategies.pop(i))
-    out_strat = torch.distributed.tensor.DTensor._op_dispatcher.sharding_propagator.op_strategy_funcs[
-        op
-    ](
-        op_schema
-    )
+    out_strat = strategy_pool.get_op_strategy(op, op_schema)
     for i, ss in enumerate(out_strat.strategies):
         for remov in to_remove:
             ss.redistribute_cost[0].insert(remov, math.inf)
diff --git a/autoparallel/utils.py b/autoparallel/utils.py
@@ -10,6 +10,7 @@
 from torch.distributed.tensor._ops.utils import generate_redistribute_costs
 from torch.utils._pytree import tree_flatten, tree_map_only
 
+from .dtensor_util import strategy_pool
 from .propagation_rules import _op_partial_rules, _op_rules, remove_invalid_configs
 
 
@@ -110,11 +111,7 @@ def get_placement_options(mesh, op, specs, user_args, user_kwargs):
     if op in _op_partial_rules:
         out_strat = _op_partial_rules[op](mesh, op_schema)
     else:
-        out_strat = torch.distributed.tensor.DTensor._op_dispatcher.sharding_propagator.op_strategy_funcs[
-            op
-        ](
-            op_schema
-        )
+        out_strat = strategy_pool.get_op_strategy(op, op_schema)
 
     propagate_tensor_meta(op, user_args, user_kwargs, out_strat)
     fill_missing_redistribute_cost(op, specs, out_strat)
diff --git a/requirements-test.txt b/requirements-test.txt
@@ -1,7 +1,8 @@
 torch >= 2.7.0
 numpy
 pulp
-pytest
+pytest >= 8.1
+expecttest
 
 black == 22.3.0
 flake8 == 6.1.0
diff --git a/tests/test_dtensor.py b/tests/test_dtensor.py