Support of implicit fallback (#49)

zpcore · web-flow · commit 8df62c4bd753 · 2025-07-29T18:16:17.000+02:00
* Support of explicit fallback [ghstack-poisoned] * Update on "Support of implicit fallback" (Split out the large PR #46) Support the implicit replication fallback startegy. How to use Implicit replication fallback: ```python from autoparallel.dtensor_util import strategy_pool with strategy_pool.replicate_for_unsupported_operators(): ... # (missing ops will use replicated strategy if possible) ``` Note: StrategyPool reuses the _op_dispatcher.sharding_propagator.op_strategy_funcs/op_to_rules/op_to_schema_info by reference now. [ghstack-poisoned] * Update on "Support of implicit fallback" (Split out the large PR #46) Support the implicit replication fallback startegy. How to use Implicit replication fallback: ```python from autoparallel.dtensor_util import strategy_pool with strategy_pool.replicate_for_unsupported_operators(): ... # (missing ops will use replicated strategy if possible) ``` Note: StrategyPool reuses the _op_dispatcher.sharding_propagator.op_strategy_funcs/op_to_rules/op_to_schema_info by reference now. [ghstack-poisoned]
diff --git a/autoparallel/dtensor_util/__init__.py b/autoparallel/dtensor_util/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
+# functions to expose
+from .utils import (
+    get_op_strategy,
+    op_strategy_context,
+    replicate_op_strategy,
+    with_implicit_strategies,
+)
+
+__all__ = [
+    "replicate_op_strategy",
+    "get_op_strategy",
+    "with_implicit_strategies",
+    "op_strategy_context",
+]
diff --git a/autoparallel/dtensor_util/utils.py b/autoparallel/dtensor_util/utils.py
@@ -0,0 +1,108 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from contextlib import ExitStack, contextmanager
+
+import torch
+from torch.distributed.tensor import DTensor
+from torch.distributed.tensor._op_schema import OpSchema, StrategyType
+from torch.distributed.tensor._ops.utils import register_op_strategy
+
+logger = logging.getLogger(__name__)
+
+aten = torch.ops.aten
+
+# reference to existing sharding_propagator DTensor upstream
+propagator = DTensor._op_dispatcher.sharding_propagator
+
+enable_implicit_replication = False
+_current_stack = None
+
+replicate_op_strategy = torch.distributed.tensor._ops.utils.replicate_op_strategy
+
+
+# TODO: remove and refer to
+# https://github.com/pytorch/pytorch/blob/9c107606629de6383f55e3b48b42e594d23407b1/test/distributed/tensor/test_op_strategy.py#L446
+# once the function is moved outside of the test folder in upstream
+@contextmanager
+def op_strategy_context(op_overload, strategy_func, schema_info=None):
+    """
+    Context manager for setting and clearing op strategies.
+    Args:
+        op_overload: The operator overload to set or clear the strategy for.
+        strategy_func: The strategy function to set for the operator overload.
+        schema_info: Optional schema information for the operator overload.
+    Yields:
+        None
+    """
+    propagator = DTensor._op_dispatcher.sharding_propagator
+    try:
+        # register the op strategy
+        register_op_strategy(op_overload, schema_info=schema_info)(strategy_func)
+        yield
+    finally:
+        # clear this op strategy cache
+        if op_overload in propagator.op_strategy_funcs:
+            del propagator.op_strategy_funcs[op_overload]
+        if op_overload in propagator.op_to_schema_info:
+            del propagator.op_to_schema_info[op_overload]
+        propagator.propagate_op_sharding.cache.cache_clear()
+
+
+def get_op_strategy(op: torch._ops.OpOverload, op_schema: OpSchema) -> StrategyType:
+    global enable_implicit_replication, _current_stack
+
+    if op not in propagator.op_strategy_funcs:
+        if not enable_implicit_replication:
+            raise NotImplementedError(
+                f"Operator {op} does not have a sharding strategy registered."
+            )
+        else:
+            # Use the current stack if available
+            if _current_stack is not None:
+                _current_stack.enter_context(
+                    op_strategy_context(op, replicate_op_strategy)
+                )
+            else:
+                # No stack available, just register permanently
+                register_op_strategy(op)(replicate_op_strategy)
+            logger.warning(
+                f"implicitly registering `{op}` with `{replicate_op_strategy.__name__}`"
+            )
+    return propagator.op_strategy_funcs[op](op_schema)
+
+
+@contextmanager
+def with_implicit_strategies():
+    """Context manager to enable implicit replication and clean up strategies."""
+    global enable_implicit_replication, _current_stack
+
+    # Create a fresh ExitStack for this context
+    with ExitStack() as local_stack:
+        # Store the stack as a global variable
+        old_stack = _current_stack
+        _current_stack = local_stack
+
+        # Enable implicit replication
+        old_value = enable_implicit_replication
+        enable_implicit_replication = True
+        try:
+            yield
+        finally:
+            # Restore the original values
+            _current_stack = old_stack
+            enable_implicit_replication = old_value
+
+
+# TODO: automatic generate redistribute cost for strategies. There exists a
+# `fill_missing_redistribute_cost` in autoparallel/utils.py, which is a hack
+# to generate redistribute cost given input specs, and only tested on
+# certain ops. We can potentially make an improvement.
+def fill_missing_redistribute_cost(op: torch._ops.OpOverload, op_schema: OpSchema):
+    """
+    Fill missing redistribute cost for strategies.
+    """
+    ...
diff --git a/autoparallel/propagation_rules.py b/autoparallel/propagation_rules.py
@@ -36,6 +36,8 @@
 from torch.distributed.tensor._ops.utils import generate_redistribute_costs
 from torch.distributed.tensor.placement_types import Replicate, Shard
 
+from .dtensor_util import get_op_strategy
+
 # TODO: move this to PyTorch
 dim_maps[torch.t] = lambda input: dim_transpose(input.ndim, -2, -1)
 
@@ -582,11 +584,7 @@ def index_rule(mesh, op_schema):
 @register_opschema_rule(torch.ops.aten._scaled_dot_product_efficient_attention.default)
 def sdpa_rule(mesh, op_schema):
     op = torch.ops.aten._scaled_dot_product_efficient_attention.default
-    out_strat = torch.distributed.tensor.DTensor._op_dispatcher.sharding_propagator.op_strategy_funcs[
-        op
-    ](
-        op_schema
-    )
+    out_strat = get_op_strategy(op, op_schema)
     # remove wrong context-parallel strategy
     # https://github.com/pytorch/pytorch/pull/131351#discussion_r1716164659
     new_strats = []
@@ -613,11 +611,7 @@ def sdpa_rule(mesh, op_schema):
 @register_opschema_rule(torch.ops.aten.reshape.default)
 def reshape_rule(mesh, op_schema):
     op = torch.ops.aten.reshape.default
-    out_strat = torch.distributed.tensor.DTensor._op_dispatcher.sharding_propagator.op_strategy_funcs[
-        op
-    ](
-        op_schema
-    )
+    out_strat = get_op_strategy(op, op_schema)
     if mesh.ndim == 1:
         # remove duplicate strategy
         # TODO: hack, fixme
@@ -643,11 +637,7 @@ def expand_rule(mesh, op_schema_):
     ]
     if len(expand_dim) != 1:
         assert len(expand_dim) == 0
-        return torch.distributed.tensor.DTensor._op_dispatcher.sharding_propagator.op_strategy_funcs[
-            op
-        ](
-            op_schema
-        )
+        return get_op_strategy(op, op_schema)
     assert len(expand_dim) == 1, f"{expand_dim}"
     expand_dim = expand_dim[0]
     to_remove = []
@@ -661,11 +651,7 @@ def expand_rule(mesh, op_schema_):
     removed = []
     for i in reversed(to_remove):
         removed.append(input_strat.strategies.pop(i))
-    out_strat = torch.distributed.tensor.DTensor._op_dispatcher.sharding_propagator.op_strategy_funcs[
-        op
-    ](
-        op_schema
-    )
+    out_strat = get_op_strategy(op, op_schema)
     for i, ss in enumerate(out_strat.strategies):
         for remov in to_remove:
             ss.redistribute_cost[0].insert(remov, math.inf)
diff --git a/autoparallel/utils.py b/autoparallel/utils.py
@@ -10,6 +10,7 @@
 from torch.distributed.tensor._ops.utils import generate_redistribute_costs
 from torch.utils._pytree import tree_flatten, tree_map_only
 
+from .dtensor_util import get_op_strategy
 from .propagation_rules import _op_partial_rules, _op_rules, remove_invalid_configs
 
 
@@ -110,11 +111,7 @@ def get_placement_options(mesh, op, specs, user_args, user_kwargs):
     if op in _op_partial_rules:
         out_strat = _op_partial_rules[op](mesh, op_schema)
     else:
-        out_strat = torch.distributed.tensor.DTensor._op_dispatcher.sharding_propagator.op_strategy_funcs[
-            op
-        ](
-            op_schema
-        )
+        out_strat = get_op_strategy(op, op_schema)
 
     propagate_tensor_meta(op, user_args, user_kwargs, out_strat)
     fill_missing_redistribute_cost(op, specs, out_strat)
diff --git a/requirements-test.txt b/requirements-test.txt
@@ -1,7 +1,8 @@
 torch >= 2.7.0
 numpy
 pulp
-pytest
+pytest >= 8.1
+expecttest
 
 black == 22.3.0
 flake8 == 6.1.0
diff --git a/tests/test_dtensor.py b/tests/test_dtensor.py