Miscellaneous improvements for MLA (#7)

fmassa · web-flow · commit ed3312a3c41d · 2025-06-16T13:15:07.000+02:00
With those improvements, AMAIA's implementation of MLA seems to work
diff --git a/autoparallel/compute_estimation.py b/autoparallel/compute_estimation.py
@@ -4,7 +4,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import torch
-from torch.utils._pytree import tree_map_only
+from torch.utils._pytree import tree_flatten, tree_map_only
 from torch.utils.flop_counter import FlopCounterMode
 
 
@@ -59,12 +59,20 @@ def estimate_strategy_runtime_cost(node, strategy):
 
     args = tree_map_only(torch.fx.Node, lambda x: x.meta["val"], node.args)
     kwargs = tree_map_only(torch.fx.Node, lambda x: x.meta["val"], node.kwargs)
-    fake_mode = next(
+
+    fake_modes = [
         arg.fake_mode
-        for arg in args
+        for arg in tree_flatten(args)[0]
         if isinstance(arg, torch._subclasses.fake_tensor.FakeTensor)
-    )
-    assert len(kwargs) == 0
+    ]
+    if len(fake_modes) == 0:
+        return 0
+
+    assert all(fm == fake_modes[0] for fm in fake_modes)
+    fake_mode = fake_modes[0]
+    if len(kwargs) > 0:
+        for k, v in kwargs.items():
+            assert not isinstance(v, torch.Tensor), f"{node} {v}"
     args_shapes = tuple(_get_sharded_shape(spec) for spec in strategy.input_specs)
 
     counter = 0
@@ -87,6 +95,9 @@ def estimate_strategy_runtime_cost(node, strategy):
     # TODO: fix this
     dtype = strategy.input_specs[0].tensor_meta.dtype
 
+    # TODO: better handle this case
+    if dtype.is_complex:
+        return 0
     # TODO: use PyTorch's version once it's giving correct results
     gpu_flops = _get_device_tflops(dtype) * 10**12
 
diff --git a/autoparallel/propagation_rules.py b/autoparallel/propagation_rules.py
@@ -678,7 +678,14 @@ def expand_rule(mesh, op_schema_):
         for i, (s1, s2) in enumerate(zip(orig_shape, dest_shape))
         if s1 == 1 and s2 != s1
     ]
-    assert len(expand_dim) == 1
+    if len(expand_dim) != 1:
+        assert len(expand_dim) == 0
+        return torch.distributed.tensor.DTensor._op_dispatcher.sharding_propagator.op_strategy_funcs[
+            op
+        ](
+            op_schema
+        )
+    assert len(expand_dim) == 1, f"{expand_dim}"
     expand_dim = expand_dim[0]
     to_remove = []
     for i, ss in enumerate(input_strat.strategies):
diff --git a/autoparallel/utils.py b/autoparallel/utils.py
@@ -43,12 +43,14 @@ def propagate_tensor_meta(op, user_args, out_strat):
                         else:
                             assert tm is None
         if strat.input_specs is None:
-            assert op in {
+            supported_ops = {
                 torch.ops.prims.convert_element_type.default,
+                torch.ops.aten.clone.default,
                 torch.ops.aten.slice.Tensor,
-            }, (
+            }
+            assert op in supported_ops, (
                 f"{op} strategy doesn't have input_specs, only harcoded "
-                "prims.convert_element_type.default and aten.slice.Tensor for now"
+                "{supported_ops} for now"
             )
             strat.input_specs = (strat.output_specs,)
             assert strat.redistribute_cost is None