pytorch · danielvegamyhre · Jun 11, 2025 · Jun 11, 2025 · Jun 11, 2025 · Jun 17, 2025
diff --git a/torchao/prototype/moe_training/scaled_grouped_mm.py b/torchao/prototype/moe_training/scaled_grouped_mm.py
@@ -35,6 +35,7 @@ def _scaled_grouped_mm(
         offs (int32 torch.Tensor): The offsets to use to mark the starting index of each group along dim0 of the A tensor.
         out_dtype (Optional[torch.dtype]): The dtype of the output tensor. Currently only torch.bfloat16 is supported.
     """
+    print("$$$ SCALED GROUPED MM")
     return _Float8GroupedMM.apply(
         A,
         B_t,

diff --git a/torchao/prototype/moe_training/tensor.py b/torchao/prototype/moe_training/tensor.py
@@ -1,8 +1,8 @@
 import torch
+from torch.utils._pytree import tree_map
 
 from torchao.prototype.moe_training import _scaled_grouped_mm
 
-
 class ScaledGroupedMMTensor(torch.Tensor):
     """
     ScaledGroupedMMTensor is a simple tensor subclass that wraps a regular tensor
@@ -16,6 +16,9 @@ class ScaledGroupedMMTensor(torch.Tensor):
     def __init__(self, data: torch.Tensor):
         self._data = data
 
+    def __repr__(self):
+        return f"ScaledGroupedMMTensor(data={self._data})"
+
     @classmethod
     def __torch_function__(cls, func, types, args, kwargs={}):
         if func.__name__ == cls.grouped_mm_func_name:
@@ -32,4 +35,27 @@ def __torch_function__(cls, func, types, args, kwargs={}):
             has_offs = kwargs.get(cls.offs_arg_name) is not None
             if A_is_2d and B_is_3d and has_offs:
                 return _scaled_grouped_mm(*args, **kwargs)
-        return super().__torch_function__(func, types, args, kwargs)
+
+        # Disable torch_function by hand because we don't want 
+        # the wrapping behavior of the super() impl, go directly to
+        # torch_dispatch for the rest of the ops.
+        with torch._C.DisableTorchFunctionSubclass():
+            return func(*args, **kwargs)
+
+
+    @classmethod
+    def __torch_dispatch__(cls, func, types, args, kwargs={}):
+        unwrap = lambda x: x._data if isinstance(x, cls) else x
+        wrap = lambda x: cls(x) if isinstance(x, torch.Tensor) else x
+        unwrapped_args, unwrapped_kwargs = tree_map(unwrap, (args, kwargs))
+
+        # special case: for ops with out=.. specified, we want the output tensor to be a subclass.
+        if 'out' in unwrapped_kwargs:
+            unwrapped_kwargs['out'] = tree_map(wrap, unwrapped_kwargs['out'])
+
+        with torch._C.DisableTorchFunctionSubclass():
+            output = func(*args, **kwargs)
+        wrapped_output = tree_map(wrap, output)
+        print("func", func.__name__)
+        print(wrapped_output)
+        return wrapped_output