[INTERPRETER] Support generic reduce (#3412)

Jokeren · web-flow · commit 5ee38fe270cd · 2024-03-20T00:41:54.000-04:00
This is a simple and general version that matches the underlying triton
implementation when reducing within a single thread. `numpy.func` is not
used because its reduce op only supports binary inputs, but triton's
`combine_fn` can have arbitrary number of inputs.
diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
@@ -1635,6 +1635,7 @@ def kernel(X, Z1, Z2, N: tl.constexpr):
     np.testing.assert_equal(to_numpy(z2_ref), to_numpy(z2))
 
 
+@pytest.mark.interpreter
 def test_split_to_scalar(device):
 
     @triton.jit
@@ -2602,6 +2603,7 @@ def test_chain_reduce(M, N, src_layout, op, device, first_axis):
     np.testing.assert_allclose(z_ref, z_tri.cpu().numpy(), rtol=0.01, atol=1e-3)
 
 
+@pytest.mark.interpreter
 def test_generic_reduction(device):
 
     @triton.jit
diff --git a/python/triton/language/core.py b/python/triton/language/core.py
@@ -1968,7 +1968,7 @@ def _reduce_with_indices(input, axis, combine_fn, keep_dims=False, _builder=None
 # -----------------------
 
 
-def _add_scan_docstr(name: str, return_indices_arg: str = None, tie_break_arg: str = None) -> Callable[[T], T]:
+def _add_scan_docstr(name: str) -> Callable[[T], T]:
 
     def _decorator(func: T) -> T:
         docstr = """
diff --git a/python/triton/runtime/interpreter.py b/python/triton/runtime/interpreter.py
@@ -484,29 +484,89 @@ def _get_transpose(self):
     tensor.T = property(_get_transpose)
 
 
-def _patch_lang_core(lang, builder):
-    for name, member in inspect.getmembers(lang):
-        if tl.core.is_builtin(member):
-            _patch_attr(lang, name, member, builder)
-    # reduce is better off with a separate patch due to how
-    # the builder currently interfaces with custom functions
+def _patch_reduce_scan(lang):
 
-    def _new_reduce(input, axis, combine_fn, **kwargs):
-        if axis is not None and axis >= len(input.shape):
-            raise ValueError(f"axis {axis} out of bounds for shape {input.shape}")
+    def _check_axis(tensor, axis):
+        if axis is not None and axis >= len(tensor.shape):
+            raise ValueError(f"axis {axis} out of bounds for shape {tensor.shape}")
 
-        def _to_tensor(ret, dtype):
-            if ret.shape:
-                ret_type = tl.block_type(dtype, ret.shape)
+    def _to_tensor(ret, dtype):
+        if hasattr(ret, "shape") and ret.shape:
+            ret_type = tl.block_type(dtype, ret.shape)
+        else:
+            ret = np.array([ret], dtype=_get_np_dtype(dtype))
+            ret_type = dtype
+        return tl.core.tensor(TensorHandle(ret, dtype), ret_type)
+
+    def _generic_reduce(input, axis, combine_fn, keep_dims):
+
+        def _check_axis_and_unravel(input, axis):
+            ret = []
+            if not isinstance(input, tuple):
+                input = (input, )
+            for data in input:
+                if isinstance(data, tl.core.tensor):
+                    if axis is not None:
+                        _check_axis(data, axis)
+                        ret.append(data)
+                    else:
+                        axis = 0
+                        ret.append(_to_tensor(data.handle.data.flatten(), data.dtype))
+            return tuple(ret), axis
+
+        original_input = input
+        original_axis = axis
+        input, axis = _check_axis_and_unravel(input, axis)
+        input_data = []
+        output_data = []
+        input_shape = None
+        output_shape = None
+        for arg in input:
+            if isinstance(arg, tl.core.tensor):
+                input_shape = arg.handle.data.shape
+                input_data.append(arg.handle.data)
+                output_shape = input_shape[0:axis] + input_shape[axis + 1:]
+                output_data.append(np.zeros(output_shape, dtype=arg.handle.data.dtype))
+        if not input_shape:
+            raise ValueError("no tensors found in input")
+        # Reduce on axis
+        for i in range(len(input_data[0])):
+            # Recover input_index from i using input_shape
+            input_index = np.unravel_index(i, input_shape)
+            output_index = input_index[0:axis] + input_index[axis + 1:]
+            input_tuple = tuple(d[input_index] for d in input_data)
+            if input_index[axis] == 0:
+                # First element
+                for j in range(len(output_data)):
+                    output_data[j][output_index] = input_tuple[j]
             else:
-                ret = np.array([ret], dtype=_get_np_dtype(dtype))
-                ret_type = dtype
-            return tl.core.tensor(TensorHandle(ret, dtype), ret_type)
-
-        def _min_max(input, val_reduce_op, idx_reduce_op=None, axis=None, return_indices_tie_break_left=True,
-                     keepdims=False):
-            if return_indices_tie_break_left is False:
-                raise NotImplementedError("return_indices_tie_break_left=False not supported in interpreter mode")
+                acc_tuple = tuple(o[output_index] for o in output_data)
+                acc_tuple = combine_fn.fn(*acc_tuple, *input_tuple)
+                for j in range(len(output_data)):
+                    output_data[j][output_index] = acc_tuple[j]
+        # Pack output
+        ret = []
+        for data in output_data:
+            if keep_dims:
+                if original_axis is not None:
+                    data = np.expand_dims(data, axis)
+                else:
+                    input_shape = original_input[0].handle.data.shape
+                    for _ in range(len(input_shape)):
+                        data = np.expand_dims(data, 0)
+
+            elif original_axis is None:
+                # Take a scalar
+                data = data.item()
+            ret.append(_to_tensor(data, input[0].dtype))
+        return ret[0] if len(ret) == 1 else tuple(ret)
+
+    def _new_reduce(input, axis, combine_fn, keep_dims=False, **kwargs):
+
+        def _min_max(input, val_reduce_op, idx_reduce_op=None, axis=None, keepdims=False):
+            # If input is a tuple, it must be (val, index), and we only take val
+            input = input[0] if isinstance(input, tuple) else input
+            _check_axis(input, axis)
             val = None
             idx = None
             if val_reduce_op:
@@ -523,53 +583,42 @@ def _min_max(input, val_reduce_op, idx_reduce_op=None, axis=None, return_indices
                 raise ValueError("val_reduce_op and idx_reduce_op are both None")
 
         def _sum(input, axis=None, keepdims=False):
+            _check_axis(input, axis)
             return _to_tensor(np.sum(input.handle.data, axis=axis, keepdims=keepdims), input.dtype)
 
-        keep_dims = kwargs.get("keep_dims", False)
-        return_indices = kwargs.get("return_indices", False)
-        return_indices_tile_break_left = kwargs.get("return_indices_tile_break_left", True)
-        fn = combine_fn.fn.__name__
         mapping = {
-            "_elementwise_min":  #
-            functools.partial(_min_max, val_reduce_op=np.min, idx_reduce_op=np.argmin if return_indices else None,
-                              return_indices_tie_break_left=return_indices_tile_break_left),  #
-            "_elementwise_max":  #
-            functools.partial(_min_max, val_reduce_op=np.max, idx_reduce_op=np.argmax if return_indices else None,
-                              return_indices_tie_break_left=return_indices_tile_break_left),  #
-            "_argmin_combine":  #
-            functools.partial(_min_max, val_reduce_op=None, idx_reduce_op=np.argmin,
-                              return_indices_tie_break_left=return_indices_tile_break_left),  #
-            "_argmax_combine":  #
-            functools.partial(_min_max, val_reduce_op=None, idx_reduce_op=np.argmax,
-                              return_indices_tie_break_left=return_indices_tile_break_left),  #
-            "_sum_combine": _sum
+            tl.standard._argmin_combine_tie_break_left:  #
+            functools.partial(_min_max, val_reduce_op=np.min, idx_reduce_op=np.argmin),  #
+            tl.standard._argmax_combine_tie_break_left:  #
+            functools.partial(_min_max, val_reduce_op=np.max, idx_reduce_op=np.argmax),  #
+            tl.standard._elementwise_max: functools.partial(_min_max, val_reduce_op=np.max, idx_reduce_op=None),  #
+            tl.standard._elementwise_min: functools.partial(_min_max, val_reduce_op=np.min, idx_reduce_op=None),  #
+            tl.standard._sum_combine: _sum,  #
         }
-        if fn not in mapping:
-            raise ValueError(f"fn {fn} not supported")
-        return mapping[fn](input, axis=axis, keepdims=keep_dims)
+        if combine_fn not in mapping:
+            # Fall back to the slow mode
+            return _generic_reduce(input, axis, combine_fn, keep_dims)
+        return mapping[combine_fn](input, axis=axis, keepdims=keep_dims)
 
     def _new_scan(input, axis, combine_fn, **kwargs):
-        fn = combine_fn.fn.__name__
         mapping = {
-            "_sum_combine": np.cumsum,
+            tl.standard._sum_combine: np.cumsum,
         }
-        ret = mapping[fn](input.handle.data, axis=axis)
+        ret = mapping[combine_fn](input.handle.data, axis=axis)
         ret_type = tl.block_type(input.dtype, ret.shape)
         return tl.core.tensor(TensorHandle(ret, input.dtype), ret_type)
 
-    def _new_reduce_scan_wrapper(mode, input, axis=None, **kwargs):
-        impl_fn = _new_scan if mode.startswith("cum") else _new_reduce
-        mode = mode[3:] if mode.startswith("cum") else mode
-        combine_fn = {
-            "min": tl.standard._elementwise_min,
-            "max": tl.standard._elementwise_max,
-            "sum": tl.standard._sum_combine,
-            "argmin": tl.standard._argmin_combine,
-            "argmax": tl.standard._argmax_combine,
-        }
-        if mode not in combine_fn:
-            raise ValueError(f"mode {mode} not supported")
-        return impl_fn(input, axis, combine_fn[mode], **kwargs)
+    tl.reduce = _new_reduce
+    tl.associative_scan = _new_scan
+    # FIXME(Keren): This is a workaround because some core functions use core.reduce but not tl.reduce
+    tl.core.reduce = _new_reduce
+    tl.core.associative_scan = _new_scan
+
+
+def _patch_lang_core(lang, builder):
+    for name, member in inspect.getmembers(lang):
+        if tl.core.is_builtin(member):
+            _patch_attr(lang, name, member, builder)
 
     def _new_to_ir(self, builder):
         # We need to specify signedness for integer types in the numpy mode
@@ -611,14 +660,6 @@ def _new_to_ir(self, builder):
             return builder.get_double_ty()
         raise ValueError(f'fail to convert {self} to ir type')
 
-    lang.reduce = _new_reduce
-    lang.min = functools.partial(_new_reduce_scan_wrapper, "min")
-    lang.max = functools.partial(_new_reduce_scan_wrapper, "max")
-    lang.sum = functools.partial(_new_reduce_scan_wrapper, "sum")
-    lang.argmin = functools.partial(_new_reduce_scan_wrapper, "argmin")
-    lang.argmax = functools.partial(_new_reduce_scan_wrapper, "argmax")
-    lang.cumsum = functools.partial(_new_reduce_scan_wrapper, "cumsum")
-
     # can't just map lang.static_range to `range`, because `tl.static_range`
     # can get `step` passed by keyword
     def _new_range(arg1, arg2=None, step=None, **kwargs):
@@ -638,6 +679,8 @@ def _new_static_assert(cond, msg=""):
     lang.static_assert = _new_static_assert
     lang.dtype.to_ir = _new_to_ir
 
+    _patch_reduce_scan(lang)
+
 
 def _patch_lang_math(lang):
     mapping = {