[pallas:mosaic_gpu] Added support for reductions to the WG lowering

superbobry · Google-ML-Automation · commit e33f3fc48bcc · 2025-03-12T08:18:31.000-07:00
Note that

* we have no easy way of testing multi-reductions at the moment;
* `reduce_max` assumes WGMMA_ROW layout which is not currently supported by
  the dialect lowering AFAICT.

PiperOrigin-RevId: 736138554
diff --git a/jax/_src/pallas/mosaic_gpu/lowering.py b/jax/_src/pallas/mosaic_gpu/lowering.py
@@ -1543,6 +1543,60 @@ def _reduce_max_lowering_rule(ctx: LoweringRuleContext, x, *, axes):
       raise NotImplementedError(f"Unsupported layout {x.layout}")
 
 
+def _reduce_lowering_rule_wg(
+    kind: vector_dialect.CombiningKind,
+    acc: object,
+    ctx: LoweringRuleContext,
+    x,
+    *,
+    axes,
+) -> ir.OpView:
+  [x_aval] = ctx.avals_in
+  [out_aval] = ctx.avals_out
+  x = _ensure_ir_value(x, x_aval.dtype)
+  out_type = mgpu_utils.dtype_to_ir_type(out_aval.dtype)
+  if not out_aval.shape:
+    # Special-case: reducing to a scalar.
+    if x_aval.ndim != 1:
+      # TODO(slebedev): Flatten to 1D, since vector.reduction only supports
+      # 1D inputs.
+      raise NotImplementedError("Only 1D inputs are supported")
+    return vector_dialect.ReductionOp(out_type, kind, x)
+  acc = vector_dialect.splat(
+      ir.VectorType.get(out_aval.shape, out_type),
+      _ensure_ir_value(acc, out_aval.dtype),
+  )
+  return vector_dialect.MultiDimReductionOp(kind, x, acc, axes)
+
+
+@register_lowering_rule(lax.reduce_sum_p, mgpu.ThreadSemantics.Warpgroup)
+def _reduce_sum_lowering_rule_wg(ctx: LoweringRuleContext, x, *, axes):
+  op = _reduce_lowering_rule_wg(
+      vector_dialect.CombiningKind.ADD, 0, ctx, x, axes=axes
+  )
+  op.attributes["offset"] = ir.IntegerAttr.get(
+      ir.IntegerType.get_signless(32), ctx.module_ctx.smem_used_bytes
+  )
+  return op.result
+
+
+@register_lowering_rule(lax.reduce_max_p, mgpu.ThreadSemantics.Warpgroup)
+def _reduce_max_lowering_rule_wg(ctx: LoweringRuleContext, x, *, axes):
+  [x_aval] = ctx.avals_in
+  if jnp.issubdtype(x_aval.dtype, jnp.floating):
+    kind = vector_dialect.CombiningKind.MAXIMUMF
+    acc = float("-inf")
+  elif jnp.issubdtype(x_aval.dtype, jnp.signedinteger):
+    kind = vector_dialect.CombiningKind.MAXSI
+    acc = np.iinfo(x_aval.dtype).max
+  elif jnp.issubdtype(x_aval.dtype, jnp.unsignedinteger):
+    kind = vector_dialect.CombiningKind.MAXUI
+    acc = np.iinfo(x_aval.dtype).max
+  else:
+    raise NotImplementedError(f"Unsupported dtype {x_aval.dtype}")
+  return _reduce_lowering_rule_wg(kind, acc, ctx, x, axes=axes).result
+
+
 @register_lowering_rule(lax.axis_index_p, mgpu.ThreadSemantics.Lane)
 def _axis_index_rule(ctx: LoweringRuleContext, *, axis_name: Hashable):
   i32 = ir.IntegerType.get_signless(32)
diff --git a/jax/experimental/mosaic/gpu/dialect_lowering.py b/jax/experimental/mosaic/gpu/dialect_lowering.py
@@ -320,6 +320,34 @@ def _vector_splat_op_lowering_rule(
   return [_fragmented_array_to_ir(fragmented_array, out_vec_ty)]
 
 
+@_register_lowering(vector.ReductionOp)
+def _vector_reduction_op_lowering_rule(
+    ctx: LoweringContext, op: vector.ReductionOp
+) -> Sequence[ir.Value]:
+  del ctx  # Unused.
+  [layout] = inference_utils.in_layouts(op)
+  () = inference_utils.out_layouts(op)
+  element_type = ir.VectorType(op.vector.type).element_type
+  is_signed = False if ir.IntegerType.isinstance(element_type) else None
+  a = _fragmented_array_from_ir(op.vector, layout, is_signed)
+  match str(op.kind):
+    case "#vector.kind<add>":
+      smem = ir.Attribute.parse("#gpu.address_space<workgroup>")
+      scratch = _slice_smem(
+          ir.MemRefType.get([4], element_type, memory_space=smem),
+          arith.constant(None, op.attributes["offset"]),
+      )
+      result = a.reduce_sum(scratch)
+    case (
+        "#vector.kind<maxsi>" | "#vector.kind<maxui>" | "#vector.kind<maximumf>"
+    ):
+      # TODO(slebedev): Implement this and remove the raise below.
+      raise NotImplementedError(f"Unsupported reduction kind: {op.kind}")
+    case _:
+      raise NotImplementedError(f"Unsupported reduction kind: {op.kind}")
+  return [_fragmented_array_to_ir(result, op.result.type)]
+
+
 def memref_layout_to_swizzle_and_transforms(
     layout: ir.Attribute,
 ) -> tuple[mgpu.SwizzlingMode, tuple[launch_context.MemRefTransform, ...]]:
@@ -713,16 +741,17 @@ def _mgpu_slice_smem_op_lowering_rule(
     ctx: LoweringContext, op: SliceSMEMOp
 ) -> Sequence[ir.Value]:
   del ctx
+  return [_slice_smem(op.result.type, op.offset)]
+
+
+def _slice_smem(result: ir.Type, offset: ir.Value):
   i8 = ir.IntegerType.get_signless(8)
   smem = ir.Attribute.parse("#gpu.address_space<workgroup>")
-
   smem_base = gpu.dynamic_shared_memory(
       ir.MemRefType.get((utils.DYNAMIC,), i8, memory_space=smem)
   )
-
-  offset = arith.index_cast(ir.IndexType.get(), op.offset)
-
-  return [memref.view(op.result.type, smem_base, offset, [])]
+  offset = arith.index_cast(ir.IndexType.get(), offset)
+  return memref.view(result, smem_base, offset, [])
 
 
 @_register_lowering(scf.ForOp)
@@ -866,7 +895,8 @@ def _should_lower(op: ir.OpView) -> bool:
 
 
 def lower_mgpu_dialect(
-    module: ir.Module, launch_context: launch_context.LaunchContext | None
+    module: ir.Module,
+    launch_context: launch_context.LaunchContext | None,
 ):
   # TODO(apaszke,bchetioui): Make sure the layouts match.
   # TODO(bchetioui): rethink this API. It doesn't make sense to pass in a full
diff --git a/jax/experimental/mosaic/gpu/fragmented_array.py b/jax/experimental/mosaic/gpu/fragmented_array.py
@@ -1389,7 +1389,7 @@ def reduce_sum(self, scratch: ir.Value | None = None):
     if isinstance(self.layout, WGSplatFragLayout):
       [reg] = self.registers.flat
       if ir.FloatType.isinstance(self.mlir_dtype):
-        op = arith.mulf
+        op = mulf
       elif ir.IntegerType.isinstance(self.mlir_dtype):
         op = arith.muli
       else:
diff --git a/jax/experimental/mosaic/gpu/layout_inference.py b/jax/experimental/mosaic/gpu/layout_inference.py
@@ -336,6 +336,12 @@ def _infer_splat_op_layout(splat_op: vector.SplatOp) -> OptionalLayouts:
 
   return [], [layout]
 
+@partial(_add_layout_inference_rule, vector.ReductionOp)
+def _infer_reduction_op_layout(op: vector.ReductionOp) -> OptionalLayouts:
+  if layout := inference_utils.value_layout(op.vector):
+    return [layout], []
+  return None
+
 
 @partial(_add_layout_inference_rule, mgpu.WGMMAOp)
 def _infer_wgmma_op_layout(wgmma_op: mgpu.WGMMAOp) -> OptionalLayouts:
diff --git a/tests/pallas/mosaic_gpu_test.py b/tests/pallas/mosaic_gpu_test.py
@@ -184,6 +184,23 @@ def kernel(x_ref, y_ref, o_ref):
     y = jnp.flip(x).reshape(1, 256)
     np.testing.assert_array_equal(kernel(x, y), x + y[0])
 
+  @parameterized.product(
+      shape=[(128,)], thread_semantics=[*plgpu.ThreadSemantics]
+  )
+  def test_reduce_sum(self, shape, thread_semantics):
+    @functools.partial(
+        pl.pallas_call,
+        out_shape=jax.ShapeDtypeStruct(shape, jnp.float32),
+        compiler_params=plgpu.GPUCompilerParams(
+            thread_semantics=thread_semantics
+        ),
+    )
+    def kernel(x_ref, o_ref):
+      o_ref[...] = jnp.broadcast_to(_sum_same_dtype(x_ref[...]), o_ref.shape)
+
+    x = jnp.arange(math.prod(shape)).reshape(shape).astype(jnp.float32)
+    np.testing.assert_array_equal(kernel(x), jnp.sum(x))
+
   def test_reshape(self):
     shape1, shape2 = (128,), (2, 16, 4)
 
@@ -200,10 +217,14 @@ def kernel(x_ref, out_ref):
     x = jnp.arange(math.prod(shape1)).astype(jnp.float32)
     np.testing.assert_array_equal(kernel(x), x.reshape(shape2))
 
-  def test_add_xy_indexed(self):
+  @parameterized.product(thread_semantics=[*plgpu.ThreadSemantics])
+  def test_add_xy_indexed(self, thread_semantics):
     @functools.partial(
         pl.pallas_call,
         out_shape=jax.ShapeDtypeStruct([128], jnp.float32),
+        compiler_params=plgpu.GPUCompilerParams(
+            thread_semantics=thread_semantics
+        ),
     )
     def kernel(x_ref, y_ref, o_ref):
       idx = _sum_same_dtype(y_ref[...])
@@ -1078,10 +1099,14 @@ def kernel(x_ref, o_ref):
 
     self.assertIn("acc % 2", output())
 
-  def test_cond_returning_array(self):
+  @parameterized.parameters([*plgpu.ThreadSemantics])
+  def test_cond_returning_array(self, thread_semantics):
     @functools.partial(
         pl.pallas_call,
         out_shape=jax.ShapeDtypeStruct([256], jnp.int32),
+        compiler_params=plgpu.GPUCompilerParams(
+            thread_semantics=thread_semantics
+        ),
     )
     def kernel(x_ref, o_ref):
       acc = _sum_same_dtype(x_ref[...])