[SME] Add scalable fp16->fp32 dense schedule (#16981)

This commit extends the functionality of the SME dense and matmul schedules to support operations with fp16 inputs and an fp32 output, where `transpose_a=False` and `transpose_b=True`. For convenience, it also adds a utility called `get_vscale_factor` which created the correct multiplier for `vscale` given a data type, reflecting ideas from an early design of the [SVE](apache/tvm-rfcs#104) RFC.
apache · May 28, 2024 · 430e02f · 430e02f
1 parent 20d8c53
commit 430e02f
Show file tree

Hide file tree

Showing 13 changed files with 442 additions and 106 deletions.
diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -21,7 +21,6 @@
 # pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import
 import re
 
-import tvm
 from tvm import relay, topi, tir
 from tvm.tir.schedule.analysis import has_block
 
@@ -684,9 +683,9 @@ def schedule_dense_arm_cpu(attrs, inputs, out_type, target):
 
     if (
         target.features.has_sme
-        and data.dtype in ["float32"]
-        and weight.dtype in ["float32"]
-        and out_type.dtype in ["float32"]
+        and data.dtype in ["float32", "float16"]
+        and weight.dtype == data.dtype
+        and out_type.dtype == "float32"
         # The schedule uses tensorization which does not work when the
         # reduction axis has unit iters. See
         # https://github.com/apache/tvm/issues/16566
@@ -724,10 +723,12 @@ def matmul_strategy_arm_cpu(attrs, inputs, out_type, target):
 
     if (
         target.features.has_sme
-        and data.dtype in ["float32"]
-        and weight.dtype in ["float32"]
-        and out_type.dtype in ["float32"]
-        and not (attrs.transpose_a or attrs.transpose_b)
+        and data.dtype in ["float32", "float16"]
+        and weight.dtype == data.dtype
+        and out_type.dtype == "float32"
+        and not attrs.transpose_a
+        and not (data.dtype == "float16" and not attrs.transpose_b)
+        and not (data.dtype == "float32" and attrs.transpose_b)
         and len(data.shape) == 2
         # The schedule uses tensorization which does not work when the
         # reduction axis has unit iters. See
@@ -796,9 +797,13 @@ def arm_cpu_tir_strategy(sch: tir.Schedule) -> bool:
     """
     Strategy for arm_cpu STIR schedules.
     """
-    current_target = tvm.target.Target.current()
+    matmul_block = None
+    if has_block(sch, "T_matmul_NN"):
+        matmul_block = sch.get_block("T_matmul_NN")
+    elif has_block(sch, "T_matmul_NT"):
+        matmul_block = sch.get_block("T_matmul_NT")
 
-    if current_target.features.has_sme and has_block(sch, "matmul_sme_gemm"):
+    if matmul_block and sch.get(matmul_block).annotations.get("schedule_type", "") == "sme":
         topi.arm_cpu.matmul.tir_schedule_matmul_sme(sch)
         return True
 

diff --git a/python/tvm/testing/aot.py b/python/tvm/testing/aot.py
@@ -45,6 +45,8 @@
     "uint16": "uint16_t",
     "int32": "int32_t",
     "uint32": "uint32_t",
+    # See: https://gcc.gnu.org/onlinedocs/gcc/Half-Precision.html
+    "float16": "_Float16",
     "float32": "float",
 }
 

diff --git a/python/tvm/tir/__init__.py b/python/tvm/tir/__init__.py
@@ -88,7 +88,7 @@
 from .op import q_multiply_shift, q_multiply_shift_per_axis, shift_left, shift_right
 from .op import TVMBackendAllocWorkspace, TVMBackendFreeWorkspace
 from .op import start_profile_intrinsic, end_profile_intrinsic
-from .op import vscale, get_active_lane_mask
+from .op import vscale, get_active_lane_mask, get_vscale_expr
 from .generic import add, subtract, multiply
 
 from .schedule import StmtSRef, BlockScope, ScheduleState, Schedule, ScheduleError

diff --git a/python/tvm/tir/op.py b/python/tvm/tir/op.py
@@ -16,7 +16,7 @@
 # under the License.
 # pylint: disable=redefined-builtin, invalid-name
 """Operators used in TIR expression."""
-from typing import Any, Optional
+from typing import Any, Optional, Union
 
 import tvm._ffi
 from tvm.ir import Array, Op, PrimExpr
@@ -3370,6 +3370,22 @@ def get_active_lane_mask(dtype, base, limit):
     return call_intrin(dtype, "tir.get_active_lane_mask", base, limit)
 
 
+def get_vscale_expr(dtype: Union[str, tvm.DataType], min_size: int = 128) -> PrimExpr:
+    """
+    Create a datatype dependent scalable expression.
+
+    Parameters
+    ----------
+    dtype : Union[str, tvm.DataType]
+        Element data type.
+    min_size : int
+        The minimum size of the scalable vector in bits.
+    """
+    if isinstance(dtype, str):
+        dtype = tvm.DataType(dtype)
+    return min_size // dtype.bits * vscale()
+
+
 # pylint: disable=unnecessary-lambda
 sum = comm_reducer(lambda x, y: x + y, lambda t: const(0, dtype=t), name="sum")
 min = comm_reducer(lambda x, y: _ffi_api._OpMin(x, y, None), max_value, name="min")  # type: ignore