From ce8060962fbac86cca578b068fcdf2f3addebe9c Mon Sep 17 00:00:00 2001
From: Egor Churaev <egor.churaev@gmail.com>
Date: Mon, 7 Jun 2021 16:51:56 +0300
Subject: [PATCH] [Metal] Reduce number of threads for reduction layers

Reduced default number of threads in reduction kernels for Metal.
Default code generation generated thread block with the following size:
32x32x1. With this size number of threads per threadgroup was equal to
1024 (32 * 32 * 1). Sometimes device doesn't have enough resources and
in this case we will get an exception that the block size is greater
than value of maxTotalThreadsPerThreadgroup.
To prevent such situation we decrease default number of threads. With
this fix every model should work with default codegen and auto-tuning or
auto-scheduling will select the optimal number of threads.
---
 include/tvm/topi/cuda/reduction.h | 2 +-
 python/tvm/topi/cuda/reduction.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/include/tvm/topi/cuda/reduction.h b/include/tvm/topi/cuda/reduction.h
index 7160419422a6..51f35ed8dc25 100644
--- a/include/tvm/topi/cuda/reduction.h
+++ b/include/tvm/topi/cuda/reduction.h
@@ -70,7 +70,7 @@ Schedule ScheduleReduce(const Target& target, Operation op, Schedule sch,
   if (out_stage->op.as<ComputeOpNode>()->axis.size() > 0) {
     all_reduce = false;
     num_thread = 32;
-    if (target->kind->name == "opencl") {
+    if (target->kind->name == "opencl" || target->kind->name == "metal") {
       // Without this, CL_INVALID_WORK_GROUP_SIZE occurs with python tests.
       // Don't know why.
       num_thread = 16;
diff --git a/python/tvm/topi/cuda/reduction.py b/python/tvm/topi/cuda/reduction.py
index ceab71640533..b9d02d9c81d8 100644
--- a/python/tvm/topi/cuda/reduction.py
+++ b/python/tvm/topi/cuda/reduction.py
@@ -37,7 +37,7 @@ def _schedule_reduce(op, sch, is_idx_reduce=False):
         all_reduce = False
         num_thread = 32
         target = tvm.target.Target.current()
-        if target and target.kind.name == "opencl":
+        if target and (target.kind.name == "opencl" or target.kind.name == "metal"):
             # without it, CL_INVALID_WORK_GROUP_SIZE occurred when running test_topi_reduce.py
             # don't know why
             num_thread = 16