From ce8060962fbac86cca578b068fcdf2f3addebe9c Mon Sep 17 00:00:00 2001 From: Egor Churaev Date: Mon, 7 Jun 2021 16:51:56 +0300 Subject: [PATCH] [Metal] Reduce number of threads for reduction layers Reduced default number of threads in reduction kernels for Metal. Default code generation generated thread block with the following size: 32x32x1. With this size number of threads per threadgroup was equal to 1024 (32 * 32 * 1). Sometimes device doesn't have enough resources and in this case we will get an exception that the block size is greater than value of maxTotalThreadsPerThreadgroup. To prevent such situation we decrease default number of threads. With this fix every model should work with default codegen and auto-tuning or auto-scheduling will select the optimal number of threads. --- include/tvm/topi/cuda/reduction.h | 2 +- python/tvm/topi/cuda/reduction.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/tvm/topi/cuda/reduction.h b/include/tvm/topi/cuda/reduction.h index 7160419422a6..51f35ed8dc25 100644 --- a/include/tvm/topi/cuda/reduction.h +++ b/include/tvm/topi/cuda/reduction.h @@ -70,7 +70,7 @@ Schedule ScheduleReduce(const Target& target, Operation op, Schedule sch, if (out_stage->op.as()->axis.size() > 0) { all_reduce = false; num_thread = 32; - if (target->kind->name == "opencl") { + if (target->kind->name == "opencl" || target->kind->name == "metal") { // Without this, CL_INVALID_WORK_GROUP_SIZE occurs with python tests. // Don't know why. num_thread = 16; diff --git a/python/tvm/topi/cuda/reduction.py b/python/tvm/topi/cuda/reduction.py index ceab71640533..b9d02d9c81d8 100644 --- a/python/tvm/topi/cuda/reduction.py +++ b/python/tvm/topi/cuda/reduction.py @@ -37,7 +37,7 @@ def _schedule_reduce(op, sch, is_idx_reduce=False): all_reduce = False num_thread = 32 target = tvm.target.Target.current() - if target and target.kind.name == "opencl": + if target and (target.kind.name == "opencl" or target.kind.name == "metal"): # without it, CL_INVALID_WORK_GROUP_SIZE occurred when running test_topi_reduce.py # don't know why num_thread = 16