diff --git a/cinn/hlir/op/contrib/argmax_test.cc b/cinn/hlir/op/contrib/argmax_test.cc index 419fb081c1..3b42e73f1f 100644 --- a/cinn/hlir/op/contrib/argmax_test.cc +++ b/cinn/hlir/op/contrib/argmax_test.cc @@ -94,7 +94,7 @@ void TestGenerateCodeCpu_Argmax_Keep(void* _args, int32_t num_args) for (int32_t j = 0; j < 3; j += 1) { for (int32_t k = 0; k < 28; k += 1) { for (int32_t a = 0; a < 28; a += 1) { - test_argmax_in_index[((2352 * i) + ((784 * j) + ((28 * k) + a)))] = cinn_host_find_int_nd(_test_argmax_in_index_temp, 3, j, ((2352 * i) + ((28 * k) + a)), 784); + test_argmax_in_index[((2352 * i) + ((784 * j) + ((28 * k) + a)))] = cinn_host_next_smallest_int32(_test_argmax_in_index_temp, 3, j, ((2352 * i) + ((28 * k) + a)), 784); }; }; }; diff --git a/cinn/hlir/op/contrib/argmin_test.cc b/cinn/hlir/op/contrib/argmin_test.cc index 98e9bdb8d5..bfe053f101 100644 --- a/cinn/hlir/op/contrib/argmin_test.cc +++ b/cinn/hlir/op/contrib/argmin_test.cc @@ -93,7 +93,7 @@ void TestGenerateCodeCpu_Argmin_Keep(void* _args, int32_t num_args) for (int32_t j = 0; j < 3; j += 1) { for (int32_t k = 0; k < 28; k += 1) { for (int32_t a = 0; a < 28; a += 1) { - test_argmin_in_index[((2352 * i) + ((784 * j) + ((28 * k) + a)))] = cinn_host_find_int_nd(_test_argmin_in_index_temp, 3, j, ((2352 * i) + ((28 * k) + a)), 784); + test_argmin_in_index[((2352 * i) + ((784 * j) + ((28 * k) + a)))] = cinn_host_next_smallest_int32(_test_argmin_in_index_temp, 3, j, ((2352 * i) + ((28 * k) + a)), 784); }; }; }; diff --git a/cinn/hlir/op/contrib/sort.cc b/cinn/hlir/op/contrib/sort.cc index 927949f13b..e8471bcb4a 100644 --- a/cinn/hlir/op/contrib/sort.cc +++ b/cinn/hlir/op/contrib/sort.cc @@ -56,9 +56,9 @@ std::vector ArgSort(const ir::Tensor &A, std::string find_func_name; std::string index_func_name; if (target.arch == common::Target::Arch::NVGPU) { - find_func_name.assign("cinn_cuda_find_int_nd"); + find_func_name.assign("cinn_nvgpu_next_smallest_int32"); } else if (target.arch == common::Target::Arch::X86) { - find_func_name.assign("cinn_host_find_int_nd"); + find_func_name.assign("cinn_host_next_smallest_int32"); } else { LOG(FATAL) << "ArgSort only supports X86 and NVGPU ! Please Check.\n"; } diff --git a/cinn/hlir/op/contrib/sort_test.cc b/cinn/hlir/op/contrib/sort_test.cc index 860eef32e3..e5f990ba11 100644 --- a/cinn/hlir/op/contrib/sort_test.cc +++ b/cinn/hlir/op/contrib/sort_test.cc @@ -112,7 +112,7 @@ void TestGenerateCodeCpu_Sort(void* _args, int32_t num_args) }; for (int32_t i = 0; i < 4; i += 1) { for (int32_t j = 0; j < 28; j += 1) { - test_sort_out_index[((28 * i) + j)] = cinn_host_find_int_nd(_test_sort_out_index_temp, 28, j, (28 * i), 1); + test_sort_out_index[((28 * i) + j)] = cinn_host_next_smallest_int32(_test_sort_out_index_temp, 28, j, (28 * i), 1); }; }; for (int32_t i = 0; i < 4; i += 1) { diff --git a/cinn/runtime/cpu/host_intrinsics.cc b/cinn/runtime/cpu/host_intrinsics.cc index 2ce67f7b86..a9d1cd0e38 100644 --- a/cinn/runtime/cpu/host_intrinsics.cc +++ b/cinn/runtime/cpu/host_intrinsics.cc @@ -64,6 +64,20 @@ inline int cinn_host_find_float_nd(const cinn_buffer_t* buf, int size, float num #undef __cinn_host_find_kernel +inline int cinn_host_next_smallest_int32(cinn_buffer_t* buf, int size, int num, int begin, int stride) { + int id = -1; + for (int i = begin; i < begin + size * stride; i += stride) { + if (id == -1 || reinterpret_cast(buf->memory)[i] < reinterpret_cast(buf->memory)[id]) { + id = i; + } + } + if (id != -1) { + reinterpret_cast(buf->memory)[id] = 2147483647; + return (id - begin) / stride; + } + return -1; +} + #define CINN_HOST_LT_NUM(TYPE_SUFFIX, TYPE) \ inline int cinn_host_lt_num_##TYPE_SUFFIX( \ const cinn_buffer_t* buf, const int size, const TYPE num, const int offset, const int stride) { \ @@ -349,6 +363,15 @@ CINN_REGISTER_HELPER(host_intrinsics) { .AddInputType() .End(); + REGISTER_EXTERN_FUNC_HELPER(cinn_host_next_smallest_int32, host_target) + .SetRetType() + .AddInputType() + .AddInputType() + .AddInputType() + .AddInputType() + .AddInputType() + .End(); + #define _REGISTER_CINN_HOST_LT_NUM(TYPE_SUFFIX, TYPE) \ REGISTER_EXTERN_FUNC_HELPER(cinn_host_lt_num_##TYPE_SUFFIX, host_target) \ .SetRetType() \ diff --git a/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh b/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh index 7d0b75e476..2024f2badc 100644 --- a/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh +++ b/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh @@ -1,7 +1,12 @@ /** * \file This file contains all the intrinsics available to be used in CUDA code generated by CodeGen. */ + extern "C" { + +#define CINN_INT32_MAX 2147483647 +#define CINN_INT32_MIN -2147483648 + // *************************************************************** // // bool unary and binary operator #define FN_BOOL(func) cinn_nvgpu_##func##_bool @@ -364,8 +369,8 @@ __device__ inline float16 FN_FP16(pow)(float16 a, float16 b) { #define EXPAND_REDUCE_INT32_MARCO(MARCO, ...) \ MARCO(sum_int32, 0, int, ##__VA_ARGS__) \ MARCO(prod_int32, 1, int, ##__VA_ARGS__) \ - MARCO(max_int32, -2147483648, int, ##__VA_ARGS__) \ - MARCO(min_int32, 2147483647, int, ##__VA_ARGS__) + MARCO(max_int32, CINN_INT32_MIN, int, ##__VA_ARGS__) \ + MARCO(min_int32, CINN_INT32_MAX, int, ##__VA_ARGS__) __device__ inline int cinn_sum_int32(const int left, const int right) { return left + right; } __device__ inline int cinn_prod_int32(const int left, const int right) { return left * right; } @@ -634,6 +639,20 @@ __device__ inline int cinn_cuda_find_float_nd(const float *buf, int size, float #undef __cinn_cuda_find_kernel +__device__ inline int cinn_nvgpu_next_smallest_int32(int *buf, int size, int num, int begin, int stride) { + int id = -1; + for (int i = begin; i < begin + size * stride; i += stride) { + if (id == -1 || buf[i] < buf[id]) { + id = i; + } + } + if (id != -1) { + buf[id] = CINN_INT32_MAX; + return (id - begin) / stride; + } + return -1; +} + #define __cinn_cuda_find_from_kernel(buf, size, num, begin) \ do { \ for (int i = begin; i < size; ++i) { \ @@ -812,6 +831,8 @@ __device__ int cinn_cuda_resize_bicubic(const int *buf, // *************************************************************** // // end of macro undef +#undef CINN_INT32_MAX +#undef CINN_INT32_MIN #undef FN_BOOL #undef FN_UINT8 #undef FN_INT8 diff --git a/cinn/runtime/cuda/cuda_intrinsics.cc b/cinn/runtime/cuda/cuda_intrinsics.cc index bdb1005ebd..40bce455ff 100644 --- a/cinn/runtime/cuda/cuda_intrinsics.cc +++ b/cinn/runtime/cuda/cuda_intrinsics.cc @@ -300,6 +300,15 @@ CINN_REGISTER_HELPER(cuda_intrinsics) { .AddInputType() .End(); + REGISTER_FACKED_EXTERN_FUNC_HELPER(cinn_nvgpu_next_smallest_int32, target) + .SetRetType() + .AddInputType() + .AddInputType() + .AddInputType() + .AddInputType() + .AddInputType() + .End(); + #define _REGISTER_CINN_NVGPU_LT_NUM(TYPE_SUFFIX, TYPE) \ REGISTER_FACKED_EXTERN_FUNC_HELPER(cinn_nvgpu_lt_num_##TYPE_SUFFIX, target) \ .SetRetType() \ diff --git a/python/tests/ops/op_test_helper.py b/python/tests/ops/op_test_helper.py index f9796f6026..eb8c99889a 100644 --- a/python/tests/ops/op_test_helper.py +++ b/python/tests/ops/op_test_helper.py @@ -18,6 +18,9 @@ import unittest import re +from unittest import suite +from typing import Union, List + parser = argparse.ArgumentParser(description="Argparse for op test helper") parser.add_argument( "--case", @@ -104,3 +107,17 @@ def run(self): res = runner.run(test_suite) if not res.wasSuccessful(): sys.exit(not res.wasSuccessful()) + + +def run_test(test_class: Union[suite.TestSuite, List[suite.TestSuite]]): + test_suite = unittest.TestSuite() + test_loader = unittest.TestLoader() + if isinstance(test_class, type): + test_suite.addTests(test_loader.loadTestsFromTestCase(test_class)) + else: + for cls in test_class: + test_suite.addTests(test_loader.loadTestsFromTestCase(cls)) + runner = unittest.TextTestRunner() + res = runner.run(test_suite) + if not res.wasSuccessful(): + sys.exit(not res.wasSuccessful()) diff --git a/python/tests/ops/test_sort_op.py b/python/tests/ops/test_sort_op.py index 9230fcc4fb..7708d58e86 100644 --- a/python/tests/ops/test_sort_op.py +++ b/python/tests/ops/test_sort_op.py @@ -14,45 +14,41 @@ # See the License for the specific language governing permissions and # limitations under the License. -import unittest -import numpy as np -from op_test import OpTest, OpTestTool import paddle -import paddle.nn.functional as F -import cinn from cinn.frontend import * from cinn.common import * +from op_test import OpTest +from op_test_helper import TestCaseHelper, run_test -@OpTestTool.skip_if(not is_compiled_with_cuda(), - "x86 test will be skipped due to timeout.") class TestSortOp(OpTest): def setUp(self): - self.init_case() + print(f"\nRunning {self.__class__.__name__}: {self.case}") + self.inputs = {} + self.prepare_inputs() - def init_case(self): + def prepare_inputs(self): self.inputs = { - "x1": np.random.random([ - 2, - 4, - ]).astype("float32") + "x": self.random(self.case["shape"], self.case["dtype"]) } - self.axis = 1 - self.descending = False + self.axis = self.case["axis"] + self.descending = self.case["descending"] def build_paddle_program(self, target): - x1 = paddle.to_tensor(self.inputs["x1"], stop_gradient=True) + x1 = paddle.to_tensor(self.inputs["x"], stop_gradient=True) out = paddle.sort(x1, self.axis, self.descending) self.paddle_outputs = [out] def build_cinn_program(self, target): - builder = NetBuilder("sum") - x1 = builder.create_input(Float(32), self.inputs["x1"].shape, "x1") + builder = NetBuilder("sort") + x1 = builder.create_input( + self.nptype2cinntype(self.inputs["x"].dtype), + self.inputs["x"].shape, "x") out = builder.sort(x1, self.axis, not self.descending) prog = builder.build() forward_res = self.get_cinn_output(prog, target, [x1], - [self.inputs["x1"]], [out]) + [self.inputs["x"]], [out]) self.cinn_outputs = forward_res @@ -60,41 +56,173 @@ def test_check_results(self): self.check_outputs_and_grads() -class TestSortCase1(TestSortOp): - def init_case(self): - self.inputs = { - "x1": np.random.random([ - 2, - 4, - ]).astype("float32") - } +class TestSortOpDumpicateElement(TestSortOp): + def setUp(self): + self.inputs = {} + self.prepare_inputs() + + def prepare_inputs(self): + self.inputs = {"x": self.random([128], "int64", -10, 10)} self.axis = 0 self.descending = False -class TestSortCase2(TestSortOp): - def init_case(self): - self.inputs = { - "x1": np.random.random([ - 2, - 4, - ]).astype("float32") - } +# This test case will cause CINN to allocate a large amount of GPU memory, nearly 10 GB. +class TestSortOpLargeCudaMemoryOccupation(TestSortOp): + def setUp(self): + self.inputs = {} + self.prepare_inputs() + + def prepare_inputs(self): + self.inputs = {"x": self.random([8192], "float64")} self.axis = 0 - self.descending = True + self.descending = False -class TestSortCase3(TestSortOp): - def init_case(self): - self.inputs = { - "x1": np.random.random([ - 2, - 4, - ]).astype("float32") - } - self.axis = 1 - self.descending = True +class TestSortOpShapeTest(TestCaseHelper): + def init_attrs(self): + self.class_name = "TestSortOpShapeTest" + self.cls = TestSortOp + self.inputs = [ + { + "shape": [512], + }, + { + "shape": [1024], + }, + { + "shape": [1200], + }, + { + "shape": [64, 16], + }, + { + "shape": [4, 32, 8], + }, + { + "shape": [16, 8, 4, 2], + }, + { + "shape": [2, 8, 4, 2, 5], + }, + { + "shape": [4, 8, 1, 2, 16], + }, + { + "shape": [1], + }, + { + "shape": [1, 1, 1, 1], + }, + { + "shape": [1, 1, 1, 1, 1], + }, + # TODO: known issue cinn/hlir/op/contrib/sort.cc:201 + # the array will exceed the cuda kernel stack size limit + # { + # "shape": [32768], + # }, + # { + # "shape": [65536], + # }, + # { + # "shape": [131072], + # }, + ] + self.dtypes = [{"dtype": "float32"}] + self.attrs = [{"axis": 0, "descending": False}] + + +class TestSortOpDtypeTest(TestCaseHelper): + def init_attrs(self): + self.class_name = "TestSortOpDtypeTest" + self.cls = TestSortOp + self.inputs = [ + { + "shape": [1024], + }, + { + "shape": [64, 16], + }, + { + "shape": [4, 32, 8], + }, + { + "shape": [16, 8, 4, 2], + }, + ] + self.dtypes = [ + { + "dtype": "float32" + }, + { + "dtype": "float64" + }, + { + "dtype": "int32" + }, + { + "dtype": "int64" + }, + ] + self.attrs = [{"axis": 0, "descending": False}] + + +class TestSortOpAxisTest(TestCaseHelper): + def init_attrs(self): + self.class_name = "TestSortOpAttrsTest" + self.cls = TestSortOp + self.inputs = [ + { + "shape": [16, 8, 4, 2], + }, + ] + self.dtypes = [{"dtype": "float32"}] + self.attrs = [{ + "axis": 0, + "descending": False + }, { + "axis": 1, + "descending": False + }, { + "axis": 2, + "descending": False + }, { + "axis": 3, + "descending": False + }] + + +class TestSortOpDescedingTest(TestSortOpShapeTest): + def init_attrs(self): + self.class_name = "TestSortOpDescedingTest" + self.cls = TestSortOp + self.inputs = [ + { + "shape": [16, 8, 4, 2], + }, + ] + self.dtypes = [{"dtype": "float32"}] + self.attrs = [{ + "axis": 0, + "descending": True + }, { + "axis": 1, + "descending": True + }, { + "axis": 2, + "descending": True + }, { + "axis": 3, + "descending": True + }] if __name__ == "__main__": - unittest.main() + run_test(TestSortOpDumpicateElement) + # run_test(TestSortOpLargeCudaMemoryOccupation) + + TestSortOpShapeTest().run() + TestSortOpDtypeTest().run() + TestSortOpAxisTest().run() + TestSortOpDescedingTest().run()