PaddlePaddle · thisjiang · May 24, 2023 · May 9, 2023 · May 9, 2023 · May 10, 2023
diff --git a/cinn/hlir/op/contrib/argmax_test.cc b/cinn/hlir/op/contrib/argmax_test.cc
@@ -94,7 +94,7 @@ void TestGenerateCodeCpu_Argmax_Keep(void* _args, int32_t num_args)
     for (int32_t j = 0; j < 3; j += 1) {
       for (int32_t k = 0; k < 28; k += 1) {
         for (int32_t a = 0; a < 28; a += 1) {
-          test_argmax_in_index[((2352 * i) + ((784 * j) + ((28 * k) + a)))] = cinn_host_find_int_nd(_test_argmax_in_index_temp, 3, j, ((2352 * i) + ((28 * k) + a)), 784);
+          test_argmax_in_index[((2352 * i) + ((784 * j) + ((28 * k) + a)))] = cinn_host_next_smallest_int32(_test_argmax_in_index_temp, 3, j, ((2352 * i) + ((28 * k) + a)), 784);
         };
       };
     };

diff --git a/cinn/hlir/op/contrib/argmin_test.cc b/cinn/hlir/op/contrib/argmin_test.cc
@@ -93,7 +93,7 @@ void TestGenerateCodeCpu_Argmin_Keep(void* _args, int32_t num_args)
     for (int32_t j = 0; j < 3; j += 1) {
       for (int32_t k = 0; k < 28; k += 1) {
         for (int32_t a = 0; a < 28; a += 1) {
-          test_argmin_in_index[((2352 * i) + ((784 * j) + ((28 * k) + a)))] = cinn_host_find_int_nd(_test_argmin_in_index_temp, 3, j, ((2352 * i) + ((28 * k) + a)), 784);
+          test_argmin_in_index[((2352 * i) + ((784 * j) + ((28 * k) + a)))] = cinn_host_next_smallest_int32(_test_argmin_in_index_temp, 3, j, ((2352 * i) + ((28 * k) + a)), 784);
         };
       };
     };

diff --git a/cinn/hlir/op/contrib/sort.cc b/cinn/hlir/op/contrib/sort.cc
@@ -56,9 +56,9 @@ std::vector<ir::Tensor> ArgSort(const ir::Tensor &A,
   std::string find_func_name;
   std::string index_func_name;
   if (target.arch == common::Target::Arch::NVGPU) {
-    find_func_name.assign("cinn_cuda_find_int_nd");
+    find_func_name.assign("cinn_nvgpu_next_smallest_int32");
   } else if (target.arch == common::Target::Arch::X86) {
-    find_func_name.assign("cinn_host_find_int_nd");
+    find_func_name.assign("cinn_host_next_smallest_int32");
   } else {
     LOG(FATAL) << "ArgSort only supports X86 and NVGPU ! Please Check.\n";
   }

diff --git a/cinn/hlir/op/contrib/sort_test.cc b/cinn/hlir/op/contrib/sort_test.cc
@@ -112,7 +112,7 @@ void TestGenerateCodeCpu_Sort(void* _args, int32_t num_args)
   };
   for (int32_t i = 0; i < 4; i += 1) {
     for (int32_t j = 0; j < 28; j += 1) {
-      test_sort_out_index[((28 * i) + j)] = cinn_host_find_int_nd(_test_sort_out_index_temp, 28, j, (28 * i), 1);
+      test_sort_out_index[((28 * i) + j)] = cinn_host_next_smallest_int32(_test_sort_out_index_temp, 28, j, (28 * i), 1);
     };
   };
   for (int32_t i = 0; i < 4; i += 1) {

diff --git a/cinn/runtime/cpu/host_intrinsics.cc b/cinn/runtime/cpu/host_intrinsics.cc
@@ -64,6 +64,20 @@ inline int cinn_host_find_float_nd(const cinn_buffer_t* buf, int size, float num
 
 #undef __cinn_host_find_kernel
 
+inline int cinn_host_next_smallest_int32(cinn_buffer_t* buf, int size, int num, int begin, int stride) {
+  int id = -1;
+  for (int i = begin; i < begin + size * stride; i += stride) {
+    if (id == -1 || reinterpret_cast<int*>(buf->memory)[i] < reinterpret_cast<int*>(buf->memory)[id]) {
+      id = i;
+    }
+  }
+  if (id != -1) {
+    reinterpret_cast<int*>(buf->memory)[id] = 2147483647;
+    return (id - begin) / stride;
+  }
+  return -1;
+}
+
 #define CINN_HOST_LT_NUM(TYPE_SUFFIX, TYPE)                                                           \
   inline int cinn_host_lt_num_##TYPE_SUFFIX(                                                          \
       const cinn_buffer_t* buf, const int size, const TYPE num, const int offset, const int stride) { \
@@ -349,6 +363,15 @@ CINN_REGISTER_HELPER(host_intrinsics) {
       .AddInputType<int>()
       .End();
 
+  REGISTER_EXTERN_FUNC_HELPER(cinn_host_next_smallest_int32, host_target)
+      .SetRetType<int>()
+      .AddInputType<cinn_buffer_t*>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .End();
+
 #define _REGISTER_CINN_HOST_LT_NUM(TYPE_SUFFIX, TYPE)                      \
   REGISTER_EXTERN_FUNC_HELPER(cinn_host_lt_num_##TYPE_SUFFIX, host_target) \
       .SetRetType<int>()                                                   \

diff --git a/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh b/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh
@@ -1,7 +1,12 @@
 /**
  * \file This file contains all the intrinsics available to be used in CUDA code generated by CodeGen.
  */
+
 extern "C" {
+
+#define CINN_INT32_MAX 2147483647
+#define CINN_INT32_MIN -2147483648
+
 // *************************************************************** //
 // bool unary and binary operator
 #define FN_BOOL(func) cinn_nvgpu_##func##_bool
@@ -292,8 +297,8 @@ __device__ inline float16 FN_FP16(pow)(float16 a, float16 b) {
 #define EXPAND_REDUCE_INT32_MARCO(MARCO, ...)       \
   MARCO(sum_int32, 0, int, ##__VA_ARGS__)           \
   MARCO(prod_int32, 1, int, ##__VA_ARGS__)          \
-  MARCO(max_int32, -2147483648, int, ##__VA_ARGS__) \
-  MARCO(min_int32, 2147483647, int, ##__VA_ARGS__)
+  MARCO(max_int32, CINN_INT32_MIN, int, ##__VA_ARGS__) \
+  MARCO(min_int32, CINN_INT32_MAX, int, ##__VA_ARGS__)
 
 __device__ inline int cinn_sum_int32(const int left, const int right) { return left + right; }
 __device__ inline int cinn_prod_int32(const int left, const int right) { return left * right; }
@@ -542,6 +547,20 @@ __device__ inline int cinn_cuda_find_float_nd(const float *buf, int size, float
 
 #undef __cinn_cuda_find_kernel
 
+__device__ inline int cinn_nvgpu_next_smallest_int32(int *buf, int size, int num, int begin, int stride) {
+  int id = -1;
+  for (int i = begin; i < begin + size * stride; i += stride) {
+    if (id == -1 || buf[i] < buf[id]) {
+      id = i;
+    }
+  }
+  if (id != -1) {
+    buf[id] = CINN_INT32_MAX;
+    return (id - begin) / stride;
+  }
+  return -1;
+}
+
 #define __cinn_cuda_find_from_kernel(buf, size, num, begin) \
   do {                                                      \
     for (int i = begin; i < size; ++i) {                    \
@@ -720,6 +739,8 @@ __device__ int cinn_cuda_resize_bicubic(const int *buf,
 
 // *************************************************************** //
 // end of macro undef
+#undef CINN_INT32_MAX
+#undef CINN_INT32_MIN
 #undef FN_BOOL
 #undef FN_UINT8
 #undef FN_INT8

diff --git a/cinn/runtime/cuda/cuda_intrinsics.cc b/cinn/runtime/cuda/cuda_intrinsics.cc
@@ -300,6 +300,15 @@ CINN_REGISTER_HELPER(cuda_intrinsics) {
       .AddInputType<int>()
       .End();
 
+  REGISTER_FACKED_EXTERN_FUNC_HELPER(cinn_nvgpu_next_smallest_int32, target)
+      .SetRetType<int>()
+      .AddInputType<cinn_buffer_t *>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .End();
+
 #define _REGISTER_CINN_NVGPU_LT_NUM(TYPE_SUFFIX, TYPE)                        \
   REGISTER_FACKED_EXTERN_FUNC_HELPER(cinn_nvgpu_lt_num_##TYPE_SUFFIX, target) \
       .SetRetType<int>()                                                      \

diff --git a/python/tests/ops/op_test_helper.py b/python/tests/ops/op_test_helper.py
@@ -18,6 +18,9 @@
 import unittest
 import re
 
+from unittest import suite
+from typing import Union, List
+
 parser = argparse.ArgumentParser(description="Argparse for op test helper")
 parser.add_argument(
     "--case",
@@ -104,3 +107,17 @@ def run(self):
         res = runner.run(test_suite)
         if not res.wasSuccessful():
             sys.exit(not res.wasSuccessful())
+
+
+def run_test(test_class: Union[suite.TestSuite, List[suite.TestSuite]]):
+    test_suite = unittest.TestSuite()
+    test_loader = unittest.TestLoader()
+    if isinstance(test_class, type):
+        test_suite.addTests(test_loader.loadTestsFromTestCase(test_class))
+    else:
+        for cls in test_class:
+            test_suite.addTests(test_loader.loadTestsFromTestCase(cls))
+    runner = unittest.TextTestRunner()
+    res = runner.run(test_suite)
+    if not res.wasSuccessful():
+        sys.exit(not res.wasSuccessful())