diff --git a/cinn/hlir/op/contrib/argmax_test.cc b/cinn/hlir/op/contrib/argmax_test.cc
index 419fb081c1..3b42e73f1f 100644
--- a/cinn/hlir/op/contrib/argmax_test.cc
+++ b/cinn/hlir/op/contrib/argmax_test.cc
@@ -94,7 +94,7 @@ void TestGenerateCodeCpu_Argmax_Keep(void* _args, int32_t num_args)
     for (int32_t j = 0; j < 3; j += 1) {
       for (int32_t k = 0; k < 28; k += 1) {
         for (int32_t a = 0; a < 28; a += 1) {
-          test_argmax_in_index[((2352 * i) + ((784 * j) + ((28 * k) + a)))] = cinn_host_find_int_nd(_test_argmax_in_index_temp, 3, j, ((2352 * i) + ((28 * k) + a)), 784);
+          test_argmax_in_index[((2352 * i) + ((784 * j) + ((28 * k) + a)))] = cinn_host_next_smallest_int32(_test_argmax_in_index_temp, 3, j, ((2352 * i) + ((28 * k) + a)), 784);
         };
       };
     };
diff --git a/cinn/hlir/op/contrib/argmin_test.cc b/cinn/hlir/op/contrib/argmin_test.cc
index 98e9bdb8d5..bfe053f101 100644
--- a/cinn/hlir/op/contrib/argmin_test.cc
+++ b/cinn/hlir/op/contrib/argmin_test.cc
@@ -93,7 +93,7 @@ void TestGenerateCodeCpu_Argmin_Keep(void* _args, int32_t num_args)
     for (int32_t j = 0; j < 3; j += 1) {
       for (int32_t k = 0; k < 28; k += 1) {
         for (int32_t a = 0; a < 28; a += 1) {
-          test_argmin_in_index[((2352 * i) + ((784 * j) + ((28 * k) + a)))] = cinn_host_find_int_nd(_test_argmin_in_index_temp, 3, j, ((2352 * i) + ((28 * k) + a)), 784);
+          test_argmin_in_index[((2352 * i) + ((784 * j) + ((28 * k) + a)))] = cinn_host_next_smallest_int32(_test_argmin_in_index_temp, 3, j, ((2352 * i) + ((28 * k) + a)), 784);
         };
       };
     };
diff --git a/cinn/hlir/op/contrib/sort.cc b/cinn/hlir/op/contrib/sort.cc
index 927949f13b..e8471bcb4a 100644
--- a/cinn/hlir/op/contrib/sort.cc
+++ b/cinn/hlir/op/contrib/sort.cc
@@ -56,9 +56,9 @@ std::vector<ir::Tensor> ArgSort(const ir::Tensor &A,
   std::string find_func_name;
   std::string index_func_name;
   if (target.arch == common::Target::Arch::NVGPU) {
-    find_func_name.assign("cinn_cuda_find_int_nd");
+    find_func_name.assign("cinn_nvgpu_next_smallest_int32");
   } else if (target.arch == common::Target::Arch::X86) {
-    find_func_name.assign("cinn_host_find_int_nd");
+    find_func_name.assign("cinn_host_next_smallest_int32");
   } else {
     LOG(FATAL) << "ArgSort only supports X86 and NVGPU ! Please Check.\n";
   }
diff --git a/cinn/hlir/op/contrib/sort_test.cc b/cinn/hlir/op/contrib/sort_test.cc
index 860eef32e3..e5f990ba11 100644
--- a/cinn/hlir/op/contrib/sort_test.cc
+++ b/cinn/hlir/op/contrib/sort_test.cc
@@ -112,7 +112,7 @@ void TestGenerateCodeCpu_Sort(void* _args, int32_t num_args)
   };
   for (int32_t i = 0; i < 4; i += 1) {
     for (int32_t j = 0; j < 28; j += 1) {
-      test_sort_out_index[((28 * i) + j)] = cinn_host_find_int_nd(_test_sort_out_index_temp, 28, j, (28 * i), 1);
+      test_sort_out_index[((28 * i) + j)] = cinn_host_next_smallest_int32(_test_sort_out_index_temp, 28, j, (28 * i), 1);
     };
   };
   for (int32_t i = 0; i < 4; i += 1) {
diff --git a/cinn/runtime/cpu/host_intrinsics.cc b/cinn/runtime/cpu/host_intrinsics.cc
index 2ce67f7b86..a9d1cd0e38 100644
--- a/cinn/runtime/cpu/host_intrinsics.cc
+++ b/cinn/runtime/cpu/host_intrinsics.cc
@@ -64,6 +64,20 @@ inline int cinn_host_find_float_nd(const cinn_buffer_t* buf, int size, float num
 
 #undef __cinn_host_find_kernel
 
+inline int cinn_host_next_smallest_int32(cinn_buffer_t* buf, int size, int num, int begin, int stride) {
+  int id = -1;
+  for (int i = begin; i < begin + size * stride; i += stride) {
+    if (id == -1 || reinterpret_cast<int*>(buf->memory)[i] < reinterpret_cast<int*>(buf->memory)[id]) {
+      id = i;
+    }
+  }
+  if (id != -1) {
+    reinterpret_cast<int*>(buf->memory)[id] = 2147483647;
+    return (id - begin) / stride;
+  }
+  return -1;
+}
+
 #define CINN_HOST_LT_NUM(TYPE_SUFFIX, TYPE)                                                           \
   inline int cinn_host_lt_num_##TYPE_SUFFIX(                                                          \
       const cinn_buffer_t* buf, const int size, const TYPE num, const int offset, const int stride) { \
@@ -349,6 +363,15 @@ CINN_REGISTER_HELPER(host_intrinsics) {
       .AddInputType<int>()
       .End();
 
+  REGISTER_EXTERN_FUNC_HELPER(cinn_host_next_smallest_int32, host_target)
+      .SetRetType<int>()
+      .AddInputType<cinn_buffer_t*>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .End();
+
 #define _REGISTER_CINN_HOST_LT_NUM(TYPE_SUFFIX, TYPE)                      \
   REGISTER_EXTERN_FUNC_HELPER(cinn_host_lt_num_##TYPE_SUFFIX, host_target) \
       .SetRetType<int>()                                                   \
diff --git a/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh b/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh
index 7d0b75e476..2024f2badc 100644
--- a/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh
+++ b/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh
@@ -1,7 +1,12 @@
 /**
  * \file This file contains all the intrinsics available to be used in CUDA code generated by CodeGen.
  */
+
 extern "C" {
+
+#define CINN_INT32_MAX 2147483647
+#define CINN_INT32_MIN -2147483648
+
 // *************************************************************** //
 // bool unary and binary operator
 #define FN_BOOL(func) cinn_nvgpu_##func##_bool
@@ -364,8 +369,8 @@ __device__ inline float16 FN_FP16(pow)(float16 a, float16 b) {
 #define EXPAND_REDUCE_INT32_MARCO(MARCO, ...)       \
   MARCO(sum_int32, 0, int, ##__VA_ARGS__)           \
   MARCO(prod_int32, 1, int, ##__VA_ARGS__)          \
-  MARCO(max_int32, -2147483648, int, ##__VA_ARGS__) \
-  MARCO(min_int32, 2147483647, int, ##__VA_ARGS__)
+  MARCO(max_int32, CINN_INT32_MIN, int, ##__VA_ARGS__) \
+  MARCO(min_int32, CINN_INT32_MAX, int, ##__VA_ARGS__)
 
 __device__ inline int cinn_sum_int32(const int left, const int right) { return left + right; }
 __device__ inline int cinn_prod_int32(const int left, const int right) { return left * right; }
@@ -634,6 +639,20 @@ __device__ inline int cinn_cuda_find_float_nd(const float *buf, int size, float
 
 #undef __cinn_cuda_find_kernel
 
+__device__ inline int cinn_nvgpu_next_smallest_int32(int *buf, int size, int num, int begin, int stride) {
+  int id = -1;
+  for (int i = begin; i < begin + size * stride; i += stride) {
+    if (id == -1 || buf[i] < buf[id]) {
+      id = i;
+    }
+  }
+  if (id != -1) {
+    buf[id] = CINN_INT32_MAX;
+    return (id - begin) / stride;
+  }
+  return -1;
+}
+
 #define __cinn_cuda_find_from_kernel(buf, size, num, begin) \
   do {                                                      \
     for (int i = begin; i < size; ++i) {                    \
@@ -812,6 +831,8 @@ __device__ int cinn_cuda_resize_bicubic(const int *buf,
 
 // *************************************************************** //
 // end of macro undef
+#undef CINN_INT32_MAX
+#undef CINN_INT32_MIN
 #undef FN_BOOL
 #undef FN_UINT8
 #undef FN_INT8
diff --git a/cinn/runtime/cuda/cuda_intrinsics.cc b/cinn/runtime/cuda/cuda_intrinsics.cc
index bdb1005ebd..40bce455ff 100644
--- a/cinn/runtime/cuda/cuda_intrinsics.cc
+++ b/cinn/runtime/cuda/cuda_intrinsics.cc
@@ -300,6 +300,15 @@ CINN_REGISTER_HELPER(cuda_intrinsics) {
       .AddInputType<int>()
       .End();
 
+  REGISTER_FACKED_EXTERN_FUNC_HELPER(cinn_nvgpu_next_smallest_int32, target)
+      .SetRetType<int>()
+      .AddInputType<cinn_buffer_t *>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .AddInputType<int>()
+      .End();
+
 #define _REGISTER_CINN_NVGPU_LT_NUM(TYPE_SUFFIX, TYPE)                        \
   REGISTER_FACKED_EXTERN_FUNC_HELPER(cinn_nvgpu_lt_num_##TYPE_SUFFIX, target) \
       .SetRetType<int>()                                                      \
diff --git a/python/tests/ops/op_test_helper.py b/python/tests/ops/op_test_helper.py
index f9796f6026..eb8c99889a 100644
--- a/python/tests/ops/op_test_helper.py
+++ b/python/tests/ops/op_test_helper.py
@@ -18,6 +18,9 @@
 import unittest
 import re
 
+from unittest import suite
+from typing import Union, List
+
 parser = argparse.ArgumentParser(description="Argparse for op test helper")
 parser.add_argument(
     "--case",
@@ -104,3 +107,17 @@ def run(self):
         res = runner.run(test_suite)
         if not res.wasSuccessful():
             sys.exit(not res.wasSuccessful())
+
+
+def run_test(test_class: Union[suite.TestSuite, List[suite.TestSuite]]):
+    test_suite = unittest.TestSuite()
+    test_loader = unittest.TestLoader()
+    if isinstance(test_class, type):
+        test_suite.addTests(test_loader.loadTestsFromTestCase(test_class))
+    else:
+        for cls in test_class:
+            test_suite.addTests(test_loader.loadTestsFromTestCase(cls))
+    runner = unittest.TextTestRunner()
+    res = runner.run(test_suite)
+    if not res.wasSuccessful():
+        sys.exit(not res.wasSuccessful())
diff --git a/python/tests/ops/test_sort_op.py b/python/tests/ops/test_sort_op.py
index 9230fcc4fb..7708d58e86 100644
--- a/python/tests/ops/test_sort_op.py
+++ b/python/tests/ops/test_sort_op.py
@@ -14,45 +14,41 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import unittest
-import numpy as np
-from op_test import OpTest, OpTestTool
 import paddle
-import paddle.nn.functional as F
-import cinn
 from cinn.frontend import *
 from cinn.common import *
+from op_test import OpTest
+from op_test_helper import TestCaseHelper, run_test
 
 
-@OpTestTool.skip_if(not is_compiled_with_cuda(),
-                    "x86 test will be skipped due to timeout.")
 class TestSortOp(OpTest):
     def setUp(self):
-        self.init_case()
+        print(f"\nRunning {self.__class__.__name__}: {self.case}")
+        self.inputs = {}
+        self.prepare_inputs()
 
-    def init_case(self):
+    def prepare_inputs(self):
         self.inputs = {
-            "x1": np.random.random([
-                2,
-                4,
-            ]).astype("float32")
+            "x": self.random(self.case["shape"], self.case["dtype"])
         }
-        self.axis = 1
-        self.descending = False
+        self.axis = self.case["axis"]
+        self.descending = self.case["descending"]
 
     def build_paddle_program(self, target):
-        x1 = paddle.to_tensor(self.inputs["x1"], stop_gradient=True)
+        x1 = paddle.to_tensor(self.inputs["x"], stop_gradient=True)
         out = paddle.sort(x1, self.axis, self.descending)
 
         self.paddle_outputs = [out]
 
     def build_cinn_program(self, target):
-        builder = NetBuilder("sum")
-        x1 = builder.create_input(Float(32), self.inputs["x1"].shape, "x1")
+        builder = NetBuilder("sort")
+        x1 = builder.create_input(
+            self.nptype2cinntype(self.inputs["x"].dtype),
+            self.inputs["x"].shape, "x")
         out = builder.sort(x1, self.axis, not self.descending)
         prog = builder.build()
         forward_res = self.get_cinn_output(prog, target, [x1],
-                                           [self.inputs["x1"]], [out])
+                                           [self.inputs["x"]], [out])
 
         self.cinn_outputs = forward_res
 
@@ -60,41 +56,173 @@ def test_check_results(self):
         self.check_outputs_and_grads()
 
 
-class TestSortCase1(TestSortOp):
-    def init_case(self):
-        self.inputs = {
-            "x1": np.random.random([
-                2,
-                4,
-            ]).astype("float32")
-        }
+class TestSortOpDumpicateElement(TestSortOp):
+    def setUp(self):
+        self.inputs = {}
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.inputs = {"x": self.random([128], "int64", -10, 10)}
         self.axis = 0
         self.descending = False
 
 
-class TestSortCase2(TestSortOp):
-    def init_case(self):
-        self.inputs = {
-            "x1": np.random.random([
-                2,
-                4,
-            ]).astype("float32")
-        }
+# This test case will cause CINN to allocate a large amount of GPU memory, nearly 10 GB.
+class TestSortOpLargeCudaMemoryOccupation(TestSortOp):
+    def setUp(self):
+        self.inputs = {}
+        self.prepare_inputs()
+
+    def prepare_inputs(self):
+        self.inputs = {"x": self.random([8192], "float64")}
         self.axis = 0
-        self.descending = True
+        self.descending = False
 
 
-class TestSortCase3(TestSortOp):
-    def init_case(self):
-        self.inputs = {
-            "x1": np.random.random([
-                2,
-                4,
-            ]).astype("float32")
-        }
-        self.axis = 1
-        self.descending = True
+class TestSortOpShapeTest(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestSortOpShapeTest"
+        self.cls = TestSortOp
+        self.inputs = [
+            {
+                "shape": [512],
+            },
+            {
+                "shape": [1024],
+            },
+            {
+                "shape": [1200],
+            },
+            {
+                "shape": [64, 16],
+            },
+            {
+                "shape": [4, 32, 8],
+            },
+            {
+                "shape": [16, 8, 4, 2],
+            },
+            {
+                "shape": [2, 8, 4, 2, 5],
+            },
+            {
+                "shape": [4, 8, 1, 2, 16],
+            },
+            {
+                "shape": [1],
+            },
+            {
+                "shape": [1, 1, 1, 1],
+            },
+            {
+                "shape": [1, 1, 1, 1, 1],
+            },
+            # TODO: known issue cinn/hlir/op/contrib/sort.cc:201
+            # the array will exceed the cuda kernel stack size limit
+            # {
+            #     "shape": [32768],
+            # },
+            # {
+            #     "shape": [65536],
+            # },
+            # {
+            #     "shape": [131072],
+            # },
+        ]
+        self.dtypes = [{"dtype": "float32"}]
+        self.attrs = [{"axis": 0, "descending": False}]
+
+
+class TestSortOpDtypeTest(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestSortOpDtypeTest"
+        self.cls = TestSortOp
+        self.inputs = [
+            {
+                "shape": [1024],
+            },
+            {
+                "shape": [64, 16],
+            },
+            {
+                "shape": [4, 32, 8],
+            },
+            {
+                "shape": [16, 8, 4, 2],
+            },
+        ]
+        self.dtypes = [
+            {
+                "dtype": "float32"
+            },
+            {
+                "dtype": "float64"
+            },
+            {
+                "dtype": "int32"
+            },
+            {
+                "dtype": "int64"
+            },
+        ]
+        self.attrs = [{"axis": 0, "descending": False}]
+
+
+class TestSortOpAxisTest(TestCaseHelper):
+    def init_attrs(self):
+        self.class_name = "TestSortOpAttrsTest"
+        self.cls = TestSortOp
+        self.inputs = [
+            {
+                "shape": [16, 8, 4, 2],
+            },
+        ]
+        self.dtypes = [{"dtype": "float32"}]
+        self.attrs = [{
+            "axis": 0,
+            "descending": False
+        }, {
+            "axis": 1,
+            "descending": False
+        }, {
+            "axis": 2,
+            "descending": False
+        }, {
+            "axis": 3,
+            "descending": False
+        }]
+
+
+class TestSortOpDescedingTest(TestSortOpShapeTest):
+    def init_attrs(self):
+        self.class_name = "TestSortOpDescedingTest"
+        self.cls = TestSortOp
+        self.inputs = [
+            {
+                "shape": [16, 8, 4, 2],
+            },
+        ]
+        self.dtypes = [{"dtype": "float32"}]
+        self.attrs = [{
+            "axis": 0,
+            "descending": True
+        }, {
+            "axis": 1,
+            "descending": True
+        }, {
+            "axis": 2,
+            "descending": True
+        }, {
+            "axis": 3,
+            "descending": True
+        }]
 
 
 if __name__ == "__main__":
-    unittest.main()
+    run_test(TestSortOpDumpicateElement)
+    # run_test(TestSortOpLargeCudaMemoryOccupation)
+
+    TestSortOpShapeTest().run()
+    TestSortOpDtypeTest().run()
+    TestSortOpAxisTest().run()
+    TestSortOpDescedingTest().run()