From 7c2d1af8bf87b6bbeb26a179e4723068ae1e8d5f Mon Sep 17 00:00:00 2001
From: MARD1NO <359521840@qq.com>
Date: Fri, 4 Feb 2022 16:43:04 +0800
Subject: [PATCH 01/17] first debug

---
 oneflow/user/kernels/pooling_kernel_util.h | 112 ++++++++++++++-------
 1 file changed, 73 insertions(+), 39 deletions(-)
diff --git a/oneflow/user/kernels/pooling_kernel_util.h b/oneflow/user/kernels/pooling_kernel_util.h
index e5a67944eb8..6ad5233515c 100644
--- a/oneflow/user/kernels/pooling_kernel_util.h
+++ b/oneflow/user/kernels/pooling_kernel_util.h
@@ -205,53 +205,87 @@ OF_DEVICE_FUNC void Maxpool2dForwardComputeCFirst(
     const int32_t stride_h, const int32_t stride_w, const int32_t dilation_h,
     const int32_t dilation_w) {
   XPU_1D_KERNEL_LOOP(num, elem_num) {
-    int64_t n, c, h, w;
-    index_helper.OffsetToNdIndex(num, n, c, h, w);
-
-    const int64_t start_idx = (n * n_channel + c) * x_width * x_height;
-    int64_t hstart = h * stride_h - padding_h;
-    int64_t wstart = w * stride_w - padding_w;
-    const int64_t hend = (hstart + (kernel_size_h - 1) * dilation_h + 1) <= x_height
+    // int64_t n, c, h, w;
+    // index_helper.OffsetToNdIndex(num, n, c, h, w);
+
+    // const int64_t start_idx = (n * n_channel + c) * x_width * x_height;
+    // int64_t hstart = h * stride_h - padding_h;
+    // int64_t wstart = w * stride_w - padding_w;
+    // const int64_t hend = (hstart + (kernel_size_h - 1) * dilation_h + 1) <= x_height
+    //                          ? (hstart + (kernel_size_h - 1) * dilation_h + 1)
+    //                          : x_height;
+    // const int64_t wend = (wstart + (kernel_size_w - 1) * dilation_w + 1) <= x_width
+    //                          ? (wstart + (kernel_size_w - 1) * dilation_w + 1)
+    //                          : x_width;
+
+    // while (hstart < 0) { hstart += dilation_h; }
+    // while (wstart < 0) { wstart += dilation_w; }
+
+    // /* compute max value(src[src_idx]) in kernel box region, and save the value to dest[num] */
+    // int64_t max_index = hstart * x_width + wstart;
+    // int64_t src_idx = 0;
+
+    // /* equal to -std::numeric_limits<T>::infinity(); */
+    // T max_value = detail::numeric_limits<T>::lower_bound();
+
+    // for (int64_t i = hstart; i < hend; i += dilation_h) {
+    //   for (int64_t j = wstart; j < wend; j += dilation_w) {
+    //     const int64_t window_idx = i * x_width + j;
+    //     const int64_t search_idx = start_idx + window_idx;
+    //     T val = src[search_idx];
+    //     /* NOTE:
+    //     std::isnan(val) only supports a few data types, see:
+    //     https://en.cppreference.com/w/cpp/numeric/math/isnan and when use gcc/g++ 4.x to compile,
+    //     the following exception will be throw:
+
+    //     new_kernel_util.cu:24] Check failed: cudaMemcpyAsync(dst, src, sz, cudaMemcpyDefault,
+    //     ctx->cuda_stream() ) : unspecified launch failure (719)
+
+    //     but if use gcc/g++ 7.x to compile, everything is ok! the exact reason is still unknown!
+    //     */
+    //     if (val > max_value || detail::numerics<T>::isnan(val)) {
+    //       max_value = val;
+    //       max_index = window_idx;
+    //       src_idx = search_idx;
+    //     }
+    //   }
+    // }
+
+    int pw = num % kernel_size_w;
+    int ph = (num / kernel_size_w) % kernel_size_h;
+    int c = (num / kernel_size_w / kernel_size_h) % 64;
+    int n = num / kernel_size_w / kernel_size_h / 64;
+    int hstart = ph * stride_h - padding_h;
+    int wstart = pw * stride_w - padding_w;
+    // int hend = min(hstart + (kernel_size_h - 1) * dilation_h + 1, x_height);
+    // int wend = min(wstart + (kernel_size_w - 1) * dilation_w + 1, x_width);
+
+    int hend = (hstart + (kernel_size_h - 1) * dilation_h + 1) <= x_height
                              ? (hstart + (kernel_size_h - 1) * dilation_h + 1)
                              : x_height;
-    const int64_t wend = (wstart + (kernel_size_w - 1) * dilation_w + 1) <= x_width
+    int wend = (wstart + (kernel_size_w - 1) * dilation_w + 1) <= x_width
                              ? (wstart + (kernel_size_w - 1) * dilation_w + 1)
                              : x_width;
 
-    while (hstart < 0) { hstart += dilation_h; }
-    while (wstart < 0) { wstart += dilation_w; }
-
-    /* compute max value(src[src_idx]) in kernel box region, and save the value to dest[num] */
-    int64_t max_index = hstart * x_width + wstart;
-    int64_t src_idx = 0;
-
-    /* equal to -std::numeric_limits<T>::infinity(); */
-    T max_value = detail::numeric_limits<T>::lower_bound();
-
-    for (int64_t i = hstart; i < hend; i += dilation_h) {
-      for (int64_t j = wstart; j < wend; j += dilation_w) {
-        const int64_t window_idx = i * x_width + j;
-        const int64_t search_idx = start_idx + window_idx;
-        T val = src[search_idx];
-        /* NOTE:
-        std::isnan(val) only supports a few data types, see:
-        https://en.cppreference.com/w/cpp/numeric/math/isnan and when use gcc/g++ 4.x to compile,
-        the following exception will be throw:
-
-        new_kernel_util.cu:24] Check failed: cudaMemcpyAsync(dst, src, sz, cudaMemcpyDefault,
-        ctx->cuda_stream() ) : unspecified launch failure (719)
-
-        but if use gcc/g++ 7.x to compile, everything is ok! the exact reason is still unknown!
-        */
-        if (val > max_value || detail::numerics<T>::isnan(val)) {
-          max_value = val;
-          max_index = window_idx;
-          src_idx = search_idx;
+    while(hstart < 0)
+      hstart += dilation_h;
+    while(wstart < 0)
+      wstart += dilation_w;
+    
+    T maxval = detail::numeric_limits<T>::lower_bound(); // -Infinity
+    int maxidx = hstart * x_width + wstart;
+    const T* btm_data = src + (n * 64 + c) * x_height * x_width;
+    for (int h = hstart; h < hend; h += dilation_h) {
+      for (int w = wstart; w < wend; w += dilation_w) {
+        T val = btm_data[h * x_width + w];
+        if (val > maxval|| detail::numerics<T>::isnan(val)) {
+          maxidx = h * x_width + w;
+          maxval = static_cast<T>(val);
         }
       }
     }
-    dest[num] = src[src_idx];
-    indice_ptr[num] = max_index;
+    dest[num] = maxval;
+    indice_ptr[num] = maxidx;
   }
 }
 

From f2375f7f58672f0d8f2780ba74cc91cafbfc31a2 Mon Sep 17 00:00:00 2001
From: MARD1NO <359521840@qq.com>
Date: Mon, 7 Feb 2022 13:45:38 +0800
Subject: [PATCH 02/17] fix maxpool

---
 oneflow/user/kernels/pooling_kernel.cpp    | 211 +++++++++------
 oneflow/user/kernels/pooling_kernel.cu     | 185 ++++++-------
 oneflow/user/kernels/pooling_kernel_util.h | 293 +++++++++------------
 3 files changed, 353 insertions(+), 336 deletions(-)

diff --git a/oneflow/user/kernels/pooling_kernel.cpp b/oneflow/user/kernels/pooling_kernel.cpp
index edb9226cabc..ea726328a13 100644
--- a/oneflow/user/kernels/pooling_kernel.cpp
+++ b/oneflow/user/kernels/pooling_kernel.cpp
@@ -42,9 +42,9 @@ std::shared_ptr<PoolingOpKernelCache> CreateOpKernelCache(user_op::KernelCacheCo
 
 namespace {
 
-template<typename T>
-void Maxpool2dForwardComputeCLast(const NdIndexOffsetHelper<int64_t, 4>& index_helper,
-                                  int64_t elem_num, const T* src, T* dest, int64_t* indice_ptr,
+template<typename T, typename IDX>
+void Maxpool2dForwardComputeCLast(const NdIndexOffsetHelper<IDX, 4>& index_helper,
+                                  IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
                                   const int32_t padding_h, const int32_t padding_w,
                                   const int64_t n_batch, const int64_t n_channel,
                                   const int64_t x_height, const int64_t x_width,
@@ -52,33 +52,33 @@ void Maxpool2dForwardComputeCLast(const NdIndexOffsetHelper<int64_t, 4>& index_h
                                   const int32_t kernel_size_h, const int32_t kernel_size_w,
                                   const int32_t stride_h, const int32_t stride_w,
                                   const int32_t dilation_h, const int32_t dilation_w) {
-  int64_t n = 0, h = 0, w = 0, c = 0;
-  for (int64_t num = 0; num < elem_num; ++num) {
+  IDX n = 0, h = 0, w = 0, c = 0;
+  for (IDX num = 0; num < elem_num; ++num) {
     index_helper.OffsetToNdIndex(num, n, h, w, c);
 
-    const int64_t x_start_idx = n * x_height * x_width * n_channel;
-    const int64_t y_start_idx = n * y_height * y_width * n_channel;
-    int64_t hstart = h * stride_h - padding_h;
-    int64_t wstart = w * stride_w - padding_w;
-    const int64_t hend = (hstart + (kernel_size_h - 1) * dilation_h + 1) <= x_height
+    const IDX x_start_idx = n * x_height * x_width * n_channel;
+    const IDX y_start_idx = n * y_height * y_width * n_channel;
+    IDX hstart = h * stride_h - padding_h;
+    IDX wstart = w * stride_w - padding_w;
+    const IDX hend = (hstart + (kernel_size_h - 1) * dilation_h + 1) <= x_height
                              ? (hstart + (kernel_size_h - 1) * dilation_h + 1)
                              : x_height;
-    const int64_t wend = (wstart + (kernel_size_w - 1) * dilation_w + 1) <= x_width
+    const IDX wend = (wstart + (kernel_size_w - 1) * dilation_w + 1) <= x_width
                              ? (wstart + (kernel_size_w - 1) * dilation_w + 1)
                              : x_width;
 
     while (hstart < 0) { hstart += dilation_h; }
     while (wstart < 0) { wstart += dilation_w; }
     /* compute max value(src[src_idx]) in kernel box region, and save the value to dest[num] */
-    int64_t max_index = hstart * x_width + wstart;
-    int64_t src_idx = 0;
+    IDX max_index = hstart * x_width + wstart;
+    IDX src_idx = 0;
     /* equal to -std::numeric_limits<T>::infinity(); */
     T max_value = detail::numeric_limits<T>::lower_bound();
 
-    for (int64_t i = hstart; i < hend; i += dilation_h) {
-      for (int64_t j = wstart; j < wend; j += dilation_w) {
-        const int64_t window_idx = i * x_width * n_channel + j * n_channel + c;
-        const int64_t search_idx = x_start_idx + window_idx;
+    for (IDX i = hstart; i < hend; i += dilation_h) {
+      for (IDX j = wstart; j < wend; j += dilation_w) {
+        const IDX window_idx = i * x_width * n_channel + j * n_channel + c;
+        const IDX search_idx = x_start_idx + window_idx;
         T val = src[search_idx];
         if (val > max_value || detail::numerics<T>::isnan(val)) {
           max_value = val;
@@ -87,7 +87,7 @@ void Maxpool2dForwardComputeCLast(const NdIndexOffsetHelper<int64_t, 4>& index_h
         }
       }
     }
-    const int64_t out_idx = y_start_idx + h * y_width * n_channel + w * n_channel + c;
+    const IDX out_idx = y_start_idx + h * y_width * n_channel + w * n_channel + c;
     dest[out_idx] = src[src_idx];
     indice_ptr[out_idx] = max_index;
   }
@@ -95,57 +95,57 @@ void Maxpool2dForwardComputeCLast(const NdIndexOffsetHelper<int64_t, 4>& index_h
 
 }  // namespace
 
-template<typename T>
-struct PoolingKernelUtil<DeviceType::kCPU, T> {
+template<typename T, typename IDX>
+struct PoolingKernelUtil<DeviceType::kCPU, T, IDX> {
   static void Maxpool1dForward(ep::Stream* stream,
-                               const NdIndexOffsetHelper<int64_t, 3>& index_helper,
-                               const int64_t elem_num, const T* src, T* dest, int64_t* indice_ptr,
+                               const NdIndexOffsetHelper<IDX, 3>& index_helper,
+                               const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
                                const MaxPoolingParams3D& params_3d) {
-    Maxpool1dForwardCompute<T>(index_helper, elem_num, src, dest, indice_ptr,
+    Maxpool1dForwardCompute<T, IDX>(index_helper, elem_num, src, dest, indice_ptr,
                                params_3d.padding()[2], params_3d.num_batch(),
                                params_3d.num_channel(), params_3d.GetXShape5D().At(4),
-                               params_3d.GetYShape5D().At(4), params_3d.pooling_size_3d()[2],
+                               params_3d.pooling_size_3d()[2],
                                params_3d.stride_3d()[2], params_3d.dilation_3d()[2]);
   }
 
   static void Maxpool1dBackward(ep::Stream* stream,
-                                const NdIndexOffsetHelper<int64_t, 3>& index_helper,
-                                const int64_t elem_num, const T* src, T* dest,
+                                const NdIndexOffsetHelper<IDX, 3>& index_helper,
+                                const IDX elem_num, const T* src, T* dest,
                                 const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) {
-    Maxpool1dBackwardCompute<T>(index_helper, elem_num, src, dest, indice_ptr,
+    Maxpool1dBackwardCompute<T, IDX>(index_helper, elem_num, src, dest, indice_ptr,
                                 params_3d.num_batch(), params_3d.num_channel(),
                                 params_3d.GetYShape5D().At(4), params_3d.GetXShape5D().At(4));
   }
 
   static void Maxpool2dForwardCFirst(ep::Stream* stream,
-                                     const NdIndexOffsetHelper<int64_t, 4>& index_helper,
-                                     const int64_t elem_num, const T* src, T* dest,
+                                     const NdIndexOffsetHelper<IDX, 4>& index_helper,
+                                     const IDX elem_num, const T* src, T* dest,
                                      int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) {
-    Maxpool2dForwardComputeCFirst<T>(
+    Maxpool2dForwardComputeCFirst<T, IDX>(
         index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[1],
         params_3d.padding()[2], params_3d.num_batch(), params_3d.num_channel(),
-        params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4), params_3d.GetYShape5D().At(3),
-        params_3d.GetYShape5D().At(4), params_3d.pooling_size_3d()[1],
-        params_3d.pooling_size_3d()[2], params_3d.stride_3d()[1], params_3d.stride_3d()[2],
+        params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4), 
+        params_3d.pooling_size_3d()[1], params_3d.pooling_size_3d()[2], 
+        params_3d.stride_3d()[1], params_3d.stride_3d()[2],
         params_3d.dilation_3d()[1], params_3d.dilation_3d()[2]);
   }
 
   static void Maxpool2dBackwardCFirst(ep::Stream* stream,
-                                      const NdIndexOffsetHelper<int64_t, 4>& index_helper,
-                                      const int64_t elem_num, const T* src, T* dest,
+                                      const NdIndexOffsetHelper<IDX, 4>& index_helper,
+                                      const IDX elem_num, const T* src, T* dest,
                                       const int64_t* indice_ptr,
                                       const MaxPoolingParams3D& params_3d) {
-    Maxpool2dBackwardComputeCFirst<T>(index_helper, elem_num, src, dest, indice_ptr,
+    Maxpool2dBackwardComputeCFirst<T, IDX>(index_helper, elem_num, src, dest, indice_ptr,
                                       params_3d.num_batch(), params_3d.num_channel(),
                                       params_3d.GetYShape5D().At(3), params_3d.GetYShape5D().At(4),
                                       params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4));
   }
 
   static void Maxpool2dForwardCLast(ep::Stream* stream,
-                                    const NdIndexOffsetHelper<int64_t, 4>& index_helper,
-                                    const int64_t elem_num, const T* src, T* dest,
+                                    const NdIndexOffsetHelper<IDX, 4>& index_helper,
+                                    const IDX elem_num, const T* src, T* dest,
                                     int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) {
-    Maxpool2dForwardComputeCLast<T>(
+    Maxpool2dForwardComputeCLast<T, IDX>(
         index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[1],
         params_3d.padding()[2], params_3d.num_batch(), params_3d.num_channel(),
         params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4), params_3d.GetYShape5D().At(3),
@@ -155,36 +155,36 @@ struct PoolingKernelUtil<DeviceType::kCPU, T> {
   }
 
   static void Maxpool2dBackwardCLast(ep::Stream* stream,
-                                     const NdIndexOffsetHelper<int64_t, 4>& index_helper,
-                                     const int64_t elem_num, const T* src, T* dest,
+                                     const NdIndexOffsetHelper<IDX, 4>& index_helper,
+                                     const IDX elem_num, const T* src, T* dest,
                                      const int64_t* indice_ptr,
                                      const MaxPoolingParams3D& params_3d) {
-    Maxpool2dBackwardComputeCLast<T>(index_helper, elem_num, src, dest, indice_ptr,
+    Maxpool2dBackwardComputeCLast<T, IDX>(index_helper, elem_num, src, dest, indice_ptr,
                                      params_3d.num_batch(), params_3d.num_channel(),
                                      params_3d.GetYShape5D().At(3), params_3d.GetYShape5D().At(4),
                                      params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4));
   }
 
   static void Maxpool3dForward(ep::Stream* stream,
-                               const NdIndexOffsetHelper<int64_t, 5>& index_helper,
-                               const int64_t elem_num, const T* src, T* dest, int64_t* indice_ptr,
+                               const NdIndexOffsetHelper<IDX, 5>& index_helper,
+                               const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
                                const MaxPoolingParams3D& params_3d) {
-    Maxpool3dForwardCompute<T>(
+    Maxpool3dForwardCompute<T, IDX>(
         index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[0],
         params_3d.padding()[1], params_3d.padding()[2], params_3d.num_batch(),
-        params_3d.num_channel(), params_3d.GetXShape5D().At(2), params_3d.GetXShape5D().At(3),
-        params_3d.GetXShape5D().At(4), params_3d.GetYShape5D().At(2), params_3d.GetYShape5D().At(3),
-        params_3d.GetYShape5D().At(4), params_3d.pooling_size_3d()[0],
-        params_3d.pooling_size_3d()[1], params_3d.pooling_size_3d()[2], params_3d.stride_3d()[0],
-        params_3d.stride_3d()[1], params_3d.stride_3d()[2], params_3d.dilation_3d()[0],
+        params_3d.num_channel(), 
+        params_3d.GetXShape5D().At(2), params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4), 
+        params_3d.pooling_size_3d()[0], params_3d.pooling_size_3d()[1], params_3d.pooling_size_3d()[2], 
+        params_3d.stride_3d()[0], params_3d.stride_3d()[1], params_3d.stride_3d()[2], 
+        params_3d.dilation_3d()[0],
         params_3d.dilation_3d()[1], params_3d.dilation_3d()[2]);
   }
 
   static void Maxpool3dBackward(ep::Stream* stream,
-                                const NdIndexOffsetHelper<int64_t, 5> index_helper,
-                                const int64_t elem_num, const T* src, T* dest,
+                                const NdIndexOffsetHelper<IDX, 5> index_helper,
+                                const IDX elem_num, const T* src, T* dest,
                                 const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) {
-    Maxpool3dBackwardCompute<T>(index_helper, elem_num, src, dest, indice_ptr,
+    Maxpool3dBackwardCompute<T, IDX>(index_helper, elem_num, src, dest, indice_ptr,
                                 params_3d.num_batch(), params_3d.num_channel(),
                                 params_3d.GetYShape5D().At(2), params_3d.GetYShape5D().At(3),
                                 params_3d.GetYShape5D().At(4), params_3d.GetXShape5D().At(2),
@@ -221,11 +221,17 @@ class MaxPool1dKernel final : public user_op::OpKernel {
 
     DimVector y_vector;
     y->shape().ToDimVector(&y_vector);
-    NdIndexOffsetHelper<int64_t, 3> index_helper(y_vector.data());
 
-    PoolingKernelUtil<device_type, T>::Maxpool1dForward(ctx->stream(), index_helper, elem_num, src,
+    if(elem_num < GetMaxVal<int32_t>()){
+      NdIndexOffsetHelper<int32_t, 3> index_helper(y_vector.data());
+      PoolingKernelUtil<device_type, T, int32_t>::Maxpool1dForward(ctx->stream(), index_helper, elem_num, src,
                                                         dest, indice_ptr, params_3d);
-  };
+    }else{
+      NdIndexOffsetHelper<int64_t, 3> index_helper(y_vector.data());
+      PoolingKernelUtil<device_type, T, int64_t>::Maxpool1dForward(ctx->stream(), index_helper, elem_num, src,
+                                                        dest, indice_ptr, params_3d);
+      }
+    }
 };
 
 template<DeviceType device_type, typename T>
@@ -256,13 +262,19 @@ class MaxPool1dGradKernel final : public user_op::OpKernel {
     T* dest = dx->mut_dptr<T>();
     DimVector dy_vector;
     dy->shape().ToDimVector(&dy_vector);
-    NdIndexOffsetHelper<int64_t, 3> index_helper(dy_vector.data());
-
     size_t out_bytes_size = dx->shape().elem_cnt() * GetSizeOfDataType(dx->data_type());
     Memset<device_type>(ctx->stream(), dest, 0, out_bytes_size);
 
-    PoolingKernelUtil<device_type, T>::Maxpool1dBackward(ctx->stream(), index_helper, elem_num, src,
+    if(elem_num < GetMaxVal<int32_t>()){
+      NdIndexOffsetHelper<int32_t, 3> index_helper(dy_vector.data());
+      PoolingKernelUtil<device_type, T, int32_t>::Maxpool1dBackward(ctx->stream(), index_helper, elem_num, src,
+                                                         dest, indice_ptr, params_3d);
+    }else{
+      NdIndexOffsetHelper<int64_t, 3> index_helper(dy_vector.data());
+      PoolingKernelUtil<device_type, T, int64_t>::Maxpool1dBackward(ctx->stream(), index_helper, elem_num, src,
                                                          dest, indice_ptr, params_3d);
+    }
+
   };
 };
 
@@ -295,16 +307,29 @@ class MaxPool2dKernel final : public user_op::OpKernel {
 
     DimVector y_vector;
     y->shape().ToDimVector(&y_vector);
-    NdIndexOffsetHelper<int64_t, 4> index_helper(y_vector.data());
     const std::string& data_format = ctx->Attr<std::string>("data_format");
-    if (data_format == "channels_first") {
-      PoolingKernelUtil<device_type, T>::Maxpool2dForwardCFirst(
+    if(elem_num < GetMaxVal<int32_t>()){
+      NdIndexOffsetHelper<int32_t, 4> index_helper(y_vector.data());
+      if (data_format == "channels_first") {
+      PoolingKernelUtil<device_type, T, int32_t>::Maxpool2dForwardCFirst(
           ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d);
-    } else if (data_format == "channels_last") {
-      PoolingKernelUtil<device_type, T>::Maxpool2dForwardCLast(
+      } else if (data_format == "channels_last") {
+        PoolingKernelUtil<device_type, T, int32_t>::Maxpool2dForwardCLast(
+            ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d);
+      } else {
+        UNIMPLEMENTED() << "Unsupported data_format";
+      }
+    }else{
+      NdIndexOffsetHelper<int64_t, 4> index_helper(y_vector.data());
+      if (data_format == "channels_first") {
+      PoolingKernelUtil<device_type, T, int64_t>::Maxpool2dForwardCFirst(
           ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d);
-    } else {
-      UNIMPLEMENTED() << "Unsupported data_format";
+      } else if (data_format == "channels_last") {
+        PoolingKernelUtil<device_type, T, int64_t>::Maxpool2dForwardCLast(
+            ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d);
+      } else {
+        UNIMPLEMENTED() << "Unsupported data_format";
+      }
     }
   };
 };
@@ -337,20 +362,33 @@ class MaxPool2dGradKernel final : public user_op::OpKernel {
     T* dest = dx->mut_dptr<T>();
     DimVector dy_vector;
     dy->shape().ToDimVector(&dy_vector);
-    NdIndexOffsetHelper<int64_t, 4> index_helper(dy_vector.data());
 
     size_t out_bytes_size = dx->shape().elem_cnt() * GetSizeOfDataType(dx->data_type());
     Memset<device_type>(ctx->stream(), dest, 0, out_bytes_size);
 
     const std::string& data_format = ctx->Attr<std::string>("data_format");
-    if (data_format == "channels_first") {
-      PoolingKernelUtil<device_type, T>::Maxpool2dBackwardCFirst(
-          ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d);
-    } else if (data_format == "channels_last") {
-      PoolingKernelUtil<device_type, T>::Maxpool2dBackwardCLast(
-          ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d);
-    } else {
-      UNIMPLEMENTED() << "Unsupported data_format";
+    if(elem_num < GetMaxVal<int32_t>()){
+      NdIndexOffsetHelper<int32_t, 4> index_helper(dy_vector.data());
+      if (data_format == "channels_first") {
+        PoolingKernelUtil<device_type, T, int32_t>::Maxpool2dBackwardCFirst(
+            ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d);
+      } else if (data_format == "channels_last") {
+        PoolingKernelUtil<device_type, T, int32_t>::Maxpool2dBackwardCLast(
+            ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d);
+      } else {
+        UNIMPLEMENTED() << "Unsupported data_format";
+      }
+    }else{
+      NdIndexOffsetHelper<int64_t, 4> index_helper(dy_vector.data());
+      if (data_format == "channels_first") {
+        PoolingKernelUtil<device_type, T, int64_t>::Maxpool2dBackwardCFirst(
+            ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d);
+      } else if (data_format == "channels_last") {
+        PoolingKernelUtil<device_type, T, int64_t>::Maxpool2dBackwardCLast(
+            ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d);
+      } else {
+        UNIMPLEMENTED() << "Unsupported data_format";
+      }
     }
   };
 };
@@ -384,10 +422,15 @@ class MaxPool3dKernel final : public user_op::OpKernel {
 
     DimVector y_vector;
     y->shape().ToDimVector(&y_vector);
-    NdIndexOffsetHelper<int64_t, 5> index_helper(y_vector.data());
-
-    PoolingKernelUtil<device_type, T>::Maxpool3dForward(ctx->stream(), index_helper, elem_num, src,
+    if(elem_num < GetMaxVal<int32_t>()){
+      NdIndexOffsetHelper<int32_t, 5> index_helper(y_vector.data());
+      PoolingKernelUtil<device_type, T, int32_t>::Maxpool3dForward(ctx->stream(), index_helper, elem_num, src,
+                                                        dest, indice_ptr, params_3d);
+    }else{
+      NdIndexOffsetHelper<int64_t, 5> index_helper(y_vector.data());
+      PoolingKernelUtil<device_type, T, int64_t>::Maxpool3dForward(ctx->stream(), index_helper, elem_num, src,
                                                         dest, indice_ptr, params_3d);
+    }
   };
 };
 
@@ -420,13 +463,19 @@ class MaxPool3dGradKernel final : public user_op::OpKernel {
 
     DimVector dy_vector;
     dy->shape().ToDimVector(&dy_vector);
-    NdIndexOffsetHelper<int64_t, 5> index_helper(dy_vector.data());
 
     size_t out_bytes_size = dx->shape().elem_cnt() * GetSizeOfDataType(dx->data_type());
     Memset<device_type>(ctx->stream(), dest, 0, out_bytes_size);
 
-    PoolingKernelUtil<device_type, T>::Maxpool3dBackward(ctx->stream(), index_helper, elem_num, src,
+    if(elem_num < GetMaxVal<int32_t>()){
+      NdIndexOffsetHelper<int32_t, 5> index_helper(dy_vector.data());
+      PoolingKernelUtil<device_type, T, int32_t>::Maxpool3dBackward(ctx->stream(), index_helper, elem_num, src,
                                                          dest, indice_ptr, params_3d);
+    }else {
+      NdIndexOffsetHelper<int64_t, 5> index_helper(dy_vector.data());
+      PoolingKernelUtil<device_type, T, int64_t>::Maxpool3dBackward(ctx->stream(), index_helper, elem_num, src,
+                                                         dest, indice_ptr, params_3d);
+    }
   };
 };
 
@@ -469,6 +518,6 @@ REGISTER_POOLING_WITH_DEVICE(DeviceType::kCUDA)
 #endif
 
 OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_POOLING_KERNEL_UTIL, (DeviceType::kCPU),
-                                 POOLING_DATA_TYPE_CPU_SEQ);
+                                 POOLING_DATA_TYPE_CPU_SEQ, POOLING_IDX_DATA_TYPE_SEQ);
 
 }  // namespace oneflow
diff --git a/oneflow/user/kernels/pooling_kernel.cu b/oneflow/user/kernels/pooling_kernel.cu
index 124f984d0aa..1432ab2e783 100644
--- a/oneflow/user/kernels/pooling_kernel.cu
+++ b/oneflow/user/kernels/pooling_kernel.cu
@@ -32,41 +32,41 @@ int GetNumBlocks(int64_t elem_cnt) {
   return num_blocks;
 }
 
-template<typename T>
+template<typename T, typename IDX>
 __device__ __inline__ void Maxpool2dForwardComputeCLast(
-    const NdIndexOffsetHelper<int64_t, 4>& index_helper, int64_t elem_num, const T* src, T* dest,
+    const NdIndexOffsetHelper<IDX, 4>& index_helper, IDX elem_num, const T* src, T* dest,
     int64_t* indice_ptr, const int32_t padding_h, const int32_t padding_w, const int64_t n_batch,
     const int64_t n_channel, const int64_t x_height, const int64_t x_width, const int64_t y_height,
     const int64_t y_width, const int32_t kernel_size_h, const int32_t kernel_size_w,
     const int32_t stride_h, const int32_t stride_w, const int32_t dilation_h,
     const int32_t dilation_w) {
-  int64_t n, h, w, c;
+  IDX n, h, w, c;
   CUDA_1D_KERNEL_LOOP(num, elem_num) {
     index_helper.OffsetToNdIndex(num, n, h, w, c);
 
-    const int64_t x_start_idx = n * n_channel * x_width * x_height;
-    const int64_t y_start_idx = n * n_channel * y_height * y_width;
-    int64_t hstart = h * stride_h - padding_h;
-    int64_t wstart = w * stride_w - padding_w;
-    const int64_t hend = (hstart + (kernel_size_h - 1) * dilation_h + 1) <= x_height
+    const IDX x_start_idx = n * n_channel * x_width * x_height;
+    const IDX y_start_idx = n * n_channel * y_height * y_width;
+    IDX hstart = h * stride_h - padding_h;
+    IDX wstart = w * stride_w - padding_w;
+    const IDX hend = (hstart + (kernel_size_h - 1) * dilation_h + 1) <= x_height
                              ? (hstart + (kernel_size_h - 1) * dilation_h + 1)
                              : x_height;
-    const int64_t wend = (wstart + (kernel_size_w - 1) * dilation_w + 1) <= x_width
+    const IDX wend = (wstart + (kernel_size_w - 1) * dilation_w + 1) <= x_width
                              ? (wstart + (kernel_size_w - 1) * dilation_w + 1)
                              : x_width;
 
     while (hstart < 0) { hstart += dilation_h; }
     while (wstart < 0) { wstart += dilation_w; }
     /* compute max value(src[src_idx]) in kernel box region, and save the value to dest[num] */
-    int64_t max_index = hstart * x_width + wstart;
-    int64_t src_idx = 0;
+    IDX max_index = hstart * x_width + wstart;
+    IDX src_idx = 0;
     /* equal to -std::numeric_limits<T>::infinity(); */
     T max_value = detail::numeric_limits<T>::lower_bound();
 
-    for (int64_t i = hstart; i < hend; i++) {
-      for (int64_t j = wstart; j < wend; j++) {
-        const int64_t window_idx = i * x_width * n_channel + j * n_channel + c;
-        const int64_t search_idx = x_start_idx + window_idx;
+    for (IDX i = hstart; i < hend; i++) {
+      for (IDX j = wstart; j < wend; j++) {
+        const IDX window_idx = i * x_width * n_channel + j * n_channel + c;
+        const IDX search_idx = x_start_idx + window_idx;
         T val = src[search_idx];
         if (val > max_value || detail::numerics<T>::isnan(val)) {
           max_value = val;
@@ -75,7 +75,7 @@ __device__ __inline__ void Maxpool2dForwardComputeCLast(
         }
       }
     }
-    const int64_t out_idx = y_start_idx + h * y_width * n_channel + w * n_channel + c;
+    const IDX out_idx = y_start_idx + h * y_width * n_channel + w * n_channel + c;
     dest[out_idx] = src[src_idx];
     indice_ptr[out_idx] = max_index;
   }
@@ -83,156 +83,156 @@ __device__ __inline__ void Maxpool2dForwardComputeCLast(
 
 }  // namespace
 
-template<typename T>
+template<typename T, typename IDX>
 __launch_bounds__(kBlockSize) __global__
-    void DoCUDAMaxPool1dForward(const NdIndexOffsetHelper<int64_t, 3> index_helper,
-                                int64_t elem_num, const T* src, T* dest, int64_t* indice_ptr,
+    void DoCUDAMaxPool1dForward(const NdIndexOffsetHelper<IDX, 3> index_helper,
+                                IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
                                 int32_t padding_l, int64_t n_batch, int64_t n_channel,
-                                int64_t x_length, int64_t y_length, int32_t kernel_size_l,
+                                int64_t x_length, int32_t kernel_size_l,
                                 int32_t stride_l, int32_t dilation_l) {
-  Maxpool1dForwardCompute<T>(index_helper, elem_num, src, dest, indice_ptr, padding_l, n_batch,
-                             n_channel, x_length, y_length, kernel_size_l, stride_l, dilation_l);
+  Maxpool1dForwardCompute<T, IDX>(index_helper, elem_num, src, dest, indice_ptr, padding_l, n_batch,
+                             n_channel, x_length, kernel_size_l, stride_l, dilation_l);
 };
 
-template<typename T>
+template<typename T, typename IDX>
 __launch_bounds__(kBlockSize) __global__
-    void DoCUDAMaxPool2dForwardCFirst(const NdIndexOffsetHelper<int64_t, 4> index_helper,
-                                      int64_t elem_num, const T* src, T* dest, int64_t* indice_ptr,
+    void DoCUDAMaxPool2dForwardCFirst(const NdIndexOffsetHelper<IDX, 4> index_helper,
+                                      IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
                                       int32_t padding_h, int32_t padding_w, int64_t n_batch,
                                       int64_t n_channel, int64_t x_height, int64_t x_width,
-                                      int64_t y_height, int64_t y_width, int32_t kernel_size_h,
-                                      int32_t kernel_size_w, int32_t stride_h, int32_t stride_w,
+                                      int32_t kernel_size_h, int32_t kernel_size_w, 
+                                      int32_t stride_h, int32_t stride_w,
                                       int32_t dilation_h, int32_t dilation_w) {
-  Maxpool2dForwardComputeCFirst<T>(index_helper, elem_num, src, dest, indice_ptr, padding_h,
-                                   padding_w, n_batch, n_channel, x_height, x_width, y_height,
-                                   y_width, kernel_size_h, kernel_size_w, stride_h, stride_w,
+  Maxpool2dForwardComputeCFirst<T, IDX>(index_helper, elem_num, src, dest, indice_ptr, padding_h,
+                                   padding_w, n_batch, n_channel, x_height, x_width, 
+                                   kernel_size_h, kernel_size_w, stride_h, stride_w,
                                    dilation_h, dilation_w);
 };
 
-template<typename T>
+template<typename T, typename IDX>
 __launch_bounds__(kBlockSize) __global__
-    void DoCUDAMaxPool2dForwardCLast(const NdIndexOffsetHelper<int64_t, 4> index_helper,
-                                     int64_t elem_num, const T* src, T* dest, int64_t* indice_ptr,
+    void DoCUDAMaxPool2dForwardCLast(const NdIndexOffsetHelper<IDX, 4> index_helper,
+                                     IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
                                      int32_t padding_h, int32_t padding_w, int64_t n_batch,
                                      int64_t n_channel, int64_t x_height, int64_t x_width,
                                      int64_t y_height, int64_t y_width, int32_t kernel_size_h,
                                      int32_t kernel_size_w, int32_t stride_h, int32_t stride_w,
                                      int32_t dilation_h, int32_t dilation_w) {
-  Maxpool2dForwardComputeCLast<T>(index_helper, elem_num, src, dest, indice_ptr, padding_h,
+  Maxpool2dForwardComputeCLast<T, IDX>(index_helper, elem_num, src, dest, indice_ptr, padding_h,
                                   padding_w, n_batch, n_channel, x_height, x_width, y_height,
                                   y_width, kernel_size_h, kernel_size_w, stride_h, stride_w,
                                   dilation_h, dilation_w);
 };
 
-template<typename T>
+template<typename T, typename IDX>
 __launch_bounds__(kBlockSize) __global__
-    void DoCUDAMaxPool3dForward(const NdIndexOffsetHelper<int64_t, 5> index_helper,
-                                int64_t elem_num, const T* src, T* dest, int64_t* indice_ptr,
+    void DoCUDAMaxPool3dForward(const NdIndexOffsetHelper<IDX, 5> index_helper,
+                                IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
                                 int32_t padding_t, int32_t padding_h, int32_t padding_w,
                                 int64_t n_batch, int64_t n_channel, int64_t x_time,
-                                int64_t x_height, int64_t x_width, int64_t y_time, int64_t y_height,
-                                int64_t y_width, int32_t kernel_size_t, int32_t kernel_size_h,
+                                int64_t x_height, int64_t x_width, 
+                                int32_t kernel_size_t, int32_t kernel_size_h,
                                 int32_t kernel_size_w, int32_t stride_t, int32_t stride_h,
                                 int32_t stride_w, int32_t dilation_t, int32_t dilation_h,
                                 int32_t dilation_w) {
-  Maxpool3dForwardCompute<T>(index_helper, elem_num, src, dest, indice_ptr, padding_t, padding_h,
-                             padding_w, n_batch, n_channel, x_time, x_height, x_width, y_time,
-                             y_height, y_width, kernel_size_t, kernel_size_h, kernel_size_w,
+  Maxpool3dForwardCompute<T, IDX>(index_helper, elem_num, src, dest, indice_ptr, padding_t, padding_h,
+                             padding_w, n_batch, n_channel, x_time, x_height, x_width, 
+                             kernel_size_t, kernel_size_h, kernel_size_w,
                              stride_t, stride_h, stride_w, dilation_t, dilation_h, dilation_w);
 };
 
-template<typename T>
+template<typename T, typename IDX>
 __launch_bounds__(kBlockSize) __global__
-    void DoCUDAMaxPool1dBackward(const NdIndexOffsetHelper<int64_t, 3> index_helper,
-                                 const int64_t elem_num, const T* src, T* dest,
+    void DoCUDAMaxPool1dBackward(const NdIndexOffsetHelper<IDX, 3> index_helper,
+                                 const IDX elem_num, const T* src, T* dest,
                                  const int64_t* indice_ptr, const int64_t n_batch,
                                  const int64_t n_channel, const int64_t src_length,
                                  const int64_t dst_length) {
-  Maxpool1dBackwardCompute<T>(index_helper, elem_num, src, dest, indice_ptr, n_batch, n_channel,
+  Maxpool1dBackwardCompute<T, IDX>(index_helper, elem_num, src, dest, indice_ptr, n_batch, n_channel,
                               src_length, dst_length);
 };
 
-template<typename T>
+template<typename T, typename IDX>
 __launch_bounds__(kBlockSize) __global__
-    void DoCUDAMaxPool2dBackwardCFirst(const NdIndexOffsetHelper<int64_t, 4> index_helper,
-                                       const int64_t elem_num, const T* src, T* dest,
+    void DoCUDAMaxPool2dBackwardCFirst(const NdIndexOffsetHelper<IDX, 4> index_helper,
+                                       const IDX elem_num, const T* src, T* dest,
                                        const int64_t* indice_ptr, const int64_t n_batch,
                                        const int64_t n_channel, const int64_t src_height,
                                        const int64_t src_width, const int64_t dst_height,
                                        const int64_t dst_width) {
-  Maxpool2dBackwardComputeCFirst<T>(index_helper, elem_num, src, dest, indice_ptr, n_batch,
+  Maxpool2dBackwardComputeCFirst<T, IDX>(index_helper, elem_num, src, dest, indice_ptr, n_batch,
                                     n_channel, src_height, src_width, dst_height, dst_width);
 };
 
-template<typename T>
+template<typename T, typename IDX>
 __launch_bounds__(kBlockSize) __global__
-    void DoCUDAMaxPool2dBackwardCLast(const NdIndexOffsetHelper<int64_t, 4> index_helper,
-                                      const int64_t elem_num, const T* src, T* dest,
+    void DoCUDAMaxPool2dBackwardCLast(const NdIndexOffsetHelper<IDX, 4> index_helper,
+                                      const IDX elem_num, const T* src, T* dest,
                                       const int64_t* indice_ptr, const int64_t n_batch,
                                       const int64_t n_channel, const int64_t src_height,
                                       const int64_t src_width, const int64_t dst_height,
                                       const int64_t dst_width) {
-  Maxpool2dBackwardComputeCLast<T>(index_helper, elem_num, src, dest, indice_ptr, n_batch,
+  Maxpool2dBackwardComputeCLast<T, IDX>(index_helper, elem_num, src, dest, indice_ptr, n_batch,
                                    n_channel, src_height, src_width, dst_height, dst_width);
 };
 
-template<typename T>
+template<typename T, typename IDX>
 __launch_bounds__(kBlockSize) __global__
-    void DoCUDAMaxPool3dBackward(const NdIndexOffsetHelper<int64_t, 5> index_helper,
-                                 const int64_t elem_num, const T* src, T* dest,
+    void DoCUDAMaxPool3dBackward(const NdIndexOffsetHelper<IDX, 5> index_helper,
+                                 const IDX elem_num, const T* src, T* dest,
                                  const int64_t* indice_ptr, const int64_t n_batch,
                                  const int64_t n_channel, const int64_t src_time,
                                  const int64_t src_height, const int64_t src_width,
                                  const int64_t dst_time, const int64_t dst_height,
                                  const int64_t dst_width) {
-  Maxpool3dBackwardCompute<T>(index_helper, elem_num, src, dest, indice_ptr, n_batch, n_channel,
+  Maxpool3dBackwardCompute<T, IDX>(index_helper, elem_num, src, dest, indice_ptr, n_batch, n_channel,
                               src_time, src_height, src_width, dst_time, dst_height, dst_width);
 };
 
-template<typename T>
-struct PoolingKernelUtil<DeviceType::kCUDA, T> {
+template<typename T, typename IDX>
+struct PoolingKernelUtil<DeviceType::kCUDA, T, IDX> {
   static void Maxpool1dForward(ep::Stream* stream,
-                               const NdIndexOffsetHelper<int64_t, 3>& index_helper,
-                               const int64_t elem_num, const T* src, T* dest, int64_t* indice_ptr,
+                               const NdIndexOffsetHelper<IDX, 3>& index_helper,
+                               const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
                                const MaxPoolingParams3D& params_3d) {
-    DoCUDAMaxPool1dForward<T><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
+    DoCUDAMaxPool1dForward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
                                 stream->As<ep::CudaStream>()->cuda_stream()>>>(
         index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[2],
         params_3d.num_batch(), params_3d.num_channel(), params_3d.GetXShape5D().At(4),
-        params_3d.GetYShape5D().At(4), params_3d.pooling_size_3d()[2], params_3d.stride_3d()[2],
+        params_3d.pooling_size_3d()[2], params_3d.stride_3d()[2],
         params_3d.dilation_3d()[2]);
   }
 
   static void Maxpool1dBackward(ep::Stream* stream,
-                                const NdIndexOffsetHelper<int64_t, 3>& index_helper,
-                                const int64_t elem_num, const T* src, T* dest,
+                                const NdIndexOffsetHelper<IDX, 3>& index_helper,
+                                const IDX elem_num, const T* src, T* dest,
                                 const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) {
-    DoCUDAMaxPool1dBackward<T><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
+    DoCUDAMaxPool1dBackward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
                                  stream->As<ep::CudaStream>()->cuda_stream()>>>(
         index_helper, elem_num, src, dest, indice_ptr, params_3d.num_batch(),
         params_3d.num_channel(), params_3d.GetYShape5D().At(4), params_3d.GetXShape5D().At(4));
   }
 
   static void Maxpool2dForwardCFirst(ep::Stream* stream,
-                                     const NdIndexOffsetHelper<int64_t, 4>& index_helper,
-                                     const int64_t elem_num, const T* src, T* dest,
+                                     const NdIndexOffsetHelper<IDX, 4>& index_helper,
+                                     const IDX elem_num, const T* src, T* dest,
                                      int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) {
-    DoCUDAMaxPool2dForwardCFirst<T><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
+    DoCUDAMaxPool2dForwardCFirst<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
                                       stream->As<ep::CudaStream>()->cuda_stream()>>>(
         index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[1],
         params_3d.padding()[2], params_3d.num_batch(), params_3d.num_channel(),
-        params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4), params_3d.GetYShape5D().At(3),
-        params_3d.GetYShape5D().At(4), params_3d.pooling_size_3d()[1],
-        params_3d.pooling_size_3d()[2], params_3d.stride_3d()[1], params_3d.stride_3d()[2],
+        params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4), 
+        params_3d.pooling_size_3d()[1], params_3d.pooling_size_3d()[2], 
+        params_3d.stride_3d()[1], params_3d.stride_3d()[2],
         params_3d.dilation_3d()[1], params_3d.dilation_3d()[2]);
   }
 
   static void Maxpool2dBackwardCFirst(ep::Stream* stream,
-                                      const NdIndexOffsetHelper<int64_t, 4>& index_helper,
-                                      const int64_t elem_num, const T* src, T* dest,
+                                      const NdIndexOffsetHelper<IDX, 4>& index_helper,
+                                      const IDX elem_num, const T* src, T* dest,
                                       const int64_t* indice_ptr,
                                       const MaxPoolingParams3D& params_3d) {
-    DoCUDAMaxPool2dBackwardCFirst<T><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
+    DoCUDAMaxPool2dBackwardCFirst<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
                                        stream->As<ep::CudaStream>()->cuda_stream()>>>(
         index_helper, elem_num, src, dest, indice_ptr, params_3d.num_batch(),
         params_3d.num_channel(), params_3d.GetYShape5D().At(3), params_3d.GetYShape5D().At(4),
@@ -240,10 +240,10 @@ struct PoolingKernelUtil<DeviceType::kCUDA, T> {
   }
 
   static void Maxpool2dForwardCLast(ep::Stream* stream,
-                                    const NdIndexOffsetHelper<int64_t, 4>& index_helper,
-                                    const int64_t elem_num, const T* src, T* dest,
+                                    const NdIndexOffsetHelper<IDX, 4>& index_helper,
+                                    const IDX elem_num, const T* src, T* dest,
                                     int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) {
-    DoCUDAMaxPool2dForwardCLast<T><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
+    DoCUDAMaxPool2dForwardCLast<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
                                      stream->As<ep::CudaStream>()->cuda_stream()>>>(
         index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[1],
         params_3d.padding()[2], params_3d.num_batch(), params_3d.num_channel(),
@@ -254,11 +254,11 @@ struct PoolingKernelUtil<DeviceType::kCUDA, T> {
   }
 
   static void Maxpool2dBackwardCLast(ep::Stream* stream,
-                                     const NdIndexOffsetHelper<int64_t, 4>& index_helper,
-                                     const int64_t elem_num, const T* src, T* dest,
+                                     const NdIndexOffsetHelper<IDX, 4>& index_helper,
+                                     const IDX elem_num, const T* src, T* dest,
                                      const int64_t* indice_ptr,
                                      const MaxPoolingParams3D& params_3d) {
-    DoCUDAMaxPool2dBackwardCLast<T><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
+    DoCUDAMaxPool2dBackwardCLast<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
                                       stream->As<ep::CudaStream>()->cuda_stream()>>>(
         index_helper, elem_num, src, dest, indice_ptr, params_3d.num_batch(),
         params_3d.num_channel(), params_3d.GetYShape5D().At(3), params_3d.GetYShape5D().At(4),
@@ -266,26 +266,27 @@ struct PoolingKernelUtil<DeviceType::kCUDA, T> {
   }
 
   static void Maxpool3dForward(ep::Stream* stream,
-                               const NdIndexOffsetHelper<int64_t, 5>& index_helper,
-                               const int64_t elem_num, const T* src, T* dest, int64_t* indice_ptr,
+                               const NdIndexOffsetHelper<IDX, 5>& index_helper,
+                               const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
                                const MaxPoolingParams3D& params_3d) {
-    DoCUDAMaxPool3dForward<T><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
+    DoCUDAMaxPool3dForward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
                                 stream->As<ep::CudaStream>()->cuda_stream()>>>(
         index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[0],
         params_3d.padding()[1], params_3d.padding()[2], params_3d.num_batch(),
-        params_3d.num_channel(), params_3d.GetXShape5D().At(2), params_3d.GetXShape5D().At(3),
-        params_3d.GetXShape5D().At(4), params_3d.GetYShape5D().At(2), params_3d.GetYShape5D().At(3),
-        params_3d.GetYShape5D().At(4), params_3d.pooling_size_3d()[0],
+        params_3d.num_channel(), 
+        params_3d.GetXShape5D().At(2), params_3d.GetXShape5D().At(3),
+        params_3d.GetXShape5D().At(4), 
+        params_3d.pooling_size_3d()[0],
         params_3d.pooling_size_3d()[1], params_3d.pooling_size_3d()[2], params_3d.stride_3d()[0],
         params_3d.stride_3d()[1], params_3d.stride_3d()[2], params_3d.dilation_3d()[0],
         params_3d.dilation_3d()[1], params_3d.dilation_3d()[2]);
   }
 
   static void Maxpool3dBackward(ep::Stream* stream,
-                                const NdIndexOffsetHelper<int64_t, 5>& index_helper,
-                                const int64_t elem_num, const T* src, T* dest,
+                                const NdIndexOffsetHelper<IDX, 5>& index_helper,
+                                const IDX elem_num, const T* src, T* dest,
                                 const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) {
-    DoCUDAMaxPool3dBackward<T><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
+    DoCUDAMaxPool3dBackward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
                                  stream->As<ep::CudaStream>()->cuda_stream()>>>(
         index_helper, elem_num, src, dest, indice_ptr, params_3d.num_batch(),
         params_3d.num_channel(), params_3d.GetYShape5D().At(2), params_3d.GetYShape5D().At(3),
@@ -295,7 +296,7 @@ struct PoolingKernelUtil<DeviceType::kCUDA, T> {
 };
 
 OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_POOLING_KERNEL_UTIL, (DeviceType::kCUDA),
-                                 POOLING_DATA_TYPE_CUDA_SEQ);
+                                 POOLING_DATA_TYPE_CUDA_SEQ, POOLING_IDX_DATA_TYPE_SEQ);
 
 }  // namespace oneflow
 #endif  // WITH_CUDA
diff --git a/oneflow/user/kernels/pooling_kernel_util.h b/oneflow/user/kernels/pooling_kernel_util.h
index 6ad5233515c..4c7c542896d 100644
--- a/oneflow/user/kernels/pooling_kernel_util.h
+++ b/oneflow/user/kernels/pooling_kernel_util.h
@@ -33,6 +33,10 @@ namespace oneflow {
   OF_PP_MAKE_TUPLE_SEQ(float, DataType::kFloat)   \
   OF_PP_MAKE_TUPLE_SEQ(double, DataType::kDouble)
 
+#define POOLING_IDX_DATA_TYPE_SEQ                     \
+  OF_PP_MAKE_TUPLE_SEQ(int32_t, DataType::kInt32) \
+  OF_PP_MAKE_TUPLE_SEQ(int64_t, DataType::kInt64)
+
 #define POOLING_DATA_TYPE_CPU_SEQ POOLING_DATA_TYPE_SEQ
 
 #define POOLING_DATA_TYPE_CUDA_SEQ POOLING_DATA_TYPE_SEQ
@@ -88,81 +92,80 @@ class MaxPoolingParams3D {
   int64_t channel_num_;
 };
 
-template<DeviceType device_type, typename T>
+template<DeviceType device_type, typename T, typename IDX>
 struct PoolingKernelUtil {
   static void Maxpool1dForward(ep::Stream* stream,
-                               const NdIndexOffsetHelper<int64_t, 3>& index_helper,
-                               const int64_t elem_num, const T* src, T* dest, int64_t* indice_ptr,
+                               const NdIndexOffsetHelper<IDX, 3>& index_helper,
+                               const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
                                const MaxPoolingParams3D& params_3d);
 
   static void Maxpool1dBackward(ep::Stream* stream,
-                                const NdIndexOffsetHelper<int64_t, 3>& index_helper,
-                                const int64_t elem_num, const T* src, T* dest,
+                                const NdIndexOffsetHelper<IDX, 3>& index_helper,
+                                const IDX elem_num, const T* src, T* dest,
                                 const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d);
 
   static void Maxpool2dForwardCFirst(ep::Stream* stream,
-                                     const NdIndexOffsetHelper<int64_t, 4>& index_helper,
-                                     const int64_t elem_num, const T* src, T* dest,
+                                     const NdIndexOffsetHelper<IDX, 4>& index_helper,
+                                     const IDX elem_num, const T* src, T* dest,
                                      int64_t* indice_ptr, const MaxPoolingParams3D& params_3d);
 
   static void Maxpool2dBackwardCFirst(ep::Stream* stream,
-                                      const NdIndexOffsetHelper<int64_t, 4>& index_helper,
-                                      const int64_t elem_num, const T* src, T* dest,
+                                      const NdIndexOffsetHelper<IDX, 4>& index_helper,
+                                      const IDX elem_num, const T* src, T* dest,
                                       const int64_t* indice_ptr,
                                       const MaxPoolingParams3D& params_3d);
 
   static void Maxpool2dForwardCLast(ep::Stream* stream,
-                                    const NdIndexOffsetHelper<int64_t, 4>& index_helper,
-                                    const int64_t elem_num, const T* src, T* dest,
+                                    const NdIndexOffsetHelper<IDX, 4>& index_helper,
+                                    const IDX elem_num, const T* src, T* dest,
                                     int64_t* indice_ptr, const MaxPoolingParams3D& params_3d);
 
   static void Maxpool2dBackwardCLast(ep::Stream* stream,
-                                     const NdIndexOffsetHelper<int64_t, 4>& index_helper,
-                                     const int64_t elem_num, const T* src, T* dest,
+                                     const NdIndexOffsetHelper<IDX, 4>& index_helper,
+                                     const IDX elem_num, const T* src, T* dest,
                                      const int64_t* indice_ptr,
                                      const MaxPoolingParams3D& params_3d);
 
   static void Maxpool3dForward(ep::Stream* stream,
-                               const NdIndexOffsetHelper<int64_t, 5>& index_helper,
-                               const int64_t elem_num, const T* src, T* dest, int64_t* indice_ptr,
+                               const NdIndexOffsetHelper<IDX, 5>& index_helper,
+                               const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
                                const MaxPoolingParams3D& params_3d);
 
   static void Maxpool3dBackward(ep::Stream* stream,
-                                const NdIndexOffsetHelper<int64_t, 5>& index_helper,
-                                const int64_t elem_num, const T* src, T* dest,
+                                const NdIndexOffsetHelper<IDX, 5>& index_helper,
+                                const IDX elem_num, const T* src, T* dest,
                                 const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d);
 };
 
-template<typename T>
-OF_DEVICE_FUNC void Maxpool1dForwardCompute(const NdIndexOffsetHelper<int64_t, 3> index_helper,
-                                            int64_t elem_num, const T* src, T* dest,
+template<typename T, typename IDX>
+OF_DEVICE_FUNC void Maxpool1dForwardCompute(const NdIndexOffsetHelper<IDX, 3> index_helper,
+                                            IDX elem_num, const T* src, T* dest,
                                             int64_t* indice_ptr, const int32_t padding_l,
                                             const int64_t n_batch, const int64_t n_channel,
-                                            const int64_t x_length, const int64_t y_length,
-                                            const int32_t kernel_size_l, const int32_t stride_l,
-                                            const int32_t dilation_l) {
+                                            const int64_t x_length, const int32_t kernel_size_l, 
+                                            const int32_t stride_l, const int32_t dilation_l) {
   XPU_1D_KERNEL_LOOP(num, elem_num) {
-    int64_t n, c, l;
+    IDX n, c, l;
     index_helper.OffsetToNdIndex(num, n, c, l);
 
     // n, c, l->index = n*c*l + c* l
-    const int64_t start_idx = (n * n_channel + c) * x_length;
-    int64_t lstart = l * stride_l - padding_l;
-    const int64_t lend = (lstart + (kernel_size_l - 1) * dilation_l + 1) <= x_length
+    const IDX start_idx = (n * n_channel + c) * x_length;
+    IDX lstart = l * stride_l - padding_l;
+    const IDX lend = (lstart + (kernel_size_l - 1) * dilation_l + 1) <= x_length
                              ? (lstart + (kernel_size_l - 1) * dilation_l + 1)
                              : x_length;
 
     while (lstart < 0) { lstart += dilation_l; }
 
     /* compute max value(src[src_idx]) in kernel box region, and save the value to dest[num] */
-    int64_t max_index = lstart;
-    int64_t src_idx = 0;
+    IDX max_index = lstart;
+    IDX src_idx = 0;
 
     /* equal to -std::numeric_limits<T>::infinity(); */
     T max_value = detail::numeric_limits<T>::lower_bound();
 
-    for (int64_t idx = lstart; idx < lend; idx += dilation_l) {
-      const int64_t search_idx = start_idx + idx;
+    for (IDX idx = lstart; idx < lend; idx += dilation_l) {
+      const IDX search_idx = start_idx + idx;
       T val = src[search_idx];
       if (val > max_value || detail::numerics<T>::isnan(val)) {
         max_value = val;
@@ -175,20 +178,20 @@ OF_DEVICE_FUNC void Maxpool1dForwardCompute(const NdIndexOffsetHelper<int64_t, 3
   }
 }
 
-template<typename T>
-OF_DEVICE_FUNC void Maxpool1dBackwardCompute(const NdIndexOffsetHelper<int64_t, 3> index_helper,
-                                             const int64_t elem_num, const T* src, T* dest,
+template<typename T, typename IDX>
+OF_DEVICE_FUNC void Maxpool1dBackwardCompute(const NdIndexOffsetHelper<IDX, 3> index_helper,
+                                             const IDX elem_num, const T* src, T* dest,
                                              const int64_t* indice_ptr, const int64_t n_batch,
                                              const int64_t n_channel, const int64_t src_length,
                                              const int64_t dst_length) {
   XPU_1D_KERNEL_LOOP(num, elem_num) {
-    int64_t n, c, l;
+    IDX n, c, l;
     index_helper.OffsetToNdIndex(num, n, c, l);
 
-    const int64_t src_start = (n * n_channel + c) * src_length;
-    const int64_t dst_start = (n * n_channel + c) * dst_length;
-    const int64_t index = src_start + l;
-    const int64_t max_index = dst_start + indice_ptr[index];
+    const IDX src_start = (n * n_channel + c) * src_length;
+    const IDX dst_start = (n * n_channel + c) * dst_length;
+    const IDX index = src_start + l;
+    const IDX max_index = dst_start + indice_ptr[index];
     if (max_index != -1) {
       /* update gradient, equals to dest[max_index] += src[index]; */
       DeviceAdd<T>::Invoke(src + index, dest + max_index);
@@ -196,114 +199,78 @@ OF_DEVICE_FUNC void Maxpool1dBackwardCompute(const NdIndexOffsetHelper<int64_t,
   }
 }
 
-template<typename T>
+template<typename T, typename IDX>
 OF_DEVICE_FUNC void Maxpool2dForwardComputeCFirst(
-    const NdIndexOffsetHelper<int64_t, 4> index_helper, int64_t elem_num, const T* src, T* dest,
+    const NdIndexOffsetHelper<IDX, 4> index_helper, IDX elem_num, const T* src, T* dest,
     int64_t* indice_ptr, const int32_t padding_h, const int32_t padding_w, const int64_t n_batch,
-    const int64_t n_channel, const int64_t x_height, const int64_t x_width, const int64_t y_height,
-    const int64_t y_width, const int32_t kernel_size_h, const int32_t kernel_size_w,
+    const int64_t n_channel, const int64_t x_height, const int64_t x_width,
+    const int32_t kernel_size_h, const int32_t kernel_size_w,
     const int32_t stride_h, const int32_t stride_w, const int32_t dilation_h,
     const int32_t dilation_w) {
   XPU_1D_KERNEL_LOOP(num, elem_num) {
-    // int64_t n, c, h, w;
-    // index_helper.OffsetToNdIndex(num, n, c, h, w);
-
-    // const int64_t start_idx = (n * n_channel + c) * x_width * x_height;
-    // int64_t hstart = h * stride_h - padding_h;
-    // int64_t wstart = w * stride_w - padding_w;
-    // const int64_t hend = (hstart + (kernel_size_h - 1) * dilation_h + 1) <= x_height
-    //                          ? (hstart + (kernel_size_h - 1) * dilation_h + 1)
-    //                          : x_height;
-    // const int64_t wend = (wstart + (kernel_size_w - 1) * dilation_w + 1) <= x_width
-    //                          ? (wstart + (kernel_size_w - 1) * dilation_w + 1)
-    //                          : x_width;
-
-    // while (hstart < 0) { hstart += dilation_h; }
-    // while (wstart < 0) { wstart += dilation_w; }
-
-    // /* compute max value(src[src_idx]) in kernel box region, and save the value to dest[num] */
-    // int64_t max_index = hstart * x_width + wstart;
-    // int64_t src_idx = 0;
-
-    // /* equal to -std::numeric_limits<T>::infinity(); */
-    // T max_value = detail::numeric_limits<T>::lower_bound();
-
-    // for (int64_t i = hstart; i < hend; i += dilation_h) {
-    //   for (int64_t j = wstart; j < wend; j += dilation_w) {
-    //     const int64_t window_idx = i * x_width + j;
-    //     const int64_t search_idx = start_idx + window_idx;
-    //     T val = src[search_idx];
-    //     /* NOTE:
-    //     std::isnan(val) only supports a few data types, see:
-    //     https://en.cppreference.com/w/cpp/numeric/math/isnan and when use gcc/g++ 4.x to compile,
-    //     the following exception will be throw:
-
-    //     new_kernel_util.cu:24] Check failed: cudaMemcpyAsync(dst, src, sz, cudaMemcpyDefault,
-    //     ctx->cuda_stream() ) : unspecified launch failure (719)
-
-    //     but if use gcc/g++ 7.x to compile, everything is ok! the exact reason is still unknown!
-    //     */
-    //     if (val > max_value || detail::numerics<T>::isnan(val)) {
-    //       max_value = val;
-    //       max_index = window_idx;
-    //       src_idx = search_idx;
-    //     }
-    //   }
-    // }
-
-    int pw = num % kernel_size_w;
-    int ph = (num / kernel_size_w) % kernel_size_h;
-    int c = (num / kernel_size_w / kernel_size_h) % 64;
-    int n = num / kernel_size_w / kernel_size_h / 64;
-    int hstart = ph * stride_h - padding_h;
-    int wstart = pw * stride_w - padding_w;
-    // int hend = min(hstart + (kernel_size_h - 1) * dilation_h + 1, x_height);
-    // int wend = min(wstart + (kernel_size_w - 1) * dilation_w + 1, x_width);
-
-    int hend = (hstart + (kernel_size_h - 1) * dilation_h + 1) <= x_height
+    IDX n, c, h, w;
+    index_helper.OffsetToNdIndex(num, n, c, h, w);
+
+    const IDX start_idx = (n * n_channel + c) * x_width * x_height;
+    IDX hstart = h * stride_h - padding_h;
+    IDX wstart = w * stride_w - padding_w;
+    const IDX hend = (hstart + (kernel_size_h - 1) * dilation_h + 1) <= x_height
                              ? (hstart + (kernel_size_h - 1) * dilation_h + 1)
                              : x_height;
-    int wend = (wstart + (kernel_size_w - 1) * dilation_w + 1) <= x_width
+    const IDX wend = (wstart + (kernel_size_w - 1) * dilation_w + 1) <= x_width
                              ? (wstart + (kernel_size_w - 1) * dilation_w + 1)
                              : x_width;
 
-    while(hstart < 0)
-      hstart += dilation_h;
-    while(wstart < 0)
-      wstart += dilation_w;
-    
-    T maxval = detail::numeric_limits<T>::lower_bound(); // -Infinity
-    int maxidx = hstart * x_width + wstart;
-    const T* btm_data = src + (n * 64 + c) * x_height * x_width;
-    for (int h = hstart; h < hend; h += dilation_h) {
-      for (int w = wstart; w < wend; w += dilation_w) {
-        T val = btm_data[h * x_width + w];
-        if (val > maxval|| detail::numerics<T>::isnan(val)) {
-          maxidx = h * x_width + w;
-          maxval = static_cast<T>(val);
+    while (hstart < 0) { hstart += dilation_h; }
+    while (wstart < 0) { wstart += dilation_w; }
+
+    /* compute max value(src[src_idx]) in kernel box region, and save the value to dest[num] */
+    IDX max_index = hstart * x_width + wstart;
+    IDX src_idx = 0;
+
+    /* equal to -std::numeric_limits<T>::infinity(); */
+    T max_value = detail::numeric_limits<T>::lower_bound();
+
+    for (IDX i = hstart; i < hend; i += dilation_h) {
+      for (IDX j = wstart; j < wend; j += dilation_w) {
+        const IDX window_idx = i * x_width + j;
+        const IDX search_idx = start_idx + window_idx;
+        T val = src[search_idx];
+        /* NOTE:
+        std::isnan(val) only supports a few data types, see:
+        https://en.cppreference.com/w/cpp/numeric/math/isnan and when use gcc/g++ 4.x to compile,
+        the following exception will be throw:
+
+        new_kernel_util.cu:24] Check failed: cudaMemcpyAsync(dst, src, sz, cudaMemcpyDefault,
+        ctx->cuda_stream() ) : unspecified launch failure (719)
+
+        but if use gcc/g++ 7.x to compile, everything is ok! the exact reason is still unknown!
+        */
+        if (val > max_value || detail::numerics<T>::isnan(val)) {
+          max_value = val;
+          max_index = window_idx;
+          src_idx = search_idx;
         }
       }
     }
-    dest[num] = maxval;
-    indice_ptr[num] = maxidx;
   }
 }
 
-template<typename T>
+template<typename T, typename IDX>
 OF_DEVICE_FUNC void Maxpool2dBackwardComputeCFirst(
-    const NdIndexOffsetHelper<int64_t, 4> index_helper, const int64_t elem_num, const T* src,
+    const NdIndexOffsetHelper<IDX, 4> index_helper, const IDX elem_num, const T* src,
     T* dest, const int64_t* indice_ptr, const int64_t n_batch, const int64_t n_channel,
     const int64_t src_height, const int64_t src_width, const int64_t dst_height,
     const int64_t dst_width) {
   XPU_1D_KERNEL_LOOP(num, elem_num) {
-    int64_t n, c, h, w;
+    IDX n, c, h, w;
     index_helper.OffsetToNdIndex(num, n, c, h, w);
 
-    const int64_t src_start = (n * n_channel + c) * src_height * src_width;
-    const int64_t dst_start = (n * n_channel + c) * dst_height * dst_width;
-    const int64_t index = src_start + h * src_width + w;
+    const IDX src_start = (n * n_channel + c) * src_height * src_width;
+    const IDX dst_start = (n * n_channel + c) * dst_height * dst_width;
+    const IDX index = src_start + h * src_width + w;
 
-    const int64_t max_index = dst_start + indice_ptr[index];
+    const IDX max_index = dst_start + indice_ptr[index];
     if (max_index != -1) {
       /* update gradient, equals to dest[max_index] += src[index]; */
       DeviceAdd<T>::Invoke(src + index, dest + max_index);
@@ -311,19 +278,19 @@ OF_DEVICE_FUNC void Maxpool2dBackwardComputeCFirst(
   }
 }
 
-template<typename T>
+template<typename T, typename IDX>
 OF_DEVICE_FUNC void Maxpool2dBackwardComputeCLast(
-    const NdIndexOffsetHelper<int64_t, 4> index_helper, const int64_t elem_num, const T* src,
+    const NdIndexOffsetHelper<IDX, 4> index_helper, const IDX elem_num, const T* src,
     T* dest, const int64_t* indice_ptr, const int64_t n_batch, const int64_t n_channel,
     const int64_t src_height, const int64_t src_width, const int64_t dst_height,
     const int64_t dst_width) {
   XPU_1D_KERNEL_LOOP(num, elem_num) {
-    int64_t n, c, h, w;
+    IDX n, c, h, w;
     index_helper.OffsetToNdIndex(num, n, c, h, w);
-    const int64_t src_start = n * src_height * src_width * n_channel;
-    const int64_t dst_start = n * dst_height * dst_width * n_channel;
-    const int64_t index = src_start + h * src_width + w;
-    const int64_t max_index = dst_start + indice_ptr[index];
+    const IDX src_start = n * src_height * src_width * n_channel;
+    const IDX dst_start = n * dst_height * dst_width * n_channel;
+    const IDX index = src_start + h * src_width + w;
+    const IDX max_index = dst_start + indice_ptr[index];
     if (max_index != -1) {
       /* update gradient, equals to dest[max_index] += src[index]; */
       DeviceAdd<T>::Invoke(src + index, dest + max_index);
@@ -331,45 +298,45 @@ OF_DEVICE_FUNC void Maxpool2dBackwardComputeCLast(
   }
 }
 
-template<typename T>
+template<typename T, typename IDX>
 OF_DEVICE_FUNC void Maxpool3dForwardCompute(
-    const NdIndexOffsetHelper<int64_t, 5> index_helper, int64_t elem_num, const T* src, T* dest,
+    const NdIndexOffsetHelper<IDX, 5> index_helper, IDX elem_num, const T* src, T* dest,
     int64_t* indice_ptr, const int32_t padding_t, const int32_t padding_h, const int32_t padding_w,
-    const int64_t n_batch, const int64_t n_channel, const int64_t x_time, const int64_t x_height,
-    const int64_t x_width, const int64_t y_time, const int64_t y_height, const int64_t y_width,
+    const int64_t n_batch, const int64_t n_channel, 
+    const int64_t x_time, const int64_t x_height, const int64_t x_width, 
     const int32_t kernel_size_t, const int32_t kernel_size_h, const int32_t kernel_size_w,
     const int32_t stride_t, const int32_t stride_h, const int32_t stride_w,
     const int32_t dilation_t, const int32_t dilation_h, const int32_t dilation_w) {
   XPU_1D_KERNEL_LOOP(num, elem_num) {
-    int64_t n, c, t, h, w;
+    IDX n, c, t, h, w;
     index_helper.OffsetToNdIndex(num, n, c, t, h, w);
 
-    int64_t xstart = n * n_channel * x_time * x_width * x_height;
-    int64_t start_idx = xstart + c * x_time * x_width * x_height;
-    int64_t tstart = t * stride_t - padding_t;
-    int64_t hstart = h * stride_h - padding_h;
-    int64_t wstart = w * stride_w - padding_w;
+    IDX xstart = n * n_channel * x_time * x_width * x_height;
+    IDX start_idx = xstart + c * x_time * x_width * x_height;
+    IDX tstart = t * stride_t - padding_t;
+    IDX hstart = h * stride_h - padding_h;
+    IDX wstart = w * stride_w - padding_w;
 
-    const int64_t t1 = tstart + (kernel_size_t - 1) * dilation_t + 1;
-    const int64_t t2 = hstart + (kernel_size_h - 1) * dilation_h + 1;
-    const int64_t t3 = wstart + (kernel_size_w - 1) * dilation_w + 1;
-    const int64_t tend = t1 <= x_time ? t1 : x_time;
-    const int64_t hend = t2 <= x_height ? t2 : x_height;
-    const int64_t wend = t3 <= x_width ? t3 : x_width;
+    const IDX t1 = tstart + (kernel_size_t - 1) * dilation_t + 1;
+    const IDX t2 = hstart + (kernel_size_h - 1) * dilation_h + 1;
+    const IDX t3 = wstart + (kernel_size_w - 1) * dilation_w + 1;
+    const IDX tend = t1 <= x_time ? t1 : x_time;
+    const IDX hend = t2 <= x_height ? t2 : x_height;
+    const IDX wend = t3 <= x_width ? t3 : x_width;
 
     while (tstart < 0) { tstart += dilation_t; }
     while (hstart < 0) { hstart += dilation_h; }
     while (wstart < 0) { wstart += dilation_w; }
 
-    int64_t max_index = tstart * x_height * x_width + hstart * x_width + wstart;
-    int64_t src_idx = 0;
+    IDX max_index = tstart * x_height * x_width + hstart * x_width + wstart;
+    IDX src_idx = 0;
 
     T max_value = detail::numeric_limits<T>::lower_bound();
-    for (int64_t zi = tstart; zi < tend; zi += dilation_t) {
-      for (int64_t i = hstart; i < hend; i += dilation_h) {
-        for (int64_t j = wstart; j < wend; j += dilation_w) {
-          const int64_t window_idx = zi * x_height * x_width + i * x_width + j;
-          const int64_t search_idx = start_idx + window_idx;
+    for (IDX zi = tstart; zi < tend; zi += dilation_t) {
+      for (IDX i = hstart; i < hend; i += dilation_h) {
+        for (IDX j = wstart; j < wend; j += dilation_w) {
+          const IDX window_idx = zi * x_height * x_width + i * x_width + j;
+          const IDX search_idx = start_idx + window_idx;
           T val = src[search_idx];
           if (val > max_value || detail::numerics<T>::isnan(val)) {
             max_value = val;
@@ -386,29 +353,29 @@ OF_DEVICE_FUNC void Maxpool3dForwardCompute(
   }
 }
 
-template<typename T>
-OF_DEVICE_FUNC void Maxpool3dBackwardCompute(const NdIndexOffsetHelper<int64_t, 5> index_helper,
-                                             const int64_t elem_num, const T* src, T* dest,
+template<typename T, typename IDX>
+OF_DEVICE_FUNC void Maxpool3dBackwardCompute(const NdIndexOffsetHelper<IDX, 5> index_helper,
+                                             const IDX elem_num, const T* src, T* dest,
                                              const int64_t* indice_ptr, const int64_t n_batch,
                                              const int64_t n_channel, const int64_t src_time,
                                              const int64_t src_height, const int64_t src_width,
                                              const int64_t dst_time, const int64_t dst_height,
                                              const int64_t dst_width) {
   XPU_1D_KERNEL_LOOP(num, elem_num) {
-    int64_t n, c, t, h, w;
+    IDX n, c, t, h, w;
     index_helper.OffsetToNdIndex(num, n, c, t, h, w);
 
-    const int64_t src_start = (n * n_channel + c) * src_time * src_height * src_width;
-    const int64_t dst_start = (n * n_channel + c) * dst_time * dst_height * dst_width;
-    const int64_t index = src_start + t * src_height * src_width + h * src_width + w;
-    const int64_t max_index = dst_start + indice_ptr[index];
+    const IDX src_start = (n * n_channel + c) * src_time * src_height * src_width;
+    const IDX dst_start = (n * n_channel + c) * dst_time * dst_height * dst_width;
+    const IDX index = src_start + t * src_height * src_width + h * src_width + w;
+    const IDX max_index = dst_start + indice_ptr[index];
 
     if (max_index != -1) { DeviceAdd<T>::Invoke(src + index, dest + max_index); }
   }
 }
 
-#define INSTANTIATE_POOLING_KERNEL_UTIL(device_type_v, dtype_pair) \
-  template struct PoolingKernelUtil<device_type_v, OF_PP_PAIR_FIRST(dtype_pair)>;
+#define INSTANTIATE_POOLING_KERNEL_UTIL(device_type_v, dtype_pair, index_dtype_pair) \
+  template struct PoolingKernelUtil<device_type_v, OF_PP_PAIR_FIRST(dtype_pair), OF_PP_PAIR_FIRST(index_dtype_pair)>;
 
 }  // namespace oneflow
 

From 79ff4dbce54aa0637ad5557f477c6e776f2d012d Mon Sep 17 00:00:00 2001
From: MARD1NO <359521840@qq.com>
Date: Mon, 7 Feb 2022 14:02:35 +0800
Subject: [PATCH 03/17] fix bug

---
 oneflow/user/kernels/pooling_kernel_util.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/oneflow/user/kernels/pooling_kernel_util.h b/oneflow/user/kernels/pooling_kernel_util.h
index 4c7c542896d..f620c80d77b 100644
--- a/oneflow/user/kernels/pooling_kernel_util.h
+++ b/oneflow/user/kernels/pooling_kernel_util.h
@@ -253,6 +253,8 @@ OF_DEVICE_FUNC void Maxpool2dForwardComputeCFirst(
         }
       }
     }
+    dest[num] = src[src_idx];
+    indice_ptr[num] = max_index;
   }
 }
 

From 14c9fd6dedb278a73118363bc220a19dec769abd Mon Sep 17 00:00:00 2001
From: MARD1NO <359521840@qq.com>
Date: Mon, 7 Feb 2022 14:02:48 +0800
Subject: [PATCH 04/17] remove redundant code

---
 oneflow/user/ops/pooling_op.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/oneflow/user/ops/pooling_op.cpp b/oneflow/user/ops/pooling_op.cpp
index c46b85188d3..ffb7d85e1ba 100644
--- a/oneflow/user/ops/pooling_op.cpp
+++ b/oneflow/user/ops/pooling_op.cpp
@@ -113,7 +113,6 @@ Maybe<void> AvgPoolForwardGetSbpFn(user_op::SbpContext* ctx) {
 
 Maybe<void> MaxPoolBackwardGetSbpFn(user_op::SbpContext* ctx) {
   const user_op::TensorDesc& tensor = ctx->LogicalTensorDesc4InputArgNameAndIndex("x", 0);
-  const std::vector<int32_t>& padding = ctx->Attr<std::vector<int32_t>>("padding");
   FOR_RANGE(int64_t, i, 0, std::min(2, (int)tensor.shape().NumAxes())) {
     ctx->NewBuilder()
         .Split(user_op::OpArg("x", 0), i)

From 4f5e6f44e29a7737b8a1168853c6b0e18f4d7765 Mon Sep 17 00:00:00 2001
From: MARD1NO <359521840@qq.com>
Date: Mon, 7 Feb 2022 20:20:49 +0800
Subject: [PATCH 05/17] remove redundant read

---
 oneflow/user/kernels/pooling_kernel_util.h | 30 +++++++++++++++++++---
 1 file changed, 27 insertions(+), 3 deletions(-)

diff --git a/oneflow/user/kernels/pooling_kernel_util.h b/oneflow/user/kernels/pooling_kernel_util.h
index f620c80d77b..b8e134d9a9b 100644
--- a/oneflow/user/kernels/pooling_kernel_util.h
+++ b/oneflow/user/kernels/pooling_kernel_util.h
@@ -54,6 +54,13 @@ struct DeviceAdd {
   };
 };
 
+#ifdef WITH_CUDA 
+  template<typename T>
+  __device__ inline T device_min(T a, T b) {
+    return a <= b ? a : b;
+  }
+#endif
+
 class MaxPoolingParams3D {
  public:
   MaxPoolingParams3D(const int32_t dim, const ShapeView& x_shape, const std::string& data_format,
@@ -214,19 +221,35 @@ OF_DEVICE_FUNC void Maxpool2dForwardComputeCFirst(
     const IDX start_idx = (n * n_channel + c) * x_width * x_height;
     IDX hstart = h * stride_h - padding_h;
     IDX wstart = w * stride_w - padding_w;
+    /*
     const IDX hend = (hstart + (kernel_size_h - 1) * dilation_h + 1) <= x_height
                              ? (hstart + (kernel_size_h - 1) * dilation_h + 1)
                              : x_height;
     const IDX wend = (wstart + (kernel_size_w - 1) * dilation_w + 1) <= x_width
                              ? (wstart + (kernel_size_w - 1) * dilation_w + 1)
                              : x_width;
+    */
+
+    #ifdef WITH_CUDA
+    const IDX hend = device_min<T>((hstart + (kernel_size_h - 1) * dilation_h + 1), x_height);
+    const IDX wend = device_min<T>((wstart + (kernel_size_w - 1) * dilation_w + 1), x_height);
+    // const IDX hend = (hstart + (kernel_size_h - 1) * dilation_h + 1) <= x_height
+    //                          ? (hstart + (kernel_size_h - 1) * dilation_h + 1)
+    //                          : x_height;
+    // const IDX wend = (wstart + (kernel_size_w - 1) * dilation_w + 1) <= x_width
+    //                          ? (wstart + (kernel_size_w - 1) * dilation_w + 1)
+    //                          : x_width;
+    #else
+    const IDX hend = std::min((hstart + (kernel_size_h - 1) * dilation_h + 1), x_height);
+    const IDX wend = std::min((wstart + (kernel_size_w - 1) * dilation_w + 1), x_height);
+    #endif
 
     while (hstart < 0) { hstart += dilation_h; }
     while (wstart < 0) { wstart += dilation_w; }
 
     /* compute max value(src[src_idx]) in kernel box region, and save the value to dest[num] */
     IDX max_index = hstart * x_width + wstart;
-    IDX src_idx = 0;
+    // IDX src_idx = 0;
 
     /* equal to -std::numeric_limits<T>::infinity(); */
     T max_value = detail::numeric_limits<T>::lower_bound();
@@ -249,11 +272,12 @@ OF_DEVICE_FUNC void Maxpool2dForwardComputeCFirst(
         if (val > max_value || detail::numerics<T>::isnan(val)) {
           max_value = val;
           max_index = window_idx;
-          src_idx = search_idx;
+          // src_idx = search_idx;
         }
       }
     }
-    dest[num] = src[src_idx];
+    // dest[num] = src[src_idx];
+    dest[num] = max_value; 
     indice_ptr[num] = max_index;
   }
 }

From d9fb335c4efa0980172828f81a4008a4fe5e0226 Mon Sep 17 00:00:00 2001
From: MARD1NO <359521840@qq.com>
Date: Mon, 7 Feb 2022 20:32:15 +0800
Subject: [PATCH 06/17] remove redundant data_ptr offset

---
 oneflow/user/kernels/pooling_kernel_util.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/oneflow/user/kernels/pooling_kernel_util.h b/oneflow/user/kernels/pooling_kernel_util.h
index b8e134d9a9b..f062c72b110 100644
--- a/oneflow/user/kernels/pooling_kernel_util.h
+++ b/oneflow/user/kernels/pooling_kernel_util.h
@@ -254,11 +254,13 @@ OF_DEVICE_FUNC void Maxpool2dForwardComputeCFirst(
     /* equal to -std::numeric_limits<T>::infinity(); */
     T max_value = detail::numeric_limits<T>::lower_bound();
 
+    const T* btm_data = src + start_idx; 
     for (IDX i = hstart; i < hend; i += dilation_h) {
       for (IDX j = wstart; j < wend; j += dilation_w) {
         const IDX window_idx = i * x_width + j;
-        const IDX search_idx = start_idx + window_idx;
-        T val = src[search_idx];
+        // const IDX search_idx = start_idx + window_idx;
+        // T val = src[search_idx];
+        T val = btm_data[window_idx]; 
         /* NOTE:
         std::isnan(val) only supports a few data types, see:
         https://en.cppreference.com/w/cpp/numeric/math/isnan and when use gcc/g++ 4.x to compile,

From 3b79ba8e43f3e887bdef029ec3f9a561930fc70d Mon Sep 17 00:00:00 2001
From: MARD1NO <359521840@qq.com>
Date: Tue, 8 Feb 2022 09:27:47 +0800
Subject: [PATCH 07/17] use int32 to describe x shape

---
 oneflow/user/kernels/pooling_kernel.cpp    |  6 +--
 oneflow/user/kernels/pooling_kernel_util.h | 48 +++++++++++-----------
 2 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/oneflow/user/kernels/pooling_kernel.cpp b/oneflow/user/kernels/pooling_kernel.cpp
index ea726328a13..eba32c13151 100644
--- a/oneflow/user/kernels/pooling_kernel.cpp
+++ b/oneflow/user/kernels/pooling_kernel.cpp
@@ -46,9 +46,9 @@ template<typename T, typename IDX>
 void Maxpool2dForwardComputeCLast(const NdIndexOffsetHelper<IDX, 4>& index_helper,
                                   IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
                                   const int32_t padding_h, const int32_t padding_w,
-                                  const int64_t n_batch, const int64_t n_channel,
-                                  const int64_t x_height, const int64_t x_width,
-                                  const int64_t y_height, const int64_t y_width,
+                                  const int32_t n_batch, const int32_t n_channel,
+                                  const int32_t x_height, const int32_t x_width,
+                                  const int32_t y_height, const int32_t y_width,
                                   const int32_t kernel_size_h, const int32_t kernel_size_w,
                                   const int32_t stride_h, const int32_t stride_w,
                                   const int32_t dilation_h, const int32_t dilation_w) {
diff --git a/oneflow/user/kernels/pooling_kernel_util.h b/oneflow/user/kernels/pooling_kernel_util.h
index f062c72b110..3b68b5a5c1e 100644
--- a/oneflow/user/kernels/pooling_kernel_util.h
+++ b/oneflow/user/kernels/pooling_kernel_util.h
@@ -76,8 +76,8 @@ class MaxPoolingParams3D {
   const std::vector<int32_t>& dilation_3d() const { return dilation_3d_; }
   const bool& return_indices() const { return return_indices_; }
   const bool& ceil_mode() const { return ceil_mode_; }
-  const int64_t& num_batch() const { return batch_num_; }
-  const int64_t& num_channel() const { return channel_num_; }
+  const int32_t& num_batch() const { return batch_num_; }
+  const int32_t& num_channel() const { return channel_num_; }
 
   void Reset(const ShapeView& x_shape);
   Shape GetYShape() const;
@@ -95,8 +95,8 @@ class MaxPoolingParams3D {
   std::vector<int32_t> dilation_3d_;
   bool return_indices_;
   bool ceil_mode_;
-  int64_t batch_num_;
-  int64_t channel_num_;
+  int32_t batch_num_;
+  int32_t channel_num_;
 };
 
 template<DeviceType device_type, typename T, typename IDX>
@@ -148,8 +148,8 @@ template<typename T, typename IDX>
 OF_DEVICE_FUNC void Maxpool1dForwardCompute(const NdIndexOffsetHelper<IDX, 3> index_helper,
                                             IDX elem_num, const T* src, T* dest,
                                             int64_t* indice_ptr, const int32_t padding_l,
-                                            const int64_t n_batch, const int64_t n_channel,
-                                            const int64_t x_length, const int32_t kernel_size_l, 
+                                            const int32_t n_batch, const int32_t n_channel,
+                                            const int32_t x_length, const int32_t kernel_size_l, 
                                             const int32_t stride_l, const int32_t dilation_l) {
   XPU_1D_KERNEL_LOOP(num, elem_num) {
     IDX n, c, l;
@@ -188,9 +188,9 @@ OF_DEVICE_FUNC void Maxpool1dForwardCompute(const NdIndexOffsetHelper<IDX, 3> in
 template<typename T, typename IDX>
 OF_DEVICE_FUNC void Maxpool1dBackwardCompute(const NdIndexOffsetHelper<IDX, 3> index_helper,
                                              const IDX elem_num, const T* src, T* dest,
-                                             const int64_t* indice_ptr, const int64_t n_batch,
-                                             const int64_t n_channel, const int64_t src_length,
-                                             const int64_t dst_length) {
+                                             const int64_t* indice_ptr, const int32_t n_batch,
+                                             const int32_t n_channel, const int32_t src_length,
+                                             const int32_t dst_length) {
   XPU_1D_KERNEL_LOOP(num, elem_num) {
     IDX n, c, l;
     index_helper.OffsetToNdIndex(num, n, c, l);
@@ -209,8 +209,8 @@ OF_DEVICE_FUNC void Maxpool1dBackwardCompute(const NdIndexOffsetHelper<IDX, 3> i
 template<typename T, typename IDX>
 OF_DEVICE_FUNC void Maxpool2dForwardComputeCFirst(
     const NdIndexOffsetHelper<IDX, 4> index_helper, IDX elem_num, const T* src, T* dest,
-    int64_t* indice_ptr, const int32_t padding_h, const int32_t padding_w, const int64_t n_batch,
-    const int64_t n_channel, const int64_t x_height, const int64_t x_width,
+    int64_t* indice_ptr, const int32_t padding_h, const int32_t padding_w, const int32_t n_batch,
+    const int32_t n_channel, const int32_t x_height, const int32_t x_width,
     const int32_t kernel_size_h, const int32_t kernel_size_w,
     const int32_t stride_h, const int32_t stride_w, const int32_t dilation_h,
     const int32_t dilation_w) {
@@ -287,9 +287,9 @@ OF_DEVICE_FUNC void Maxpool2dForwardComputeCFirst(
 template<typename T, typename IDX>
 OF_DEVICE_FUNC void Maxpool2dBackwardComputeCFirst(
     const NdIndexOffsetHelper<IDX, 4> index_helper, const IDX elem_num, const T* src,
-    T* dest, const int64_t* indice_ptr, const int64_t n_batch, const int64_t n_channel,
-    const int64_t src_height, const int64_t src_width, const int64_t dst_height,
-    const int64_t dst_width) {
+    T* dest, const int64_t* indice_ptr, const int32_t n_batch, const int32_t n_channel,
+    const int32_t src_height, const int32_t src_width, const int32_t dst_height,
+    const int32_t dst_width) {
   XPU_1D_KERNEL_LOOP(num, elem_num) {
     IDX n, c, h, w;
     index_helper.OffsetToNdIndex(num, n, c, h, w);
@@ -309,9 +309,9 @@ OF_DEVICE_FUNC void Maxpool2dBackwardComputeCFirst(
 template<typename T, typename IDX>
 OF_DEVICE_FUNC void Maxpool2dBackwardComputeCLast(
     const NdIndexOffsetHelper<IDX, 4> index_helper, const IDX elem_num, const T* src,
-    T* dest, const int64_t* indice_ptr, const int64_t n_batch, const int64_t n_channel,
-    const int64_t src_height, const int64_t src_width, const int64_t dst_height,
-    const int64_t dst_width) {
+    T* dest, const int64_t* indice_ptr, const int32_t n_batch, const int32_t n_channel,
+    const int32_t src_height, const int32_t src_width, const int32_t dst_height,
+    const int32_t dst_width) {
   XPU_1D_KERNEL_LOOP(num, elem_num) {
     IDX n, c, h, w;
     index_helper.OffsetToNdIndex(num, n, c, h, w);
@@ -330,8 +330,8 @@ template<typename T, typename IDX>
 OF_DEVICE_FUNC void Maxpool3dForwardCompute(
     const NdIndexOffsetHelper<IDX, 5> index_helper, IDX elem_num, const T* src, T* dest,
     int64_t* indice_ptr, const int32_t padding_t, const int32_t padding_h, const int32_t padding_w,
-    const int64_t n_batch, const int64_t n_channel, 
-    const int64_t x_time, const int64_t x_height, const int64_t x_width, 
+    const int32_t n_batch, const int32_t n_channel, 
+    const int32_t x_time, const int32_t x_height, const int32_t x_width, 
     const int32_t kernel_size_t, const int32_t kernel_size_h, const int32_t kernel_size_w,
     const int32_t stride_t, const int32_t stride_h, const int32_t stride_w,
     const int32_t dilation_t, const int32_t dilation_h, const int32_t dilation_w) {
@@ -384,11 +384,11 @@ OF_DEVICE_FUNC void Maxpool3dForwardCompute(
 template<typename T, typename IDX>
 OF_DEVICE_FUNC void Maxpool3dBackwardCompute(const NdIndexOffsetHelper<IDX, 5> index_helper,
                                              const IDX elem_num, const T* src, T* dest,
-                                             const int64_t* indice_ptr, const int64_t n_batch,
-                                             const int64_t n_channel, const int64_t src_time,
-                                             const int64_t src_height, const int64_t src_width,
-                                             const int64_t dst_time, const int64_t dst_height,
-                                             const int64_t dst_width) {
+                                             const int64_t* indice_ptr, const int32_t n_batch,
+                                             const int32_t n_channel, const int32_t src_time,
+                                             const int32_t src_height, const int32_t src_width,
+                                             const int32_t dst_time, const int32_t dst_height,
+                                             const int32_t dst_width) {
   XPU_1D_KERNEL_LOOP(num, elem_num) {
     IDX n, c, t, h, w;
     index_helper.OffsetToNdIndex(num, n, c, t, h, w);

From 4f8bba56384322a6916c6892b41b40b23725a767 Mon Sep 17 00:00:00 2001
From: MARD1NO <359521840@qq.com>
Date: Tue, 8 Feb 2022 09:50:50 +0800
Subject: [PATCH 08/17] Fix cuda input params for maxpool2d

---
 oneflow/user/kernels/pooling_kernel.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/oneflow/user/kernels/pooling_kernel.cu b/oneflow/user/kernels/pooling_kernel.cu
index 1432ab2e783..84373a0cd26 100644
--- a/oneflow/user/kernels/pooling_kernel.cu
+++ b/oneflow/user/kernels/pooling_kernel.cu
@@ -98,8 +98,8 @@ template<typename T, typename IDX>
 __launch_bounds__(kBlockSize) __global__
     void DoCUDAMaxPool2dForwardCFirst(const NdIndexOffsetHelper<IDX, 4> index_helper,
                                       IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
-                                      int32_t padding_h, int32_t padding_w, int64_t n_batch,
-                                      int64_t n_channel, int64_t x_height, int64_t x_width,
+                                      int32_t padding_h, int32_t padding_w, int32_t n_batch,
+                                      int32_t n_channel, int32_t x_height, int32_t x_width,
                                       int32_t kernel_size_h, int32_t kernel_size_w, 
                                       int32_t stride_h, int32_t stride_w,
                                       int32_t dilation_h, int32_t dilation_w) {

From 6b6a8e3ffd8eb7c068911310842d09fb98779a7a Mon Sep 17 00:00:00 2001
From: MARD1NO <359521840@qq.com>
Date: Tue, 8 Feb 2022 10:23:51 +0800
Subject: [PATCH 09/17] just for debug

---
 oneflow/user/kernels/pooling_kernel.cpp    | 57 +++++++++++++---------
 oneflow/user/kernels/pooling_kernel_util.h | 23 ++++-----
 2 files changed, 47 insertions(+), 33 deletions(-)

diff --git a/oneflow/user/kernels/pooling_kernel.cpp b/oneflow/user/kernels/pooling_kernel.cpp
index eba32c13151..62f6a03fca8 100644
--- a/oneflow/user/kernels/pooling_kernel.cpp
+++ b/oneflow/user/kernels/pooling_kernel.cpp
@@ -300,7 +300,9 @@ class MaxPool2dKernel final : public user_op::OpKernel {
     const auto* pooling_cache = dynamic_cast<const PoolingOpKernelCache*>(cache);
     const MaxPoolingParams3D& params_3d = pooling_cache->GetParams3D();
 
-    const int64_t elem_num = y->shape().elem_cnt();
+    // const int64_t elem_num = y->shape().elem_cnt();
+    const int32_t elem_num = y->shape().elem_cnt();
+
     const T* src = x->dptr<T>();
     T* dest = y->mut_dptr<T>();
     int64_t* indice_ptr = indice->mut_dptr<int64_t>();
@@ -308,28 +310,39 @@ class MaxPool2dKernel final : public user_op::OpKernel {
     DimVector y_vector;
     y->shape().ToDimVector(&y_vector);
     const std::string& data_format = ctx->Attr<std::string>("data_format");
-    if(elem_num < GetMaxVal<int32_t>()){
-      NdIndexOffsetHelper<int32_t, 4> index_helper(y_vector.data());
-      if (data_format == "channels_first") {
-      PoolingKernelUtil<device_type, T, int32_t>::Maxpool2dForwardCFirst(
+    // if(elem_num < GetMaxVal<int32_t>()){
+    //   NdIndexOffsetHelper<int32_t, 4> index_helper(y_vector.data());
+    //   if (data_format == "channels_first") {
+    //   PoolingKernelUtil<device_type, T, int32_t>::Maxpool2dForwardCFirst(
+    //       ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d);
+    //   } else if (data_format == "channels_last") {
+    //     PoolingKernelUtil<device_type, T, int32_t>::Maxpool2dForwardCLast(
+    //         ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d);
+    //   } else {
+    //     UNIMPLEMENTED() << "Unsupported data_format";
+    //   }
+    // }else{
+    //   NdIndexOffsetHelper<int64_t, 4> index_helper(y_vector.data());
+    //   if (data_format == "channels_first") {
+    //   PoolingKernelUtil<device_type, T, int64_t>::Maxpool2dForwardCFirst(
+    //       ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d);
+    //   } else if (data_format == "channels_last") {
+    //     PoolingKernelUtil<device_type, T, int64_t>::Maxpool2dForwardCLast(
+    //         ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d);
+    //   } else {
+    //     UNIMPLEMENTED() << "Unsupported data_format";
+    //   }
+    // }
+
+    NdIndexOffsetHelper<int32_t, 4> index_helper(y_vector.data());
+    if (data_format == "channels_first") {
+    PoolingKernelUtil<device_type, T, int32_t>::Maxpool2dForwardCFirst(
+        ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d);
+    } else if (data_format == "channels_last") {
+      PoolingKernelUtil<device_type, T, int32_t>::Maxpool2dForwardCLast(
           ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d);
-      } else if (data_format == "channels_last") {
-        PoolingKernelUtil<device_type, T, int32_t>::Maxpool2dForwardCLast(
-            ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d);
-      } else {
-        UNIMPLEMENTED() << "Unsupported data_format";
-      }
-    }else{
-      NdIndexOffsetHelper<int64_t, 4> index_helper(y_vector.data());
-      if (data_format == "channels_first") {
-      PoolingKernelUtil<device_type, T, int64_t>::Maxpool2dForwardCFirst(
-          ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d);
-      } else if (data_format == "channels_last") {
-        PoolingKernelUtil<device_type, T, int64_t>::Maxpool2dForwardCLast(
-            ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d);
-      } else {
-        UNIMPLEMENTED() << "Unsupported data_format";
-      }
+    } else {
+      UNIMPLEMENTED() << "Unsupported data_format";
     }
   };
 };
diff --git a/oneflow/user/kernels/pooling_kernel_util.h b/oneflow/user/kernels/pooling_kernel_util.h
index 3b68b5a5c1e..918fcfcb8cb 100644
--- a/oneflow/user/kernels/pooling_kernel_util.h
+++ b/oneflow/user/kernels/pooling_kernel_util.h
@@ -218,9 +218,9 @@ OF_DEVICE_FUNC void Maxpool2dForwardComputeCFirst(
     IDX n, c, h, w;
     index_helper.OffsetToNdIndex(num, n, c, h, w);
 
-    const IDX start_idx = (n * n_channel + c) * x_width * x_height;
-    IDX hstart = h * stride_h - padding_h;
-    IDX wstart = w * stride_w - padding_w;
+    const int32_t start_idx = (n * n_channel + c) * x_width * x_height;
+    int32_t hstart = h * stride_h - padding_h;
+    int32_t wstart = w * stride_w - padding_w;
     /*
     const IDX hend = (hstart + (kernel_size_h - 1) * dilation_h + 1) <= x_height
                              ? (hstart + (kernel_size_h - 1) * dilation_h + 1)
@@ -231,8 +231,9 @@ OF_DEVICE_FUNC void Maxpool2dForwardComputeCFirst(
     */
 
     #ifdef WITH_CUDA
-    const IDX hend = device_min<T>((hstart + (kernel_size_h - 1) * dilation_h + 1), x_height);
-    const IDX wend = device_min<T>((wstart + (kernel_size_w - 1) * dilation_w + 1), x_height);
+    const int32_t hend = device_min<T>((hstart + (kernel_size_h - 1) * dilation_h + 1), x_height);
+    const int32_t wend = device_min<T>((wstart + (kernel_size_w - 1) * dilation_w + 1), x_height);
+    
     // const IDX hend = (hstart + (kernel_size_h - 1) * dilation_h + 1) <= x_height
     //                          ? (hstart + (kernel_size_h - 1) * dilation_h + 1)
     //                          : x_height;
@@ -240,24 +241,24 @@ OF_DEVICE_FUNC void Maxpool2dForwardComputeCFirst(
     //                          ? (wstart + (kernel_size_w - 1) * dilation_w + 1)
     //                          : x_width;
     #else
-    const IDX hend = std::min((hstart + (kernel_size_h - 1) * dilation_h + 1), x_height);
-    const IDX wend = std::min((wstart + (kernel_size_w - 1) * dilation_w + 1), x_height);
+    const int32_t hend = std::min((hstart + (kernel_size_h - 1) * dilation_h + 1), x_height);
+    const int32_t wend = std::min((wstart + (kernel_size_w - 1) * dilation_w + 1), x_height);
     #endif
 
     while (hstart < 0) { hstart += dilation_h; }
     while (wstart < 0) { wstart += dilation_w; }
 
     /* compute max value(src[src_idx]) in kernel box region, and save the value to dest[num] */
-    IDX max_index = hstart * x_width + wstart;
+    int32_t max_index = hstart * x_width + wstart;
     // IDX src_idx = 0;
 
     /* equal to -std::numeric_limits<T>::infinity(); */
     T max_value = detail::numeric_limits<T>::lower_bound();
 
     const T* btm_data = src + start_idx; 
-    for (IDX i = hstart; i < hend; i += dilation_h) {
-      for (IDX j = wstart; j < wend; j += dilation_w) {
-        const IDX window_idx = i * x_width + j;
+    for (int32_t i = hstart; i < hend; i += dilation_h) {
+      for (int32_t j = wstart; j < wend; j += dilation_w) {
+        const int32_t window_idx = i * x_width + j;
         // const IDX search_idx = start_idx + window_idx;
         // T val = src[search_idx];
         T val = btm_data[window_idx]; 

From 385c962e97a6386bd614e3641fcad5b4dce82daa Mon Sep 17 00:00:00 2001
From: MARD1NO <359521840@qq.com>
Date: Tue, 8 Feb 2022 11:36:55 +0800
Subject: [PATCH 10/17] just for profile

---
 oneflow/user/kernels/pooling_kernel.cpp    |  4 +--
 oneflow/user/kernels/pooling_kernel_util.h | 42 +++++++++++-----------
 2 files changed, 22 insertions(+), 24 deletions(-)

diff --git a/oneflow/user/kernels/pooling_kernel.cpp b/oneflow/user/kernels/pooling_kernel.cpp
index 62f6a03fca8..49070b51f9a 100644
--- a/oneflow/user/kernels/pooling_kernel.cpp
+++ b/oneflow/user/kernels/pooling_kernel.cpp
@@ -300,8 +300,8 @@ class MaxPool2dKernel final : public user_op::OpKernel {
     const auto* pooling_cache = dynamic_cast<const PoolingOpKernelCache*>(cache);
     const MaxPoolingParams3D& params_3d = pooling_cache->GetParams3D();
 
-    // const int64_t elem_num = y->shape().elem_cnt();
-    const int32_t elem_num = y->shape().elem_cnt();
+    const int64_t elem_num = y->shape().elem_cnt();
+    // const int32_t elem_num = y->shape().elem_cnt();
 
     const T* src = x->dptr<T>();
     T* dest = y->mut_dptr<T>();
diff --git a/oneflow/user/kernels/pooling_kernel_util.h b/oneflow/user/kernels/pooling_kernel_util.h
index 918fcfcb8cb..5e548fb4224 100644
--- a/oneflow/user/kernels/pooling_kernel_util.h
+++ b/oneflow/user/kernels/pooling_kernel_util.h
@@ -55,10 +55,11 @@ struct DeviceAdd {
 };
 
 #ifdef WITH_CUDA 
-  template<typename T>
-  __device__ inline T device_min(T a, T b) {
-    return a <= b ? a : b;
-  }
+ 
+OF_DEVICE_FUNC int32_t device_min(int32_t a, int32_t b) {
+  return a <= b ? a : b;
+}
+
 #endif
 
 class MaxPoolingParams3D {
@@ -217,8 +218,6 @@ OF_DEVICE_FUNC void Maxpool2dForwardComputeCFirst(
   XPU_1D_KERNEL_LOOP(num, elem_num) {
     IDX n, c, h, w;
     index_helper.OffsetToNdIndex(num, n, c, h, w);
-
-    const int32_t start_idx = (n * n_channel + c) * x_width * x_height;
     int32_t hstart = h * stride_h - padding_h;
     int32_t wstart = w * stride_w - padding_w;
     /*
@@ -229,11 +228,9 @@ OF_DEVICE_FUNC void Maxpool2dForwardComputeCFirst(
                              ? (wstart + (kernel_size_w - 1) * dilation_w + 1)
                              : x_width;
     */
-
     #ifdef WITH_CUDA
-    const int32_t hend = device_min<T>((hstart + (kernel_size_h - 1) * dilation_h + 1), x_height);
-    const int32_t wend = device_min<T>((wstart + (kernel_size_w - 1) * dilation_w + 1), x_height);
-    
+    const int32_t hend = device_min((hstart + (kernel_size_h - 1) * dilation_h + 1), x_height);
+    const int32_t wend = device_min((wstart + (kernel_size_w - 1) * dilation_w + 1), x_width);
     // const IDX hend = (hstart + (kernel_size_h - 1) * dilation_h + 1) <= x_height
     //                          ? (hstart + (kernel_size_h - 1) * dilation_h + 1)
     //                          : x_height;
@@ -247,21 +244,18 @@ OF_DEVICE_FUNC void Maxpool2dForwardComputeCFirst(
 
     while (hstart < 0) { hstart += dilation_h; }
     while (wstart < 0) { wstart += dilation_w; }
-
+    /* equal to -std::numeric_limits<T>::infinity(); */
+    T max_value = -10000;
     /* compute max value(src[src_idx]) in kernel box region, and save the value to dest[num] */
     int32_t max_index = hstart * x_width + wstart;
-    // IDX src_idx = 0;
-
-    /* equal to -std::numeric_limits<T>::infinity(); */
-    T max_value = detail::numeric_limits<T>::lower_bound();
-
-    const T* btm_data = src + start_idx; 
+    const T* btm_data = src + (n * n_channel + c) * x_width * x_height;
     for (int32_t i = hstart; i < hend; i += dilation_h) {
       for (int32_t j = wstart; j < wend; j += dilation_w) {
-        const int32_t window_idx = i * x_width + j;
+        // const int32_t window_idx = i * x_width + j;
         // const IDX search_idx = start_idx + window_idx;
         // T val = src[search_idx];
-        T val = btm_data[window_idx]; 
+        // T val = btm_data[window_idx]; 
+        T val = btm_data[i * x_width + j]; 
         /* NOTE:
         std::isnan(val) only supports a few data types, see:
         https://en.cppreference.com/w/cpp/numeric/math/isnan and when use gcc/g++ 4.x to compile,
@@ -273,14 +267,18 @@ OF_DEVICE_FUNC void Maxpool2dForwardComputeCFirst(
         but if use gcc/g++ 7.x to compile, everything is ok! the exact reason is still unknown!
         */
         if (val > max_value || detail::numerics<T>::isnan(val)) {
-          max_value = val;
-          max_index = window_idx;
+          max_index = i * x_width + j;
+          // max_value = val;
+          max_value = static_cast<T>(val);
+
+          // max_index = window_idx;
           // src_idx = search_idx;
         }
       }
     }
     // dest[num] = src[src_idx];
-    dest[num] = max_value; 
+    // dest[num] = max_value;
+    dest[num] = static_cast<T>(max_value); 
     indice_ptr[num] = max_index;
   }
 }

From 1c39ee27f4d10f13be9e32003c67ddcaec2707aa Mon Sep 17 00:00:00 2001
From: MARD1NO <359521840@qq.com>
Date: Tue, 8 Feb 2022 17:05:16 +0800
Subject: [PATCH 11/17] reduce div

---
 oneflow/user/kernels/pooling_kernel.cpp    | 17 +++++---
 oneflow/user/kernels/pooling_kernel.cu     |  4 +-
 oneflow/user/kernels/pooling_kernel_util.h | 47 +++++-----------------
 3 files changed, 25 insertions(+), 43 deletions(-)

diff --git a/oneflow/user/kernels/pooling_kernel.cpp b/oneflow/user/kernels/pooling_kernel.cpp
index 49070b51f9a..0eac427f710 100644
--- a/oneflow/user/kernels/pooling_kernel.cpp
+++ b/oneflow/user/kernels/pooling_kernel.cpp
@@ -118,7 +118,7 @@ struct PoolingKernelUtil<DeviceType::kCPU, T, IDX> {
   }
 
   static void Maxpool2dForwardCFirst(ep::Stream* stream,
-                                     const NdIndexOffsetHelper<IDX, 4>& index_helper,
+                                     const NdIndexOffsetHelper<IDX, 3>& index_helper,
                                      const IDX elem_num, const T* src, T* dest,
                                      int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) {
     Maxpool2dForwardComputeCFirst<T, IDX>(
@@ -307,8 +307,8 @@ class MaxPool2dKernel final : public user_op::OpKernel {
     T* dest = y->mut_dptr<T>();
     int64_t* indice_ptr = indice->mut_dptr<int64_t>();
 
-    DimVector y_vector;
-    y->shape().ToDimVector(&y_vector);
+
+
     const std::string& data_format = ctx->Attr<std::string>("data_format");
     // if(elem_num < GetMaxVal<int32_t>()){
     //   NdIndexOffsetHelper<int32_t, 4> index_helper(y_vector.data());
@@ -334,11 +334,18 @@ class MaxPool2dKernel final : public user_op::OpKernel {
     //   }
     // }
 
-    NdIndexOffsetHelper<int32_t, 4> index_helper(y_vector.data());
     if (data_format == "channels_first") {
-    PoolingKernelUtil<device_type, T, int32_t>::Maxpool2dForwardCFirst(
+      DimVector y_vector(3);
+      y_vector.at(0) = y->shape().At(0) * y->shape().At(1); 
+      y_vector.at(1) = y->shape().At(2); 
+      y_vector.at(2) = y->shape().At(3); 
+      NdIndexOffsetHelper<int32_t, 3> index_helper(y_vector.data());
+      PoolingKernelUtil<device_type, T, int32_t>::Maxpool2dForwardCFirst(
         ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d);
     } else if (data_format == "channels_last") {
+      DimVector y_vector;
+      y->shape().ToDimVector(&y_vector);
+      NdIndexOffsetHelper<int32_t, 4> index_helper(y_vector.data());
       PoolingKernelUtil<device_type, T, int32_t>::Maxpool2dForwardCLast(
           ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d);
     } else {
diff --git a/oneflow/user/kernels/pooling_kernel.cu b/oneflow/user/kernels/pooling_kernel.cu
index 84373a0cd26..8ac9bfaacfa 100644
--- a/oneflow/user/kernels/pooling_kernel.cu
+++ b/oneflow/user/kernels/pooling_kernel.cu
@@ -96,7 +96,7 @@ __launch_bounds__(kBlockSize) __global__
 
 template<typename T, typename IDX>
 __launch_bounds__(kBlockSize) __global__
-    void DoCUDAMaxPool2dForwardCFirst(const NdIndexOffsetHelper<IDX, 4> index_helper,
+    void DoCUDAMaxPool2dForwardCFirst(const NdIndexOffsetHelper<IDX, 3> index_helper,
                                       IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
                                       int32_t padding_h, int32_t padding_w, int32_t n_batch,
                                       int32_t n_channel, int32_t x_height, int32_t x_width,
@@ -214,7 +214,7 @@ struct PoolingKernelUtil<DeviceType::kCUDA, T, IDX> {
   }
 
   static void Maxpool2dForwardCFirst(ep::Stream* stream,
-                                     const NdIndexOffsetHelper<IDX, 4>& index_helper,
+                                     const NdIndexOffsetHelper<IDX, 3>& index_helper,
                                      const IDX elem_num, const T* src, T* dest,
                                      int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) {
     DoCUDAMaxPool2dForwardCFirst<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
diff --git a/oneflow/user/kernels/pooling_kernel_util.h b/oneflow/user/kernels/pooling_kernel_util.h
index 5e548fb4224..2e43d1ff033 100644
--- a/oneflow/user/kernels/pooling_kernel_util.h
+++ b/oneflow/user/kernels/pooling_kernel_util.h
@@ -113,7 +113,7 @@ struct PoolingKernelUtil {
                                 const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d);
 
   static void Maxpool2dForwardCFirst(ep::Stream* stream,
-                                     const NdIndexOffsetHelper<IDX, 4>& index_helper,
+                                     const NdIndexOffsetHelper<IDX, 3>& index_helper,
                                      const IDX elem_num, const T* src, T* dest,
                                      int64_t* indice_ptr, const MaxPoolingParams3D& params_3d);
 
@@ -209,53 +209,34 @@ OF_DEVICE_FUNC void Maxpool1dBackwardCompute(const NdIndexOffsetHelper<IDX, 3> i
 
 template<typename T, typename IDX>
 OF_DEVICE_FUNC void Maxpool2dForwardComputeCFirst(
-    const NdIndexOffsetHelper<IDX, 4> index_helper, IDX elem_num, const T* src, T* dest,
+    const NdIndexOffsetHelper<IDX, 3> index_helper, IDX elem_num, const T* src, T* dest,
     int64_t* indice_ptr, const int32_t padding_h, const int32_t padding_w, const int32_t n_batch,
     const int32_t n_channel, const int32_t x_height, const int32_t x_width,
     const int32_t kernel_size_h, const int32_t kernel_size_w,
     const int32_t stride_h, const int32_t stride_w, const int32_t dilation_h,
     const int32_t dilation_w) {
   XPU_1D_KERNEL_LOOP(num, elem_num) {
-    IDX n, c, h, w;
-    index_helper.OffsetToNdIndex(num, n, c, h, w);
+    IDX n_c, h, w;
+    index_helper.OffsetToNdIndex(num, n_c, h, w);
     int32_t hstart = h * stride_h - padding_h;
     int32_t wstart = w * stride_w - padding_w;
-    /*
     const IDX hend = (hstart + (kernel_size_h - 1) * dilation_h + 1) <= x_height
                              ? (hstart + (kernel_size_h - 1) * dilation_h + 1)
                              : x_height;
     const IDX wend = (wstart + (kernel_size_w - 1) * dilation_w + 1) <= x_width
                              ? (wstart + (kernel_size_w - 1) * dilation_w + 1)
                              : x_width;
-    */
-    #ifdef WITH_CUDA
-    const int32_t hend = device_min((hstart + (kernel_size_h - 1) * dilation_h + 1), x_height);
-    const int32_t wend = device_min((wstart + (kernel_size_w - 1) * dilation_w + 1), x_width);
-    // const IDX hend = (hstart + (kernel_size_h - 1) * dilation_h + 1) <= x_height
-    //                          ? (hstart + (kernel_size_h - 1) * dilation_h + 1)
-    //                          : x_height;
-    // const IDX wend = (wstart + (kernel_size_w - 1) * dilation_w + 1) <= x_width
-    //                          ? (wstart + (kernel_size_w - 1) * dilation_w + 1)
-    //                          : x_width;
-    #else
-    const int32_t hend = std::min((hstart + (kernel_size_h - 1) * dilation_h + 1), x_height);
-    const int32_t wend = std::min((wstart + (kernel_size_w - 1) * dilation_w + 1), x_height);
-    #endif
-
     while (hstart < 0) { hstart += dilation_h; }
     while (wstart < 0) { wstart += dilation_w; }
     /* equal to -std::numeric_limits<T>::infinity(); */
-    T max_value = -10000;
+    T max_value = detail::numeric_limits<T>::lower_bound();
     /* compute max value(src[src_idx]) in kernel box region, and save the value to dest[num] */
     int32_t max_index = hstart * x_width + wstart;
-    const T* btm_data = src + (n * n_channel + c) * x_width * x_height;
+    const T* btm_data = src + n_c * x_width * x_height;
     for (int32_t i = hstart; i < hend; i += dilation_h) {
       for (int32_t j = wstart; j < wend; j += dilation_w) {
-        // const int32_t window_idx = i * x_width + j;
-        // const IDX search_idx = start_idx + window_idx;
-        // T val = src[search_idx];
-        // T val = btm_data[window_idx]; 
-        T val = btm_data[i * x_width + j]; 
+        const int32_t window_idx = i * x_width + j;
+        T val = btm_data[window_idx]; 
         /* NOTE:
         std::isnan(val) only supports a few data types, see:
         https://en.cppreference.com/w/cpp/numeric/math/isnan and when use gcc/g++ 4.x to compile,
@@ -267,18 +248,12 @@ OF_DEVICE_FUNC void Maxpool2dForwardComputeCFirst(
         but if use gcc/g++ 7.x to compile, everything is ok! the exact reason is still unknown!
         */
         if (val > max_value || detail::numerics<T>::isnan(val)) {
-          max_index = i * x_width + j;
-          // max_value = val;
-          max_value = static_cast<T>(val);
-
-          // max_index = window_idx;
-          // src_idx = search_idx;
+          max_index = window_idx;
+          max_value = val;
         }
       }
     }
-    // dest[num] = src[src_idx];
-    // dest[num] = max_value;
-    dest[num] = static_cast<T>(max_value); 
+    dest[num] = max_value;
     indice_ptr[num] = max_index;
   }
 }

From 5495891220108df1f98f2184c75a120ce3c5d8b3 Mon Sep 17 00:00:00 2001
From: MARD1NO <359521840@qq.com>
Date: Tue, 8 Feb 2022 17:45:13 +0800
Subject: [PATCH 12/17] use int32_t indice

---
 oneflow/user/kernels/pooling_kernel.cpp    | 79 +++++++++-------------
 oneflow/user/kernels/pooling_kernel.cu     | 34 +++++-----
 oneflow/user/kernels/pooling_kernel_util.h | 30 ++++----
 oneflow/user/ops/pooling_op.cpp            |  2 +-
 4 files changed, 66 insertions(+), 79 deletions(-)

diff --git a/oneflow/user/kernels/pooling_kernel.cpp b/oneflow/user/kernels/pooling_kernel.cpp
index 0eac427f710..65aae1fe506 100644
--- a/oneflow/user/kernels/pooling_kernel.cpp
+++ b/oneflow/user/kernels/pooling_kernel.cpp
@@ -44,7 +44,7 @@ namespace {
 
 template<typename T, typename IDX>
 void Maxpool2dForwardComputeCLast(const NdIndexOffsetHelper<IDX, 4>& index_helper,
-                                  IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
+                                  IDX elem_num, const T* src, T* dest, int32_t* indice_ptr,
                                   const int32_t padding_h, const int32_t padding_w,
                                   const int32_t n_batch, const int32_t n_channel,
                                   const int32_t x_height, const int32_t x_width,
@@ -99,7 +99,7 @@ template<typename T, typename IDX>
 struct PoolingKernelUtil<DeviceType::kCPU, T, IDX> {
   static void Maxpool1dForward(ep::Stream* stream,
                                const NdIndexOffsetHelper<IDX, 3>& index_helper,
-                               const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
+                               const IDX elem_num, const T* src, T* dest, int32_t* indice_ptr,
                                const MaxPoolingParams3D& params_3d) {
     Maxpool1dForwardCompute<T, IDX>(index_helper, elem_num, src, dest, indice_ptr,
                                params_3d.padding()[2], params_3d.num_batch(),
@@ -111,7 +111,7 @@ struct PoolingKernelUtil<DeviceType::kCPU, T, IDX> {
   static void Maxpool1dBackward(ep::Stream* stream,
                                 const NdIndexOffsetHelper<IDX, 3>& index_helper,
                                 const IDX elem_num, const T* src, T* dest,
-                                const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) {
+                                const int32_t* indice_ptr, const MaxPoolingParams3D& params_3d) {
     Maxpool1dBackwardCompute<T, IDX>(index_helper, elem_num, src, dest, indice_ptr,
                                 params_3d.num_batch(), params_3d.num_channel(),
                                 params_3d.GetYShape5D().At(4), params_3d.GetXShape5D().At(4));
@@ -120,7 +120,7 @@ struct PoolingKernelUtil<DeviceType::kCPU, T, IDX> {
   static void Maxpool2dForwardCFirst(ep::Stream* stream,
                                      const NdIndexOffsetHelper<IDX, 3>& index_helper,
                                      const IDX elem_num, const T* src, T* dest,
-                                     int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) {
+                                     int32_t* indice_ptr, const MaxPoolingParams3D& params_3d) {
     Maxpool2dForwardComputeCFirst<T, IDX>(
         index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[1],
         params_3d.padding()[2], params_3d.num_batch(), params_3d.num_channel(),
@@ -133,7 +133,7 @@ struct PoolingKernelUtil<DeviceType::kCPU, T, IDX> {
   static void Maxpool2dBackwardCFirst(ep::Stream* stream,
                                       const NdIndexOffsetHelper<IDX, 4>& index_helper,
                                       const IDX elem_num, const T* src, T* dest,
-                                      const int64_t* indice_ptr,
+                                      const int32_t* indice_ptr,
                                       const MaxPoolingParams3D& params_3d) {
     Maxpool2dBackwardComputeCFirst<T, IDX>(index_helper, elem_num, src, dest, indice_ptr,
                                       params_3d.num_batch(), params_3d.num_channel(),
@@ -144,7 +144,7 @@ struct PoolingKernelUtil<DeviceType::kCPU, T, IDX> {
   static void Maxpool2dForwardCLast(ep::Stream* stream,
                                     const NdIndexOffsetHelper<IDX, 4>& index_helper,
                                     const IDX elem_num, const T* src, T* dest,
-                                    int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) {
+                                    int32_t* indice_ptr, const MaxPoolingParams3D& params_3d) {
     Maxpool2dForwardComputeCLast<T, IDX>(
         index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[1],
         params_3d.padding()[2], params_3d.num_batch(), params_3d.num_channel(),
@@ -157,7 +157,7 @@ struct PoolingKernelUtil<DeviceType::kCPU, T, IDX> {
   static void Maxpool2dBackwardCLast(ep::Stream* stream,
                                      const NdIndexOffsetHelper<IDX, 4>& index_helper,
                                      const IDX elem_num, const T* src, T* dest,
-                                     const int64_t* indice_ptr,
+                                     const int32_t* indice_ptr,
                                      const MaxPoolingParams3D& params_3d) {
     Maxpool2dBackwardComputeCLast<T, IDX>(index_helper, elem_num, src, dest, indice_ptr,
                                      params_3d.num_batch(), params_3d.num_channel(),
@@ -167,7 +167,7 @@ struct PoolingKernelUtil<DeviceType::kCPU, T, IDX> {
 
   static void Maxpool3dForward(ep::Stream* stream,
                                const NdIndexOffsetHelper<IDX, 5>& index_helper,
-                               const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
+                               const IDX elem_num, const T* src, T* dest, int32_t* indice_ptr,
                                const MaxPoolingParams3D& params_3d) {
     Maxpool3dForwardCompute<T, IDX>(
         index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[0],
@@ -183,7 +183,7 @@ struct PoolingKernelUtil<DeviceType::kCPU, T, IDX> {
   static void Maxpool3dBackward(ep::Stream* stream,
                                 const NdIndexOffsetHelper<IDX, 5> index_helper,
                                 const IDX elem_num, const T* src, T* dest,
-                                const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) {
+                                const int32_t* indice_ptr, const MaxPoolingParams3D& params_3d) {
     Maxpool3dBackwardCompute<T, IDX>(index_helper, elem_num, src, dest, indice_ptr,
                                 params_3d.num_batch(), params_3d.num_channel(),
                                 params_3d.GetYShape5D().At(2), params_3d.GetYShape5D().At(3),
@@ -217,7 +217,7 @@ class MaxPool1dKernel final : public user_op::OpKernel {
     const int64_t elem_num = y->shape().elem_cnt();
     const T* src = x->dptr<T>();
     T* dest = y->mut_dptr<T>();
-    int64_t* indice_ptr = indice->mut_dptr<int64_t>();
+    int32_t* indice_ptr = indice->mut_dptr<int32_t>();
 
     DimVector y_vector;
     y->shape().ToDimVector(&y_vector);
@@ -258,7 +258,7 @@ class MaxPool1dGradKernel final : public user_op::OpKernel {
 
     const int64_t elem_num = dy->shape().elem_cnt();
     const T* src = dy->dptr<T>();
-    const int64_t* indice_ptr = indice->dptr<int64_t>();
+    const int32_t* indice_ptr = indice->dptr<int32_t>();
     T* dest = dx->mut_dptr<T>();
     DimVector dy_vector;
     dy->shape().ToDimVector(&dy_vector);
@@ -305,49 +305,36 @@ class MaxPool2dKernel final : public user_op::OpKernel {
 
     const T* src = x->dptr<T>();
     T* dest = y->mut_dptr<T>();
-    int64_t* indice_ptr = indice->mut_dptr<int64_t>();
-
-
+    int32_t* indice_ptr = indice->mut_dptr<int32_t>();
 
     const std::string& data_format = ctx->Attr<std::string>("data_format");
-    // if(elem_num < GetMaxVal<int32_t>()){
-    //   NdIndexOffsetHelper<int32_t, 4> index_helper(y_vector.data());
-    //   if (data_format == "channels_first") {
-    //   PoolingKernelUtil<device_type, T, int32_t>::Maxpool2dForwardCFirst(
-    //       ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d);
-    //   } else if (data_format == "channels_last") {
-    //     PoolingKernelUtil<device_type, T, int32_t>::Maxpool2dForwardCLast(
-    //         ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d);
-    //   } else {
-    //     UNIMPLEMENTED() << "Unsupported data_format";
-    //   }
-    // }else{
-    //   NdIndexOffsetHelper<int64_t, 4> index_helper(y_vector.data());
-    //   if (data_format == "channels_first") {
-    //   PoolingKernelUtil<device_type, T, int64_t>::Maxpool2dForwardCFirst(
-    //       ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d);
-    //   } else if (data_format == "channels_last") {
-    //     PoolingKernelUtil<device_type, T, int64_t>::Maxpool2dForwardCLast(
-    //         ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d);
-    //   } else {
-    //     UNIMPLEMENTED() << "Unsupported data_format";
-    //   }
-    // }
-
     if (data_format == "channels_first") {
       DimVector y_vector(3);
       y_vector.at(0) = y->shape().At(0) * y->shape().At(1); 
       y_vector.at(1) = y->shape().At(2); 
       y_vector.at(2) = y->shape().At(3); 
-      NdIndexOffsetHelper<int32_t, 3> index_helper(y_vector.data());
-      PoolingKernelUtil<device_type, T, int32_t>::Maxpool2dForwardCFirst(
-        ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d);
+      if(elem_num < GetMaxVal<int32_t>()){
+        NdIndexOffsetHelper<int32_t, 3> index_helper(y_vector.data());
+        PoolingKernelUtil<device_type, T, int32_t>::Maxpool2dForwardCFirst(
+          ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d);
+      }
+      else{
+        NdIndexOffsetHelper<int64_t, 3> index_helper(y_vector.data());
+        PoolingKernelUtil<device_type, T, int64_t>::Maxpool2dForwardCFirst(
+          ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d);
+      }
     } else if (data_format == "channels_last") {
       DimVector y_vector;
       y->shape().ToDimVector(&y_vector);
-      NdIndexOffsetHelper<int32_t, 4> index_helper(y_vector.data());
-      PoolingKernelUtil<device_type, T, int32_t>::Maxpool2dForwardCLast(
+      if(elem_num < GetMaxVal<int32_t>()){
+        NdIndexOffsetHelper<int32_t, 4> index_helper(y_vector.data());
+        PoolingKernelUtil<device_type, T, int32_t>::Maxpool2dForwardCLast(
           ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d);
+      }else{
+        NdIndexOffsetHelper<int64_t, 4> index_helper(y_vector.data());
+        PoolingKernelUtil<device_type, T, int64_t>::Maxpool2dForwardCLast(
+          ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d);
+      }
     } else {
       UNIMPLEMENTED() << "Unsupported data_format";
     }
@@ -378,7 +365,7 @@ class MaxPool2dGradKernel final : public user_op::OpKernel {
 
     const int64_t elem_num = dy->shape().elem_cnt();
     const T* src = dy->dptr<T>();
-    const int64_t* indice_ptr = indice->dptr<int64_t>();
+    const int32_t* indice_ptr = indice->dptr<int32_t>();
     T* dest = dx->mut_dptr<T>();
     DimVector dy_vector;
     dy->shape().ToDimVector(&dy_vector);
@@ -438,7 +425,7 @@ class MaxPool3dKernel final : public user_op::OpKernel {
     const int64_t elem_num = y->shape().elem_cnt();
     const T* src = x->dptr<T>();
     T* dest = y->mut_dptr<T>();
-    int64_t* indice_ptr = indice->mut_dptr<int64_t>();
+    int32_t* indice_ptr = indice->mut_dptr<int32_t>();
 
     DimVector y_vector;
     y->shape().ToDimVector(&y_vector);
@@ -478,7 +465,7 @@ class MaxPool3dGradKernel final : public user_op::OpKernel {
 
     const int64_t elem_num = dy->shape().elem_cnt();
     const T* src = dy->dptr<T>();
-    const int64_t* indice_ptr = indice->dptr<int64_t>();
+    const int32_t* indice_ptr = indice->dptr<int32_t>();
     T* dest = dx->mut_dptr<T>();
 
     DimVector dy_vector;
diff --git a/oneflow/user/kernels/pooling_kernel.cu b/oneflow/user/kernels/pooling_kernel.cu
index 8ac9bfaacfa..f8a590d00cc 100644
--- a/oneflow/user/kernels/pooling_kernel.cu
+++ b/oneflow/user/kernels/pooling_kernel.cu
@@ -35,7 +35,7 @@ int GetNumBlocks(int64_t elem_cnt) {
 template<typename T, typename IDX>
 __device__ __inline__ void Maxpool2dForwardComputeCLast(
     const NdIndexOffsetHelper<IDX, 4>& index_helper, IDX elem_num, const T* src, T* dest,
-    int64_t* indice_ptr, const int32_t padding_h, const int32_t padding_w, const int64_t n_batch,
+    int32_t* indice_ptr, const int32_t padding_h, const int32_t padding_w, const int64_t n_batch,
     const int64_t n_channel, const int64_t x_height, const int64_t x_width, const int64_t y_height,
     const int64_t y_width, const int32_t kernel_size_h, const int32_t kernel_size_w,
     const int32_t stride_h, const int32_t stride_w, const int32_t dilation_h,
@@ -86,7 +86,7 @@ __device__ __inline__ void Maxpool2dForwardComputeCLast(
 template<typename T, typename IDX>
 __launch_bounds__(kBlockSize) __global__
     void DoCUDAMaxPool1dForward(const NdIndexOffsetHelper<IDX, 3> index_helper,
-                                IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
+                                IDX elem_num, const T* src, T* dest, int32_t* indice_ptr,
                                 int32_t padding_l, int64_t n_batch, int64_t n_channel,
                                 int64_t x_length, int32_t kernel_size_l,
                                 int32_t stride_l, int32_t dilation_l) {
@@ -97,7 +97,7 @@ __launch_bounds__(kBlockSize) __global__
 template<typename T, typename IDX>
 __launch_bounds__(kBlockSize) __global__
     void DoCUDAMaxPool2dForwardCFirst(const NdIndexOffsetHelper<IDX, 3> index_helper,
-                                      IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
+                                      IDX elem_num, const T* src, T* dest, int32_t* indice_ptr,
                                       int32_t padding_h, int32_t padding_w, int32_t n_batch,
                                       int32_t n_channel, int32_t x_height, int32_t x_width,
                                       int32_t kernel_size_h, int32_t kernel_size_w, 
@@ -112,7 +112,7 @@ __launch_bounds__(kBlockSize) __global__
 template<typename T, typename IDX>
 __launch_bounds__(kBlockSize) __global__
     void DoCUDAMaxPool2dForwardCLast(const NdIndexOffsetHelper<IDX, 4> index_helper,
-                                     IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
+                                     IDX elem_num, const T* src, T* dest, int32_t* indice_ptr,
                                      int32_t padding_h, int32_t padding_w, int64_t n_batch,
                                      int64_t n_channel, int64_t x_height, int64_t x_width,
                                      int64_t y_height, int64_t y_width, int32_t kernel_size_h,
@@ -127,7 +127,7 @@ __launch_bounds__(kBlockSize) __global__
 template<typename T, typename IDX>
 __launch_bounds__(kBlockSize) __global__
     void DoCUDAMaxPool3dForward(const NdIndexOffsetHelper<IDX, 5> index_helper,
-                                IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
+                                IDX elem_num, const T* src, T* dest, int32_t* indice_ptr,
                                 int32_t padding_t, int32_t padding_h, int32_t padding_w,
                                 int64_t n_batch, int64_t n_channel, int64_t x_time,
                                 int64_t x_height, int64_t x_width, 
@@ -145,7 +145,7 @@ template<typename T, typename IDX>
 __launch_bounds__(kBlockSize) __global__
     void DoCUDAMaxPool1dBackward(const NdIndexOffsetHelper<IDX, 3> index_helper,
                                  const IDX elem_num, const T* src, T* dest,
-                                 const int64_t* indice_ptr, const int64_t n_batch,
+                                 const int32_t* indice_ptr, const int64_t n_batch,
                                  const int64_t n_channel, const int64_t src_length,
                                  const int64_t dst_length) {
   Maxpool1dBackwardCompute<T, IDX>(index_helper, elem_num, src, dest, indice_ptr, n_batch, n_channel,
@@ -156,7 +156,7 @@ template<typename T, typename IDX>
 __launch_bounds__(kBlockSize) __global__
     void DoCUDAMaxPool2dBackwardCFirst(const NdIndexOffsetHelper<IDX, 4> index_helper,
                                        const IDX elem_num, const T* src, T* dest,
-                                       const int64_t* indice_ptr, const int64_t n_batch,
+                                       const int32_t* indice_ptr, const int64_t n_batch,
                                        const int64_t n_channel, const int64_t src_height,
                                        const int64_t src_width, const int64_t dst_height,
                                        const int64_t dst_width) {
@@ -168,7 +168,7 @@ template<typename T, typename IDX>
 __launch_bounds__(kBlockSize) __global__
     void DoCUDAMaxPool2dBackwardCLast(const NdIndexOffsetHelper<IDX, 4> index_helper,
                                       const IDX elem_num, const T* src, T* dest,
-                                      const int64_t* indice_ptr, const int64_t n_batch,
+                                      const int32_t* indice_ptr, const int64_t n_batch,
                                       const int64_t n_channel, const int64_t src_height,
                                       const int64_t src_width, const int64_t dst_height,
                                       const int64_t dst_width) {
@@ -180,7 +180,7 @@ template<typename T, typename IDX>
 __launch_bounds__(kBlockSize) __global__
     void DoCUDAMaxPool3dBackward(const NdIndexOffsetHelper<IDX, 5> index_helper,
                                  const IDX elem_num, const T* src, T* dest,
-                                 const int64_t* indice_ptr, const int64_t n_batch,
+                                 const int32_t* indice_ptr, const int64_t n_batch,
                                  const int64_t n_channel, const int64_t src_time,
                                  const int64_t src_height, const int64_t src_width,
                                  const int64_t dst_time, const int64_t dst_height,
@@ -193,7 +193,7 @@ template<typename T, typename IDX>
 struct PoolingKernelUtil<DeviceType::kCUDA, T, IDX> {
   static void Maxpool1dForward(ep::Stream* stream,
                                const NdIndexOffsetHelper<IDX, 3>& index_helper,
-                               const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
+                               const IDX elem_num, const T* src, T* dest, int32_t* indice_ptr,
                                const MaxPoolingParams3D& params_3d) {
     DoCUDAMaxPool1dForward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
                                 stream->As<ep::CudaStream>()->cuda_stream()>>>(
@@ -206,7 +206,7 @@ struct PoolingKernelUtil<DeviceType::kCUDA, T, IDX> {
   static void Maxpool1dBackward(ep::Stream* stream,
                                 const NdIndexOffsetHelper<IDX, 3>& index_helper,
                                 const IDX elem_num, const T* src, T* dest,
-                                const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) {
+                                const int32_t* indice_ptr, const MaxPoolingParams3D& params_3d) {
     DoCUDAMaxPool1dBackward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
                                  stream->As<ep::CudaStream>()->cuda_stream()>>>(
         index_helper, elem_num, src, dest, indice_ptr, params_3d.num_batch(),
@@ -216,7 +216,7 @@ struct PoolingKernelUtil<DeviceType::kCUDA, T, IDX> {
   static void Maxpool2dForwardCFirst(ep::Stream* stream,
                                      const NdIndexOffsetHelper<IDX, 3>& index_helper,
                                      const IDX elem_num, const T* src, T* dest,
-                                     int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) {
+                                     int32_t* indice_ptr, const MaxPoolingParams3D& params_3d) {
     DoCUDAMaxPool2dForwardCFirst<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
                                       stream->As<ep::CudaStream>()->cuda_stream()>>>(
         index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[1],
@@ -230,7 +230,7 @@ struct PoolingKernelUtil<DeviceType::kCUDA, T, IDX> {
   static void Maxpool2dBackwardCFirst(ep::Stream* stream,
                                       const NdIndexOffsetHelper<IDX, 4>& index_helper,
                                       const IDX elem_num, const T* src, T* dest,
-                                      const int64_t* indice_ptr,
+                                      const int32_t* indice_ptr,
                                       const MaxPoolingParams3D& params_3d) {
     DoCUDAMaxPool2dBackwardCFirst<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
                                        stream->As<ep::CudaStream>()->cuda_stream()>>>(
@@ -242,7 +242,7 @@ struct PoolingKernelUtil<DeviceType::kCUDA, T, IDX> {
   static void Maxpool2dForwardCLast(ep::Stream* stream,
                                     const NdIndexOffsetHelper<IDX, 4>& index_helper,
                                     const IDX elem_num, const T* src, T* dest,
-                                    int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) {
+                                    int32_t* indice_ptr, const MaxPoolingParams3D& params_3d) {
     DoCUDAMaxPool2dForwardCLast<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
                                      stream->As<ep::CudaStream>()->cuda_stream()>>>(
         index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[1],
@@ -256,7 +256,7 @@ struct PoolingKernelUtil<DeviceType::kCUDA, T, IDX> {
   static void Maxpool2dBackwardCLast(ep::Stream* stream,
                                      const NdIndexOffsetHelper<IDX, 4>& index_helper,
                                      const IDX elem_num, const T* src, T* dest,
-                                     const int64_t* indice_ptr,
+                                     const int32_t* indice_ptr,
                                      const MaxPoolingParams3D& params_3d) {
     DoCUDAMaxPool2dBackwardCLast<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
                                       stream->As<ep::CudaStream>()->cuda_stream()>>>(
@@ -267,7 +267,7 @@ struct PoolingKernelUtil<DeviceType::kCUDA, T, IDX> {
 
   static void Maxpool3dForward(ep::Stream* stream,
                                const NdIndexOffsetHelper<IDX, 5>& index_helper,
-                               const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
+                               const IDX elem_num, const T* src, T* dest, int32_t* indice_ptr,
                                const MaxPoolingParams3D& params_3d) {
     DoCUDAMaxPool3dForward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
                                 stream->As<ep::CudaStream>()->cuda_stream()>>>(
@@ -285,7 +285,7 @@ struct PoolingKernelUtil<DeviceType::kCUDA, T, IDX> {
   static void Maxpool3dBackward(ep::Stream* stream,
                                 const NdIndexOffsetHelper<IDX, 5>& index_helper,
                                 const IDX elem_num, const T* src, T* dest,
-                                const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) {
+                                const int32_t* indice_ptr, const MaxPoolingParams3D& params_3d) {
     DoCUDAMaxPool3dBackward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
                                  stream->As<ep::CudaStream>()->cuda_stream()>>>(
         index_helper, elem_num, src, dest, indice_ptr, params_3d.num_batch(),
diff --git a/oneflow/user/kernels/pooling_kernel_util.h b/oneflow/user/kernels/pooling_kernel_util.h
index 2e43d1ff033..208dda34e45 100644
--- a/oneflow/user/kernels/pooling_kernel_util.h
+++ b/oneflow/user/kernels/pooling_kernel_util.h
@@ -104,51 +104,51 @@ template<DeviceType device_type, typename T, typename IDX>
 struct PoolingKernelUtil {
   static void Maxpool1dForward(ep::Stream* stream,
                                const NdIndexOffsetHelper<IDX, 3>& index_helper,
-                               const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
+                               const IDX elem_num, const T* src, T* dest, int32_t* indice_ptr,
                                const MaxPoolingParams3D& params_3d);
 
   static void Maxpool1dBackward(ep::Stream* stream,
                                 const NdIndexOffsetHelper<IDX, 3>& index_helper,
                                 const IDX elem_num, const T* src, T* dest,
-                                const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d);
+                                const int32_t* indice_ptr, const MaxPoolingParams3D& params_3d);
 
   static void Maxpool2dForwardCFirst(ep::Stream* stream,
                                      const NdIndexOffsetHelper<IDX, 3>& index_helper,
                                      const IDX elem_num, const T* src, T* dest,
-                                     int64_t* indice_ptr, const MaxPoolingParams3D& params_3d);
+                                     int32_t* indice_ptr, const MaxPoolingParams3D& params_3d);
 
   static void Maxpool2dBackwardCFirst(ep::Stream* stream,
                                       const NdIndexOffsetHelper<IDX, 4>& index_helper,
                                       const IDX elem_num, const T* src, T* dest,
-                                      const int64_t* indice_ptr,
+                                      const int32_t* indice_ptr,
                                       const MaxPoolingParams3D& params_3d);
 
   static void Maxpool2dForwardCLast(ep::Stream* stream,
                                     const NdIndexOffsetHelper<IDX, 4>& index_helper,
                                     const IDX elem_num, const T* src, T* dest,
-                                    int64_t* indice_ptr, const MaxPoolingParams3D& params_3d);
+                                    int32_t* indice_ptr, const MaxPoolingParams3D& params_3d);
 
   static void Maxpool2dBackwardCLast(ep::Stream* stream,
                                      const NdIndexOffsetHelper<IDX, 4>& index_helper,
                                      const IDX elem_num, const T* src, T* dest,
-                                     const int64_t* indice_ptr,
+                                     const int32_t* indice_ptr,
                                      const MaxPoolingParams3D& params_3d);
 
   static void Maxpool3dForward(ep::Stream* stream,
                                const NdIndexOffsetHelper<IDX, 5>& index_helper,
-                               const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
+                               const IDX elem_num, const T* src, T* dest, int32_t* indice_ptr,
                                const MaxPoolingParams3D& params_3d);
 
   static void Maxpool3dBackward(ep::Stream* stream,
                                 const NdIndexOffsetHelper<IDX, 5>& index_helper,
                                 const IDX elem_num, const T* src, T* dest,
-                                const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d);
+                                const int32_t* indice_ptr, const MaxPoolingParams3D& params_3d);
 };
 
 template<typename T, typename IDX>
 OF_DEVICE_FUNC void Maxpool1dForwardCompute(const NdIndexOffsetHelper<IDX, 3> index_helper,
                                             IDX elem_num, const T* src, T* dest,
-                                            int64_t* indice_ptr, const int32_t padding_l,
+                                            int32_t* indice_ptr, const int32_t padding_l,
                                             const int32_t n_batch, const int32_t n_channel,
                                             const int32_t x_length, const int32_t kernel_size_l, 
                                             const int32_t stride_l, const int32_t dilation_l) {
@@ -189,7 +189,7 @@ OF_DEVICE_FUNC void Maxpool1dForwardCompute(const NdIndexOffsetHelper<IDX, 3> in
 template<typename T, typename IDX>
 OF_DEVICE_FUNC void Maxpool1dBackwardCompute(const NdIndexOffsetHelper<IDX, 3> index_helper,
                                              const IDX elem_num, const T* src, T* dest,
-                                             const int64_t* indice_ptr, const int32_t n_batch,
+                                             const int32_t* indice_ptr, const int32_t n_batch,
                                              const int32_t n_channel, const int32_t src_length,
                                              const int32_t dst_length) {
   XPU_1D_KERNEL_LOOP(num, elem_num) {
@@ -210,7 +210,7 @@ OF_DEVICE_FUNC void Maxpool1dBackwardCompute(const NdIndexOffsetHelper<IDX, 3> i
 template<typename T, typename IDX>
 OF_DEVICE_FUNC void Maxpool2dForwardComputeCFirst(
     const NdIndexOffsetHelper<IDX, 3> index_helper, IDX elem_num, const T* src, T* dest,
-    int64_t* indice_ptr, const int32_t padding_h, const int32_t padding_w, const int32_t n_batch,
+    int32_t* indice_ptr, const int32_t padding_h, const int32_t padding_w, const int32_t n_batch,
     const int32_t n_channel, const int32_t x_height, const int32_t x_width,
     const int32_t kernel_size_h, const int32_t kernel_size_w,
     const int32_t stride_h, const int32_t stride_w, const int32_t dilation_h,
@@ -261,7 +261,7 @@ OF_DEVICE_FUNC void Maxpool2dForwardComputeCFirst(
 template<typename T, typename IDX>
 OF_DEVICE_FUNC void Maxpool2dBackwardComputeCFirst(
     const NdIndexOffsetHelper<IDX, 4> index_helper, const IDX elem_num, const T* src,
-    T* dest, const int64_t* indice_ptr, const int32_t n_batch, const int32_t n_channel,
+    T* dest, const int32_t* indice_ptr, const int32_t n_batch, const int32_t n_channel,
     const int32_t src_height, const int32_t src_width, const int32_t dst_height,
     const int32_t dst_width) {
   XPU_1D_KERNEL_LOOP(num, elem_num) {
@@ -283,7 +283,7 @@ OF_DEVICE_FUNC void Maxpool2dBackwardComputeCFirst(
 template<typename T, typename IDX>
 OF_DEVICE_FUNC void Maxpool2dBackwardComputeCLast(
     const NdIndexOffsetHelper<IDX, 4> index_helper, const IDX elem_num, const T* src,
-    T* dest, const int64_t* indice_ptr, const int32_t n_batch, const int32_t n_channel,
+    T* dest, const int32_t* indice_ptr, const int32_t n_batch, const int32_t n_channel,
     const int32_t src_height, const int32_t src_width, const int32_t dst_height,
     const int32_t dst_width) {
   XPU_1D_KERNEL_LOOP(num, elem_num) {
@@ -303,7 +303,7 @@ OF_DEVICE_FUNC void Maxpool2dBackwardComputeCLast(
 template<typename T, typename IDX>
 OF_DEVICE_FUNC void Maxpool3dForwardCompute(
     const NdIndexOffsetHelper<IDX, 5> index_helper, IDX elem_num, const T* src, T* dest,
-    int64_t* indice_ptr, const int32_t padding_t, const int32_t padding_h, const int32_t padding_w,
+    int32_t* indice_ptr, const int32_t padding_t, const int32_t padding_h, const int32_t padding_w,
     const int32_t n_batch, const int32_t n_channel, 
     const int32_t x_time, const int32_t x_height, const int32_t x_width, 
     const int32_t kernel_size_t, const int32_t kernel_size_h, const int32_t kernel_size_w,
@@ -358,7 +358,7 @@ OF_DEVICE_FUNC void Maxpool3dForwardCompute(
 template<typename T, typename IDX>
 OF_DEVICE_FUNC void Maxpool3dBackwardCompute(const NdIndexOffsetHelper<IDX, 5> index_helper,
                                              const IDX elem_num, const T* src, T* dest,
-                                             const int64_t* indice_ptr, const int32_t n_batch,
+                                             const int32_t* indice_ptr, const int32_t n_batch,
                                              const int32_t n_channel, const int32_t src_time,
                                              const int32_t src_height, const int32_t src_width,
                                              const int32_t dst_time, const int32_t dst_height,
diff --git a/oneflow/user/ops/pooling_op.cpp b/oneflow/user/ops/pooling_op.cpp
index ffb7d85e1ba..31289e5da9f 100644
--- a/oneflow/user/ops/pooling_op.cpp
+++ b/oneflow/user/ops/pooling_op.cpp
@@ -56,7 +56,7 @@ TensorDescInferFn MaxPoolMakeForwardTensorDescInferFn(const int32_t dim) {
     *indice_desc = *ctx->OutputTensorDesc("y", 0);
     *indice_desc->mut_shape() = *y_desc->mut_shape();
     DataType* dtype = indice_desc->mut_data_type();
-    *dtype = kInt64;
+    *dtype = kInt32;
     return Maybe<void>::Ok();
   };
 }

From f62ac8fefb1d7197f6c68ea51b746c9d5e6cbf57 Mon Sep 17 00:00:00 2001
From: MARD1NO <359521840@qq.com>
Date: Tue, 8 Feb 2022 17:58:12 +0800
Subject: [PATCH 13/17] revert back to use int64_t

---
 oneflow/user/kernels/pooling_kernel.cpp    | 30 +++++++++----------
 oneflow/user/kernels/pooling_kernel.cu     | 34 +++++++++++-----------
 oneflow/user/kernels/pooling_kernel_util.h | 30 +++++++++----------
 oneflow/user/ops/pooling_op.cpp            |  2 +-
 4 files changed, 48 insertions(+), 48 deletions(-)

diff --git a/oneflow/user/kernels/pooling_kernel.cpp b/oneflow/user/kernels/pooling_kernel.cpp
index 65aae1fe506..4220bd26fbf 100644
--- a/oneflow/user/kernels/pooling_kernel.cpp
+++ b/oneflow/user/kernels/pooling_kernel.cpp
@@ -44,7 +44,7 @@ namespace {
 
 template<typename T, typename IDX>
 void Maxpool2dForwardComputeCLast(const NdIndexOffsetHelper<IDX, 4>& index_helper,
-                                  IDX elem_num, const T* src, T* dest, int32_t* indice_ptr,
+                                  IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
                                   const int32_t padding_h, const int32_t padding_w,
                                   const int32_t n_batch, const int32_t n_channel,
                                   const int32_t x_height, const int32_t x_width,
@@ -99,7 +99,7 @@ template<typename T, typename IDX>
 struct PoolingKernelUtil<DeviceType::kCPU, T, IDX> {
   static void Maxpool1dForward(ep::Stream* stream,
                                const NdIndexOffsetHelper<IDX, 3>& index_helper,
-                               const IDX elem_num, const T* src, T* dest, int32_t* indice_ptr,
+                               const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
                                const MaxPoolingParams3D& params_3d) {
     Maxpool1dForwardCompute<T, IDX>(index_helper, elem_num, src, dest, indice_ptr,
                                params_3d.padding()[2], params_3d.num_batch(),
@@ -111,7 +111,7 @@ struct PoolingKernelUtil<DeviceType::kCPU, T, IDX> {
   static void Maxpool1dBackward(ep::Stream* stream,
                                 const NdIndexOffsetHelper<IDX, 3>& index_helper,
                                 const IDX elem_num, const T* src, T* dest,
-                                const int32_t* indice_ptr, const MaxPoolingParams3D& params_3d) {
+                                const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) {
     Maxpool1dBackwardCompute<T, IDX>(index_helper, elem_num, src, dest, indice_ptr,
                                 params_3d.num_batch(), params_3d.num_channel(),
                                 params_3d.GetYShape5D().At(4), params_3d.GetXShape5D().At(4));
@@ -120,7 +120,7 @@ struct PoolingKernelUtil<DeviceType::kCPU, T, IDX> {
   static void Maxpool2dForwardCFirst(ep::Stream* stream,
                                      const NdIndexOffsetHelper<IDX, 3>& index_helper,
                                      const IDX elem_num, const T* src, T* dest,
-                                     int32_t* indice_ptr, const MaxPoolingParams3D& params_3d) {
+                                     int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) {
     Maxpool2dForwardComputeCFirst<T, IDX>(
         index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[1],
         params_3d.padding()[2], params_3d.num_batch(), params_3d.num_channel(),
@@ -133,7 +133,7 @@ struct PoolingKernelUtil<DeviceType::kCPU, T, IDX> {
   static void Maxpool2dBackwardCFirst(ep::Stream* stream,
                                       const NdIndexOffsetHelper<IDX, 4>& index_helper,
                                       const IDX elem_num, const T* src, T* dest,
-                                      const int32_t* indice_ptr,
+                                      const int64_t* indice_ptr,
                                       const MaxPoolingParams3D& params_3d) {
     Maxpool2dBackwardComputeCFirst<T, IDX>(index_helper, elem_num, src, dest, indice_ptr,
                                       params_3d.num_batch(), params_3d.num_channel(),
@@ -144,7 +144,7 @@ struct PoolingKernelUtil<DeviceType::kCPU, T, IDX> {
   static void Maxpool2dForwardCLast(ep::Stream* stream,
                                     const NdIndexOffsetHelper<IDX, 4>& index_helper,
                                     const IDX elem_num, const T* src, T* dest,
-                                    int32_t* indice_ptr, const MaxPoolingParams3D& params_3d) {
+                                    int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) {
     Maxpool2dForwardComputeCLast<T, IDX>(
         index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[1],
         params_3d.padding()[2], params_3d.num_batch(), params_3d.num_channel(),
@@ -157,7 +157,7 @@ struct PoolingKernelUtil<DeviceType::kCPU, T, IDX> {
   static void Maxpool2dBackwardCLast(ep::Stream* stream,
                                      const NdIndexOffsetHelper<IDX, 4>& index_helper,
                                      const IDX elem_num, const T* src, T* dest,
-                                     const int32_t* indice_ptr,
+                                     const int64_t* indice_ptr,
                                      const MaxPoolingParams3D& params_3d) {
     Maxpool2dBackwardComputeCLast<T, IDX>(index_helper, elem_num, src, dest, indice_ptr,
                                      params_3d.num_batch(), params_3d.num_channel(),
@@ -167,7 +167,7 @@ struct PoolingKernelUtil<DeviceType::kCPU, T, IDX> {
 
   static void Maxpool3dForward(ep::Stream* stream,
                                const NdIndexOffsetHelper<IDX, 5>& index_helper,
-                               const IDX elem_num, const T* src, T* dest, int32_t* indice_ptr,
+                               const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
                                const MaxPoolingParams3D& params_3d) {
     Maxpool3dForwardCompute<T, IDX>(
         index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[0],
@@ -183,7 +183,7 @@ struct PoolingKernelUtil<DeviceType::kCPU, T, IDX> {
   static void Maxpool3dBackward(ep::Stream* stream,
                                 const NdIndexOffsetHelper<IDX, 5> index_helper,
                                 const IDX elem_num, const T* src, T* dest,
-                                const int32_t* indice_ptr, const MaxPoolingParams3D& params_3d) {
+                                const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) {
     Maxpool3dBackwardCompute<T, IDX>(index_helper, elem_num, src, dest, indice_ptr,
                                 params_3d.num_batch(), params_3d.num_channel(),
                                 params_3d.GetYShape5D().At(2), params_3d.GetYShape5D().At(3),
@@ -217,7 +217,7 @@ class MaxPool1dKernel final : public user_op::OpKernel {
     const int64_t elem_num = y->shape().elem_cnt();
     const T* src = x->dptr<T>();
     T* dest = y->mut_dptr<T>();
-    int32_t* indice_ptr = indice->mut_dptr<int32_t>();
+    int64_t* indice_ptr = indice->mut_dptr<int64_t>();
 
     DimVector y_vector;
     y->shape().ToDimVector(&y_vector);
@@ -258,7 +258,7 @@ class MaxPool1dGradKernel final : public user_op::OpKernel {
 
     const int64_t elem_num = dy->shape().elem_cnt();
     const T* src = dy->dptr<T>();
-    const int32_t* indice_ptr = indice->dptr<int32_t>();
+    const int64_t* indice_ptr = indice->dptr<int64_t>();
     T* dest = dx->mut_dptr<T>();
     DimVector dy_vector;
     dy->shape().ToDimVector(&dy_vector);
@@ -305,7 +305,7 @@ class MaxPool2dKernel final : public user_op::OpKernel {
 
     const T* src = x->dptr<T>();
     T* dest = y->mut_dptr<T>();
-    int32_t* indice_ptr = indice->mut_dptr<int32_t>();
+    int64_t* indice_ptr = indice->mut_dptr<int64_t>();
 
     const std::string& data_format = ctx->Attr<std::string>("data_format");
     if (data_format == "channels_first") {
@@ -365,7 +365,7 @@ class MaxPool2dGradKernel final : public user_op::OpKernel {
 
     const int64_t elem_num = dy->shape().elem_cnt();
     const T* src = dy->dptr<T>();
-    const int32_t* indice_ptr = indice->dptr<int32_t>();
+    const int64_t* indice_ptr = indice->dptr<int64_t>();
     T* dest = dx->mut_dptr<T>();
     DimVector dy_vector;
     dy->shape().ToDimVector(&dy_vector);
@@ -425,7 +425,7 @@ class MaxPool3dKernel final : public user_op::OpKernel {
     const int64_t elem_num = y->shape().elem_cnt();
     const T* src = x->dptr<T>();
     T* dest = y->mut_dptr<T>();
-    int32_t* indice_ptr = indice->mut_dptr<int32_t>();
+    int64_t* indice_ptr = indice->mut_dptr<int64_t>();
 
     DimVector y_vector;
     y->shape().ToDimVector(&y_vector);
@@ -465,7 +465,7 @@ class MaxPool3dGradKernel final : public user_op::OpKernel {
 
     const int64_t elem_num = dy->shape().elem_cnt();
     const T* src = dy->dptr<T>();
-    const int32_t* indice_ptr = indice->dptr<int32_t>();
+    const int64_t* indice_ptr = indice->dptr<int64_t>();
     T* dest = dx->mut_dptr<T>();
 
     DimVector dy_vector;
diff --git a/oneflow/user/kernels/pooling_kernel.cu b/oneflow/user/kernels/pooling_kernel.cu
index f8a590d00cc..8ac9bfaacfa 100644
--- a/oneflow/user/kernels/pooling_kernel.cu
+++ b/oneflow/user/kernels/pooling_kernel.cu
@@ -35,7 +35,7 @@ int GetNumBlocks(int64_t elem_cnt) {
 template<typename T, typename IDX>
 __device__ __inline__ void Maxpool2dForwardComputeCLast(
     const NdIndexOffsetHelper<IDX, 4>& index_helper, IDX elem_num, const T* src, T* dest,
-    int32_t* indice_ptr, const int32_t padding_h, const int32_t padding_w, const int64_t n_batch,
+    int64_t* indice_ptr, const int32_t padding_h, const int32_t padding_w, const int64_t n_batch,
     const int64_t n_channel, const int64_t x_height, const int64_t x_width, const int64_t y_height,
     const int64_t y_width, const int32_t kernel_size_h, const int32_t kernel_size_w,
     const int32_t stride_h, const int32_t stride_w, const int32_t dilation_h,
@@ -86,7 +86,7 @@ __device__ __inline__ void Maxpool2dForwardComputeCLast(
 template<typename T, typename IDX>
 __launch_bounds__(kBlockSize) __global__
     void DoCUDAMaxPool1dForward(const NdIndexOffsetHelper<IDX, 3> index_helper,
-                                IDX elem_num, const T* src, T* dest, int32_t* indice_ptr,
+                                IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
                                 int32_t padding_l, int64_t n_batch, int64_t n_channel,
                                 int64_t x_length, int32_t kernel_size_l,
                                 int32_t stride_l, int32_t dilation_l) {
@@ -97,7 +97,7 @@ __launch_bounds__(kBlockSize) __global__
 template<typename T, typename IDX>
 __launch_bounds__(kBlockSize) __global__
     void DoCUDAMaxPool2dForwardCFirst(const NdIndexOffsetHelper<IDX, 3> index_helper,
-                                      IDX elem_num, const T* src, T* dest, int32_t* indice_ptr,
+                                      IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
                                       int32_t padding_h, int32_t padding_w, int32_t n_batch,
                                       int32_t n_channel, int32_t x_height, int32_t x_width,
                                       int32_t kernel_size_h, int32_t kernel_size_w, 
@@ -112,7 +112,7 @@ __launch_bounds__(kBlockSize) __global__
 template<typename T, typename IDX>
 __launch_bounds__(kBlockSize) __global__
     void DoCUDAMaxPool2dForwardCLast(const NdIndexOffsetHelper<IDX, 4> index_helper,
-                                     IDX elem_num, const T* src, T* dest, int32_t* indice_ptr,
+                                     IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
                                      int32_t padding_h, int32_t padding_w, int64_t n_batch,
                                      int64_t n_channel, int64_t x_height, int64_t x_width,
                                      int64_t y_height, int64_t y_width, int32_t kernel_size_h,
@@ -127,7 +127,7 @@ __launch_bounds__(kBlockSize) __global__
 template<typename T, typename IDX>
 __launch_bounds__(kBlockSize) __global__
     void DoCUDAMaxPool3dForward(const NdIndexOffsetHelper<IDX, 5> index_helper,
-                                IDX elem_num, const T* src, T* dest, int32_t* indice_ptr,
+                                IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
                                 int32_t padding_t, int32_t padding_h, int32_t padding_w,
                                 int64_t n_batch, int64_t n_channel, int64_t x_time,
                                 int64_t x_height, int64_t x_width, 
@@ -145,7 +145,7 @@ template<typename T, typename IDX>
 __launch_bounds__(kBlockSize) __global__
     void DoCUDAMaxPool1dBackward(const NdIndexOffsetHelper<IDX, 3> index_helper,
                                  const IDX elem_num, const T* src, T* dest,
-                                 const int32_t* indice_ptr, const int64_t n_batch,
+                                 const int64_t* indice_ptr, const int64_t n_batch,
                                  const int64_t n_channel, const int64_t src_length,
                                  const int64_t dst_length) {
   Maxpool1dBackwardCompute<T, IDX>(index_helper, elem_num, src, dest, indice_ptr, n_batch, n_channel,
@@ -156,7 +156,7 @@ template<typename T, typename IDX>
 __launch_bounds__(kBlockSize) __global__
     void DoCUDAMaxPool2dBackwardCFirst(const NdIndexOffsetHelper<IDX, 4> index_helper,
                                        const IDX elem_num, const T* src, T* dest,
-                                       const int32_t* indice_ptr, const int64_t n_batch,
+                                       const int64_t* indice_ptr, const int64_t n_batch,
                                        const int64_t n_channel, const int64_t src_height,
                                        const int64_t src_width, const int64_t dst_height,
                                        const int64_t dst_width) {
@@ -168,7 +168,7 @@ template<typename T, typename IDX>
 __launch_bounds__(kBlockSize) __global__
     void DoCUDAMaxPool2dBackwardCLast(const NdIndexOffsetHelper<IDX, 4> index_helper,
                                       const IDX elem_num, const T* src, T* dest,
-                                      const int32_t* indice_ptr, const int64_t n_batch,
+                                      const int64_t* indice_ptr, const int64_t n_batch,
                                       const int64_t n_channel, const int64_t src_height,
                                       const int64_t src_width, const int64_t dst_height,
                                       const int64_t dst_width) {
@@ -180,7 +180,7 @@ template<typename T, typename IDX>
 __launch_bounds__(kBlockSize) __global__
     void DoCUDAMaxPool3dBackward(const NdIndexOffsetHelper<IDX, 5> index_helper,
                                  const IDX elem_num, const T* src, T* dest,
-                                 const int32_t* indice_ptr, const int64_t n_batch,
+                                 const int64_t* indice_ptr, const int64_t n_batch,
                                  const int64_t n_channel, const int64_t src_time,
                                  const int64_t src_height, const int64_t src_width,
                                  const int64_t dst_time, const int64_t dst_height,
@@ -193,7 +193,7 @@ template<typename T, typename IDX>
 struct PoolingKernelUtil<DeviceType::kCUDA, T, IDX> {
   static void Maxpool1dForward(ep::Stream* stream,
                                const NdIndexOffsetHelper<IDX, 3>& index_helper,
-                               const IDX elem_num, const T* src, T* dest, int32_t* indice_ptr,
+                               const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
                                const MaxPoolingParams3D& params_3d) {
     DoCUDAMaxPool1dForward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
                                 stream->As<ep::CudaStream>()->cuda_stream()>>>(
@@ -206,7 +206,7 @@ struct PoolingKernelUtil<DeviceType::kCUDA, T, IDX> {
   static void Maxpool1dBackward(ep::Stream* stream,
                                 const NdIndexOffsetHelper<IDX, 3>& index_helper,
                                 const IDX elem_num, const T* src, T* dest,
-                                const int32_t* indice_ptr, const MaxPoolingParams3D& params_3d) {
+                                const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) {
     DoCUDAMaxPool1dBackward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
                                  stream->As<ep::CudaStream>()->cuda_stream()>>>(
         index_helper, elem_num, src, dest, indice_ptr, params_3d.num_batch(),
@@ -216,7 +216,7 @@ struct PoolingKernelUtil<DeviceType::kCUDA, T, IDX> {
   static void Maxpool2dForwardCFirst(ep::Stream* stream,
                                      const NdIndexOffsetHelper<IDX, 3>& index_helper,
                                      const IDX elem_num, const T* src, T* dest,
-                                     int32_t* indice_ptr, const MaxPoolingParams3D& params_3d) {
+                                     int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) {
     DoCUDAMaxPool2dForwardCFirst<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
                                       stream->As<ep::CudaStream>()->cuda_stream()>>>(
         index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[1],
@@ -230,7 +230,7 @@ struct PoolingKernelUtil<DeviceType::kCUDA, T, IDX> {
   static void Maxpool2dBackwardCFirst(ep::Stream* stream,
                                       const NdIndexOffsetHelper<IDX, 4>& index_helper,
                                       const IDX elem_num, const T* src, T* dest,
-                                      const int32_t* indice_ptr,
+                                      const int64_t* indice_ptr,
                                       const MaxPoolingParams3D& params_3d) {
     DoCUDAMaxPool2dBackwardCFirst<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
                                        stream->As<ep::CudaStream>()->cuda_stream()>>>(
@@ -242,7 +242,7 @@ struct PoolingKernelUtil<DeviceType::kCUDA, T, IDX> {
   static void Maxpool2dForwardCLast(ep::Stream* stream,
                                     const NdIndexOffsetHelper<IDX, 4>& index_helper,
                                     const IDX elem_num, const T* src, T* dest,
-                                    int32_t* indice_ptr, const MaxPoolingParams3D& params_3d) {
+                                    int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) {
     DoCUDAMaxPool2dForwardCLast<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
                                      stream->As<ep::CudaStream>()->cuda_stream()>>>(
         index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[1],
@@ -256,7 +256,7 @@ struct PoolingKernelUtil<DeviceType::kCUDA, T, IDX> {
   static void Maxpool2dBackwardCLast(ep::Stream* stream,
                                      const NdIndexOffsetHelper<IDX, 4>& index_helper,
                                      const IDX elem_num, const T* src, T* dest,
-                                     const int32_t* indice_ptr,
+                                     const int64_t* indice_ptr,
                                      const MaxPoolingParams3D& params_3d) {
     DoCUDAMaxPool2dBackwardCLast<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
                                       stream->As<ep::CudaStream>()->cuda_stream()>>>(
@@ -267,7 +267,7 @@ struct PoolingKernelUtil<DeviceType::kCUDA, T, IDX> {
 
   static void Maxpool3dForward(ep::Stream* stream,
                                const NdIndexOffsetHelper<IDX, 5>& index_helper,
-                               const IDX elem_num, const T* src, T* dest, int32_t* indice_ptr,
+                               const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
                                const MaxPoolingParams3D& params_3d) {
     DoCUDAMaxPool3dForward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
                                 stream->As<ep::CudaStream>()->cuda_stream()>>>(
@@ -285,7 +285,7 @@ struct PoolingKernelUtil<DeviceType::kCUDA, T, IDX> {
   static void Maxpool3dBackward(ep::Stream* stream,
                                 const NdIndexOffsetHelper<IDX, 5>& index_helper,
                                 const IDX elem_num, const T* src, T* dest,
-                                const int32_t* indice_ptr, const MaxPoolingParams3D& params_3d) {
+                                const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) {
     DoCUDAMaxPool3dBackward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
                                  stream->As<ep::CudaStream>()->cuda_stream()>>>(
         index_helper, elem_num, src, dest, indice_ptr, params_3d.num_batch(),
diff --git a/oneflow/user/kernels/pooling_kernel_util.h b/oneflow/user/kernels/pooling_kernel_util.h
index 208dda34e45..2e43d1ff033 100644
--- a/oneflow/user/kernels/pooling_kernel_util.h
+++ b/oneflow/user/kernels/pooling_kernel_util.h
@@ -104,51 +104,51 @@ template<DeviceType device_type, typename T, typename IDX>
 struct PoolingKernelUtil {
   static void Maxpool1dForward(ep::Stream* stream,
                                const NdIndexOffsetHelper<IDX, 3>& index_helper,
-                               const IDX elem_num, const T* src, T* dest, int32_t* indice_ptr,
+                               const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
                                const MaxPoolingParams3D& params_3d);
 
   static void Maxpool1dBackward(ep::Stream* stream,
                                 const NdIndexOffsetHelper<IDX, 3>& index_helper,
                                 const IDX elem_num, const T* src, T* dest,
-                                const int32_t* indice_ptr, const MaxPoolingParams3D& params_3d);
+                                const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d);
 
   static void Maxpool2dForwardCFirst(ep::Stream* stream,
                                      const NdIndexOffsetHelper<IDX, 3>& index_helper,
                                      const IDX elem_num, const T* src, T* dest,
-                                     int32_t* indice_ptr, const MaxPoolingParams3D& params_3d);
+                                     int64_t* indice_ptr, const MaxPoolingParams3D& params_3d);
 
   static void Maxpool2dBackwardCFirst(ep::Stream* stream,
                                       const NdIndexOffsetHelper<IDX, 4>& index_helper,
                                       const IDX elem_num, const T* src, T* dest,
-                                      const int32_t* indice_ptr,
+                                      const int64_t* indice_ptr,
                                       const MaxPoolingParams3D& params_3d);
 
   static void Maxpool2dForwardCLast(ep::Stream* stream,
                                     const NdIndexOffsetHelper<IDX, 4>& index_helper,
                                     const IDX elem_num, const T* src, T* dest,
-                                    int32_t* indice_ptr, const MaxPoolingParams3D& params_3d);
+                                    int64_t* indice_ptr, const MaxPoolingParams3D& params_3d);
 
   static void Maxpool2dBackwardCLast(ep::Stream* stream,
                                      const NdIndexOffsetHelper<IDX, 4>& index_helper,
                                      const IDX elem_num, const T* src, T* dest,
-                                     const int32_t* indice_ptr,
+                                     const int64_t* indice_ptr,
                                      const MaxPoolingParams3D& params_3d);
 
   static void Maxpool3dForward(ep::Stream* stream,
                                const NdIndexOffsetHelper<IDX, 5>& index_helper,
-                               const IDX elem_num, const T* src, T* dest, int32_t* indice_ptr,
+                               const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
                                const MaxPoolingParams3D& params_3d);
 
   static void Maxpool3dBackward(ep::Stream* stream,
                                 const NdIndexOffsetHelper<IDX, 5>& index_helper,
                                 const IDX elem_num, const T* src, T* dest,
-                                const int32_t* indice_ptr, const MaxPoolingParams3D& params_3d);
+                                const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d);
 };
 
 template<typename T, typename IDX>
 OF_DEVICE_FUNC void Maxpool1dForwardCompute(const NdIndexOffsetHelper<IDX, 3> index_helper,
                                             IDX elem_num, const T* src, T* dest,
-                                            int32_t* indice_ptr, const int32_t padding_l,
+                                            int64_t* indice_ptr, const int32_t padding_l,
                                             const int32_t n_batch, const int32_t n_channel,
                                             const int32_t x_length, const int32_t kernel_size_l, 
                                             const int32_t stride_l, const int32_t dilation_l) {
@@ -189,7 +189,7 @@ OF_DEVICE_FUNC void Maxpool1dForwardCompute(const NdIndexOffsetHelper<IDX, 3> in
 template<typename T, typename IDX>
 OF_DEVICE_FUNC void Maxpool1dBackwardCompute(const NdIndexOffsetHelper<IDX, 3> index_helper,
                                              const IDX elem_num, const T* src, T* dest,
-                                             const int32_t* indice_ptr, const int32_t n_batch,
+                                             const int64_t* indice_ptr, const int32_t n_batch,
                                              const int32_t n_channel, const int32_t src_length,
                                              const int32_t dst_length) {
   XPU_1D_KERNEL_LOOP(num, elem_num) {
@@ -210,7 +210,7 @@ OF_DEVICE_FUNC void Maxpool1dBackwardCompute(const NdIndexOffsetHelper<IDX, 3> i
 template<typename T, typename IDX>
 OF_DEVICE_FUNC void Maxpool2dForwardComputeCFirst(
     const NdIndexOffsetHelper<IDX, 3> index_helper, IDX elem_num, const T* src, T* dest,
-    int32_t* indice_ptr, const int32_t padding_h, const int32_t padding_w, const int32_t n_batch,
+    int64_t* indice_ptr, const int32_t padding_h, const int32_t padding_w, const int32_t n_batch,
     const int32_t n_channel, const int32_t x_height, const int32_t x_width,
     const int32_t kernel_size_h, const int32_t kernel_size_w,
     const int32_t stride_h, const int32_t stride_w, const int32_t dilation_h,
@@ -261,7 +261,7 @@ OF_DEVICE_FUNC void Maxpool2dForwardComputeCFirst(
 template<typename T, typename IDX>
 OF_DEVICE_FUNC void Maxpool2dBackwardComputeCFirst(
     const NdIndexOffsetHelper<IDX, 4> index_helper, const IDX elem_num, const T* src,
-    T* dest, const int32_t* indice_ptr, const int32_t n_batch, const int32_t n_channel,
+    T* dest, const int64_t* indice_ptr, const int32_t n_batch, const int32_t n_channel,
     const int32_t src_height, const int32_t src_width, const int32_t dst_height,
     const int32_t dst_width) {
   XPU_1D_KERNEL_LOOP(num, elem_num) {
@@ -283,7 +283,7 @@ OF_DEVICE_FUNC void Maxpool2dBackwardComputeCFirst(
 template<typename T, typename IDX>
 OF_DEVICE_FUNC void Maxpool2dBackwardComputeCLast(
     const NdIndexOffsetHelper<IDX, 4> index_helper, const IDX elem_num, const T* src,
-    T* dest, const int32_t* indice_ptr, const int32_t n_batch, const int32_t n_channel,
+    T* dest, const int64_t* indice_ptr, const int32_t n_batch, const int32_t n_channel,
     const int32_t src_height, const int32_t src_width, const int32_t dst_height,
     const int32_t dst_width) {
   XPU_1D_KERNEL_LOOP(num, elem_num) {
@@ -303,7 +303,7 @@ OF_DEVICE_FUNC void Maxpool2dBackwardComputeCLast(
 template<typename T, typename IDX>
 OF_DEVICE_FUNC void Maxpool3dForwardCompute(
     const NdIndexOffsetHelper<IDX, 5> index_helper, IDX elem_num, const T* src, T* dest,
-    int32_t* indice_ptr, const int32_t padding_t, const int32_t padding_h, const int32_t padding_w,
+    int64_t* indice_ptr, const int32_t padding_t, const int32_t padding_h, const int32_t padding_w,
     const int32_t n_batch, const int32_t n_channel, 
     const int32_t x_time, const int32_t x_height, const int32_t x_width, 
     const int32_t kernel_size_t, const int32_t kernel_size_h, const int32_t kernel_size_w,
@@ -358,7 +358,7 @@ OF_DEVICE_FUNC void Maxpool3dForwardCompute(
 template<typename T, typename IDX>
 OF_DEVICE_FUNC void Maxpool3dBackwardCompute(const NdIndexOffsetHelper<IDX, 5> index_helper,
                                              const IDX elem_num, const T* src, T* dest,
-                                             const int32_t* indice_ptr, const int32_t n_batch,
+                                             const int64_t* indice_ptr, const int32_t n_batch,
                                              const int32_t n_channel, const int32_t src_time,
                                              const int32_t src_height, const int32_t src_width,
                                              const int32_t dst_time, const int32_t dst_height,
diff --git a/oneflow/user/ops/pooling_op.cpp b/oneflow/user/ops/pooling_op.cpp
index 31289e5da9f..ffb7d85e1ba 100644
--- a/oneflow/user/ops/pooling_op.cpp
+++ b/oneflow/user/ops/pooling_op.cpp
@@ -56,7 +56,7 @@ TensorDescInferFn MaxPoolMakeForwardTensorDescInferFn(const int32_t dim) {
     *indice_desc = *ctx->OutputTensorDesc("y", 0);
     *indice_desc->mut_shape() = *y_desc->mut_shape();
     DataType* dtype = indice_desc->mut_data_type();
-    *dtype = kInt32;
+    *dtype = kInt64;
     return Maybe<void>::Ok();
   };
 }

From 1d8843f6da2d1822fabc7f6f63bccb1e785e9c75 Mon Sep 17 00:00:00 2001
From: MARD1NO <359521840@qq.com>
Date: Wed, 9 Feb 2022 09:08:50 +0800
Subject: [PATCH 14/17] fix maxpool1d 3d

---
 oneflow/user/kernels/pooling_kernel.cpp    | 26 +++++++++++++---------
 oneflow/user/kernels/pooling_kernel.cu     |  8 +++----
 oneflow/user/kernels/pooling_kernel_util.h | 22 +++++++++---------
 3 files changed, 29 insertions(+), 27 deletions(-)

diff --git a/oneflow/user/kernels/pooling_kernel.cpp b/oneflow/user/kernels/pooling_kernel.cpp
index 4220bd26fbf..41236654dcc 100644
--- a/oneflow/user/kernels/pooling_kernel.cpp
+++ b/oneflow/user/kernels/pooling_kernel.cpp
@@ -98,7 +98,7 @@ void Maxpool2dForwardComputeCLast(const NdIndexOffsetHelper<IDX, 4>& index_helpe
 template<typename T, typename IDX>
 struct PoolingKernelUtil<DeviceType::kCPU, T, IDX> {
   static void Maxpool1dForward(ep::Stream* stream,
-                               const NdIndexOffsetHelper<IDX, 3>& index_helper,
+                               const NdIndexOffsetHelper<IDX, 2>& index_helper,
                                const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
                                const MaxPoolingParams3D& params_3d) {
     Maxpool1dForwardCompute<T, IDX>(index_helper, elem_num, src, dest, indice_ptr,
@@ -166,7 +166,7 @@ struct PoolingKernelUtil<DeviceType::kCPU, T, IDX> {
   }
 
   static void Maxpool3dForward(ep::Stream* stream,
-                               const NdIndexOffsetHelper<IDX, 5>& index_helper,
+                               const NdIndexOffsetHelper<IDX, 4>& index_helper,
                                const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
                                const MaxPoolingParams3D& params_3d) {
     Maxpool3dForwardCompute<T, IDX>(
@@ -219,15 +219,15 @@ class MaxPool1dKernel final : public user_op::OpKernel {
     T* dest = y->mut_dptr<T>();
     int64_t* indice_ptr = indice->mut_dptr<int64_t>();
 
-    DimVector y_vector;
-    y->shape().ToDimVector(&y_vector);
-
+    DimVector y_vector(2);
+    y_vector.at(0) = y->shape().At(0) * y->shape().At(1); 
+    y_vector.at(1) = y->shape().At(2); 
     if(elem_num < GetMaxVal<int32_t>()){
-      NdIndexOffsetHelper<int32_t, 3> index_helper(y_vector.data());
+      NdIndexOffsetHelper<int32_t, 2> index_helper(y_vector.data());
       PoolingKernelUtil<device_type, T, int32_t>::Maxpool1dForward(ctx->stream(), index_helper, elem_num, src,
                                                         dest, indice_ptr, params_3d);
     }else{
-      NdIndexOffsetHelper<int64_t, 3> index_helper(y_vector.data());
+      NdIndexOffsetHelper<int64_t, 2> index_helper(y_vector.data());
       PoolingKernelUtil<device_type, T, int64_t>::Maxpool1dForward(ctx->stream(), index_helper, elem_num, src,
                                                         dest, indice_ptr, params_3d);
       }
@@ -427,14 +427,18 @@ class MaxPool3dKernel final : public user_op::OpKernel {
     T* dest = y->mut_dptr<T>();
     int64_t* indice_ptr = indice->mut_dptr<int64_t>();
 
-    DimVector y_vector;
-    y->shape().ToDimVector(&y_vector);
+    DimVector y_vector(4);
+    y_vector.at(0) = y->shape().At(0) * y->shape().At(1); 
+    y_vector.at(1) = y->shape().At(2); 
+    y_vector.at(2) = y->shape().At(3); 
+    y_vector.at(3) = y->shape().At(4); 
+
     if(elem_num < GetMaxVal<int32_t>()){
-      NdIndexOffsetHelper<int32_t, 5> index_helper(y_vector.data());
+      NdIndexOffsetHelper<int32_t, 4> index_helper(y_vector.data());
       PoolingKernelUtil<device_type, T, int32_t>::Maxpool3dForward(ctx->stream(), index_helper, elem_num, src,
                                                         dest, indice_ptr, params_3d);
     }else{
-      NdIndexOffsetHelper<int64_t, 5> index_helper(y_vector.data());
+      NdIndexOffsetHelper<int64_t, 4> index_helper(y_vector.data());
       PoolingKernelUtil<device_type, T, int64_t>::Maxpool3dForward(ctx->stream(), index_helper, elem_num, src,
                                                         dest, indice_ptr, params_3d);
     }
diff --git a/oneflow/user/kernels/pooling_kernel.cu b/oneflow/user/kernels/pooling_kernel.cu
index 8ac9bfaacfa..ef3b8228328 100644
--- a/oneflow/user/kernels/pooling_kernel.cu
+++ b/oneflow/user/kernels/pooling_kernel.cu
@@ -85,7 +85,7 @@ __device__ __inline__ void Maxpool2dForwardComputeCLast(
 
 template<typename T, typename IDX>
 __launch_bounds__(kBlockSize) __global__
-    void DoCUDAMaxPool1dForward(const NdIndexOffsetHelper<IDX, 3> index_helper,
+    void DoCUDAMaxPool1dForward(const NdIndexOffsetHelper<IDX, 2> index_helper,
                                 IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
                                 int32_t padding_l, int64_t n_batch, int64_t n_channel,
                                 int64_t x_length, int32_t kernel_size_l,
@@ -126,7 +126,7 @@ __launch_bounds__(kBlockSize) __global__
 
 template<typename T, typename IDX>
 __launch_bounds__(kBlockSize) __global__
-    void DoCUDAMaxPool3dForward(const NdIndexOffsetHelper<IDX, 5> index_helper,
+    void DoCUDAMaxPool3dForward(const NdIndexOffsetHelper<IDX, 4> index_helper,
                                 IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
                                 int32_t padding_t, int32_t padding_h, int32_t padding_w,
                                 int64_t n_batch, int64_t n_channel, int64_t x_time,
@@ -192,7 +192,7 @@ __launch_bounds__(kBlockSize) __global__
 template<typename T, typename IDX>
 struct PoolingKernelUtil<DeviceType::kCUDA, T, IDX> {
   static void Maxpool1dForward(ep::Stream* stream,
-                               const NdIndexOffsetHelper<IDX, 3>& index_helper,
+                               const NdIndexOffsetHelper<IDX, 2>& index_helper,
                                const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
                                const MaxPoolingParams3D& params_3d) {
     DoCUDAMaxPool1dForward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
@@ -266,7 +266,7 @@ struct PoolingKernelUtil<DeviceType::kCUDA, T, IDX> {
   }
 
   static void Maxpool3dForward(ep::Stream* stream,
-                               const NdIndexOffsetHelper<IDX, 5>& index_helper,
+                               const NdIndexOffsetHelper<IDX, 4>& index_helper,
                                const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
                                const MaxPoolingParams3D& params_3d) {
     DoCUDAMaxPool3dForward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
diff --git a/oneflow/user/kernels/pooling_kernel_util.h b/oneflow/user/kernels/pooling_kernel_util.h
index 2e43d1ff033..8a7cfd557a2 100644
--- a/oneflow/user/kernels/pooling_kernel_util.h
+++ b/oneflow/user/kernels/pooling_kernel_util.h
@@ -103,7 +103,7 @@ class MaxPoolingParams3D {
 template<DeviceType device_type, typename T, typename IDX>
 struct PoolingKernelUtil {
   static void Maxpool1dForward(ep::Stream* stream,
-                               const NdIndexOffsetHelper<IDX, 3>& index_helper,
+                               const NdIndexOffsetHelper<IDX, 2>& index_helper,
                                const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
                                const MaxPoolingParams3D& params_3d);
 
@@ -135,7 +135,7 @@ struct PoolingKernelUtil {
                                      const MaxPoolingParams3D& params_3d);
 
   static void Maxpool3dForward(ep::Stream* stream,
-                               const NdIndexOffsetHelper<IDX, 5>& index_helper,
+                               const NdIndexOffsetHelper<IDX, 4>& index_helper,
                                const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
                                const MaxPoolingParams3D& params_3d);
 
@@ -146,18 +146,17 @@ struct PoolingKernelUtil {
 };
 
 template<typename T, typename IDX>
-OF_DEVICE_FUNC void Maxpool1dForwardCompute(const NdIndexOffsetHelper<IDX, 3> index_helper,
+OF_DEVICE_FUNC void Maxpool1dForwardCompute(const NdIndexOffsetHelper<IDX, 2> index_helper,
                                             IDX elem_num, const T* src, T* dest,
                                             int64_t* indice_ptr, const int32_t padding_l,
                                             const int32_t n_batch, const int32_t n_channel,
                                             const int32_t x_length, const int32_t kernel_size_l, 
                                             const int32_t stride_l, const int32_t dilation_l) {
   XPU_1D_KERNEL_LOOP(num, elem_num) {
-    IDX n, c, l;
-    index_helper.OffsetToNdIndex(num, n, c, l);
+    IDX n_c, l;
+    index_helper.OffsetToNdIndex(num, n_c, l);
 
-    // n, c, l->index = n*c*l + c* l
-    const IDX start_idx = (n * n_channel + c) * x_length;
+    const IDX start_idx = n_c * x_length;
     IDX lstart = l * stride_l - padding_l;
     const IDX lend = (lstart + (kernel_size_l - 1) * dilation_l + 1) <= x_length
                              ? (lstart + (kernel_size_l - 1) * dilation_l + 1)
@@ -302,7 +301,7 @@ OF_DEVICE_FUNC void Maxpool2dBackwardComputeCLast(
 
 template<typename T, typename IDX>
 OF_DEVICE_FUNC void Maxpool3dForwardCompute(
-    const NdIndexOffsetHelper<IDX, 5> index_helper, IDX elem_num, const T* src, T* dest,
+    const NdIndexOffsetHelper<IDX, 4> index_helper, IDX elem_num, const T* src, T* dest,
     int64_t* indice_ptr, const int32_t padding_t, const int32_t padding_h, const int32_t padding_w,
     const int32_t n_batch, const int32_t n_channel, 
     const int32_t x_time, const int32_t x_height, const int32_t x_width, 
@@ -310,11 +309,10 @@ OF_DEVICE_FUNC void Maxpool3dForwardCompute(
     const int32_t stride_t, const int32_t stride_h, const int32_t stride_w,
     const int32_t dilation_t, const int32_t dilation_h, const int32_t dilation_w) {
   XPU_1D_KERNEL_LOOP(num, elem_num) {
-    IDX n, c, t, h, w;
-    index_helper.OffsetToNdIndex(num, n, c, t, h, w);
+    IDX n_c, t, h, w;
+    index_helper.OffsetToNdIndex(num, n_c, t, h, w);
 
-    IDX xstart = n * n_channel * x_time * x_width * x_height;
-    IDX start_idx = xstart + c * x_time * x_width * x_height;
+    IDX start_idx = n_c * x_time * x_width * x_height;
     IDX tstart = t * stride_t - padding_t;
     IDX hstart = h * stride_h - padding_h;
     IDX wstart = w * stride_w - padding_w;

From 36a6fc85c1163e1cf2347aab597f44276e78663b Mon Sep 17 00:00:00 2001
From: MARD1NO <359521840@qq.com>
Date: Wed, 9 Feb 2022 10:37:18 +0800
Subject: [PATCH 15/17] optimize backward

---
 oneflow/user/kernels/pooling_kernel.cpp    | 65 ++++++++++---------
 oneflow/user/kernels/pooling_kernel.cu     | 58 ++++++++---------
 oneflow/user/kernels/pooling_kernel_util.h | 73 ++++++++++------------
 3 files changed, 99 insertions(+), 97 deletions(-)

diff --git a/oneflow/user/kernels/pooling_kernel.cpp b/oneflow/user/kernels/pooling_kernel.cpp
index 41236654dcc..132ce9f41e4 100644
--- a/oneflow/user/kernels/pooling_kernel.cpp
+++ b/oneflow/user/kernels/pooling_kernel.cpp
@@ -109,7 +109,7 @@ struct PoolingKernelUtil<DeviceType::kCPU, T, IDX> {
   }
 
   static void Maxpool1dBackward(ep::Stream* stream,
-                                const NdIndexOffsetHelper<IDX, 3>& index_helper,
+                                const NdIndexOffsetHelper<IDX, 2>& index_helper,
                                 const IDX elem_num, const T* src, T* dest,
                                 const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) {
     Maxpool1dBackwardCompute<T, IDX>(index_helper, elem_num, src, dest, indice_ptr,
@@ -131,7 +131,7 @@ struct PoolingKernelUtil<DeviceType::kCPU, T, IDX> {
   }
 
   static void Maxpool2dBackwardCFirst(ep::Stream* stream,
-                                      const NdIndexOffsetHelper<IDX, 4>& index_helper,
+                                      const NdIndexOffsetHelper<IDX, 3>& index_helper,
                                       const IDX elem_num, const T* src, T* dest,
                                       const int64_t* indice_ptr,
                                       const MaxPoolingParams3D& params_3d) {
@@ -181,7 +181,7 @@ struct PoolingKernelUtil<DeviceType::kCPU, T, IDX> {
   }
 
   static void Maxpool3dBackward(ep::Stream* stream,
-                                const NdIndexOffsetHelper<IDX, 5> index_helper,
+                                const NdIndexOffsetHelper<IDX, 4> index_helper,
                                 const IDX elem_num, const T* src, T* dest,
                                 const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) {
     Maxpool3dBackwardCompute<T, IDX>(index_helper, elem_num, src, dest, indice_ptr,
@@ -260,17 +260,18 @@ class MaxPool1dGradKernel final : public user_op::OpKernel {
     const T* src = dy->dptr<T>();
     const int64_t* indice_ptr = indice->dptr<int64_t>();
     T* dest = dx->mut_dptr<T>();
-    DimVector dy_vector;
-    dy->shape().ToDimVector(&dy_vector);
+    DimVector dy_vector(2);
+    dy_vector.at(0) = dy->shape().At(0) * dy->shape().At(1); 
+    dy_vector.at(1) = dy->shape().At(2); 
     size_t out_bytes_size = dx->shape().elem_cnt() * GetSizeOfDataType(dx->data_type());
     Memset<device_type>(ctx->stream(), dest, 0, out_bytes_size);
 
     if(elem_num < GetMaxVal<int32_t>()){
-      NdIndexOffsetHelper<int32_t, 3> index_helper(dy_vector.data());
+      NdIndexOffsetHelper<int32_t, 2> index_helper(dy_vector.data());
       PoolingKernelUtil<device_type, T, int32_t>::Maxpool1dBackward(ctx->stream(), index_helper, elem_num, src,
                                                          dest, indice_ptr, params_3d);
     }else{
-      NdIndexOffsetHelper<int64_t, 3> index_helper(dy_vector.data());
+      NdIndexOffsetHelper<int64_t, 2> index_helper(dy_vector.data());
       PoolingKernelUtil<device_type, T, int64_t>::Maxpool1dBackward(ctx->stream(), index_helper, elem_num, src,
                                                          dest, indice_ptr, params_3d);
     }
@@ -367,35 +368,40 @@ class MaxPool2dGradKernel final : public user_op::OpKernel {
     const T* src = dy->dptr<T>();
     const int64_t* indice_ptr = indice->dptr<int64_t>();
     T* dest = dx->mut_dptr<T>();
-    DimVector dy_vector;
-    dy->shape().ToDimVector(&dy_vector);
 
     size_t out_bytes_size = dx->shape().elem_cnt() * GetSizeOfDataType(dx->data_type());
     Memset<device_type>(ctx->stream(), dest, 0, out_bytes_size);
 
     const std::string& data_format = ctx->Attr<std::string>("data_format");
-    if(elem_num < GetMaxVal<int32_t>()){
-      NdIndexOffsetHelper<int32_t, 4> index_helper(dy_vector.data());
-      if (data_format == "channels_first") {
+      
+    if (data_format == "channels_first") {
+      DimVector dy_vector(3);
+      dy_vector.at(0) = dy->shape().At(0) * dy->shape().At(1); 
+      dy_vector.at(1) = dy->shape().At(2);
+      dy_vector.at(2) = dy->shape().At(3);
+      if(elem_num < GetMaxVal<int32_t>()){
+        NdIndexOffsetHelper<int32_t, 3> index_helper(dy_vector.data());
         PoolingKernelUtil<device_type, T, int32_t>::Maxpool2dBackwardCFirst(
             ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d);
-      } else if (data_format == "channels_last") {
-        PoolingKernelUtil<device_type, T, int32_t>::Maxpool2dBackwardCLast(
-            ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d);
       } else {
-        UNIMPLEMENTED() << "Unsupported data_format";
-      }
-    }else{
-      NdIndexOffsetHelper<int64_t, 4> index_helper(dy_vector.data());
-      if (data_format == "channels_first") {
+        NdIndexOffsetHelper<int64_t, 3> index_helper(dy_vector.data());
         PoolingKernelUtil<device_type, T, int64_t>::Maxpool2dBackwardCFirst(
             ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d);
-      } else if (data_format == "channels_last") {
-        PoolingKernelUtil<device_type, T, int64_t>::Maxpool2dBackwardCLast(
-            ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d);
+      }
+    } else if (data_format == "channels_last") {
+      DimVector dy_vector;
+      dy->shape().ToDimVector(&dy_vector);
+      if(elem_num < GetMaxVal<int32_t>()){
+        NdIndexOffsetHelper<int32_t, 4> index_helper(dy_vector.data());
+        PoolingKernelUtil<device_type, T, int32_t>::Maxpool2dBackwardCLast(
+          ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d);
       } else {
-        UNIMPLEMENTED() << "Unsupported data_format";
+        NdIndexOffsetHelper<int64_t, 4> index_helper(dy_vector.data());
+        PoolingKernelUtil<device_type, T, int64_t>::Maxpool2dBackwardCLast(
+          ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d);
       }
+    } else {
+      UNIMPLEMENTED() << "Unsupported data_format";
     }
   };
 };
@@ -472,18 +478,21 @@ class MaxPool3dGradKernel final : public user_op::OpKernel {
     const int64_t* indice_ptr = indice->dptr<int64_t>();
     T* dest = dx->mut_dptr<T>();
 
-    DimVector dy_vector;
-    dy->shape().ToDimVector(&dy_vector);
+    DimVector dy_vector(4);
+    dy_vector.at(0) = dy->shape().At(0) * dy->shape().At(1); 
+    dy_vector.at(1) = dy->shape().At(2);
+    dy_vector.at(2) = dy->shape().At(3); 
+    dy_vector.at(3) = dy->shape().At(4); 
 
     size_t out_bytes_size = dx->shape().elem_cnt() * GetSizeOfDataType(dx->data_type());
     Memset<device_type>(ctx->stream(), dest, 0, out_bytes_size);
 
     if(elem_num < GetMaxVal<int32_t>()){
-      NdIndexOffsetHelper<int32_t, 5> index_helper(dy_vector.data());
+      NdIndexOffsetHelper<int32_t, 4> index_helper(dy_vector.data());
       PoolingKernelUtil<device_type, T, int32_t>::Maxpool3dBackward(ctx->stream(), index_helper, elem_num, src,
                                                          dest, indice_ptr, params_3d);
     }else {
-      NdIndexOffsetHelper<int64_t, 5> index_helper(dy_vector.data());
+      NdIndexOffsetHelper<int64_t, 4> index_helper(dy_vector.data());
       PoolingKernelUtil<device_type, T, int64_t>::Maxpool3dBackward(ctx->stream(), index_helper, elem_num, src,
                                                          dest, indice_ptr, params_3d);
     }
diff --git a/oneflow/user/kernels/pooling_kernel.cu b/oneflow/user/kernels/pooling_kernel.cu
index ef3b8228328..097ff903ec3 100644
--- a/oneflow/user/kernels/pooling_kernel.cu
+++ b/oneflow/user/kernels/pooling_kernel.cu
@@ -87,8 +87,8 @@ template<typename T, typename IDX>
 __launch_bounds__(kBlockSize) __global__
     void DoCUDAMaxPool1dForward(const NdIndexOffsetHelper<IDX, 2> index_helper,
                                 IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
-                                int32_t padding_l, int64_t n_batch, int64_t n_channel,
-                                int64_t x_length, int32_t kernel_size_l,
+                                int32_t padding_l, int32_t n_batch, int32_t n_channel,
+                                int32_t x_length, int32_t kernel_size_l,
                                 int32_t stride_l, int32_t dilation_l) {
   Maxpool1dForwardCompute<T, IDX>(index_helper, elem_num, src, dest, indice_ptr, padding_l, n_batch,
                              n_channel, x_length, kernel_size_l, stride_l, dilation_l);
@@ -113,9 +113,9 @@ template<typename T, typename IDX>
 __launch_bounds__(kBlockSize) __global__
     void DoCUDAMaxPool2dForwardCLast(const NdIndexOffsetHelper<IDX, 4> index_helper,
                                      IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
-                                     int32_t padding_h, int32_t padding_w, int64_t n_batch,
-                                     int64_t n_channel, int64_t x_height, int64_t x_width,
-                                     int64_t y_height, int64_t y_width, int32_t kernel_size_h,
+                                     int32_t padding_h, int32_t padding_w, int32_t n_batch,
+                                     int32_t n_channel, int32_t x_height, int32_t x_width,
+                                     int32_t y_height, int32_t y_width, int32_t kernel_size_h,
                                      int32_t kernel_size_w, int32_t stride_h, int32_t stride_w,
                                      int32_t dilation_h, int32_t dilation_w) {
   Maxpool2dForwardComputeCLast<T, IDX>(index_helper, elem_num, src, dest, indice_ptr, padding_h,
@@ -129,8 +129,8 @@ __launch_bounds__(kBlockSize) __global__
     void DoCUDAMaxPool3dForward(const NdIndexOffsetHelper<IDX, 4> index_helper,
                                 IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
                                 int32_t padding_t, int32_t padding_h, int32_t padding_w,
-                                int64_t n_batch, int64_t n_channel, int64_t x_time,
-                                int64_t x_height, int64_t x_width, 
+                                int32_t n_batch, int32_t n_channel, int32_t x_time,
+                                int32_t x_height, int32_t x_width, 
                                 int32_t kernel_size_t, int32_t kernel_size_h,
                                 int32_t kernel_size_w, int32_t stride_t, int32_t stride_h,
                                 int32_t stride_w, int32_t dilation_t, int32_t dilation_h,
@@ -143,23 +143,23 @@ __launch_bounds__(kBlockSize) __global__
 
 template<typename T, typename IDX>
 __launch_bounds__(kBlockSize) __global__
-    void DoCUDAMaxPool1dBackward(const NdIndexOffsetHelper<IDX, 3> index_helper,
+  void DoCUDAMaxPool1dBackward(const NdIndexOffsetHelper<IDX, 2> index_helper,
                                  const IDX elem_num, const T* src, T* dest,
-                                 const int64_t* indice_ptr, const int64_t n_batch,
-                                 const int64_t n_channel, const int64_t src_length,
-                                 const int64_t dst_length) {
+                                 const int64_t* indice_ptr, const int32_t n_batch,
+                                 const int32_t n_channel, const int32_t src_length,
+                                 const int32_t dst_length) {
   Maxpool1dBackwardCompute<T, IDX>(index_helper, elem_num, src, dest, indice_ptr, n_batch, n_channel,
                               src_length, dst_length);
 };
 
 template<typename T, typename IDX>
 __launch_bounds__(kBlockSize) __global__
-    void DoCUDAMaxPool2dBackwardCFirst(const NdIndexOffsetHelper<IDX, 4> index_helper,
+    void DoCUDAMaxPool2dBackwardCFirst(const NdIndexOffsetHelper<IDX, 3> index_helper,
                                        const IDX elem_num, const T* src, T* dest,
-                                       const int64_t* indice_ptr, const int64_t n_batch,
-                                       const int64_t n_channel, const int64_t src_height,
-                                       const int64_t src_width, const int64_t dst_height,
-                                       const int64_t dst_width) {
+                                       const int64_t* indice_ptr, const int32_t n_batch,
+                                       const int32_t n_channel, const int32_t src_height,
+                                       const int32_t src_width, const int32_t dst_height,
+                                       const int32_t dst_width) {
   Maxpool2dBackwardComputeCFirst<T, IDX>(index_helper, elem_num, src, dest, indice_ptr, n_batch,
                                     n_channel, src_height, src_width, dst_height, dst_width);
 };
@@ -168,23 +168,23 @@ template<typename T, typename IDX>
 __launch_bounds__(kBlockSize) __global__
     void DoCUDAMaxPool2dBackwardCLast(const NdIndexOffsetHelper<IDX, 4> index_helper,
                                       const IDX elem_num, const T* src, T* dest,
-                                      const int64_t* indice_ptr, const int64_t n_batch,
-                                      const int64_t n_channel, const int64_t src_height,
-                                      const int64_t src_width, const int64_t dst_height,
-                                      const int64_t dst_width) {
+                                      const int64_t* indice_ptr, const int32_t n_batch,
+                                      const int32_t n_channel, const int32_t src_height,
+                                      const int32_t src_width, const int32_t dst_height,
+                                      const int32_t dst_width) {
   Maxpool2dBackwardComputeCLast<T, IDX>(index_helper, elem_num, src, dest, indice_ptr, n_batch,
                                    n_channel, src_height, src_width, dst_height, dst_width);
 };
 
 template<typename T, typename IDX>
 __launch_bounds__(kBlockSize) __global__
-    void DoCUDAMaxPool3dBackward(const NdIndexOffsetHelper<IDX, 5> index_helper,
+    void DoCUDAMaxPool3dBackward(const NdIndexOffsetHelper<IDX, 4> index_helper,
                                  const IDX elem_num, const T* src, T* dest,
-                                 const int64_t* indice_ptr, const int64_t n_batch,
-                                 const int64_t n_channel, const int64_t src_time,
-                                 const int64_t src_height, const int64_t src_width,
-                                 const int64_t dst_time, const int64_t dst_height,
-                                 const int64_t dst_width) {
+                                 const int64_t* indice_ptr, const int32_t n_batch,
+                                 const int32_t n_channel, const int32_t src_time,
+                                 const int32_t src_height, const int32_t src_width,
+                                 const int32_t dst_time, const int32_t dst_height,
+                                 const int32_t dst_width) {
   Maxpool3dBackwardCompute<T, IDX>(index_helper, elem_num, src, dest, indice_ptr, n_batch, n_channel,
                               src_time, src_height, src_width, dst_time, dst_height, dst_width);
 };
@@ -204,7 +204,7 @@ struct PoolingKernelUtil<DeviceType::kCUDA, T, IDX> {
   }
 
   static void Maxpool1dBackward(ep::Stream* stream,
-                                const NdIndexOffsetHelper<IDX, 3>& index_helper,
+                                const NdIndexOffsetHelper<IDX, 2>& index_helper,
                                 const IDX elem_num, const T* src, T* dest,
                                 const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) {
     DoCUDAMaxPool1dBackward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
@@ -228,7 +228,7 @@ struct PoolingKernelUtil<DeviceType::kCUDA, T, IDX> {
   }
 
   static void Maxpool2dBackwardCFirst(ep::Stream* stream,
-                                      const NdIndexOffsetHelper<IDX, 4>& index_helper,
+                                      const NdIndexOffsetHelper<IDX, 3>& index_helper,
                                       const IDX elem_num, const T* src, T* dest,
                                       const int64_t* indice_ptr,
                                       const MaxPoolingParams3D& params_3d) {
@@ -283,7 +283,7 @@ struct PoolingKernelUtil<DeviceType::kCUDA, T, IDX> {
   }
 
   static void Maxpool3dBackward(ep::Stream* stream,
-                                const NdIndexOffsetHelper<IDX, 5>& index_helper,
+                                const NdIndexOffsetHelper<IDX, 4>& index_helper,
                                 const IDX elem_num, const T* src, T* dest,
                                 const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) {
     DoCUDAMaxPool3dBackward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
diff --git a/oneflow/user/kernels/pooling_kernel_util.h b/oneflow/user/kernels/pooling_kernel_util.h
index 8a7cfd557a2..dc99137d439 100644
--- a/oneflow/user/kernels/pooling_kernel_util.h
+++ b/oneflow/user/kernels/pooling_kernel_util.h
@@ -108,7 +108,7 @@ struct PoolingKernelUtil {
                                const MaxPoolingParams3D& params_3d);
 
   static void Maxpool1dBackward(ep::Stream* stream,
-                                const NdIndexOffsetHelper<IDX, 3>& index_helper,
+                                const NdIndexOffsetHelper<IDX, 2>& index_helper,
                                 const IDX elem_num, const T* src, T* dest,
                                 const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d);
 
@@ -118,7 +118,7 @@ struct PoolingKernelUtil {
                                      int64_t* indice_ptr, const MaxPoolingParams3D& params_3d);
 
   static void Maxpool2dBackwardCFirst(ep::Stream* stream,
-                                      const NdIndexOffsetHelper<IDX, 4>& index_helper,
+                                      const NdIndexOffsetHelper<IDX, 3>& index_helper,
                                       const IDX elem_num, const T* src, T* dest,
                                       const int64_t* indice_ptr,
                                       const MaxPoolingParams3D& params_3d);
@@ -140,7 +140,7 @@ struct PoolingKernelUtil {
                                const MaxPoolingParams3D& params_3d);
 
   static void Maxpool3dBackward(ep::Stream* stream,
-                                const NdIndexOffsetHelper<IDX, 5>& index_helper,
+                                const NdIndexOffsetHelper<IDX, 4>& index_helper,
                                 const IDX elem_num, const T* src, T* dest,
                                 const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d);
 };
@@ -156,7 +156,6 @@ OF_DEVICE_FUNC void Maxpool1dForwardCompute(const NdIndexOffsetHelper<IDX, 2> in
     IDX n_c, l;
     index_helper.OffsetToNdIndex(num, n_c, l);
 
-    const IDX start_idx = n_c * x_length;
     IDX lstart = l * stride_l - padding_l;
     const IDX lend = (lstart + (kernel_size_l - 1) * dilation_l + 1) <= x_length
                              ? (lstart + (kernel_size_l - 1) * dilation_l + 1)
@@ -166,37 +165,35 @@ OF_DEVICE_FUNC void Maxpool1dForwardCompute(const NdIndexOffsetHelper<IDX, 2> in
 
     /* compute max value(src[src_idx]) in kernel box region, and save the value to dest[num] */
     IDX max_index = lstart;
-    IDX src_idx = 0;
 
     /* equal to -std::numeric_limits<T>::infinity(); */
     T max_value = detail::numeric_limits<T>::lower_bound();
-
+    const T* data = src + n_c * x_length;
     for (IDX idx = lstart; idx < lend; idx += dilation_l) {
-      const IDX search_idx = start_idx + idx;
-      T val = src[search_idx];
+      const IDX window_idx = idx;
+      T val = data[window_idx];
       if (val > max_value || detail::numerics<T>::isnan(val)) {
         max_value = val;
         max_index = idx;
-        src_idx = search_idx;
       }
     }
-    dest[num] = src[src_idx];
+    dest[num] = max_value;
     indice_ptr[num] = max_index;
   }
 }
 
 template<typename T, typename IDX>
-OF_DEVICE_FUNC void Maxpool1dBackwardCompute(const NdIndexOffsetHelper<IDX, 3> index_helper,
+OF_DEVICE_FUNC void Maxpool1dBackwardCompute(const NdIndexOffsetHelper<IDX, 2> index_helper,
                                              const IDX elem_num, const T* src, T* dest,
                                              const int64_t* indice_ptr, const int32_t n_batch,
                                              const int32_t n_channel, const int32_t src_length,
                                              const int32_t dst_length) {
   XPU_1D_KERNEL_LOOP(num, elem_num) {
-    IDX n, c, l;
-    index_helper.OffsetToNdIndex(num, n, c, l);
+    IDX n_c, l;
+    index_helper.OffsetToNdIndex(num, n_c, l);
 
-    const IDX src_start = (n * n_channel + c) * src_length;
-    const IDX dst_start = (n * n_channel + c) * dst_length;
+    const IDX src_start = n_c * src_length;
+    const IDX dst_start = n_c * dst_length;
     const IDX index = src_start + l;
     const IDX max_index = dst_start + indice_ptr[index];
     if (max_index != -1) {
@@ -217,8 +214,8 @@ OF_DEVICE_FUNC void Maxpool2dForwardComputeCFirst(
   XPU_1D_KERNEL_LOOP(num, elem_num) {
     IDX n_c, h, w;
     index_helper.OffsetToNdIndex(num, n_c, h, w);
-    int32_t hstart = h * stride_h - padding_h;
-    int32_t wstart = w * stride_w - padding_w;
+    IDX hstart = h * stride_h - padding_h;
+    IDX wstart = w * stride_w - padding_w;
     const IDX hend = (hstart + (kernel_size_h - 1) * dilation_h + 1) <= x_height
                              ? (hstart + (kernel_size_h - 1) * dilation_h + 1)
                              : x_height;
@@ -230,12 +227,12 @@ OF_DEVICE_FUNC void Maxpool2dForwardComputeCFirst(
     /* equal to -std::numeric_limits<T>::infinity(); */
     T max_value = detail::numeric_limits<T>::lower_bound();
     /* compute max value(src[src_idx]) in kernel box region, and save the value to dest[num] */
-    int32_t max_index = hstart * x_width + wstart;
-    const T* btm_data = src + n_c * x_width * x_height;
-    for (int32_t i = hstart; i < hend; i += dilation_h) {
-      for (int32_t j = wstart; j < wend; j += dilation_w) {
-        const int32_t window_idx = i * x_width + j;
-        T val = btm_data[window_idx]; 
+    IDX max_index = hstart * x_width + wstart;
+    const T* data = src + n_c * x_width * x_height;
+    for (IDX i = hstart; i < hend; i += dilation_h) {
+      for (IDX j = wstart; j < wend; j += dilation_w) {
+        const IDX window_idx = i * x_width + j;
+        T val = data[window_idx]; 
         /* NOTE:
         std::isnan(val) only supports a few data types, see:
         https://en.cppreference.com/w/cpp/numeric/math/isnan and when use gcc/g++ 4.x to compile,
@@ -259,16 +256,16 @@ OF_DEVICE_FUNC void Maxpool2dForwardComputeCFirst(
 
 template<typename T, typename IDX>
 OF_DEVICE_FUNC void Maxpool2dBackwardComputeCFirst(
-    const NdIndexOffsetHelper<IDX, 4> index_helper, const IDX elem_num, const T* src,
+    const NdIndexOffsetHelper<IDX, 3> index_helper, const IDX elem_num, const T* src,
     T* dest, const int64_t* indice_ptr, const int32_t n_batch, const int32_t n_channel,
     const int32_t src_height, const int32_t src_width, const int32_t dst_height,
     const int32_t dst_width) {
   XPU_1D_KERNEL_LOOP(num, elem_num) {
-    IDX n, c, h, w;
-    index_helper.OffsetToNdIndex(num, n, c, h, w);
+    IDX n_c, h, w;
+    index_helper.OffsetToNdIndex(num, n_c, h, w);
 
-    const IDX src_start = (n * n_channel + c) * src_height * src_width;
-    const IDX dst_start = (n * n_channel + c) * dst_height * dst_width;
+    const IDX src_start = n_c * src_height * src_width;
+    const IDX dst_start = n_c * dst_height * dst_width;
     const IDX index = src_start + h * src_width + w;
 
     const IDX max_index = dst_start + indice_ptr[index];
@@ -312,7 +309,6 @@ OF_DEVICE_FUNC void Maxpool3dForwardCompute(
     IDX n_c, t, h, w;
     index_helper.OffsetToNdIndex(num, n_c, t, h, w);
 
-    IDX start_idx = n_c * x_time * x_width * x_height;
     IDX tstart = t * stride_t - padding_t;
     IDX hstart = h * stride_h - padding_h;
     IDX wstart = w * stride_w - padding_w;
@@ -329,24 +325,21 @@ OF_DEVICE_FUNC void Maxpool3dForwardCompute(
     while (wstart < 0) { wstart += dilation_w; }
 
     IDX max_index = tstart * x_height * x_width + hstart * x_width + wstart;
-    IDX src_idx = 0;
-
+    const T* data = src + n_c * x_time * x_width * x_height;
     T max_value = detail::numeric_limits<T>::lower_bound();
     for (IDX zi = tstart; zi < tend; zi += dilation_t) {
       for (IDX i = hstart; i < hend; i += dilation_h) {
         for (IDX j = wstart; j < wend; j += dilation_w) {
           const IDX window_idx = zi * x_height * x_width + i * x_width + j;
-          const IDX search_idx = start_idx + window_idx;
-          T val = src[search_idx];
+          T val = data[window_idx];
           if (val > max_value || detail::numerics<T>::isnan(val)) {
             max_value = val;
             max_index = window_idx;
-            src_idx = search_idx;
           }
         }
       }
       /* set output to local max */
-      dest[num] = src[src_idx];
+      dest[num] = max_value;
       /* store location of max */
       indice_ptr[num] = max_index;
     }
@@ -354,7 +347,7 @@ OF_DEVICE_FUNC void Maxpool3dForwardCompute(
 }
 
 template<typename T, typename IDX>
-OF_DEVICE_FUNC void Maxpool3dBackwardCompute(const NdIndexOffsetHelper<IDX, 5> index_helper,
+OF_DEVICE_FUNC void Maxpool3dBackwardCompute(const NdIndexOffsetHelper<IDX, 4> index_helper,
                                              const IDX elem_num, const T* src, T* dest,
                                              const int64_t* indice_ptr, const int32_t n_batch,
                                              const int32_t n_channel, const int32_t src_time,
@@ -362,11 +355,11 @@ OF_DEVICE_FUNC void Maxpool3dBackwardCompute(const NdIndexOffsetHelper<IDX, 5> i
                                              const int32_t dst_time, const int32_t dst_height,
                                              const int32_t dst_width) {
   XPU_1D_KERNEL_LOOP(num, elem_num) {
-    IDX n, c, t, h, w;
-    index_helper.OffsetToNdIndex(num, n, c, t, h, w);
+    IDX n_c, t, h, w;
+    index_helper.OffsetToNdIndex(num, n_c, t, h, w);
 
-    const IDX src_start = (n * n_channel + c) * src_time * src_height * src_width;
-    const IDX dst_start = (n * n_channel + c) * dst_time * dst_height * dst_width;
+    const IDX src_start = n_c * src_time * src_height * src_width;
+    const IDX dst_start = n_c * dst_time * dst_height * dst_width;
     const IDX index = src_start + t * src_height * src_width + h * src_width + w;
     const IDX max_index = dst_start + indice_ptr[index];
 

From 97cbec0c2e45fbdaf2f7b4ba50779c5ca6e02ffc Mon Sep 17 00:00:00 2001
From: MARD1NO <359521840@qq.com>
Date: Wed, 9 Feb 2022 11:31:55 +0800
Subject: [PATCH 16/17] fix all optimize. TODO: NHWC

---
 oneflow/user/kernels/pooling_kernel.cpp    | 193 ++++++++++-----------
 oneflow/user/kernels/pooling_kernel.cu     | 163 ++++++++---------
 oneflow/user/kernels/pooling_kernel_util.h |  79 ++++-----
 3 files changed, 204 insertions(+), 231 deletions(-)

diff --git a/oneflow/user/kernels/pooling_kernel.cpp b/oneflow/user/kernels/pooling_kernel.cpp
index 132ce9f41e4..12d979ec536 100644
--- a/oneflow/user/kernels/pooling_kernel.cpp
+++ b/oneflow/user/kernels/pooling_kernel.cpp
@@ -43,8 +43,8 @@ std::shared_ptr<PoolingOpKernelCache> CreateOpKernelCache(user_op::KernelCacheCo
 namespace {
 
 template<typename T, typename IDX>
-void Maxpool2dForwardComputeCLast(const NdIndexOffsetHelper<IDX, 4>& index_helper,
-                                  IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
+void Maxpool2dForwardComputeCLast(const NdIndexOffsetHelper<IDX, 4>& index_helper, IDX elem_num,
+                                  const T* src, T* dest, int64_t* indice_ptr,
                                   const int32_t padding_h, const int32_t padding_w,
                                   const int32_t n_batch, const int32_t n_channel,
                                   const int32_t x_height, const int32_t x_width,
@@ -61,11 +61,11 @@ void Maxpool2dForwardComputeCLast(const NdIndexOffsetHelper<IDX, 4>& index_helpe
     IDX hstart = h * stride_h - padding_h;
     IDX wstart = w * stride_w - padding_w;
     const IDX hend = (hstart + (kernel_size_h - 1) * dilation_h + 1) <= x_height
-                             ? (hstart + (kernel_size_h - 1) * dilation_h + 1)
-                             : x_height;
+                         ? (hstart + (kernel_size_h - 1) * dilation_h + 1)
+                         : x_height;
     const IDX wend = (wstart + (kernel_size_w - 1) * dilation_w + 1) <= x_width
-                             ? (wstart + (kernel_size_w - 1) * dilation_w + 1)
-                             : x_width;
+                         ? (wstart + (kernel_size_w - 1) * dilation_w + 1)
+                         : x_width;
 
     while (hstart < 0) { hstart += dilation_h; }
     while (wstart < 0) { wstart += dilation_w; }
@@ -97,37 +97,33 @@ void Maxpool2dForwardComputeCLast(const NdIndexOffsetHelper<IDX, 4>& index_helpe
 
 template<typename T, typename IDX>
 struct PoolingKernelUtil<DeviceType::kCPU, T, IDX> {
-  static void Maxpool1dForward(ep::Stream* stream,
-                               const NdIndexOffsetHelper<IDX, 2>& index_helper,
+  static void Maxpool1dForward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 2>& index_helper,
                                const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
                                const MaxPoolingParams3D& params_3d) {
-    Maxpool1dForwardCompute<T, IDX>(index_helper, elem_num, src, dest, indice_ptr,
-                               params_3d.padding()[2], params_3d.num_batch(),
-                               params_3d.num_channel(), params_3d.GetXShape5D().At(4),
-                               params_3d.pooling_size_3d()[2],
-                               params_3d.stride_3d()[2], params_3d.dilation_3d()[2]);
+    Maxpool1dForwardCompute<T, IDX>(
+        index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[2],
+        params_3d.num_batch(), params_3d.num_channel(), params_3d.GetXShape5D().At(4),
+        params_3d.pooling_size_3d()[2], params_3d.stride_3d()[2], params_3d.dilation_3d()[2]);
   }
 
-  static void Maxpool1dBackward(ep::Stream* stream,
-                                const NdIndexOffsetHelper<IDX, 2>& index_helper,
+  static void Maxpool1dBackward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 2>& index_helper,
                                 const IDX elem_num, const T* src, T* dest,
                                 const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) {
     Maxpool1dBackwardCompute<T, IDX>(index_helper, elem_num, src, dest, indice_ptr,
-                                params_3d.num_batch(), params_3d.num_channel(),
-                                params_3d.GetYShape5D().At(4), params_3d.GetXShape5D().At(4));
+                                     params_3d.num_batch(), params_3d.num_channel(),
+                                     params_3d.GetYShape5D().At(4), params_3d.GetXShape5D().At(4));
   }
 
   static void Maxpool2dForwardCFirst(ep::Stream* stream,
                                      const NdIndexOffsetHelper<IDX, 3>& index_helper,
-                                     const IDX elem_num, const T* src, T* dest,
-                                     int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) {
+                                     const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
+                                     const MaxPoolingParams3D& params_3d) {
     Maxpool2dForwardComputeCFirst<T, IDX>(
         index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[1],
         params_3d.padding()[2], params_3d.num_batch(), params_3d.num_channel(),
-        params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4), 
-        params_3d.pooling_size_3d()[1], params_3d.pooling_size_3d()[2], 
-        params_3d.stride_3d()[1], params_3d.stride_3d()[2],
-        params_3d.dilation_3d()[1], params_3d.dilation_3d()[2]);
+        params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4),
+        params_3d.pooling_size_3d()[1], params_3d.pooling_size_3d()[2], params_3d.stride_3d()[1],
+        params_3d.stride_3d()[2], params_3d.dilation_3d()[1], params_3d.dilation_3d()[2]);
   }
 
   static void Maxpool2dBackwardCFirst(ep::Stream* stream,
@@ -135,16 +131,16 @@ struct PoolingKernelUtil<DeviceType::kCPU, T, IDX> {
                                       const IDX elem_num, const T* src, T* dest,
                                       const int64_t* indice_ptr,
                                       const MaxPoolingParams3D& params_3d) {
-    Maxpool2dBackwardComputeCFirst<T, IDX>(index_helper, elem_num, src, dest, indice_ptr,
-                                      params_3d.num_batch(), params_3d.num_channel(),
-                                      params_3d.GetYShape5D().At(3), params_3d.GetYShape5D().At(4),
-                                      params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4));
+    Maxpool2dBackwardComputeCFirst<T, IDX>(
+        index_helper, elem_num, src, dest, indice_ptr, params_3d.num_batch(),
+        params_3d.num_channel(), params_3d.GetYShape5D().At(3), params_3d.GetYShape5D().At(4),
+        params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4));
   }
 
   static void Maxpool2dForwardCLast(ep::Stream* stream,
                                     const NdIndexOffsetHelper<IDX, 4>& index_helper,
-                                    const IDX elem_num, const T* src, T* dest,
-                                    int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) {
+                                    const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
+                                    const MaxPoolingParams3D& params_3d) {
     Maxpool2dForwardComputeCLast<T, IDX>(
         index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[1],
         params_3d.padding()[2], params_3d.num_batch(), params_3d.num_channel(),
@@ -159,36 +155,33 @@ struct PoolingKernelUtil<DeviceType::kCPU, T, IDX> {
                                      const IDX elem_num, const T* src, T* dest,
                                      const int64_t* indice_ptr,
                                      const MaxPoolingParams3D& params_3d) {
-    Maxpool2dBackwardComputeCLast<T, IDX>(index_helper, elem_num, src, dest, indice_ptr,
-                                     params_3d.num_batch(), params_3d.num_channel(),
-                                     params_3d.GetYShape5D().At(3), params_3d.GetYShape5D().At(4),
-                                     params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4));
+    Maxpool2dBackwardComputeCLast<T, IDX>(
+        index_helper, elem_num, src, dest, indice_ptr, params_3d.num_batch(),
+        params_3d.num_channel(), params_3d.GetYShape5D().At(3), params_3d.GetYShape5D().At(4),
+        params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4));
   }
 
-  static void Maxpool3dForward(ep::Stream* stream,
-                               const NdIndexOffsetHelper<IDX, 4>& index_helper,
+  static void Maxpool3dForward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 4>& index_helper,
                                const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
                                const MaxPoolingParams3D& params_3d) {
     Maxpool3dForwardCompute<T, IDX>(
         index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[0],
         params_3d.padding()[1], params_3d.padding()[2], params_3d.num_batch(),
-        params_3d.num_channel(), 
-        params_3d.GetXShape5D().At(2), params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4), 
-        params_3d.pooling_size_3d()[0], params_3d.pooling_size_3d()[1], params_3d.pooling_size_3d()[2], 
-        params_3d.stride_3d()[0], params_3d.stride_3d()[1], params_3d.stride_3d()[2], 
-        params_3d.dilation_3d()[0],
+        params_3d.num_channel(), params_3d.GetXShape5D().At(2), params_3d.GetXShape5D().At(3),
+        params_3d.GetXShape5D().At(4), params_3d.pooling_size_3d()[0],
+        params_3d.pooling_size_3d()[1], params_3d.pooling_size_3d()[2], params_3d.stride_3d()[0],
+        params_3d.stride_3d()[1], params_3d.stride_3d()[2], params_3d.dilation_3d()[0],
         params_3d.dilation_3d()[1], params_3d.dilation_3d()[2]);
   }
 
-  static void Maxpool3dBackward(ep::Stream* stream,
-                                const NdIndexOffsetHelper<IDX, 4> index_helper,
+  static void Maxpool3dBackward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 4> index_helper,
                                 const IDX elem_num, const T* src, T* dest,
                                 const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) {
     Maxpool3dBackwardCompute<T, IDX>(index_helper, elem_num, src, dest, indice_ptr,
-                                params_3d.num_batch(), params_3d.num_channel(),
-                                params_3d.GetYShape5D().At(2), params_3d.GetYShape5D().At(3),
-                                params_3d.GetYShape5D().At(4), params_3d.GetXShape5D().At(2),
-                                params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4));
+                                     params_3d.num_batch(), params_3d.num_channel(),
+                                     params_3d.GetYShape5D().At(2), params_3d.GetYShape5D().At(3),
+                                     params_3d.GetYShape5D().At(4), params_3d.GetXShape5D().At(2),
+                                     params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4));
   }
 };
 
@@ -220,18 +213,18 @@ class MaxPool1dKernel final : public user_op::OpKernel {
     int64_t* indice_ptr = indice->mut_dptr<int64_t>();
 
     DimVector y_vector(2);
-    y_vector.at(0) = y->shape().At(0) * y->shape().At(1); 
-    y_vector.at(1) = y->shape().At(2); 
-    if(elem_num < GetMaxVal<int32_t>()){
+    y_vector.at(0) = y->shape().At(0) * y->shape().At(1);
+    y_vector.at(1) = y->shape().At(2);
+    if (elem_num < GetMaxVal<int32_t>()) {
       NdIndexOffsetHelper<int32_t, 2> index_helper(y_vector.data());
-      PoolingKernelUtil<device_type, T, int32_t>::Maxpool1dForward(ctx->stream(), index_helper, elem_num, src,
-                                                        dest, indice_ptr, params_3d);
-    }else{
+      PoolingKernelUtil<device_type, T, int32_t>::Maxpool1dForward(
+          ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d);
+    } else {
       NdIndexOffsetHelper<int64_t, 2> index_helper(y_vector.data());
-      PoolingKernelUtil<device_type, T, int64_t>::Maxpool1dForward(ctx->stream(), index_helper, elem_num, src,
-                                                        dest, indice_ptr, params_3d);
-      }
+      PoolingKernelUtil<device_type, T, int64_t>::Maxpool1dForward(
+          ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d);
     }
+  }
 };
 
 template<DeviceType device_type, typename T>
@@ -261,21 +254,20 @@ class MaxPool1dGradKernel final : public user_op::OpKernel {
     const int64_t* indice_ptr = indice->dptr<int64_t>();
     T* dest = dx->mut_dptr<T>();
     DimVector dy_vector(2);
-    dy_vector.at(0) = dy->shape().At(0) * dy->shape().At(1); 
-    dy_vector.at(1) = dy->shape().At(2); 
+    dy_vector.at(0) = dy->shape().At(0) * dy->shape().At(1);
+    dy_vector.at(1) = dy->shape().At(2);
     size_t out_bytes_size = dx->shape().elem_cnt() * GetSizeOfDataType(dx->data_type());
     Memset<device_type>(ctx->stream(), dest, 0, out_bytes_size);
 
-    if(elem_num < GetMaxVal<int32_t>()){
+    if (elem_num < GetMaxVal<int32_t>()) {
       NdIndexOffsetHelper<int32_t, 2> index_helper(dy_vector.data());
-      PoolingKernelUtil<device_type, T, int32_t>::Maxpool1dBackward(ctx->stream(), index_helper, elem_num, src,
-                                                         dest, indice_ptr, params_3d);
-    }else{
+      PoolingKernelUtil<device_type, T, int32_t>::Maxpool1dBackward(
+          ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d);
+    } else {
       NdIndexOffsetHelper<int64_t, 2> index_helper(dy_vector.data());
-      PoolingKernelUtil<device_type, T, int64_t>::Maxpool1dBackward(ctx->stream(), index_helper, elem_num, src,
-                                                         dest, indice_ptr, params_3d);
+      PoolingKernelUtil<device_type, T, int64_t>::Maxpool1dBackward(
+          ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d);
     }
-
   };
 };
 
@@ -311,30 +303,29 @@ class MaxPool2dKernel final : public user_op::OpKernel {
     const std::string& data_format = ctx->Attr<std::string>("data_format");
     if (data_format == "channels_first") {
       DimVector y_vector(3);
-      y_vector.at(0) = y->shape().At(0) * y->shape().At(1); 
-      y_vector.at(1) = y->shape().At(2); 
-      y_vector.at(2) = y->shape().At(3); 
-      if(elem_num < GetMaxVal<int32_t>()){
+      y_vector.at(0) = y->shape().At(0) * y->shape().At(1);
+      y_vector.at(1) = y->shape().At(2);
+      y_vector.at(2) = y->shape().At(3);
+      if (elem_num < GetMaxVal<int32_t>()) {
         NdIndexOffsetHelper<int32_t, 3> index_helper(y_vector.data());
         PoolingKernelUtil<device_type, T, int32_t>::Maxpool2dForwardCFirst(
-          ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d);
-      }
-      else{
+            ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d);
+      } else {
         NdIndexOffsetHelper<int64_t, 3> index_helper(y_vector.data());
         PoolingKernelUtil<device_type, T, int64_t>::Maxpool2dForwardCFirst(
-          ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d);
+            ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d);
       }
     } else if (data_format == "channels_last") {
       DimVector y_vector;
       y->shape().ToDimVector(&y_vector);
-      if(elem_num < GetMaxVal<int32_t>()){
+      if (elem_num < GetMaxVal<int32_t>()) {
         NdIndexOffsetHelper<int32_t, 4> index_helper(y_vector.data());
         PoolingKernelUtil<device_type, T, int32_t>::Maxpool2dForwardCLast(
-          ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d);
-      }else{
+            ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d);
+      } else {
         NdIndexOffsetHelper<int64_t, 4> index_helper(y_vector.data());
         PoolingKernelUtil<device_type, T, int64_t>::Maxpool2dForwardCLast(
-          ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d);
+            ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d);
       }
     } else {
       UNIMPLEMENTED() << "Unsupported data_format";
@@ -373,13 +364,13 @@ class MaxPool2dGradKernel final : public user_op::OpKernel {
     Memset<device_type>(ctx->stream(), dest, 0, out_bytes_size);
 
     const std::string& data_format = ctx->Attr<std::string>("data_format");
-      
+
     if (data_format == "channels_first") {
       DimVector dy_vector(3);
-      dy_vector.at(0) = dy->shape().At(0) * dy->shape().At(1); 
+      dy_vector.at(0) = dy->shape().At(0) * dy->shape().At(1);
       dy_vector.at(1) = dy->shape().At(2);
       dy_vector.at(2) = dy->shape().At(3);
-      if(elem_num < GetMaxVal<int32_t>()){
+      if (elem_num < GetMaxVal<int32_t>()) {
         NdIndexOffsetHelper<int32_t, 3> index_helper(dy_vector.data());
         PoolingKernelUtil<device_type, T, int32_t>::Maxpool2dBackwardCFirst(
             ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d);
@@ -391,14 +382,14 @@ class MaxPool2dGradKernel final : public user_op::OpKernel {
     } else if (data_format == "channels_last") {
       DimVector dy_vector;
       dy->shape().ToDimVector(&dy_vector);
-      if(elem_num < GetMaxVal<int32_t>()){
+      if (elem_num < GetMaxVal<int32_t>()) {
         NdIndexOffsetHelper<int32_t, 4> index_helper(dy_vector.data());
         PoolingKernelUtil<device_type, T, int32_t>::Maxpool2dBackwardCLast(
-          ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d);
+            ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d);
       } else {
         NdIndexOffsetHelper<int64_t, 4> index_helper(dy_vector.data());
         PoolingKernelUtil<device_type, T, int64_t>::Maxpool2dBackwardCLast(
-          ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d);
+            ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d);
       }
     } else {
       UNIMPLEMENTED() << "Unsupported data_format";
@@ -434,19 +425,19 @@ class MaxPool3dKernel final : public user_op::OpKernel {
     int64_t* indice_ptr = indice->mut_dptr<int64_t>();
 
     DimVector y_vector(4);
-    y_vector.at(0) = y->shape().At(0) * y->shape().At(1); 
-    y_vector.at(1) = y->shape().At(2); 
-    y_vector.at(2) = y->shape().At(3); 
-    y_vector.at(3) = y->shape().At(4); 
+    y_vector.at(0) = y->shape().At(0) * y->shape().At(1);
+    y_vector.at(1) = y->shape().At(2);
+    y_vector.at(2) = y->shape().At(3);
+    y_vector.at(3) = y->shape().At(4);
 
-    if(elem_num < GetMaxVal<int32_t>()){
+    if (elem_num < GetMaxVal<int32_t>()) {
       NdIndexOffsetHelper<int32_t, 4> index_helper(y_vector.data());
-      PoolingKernelUtil<device_type, T, int32_t>::Maxpool3dForward(ctx->stream(), index_helper, elem_num, src,
-                                                        dest, indice_ptr, params_3d);
-    }else{
+      PoolingKernelUtil<device_type, T, int32_t>::Maxpool3dForward(
+          ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d);
+    } else {
       NdIndexOffsetHelper<int64_t, 4> index_helper(y_vector.data());
-      PoolingKernelUtil<device_type, T, int64_t>::Maxpool3dForward(ctx->stream(), index_helper, elem_num, src,
-                                                        dest, indice_ptr, params_3d);
+      PoolingKernelUtil<device_type, T, int64_t>::Maxpool3dForward(
+          ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d);
     }
   };
 };
@@ -479,22 +470,22 @@ class MaxPool3dGradKernel final : public user_op::OpKernel {
     T* dest = dx->mut_dptr<T>();
 
     DimVector dy_vector(4);
-    dy_vector.at(0) = dy->shape().At(0) * dy->shape().At(1); 
+    dy_vector.at(0) = dy->shape().At(0) * dy->shape().At(1);
     dy_vector.at(1) = dy->shape().At(2);
-    dy_vector.at(2) = dy->shape().At(3); 
-    dy_vector.at(3) = dy->shape().At(4); 
+    dy_vector.at(2) = dy->shape().At(3);
+    dy_vector.at(3) = dy->shape().At(4);
 
     size_t out_bytes_size = dx->shape().elem_cnt() * GetSizeOfDataType(dx->data_type());
     Memset<device_type>(ctx->stream(), dest, 0, out_bytes_size);
 
-    if(elem_num < GetMaxVal<int32_t>()){
+    if (elem_num < GetMaxVal<int32_t>()) {
       NdIndexOffsetHelper<int32_t, 4> index_helper(dy_vector.data());
-      PoolingKernelUtil<device_type, T, int32_t>::Maxpool3dBackward(ctx->stream(), index_helper, elem_num, src,
-                                                         dest, indice_ptr, params_3d);
-    }else {
+      PoolingKernelUtil<device_type, T, int32_t>::Maxpool3dBackward(
+          ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d);
+    } else {
       NdIndexOffsetHelper<int64_t, 4> index_helper(dy_vector.data());
-      PoolingKernelUtil<device_type, T, int64_t>::Maxpool3dBackward(ctx->stream(), index_helper, elem_num, src,
-                                                         dest, indice_ptr, params_3d);
+      PoolingKernelUtil<device_type, T, int64_t>::Maxpool3dBackward(
+          ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d);
     }
   };
 };
diff --git a/oneflow/user/kernels/pooling_kernel.cu b/oneflow/user/kernels/pooling_kernel.cu
index 097ff903ec3..dc47abd2744 100644
--- a/oneflow/user/kernels/pooling_kernel.cu
+++ b/oneflow/user/kernels/pooling_kernel.cu
@@ -49,11 +49,11 @@ __device__ __inline__ void Maxpool2dForwardComputeCLast(
     IDX hstart = h * stride_h - padding_h;
     IDX wstart = w * stride_w - padding_w;
     const IDX hend = (hstart + (kernel_size_h - 1) * dilation_h + 1) <= x_height
-                             ? (hstart + (kernel_size_h - 1) * dilation_h + 1)
-                             : x_height;
+                         ? (hstart + (kernel_size_h - 1) * dilation_h + 1)
+                         : x_height;
     const IDX wend = (wstart + (kernel_size_w - 1) * dilation_w + 1) <= x_width
-                             ? (wstart + (kernel_size_w - 1) * dilation_w + 1)
-                             : x_width;
+                         ? (wstart + (kernel_size_w - 1) * dilation_w + 1)
+                         : x_width;
 
     while (hstart < 0) { hstart += dilation_h; }
     while (wstart < 0) { wstart += dilation_w; }
@@ -85,71 +85,66 @@ __device__ __inline__ void Maxpool2dForwardComputeCLast(
 
 template<typename T, typename IDX>
 __launch_bounds__(kBlockSize) __global__
-    void DoCUDAMaxPool1dForward(const NdIndexOffsetHelper<IDX, 2> index_helper,
-                                IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
-                                int32_t padding_l, int32_t n_batch, int32_t n_channel,
-                                int32_t x_length, int32_t kernel_size_l,
-                                int32_t stride_l, int32_t dilation_l) {
+    void DoCUDAMaxPool1dForward(const NdIndexOffsetHelper<IDX, 2> index_helper, IDX elem_num,
+                                const T* src, T* dest, int64_t* indice_ptr, int32_t padding_l,
+                                int32_t n_batch, int32_t n_channel, int32_t x_length,
+                                int32_t kernel_size_l, int32_t stride_l, int32_t dilation_l) {
   Maxpool1dForwardCompute<T, IDX>(index_helper, elem_num, src, dest, indice_ptr, padding_l, n_batch,
-                             n_channel, x_length, kernel_size_l, stride_l, dilation_l);
+                                  n_channel, x_length, kernel_size_l, stride_l, dilation_l);
 };
 
 template<typename T, typename IDX>
 __launch_bounds__(kBlockSize) __global__
-    void DoCUDAMaxPool2dForwardCFirst(const NdIndexOffsetHelper<IDX, 3> index_helper,
-                                      IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
-                                      int32_t padding_h, int32_t padding_w, int32_t n_batch,
-                                      int32_t n_channel, int32_t x_height, int32_t x_width,
-                                      int32_t kernel_size_h, int32_t kernel_size_w, 
-                                      int32_t stride_h, int32_t stride_w,
+    void DoCUDAMaxPool2dForwardCFirst(const NdIndexOffsetHelper<IDX, 3> index_helper, IDX elem_num,
+                                      const T* src, T* dest, int64_t* indice_ptr, int32_t padding_h,
+                                      int32_t padding_w, int32_t n_batch, int32_t n_channel,
+                                      int32_t x_height, int32_t x_width, int32_t kernel_size_h,
+                                      int32_t kernel_size_w, int32_t stride_h, int32_t stride_w,
                                       int32_t dilation_h, int32_t dilation_w) {
-  Maxpool2dForwardComputeCFirst<T, IDX>(index_helper, elem_num, src, dest, indice_ptr, padding_h,
-                                   padding_w, n_batch, n_channel, x_height, x_width, 
-                                   kernel_size_h, kernel_size_w, stride_h, stride_w,
-                                   dilation_h, dilation_w);
+  Maxpool2dForwardComputeCFirst<T, IDX>(
+      index_helper, elem_num, src, dest, indice_ptr, padding_h, padding_w, n_batch, n_channel,
+      x_height, x_width, kernel_size_h, kernel_size_w, stride_h, stride_w, dilation_h, dilation_w);
 };
 
 template<typename T, typename IDX>
 __launch_bounds__(kBlockSize) __global__
-    void DoCUDAMaxPool2dForwardCLast(const NdIndexOffsetHelper<IDX, 4> index_helper,
-                                     IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
-                                     int32_t padding_h, int32_t padding_w, int32_t n_batch,
-                                     int32_t n_channel, int32_t x_height, int32_t x_width,
-                                     int32_t y_height, int32_t y_width, int32_t kernel_size_h,
-                                     int32_t kernel_size_w, int32_t stride_h, int32_t stride_w,
-                                     int32_t dilation_h, int32_t dilation_w) {
+    void DoCUDAMaxPool2dForwardCLast(const NdIndexOffsetHelper<IDX, 4> index_helper, IDX elem_num,
+                                     const T* src, T* dest, int64_t* indice_ptr, int32_t padding_h,
+                                     int32_t padding_w, int32_t n_batch, int32_t n_channel,
+                                     int32_t x_height, int32_t x_width, int32_t y_height,
+                                     int32_t y_width, int32_t kernel_size_h, int32_t kernel_size_w,
+                                     int32_t stride_h, int32_t stride_w, int32_t dilation_h,
+                                     int32_t dilation_w) {
   Maxpool2dForwardComputeCLast<T, IDX>(index_helper, elem_num, src, dest, indice_ptr, padding_h,
-                                  padding_w, n_batch, n_channel, x_height, x_width, y_height,
-                                  y_width, kernel_size_h, kernel_size_w, stride_h, stride_w,
-                                  dilation_h, dilation_w);
+                                       padding_w, n_batch, n_channel, x_height, x_width, y_height,
+                                       y_width, kernel_size_h, kernel_size_w, stride_h, stride_w,
+                                       dilation_h, dilation_w);
 };
 
 template<typename T, typename IDX>
 __launch_bounds__(kBlockSize) __global__
-    void DoCUDAMaxPool3dForward(const NdIndexOffsetHelper<IDX, 4> index_helper,
-                                IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
-                                int32_t padding_t, int32_t padding_h, int32_t padding_w,
-                                int32_t n_batch, int32_t n_channel, int32_t x_time,
-                                int32_t x_height, int32_t x_width, 
-                                int32_t kernel_size_t, int32_t kernel_size_h,
+    void DoCUDAMaxPool3dForward(const NdIndexOffsetHelper<IDX, 4> index_helper, IDX elem_num,
+                                const T* src, T* dest, int64_t* indice_ptr, int32_t padding_t,
+                                int32_t padding_h, int32_t padding_w, int32_t n_batch,
+                                int32_t n_channel, int32_t x_time, int32_t x_height,
+                                int32_t x_width, int32_t kernel_size_t, int32_t kernel_size_h,
                                 int32_t kernel_size_w, int32_t stride_t, int32_t stride_h,
                                 int32_t stride_w, int32_t dilation_t, int32_t dilation_h,
                                 int32_t dilation_w) {
-  Maxpool3dForwardCompute<T, IDX>(index_helper, elem_num, src, dest, indice_ptr, padding_t, padding_h,
-                             padding_w, n_batch, n_channel, x_time, x_height, x_width, 
-                             kernel_size_t, kernel_size_h, kernel_size_w,
-                             stride_t, stride_h, stride_w, dilation_t, dilation_h, dilation_w);
+  Maxpool3dForwardCompute<T, IDX>(index_helper, elem_num, src, dest, indice_ptr, padding_t,
+                                  padding_h, padding_w, n_batch, n_channel, x_time, x_height,
+                                  x_width, kernel_size_t, kernel_size_h, kernel_size_w, stride_t,
+                                  stride_h, stride_w, dilation_t, dilation_h, dilation_w);
 };
 
 template<typename T, typename IDX>
 __launch_bounds__(kBlockSize) __global__
-  void DoCUDAMaxPool1dBackward(const NdIndexOffsetHelper<IDX, 2> index_helper,
-                                 const IDX elem_num, const T* src, T* dest,
-                                 const int64_t* indice_ptr, const int32_t n_batch,
-                                 const int32_t n_channel, const int32_t src_length,
-                                 const int32_t dst_length) {
-  Maxpool1dBackwardCompute<T, IDX>(index_helper, elem_num, src, dest, indice_ptr, n_batch, n_channel,
-                              src_length, dst_length);
+    void DoCUDAMaxPool1dBackward(const NdIndexOffsetHelper<IDX, 2> index_helper, const IDX elem_num,
+                                 const T* src, T* dest, const int64_t* indice_ptr,
+                                 const int32_t n_batch, const int32_t n_channel,
+                                 const int32_t src_length, const int32_t dst_length) {
+  Maxpool1dBackwardCompute<T, IDX>(index_helper, elem_num, src, dest, indice_ptr, n_batch,
+                                   n_channel, src_length, dst_length);
 };
 
 template<typename T, typename IDX>
@@ -161,7 +156,7 @@ __launch_bounds__(kBlockSize) __global__
                                        const int32_t src_width, const int32_t dst_height,
                                        const int32_t dst_width) {
   Maxpool2dBackwardComputeCFirst<T, IDX>(index_helper, elem_num, src, dest, indice_ptr, n_batch,
-                                    n_channel, src_height, src_width, dst_height, dst_width);
+                                         n_channel, src_height, src_width, dst_height, dst_width);
 };
 
 template<typename T, typename IDX>
@@ -173,58 +168,54 @@ __launch_bounds__(kBlockSize) __global__
                                       const int32_t src_width, const int32_t dst_height,
                                       const int32_t dst_width) {
   Maxpool2dBackwardComputeCLast<T, IDX>(index_helper, elem_num, src, dest, indice_ptr, n_batch,
-                                   n_channel, src_height, src_width, dst_height, dst_width);
+                                        n_channel, src_height, src_width, dst_height, dst_width);
 };
 
 template<typename T, typename IDX>
 __launch_bounds__(kBlockSize) __global__
-    void DoCUDAMaxPool3dBackward(const NdIndexOffsetHelper<IDX, 4> index_helper,
-                                 const IDX elem_num, const T* src, T* dest,
-                                 const int64_t* indice_ptr, const int32_t n_batch,
-                                 const int32_t n_channel, const int32_t src_time,
-                                 const int32_t src_height, const int32_t src_width,
-                                 const int32_t dst_time, const int32_t dst_height,
-                                 const int32_t dst_width) {
-  Maxpool3dBackwardCompute<T, IDX>(index_helper, elem_num, src, dest, indice_ptr, n_batch, n_channel,
-                              src_time, src_height, src_width, dst_time, dst_height, dst_width);
+    void DoCUDAMaxPool3dBackward(const NdIndexOffsetHelper<IDX, 4> index_helper, const IDX elem_num,
+                                 const T* src, T* dest, const int64_t* indice_ptr,
+                                 const int32_t n_batch, const int32_t n_channel,
+                                 const int32_t src_time, const int32_t src_height,
+                                 const int32_t src_width, const int32_t dst_time,
+                                 const int32_t dst_height, const int32_t dst_width) {
+  Maxpool3dBackwardCompute<T, IDX>(index_helper, elem_num, src, dest, indice_ptr, n_batch,
+                                   n_channel, src_time, src_height, src_width, dst_time, dst_height,
+                                   dst_width);
 };
 
 template<typename T, typename IDX>
 struct PoolingKernelUtil<DeviceType::kCUDA, T, IDX> {
-  static void Maxpool1dForward(ep::Stream* stream,
-                               const NdIndexOffsetHelper<IDX, 2>& index_helper,
+  static void Maxpool1dForward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 2>& index_helper,
                                const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
                                const MaxPoolingParams3D& params_3d) {
     DoCUDAMaxPool1dForward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
-                                stream->As<ep::CudaStream>()->cuda_stream()>>>(
+                                     stream->As<ep::CudaStream>()->cuda_stream()>>>(
         index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[2],
         params_3d.num_batch(), params_3d.num_channel(), params_3d.GetXShape5D().At(4),
-        params_3d.pooling_size_3d()[2], params_3d.stride_3d()[2],
-        params_3d.dilation_3d()[2]);
+        params_3d.pooling_size_3d()[2], params_3d.stride_3d()[2], params_3d.dilation_3d()[2]);
   }
 
-  static void Maxpool1dBackward(ep::Stream* stream,
-                                const NdIndexOffsetHelper<IDX, 2>& index_helper,
+  static void Maxpool1dBackward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 2>& index_helper,
                                 const IDX elem_num, const T* src, T* dest,
                                 const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) {
     DoCUDAMaxPool1dBackward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
-                                 stream->As<ep::CudaStream>()->cuda_stream()>>>(
+                                      stream->As<ep::CudaStream>()->cuda_stream()>>>(
         index_helper, elem_num, src, dest, indice_ptr, params_3d.num_batch(),
         params_3d.num_channel(), params_3d.GetYShape5D().At(4), params_3d.GetXShape5D().At(4));
   }
 
   static void Maxpool2dForwardCFirst(ep::Stream* stream,
                                      const NdIndexOffsetHelper<IDX, 3>& index_helper,
-                                     const IDX elem_num, const T* src, T* dest,
-                                     int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) {
+                                     const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
+                                     const MaxPoolingParams3D& params_3d) {
     DoCUDAMaxPool2dForwardCFirst<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
-                                      stream->As<ep::CudaStream>()->cuda_stream()>>>(
+                                           stream->As<ep::CudaStream>()->cuda_stream()>>>(
         index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[1],
         params_3d.padding()[2], params_3d.num_batch(), params_3d.num_channel(),
-        params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4), 
-        params_3d.pooling_size_3d()[1], params_3d.pooling_size_3d()[2], 
-        params_3d.stride_3d()[1], params_3d.stride_3d()[2],
-        params_3d.dilation_3d()[1], params_3d.dilation_3d()[2]);
+        params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4),
+        params_3d.pooling_size_3d()[1], params_3d.pooling_size_3d()[2], params_3d.stride_3d()[1],
+        params_3d.stride_3d()[2], params_3d.dilation_3d()[1], params_3d.dilation_3d()[2]);
   }
 
   static void Maxpool2dBackwardCFirst(ep::Stream* stream,
@@ -233,7 +224,7 @@ struct PoolingKernelUtil<DeviceType::kCUDA, T, IDX> {
                                       const int64_t* indice_ptr,
                                       const MaxPoolingParams3D& params_3d) {
     DoCUDAMaxPool2dBackwardCFirst<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
-                                       stream->As<ep::CudaStream>()->cuda_stream()>>>(
+                                            stream->As<ep::CudaStream>()->cuda_stream()>>>(
         index_helper, elem_num, src, dest, indice_ptr, params_3d.num_batch(),
         params_3d.num_channel(), params_3d.GetYShape5D().At(3), params_3d.GetYShape5D().At(4),
         params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4));
@@ -241,10 +232,10 @@ struct PoolingKernelUtil<DeviceType::kCUDA, T, IDX> {
 
   static void Maxpool2dForwardCLast(ep::Stream* stream,
                                     const NdIndexOffsetHelper<IDX, 4>& index_helper,
-                                    const IDX elem_num, const T* src, T* dest,
-                                    int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) {
+                                    const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
+                                    const MaxPoolingParams3D& params_3d) {
     DoCUDAMaxPool2dForwardCLast<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
-                                     stream->As<ep::CudaStream>()->cuda_stream()>>>(
+                                          stream->As<ep::CudaStream>()->cuda_stream()>>>(
         index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[1],
         params_3d.padding()[2], params_3d.num_batch(), params_3d.num_channel(),
         params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4), params_3d.GetYShape5D().At(3),
@@ -259,35 +250,31 @@ struct PoolingKernelUtil<DeviceType::kCUDA, T, IDX> {
                                      const int64_t* indice_ptr,
                                      const MaxPoolingParams3D& params_3d) {
     DoCUDAMaxPool2dBackwardCLast<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
-                                      stream->As<ep::CudaStream>()->cuda_stream()>>>(
+                                           stream->As<ep::CudaStream>()->cuda_stream()>>>(
         index_helper, elem_num, src, dest, indice_ptr, params_3d.num_batch(),
         params_3d.num_channel(), params_3d.GetYShape5D().At(3), params_3d.GetYShape5D().At(4),
         params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4));
   }
 
-  static void Maxpool3dForward(ep::Stream* stream,
-                               const NdIndexOffsetHelper<IDX, 4>& index_helper,
+  static void Maxpool3dForward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 4>& index_helper,
                                const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
                                const MaxPoolingParams3D& params_3d) {
     DoCUDAMaxPool3dForward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
-                                stream->As<ep::CudaStream>()->cuda_stream()>>>(
+                                     stream->As<ep::CudaStream>()->cuda_stream()>>>(
         index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[0],
         params_3d.padding()[1], params_3d.padding()[2], params_3d.num_batch(),
-        params_3d.num_channel(), 
-        params_3d.GetXShape5D().At(2), params_3d.GetXShape5D().At(3),
-        params_3d.GetXShape5D().At(4), 
-        params_3d.pooling_size_3d()[0],
+        params_3d.num_channel(), params_3d.GetXShape5D().At(2), params_3d.GetXShape5D().At(3),
+        params_3d.GetXShape5D().At(4), params_3d.pooling_size_3d()[0],
         params_3d.pooling_size_3d()[1], params_3d.pooling_size_3d()[2], params_3d.stride_3d()[0],
         params_3d.stride_3d()[1], params_3d.stride_3d()[2], params_3d.dilation_3d()[0],
         params_3d.dilation_3d()[1], params_3d.dilation_3d()[2]);
   }
 
-  static void Maxpool3dBackward(ep::Stream* stream,
-                                const NdIndexOffsetHelper<IDX, 4>& index_helper,
+  static void Maxpool3dBackward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 4>& index_helper,
                                 const IDX elem_num, const T* src, T* dest,
                                 const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) {
     DoCUDAMaxPool3dBackward<T, IDX><<<GetNumBlocks(elem_num), GetMinThreadNum(elem_num), 0,
-                                 stream->As<ep::CudaStream>()->cuda_stream()>>>(
+                                      stream->As<ep::CudaStream>()->cuda_stream()>>>(
         index_helper, elem_num, src, dest, indice_ptr, params_3d.num_batch(),
         params_3d.num_channel(), params_3d.GetYShape5D().At(2), params_3d.GetYShape5D().At(3),
         params_3d.GetYShape5D().At(4), params_3d.GetXShape5D().At(2), params_3d.GetXShape5D().At(3),
diff --git a/oneflow/user/kernels/pooling_kernel_util.h b/oneflow/user/kernels/pooling_kernel_util.h
index dc99137d439..533641156a3 100644
--- a/oneflow/user/kernels/pooling_kernel_util.h
+++ b/oneflow/user/kernels/pooling_kernel_util.h
@@ -33,7 +33,7 @@ namespace oneflow {
   OF_PP_MAKE_TUPLE_SEQ(float, DataType::kFloat)   \
   OF_PP_MAKE_TUPLE_SEQ(double, DataType::kDouble)
 
-#define POOLING_IDX_DATA_TYPE_SEQ                     \
+#define POOLING_IDX_DATA_TYPE_SEQ                 \
   OF_PP_MAKE_TUPLE_SEQ(int32_t, DataType::kInt32) \
   OF_PP_MAKE_TUPLE_SEQ(int64_t, DataType::kInt64)
 
@@ -54,11 +54,9 @@ struct DeviceAdd {
   };
 };
 
-#ifdef WITH_CUDA 
- 
-OF_DEVICE_FUNC int32_t device_min(int32_t a, int32_t b) {
-  return a <= b ? a : b;
-}
+#ifdef WITH_CUDA
+
+OF_DEVICE_FUNC int32_t device_min(int32_t a, int32_t b) { return a <= b ? a : b; }
 
 #endif
 
@@ -102,20 +100,18 @@ class MaxPoolingParams3D {
 
 template<DeviceType device_type, typename T, typename IDX>
 struct PoolingKernelUtil {
-  static void Maxpool1dForward(ep::Stream* stream,
-                               const NdIndexOffsetHelper<IDX, 2>& index_helper,
+  static void Maxpool1dForward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 2>& index_helper,
                                const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
                                const MaxPoolingParams3D& params_3d);
 
-  static void Maxpool1dBackward(ep::Stream* stream,
-                                const NdIndexOffsetHelper<IDX, 2>& index_helper,
+  static void Maxpool1dBackward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 2>& index_helper,
                                 const IDX elem_num, const T* src, T* dest,
                                 const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d);
 
   static void Maxpool2dForwardCFirst(ep::Stream* stream,
                                      const NdIndexOffsetHelper<IDX, 3>& index_helper,
-                                     const IDX elem_num, const T* src, T* dest,
-                                     int64_t* indice_ptr, const MaxPoolingParams3D& params_3d);
+                                     const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
+                                     const MaxPoolingParams3D& params_3d);
 
   static void Maxpool2dBackwardCFirst(ep::Stream* stream,
                                       const NdIndexOffsetHelper<IDX, 3>& index_helper,
@@ -125,8 +121,8 @@ struct PoolingKernelUtil {
 
   static void Maxpool2dForwardCLast(ep::Stream* stream,
                                     const NdIndexOffsetHelper<IDX, 4>& index_helper,
-                                    const IDX elem_num, const T* src, T* dest,
-                                    int64_t* indice_ptr, const MaxPoolingParams3D& params_3d);
+                                    const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
+                                    const MaxPoolingParams3D& params_3d);
 
   static void Maxpool2dBackwardCLast(ep::Stream* stream,
                                      const NdIndexOffsetHelper<IDX, 4>& index_helper,
@@ -134,13 +130,11 @@ struct PoolingKernelUtil {
                                      const int64_t* indice_ptr,
                                      const MaxPoolingParams3D& params_3d);
 
-  static void Maxpool3dForward(ep::Stream* stream,
-                               const NdIndexOffsetHelper<IDX, 4>& index_helper,
+  static void Maxpool3dForward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 4>& index_helper,
                                const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr,
                                const MaxPoolingParams3D& params_3d);
 
-  static void Maxpool3dBackward(ep::Stream* stream,
-                                const NdIndexOffsetHelper<IDX, 4>& index_helper,
+  static void Maxpool3dBackward(ep::Stream* stream, const NdIndexOffsetHelper<IDX, 4>& index_helper,
                                 const IDX elem_num, const T* src, T* dest,
                                 const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d);
 };
@@ -150,7 +144,7 @@ OF_DEVICE_FUNC void Maxpool1dForwardCompute(const NdIndexOffsetHelper<IDX, 2> in
                                             IDX elem_num, const T* src, T* dest,
                                             int64_t* indice_ptr, const int32_t padding_l,
                                             const int32_t n_batch, const int32_t n_channel,
-                                            const int32_t x_length, const int32_t kernel_size_l, 
+                                            const int32_t x_length, const int32_t kernel_size_l,
                                             const int32_t stride_l, const int32_t dilation_l) {
   XPU_1D_KERNEL_LOOP(num, elem_num) {
     IDX n_c, l;
@@ -158,8 +152,8 @@ OF_DEVICE_FUNC void Maxpool1dForwardCompute(const NdIndexOffsetHelper<IDX, 2> in
 
     IDX lstart = l * stride_l - padding_l;
     const IDX lend = (lstart + (kernel_size_l - 1) * dilation_l + 1) <= x_length
-                             ? (lstart + (kernel_size_l - 1) * dilation_l + 1)
-                             : x_length;
+                         ? (lstart + (kernel_size_l - 1) * dilation_l + 1)
+                         : x_length;
 
     while (lstart < 0) { lstart += dilation_l; }
 
@@ -208,20 +202,19 @@ OF_DEVICE_FUNC void Maxpool2dForwardComputeCFirst(
     const NdIndexOffsetHelper<IDX, 3> index_helper, IDX elem_num, const T* src, T* dest,
     int64_t* indice_ptr, const int32_t padding_h, const int32_t padding_w, const int32_t n_batch,
     const int32_t n_channel, const int32_t x_height, const int32_t x_width,
-    const int32_t kernel_size_h, const int32_t kernel_size_w,
-    const int32_t stride_h, const int32_t stride_w, const int32_t dilation_h,
-    const int32_t dilation_w) {
+    const int32_t kernel_size_h, const int32_t kernel_size_w, const int32_t stride_h,
+    const int32_t stride_w, const int32_t dilation_h, const int32_t dilation_w) {
   XPU_1D_KERNEL_LOOP(num, elem_num) {
     IDX n_c, h, w;
     index_helper.OffsetToNdIndex(num, n_c, h, w);
     IDX hstart = h * stride_h - padding_h;
     IDX wstart = w * stride_w - padding_w;
     const IDX hend = (hstart + (kernel_size_h - 1) * dilation_h + 1) <= x_height
-                             ? (hstart + (kernel_size_h - 1) * dilation_h + 1)
-                             : x_height;
+                         ? (hstart + (kernel_size_h - 1) * dilation_h + 1)
+                         : x_height;
     const IDX wend = (wstart + (kernel_size_w - 1) * dilation_w + 1) <= x_width
-                             ? (wstart + (kernel_size_w - 1) * dilation_w + 1)
-                             : x_width;
+                         ? (wstart + (kernel_size_w - 1) * dilation_w + 1)
+                         : x_width;
     while (hstart < 0) { hstart += dilation_h; }
     while (wstart < 0) { wstart += dilation_w; }
     /* equal to -std::numeric_limits<T>::infinity(); */
@@ -232,7 +225,7 @@ OF_DEVICE_FUNC void Maxpool2dForwardComputeCFirst(
     for (IDX i = hstart; i < hend; i += dilation_h) {
       for (IDX j = wstart; j < wend; j += dilation_w) {
         const IDX window_idx = i * x_width + j;
-        T val = data[window_idx]; 
+        T val = data[window_idx];
         /* NOTE:
         std::isnan(val) only supports a few data types, see:
         https://en.cppreference.com/w/cpp/numeric/math/isnan and when use gcc/g++ 4.x to compile,
@@ -256,8 +249,8 @@ OF_DEVICE_FUNC void Maxpool2dForwardComputeCFirst(
 
 template<typename T, typename IDX>
 OF_DEVICE_FUNC void Maxpool2dBackwardComputeCFirst(
-    const NdIndexOffsetHelper<IDX, 3> index_helper, const IDX elem_num, const T* src,
-    T* dest, const int64_t* indice_ptr, const int32_t n_batch, const int32_t n_channel,
+    const NdIndexOffsetHelper<IDX, 3> index_helper, const IDX elem_num, const T* src, T* dest,
+    const int64_t* indice_ptr, const int32_t n_batch, const int32_t n_channel,
     const int32_t src_height, const int32_t src_width, const int32_t dst_height,
     const int32_t dst_width) {
   XPU_1D_KERNEL_LOOP(num, elem_num) {
@@ -277,11 +270,12 @@ OF_DEVICE_FUNC void Maxpool2dBackwardComputeCFirst(
 }
 
 template<typename T, typename IDX>
-OF_DEVICE_FUNC void Maxpool2dBackwardComputeCLast(
-    const NdIndexOffsetHelper<IDX, 4> index_helper, const IDX elem_num, const T* src,
-    T* dest, const int64_t* indice_ptr, const int32_t n_batch, const int32_t n_channel,
-    const int32_t src_height, const int32_t src_width, const int32_t dst_height,
-    const int32_t dst_width) {
+OF_DEVICE_FUNC void Maxpool2dBackwardComputeCLast(const NdIndexOffsetHelper<IDX, 4> index_helper,
+                                                  const IDX elem_num, const T* src, T* dest,
+                                                  const int64_t* indice_ptr, const int32_t n_batch,
+                                                  const int32_t n_channel, const int32_t src_height,
+                                                  const int32_t src_width, const int32_t dst_height,
+                                                  const int32_t dst_width) {
   XPU_1D_KERNEL_LOOP(num, elem_num) {
     IDX n, c, h, w;
     index_helper.OffsetToNdIndex(num, n, c, h, w);
@@ -300,11 +294,11 @@ template<typename T, typename IDX>
 OF_DEVICE_FUNC void Maxpool3dForwardCompute(
     const NdIndexOffsetHelper<IDX, 4> index_helper, IDX elem_num, const T* src, T* dest,
     int64_t* indice_ptr, const int32_t padding_t, const int32_t padding_h, const int32_t padding_w,
-    const int32_t n_batch, const int32_t n_channel, 
-    const int32_t x_time, const int32_t x_height, const int32_t x_width, 
-    const int32_t kernel_size_t, const int32_t kernel_size_h, const int32_t kernel_size_w,
-    const int32_t stride_t, const int32_t stride_h, const int32_t stride_w,
-    const int32_t dilation_t, const int32_t dilation_h, const int32_t dilation_w) {
+    const int32_t n_batch, const int32_t n_channel, const int32_t x_time, const int32_t x_height,
+    const int32_t x_width, const int32_t kernel_size_t, const int32_t kernel_size_h,
+    const int32_t kernel_size_w, const int32_t stride_t, const int32_t stride_h,
+    const int32_t stride_w, const int32_t dilation_t, const int32_t dilation_h,
+    const int32_t dilation_w) {
   XPU_1D_KERNEL_LOOP(num, elem_num) {
     IDX n_c, t, h, w;
     index_helper.OffsetToNdIndex(num, n_c, t, h, w);
@@ -368,7 +362,8 @@ OF_DEVICE_FUNC void Maxpool3dBackwardCompute(const NdIndexOffsetHelper<IDX, 4> i
 }
 
 #define INSTANTIATE_POOLING_KERNEL_UTIL(device_type_v, dtype_pair, index_dtype_pair) \
-  template struct PoolingKernelUtil<device_type_v, OF_PP_PAIR_FIRST(dtype_pair), OF_PP_PAIR_FIRST(index_dtype_pair)>;
+  template struct PoolingKernelUtil<device_type_v, OF_PP_PAIR_FIRST(dtype_pair),     \
+                                    OF_PP_PAIR_FIRST(index_dtype_pair)>;
 
 }  // namespace oneflow
 

From cc31a1454a6c806b4e259e8e4dc831097f129e4d Mon Sep 17 00:00:00 2001
From: MARD1NO <359521840@qq.com>
Date: Mon, 21 Feb 2022 13:48:13 +0800
Subject: [PATCH 17/17] fix comment

---
 oneflow/user/kernels/pooling_kernel.cpp    | 1 -
 oneflow/user/kernels/pooling_kernel_util.h | 6 ------
 2 files changed, 7 deletions(-)

diff --git a/oneflow/user/kernels/pooling_kernel.cpp b/oneflow/user/kernels/pooling_kernel.cpp
index 12d979ec536..7371f805b98 100644
--- a/oneflow/user/kernels/pooling_kernel.cpp
+++ b/oneflow/user/kernels/pooling_kernel.cpp
@@ -294,7 +294,6 @@ class MaxPool2dKernel final : public user_op::OpKernel {
     const MaxPoolingParams3D& params_3d = pooling_cache->GetParams3D();
 
     const int64_t elem_num = y->shape().elem_cnt();
-    // const int32_t elem_num = y->shape().elem_cnt();
 
     const T* src = x->dptr<T>();
     T* dest = y->mut_dptr<T>();
diff --git a/oneflow/user/kernels/pooling_kernel_util.h b/oneflow/user/kernels/pooling_kernel_util.h
index 533641156a3..defb5d37013 100644
--- a/oneflow/user/kernels/pooling_kernel_util.h
+++ b/oneflow/user/kernels/pooling_kernel_util.h
@@ -54,12 +54,6 @@ struct DeviceAdd {
   };
 };
 
-#ifdef WITH_CUDA
-
-OF_DEVICE_FUNC int32_t device_min(int32_t a, int32_t b) { return a <= b ? a : b; }
-
-#endif
-
 class MaxPoolingParams3D {
  public:
   MaxPoolingParams3D(const int32_t dim, const ShapeView& x_shape, const std::string& data_format,