From 7c2d1af8bf87b6bbeb26a179e4723068ae1e8d5f Mon Sep 17 00:00:00 2001 From: MARD1NO <359521840@qq.com> Date: Fri, 4 Feb 2022 16:43:04 +0800 Subject: [PATCH 01/17] first debug --- oneflow/user/kernels/pooling_kernel_util.h | 112 ++++++++++++++------- 1 file changed, 73 insertions(+), 39 deletions(-) diff --git a/oneflow/user/kernels/pooling_kernel_util.h b/oneflow/user/kernels/pooling_kernel_util.h index e5a67944eb8..6ad5233515c 100644 --- a/oneflow/user/kernels/pooling_kernel_util.h +++ b/oneflow/user/kernels/pooling_kernel_util.h @@ -205,53 +205,87 @@ OF_DEVICE_FUNC void Maxpool2dForwardComputeCFirst( const int32_t stride_h, const int32_t stride_w, const int32_t dilation_h, const int32_t dilation_w) { XPU_1D_KERNEL_LOOP(num, elem_num) { - int64_t n, c, h, w; - index_helper.OffsetToNdIndex(num, n, c, h, w); - - const int64_t start_idx = (n * n_channel + c) * x_width * x_height; - int64_t hstart = h * stride_h - padding_h; - int64_t wstart = w * stride_w - padding_w; - const int64_t hend = (hstart + (kernel_size_h - 1) * dilation_h + 1) <= x_height + // int64_t n, c, h, w; + // index_helper.OffsetToNdIndex(num, n, c, h, w); + + // const int64_t start_idx = (n * n_channel + c) * x_width * x_height; + // int64_t hstart = h * stride_h - padding_h; + // int64_t wstart = w * stride_w - padding_w; + // const int64_t hend = (hstart + (kernel_size_h - 1) * dilation_h + 1) <= x_height + // ? (hstart + (kernel_size_h - 1) * dilation_h + 1) + // : x_height; + // const int64_t wend = (wstart + (kernel_size_w - 1) * dilation_w + 1) <= x_width + // ? (wstart + (kernel_size_w - 1) * dilation_w + 1) + // : x_width; + + // while (hstart < 0) { hstart += dilation_h; } + // while (wstart < 0) { wstart += dilation_w; } + + // /* compute max value(src[src_idx]) in kernel box region, and save the value to dest[num] */ + // int64_t max_index = hstart * x_width + wstart; + // int64_t src_idx = 0; + + // /* equal to -std::numeric_limits::infinity(); */ + // T max_value = detail::numeric_limits::lower_bound(); + + // for (int64_t i = hstart; i < hend; i += dilation_h) { + // for (int64_t j = wstart; j < wend; j += dilation_w) { + // const int64_t window_idx = i * x_width + j; + // const int64_t search_idx = start_idx + window_idx; + // T val = src[search_idx]; + // /* NOTE: + // std::isnan(val) only supports a few data types, see: + // https://en.cppreference.com/w/cpp/numeric/math/isnan and when use gcc/g++ 4.x to compile, + // the following exception will be throw: + + // new_kernel_util.cu:24] Check failed: cudaMemcpyAsync(dst, src, sz, cudaMemcpyDefault, + // ctx->cuda_stream() ) : unspecified launch failure (719) + + // but if use gcc/g++ 7.x to compile, everything is ok! the exact reason is still unknown! + // */ + // if (val > max_value || detail::numerics::isnan(val)) { + // max_value = val; + // max_index = window_idx; + // src_idx = search_idx; + // } + // } + // } + + int pw = num % kernel_size_w; + int ph = (num / kernel_size_w) % kernel_size_h; + int c = (num / kernel_size_w / kernel_size_h) % 64; + int n = num / kernel_size_w / kernel_size_h / 64; + int hstart = ph * stride_h - padding_h; + int wstart = pw * stride_w - padding_w; + // int hend = min(hstart + (kernel_size_h - 1) * dilation_h + 1, x_height); + // int wend = min(wstart + (kernel_size_w - 1) * dilation_w + 1, x_width); + + int hend = (hstart + (kernel_size_h - 1) * dilation_h + 1) <= x_height ? (hstart + (kernel_size_h - 1) * dilation_h + 1) : x_height; - const int64_t wend = (wstart + (kernel_size_w - 1) * dilation_w + 1) <= x_width + int wend = (wstart + (kernel_size_w - 1) * dilation_w + 1) <= x_width ? (wstart + (kernel_size_w - 1) * dilation_w + 1) : x_width; - while (hstart < 0) { hstart += dilation_h; } - while (wstart < 0) { wstart += dilation_w; } - - /* compute max value(src[src_idx]) in kernel box region, and save the value to dest[num] */ - int64_t max_index = hstart * x_width + wstart; - int64_t src_idx = 0; - - /* equal to -std::numeric_limits::infinity(); */ - T max_value = detail::numeric_limits::lower_bound(); - - for (int64_t i = hstart; i < hend; i += dilation_h) { - for (int64_t j = wstart; j < wend; j += dilation_w) { - const int64_t window_idx = i * x_width + j; - const int64_t search_idx = start_idx + window_idx; - T val = src[search_idx]; - /* NOTE: - std::isnan(val) only supports a few data types, see: - https://en.cppreference.com/w/cpp/numeric/math/isnan and when use gcc/g++ 4.x to compile, - the following exception will be throw: - - new_kernel_util.cu:24] Check failed: cudaMemcpyAsync(dst, src, sz, cudaMemcpyDefault, - ctx->cuda_stream() ) : unspecified launch failure (719) - - but if use gcc/g++ 7.x to compile, everything is ok! the exact reason is still unknown! - */ - if (val > max_value || detail::numerics::isnan(val)) { - max_value = val; - max_index = window_idx; - src_idx = search_idx; + while(hstart < 0) + hstart += dilation_h; + while(wstart < 0) + wstart += dilation_w; + + T maxval = detail::numeric_limits::lower_bound(); // -Infinity + int maxidx = hstart * x_width + wstart; + const T* btm_data = src + (n * 64 + c) * x_height * x_width; + for (int h = hstart; h < hend; h += dilation_h) { + for (int w = wstart; w < wend; w += dilation_w) { + T val = btm_data[h * x_width + w]; + if (val > maxval|| detail::numerics::isnan(val)) { + maxidx = h * x_width + w; + maxval = static_cast(val); } } } - dest[num] = src[src_idx]; - indice_ptr[num] = max_index; + dest[num] = maxval; + indice_ptr[num] = maxidx; } } From f2375f7f58672f0d8f2780ba74cc91cafbfc31a2 Mon Sep 17 00:00:00 2001 From: MARD1NO <359521840@qq.com> Date: Mon, 7 Feb 2022 13:45:38 +0800 Subject: [PATCH 02/17] fix maxpool --- oneflow/user/kernels/pooling_kernel.cpp | 211 +++++++++------ oneflow/user/kernels/pooling_kernel.cu | 185 ++++++------- oneflow/user/kernels/pooling_kernel_util.h | 293 +++++++++------------ 3 files changed, 353 insertions(+), 336 deletions(-) diff --git a/oneflow/user/kernels/pooling_kernel.cpp b/oneflow/user/kernels/pooling_kernel.cpp index edb9226cabc..ea726328a13 100644 --- a/oneflow/user/kernels/pooling_kernel.cpp +++ b/oneflow/user/kernels/pooling_kernel.cpp @@ -42,9 +42,9 @@ std::shared_ptr CreateOpKernelCache(user_op::KernelCacheCo namespace { -template -void Maxpool2dForwardComputeCLast(const NdIndexOffsetHelper& index_helper, - int64_t elem_num, const T* src, T* dest, int64_t* indice_ptr, +template +void Maxpool2dForwardComputeCLast(const NdIndexOffsetHelper& index_helper, + IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, const int32_t padding_h, const int32_t padding_w, const int64_t n_batch, const int64_t n_channel, const int64_t x_height, const int64_t x_width, @@ -52,33 +52,33 @@ void Maxpool2dForwardComputeCLast(const NdIndexOffsetHelper& index_h const int32_t kernel_size_h, const int32_t kernel_size_w, const int32_t stride_h, const int32_t stride_w, const int32_t dilation_h, const int32_t dilation_w) { - int64_t n = 0, h = 0, w = 0, c = 0; - for (int64_t num = 0; num < elem_num; ++num) { + IDX n = 0, h = 0, w = 0, c = 0; + for (IDX num = 0; num < elem_num; ++num) { index_helper.OffsetToNdIndex(num, n, h, w, c); - const int64_t x_start_idx = n * x_height * x_width * n_channel; - const int64_t y_start_idx = n * y_height * y_width * n_channel; - int64_t hstart = h * stride_h - padding_h; - int64_t wstart = w * stride_w - padding_w; - const int64_t hend = (hstart + (kernel_size_h - 1) * dilation_h + 1) <= x_height + const IDX x_start_idx = n * x_height * x_width * n_channel; + const IDX y_start_idx = n * y_height * y_width * n_channel; + IDX hstart = h * stride_h - padding_h; + IDX wstart = w * stride_w - padding_w; + const IDX hend = (hstart + (kernel_size_h - 1) * dilation_h + 1) <= x_height ? (hstart + (kernel_size_h - 1) * dilation_h + 1) : x_height; - const int64_t wend = (wstart + (kernel_size_w - 1) * dilation_w + 1) <= x_width + const IDX wend = (wstart + (kernel_size_w - 1) * dilation_w + 1) <= x_width ? (wstart + (kernel_size_w - 1) * dilation_w + 1) : x_width; while (hstart < 0) { hstart += dilation_h; } while (wstart < 0) { wstart += dilation_w; } /* compute max value(src[src_idx]) in kernel box region, and save the value to dest[num] */ - int64_t max_index = hstart * x_width + wstart; - int64_t src_idx = 0; + IDX max_index = hstart * x_width + wstart; + IDX src_idx = 0; /* equal to -std::numeric_limits::infinity(); */ T max_value = detail::numeric_limits::lower_bound(); - for (int64_t i = hstart; i < hend; i += dilation_h) { - for (int64_t j = wstart; j < wend; j += dilation_w) { - const int64_t window_idx = i * x_width * n_channel + j * n_channel + c; - const int64_t search_idx = x_start_idx + window_idx; + for (IDX i = hstart; i < hend; i += dilation_h) { + for (IDX j = wstart; j < wend; j += dilation_w) { + const IDX window_idx = i * x_width * n_channel + j * n_channel + c; + const IDX search_idx = x_start_idx + window_idx; T val = src[search_idx]; if (val > max_value || detail::numerics::isnan(val)) { max_value = val; @@ -87,7 +87,7 @@ void Maxpool2dForwardComputeCLast(const NdIndexOffsetHelper& index_h } } } - const int64_t out_idx = y_start_idx + h * y_width * n_channel + w * n_channel + c; + const IDX out_idx = y_start_idx + h * y_width * n_channel + w * n_channel + c; dest[out_idx] = src[src_idx]; indice_ptr[out_idx] = max_index; } @@ -95,57 +95,57 @@ void Maxpool2dForwardComputeCLast(const NdIndexOffsetHelper& index_h } // namespace -template -struct PoolingKernelUtil { +template +struct PoolingKernelUtil { static void Maxpool1dForward(ep::Stream* stream, - const NdIndexOffsetHelper& index_helper, - const int64_t elem_num, const T* src, T* dest, int64_t* indice_ptr, + const NdIndexOffsetHelper& index_helper, + const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { - Maxpool1dForwardCompute(index_helper, elem_num, src, dest, indice_ptr, + Maxpool1dForwardCompute(index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[2], params_3d.num_batch(), params_3d.num_channel(), params_3d.GetXShape5D().At(4), - params_3d.GetYShape5D().At(4), params_3d.pooling_size_3d()[2], + params_3d.pooling_size_3d()[2], params_3d.stride_3d()[2], params_3d.dilation_3d()[2]); } static void Maxpool1dBackward(ep::Stream* stream, - const NdIndexOffsetHelper& index_helper, - const int64_t elem_num, const T* src, T* dest, + const NdIndexOffsetHelper& index_helper, + const IDX elem_num, const T* src, T* dest, const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { - Maxpool1dBackwardCompute(index_helper, elem_num, src, dest, indice_ptr, + Maxpool1dBackwardCompute(index_helper, elem_num, src, dest, indice_ptr, params_3d.num_batch(), params_3d.num_channel(), params_3d.GetYShape5D().At(4), params_3d.GetXShape5D().At(4)); } static void Maxpool2dForwardCFirst(ep::Stream* stream, - const NdIndexOffsetHelper& index_helper, - const int64_t elem_num, const T* src, T* dest, + const NdIndexOffsetHelper& index_helper, + const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { - Maxpool2dForwardComputeCFirst( + Maxpool2dForwardComputeCFirst( index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[1], params_3d.padding()[2], params_3d.num_batch(), params_3d.num_channel(), - params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4), params_3d.GetYShape5D().At(3), - params_3d.GetYShape5D().At(4), params_3d.pooling_size_3d()[1], - params_3d.pooling_size_3d()[2], params_3d.stride_3d()[1], params_3d.stride_3d()[2], + params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4), + params_3d.pooling_size_3d()[1], params_3d.pooling_size_3d()[2], + params_3d.stride_3d()[1], params_3d.stride_3d()[2], params_3d.dilation_3d()[1], params_3d.dilation_3d()[2]); } static void Maxpool2dBackwardCFirst(ep::Stream* stream, - const NdIndexOffsetHelper& index_helper, - const int64_t elem_num, const T* src, T* dest, + const NdIndexOffsetHelper& index_helper, + const IDX elem_num, const T* src, T* dest, const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { - Maxpool2dBackwardComputeCFirst(index_helper, elem_num, src, dest, indice_ptr, + Maxpool2dBackwardComputeCFirst(index_helper, elem_num, src, dest, indice_ptr, params_3d.num_batch(), params_3d.num_channel(), params_3d.GetYShape5D().At(3), params_3d.GetYShape5D().At(4), params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4)); } static void Maxpool2dForwardCLast(ep::Stream* stream, - const NdIndexOffsetHelper& index_helper, - const int64_t elem_num, const T* src, T* dest, + const NdIndexOffsetHelper& index_helper, + const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { - Maxpool2dForwardComputeCLast( + Maxpool2dForwardComputeCLast( index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[1], params_3d.padding()[2], params_3d.num_batch(), params_3d.num_channel(), params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4), params_3d.GetYShape5D().At(3), @@ -155,36 +155,36 @@ struct PoolingKernelUtil { } static void Maxpool2dBackwardCLast(ep::Stream* stream, - const NdIndexOffsetHelper& index_helper, - const int64_t elem_num, const T* src, T* dest, + const NdIndexOffsetHelper& index_helper, + const IDX elem_num, const T* src, T* dest, const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { - Maxpool2dBackwardComputeCLast(index_helper, elem_num, src, dest, indice_ptr, + Maxpool2dBackwardComputeCLast(index_helper, elem_num, src, dest, indice_ptr, params_3d.num_batch(), params_3d.num_channel(), params_3d.GetYShape5D().At(3), params_3d.GetYShape5D().At(4), params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4)); } static void Maxpool3dForward(ep::Stream* stream, - const NdIndexOffsetHelper& index_helper, - const int64_t elem_num, const T* src, T* dest, int64_t* indice_ptr, + const NdIndexOffsetHelper& index_helper, + const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { - Maxpool3dForwardCompute( + Maxpool3dForwardCompute( index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[0], params_3d.padding()[1], params_3d.padding()[2], params_3d.num_batch(), - params_3d.num_channel(), params_3d.GetXShape5D().At(2), params_3d.GetXShape5D().At(3), - params_3d.GetXShape5D().At(4), params_3d.GetYShape5D().At(2), params_3d.GetYShape5D().At(3), - params_3d.GetYShape5D().At(4), params_3d.pooling_size_3d()[0], - params_3d.pooling_size_3d()[1], params_3d.pooling_size_3d()[2], params_3d.stride_3d()[0], - params_3d.stride_3d()[1], params_3d.stride_3d()[2], params_3d.dilation_3d()[0], + params_3d.num_channel(), + params_3d.GetXShape5D().At(2), params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4), + params_3d.pooling_size_3d()[0], params_3d.pooling_size_3d()[1], params_3d.pooling_size_3d()[2], + params_3d.stride_3d()[0], params_3d.stride_3d()[1], params_3d.stride_3d()[2], + params_3d.dilation_3d()[0], params_3d.dilation_3d()[1], params_3d.dilation_3d()[2]); } static void Maxpool3dBackward(ep::Stream* stream, - const NdIndexOffsetHelper index_helper, - const int64_t elem_num, const T* src, T* dest, + const NdIndexOffsetHelper index_helper, + const IDX elem_num, const T* src, T* dest, const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { - Maxpool3dBackwardCompute(index_helper, elem_num, src, dest, indice_ptr, + Maxpool3dBackwardCompute(index_helper, elem_num, src, dest, indice_ptr, params_3d.num_batch(), params_3d.num_channel(), params_3d.GetYShape5D().At(2), params_3d.GetYShape5D().At(3), params_3d.GetYShape5D().At(4), params_3d.GetXShape5D().At(2), @@ -221,11 +221,17 @@ class MaxPool1dKernel final : public user_op::OpKernel { DimVector y_vector; y->shape().ToDimVector(&y_vector); - NdIndexOffsetHelper index_helper(y_vector.data()); - PoolingKernelUtil::Maxpool1dForward(ctx->stream(), index_helper, elem_num, src, + if(elem_num < GetMaxVal()){ + NdIndexOffsetHelper index_helper(y_vector.data()); + PoolingKernelUtil::Maxpool1dForward(ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); - }; + }else{ + NdIndexOffsetHelper index_helper(y_vector.data()); + PoolingKernelUtil::Maxpool1dForward(ctx->stream(), index_helper, elem_num, src, + dest, indice_ptr, params_3d); + } + } }; template @@ -256,13 +262,19 @@ class MaxPool1dGradKernel final : public user_op::OpKernel { T* dest = dx->mut_dptr(); DimVector dy_vector; dy->shape().ToDimVector(&dy_vector); - NdIndexOffsetHelper index_helper(dy_vector.data()); - size_t out_bytes_size = dx->shape().elem_cnt() * GetSizeOfDataType(dx->data_type()); Memset(ctx->stream(), dest, 0, out_bytes_size); - PoolingKernelUtil::Maxpool1dBackward(ctx->stream(), index_helper, elem_num, src, + if(elem_num < GetMaxVal()){ + NdIndexOffsetHelper index_helper(dy_vector.data()); + PoolingKernelUtil::Maxpool1dBackward(ctx->stream(), index_helper, elem_num, src, + dest, indice_ptr, params_3d); + }else{ + NdIndexOffsetHelper index_helper(dy_vector.data()); + PoolingKernelUtil::Maxpool1dBackward(ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); + } + }; }; @@ -295,16 +307,29 @@ class MaxPool2dKernel final : public user_op::OpKernel { DimVector y_vector; y->shape().ToDimVector(&y_vector); - NdIndexOffsetHelper index_helper(y_vector.data()); const std::string& data_format = ctx->Attr("data_format"); - if (data_format == "channels_first") { - PoolingKernelUtil::Maxpool2dForwardCFirst( + if(elem_num < GetMaxVal()){ + NdIndexOffsetHelper index_helper(y_vector.data()); + if (data_format == "channels_first") { + PoolingKernelUtil::Maxpool2dForwardCFirst( ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); - } else if (data_format == "channels_last") { - PoolingKernelUtil::Maxpool2dForwardCLast( + } else if (data_format == "channels_last") { + PoolingKernelUtil::Maxpool2dForwardCLast( + ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); + } else { + UNIMPLEMENTED() << "Unsupported data_format"; + } + }else{ + NdIndexOffsetHelper index_helper(y_vector.data()); + if (data_format == "channels_first") { + PoolingKernelUtil::Maxpool2dForwardCFirst( ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); - } else { - UNIMPLEMENTED() << "Unsupported data_format"; + } else if (data_format == "channels_last") { + PoolingKernelUtil::Maxpool2dForwardCLast( + ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); + } else { + UNIMPLEMENTED() << "Unsupported data_format"; + } } }; }; @@ -337,20 +362,33 @@ class MaxPool2dGradKernel final : public user_op::OpKernel { T* dest = dx->mut_dptr(); DimVector dy_vector; dy->shape().ToDimVector(&dy_vector); - NdIndexOffsetHelper index_helper(dy_vector.data()); size_t out_bytes_size = dx->shape().elem_cnt() * GetSizeOfDataType(dx->data_type()); Memset(ctx->stream(), dest, 0, out_bytes_size); const std::string& data_format = ctx->Attr("data_format"); - if (data_format == "channels_first") { - PoolingKernelUtil::Maxpool2dBackwardCFirst( - ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); - } else if (data_format == "channels_last") { - PoolingKernelUtil::Maxpool2dBackwardCLast( - ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); - } else { - UNIMPLEMENTED() << "Unsupported data_format"; + if(elem_num < GetMaxVal()){ + NdIndexOffsetHelper index_helper(dy_vector.data()); + if (data_format == "channels_first") { + PoolingKernelUtil::Maxpool2dBackwardCFirst( + ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); + } else if (data_format == "channels_last") { + PoolingKernelUtil::Maxpool2dBackwardCLast( + ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); + } else { + UNIMPLEMENTED() << "Unsupported data_format"; + } + }else{ + NdIndexOffsetHelper index_helper(dy_vector.data()); + if (data_format == "channels_first") { + PoolingKernelUtil::Maxpool2dBackwardCFirst( + ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); + } else if (data_format == "channels_last") { + PoolingKernelUtil::Maxpool2dBackwardCLast( + ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); + } else { + UNIMPLEMENTED() << "Unsupported data_format"; + } } }; }; @@ -384,10 +422,15 @@ class MaxPool3dKernel final : public user_op::OpKernel { DimVector y_vector; y->shape().ToDimVector(&y_vector); - NdIndexOffsetHelper index_helper(y_vector.data()); - - PoolingKernelUtil::Maxpool3dForward(ctx->stream(), index_helper, elem_num, src, + if(elem_num < GetMaxVal()){ + NdIndexOffsetHelper index_helper(y_vector.data()); + PoolingKernelUtil::Maxpool3dForward(ctx->stream(), index_helper, elem_num, src, + dest, indice_ptr, params_3d); + }else{ + NdIndexOffsetHelper index_helper(y_vector.data()); + PoolingKernelUtil::Maxpool3dForward(ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); + } }; }; @@ -420,13 +463,19 @@ class MaxPool3dGradKernel final : public user_op::OpKernel { DimVector dy_vector; dy->shape().ToDimVector(&dy_vector); - NdIndexOffsetHelper index_helper(dy_vector.data()); size_t out_bytes_size = dx->shape().elem_cnt() * GetSizeOfDataType(dx->data_type()); Memset(ctx->stream(), dest, 0, out_bytes_size); - PoolingKernelUtil::Maxpool3dBackward(ctx->stream(), index_helper, elem_num, src, + if(elem_num < GetMaxVal()){ + NdIndexOffsetHelper index_helper(dy_vector.data()); + PoolingKernelUtil::Maxpool3dBackward(ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); + }else { + NdIndexOffsetHelper index_helper(dy_vector.data()); + PoolingKernelUtil::Maxpool3dBackward(ctx->stream(), index_helper, elem_num, src, + dest, indice_ptr, params_3d); + } }; }; @@ -469,6 +518,6 @@ REGISTER_POOLING_WITH_DEVICE(DeviceType::kCUDA) #endif OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_POOLING_KERNEL_UTIL, (DeviceType::kCPU), - POOLING_DATA_TYPE_CPU_SEQ); + POOLING_DATA_TYPE_CPU_SEQ, POOLING_IDX_DATA_TYPE_SEQ); } // namespace oneflow diff --git a/oneflow/user/kernels/pooling_kernel.cu b/oneflow/user/kernels/pooling_kernel.cu index 124f984d0aa..1432ab2e783 100644 --- a/oneflow/user/kernels/pooling_kernel.cu +++ b/oneflow/user/kernels/pooling_kernel.cu @@ -32,41 +32,41 @@ int GetNumBlocks(int64_t elem_cnt) { return num_blocks; } -template +template __device__ __inline__ void Maxpool2dForwardComputeCLast( - const NdIndexOffsetHelper& index_helper, int64_t elem_num, const T* src, T* dest, + const NdIndexOffsetHelper& index_helper, IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, const int32_t padding_h, const int32_t padding_w, const int64_t n_batch, const int64_t n_channel, const int64_t x_height, const int64_t x_width, const int64_t y_height, const int64_t y_width, const int32_t kernel_size_h, const int32_t kernel_size_w, const int32_t stride_h, const int32_t stride_w, const int32_t dilation_h, const int32_t dilation_w) { - int64_t n, h, w, c; + IDX n, h, w, c; CUDA_1D_KERNEL_LOOP(num, elem_num) { index_helper.OffsetToNdIndex(num, n, h, w, c); - const int64_t x_start_idx = n * n_channel * x_width * x_height; - const int64_t y_start_idx = n * n_channel * y_height * y_width; - int64_t hstart = h * stride_h - padding_h; - int64_t wstart = w * stride_w - padding_w; - const int64_t hend = (hstart + (kernel_size_h - 1) * dilation_h + 1) <= x_height + const IDX x_start_idx = n * n_channel * x_width * x_height; + const IDX y_start_idx = n * n_channel * y_height * y_width; + IDX hstart = h * stride_h - padding_h; + IDX wstart = w * stride_w - padding_w; + const IDX hend = (hstart + (kernel_size_h - 1) * dilation_h + 1) <= x_height ? (hstart + (kernel_size_h - 1) * dilation_h + 1) : x_height; - const int64_t wend = (wstart + (kernel_size_w - 1) * dilation_w + 1) <= x_width + const IDX wend = (wstart + (kernel_size_w - 1) * dilation_w + 1) <= x_width ? (wstart + (kernel_size_w - 1) * dilation_w + 1) : x_width; while (hstart < 0) { hstart += dilation_h; } while (wstart < 0) { wstart += dilation_w; } /* compute max value(src[src_idx]) in kernel box region, and save the value to dest[num] */ - int64_t max_index = hstart * x_width + wstart; - int64_t src_idx = 0; + IDX max_index = hstart * x_width + wstart; + IDX src_idx = 0; /* equal to -std::numeric_limits::infinity(); */ T max_value = detail::numeric_limits::lower_bound(); - for (int64_t i = hstart; i < hend; i++) { - for (int64_t j = wstart; j < wend; j++) { - const int64_t window_idx = i * x_width * n_channel + j * n_channel + c; - const int64_t search_idx = x_start_idx + window_idx; + for (IDX i = hstart; i < hend; i++) { + for (IDX j = wstart; j < wend; j++) { + const IDX window_idx = i * x_width * n_channel + j * n_channel + c; + const IDX search_idx = x_start_idx + window_idx; T val = src[search_idx]; if (val > max_value || detail::numerics::isnan(val)) { max_value = val; @@ -75,7 +75,7 @@ __device__ __inline__ void Maxpool2dForwardComputeCLast( } } } - const int64_t out_idx = y_start_idx + h * y_width * n_channel + w * n_channel + c; + const IDX out_idx = y_start_idx + h * y_width * n_channel + w * n_channel + c; dest[out_idx] = src[src_idx]; indice_ptr[out_idx] = max_index; } @@ -83,156 +83,156 @@ __device__ __inline__ void Maxpool2dForwardComputeCLast( } // namespace -template +template __launch_bounds__(kBlockSize) __global__ - void DoCUDAMaxPool1dForward(const NdIndexOffsetHelper index_helper, - int64_t elem_num, const T* src, T* dest, int64_t* indice_ptr, + void DoCUDAMaxPool1dForward(const NdIndexOffsetHelper index_helper, + IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, int32_t padding_l, int64_t n_batch, int64_t n_channel, - int64_t x_length, int64_t y_length, int32_t kernel_size_l, + int64_t x_length, int32_t kernel_size_l, int32_t stride_l, int32_t dilation_l) { - Maxpool1dForwardCompute(index_helper, elem_num, src, dest, indice_ptr, padding_l, n_batch, - n_channel, x_length, y_length, kernel_size_l, stride_l, dilation_l); + Maxpool1dForwardCompute(index_helper, elem_num, src, dest, indice_ptr, padding_l, n_batch, + n_channel, x_length, kernel_size_l, stride_l, dilation_l); }; -template +template __launch_bounds__(kBlockSize) __global__ - void DoCUDAMaxPool2dForwardCFirst(const NdIndexOffsetHelper index_helper, - int64_t elem_num, const T* src, T* dest, int64_t* indice_ptr, + void DoCUDAMaxPool2dForwardCFirst(const NdIndexOffsetHelper index_helper, + IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, int32_t padding_h, int32_t padding_w, int64_t n_batch, int64_t n_channel, int64_t x_height, int64_t x_width, - int64_t y_height, int64_t y_width, int32_t kernel_size_h, - int32_t kernel_size_w, int32_t stride_h, int32_t stride_w, + int32_t kernel_size_h, int32_t kernel_size_w, + int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w) { - Maxpool2dForwardComputeCFirst(index_helper, elem_num, src, dest, indice_ptr, padding_h, - padding_w, n_batch, n_channel, x_height, x_width, y_height, - y_width, kernel_size_h, kernel_size_w, stride_h, stride_w, + Maxpool2dForwardComputeCFirst(index_helper, elem_num, src, dest, indice_ptr, padding_h, + padding_w, n_batch, n_channel, x_height, x_width, + kernel_size_h, kernel_size_w, stride_h, stride_w, dilation_h, dilation_w); }; -template +template __launch_bounds__(kBlockSize) __global__ - void DoCUDAMaxPool2dForwardCLast(const NdIndexOffsetHelper index_helper, - int64_t elem_num, const T* src, T* dest, int64_t* indice_ptr, + void DoCUDAMaxPool2dForwardCLast(const NdIndexOffsetHelper index_helper, + IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, int32_t padding_h, int32_t padding_w, int64_t n_batch, int64_t n_channel, int64_t x_height, int64_t x_width, int64_t y_height, int64_t y_width, int32_t kernel_size_h, int32_t kernel_size_w, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w) { - Maxpool2dForwardComputeCLast(index_helper, elem_num, src, dest, indice_ptr, padding_h, + Maxpool2dForwardComputeCLast(index_helper, elem_num, src, dest, indice_ptr, padding_h, padding_w, n_batch, n_channel, x_height, x_width, y_height, y_width, kernel_size_h, kernel_size_w, stride_h, stride_w, dilation_h, dilation_w); }; -template +template __launch_bounds__(kBlockSize) __global__ - void DoCUDAMaxPool3dForward(const NdIndexOffsetHelper index_helper, - int64_t elem_num, const T* src, T* dest, int64_t* indice_ptr, + void DoCUDAMaxPool3dForward(const NdIndexOffsetHelper index_helper, + IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, int32_t padding_t, int32_t padding_h, int32_t padding_w, int64_t n_batch, int64_t n_channel, int64_t x_time, - int64_t x_height, int64_t x_width, int64_t y_time, int64_t y_height, - int64_t y_width, int32_t kernel_size_t, int32_t kernel_size_h, + int64_t x_height, int64_t x_width, + int32_t kernel_size_t, int32_t kernel_size_h, int32_t kernel_size_w, int32_t stride_t, int32_t stride_h, int32_t stride_w, int32_t dilation_t, int32_t dilation_h, int32_t dilation_w) { - Maxpool3dForwardCompute(index_helper, elem_num, src, dest, indice_ptr, padding_t, padding_h, - padding_w, n_batch, n_channel, x_time, x_height, x_width, y_time, - y_height, y_width, kernel_size_t, kernel_size_h, kernel_size_w, + Maxpool3dForwardCompute(index_helper, elem_num, src, dest, indice_ptr, padding_t, padding_h, + padding_w, n_batch, n_channel, x_time, x_height, x_width, + kernel_size_t, kernel_size_h, kernel_size_w, stride_t, stride_h, stride_w, dilation_t, dilation_h, dilation_w); }; -template +template __launch_bounds__(kBlockSize) __global__ - void DoCUDAMaxPool1dBackward(const NdIndexOffsetHelper index_helper, - const int64_t elem_num, const T* src, T* dest, + void DoCUDAMaxPool1dBackward(const NdIndexOffsetHelper index_helper, + const IDX elem_num, const T* src, T* dest, const int64_t* indice_ptr, const int64_t n_batch, const int64_t n_channel, const int64_t src_length, const int64_t dst_length) { - Maxpool1dBackwardCompute(index_helper, elem_num, src, dest, indice_ptr, n_batch, n_channel, + Maxpool1dBackwardCompute(index_helper, elem_num, src, dest, indice_ptr, n_batch, n_channel, src_length, dst_length); }; -template +template __launch_bounds__(kBlockSize) __global__ - void DoCUDAMaxPool2dBackwardCFirst(const NdIndexOffsetHelper index_helper, - const int64_t elem_num, const T* src, T* dest, + void DoCUDAMaxPool2dBackwardCFirst(const NdIndexOffsetHelper index_helper, + const IDX elem_num, const T* src, T* dest, const int64_t* indice_ptr, const int64_t n_batch, const int64_t n_channel, const int64_t src_height, const int64_t src_width, const int64_t dst_height, const int64_t dst_width) { - Maxpool2dBackwardComputeCFirst(index_helper, elem_num, src, dest, indice_ptr, n_batch, + Maxpool2dBackwardComputeCFirst(index_helper, elem_num, src, dest, indice_ptr, n_batch, n_channel, src_height, src_width, dst_height, dst_width); }; -template +template __launch_bounds__(kBlockSize) __global__ - void DoCUDAMaxPool2dBackwardCLast(const NdIndexOffsetHelper index_helper, - const int64_t elem_num, const T* src, T* dest, + void DoCUDAMaxPool2dBackwardCLast(const NdIndexOffsetHelper index_helper, + const IDX elem_num, const T* src, T* dest, const int64_t* indice_ptr, const int64_t n_batch, const int64_t n_channel, const int64_t src_height, const int64_t src_width, const int64_t dst_height, const int64_t dst_width) { - Maxpool2dBackwardComputeCLast(index_helper, elem_num, src, dest, indice_ptr, n_batch, + Maxpool2dBackwardComputeCLast(index_helper, elem_num, src, dest, indice_ptr, n_batch, n_channel, src_height, src_width, dst_height, dst_width); }; -template +template __launch_bounds__(kBlockSize) __global__ - void DoCUDAMaxPool3dBackward(const NdIndexOffsetHelper index_helper, - const int64_t elem_num, const T* src, T* dest, + void DoCUDAMaxPool3dBackward(const NdIndexOffsetHelper index_helper, + const IDX elem_num, const T* src, T* dest, const int64_t* indice_ptr, const int64_t n_batch, const int64_t n_channel, const int64_t src_time, const int64_t src_height, const int64_t src_width, const int64_t dst_time, const int64_t dst_height, const int64_t dst_width) { - Maxpool3dBackwardCompute(index_helper, elem_num, src, dest, indice_ptr, n_batch, n_channel, + Maxpool3dBackwardCompute(index_helper, elem_num, src, dest, indice_ptr, n_batch, n_channel, src_time, src_height, src_width, dst_time, dst_height, dst_width); }; -template -struct PoolingKernelUtil { +template +struct PoolingKernelUtil { static void Maxpool1dForward(ep::Stream* stream, - const NdIndexOffsetHelper& index_helper, - const int64_t elem_num, const T* src, T* dest, int64_t* indice_ptr, + const NdIndexOffsetHelper& index_helper, + const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { - DoCUDAMaxPool1dForward<<<<As()->cuda_stream()>>>( index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[2], params_3d.num_batch(), params_3d.num_channel(), params_3d.GetXShape5D().At(4), - params_3d.GetYShape5D().At(4), params_3d.pooling_size_3d()[2], params_3d.stride_3d()[2], + params_3d.pooling_size_3d()[2], params_3d.stride_3d()[2], params_3d.dilation_3d()[2]); } static void Maxpool1dBackward(ep::Stream* stream, - const NdIndexOffsetHelper& index_helper, - const int64_t elem_num, const T* src, T* dest, + const NdIndexOffsetHelper& index_helper, + const IDX elem_num, const T* src, T* dest, const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { - DoCUDAMaxPool1dBackward<<<<As()->cuda_stream()>>>( index_helper, elem_num, src, dest, indice_ptr, params_3d.num_batch(), params_3d.num_channel(), params_3d.GetYShape5D().At(4), params_3d.GetXShape5D().At(4)); } static void Maxpool2dForwardCFirst(ep::Stream* stream, - const NdIndexOffsetHelper& index_helper, - const int64_t elem_num, const T* src, T* dest, + const NdIndexOffsetHelper& index_helper, + const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { - DoCUDAMaxPool2dForwardCFirst<<<<As()->cuda_stream()>>>( index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[1], params_3d.padding()[2], params_3d.num_batch(), params_3d.num_channel(), - params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4), params_3d.GetYShape5D().At(3), - params_3d.GetYShape5D().At(4), params_3d.pooling_size_3d()[1], - params_3d.pooling_size_3d()[2], params_3d.stride_3d()[1], params_3d.stride_3d()[2], + params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4), + params_3d.pooling_size_3d()[1], params_3d.pooling_size_3d()[2], + params_3d.stride_3d()[1], params_3d.stride_3d()[2], params_3d.dilation_3d()[1], params_3d.dilation_3d()[2]); } static void Maxpool2dBackwardCFirst(ep::Stream* stream, - const NdIndexOffsetHelper& index_helper, - const int64_t elem_num, const T* src, T* dest, + const NdIndexOffsetHelper& index_helper, + const IDX elem_num, const T* src, T* dest, const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { - DoCUDAMaxPool2dBackwardCFirst<<<<As()->cuda_stream()>>>( index_helper, elem_num, src, dest, indice_ptr, params_3d.num_batch(), params_3d.num_channel(), params_3d.GetYShape5D().At(3), params_3d.GetYShape5D().At(4), @@ -240,10 +240,10 @@ struct PoolingKernelUtil { } static void Maxpool2dForwardCLast(ep::Stream* stream, - const NdIndexOffsetHelper& index_helper, - const int64_t elem_num, const T* src, T* dest, + const NdIndexOffsetHelper& index_helper, + const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { - DoCUDAMaxPool2dForwardCLast<<<<As()->cuda_stream()>>>( index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[1], params_3d.padding()[2], params_3d.num_batch(), params_3d.num_channel(), @@ -254,11 +254,11 @@ struct PoolingKernelUtil { } static void Maxpool2dBackwardCLast(ep::Stream* stream, - const NdIndexOffsetHelper& index_helper, - const int64_t elem_num, const T* src, T* dest, + const NdIndexOffsetHelper& index_helper, + const IDX elem_num, const T* src, T* dest, const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { - DoCUDAMaxPool2dBackwardCLast<<<<As()->cuda_stream()>>>( index_helper, elem_num, src, dest, indice_ptr, params_3d.num_batch(), params_3d.num_channel(), params_3d.GetYShape5D().At(3), params_3d.GetYShape5D().At(4), @@ -266,26 +266,27 @@ struct PoolingKernelUtil { } static void Maxpool3dForward(ep::Stream* stream, - const NdIndexOffsetHelper& index_helper, - const int64_t elem_num, const T* src, T* dest, int64_t* indice_ptr, + const NdIndexOffsetHelper& index_helper, + const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { - DoCUDAMaxPool3dForward<<<<As()->cuda_stream()>>>( index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[0], params_3d.padding()[1], params_3d.padding()[2], params_3d.num_batch(), - params_3d.num_channel(), params_3d.GetXShape5D().At(2), params_3d.GetXShape5D().At(3), - params_3d.GetXShape5D().At(4), params_3d.GetYShape5D().At(2), params_3d.GetYShape5D().At(3), - params_3d.GetYShape5D().At(4), params_3d.pooling_size_3d()[0], + params_3d.num_channel(), + params_3d.GetXShape5D().At(2), params_3d.GetXShape5D().At(3), + params_3d.GetXShape5D().At(4), + params_3d.pooling_size_3d()[0], params_3d.pooling_size_3d()[1], params_3d.pooling_size_3d()[2], params_3d.stride_3d()[0], params_3d.stride_3d()[1], params_3d.stride_3d()[2], params_3d.dilation_3d()[0], params_3d.dilation_3d()[1], params_3d.dilation_3d()[2]); } static void Maxpool3dBackward(ep::Stream* stream, - const NdIndexOffsetHelper& index_helper, - const int64_t elem_num, const T* src, T* dest, + const NdIndexOffsetHelper& index_helper, + const IDX elem_num, const T* src, T* dest, const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { - DoCUDAMaxPool3dBackward<<<<As()->cuda_stream()>>>( index_helper, elem_num, src, dest, indice_ptr, params_3d.num_batch(), params_3d.num_channel(), params_3d.GetYShape5D().At(2), params_3d.GetYShape5D().At(3), @@ -295,7 +296,7 @@ struct PoolingKernelUtil { }; OF_PP_SEQ_PRODUCT_FOR_EACH_TUPLE(INSTANTIATE_POOLING_KERNEL_UTIL, (DeviceType::kCUDA), - POOLING_DATA_TYPE_CUDA_SEQ); + POOLING_DATA_TYPE_CUDA_SEQ, POOLING_IDX_DATA_TYPE_SEQ); } // namespace oneflow #endif // WITH_CUDA diff --git a/oneflow/user/kernels/pooling_kernel_util.h b/oneflow/user/kernels/pooling_kernel_util.h index 6ad5233515c..4c7c542896d 100644 --- a/oneflow/user/kernels/pooling_kernel_util.h +++ b/oneflow/user/kernels/pooling_kernel_util.h @@ -33,6 +33,10 @@ namespace oneflow { OF_PP_MAKE_TUPLE_SEQ(float, DataType::kFloat) \ OF_PP_MAKE_TUPLE_SEQ(double, DataType::kDouble) +#define POOLING_IDX_DATA_TYPE_SEQ \ + OF_PP_MAKE_TUPLE_SEQ(int32_t, DataType::kInt32) \ + OF_PP_MAKE_TUPLE_SEQ(int64_t, DataType::kInt64) + #define POOLING_DATA_TYPE_CPU_SEQ POOLING_DATA_TYPE_SEQ #define POOLING_DATA_TYPE_CUDA_SEQ POOLING_DATA_TYPE_SEQ @@ -88,81 +92,80 @@ class MaxPoolingParams3D { int64_t channel_num_; }; -template +template struct PoolingKernelUtil { static void Maxpool1dForward(ep::Stream* stream, - const NdIndexOffsetHelper& index_helper, - const int64_t elem_num, const T* src, T* dest, int64_t* indice_ptr, + const NdIndexOffsetHelper& index_helper, + const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, const MaxPoolingParams3D& params_3d); static void Maxpool1dBackward(ep::Stream* stream, - const NdIndexOffsetHelper& index_helper, - const int64_t elem_num, const T* src, T* dest, + const NdIndexOffsetHelper& index_helper, + const IDX elem_num, const T* src, T* dest, const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d); static void Maxpool2dForwardCFirst(ep::Stream* stream, - const NdIndexOffsetHelper& index_helper, - const int64_t elem_num, const T* src, T* dest, + const NdIndexOffsetHelper& index_helper, + const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, const MaxPoolingParams3D& params_3d); static void Maxpool2dBackwardCFirst(ep::Stream* stream, - const NdIndexOffsetHelper& index_helper, - const int64_t elem_num, const T* src, T* dest, + const NdIndexOffsetHelper& index_helper, + const IDX elem_num, const T* src, T* dest, const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d); static void Maxpool2dForwardCLast(ep::Stream* stream, - const NdIndexOffsetHelper& index_helper, - const int64_t elem_num, const T* src, T* dest, + const NdIndexOffsetHelper& index_helper, + const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, const MaxPoolingParams3D& params_3d); static void Maxpool2dBackwardCLast(ep::Stream* stream, - const NdIndexOffsetHelper& index_helper, - const int64_t elem_num, const T* src, T* dest, + const NdIndexOffsetHelper& index_helper, + const IDX elem_num, const T* src, T* dest, const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d); static void Maxpool3dForward(ep::Stream* stream, - const NdIndexOffsetHelper& index_helper, - const int64_t elem_num, const T* src, T* dest, int64_t* indice_ptr, + const NdIndexOffsetHelper& index_helper, + const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, const MaxPoolingParams3D& params_3d); static void Maxpool3dBackward(ep::Stream* stream, - const NdIndexOffsetHelper& index_helper, - const int64_t elem_num, const T* src, T* dest, + const NdIndexOffsetHelper& index_helper, + const IDX elem_num, const T* src, T* dest, const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d); }; -template -OF_DEVICE_FUNC void Maxpool1dForwardCompute(const NdIndexOffsetHelper index_helper, - int64_t elem_num, const T* src, T* dest, +template +OF_DEVICE_FUNC void Maxpool1dForwardCompute(const NdIndexOffsetHelper index_helper, + IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, const int32_t padding_l, const int64_t n_batch, const int64_t n_channel, - const int64_t x_length, const int64_t y_length, - const int32_t kernel_size_l, const int32_t stride_l, - const int32_t dilation_l) { + const int64_t x_length, const int32_t kernel_size_l, + const int32_t stride_l, const int32_t dilation_l) { XPU_1D_KERNEL_LOOP(num, elem_num) { - int64_t n, c, l; + IDX n, c, l; index_helper.OffsetToNdIndex(num, n, c, l); // n, c, l->index = n*c*l + c* l - const int64_t start_idx = (n * n_channel + c) * x_length; - int64_t lstart = l * stride_l - padding_l; - const int64_t lend = (lstart + (kernel_size_l - 1) * dilation_l + 1) <= x_length + const IDX start_idx = (n * n_channel + c) * x_length; + IDX lstart = l * stride_l - padding_l; + const IDX lend = (lstart + (kernel_size_l - 1) * dilation_l + 1) <= x_length ? (lstart + (kernel_size_l - 1) * dilation_l + 1) : x_length; while (lstart < 0) { lstart += dilation_l; } /* compute max value(src[src_idx]) in kernel box region, and save the value to dest[num] */ - int64_t max_index = lstart; - int64_t src_idx = 0; + IDX max_index = lstart; + IDX src_idx = 0; /* equal to -std::numeric_limits::infinity(); */ T max_value = detail::numeric_limits::lower_bound(); - for (int64_t idx = lstart; idx < lend; idx += dilation_l) { - const int64_t search_idx = start_idx + idx; + for (IDX idx = lstart; idx < lend; idx += dilation_l) { + const IDX search_idx = start_idx + idx; T val = src[search_idx]; if (val > max_value || detail::numerics::isnan(val)) { max_value = val; @@ -175,20 +178,20 @@ OF_DEVICE_FUNC void Maxpool1dForwardCompute(const NdIndexOffsetHelper -OF_DEVICE_FUNC void Maxpool1dBackwardCompute(const NdIndexOffsetHelper index_helper, - const int64_t elem_num, const T* src, T* dest, +template +OF_DEVICE_FUNC void Maxpool1dBackwardCompute(const NdIndexOffsetHelper index_helper, + const IDX elem_num, const T* src, T* dest, const int64_t* indice_ptr, const int64_t n_batch, const int64_t n_channel, const int64_t src_length, const int64_t dst_length) { XPU_1D_KERNEL_LOOP(num, elem_num) { - int64_t n, c, l; + IDX n, c, l; index_helper.OffsetToNdIndex(num, n, c, l); - const int64_t src_start = (n * n_channel + c) * src_length; - const int64_t dst_start = (n * n_channel + c) * dst_length; - const int64_t index = src_start + l; - const int64_t max_index = dst_start + indice_ptr[index]; + const IDX src_start = (n * n_channel + c) * src_length; + const IDX dst_start = (n * n_channel + c) * dst_length; + const IDX index = src_start + l; + const IDX max_index = dst_start + indice_ptr[index]; if (max_index != -1) { /* update gradient, equals to dest[max_index] += src[index]; */ DeviceAdd::Invoke(src + index, dest + max_index); @@ -196,114 +199,78 @@ OF_DEVICE_FUNC void Maxpool1dBackwardCompute(const NdIndexOffsetHelper +template OF_DEVICE_FUNC void Maxpool2dForwardComputeCFirst( - const NdIndexOffsetHelper index_helper, int64_t elem_num, const T* src, T* dest, + const NdIndexOffsetHelper index_helper, IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, const int32_t padding_h, const int32_t padding_w, const int64_t n_batch, - const int64_t n_channel, const int64_t x_height, const int64_t x_width, const int64_t y_height, - const int64_t y_width, const int32_t kernel_size_h, const int32_t kernel_size_w, + const int64_t n_channel, const int64_t x_height, const int64_t x_width, + const int32_t kernel_size_h, const int32_t kernel_size_w, const int32_t stride_h, const int32_t stride_w, const int32_t dilation_h, const int32_t dilation_w) { XPU_1D_KERNEL_LOOP(num, elem_num) { - // int64_t n, c, h, w; - // index_helper.OffsetToNdIndex(num, n, c, h, w); - - // const int64_t start_idx = (n * n_channel + c) * x_width * x_height; - // int64_t hstart = h * stride_h - padding_h; - // int64_t wstart = w * stride_w - padding_w; - // const int64_t hend = (hstart + (kernel_size_h - 1) * dilation_h + 1) <= x_height - // ? (hstart + (kernel_size_h - 1) * dilation_h + 1) - // : x_height; - // const int64_t wend = (wstart + (kernel_size_w - 1) * dilation_w + 1) <= x_width - // ? (wstart + (kernel_size_w - 1) * dilation_w + 1) - // : x_width; - - // while (hstart < 0) { hstart += dilation_h; } - // while (wstart < 0) { wstart += dilation_w; } - - // /* compute max value(src[src_idx]) in kernel box region, and save the value to dest[num] */ - // int64_t max_index = hstart * x_width + wstart; - // int64_t src_idx = 0; - - // /* equal to -std::numeric_limits::infinity(); */ - // T max_value = detail::numeric_limits::lower_bound(); - - // for (int64_t i = hstart; i < hend; i += dilation_h) { - // for (int64_t j = wstart; j < wend; j += dilation_w) { - // const int64_t window_idx = i * x_width + j; - // const int64_t search_idx = start_idx + window_idx; - // T val = src[search_idx]; - // /* NOTE: - // std::isnan(val) only supports a few data types, see: - // https://en.cppreference.com/w/cpp/numeric/math/isnan and when use gcc/g++ 4.x to compile, - // the following exception will be throw: - - // new_kernel_util.cu:24] Check failed: cudaMemcpyAsync(dst, src, sz, cudaMemcpyDefault, - // ctx->cuda_stream() ) : unspecified launch failure (719) - - // but if use gcc/g++ 7.x to compile, everything is ok! the exact reason is still unknown! - // */ - // if (val > max_value || detail::numerics::isnan(val)) { - // max_value = val; - // max_index = window_idx; - // src_idx = search_idx; - // } - // } - // } - - int pw = num % kernel_size_w; - int ph = (num / kernel_size_w) % kernel_size_h; - int c = (num / kernel_size_w / kernel_size_h) % 64; - int n = num / kernel_size_w / kernel_size_h / 64; - int hstart = ph * stride_h - padding_h; - int wstart = pw * stride_w - padding_w; - // int hend = min(hstart + (kernel_size_h - 1) * dilation_h + 1, x_height); - // int wend = min(wstart + (kernel_size_w - 1) * dilation_w + 1, x_width); - - int hend = (hstart + (kernel_size_h - 1) * dilation_h + 1) <= x_height + IDX n, c, h, w; + index_helper.OffsetToNdIndex(num, n, c, h, w); + + const IDX start_idx = (n * n_channel + c) * x_width * x_height; + IDX hstart = h * stride_h - padding_h; + IDX wstart = w * stride_w - padding_w; + const IDX hend = (hstart + (kernel_size_h - 1) * dilation_h + 1) <= x_height ? (hstart + (kernel_size_h - 1) * dilation_h + 1) : x_height; - int wend = (wstart + (kernel_size_w - 1) * dilation_w + 1) <= x_width + const IDX wend = (wstart + (kernel_size_w - 1) * dilation_w + 1) <= x_width ? (wstart + (kernel_size_w - 1) * dilation_w + 1) : x_width; - while(hstart < 0) - hstart += dilation_h; - while(wstart < 0) - wstart += dilation_w; - - T maxval = detail::numeric_limits::lower_bound(); // -Infinity - int maxidx = hstart * x_width + wstart; - const T* btm_data = src + (n * 64 + c) * x_height * x_width; - for (int h = hstart; h < hend; h += dilation_h) { - for (int w = wstart; w < wend; w += dilation_w) { - T val = btm_data[h * x_width + w]; - if (val > maxval|| detail::numerics::isnan(val)) { - maxidx = h * x_width + w; - maxval = static_cast(val); + while (hstart < 0) { hstart += dilation_h; } + while (wstart < 0) { wstart += dilation_w; } + + /* compute max value(src[src_idx]) in kernel box region, and save the value to dest[num] */ + IDX max_index = hstart * x_width + wstart; + IDX src_idx = 0; + + /* equal to -std::numeric_limits::infinity(); */ + T max_value = detail::numeric_limits::lower_bound(); + + for (IDX i = hstart; i < hend; i += dilation_h) { + for (IDX j = wstart; j < wend; j += dilation_w) { + const IDX window_idx = i * x_width + j; + const IDX search_idx = start_idx + window_idx; + T val = src[search_idx]; + /* NOTE: + std::isnan(val) only supports a few data types, see: + https://en.cppreference.com/w/cpp/numeric/math/isnan and when use gcc/g++ 4.x to compile, + the following exception will be throw: + + new_kernel_util.cu:24] Check failed: cudaMemcpyAsync(dst, src, sz, cudaMemcpyDefault, + ctx->cuda_stream() ) : unspecified launch failure (719) + + but if use gcc/g++ 7.x to compile, everything is ok! the exact reason is still unknown! + */ + if (val > max_value || detail::numerics::isnan(val)) { + max_value = val; + max_index = window_idx; + src_idx = search_idx; } } } - dest[num] = maxval; - indice_ptr[num] = maxidx; } } -template +template OF_DEVICE_FUNC void Maxpool2dBackwardComputeCFirst( - const NdIndexOffsetHelper index_helper, const int64_t elem_num, const T* src, + const NdIndexOffsetHelper index_helper, const IDX elem_num, const T* src, T* dest, const int64_t* indice_ptr, const int64_t n_batch, const int64_t n_channel, const int64_t src_height, const int64_t src_width, const int64_t dst_height, const int64_t dst_width) { XPU_1D_KERNEL_LOOP(num, elem_num) { - int64_t n, c, h, w; + IDX n, c, h, w; index_helper.OffsetToNdIndex(num, n, c, h, w); - const int64_t src_start = (n * n_channel + c) * src_height * src_width; - const int64_t dst_start = (n * n_channel + c) * dst_height * dst_width; - const int64_t index = src_start + h * src_width + w; + const IDX src_start = (n * n_channel + c) * src_height * src_width; + const IDX dst_start = (n * n_channel + c) * dst_height * dst_width; + const IDX index = src_start + h * src_width + w; - const int64_t max_index = dst_start + indice_ptr[index]; + const IDX max_index = dst_start + indice_ptr[index]; if (max_index != -1) { /* update gradient, equals to dest[max_index] += src[index]; */ DeviceAdd::Invoke(src + index, dest + max_index); @@ -311,19 +278,19 @@ OF_DEVICE_FUNC void Maxpool2dBackwardComputeCFirst( } } -template +template OF_DEVICE_FUNC void Maxpool2dBackwardComputeCLast( - const NdIndexOffsetHelper index_helper, const int64_t elem_num, const T* src, + const NdIndexOffsetHelper index_helper, const IDX elem_num, const T* src, T* dest, const int64_t* indice_ptr, const int64_t n_batch, const int64_t n_channel, const int64_t src_height, const int64_t src_width, const int64_t dst_height, const int64_t dst_width) { XPU_1D_KERNEL_LOOP(num, elem_num) { - int64_t n, c, h, w; + IDX n, c, h, w; index_helper.OffsetToNdIndex(num, n, c, h, w); - const int64_t src_start = n * src_height * src_width * n_channel; - const int64_t dst_start = n * dst_height * dst_width * n_channel; - const int64_t index = src_start + h * src_width + w; - const int64_t max_index = dst_start + indice_ptr[index]; + const IDX src_start = n * src_height * src_width * n_channel; + const IDX dst_start = n * dst_height * dst_width * n_channel; + const IDX index = src_start + h * src_width + w; + const IDX max_index = dst_start + indice_ptr[index]; if (max_index != -1) { /* update gradient, equals to dest[max_index] += src[index]; */ DeviceAdd::Invoke(src + index, dest + max_index); @@ -331,45 +298,45 @@ OF_DEVICE_FUNC void Maxpool2dBackwardComputeCLast( } } -template +template OF_DEVICE_FUNC void Maxpool3dForwardCompute( - const NdIndexOffsetHelper index_helper, int64_t elem_num, const T* src, T* dest, + const NdIndexOffsetHelper index_helper, IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, const int32_t padding_t, const int32_t padding_h, const int32_t padding_w, - const int64_t n_batch, const int64_t n_channel, const int64_t x_time, const int64_t x_height, - const int64_t x_width, const int64_t y_time, const int64_t y_height, const int64_t y_width, + const int64_t n_batch, const int64_t n_channel, + const int64_t x_time, const int64_t x_height, const int64_t x_width, const int32_t kernel_size_t, const int32_t kernel_size_h, const int32_t kernel_size_w, const int32_t stride_t, const int32_t stride_h, const int32_t stride_w, const int32_t dilation_t, const int32_t dilation_h, const int32_t dilation_w) { XPU_1D_KERNEL_LOOP(num, elem_num) { - int64_t n, c, t, h, w; + IDX n, c, t, h, w; index_helper.OffsetToNdIndex(num, n, c, t, h, w); - int64_t xstart = n * n_channel * x_time * x_width * x_height; - int64_t start_idx = xstart + c * x_time * x_width * x_height; - int64_t tstart = t * stride_t - padding_t; - int64_t hstart = h * stride_h - padding_h; - int64_t wstart = w * stride_w - padding_w; + IDX xstart = n * n_channel * x_time * x_width * x_height; + IDX start_idx = xstart + c * x_time * x_width * x_height; + IDX tstart = t * stride_t - padding_t; + IDX hstart = h * stride_h - padding_h; + IDX wstart = w * stride_w - padding_w; - const int64_t t1 = tstart + (kernel_size_t - 1) * dilation_t + 1; - const int64_t t2 = hstart + (kernel_size_h - 1) * dilation_h + 1; - const int64_t t3 = wstart + (kernel_size_w - 1) * dilation_w + 1; - const int64_t tend = t1 <= x_time ? t1 : x_time; - const int64_t hend = t2 <= x_height ? t2 : x_height; - const int64_t wend = t3 <= x_width ? t3 : x_width; + const IDX t1 = tstart + (kernel_size_t - 1) * dilation_t + 1; + const IDX t2 = hstart + (kernel_size_h - 1) * dilation_h + 1; + const IDX t3 = wstart + (kernel_size_w - 1) * dilation_w + 1; + const IDX tend = t1 <= x_time ? t1 : x_time; + const IDX hend = t2 <= x_height ? t2 : x_height; + const IDX wend = t3 <= x_width ? t3 : x_width; while (tstart < 0) { tstart += dilation_t; } while (hstart < 0) { hstart += dilation_h; } while (wstart < 0) { wstart += dilation_w; } - int64_t max_index = tstart * x_height * x_width + hstart * x_width + wstart; - int64_t src_idx = 0; + IDX max_index = tstart * x_height * x_width + hstart * x_width + wstart; + IDX src_idx = 0; T max_value = detail::numeric_limits::lower_bound(); - for (int64_t zi = tstart; zi < tend; zi += dilation_t) { - for (int64_t i = hstart; i < hend; i += dilation_h) { - for (int64_t j = wstart; j < wend; j += dilation_w) { - const int64_t window_idx = zi * x_height * x_width + i * x_width + j; - const int64_t search_idx = start_idx + window_idx; + for (IDX zi = tstart; zi < tend; zi += dilation_t) { + for (IDX i = hstart; i < hend; i += dilation_h) { + for (IDX j = wstart; j < wend; j += dilation_w) { + const IDX window_idx = zi * x_height * x_width + i * x_width + j; + const IDX search_idx = start_idx + window_idx; T val = src[search_idx]; if (val > max_value || detail::numerics::isnan(val)) { max_value = val; @@ -386,29 +353,29 @@ OF_DEVICE_FUNC void Maxpool3dForwardCompute( } } -template -OF_DEVICE_FUNC void Maxpool3dBackwardCompute(const NdIndexOffsetHelper index_helper, - const int64_t elem_num, const T* src, T* dest, +template +OF_DEVICE_FUNC void Maxpool3dBackwardCompute(const NdIndexOffsetHelper index_helper, + const IDX elem_num, const T* src, T* dest, const int64_t* indice_ptr, const int64_t n_batch, const int64_t n_channel, const int64_t src_time, const int64_t src_height, const int64_t src_width, const int64_t dst_time, const int64_t dst_height, const int64_t dst_width) { XPU_1D_KERNEL_LOOP(num, elem_num) { - int64_t n, c, t, h, w; + IDX n, c, t, h, w; index_helper.OffsetToNdIndex(num, n, c, t, h, w); - const int64_t src_start = (n * n_channel + c) * src_time * src_height * src_width; - const int64_t dst_start = (n * n_channel + c) * dst_time * dst_height * dst_width; - const int64_t index = src_start + t * src_height * src_width + h * src_width + w; - const int64_t max_index = dst_start + indice_ptr[index]; + const IDX src_start = (n * n_channel + c) * src_time * src_height * src_width; + const IDX dst_start = (n * n_channel + c) * dst_time * dst_height * dst_width; + const IDX index = src_start + t * src_height * src_width + h * src_width + w; + const IDX max_index = dst_start + indice_ptr[index]; if (max_index != -1) { DeviceAdd::Invoke(src + index, dest + max_index); } } } -#define INSTANTIATE_POOLING_KERNEL_UTIL(device_type_v, dtype_pair) \ - template struct PoolingKernelUtil; +#define INSTANTIATE_POOLING_KERNEL_UTIL(device_type_v, dtype_pair, index_dtype_pair) \ + template struct PoolingKernelUtil; } // namespace oneflow From 79ff4dbce54aa0637ad5557f477c6e776f2d012d Mon Sep 17 00:00:00 2001 From: MARD1NO <359521840@qq.com> Date: Mon, 7 Feb 2022 14:02:35 +0800 Subject: [PATCH 03/17] fix bug --- oneflow/user/kernels/pooling_kernel_util.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/oneflow/user/kernels/pooling_kernel_util.h b/oneflow/user/kernels/pooling_kernel_util.h index 4c7c542896d..f620c80d77b 100644 --- a/oneflow/user/kernels/pooling_kernel_util.h +++ b/oneflow/user/kernels/pooling_kernel_util.h @@ -253,6 +253,8 @@ OF_DEVICE_FUNC void Maxpool2dForwardComputeCFirst( } } } + dest[num] = src[src_idx]; + indice_ptr[num] = max_index; } } From 14c9fd6dedb278a73118363bc220a19dec769abd Mon Sep 17 00:00:00 2001 From: MARD1NO <359521840@qq.com> Date: Mon, 7 Feb 2022 14:02:48 +0800 Subject: [PATCH 04/17] remove redundant code --- oneflow/user/ops/pooling_op.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/oneflow/user/ops/pooling_op.cpp b/oneflow/user/ops/pooling_op.cpp index c46b85188d3..ffb7d85e1ba 100644 --- a/oneflow/user/ops/pooling_op.cpp +++ b/oneflow/user/ops/pooling_op.cpp @@ -113,7 +113,6 @@ Maybe AvgPoolForwardGetSbpFn(user_op::SbpContext* ctx) { Maybe MaxPoolBackwardGetSbpFn(user_op::SbpContext* ctx) { const user_op::TensorDesc& tensor = ctx->LogicalTensorDesc4InputArgNameAndIndex("x", 0); - const std::vector& padding = ctx->Attr>("padding"); FOR_RANGE(int64_t, i, 0, std::min(2, (int)tensor.shape().NumAxes())) { ctx->NewBuilder() .Split(user_op::OpArg("x", 0), i) From 4f5e6f44e29a7737b8a1168853c6b0e18f4d7765 Mon Sep 17 00:00:00 2001 From: MARD1NO <359521840@qq.com> Date: Mon, 7 Feb 2022 20:20:49 +0800 Subject: [PATCH 05/17] remove redundant read --- oneflow/user/kernels/pooling_kernel_util.h | 30 +++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/oneflow/user/kernels/pooling_kernel_util.h b/oneflow/user/kernels/pooling_kernel_util.h index f620c80d77b..b8e134d9a9b 100644 --- a/oneflow/user/kernels/pooling_kernel_util.h +++ b/oneflow/user/kernels/pooling_kernel_util.h @@ -54,6 +54,13 @@ struct DeviceAdd { }; }; +#ifdef WITH_CUDA + template + __device__ inline T device_min(T a, T b) { + return a <= b ? a : b; + } +#endif + class MaxPoolingParams3D { public: MaxPoolingParams3D(const int32_t dim, const ShapeView& x_shape, const std::string& data_format, @@ -214,19 +221,35 @@ OF_DEVICE_FUNC void Maxpool2dForwardComputeCFirst( const IDX start_idx = (n * n_channel + c) * x_width * x_height; IDX hstart = h * stride_h - padding_h; IDX wstart = w * stride_w - padding_w; + /* const IDX hend = (hstart + (kernel_size_h - 1) * dilation_h + 1) <= x_height ? (hstart + (kernel_size_h - 1) * dilation_h + 1) : x_height; const IDX wend = (wstart + (kernel_size_w - 1) * dilation_w + 1) <= x_width ? (wstart + (kernel_size_w - 1) * dilation_w + 1) : x_width; + */ + + #ifdef WITH_CUDA + const IDX hend = device_min((hstart + (kernel_size_h - 1) * dilation_h + 1), x_height); + const IDX wend = device_min((wstart + (kernel_size_w - 1) * dilation_w + 1), x_height); + // const IDX hend = (hstart + (kernel_size_h - 1) * dilation_h + 1) <= x_height + // ? (hstart + (kernel_size_h - 1) * dilation_h + 1) + // : x_height; + // const IDX wend = (wstart + (kernel_size_w - 1) * dilation_w + 1) <= x_width + // ? (wstart + (kernel_size_w - 1) * dilation_w + 1) + // : x_width; + #else + const IDX hend = std::min((hstart + (kernel_size_h - 1) * dilation_h + 1), x_height); + const IDX wend = std::min((wstart + (kernel_size_w - 1) * dilation_w + 1), x_height); + #endif while (hstart < 0) { hstart += dilation_h; } while (wstart < 0) { wstart += dilation_w; } /* compute max value(src[src_idx]) in kernel box region, and save the value to dest[num] */ IDX max_index = hstart * x_width + wstart; - IDX src_idx = 0; + // IDX src_idx = 0; /* equal to -std::numeric_limits::infinity(); */ T max_value = detail::numeric_limits::lower_bound(); @@ -249,11 +272,12 @@ OF_DEVICE_FUNC void Maxpool2dForwardComputeCFirst( if (val > max_value || detail::numerics::isnan(val)) { max_value = val; max_index = window_idx; - src_idx = search_idx; + // src_idx = search_idx; } } } - dest[num] = src[src_idx]; + // dest[num] = src[src_idx]; + dest[num] = max_value; indice_ptr[num] = max_index; } } From d9fb335c4efa0980172828f81a4008a4fe5e0226 Mon Sep 17 00:00:00 2001 From: MARD1NO <359521840@qq.com> Date: Mon, 7 Feb 2022 20:32:15 +0800 Subject: [PATCH 06/17] remove redundant data_ptr offset --- oneflow/user/kernels/pooling_kernel_util.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/oneflow/user/kernels/pooling_kernel_util.h b/oneflow/user/kernels/pooling_kernel_util.h index b8e134d9a9b..f062c72b110 100644 --- a/oneflow/user/kernels/pooling_kernel_util.h +++ b/oneflow/user/kernels/pooling_kernel_util.h @@ -254,11 +254,13 @@ OF_DEVICE_FUNC void Maxpool2dForwardComputeCFirst( /* equal to -std::numeric_limits::infinity(); */ T max_value = detail::numeric_limits::lower_bound(); + const T* btm_data = src + start_idx; for (IDX i = hstart; i < hend; i += dilation_h) { for (IDX j = wstart; j < wend; j += dilation_w) { const IDX window_idx = i * x_width + j; - const IDX search_idx = start_idx + window_idx; - T val = src[search_idx]; + // const IDX search_idx = start_idx + window_idx; + // T val = src[search_idx]; + T val = btm_data[window_idx]; /* NOTE: std::isnan(val) only supports a few data types, see: https://en.cppreference.com/w/cpp/numeric/math/isnan and when use gcc/g++ 4.x to compile, From 3b79ba8e43f3e887bdef029ec3f9a561930fc70d Mon Sep 17 00:00:00 2001 From: MARD1NO <359521840@qq.com> Date: Tue, 8 Feb 2022 09:27:47 +0800 Subject: [PATCH 07/17] use int32 to describe x shape --- oneflow/user/kernels/pooling_kernel.cpp | 6 +-- oneflow/user/kernels/pooling_kernel_util.h | 48 +++++++++++----------- 2 files changed, 27 insertions(+), 27 deletions(-) diff --git a/oneflow/user/kernels/pooling_kernel.cpp b/oneflow/user/kernels/pooling_kernel.cpp index ea726328a13..eba32c13151 100644 --- a/oneflow/user/kernels/pooling_kernel.cpp +++ b/oneflow/user/kernels/pooling_kernel.cpp @@ -46,9 +46,9 @@ template void Maxpool2dForwardComputeCLast(const NdIndexOffsetHelper& index_helper, IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, const int32_t padding_h, const int32_t padding_w, - const int64_t n_batch, const int64_t n_channel, - const int64_t x_height, const int64_t x_width, - const int64_t y_height, const int64_t y_width, + const int32_t n_batch, const int32_t n_channel, + const int32_t x_height, const int32_t x_width, + const int32_t y_height, const int32_t y_width, const int32_t kernel_size_h, const int32_t kernel_size_w, const int32_t stride_h, const int32_t stride_w, const int32_t dilation_h, const int32_t dilation_w) { diff --git a/oneflow/user/kernels/pooling_kernel_util.h b/oneflow/user/kernels/pooling_kernel_util.h index f062c72b110..3b68b5a5c1e 100644 --- a/oneflow/user/kernels/pooling_kernel_util.h +++ b/oneflow/user/kernels/pooling_kernel_util.h @@ -76,8 +76,8 @@ class MaxPoolingParams3D { const std::vector& dilation_3d() const { return dilation_3d_; } const bool& return_indices() const { return return_indices_; } const bool& ceil_mode() const { return ceil_mode_; } - const int64_t& num_batch() const { return batch_num_; } - const int64_t& num_channel() const { return channel_num_; } + const int32_t& num_batch() const { return batch_num_; } + const int32_t& num_channel() const { return channel_num_; } void Reset(const ShapeView& x_shape); Shape GetYShape() const; @@ -95,8 +95,8 @@ class MaxPoolingParams3D { std::vector dilation_3d_; bool return_indices_; bool ceil_mode_; - int64_t batch_num_; - int64_t channel_num_; + int32_t batch_num_; + int32_t channel_num_; }; template @@ -148,8 +148,8 @@ template OF_DEVICE_FUNC void Maxpool1dForwardCompute(const NdIndexOffsetHelper index_helper, IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, const int32_t padding_l, - const int64_t n_batch, const int64_t n_channel, - const int64_t x_length, const int32_t kernel_size_l, + const int32_t n_batch, const int32_t n_channel, + const int32_t x_length, const int32_t kernel_size_l, const int32_t stride_l, const int32_t dilation_l) { XPU_1D_KERNEL_LOOP(num, elem_num) { IDX n, c, l; @@ -188,9 +188,9 @@ OF_DEVICE_FUNC void Maxpool1dForwardCompute(const NdIndexOffsetHelper in template OF_DEVICE_FUNC void Maxpool1dBackwardCompute(const NdIndexOffsetHelper index_helper, const IDX elem_num, const T* src, T* dest, - const int64_t* indice_ptr, const int64_t n_batch, - const int64_t n_channel, const int64_t src_length, - const int64_t dst_length) { + const int64_t* indice_ptr, const int32_t n_batch, + const int32_t n_channel, const int32_t src_length, + const int32_t dst_length) { XPU_1D_KERNEL_LOOP(num, elem_num) { IDX n, c, l; index_helper.OffsetToNdIndex(num, n, c, l); @@ -209,8 +209,8 @@ OF_DEVICE_FUNC void Maxpool1dBackwardCompute(const NdIndexOffsetHelper i template OF_DEVICE_FUNC void Maxpool2dForwardComputeCFirst( const NdIndexOffsetHelper index_helper, IDX elem_num, const T* src, T* dest, - int64_t* indice_ptr, const int32_t padding_h, const int32_t padding_w, const int64_t n_batch, - const int64_t n_channel, const int64_t x_height, const int64_t x_width, + int64_t* indice_ptr, const int32_t padding_h, const int32_t padding_w, const int32_t n_batch, + const int32_t n_channel, const int32_t x_height, const int32_t x_width, const int32_t kernel_size_h, const int32_t kernel_size_w, const int32_t stride_h, const int32_t stride_w, const int32_t dilation_h, const int32_t dilation_w) { @@ -287,9 +287,9 @@ OF_DEVICE_FUNC void Maxpool2dForwardComputeCFirst( template OF_DEVICE_FUNC void Maxpool2dBackwardComputeCFirst( const NdIndexOffsetHelper index_helper, const IDX elem_num, const T* src, - T* dest, const int64_t* indice_ptr, const int64_t n_batch, const int64_t n_channel, - const int64_t src_height, const int64_t src_width, const int64_t dst_height, - const int64_t dst_width) { + T* dest, const int64_t* indice_ptr, const int32_t n_batch, const int32_t n_channel, + const int32_t src_height, const int32_t src_width, const int32_t dst_height, + const int32_t dst_width) { XPU_1D_KERNEL_LOOP(num, elem_num) { IDX n, c, h, w; index_helper.OffsetToNdIndex(num, n, c, h, w); @@ -309,9 +309,9 @@ OF_DEVICE_FUNC void Maxpool2dBackwardComputeCFirst( template OF_DEVICE_FUNC void Maxpool2dBackwardComputeCLast( const NdIndexOffsetHelper index_helper, const IDX elem_num, const T* src, - T* dest, const int64_t* indice_ptr, const int64_t n_batch, const int64_t n_channel, - const int64_t src_height, const int64_t src_width, const int64_t dst_height, - const int64_t dst_width) { + T* dest, const int64_t* indice_ptr, const int32_t n_batch, const int32_t n_channel, + const int32_t src_height, const int32_t src_width, const int32_t dst_height, + const int32_t dst_width) { XPU_1D_KERNEL_LOOP(num, elem_num) { IDX n, c, h, w; index_helper.OffsetToNdIndex(num, n, c, h, w); @@ -330,8 +330,8 @@ template OF_DEVICE_FUNC void Maxpool3dForwardCompute( const NdIndexOffsetHelper index_helper, IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, const int32_t padding_t, const int32_t padding_h, const int32_t padding_w, - const int64_t n_batch, const int64_t n_channel, - const int64_t x_time, const int64_t x_height, const int64_t x_width, + const int32_t n_batch, const int32_t n_channel, + const int32_t x_time, const int32_t x_height, const int32_t x_width, const int32_t kernel_size_t, const int32_t kernel_size_h, const int32_t kernel_size_w, const int32_t stride_t, const int32_t stride_h, const int32_t stride_w, const int32_t dilation_t, const int32_t dilation_h, const int32_t dilation_w) { @@ -384,11 +384,11 @@ OF_DEVICE_FUNC void Maxpool3dForwardCompute( template OF_DEVICE_FUNC void Maxpool3dBackwardCompute(const NdIndexOffsetHelper index_helper, const IDX elem_num, const T* src, T* dest, - const int64_t* indice_ptr, const int64_t n_batch, - const int64_t n_channel, const int64_t src_time, - const int64_t src_height, const int64_t src_width, - const int64_t dst_time, const int64_t dst_height, - const int64_t dst_width) { + const int64_t* indice_ptr, const int32_t n_batch, + const int32_t n_channel, const int32_t src_time, + const int32_t src_height, const int32_t src_width, + const int32_t dst_time, const int32_t dst_height, + const int32_t dst_width) { XPU_1D_KERNEL_LOOP(num, elem_num) { IDX n, c, t, h, w; index_helper.OffsetToNdIndex(num, n, c, t, h, w); From 4f8bba56384322a6916c6892b41b40b23725a767 Mon Sep 17 00:00:00 2001 From: MARD1NO <359521840@qq.com> Date: Tue, 8 Feb 2022 09:50:50 +0800 Subject: [PATCH 08/17] Fix cuda input params for maxpool2d --- oneflow/user/kernels/pooling_kernel.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/oneflow/user/kernels/pooling_kernel.cu b/oneflow/user/kernels/pooling_kernel.cu index 1432ab2e783..84373a0cd26 100644 --- a/oneflow/user/kernels/pooling_kernel.cu +++ b/oneflow/user/kernels/pooling_kernel.cu @@ -98,8 +98,8 @@ template __launch_bounds__(kBlockSize) __global__ void DoCUDAMaxPool2dForwardCFirst(const NdIndexOffsetHelper index_helper, IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, - int32_t padding_h, int32_t padding_w, int64_t n_batch, - int64_t n_channel, int64_t x_height, int64_t x_width, + int32_t padding_h, int32_t padding_w, int32_t n_batch, + int32_t n_channel, int32_t x_height, int32_t x_width, int32_t kernel_size_h, int32_t kernel_size_w, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w) { From 6b6a8e3ffd8eb7c068911310842d09fb98779a7a Mon Sep 17 00:00:00 2001 From: MARD1NO <359521840@qq.com> Date: Tue, 8 Feb 2022 10:23:51 +0800 Subject: [PATCH 09/17] just for debug --- oneflow/user/kernels/pooling_kernel.cpp | 57 +++++++++++++--------- oneflow/user/kernels/pooling_kernel_util.h | 23 ++++----- 2 files changed, 47 insertions(+), 33 deletions(-) diff --git a/oneflow/user/kernels/pooling_kernel.cpp b/oneflow/user/kernels/pooling_kernel.cpp index eba32c13151..62f6a03fca8 100644 --- a/oneflow/user/kernels/pooling_kernel.cpp +++ b/oneflow/user/kernels/pooling_kernel.cpp @@ -300,7 +300,9 @@ class MaxPool2dKernel final : public user_op::OpKernel { const auto* pooling_cache = dynamic_cast(cache); const MaxPoolingParams3D& params_3d = pooling_cache->GetParams3D(); - const int64_t elem_num = y->shape().elem_cnt(); + // const int64_t elem_num = y->shape().elem_cnt(); + const int32_t elem_num = y->shape().elem_cnt(); + const T* src = x->dptr(); T* dest = y->mut_dptr(); int64_t* indice_ptr = indice->mut_dptr(); @@ -308,28 +310,39 @@ class MaxPool2dKernel final : public user_op::OpKernel { DimVector y_vector; y->shape().ToDimVector(&y_vector); const std::string& data_format = ctx->Attr("data_format"); - if(elem_num < GetMaxVal()){ - NdIndexOffsetHelper index_helper(y_vector.data()); - if (data_format == "channels_first") { - PoolingKernelUtil::Maxpool2dForwardCFirst( + // if(elem_num < GetMaxVal()){ + // NdIndexOffsetHelper index_helper(y_vector.data()); + // if (data_format == "channels_first") { + // PoolingKernelUtil::Maxpool2dForwardCFirst( + // ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); + // } else if (data_format == "channels_last") { + // PoolingKernelUtil::Maxpool2dForwardCLast( + // ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); + // } else { + // UNIMPLEMENTED() << "Unsupported data_format"; + // } + // }else{ + // NdIndexOffsetHelper index_helper(y_vector.data()); + // if (data_format == "channels_first") { + // PoolingKernelUtil::Maxpool2dForwardCFirst( + // ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); + // } else if (data_format == "channels_last") { + // PoolingKernelUtil::Maxpool2dForwardCLast( + // ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); + // } else { + // UNIMPLEMENTED() << "Unsupported data_format"; + // } + // } + + NdIndexOffsetHelper index_helper(y_vector.data()); + if (data_format == "channels_first") { + PoolingKernelUtil::Maxpool2dForwardCFirst( + ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); + } else if (data_format == "channels_last") { + PoolingKernelUtil::Maxpool2dForwardCLast( ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); - } else if (data_format == "channels_last") { - PoolingKernelUtil::Maxpool2dForwardCLast( - ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); - } else { - UNIMPLEMENTED() << "Unsupported data_format"; - } - }else{ - NdIndexOffsetHelper index_helper(y_vector.data()); - if (data_format == "channels_first") { - PoolingKernelUtil::Maxpool2dForwardCFirst( - ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); - } else if (data_format == "channels_last") { - PoolingKernelUtil::Maxpool2dForwardCLast( - ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); - } else { - UNIMPLEMENTED() << "Unsupported data_format"; - } + } else { + UNIMPLEMENTED() << "Unsupported data_format"; } }; }; diff --git a/oneflow/user/kernels/pooling_kernel_util.h b/oneflow/user/kernels/pooling_kernel_util.h index 3b68b5a5c1e..918fcfcb8cb 100644 --- a/oneflow/user/kernels/pooling_kernel_util.h +++ b/oneflow/user/kernels/pooling_kernel_util.h @@ -218,9 +218,9 @@ OF_DEVICE_FUNC void Maxpool2dForwardComputeCFirst( IDX n, c, h, w; index_helper.OffsetToNdIndex(num, n, c, h, w); - const IDX start_idx = (n * n_channel + c) * x_width * x_height; - IDX hstart = h * stride_h - padding_h; - IDX wstart = w * stride_w - padding_w; + const int32_t start_idx = (n * n_channel + c) * x_width * x_height; + int32_t hstart = h * stride_h - padding_h; + int32_t wstart = w * stride_w - padding_w; /* const IDX hend = (hstart + (kernel_size_h - 1) * dilation_h + 1) <= x_height ? (hstart + (kernel_size_h - 1) * dilation_h + 1) @@ -231,8 +231,9 @@ OF_DEVICE_FUNC void Maxpool2dForwardComputeCFirst( */ #ifdef WITH_CUDA - const IDX hend = device_min((hstart + (kernel_size_h - 1) * dilation_h + 1), x_height); - const IDX wend = device_min((wstart + (kernel_size_w - 1) * dilation_w + 1), x_height); + const int32_t hend = device_min((hstart + (kernel_size_h - 1) * dilation_h + 1), x_height); + const int32_t wend = device_min((wstart + (kernel_size_w - 1) * dilation_w + 1), x_height); + // const IDX hend = (hstart + (kernel_size_h - 1) * dilation_h + 1) <= x_height // ? (hstart + (kernel_size_h - 1) * dilation_h + 1) // : x_height; @@ -240,24 +241,24 @@ OF_DEVICE_FUNC void Maxpool2dForwardComputeCFirst( // ? (wstart + (kernel_size_w - 1) * dilation_w + 1) // : x_width; #else - const IDX hend = std::min((hstart + (kernel_size_h - 1) * dilation_h + 1), x_height); - const IDX wend = std::min((wstart + (kernel_size_w - 1) * dilation_w + 1), x_height); + const int32_t hend = std::min((hstart + (kernel_size_h - 1) * dilation_h + 1), x_height); + const int32_t wend = std::min((wstart + (kernel_size_w - 1) * dilation_w + 1), x_height); #endif while (hstart < 0) { hstart += dilation_h; } while (wstart < 0) { wstart += dilation_w; } /* compute max value(src[src_idx]) in kernel box region, and save the value to dest[num] */ - IDX max_index = hstart * x_width + wstart; + int32_t max_index = hstart * x_width + wstart; // IDX src_idx = 0; /* equal to -std::numeric_limits::infinity(); */ T max_value = detail::numeric_limits::lower_bound(); const T* btm_data = src + start_idx; - for (IDX i = hstart; i < hend; i += dilation_h) { - for (IDX j = wstart; j < wend; j += dilation_w) { - const IDX window_idx = i * x_width + j; + for (int32_t i = hstart; i < hend; i += dilation_h) { + for (int32_t j = wstart; j < wend; j += dilation_w) { + const int32_t window_idx = i * x_width + j; // const IDX search_idx = start_idx + window_idx; // T val = src[search_idx]; T val = btm_data[window_idx]; From 385c962e97a6386bd614e3641fcad5b4dce82daa Mon Sep 17 00:00:00 2001 From: MARD1NO <359521840@qq.com> Date: Tue, 8 Feb 2022 11:36:55 +0800 Subject: [PATCH 10/17] just for profile --- oneflow/user/kernels/pooling_kernel.cpp | 4 +-- oneflow/user/kernels/pooling_kernel_util.h | 42 +++++++++++----------- 2 files changed, 22 insertions(+), 24 deletions(-) diff --git a/oneflow/user/kernels/pooling_kernel.cpp b/oneflow/user/kernels/pooling_kernel.cpp index 62f6a03fca8..49070b51f9a 100644 --- a/oneflow/user/kernels/pooling_kernel.cpp +++ b/oneflow/user/kernels/pooling_kernel.cpp @@ -300,8 +300,8 @@ class MaxPool2dKernel final : public user_op::OpKernel { const auto* pooling_cache = dynamic_cast(cache); const MaxPoolingParams3D& params_3d = pooling_cache->GetParams3D(); - // const int64_t elem_num = y->shape().elem_cnt(); - const int32_t elem_num = y->shape().elem_cnt(); + const int64_t elem_num = y->shape().elem_cnt(); + // const int32_t elem_num = y->shape().elem_cnt(); const T* src = x->dptr(); T* dest = y->mut_dptr(); diff --git a/oneflow/user/kernels/pooling_kernel_util.h b/oneflow/user/kernels/pooling_kernel_util.h index 918fcfcb8cb..5e548fb4224 100644 --- a/oneflow/user/kernels/pooling_kernel_util.h +++ b/oneflow/user/kernels/pooling_kernel_util.h @@ -55,10 +55,11 @@ struct DeviceAdd { }; #ifdef WITH_CUDA - template - __device__ inline T device_min(T a, T b) { - return a <= b ? a : b; - } + +OF_DEVICE_FUNC int32_t device_min(int32_t a, int32_t b) { + return a <= b ? a : b; +} + #endif class MaxPoolingParams3D { @@ -217,8 +218,6 @@ OF_DEVICE_FUNC void Maxpool2dForwardComputeCFirst( XPU_1D_KERNEL_LOOP(num, elem_num) { IDX n, c, h, w; index_helper.OffsetToNdIndex(num, n, c, h, w); - - const int32_t start_idx = (n * n_channel + c) * x_width * x_height; int32_t hstart = h * stride_h - padding_h; int32_t wstart = w * stride_w - padding_w; /* @@ -229,11 +228,9 @@ OF_DEVICE_FUNC void Maxpool2dForwardComputeCFirst( ? (wstart + (kernel_size_w - 1) * dilation_w + 1) : x_width; */ - #ifdef WITH_CUDA - const int32_t hend = device_min((hstart + (kernel_size_h - 1) * dilation_h + 1), x_height); - const int32_t wend = device_min((wstart + (kernel_size_w - 1) * dilation_w + 1), x_height); - + const int32_t hend = device_min((hstart + (kernel_size_h - 1) * dilation_h + 1), x_height); + const int32_t wend = device_min((wstart + (kernel_size_w - 1) * dilation_w + 1), x_width); // const IDX hend = (hstart + (kernel_size_h - 1) * dilation_h + 1) <= x_height // ? (hstart + (kernel_size_h - 1) * dilation_h + 1) // : x_height; @@ -247,21 +244,18 @@ OF_DEVICE_FUNC void Maxpool2dForwardComputeCFirst( while (hstart < 0) { hstart += dilation_h; } while (wstart < 0) { wstart += dilation_w; } - + /* equal to -std::numeric_limits::infinity(); */ + T max_value = -10000; /* compute max value(src[src_idx]) in kernel box region, and save the value to dest[num] */ int32_t max_index = hstart * x_width + wstart; - // IDX src_idx = 0; - - /* equal to -std::numeric_limits::infinity(); */ - T max_value = detail::numeric_limits::lower_bound(); - - const T* btm_data = src + start_idx; + const T* btm_data = src + (n * n_channel + c) * x_width * x_height; for (int32_t i = hstart; i < hend; i += dilation_h) { for (int32_t j = wstart; j < wend; j += dilation_w) { - const int32_t window_idx = i * x_width + j; + // const int32_t window_idx = i * x_width + j; // const IDX search_idx = start_idx + window_idx; // T val = src[search_idx]; - T val = btm_data[window_idx]; + // T val = btm_data[window_idx]; + T val = btm_data[i * x_width + j]; /* NOTE: std::isnan(val) only supports a few data types, see: https://en.cppreference.com/w/cpp/numeric/math/isnan and when use gcc/g++ 4.x to compile, @@ -273,14 +267,18 @@ OF_DEVICE_FUNC void Maxpool2dForwardComputeCFirst( but if use gcc/g++ 7.x to compile, everything is ok! the exact reason is still unknown! */ if (val > max_value || detail::numerics::isnan(val)) { - max_value = val; - max_index = window_idx; + max_index = i * x_width + j; + // max_value = val; + max_value = static_cast(val); + + // max_index = window_idx; // src_idx = search_idx; } } } // dest[num] = src[src_idx]; - dest[num] = max_value; + // dest[num] = max_value; + dest[num] = static_cast(max_value); indice_ptr[num] = max_index; } } From 1c39ee27f4d10f13be9e32003c67ddcaec2707aa Mon Sep 17 00:00:00 2001 From: MARD1NO <359521840@qq.com> Date: Tue, 8 Feb 2022 17:05:16 +0800 Subject: [PATCH 11/17] reduce div --- oneflow/user/kernels/pooling_kernel.cpp | 17 +++++--- oneflow/user/kernels/pooling_kernel.cu | 4 +- oneflow/user/kernels/pooling_kernel_util.h | 47 +++++----------------- 3 files changed, 25 insertions(+), 43 deletions(-) diff --git a/oneflow/user/kernels/pooling_kernel.cpp b/oneflow/user/kernels/pooling_kernel.cpp index 49070b51f9a..0eac427f710 100644 --- a/oneflow/user/kernels/pooling_kernel.cpp +++ b/oneflow/user/kernels/pooling_kernel.cpp @@ -118,7 +118,7 @@ struct PoolingKernelUtil { } static void Maxpool2dForwardCFirst(ep::Stream* stream, - const NdIndexOffsetHelper& index_helper, + const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { Maxpool2dForwardComputeCFirst( @@ -307,8 +307,8 @@ class MaxPool2dKernel final : public user_op::OpKernel { T* dest = y->mut_dptr(); int64_t* indice_ptr = indice->mut_dptr(); - DimVector y_vector; - y->shape().ToDimVector(&y_vector); + + const std::string& data_format = ctx->Attr("data_format"); // if(elem_num < GetMaxVal()){ // NdIndexOffsetHelper index_helper(y_vector.data()); @@ -334,11 +334,18 @@ class MaxPool2dKernel final : public user_op::OpKernel { // } // } - NdIndexOffsetHelper index_helper(y_vector.data()); if (data_format == "channels_first") { - PoolingKernelUtil::Maxpool2dForwardCFirst( + DimVector y_vector(3); + y_vector.at(0) = y->shape().At(0) * y->shape().At(1); + y_vector.at(1) = y->shape().At(2); + y_vector.at(2) = y->shape().At(3); + NdIndexOffsetHelper index_helper(y_vector.data()); + PoolingKernelUtil::Maxpool2dForwardCFirst( ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); } else if (data_format == "channels_last") { + DimVector y_vector; + y->shape().ToDimVector(&y_vector); + NdIndexOffsetHelper index_helper(y_vector.data()); PoolingKernelUtil::Maxpool2dForwardCLast( ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); } else { diff --git a/oneflow/user/kernels/pooling_kernel.cu b/oneflow/user/kernels/pooling_kernel.cu index 84373a0cd26..8ac9bfaacfa 100644 --- a/oneflow/user/kernels/pooling_kernel.cu +++ b/oneflow/user/kernels/pooling_kernel.cu @@ -96,7 +96,7 @@ __launch_bounds__(kBlockSize) __global__ template __launch_bounds__(kBlockSize) __global__ - void DoCUDAMaxPool2dForwardCFirst(const NdIndexOffsetHelper index_helper, + void DoCUDAMaxPool2dForwardCFirst(const NdIndexOffsetHelper index_helper, IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, int32_t padding_h, int32_t padding_w, int32_t n_batch, int32_t n_channel, int32_t x_height, int32_t x_width, @@ -214,7 +214,7 @@ struct PoolingKernelUtil { } static void Maxpool2dForwardCFirst(ep::Stream* stream, - const NdIndexOffsetHelper& index_helper, + const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { DoCUDAMaxPool2dForwardCFirst<<& index_helper, + const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, const MaxPoolingParams3D& params_3d); @@ -209,53 +209,34 @@ OF_DEVICE_FUNC void Maxpool1dBackwardCompute(const NdIndexOffsetHelper i template OF_DEVICE_FUNC void Maxpool2dForwardComputeCFirst( - const NdIndexOffsetHelper index_helper, IDX elem_num, const T* src, T* dest, + const NdIndexOffsetHelper index_helper, IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, const int32_t padding_h, const int32_t padding_w, const int32_t n_batch, const int32_t n_channel, const int32_t x_height, const int32_t x_width, const int32_t kernel_size_h, const int32_t kernel_size_w, const int32_t stride_h, const int32_t stride_w, const int32_t dilation_h, const int32_t dilation_w) { XPU_1D_KERNEL_LOOP(num, elem_num) { - IDX n, c, h, w; - index_helper.OffsetToNdIndex(num, n, c, h, w); + IDX n_c, h, w; + index_helper.OffsetToNdIndex(num, n_c, h, w); int32_t hstart = h * stride_h - padding_h; int32_t wstart = w * stride_w - padding_w; - /* const IDX hend = (hstart + (kernel_size_h - 1) * dilation_h + 1) <= x_height ? (hstart + (kernel_size_h - 1) * dilation_h + 1) : x_height; const IDX wend = (wstart + (kernel_size_w - 1) * dilation_w + 1) <= x_width ? (wstart + (kernel_size_w - 1) * dilation_w + 1) : x_width; - */ - #ifdef WITH_CUDA - const int32_t hend = device_min((hstart + (kernel_size_h - 1) * dilation_h + 1), x_height); - const int32_t wend = device_min((wstart + (kernel_size_w - 1) * dilation_w + 1), x_width); - // const IDX hend = (hstart + (kernel_size_h - 1) * dilation_h + 1) <= x_height - // ? (hstart + (kernel_size_h - 1) * dilation_h + 1) - // : x_height; - // const IDX wend = (wstart + (kernel_size_w - 1) * dilation_w + 1) <= x_width - // ? (wstart + (kernel_size_w - 1) * dilation_w + 1) - // : x_width; - #else - const int32_t hend = std::min((hstart + (kernel_size_h - 1) * dilation_h + 1), x_height); - const int32_t wend = std::min((wstart + (kernel_size_w - 1) * dilation_w + 1), x_height); - #endif - while (hstart < 0) { hstart += dilation_h; } while (wstart < 0) { wstart += dilation_w; } /* equal to -std::numeric_limits::infinity(); */ - T max_value = -10000; + T max_value = detail::numeric_limits::lower_bound(); /* compute max value(src[src_idx]) in kernel box region, and save the value to dest[num] */ int32_t max_index = hstart * x_width + wstart; - const T* btm_data = src + (n * n_channel + c) * x_width * x_height; + const T* btm_data = src + n_c * x_width * x_height; for (int32_t i = hstart; i < hend; i += dilation_h) { for (int32_t j = wstart; j < wend; j += dilation_w) { - // const int32_t window_idx = i * x_width + j; - // const IDX search_idx = start_idx + window_idx; - // T val = src[search_idx]; - // T val = btm_data[window_idx]; - T val = btm_data[i * x_width + j]; + const int32_t window_idx = i * x_width + j; + T val = btm_data[window_idx]; /* NOTE: std::isnan(val) only supports a few data types, see: https://en.cppreference.com/w/cpp/numeric/math/isnan and when use gcc/g++ 4.x to compile, @@ -267,18 +248,12 @@ OF_DEVICE_FUNC void Maxpool2dForwardComputeCFirst( but if use gcc/g++ 7.x to compile, everything is ok! the exact reason is still unknown! */ if (val > max_value || detail::numerics::isnan(val)) { - max_index = i * x_width + j; - // max_value = val; - max_value = static_cast(val); - - // max_index = window_idx; - // src_idx = search_idx; + max_index = window_idx; + max_value = val; } } } - // dest[num] = src[src_idx]; - // dest[num] = max_value; - dest[num] = static_cast(max_value); + dest[num] = max_value; indice_ptr[num] = max_index; } } From 5495891220108df1f98f2184c75a120ce3c5d8b3 Mon Sep 17 00:00:00 2001 From: MARD1NO <359521840@qq.com> Date: Tue, 8 Feb 2022 17:45:13 +0800 Subject: [PATCH 12/17] use int32_t indice --- oneflow/user/kernels/pooling_kernel.cpp | 79 +++++++++------------- oneflow/user/kernels/pooling_kernel.cu | 34 +++++----- oneflow/user/kernels/pooling_kernel_util.h | 30 ++++---- oneflow/user/ops/pooling_op.cpp | 2 +- 4 files changed, 66 insertions(+), 79 deletions(-) diff --git a/oneflow/user/kernels/pooling_kernel.cpp b/oneflow/user/kernels/pooling_kernel.cpp index 0eac427f710..65aae1fe506 100644 --- a/oneflow/user/kernels/pooling_kernel.cpp +++ b/oneflow/user/kernels/pooling_kernel.cpp @@ -44,7 +44,7 @@ namespace { template void Maxpool2dForwardComputeCLast(const NdIndexOffsetHelper& index_helper, - IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, + IDX elem_num, const T* src, T* dest, int32_t* indice_ptr, const int32_t padding_h, const int32_t padding_w, const int32_t n_batch, const int32_t n_channel, const int32_t x_height, const int32_t x_width, @@ -99,7 +99,7 @@ template struct PoolingKernelUtil { static void Maxpool1dForward(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, - const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, + const IDX elem_num, const T* src, T* dest, int32_t* indice_ptr, const MaxPoolingParams3D& params_3d) { Maxpool1dForwardCompute(index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[2], params_3d.num_batch(), @@ -111,7 +111,7 @@ struct PoolingKernelUtil { static void Maxpool1dBackward(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, - const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { + const int32_t* indice_ptr, const MaxPoolingParams3D& params_3d) { Maxpool1dBackwardCompute(index_helper, elem_num, src, dest, indice_ptr, params_3d.num_batch(), params_3d.num_channel(), params_3d.GetYShape5D().At(4), params_3d.GetXShape5D().At(4)); @@ -120,7 +120,7 @@ struct PoolingKernelUtil { static void Maxpool2dForwardCFirst(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, - int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { + int32_t* indice_ptr, const MaxPoolingParams3D& params_3d) { Maxpool2dForwardComputeCFirst( index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[1], params_3d.padding()[2], params_3d.num_batch(), params_3d.num_channel(), @@ -133,7 +133,7 @@ struct PoolingKernelUtil { static void Maxpool2dBackwardCFirst(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, - const int64_t* indice_ptr, + const int32_t* indice_ptr, const MaxPoolingParams3D& params_3d) { Maxpool2dBackwardComputeCFirst(index_helper, elem_num, src, dest, indice_ptr, params_3d.num_batch(), params_3d.num_channel(), @@ -144,7 +144,7 @@ struct PoolingKernelUtil { static void Maxpool2dForwardCLast(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, - int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { + int32_t* indice_ptr, const MaxPoolingParams3D& params_3d) { Maxpool2dForwardComputeCLast( index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[1], params_3d.padding()[2], params_3d.num_batch(), params_3d.num_channel(), @@ -157,7 +157,7 @@ struct PoolingKernelUtil { static void Maxpool2dBackwardCLast(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, - const int64_t* indice_ptr, + const int32_t* indice_ptr, const MaxPoolingParams3D& params_3d) { Maxpool2dBackwardComputeCLast(index_helper, elem_num, src, dest, indice_ptr, params_3d.num_batch(), params_3d.num_channel(), @@ -167,7 +167,7 @@ struct PoolingKernelUtil { static void Maxpool3dForward(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, - const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, + const IDX elem_num, const T* src, T* dest, int32_t* indice_ptr, const MaxPoolingParams3D& params_3d) { Maxpool3dForwardCompute( index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[0], @@ -183,7 +183,7 @@ struct PoolingKernelUtil { static void Maxpool3dBackward(ep::Stream* stream, const NdIndexOffsetHelper index_helper, const IDX elem_num, const T* src, T* dest, - const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { + const int32_t* indice_ptr, const MaxPoolingParams3D& params_3d) { Maxpool3dBackwardCompute(index_helper, elem_num, src, dest, indice_ptr, params_3d.num_batch(), params_3d.num_channel(), params_3d.GetYShape5D().At(2), params_3d.GetYShape5D().At(3), @@ -217,7 +217,7 @@ class MaxPool1dKernel final : public user_op::OpKernel { const int64_t elem_num = y->shape().elem_cnt(); const T* src = x->dptr(); T* dest = y->mut_dptr(); - int64_t* indice_ptr = indice->mut_dptr(); + int32_t* indice_ptr = indice->mut_dptr(); DimVector y_vector; y->shape().ToDimVector(&y_vector); @@ -258,7 +258,7 @@ class MaxPool1dGradKernel final : public user_op::OpKernel { const int64_t elem_num = dy->shape().elem_cnt(); const T* src = dy->dptr(); - const int64_t* indice_ptr = indice->dptr(); + const int32_t* indice_ptr = indice->dptr(); T* dest = dx->mut_dptr(); DimVector dy_vector; dy->shape().ToDimVector(&dy_vector); @@ -305,49 +305,36 @@ class MaxPool2dKernel final : public user_op::OpKernel { const T* src = x->dptr(); T* dest = y->mut_dptr(); - int64_t* indice_ptr = indice->mut_dptr(); - - + int32_t* indice_ptr = indice->mut_dptr(); const std::string& data_format = ctx->Attr("data_format"); - // if(elem_num < GetMaxVal()){ - // NdIndexOffsetHelper index_helper(y_vector.data()); - // if (data_format == "channels_first") { - // PoolingKernelUtil::Maxpool2dForwardCFirst( - // ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); - // } else if (data_format == "channels_last") { - // PoolingKernelUtil::Maxpool2dForwardCLast( - // ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); - // } else { - // UNIMPLEMENTED() << "Unsupported data_format"; - // } - // }else{ - // NdIndexOffsetHelper index_helper(y_vector.data()); - // if (data_format == "channels_first") { - // PoolingKernelUtil::Maxpool2dForwardCFirst( - // ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); - // } else if (data_format == "channels_last") { - // PoolingKernelUtil::Maxpool2dForwardCLast( - // ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); - // } else { - // UNIMPLEMENTED() << "Unsupported data_format"; - // } - // } - if (data_format == "channels_first") { DimVector y_vector(3); y_vector.at(0) = y->shape().At(0) * y->shape().At(1); y_vector.at(1) = y->shape().At(2); y_vector.at(2) = y->shape().At(3); - NdIndexOffsetHelper index_helper(y_vector.data()); - PoolingKernelUtil::Maxpool2dForwardCFirst( - ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); + if(elem_num < GetMaxVal()){ + NdIndexOffsetHelper index_helper(y_vector.data()); + PoolingKernelUtil::Maxpool2dForwardCFirst( + ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); + } + else{ + NdIndexOffsetHelper index_helper(y_vector.data()); + PoolingKernelUtil::Maxpool2dForwardCFirst( + ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); + } } else if (data_format == "channels_last") { DimVector y_vector; y->shape().ToDimVector(&y_vector); - NdIndexOffsetHelper index_helper(y_vector.data()); - PoolingKernelUtil::Maxpool2dForwardCLast( + if(elem_num < GetMaxVal()){ + NdIndexOffsetHelper index_helper(y_vector.data()); + PoolingKernelUtil::Maxpool2dForwardCLast( ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); + }else{ + NdIndexOffsetHelper index_helper(y_vector.data()); + PoolingKernelUtil::Maxpool2dForwardCLast( + ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); + } } else { UNIMPLEMENTED() << "Unsupported data_format"; } @@ -378,7 +365,7 @@ class MaxPool2dGradKernel final : public user_op::OpKernel { const int64_t elem_num = dy->shape().elem_cnt(); const T* src = dy->dptr(); - const int64_t* indice_ptr = indice->dptr(); + const int32_t* indice_ptr = indice->dptr(); T* dest = dx->mut_dptr(); DimVector dy_vector; dy->shape().ToDimVector(&dy_vector); @@ -438,7 +425,7 @@ class MaxPool3dKernel final : public user_op::OpKernel { const int64_t elem_num = y->shape().elem_cnt(); const T* src = x->dptr(); T* dest = y->mut_dptr(); - int64_t* indice_ptr = indice->mut_dptr(); + int32_t* indice_ptr = indice->mut_dptr(); DimVector y_vector; y->shape().ToDimVector(&y_vector); @@ -478,7 +465,7 @@ class MaxPool3dGradKernel final : public user_op::OpKernel { const int64_t elem_num = dy->shape().elem_cnt(); const T* src = dy->dptr(); - const int64_t* indice_ptr = indice->dptr(); + const int32_t* indice_ptr = indice->dptr(); T* dest = dx->mut_dptr(); DimVector dy_vector; diff --git a/oneflow/user/kernels/pooling_kernel.cu b/oneflow/user/kernels/pooling_kernel.cu index 8ac9bfaacfa..f8a590d00cc 100644 --- a/oneflow/user/kernels/pooling_kernel.cu +++ b/oneflow/user/kernels/pooling_kernel.cu @@ -35,7 +35,7 @@ int GetNumBlocks(int64_t elem_cnt) { template __device__ __inline__ void Maxpool2dForwardComputeCLast( const NdIndexOffsetHelper& index_helper, IDX elem_num, const T* src, T* dest, - int64_t* indice_ptr, const int32_t padding_h, const int32_t padding_w, const int64_t n_batch, + int32_t* indice_ptr, const int32_t padding_h, const int32_t padding_w, const int64_t n_batch, const int64_t n_channel, const int64_t x_height, const int64_t x_width, const int64_t y_height, const int64_t y_width, const int32_t kernel_size_h, const int32_t kernel_size_w, const int32_t stride_h, const int32_t stride_w, const int32_t dilation_h, @@ -86,7 +86,7 @@ __device__ __inline__ void Maxpool2dForwardComputeCLast( template __launch_bounds__(kBlockSize) __global__ void DoCUDAMaxPool1dForward(const NdIndexOffsetHelper index_helper, - IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, + IDX elem_num, const T* src, T* dest, int32_t* indice_ptr, int32_t padding_l, int64_t n_batch, int64_t n_channel, int64_t x_length, int32_t kernel_size_l, int32_t stride_l, int32_t dilation_l) { @@ -97,7 +97,7 @@ __launch_bounds__(kBlockSize) __global__ template __launch_bounds__(kBlockSize) __global__ void DoCUDAMaxPool2dForwardCFirst(const NdIndexOffsetHelper index_helper, - IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, + IDX elem_num, const T* src, T* dest, int32_t* indice_ptr, int32_t padding_h, int32_t padding_w, int32_t n_batch, int32_t n_channel, int32_t x_height, int32_t x_width, int32_t kernel_size_h, int32_t kernel_size_w, @@ -112,7 +112,7 @@ __launch_bounds__(kBlockSize) __global__ template __launch_bounds__(kBlockSize) __global__ void DoCUDAMaxPool2dForwardCLast(const NdIndexOffsetHelper index_helper, - IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, + IDX elem_num, const T* src, T* dest, int32_t* indice_ptr, int32_t padding_h, int32_t padding_w, int64_t n_batch, int64_t n_channel, int64_t x_height, int64_t x_width, int64_t y_height, int64_t y_width, int32_t kernel_size_h, @@ -127,7 +127,7 @@ __launch_bounds__(kBlockSize) __global__ template __launch_bounds__(kBlockSize) __global__ void DoCUDAMaxPool3dForward(const NdIndexOffsetHelper index_helper, - IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, + IDX elem_num, const T* src, T* dest, int32_t* indice_ptr, int32_t padding_t, int32_t padding_h, int32_t padding_w, int64_t n_batch, int64_t n_channel, int64_t x_time, int64_t x_height, int64_t x_width, @@ -145,7 +145,7 @@ template __launch_bounds__(kBlockSize) __global__ void DoCUDAMaxPool1dBackward(const NdIndexOffsetHelper index_helper, const IDX elem_num, const T* src, T* dest, - const int64_t* indice_ptr, const int64_t n_batch, + const int32_t* indice_ptr, const int64_t n_batch, const int64_t n_channel, const int64_t src_length, const int64_t dst_length) { Maxpool1dBackwardCompute(index_helper, elem_num, src, dest, indice_ptr, n_batch, n_channel, @@ -156,7 +156,7 @@ template __launch_bounds__(kBlockSize) __global__ void DoCUDAMaxPool2dBackwardCFirst(const NdIndexOffsetHelper index_helper, const IDX elem_num, const T* src, T* dest, - const int64_t* indice_ptr, const int64_t n_batch, + const int32_t* indice_ptr, const int64_t n_batch, const int64_t n_channel, const int64_t src_height, const int64_t src_width, const int64_t dst_height, const int64_t dst_width) { @@ -168,7 +168,7 @@ template __launch_bounds__(kBlockSize) __global__ void DoCUDAMaxPool2dBackwardCLast(const NdIndexOffsetHelper index_helper, const IDX elem_num, const T* src, T* dest, - const int64_t* indice_ptr, const int64_t n_batch, + const int32_t* indice_ptr, const int64_t n_batch, const int64_t n_channel, const int64_t src_height, const int64_t src_width, const int64_t dst_height, const int64_t dst_width) { @@ -180,7 +180,7 @@ template __launch_bounds__(kBlockSize) __global__ void DoCUDAMaxPool3dBackward(const NdIndexOffsetHelper index_helper, const IDX elem_num, const T* src, T* dest, - const int64_t* indice_ptr, const int64_t n_batch, + const int32_t* indice_ptr, const int64_t n_batch, const int64_t n_channel, const int64_t src_time, const int64_t src_height, const int64_t src_width, const int64_t dst_time, const int64_t dst_height, @@ -193,7 +193,7 @@ template struct PoolingKernelUtil { static void Maxpool1dForward(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, - const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, + const IDX elem_num, const T* src, T* dest, int32_t* indice_ptr, const MaxPoolingParams3D& params_3d) { DoCUDAMaxPool1dForward<<As()->cuda_stream()>>>( @@ -206,7 +206,7 @@ struct PoolingKernelUtil { static void Maxpool1dBackward(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, - const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { + const int32_t* indice_ptr, const MaxPoolingParams3D& params_3d) { DoCUDAMaxPool1dBackward<<As()->cuda_stream()>>>( index_helper, elem_num, src, dest, indice_ptr, params_3d.num_batch(), @@ -216,7 +216,7 @@ struct PoolingKernelUtil { static void Maxpool2dForwardCFirst(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, - int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { + int32_t* indice_ptr, const MaxPoolingParams3D& params_3d) { DoCUDAMaxPool2dForwardCFirst<<As()->cuda_stream()>>>( index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[1], @@ -230,7 +230,7 @@ struct PoolingKernelUtil { static void Maxpool2dBackwardCFirst(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, - const int64_t* indice_ptr, + const int32_t* indice_ptr, const MaxPoolingParams3D& params_3d) { DoCUDAMaxPool2dBackwardCFirst<<As()->cuda_stream()>>>( @@ -242,7 +242,7 @@ struct PoolingKernelUtil { static void Maxpool2dForwardCLast(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, - int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { + int32_t* indice_ptr, const MaxPoolingParams3D& params_3d) { DoCUDAMaxPool2dForwardCLast<<As()->cuda_stream()>>>( index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[1], @@ -256,7 +256,7 @@ struct PoolingKernelUtil { static void Maxpool2dBackwardCLast(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, - const int64_t* indice_ptr, + const int32_t* indice_ptr, const MaxPoolingParams3D& params_3d) { DoCUDAMaxPool2dBackwardCLast<<As()->cuda_stream()>>>( @@ -267,7 +267,7 @@ struct PoolingKernelUtil { static void Maxpool3dForward(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, - const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, + const IDX elem_num, const T* src, T* dest, int32_t* indice_ptr, const MaxPoolingParams3D& params_3d) { DoCUDAMaxPool3dForward<<As()->cuda_stream()>>>( @@ -285,7 +285,7 @@ struct PoolingKernelUtil { static void Maxpool3dBackward(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, - const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { + const int32_t* indice_ptr, const MaxPoolingParams3D& params_3d) { DoCUDAMaxPool3dBackward<<As()->cuda_stream()>>>( index_helper, elem_num, src, dest, indice_ptr, params_3d.num_batch(), diff --git a/oneflow/user/kernels/pooling_kernel_util.h b/oneflow/user/kernels/pooling_kernel_util.h index 2e43d1ff033..208dda34e45 100644 --- a/oneflow/user/kernels/pooling_kernel_util.h +++ b/oneflow/user/kernels/pooling_kernel_util.h @@ -104,51 +104,51 @@ template struct PoolingKernelUtil { static void Maxpool1dForward(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, - const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, + const IDX elem_num, const T* src, T* dest, int32_t* indice_ptr, const MaxPoolingParams3D& params_3d); static void Maxpool1dBackward(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, - const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d); + const int32_t* indice_ptr, const MaxPoolingParams3D& params_3d); static void Maxpool2dForwardCFirst(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, - int64_t* indice_ptr, const MaxPoolingParams3D& params_3d); + int32_t* indice_ptr, const MaxPoolingParams3D& params_3d); static void Maxpool2dBackwardCFirst(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, - const int64_t* indice_ptr, + const int32_t* indice_ptr, const MaxPoolingParams3D& params_3d); static void Maxpool2dForwardCLast(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, - int64_t* indice_ptr, const MaxPoolingParams3D& params_3d); + int32_t* indice_ptr, const MaxPoolingParams3D& params_3d); static void Maxpool2dBackwardCLast(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, - const int64_t* indice_ptr, + const int32_t* indice_ptr, const MaxPoolingParams3D& params_3d); static void Maxpool3dForward(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, - const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, + const IDX elem_num, const T* src, T* dest, int32_t* indice_ptr, const MaxPoolingParams3D& params_3d); static void Maxpool3dBackward(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, - const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d); + const int32_t* indice_ptr, const MaxPoolingParams3D& params_3d); }; template OF_DEVICE_FUNC void Maxpool1dForwardCompute(const NdIndexOffsetHelper index_helper, IDX elem_num, const T* src, T* dest, - int64_t* indice_ptr, const int32_t padding_l, + int32_t* indice_ptr, const int32_t padding_l, const int32_t n_batch, const int32_t n_channel, const int32_t x_length, const int32_t kernel_size_l, const int32_t stride_l, const int32_t dilation_l) { @@ -189,7 +189,7 @@ OF_DEVICE_FUNC void Maxpool1dForwardCompute(const NdIndexOffsetHelper in template OF_DEVICE_FUNC void Maxpool1dBackwardCompute(const NdIndexOffsetHelper index_helper, const IDX elem_num, const T* src, T* dest, - const int64_t* indice_ptr, const int32_t n_batch, + const int32_t* indice_ptr, const int32_t n_batch, const int32_t n_channel, const int32_t src_length, const int32_t dst_length) { XPU_1D_KERNEL_LOOP(num, elem_num) { @@ -210,7 +210,7 @@ OF_DEVICE_FUNC void Maxpool1dBackwardCompute(const NdIndexOffsetHelper i template OF_DEVICE_FUNC void Maxpool2dForwardComputeCFirst( const NdIndexOffsetHelper index_helper, IDX elem_num, const T* src, T* dest, - int64_t* indice_ptr, const int32_t padding_h, const int32_t padding_w, const int32_t n_batch, + int32_t* indice_ptr, const int32_t padding_h, const int32_t padding_w, const int32_t n_batch, const int32_t n_channel, const int32_t x_height, const int32_t x_width, const int32_t kernel_size_h, const int32_t kernel_size_w, const int32_t stride_h, const int32_t stride_w, const int32_t dilation_h, @@ -261,7 +261,7 @@ OF_DEVICE_FUNC void Maxpool2dForwardComputeCFirst( template OF_DEVICE_FUNC void Maxpool2dBackwardComputeCFirst( const NdIndexOffsetHelper index_helper, const IDX elem_num, const T* src, - T* dest, const int64_t* indice_ptr, const int32_t n_batch, const int32_t n_channel, + T* dest, const int32_t* indice_ptr, const int32_t n_batch, const int32_t n_channel, const int32_t src_height, const int32_t src_width, const int32_t dst_height, const int32_t dst_width) { XPU_1D_KERNEL_LOOP(num, elem_num) { @@ -283,7 +283,7 @@ OF_DEVICE_FUNC void Maxpool2dBackwardComputeCFirst( template OF_DEVICE_FUNC void Maxpool2dBackwardComputeCLast( const NdIndexOffsetHelper index_helper, const IDX elem_num, const T* src, - T* dest, const int64_t* indice_ptr, const int32_t n_batch, const int32_t n_channel, + T* dest, const int32_t* indice_ptr, const int32_t n_batch, const int32_t n_channel, const int32_t src_height, const int32_t src_width, const int32_t dst_height, const int32_t dst_width) { XPU_1D_KERNEL_LOOP(num, elem_num) { @@ -303,7 +303,7 @@ OF_DEVICE_FUNC void Maxpool2dBackwardComputeCLast( template OF_DEVICE_FUNC void Maxpool3dForwardCompute( const NdIndexOffsetHelper index_helper, IDX elem_num, const T* src, T* dest, - int64_t* indice_ptr, const int32_t padding_t, const int32_t padding_h, const int32_t padding_w, + int32_t* indice_ptr, const int32_t padding_t, const int32_t padding_h, const int32_t padding_w, const int32_t n_batch, const int32_t n_channel, const int32_t x_time, const int32_t x_height, const int32_t x_width, const int32_t kernel_size_t, const int32_t kernel_size_h, const int32_t kernel_size_w, @@ -358,7 +358,7 @@ OF_DEVICE_FUNC void Maxpool3dForwardCompute( template OF_DEVICE_FUNC void Maxpool3dBackwardCompute(const NdIndexOffsetHelper index_helper, const IDX elem_num, const T* src, T* dest, - const int64_t* indice_ptr, const int32_t n_batch, + const int32_t* indice_ptr, const int32_t n_batch, const int32_t n_channel, const int32_t src_time, const int32_t src_height, const int32_t src_width, const int32_t dst_time, const int32_t dst_height, diff --git a/oneflow/user/ops/pooling_op.cpp b/oneflow/user/ops/pooling_op.cpp index ffb7d85e1ba..31289e5da9f 100644 --- a/oneflow/user/ops/pooling_op.cpp +++ b/oneflow/user/ops/pooling_op.cpp @@ -56,7 +56,7 @@ TensorDescInferFn MaxPoolMakeForwardTensorDescInferFn(const int32_t dim) { *indice_desc = *ctx->OutputTensorDesc("y", 0); *indice_desc->mut_shape() = *y_desc->mut_shape(); DataType* dtype = indice_desc->mut_data_type(); - *dtype = kInt64; + *dtype = kInt32; return Maybe::Ok(); }; } From f62ac8fefb1d7197f6c68ea51b746c9d5e6cbf57 Mon Sep 17 00:00:00 2001 From: MARD1NO <359521840@qq.com> Date: Tue, 8 Feb 2022 17:58:12 +0800 Subject: [PATCH 13/17] revert back to use int64_t --- oneflow/user/kernels/pooling_kernel.cpp | 30 +++++++++---------- oneflow/user/kernels/pooling_kernel.cu | 34 +++++++++++----------- oneflow/user/kernels/pooling_kernel_util.h | 30 +++++++++---------- oneflow/user/ops/pooling_op.cpp | 2 +- 4 files changed, 48 insertions(+), 48 deletions(-) diff --git a/oneflow/user/kernels/pooling_kernel.cpp b/oneflow/user/kernels/pooling_kernel.cpp index 65aae1fe506..4220bd26fbf 100644 --- a/oneflow/user/kernels/pooling_kernel.cpp +++ b/oneflow/user/kernels/pooling_kernel.cpp @@ -44,7 +44,7 @@ namespace { template void Maxpool2dForwardComputeCLast(const NdIndexOffsetHelper& index_helper, - IDX elem_num, const T* src, T* dest, int32_t* indice_ptr, + IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, const int32_t padding_h, const int32_t padding_w, const int32_t n_batch, const int32_t n_channel, const int32_t x_height, const int32_t x_width, @@ -99,7 +99,7 @@ template struct PoolingKernelUtil { static void Maxpool1dForward(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, - const IDX elem_num, const T* src, T* dest, int32_t* indice_ptr, + const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { Maxpool1dForwardCompute(index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[2], params_3d.num_batch(), @@ -111,7 +111,7 @@ struct PoolingKernelUtil { static void Maxpool1dBackward(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, - const int32_t* indice_ptr, const MaxPoolingParams3D& params_3d) { + const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { Maxpool1dBackwardCompute(index_helper, elem_num, src, dest, indice_ptr, params_3d.num_batch(), params_3d.num_channel(), params_3d.GetYShape5D().At(4), params_3d.GetXShape5D().At(4)); @@ -120,7 +120,7 @@ struct PoolingKernelUtil { static void Maxpool2dForwardCFirst(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, - int32_t* indice_ptr, const MaxPoolingParams3D& params_3d) { + int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { Maxpool2dForwardComputeCFirst( index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[1], params_3d.padding()[2], params_3d.num_batch(), params_3d.num_channel(), @@ -133,7 +133,7 @@ struct PoolingKernelUtil { static void Maxpool2dBackwardCFirst(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, - const int32_t* indice_ptr, + const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { Maxpool2dBackwardComputeCFirst(index_helper, elem_num, src, dest, indice_ptr, params_3d.num_batch(), params_3d.num_channel(), @@ -144,7 +144,7 @@ struct PoolingKernelUtil { static void Maxpool2dForwardCLast(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, - int32_t* indice_ptr, const MaxPoolingParams3D& params_3d) { + int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { Maxpool2dForwardComputeCLast( index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[1], params_3d.padding()[2], params_3d.num_batch(), params_3d.num_channel(), @@ -157,7 +157,7 @@ struct PoolingKernelUtil { static void Maxpool2dBackwardCLast(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, - const int32_t* indice_ptr, + const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { Maxpool2dBackwardComputeCLast(index_helper, elem_num, src, dest, indice_ptr, params_3d.num_batch(), params_3d.num_channel(), @@ -167,7 +167,7 @@ struct PoolingKernelUtil { static void Maxpool3dForward(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, - const IDX elem_num, const T* src, T* dest, int32_t* indice_ptr, + const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { Maxpool3dForwardCompute( index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[0], @@ -183,7 +183,7 @@ struct PoolingKernelUtil { static void Maxpool3dBackward(ep::Stream* stream, const NdIndexOffsetHelper index_helper, const IDX elem_num, const T* src, T* dest, - const int32_t* indice_ptr, const MaxPoolingParams3D& params_3d) { + const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { Maxpool3dBackwardCompute(index_helper, elem_num, src, dest, indice_ptr, params_3d.num_batch(), params_3d.num_channel(), params_3d.GetYShape5D().At(2), params_3d.GetYShape5D().At(3), @@ -217,7 +217,7 @@ class MaxPool1dKernel final : public user_op::OpKernel { const int64_t elem_num = y->shape().elem_cnt(); const T* src = x->dptr(); T* dest = y->mut_dptr(); - int32_t* indice_ptr = indice->mut_dptr(); + int64_t* indice_ptr = indice->mut_dptr(); DimVector y_vector; y->shape().ToDimVector(&y_vector); @@ -258,7 +258,7 @@ class MaxPool1dGradKernel final : public user_op::OpKernel { const int64_t elem_num = dy->shape().elem_cnt(); const T* src = dy->dptr(); - const int32_t* indice_ptr = indice->dptr(); + const int64_t* indice_ptr = indice->dptr(); T* dest = dx->mut_dptr(); DimVector dy_vector; dy->shape().ToDimVector(&dy_vector); @@ -305,7 +305,7 @@ class MaxPool2dKernel final : public user_op::OpKernel { const T* src = x->dptr(); T* dest = y->mut_dptr(); - int32_t* indice_ptr = indice->mut_dptr(); + int64_t* indice_ptr = indice->mut_dptr(); const std::string& data_format = ctx->Attr("data_format"); if (data_format == "channels_first") { @@ -365,7 +365,7 @@ class MaxPool2dGradKernel final : public user_op::OpKernel { const int64_t elem_num = dy->shape().elem_cnt(); const T* src = dy->dptr(); - const int32_t* indice_ptr = indice->dptr(); + const int64_t* indice_ptr = indice->dptr(); T* dest = dx->mut_dptr(); DimVector dy_vector; dy->shape().ToDimVector(&dy_vector); @@ -425,7 +425,7 @@ class MaxPool3dKernel final : public user_op::OpKernel { const int64_t elem_num = y->shape().elem_cnt(); const T* src = x->dptr(); T* dest = y->mut_dptr(); - int32_t* indice_ptr = indice->mut_dptr(); + int64_t* indice_ptr = indice->mut_dptr(); DimVector y_vector; y->shape().ToDimVector(&y_vector); @@ -465,7 +465,7 @@ class MaxPool3dGradKernel final : public user_op::OpKernel { const int64_t elem_num = dy->shape().elem_cnt(); const T* src = dy->dptr(); - const int32_t* indice_ptr = indice->dptr(); + const int64_t* indice_ptr = indice->dptr(); T* dest = dx->mut_dptr(); DimVector dy_vector; diff --git a/oneflow/user/kernels/pooling_kernel.cu b/oneflow/user/kernels/pooling_kernel.cu index f8a590d00cc..8ac9bfaacfa 100644 --- a/oneflow/user/kernels/pooling_kernel.cu +++ b/oneflow/user/kernels/pooling_kernel.cu @@ -35,7 +35,7 @@ int GetNumBlocks(int64_t elem_cnt) { template __device__ __inline__ void Maxpool2dForwardComputeCLast( const NdIndexOffsetHelper& index_helper, IDX elem_num, const T* src, T* dest, - int32_t* indice_ptr, const int32_t padding_h, const int32_t padding_w, const int64_t n_batch, + int64_t* indice_ptr, const int32_t padding_h, const int32_t padding_w, const int64_t n_batch, const int64_t n_channel, const int64_t x_height, const int64_t x_width, const int64_t y_height, const int64_t y_width, const int32_t kernel_size_h, const int32_t kernel_size_w, const int32_t stride_h, const int32_t stride_w, const int32_t dilation_h, @@ -86,7 +86,7 @@ __device__ __inline__ void Maxpool2dForwardComputeCLast( template __launch_bounds__(kBlockSize) __global__ void DoCUDAMaxPool1dForward(const NdIndexOffsetHelper index_helper, - IDX elem_num, const T* src, T* dest, int32_t* indice_ptr, + IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, int32_t padding_l, int64_t n_batch, int64_t n_channel, int64_t x_length, int32_t kernel_size_l, int32_t stride_l, int32_t dilation_l) { @@ -97,7 +97,7 @@ __launch_bounds__(kBlockSize) __global__ template __launch_bounds__(kBlockSize) __global__ void DoCUDAMaxPool2dForwardCFirst(const NdIndexOffsetHelper index_helper, - IDX elem_num, const T* src, T* dest, int32_t* indice_ptr, + IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, int32_t padding_h, int32_t padding_w, int32_t n_batch, int32_t n_channel, int32_t x_height, int32_t x_width, int32_t kernel_size_h, int32_t kernel_size_w, @@ -112,7 +112,7 @@ __launch_bounds__(kBlockSize) __global__ template __launch_bounds__(kBlockSize) __global__ void DoCUDAMaxPool2dForwardCLast(const NdIndexOffsetHelper index_helper, - IDX elem_num, const T* src, T* dest, int32_t* indice_ptr, + IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, int32_t padding_h, int32_t padding_w, int64_t n_batch, int64_t n_channel, int64_t x_height, int64_t x_width, int64_t y_height, int64_t y_width, int32_t kernel_size_h, @@ -127,7 +127,7 @@ __launch_bounds__(kBlockSize) __global__ template __launch_bounds__(kBlockSize) __global__ void DoCUDAMaxPool3dForward(const NdIndexOffsetHelper index_helper, - IDX elem_num, const T* src, T* dest, int32_t* indice_ptr, + IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, int32_t padding_t, int32_t padding_h, int32_t padding_w, int64_t n_batch, int64_t n_channel, int64_t x_time, int64_t x_height, int64_t x_width, @@ -145,7 +145,7 @@ template __launch_bounds__(kBlockSize) __global__ void DoCUDAMaxPool1dBackward(const NdIndexOffsetHelper index_helper, const IDX elem_num, const T* src, T* dest, - const int32_t* indice_ptr, const int64_t n_batch, + const int64_t* indice_ptr, const int64_t n_batch, const int64_t n_channel, const int64_t src_length, const int64_t dst_length) { Maxpool1dBackwardCompute(index_helper, elem_num, src, dest, indice_ptr, n_batch, n_channel, @@ -156,7 +156,7 @@ template __launch_bounds__(kBlockSize) __global__ void DoCUDAMaxPool2dBackwardCFirst(const NdIndexOffsetHelper index_helper, const IDX elem_num, const T* src, T* dest, - const int32_t* indice_ptr, const int64_t n_batch, + const int64_t* indice_ptr, const int64_t n_batch, const int64_t n_channel, const int64_t src_height, const int64_t src_width, const int64_t dst_height, const int64_t dst_width) { @@ -168,7 +168,7 @@ template __launch_bounds__(kBlockSize) __global__ void DoCUDAMaxPool2dBackwardCLast(const NdIndexOffsetHelper index_helper, const IDX elem_num, const T* src, T* dest, - const int32_t* indice_ptr, const int64_t n_batch, + const int64_t* indice_ptr, const int64_t n_batch, const int64_t n_channel, const int64_t src_height, const int64_t src_width, const int64_t dst_height, const int64_t dst_width) { @@ -180,7 +180,7 @@ template __launch_bounds__(kBlockSize) __global__ void DoCUDAMaxPool3dBackward(const NdIndexOffsetHelper index_helper, const IDX elem_num, const T* src, T* dest, - const int32_t* indice_ptr, const int64_t n_batch, + const int64_t* indice_ptr, const int64_t n_batch, const int64_t n_channel, const int64_t src_time, const int64_t src_height, const int64_t src_width, const int64_t dst_time, const int64_t dst_height, @@ -193,7 +193,7 @@ template struct PoolingKernelUtil { static void Maxpool1dForward(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, - const IDX elem_num, const T* src, T* dest, int32_t* indice_ptr, + const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { DoCUDAMaxPool1dForward<<As()->cuda_stream()>>>( @@ -206,7 +206,7 @@ struct PoolingKernelUtil { static void Maxpool1dBackward(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, - const int32_t* indice_ptr, const MaxPoolingParams3D& params_3d) { + const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { DoCUDAMaxPool1dBackward<<As()->cuda_stream()>>>( index_helper, elem_num, src, dest, indice_ptr, params_3d.num_batch(), @@ -216,7 +216,7 @@ struct PoolingKernelUtil { static void Maxpool2dForwardCFirst(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, - int32_t* indice_ptr, const MaxPoolingParams3D& params_3d) { + int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { DoCUDAMaxPool2dForwardCFirst<<As()->cuda_stream()>>>( index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[1], @@ -230,7 +230,7 @@ struct PoolingKernelUtil { static void Maxpool2dBackwardCFirst(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, - const int32_t* indice_ptr, + const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { DoCUDAMaxPool2dBackwardCFirst<<As()->cuda_stream()>>>( @@ -242,7 +242,7 @@ struct PoolingKernelUtil { static void Maxpool2dForwardCLast(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, - int32_t* indice_ptr, const MaxPoolingParams3D& params_3d) { + int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { DoCUDAMaxPool2dForwardCLast<<As()->cuda_stream()>>>( index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[1], @@ -256,7 +256,7 @@ struct PoolingKernelUtil { static void Maxpool2dBackwardCLast(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, - const int32_t* indice_ptr, + const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { DoCUDAMaxPool2dBackwardCLast<<As()->cuda_stream()>>>( @@ -267,7 +267,7 @@ struct PoolingKernelUtil { static void Maxpool3dForward(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, - const IDX elem_num, const T* src, T* dest, int32_t* indice_ptr, + const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { DoCUDAMaxPool3dForward<<As()->cuda_stream()>>>( @@ -285,7 +285,7 @@ struct PoolingKernelUtil { static void Maxpool3dBackward(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, - const int32_t* indice_ptr, const MaxPoolingParams3D& params_3d) { + const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { DoCUDAMaxPool3dBackward<<As()->cuda_stream()>>>( index_helper, elem_num, src, dest, indice_ptr, params_3d.num_batch(), diff --git a/oneflow/user/kernels/pooling_kernel_util.h b/oneflow/user/kernels/pooling_kernel_util.h index 208dda34e45..2e43d1ff033 100644 --- a/oneflow/user/kernels/pooling_kernel_util.h +++ b/oneflow/user/kernels/pooling_kernel_util.h @@ -104,51 +104,51 @@ template struct PoolingKernelUtil { static void Maxpool1dForward(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, - const IDX elem_num, const T* src, T* dest, int32_t* indice_ptr, + const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, const MaxPoolingParams3D& params_3d); static void Maxpool1dBackward(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, - const int32_t* indice_ptr, const MaxPoolingParams3D& params_3d); + const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d); static void Maxpool2dForwardCFirst(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, - int32_t* indice_ptr, const MaxPoolingParams3D& params_3d); + int64_t* indice_ptr, const MaxPoolingParams3D& params_3d); static void Maxpool2dBackwardCFirst(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, - const int32_t* indice_ptr, + const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d); static void Maxpool2dForwardCLast(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, - int32_t* indice_ptr, const MaxPoolingParams3D& params_3d); + int64_t* indice_ptr, const MaxPoolingParams3D& params_3d); static void Maxpool2dBackwardCLast(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, - const int32_t* indice_ptr, + const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d); static void Maxpool3dForward(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, - const IDX elem_num, const T* src, T* dest, int32_t* indice_ptr, + const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, const MaxPoolingParams3D& params_3d); static void Maxpool3dBackward(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, - const int32_t* indice_ptr, const MaxPoolingParams3D& params_3d); + const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d); }; template OF_DEVICE_FUNC void Maxpool1dForwardCompute(const NdIndexOffsetHelper index_helper, IDX elem_num, const T* src, T* dest, - int32_t* indice_ptr, const int32_t padding_l, + int64_t* indice_ptr, const int32_t padding_l, const int32_t n_batch, const int32_t n_channel, const int32_t x_length, const int32_t kernel_size_l, const int32_t stride_l, const int32_t dilation_l) { @@ -189,7 +189,7 @@ OF_DEVICE_FUNC void Maxpool1dForwardCompute(const NdIndexOffsetHelper in template OF_DEVICE_FUNC void Maxpool1dBackwardCompute(const NdIndexOffsetHelper index_helper, const IDX elem_num, const T* src, T* dest, - const int32_t* indice_ptr, const int32_t n_batch, + const int64_t* indice_ptr, const int32_t n_batch, const int32_t n_channel, const int32_t src_length, const int32_t dst_length) { XPU_1D_KERNEL_LOOP(num, elem_num) { @@ -210,7 +210,7 @@ OF_DEVICE_FUNC void Maxpool1dBackwardCompute(const NdIndexOffsetHelper i template OF_DEVICE_FUNC void Maxpool2dForwardComputeCFirst( const NdIndexOffsetHelper index_helper, IDX elem_num, const T* src, T* dest, - int32_t* indice_ptr, const int32_t padding_h, const int32_t padding_w, const int32_t n_batch, + int64_t* indice_ptr, const int32_t padding_h, const int32_t padding_w, const int32_t n_batch, const int32_t n_channel, const int32_t x_height, const int32_t x_width, const int32_t kernel_size_h, const int32_t kernel_size_w, const int32_t stride_h, const int32_t stride_w, const int32_t dilation_h, @@ -261,7 +261,7 @@ OF_DEVICE_FUNC void Maxpool2dForwardComputeCFirst( template OF_DEVICE_FUNC void Maxpool2dBackwardComputeCFirst( const NdIndexOffsetHelper index_helper, const IDX elem_num, const T* src, - T* dest, const int32_t* indice_ptr, const int32_t n_batch, const int32_t n_channel, + T* dest, const int64_t* indice_ptr, const int32_t n_batch, const int32_t n_channel, const int32_t src_height, const int32_t src_width, const int32_t dst_height, const int32_t dst_width) { XPU_1D_KERNEL_LOOP(num, elem_num) { @@ -283,7 +283,7 @@ OF_DEVICE_FUNC void Maxpool2dBackwardComputeCFirst( template OF_DEVICE_FUNC void Maxpool2dBackwardComputeCLast( const NdIndexOffsetHelper index_helper, const IDX elem_num, const T* src, - T* dest, const int32_t* indice_ptr, const int32_t n_batch, const int32_t n_channel, + T* dest, const int64_t* indice_ptr, const int32_t n_batch, const int32_t n_channel, const int32_t src_height, const int32_t src_width, const int32_t dst_height, const int32_t dst_width) { XPU_1D_KERNEL_LOOP(num, elem_num) { @@ -303,7 +303,7 @@ OF_DEVICE_FUNC void Maxpool2dBackwardComputeCLast( template OF_DEVICE_FUNC void Maxpool3dForwardCompute( const NdIndexOffsetHelper index_helper, IDX elem_num, const T* src, T* dest, - int32_t* indice_ptr, const int32_t padding_t, const int32_t padding_h, const int32_t padding_w, + int64_t* indice_ptr, const int32_t padding_t, const int32_t padding_h, const int32_t padding_w, const int32_t n_batch, const int32_t n_channel, const int32_t x_time, const int32_t x_height, const int32_t x_width, const int32_t kernel_size_t, const int32_t kernel_size_h, const int32_t kernel_size_w, @@ -358,7 +358,7 @@ OF_DEVICE_FUNC void Maxpool3dForwardCompute( template OF_DEVICE_FUNC void Maxpool3dBackwardCompute(const NdIndexOffsetHelper index_helper, const IDX elem_num, const T* src, T* dest, - const int32_t* indice_ptr, const int32_t n_batch, + const int64_t* indice_ptr, const int32_t n_batch, const int32_t n_channel, const int32_t src_time, const int32_t src_height, const int32_t src_width, const int32_t dst_time, const int32_t dst_height, diff --git a/oneflow/user/ops/pooling_op.cpp b/oneflow/user/ops/pooling_op.cpp index 31289e5da9f..ffb7d85e1ba 100644 --- a/oneflow/user/ops/pooling_op.cpp +++ b/oneflow/user/ops/pooling_op.cpp @@ -56,7 +56,7 @@ TensorDescInferFn MaxPoolMakeForwardTensorDescInferFn(const int32_t dim) { *indice_desc = *ctx->OutputTensorDesc("y", 0); *indice_desc->mut_shape() = *y_desc->mut_shape(); DataType* dtype = indice_desc->mut_data_type(); - *dtype = kInt32; + *dtype = kInt64; return Maybe::Ok(); }; } From 1d8843f6da2d1822fabc7f6f63bccb1e785e9c75 Mon Sep 17 00:00:00 2001 From: MARD1NO <359521840@qq.com> Date: Wed, 9 Feb 2022 09:08:50 +0800 Subject: [PATCH 14/17] fix maxpool1d 3d --- oneflow/user/kernels/pooling_kernel.cpp | 26 +++++++++++++--------- oneflow/user/kernels/pooling_kernel.cu | 8 +++---- oneflow/user/kernels/pooling_kernel_util.h | 22 +++++++++--------- 3 files changed, 29 insertions(+), 27 deletions(-) diff --git a/oneflow/user/kernels/pooling_kernel.cpp b/oneflow/user/kernels/pooling_kernel.cpp index 4220bd26fbf..41236654dcc 100644 --- a/oneflow/user/kernels/pooling_kernel.cpp +++ b/oneflow/user/kernels/pooling_kernel.cpp @@ -98,7 +98,7 @@ void Maxpool2dForwardComputeCLast(const NdIndexOffsetHelper& index_helpe template struct PoolingKernelUtil { static void Maxpool1dForward(ep::Stream* stream, - const NdIndexOffsetHelper& index_helper, + const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { Maxpool1dForwardCompute(index_helper, elem_num, src, dest, indice_ptr, @@ -166,7 +166,7 @@ struct PoolingKernelUtil { } static void Maxpool3dForward(ep::Stream* stream, - const NdIndexOffsetHelper& index_helper, + const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { Maxpool3dForwardCompute( @@ -219,15 +219,15 @@ class MaxPool1dKernel final : public user_op::OpKernel { T* dest = y->mut_dptr(); int64_t* indice_ptr = indice->mut_dptr(); - DimVector y_vector; - y->shape().ToDimVector(&y_vector); - + DimVector y_vector(2); + y_vector.at(0) = y->shape().At(0) * y->shape().At(1); + y_vector.at(1) = y->shape().At(2); if(elem_num < GetMaxVal()){ - NdIndexOffsetHelper index_helper(y_vector.data()); + NdIndexOffsetHelper index_helper(y_vector.data()); PoolingKernelUtil::Maxpool1dForward(ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); }else{ - NdIndexOffsetHelper index_helper(y_vector.data()); + NdIndexOffsetHelper index_helper(y_vector.data()); PoolingKernelUtil::Maxpool1dForward(ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); } @@ -427,14 +427,18 @@ class MaxPool3dKernel final : public user_op::OpKernel { T* dest = y->mut_dptr(); int64_t* indice_ptr = indice->mut_dptr(); - DimVector y_vector; - y->shape().ToDimVector(&y_vector); + DimVector y_vector(4); + y_vector.at(0) = y->shape().At(0) * y->shape().At(1); + y_vector.at(1) = y->shape().At(2); + y_vector.at(2) = y->shape().At(3); + y_vector.at(3) = y->shape().At(4); + if(elem_num < GetMaxVal()){ - NdIndexOffsetHelper index_helper(y_vector.data()); + NdIndexOffsetHelper index_helper(y_vector.data()); PoolingKernelUtil::Maxpool3dForward(ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); }else{ - NdIndexOffsetHelper index_helper(y_vector.data()); + NdIndexOffsetHelper index_helper(y_vector.data()); PoolingKernelUtil::Maxpool3dForward(ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); } diff --git a/oneflow/user/kernels/pooling_kernel.cu b/oneflow/user/kernels/pooling_kernel.cu index 8ac9bfaacfa..ef3b8228328 100644 --- a/oneflow/user/kernels/pooling_kernel.cu +++ b/oneflow/user/kernels/pooling_kernel.cu @@ -85,7 +85,7 @@ __device__ __inline__ void Maxpool2dForwardComputeCLast( template __launch_bounds__(kBlockSize) __global__ - void DoCUDAMaxPool1dForward(const NdIndexOffsetHelper index_helper, + void DoCUDAMaxPool1dForward(const NdIndexOffsetHelper index_helper, IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, int32_t padding_l, int64_t n_batch, int64_t n_channel, int64_t x_length, int32_t kernel_size_l, @@ -126,7 +126,7 @@ __launch_bounds__(kBlockSize) __global__ template __launch_bounds__(kBlockSize) __global__ - void DoCUDAMaxPool3dForward(const NdIndexOffsetHelper index_helper, + void DoCUDAMaxPool3dForward(const NdIndexOffsetHelper index_helper, IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, int32_t padding_t, int32_t padding_h, int32_t padding_w, int64_t n_batch, int64_t n_channel, int64_t x_time, @@ -192,7 +192,7 @@ __launch_bounds__(kBlockSize) __global__ template struct PoolingKernelUtil { static void Maxpool1dForward(ep::Stream* stream, - const NdIndexOffsetHelper& index_helper, + const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { DoCUDAMaxPool1dForward<< { } static void Maxpool3dForward(ep::Stream* stream, - const NdIndexOffsetHelper& index_helper, + const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { DoCUDAMaxPool3dForward<< struct PoolingKernelUtil { static void Maxpool1dForward(ep::Stream* stream, - const NdIndexOffsetHelper& index_helper, + const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, const MaxPoolingParams3D& params_3d); @@ -135,7 +135,7 @@ struct PoolingKernelUtil { const MaxPoolingParams3D& params_3d); static void Maxpool3dForward(ep::Stream* stream, - const NdIndexOffsetHelper& index_helper, + const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, const MaxPoolingParams3D& params_3d); @@ -146,18 +146,17 @@ struct PoolingKernelUtil { }; template -OF_DEVICE_FUNC void Maxpool1dForwardCompute(const NdIndexOffsetHelper index_helper, +OF_DEVICE_FUNC void Maxpool1dForwardCompute(const NdIndexOffsetHelper index_helper, IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, const int32_t padding_l, const int32_t n_batch, const int32_t n_channel, const int32_t x_length, const int32_t kernel_size_l, const int32_t stride_l, const int32_t dilation_l) { XPU_1D_KERNEL_LOOP(num, elem_num) { - IDX n, c, l; - index_helper.OffsetToNdIndex(num, n, c, l); + IDX n_c, l; + index_helper.OffsetToNdIndex(num, n_c, l); - // n, c, l->index = n*c*l + c* l - const IDX start_idx = (n * n_channel + c) * x_length; + const IDX start_idx = n_c * x_length; IDX lstart = l * stride_l - padding_l; const IDX lend = (lstart + (kernel_size_l - 1) * dilation_l + 1) <= x_length ? (lstart + (kernel_size_l - 1) * dilation_l + 1) @@ -302,7 +301,7 @@ OF_DEVICE_FUNC void Maxpool2dBackwardComputeCLast( template OF_DEVICE_FUNC void Maxpool3dForwardCompute( - const NdIndexOffsetHelper index_helper, IDX elem_num, const T* src, T* dest, + const NdIndexOffsetHelper index_helper, IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, const int32_t padding_t, const int32_t padding_h, const int32_t padding_w, const int32_t n_batch, const int32_t n_channel, const int32_t x_time, const int32_t x_height, const int32_t x_width, @@ -310,11 +309,10 @@ OF_DEVICE_FUNC void Maxpool3dForwardCompute( const int32_t stride_t, const int32_t stride_h, const int32_t stride_w, const int32_t dilation_t, const int32_t dilation_h, const int32_t dilation_w) { XPU_1D_KERNEL_LOOP(num, elem_num) { - IDX n, c, t, h, w; - index_helper.OffsetToNdIndex(num, n, c, t, h, w); + IDX n_c, t, h, w; + index_helper.OffsetToNdIndex(num, n_c, t, h, w); - IDX xstart = n * n_channel * x_time * x_width * x_height; - IDX start_idx = xstart + c * x_time * x_width * x_height; + IDX start_idx = n_c * x_time * x_width * x_height; IDX tstart = t * stride_t - padding_t; IDX hstart = h * stride_h - padding_h; IDX wstart = w * stride_w - padding_w; From 36a6fc85c1163e1cf2347aab597f44276e78663b Mon Sep 17 00:00:00 2001 From: MARD1NO <359521840@qq.com> Date: Wed, 9 Feb 2022 10:37:18 +0800 Subject: [PATCH 15/17] optimize backward --- oneflow/user/kernels/pooling_kernel.cpp | 65 ++++++++++--------- oneflow/user/kernels/pooling_kernel.cu | 58 ++++++++--------- oneflow/user/kernels/pooling_kernel_util.h | 73 ++++++++++------------ 3 files changed, 99 insertions(+), 97 deletions(-) diff --git a/oneflow/user/kernels/pooling_kernel.cpp b/oneflow/user/kernels/pooling_kernel.cpp index 41236654dcc..132ce9f41e4 100644 --- a/oneflow/user/kernels/pooling_kernel.cpp +++ b/oneflow/user/kernels/pooling_kernel.cpp @@ -109,7 +109,7 @@ struct PoolingKernelUtil { } static void Maxpool1dBackward(ep::Stream* stream, - const NdIndexOffsetHelper& index_helper, + const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { Maxpool1dBackwardCompute(index_helper, elem_num, src, dest, indice_ptr, @@ -131,7 +131,7 @@ struct PoolingKernelUtil { } static void Maxpool2dBackwardCFirst(ep::Stream* stream, - const NdIndexOffsetHelper& index_helper, + const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { @@ -181,7 +181,7 @@ struct PoolingKernelUtil { } static void Maxpool3dBackward(ep::Stream* stream, - const NdIndexOffsetHelper index_helper, + const NdIndexOffsetHelper index_helper, const IDX elem_num, const T* src, T* dest, const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { Maxpool3dBackwardCompute(index_helper, elem_num, src, dest, indice_ptr, @@ -260,17 +260,18 @@ class MaxPool1dGradKernel final : public user_op::OpKernel { const T* src = dy->dptr(); const int64_t* indice_ptr = indice->dptr(); T* dest = dx->mut_dptr(); - DimVector dy_vector; - dy->shape().ToDimVector(&dy_vector); + DimVector dy_vector(2); + dy_vector.at(0) = dy->shape().At(0) * dy->shape().At(1); + dy_vector.at(1) = dy->shape().At(2); size_t out_bytes_size = dx->shape().elem_cnt() * GetSizeOfDataType(dx->data_type()); Memset(ctx->stream(), dest, 0, out_bytes_size); if(elem_num < GetMaxVal()){ - NdIndexOffsetHelper index_helper(dy_vector.data()); + NdIndexOffsetHelper index_helper(dy_vector.data()); PoolingKernelUtil::Maxpool1dBackward(ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); }else{ - NdIndexOffsetHelper index_helper(dy_vector.data()); + NdIndexOffsetHelper index_helper(dy_vector.data()); PoolingKernelUtil::Maxpool1dBackward(ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); } @@ -367,35 +368,40 @@ class MaxPool2dGradKernel final : public user_op::OpKernel { const T* src = dy->dptr(); const int64_t* indice_ptr = indice->dptr(); T* dest = dx->mut_dptr(); - DimVector dy_vector; - dy->shape().ToDimVector(&dy_vector); size_t out_bytes_size = dx->shape().elem_cnt() * GetSizeOfDataType(dx->data_type()); Memset(ctx->stream(), dest, 0, out_bytes_size); const std::string& data_format = ctx->Attr("data_format"); - if(elem_num < GetMaxVal()){ - NdIndexOffsetHelper index_helper(dy_vector.data()); - if (data_format == "channels_first") { + + if (data_format == "channels_first") { + DimVector dy_vector(3); + dy_vector.at(0) = dy->shape().At(0) * dy->shape().At(1); + dy_vector.at(1) = dy->shape().At(2); + dy_vector.at(2) = dy->shape().At(3); + if(elem_num < GetMaxVal()){ + NdIndexOffsetHelper index_helper(dy_vector.data()); PoolingKernelUtil::Maxpool2dBackwardCFirst( ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); - } else if (data_format == "channels_last") { - PoolingKernelUtil::Maxpool2dBackwardCLast( - ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); } else { - UNIMPLEMENTED() << "Unsupported data_format"; - } - }else{ - NdIndexOffsetHelper index_helper(dy_vector.data()); - if (data_format == "channels_first") { + NdIndexOffsetHelper index_helper(dy_vector.data()); PoolingKernelUtil::Maxpool2dBackwardCFirst( ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); - } else if (data_format == "channels_last") { - PoolingKernelUtil::Maxpool2dBackwardCLast( - ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); + } + } else if (data_format == "channels_last") { + DimVector dy_vector; + dy->shape().ToDimVector(&dy_vector); + if(elem_num < GetMaxVal()){ + NdIndexOffsetHelper index_helper(dy_vector.data()); + PoolingKernelUtil::Maxpool2dBackwardCLast( + ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); } else { - UNIMPLEMENTED() << "Unsupported data_format"; + NdIndexOffsetHelper index_helper(dy_vector.data()); + PoolingKernelUtil::Maxpool2dBackwardCLast( + ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); } + } else { + UNIMPLEMENTED() << "Unsupported data_format"; } }; }; @@ -472,18 +478,21 @@ class MaxPool3dGradKernel final : public user_op::OpKernel { const int64_t* indice_ptr = indice->dptr(); T* dest = dx->mut_dptr(); - DimVector dy_vector; - dy->shape().ToDimVector(&dy_vector); + DimVector dy_vector(4); + dy_vector.at(0) = dy->shape().At(0) * dy->shape().At(1); + dy_vector.at(1) = dy->shape().At(2); + dy_vector.at(2) = dy->shape().At(3); + dy_vector.at(3) = dy->shape().At(4); size_t out_bytes_size = dx->shape().elem_cnt() * GetSizeOfDataType(dx->data_type()); Memset(ctx->stream(), dest, 0, out_bytes_size); if(elem_num < GetMaxVal()){ - NdIndexOffsetHelper index_helper(dy_vector.data()); + NdIndexOffsetHelper index_helper(dy_vector.data()); PoolingKernelUtil::Maxpool3dBackward(ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); }else { - NdIndexOffsetHelper index_helper(dy_vector.data()); + NdIndexOffsetHelper index_helper(dy_vector.data()); PoolingKernelUtil::Maxpool3dBackward(ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); } diff --git a/oneflow/user/kernels/pooling_kernel.cu b/oneflow/user/kernels/pooling_kernel.cu index ef3b8228328..097ff903ec3 100644 --- a/oneflow/user/kernels/pooling_kernel.cu +++ b/oneflow/user/kernels/pooling_kernel.cu @@ -87,8 +87,8 @@ template __launch_bounds__(kBlockSize) __global__ void DoCUDAMaxPool1dForward(const NdIndexOffsetHelper index_helper, IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, - int32_t padding_l, int64_t n_batch, int64_t n_channel, - int64_t x_length, int32_t kernel_size_l, + int32_t padding_l, int32_t n_batch, int32_t n_channel, + int32_t x_length, int32_t kernel_size_l, int32_t stride_l, int32_t dilation_l) { Maxpool1dForwardCompute(index_helper, elem_num, src, dest, indice_ptr, padding_l, n_batch, n_channel, x_length, kernel_size_l, stride_l, dilation_l); @@ -113,9 +113,9 @@ template __launch_bounds__(kBlockSize) __global__ void DoCUDAMaxPool2dForwardCLast(const NdIndexOffsetHelper index_helper, IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, - int32_t padding_h, int32_t padding_w, int64_t n_batch, - int64_t n_channel, int64_t x_height, int64_t x_width, - int64_t y_height, int64_t y_width, int32_t kernel_size_h, + int32_t padding_h, int32_t padding_w, int32_t n_batch, + int32_t n_channel, int32_t x_height, int32_t x_width, + int32_t y_height, int32_t y_width, int32_t kernel_size_h, int32_t kernel_size_w, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w) { Maxpool2dForwardComputeCLast(index_helper, elem_num, src, dest, indice_ptr, padding_h, @@ -129,8 +129,8 @@ __launch_bounds__(kBlockSize) __global__ void DoCUDAMaxPool3dForward(const NdIndexOffsetHelper index_helper, IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, int32_t padding_t, int32_t padding_h, int32_t padding_w, - int64_t n_batch, int64_t n_channel, int64_t x_time, - int64_t x_height, int64_t x_width, + int32_t n_batch, int32_t n_channel, int32_t x_time, + int32_t x_height, int32_t x_width, int32_t kernel_size_t, int32_t kernel_size_h, int32_t kernel_size_w, int32_t stride_t, int32_t stride_h, int32_t stride_w, int32_t dilation_t, int32_t dilation_h, @@ -143,23 +143,23 @@ __launch_bounds__(kBlockSize) __global__ template __launch_bounds__(kBlockSize) __global__ - void DoCUDAMaxPool1dBackward(const NdIndexOffsetHelper index_helper, + void DoCUDAMaxPool1dBackward(const NdIndexOffsetHelper index_helper, const IDX elem_num, const T* src, T* dest, - const int64_t* indice_ptr, const int64_t n_batch, - const int64_t n_channel, const int64_t src_length, - const int64_t dst_length) { + const int64_t* indice_ptr, const int32_t n_batch, + const int32_t n_channel, const int32_t src_length, + const int32_t dst_length) { Maxpool1dBackwardCompute(index_helper, elem_num, src, dest, indice_ptr, n_batch, n_channel, src_length, dst_length); }; template __launch_bounds__(kBlockSize) __global__ - void DoCUDAMaxPool2dBackwardCFirst(const NdIndexOffsetHelper index_helper, + void DoCUDAMaxPool2dBackwardCFirst(const NdIndexOffsetHelper index_helper, const IDX elem_num, const T* src, T* dest, - const int64_t* indice_ptr, const int64_t n_batch, - const int64_t n_channel, const int64_t src_height, - const int64_t src_width, const int64_t dst_height, - const int64_t dst_width) { + const int64_t* indice_ptr, const int32_t n_batch, + const int32_t n_channel, const int32_t src_height, + const int32_t src_width, const int32_t dst_height, + const int32_t dst_width) { Maxpool2dBackwardComputeCFirst(index_helper, elem_num, src, dest, indice_ptr, n_batch, n_channel, src_height, src_width, dst_height, dst_width); }; @@ -168,23 +168,23 @@ template __launch_bounds__(kBlockSize) __global__ void DoCUDAMaxPool2dBackwardCLast(const NdIndexOffsetHelper index_helper, const IDX elem_num, const T* src, T* dest, - const int64_t* indice_ptr, const int64_t n_batch, - const int64_t n_channel, const int64_t src_height, - const int64_t src_width, const int64_t dst_height, - const int64_t dst_width) { + const int64_t* indice_ptr, const int32_t n_batch, + const int32_t n_channel, const int32_t src_height, + const int32_t src_width, const int32_t dst_height, + const int32_t dst_width) { Maxpool2dBackwardComputeCLast(index_helper, elem_num, src, dest, indice_ptr, n_batch, n_channel, src_height, src_width, dst_height, dst_width); }; template __launch_bounds__(kBlockSize) __global__ - void DoCUDAMaxPool3dBackward(const NdIndexOffsetHelper index_helper, + void DoCUDAMaxPool3dBackward(const NdIndexOffsetHelper index_helper, const IDX elem_num, const T* src, T* dest, - const int64_t* indice_ptr, const int64_t n_batch, - const int64_t n_channel, const int64_t src_time, - const int64_t src_height, const int64_t src_width, - const int64_t dst_time, const int64_t dst_height, - const int64_t dst_width) { + const int64_t* indice_ptr, const int32_t n_batch, + const int32_t n_channel, const int32_t src_time, + const int32_t src_height, const int32_t src_width, + const int32_t dst_time, const int32_t dst_height, + const int32_t dst_width) { Maxpool3dBackwardCompute(index_helper, elem_num, src, dest, indice_ptr, n_batch, n_channel, src_time, src_height, src_width, dst_time, dst_height, dst_width); }; @@ -204,7 +204,7 @@ struct PoolingKernelUtil { } static void Maxpool1dBackward(ep::Stream* stream, - const NdIndexOffsetHelper& index_helper, + const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { DoCUDAMaxPool1dBackward<< { } static void Maxpool2dBackwardCFirst(ep::Stream* stream, - const NdIndexOffsetHelper& index_helper, + const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { @@ -283,7 +283,7 @@ struct PoolingKernelUtil { } static void Maxpool3dBackward(ep::Stream* stream, - const NdIndexOffsetHelper& index_helper, + const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { DoCUDAMaxPool3dBackward<<& index_helper, + const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d); @@ -118,7 +118,7 @@ struct PoolingKernelUtil { int64_t* indice_ptr, const MaxPoolingParams3D& params_3d); static void Maxpool2dBackwardCFirst(ep::Stream* stream, - const NdIndexOffsetHelper& index_helper, + const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d); @@ -140,7 +140,7 @@ struct PoolingKernelUtil { const MaxPoolingParams3D& params_3d); static void Maxpool3dBackward(ep::Stream* stream, - const NdIndexOffsetHelper& index_helper, + const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d); }; @@ -156,7 +156,6 @@ OF_DEVICE_FUNC void Maxpool1dForwardCompute(const NdIndexOffsetHelper in IDX n_c, l; index_helper.OffsetToNdIndex(num, n_c, l); - const IDX start_idx = n_c * x_length; IDX lstart = l * stride_l - padding_l; const IDX lend = (lstart + (kernel_size_l - 1) * dilation_l + 1) <= x_length ? (lstart + (kernel_size_l - 1) * dilation_l + 1) @@ -166,37 +165,35 @@ OF_DEVICE_FUNC void Maxpool1dForwardCompute(const NdIndexOffsetHelper in /* compute max value(src[src_idx]) in kernel box region, and save the value to dest[num] */ IDX max_index = lstart; - IDX src_idx = 0; /* equal to -std::numeric_limits::infinity(); */ T max_value = detail::numeric_limits::lower_bound(); - + const T* data = src + n_c * x_length; for (IDX idx = lstart; idx < lend; idx += dilation_l) { - const IDX search_idx = start_idx + idx; - T val = src[search_idx]; + const IDX window_idx = idx; + T val = data[window_idx]; if (val > max_value || detail::numerics::isnan(val)) { max_value = val; max_index = idx; - src_idx = search_idx; } } - dest[num] = src[src_idx]; + dest[num] = max_value; indice_ptr[num] = max_index; } } template -OF_DEVICE_FUNC void Maxpool1dBackwardCompute(const NdIndexOffsetHelper index_helper, +OF_DEVICE_FUNC void Maxpool1dBackwardCompute(const NdIndexOffsetHelper index_helper, const IDX elem_num, const T* src, T* dest, const int64_t* indice_ptr, const int32_t n_batch, const int32_t n_channel, const int32_t src_length, const int32_t dst_length) { XPU_1D_KERNEL_LOOP(num, elem_num) { - IDX n, c, l; - index_helper.OffsetToNdIndex(num, n, c, l); + IDX n_c, l; + index_helper.OffsetToNdIndex(num, n_c, l); - const IDX src_start = (n * n_channel + c) * src_length; - const IDX dst_start = (n * n_channel + c) * dst_length; + const IDX src_start = n_c * src_length; + const IDX dst_start = n_c * dst_length; const IDX index = src_start + l; const IDX max_index = dst_start + indice_ptr[index]; if (max_index != -1) { @@ -217,8 +214,8 @@ OF_DEVICE_FUNC void Maxpool2dForwardComputeCFirst( XPU_1D_KERNEL_LOOP(num, elem_num) { IDX n_c, h, w; index_helper.OffsetToNdIndex(num, n_c, h, w); - int32_t hstart = h * stride_h - padding_h; - int32_t wstart = w * stride_w - padding_w; + IDX hstart = h * stride_h - padding_h; + IDX wstart = w * stride_w - padding_w; const IDX hend = (hstart + (kernel_size_h - 1) * dilation_h + 1) <= x_height ? (hstart + (kernel_size_h - 1) * dilation_h + 1) : x_height; @@ -230,12 +227,12 @@ OF_DEVICE_FUNC void Maxpool2dForwardComputeCFirst( /* equal to -std::numeric_limits::infinity(); */ T max_value = detail::numeric_limits::lower_bound(); /* compute max value(src[src_idx]) in kernel box region, and save the value to dest[num] */ - int32_t max_index = hstart * x_width + wstart; - const T* btm_data = src + n_c * x_width * x_height; - for (int32_t i = hstart; i < hend; i += dilation_h) { - for (int32_t j = wstart; j < wend; j += dilation_w) { - const int32_t window_idx = i * x_width + j; - T val = btm_data[window_idx]; + IDX max_index = hstart * x_width + wstart; + const T* data = src + n_c * x_width * x_height; + for (IDX i = hstart; i < hend; i += dilation_h) { + for (IDX j = wstart; j < wend; j += dilation_w) { + const IDX window_idx = i * x_width + j; + T val = data[window_idx]; /* NOTE: std::isnan(val) only supports a few data types, see: https://en.cppreference.com/w/cpp/numeric/math/isnan and when use gcc/g++ 4.x to compile, @@ -259,16 +256,16 @@ OF_DEVICE_FUNC void Maxpool2dForwardComputeCFirst( template OF_DEVICE_FUNC void Maxpool2dBackwardComputeCFirst( - const NdIndexOffsetHelper index_helper, const IDX elem_num, const T* src, + const NdIndexOffsetHelper index_helper, const IDX elem_num, const T* src, T* dest, const int64_t* indice_ptr, const int32_t n_batch, const int32_t n_channel, const int32_t src_height, const int32_t src_width, const int32_t dst_height, const int32_t dst_width) { XPU_1D_KERNEL_LOOP(num, elem_num) { - IDX n, c, h, w; - index_helper.OffsetToNdIndex(num, n, c, h, w); + IDX n_c, h, w; + index_helper.OffsetToNdIndex(num, n_c, h, w); - const IDX src_start = (n * n_channel + c) * src_height * src_width; - const IDX dst_start = (n * n_channel + c) * dst_height * dst_width; + const IDX src_start = n_c * src_height * src_width; + const IDX dst_start = n_c * dst_height * dst_width; const IDX index = src_start + h * src_width + w; const IDX max_index = dst_start + indice_ptr[index]; @@ -312,7 +309,6 @@ OF_DEVICE_FUNC void Maxpool3dForwardCompute( IDX n_c, t, h, w; index_helper.OffsetToNdIndex(num, n_c, t, h, w); - IDX start_idx = n_c * x_time * x_width * x_height; IDX tstart = t * stride_t - padding_t; IDX hstart = h * stride_h - padding_h; IDX wstart = w * stride_w - padding_w; @@ -329,24 +325,21 @@ OF_DEVICE_FUNC void Maxpool3dForwardCompute( while (wstart < 0) { wstart += dilation_w; } IDX max_index = tstart * x_height * x_width + hstart * x_width + wstart; - IDX src_idx = 0; - + const T* data = src + n_c * x_time * x_width * x_height; T max_value = detail::numeric_limits::lower_bound(); for (IDX zi = tstart; zi < tend; zi += dilation_t) { for (IDX i = hstart; i < hend; i += dilation_h) { for (IDX j = wstart; j < wend; j += dilation_w) { const IDX window_idx = zi * x_height * x_width + i * x_width + j; - const IDX search_idx = start_idx + window_idx; - T val = src[search_idx]; + T val = data[window_idx]; if (val > max_value || detail::numerics::isnan(val)) { max_value = val; max_index = window_idx; - src_idx = search_idx; } } } /* set output to local max */ - dest[num] = src[src_idx]; + dest[num] = max_value; /* store location of max */ indice_ptr[num] = max_index; } @@ -354,7 +347,7 @@ OF_DEVICE_FUNC void Maxpool3dForwardCompute( } template -OF_DEVICE_FUNC void Maxpool3dBackwardCompute(const NdIndexOffsetHelper index_helper, +OF_DEVICE_FUNC void Maxpool3dBackwardCompute(const NdIndexOffsetHelper index_helper, const IDX elem_num, const T* src, T* dest, const int64_t* indice_ptr, const int32_t n_batch, const int32_t n_channel, const int32_t src_time, @@ -362,11 +355,11 @@ OF_DEVICE_FUNC void Maxpool3dBackwardCompute(const NdIndexOffsetHelper i const int32_t dst_time, const int32_t dst_height, const int32_t dst_width) { XPU_1D_KERNEL_LOOP(num, elem_num) { - IDX n, c, t, h, w; - index_helper.OffsetToNdIndex(num, n, c, t, h, w); + IDX n_c, t, h, w; + index_helper.OffsetToNdIndex(num, n_c, t, h, w); - const IDX src_start = (n * n_channel + c) * src_time * src_height * src_width; - const IDX dst_start = (n * n_channel + c) * dst_time * dst_height * dst_width; + const IDX src_start = n_c * src_time * src_height * src_width; + const IDX dst_start = n_c * dst_time * dst_height * dst_width; const IDX index = src_start + t * src_height * src_width + h * src_width + w; const IDX max_index = dst_start + indice_ptr[index]; From 97cbec0c2e45fbdaf2f7b4ba50779c5ca6e02ffc Mon Sep 17 00:00:00 2001 From: MARD1NO <359521840@qq.com> Date: Wed, 9 Feb 2022 11:31:55 +0800 Subject: [PATCH 16/17] fix all optimize. TODO: NHWC --- oneflow/user/kernels/pooling_kernel.cpp | 193 ++++++++++----------- oneflow/user/kernels/pooling_kernel.cu | 163 ++++++++--------- oneflow/user/kernels/pooling_kernel_util.h | 79 ++++----- 3 files changed, 204 insertions(+), 231 deletions(-) diff --git a/oneflow/user/kernels/pooling_kernel.cpp b/oneflow/user/kernels/pooling_kernel.cpp index 132ce9f41e4..12d979ec536 100644 --- a/oneflow/user/kernels/pooling_kernel.cpp +++ b/oneflow/user/kernels/pooling_kernel.cpp @@ -43,8 +43,8 @@ std::shared_ptr CreateOpKernelCache(user_op::KernelCacheCo namespace { template -void Maxpool2dForwardComputeCLast(const NdIndexOffsetHelper& index_helper, - IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, +void Maxpool2dForwardComputeCLast(const NdIndexOffsetHelper& index_helper, IDX elem_num, + const T* src, T* dest, int64_t* indice_ptr, const int32_t padding_h, const int32_t padding_w, const int32_t n_batch, const int32_t n_channel, const int32_t x_height, const int32_t x_width, @@ -61,11 +61,11 @@ void Maxpool2dForwardComputeCLast(const NdIndexOffsetHelper& index_helpe IDX hstart = h * stride_h - padding_h; IDX wstart = w * stride_w - padding_w; const IDX hend = (hstart + (kernel_size_h - 1) * dilation_h + 1) <= x_height - ? (hstart + (kernel_size_h - 1) * dilation_h + 1) - : x_height; + ? (hstart + (kernel_size_h - 1) * dilation_h + 1) + : x_height; const IDX wend = (wstart + (kernel_size_w - 1) * dilation_w + 1) <= x_width - ? (wstart + (kernel_size_w - 1) * dilation_w + 1) - : x_width; + ? (wstart + (kernel_size_w - 1) * dilation_w + 1) + : x_width; while (hstart < 0) { hstart += dilation_h; } while (wstart < 0) { wstart += dilation_w; } @@ -97,37 +97,33 @@ void Maxpool2dForwardComputeCLast(const NdIndexOffsetHelper& index_helpe template struct PoolingKernelUtil { - static void Maxpool1dForward(ep::Stream* stream, - const NdIndexOffsetHelper& index_helper, + static void Maxpool1dForward(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { - Maxpool1dForwardCompute(index_helper, elem_num, src, dest, indice_ptr, - params_3d.padding()[2], params_3d.num_batch(), - params_3d.num_channel(), params_3d.GetXShape5D().At(4), - params_3d.pooling_size_3d()[2], - params_3d.stride_3d()[2], params_3d.dilation_3d()[2]); + Maxpool1dForwardCompute( + index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[2], + params_3d.num_batch(), params_3d.num_channel(), params_3d.GetXShape5D().At(4), + params_3d.pooling_size_3d()[2], params_3d.stride_3d()[2], params_3d.dilation_3d()[2]); } - static void Maxpool1dBackward(ep::Stream* stream, - const NdIndexOffsetHelper& index_helper, + static void Maxpool1dBackward(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { Maxpool1dBackwardCompute(index_helper, elem_num, src, dest, indice_ptr, - params_3d.num_batch(), params_3d.num_channel(), - params_3d.GetYShape5D().At(4), params_3d.GetXShape5D().At(4)); + params_3d.num_batch(), params_3d.num_channel(), + params_3d.GetYShape5D().At(4), params_3d.GetXShape5D().At(4)); } static void Maxpool2dForwardCFirst(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, - const IDX elem_num, const T* src, T* dest, - int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { + const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, + const MaxPoolingParams3D& params_3d) { Maxpool2dForwardComputeCFirst( index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[1], params_3d.padding()[2], params_3d.num_batch(), params_3d.num_channel(), - params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4), - params_3d.pooling_size_3d()[1], params_3d.pooling_size_3d()[2], - params_3d.stride_3d()[1], params_3d.stride_3d()[2], - params_3d.dilation_3d()[1], params_3d.dilation_3d()[2]); + params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4), + params_3d.pooling_size_3d()[1], params_3d.pooling_size_3d()[2], params_3d.stride_3d()[1], + params_3d.stride_3d()[2], params_3d.dilation_3d()[1], params_3d.dilation_3d()[2]); } static void Maxpool2dBackwardCFirst(ep::Stream* stream, @@ -135,16 +131,16 @@ struct PoolingKernelUtil { const IDX elem_num, const T* src, T* dest, const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { - Maxpool2dBackwardComputeCFirst(index_helper, elem_num, src, dest, indice_ptr, - params_3d.num_batch(), params_3d.num_channel(), - params_3d.GetYShape5D().At(3), params_3d.GetYShape5D().At(4), - params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4)); + Maxpool2dBackwardComputeCFirst( + index_helper, elem_num, src, dest, indice_ptr, params_3d.num_batch(), + params_3d.num_channel(), params_3d.GetYShape5D().At(3), params_3d.GetYShape5D().At(4), + params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4)); } static void Maxpool2dForwardCLast(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, - const IDX elem_num, const T* src, T* dest, - int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { + const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, + const MaxPoolingParams3D& params_3d) { Maxpool2dForwardComputeCLast( index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[1], params_3d.padding()[2], params_3d.num_batch(), params_3d.num_channel(), @@ -159,36 +155,33 @@ struct PoolingKernelUtil { const IDX elem_num, const T* src, T* dest, const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { - Maxpool2dBackwardComputeCLast(index_helper, elem_num, src, dest, indice_ptr, - params_3d.num_batch(), params_3d.num_channel(), - params_3d.GetYShape5D().At(3), params_3d.GetYShape5D().At(4), - params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4)); + Maxpool2dBackwardComputeCLast( + index_helper, elem_num, src, dest, indice_ptr, params_3d.num_batch(), + params_3d.num_channel(), params_3d.GetYShape5D().At(3), params_3d.GetYShape5D().At(4), + params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4)); } - static void Maxpool3dForward(ep::Stream* stream, - const NdIndexOffsetHelper& index_helper, + static void Maxpool3dForward(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { Maxpool3dForwardCompute( index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[0], params_3d.padding()[1], params_3d.padding()[2], params_3d.num_batch(), - params_3d.num_channel(), - params_3d.GetXShape5D().At(2), params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4), - params_3d.pooling_size_3d()[0], params_3d.pooling_size_3d()[1], params_3d.pooling_size_3d()[2], - params_3d.stride_3d()[0], params_3d.stride_3d()[1], params_3d.stride_3d()[2], - params_3d.dilation_3d()[0], + params_3d.num_channel(), params_3d.GetXShape5D().At(2), params_3d.GetXShape5D().At(3), + params_3d.GetXShape5D().At(4), params_3d.pooling_size_3d()[0], + params_3d.pooling_size_3d()[1], params_3d.pooling_size_3d()[2], params_3d.stride_3d()[0], + params_3d.stride_3d()[1], params_3d.stride_3d()[2], params_3d.dilation_3d()[0], params_3d.dilation_3d()[1], params_3d.dilation_3d()[2]); } - static void Maxpool3dBackward(ep::Stream* stream, - const NdIndexOffsetHelper index_helper, + static void Maxpool3dBackward(ep::Stream* stream, const NdIndexOffsetHelper index_helper, const IDX elem_num, const T* src, T* dest, const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { Maxpool3dBackwardCompute(index_helper, elem_num, src, dest, indice_ptr, - params_3d.num_batch(), params_3d.num_channel(), - params_3d.GetYShape5D().At(2), params_3d.GetYShape5D().At(3), - params_3d.GetYShape5D().At(4), params_3d.GetXShape5D().At(2), - params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4)); + params_3d.num_batch(), params_3d.num_channel(), + params_3d.GetYShape5D().At(2), params_3d.GetYShape5D().At(3), + params_3d.GetYShape5D().At(4), params_3d.GetXShape5D().At(2), + params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4)); } }; @@ -220,18 +213,18 @@ class MaxPool1dKernel final : public user_op::OpKernel { int64_t* indice_ptr = indice->mut_dptr(); DimVector y_vector(2); - y_vector.at(0) = y->shape().At(0) * y->shape().At(1); - y_vector.at(1) = y->shape().At(2); - if(elem_num < GetMaxVal()){ + y_vector.at(0) = y->shape().At(0) * y->shape().At(1); + y_vector.at(1) = y->shape().At(2); + if (elem_num < GetMaxVal()) { NdIndexOffsetHelper index_helper(y_vector.data()); - PoolingKernelUtil::Maxpool1dForward(ctx->stream(), index_helper, elem_num, src, - dest, indice_ptr, params_3d); - }else{ + PoolingKernelUtil::Maxpool1dForward( + ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); + } else { NdIndexOffsetHelper index_helper(y_vector.data()); - PoolingKernelUtil::Maxpool1dForward(ctx->stream(), index_helper, elem_num, src, - dest, indice_ptr, params_3d); - } + PoolingKernelUtil::Maxpool1dForward( + ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); } + } }; template @@ -261,21 +254,20 @@ class MaxPool1dGradKernel final : public user_op::OpKernel { const int64_t* indice_ptr = indice->dptr(); T* dest = dx->mut_dptr(); DimVector dy_vector(2); - dy_vector.at(0) = dy->shape().At(0) * dy->shape().At(1); - dy_vector.at(1) = dy->shape().At(2); + dy_vector.at(0) = dy->shape().At(0) * dy->shape().At(1); + dy_vector.at(1) = dy->shape().At(2); size_t out_bytes_size = dx->shape().elem_cnt() * GetSizeOfDataType(dx->data_type()); Memset(ctx->stream(), dest, 0, out_bytes_size); - if(elem_num < GetMaxVal()){ + if (elem_num < GetMaxVal()) { NdIndexOffsetHelper index_helper(dy_vector.data()); - PoolingKernelUtil::Maxpool1dBackward(ctx->stream(), index_helper, elem_num, src, - dest, indice_ptr, params_3d); - }else{ + PoolingKernelUtil::Maxpool1dBackward( + ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); + } else { NdIndexOffsetHelper index_helper(dy_vector.data()); - PoolingKernelUtil::Maxpool1dBackward(ctx->stream(), index_helper, elem_num, src, - dest, indice_ptr, params_3d); + PoolingKernelUtil::Maxpool1dBackward( + ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); } - }; }; @@ -311,30 +303,29 @@ class MaxPool2dKernel final : public user_op::OpKernel { const std::string& data_format = ctx->Attr("data_format"); if (data_format == "channels_first") { DimVector y_vector(3); - y_vector.at(0) = y->shape().At(0) * y->shape().At(1); - y_vector.at(1) = y->shape().At(2); - y_vector.at(2) = y->shape().At(3); - if(elem_num < GetMaxVal()){ + y_vector.at(0) = y->shape().At(0) * y->shape().At(1); + y_vector.at(1) = y->shape().At(2); + y_vector.at(2) = y->shape().At(3); + if (elem_num < GetMaxVal()) { NdIndexOffsetHelper index_helper(y_vector.data()); PoolingKernelUtil::Maxpool2dForwardCFirst( - ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); - } - else{ + ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); + } else { NdIndexOffsetHelper index_helper(y_vector.data()); PoolingKernelUtil::Maxpool2dForwardCFirst( - ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); + ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); } } else if (data_format == "channels_last") { DimVector y_vector; y->shape().ToDimVector(&y_vector); - if(elem_num < GetMaxVal()){ + if (elem_num < GetMaxVal()) { NdIndexOffsetHelper index_helper(y_vector.data()); PoolingKernelUtil::Maxpool2dForwardCLast( - ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); - }else{ + ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); + } else { NdIndexOffsetHelper index_helper(y_vector.data()); PoolingKernelUtil::Maxpool2dForwardCLast( - ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); + ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); } } else { UNIMPLEMENTED() << "Unsupported data_format"; @@ -373,13 +364,13 @@ class MaxPool2dGradKernel final : public user_op::OpKernel { Memset(ctx->stream(), dest, 0, out_bytes_size); const std::string& data_format = ctx->Attr("data_format"); - + if (data_format == "channels_first") { DimVector dy_vector(3); - dy_vector.at(0) = dy->shape().At(0) * dy->shape().At(1); + dy_vector.at(0) = dy->shape().At(0) * dy->shape().At(1); dy_vector.at(1) = dy->shape().At(2); dy_vector.at(2) = dy->shape().At(3); - if(elem_num < GetMaxVal()){ + if (elem_num < GetMaxVal()) { NdIndexOffsetHelper index_helper(dy_vector.data()); PoolingKernelUtil::Maxpool2dBackwardCFirst( ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); @@ -391,14 +382,14 @@ class MaxPool2dGradKernel final : public user_op::OpKernel { } else if (data_format == "channels_last") { DimVector dy_vector; dy->shape().ToDimVector(&dy_vector); - if(elem_num < GetMaxVal()){ + if (elem_num < GetMaxVal()) { NdIndexOffsetHelper index_helper(dy_vector.data()); PoolingKernelUtil::Maxpool2dBackwardCLast( - ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); + ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); } else { NdIndexOffsetHelper index_helper(dy_vector.data()); PoolingKernelUtil::Maxpool2dBackwardCLast( - ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); + ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); } } else { UNIMPLEMENTED() << "Unsupported data_format"; @@ -434,19 +425,19 @@ class MaxPool3dKernel final : public user_op::OpKernel { int64_t* indice_ptr = indice->mut_dptr(); DimVector y_vector(4); - y_vector.at(0) = y->shape().At(0) * y->shape().At(1); - y_vector.at(1) = y->shape().At(2); - y_vector.at(2) = y->shape().At(3); - y_vector.at(3) = y->shape().At(4); + y_vector.at(0) = y->shape().At(0) * y->shape().At(1); + y_vector.at(1) = y->shape().At(2); + y_vector.at(2) = y->shape().At(3); + y_vector.at(3) = y->shape().At(4); - if(elem_num < GetMaxVal()){ + if (elem_num < GetMaxVal()) { NdIndexOffsetHelper index_helper(y_vector.data()); - PoolingKernelUtil::Maxpool3dForward(ctx->stream(), index_helper, elem_num, src, - dest, indice_ptr, params_3d); - }else{ + PoolingKernelUtil::Maxpool3dForward( + ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); + } else { NdIndexOffsetHelper index_helper(y_vector.data()); - PoolingKernelUtil::Maxpool3dForward(ctx->stream(), index_helper, elem_num, src, - dest, indice_ptr, params_3d); + PoolingKernelUtil::Maxpool3dForward( + ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); } }; }; @@ -479,22 +470,22 @@ class MaxPool3dGradKernel final : public user_op::OpKernel { T* dest = dx->mut_dptr(); DimVector dy_vector(4); - dy_vector.at(0) = dy->shape().At(0) * dy->shape().At(1); + dy_vector.at(0) = dy->shape().At(0) * dy->shape().At(1); dy_vector.at(1) = dy->shape().At(2); - dy_vector.at(2) = dy->shape().At(3); - dy_vector.at(3) = dy->shape().At(4); + dy_vector.at(2) = dy->shape().At(3); + dy_vector.at(3) = dy->shape().At(4); size_t out_bytes_size = dx->shape().elem_cnt() * GetSizeOfDataType(dx->data_type()); Memset(ctx->stream(), dest, 0, out_bytes_size); - if(elem_num < GetMaxVal()){ + if (elem_num < GetMaxVal()) { NdIndexOffsetHelper index_helper(dy_vector.data()); - PoolingKernelUtil::Maxpool3dBackward(ctx->stream(), index_helper, elem_num, src, - dest, indice_ptr, params_3d); - }else { + PoolingKernelUtil::Maxpool3dBackward( + ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); + } else { NdIndexOffsetHelper index_helper(dy_vector.data()); - PoolingKernelUtil::Maxpool3dBackward(ctx->stream(), index_helper, elem_num, src, - dest, indice_ptr, params_3d); + PoolingKernelUtil::Maxpool3dBackward( + ctx->stream(), index_helper, elem_num, src, dest, indice_ptr, params_3d); } }; }; diff --git a/oneflow/user/kernels/pooling_kernel.cu b/oneflow/user/kernels/pooling_kernel.cu index 097ff903ec3..dc47abd2744 100644 --- a/oneflow/user/kernels/pooling_kernel.cu +++ b/oneflow/user/kernels/pooling_kernel.cu @@ -49,11 +49,11 @@ __device__ __inline__ void Maxpool2dForwardComputeCLast( IDX hstart = h * stride_h - padding_h; IDX wstart = w * stride_w - padding_w; const IDX hend = (hstart + (kernel_size_h - 1) * dilation_h + 1) <= x_height - ? (hstart + (kernel_size_h - 1) * dilation_h + 1) - : x_height; + ? (hstart + (kernel_size_h - 1) * dilation_h + 1) + : x_height; const IDX wend = (wstart + (kernel_size_w - 1) * dilation_w + 1) <= x_width - ? (wstart + (kernel_size_w - 1) * dilation_w + 1) - : x_width; + ? (wstart + (kernel_size_w - 1) * dilation_w + 1) + : x_width; while (hstart < 0) { hstart += dilation_h; } while (wstart < 0) { wstart += dilation_w; } @@ -85,71 +85,66 @@ __device__ __inline__ void Maxpool2dForwardComputeCLast( template __launch_bounds__(kBlockSize) __global__ - void DoCUDAMaxPool1dForward(const NdIndexOffsetHelper index_helper, - IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, - int32_t padding_l, int32_t n_batch, int32_t n_channel, - int32_t x_length, int32_t kernel_size_l, - int32_t stride_l, int32_t dilation_l) { + void DoCUDAMaxPool1dForward(const NdIndexOffsetHelper index_helper, IDX elem_num, + const T* src, T* dest, int64_t* indice_ptr, int32_t padding_l, + int32_t n_batch, int32_t n_channel, int32_t x_length, + int32_t kernel_size_l, int32_t stride_l, int32_t dilation_l) { Maxpool1dForwardCompute(index_helper, elem_num, src, dest, indice_ptr, padding_l, n_batch, - n_channel, x_length, kernel_size_l, stride_l, dilation_l); + n_channel, x_length, kernel_size_l, stride_l, dilation_l); }; template __launch_bounds__(kBlockSize) __global__ - void DoCUDAMaxPool2dForwardCFirst(const NdIndexOffsetHelper index_helper, - IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, - int32_t padding_h, int32_t padding_w, int32_t n_batch, - int32_t n_channel, int32_t x_height, int32_t x_width, - int32_t kernel_size_h, int32_t kernel_size_w, - int32_t stride_h, int32_t stride_w, + void DoCUDAMaxPool2dForwardCFirst(const NdIndexOffsetHelper index_helper, IDX elem_num, + const T* src, T* dest, int64_t* indice_ptr, int32_t padding_h, + int32_t padding_w, int32_t n_batch, int32_t n_channel, + int32_t x_height, int32_t x_width, int32_t kernel_size_h, + int32_t kernel_size_w, int32_t stride_h, int32_t stride_w, int32_t dilation_h, int32_t dilation_w) { - Maxpool2dForwardComputeCFirst(index_helper, elem_num, src, dest, indice_ptr, padding_h, - padding_w, n_batch, n_channel, x_height, x_width, - kernel_size_h, kernel_size_w, stride_h, stride_w, - dilation_h, dilation_w); + Maxpool2dForwardComputeCFirst( + index_helper, elem_num, src, dest, indice_ptr, padding_h, padding_w, n_batch, n_channel, + x_height, x_width, kernel_size_h, kernel_size_w, stride_h, stride_w, dilation_h, dilation_w); }; template __launch_bounds__(kBlockSize) __global__ - void DoCUDAMaxPool2dForwardCLast(const NdIndexOffsetHelper index_helper, - IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, - int32_t padding_h, int32_t padding_w, int32_t n_batch, - int32_t n_channel, int32_t x_height, int32_t x_width, - int32_t y_height, int32_t y_width, int32_t kernel_size_h, - int32_t kernel_size_w, int32_t stride_h, int32_t stride_w, - int32_t dilation_h, int32_t dilation_w) { + void DoCUDAMaxPool2dForwardCLast(const NdIndexOffsetHelper index_helper, IDX elem_num, + const T* src, T* dest, int64_t* indice_ptr, int32_t padding_h, + int32_t padding_w, int32_t n_batch, int32_t n_channel, + int32_t x_height, int32_t x_width, int32_t y_height, + int32_t y_width, int32_t kernel_size_h, int32_t kernel_size_w, + int32_t stride_h, int32_t stride_w, int32_t dilation_h, + int32_t dilation_w) { Maxpool2dForwardComputeCLast(index_helper, elem_num, src, dest, indice_ptr, padding_h, - padding_w, n_batch, n_channel, x_height, x_width, y_height, - y_width, kernel_size_h, kernel_size_w, stride_h, stride_w, - dilation_h, dilation_w); + padding_w, n_batch, n_channel, x_height, x_width, y_height, + y_width, kernel_size_h, kernel_size_w, stride_h, stride_w, + dilation_h, dilation_w); }; template __launch_bounds__(kBlockSize) __global__ - void DoCUDAMaxPool3dForward(const NdIndexOffsetHelper index_helper, - IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, - int32_t padding_t, int32_t padding_h, int32_t padding_w, - int32_t n_batch, int32_t n_channel, int32_t x_time, - int32_t x_height, int32_t x_width, - int32_t kernel_size_t, int32_t kernel_size_h, + void DoCUDAMaxPool3dForward(const NdIndexOffsetHelper index_helper, IDX elem_num, + const T* src, T* dest, int64_t* indice_ptr, int32_t padding_t, + int32_t padding_h, int32_t padding_w, int32_t n_batch, + int32_t n_channel, int32_t x_time, int32_t x_height, + int32_t x_width, int32_t kernel_size_t, int32_t kernel_size_h, int32_t kernel_size_w, int32_t stride_t, int32_t stride_h, int32_t stride_w, int32_t dilation_t, int32_t dilation_h, int32_t dilation_w) { - Maxpool3dForwardCompute(index_helper, elem_num, src, dest, indice_ptr, padding_t, padding_h, - padding_w, n_batch, n_channel, x_time, x_height, x_width, - kernel_size_t, kernel_size_h, kernel_size_w, - stride_t, stride_h, stride_w, dilation_t, dilation_h, dilation_w); + Maxpool3dForwardCompute(index_helper, elem_num, src, dest, indice_ptr, padding_t, + padding_h, padding_w, n_batch, n_channel, x_time, x_height, + x_width, kernel_size_t, kernel_size_h, kernel_size_w, stride_t, + stride_h, stride_w, dilation_t, dilation_h, dilation_w); }; template __launch_bounds__(kBlockSize) __global__ - void DoCUDAMaxPool1dBackward(const NdIndexOffsetHelper index_helper, - const IDX elem_num, const T* src, T* dest, - const int64_t* indice_ptr, const int32_t n_batch, - const int32_t n_channel, const int32_t src_length, - const int32_t dst_length) { - Maxpool1dBackwardCompute(index_helper, elem_num, src, dest, indice_ptr, n_batch, n_channel, - src_length, dst_length); + void DoCUDAMaxPool1dBackward(const NdIndexOffsetHelper index_helper, const IDX elem_num, + const T* src, T* dest, const int64_t* indice_ptr, + const int32_t n_batch, const int32_t n_channel, + const int32_t src_length, const int32_t dst_length) { + Maxpool1dBackwardCompute(index_helper, elem_num, src, dest, indice_ptr, n_batch, + n_channel, src_length, dst_length); }; template @@ -161,7 +156,7 @@ __launch_bounds__(kBlockSize) __global__ const int32_t src_width, const int32_t dst_height, const int32_t dst_width) { Maxpool2dBackwardComputeCFirst(index_helper, elem_num, src, dest, indice_ptr, n_batch, - n_channel, src_height, src_width, dst_height, dst_width); + n_channel, src_height, src_width, dst_height, dst_width); }; template @@ -173,58 +168,54 @@ __launch_bounds__(kBlockSize) __global__ const int32_t src_width, const int32_t dst_height, const int32_t dst_width) { Maxpool2dBackwardComputeCLast(index_helper, elem_num, src, dest, indice_ptr, n_batch, - n_channel, src_height, src_width, dst_height, dst_width); + n_channel, src_height, src_width, dst_height, dst_width); }; template __launch_bounds__(kBlockSize) __global__ - void DoCUDAMaxPool3dBackward(const NdIndexOffsetHelper index_helper, - const IDX elem_num, const T* src, T* dest, - const int64_t* indice_ptr, const int32_t n_batch, - const int32_t n_channel, const int32_t src_time, - const int32_t src_height, const int32_t src_width, - const int32_t dst_time, const int32_t dst_height, - const int32_t dst_width) { - Maxpool3dBackwardCompute(index_helper, elem_num, src, dest, indice_ptr, n_batch, n_channel, - src_time, src_height, src_width, dst_time, dst_height, dst_width); + void DoCUDAMaxPool3dBackward(const NdIndexOffsetHelper index_helper, const IDX elem_num, + const T* src, T* dest, const int64_t* indice_ptr, + const int32_t n_batch, const int32_t n_channel, + const int32_t src_time, const int32_t src_height, + const int32_t src_width, const int32_t dst_time, + const int32_t dst_height, const int32_t dst_width) { + Maxpool3dBackwardCompute(index_helper, elem_num, src, dest, indice_ptr, n_batch, + n_channel, src_time, src_height, src_width, dst_time, dst_height, + dst_width); }; template struct PoolingKernelUtil { - static void Maxpool1dForward(ep::Stream* stream, - const NdIndexOffsetHelper& index_helper, + static void Maxpool1dForward(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { DoCUDAMaxPool1dForward<<As()->cuda_stream()>>>( + stream->As()->cuda_stream()>>>( index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[2], params_3d.num_batch(), params_3d.num_channel(), params_3d.GetXShape5D().At(4), - params_3d.pooling_size_3d()[2], params_3d.stride_3d()[2], - params_3d.dilation_3d()[2]); + params_3d.pooling_size_3d()[2], params_3d.stride_3d()[2], params_3d.dilation_3d()[2]); } - static void Maxpool1dBackward(ep::Stream* stream, - const NdIndexOffsetHelper& index_helper, + static void Maxpool1dBackward(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { DoCUDAMaxPool1dBackward<<As()->cuda_stream()>>>( + stream->As()->cuda_stream()>>>( index_helper, elem_num, src, dest, indice_ptr, params_3d.num_batch(), params_3d.num_channel(), params_3d.GetYShape5D().At(4), params_3d.GetXShape5D().At(4)); } static void Maxpool2dForwardCFirst(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, - const IDX elem_num, const T* src, T* dest, - int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { + const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, + const MaxPoolingParams3D& params_3d) { DoCUDAMaxPool2dForwardCFirst<<As()->cuda_stream()>>>( + stream->As()->cuda_stream()>>>( index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[1], params_3d.padding()[2], params_3d.num_batch(), params_3d.num_channel(), - params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4), - params_3d.pooling_size_3d()[1], params_3d.pooling_size_3d()[2], - params_3d.stride_3d()[1], params_3d.stride_3d()[2], - params_3d.dilation_3d()[1], params_3d.dilation_3d()[2]); + params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4), + params_3d.pooling_size_3d()[1], params_3d.pooling_size_3d()[2], params_3d.stride_3d()[1], + params_3d.stride_3d()[2], params_3d.dilation_3d()[1], params_3d.dilation_3d()[2]); } static void Maxpool2dBackwardCFirst(ep::Stream* stream, @@ -233,7 +224,7 @@ struct PoolingKernelUtil { const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { DoCUDAMaxPool2dBackwardCFirst<<As()->cuda_stream()>>>( + stream->As()->cuda_stream()>>>( index_helper, elem_num, src, dest, indice_ptr, params_3d.num_batch(), params_3d.num_channel(), params_3d.GetYShape5D().At(3), params_3d.GetYShape5D().At(4), params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4)); @@ -241,10 +232,10 @@ struct PoolingKernelUtil { static void Maxpool2dForwardCLast(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, - const IDX elem_num, const T* src, T* dest, - int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { + const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, + const MaxPoolingParams3D& params_3d) { DoCUDAMaxPool2dForwardCLast<<As()->cuda_stream()>>>( + stream->As()->cuda_stream()>>>( index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[1], params_3d.padding()[2], params_3d.num_batch(), params_3d.num_channel(), params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4), params_3d.GetYShape5D().At(3), @@ -259,35 +250,31 @@ struct PoolingKernelUtil { const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { DoCUDAMaxPool2dBackwardCLast<<As()->cuda_stream()>>>( + stream->As()->cuda_stream()>>>( index_helper, elem_num, src, dest, indice_ptr, params_3d.num_batch(), params_3d.num_channel(), params_3d.GetYShape5D().At(3), params_3d.GetYShape5D().At(4), params_3d.GetXShape5D().At(3), params_3d.GetXShape5D().At(4)); } - static void Maxpool3dForward(ep::Stream* stream, - const NdIndexOffsetHelper& index_helper, + static void Maxpool3dForward(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { DoCUDAMaxPool3dForward<<As()->cuda_stream()>>>( + stream->As()->cuda_stream()>>>( index_helper, elem_num, src, dest, indice_ptr, params_3d.padding()[0], params_3d.padding()[1], params_3d.padding()[2], params_3d.num_batch(), - params_3d.num_channel(), - params_3d.GetXShape5D().At(2), params_3d.GetXShape5D().At(3), - params_3d.GetXShape5D().At(4), - params_3d.pooling_size_3d()[0], + params_3d.num_channel(), params_3d.GetXShape5D().At(2), params_3d.GetXShape5D().At(3), + params_3d.GetXShape5D().At(4), params_3d.pooling_size_3d()[0], params_3d.pooling_size_3d()[1], params_3d.pooling_size_3d()[2], params_3d.stride_3d()[0], params_3d.stride_3d()[1], params_3d.stride_3d()[2], params_3d.dilation_3d()[0], params_3d.dilation_3d()[1], params_3d.dilation_3d()[2]); } - static void Maxpool3dBackward(ep::Stream* stream, - const NdIndexOffsetHelper& index_helper, + static void Maxpool3dBackward(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d) { DoCUDAMaxPool3dBackward<<As()->cuda_stream()>>>( + stream->As()->cuda_stream()>>>( index_helper, elem_num, src, dest, indice_ptr, params_3d.num_batch(), params_3d.num_channel(), params_3d.GetYShape5D().At(2), params_3d.GetYShape5D().At(3), params_3d.GetYShape5D().At(4), params_3d.GetXShape5D().At(2), params_3d.GetXShape5D().At(3), diff --git a/oneflow/user/kernels/pooling_kernel_util.h b/oneflow/user/kernels/pooling_kernel_util.h index dc99137d439..533641156a3 100644 --- a/oneflow/user/kernels/pooling_kernel_util.h +++ b/oneflow/user/kernels/pooling_kernel_util.h @@ -33,7 +33,7 @@ namespace oneflow { OF_PP_MAKE_TUPLE_SEQ(float, DataType::kFloat) \ OF_PP_MAKE_TUPLE_SEQ(double, DataType::kDouble) -#define POOLING_IDX_DATA_TYPE_SEQ \ +#define POOLING_IDX_DATA_TYPE_SEQ \ OF_PP_MAKE_TUPLE_SEQ(int32_t, DataType::kInt32) \ OF_PP_MAKE_TUPLE_SEQ(int64_t, DataType::kInt64) @@ -54,11 +54,9 @@ struct DeviceAdd { }; }; -#ifdef WITH_CUDA - -OF_DEVICE_FUNC int32_t device_min(int32_t a, int32_t b) { - return a <= b ? a : b; -} +#ifdef WITH_CUDA + +OF_DEVICE_FUNC int32_t device_min(int32_t a, int32_t b) { return a <= b ? a : b; } #endif @@ -102,20 +100,18 @@ class MaxPoolingParams3D { template struct PoolingKernelUtil { - static void Maxpool1dForward(ep::Stream* stream, - const NdIndexOffsetHelper& index_helper, + static void Maxpool1dForward(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, const MaxPoolingParams3D& params_3d); - static void Maxpool1dBackward(ep::Stream* stream, - const NdIndexOffsetHelper& index_helper, + static void Maxpool1dBackward(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d); static void Maxpool2dForwardCFirst(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, - const IDX elem_num, const T* src, T* dest, - int64_t* indice_ptr, const MaxPoolingParams3D& params_3d); + const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, + const MaxPoolingParams3D& params_3d); static void Maxpool2dBackwardCFirst(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, @@ -125,8 +121,8 @@ struct PoolingKernelUtil { static void Maxpool2dForwardCLast(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, - const IDX elem_num, const T* src, T* dest, - int64_t* indice_ptr, const MaxPoolingParams3D& params_3d); + const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, + const MaxPoolingParams3D& params_3d); static void Maxpool2dBackwardCLast(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, @@ -134,13 +130,11 @@ struct PoolingKernelUtil { const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d); - static void Maxpool3dForward(ep::Stream* stream, - const NdIndexOffsetHelper& index_helper, + static void Maxpool3dForward(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, const MaxPoolingParams3D& params_3d); - static void Maxpool3dBackward(ep::Stream* stream, - const NdIndexOffsetHelper& index_helper, + static void Maxpool3dBackward(ep::Stream* stream, const NdIndexOffsetHelper& index_helper, const IDX elem_num, const T* src, T* dest, const int64_t* indice_ptr, const MaxPoolingParams3D& params_3d); }; @@ -150,7 +144,7 @@ OF_DEVICE_FUNC void Maxpool1dForwardCompute(const NdIndexOffsetHelper in IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, const int32_t padding_l, const int32_t n_batch, const int32_t n_channel, - const int32_t x_length, const int32_t kernel_size_l, + const int32_t x_length, const int32_t kernel_size_l, const int32_t stride_l, const int32_t dilation_l) { XPU_1D_KERNEL_LOOP(num, elem_num) { IDX n_c, l; @@ -158,8 +152,8 @@ OF_DEVICE_FUNC void Maxpool1dForwardCompute(const NdIndexOffsetHelper in IDX lstart = l * stride_l - padding_l; const IDX lend = (lstart + (kernel_size_l - 1) * dilation_l + 1) <= x_length - ? (lstart + (kernel_size_l - 1) * dilation_l + 1) - : x_length; + ? (lstart + (kernel_size_l - 1) * dilation_l + 1) + : x_length; while (lstart < 0) { lstart += dilation_l; } @@ -208,20 +202,19 @@ OF_DEVICE_FUNC void Maxpool2dForwardComputeCFirst( const NdIndexOffsetHelper index_helper, IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, const int32_t padding_h, const int32_t padding_w, const int32_t n_batch, const int32_t n_channel, const int32_t x_height, const int32_t x_width, - const int32_t kernel_size_h, const int32_t kernel_size_w, - const int32_t stride_h, const int32_t stride_w, const int32_t dilation_h, - const int32_t dilation_w) { + const int32_t kernel_size_h, const int32_t kernel_size_w, const int32_t stride_h, + const int32_t stride_w, const int32_t dilation_h, const int32_t dilation_w) { XPU_1D_KERNEL_LOOP(num, elem_num) { IDX n_c, h, w; index_helper.OffsetToNdIndex(num, n_c, h, w); IDX hstart = h * stride_h - padding_h; IDX wstart = w * stride_w - padding_w; const IDX hend = (hstart + (kernel_size_h - 1) * dilation_h + 1) <= x_height - ? (hstart + (kernel_size_h - 1) * dilation_h + 1) - : x_height; + ? (hstart + (kernel_size_h - 1) * dilation_h + 1) + : x_height; const IDX wend = (wstart + (kernel_size_w - 1) * dilation_w + 1) <= x_width - ? (wstart + (kernel_size_w - 1) * dilation_w + 1) - : x_width; + ? (wstart + (kernel_size_w - 1) * dilation_w + 1) + : x_width; while (hstart < 0) { hstart += dilation_h; } while (wstart < 0) { wstart += dilation_w; } /* equal to -std::numeric_limits::infinity(); */ @@ -232,7 +225,7 @@ OF_DEVICE_FUNC void Maxpool2dForwardComputeCFirst( for (IDX i = hstart; i < hend; i += dilation_h) { for (IDX j = wstart; j < wend; j += dilation_w) { const IDX window_idx = i * x_width + j; - T val = data[window_idx]; + T val = data[window_idx]; /* NOTE: std::isnan(val) only supports a few data types, see: https://en.cppreference.com/w/cpp/numeric/math/isnan and when use gcc/g++ 4.x to compile, @@ -256,8 +249,8 @@ OF_DEVICE_FUNC void Maxpool2dForwardComputeCFirst( template OF_DEVICE_FUNC void Maxpool2dBackwardComputeCFirst( - const NdIndexOffsetHelper index_helper, const IDX elem_num, const T* src, - T* dest, const int64_t* indice_ptr, const int32_t n_batch, const int32_t n_channel, + const NdIndexOffsetHelper index_helper, const IDX elem_num, const T* src, T* dest, + const int64_t* indice_ptr, const int32_t n_batch, const int32_t n_channel, const int32_t src_height, const int32_t src_width, const int32_t dst_height, const int32_t dst_width) { XPU_1D_KERNEL_LOOP(num, elem_num) { @@ -277,11 +270,12 @@ OF_DEVICE_FUNC void Maxpool2dBackwardComputeCFirst( } template -OF_DEVICE_FUNC void Maxpool2dBackwardComputeCLast( - const NdIndexOffsetHelper index_helper, const IDX elem_num, const T* src, - T* dest, const int64_t* indice_ptr, const int32_t n_batch, const int32_t n_channel, - const int32_t src_height, const int32_t src_width, const int32_t dst_height, - const int32_t dst_width) { +OF_DEVICE_FUNC void Maxpool2dBackwardComputeCLast(const NdIndexOffsetHelper index_helper, + const IDX elem_num, const T* src, T* dest, + const int64_t* indice_ptr, const int32_t n_batch, + const int32_t n_channel, const int32_t src_height, + const int32_t src_width, const int32_t dst_height, + const int32_t dst_width) { XPU_1D_KERNEL_LOOP(num, elem_num) { IDX n, c, h, w; index_helper.OffsetToNdIndex(num, n, c, h, w); @@ -300,11 +294,11 @@ template OF_DEVICE_FUNC void Maxpool3dForwardCompute( const NdIndexOffsetHelper index_helper, IDX elem_num, const T* src, T* dest, int64_t* indice_ptr, const int32_t padding_t, const int32_t padding_h, const int32_t padding_w, - const int32_t n_batch, const int32_t n_channel, - const int32_t x_time, const int32_t x_height, const int32_t x_width, - const int32_t kernel_size_t, const int32_t kernel_size_h, const int32_t kernel_size_w, - const int32_t stride_t, const int32_t stride_h, const int32_t stride_w, - const int32_t dilation_t, const int32_t dilation_h, const int32_t dilation_w) { + const int32_t n_batch, const int32_t n_channel, const int32_t x_time, const int32_t x_height, + const int32_t x_width, const int32_t kernel_size_t, const int32_t kernel_size_h, + const int32_t kernel_size_w, const int32_t stride_t, const int32_t stride_h, + const int32_t stride_w, const int32_t dilation_t, const int32_t dilation_h, + const int32_t dilation_w) { XPU_1D_KERNEL_LOOP(num, elem_num) { IDX n_c, t, h, w; index_helper.OffsetToNdIndex(num, n_c, t, h, w); @@ -368,7 +362,8 @@ OF_DEVICE_FUNC void Maxpool3dBackwardCompute(const NdIndexOffsetHelper i } #define INSTANTIATE_POOLING_KERNEL_UTIL(device_type_v, dtype_pair, index_dtype_pair) \ - template struct PoolingKernelUtil; + template struct PoolingKernelUtil; } // namespace oneflow From cc31a1454a6c806b4e259e8e4dc831097f129e4d Mon Sep 17 00:00:00 2001 From: MARD1NO <359521840@qq.com> Date: Mon, 21 Feb 2022 13:48:13 +0800 Subject: [PATCH 17/17] fix comment --- oneflow/user/kernels/pooling_kernel.cpp | 1 - oneflow/user/kernels/pooling_kernel_util.h | 6 ------ 2 files changed, 7 deletions(-) diff --git a/oneflow/user/kernels/pooling_kernel.cpp b/oneflow/user/kernels/pooling_kernel.cpp index 12d979ec536..7371f805b98 100644 --- a/oneflow/user/kernels/pooling_kernel.cpp +++ b/oneflow/user/kernels/pooling_kernel.cpp @@ -294,7 +294,6 @@ class MaxPool2dKernel final : public user_op::OpKernel { const MaxPoolingParams3D& params_3d = pooling_cache->GetParams3D(); const int64_t elem_num = y->shape().elem_cnt(); - // const int32_t elem_num = y->shape().elem_cnt(); const T* src = x->dptr(); T* dest = y->mut_dptr(); diff --git a/oneflow/user/kernels/pooling_kernel_util.h b/oneflow/user/kernels/pooling_kernel_util.h index 533641156a3..defb5d37013 100644 --- a/oneflow/user/kernels/pooling_kernel_util.h +++ b/oneflow/user/kernels/pooling_kernel_util.h @@ -54,12 +54,6 @@ struct DeviceAdd { }; }; -#ifdef WITH_CUDA - -OF_DEVICE_FUNC int32_t device_min(int32_t a, int32_t b) { return a <= b ? a : b; } - -#endif - class MaxPoolingParams3D { public: MaxPoolingParams3D(const int32_t dim, const ShapeView& x_shape, const std::string& data_format,