From e90a2815d4dfadbb9be67f318e2b60406f6dfbcb Mon Sep 17 00:00:00 2001 From: Ginkgo Date: Sun, 23 Jun 2024 20:07:16 +0800 Subject: [PATCH] fix cuda kernel launch parameter - grid and block are reversed --- modules/cudawarping/perf/perf_warping.cpp | 4 +- modules/cudawarping/src/cuda/resize_onnx.cu | 70 ++++++++++----------- 2 files changed, 35 insertions(+), 39 deletions(-) diff --git a/modules/cudawarping/perf/perf_warping.cpp b/modules/cudawarping/perf/perf_warping.cpp index 8d781f0819..e5f6c8065f 100644 --- a/modules/cudawarping/perf/perf_warping.cpp +++ b/modules/cudawarping/perf/perf_warping.cpp @@ -230,9 +230,9 @@ PERF_TEST_P(Sz_Depth_Cn_Scale, ResizeOnnxLinearAntialias, Combine(CUDA_TYPICAL_MAT_SIZES, Values(CV_8U, CV_16U, CV_32F), CUDA_CHANNELS_1_3_4, - Values(0.2, 0.1, 0.05))) + Values(0.8, 0.5, 0.3))) { - declare.time(1.0); + declare.time(10.0); const cv::Size size = GET_PARAM(0); const int depth = GET_PARAM(1); diff --git a/modules/cudawarping/src/cuda/resize_onnx.cu b/modules/cudawarping/src/cuda/resize_onnx.cu index 0b0baf6a18..86ff17e254 100644 --- a/modules/cudawarping/src/cuda/resize_onnx.cu +++ b/modules/cudawarping/src/cuda/resize_onnx.cu @@ -339,24 +339,22 @@ namespace cv { namespace cuda { namespace device { { int xstart = __float2int_rd(fx) - 1; int ystart = __float2int_rd(fy) - 1; - int xlimit = xstart + 3; - int ylimit = ystart + 3; int xoffset[4]; - float xcoeff[4]; - for (int x = xstart; x <= xlimit; ++x) + W1 xcoeff[4]; + for (int x = 0; x < 4; ++x, ++xstart) { - xoffset[x - xstart] = clamp(x, 0, col1); - xcoeff[x - xstart] = cubic.at(x - fx); + xoffset[x] = clamp(xstart, 0, col1); + xcoeff [x] = cubic.at(xstart - fx); } W sumval = VecTraits::all(0); - for (int y = ystart; y <= ylimit; ++y) + for (int y = 0; y < 4; ++y, ++ystart) { - int yoffest = clamp(y, 0, row1); + int yoffest = clamp(ystart, 0, row1); T const* S = ptr(src, yoffest); W sline = VecTraits::all(0); for (int x = 0; x < 4; ++x) sline += xcoeff[x] * saturate_cast(S[xoffset[x]]); - sumval += sline * cubic.at(y - fy); + sumval += sline * cubic.at(ystart - fy); } at(dst, dy, dx) = saturate_cast(sumval); } @@ -376,19 +374,17 @@ namespace cv { namespace cuda { namespace device { { int xstart = __float2int_rd(fx) - 1; int ystart = __float2int_rd(fy) - 1; - int xlimit = xstart + 3; - int ylimit = ystart + 3; int xoffset[4], yoffset[4]; W xcoeff[4], ycoeff[4]; - for (int x = xstart; x <= xlimit; ++x) + for (int x = 0; x < 4; ++x, ++xstart) { - xoffset[x - xstart] = clamp(x, 0, col1) * cn; - xcoeff[x - xstart] = cubic.at(x - fx); + xoffset[x] = clamp(xstart, 0, col1) * cn; + xcoeff [x] = cubic.at(xstart - fx); } - for (int y = ystart; y <= ylimit; ++y) + for (int y = 0; y < 4; ++y, ++ystart) { - yoffset[y - ystart] = clamp(y, 0, row1); - ycoeff[y - ystart] = cubic.at(y - fy); + yoffset[y] = clamp(ystart, 0, row1); + ycoeff [y] = cubic.at(ystart - fy); } T* D = ptr(dst, dy) + dx * cn; for (int i = 0; i < cn; ++i) @@ -509,15 +505,15 @@ namespace cv { namespace cuda { namespace device { dim3 block(32, 8); dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); if (cn == 1) - sampleKernel<<>>(M, LinearVec(src, dst)); + sampleKernel<<>>(M, LinearVec(src, dst)); else if (cn == 2) - sampleKernel<<>>(M, LinearVec(src, dst)); + sampleKernel<<>>(M, LinearVec(src, dst)); else if (cn == 3) - sampleKernel<<>>(M, LinearVec(src, dst)); + sampleKernel<<>>(M, LinearVec(src, dst)); else if (cn == 4) - sampleKernel<<>>(M, LinearVec(src, dst)); + sampleKernel<<>>(M, LinearVec(src, dst)); else - sampleKernel<<>>(M, LinearCn(src, dst, cn)); + sampleKernel<<>>(M, LinearCn(src, dst, cn)); } template @@ -527,15 +523,15 @@ namespace cv { namespace cuda { namespace device { dim3 block(32, 8); dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); if (cn == 1) - sampleKernel<<>>(M, LinearAntiVec(src, dst, scale, 0)); + sampleKernel<<>>(M, LinearAntiVec(src, dst, scale, 0)); else if (cn == 2) - sampleKernel<<>>(M, LinearAntiVec(src, dst, scale, 0)); + sampleKernel<<>>(M, LinearAntiVec(src, dst, scale, 0)); else if (cn == 3) - sampleKernel<<>>(M, LinearAntiVec(src, dst, scale, 0)); + sampleKernel<<>>(M, LinearAntiVec(src, dst, scale, 0)); else if (cn == 4) - sampleKernel<<>>(M, LinearAntiVec(src, dst, scale, 0)); + sampleKernel<<>>(M, LinearAntiVec(src, dst, scale, 0)); else - sampleKernel<<>>(M, LinearAntiCn(src, dst, scale, 0, cn)); + sampleKernel<<>>(M, LinearAntiCn(src, dst, scale, 0, cn)); } //==================== cubic ====================// @@ -547,15 +543,15 @@ namespace cv { namespace cuda { namespace device { dim3 block(32, 8); dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); if (cn == 1) - sampleKernel<<>>(M, CubicVec(src, dst, A)); + sampleKernel<<>>(M, CubicVec(src, dst, A)); else if (cn == 2) - sampleKernel<<>>(M, CubicVec(src, dst, A)); + sampleKernel<<>>(M, CubicVec(src, dst, A)); else if (cn == 3) - sampleKernel<<>>(M, CubicVec(src, dst, A)); + sampleKernel<<>>(M, CubicVec(src, dst, A)); else if (cn == 4) - sampleKernel<<>>(M, CubicVec(src, dst, A)); + sampleKernel<<>>(M, CubicVec(src, dst, A)); else - sampleKernel<<>>(M, CubicCn(src, dst, A, cn)); + sampleKernel<<>>(M, CubicCn(src, dst, A, cn)); } template @@ -565,15 +561,15 @@ namespace cv { namespace cuda { namespace device { dim3 block(32, 8); dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); if (cn == 1) - sampleKernel<<>>(M, CubicAntiVec(src, dst, scale, A)); + sampleKernel<<>>(M, CubicAntiVec(src, dst, scale, A)); else if (cn == 2) - sampleKernel<<>>(M, CubicAntiVec(src, dst, scale, A)); + sampleKernel<<>>(M, CubicAntiVec(src, dst, scale, A)); else if (cn == 3) - sampleKernel<<>>(M, CubicAntiVec(src, dst, scale, A)); + sampleKernel<<>>(M, CubicAntiVec(src, dst, scale, A)); else if (cn == 4) - sampleKernel<<>>(M, CubicAntiVec(src, dst, scale, A)); + sampleKernel<<>>(M, CubicAntiVec(src, dst, scale, A)); else - sampleKernel<<>>(M, CubicAntiCn(src, dst, scale, A, cn)); + sampleKernel<<>>(M, CubicAntiCn(src, dst, scale, A, cn)); } template