diff --git a/modules/cudawarping/include/opencv2/cudawarping.hpp b/modules/cudawarping/include/opencv2/cudawarping.hpp index 45cca1ccf8..a92cb39985 100644 --- a/modules/cudawarping/include/opencv2/cudawarping.hpp +++ b/modules/cudawarping/include/opencv2/cudawarping.hpp @@ -107,6 +107,33 @@ supported for now. */ CV_EXPORTS_W void resize(InputArray src, OutputArray dst, Size dsize, double fx=0, double fy=0, int interpolation = INTER_LINEAR, Stream& stream = Stream::Null()); +/** @brief onnx resize op +https://github.com/onnx/onnx/blob/main/docs/Operators.md#Resize +https://github.com/onnx/onnx/blob/main/onnx/reference/ops/op_resize.py + +Not support `exclude_outside` and `extrapolation_value` yet. + +To get a similar result to `cv::resize`, give dsize and: + INTER_NEAREST : ASYMMETRIC + NEAREST_FLOOR + INTER_LINEAR : HALF_PIXEL + INTER_CUBIC : HALF_PIXEL + cubicCoeff(-0.75) + +@param src input image. +@param dst output image; it has the size dsize (when it is non-zero) or the size computed from src.size(), scale; the type of dst is the same as of src. +@param dsize output image size; if it equals to zero, it is computed as: + \f[\texttt{dsize = Size(int(scale.x * src.cols), int(scale.y * src.rows))}\f] + Either dsize or scale must be non-zero. +@param scale scale factor; use same definition as ONNX, if scale > 1, it's upsampling. +@param interpolation interpolation / coordiante, see #InterpolationFlags and #ResizeONNXFlags +@param cubicCoeff cubic sampling coeff; range \f[[-1.0, 0)\f] +@param roi crop region; if provided, the rois' coordinates are normalized in the coordinate system of the input image; it only takes effect with INTER_TF_CROP_RESIZE (ONNX tf_crop_and_resize) + +@sa resize, resizeOnnx + */ +CV_EXPORTS_W void resizeOnnx(InputArray src, OutputArray dst, Size dsize, + Point2d scale = Point2d(), int interpolation = INTER_LINEAR | INTER_HALF_PIXEL, + float cubicCoeff = -0.75f, Rect2d const& roi = Rect2d(), Stream& stream = Stream::Null()); + /** @brief Applies an affine transformation to an image. @param src Source image. CV_8U , CV_16U , CV_32S , or CV_32F depth and 1, 3, or 4 channels are diff --git a/modules/cudawarping/src/cuda/resize_onnx.cu b/modules/cudawarping/src/cuda/resize_onnx.cu new file mode 100644 index 0000000000..ca52d90584 --- /dev/null +++ b/modules/cudawarping/src/cuda/resize_onnx.cu @@ -0,0 +1,654 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#if !defined CUDA_DISABLER +#define __CUDACC__ 110700 +#include "opencv2/imgproc.hpp" +#include "opencv2/core/cuda/common.hpp" +#include "opencv2/core/cuda/vec_traits.hpp" +#include "opencv2/core/cuda/vec_math.hpp" +#include "opencv2/core/cuda/saturate_cast.hpp" + +namespace cv { namespace cuda { namespace device { + + __device__ __host__ __forceinline__ int clamp(int x, int lo, int hi) + { + return x < lo ? lo : hi < x ? hi : x; + } + + template + using TypeVecT = typename TypeVec::vec_type; + + struct LinearCoeff + { + enum { ksize = 2 }; + + LinearCoeff(float) {} + + __device__ __forceinline__ float at(float x) const + { + return __saturatef(1.f - ::fabsf(x)); + } + }; + + struct CubicCoeff + { + enum { ksize = 4 }; + + float A, A2, A3; + + CubicCoeff(float a) : A(a), A2(a + 2), A3(a + 3) {} + + __device__ __forceinline__ float at(float x) const + { + x = ::fabsf(x); + if (x <= 1) + x = (A2 * x - A3) * x * x + 1; + else if (x <= 2) + x = A * (((x - 5) * x + 8) * x - 4); + else + x = 0; + return x; + } + }; + + //==================== sampler ====================// + + template + struct SamplerBase + { + PtrStep src; + PtrStepSzb dst; + int row1, col1; + + // discard const on dst + __device__ __forceinline__ T* dst_ptr(int dy) const + { return reinterpret_cast(dst.data + dy * dst.step); } + + __device__ __forceinline__ T& dst_at(int dy, int dx) const + { return dst_ptr(dy)[dx]; } + + SamplerBase(PtrStepSzb const& S, PtrStepSzb const& D) + : src(reinterpret_cast(S.data), S.step) + , dst(D), row1(S.rows - 1), col1(S.cols - 1) + {} + }; + + template + struct AntiBase : public SamplerBase + { + static_assert(Coeff::ksize % 2 == 0, ""); + + float xscale, yscale; + int xstart, xend, ystart, yend; + Coeff coeff; + + AntiBase(PtrStepSzb const& S, PtrStepSzb const& D, + Point2f const& scale, float A) + : SamplerBase(S, D), coeff(A) + { + int const khalf = Coeff::ksize / 2; + xscale = std::min(scale.x, 1.f); + yscale = std::min(scale.y, 1.f); + xstart = cvFloor(-khalf / xscale) + 1; + xend = 2 - xstart; + ystart = cvFloor(-khalf / yscale) + 1; + yend = 2 - ystart; + } + }; + + ////////// nearest neighbor ////////// + + template + struct NearestVec : public SamplerBase + { + using SamplerBase::SamplerBase; + + __device__ void to(int sx, int sy, int dx, int dy) const + { + sx = clamp(sx, 0, col1); + sy = clamp(sy, 0, row1); + dst_at(dy, dx) = src(sy, sx); + } + }; + + struct NearestSize : public SamplerBase + { + size_t esz; + + NearestSize(PtrStepSzb const& S, PtrStepSzb const& D, size_t sz) + : SamplerBase(S, D), esz(sz) + {} + + __device__ void to(int sx, int sy, int dx, int dy) const + { + sx = clamp(sx, 0, col1); + sy = clamp(sy, 0, row1); + uchar const* S = src.ptr(sy) + sx * esz; + uchar * D = dst_ptr(dy) + dx * esz; + for (size_t i = 0; i < esz; ++i) + D[i] = S[i]; + } + }; + + ////////// anti-alias brute force ////////// + + template + struct AntiVec : public AntiBase + { + using AntiBase::AntiBase; + + __device__ void to(float fx, float fy, int dx, int dy) const + { + int ix = __float2int_rd(fx), iy = __float2int_rd(fy); + float rx = fx - ix, ry = fy - iy; + float weight = 0; + W sumval = VecTraits::all(0); + for (int h = ystart; h < yend; ++h) + { + float wline = 0; + W sline = VecTraits::all(0); + int sy = clamp(iy + h, 0, row1); + T const* S = src.ptr(sy); + for (int w = xstart; w < xend; ++w) + { + int sx = clamp(ix + w, 0, col1); + float t = coeff.at((w - rx) * xscale); + wline += t; + sline += t * saturate_cast(S[sx]); + } + float u = coeff.at((h - ry) * yscale); + weight += u * wline; + sumval += u * sline; + } + dst_at(dy, dx) = saturate_cast(sumval / weight); + } + }; + + template + struct AntiCn : public AntiBase + { + int cn; + + AntiCn(PtrStepSzb const& S, PtrStepSzb const& D, + Point2f const& scale, float A, int _cn) + : AntiBase(S, D, scale, A), cn(_cn) + {} + + __device__ void to(float fx, float fy, int dx, int dy) const + { + int ix = __float2int_rd(fx), iy = __float2int_rd(fy); + float rx = fx - ix, ry = fy - iy; + float weight = 0; + W sumval = 0; + T* D = dst_ptr(dy) + dx * cn; + for (int h = ystart; h < yend; ++h) + { + float wline = 0; + W sline = 0; + int sy = clamp(iy + h, 0, row1); + T const* S = src.ptr(sy); + for (int w = xstart; w < xend; ++w) + { + int sx = clamp(ix + w, 0, col1) * cn; + W t = coeff.at((w - rx) * xscale); + wline += t; + sline += t * S[sx]; + } + W u = coeff.at((h - ry) * yscale); + weight += u * wline; + sumval += u * sline; + } + D[0] = saturate_cast(sumval / weight); + for (int i = 1; i < cn; ++i) + { + sumval = 0; + for (int h = ystart; h < yend; ++h) + { + W sline = 0; + int sy = clamp(iy + h, 0, row1); + T const* S = src.ptr(sy) + i; + for (int w = xstart; w < xend; ++w) + { + int sx = clamp(ix + w, 0, col1) * cn; + W t = coeff.at((w - rx) * xscale); + sline += t * S[sx]; + } + W u = coeff.at((h - ry) * yscale); + sumval += u * sline; + } + D[i] = saturate_cast(sumval / weight); + } + } + }; + + ////////// bi-linear ////////// + + template + struct LinearVec : public SamplerBase + { + using SamplerBase::SamplerBase; + + __device__ void to(float fx, float fy, int dx, int dy) const + { + int ix = __float2int_rd(fx), iy = __float2int_rd(fy); + float u1 = fx - ix, v1 = fy - iy; + float u0 = 1.f - u1, v0 = 1.f - v1; + int x0 = ::max(ix, 0); + int y0 = ::max(iy, 0); + int x1 = ::min(ix + 1, col1); + int y1 = ::min(iy + 1, row1); + W s0 = saturate_cast(src(y0, x0)), s1 = saturate_cast(src(y0, x1)); + W s2 = saturate_cast(src(y1, x0)), s3 = saturate_cast(src(y1, x1)); + W val = (u0 * v0) * s0 + (u1 * v0) * s1 + (u0 * v1) * s2 + (u1 * v1) * s3; + dst_at(dy, dx) = saturate_cast(val); + } + }; + + template + struct LinearCn : public SamplerBase + { + int cn; + + LinearCn(PtrStepSzb const& S, PtrStepSzb const& D, int _cn) + : SamplerBase(S, D), cn(_cn) + {} + + __device__ void to(float fx, float fy, int dx, int dy) const + { + int ix = __float2int_rd(fx), iy = __float2int_rd(fy); + float u1 = fx - ix, v1 = fy - iy; + float u0 = 1.f - u1, v0 = 1.f - v1; + int x0 = ::max(ix, 0); + int y0 = ::max(iy, 0); + int x1 = ::min(ix + 1, col1); + int y1 = ::min(iy + 1, row1); + W coeff[4] = {u0 * v0, u1 * v0, u0 * v1, u1 * v1}; + T const* S0 = src.ptr(y0) + x0 * cn; + T const* S1 = src.ptr(y0) + x1 * cn; + T const* S2 = src.ptr(y1) + x0 * cn; + T const* S3 = src.ptr(y1) + x1 * cn; + T * D = dst_ptr(dy) + dx * cn; + for (int i = 0; i < cn; ++i) + { + D[i] = saturate_cast(coeff[0] * S0[i] + + coeff[1] * S1[i] + coeff[2] * S2[i] + coeff[3] * S3[i]); + } + } + }; + + template + using LinearAntiVec = AntiVec; + + template + using LinearAntiCn = AntiCn; + + ////////// bi-cubic ////////// + + template + struct CubicVec : public SamplerBase + { + CubicCoeff cubic; + + CubicVec(PtrStepSzb const& S, PtrStepSzb const& D, float A) + : SamplerBase(S, D), cubic(A) + {} + + __device__ void to(float fx, float fy, int dx, int dy) const + { + int xstart = __float2int_rd(fx) - 1; + int ystart = __float2int_rd(fy) - 1; + int xlimit = xstart + 3; + int ylimit = ystart + 3; + int xoffset[4]; + float xcoeff[4]; + for (int x = xstart; x <= xlimit; ++x) + { + xoffset[x - xstart] = clamp(x, 0, col1); + xcoeff[x - xstart] = cubic.at(x - fx); + } + W sumval = VecTraits::all(0); + for (int y = ystart; y <= ylimit; ++y) + { + int yoffest = clamp(y, 0, row1); + T const* S = src.ptr(yoffest); + W sline = VecTraits::all(0); + for (int x = 0; x < 4; ++x) + sline += xcoeff[x] * saturate_cast(S[xoffset[x]]); + sumval += sline * cubic.at(y - fy); + } + dst_at(dy, dx) = saturate_cast(sumval); + } + }; + + template + struct CubicCn : public SamplerBase + { + CubicCoeff cubic; + int cn; + + CubicCn(PtrStepSzb const& S, PtrStepSzb const& D, float A, int _cn) + : SamplerBase(S, D), cubic(A), cn(_cn) + {} + + __device__ void to(float fx, float fy, int dx, int dy) const + { + int xstart = __float2int_rd(fx) - 1; + int ystart = __float2int_rd(fy) - 1; + int xlimit = xstart + 3; + int ylimit = ystart + 3; + int xoffset[4], yoffset[4]; + W xcoeff[4], ycoeff[4]; + for (int x = xstart; x <= xlimit; ++x) + { + xoffset[x - xstart] = clamp(x, 0, col1) * cn; + xcoeff[x - xstart] = cubic.at(x - fx); + } + for (int y = ystart; y <= ylimit; ++y) + { + yoffset[y - ystart] = clamp(y, 0, row1); + ycoeff[y - ystart] = cubic.at(y - fy); + } + T* D = dst_ptr(dy) + dx * cn; + for (int i = 0; i < cn; ++i) + { + W sumval = 0; + for (int y = 0; y < 4; ++y) + { + T const* S = src.ptr(yoffset[y]) + i; + W sline = 0; + for (int x = 0; x < 4; ++x) + sline += xcoeff[x] * S[xoffset[x]]; + sumval += sline * ycoeff[y]; + } + D[i] = saturate_cast(sumval); + } + } + }; + + template + using CubicAntiVec = AntiVec; + + template + using CubicAntiCn = AntiCn; + + ////////// generic ////////// + + template + __global__ void sampleKernel(Matx22f const M, Sampler const sampler) + { + int dx = blockDim.x * blockIdx.x + threadIdx.x; + int dy = blockDim.y * blockIdx.y + threadIdx.y; + if (dx < sampler.dst.cols && dy < sampler.dst.rows) + { + float fx = ::fmaf(static_cast(dx), M.val[0], M.val[1]); + float fy = ::fmaf(static_cast(dy), M.val[2], M.val[3]); + sampler.to(fx, fy, dx, dy); + } + } + + //==================== nearest neighbor ====================// + + struct RoundUp + { + __device__ __forceinline__ int operator()(float x) const + { return __float2int_ru(x); } + }; + + struct RoundDown + { + __device__ __forceinline__ int operator()(float x) const + { return __float2int_rd(x); } + }; + + template + __global__ void nnBySampler( + RoundOp const R, Sampler const sampler, Matx22f const M, float const offset) + { + int dx = blockDim.x * blockIdx.x + threadIdx.x; + int dy = blockDim.y * blockIdx.y + threadIdx.y; + if (dx < sampler.dst.cols && dy < sampler.dst.rows) + { + int sx = R(::fmaf(static_cast(dx), M.val[0], M.val[1]) + offset); + int sy = R(::fmaf(static_cast(dy), M.val[2], M.val[3]) + offset); + sampler.to(sx, sy, dx, dy); + } + } + + template + void nnByRound(size_t esz, PtrStepSzb const& src, PtrStepSzb dst, + Matx22f const& M, float offset, cudaStream_t stream) + { + RoundOp R; + dim3 block(32, 8); + dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); + if (esz == 1) + nnBySampler<<>>(R, NearestVec(src, dst), M, offset); + else if (esz == 2) + nnBySampler<<>>(R, NearestVec(src, dst), M, offset); + else if (esz == 3) + nnBySampler<<>>(R, NearestVec(src, dst), M, offset); + else if (esz == 4) + nnBySampler<<>>(R, NearestVec(src, dst), M, offset); + else if (esz == 6) + nnBySampler<<>>(R, NearestVec(src, dst), M, offset); + else if (esz == 8) + nnBySampler<<>>(R, NearestVec(src, dst), M, offset); + else if (esz == 12) + nnBySampler<<>>(R, NearestVec(src, dst), M, offset); + else if (esz == 16) + nnBySampler<<>>(R, NearestVec(src, dst), M, offset); + else + nnBySampler<<>>(R, NearestSize(src, dst, esz), M, offset); + } + + void resizeOnnxNN(size_t elemSize, PtrStepSzb const& src, PtrStepSzb const& dst, + Matx22f const& M, int mode, cudaStream_t stream) + { + float offset = 0.f; + if (mode == INTER_NEAREST_PREFER_FLOOR) + offset = -0.5f; + if (mode == INTER_NEAREST_PREFER_CEIL) + offset = +0.5f; + + if (mode == INTER_NEAREST_PREFER_FLOOR || + mode == INTER_NEAREST_CEIL) + nnByRound(elemSize, src, dst, M, offset, stream); + else + nnByRound(elemSize, src, dst, M, offset, stream); + if (!stream) + cudaSafeCall(cudaDeviceSynchronize()); + } + + //==================== linear ====================// + + template + void linearDispatch(int cn, PtrStepSzb const& src, PtrStepSzb const& dst, + Matx22f const& M, cudaStream_t stream) + { + dim3 block(32, 8); + dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); + if (cn == 1) + sampleKernel<<>>(M, + LinearVec(src, dst)); + else if (cn == 2) + sampleKernel<<>>(M, + LinearVec, TypeVecT>(src, dst)); + else if (cn == 3) + sampleKernel<<>>(M, + LinearVec, TypeVecT>(src, dst)); + else if (cn == 4) + sampleKernel<<>>(M, + LinearVec, TypeVecT>(src, dst)); + else + sampleKernel<<>>(M, + LinearCn(src, dst, cn)); + } + + template + void linearAntiDispatch(int cn, PtrStepSzb const& src, PtrStepSzb const& dst, + Matx22f const& M, Point2f const& scale, cudaStream_t stream) + { + dim3 block(32, 8); + dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); + if (cn == 1) + sampleKernel<<>>( + M, LinearAntiVec(src, dst, scale, 0)); + else if (cn == 2) + sampleKernel<<>>( + M, LinearAntiVec, TypeVecT>(src, dst, scale, 0)); + else if (cn == 3) + sampleKernel<<>>( + M, LinearAntiVec, TypeVecT>(src, dst, scale, 0)); + else if (cn == 4) + sampleKernel<<>>( + M, LinearAntiVec, TypeVecT>(src, dst, scale, 0)); + else + sampleKernel<<>>( + M, LinearAntiCn(src, dst, scale, 0, cn)); + } + + //==================== cubic ====================// + + template + void cubicDispatch(int cn, float A, PtrStepSzb const& src, + PtrStepSzb const& dst, Matx22f const& M, cudaStream_t stream) + { + dim3 block(32, 8); + dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); + if (cn == 1) + sampleKernel<<>>( + M, CubicVec(src, dst, A)); + else if (cn == 2) + sampleKernel<<>>( + M, CubicVec, TypeVecT>(src, dst, A)); + else if (cn == 3) + sampleKernel<<>>( + M, CubicVec, TypeVecT>(src, dst, A)); + else if (cn == 4) + sampleKernel<<>>( + M, CubicVec, TypeVecT>(src, dst, A)); + else + sampleKernel<<>>( + M, CubicCn(src, dst, A, cn)); + } + + template + void cubicAntiDispatch(int cn, float A, + PtrStepSzb const& src, PtrStepSzb const& dst, + Matx22f const& M, Point2f const& scale, cudaStream_t stream) + { + dim3 block(32, 8); + dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); + if (cn == 1) + sampleKernel<<>>( + M, CubicAntiVec(src, dst, scale, A)); + else if (cn == 2) + sampleKernel<<>>( + M, CubicAntiVec, TypeVecT>(src, dst, scale, A)); + else if (cn == 3) + sampleKernel<<>>( + M, CubicAntiVec, TypeVecT>(src, dst, scale, A)); + else if (cn == 4) + sampleKernel<<>>( + M, CubicAntiVec, TypeVecT>(src, dst, scale, A)); + else + sampleKernel<<>>( + M, CubicAntiCn(src, dst, scale, A, cn)); + } + +template +void resizeOnnx(int cn, float A, + PtrStepSzb const& src, PtrStepSzb const& dst, Matx22f const& M, + Point2f const& scale, int interpolation, cudaStream_t stream) +{ + int sampler = interpolation & INTER_SAMPLER_MASK; + int antialias = interpolation & INTER_ANTIALIAS_MASK; + if (sampler == INTER_LINEAR && !antialias) + linearDispatch(cn, src, dst, M, stream); + else if (sampler == INTER_LINEAR && antialias) + linearAntiDispatch(cn, src, dst, M, scale, stream); + else if (sampler == INTER_CUBIC && !antialias) + cubicDispatch(cn, A, src, dst, M, stream); + else if (sampler == INTER_CUBIC && antialias) + cubicAntiDispatch(cn, A, src, dst, M, scale, stream); + else + CV_Error(cv::Error::StsBadArg, "unsupported interpolation"); + + if (!stream) + cudaSafeCall(cudaDeviceSynchronize()); +} + +template void resizeOnnx(int cn, float A, + PtrStepSzb const& src, PtrStepSzb const& dst, Matx22f const& M, + Point2f const& scale, int interpolation, cudaStream_t stream); + +template void resizeOnnx(int cn, float A, + PtrStepSzb const& src, PtrStepSzb const& dst, Matx22f const& M, + Point2f const& scale, int interpolation, cudaStream_t stream); + +template void resizeOnnx(int cn, float A, + PtrStepSzb const& src, PtrStepSzb const& dst, Matx22f const& M, + Point2f const& scale, int interpolation, cudaStream_t stream); + +template void resizeOnnx(int cn, float A, + PtrStepSzb const& src, PtrStepSzb const& dst, Matx22f const& M, + Point2f const& scale, int interpolation, cudaStream_t stream); + +template void resizeOnnx(int cn, float A, + PtrStepSzb const& src, PtrStepSzb const& dst, Matx22f const& M, + Point2f const& scale, int interpolation, cudaStream_t stream); + +template void resizeOnnx(int cn, float A, + PtrStepSzb const& src, PtrStepSzb const& dst, Matx22f const& M, + Point2f const& scale, int interpolation, cudaStream_t stream); + +template void resizeOnnx(int cn, float A, + PtrStepSzb const& src, PtrStepSzb const& dst, Matx22f const& M, + Point2f const& scale, int interpolation, cudaStream_t stream); + +/*template void resizeOnnx<__half, float>(int cn, float A, + PtrStepSzb const& src, PtrStepSzb const& dst, Matx22f const& M, + Point2f const& scale, int interpolation, cudaStream_t stream);*/ +}}} + +#endif /* CUDA_DISABLER */ diff --git a/modules/cudawarping/src/resize.cpp b/modules/cudawarping/src/resize.cpp index 9943a6cdc6..4b9de23cfe 100644 --- a/modules/cudawarping/src/resize.cpp +++ b/modules/cudawarping/src/resize.cpp @@ -46,14 +46,77 @@ void cv::cuda::resize(InputArray, OutputArray, Size, double, double, int, Stream&) { throw_no_cuda(); } +void resizeOnnx(InputArray /*_src*/, OutputArray /*_dst*/, + Size /*dsize*/, Point2d /*scale*/, int /*interpolation*/, + float /*cubicCoeff*/, Rect2d const& /*roi*/, Stream& /*stream*/) + { throw_no_cuda(); } + #else // HAVE_CUDA namespace cv { namespace cuda { namespace device { - template - void resize(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream); +template +void resize(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream); + +template +void resizeOnnx(int cn, float A, PtrStepSzb const& src, PtrStepSzb const& dst, + Matx22f const& M, Point2f const& scale, int interpolation, cudaStream_t stream); + +void resizeOnnxNN(size_t elemSize,PtrStepSzb const& src, PtrStepSzb const& dst, + Matx22f const& M, int mode, cudaStream_t stream); }}} +namespace cv +{ +static Vec2f interCoordinate(int coordinate, int dst, int src, double scale, double start, double end) +{ + float a, b; + if (coordinate == INTER_HALF_PIXEL + || coordinate == INTER_HALF_PIXEL_SYMMETRIC + || coordinate == INTER_HALF_PIXEL_PYTORCH) + { + a = static_cast(1.0 / scale); + b = static_cast(0.5 / scale - 0.5); + if (coordinate == INTER_HALF_PIXEL_SYMMETRIC) + b += static_cast(0.5 * (src - dst / scale)); + if (coordinate == INTER_HALF_PIXEL_PYTORCH && dst <= 1) + { + a = 0.f; + b = -0.5f; + } + } + else if (coordinate == INTER_ALIGN_CORNERS) + { + a = static_cast((src - 1.0) / (src * scale - 1.0)); + b = 0.f; + } + else if (coordinate == INTER_ASYMMETRIC) + { + a = static_cast(1.0 / scale); + b = 0.f; + } + else if (coordinate == INTER_TF_CROP_RESIZE) + { + CV_CheckGE(start, 0.0, "roi's start is out of image"); + CV_CheckLE(end, 1.0, "roi's end is out of image"); + CV_CheckLT(start, end, "roi's start must be less than its end"); + if (dst <= 1) + { + a = 0.f; + b = static_cast(0.5 * (start + end) * (src - 1.0)); + } + else + { + a = static_cast((end - start) * (src - 1.0) / (src * scale - 1.0)); + b = static_cast(start * (src - 1.0)); + } + } + else + CV_Error(Error::StsBadArg, format("Unknown coordinate transformation mode %d", coordinate)); + return Vec2f(a, b); +} +} + void cv::cuda::resize(InputArray _src, OutputArray _dst, Size dsize, double fx, double fy, int interpolation, Stream& stream) { GpuMat src = _src.getGpuMat(); @@ -105,4 +168,102 @@ void cv::cuda::resize(InputArray _src, OutputArray _dst, Size dsize, double fx, func(src, wholeSrc, ofs.y, ofs.x, dst, static_cast(1.0 / fy), static_cast(1.0 / fx), interpolation, StreamAccessor::getStream(stream)); } + +void cv::cuda::resizeOnnx(InputArray _src, OutputArray _dst, Size dsize, Point2d scale, + int interpolation, float cubicCoeff, Rect2d const& roi, Stream& stream) +{ + GpuMat src = _src.getGpuMat(); + Size ssize = _src.size(); + CV_CheckEQ(_src.dims(), 2, "only 2 dim image is support now"); + CV_CheckFalse(ssize.empty(), "src size must not be empty"); + if (dsize.empty()) + { + CV_CheckGT(scale.x, 0.0, "scale must > 0 if no dsize given"); + CV_CheckGT(scale.y, 0.0, "scale must > 0 if no dsize given"); + dsize.width = static_cast(scale.x * ssize.width); + dsize.height = static_cast(scale.y * ssize.height); + } + if (scale.x == 0 || scale.y == 0) + { + scale.x = static_cast(dsize.width) / ssize.width; + scale.y = static_cast(dsize.height) / ssize.height; + } + CV_CheckFalse(dsize.empty(), "dst size must not empty"); + CV_CheckGT(scale.x, 0.0, "require computed or given scale > 0"); + CV_CheckGT(scale.y, 0.0, "require computed or given scale > 0"); + + int sampler = interpolation & INTER_SAMPLER_MASK; + int nearest = interpolation & INTER_NEAREST_MODE_MASK; + int coordinate = interpolation & INTER_COORDINATE_MASK; + CV_Assert( + sampler == INTER_NEAREST || + sampler == INTER_LINEAR || + sampler == INTER_CUBIC); + CV_Assert( + nearest == INTER_NEAREST_PREFER_FLOOR || + nearest == INTER_NEAREST_PREFER_CEIL || + nearest == INTER_NEAREST_FLOOR || + nearest == INTER_NEAREST_CEIL); + CV_Assert( + coordinate == INTER_HALF_PIXEL || + coordinate == INTER_HALF_PIXEL_PYTORCH || + coordinate == INTER_HALF_PIXEL_SYMMETRIC || + coordinate == INTER_ALIGN_CORNERS || + coordinate == INTER_ASYMMETRIC || + coordinate == INTER_TF_CROP_RESIZE); + + _dst.create(dsize, _src.type()); + GpuMat dst = _dst.getGpuMat(); + if (dsize == ssize && coordinate != INTER_TF_CROP_RESIZE) + { + src.copyTo(dst, stream); + return; + } + if (scale.x >= 1.0 && scale.y >= 1.0) + interpolation &= ~INTER_ANTIALIAS_MASK; + + Point2f scalef = static_cast(scale); + Matx22f M; + Vec2f xcoef = interCoordinate( + coordinate, dsize.width, ssize.width, scale.x, roi.x, roi.x + roi.width); + Vec2f ycoef = interCoordinate( + coordinate, dsize.height, ssize.height, scale.y, roi.y, roi.y + roi.height); + M(0, 0) = xcoef[0]; + M(0, 1) = xcoef[1]; + M(1, 0) = ycoef[0]; + M(1, 1) = ycoef[1]; + + if (sampler == INTER_NEAREST) + { + device::resizeOnnxNN(src.elemSize(), + src, dst, M, nearest, StreamAccessor::getStream(stream)); + return; + } + + int depth = src.depth(), cn = src.channels(); + CV_CheckDepth(depth, depth <= CV_64F, + "only support float in cuda kernel when not use nearest sampler"); + + using Func = void(*)(int cn, float A, + PtrStepSzb const& src, PtrStepSzb const& dst, Matx22f const& M, + Point2f const& scale, int interpolation, cudaStream_t stream); + static Func const funcs[CV_DEPTH_MAX] = + { + device::resizeOnnx, + device::resizeOnnx, + device::resizeOnnx, + device::resizeOnnx, + device::resizeOnnx, + device::resizeOnnx, + device::resizeOnnx, + /*device::resizeOnnx<__half, float>*/ nullptr, + }; + + Func const func = funcs[depth]; + if (!func) + CV_Error(Error::StsUnsupportedFormat, "Unsupported depth"); + func(cn, cubicCoeff, src, dst, M, scalef, interpolation, + StreamAccessor::getStream(stream)); +} + #endif // HAVE_CUDA diff --git a/modules/cudawarping/test/test_resize.cpp b/modules/cudawarping/test/test_resize.cpp index 768ad09f98..98156dbc22 100644 --- a/modules/cudawarping/test/test_resize.cpp +++ b/modules/cudawarping/test/test_resize.cpp @@ -260,6 +260,113 @@ INSTANTIATE_TEST_CASE_P(CUDA_Warping, ResizeTextures, testing::Combine( ALL_DEVICES, testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC)))); +PARAM_TEST_CASE(ResizeOnnx, cv::cuda::DeviceInfo, MatType, double, double, int, UseRoi) +{ + cv::cuda::DeviceInfo devInfo; + int depth, interpolation; + double fx, fy; + bool useRoi; + + Rect src_loc, dst_loc; + Mat src, dst, src_roi, dst_roi; + GpuMat gsrc, gdst, gsrc_roi, gdst_roi; + + virtual void SetUp() + { + devInfo = GET_PARAM(0); + depth = GET_PARAM(1); + fx = GET_PARAM(2); + fy = GET_PARAM(3); + interpolation = GET_PARAM(4); + useRoi = GET_PARAM(5); + cv::cuda::setDevice(devInfo.deviceID()); + } + + void random_submat(int type, + Size& size, Rect& roi, Mat& mat, Mat& sub, GpuMat& gmat, GpuMat& gsub) + { + int border = useRoi ? 65 : 0; + roi.x = randomInt(0, border); + roi.y = randomInt(0, border); + roi.width = size.width; + roi.height = size.height; + size.width += roi.x + randomInt(0, border); + size.height += roi.y + randomInt(0, border); + mat = randomMat(size, type, -127, 127); + gmat.upload(mat); + sub = mat(roi); + gsub = gmat(roi); + } + + void random_roi(int type) + { + Size srcSize, dstSize; + int minSize = min(fx, fy) < 1.0 ? 10 : 1; + while (dstSize.empty()) + { + srcSize = randomSize(minSize, 129); + dstSize.width = cvRound(srcSize.width * fx); + dstSize.height = cvRound(srcSize.height * fy); + } + + random_submat(type, srcSize, src_loc, src, src_roi, gsrc, gsrc_roi); + random_submat(type, dstSize, dst_loc, dst, dst_roi, gdst, gdst_roi); + } +}; + +CUDA_TEST_P(ResizeOnnx, Accuracy) +{ + Mat host, host_roi; + double eps = depth <= CV_32S ? 1 : 5e-2; + + for (int cn = 1; cn <= 6; ++cn) + { + int type = CV_MAKETYPE(depth, cn); + random_roi(type); + + cv::resizeOnnx(src_roi, dst_roi, dst_roi.size(), Point2d(fx, fy), interpolation); + cv::cuda::resizeOnnx(gsrc_roi, gdst_roi, dst_roi.size(), Point2d(fx, fy), interpolation); + + gdst.download(host); + host_roi = host(dst_loc); + string info = cv::format( + "fail on type %sC%d src %dx%d dst %dx%d src_roi %dx%d dst_roi %dx%d", + depthToString(depth), cn, src.cols, src.rows, dst.cols, dst.rows, + src_roi.cols, src_roi.rows, dst_roi.cols, dst_roi.rows); + EXPECT_MAT_NEAR(dst_roi, host_roi, eps) << info; + } +} + +INSTANTIATE_TEST_CASE_P(CUDA_Warping, ResizeOnnx, Combine( + ALL_DEVICES, + Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32F), + Values(0.4, 0.27, 1.6), + Values(0.5, 0.71, 2.7), + Values((int)(INTER_LINEAR), (int)(INTER_CUBIC)), + WHOLE_SUBMAT)); + +INSTANTIATE_TEST_CASE_P(CUDA_Warping_Antialias, ResizeOnnx, Combine( + ALL_DEVICES, + Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32F), + Values(0.4, 0.27, 1.6), + Values(0.5, 0.71, 2.7), + Values( + (int)(INTER_ANTIALIAS | INTER_LINEAR), + (int)(INTER_ANTIALIAS | INTER_CUBIC)), + WHOLE_SUBMAT)); + +INSTANTIATE_TEST_CASE_P(CUDA_Warping_Nearest, ResizeOnnx, Combine( + ALL_DEVICES, + Values(CV_8S, CV_16S, CV_32F, CV_64F), + Values(0.4, 0.27, 1.6), + Values(0.5, 0.71, 2.7), + Values( + (int)(INTER_NEAREST | INTER_NEAREST_PREFER_FLOOR), + (int)(INTER_NEAREST | INTER_NEAREST_PREFER_CEIL), + (int)(INTER_NEAREST | INTER_NEAREST_CEIL), + (int)(INTER_NEAREST | INTER_NEAREST_FLOOR)), + WHOLE_SUBMAT)); }} // namespace + #endif // HAVE_CUDA