diff --git a/modules/cudawarping/include/opencv2/cudawarping.hpp b/modules/cudawarping/include/opencv2/cudawarping.hpp
index 45cca1ccf8..a92cb39985 100644
--- a/modules/cudawarping/include/opencv2/cudawarping.hpp
+++ b/modules/cudawarping/include/opencv2/cudawarping.hpp
@@ -107,6 +107,33 @@ supported for now.
  */
 CV_EXPORTS_W void resize(InputArray src, OutputArray dst, Size dsize, double fx=0, double fy=0, int interpolation = INTER_LINEAR, Stream& stream = Stream::Null());
 
+/** @brief onnx resize op
+https://github.com/onnx/onnx/blob/main/docs/Operators.md#Resize
+https://github.com/onnx/onnx/blob/main/onnx/reference/ops/op_resize.py
+
+Not support `exclude_outside` and `extrapolation_value` yet.
+
+To get a similar result to `cv::resize`, give dsize and:
+    INTER_NEAREST : ASYMMETRIC + NEAREST_FLOOR
+    INTER_LINEAR  : HALF_PIXEL
+    INTER_CUBIC   : HALF_PIXEL + cubicCoeff(-0.75)
+
+@param src input image.
+@param dst output image; it has the size dsize (when it is non-zero) or the size computed from src.size(), scale; the type of dst is the same as of src.
+@param dsize output image size; if it equals to zero, it is computed as:
+ \f[\texttt{dsize = Size(int(scale.x * src.cols), int(scale.y * src.rows))}\f]
+ Either dsize or scale must be non-zero.
+@param scale scale factor; use same definition as ONNX, if scale > 1, it's upsampling.
+@param interpolation interpolation / coordiante, see #InterpolationFlags and #ResizeONNXFlags
+@param cubicCoeff cubic sampling coeff; range \f[[-1.0, 0)\f]
+@param roi crop region; if provided, the rois' coordinates are normalized in the coordinate system of the input image; it only takes effect with INTER_TF_CROP_RESIZE (ONNX tf_crop_and_resize)
+
+@sa resize, resizeOnnx
+ */
+CV_EXPORTS_W void resizeOnnx(InputArray src, OutputArray dst, Size dsize,
+    Point2d scale = Point2d(), int interpolation = INTER_LINEAR | INTER_HALF_PIXEL,
+    float cubicCoeff = -0.75f, Rect2d const& roi = Rect2d(), Stream& stream = Stream::Null());
+
 /** @brief Applies an affine transformation to an image.
 
 @param src Source image. CV_8U , CV_16U , CV_32S , or CV_32F depth and 1, 3, or 4 channels are
diff --git a/modules/cudawarping/src/cuda/resize_onnx.cu b/modules/cudawarping/src/cuda/resize_onnx.cu
new file mode 100644
index 0000000000..ca52d90584
--- /dev/null
+++ b/modules/cudawarping/src/cuda/resize_onnx.cu
@@ -0,0 +1,654 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#if !defined CUDA_DISABLER
+#define __CUDACC__ 110700
+#include "opencv2/imgproc.hpp"
+#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
+#include "opencv2/core/cuda/vec_math.hpp"
+#include "opencv2/core/cuda/saturate_cast.hpp"
+
+namespace cv { namespace cuda { namespace device {
+
+    __device__ __host__ __forceinline__ int clamp(int x, int lo, int hi)
+    {
+        return x < lo ? lo : hi < x ? hi : x;
+    }
+
+    template <typename T, int cn>
+    using TypeVecT = typename TypeVec<T, cn>::vec_type;
+
+    struct LinearCoeff
+    {
+        enum { ksize = 2 };
+
+        LinearCoeff(float) {}
+
+        __device__ __forceinline__ float at(float x) const
+        {
+            return __saturatef(1.f - ::fabsf(x));
+        }
+    };
+
+    struct CubicCoeff
+    {
+        enum { ksize = 4 };
+
+        float A, A2, A3;
+
+        CubicCoeff(float a) : A(a), A2(a + 2), A3(a + 3) {}
+
+        __device__ __forceinline__ float at(float x) const
+        {
+            x = ::fabsf(x);
+            if (x <= 1)
+                x = (A2 * x - A3) * x * x + 1;
+            else if (x <= 2)
+                x = A * (((x - 5) * x + 8) * x - 4);
+            else
+                x = 0;
+            return x;
+        }
+    };
+
+    //==================== sampler ====================//
+
+    template <typename T>
+    struct SamplerBase
+    {
+        PtrStep<T> src;
+        PtrStepSzb dst;
+        int row1, col1;
+
+        // discard const on dst
+        __device__ __forceinline__  T* dst_ptr(int dy) const
+        { return reinterpret_cast<T*>(dst.data + dy * dst.step); }
+
+        __device__ __forceinline__ T& dst_at(int dy, int dx) const
+        { return dst_ptr(dy)[dx]; }
+
+        SamplerBase(PtrStepSzb const& S, PtrStepSzb const& D)
+            : src(reinterpret_cast<T*>(S.data), S.step)
+            , dst(D), row1(S.rows - 1), col1(S.cols - 1)
+        {}
+    };
+
+    template <typename T, typename Coeff>
+    struct AntiBase : public SamplerBase<T>
+    {
+        static_assert(Coeff::ksize % 2 == 0, "");
+
+        float xscale, yscale;
+        int xstart, xend, ystart, yend;
+        Coeff coeff;
+
+        AntiBase(PtrStepSzb const& S, PtrStepSzb const& D,
+            Point2f const& scale, float A)
+            : SamplerBase<T>(S, D), coeff(A)
+        {
+            int const khalf = Coeff::ksize / 2;
+            xscale = std::min(scale.x, 1.f);
+            yscale = std::min(scale.y, 1.f);
+            xstart = cvFloor(-khalf / xscale) + 1;
+            xend = 2 - xstart;
+            ystart = cvFloor(-khalf / yscale) + 1;
+            yend = 2 - ystart;
+        }
+    };
+
+    ////////// nearest neighbor //////////
+
+    template <typename T>
+    struct NearestVec : public SamplerBase<T>
+    {
+        using SamplerBase<T>::SamplerBase;
+
+        __device__ void to(int sx, int sy, int dx, int dy) const
+        {
+            sx = clamp(sx, 0, col1);
+            sy = clamp(sy, 0, row1);
+            dst_at(dy, dx) = src(sy, sx);
+        }
+    };
+
+    struct NearestSize : public SamplerBase<uchar>
+    {
+        size_t esz;
+
+        NearestSize(PtrStepSzb const& S, PtrStepSzb const& D, size_t sz)
+            : SamplerBase(S, D), esz(sz)
+        {}
+
+        __device__ void to(int sx, int sy, int dx, int dy) const
+        {
+            sx = clamp(sx, 0, col1);
+            sy = clamp(sy, 0, row1);
+            uchar const* S = src.ptr(sy) + sx * esz;
+            uchar      * D = dst_ptr(dy) + dx * esz;
+            for (size_t i = 0; i < esz; ++i)
+                D[i] = S[i];
+        }
+    };
+
+    ////////// anti-alias brute force //////////
+
+    template <typename T, typename W, typename Coeff>
+    struct AntiVec : public AntiBase<T, Coeff>
+    {
+        using AntiBase<T, Coeff>::AntiBase;
+
+        __device__ void to(float fx, float fy, int dx, int dy) const
+        {
+            int ix = __float2int_rd(fx), iy = __float2int_rd(fy);
+            float rx = fx - ix, ry = fy - iy;
+            float weight = 0;
+            W sumval = VecTraits<W>::all(0);
+            for (int h = ystart; h < yend; ++h)
+            {
+                float wline = 0;
+                W sline = VecTraits<W>::all(0);
+                int sy = clamp(iy + h, 0, row1);
+                T const* S = src.ptr(sy);
+                for (int w = xstart; w < xend; ++w)
+                {
+                    int sx = clamp(ix + w, 0, col1);
+                    float t = coeff.at((w - rx) * xscale);
+                    wline += t;
+                    sline += t * saturate_cast<W>(S[sx]);
+                }
+                float u = coeff.at((h - ry) * yscale);
+                weight += u * wline;
+                sumval += u * sline;
+            }
+            dst_at(dy, dx) = saturate_cast<T>(sumval / weight);
+        }
+    };
+
+    template <typename T, typename W, typename Coeff>
+    struct AntiCn : public AntiBase<T, Coeff>
+    {
+        int cn;
+
+        AntiCn(PtrStepSzb const& S, PtrStepSzb const& D,
+            Point2f const& scale, float A, int _cn)
+            : AntiBase<T, Coeff>(S, D, scale, A), cn(_cn)
+        {}
+
+        __device__ void to(float fx, float fy, int dx, int dy) const
+        {
+            int ix = __float2int_rd(fx), iy = __float2int_rd(fy);
+            float rx = fx - ix, ry = fy - iy;
+            float weight = 0;
+            W sumval = 0;
+            T* D = dst_ptr(dy) + dx * cn;
+            for (int h = ystart; h < yend; ++h)
+            {
+                float wline = 0;
+                W sline = 0;
+                int sy = clamp(iy + h, 0, row1);
+                T const* S = src.ptr(sy);
+                for (int w = xstart; w < xend; ++w)
+                {
+                    int sx = clamp(ix + w, 0, col1) * cn;
+                    W t = coeff.at((w - rx) * xscale);
+                    wline += t;
+                    sline += t * S[sx];
+                }
+                W u = coeff.at((h - ry) * yscale);
+                weight += u * wline;
+                sumval += u * sline;
+            }
+            D[0] = saturate_cast<T>(sumval / weight);
+            for (int i = 1; i < cn; ++i)
+            {
+                sumval = 0;
+                for (int h = ystart; h < yend; ++h)
+                {
+                    W sline = 0;
+                    int sy = clamp(iy + h, 0, row1);
+                    T const* S = src.ptr(sy) + i;
+                    for (int w = xstart; w < xend; ++w)
+                    {
+                        int sx = clamp(ix + w, 0, col1) * cn;
+                        W t = coeff.at((w - rx) * xscale);
+                        sline += t * S[sx];
+                    }
+                    W u = coeff.at((h - ry) * yscale);
+                    sumval += u * sline;
+                }
+                D[i] = saturate_cast<T>(sumval / weight);
+            }
+        }
+    };
+
+    ////////// bi-linear //////////
+
+    template <typename T, typename W>
+    struct LinearVec : public SamplerBase<T>
+    {
+        using SamplerBase<T>::SamplerBase;
+
+        __device__ void to(float fx, float fy, int dx, int dy) const
+        {
+            int ix = __float2int_rd(fx), iy = __float2int_rd(fy);
+            float u1 = fx - ix, v1 = fy - iy;
+            float u0 = 1.f - u1, v0 = 1.f - v1;
+            int x0 = ::max(ix, 0);
+            int y0 = ::max(iy, 0);
+            int x1 = ::min(ix + 1, col1);
+            int y1 = ::min(iy + 1, row1);
+            W s0 = saturate_cast<W>(src(y0, x0)), s1 = saturate_cast<W>(src(y0, x1));
+            W s2 = saturate_cast<W>(src(y1, x0)), s3 = saturate_cast<W>(src(y1, x1));
+            W val = (u0 * v0) * s0 + (u1 * v0) * s1 + (u0 * v1) * s2 + (u1 * v1) * s3;
+            dst_at(dy, dx) = saturate_cast<T>(val);
+        }
+    };
+
+    template <typename T, typename W>
+    struct LinearCn : public SamplerBase<T>
+    {
+        int cn;
+
+        LinearCn(PtrStepSzb const& S, PtrStepSzb const& D, int _cn)
+            : SamplerBase<T>(S, D), cn(_cn)
+        {}
+
+        __device__ void to(float fx, float fy, int dx, int dy) const
+        {
+            int ix = __float2int_rd(fx), iy = __float2int_rd(fy);
+            float u1 = fx - ix, v1 = fy - iy;
+            float u0 = 1.f - u1, v0 = 1.f - v1;
+            int x0 = ::max(ix, 0);
+            int y0 = ::max(iy, 0);
+            int x1 = ::min(ix + 1, col1);
+            int y1 = ::min(iy + 1, row1);
+            W coeff[4] = {u0 * v0, u1 * v0, u0 * v1, u1 * v1};
+            T const* S0 = src.ptr(y0) + x0 * cn;
+            T const* S1 = src.ptr(y0) + x1 * cn;
+            T const* S2 = src.ptr(y1) + x0 * cn;
+            T const* S3 = src.ptr(y1) + x1 * cn;
+            T      * D  = dst_ptr(dy) + dx * cn;
+            for (int i = 0; i < cn; ++i)
+            {
+                D[i] = saturate_cast<T>(coeff[0] * S0[i]
+                    + coeff[1] * S1[i] + coeff[2] * S2[i] + coeff[3] * S3[i]);
+            }
+        }
+    };
+
+    template <typename T, typename W>
+    using LinearAntiVec = AntiVec<T, W, LinearCoeff>;
+
+    template <typename T, typename W>
+    using LinearAntiCn = AntiCn<T, W, LinearCoeff>;
+
+    ////////// bi-cubic //////////
+
+    template <typename T, typename W>
+    struct CubicVec : public SamplerBase<T>
+    {
+        CubicCoeff cubic;
+
+        CubicVec(PtrStepSzb const& S, PtrStepSzb const& D, float A)
+            : SamplerBase<T>(S, D), cubic(A)
+        {}
+
+        __device__ void to(float fx, float fy, int dx, int dy) const
+        {
+            int xstart = __float2int_rd(fx) - 1;
+            int ystart = __float2int_rd(fy) - 1;
+            int xlimit = xstart + 3;
+            int ylimit = ystart + 3;
+            int xoffset[4];
+            float xcoeff[4];
+            for (int x = xstart; x <= xlimit; ++x)
+            {
+                xoffset[x - xstart] = clamp(x, 0, col1);
+                xcoeff[x - xstart] = cubic.at(x - fx);
+            }
+            W sumval = VecTraits<W>::all(0);
+            for (int y = ystart; y <= ylimit; ++y)
+            {
+                int yoffest = clamp(y, 0, row1);
+                T const* S = src.ptr(yoffest);
+                W sline = VecTraits<W>::all(0);
+                for (int x = 0; x < 4; ++x)
+                    sline += xcoeff[x] * saturate_cast<W>(S[xoffset[x]]);
+                sumval += sline * cubic.at(y - fy);
+            }
+            dst_at(dy, dx) = saturate_cast<T>(sumval);
+        }
+    };
+
+    template <typename T, typename W>
+    struct CubicCn : public SamplerBase<T>
+    {
+        CubicCoeff cubic;
+        int cn;
+
+        CubicCn(PtrStepSzb const& S, PtrStepSzb const& D, float A, int _cn)
+            : SamplerBase<T>(S, D), cubic(A), cn(_cn)
+        {}
+
+        __device__ void to(float fx, float fy, int dx, int dy) const
+        {
+            int xstart = __float2int_rd(fx) - 1;
+            int ystart = __float2int_rd(fy) - 1;
+            int xlimit = xstart + 3;
+            int ylimit = ystart + 3;
+            int xoffset[4], yoffset[4];
+            W xcoeff[4], ycoeff[4];
+            for (int x = xstart; x <= xlimit; ++x)
+            {
+                xoffset[x - xstart] = clamp(x, 0, col1) * cn;
+                xcoeff[x - xstart] = cubic.at(x - fx);
+            }
+            for (int y = ystart; y <= ylimit; ++y)
+            {
+                yoffset[y - ystart] = clamp(y, 0, row1);
+                ycoeff[y - ystart] = cubic.at(y - fy);
+            }
+            T* D = dst_ptr(dy) + dx * cn;
+            for (int i = 0; i < cn; ++i)
+            {
+                W sumval = 0;
+                for (int y = 0; y < 4; ++y)
+                {
+                    T const* S = src.ptr(yoffset[y]) + i;
+                    W sline = 0;
+                    for (int x = 0; x < 4; ++x)
+                        sline += xcoeff[x] * S[xoffset[x]];
+                    sumval += sline * ycoeff[y];
+                }
+                D[i] = saturate_cast<T>(sumval);
+            }
+        }
+    };
+
+    template <typename T, typename W>
+    using CubicAntiVec = AntiVec<T, W, CubicCoeff>;
+
+    template <typename T, typename W>
+    using CubicAntiCn = AntiCn<T, W, CubicCoeff>;
+
+    ////////// generic //////////
+
+    template <typename Sampler>
+    __global__ void sampleKernel(Matx22f const M, Sampler const sampler)
+    {
+        int dx = blockDim.x * blockIdx.x + threadIdx.x;
+        int dy = blockDim.y * blockIdx.y + threadIdx.y;
+        if (dx < sampler.dst.cols && dy < sampler.dst.rows)
+        {
+            float fx = ::fmaf(static_cast<float>(dx), M.val[0], M.val[1]);
+            float fy = ::fmaf(static_cast<float>(dy), M.val[2], M.val[3]);
+            sampler.to(fx, fy, dx, dy);
+        }
+    }
+
+    //==================== nearest neighbor  ====================//
+
+    struct RoundUp
+    {
+        __device__ __forceinline__ int operator()(float x) const
+        { return __float2int_ru(x); }
+    };
+
+    struct RoundDown
+    {
+        __device__ __forceinline__ int operator()(float x) const
+        { return __float2int_rd(x); }
+    };
+
+    template <typename RoundOp, typename Sampler>
+    __global__ void nnBySampler(
+        RoundOp const R, Sampler const sampler, Matx22f const M, float const offset)
+    {
+        int dx = blockDim.x * blockIdx.x + threadIdx.x;
+        int dy = blockDim.y * blockIdx.y + threadIdx.y;
+        if (dx < sampler.dst.cols && dy < sampler.dst.rows)
+        {
+            int sx = R(::fmaf(static_cast<float>(dx), M.val[0], M.val[1]) + offset);
+            int sy = R(::fmaf(static_cast<float>(dy), M.val[2], M.val[3]) + offset);
+            sampler.to(sx, sy, dx, dy);
+        }
+    }
+
+    template <typename RoundOp>
+    void nnByRound(size_t esz, PtrStepSzb const& src, PtrStepSzb dst,
+        Matx22f const& M, float offset, cudaStream_t stream)
+    {
+        RoundOp R;
+        dim3 block(32, 8);
+        dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+        if (esz == 1)
+            nnBySampler<<<grid, block, 0, stream>>>(R, NearestVec<uchar>(src, dst), M, offset);
+        else if (esz == 2)
+            nnBySampler<<<grid, block, 0, stream>>>(R, NearestVec<ushort>(src, dst), M, offset);
+        else if (esz == 3)
+            nnBySampler<<<grid, block, 0, stream>>>(R, NearestVec<uchar3>(src, dst), M, offset);
+        else if (esz == 4)
+            nnBySampler<<<grid, block, 0, stream>>>(R, NearestVec<uint>(src, dst), M, offset);
+        else if (esz == 6)
+            nnBySampler<<<grid, block, 0, stream>>>(R, NearestVec<ushort3>(src, dst), M, offset);
+        else if (esz == 8)
+            nnBySampler<<<grid, block, 0, stream>>>(R, NearestVec<uint2>(src, dst), M, offset);
+        else if (esz == 12)
+            nnBySampler<<<grid, block, 0, stream>>>(R, NearestVec<uint3>(src, dst), M, offset);
+        else if (esz == 16)
+            nnBySampler<<<grid, block, 0, stream>>>(R, NearestVec<uint4>(src, dst), M, offset);
+        else
+            nnBySampler<<<grid, block, 0, stream>>>(R, NearestSize(src, dst, esz), M, offset);
+    }
+
+    void resizeOnnxNN(size_t elemSize, PtrStepSzb const& src, PtrStepSzb const& dst,
+        Matx22f const& M, int mode, cudaStream_t stream)
+    {
+        float offset = 0.f;
+        if (mode == INTER_NEAREST_PREFER_FLOOR)
+            offset = -0.5f;
+        if (mode == INTER_NEAREST_PREFER_CEIL)
+            offset = +0.5f;
+
+        if (mode == INTER_NEAREST_PREFER_FLOOR ||
+            mode == INTER_NEAREST_CEIL)
+            nnByRound<RoundUp>(elemSize, src, dst, M, offset, stream);
+        else
+            nnByRound<RoundDown>(elemSize, src, dst, M, offset, stream);
+        if (!stream)
+            cudaSafeCall(cudaDeviceSynchronize());
+    }
+
+    //==================== linear ====================//
+
+    template <typename T, typename W>
+    void linearDispatch(int cn, PtrStepSzb const& src, PtrStepSzb const& dst,
+        Matx22f const& M, cudaStream_t stream)
+    {
+        dim3 block(32, 8);
+        dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+        if (cn == 1)
+            sampleKernel<<<block, grid, 0, stream>>>(M,
+                LinearVec<T, W>(src, dst));
+        else if (cn == 2)
+            sampleKernel<<<block, grid, 0, stream>>>(M,
+                LinearVec<TypeVecT<T, 2>, TypeVecT<W, 2>>(src, dst));
+        else if (cn == 3)
+            sampleKernel<<<block, grid, 0, stream>>>(M,
+                LinearVec<TypeVecT<T, 3>, TypeVecT<W, 3>>(src, dst));
+        else if (cn == 4)
+            sampleKernel<<<block, grid, 0, stream>>>(M,
+                LinearVec<TypeVecT<T, 4>, TypeVecT<W, 4>>(src, dst));
+        else
+            sampleKernel<<<block, grid, 0, stream>>>(M,
+                LinearCn<T, W>(src, dst, cn));
+    }
+
+    template <typename T, typename W>
+    void linearAntiDispatch(int cn, PtrStepSzb const& src, PtrStepSzb const& dst,
+        Matx22f const& M, Point2f const& scale, cudaStream_t stream)
+    {
+        dim3 block(32, 8);
+        dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+        if (cn == 1)
+            sampleKernel<<<block, grid, 0, stream>>>(
+                M, LinearAntiVec<T, W>(src, dst, scale, 0));
+        else if (cn == 2)
+            sampleKernel<<<block, grid, 0, stream>>>(
+                M, LinearAntiVec<TypeVecT<T, 2>, TypeVecT<W, 2>>(src, dst, scale, 0));
+        else if (cn == 3)
+            sampleKernel<<<block, grid, 0, stream>>>(
+                M, LinearAntiVec<TypeVecT<T, 3>, TypeVecT<W, 3>>(src, dst, scale, 0));
+        else if (cn == 4)
+            sampleKernel<<<block, grid, 0, stream>>>(
+                M, LinearAntiVec<TypeVecT<T, 4>, TypeVecT<W, 4>>(src, dst, scale, 0));
+        else
+            sampleKernel<<<block, grid, 0, stream>>>(
+                M, LinearAntiCn<T, W>(src, dst, scale, 0, cn));
+    }
+
+    //==================== cubic  ====================//
+
+    template <typename T, typename W>
+    void cubicDispatch(int cn, float A, PtrStepSzb const& src,
+        PtrStepSzb const& dst, Matx22f const& M, cudaStream_t stream)
+    {
+        dim3 block(32, 8);
+        dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+        if (cn == 1)
+            sampleKernel<<<block, grid, 0, stream>>>(
+                M, CubicVec<T, W>(src, dst, A));
+        else if (cn == 2)
+            sampleKernel<<<block, grid, 0, stream>>>(
+                M, CubicVec<TypeVecT<T, 2>, TypeVecT<W, 2>>(src, dst, A));
+        else if (cn == 3)
+            sampleKernel<<<block, grid, 0, stream>>>(
+                M, CubicVec<TypeVecT<T, 3>, TypeVecT<W, 3>>(src, dst, A));
+        else if (cn == 4)
+            sampleKernel<<<block, grid, 0, stream>>>(
+                M, CubicVec<TypeVecT<T, 4>, TypeVecT<W, 4>>(src, dst, A));
+        else
+            sampleKernel<<<block, grid, 0, stream>>>(
+                M, CubicCn<T, W>(src, dst, A, cn));
+    }
+
+    template <typename T, typename W>
+    void cubicAntiDispatch(int cn, float A,
+        PtrStepSzb const& src, PtrStepSzb const& dst,
+        Matx22f const& M, Point2f const& scale, cudaStream_t stream)
+    {
+        dim3 block(32, 8);
+        dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
+        if (cn == 1)
+            sampleKernel<<<block, grid, 0, stream>>>(
+                M, CubicAntiVec<T, W>(src, dst, scale, A));
+        else if (cn == 2)
+            sampleKernel<<<block, grid, 0, stream>>>(
+                M, CubicAntiVec<TypeVecT<T, 2>, TypeVecT<W, 2>>(src, dst, scale, A));
+        else if (cn == 3)
+            sampleKernel<<<block, grid, 0, stream>>>(
+                M, CubicAntiVec<TypeVecT<T, 3>, TypeVecT<W, 3>>(src, dst, scale, A));
+        else if (cn == 4)
+            sampleKernel<<<block, grid, 0, stream>>>(
+                M, CubicAntiVec<TypeVecT<T, 4>, TypeVecT<W, 4>>(src, dst, scale, A));
+        else
+            sampleKernel<<<block, grid, 0, stream>>>(
+                M, CubicAntiCn<T, W>(src, dst, scale, A, cn));
+    }
+
+template <typename T, typename W>
+void resizeOnnx(int cn, float A,
+    PtrStepSzb const& src, PtrStepSzb const& dst, Matx22f const& M,
+    Point2f const& scale, int interpolation, cudaStream_t stream)
+{
+    int sampler = interpolation & INTER_SAMPLER_MASK;
+    int antialias = interpolation & INTER_ANTIALIAS_MASK;
+    if (sampler == INTER_LINEAR && !antialias)
+        linearDispatch<T, W>(cn, src, dst, M, stream);
+    else if (sampler == INTER_LINEAR && antialias)
+        linearAntiDispatch<T, W>(cn, src, dst, M, scale, stream);
+    else if (sampler == INTER_CUBIC && !antialias)
+        cubicDispatch<T, W>(cn, A, src, dst, M, stream);
+    else if (sampler == INTER_CUBIC && antialias)
+        cubicAntiDispatch<T, W>(cn, A, src, dst, M, scale, stream);
+    else
+        CV_Error(cv::Error::StsBadArg, "unsupported interpolation");
+
+    if (!stream)
+        cudaSafeCall(cudaDeviceSynchronize());
+}
+
+template void resizeOnnx<uchar, float>(int cn, float A,
+    PtrStepSzb const& src, PtrStepSzb const& dst, Matx22f const& M,
+    Point2f const& scale, int interpolation, cudaStream_t stream);
+
+template void resizeOnnx<schar, float>(int cn, float A,
+    PtrStepSzb const& src, PtrStepSzb const& dst, Matx22f const& M,
+    Point2f const& scale, int interpolation, cudaStream_t stream);
+
+template void resizeOnnx<ushort, float>(int cn, float A,
+    PtrStepSzb const& src, PtrStepSzb const& dst, Matx22f const& M,
+    Point2f const& scale, int interpolation, cudaStream_t stream);
+
+template void resizeOnnx<short, float>(int cn, float A,
+    PtrStepSzb const& src, PtrStepSzb const& dst, Matx22f const& M,
+    Point2f const& scale, int interpolation, cudaStream_t stream);
+
+template void resizeOnnx<int, double>(int cn, float A,
+    PtrStepSzb const& src, PtrStepSzb const& dst, Matx22f const& M,
+    Point2f const& scale, int interpolation, cudaStream_t stream);
+
+template void resizeOnnx<float, float>(int cn, float A,
+    PtrStepSzb const& src, PtrStepSzb const& dst, Matx22f const& M,
+    Point2f const& scale, int interpolation, cudaStream_t stream);
+
+template void resizeOnnx<double, double>(int cn, float A,
+    PtrStepSzb const& src, PtrStepSzb const& dst, Matx22f const& M,
+    Point2f const& scale, int interpolation, cudaStream_t stream);
+
+/*template void resizeOnnx<__half, float>(int cn, float A,
+    PtrStepSzb const& src, PtrStepSzb const& dst, Matx22f const& M,
+    Point2f const& scale, int interpolation, cudaStream_t stream);*/
+}}}
+
+#endif /* CUDA_DISABLER */
diff --git a/modules/cudawarping/src/resize.cpp b/modules/cudawarping/src/resize.cpp
index 9943a6cdc6..4b9de23cfe 100644
--- a/modules/cudawarping/src/resize.cpp
+++ b/modules/cudawarping/src/resize.cpp
@@ -46,14 +46,77 @@
 
 void cv::cuda::resize(InputArray, OutputArray, Size, double, double, int, Stream&) { throw_no_cuda(); }
 
+void resizeOnnx(InputArray /*_src*/, OutputArray /*_dst*/,
+    Size /*dsize*/, Point2d /*scale*/, int /*interpolation*/,
+    float /*cubicCoeff*/, Rect2d const& /*roi*/, Stream& /*stream*/)
+    { throw_no_cuda(); }
+
 #else // HAVE_CUDA
 
 namespace cv { namespace cuda { namespace device
 {
-    template <typename T>
-    void resize(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
+template <typename T>
+void resize(const PtrStepSzb& src, const PtrStepSzb& srcWhole, int yoff, int xoff, const PtrStepSzb& dst, float fy, float fx, int interpolation, cudaStream_t stream);
+
+template <typename T, typename W>
+void resizeOnnx(int cn, float A, PtrStepSzb const& src, PtrStepSzb const& dst,
+    Matx22f const& M, Point2f const& scale, int interpolation, cudaStream_t stream);
+
+void resizeOnnxNN(size_t elemSize,PtrStepSzb const& src, PtrStepSzb const& dst,
+    Matx22f const& M, int mode, cudaStream_t stream);
 }}}
 
+namespace cv
+{
+static Vec2f interCoordinate(int coordinate, int dst, int src, double scale, double start, double end)
+{
+    float a, b;
+    if (coordinate == INTER_HALF_PIXEL
+        || coordinate == INTER_HALF_PIXEL_SYMMETRIC
+        || coordinate == INTER_HALF_PIXEL_PYTORCH)
+    {
+        a = static_cast<float>(1.0 / scale);
+        b = static_cast<float>(0.5 / scale - 0.5);
+        if (coordinate == INTER_HALF_PIXEL_SYMMETRIC)
+            b += static_cast<float>(0.5 * (src - dst / scale));
+        if (coordinate == INTER_HALF_PIXEL_PYTORCH && dst <= 1)
+        {
+            a = 0.f;
+            b = -0.5f;
+        }
+    }
+    else if (coordinate == INTER_ALIGN_CORNERS)
+    {
+        a = static_cast<float>((src - 1.0) / (src * scale - 1.0));
+        b = 0.f;
+    }
+    else if (coordinate == INTER_ASYMMETRIC)
+    {
+        a = static_cast<float>(1.0 / scale);
+        b = 0.f;
+    }
+    else if (coordinate == INTER_TF_CROP_RESIZE)
+    {
+        CV_CheckGE(start, 0.0, "roi's start is out of image");
+        CV_CheckLE(end, 1.0, "roi's end   is out of image");
+        CV_CheckLT(start, end, "roi's start must be less than its end");
+        if (dst <= 1)
+        {
+            a = 0.f;
+            b = static_cast<float>(0.5 * (start + end) * (src - 1.0));
+        }
+        else
+        {
+            a = static_cast<float>((end - start) * (src - 1.0) / (src * scale - 1.0));
+            b = static_cast<float>(start * (src - 1.0));
+        }
+    }
+    else
+        CV_Error(Error::StsBadArg, format("Unknown coordinate transformation mode %d", coordinate));
+    return Vec2f(a, b);
+}
+}
+
 void cv::cuda::resize(InputArray _src, OutputArray _dst, Size dsize, double fx, double fy, int interpolation, Stream& stream)
 {
     GpuMat src = _src.getGpuMat();
@@ -105,4 +168,102 @@ void cv::cuda::resize(InputArray _src, OutputArray _dst, Size dsize, double fx,
     func(src, wholeSrc, ofs.y, ofs.x, dst, static_cast<float>(1.0 / fy), static_cast<float>(1.0 / fx), interpolation, StreamAccessor::getStream(stream));
 }
 
+
+void cv::cuda::resizeOnnx(InputArray _src, OutputArray _dst, Size dsize, Point2d scale,
+    int interpolation, float cubicCoeff, Rect2d const& roi, Stream& stream)
+{
+    GpuMat src = _src.getGpuMat();
+    Size ssize = _src.size();
+    CV_CheckEQ(_src.dims(), 2, "only 2 dim image is support now");
+    CV_CheckFalse(ssize.empty(), "src size must not be empty");
+    if (dsize.empty())
+    {
+        CV_CheckGT(scale.x, 0.0, "scale must > 0 if no dsize given");
+        CV_CheckGT(scale.y, 0.0, "scale must > 0 if no dsize given");
+        dsize.width = static_cast<int>(scale.x * ssize.width);
+        dsize.height = static_cast<int>(scale.y * ssize.height);
+    }
+    if (scale.x == 0 || scale.y == 0)
+    {
+        scale.x = static_cast<double>(dsize.width) / ssize.width;
+        scale.y = static_cast<double>(dsize.height) / ssize.height;
+    }
+    CV_CheckFalse(dsize.empty(), "dst size must not empty");
+    CV_CheckGT(scale.x, 0.0, "require computed or given scale > 0");
+    CV_CheckGT(scale.y, 0.0, "require computed or given scale > 0");
+
+    int sampler = interpolation & INTER_SAMPLER_MASK;
+    int nearest = interpolation & INTER_NEAREST_MODE_MASK;
+    int coordinate = interpolation & INTER_COORDINATE_MASK;
+    CV_Assert(
+        sampler == INTER_NEAREST ||
+        sampler == INTER_LINEAR ||
+        sampler == INTER_CUBIC);
+    CV_Assert(
+        nearest == INTER_NEAREST_PREFER_FLOOR ||
+        nearest == INTER_NEAREST_PREFER_CEIL ||
+        nearest == INTER_NEAREST_FLOOR ||
+        nearest == INTER_NEAREST_CEIL);
+    CV_Assert(
+        coordinate == INTER_HALF_PIXEL ||
+        coordinate == INTER_HALF_PIXEL_PYTORCH ||
+        coordinate == INTER_HALF_PIXEL_SYMMETRIC ||
+        coordinate == INTER_ALIGN_CORNERS ||
+        coordinate == INTER_ASYMMETRIC ||
+        coordinate == INTER_TF_CROP_RESIZE);
+
+    _dst.create(dsize, _src.type());
+    GpuMat dst = _dst.getGpuMat();
+    if (dsize == ssize && coordinate != INTER_TF_CROP_RESIZE)
+    {
+        src.copyTo(dst, stream);
+        return;
+    }
+    if (scale.x >= 1.0 && scale.y >= 1.0)
+        interpolation &= ~INTER_ANTIALIAS_MASK;
+
+    Point2f scalef = static_cast<Point2f>(scale);
+    Matx22f M;
+    Vec2f xcoef = interCoordinate(
+        coordinate, dsize.width, ssize.width, scale.x, roi.x, roi.x + roi.width);
+    Vec2f ycoef = interCoordinate(
+        coordinate, dsize.height, ssize.height, scale.y, roi.y, roi.y + roi.height);
+    M(0, 0) = xcoef[0];
+    M(0, 1) = xcoef[1];
+    M(1, 0) = ycoef[0];
+    M(1, 1) = ycoef[1];
+
+    if (sampler == INTER_NEAREST)
+    {
+        device::resizeOnnxNN(src.elemSize(),
+            src, dst, M, nearest, StreamAccessor::getStream(stream));
+        return;
+    }
+
+    int depth = src.depth(), cn = src.channels();
+    CV_CheckDepth(depth, depth <= CV_64F,
+        "only support float in cuda kernel when not use nearest sampler");
+
+    using Func = void(*)(int cn, float A,
+        PtrStepSzb const& src, PtrStepSzb const& dst, Matx22f const& M,
+        Point2f const& scale, int interpolation, cudaStream_t stream);
+    static Func const funcs[CV_DEPTH_MAX] =
+    {
+        device::resizeOnnx<uchar, float>,
+        device::resizeOnnx<schar, float>,
+        device::resizeOnnx<ushort, float>,
+        device::resizeOnnx<short, float>,
+        device::resizeOnnx<int, double>,
+        device::resizeOnnx<float, float>,
+        device::resizeOnnx<double, double>,
+        /*device::resizeOnnx<__half, float>*/ nullptr,
+    };
+
+    Func const func = funcs[depth];
+    if (!func)
+        CV_Error(Error::StsUnsupportedFormat, "Unsupported depth");
+    func(cn, cubicCoeff, src, dst, M, scalef, interpolation,
+        StreamAccessor::getStream(stream));
+}
+
 #endif // HAVE_CUDA
diff --git a/modules/cudawarping/test/test_resize.cpp b/modules/cudawarping/test/test_resize.cpp
index 768ad09f98..98156dbc22 100644
--- a/modules/cudawarping/test/test_resize.cpp
+++ b/modules/cudawarping/test/test_resize.cpp
@@ -260,6 +260,113 @@ INSTANTIATE_TEST_CASE_P(CUDA_Warping, ResizeTextures, testing::Combine(
     ALL_DEVICES,
     testing::Values(Interpolation(cv::INTER_NEAREST), Interpolation(cv::INTER_LINEAR), Interpolation(cv::INTER_CUBIC))));
 
+PARAM_TEST_CASE(ResizeOnnx, cv::cuda::DeviceInfo, MatType, double, double, int, UseRoi)
+{
+    cv::cuda::DeviceInfo devInfo;
+    int depth, interpolation;
+    double fx, fy;
+    bool useRoi;
+
+    Rect src_loc, dst_loc;
+    Mat src, dst, src_roi, dst_roi;
+    GpuMat gsrc, gdst, gsrc_roi, gdst_roi;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        depth = GET_PARAM(1);
+        fx = GET_PARAM(2);
+        fy = GET_PARAM(3);
+        interpolation = GET_PARAM(4);
+        useRoi = GET_PARAM(5);
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+
+    void random_submat(int type,
+        Size& size, Rect& roi, Mat& mat, Mat& sub, GpuMat& gmat, GpuMat& gsub)
+    {
+        int border = useRoi ? 65 : 0;
+        roi.x = randomInt(0, border);
+        roi.y = randomInt(0, border);
+        roi.width = size.width;
+        roi.height = size.height;
+        size.width += roi.x + randomInt(0, border);
+        size.height += roi.y + randomInt(0, border);
+        mat = randomMat(size, type, -127, 127);
+        gmat.upload(mat);
+        sub = mat(roi);
+        gsub = gmat(roi);
+    }
+
+    void random_roi(int type)
+    {
+        Size srcSize, dstSize;
+        int minSize = min(fx, fy) < 1.0 ? 10 : 1;
+        while (dstSize.empty())
+        {
+            srcSize = randomSize(minSize, 129);
+            dstSize.width = cvRound(srcSize.width * fx);
+            dstSize.height = cvRound(srcSize.height * fy);
+        }
+
+        random_submat(type, srcSize, src_loc, src, src_roi, gsrc, gsrc_roi);
+        random_submat(type, dstSize, dst_loc, dst, dst_roi, gdst, gdst_roi);
+    }
+};
+
+CUDA_TEST_P(ResizeOnnx, Accuracy)
+{
+    Mat host, host_roi;
+    double eps = depth <= CV_32S ? 1 : 5e-2;
+
+    for (int cn = 1; cn <= 6; ++cn)
+    {
+        int type = CV_MAKETYPE(depth, cn);
+        random_roi(type);
+
+        cv::resizeOnnx(src_roi, dst_roi, dst_roi.size(), Point2d(fx, fy), interpolation);
+        cv::cuda::resizeOnnx(gsrc_roi, gdst_roi, dst_roi.size(), Point2d(fx, fy), interpolation);
+
+        gdst.download(host);
+        host_roi = host(dst_loc);
+        string info = cv::format(
+            "fail on type %sC%d src %dx%d dst %dx%d src_roi %dx%d dst_roi %dx%d",
+            depthToString(depth), cn, src.cols, src.rows, dst.cols, dst.rows,
+            src_roi.cols, src_roi.rows, dst_roi.cols, dst_roi.rows);
+        EXPECT_MAT_NEAR(dst_roi, host_roi, eps) << info;
+    }
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Warping, ResizeOnnx, Combine(
+    ALL_DEVICES,
+    Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32F),
+    Values(0.4, 0.27, 1.6),
+    Values(0.5, 0.71, 2.7),
+    Values((int)(INTER_LINEAR), (int)(INTER_CUBIC)),
+    WHOLE_SUBMAT));
+
+INSTANTIATE_TEST_CASE_P(CUDA_Warping_Antialias, ResizeOnnx, Combine(
+    ALL_DEVICES,
+    Values(CV_8U, CV_8S, CV_16U, CV_16S, CV_32F),
+    Values(0.4, 0.27, 1.6),
+    Values(0.5, 0.71, 2.7),
+    Values(
+        (int)(INTER_ANTIALIAS | INTER_LINEAR),
+        (int)(INTER_ANTIALIAS | INTER_CUBIC)),
+    WHOLE_SUBMAT));
+
+INSTANTIATE_TEST_CASE_P(CUDA_Warping_Nearest, ResizeOnnx, Combine(
+    ALL_DEVICES,
+    Values(CV_8S, CV_16S, CV_32F, CV_64F),
+    Values(0.4, 0.27, 1.6),
+    Values(0.5, 0.71, 2.7),
+    Values(
+        (int)(INTER_NEAREST | INTER_NEAREST_PREFER_FLOOR),
+        (int)(INTER_NEAREST | INTER_NEAREST_PREFER_CEIL),
+        (int)(INTER_NEAREST | INTER_NEAREST_CEIL),
+        (int)(INTER_NEAREST | INTER_NEAREST_FLOOR)),
+    WHOLE_SUBMAT));
 
 }} // namespace
+
 #endif // HAVE_CUDA