From 1eaf0b519839ad14cac1325f64be63e3005b2cce Mon Sep 17 00:00:00 2001
From: tangyanfei <tangyanfei@sensetime.com>
Date: Wed, 31 Mar 2021 10:23:36 +0800
Subject: [PATCH 01/15] add onnxruntime custom op grid_sample

---
 mmcv/ops/csrc/onnxruntime/cpu/gridSample.cpp  | 307 ++++++++++++++++++
 .../onnxruntime/cpu/onnxruntime_register.cpp  |   7 +
 mmcv/ops/csrc/onnxruntime/grid_sample.h       |  43 +++
 tests/test_ops/test_onnx.py                   |  38 +++
 4 files changed, 395 insertions(+)
 create mode 100644 mmcv/ops/csrc/onnxruntime/cpu/gridSample.cpp
 create mode 100644 mmcv/ops/csrc/onnxruntime/grid_sample.h
diff --git a/mmcv/ops/csrc/onnxruntime/cpu/gridSample.cpp b/mmcv/ops/csrc/onnxruntime/cpu/gridSample.cpp
new file mode 100644
index 00000000000..fe30f2a49a5
--- /dev/null
+++ b/mmcv/ops/csrc/onnxruntime/cpu/gridSample.cpp
@@ -0,0 +1,307 @@
+#include "grid_sample.h"
+
+#include "../ort_mmcv_utils.h"
+
+#include <cmath>
+
+#define MIN(a,b) (((a)<(b))?(a):(b))
+#define MAX(a,b) (((a)<(b))?(b):(a))
+#define CLIP_COORDINATES(in, out, clip_limit) out=MIN((clip_limit-1), MAX(in,0))
+
+GridSampleKernel::GridSampleKernel(OrtApi api, const OrtKernelInfo* info)
+    : api_(api), ort_(api_), info_(info) {
+  align_corners_ = ort_.KernelInfoGetAttribute<int64_t>(info, "align_corners");
+  interpolation_mode_ = ort_.KernelInfoGetAttribute<int64_t>(info, "interpolation_mode");
+  padding_mode_ = ort_.KernelInfoGetAttribute<int64_t>(info, "padding_mode");
+
+  allocator_ = Ort::AllocatorWithDefaultOptions();
+}
+
+enum GridSamplerInterpolation {
+    Bilinear = 0,
+    Nearest = 1,
+    Bicubic = 2};
+enum GridSamplerPadding {
+    Zeros = 0,
+    Border = 1,
+    Reflection = 2};
+
+template<typename scalar_t>
+static inline scalar_t grid_sampler_unnormalize(scalar_t coord, int64_t size, bool align_corners) {
+    if (align_corners) {
+        return ((coord + 1) / 2) * (size - 1);
+    } else {
+        return ((coord + 1) * size - 1) / 2;
+    }
+}
+
+// Clips coordinates to between 0 and clip_limit - 1
+template<typename scalar_t>
+static inline scalar_t clip_coordinates(scalar_t in, int64_t clip_limit) {
+  return std::min(static_cast<scalar_t>(clip_limit - 1), std::max(in, static_cast<scalar_t>(0)));
+}
+
+// Reflects coordinates until they fall between low and high (inclusive).
+// The bounds are passed as twice their value so that half-integer values
+// can be represented as ints.
+template<typename scalar_t>
+static inline scalar_t reflect_coordinates(scalar_t in, int64_t twice_low,
+                                           int64_t twice_high) {
+  if (twice_low == twice_high) {
+    return static_cast<scalar_t>(0);
+  }
+  scalar_t min = static_cast<scalar_t>(twice_low) / 2;
+  scalar_t span = static_cast<scalar_t>(twice_high - twice_low) / 2;
+  in = std::fabs(in - min);
+  // `fmod` returns same sign as `in`, which is positive after the `fabs` above.
+  scalar_t extra = std::fmod(in, span);
+  int flips = static_cast<int>(std::floor(in / span));
+  if (flips % 2 == 0) {
+    return extra + min;
+  } else {
+    return span - extra + min;
+  }
+}
+
+template<typename scalar_t>
+static inline scalar_t compute_coordinates(scalar_t coord, int64_t size,
+                                           int64_t padding_mode,
+                                           bool align_corners) {
+    if (padding_mode == GridSamplerPadding::Border) {
+        coord = clip_coordinates(coord, size);
+    } else if (padding_mode == GridSamplerPadding::Reflection) {
+        if (align_corners) {
+            coord = reflect_coordinates(coord, 0, 2*(size - 1));
+        } else {
+            coord = reflect_coordinates(coord, -1, 2 * size - 1);
+        }
+        coord = clip_coordinates(coord, size);
+    }
+    return coord;
+}
+
+// Computes the pixel source index value for a grid coordinate
+template <typename scalar_t>
+static inline scalar_t grid_sampler_compute_source_index(
+    scalar_t coord,
+    int64_t size,
+    int64_t padding_mode,
+    bool align_corners) {
+  coord = grid_sampler_unnormalize(coord, size, align_corners);
+  coord = compute_coordinates(coord, size, padding_mode, align_corners);
+  return coord;
+}
+
+static inline bool within_bounds_2d(int64_t h, int64_t w, int64_t H, int64_t W) {
+  return h >= 0 && h < H && w >= 0 && w < W;
+}
+
+template<typename scalar_t>
+static inline scalar_t get_value_bounded(
+    const scalar_t* data,
+    scalar_t x,
+    scalar_t y,
+    int64_t W,
+    int64_t H,
+    int64_t sW,
+    int64_t sH,
+    int64_t padding_mode,
+    bool align_corners) {
+
+  x = compute_coordinates(x, W, padding_mode, align_corners);
+  y = compute_coordinates(y, H, padding_mode, align_corners);
+
+  int64_t ix = static_cast<int64_t>(x);
+  int64_t iy = static_cast<int64_t>(y);
+
+  if (within_bounds_2d(iy, ix, H, W)) {
+    return data[iy * sH + ix * sW];
+  }
+  return static_cast<scalar_t>(0);
+}
+
+template <typename scalar_t>
+static inline scalar_t cubic_convolution1(scalar_t x, scalar_t A) {
+  return ((A + 2) * x - (A + 3)) * x * x + 1;
+}
+
+template <typename scalar_t>
+static inline scalar_t cubic_convolution2(scalar_t x, scalar_t A) {
+  return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A;
+}
+
+template <typename scalar_t>
+static inline void get_cubic_upsample_coefficients(
+    scalar_t coeffs[4],
+    scalar_t t) {
+  scalar_t A = -0.75;
+
+  scalar_t x1 = t;
+  coeffs[0] = cubic_convolution2<scalar_t>(x1 + 1.0, A);
+  coeffs[1] = cubic_convolution1<scalar_t>(x1, A);
+
+  // opposite coefficients
+  scalar_t x2 = 1.0 - t;
+  coeffs[2] = cubic_convolution1<scalar_t>(x2, A);
+  coeffs[3] = cubic_convolution2<scalar_t>(x2 + 1.0, A);
+}
+
+template <typename scalar_t>
+static inline scalar_t cubic_interp1d(
+    scalar_t x0,
+    scalar_t x1,
+    scalar_t x2,
+    scalar_t x3,
+    scalar_t t) {
+  scalar_t coeffs[4];
+  get_cubic_upsample_coefficients<scalar_t>(coeffs, t);
+
+  return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3];
+}
+
+void GridSampleKernel::Compute(OrtKernelContext* context) {
+    const bool align_corners = align_corners_;
+    const int64_t padding_mode = padding_mode_;
+    const int64_t interpolation_mode = interpolation_mode_;
+
+    const OrtValue* input = ort_.KernelContext_GetInput(context, 0);
+    const float* input_data =
+        reinterpret_cast<const float*>(ort_.GetTensorData<float>(input));
+
+    const OrtValue* grid = ort_.KernelContext_GetInput(context, 1);
+    const float* grid_data =
+        reinterpret_cast<const float*>(ort_.GetTensorData<float>(grid));
+
+    OrtTensorDimensions input_dims(ort_, input);
+    OrtTensorDimensions grid_dims(ort_, grid);
+    int64_t N = input_dims[0];
+    int64_t C = input_dims[1];
+    int64_t inp_H = input_dims[2];
+    int64_t inp_W = input_dims[3];
+    int64_t out_H = grid_dims[1];
+    int64_t out_W = grid_dims[2];
+
+    std::vector<int64_t> output_dims = {N, C, out_H, out_W};
+    OrtValue *output = ort_.KernelContext_GetOutput(context, 0,
+                                                    output_dims.data(),
+                                                    output_dims.size());
+    float* out_ptr = ort_.GetTensorMutableData<float>(output);
+
+    int64_t inp_sN = input_dims[1] * input_dims[2] * input_dims[3];
+    int64_t inp_sC = input_dims[2] * input_dims[3];
+    int64_t inp_sH = input_dims[3];
+    int64_t inp_sW = 1;
+    int64_t grid_sN = grid_dims[1] * grid_dims[2] * grid_dims[3];
+    int64_t grid_sH = grid_dims[2] * grid_dims[3];
+    int64_t grid_sW = grid_dims[3];
+    int64_t grid_sCoor = 1;
+    int64_t out_sN = output_dims[1] * output_dims[2] * output_dims[3];
+    int64_t out_sC = output_dims[2] * output_dims[3];
+    int64_t out_sH = output_dims[3];
+    int64_t out_sW = 1;
+
+    // loop over each output pixel
+    for (int64_t n = 0; n < N; ++n) {
+        const float* grid_ptr_N = grid_data + n * grid_sN;
+        const float* inp_ptr_N = input_data + n * inp_sN;
+        for (int64_t h = 0; h < out_H; ++h) {
+            for (int64_t w = 0; w < out_W; ++w) {
+                const float* grid_ptr_NHW = grid_ptr_N + h * grid_sH  + w * grid_sW;
+                float x = *grid_ptr_NHW;
+                float y = grid_ptr_NHW[grid_sCoor];
+
+                float ix = grid_sampler_compute_source_index(x, inp_W, padding_mode, align_corners);
+                float iy = grid_sampler_compute_source_index(y, inp_H, padding_mode, align_corners);
+
+                if (interpolation_mode == GridSamplerInterpolation::Bilinear) {
+                    // get corner pixel values from (x, y)
+                    // for 4d, we use north-east-south-west
+                    int64_t ix_nw = static_cast<int64_t>(std::floor(ix));
+                    int64_t iy_nw = static_cast<int64_t>(std::floor(iy));
+
+                    int64_t ix_ne = ix_nw + 1;
+                    int64_t iy_ne = iy_nw;
+
+                    int64_t ix_sw = ix_nw;
+                    int64_t iy_sw = iy_nw + 1;
+
+                    int64_t ix_se = ix_nw + 1;
+                    int64_t iy_se = iy_nw + 1;
+
+
+                    // get surfaces to each neighbor:
+                    float nw = (ix_se - ix)    * (iy_se - iy);
+                    float ne = (ix    - ix_sw) * (iy_sw - iy);
+                    float sw = (ix_ne - ix)    * (iy    - iy_ne);
+                    float se = (ix    - ix_nw) * (iy    - iy_nw);
+
+                    // calculate bilinear weighted pixel value and set output pixel
+                    const float *inp_ptr_NC = inp_ptr_N;
+                    float *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
+                    for (int64_t c = 0; c < C; ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) {
+                        auto res = static_cast<float>(0);
+                        if (within_bounds_2d(iy_nw, ix_nw, inp_H, inp_W)) {
+                            res += inp_ptr_NC[iy_nw * inp_sH + ix_nw * inp_sW] * nw;
+                        }
+                        if (within_bounds_2d(iy_ne, ix_ne, inp_H, inp_W)) {
+                            res += inp_ptr_NC[iy_ne * inp_sH + ix_ne * inp_sW] * ne;
+                        }
+                        if (within_bounds_2d(iy_sw, ix_sw, inp_H, inp_W)) {
+                            res += inp_ptr_NC[iy_sw * inp_sH + ix_sw * inp_sW] * sw;
+                        }
+                        if (within_bounds_2d(iy_se, ix_se, inp_H, inp_W)) {
+                            res += inp_ptr_NC[iy_se * inp_sH + ix_se * inp_sW] * se;
+                        }
+                        *out_ptr_NCHW = res;
+                    }
+                } else if (interpolation_mode == GridSamplerInterpolation::Nearest) {
+                        int64_t ix_nearest = static_cast<int64_t>(std::nearbyint(ix));
+                        int64_t iy_nearest = static_cast<int64_t>(std::nearbyint(iy));
+
+                        // assign nearest neighor pixel value to output pixel
+                        float *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
+                        const float *inp_ptr_NC = inp_ptr_N;
+                        for (int64_t c = 0; c < C; ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) {
+                            if (within_bounds_2d(iy_nearest, ix_nearest, inp_H, inp_W)) {
+                                *out_ptr_NCHW = inp_ptr_NC[iy_nearest * inp_sH + ix_nearest * inp_sW];
+                            } else {
+                                *out_ptr_NCHW = static_cast<float>(0);
+                            }
+                        }
+                } else if (interpolation_mode == GridSamplerInterpolation::Bicubic) {
+                    // grid_sampler_compute_source_index will "clip the value" of idx depends on the padding,
+                    // which would cause calculation to be wrong,
+                    // for example x = -0.1 -> ix = 0 for zero padding, but in bicubic ix = floor(x) = -1
+                    // There would be more problem in reflection padding, since the -1 and +1 direction is not fixed in boundary condition
+                    ix = grid_sampler_unnormalize(x, inp_W, align_corners);
+                    iy = grid_sampler_unnormalize(y, inp_H, align_corners);
+
+                    float ix_nw = std::floor(ix);
+                    float iy_nw = std::floor(iy);
+
+                    const float tx = ix - ix_nw;
+                    const float ty = iy - iy_nw;
+
+                    const float *inp_ptr_NC = inp_ptr_N;
+                    float *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
+                    for (int64_t c = 0; c < C; ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) {
+                        float coefficients[4];
+
+                        // Interpolate 4 values in the x directon
+                        for (int64_t i = 0; i < 4; ++i) {
+                        coefficients[i] = cubic_interp1d<float>(
+                            get_value_bounded<float>(inp_ptr_NC, ix_nw - 1, iy_nw - 1 + i, inp_W, inp_H, inp_sW, inp_sH, padding_mode, align_corners),
+                            get_value_bounded<float>(inp_ptr_NC, ix_nw + 0, iy_nw - 1 + i, inp_W, inp_H, inp_sW, inp_sH, padding_mode, align_corners),
+                            get_value_bounded<float>(inp_ptr_NC, ix_nw + 1, iy_nw - 1 + i, inp_W, inp_H, inp_sW, inp_sH, padding_mode, align_corners),
+                            get_value_bounded<float>(inp_ptr_NC, ix_nw + 2, iy_nw - 1 + i, inp_W, inp_H, inp_sW, inp_sH, padding_mode, align_corners),
+                            tx);
+                        }
+
+                        // Interpolate in the y direction
+                        *out_ptr_NCHW = cubic_interp1d<float>(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty);
+                    }
+                }
+            }
+        }
+    }
+}
diff --git a/mmcv/ops/csrc/onnxruntime/cpu/onnxruntime_register.cpp b/mmcv/ops/csrc/onnxruntime/cpu/onnxruntime_register.cpp
index 94614c85574..06196b8110f 100644
--- a/mmcv/ops/csrc/onnxruntime/cpu/onnxruntime_register.cpp
+++ b/mmcv/ops/csrc/onnxruntime/cpu/onnxruntime_register.cpp
@@ -4,11 +4,13 @@
 #include "ort_mmcv_utils.h"
 #include "roi_align.h"
 #include "soft_nms.h"
+#include "grid_sample.h"
 
 const char *c_MMCVOpDomain = "mmcv";
 SoftNmsOp c_SoftNmsOp;
 NmsOp c_NmsOp;
 MMCVRoiAlignCustomOp c_MMCVRoiAlignCustomOp;
+GridSampleOp c_GridSampleOp;
 
 OrtStatus *ORT_API_CALL RegisterCustomOps(OrtSessionOptions *options,
                                           const OrtApiBase *api) {
@@ -32,5 +34,10 @@ OrtStatus *ORT_API_CALL RegisterCustomOps(OrtSessionOptions *options,
     return status;
   }
 
+  if (auto status =
+          ortApi->CustomOpDomain_Add(domain, &c_GridSampleOp)) {
+    return status;
+  }
+
   return ortApi->AddCustomOpDomain(options, domain);
 }
diff --git a/mmcv/ops/csrc/onnxruntime/grid_sample.h b/mmcv/ops/csrc/onnxruntime/grid_sample.h
new file mode 100644
index 00000000000..b10555cf13d
--- /dev/null
+++ b/mmcv/ops/csrc/onnxruntime/grid_sample.h
@@ -0,0 +1,43 @@
+#ifndef ONNXRUNTIME_GRIDSAMPLE_H
+#define ONNXRUNTIME_GRIDSAMPLE_H
+
+#include <onnxruntime_cxx_api.h>
+
+struct GridSampleKernel {
+  GridSampleKernel(OrtApi api, const OrtKernelInfo *info);
+
+  void Compute(OrtKernelContext *context);
+
+ protected:
+  OrtApi api_;
+  Ort::CustomOpApi ort_;
+  const OrtKernelInfo *info_;
+  Ort::AllocatorWithDefaultOptions allocator_;
+
+  int64_t align_corners_;
+  int64_t interpolation_mode_;
+  int64_t padding_mode_;
+};
+
+struct GridSampleOp : Ort::CustomOpBase<GridSampleOp, GridSampleKernel> {
+    void* CreateKernel(OrtApi api, const OrtKernelInfo *info) const {
+        return new GridSampleKernel(api, info);
+    };
+
+    const char* GetName() const { return "grid_sampler"; };
+
+    size_t GetInputTypeCount() const { return 2; };
+    ONNXTensorElementDataType GetInputType(size_t /*index*/) const {
+        return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+    };
+
+    size_t GetOutputTypeCount() const { return 1; };
+    ONNXTensorElementDataType GetOutputType(size_t /*index*/) const {
+        return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+    };
+
+    const char* GetExecutionProviderType() const {
+        return "CPUExecutionProvider";
+    };
+};
+#endif
diff --git a/tests/test_ops/test_onnx.py b/tests/test_ops/test_onnx.py
index cc1ccb82387..4faf1f95a2d 100644
--- a/tests/test_ops/test_onnx.py
+++ b/tests/test_ops/test_onnx.py
@@ -23,6 +23,44 @@ def forward(self, *args, **kwargs):
         return self.wrapped_function(*args, **kwargs)
 
 
+class GridSample(torch.nn.Module):
+
+    def forward(self, x, y):
+        res = torch.nn.functional.grid_sample(x, grid=y, align_corners=False)
+        return res
+
+
+def test_grid_sampler():
+    from mmcv.onnx.symbolic import register_extra_symbolics
+    register_extra_symbolics()
+    input = torch.ones(1, 1, 2, 2)
+    out_h = 4
+    out_w = 4
+    h = torch.linspace(-1, 1, out_h)
+    w = torch.linspace(-1, 1, out_w)
+    grid = torch.zeros(out_h, out_w, 2)
+    grid[:, :, 0] = w.unsqueeze(0).repeat(out_h, 1)
+    grid[:, :, 1] = h.unsqueeze(0).repeat(out_w, 1).transpose(0, 1)
+    grid = grid.unsqueeze(0).repeat(1, 1, 1, 1)
+
+    model = GridSample()
+    torch.onnx.export(model, (input, grid), onnx_file, opset_version=11)
+
+    pytorch_output = model(input, grid)
+
+    from mmcv.ops import get_onnxruntime_op_path
+    ort_custom_op_path = get_onnxruntime_op_path()
+    session_options = rt.SessionOptions()
+    if os.path.exists(ort_custom_op_path):
+        session_options.register_custom_ops_library(ort_custom_op_path)
+    sess = rt.InferenceSession(onnx_file, session_options)
+    input_feature = input.cpu().numpy()
+    grid_feature = grid.cpu().numpy()
+    onnx_output = sess.run(None, {'x': input_feature, 'y': grid_feature})
+    os.remove(onnx_file)
+    assert np.allclose(pytorch_output, onnx_output, atol=1e-3)
+
+
 def test_nms():
     if torch.__version__ == 'parrots':
         pytest.skip('onnx is not supported in parrots directly')

From c32a838efb817d0332c5f3109b267e1c7e4bc197 Mon Sep 17 00:00:00 2001
From: tangyanfei <tangyanfei@sensetime.com>
Date: Wed, 31 Mar 2021 10:44:19 +0800
Subject: [PATCH 02/15] update code

---
 mmcv/ops/csrc/onnxruntime/cpu/gridSample.cpp | 305 +++++++++++--------
 1 file changed, 177 insertions(+), 128 deletions(-)

diff --git a/mmcv/ops/csrc/onnxruntime/cpu/gridSample.cpp b/mmcv/ops/csrc/onnxruntime/cpu/gridSample.cpp
index fe30f2a49a5..10c9eebb21d 100644
--- a/mmcv/ops/csrc/onnxruntime/cpu/gridSample.cpp
+++ b/mmcv/ops/csrc/onnxruntime/cpu/gridSample.cpp
@@ -4,75 +4,97 @@
 
 #include <cmath>
 
-#define MIN(a,b) (((a)<(b))?(a):(b))
-#define MAX(a,b) (((a)<(b))?(b):(a))
-#define CLIP_COORDINATES(in, out, clip_limit) out=MIN((clip_limit-1), MAX(in,0))
-
-GridSampleKernel::GridSampleKernel(OrtApi api, const OrtKernelInfo* info)
-    : api_(api), ort_(api_), info_(info) {
-  align_corners_ = ort_.KernelInfoGetAttribute<int64_t>(info, "align_corners");
-  interpolation_mode_ = ort_.KernelInfoGetAttribute<int64_t>(info, "interpolation_mode");
-  padding_mode_ = ort_.KernelInfoGetAttribute<int64_t>(info, "padding_mode");
-
-  allocator_ = Ort::AllocatorWithDefaultOptions();
+#define MIN(a, b) (((a) < (b)) ? (a) : (b))
+#define MAX(a, b) (((a) < (b)) ? (b) : (a))
+#define CLIP_COORDINATES(in, out, clip_limit) out = MIN((clip_limit - 1), MAX(in, 0))
+
+GridSampleKernel::GridSampleKernel(OrtApi api, const OrtKernelInfo *info)
+    : api_(api), ort_(api_), info_(info)
+{
+    align_corners_ = ort_.KernelInfoGetAttribute<int64_t>(info, "align_corners");
+    interpolation_mode_ = ort_.KernelInfoGetAttribute<int64_t>(info, "interpolation_mode");
+    padding_mode_ = ort_.KernelInfoGetAttribute<int64_t>(info, "padding_mode");
+
+    allocator_ = Ort::AllocatorWithDefaultOptions();
 }
 
-enum GridSamplerInterpolation {
+enum GridSamplerInterpolation
+{
     Bilinear = 0,
     Nearest = 1,
-    Bicubic = 2};
-enum GridSamplerPadding {
+    Bicubic = 2
+};
+enum GridSamplerPadding
+{
     Zeros = 0,
     Border = 1,
-    Reflection = 2};
+    Reflection = 2
+};
 
-template<typename scalar_t>
-static inline scalar_t grid_sampler_unnormalize(scalar_t coord, int64_t size, bool align_corners) {
-    if (align_corners) {
+template <typename scalar_t>
+static inline scalar_t grid_sampler_unnormalize(scalar_t coord, int64_t size, bool align_corners)
+{
+    if (align_corners)
+    {
         return ((coord + 1) / 2) * (size - 1);
-    } else {
+    }
+    else
+    {
         return ((coord + 1) * size - 1) / 2;
     }
 }
 
 // Clips coordinates to between 0 and clip_limit - 1
-template<typename scalar_t>
-static inline scalar_t clip_coordinates(scalar_t in, int64_t clip_limit) {
-  return std::min(static_cast<scalar_t>(clip_limit - 1), std::max(in, static_cast<scalar_t>(0)));
+template <typename scalar_t>
+static inline scalar_t clip_coordinates(scalar_t in, int64_t clip_limit)
+{
+    return std::min(static_cast<scalar_t>(clip_limit - 1), std::max(in, static_cast<scalar_t>(0)));
 }
 
 // Reflects coordinates until they fall between low and high (inclusive).
 // The bounds are passed as twice their value so that half-integer values
 // can be represented as ints.
-template<typename scalar_t>
+template <typename scalar_t>
 static inline scalar_t reflect_coordinates(scalar_t in, int64_t twice_low,
-                                           int64_t twice_high) {
-  if (twice_low == twice_high) {
-    return static_cast<scalar_t>(0);
-  }
-  scalar_t min = static_cast<scalar_t>(twice_low) / 2;
-  scalar_t span = static_cast<scalar_t>(twice_high - twice_low) / 2;
-  in = std::fabs(in - min);
-  // `fmod` returns same sign as `in`, which is positive after the `fabs` above.
-  scalar_t extra = std::fmod(in, span);
-  int flips = static_cast<int>(std::floor(in / span));
-  if (flips % 2 == 0) {
-    return extra + min;
-  } else {
-    return span - extra + min;
-  }
+                                           int64_t twice_high)
+{
+    if (twice_low == twice_high)
+    {
+        return static_cast<scalar_t>(0);
+    }
+    scalar_t min = static_cast<scalar_t>(twice_low) / 2;
+    scalar_t span = static_cast<scalar_t>(twice_high - twice_low) / 2;
+    in = std::fabs(in - min);
+    // `fmod` returns same sign as `in`, which is positive after the `fabs` above.
+    scalar_t extra = std::fmod(in, span);
+    int flips = static_cast<int>(std::floor(in / span));
+    if (flips % 2 == 0)
+    {
+        return extra + min;
+    }
+    else
+    {
+        return span - extra + min;
+    }
 }
 
-template<typename scalar_t>
+template <typename scalar_t>
 static inline scalar_t compute_coordinates(scalar_t coord, int64_t size,
                                            int64_t padding_mode,
-                                           bool align_corners) {
-    if (padding_mode == GridSamplerPadding::Border) {
+                                           bool align_corners)
+{
+    if (padding_mode == GridSamplerPadding::Border)
+    {
         coord = clip_coordinates(coord, size);
-    } else if (padding_mode == GridSamplerPadding::Reflection) {
-        if (align_corners) {
-            coord = reflect_coordinates(coord, 0, 2*(size - 1));
-        } else {
+    }
+    else if (padding_mode == GridSamplerPadding::Reflection)
+    {
+        if (align_corners)
+        {
+            coord = reflect_coordinates(coord, 0, 2 * (size - 1));
+        }
+        else
+        {
             coord = reflect_coordinates(coord, -1, 2 * size - 1);
         }
         coord = clip_coordinates(coord, size);
@@ -86,19 +108,21 @@ static inline scalar_t grid_sampler_compute_source_index(
     scalar_t coord,
     int64_t size,
     int64_t padding_mode,
-    bool align_corners) {
-  coord = grid_sampler_unnormalize(coord, size, align_corners);
-  coord = compute_coordinates(coord, size, padding_mode, align_corners);
-  return coord;
+    bool align_corners)
+{
+    coord = grid_sampler_unnormalize(coord, size, align_corners);
+    coord = compute_coordinates(coord, size, padding_mode, align_corners);
+    return coord;
 }
 
-static inline bool within_bounds_2d(int64_t h, int64_t w, int64_t H, int64_t W) {
-  return h >= 0 && h < H && w >= 0 && w < W;
+static inline bool within_bounds_2d(int64_t h, int64_t w, int64_t H, int64_t W)
+{
+    return h >= 0 && h < H && w >= 0 && w < W;
 }
 
-template<typename scalar_t>
+template <typename scalar_t>
 static inline scalar_t get_value_bounded(
-    const scalar_t* data,
+    const scalar_t *data,
     scalar_t x,
     scalar_t y,
     int64_t W,
@@ -106,44 +130,49 @@ static inline scalar_t get_value_bounded(
     int64_t sW,
     int64_t sH,
     int64_t padding_mode,
-    bool align_corners) {
+    bool align_corners)
+{
 
-  x = compute_coordinates(x, W, padding_mode, align_corners);
-  y = compute_coordinates(y, H, padding_mode, align_corners);
+    x = compute_coordinates(x, W, padding_mode, align_corners);
+    y = compute_coordinates(y, H, padding_mode, align_corners);
 
-  int64_t ix = static_cast<int64_t>(x);
-  int64_t iy = static_cast<int64_t>(y);
+    int64_t ix = static_cast<int64_t>(x);
+    int64_t iy = static_cast<int64_t>(y);
 
-  if (within_bounds_2d(iy, ix, H, W)) {
-    return data[iy * sH + ix * sW];
-  }
-  return static_cast<scalar_t>(0);
+    if (within_bounds_2d(iy, ix, H, W))
+    {
+        return data[iy * sH + ix * sW];
+    }
+    return static_cast<scalar_t>(0);
 }
 
 template <typename scalar_t>
-static inline scalar_t cubic_convolution1(scalar_t x, scalar_t A) {
-  return ((A + 2) * x - (A + 3)) * x * x + 1;
+static inline scalar_t cubic_convolution1(scalar_t x, scalar_t A)
+{
+    return ((A + 2) * x - (A + 3)) * x * x + 1;
 }
 
 template <typename scalar_t>
-static inline scalar_t cubic_convolution2(scalar_t x, scalar_t A) {
-  return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A;
+static inline scalar_t cubic_convolution2(scalar_t x, scalar_t A)
+{
+    return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A;
 }
 
 template <typename scalar_t>
 static inline void get_cubic_upsample_coefficients(
     scalar_t coeffs[4],
-    scalar_t t) {
-  scalar_t A = -0.75;
-
-  scalar_t x1 = t;
-  coeffs[0] = cubic_convolution2<scalar_t>(x1 + 1.0, A);
-  coeffs[1] = cubic_convolution1<scalar_t>(x1, A);
-
-  // opposite coefficients
-  scalar_t x2 = 1.0 - t;
-  coeffs[2] = cubic_convolution1<scalar_t>(x2, A);
-  coeffs[3] = cubic_convolution2<scalar_t>(x2 + 1.0, A);
+    scalar_t t)
+{
+    scalar_t A = -0.75;
+
+    scalar_t x1 = t;
+    coeffs[0] = cubic_convolution2<scalar_t>(x1 + 1.0, A);
+    coeffs[1] = cubic_convolution1<scalar_t>(x1, A);
+
+    // opposite coefficients
+    scalar_t x2 = 1.0 - t;
+    coeffs[2] = cubic_convolution1<scalar_t>(x2, A);
+    coeffs[3] = cubic_convolution2<scalar_t>(x2 + 1.0, A);
 }
 
 template <typename scalar_t>
@@ -152,25 +181,27 @@ static inline scalar_t cubic_interp1d(
     scalar_t x1,
     scalar_t x2,
     scalar_t x3,
-    scalar_t t) {
-  scalar_t coeffs[4];
-  get_cubic_upsample_coefficients<scalar_t>(coeffs, t);
+    scalar_t t)
+{
+    scalar_t coeffs[4];
+    get_cubic_upsample_coefficients<scalar_t>(coeffs, t);
 
-  return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3];
+    return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3];
 }
 
-void GridSampleKernel::Compute(OrtKernelContext* context) {
+void GridSampleKernel::Compute(OrtKernelContext *context)
+{
     const bool align_corners = align_corners_;
     const int64_t padding_mode = padding_mode_;
     const int64_t interpolation_mode = interpolation_mode_;
 
-    const OrtValue* input = ort_.KernelContext_GetInput(context, 0);
-    const float* input_data =
-        reinterpret_cast<const float*>(ort_.GetTensorData<float>(input));
+    const OrtValue *input = ort_.KernelContext_GetInput(context, 0);
+    const float *input_data =
+        reinterpret_cast<const float *>(ort_.GetTensorData<float>(input));
 
-    const OrtValue* grid = ort_.KernelContext_GetInput(context, 1);
-    const float* grid_data =
-        reinterpret_cast<const float*>(ort_.GetTensorData<float>(grid));
+    const OrtValue *grid = ort_.KernelContext_GetInput(context, 1);
+    const float *grid_data =
+        reinterpret_cast<const float *>(ort_.GetTensorData<float>(grid));
 
     OrtTensorDimensions input_dims(ort_, input);
     OrtTensorDimensions grid_dims(ort_, grid);
@@ -185,7 +216,7 @@ void GridSampleKernel::Compute(OrtKernelContext* context) {
     OrtValue *output = ort_.KernelContext_GetOutput(context, 0,
                                                     output_dims.data(),
                                                     output_dims.size());
-    float* out_ptr = ort_.GetTensorMutableData<float>(output);
+    float *out_ptr = ort_.GetTensorMutableData<float>(output);
 
     int64_t inp_sN = input_dims[1] * input_dims[2] * input_dims[3];
     int64_t inp_sC = input_dims[2] * input_dims[3];
@@ -201,19 +232,23 @@ void GridSampleKernel::Compute(OrtKernelContext* context) {
     int64_t out_sW = 1;
 
     // loop over each output pixel
-    for (int64_t n = 0; n < N; ++n) {
-        const float* grid_ptr_N = grid_data + n * grid_sN;
-        const float* inp_ptr_N = input_data + n * inp_sN;
-        for (int64_t h = 0; h < out_H; ++h) {
-            for (int64_t w = 0; w < out_W; ++w) {
-                const float* grid_ptr_NHW = grid_ptr_N + h * grid_sH  + w * grid_sW;
+    for (int64_t n = 0; n < N; ++n)
+    {
+        const float *grid_ptr_N = grid_data + n * grid_sN;
+        const float *inp_ptr_N = input_data + n * inp_sN;
+        for (int64_t h = 0; h < out_H; ++h)
+        {
+            for (int64_t w = 0; w < out_W; ++w)
+            {
+                const float *grid_ptr_NHW = grid_ptr_N + h * grid_sH + w * grid_sW;
                 float x = *grid_ptr_NHW;
                 float y = grid_ptr_NHW[grid_sCoor];
 
                 float ix = grid_sampler_compute_source_index(x, inp_W, padding_mode, align_corners);
                 float iy = grid_sampler_compute_source_index(y, inp_H, padding_mode, align_corners);
 
-                if (interpolation_mode == GridSamplerInterpolation::Bilinear) {
+                if (interpolation_mode == GridSamplerInterpolation::Bilinear)
+                {
                     // get corner pixel values from (x, y)
                     // for 4d, we use north-east-south-west
                     int64_t ix_nw = static_cast<int64_t>(std::floor(ix));
@@ -228,47 +263,59 @@ void GridSampleKernel::Compute(OrtKernelContext* context) {
                     int64_t ix_se = ix_nw + 1;
                     int64_t iy_se = iy_nw + 1;
 
-
                     // get surfaces to each neighbor:
-                    float nw = (ix_se - ix)    * (iy_se - iy);
-                    float ne = (ix    - ix_sw) * (iy_sw - iy);
-                    float sw = (ix_ne - ix)    * (iy    - iy_ne);
-                    float se = (ix    - ix_nw) * (iy    - iy_nw);
+                    float nw = (ix_se - ix) * (iy_se - iy);
+                    float ne = (ix - ix_sw) * (iy_sw - iy);
+                    float sw = (ix_ne - ix) * (iy - iy_ne);
+                    float se = (ix - ix_nw) * (iy - iy_nw);
 
                     // calculate bilinear weighted pixel value and set output pixel
                     const float *inp_ptr_NC = inp_ptr_N;
                     float *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
-                    for (int64_t c = 0; c < C; ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) {
+                    for (int64_t c = 0; c < C; ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC)
+                    {
                         auto res = static_cast<float>(0);
-                        if (within_bounds_2d(iy_nw, ix_nw, inp_H, inp_W)) {
+                        if (within_bounds_2d(iy_nw, ix_nw, inp_H, inp_W))
+                        {
                             res += inp_ptr_NC[iy_nw * inp_sH + ix_nw * inp_sW] * nw;
                         }
-                        if (within_bounds_2d(iy_ne, ix_ne, inp_H, inp_W)) {
+                        if (within_bounds_2d(iy_ne, ix_ne, inp_H, inp_W))
+                        {
                             res += inp_ptr_NC[iy_ne * inp_sH + ix_ne * inp_sW] * ne;
                         }
-                        if (within_bounds_2d(iy_sw, ix_sw, inp_H, inp_W)) {
+                        if (within_bounds_2d(iy_sw, ix_sw, inp_H, inp_W))
+                        {
                             res += inp_ptr_NC[iy_sw * inp_sH + ix_sw * inp_sW] * sw;
                         }
-                        if (within_bounds_2d(iy_se, ix_se, inp_H, inp_W)) {
+                        if (within_bounds_2d(iy_se, ix_se, inp_H, inp_W))
+                        {
                             res += inp_ptr_NC[iy_se * inp_sH + ix_se * inp_sW] * se;
                         }
                         *out_ptr_NCHW = res;
                     }
-                } else if (interpolation_mode == GridSamplerInterpolation::Nearest) {
-                        int64_t ix_nearest = static_cast<int64_t>(std::nearbyint(ix));
-                        int64_t iy_nearest = static_cast<int64_t>(std::nearbyint(iy));
-
-                        // assign nearest neighor pixel value to output pixel
-                        float *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
-                        const float *inp_ptr_NC = inp_ptr_N;
-                        for (int64_t c = 0; c < C; ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) {
-                            if (within_bounds_2d(iy_nearest, ix_nearest, inp_H, inp_W)) {
-                                *out_ptr_NCHW = inp_ptr_NC[iy_nearest * inp_sH + ix_nearest * inp_sW];
-                            } else {
-                                *out_ptr_NCHW = static_cast<float>(0);
-                            }
+                }
+                else if (interpolation_mode == GridSamplerInterpolation::Nearest)
+                {
+                    int64_t ix_nearest = static_cast<int64_t>(std::nearbyint(ix));
+                    int64_t iy_nearest = static_cast<int64_t>(std::nearbyint(iy));
+
+                    // assign nearest neighor pixel value to output pixel
+                    float *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
+                    const float *inp_ptr_NC = inp_ptr_N;
+                    for (int64_t c = 0; c < C; ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC)
+                    {
+                        if (within_bounds_2d(iy_nearest, ix_nearest, inp_H, inp_W))
+                        {
+                            *out_ptr_NCHW = inp_ptr_NC[iy_nearest * inp_sH + ix_nearest * inp_sW];
                         }
-                } else if (interpolation_mode == GridSamplerInterpolation::Bicubic) {
+                        else
+                        {
+                            *out_ptr_NCHW = static_cast<float>(0);
+                        }
+                    }
+                }
+                else if (interpolation_mode == GridSamplerInterpolation::Bicubic)
+                {
                     // grid_sampler_compute_source_index will "clip the value" of idx depends on the padding,
                     // which would cause calculation to be wrong,
                     // for example x = -0.1 -> ix = 0 for zero padding, but in bicubic ix = floor(x) = -1
@@ -284,17 +331,19 @@ void GridSampleKernel::Compute(OrtKernelContext* context) {
 
                     const float *inp_ptr_NC = inp_ptr_N;
                     float *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
-                    for (int64_t c = 0; c < C; ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) {
+                    for (int64_t c = 0; c < C; ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC)
+                    {
                         float coefficients[4];
 
                         // Interpolate 4 values in the x directon
-                        for (int64_t i = 0; i < 4; ++i) {
-                        coefficients[i] = cubic_interp1d<float>(
-                            get_value_bounded<float>(inp_ptr_NC, ix_nw - 1, iy_nw - 1 + i, inp_W, inp_H, inp_sW, inp_sH, padding_mode, align_corners),
-                            get_value_bounded<float>(inp_ptr_NC, ix_nw + 0, iy_nw - 1 + i, inp_W, inp_H, inp_sW, inp_sH, padding_mode, align_corners),
-                            get_value_bounded<float>(inp_ptr_NC, ix_nw + 1, iy_nw - 1 + i, inp_W, inp_H, inp_sW, inp_sH, padding_mode, align_corners),
-                            get_value_bounded<float>(inp_ptr_NC, ix_nw + 2, iy_nw - 1 + i, inp_W, inp_H, inp_sW, inp_sH, padding_mode, align_corners),
-                            tx);
+                        for (int64_t i = 0; i < 4; ++i)
+                        {
+                            coefficients[i] = cubic_interp1d<float>(
+                                get_value_bounded<float>(inp_ptr_NC, ix_nw - 1, iy_nw - 1 + i, inp_W, inp_H, inp_sW, inp_sH, padding_mode, align_corners),
+                                get_value_bounded<float>(inp_ptr_NC, ix_nw + 0, iy_nw - 1 + i, inp_W, inp_H, inp_sW, inp_sH, padding_mode, align_corners),
+                                get_value_bounded<float>(inp_ptr_NC, ix_nw + 1, iy_nw - 1 + i, inp_W, inp_H, inp_sW, inp_sH, padding_mode, align_corners),
+                                get_value_bounded<float>(inp_ptr_NC, ix_nw + 2, iy_nw - 1 + i, inp_W, inp_H, inp_sW, inp_sH, padding_mode, align_corners),
+                                tx);
                         }
 
                         // Interpolate in the y direction

From 2be9478a371ef3ffbf4c5bef78f6816973900061 Mon Sep 17 00:00:00 2001
From: tangyanfei <tangyanfei@sensetime.com>
Date: Wed, 31 Mar 2021 11:05:53 +0800
Subject: [PATCH 03/15] update code

---
 mmcv/ops/csrc/onnxruntime/cpu/gridSample.cpp | 548 +++++++++----------
 1 file changed, 251 insertions(+), 297 deletions(-)

diff --git a/mmcv/ops/csrc/onnxruntime/cpu/gridSample.cpp b/mmcv/ops/csrc/onnxruntime/cpu/gridSample.cpp
index 10c9eebb21d..458ec16e3e6 100644
--- a/mmcv/ops/csrc/onnxruntime/cpu/gridSample.cpp
+++ b/mmcv/ops/csrc/onnxruntime/cpu/gridSample.cpp
@@ -6,49 +6,37 @@
 
 #define MIN(a, b) (((a) < (b)) ? (a) : (b))
 #define MAX(a, b) (((a) < (b)) ? (b) : (a))
-#define CLIP_COORDINATES(in, out, clip_limit) out = MIN((clip_limit - 1), MAX(in, 0))
+#define CLIP_COORDINATES(in, out, clip_limit)                                  \
+  out = MIN((clip_limit - 1), MAX(in, 0))
 
 GridSampleKernel::GridSampleKernel(OrtApi api, const OrtKernelInfo *info)
-    : api_(api), ort_(api_), info_(info)
-{
-    align_corners_ = ort_.KernelInfoGetAttribute<int64_t>(info, "align_corners");
-    interpolation_mode_ = ort_.KernelInfoGetAttribute<int64_t>(info, "interpolation_mode");
-    padding_mode_ = ort_.KernelInfoGetAttribute<int64_t>(info, "padding_mode");
+    : api_(api), ort_(api_), info_(info) {
+  align_corners_ = ort_.KernelInfoGetAttribute<int64_t>(info, "align_corners");
+  interpolation_mode_ =
+      ort_.KernelInfoGetAttribute<int64_t>(info, "interpolation_mode");
+  padding_mode_ = ort_.KernelInfoGetAttribute<int64_t>(info, "padding_mode");
 
-    allocator_ = Ort::AllocatorWithDefaultOptions();
+  allocator_ = Ort::AllocatorWithDefaultOptions();
 }
 
-enum GridSamplerInterpolation
-{
-    Bilinear = 0,
-    Nearest = 1,
-    Bicubic = 2
-};
-enum GridSamplerPadding
-{
-    Zeros = 0,
-    Border = 1,
-    Reflection = 2
-};
+enum GridSamplerInterpolation { Bilinear = 0, Nearest = 1, Bicubic = 2 };
+enum GridSamplerPadding { Zeros = 0, Border = 1, Reflection = 2 };
 
 template <typename scalar_t>
-static inline scalar_t grid_sampler_unnormalize(scalar_t coord, int64_t size, bool align_corners)
-{
-    if (align_corners)
-    {
-        return ((coord + 1) / 2) * (size - 1);
-    }
-    else
-    {
-        return ((coord + 1) * size - 1) / 2;
-    }
+static inline scalar_t grid_sampler_unnormalize(scalar_t coord, int64_t size,
+                                                bool align_corners) {
+  if (align_corners) {
+    return ((coord + 1) / 2) * (size - 1);
+  } else {
+    return ((coord + 1) * size - 1) / 2;
+  }
 }
 
 // Clips coordinates to between 0 and clip_limit - 1
 template <typename scalar_t>
-static inline scalar_t clip_coordinates(scalar_t in, int64_t clip_limit)
-{
-    return std::min(static_cast<scalar_t>(clip_limit - 1), std::max(in, static_cast<scalar_t>(0)));
+static inline scalar_t clip_coordinates(scalar_t in, int64_t clip_limit) {
+  return std::min(static_cast<scalar_t>(clip_limit - 1),
+                  std::max(in, static_cast<scalar_t>(0)));
 }
 
 // Reflects coordinates until they fall between low and high (inclusive).
@@ -56,301 +44,267 @@ static inline scalar_t clip_coordinates(scalar_t in, int64_t clip_limit)
 // can be represented as ints.
 template <typename scalar_t>
 static inline scalar_t reflect_coordinates(scalar_t in, int64_t twice_low,
-                                           int64_t twice_high)
-{
-    if (twice_low == twice_high)
-    {
-        return static_cast<scalar_t>(0);
-    }
-    scalar_t min = static_cast<scalar_t>(twice_low) / 2;
-    scalar_t span = static_cast<scalar_t>(twice_high - twice_low) / 2;
-    in = std::fabs(in - min);
-    // `fmod` returns same sign as `in`, which is positive after the `fabs` above.
-    scalar_t extra = std::fmod(in, span);
-    int flips = static_cast<int>(std::floor(in / span));
-    if (flips % 2 == 0)
-    {
-        return extra + min;
-    }
-    else
-    {
-        return span - extra + min;
-    }
+                                           int64_t twice_high) {
+  if (twice_low == twice_high) {
+    return static_cast<scalar_t>(0);
+  }
+  scalar_t min = static_cast<scalar_t>(twice_low) / 2;
+  scalar_t span = static_cast<scalar_t>(twice_high - twice_low) / 2;
+  in = std::fabs(in - min);
+  // `fmod` returns same sign as `in`, which is positive after the `fabs` above.
+  scalar_t extra = std::fmod(in, span);
+  int flips = static_cast<int>(std::floor(in / span));
+  if (flips % 2 == 0) {
+    return extra + min;
+  } else {
+    return span - extra + min;
+  }
 }
 
 template <typename scalar_t>
 static inline scalar_t compute_coordinates(scalar_t coord, int64_t size,
                                            int64_t padding_mode,
-                                           bool align_corners)
-{
-    if (padding_mode == GridSamplerPadding::Border)
-    {
-        coord = clip_coordinates(coord, size);
+                                           bool align_corners) {
+  if (padding_mode == GridSamplerPadding::Border) {
+    coord = clip_coordinates(coord, size);
+  } else if (padding_mode == GridSamplerPadding::Reflection) {
+    if (align_corners) {
+      coord = reflect_coordinates(coord, 0, 2 * (size - 1));
+    } else {
+      coord = reflect_coordinates(coord, -1, 2 * size - 1);
     }
-    else if (padding_mode == GridSamplerPadding::Reflection)
-    {
-        if (align_corners)
-        {
-            coord = reflect_coordinates(coord, 0, 2 * (size - 1));
-        }
-        else
-        {
-            coord = reflect_coordinates(coord, -1, 2 * size - 1);
-        }
-        coord = clip_coordinates(coord, size);
-    }
-    return coord;
+    coord = clip_coordinates(coord, size);
+  }
+  return coord;
 }
 
 // Computes the pixel source index value for a grid coordinate
 template <typename scalar_t>
-static inline scalar_t grid_sampler_compute_source_index(
-    scalar_t coord,
-    int64_t size,
-    int64_t padding_mode,
-    bool align_corners)
-{
-    coord = grid_sampler_unnormalize(coord, size, align_corners);
-    coord = compute_coordinates(coord, size, padding_mode, align_corners);
-    return coord;
+static inline scalar_t
+grid_sampler_compute_source_index(scalar_t coord, int64_t size,
+                                  int64_t padding_mode, bool align_corners) {
+  coord = grid_sampler_unnormalize(coord, size, align_corners);
+  coord = compute_coordinates(coord, size, padding_mode, align_corners);
+  return coord;
 }
 
-static inline bool within_bounds_2d(int64_t h, int64_t w, int64_t H, int64_t W)
-{
-    return h >= 0 && h < H && w >= 0 && w < W;
+static inline bool within_bounds_2d(int64_t h, int64_t w, int64_t H,
+                                    int64_t W) {
+  return h >= 0 && h < H && w >= 0 && w < W;
 }
 
 template <typename scalar_t>
-static inline scalar_t get_value_bounded(
-    const scalar_t *data,
-    scalar_t x,
-    scalar_t y,
-    int64_t W,
-    int64_t H,
-    int64_t sW,
-    int64_t sH,
-    int64_t padding_mode,
-    bool align_corners)
-{
-
-    x = compute_coordinates(x, W, padding_mode, align_corners);
-    y = compute_coordinates(y, H, padding_mode, align_corners);
-
-    int64_t ix = static_cast<int64_t>(x);
-    int64_t iy = static_cast<int64_t>(y);
-
-    if (within_bounds_2d(iy, ix, H, W))
-    {
-        return data[iy * sH + ix * sW];
-    }
-    return static_cast<scalar_t>(0);
+static inline scalar_t
+get_value_bounded(const scalar_t *data, scalar_t x, scalar_t y, int64_t W,
+                  int64_t H, int64_t sW, int64_t sH, int64_t padding_mode,
+                  bool align_corners) {
+
+  x = compute_coordinates(x, W, padding_mode, align_corners);
+  y = compute_coordinates(y, H, padding_mode, align_corners);
+
+  int64_t ix = static_cast<int64_t>(x);
+  int64_t iy = static_cast<int64_t>(y);
+
+  if (within_bounds_2d(iy, ix, H, W)) {
+    return data[iy * sH + ix * sW];
+  }
+  return static_cast<scalar_t>(0);
 }
 
 template <typename scalar_t>
-static inline scalar_t cubic_convolution1(scalar_t x, scalar_t A)
-{
-    return ((A + 2) * x - (A + 3)) * x * x + 1;
+static inline scalar_t cubic_convolution1(scalar_t x, scalar_t A) {
+  return ((A + 2) * x - (A + 3)) * x * x + 1;
 }
 
 template <typename scalar_t>
-static inline scalar_t cubic_convolution2(scalar_t x, scalar_t A)
-{
-    return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A;
+static inline scalar_t cubic_convolution2(scalar_t x, scalar_t A) {
+  return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A;
 }
 
 template <typename scalar_t>
-static inline void get_cubic_upsample_coefficients(
-    scalar_t coeffs[4],
-    scalar_t t)
-{
-    scalar_t A = -0.75;
-
-    scalar_t x1 = t;
-    coeffs[0] = cubic_convolution2<scalar_t>(x1 + 1.0, A);
-    coeffs[1] = cubic_convolution1<scalar_t>(x1, A);
-
-    // opposite coefficients
-    scalar_t x2 = 1.0 - t;
-    coeffs[2] = cubic_convolution1<scalar_t>(x2, A);
-    coeffs[3] = cubic_convolution2<scalar_t>(x2 + 1.0, A);
+static inline void get_cubic_upsample_coefficients(scalar_t coeffs[4],
+                                                   scalar_t t) {
+  scalar_t A = -0.75;
+
+  scalar_t x1 = t;
+  coeffs[0] = cubic_convolution2<scalar_t>(x1 + 1.0, A);
+  coeffs[1] = cubic_convolution1<scalar_t>(x1, A);
+
+  // opposite coefficients
+  scalar_t x2 = 1.0 - t;
+  coeffs[2] = cubic_convolution1<scalar_t>(x2, A);
+  coeffs[3] = cubic_convolution2<scalar_t>(x2 + 1.0, A);
 }
 
 template <typename scalar_t>
-static inline scalar_t cubic_interp1d(
-    scalar_t x0,
-    scalar_t x1,
-    scalar_t x2,
-    scalar_t x3,
-    scalar_t t)
-{
-    scalar_t coeffs[4];
-    get_cubic_upsample_coefficients<scalar_t>(coeffs, t);
-
-    return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3];
+static inline scalar_t cubic_interp1d(scalar_t x0, scalar_t x1, scalar_t x2,
+                                      scalar_t x3, scalar_t t) {
+  scalar_t coeffs[4];
+  get_cubic_upsample_coefficients<scalar_t>(coeffs, t);
+
+  return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3];
 }
 
-void GridSampleKernel::Compute(OrtKernelContext *context)
-{
-    const bool align_corners = align_corners_;
-    const int64_t padding_mode = padding_mode_;
-    const int64_t interpolation_mode = interpolation_mode_;
-
-    const OrtValue *input = ort_.KernelContext_GetInput(context, 0);
-    const float *input_data =
-        reinterpret_cast<const float *>(ort_.GetTensorData<float>(input));
-
-    const OrtValue *grid = ort_.KernelContext_GetInput(context, 1);
-    const float *grid_data =
-        reinterpret_cast<const float *>(ort_.GetTensorData<float>(grid));
-
-    OrtTensorDimensions input_dims(ort_, input);
-    OrtTensorDimensions grid_dims(ort_, grid);
-    int64_t N = input_dims[0];
-    int64_t C = input_dims[1];
-    int64_t inp_H = input_dims[2];
-    int64_t inp_W = input_dims[3];
-    int64_t out_H = grid_dims[1];
-    int64_t out_W = grid_dims[2];
-
-    std::vector<int64_t> output_dims = {N, C, out_H, out_W};
-    OrtValue *output = ort_.KernelContext_GetOutput(context, 0,
-                                                    output_dims.data(),
-                                                    output_dims.size());
-    float *out_ptr = ort_.GetTensorMutableData<float>(output);
-
-    int64_t inp_sN = input_dims[1] * input_dims[2] * input_dims[3];
-    int64_t inp_sC = input_dims[2] * input_dims[3];
-    int64_t inp_sH = input_dims[3];
-    int64_t inp_sW = 1;
-    int64_t grid_sN = grid_dims[1] * grid_dims[2] * grid_dims[3];
-    int64_t grid_sH = grid_dims[2] * grid_dims[3];
-    int64_t grid_sW = grid_dims[3];
-    int64_t grid_sCoor = 1;
-    int64_t out_sN = output_dims[1] * output_dims[2] * output_dims[3];
-    int64_t out_sC = output_dims[2] * output_dims[3];
-    int64_t out_sH = output_dims[3];
-    int64_t out_sW = 1;
-
-    // loop over each output pixel
-    for (int64_t n = 0; n < N; ++n)
-    {
-        const float *grid_ptr_N = grid_data + n * grid_sN;
-        const float *inp_ptr_N = input_data + n * inp_sN;
-        for (int64_t h = 0; h < out_H; ++h)
-        {
-            for (int64_t w = 0; w < out_W; ++w)
-            {
-                const float *grid_ptr_NHW = grid_ptr_N + h * grid_sH + w * grid_sW;
-                float x = *grid_ptr_NHW;
-                float y = grid_ptr_NHW[grid_sCoor];
-
-                float ix = grid_sampler_compute_source_index(x, inp_W, padding_mode, align_corners);
-                float iy = grid_sampler_compute_source_index(y, inp_H, padding_mode, align_corners);
-
-                if (interpolation_mode == GridSamplerInterpolation::Bilinear)
-                {
-                    // get corner pixel values from (x, y)
-                    // for 4d, we use north-east-south-west
-                    int64_t ix_nw = static_cast<int64_t>(std::floor(ix));
-                    int64_t iy_nw = static_cast<int64_t>(std::floor(iy));
-
-                    int64_t ix_ne = ix_nw + 1;
-                    int64_t iy_ne = iy_nw;
-
-                    int64_t ix_sw = ix_nw;
-                    int64_t iy_sw = iy_nw + 1;
-
-                    int64_t ix_se = ix_nw + 1;
-                    int64_t iy_se = iy_nw + 1;
-
-                    // get surfaces to each neighbor:
-                    float nw = (ix_se - ix) * (iy_se - iy);
-                    float ne = (ix - ix_sw) * (iy_sw - iy);
-                    float sw = (ix_ne - ix) * (iy - iy_ne);
-                    float se = (ix - ix_nw) * (iy - iy_nw);
-
-                    // calculate bilinear weighted pixel value and set output pixel
-                    const float *inp_ptr_NC = inp_ptr_N;
-                    float *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
-                    for (int64_t c = 0; c < C; ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC)
-                    {
-                        auto res = static_cast<float>(0);
-                        if (within_bounds_2d(iy_nw, ix_nw, inp_H, inp_W))
-                        {
-                            res += inp_ptr_NC[iy_nw * inp_sH + ix_nw * inp_sW] * nw;
-                        }
-                        if (within_bounds_2d(iy_ne, ix_ne, inp_H, inp_W))
-                        {
-                            res += inp_ptr_NC[iy_ne * inp_sH + ix_ne * inp_sW] * ne;
-                        }
-                        if (within_bounds_2d(iy_sw, ix_sw, inp_H, inp_W))
-                        {
-                            res += inp_ptr_NC[iy_sw * inp_sH + ix_sw * inp_sW] * sw;
-                        }
-                        if (within_bounds_2d(iy_se, ix_se, inp_H, inp_W))
-                        {
-                            res += inp_ptr_NC[iy_se * inp_sH + ix_se * inp_sW] * se;
-                        }
-                        *out_ptr_NCHW = res;
-                    }
-                }
-                else if (interpolation_mode == GridSamplerInterpolation::Nearest)
-                {
-                    int64_t ix_nearest = static_cast<int64_t>(std::nearbyint(ix));
-                    int64_t iy_nearest = static_cast<int64_t>(std::nearbyint(iy));
-
-                    // assign nearest neighor pixel value to output pixel
-                    float *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
-                    const float *inp_ptr_NC = inp_ptr_N;
-                    for (int64_t c = 0; c < C; ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC)
-                    {
-                        if (within_bounds_2d(iy_nearest, ix_nearest, inp_H, inp_W))
-                        {
-                            *out_ptr_NCHW = inp_ptr_NC[iy_nearest * inp_sH + ix_nearest * inp_sW];
-                        }
-                        else
-                        {
-                            *out_ptr_NCHW = static_cast<float>(0);
-                        }
-                    }
-                }
-                else if (interpolation_mode == GridSamplerInterpolation::Bicubic)
-                {
-                    // grid_sampler_compute_source_index will "clip the value" of idx depends on the padding,
-                    // which would cause calculation to be wrong,
-                    // for example x = -0.1 -> ix = 0 for zero padding, but in bicubic ix = floor(x) = -1
-                    // There would be more problem in reflection padding, since the -1 and +1 direction is not fixed in boundary condition
-                    ix = grid_sampler_unnormalize(x, inp_W, align_corners);
-                    iy = grid_sampler_unnormalize(y, inp_H, align_corners);
-
-                    float ix_nw = std::floor(ix);
-                    float iy_nw = std::floor(iy);
-
-                    const float tx = ix - ix_nw;
-                    const float ty = iy - iy_nw;
-
-                    const float *inp_ptr_NC = inp_ptr_N;
-                    float *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
-                    for (int64_t c = 0; c < C; ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC)
-                    {
-                        float coefficients[4];
-
-                        // Interpolate 4 values in the x directon
-                        for (int64_t i = 0; i < 4; ++i)
-                        {
-                            coefficients[i] = cubic_interp1d<float>(
-                                get_value_bounded<float>(inp_ptr_NC, ix_nw - 1, iy_nw - 1 + i, inp_W, inp_H, inp_sW, inp_sH, padding_mode, align_corners),
-                                get_value_bounded<float>(inp_ptr_NC, ix_nw + 0, iy_nw - 1 + i, inp_W, inp_H, inp_sW, inp_sH, padding_mode, align_corners),
-                                get_value_bounded<float>(inp_ptr_NC, ix_nw + 1, iy_nw - 1 + i, inp_W, inp_H, inp_sW, inp_sH, padding_mode, align_corners),
-                                get_value_bounded<float>(inp_ptr_NC, ix_nw + 2, iy_nw - 1 + i, inp_W, inp_H, inp_sW, inp_sH, padding_mode, align_corners),
-                                tx);
-                        }
-
-                        // Interpolate in the y direction
-                        *out_ptr_NCHW = cubic_interp1d<float>(coefficients[0], coefficients[1], coefficients[2], coefficients[3], ty);
-                    }
-                }
+void GridSampleKernel::Compute(OrtKernelContext *context) {
+  const bool align_corners = align_corners_;
+  const int64_t padding_mode = padding_mode_;
+  const int64_t interpolation_mode = interpolation_mode_;
+
+  const OrtValue *input = ort_.KernelContext_GetInput(context, 0);
+  const float *input_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(input));
+
+  const OrtValue *grid = ort_.KernelContext_GetInput(context, 1);
+  const float *grid_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(grid));
+
+  OrtTensorDimensions input_dims(ort_, input);
+  OrtTensorDimensions grid_dims(ort_, grid);
+  int64_t N = input_dims[0];
+  int64_t C = input_dims[1];
+  int64_t inp_H = input_dims[2];
+  int64_t inp_W = input_dims[3];
+  int64_t out_H = grid_dims[1];
+  int64_t out_W = grid_dims[2];
+
+  std::vector<int64_t> output_dims = {N, C, out_H, out_W};
+  OrtValue *output = ort_.KernelContext_GetOutput(
+      context, 0, output_dims.data(), output_dims.size());
+  float *out_ptr = ort_.GetTensorMutableData<float>(output);
+
+  int64_t inp_sN = input_dims[1] * input_dims[2] * input_dims[3];
+  int64_t inp_sC = input_dims[2] * input_dims[3];
+  int64_t inp_sH = input_dims[3];
+  int64_t inp_sW = 1;
+  int64_t grid_sN = grid_dims[1] * grid_dims[2] * grid_dims[3];
+  int64_t grid_sH = grid_dims[2] * grid_dims[3];
+  int64_t grid_sW = grid_dims[3];
+  int64_t grid_sCoor = 1;
+  int64_t out_sN = output_dims[1] * output_dims[2] * output_dims[3];
+  int64_t out_sC = output_dims[2] * output_dims[3];
+  int64_t out_sH = output_dims[3];
+  int64_t out_sW = 1;
+
+  // loop over each output pixel
+  for (int64_t n = 0; n < N; ++n) {
+    const float *grid_ptr_N = grid_data + n * grid_sN;
+    const float *inp_ptr_N = input_data + n * inp_sN;
+    for (int64_t h = 0; h < out_H; ++h) {
+      for (int64_t w = 0; w < out_W; ++w) {
+        const float *grid_ptr_NHW = grid_ptr_N + h * grid_sH + w * grid_sW;
+        float x = *grid_ptr_NHW;
+        float y = grid_ptr_NHW[grid_sCoor];
+
+        float ix = grid_sampler_compute_source_index(x, inp_W, padding_mode,
+                                                     align_corners);
+        float iy = grid_sampler_compute_source_index(y, inp_H, padding_mode,
+                                                     align_corners);
+
+        if (interpolation_mode == GridSamplerInterpolation::Bilinear) {
+          // get corner pixel values from (x, y)
+          // for 4d, we use north-east-south-west
+          int64_t ix_nw = static_cast<int64_t>(std::floor(ix));
+          int64_t iy_nw = static_cast<int64_t>(std::floor(iy));
+
+          int64_t ix_ne = ix_nw + 1;
+          int64_t iy_ne = iy_nw;
+
+          int64_t ix_sw = ix_nw;
+          int64_t iy_sw = iy_nw + 1;
+
+          int64_t ix_se = ix_nw + 1;
+          int64_t iy_se = iy_nw + 1;
+
+          // get surfaces to each neighbor:
+          float nw = (ix_se - ix) * (iy_se - iy);
+          float ne = (ix - ix_sw) * (iy_sw - iy);
+          float sw = (ix_ne - ix) * (iy - iy_ne);
+          float se = (ix - ix_nw) * (iy - iy_nw);
+
+          // calculate bilinear weighted pixel value and set output pixel
+          const float *inp_ptr_NC = inp_ptr_N;
+          float *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
+          for (int64_t c = 0; c < C;
+               ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) {
+            auto res = static_cast<float>(0);
+            if (within_bounds_2d(iy_nw, ix_nw, inp_H, inp_W)) {
+              res += inp_ptr_NC[iy_nw * inp_sH + ix_nw * inp_sW] * nw;
+            }
+            if (within_bounds_2d(iy_ne, ix_ne, inp_H, inp_W)) {
+              res += inp_ptr_NC[iy_ne * inp_sH + ix_ne * inp_sW] * ne;
             }
+            if (within_bounds_2d(iy_sw, ix_sw, inp_H, inp_W)) {
+              res += inp_ptr_NC[iy_sw * inp_sH + ix_sw * inp_sW] * sw;
+            }
+            if (within_bounds_2d(iy_se, ix_se, inp_H, inp_W)) {
+              res += inp_ptr_NC[iy_se * inp_sH + ix_se * inp_sW] * se;
+            }
+            *out_ptr_NCHW = res;
+          }
+        } else if (interpolation_mode == GridSamplerInterpolation::Nearest) {
+          int64_t ix_nearest = static_cast<int64_t>(std::nearbyint(ix));
+          int64_t iy_nearest = static_cast<int64_t>(std::nearbyint(iy));
+
+          // assign nearest neighor pixel value to output pixel
+          float *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
+          const float *inp_ptr_NC = inp_ptr_N;
+          for (int64_t c = 0; c < C;
+               ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) {
+            if (within_bounds_2d(iy_nearest, ix_nearest, inp_H, inp_W)) {
+              *out_ptr_NCHW =
+                  inp_ptr_NC[iy_nearest * inp_sH + ix_nearest * inp_sW];
+            } else {
+              *out_ptr_NCHW = static_cast<float>(0);
+            }
+          }
+        } else if (interpolation_mode == GridSamplerInterpolation::Bicubic) {
+          // grid_sampler_compute_source_index will "clip the value" of idx
+          // depends on the padding,
+          // which would cause calculation to be wrong,
+          // for example x = -0.1 -> ix = 0 for zero padding, but in bicubic ix
+          // = floor(x) = -1
+          // There would be more problem in reflection padding, since the -1 and
+          // +1 direction is not fixed in boundary condition
+          ix = grid_sampler_unnormalize(x, inp_W, align_corners);
+          iy = grid_sampler_unnormalize(y, inp_H, align_corners);
+
+          float ix_nw = std::floor(ix);
+          float iy_nw = std::floor(iy);
+
+          const float tx = ix - ix_nw;
+          const float ty = iy - iy_nw;
+
+          const float *inp_ptr_NC = inp_ptr_N;
+          float *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
+          for (int64_t c = 0; c < C;
+               ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) {
+            float coefficients[4];
+
+            // Interpolate 4 values in the x directon
+            for (int64_t i = 0; i < 4; ++i) {
+              coefficients[i] = cubic_interp1d<float>(
+                  get_value_bounded<float>(inp_ptr_NC, ix_nw - 1, iy_nw - 1 + i,
+                                           inp_W, inp_H, inp_sW, inp_sH,
+                                           padding_mode, align_corners),
+                  get_value_bounded<float>(inp_ptr_NC, ix_nw + 0, iy_nw - 1 + i,
+                                           inp_W, inp_H, inp_sW, inp_sH,
+                                           padding_mode, align_corners),
+                  get_value_bounded<float>(inp_ptr_NC, ix_nw + 1, iy_nw - 1 + i,
+                                           inp_W, inp_H, inp_sW, inp_sH,
+                                           padding_mode, align_corners),
+                  get_value_bounded<float>(inp_ptr_NC, ix_nw + 2, iy_nw - 1 + i,
+                                           inp_W, inp_H, inp_sW, inp_sH,
+                                           padding_mode, align_corners),
+                  tx);
+            }
+
+            // Interpolate in the y direction
+            *out_ptr_NCHW =
+                cubic_interp1d<float>(coefficients[0], coefficients[1],
+                                      coefficients[2], coefficients[3], ty);
+          }
         }
+      }
     }
+  }
 }

From d0f92d6d64733f82e0866d2100c614d92edabe7f Mon Sep 17 00:00:00 2001
From: tangyanfei <tangyanfei@sensetime.com>
Date: Wed, 31 Mar 2021 12:11:41 +0800
Subject: [PATCH 04/15] update code

---
 tests/test_ops/test_onnx.py | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/tests/test_ops/test_onnx.py b/tests/test_ops/test_onnx.py
index 4faf1f95a2d..eb5cd320648 100644
--- a/tests/test_ops/test_onnx.py
+++ b/tests/test_ops/test_onnx.py
@@ -9,6 +9,8 @@
 import torch
 import torch.nn as nn
 from packaging import version
+from torch.onnx.symbolic_helper import parse_args
+from torch.onnx.symbolic_registry import register_op
 
 onnx_file = 'tmp.onnx'
 
@@ -30,9 +32,24 @@ def forward(self, x, y):
         return res
 
 
+@parse_args('v', 'v', 'i', 'i', 'i')
+def grid_sampler(g,
+                 input,
+                 grid,
+                 interpolation_mode,
+                 padding_mode,
+                 align_corners=False):
+    return g.op(
+        'mmcv::grid_sampler',
+        input,
+        grid,
+        interpolation_mode_i=interpolation_mode,
+        padding_mode_i=padding_mode,
+        align_corners_i=align_corners)
+
+
 def test_grid_sampler():
-    from mmcv.onnx.symbolic import register_extra_symbolics
-    register_extra_symbolics()
+    register_op('grid_sampler', grid_sampler, '', 11)
     input = torch.ones(1, 1, 2, 2)
     out_h = 4
     out_w = 4

From 285f0487efe0e2c1de9a2399e7cabbec0826b17d Mon Sep 17 00:00:00 2001
From: tangyanfei <tangyanfei@sensetime.com>
Date: Wed, 31 Mar 2021 16:35:53 +0800
Subject: [PATCH 05/15] update code

---
 mmcv/ops/csrc/onnxruntime/cpu/gridSample.cpp  | 15 +++++----
 .../onnxruntime/cpu/onnxruntime_register.cpp  |  8 ++---
 mmcv/ops/csrc/onnxruntime/grid_sample.h       | 32 +++++++++----------
 3 files changed, 28 insertions(+), 27 deletions(-)

diff --git a/mmcv/ops/csrc/onnxruntime/cpu/gridSample.cpp b/mmcv/ops/csrc/onnxruntime/cpu/gridSample.cpp
index 458ec16e3e6..2b4582facbc 100644
--- a/mmcv/ops/csrc/onnxruntime/cpu/gridSample.cpp
+++ b/mmcv/ops/csrc/onnxruntime/cpu/gridSample.cpp
@@ -6,7 +6,7 @@
 
 #define MIN(a, b) (((a) < (b)) ? (a) : (b))
 #define MAX(a, b) (((a) < (b)) ? (b) : (a))
-#define CLIP_COORDINATES(in, out, clip_limit)                                  \
+#define CLIP_COORDINATES(in, out, clip_limit) \
   out = MIN((clip_limit - 1), MAX(in, 0))
 
 GridSampleKernel::GridSampleKernel(OrtApi api, const OrtKernelInfo *info)
@@ -81,8 +81,10 @@ static inline scalar_t compute_coordinates(scalar_t coord, int64_t size,
 // Computes the pixel source index value for a grid coordinate
 template <typename scalar_t>
 static inline scalar_t
-grid_sampler_compute_source_index(scalar_t coord, int64_t size,
-                                  int64_t padding_mode, bool align_corners) {
+grid_sampler_compute_source_index(scalar_t coord,
+                                  int64_t size,
+                                  int64_t padding_mode,
+                                  bool align_corners) {
   coord = grid_sampler_unnormalize(coord, size, align_corners);
   coord = compute_coordinates(coord, size, padding_mode, align_corners);
   return coord;
@@ -95,10 +97,11 @@ static inline bool within_bounds_2d(int64_t h, int64_t w, int64_t H,
 
 template <typename scalar_t>
 static inline scalar_t
-get_value_bounded(const scalar_t *data, scalar_t x, scalar_t y, int64_t W,
-                  int64_t H, int64_t sW, int64_t sH, int64_t padding_mode,
+get_value_bounded(const scalar_t *data, scalar_t x,
+                  scalar_t y, int64_t W, int64_t H,
+                  int64_t sW, int64_t sH,
+                  int64_t padding_mode,
                   bool align_corners) {
-
   x = compute_coordinates(x, W, padding_mode, align_corners);
   y = compute_coordinates(y, H, padding_mode, align_corners);
 
diff --git a/mmcv/ops/csrc/onnxruntime/cpu/onnxruntime_register.cpp b/mmcv/ops/csrc/onnxruntime/cpu/onnxruntime_register.cpp
index 06196b8110f..257dda443d8 100644
--- a/mmcv/ops/csrc/onnxruntime/cpu/onnxruntime_register.cpp
+++ b/mmcv/ops/csrc/onnxruntime/cpu/onnxruntime_register.cpp
@@ -1,10 +1,10 @@
 #include "onnxruntime_register.h"
 
+#include "grid_sample.h"
 #include "nms.h"
 #include "ort_mmcv_utils.h"
 #include "roi_align.h"
 #include "soft_nms.h"
-#include "grid_sample.h"
 
 const char *c_MMCVOpDomain = "mmcv";
 SoftNmsOp c_SoftNmsOp;
@@ -29,13 +29,11 @@ OrtStatus *ORT_API_CALL RegisterCustomOps(OrtSessionOptions *options,
     return status;
   }
 
-  if (auto status =
-          ortApi->CustomOpDomain_Add(domain, &c_MMCVRoiAlignCustomOp)) {
+  if (auto status = ortApi->CustomOpDomain_Add(domain, &c_MMCVRoiAlignCustomOp)) {
     return status;
   }
 
-  if (auto status =
-          ortApi->CustomOpDomain_Add(domain, &c_GridSampleOp)) {
+  if (auto status = ortApi->CustomOpDomain_Add(domain, &c_GridSampleOp)) {
     return status;
   }
 
diff --git a/mmcv/ops/csrc/onnxruntime/grid_sample.h b/mmcv/ops/csrc/onnxruntime/grid_sample.h
index b10555cf13d..09cf0ad8edb 100644
--- a/mmcv/ops/csrc/onnxruntime/grid_sample.h
+++ b/mmcv/ops/csrc/onnxruntime/grid_sample.h
@@ -8,7 +8,7 @@ struct GridSampleKernel {
 
   void Compute(OrtKernelContext *context);
 
- protected:
+protected:
   OrtApi api_;
   Ort::CustomOpApi ort_;
   const OrtKernelInfo *info_;
@@ -20,24 +20,24 @@ struct GridSampleKernel {
 };
 
 struct GridSampleOp : Ort::CustomOpBase<GridSampleOp, GridSampleKernel> {
-    void* CreateKernel(OrtApi api, const OrtKernelInfo *info) const {
-        return new GridSampleKernel(api, info);
-    };
+  void *CreateKernel(OrtApi api, const OrtKernelInfo *info) const {
+    return new GridSampleKernel(api, info);
+  };
 
-    const char* GetName() const { return "grid_sampler"; };
+  const char *GetName() const { return "grid_sampler"; };
 
-    size_t GetInputTypeCount() const { return 2; };
-    ONNXTensorElementDataType GetInputType(size_t /*index*/) const {
-        return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
-    };
+  size_t GetInputTypeCount() const { return 2; };
+  ONNXTensorElementDataType GetInputType(size_t /*index*/) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
 
-    size_t GetOutputTypeCount() const { return 1; };
-    ONNXTensorElementDataType GetOutputType(size_t /*index*/) const {
-        return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
-    };
+  size_t GetOutputTypeCount() const { return 1; };
+  ONNXTensorElementDataType GetOutputType(size_t /*index*/) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
 
-    const char* GetExecutionProviderType() const {
-        return "CPUExecutionProvider";
-    };
+  const char *GetExecutionProviderType() const {
+    return "CPUExecutionProvider";
+  };
 };
 #endif

From 7874b74ac6d73750be7633266f00245488ca3e56 Mon Sep 17 00:00:00 2001
From: tangyanfei <tangyanfei@sensetime.com>
Date: Wed, 31 Mar 2021 16:57:39 +0800
Subject: [PATCH 06/15] update code

---
 mmcv/ops/csrc/onnxruntime/cpu/gridSample.cpp | 24 +++++++++-----------
 mmcv/ops/csrc/onnxruntime/grid_sample.h      |  2 +-
 2 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/mmcv/ops/csrc/onnxruntime/cpu/gridSample.cpp b/mmcv/ops/csrc/onnxruntime/cpu/gridSample.cpp
index 2b4582facbc..cdefdcc4b3a 100644
--- a/mmcv/ops/csrc/onnxruntime/cpu/gridSample.cpp
+++ b/mmcv/ops/csrc/onnxruntime/cpu/gridSample.cpp
@@ -1,8 +1,8 @@
-#include "grid_sample.h"
+#include <cmath>
 
 #include "../ort_mmcv_utils.h"
+#include "grid_sample.h"
 
-#include <cmath>
 
 #define MIN(a, b) (((a) < (b)) ? (a) : (b))
 #define MAX(a, b) (((a) < (b)) ? (b) : (a))
@@ -80,11 +80,10 @@ static inline scalar_t compute_coordinates(scalar_t coord, int64_t size,
 
 // Computes the pixel source index value for a grid coordinate
 template <typename scalar_t>
-static inline scalar_t
-grid_sampler_compute_source_index(scalar_t coord,
-                                  int64_t size,
-                                  int64_t padding_mode,
-                                  bool align_corners) {
+static inline scalar_t grid_sampler_compute_source_index(scalar_t coord,
+                                                         int64_t size,
+                                                         int64_t padding_mode,
+                                                         bool align_corners) {
   coord = grid_sampler_unnormalize(coord, size, align_corners);
   coord = compute_coordinates(coord, size, padding_mode, align_corners);
   return coord;
@@ -96,12 +95,11 @@ static inline bool within_bounds_2d(int64_t h, int64_t w, int64_t H,
 }
 
 template <typename scalar_t>
-static inline scalar_t
-get_value_bounded(const scalar_t *data, scalar_t x,
-                  scalar_t y, int64_t W, int64_t H,
-                  int64_t sW, int64_t sH,
-                  int64_t padding_mode,
-                  bool align_corners) {
+static inline scalar_t get_value_bounded(const scalar_t *data, scalar_t x,
+                                         scalar_t y, int64_t W, int64_t H,
+                                         int64_t sW, int64_t sH,
+                                         int64_t padding_mode,
+                                         bool align_corners) {
   x = compute_coordinates(x, W, padding_mode, align_corners);
   y = compute_coordinates(y, H, padding_mode, align_corners);
 
diff --git a/mmcv/ops/csrc/onnxruntime/grid_sample.h b/mmcv/ops/csrc/onnxruntime/grid_sample.h
index 09cf0ad8edb..923cf7e03ce 100644
--- a/mmcv/ops/csrc/onnxruntime/grid_sample.h
+++ b/mmcv/ops/csrc/onnxruntime/grid_sample.h
@@ -8,7 +8,7 @@ struct GridSampleKernel {
 
   void Compute(OrtKernelContext *context);
 
-protected:
+ protected:
   OrtApi api_;
   Ort::CustomOpApi ort_;
   const OrtKernelInfo *info_;

From 7bf5aa39c45b770e5f5a68ac64e01ee6392d0e2f Mon Sep 17 00:00:00 2001
From: tangyanfei <tangyanfei@sensetime.com>
Date: Wed, 31 Mar 2021 17:26:11 +0800
Subject: [PATCH 07/15] update code

---
 tests/test_ops/test_onnx.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/test_ops/test_onnx.py b/tests/test_ops/test_onnx.py
index eb5cd320648..5c352474b98 100644
--- a/tests/test_ops/test_onnx.py
+++ b/tests/test_ops/test_onnx.py
@@ -67,9 +67,11 @@ def test_grid_sampler():
 
     from mmcv.ops import get_onnxruntime_op_path
     ort_custom_op_path = get_onnxruntime_op_path()
+    if not os.path.exists(ort_custom_op_path):
+        pytest.skip('nms for onnxruntime is not compiled.')
+
     session_options = rt.SessionOptions()
-    if os.path.exists(ort_custom_op_path):
-        session_options.register_custom_ops_library(ort_custom_op_path)
+    session_options.register_custom_ops_library(ort_custom_op_path)
     sess = rt.InferenceSession(onnx_file, session_options)
     input_feature = input.cpu().numpy()
     grid_feature = grid.cpu().numpy()

From 80c18ea47d652ff7a2b934e4102bc33b6d92341e Mon Sep 17 00:00:00 2001
From: tangyanfei <tangyanfei@sensetime.com>
Date: Wed, 31 Mar 2021 19:16:30 +0800
Subject: [PATCH 08/15] update code

---
 mmcv/ops/csrc/onnxruntime/cpu/gridSample.cpp | 1 -
 tests/test_ops/test_onnx.py                  | 8 +++++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/mmcv/ops/csrc/onnxruntime/cpu/gridSample.cpp b/mmcv/ops/csrc/onnxruntime/cpu/gridSample.cpp
index cdefdcc4b3a..1a5b6715061 100644
--- a/mmcv/ops/csrc/onnxruntime/cpu/gridSample.cpp
+++ b/mmcv/ops/csrc/onnxruntime/cpu/gridSample.cpp
@@ -3,7 +3,6 @@
 #include "../ort_mmcv_utils.h"
 #include "grid_sample.h"
 
-
 #define MIN(a, b) (((a) < (b)) ? (a) : (b))
 #define MAX(a, b) (((a) < (b)) ? (b) : (a))
 #define CLIP_COORDINATES(in, out, clip_limit) \
diff --git a/tests/test_ops/test_onnx.py b/tests/test_ops/test_onnx.py
index 5c352474b98..45c678d8d77 100644
--- a/tests/test_ops/test_onnx.py
+++ b/tests/test_ops/test_onnx.py
@@ -61,7 +61,13 @@ def test_grid_sampler():
     grid = grid.unsqueeze(0).repeat(1, 1, 1, 1)
 
     model = GridSample()
-    torch.onnx.export(model, (input, grid), onnx_file, opset_version=11)
+    with torch.no_grad():
+        torch.onnx.export(
+            model, (input, grid),
+            onnx_file,
+            export_params=True,
+            keep_initializers_as_inputs=True,
+            opset_version=11)
 
     pytorch_output = model(input, grid)
 

From 121962544e609985d941c14a822d5d9b9e8c68ab Mon Sep 17 00:00:00 2001
From: tangyanfei <tangyanfei@sensetime.com>
Date: Wed, 31 Mar 2021 19:27:38 +0800
Subject: [PATCH 09/15] update code

---
 tests/test_ops/test_onnx.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/test_ops/test_onnx.py b/tests/test_ops/test_onnx.py
index 45c678d8d77..4f46eaea659 100644
--- a/tests/test_ops/test_onnx.py
+++ b/tests/test_ops/test_onnx.py
@@ -60,6 +60,11 @@ def test_grid_sampler():
     grid[:, :, 1] = h.unsqueeze(0).repeat(out_w, 1).transpose(0, 1)
     grid = grid.unsqueeze(0).repeat(1, 1, 1, 1)
 
+    from mmcv.ops import get_onnxruntime_op_path
+    ort_custom_op_path = get_onnxruntime_op_path()
+    if not os.path.exists(ort_custom_op_path):
+        pytest.skip('nms for onnxruntime is not compiled.')
+
     model = GridSample()
     with torch.no_grad():
         torch.onnx.export(
@@ -71,11 +76,6 @@ def test_grid_sampler():
 
     pytorch_output = model(input, grid)
 
-    from mmcv.ops import get_onnxruntime_op_path
-    ort_custom_op_path = get_onnxruntime_op_path()
-    if not os.path.exists(ort_custom_op_path):
-        pytest.skip('nms for onnxruntime is not compiled.')
-
     session_options = rt.SessionOptions()
     session_options.register_custom_ops_library(ort_custom_op_path)
     sess = rt.InferenceSession(onnx_file, session_options)

From 36e98a1e1af9c15855fee2c96dfa8b11872333a0 Mon Sep 17 00:00:00 2001
From: tangyanfei <tangyanfei@sensetime.com>
Date: Wed, 31 Mar 2021 19:34:20 +0800
Subject: [PATCH 10/15] update code

---
 mmcv/ops/csrc/onnxruntime/cpu/onnxruntime_register.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mmcv/ops/csrc/onnxruntime/cpu/onnxruntime_register.cpp b/mmcv/ops/csrc/onnxruntime/cpu/onnxruntime_register.cpp
index 257dda443d8..cd65412a52c 100644
--- a/mmcv/ops/csrc/onnxruntime/cpu/onnxruntime_register.cpp
+++ b/mmcv/ops/csrc/onnxruntime/cpu/onnxruntime_register.cpp
@@ -29,7 +29,8 @@ OrtStatus *ORT_API_CALL RegisterCustomOps(OrtSessionOptions *options,
     return status;
   }
 
-  if (auto status = ortApi->CustomOpDomain_Add(domain, &c_MMCVRoiAlignCustomOp)) {
+  if (auto status =
+          ortApi->CustomOpDomain_Add(domain, &c_MMCVRoiAlignCustomOp)) {
     return status;
   }
 

From 3c9037f422757d9b0a913f3a871b80346fffd167 Mon Sep 17 00:00:00 2001
From: tangyanfei <tangyanfei@sensetime.com>
Date: Thu, 1 Apr 2021 16:40:51 +0800
Subject: [PATCH 11/15] update code

---
 mmcv/ops/csrc/onnxruntime/cpu/gridSample.cpp |  3 +
 tests/test_ops/test_onnx.py                  | 91 +++++++++-----------
 2 files changed, 46 insertions(+), 48 deletions(-)

diff --git a/mmcv/ops/csrc/onnxruntime/cpu/gridSample.cpp b/mmcv/ops/csrc/onnxruntime/cpu/gridSample.cpp
index 1a5b6715061..c60c1022211 100644
--- a/mmcv/ops/csrc/onnxruntime/cpu/gridSample.cpp
+++ b/mmcv/ops/csrc/onnxruntime/cpu/gridSample.cpp
@@ -145,6 +145,9 @@ static inline scalar_t cubic_interp1d(scalar_t x0, scalar_t x1, scalar_t x2,
   return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3];
 }
 
+// modified from
+// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/GridSampler.cpp
+
 void GridSampleKernel::Compute(OrtKernelContext *context) {
   const bool align_corners = align_corners_;
   const int64_t padding_mode = padding_mode_;
diff --git a/tests/test_ops/test_onnx.py b/tests/test_ops/test_onnx.py
index 4f46eaea659..9a04f82e8c5 100644
--- a/tests/test_ops/test_onnx.py
+++ b/tests/test_ops/test_onnx.py
@@ -9,8 +9,6 @@
 import torch
 import torch.nn as nn
 from packaging import version
-from torch.onnx.symbolic_helper import parse_args
-from torch.onnx.symbolic_registry import register_op
 
 onnx_file = 'tmp.onnx'
 
@@ -25,31 +23,7 @@ def forward(self, *args, **kwargs):
         return self.wrapped_function(*args, **kwargs)
 
 
-class GridSample(torch.nn.Module):
-
-    def forward(self, x, y):
-        res = torch.nn.functional.grid_sample(x, grid=y, align_corners=False)
-        return res
-
-
-@parse_args('v', 'v', 'i', 'i', 'i')
-def grid_sampler(g,
-                 input,
-                 grid,
-                 interpolation_mode,
-                 padding_mode,
-                 align_corners=False):
-    return g.op(
-        'mmcv::grid_sampler',
-        input,
-        grid,
-        interpolation_mode_i=interpolation_mode,
-        padding_mode_i=padding_mode,
-        align_corners_i=align_corners)
-
-
 def test_grid_sampler():
-    register_op('grid_sampler', grid_sampler, '', 11)
     input = torch.ones(1, 1, 2, 2)
     out_h = 4
     out_w = 4
@@ -59,31 +33,52 @@ def test_grid_sampler():
     grid[:, :, 0] = w.unsqueeze(0).repeat(out_h, 1)
     grid[:, :, 1] = h.unsqueeze(0).repeat(out_w, 1).transpose(0, 1)
     grid = grid.unsqueeze(0).repeat(1, 1, 1, 1)
-
     from mmcv.ops import get_onnxruntime_op_path
     ort_custom_op_path = get_onnxruntime_op_path()
     if not os.path.exists(ort_custom_op_path):
-        pytest.skip('nms for onnxruntime is not compiled.')
-
-    model = GridSample()
-    with torch.no_grad():
-        torch.onnx.export(
-            model, (input, grid),
-            onnx_file,
-            export_params=True,
-            keep_initializers_as_inputs=True,
-            opset_version=11)
-
-    pytorch_output = model(input, grid)
-
-    session_options = rt.SessionOptions()
-    session_options.register_custom_ops_library(ort_custom_op_path)
-    sess = rt.InferenceSession(onnx_file, session_options)
-    input_feature = input.cpu().numpy()
-    grid_feature = grid.cpu().numpy()
-    onnx_output = sess.run(None, {'x': input_feature, 'y': grid_feature})
-    os.remove(onnx_file)
-    assert np.allclose(pytorch_output, onnx_output, atol=1e-3)
+        pytest.skip('grid_sample for onnxruntime is not compiled.')
+
+    inter_modes = ['bilinear', 'nearest']
+    padding_modes = ['zeros', 'border', 'reflection']
+    corners = [False, True]
+    for inter_mode in inter_modes:
+        for padding_mode in padding_modes:
+            for align_corner in corners:
+                pytorch_output = \
+                    torch.nn.functional.grid_sample(input,
+                                                    grid,
+                                                    mode=inter_mode,
+                                                    padding_mode=padding_mode,
+                                                    align_corners=align_corner)
+                wrapped_model = WrapFunction(torch.nn.functional.grid_sample)
+                wrapped_model.cpu().eval()
+
+                from mmcv.onnx.symbolic import register_extra_symbolics
+                opset_version = 11
+                register_extra_symbolics(opset_version)
+                with torch.no_grad():
+                    torch.onnx.export(
+                        wrapped_model, (input, grid),
+                        onnx_file,
+                        export_params=True,
+                        keep_initializers_as_inputs=True,
+                        input_names=['input', 'grid'],
+                        opset_version=11)
+
+                session_options = rt.SessionOptions()
+                session_options.register_custom_ops_library(ort_custom_op_path)
+                sess = rt.InferenceSession(onnx_file, session_options)
+                input_feature = input.cpu().numpy()
+                grid_feature = grid.cpu().numpy()
+                onnx_output = sess.run(None, {
+                    'input': input_feature,
+                    'grid': grid_feature
+                })
+                os.remove(onnx_file)
+                assert np.allclose(pytorch_output, onnx_output, atol=1e-3)
+
+
+test_grid_sampler()
 
 
 def test_nms():

From 4d8d95222221806dddc1b8b78bc714448e6e03c4 Mon Sep 17 00:00:00 2001
From: tangyanfei <tangyanfei@sensetime.com>
Date: Thu, 1 Apr 2021 16:59:22 +0800
Subject: [PATCH 12/15] update code

---
 mmcv/ops/csrc/onnxruntime/cpu/gridSample.cpp |   6 +-
 tests/test_ops/test_onnx.py                  | 107 ++++++++++---------
 2 files changed, 57 insertions(+), 56 deletions(-)

diff --git a/mmcv/ops/csrc/onnxruntime/cpu/gridSample.cpp b/mmcv/ops/csrc/onnxruntime/cpu/gridSample.cpp
index c60c1022211..ec5ad330f9b 100644
--- a/mmcv/ops/csrc/onnxruntime/cpu/gridSample.cpp
+++ b/mmcv/ops/csrc/onnxruntime/cpu/gridSample.cpp
@@ -8,6 +8,9 @@
 #define CLIP_COORDINATES(in, out, clip_limit) \
   out = MIN((clip_limit - 1), MAX(in, 0))
 
+// modified from
+// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/GridSampler.cpp
+
 GridSampleKernel::GridSampleKernel(OrtApi api, const OrtKernelInfo *info)
     : api_(api), ort_(api_), info_(info) {
   align_corners_ = ort_.KernelInfoGetAttribute<int64_t>(info, "align_corners");
@@ -145,9 +148,6 @@ static inline scalar_t cubic_interp1d(scalar_t x0, scalar_t x1, scalar_t x2,
   return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3];
 }
 
-// modified from
-// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/GridSampler.cpp
-
 void GridSampleKernel::Compute(OrtKernelContext *context) {
   const bool align_corners = align_corners_;
   const int64_t padding_mode = padding_mode_;
diff --git a/tests/test_ops/test_onnx.py b/tests/test_ops/test_onnx.py
index 9a04f82e8c5..6a8de8a5213 100644
--- a/tests/test_ops/test_onnx.py
+++ b/tests/test_ops/test_onnx.py
@@ -23,62 +23,63 @@ def forward(self, *args, **kwargs):
         return self.wrapped_function(*args, **kwargs)
 
 
-def test_grid_sampler():
-    input = torch.ones(1, 1, 2, 2)
-    out_h = 4
-    out_w = 4
-    h = torch.linspace(-1, 1, out_h)
-    w = torch.linspace(-1, 1, out_w)
-    grid = torch.zeros(out_h, out_w, 2)
-    grid[:, :, 0] = w.unsqueeze(0).repeat(out_h, 1)
-    grid[:, :, 1] = h.unsqueeze(0).repeat(out_w, 1).transpose(0, 1)
-    grid = grid.unsqueeze(0).repeat(1, 1, 1, 1)
+@pytest.mark.parametrize('mode', ['bilinear', 'nearest'])
+@pytest.mark.parametrize('padding_mode', ['zeros', 'border', 'reflection'])
+@pytest.mark.parametrize('align_corners', [True, False])
+def test_grid_sample(mode, padding_mode, align_corners):
+    from mmcv.onnx.symbolic import register_extra_symbolics
+    opset_version = 11
+    register_extra_symbolics(opset_version)
+
+    input = torch.rand(1, 1, 10, 10)
+    grid = torch.Tensor([[[1, 0, 0], [0, 1, 0]]])
+    grid = nn.functional.affine_grid(grid, (1, 1, 15, 15)).type_as(input)
+
+    def func(input, grid):
+        return nn.functional.grid_sample(
+            input,
+            grid,
+            mode=mode,
+            padding_mode=padding_mode,
+            align_corners=align_corners)
+
+    wrapped_model = WrapFunction(func).eval()
+
+    input_names = ['input', 'grid']
+    output_names = ['output']
+
+    with torch.no_grad():
+        torch.onnx.export(
+            wrapped_model, (input.clone(), grid.clone()),
+            onnx_file,
+            export_params=True,
+            keep_initializers_as_inputs=True,
+            input_names=input_names,
+            output_names=output_names,
+            opset_version=11)
+
+    onnx_model = onnx.load(onnx_file)
+
     from mmcv.ops import get_onnxruntime_op_path
     ort_custom_op_path = get_onnxruntime_op_path()
     if not os.path.exists(ort_custom_op_path):
-        pytest.skip('grid_sample for onnxruntime is not compiled.')
-
-    inter_modes = ['bilinear', 'nearest']
-    padding_modes = ['zeros', 'border', 'reflection']
-    corners = [False, True]
-    for inter_mode in inter_modes:
-        for padding_mode in padding_modes:
-            for align_corner in corners:
-                pytorch_output = \
-                    torch.nn.functional.grid_sample(input,
-                                                    grid,
-                                                    mode=inter_mode,
-                                                    padding_mode=padding_mode,
-                                                    align_corners=align_corner)
-                wrapped_model = WrapFunction(torch.nn.functional.grid_sample)
-                wrapped_model.cpu().eval()
-
-                from mmcv.onnx.symbolic import register_extra_symbolics
-                opset_version = 11
-                register_extra_symbolics(opset_version)
-                with torch.no_grad():
-                    torch.onnx.export(
-                        wrapped_model, (input, grid),
-                        onnx_file,
-                        export_params=True,
-                        keep_initializers_as_inputs=True,
-                        input_names=['input', 'grid'],
-                        opset_version=11)
-
-                session_options = rt.SessionOptions()
-                session_options.register_custom_ops_library(ort_custom_op_path)
-                sess = rt.InferenceSession(onnx_file, session_options)
-                input_feature = input.cpu().numpy()
-                grid_feature = grid.cpu().numpy()
-                onnx_output = sess.run(None, {
-                    'input': input_feature,
-                    'grid': grid_feature
-                })
-                os.remove(onnx_file)
-                assert np.allclose(pytorch_output, onnx_output, atol=1e-3)
-
-
-test_grid_sampler()
+        pytest.skip('nms for onnxruntime is not compiled.')
+
+    session_options = rt.SessionOptions()
+    session_options.register_custom_ops_library(ort_custom_op_path)
+
+    # get onnx output
+    input_all = [node.name for node in onnx_model.graph.input]
+    input_initializer = [node.name for node in onnx_model.graph.initializer]
+    net_feed_input = list(set(input_all) - set(input_initializer))
+    assert (len(net_feed_input) == 2)
+    sess = rt.InferenceSession(onnx_file, session_options)
+    ort_result = sess.run(None, {
+        'input': input.detach().numpy(),
+        'grid': grid.detach().numpy()
+    })
+    pytorch_results = wrapped_model(input.clone(), grid.clone())
+    assert np.allclose(pytorch_results, ort_result, atol=1e-3)
 
 
 def test_nms():

From 3dece937a2fda428c231c2a5849498d63ee3ce50 Mon Sep 17 00:00:00 2001
From: tangyanfei <tangyanfei@sensetime.com>
Date: Thu, 1 Apr 2021 20:06:39 +0800
Subject: [PATCH 13/15] update code

---
 tests/test_ops/test_onnx.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/test_ops/test_onnx.py b/tests/test_ops/test_onnx.py
index 6a8de8a5213..b7a8d559fad 100644
--- a/tests/test_ops/test_onnx.py
+++ b/tests/test_ops/test_onnx.py
@@ -31,6 +31,11 @@ def test_grid_sample(mode, padding_mode, align_corners):
     opset_version = 11
     register_extra_symbolics(opset_version)
 
+    from mmcv.ops import get_onnxruntime_op_path
+    ort_custom_op_path = get_onnxruntime_op_path()
+    if not os.path.exists(ort_custom_op_path):
+        pytest.skip('nms for onnxruntime is not compiled.')
+
     input = torch.rand(1, 1, 10, 10)
     grid = torch.Tensor([[[1, 0, 0], [0, 1, 0]]])
     grid = nn.functional.affine_grid(grid, (1, 1, 15, 15)).type_as(input)
@@ -60,11 +65,6 @@ def func(input, grid):
 
     onnx_model = onnx.load(onnx_file)
 
-    from mmcv.ops import get_onnxruntime_op_path
-    ort_custom_op_path = get_onnxruntime_op_path()
-    if not os.path.exists(ort_custom_op_path):
-        pytest.skip('nms for onnxruntime is not compiled.')
-
     session_options = rt.SessionOptions()
     session_options.register_custom_ops_library(ort_custom_op_path)
 

From 16b1c858b33bea29e8fd538feded57718a3fbe24 Mon Sep 17 00:00:00 2001
From: tangyanfei <tangyanfei@sensetime.com>
Date: Fri, 2 Apr 2021 13:28:37 +0800
Subject: [PATCH 14/15] update code

---
 tests/test_ops/test_onnx.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_ops/test_onnx.py b/tests/test_ops/test_onnx.py
index b7a8d559fad..a4939177497 100644
--- a/tests/test_ops/test_onnx.py
+++ b/tests/test_ops/test_onnx.py
@@ -79,6 +79,7 @@ def func(input, grid):
         'grid': grid.detach().numpy()
     })
     pytorch_results = wrapped_model(input.clone(), grid.clone())
+    os.remove(onnx_file)
     assert np.allclose(pytorch_results, ort_result, atol=1e-3)
 
 

From 3f64a84b91ff0875e323d45e94e4a035b8151685 Mon Sep 17 00:00:00 2001
From: tangyanfei <tangyanfei@sensetime.com>
Date: Tue, 6 Apr 2021 17:28:39 +0800
Subject: [PATCH 15/15] update code

---
 tests/test_ops/test_onnx.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_ops/test_onnx.py b/tests/test_ops/test_onnx.py
index a4939177497..487edbbf9da 100644
--- a/tests/test_ops/test_onnx.py
+++ b/tests/test_ops/test_onnx.py
@@ -34,7 +34,7 @@ def test_grid_sample(mode, padding_mode, align_corners):
     from mmcv.ops import get_onnxruntime_op_path
     ort_custom_op_path = get_onnxruntime_op_path()
     if not os.path.exists(ort_custom_op_path):
-        pytest.skip('nms for onnxruntime is not compiled.')
+        pytest.skip('custom ops for onnxruntime are not compiled.')
 
     input = torch.rand(1, 1, 10, 10)
     grid = torch.Tensor([[[1, 0, 0], [0, 1, 0]]])
@@ -55,7 +55,7 @@ def func(input, grid):
 
     with torch.no_grad():
         torch.onnx.export(
-            wrapped_model, (input.clone(), grid.clone()),
+            wrapped_model, (input, grid),
             onnx_file,
             export_params=True,
             keep_initializers_as_inputs=True,