open-mmlab · lvhan028 · Mar 7, 2022 · Jan 16, 2022 · Jan 17, 2022 · Jan 18, 2022
diff --git a/configs/_base_/backends/torchscript.py b/configs/_base_/backends/torchscript.py
@@ -0,0 +1 @@
+backend_config = dict(type='torchscript')
diff --git a/configs/_base_/torchscript_config.py b/configs/_base_/torchscript_config.py
@@ -0,0 +1,6 @@
+ir_config = dict(
+    type='torchscript',
+    save_file='end2end.pt',
+    input_names=['input'],
+    output_names=['output'],
+    input_shape=None)
diff --git a/configs/mmcls/classification_torchscript.py b/configs/mmcls/classification_torchscript.py
@@ -0,0 +1,6 @@
+_base_ = [
+    '../_base_/torchscript_config.py', '../_base_/backends/torchscript.py'
+]
+
+ir_config = dict(input_shape=None)
+codebase_config = dict(type='mmcls', task='Classification')
diff --git a/configs/mmdet/_base_/base_instance-seg_torchscript.py b/configs/mmdet/_base_/base_instance-seg_torchscript.py
@@ -0,0 +1,4 @@
+_base_ = ['./base_torchscript.py']
+
+ir_config = dict(output_names=['dets', 'labels', 'masks'])
+codebase_config = dict(post_processing=dict(export_postprocess_mask=False))
diff --git a/configs/mmdet/_base_/base_torchscript.py b/configs/mmdet/_base_/base_torchscript.py
@@ -0,0 +1,16 @@
+_base_ = ['../../_base_/torchscript_config.py']
+
+ir_config = dict(output_names=['dets', 'labels'])
+codebase_config = dict(
+    type='mmdet',
+    task='ObjectDetection',
+    model_type='end2end',
+    post_processing=dict(
+        score_threshold=0.05,
+        confidence_threshold=0.005,  # for YOLOv3
+        iou_threshold=0.5,
+        max_output_boxes_per_class=200,
+        pre_top_k=5000,
+        keep_top_k=100,
+        background_label_id=-1,
+    ))
diff --git a/configs/mmdet/detection/detection_torchscript.py b/configs/mmdet/detection/detection_torchscript.py
@@ -0,0 +1,3 @@
+_base_ = [
+    '../_base_/base_torchscript.py', '../../_base_/backends/torchscript.py'
+]
diff --git a/configs/mmdet/instance-seg/instance-seg_torchscript.py b/configs/mmdet/instance-seg/instance-seg_torchscript.py
@@ -0,0 +1,4 @@
+_base_ = [
+    '../_base_/base_instance-seg_torchscript.py',
+    '../../_base_/backends/torchscript.py'
+]
diff --git a/configs/mmedit/super-resolution/super-resolution_torchscript.py b/configs/mmedit/super-resolution/super-resolution_torchscript.py
@@ -0,0 +1,7 @@
+_base_ = [
+    '../../_base_/torchscript_config.py',
+    '../../_base_/backends/torchscript.py'
+]
+
+ir_config = dict(input_shape=None)
+codebase_config = dict(type='mmedit', task='SuperResolution')
diff --git a/configs/mmocr/text-detection/text-detection_torchscript.py b/configs/mmocr/text-detection/text-detection_torchscript.py
@@ -0,0 +1,7 @@
+_base_ = [
+    '../../_base_/torchscript_config.py',
+    '../../_base_/backends/torchscript.py'
+]
+
+ir_config = dict(input_shape=None)
+codebase_config = dict(type='mmocr', task='TextDetection')
diff --git a/configs/mmocr/text-recognition/text-recognition_torchscript.py b/configs/mmocr/text-recognition/text-recognition_torchscript.py
@@ -0,0 +1,7 @@
+_base_ = [
+    '../../_base_/torchscript_config.py',
+    '../../_base_/backends/torchscript.py'
+]
+
+ir_config = dict(input_shape=None)
+codebase_config = dict(type='mmocr', task='TextRecognition')
diff --git a/configs/mmseg/segmentation_torchscript.py b/configs/mmseg/segmentation_torchscript.py
@@ -0,0 +1,6 @@
+_base_ = [
+    '../_base_/torchscript_config.py', '../_base_/backends/torchscript.py'
+]
+
+ir_config = dict(input_shape=None)
+codebase_config = dict(type='mmseg', task='Segmentation')
diff --git a/csrc/backend_ops/CMakeLists.txt b/csrc/backend_ops/CMakeLists.txt
@@ -30,3 +30,9 @@ if ("ncnn" IN_LIST MMDEPLOY_TARGET_BACKENDS)
     message(STATUS "Build NCNN custom ops")
     add_subdirectory(ncnn)
 endif ()
+
+# build TorchScript ops
+if ("torchscript" IN_LIST MMDEPLOY_TARGET_BACKENDS)
+  message(STATUS "Build torchsciprt custom ops")
+  add_subdirectory(torchscript)
+endif ()
diff --git a/csrc/backend_ops/common/modulated_deform_conv/common_cuda_helper.cuh b/csrc/backend_ops/common/modulated_deform_conv/common_cuda_helper.cuh
@@ -0,0 +1,94 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#ifndef COMMON_CUDA_HELPER
+#define COMMON_CUDA_HELPER
+
+#include <cublas_v2.h>
+#include <cuda.h>
+
+#include <algorithm>
+
+#define CUDA_1D_KERNEL_LOOP(i, n) \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
+
+#define THREADS_PER_BLOCK 512
+
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
+inline int GET_BLOCKS(const int N) {
+  int optimal_block_num = DIVUP(N, THREADS_PER_BLOCK);
+  int max_block_num = 4096;
+  return std::min(optimal_block_num, max_block_num);
+}
+
+#define cudaCheckError()                                                               \
+  {                                                                                    \
+    cudaError_t e = cudaGetLastError();                                                \
+    if (e != cudaSuccess) {                                                            \
+      printf("Cuda failure %s:%d: '%s'\n", __FILE__, __LINE__, cudaGetErrorString(e)); \
+      exit(0);                                                                         \
+    }                                                                                  \
+  }
+
+/**
+ * Returns a view of the original tensor with its dimensions permuted.
+ *
+ * @param[out] dst pointer to the destination tensor
+ * @param[in] src pointer to the source tensor
+ * @param[in] src_size shape of the src tensor
+ * @param[in] permute The desired ordering of dimensions
+ * @param[in] src_dim dim of src tensor
+ * @param[in] stream cuda stream handle
+ */
+template <class scalar_t>
+void memcpyPermute(scalar_t* dst, const scalar_t* src, int* src_size, int* permute, int src_dim,
+                   cudaStream_t stream = 0);
+
+template <typename scalar_t>
+cublasStatus_t cublasGemmWrap(cublasHandle_t handle, cublasOperation_t transa,
+                              cublasOperation_t transb, int m, int n, int k, const scalar_t* alpha,
+                              const scalar_t* A, int lda, const scalar_t* B, int ldb,
+                              const scalar_t* beta, scalar_t* C, int ldc);
+
+template <typename scalar_t>
+__device__ scalar_t bilinear_interpolate(const scalar_t* input, const int height, const int width,
+                                         scalar_t y, scalar_t x) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) return 0;
+
+  if (y <= 0) y = 0;
+  if (x <= 0) x = 0;
+
+  int y_low = (int)y;
+  int x_low = (int)x;
+  int y_high;
+  int x_high;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (scalar_t)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (scalar_t)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  scalar_t ly = y - y_low;
+  scalar_t lx = x - x_low;
+  scalar_t hy = 1. - ly, hx = 1. - lx;
+  // do bilinear interpolation
+  scalar_t v1 = input[y_low * width + x_low];
+  scalar_t v2 = input[y_low * width + x_high];
+  scalar_t v3 = input[y_high * width + x_low];
+  scalar_t v4 = input[y_high * width + x_high];
+  scalar_t w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  return val;
+}
+
+#endif  // COMMON_CUDA_HELPER
diff --git a/csrc/backend_ops/common/modulated_deform_conv/modulated_deform_conv_cpu.h b/csrc/backend_ops/common/modulated_deform_conv/modulated_deform_conv_cpu.h
@@ -0,0 +1,82 @@
+#include <cmath>
+#include <cstdint>
+
+template <typename T>
+T bilinear_interpolate_2d(const T *src, const int64_t src_h, const int64_t src_w, const T h,
+                          const T w) {
+  if (h <= -1 || src_h <= h || w <= -1 || src_w <= w) {
+    return 0;
+  }
+
+  int64_t h_low = floor(h);
+  int64_t w_low = floor(w);
+  int64_t h_high = h_low + 1;
+  int64_t w_high = w_low + 1;
+
+  T lh = h - h_low;
+  T lw = w - w_low;
+  T hh = 1 - lh;
+  T hw = 1 - lw;
+
+  T v1 = 0;
+  if (h_low >= 0 && w_low >= 0) v1 = src[h_low * src_w + w_low];
+  T v2 = 0;
+  if (h_low >= 0 && w_high <= src_w - 1) v2 = src[h_low * src_w + w_high];
+  T v3 = 0;
+  if (h_high <= src_h - 1 && w_low >= 0) v3 = src[h_high * src_w + w_low];
+  T v4 = 0;
+  if (h_high <= src_h - 1 && w_high <= src_w - 1) v4 = src[h_high * src_w + w_high];
+
+  T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+// output: (channels * kernel_h * kernel_w, dst_h * dst_w)
+template <typename T>
+void deformable_im2col_2d(const T *input, const T *offset, const T *mask, const int64_t src_h,
+                          const int64_t src_w, const int64_t kernel_h, const int64_t kernel_w,
+                          const int64_t pad_h, const int64_t pad_w, const int64_t stride_h,
+                          const int64_t stride_w, const int64_t dilation_h,
+                          const int64_t dilation_w, const int64_t channels,
+                          const int64_t offset_groups, const int64_t dst_h, const int64_t dst_w,
+                          const bool use_mask, T *columns) {
+  const int64_t workload = channels * dst_h * dst_w;
+  for (int64_t index = 0; index != workload; ++index) {
+    const int64_t ow = index % dst_w;
+    const int64_t oh = (index / dst_w) % dst_h;
+    const int64_t ic = index / (dst_w * dst_h);
+    const int64_t oc = ic * kernel_h * kernel_w;
+
+    int64_t c_per_offset_grp = channels / offset_groups;
+    const int64_t grp_idx = ic / c_per_offset_grp;
+
+    auto columns_ptr = columns + (oc * (dst_h * dst_w) + oh * dst_w + ow);
+    auto input_ptr = input + ic * (src_h * src_w);
+    auto offset_ptr = offset + grp_idx * 2 * kernel_h * kernel_w * dst_h * dst_w;
+    auto mask_ptr = mask;
+    if (use_mask) {
+      mask_ptr += grp_idx * kernel_h * kernel_w * dst_h * dst_w;
+    }
+
+    for (int64_t kh = 0; kh < kernel_h; ++kh) {
+      for (int64_t kw = 0; kw < kernel_w; ++kw) {
+        const int64_t mask_idx = kh * kernel_w + kw;
+        const int64_t offset_idx = 2 * mask_idx;
+
+        T mask_value = 1;
+        if (use_mask) {
+          mask_value = mask_ptr[mask_idx * (dst_h * dst_w) + oh * dst_w + ow];
+        }
+
+        const T offset_h = offset_ptr[offset_idx * (dst_h * dst_w) + oh * dst_w + ow];
+        const T offset_w = offset_ptr[(offset_idx + 1) * (dst_h * dst_w) + oh * dst_w + ow];
+        const T ih = (oh * stride_h - pad_h) + kh * dilation_h + offset_h;
+        const T iw = (ow * stride_w - pad_w) + kw * dilation_w + offset_w;
+        *columns_ptr = mask_value * bilinear_interpolate_2d<T>(input_ptr, src_h, src_w, ih, iw);
+        columns_ptr += dst_h * dst_w;
+      }
+    }
+  }
+}
diff --git a/...conv/trt_modulated_deform_conv_kernel.hpp → ...eform_conv/modulated_deform_conv_cuda.cuh b/...conv/trt_modulated_deform_conv_kernel.hpp → ...eform_conv/modulated_deform_conv_cuda.cuh
@@ -68,7 +68,7 @@
 
 #include <float.h>
 
-#include "common_cuda_helper.hpp"
+#include "common_cuda_helper.cuh"
 
 template <typename T>
 __device__ T dmcn_im2col_bilinear(const T *input, const int data_width, const int height,

diff --git a/csrc/backend_ops/onnxruntime/CMakeLists.txt b/csrc/backend_ops/onnxruntime/CMakeLists.txt
@@ -14,6 +14,7 @@ mmdeploy_export(${PROJECT_NAME}_obj)
 target_include_directories(${PROJECT_NAME}_obj PUBLIC
         $<BUILD_INTERFACE:${ONNXRUNTIME_DIR}/include>
         $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/common>
+        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../common>
         $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/csrc>)
 target_link_directories(${PROJECT_NAME}_obj PUBLIC
         ${ONNXRUNTIME_DIR}/lib)

diff --git a/csrc/backend_ops/onnxruntime/modulated_deform_conv/modulated_deform_conv.cpp b/csrc/backend_ops/onnxruntime/modulated_deform_conv/modulated_deform_conv.cpp
@@ -4,88 +4,11 @@
 #include <cmath>
 #include <vector>
 
+#include "modulated_deform_conv/modulated_deform_conv_cpu.h"
 #include "ort_utils.h"
 
 namespace mmdeploy {
 
-float bilinear_interpolate_2d(const float *src, const int64_t src_h, const int64_t src_w,
-                              const float h, const float w) {
-  if (h <= -1 || src_h <= h || w <= -1 || src_w <= w) {
-    return 0;
-  }
-
-  int64_t h_low = floor(h);
-  int64_t w_low = floor(w);
-  int64_t h_high = h_low + 1;
-  int64_t w_high = w_low + 1;
-
-  float lh = h - h_low;
-  float lw = w - w_low;
-  float hh = 1 - lh;
-  float hw = 1 - lw;
-
-  float v1 = 0;
-  if (h_low >= 0 && w_low >= 0) v1 = src[h_low * src_w + w_low];
-  float v2 = 0;
-  if (h_low >= 0 && w_high <= src_w - 1) v2 = src[h_low * src_w + w_high];
-  float v3 = 0;
-  if (h_high <= src_h - 1 && w_low >= 0) v3 = src[h_high * src_w + w_low];
-  float v4 = 0;
-  if (h_high <= src_h - 1 && w_high <= src_w - 1) v4 = src[h_high * src_w + w_high];
-
-  float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
-
-  float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-  return val;
-}
-
-// output: (channels * kernel_h * kernel_w, dst_h * dst_w)
-void deformable_im2col_2d(const float *input, const float *offset, const float *mask,
-                          const int64_t src_h, const int64_t src_w, const int64_t kernel_h,
-                          const int64_t kernel_w, const int64_t pad_h, const int64_t pad_w,
-                          const int64_t stride_h, const int64_t stride_w, const int64_t dilation_h,
-                          const int64_t dilation_w, const int64_t channels,
-                          const int64_t offset_groups, const int64_t dst_h, const int64_t dst_w,
-                          const bool use_mask, float *columns) {
-  const int64_t workload = channels * dst_h * dst_w;
-  for (int64_t index = 0; index != workload; ++index) {
-    const int64_t ow = index % dst_w;
-    const int64_t oh = (index / dst_w) % dst_h;
-    const int64_t ic = index / (dst_w * dst_h);
-    const int64_t oc = ic * kernel_h * kernel_w;
-
-    int64_t c_per_offset_grp = channels / offset_groups;
-    const int64_t grp_idx = ic / c_per_offset_grp;
-
-    auto columns_ptr = columns + (oc * (dst_h * dst_w) + oh * dst_w + ow);
-    auto input_ptr = input + ic * (src_h * src_w);
-    auto offset_ptr = offset + grp_idx * 2 * kernel_h * kernel_w * dst_h * dst_w;
-    auto mask_ptr = mask;
-    if (use_mask) {
-      mask_ptr += grp_idx * kernel_h * kernel_w * dst_h * dst_w;
-    }
-
-    for (int64_t kh = 0; kh < kernel_h; ++kh) {
-      for (int64_t kw = 0; kw < kernel_w; ++kw) {
-        const int64_t mask_idx = kh * kernel_w + kw;
-        const int64_t offset_idx = 2 * mask_idx;
-
-        float mask_value = 1;
-        if (use_mask) {
-          mask_value = mask_ptr[mask_idx * (dst_h * dst_w) + oh * dst_w + ow];
-        }
-
-        const float offset_h = offset_ptr[offset_idx * (dst_h * dst_w) + oh * dst_w + ow];
-        const float offset_w = offset_ptr[(offset_idx + 1) * (dst_h * dst_w) + oh * dst_w + ow];
-        const float ih = (oh * stride_h - pad_h) + kh * dilation_h + offset_h;
-        const float iw = (ow * stride_w - pad_w) + kw * dilation_w + offset_w;
-        *columns_ptr = mask_value * bilinear_interpolate_2d(input_ptr, src_h, src_w, ih, iw);
-        columns_ptr += dst_h * dst_w;
-      }
-    }
-  }
-}
-
 void gemm_ref_fp32(const float *A, const float *B, const float *V, const float *H,
                    const int32_t trans_A, const int32_t trans_B, const int32_t M, const int32_t N,
                    const int32_t K, const float alpha, const float beta, float *Y) {
@@ -162,12 +85,12 @@ void deformable_conv2d_ref_fp32(const float *src, const float *offset, const flo
 
   for (int64_t b = 0; b < batch; ++b) {
     for (int64_t g = 0; g < group; ++g) {
-      deformable_im2col_2d(src + b * src_c * src_h * src_w + g * ic_per_gp * src_h * src_w,
-                           offset + b * offset_group * 2 * kernel_h * kernel_w * dst_h * dst_w,
-                           mask + b * offset_group * kernel_h * kernel_w * dst_h * dst_w, src_h,
-                           src_w, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
-                           dilation_w, ic_per_gp, offset_group, dst_h, dst_w, mask != nullptr,
-                           columns);
+      deformable_im2col_2d<float>(
+          src + b * src_c * src_h * src_w + g * ic_per_gp * src_h * src_w,
+          offset + b * offset_group * 2 * kernel_h * kernel_w * dst_h * dst_w,
+          mask + b * offset_group * kernel_h * kernel_w * dst_h * dst_w, src_h, src_w, kernel_h,
+          kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, ic_per_gp,
+          offset_group, dst_h, dst_w, mask != nullptr, columns);
       float *dst_ptr = dst + b * dst_c * dst_h * dst_w + g * oc_per_gp * dst_h * dst_w;
       if (bias != nullptr) {
         const float *bias_ptr = bias + g * oc_per_gp;