Draft

qqaatw · qqaatw · commit 60e299065fb2 · 2023-05-31T00:47:14.000+08:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -4,6 +4,7 @@ set(CMAKE_CXX_STANDARD 17)
 file(STRINGS version.txt TORCHVISION_VERSION)
 
 option(WITH_CUDA "Enable CUDA support" OFF)
+option(WITH_MPS "Enable MPS support" OFF)
 option(WITH_PNG "Enable features requiring LibPNG." ON)
 option(WITH_JPEG "Enable features requiring LibJPEG." ON)
 option(USE_PYTHON "Link to Python when building" OFF)
@@ -15,6 +16,11 @@ if(WITH_CUDA)
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
 endif()
 
+if(WITH_MPS)
+  enable_language(OBJC OBJCXX)
+  add_definitions(-DWITH_MPS)
+endif()
+
 find_package(Torch REQUIRED)
 
 if (WITH_PNG)
@@ -79,6 +85,9 @@ list(APPEND ALLOW_LISTED ${TVCPP} ${TVCPP}/io/image ${TVCPP}/io/image/cpu ${TVCP
 if(WITH_CUDA)
     list(APPEND ALLOW_LISTED ${TVCPP}/ops/cuda ${TVCPP}/ops/autocast)
 endif()
+if(WITH_MPS)
+    list(APPEND ALLOW_LISTED ${TVCPP}/ops/mps)
+endif()
 
 FOREACH(DIR ${ALLOW_LISTED})
     file(GLOB ALL_SOURCES ${ALL_SOURCES} ${DIR}/*.*)
diff --git a/setup.py b/setup.py
@@ -137,10 +137,13 @@ def get_extensions():
         + glob.glob(os.path.join(extensions_dir, "ops", "cpu", "*.cpp"))
         + glob.glob(os.path.join(extensions_dir, "ops", "quantized", "cpu", "*.cpp"))
     )
+    source_mps = glob.glob(os.path.join(extensions_dir, "ops", "mps", "*.mm"))
 
     print("Compiling extensions with following flags:")
     force_cuda = os.getenv("FORCE_CUDA", "0") == "1"
     print(f"  FORCE_CUDA: {force_cuda}")
+    force_mps = os.getenv("FORCE_MPS", "0") == "1"
+    print(f"  FORCE_MPS: {force_mps}")
     debug_mode = os.getenv("DEBUG", "0") == "1"
     print(f"  DEBUG: {debug_mode}")
     use_png = os.getenv("TORCHVISION_USE_PNG", "1") == "1"
@@ -202,6 +205,9 @@ def get_extensions():
             define_macros += [("WITH_HIP", None)]
             nvcc_flags = []
         extra_compile_args["nvcc"] = nvcc_flags
+    elif torch.backends.mps.is_available() or force_mps:
+        sources += source_mps
+        define_macros += [("WITH_MPS", None)]
 
     if sys.platform == "win32":
         define_macros += [("torchvision_EXPORTS", None)]
diff --git a/test/common_utils.py b/test/common_utils.py
@@ -133,6 +133,11 @@ def needs_cuda(test_func):
 
     return pytest.mark.needs_cuda(test_func)
 
+def needs_mps(test_func):
+    import pytest
+
+    return pytest.mark.needs_mps(test_func)
+
 
 def _create_data(height=3, width=3, channels=3, device="cpu"):
     # TODO: When all relevant tests are ported to pytest, turn this into a module-level fixture
diff --git a/torchvision/csrc/ops/mps/nms_kernel.mm b/torchvision/csrc/ops/mps/nms_kernel.mm
@@ -0,0 +1,107 @@
+//#include <ATen/mps/MPSProfiler.h>
+#include <ATen/native/mps/OperationUtils.h>
+#include "vision_kernels.h"
+
+namespace vision {
+namespace ops {
+
+namespace {
+
+at::Tensor nms_kernel(
+    const at::Tensor& dets,
+    const at::Tensor& scores,
+    double iou_threshold) {
+
+  using namespace at::native::mps;
+  TORCH_CHECK(dets.is_mps(), "dets must be a MPS tensor");
+  TORCH_CHECK(scores.is_mps(), "scores must be a MPS tensor");
+
+  TORCH_CHECK(
+      dets.dim() == 2, "boxes should be a 2d tensor, got ", dets.dim(), "D");
+  TORCH_CHECK(
+      dets.size(1) == 4,
+      "boxes should have 4 elements in dimension 1, got ",
+      dets.size(1));
+  TORCH_CHECK(
+      scores.dim() == 1,
+      "scores should be a 1d tensor, got ",
+      scores.dim(),
+      "D");
+  TORCH_CHECK(
+      dets.size(0) == scores.size(0),
+      "boxes and scores should have same number of elements in ",
+      "dimension 0, got ",
+      dets.size(0),
+      " and ",
+      scores.size(0))
+  
+  //at::Tensor input = at::arange({10}, at::kFloat, c10::nullopt, at::kMPS, c10::nullopt);
+  //at::Tensor other = at::arange({10}, at::kFloat, c10::nullopt, at::kMPS, c10::nullopt);
+  //at::Tensor out = at::zeros({10}, at::kFloat, c10::nullopt, at::kMPS, c10::nullopt);
+  
+  if (dets.numel() == 0) {
+    return at::empty({0}, dets.options().dtype(at::kLong));
+  }
+
+  auto order_t = std::get<1>(
+      scores.sort(/*stable=*/true, /*dim=*/0, /* descending=*/true));
+  auto dets_sorted = dets.index_select(0, order_t).contiguous();
+  int dets_num = dets.size(0);
+  float iou_threshold_f = static_cast<float>(iou_threshold);
+
+  //TODO: ceil_div
+  //const int col_blocks = ceil_div(dets_num, threadsPerBlock);
+  //at::Tensor mask =
+  //  at::empty({dets_num * col_blocks}, dets.options().dtype(at::kLong));
+  at::Tensor mask =
+    at::empty({dets_num}, dets.options().dtype(at::kLong));
+
+  id<MTLBuffer> inputBuffer = getMTLBufferStorage(dets_sorted);
+  id<MTLBuffer> outputBuffer = getMTLBufferStorage(mask);
+  id<MTLDevice> device = MPSDevice::getInstance()->device();
+  MPSStream* mpsStream = getCurrentMPSStream();
+  //const uint32_t nDim = iter.ndim();
+  //constexpr uint32_t nOffsets = 3;
+  const uint32_t numThreads = dets_num;
+  dispatch_sync(mpsStream->queue(), ^() {
+    @autoreleasepool {
+      NSError* error = nil;
+      id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
+      MTLSize gridSize = MTLSizeMake(numThreads, 1, 1);
+
+
+      const std::string kernel = "nms_" + scalarToMetalTypeString(dets_sorted.scalar_type());
+      id<MTLComputePipelineState> binaryPSO = mps::binaryPipelineState(device, kernel);
+
+      // this function call is a no-op if MPS Profiler is not enabled
+      //getMPSProfiler().beginProfileKernel(binaryPSO, kernel, {input, other});
+
+      [computeEncoder setComputePipelineState:binaryPSO];
+      [computeEncoder setBuffer:inputBuffer offset:dets_sorted.storage_offset() * dets_sorted.element_size() atIndex:0];
+      [computeEncoder setBuffer:outputBuffer offset:mask.storage_offset() * mask.element_size() atIndex:1];
+      [computeEncoder setBytes:&dets_num length:sizeof(int) atIndex:2];
+      [computeEncoder setBytes:&iou_threshold_f length:sizeof(float) atIndex:3];
+
+      NSUInteger tgSize = binaryPSO.maxTotalThreadsPerThreadgroup;
+      if (tgSize > numThreads) {
+        tgSize = numThreads;
+      }
+
+      MTLSize threadGroupSize = MTLSizeMake(tgSize, 1, 1);
+      [computeEncoder dispatchThreads:gridSize threadsPerThreadgroup:threadGroupSize];
+
+      //getMPSProfiler().endProfileKernel(binaryPSO);
+    }
+  });
+  return mask;
+
+}
+
+} // namespace
+
+TORCH_LIBRARY_IMPL(torchvision, MPS, m) {
+  m.impl(TORCH_SELECTIVE_NAME("torchvision::nms"), TORCH_FN(nms_kernel));
+}
+
+} // namespace ops
+} // namespace vision
diff --git a/torchvision/csrc/ops/mps/vision_kernels.h b/torchvision/csrc/ops/mps/vision_kernels.h
@@ -0,0 +1,96 @@
+#include <ATen/native/mps/OperationUtils.h>
+
+namespace vision {
+namespace ops {
+
+namespace mps {
+
+static const char* METAL_VISION = R"VISION_METAL(
+
+#include <metal_stdlib>
+using namespace metal;
+
+template <typename T, typename scalar_t>
+bool IoU(
+  constant T & a,
+  constant T & b,
+  scalar_t threshold) {
+  auto xx1 = max(a.x, b.x);
+  auto yy1 = max(a.y, b.y);
+  auto xx2 = min(a.z, b.z);
+  auto yy2 = min(a.w, b.w);
+  auto w = max(static_cast<scalar_t>(0), xx2 - xx1);
+  auto h = max(static_cast<scalar_t>(0), yy2 - yy1);
+  auto inter = w * h;
+  auto area_a = (a.z - a.x) * (a.w - a.y);
+  auto area_b = (b.z - b.x) * (b.w - b.y);
+  return (inter / (area_a + area_b - inter)) > threshold;
+}
+
+template<typename T, typename scalar_t>
+kernel void nms(constant  T       * input         [[buffer(0)]],
+                device    int64_t * out           [[buffer(1)]],
+                constant  int     & dets_num      [[buffer(2)]],
+                constant  float   & iou_threshold [[buffer(3)]],
+                uint      tid  [[thread_position_in_grid]]) {
+  int t = 0;
+  for (int i = tid + 1; i < dets_num; i++){
+    if (IoU<T, scalar_t>(input[tid], input[i], iou_threshold)){
+      t |= static_cast<int>(1) << i;
+    }
+  }
+  out[tid] = static_cast<int64_t>(t);
+}
+
+#define REGISTER_NMS_OP(DTYPE)                        \
+template                                               \
+[[host_name("nms_" #DTYPE)]]                          \
+kernel void nms<DTYPE ## 4, DTYPE>(                               \
+  constant DTYPE ## 4   * input         [[buffer(0)]],   \
+  device   int64_t  * out           [[buffer(1)]],   \
+  constant int      & dets_num      [[buffer(2)]],   \
+  constant float    & iou_threshold [[buffer(3)]],   \
+  uint tid [[thread_position_in_grid]]);
+
+REGISTER_NMS_OP(float);
+REGISTER_NMS_OP(half);
+
+)VISION_METAL";
+
+static id<MTLLibrary> compileBinaryOpsLibrary(id<MTLDevice> device) {
+  static id<MTLLibrary> binaryLibrary = nil;
+  if (binaryLibrary) {
+    return binaryLibrary;
+  }
+
+  NSError* error = nil;
+  MTLCompileOptions* options = [[MTLCompileOptions new] autorelease];
+  [options setLanguageVersion:MTLLanguageVersion2_3];
+  binaryLibrary = [device newLibraryWithSource:[NSString stringWithCString:METAL_VISION encoding:NSASCIIStringEncoding]
+                                       options:options
+                                         error:&error];
+  TORCH_CHECK(binaryLibrary, "Failed to create metal binary library, error: ", [[error description] UTF8String]);
+  return binaryLibrary;
+}
+
+static id<MTLComputePipelineState> binaryPipelineState(id<MTLDevice> device, const std::string& kernel) {
+  static std::unordered_map<std::string, id<MTLComputePipelineState>> psoCache;
+  id<MTLComputePipelineState> pso = psoCache[kernel];
+  if (pso) {
+    return pso;
+  }
+
+  NSError* error = nil;
+  id<MTLLibrary> binaryLib = compileBinaryOpsLibrary(device);
+  id<MTLFunction> binaryFunc = [binaryLib newFunctionWithName:[NSString stringWithUTF8String:kernel.c_str()]];
+  TORCH_CHECK(binaryFunc, "Failed to create function state object for: ", kernel);
+  pso = [device newComputePipelineStateWithFunction:binaryFunc error:&error];
+  TORCH_CHECK(pso, "Failed to created pipeline state object, error: ", [[error description] UTF8String]);
+
+  psoCache[kernel] = pso;
+  return pso;
+}
+
+}
+}
+}  // namespace