|
| 1 | +//#include <ATen/mps/MPSProfiler.h> |
| 2 | +#include <ATen/native/mps/OperationUtils.h> |
| 3 | +#include "vision_kernels.h" |
| 4 | + |
| 5 | +namespace vision { |
| 6 | +namespace ops { |
| 7 | + |
| 8 | +namespace { |
| 9 | + |
| 10 | +at::Tensor nms_kernel( |
| 11 | + const at::Tensor& dets, |
| 12 | + const at::Tensor& scores, |
| 13 | + double iou_threshold) { |
| 14 | + |
| 15 | + using namespace at::native::mps; |
| 16 | + TORCH_CHECK(dets.is_mps(), "dets must be a MPS tensor"); |
| 17 | + TORCH_CHECK(scores.is_mps(), "scores must be a MPS tensor"); |
| 18 | + |
| 19 | + TORCH_CHECK( |
| 20 | + dets.dim() == 2, "boxes should be a 2d tensor, got ", dets.dim(), "D"); |
| 21 | + TORCH_CHECK( |
| 22 | + dets.size(1) == 4, |
| 23 | + "boxes should have 4 elements in dimension 1, got ", |
| 24 | + dets.size(1)); |
| 25 | + TORCH_CHECK( |
| 26 | + scores.dim() == 1, |
| 27 | + "scores should be a 1d tensor, got ", |
| 28 | + scores.dim(), |
| 29 | + "D"); |
| 30 | + TORCH_CHECK( |
| 31 | + dets.size(0) == scores.size(0), |
| 32 | + "boxes and scores should have same number of elements in ", |
| 33 | + "dimension 0, got ", |
| 34 | + dets.size(0), |
| 35 | + " and ", |
| 36 | + scores.size(0)) |
| 37 | + |
| 38 | + //at::Tensor input = at::arange({10}, at::kFloat, c10::nullopt, at::kMPS, c10::nullopt); |
| 39 | + //at::Tensor other = at::arange({10}, at::kFloat, c10::nullopt, at::kMPS, c10::nullopt); |
| 40 | + //at::Tensor out = at::zeros({10}, at::kFloat, c10::nullopt, at::kMPS, c10::nullopt); |
| 41 | + |
| 42 | + if (dets.numel() == 0) { |
| 43 | + return at::empty({0}, dets.options().dtype(at::kLong)); |
| 44 | + } |
| 45 | + |
| 46 | + auto order_t = std::get<1>( |
| 47 | + scores.sort(/*stable=*/true, /*dim=*/0, /* descending=*/true)); |
| 48 | + auto dets_sorted = dets.index_select(0, order_t).contiguous(); |
| 49 | + int dets_num = dets.size(0); |
| 50 | + float iou_threshold_f = static_cast<float>(iou_threshold); |
| 51 | + |
| 52 | + //TODO: ceil_div |
| 53 | + //const int col_blocks = ceil_div(dets_num, threadsPerBlock); |
| 54 | + //at::Tensor mask = |
| 55 | + // at::empty({dets_num * col_blocks}, dets.options().dtype(at::kLong)); |
| 56 | + at::Tensor mask = |
| 57 | + at::empty({dets_num}, dets.options().dtype(at::kLong)); |
| 58 | + |
| 59 | + id<MTLBuffer> inputBuffer = getMTLBufferStorage(dets_sorted); |
| 60 | + id<MTLBuffer> outputBuffer = getMTLBufferStorage(mask); |
| 61 | + id<MTLDevice> device = MPSDevice::getInstance()->device(); |
| 62 | + MPSStream* mpsStream = getCurrentMPSStream(); |
| 63 | + //const uint32_t nDim = iter.ndim(); |
| 64 | + //constexpr uint32_t nOffsets = 3; |
| 65 | + const uint32_t numThreads = dets_num; |
| 66 | + dispatch_sync(mpsStream->queue(), ^() { |
| 67 | + @autoreleasepool { |
| 68 | + NSError* error = nil; |
| 69 | + id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder(); |
| 70 | + MTLSize gridSize = MTLSizeMake(numThreads, 1, 1); |
| 71 | + |
| 72 | + |
| 73 | + const std::string kernel = "nms_" + scalarToMetalTypeString(dets_sorted.scalar_type()); |
| 74 | + id<MTLComputePipelineState> binaryPSO = mps::binaryPipelineState(device, kernel); |
| 75 | + |
| 76 | + // this function call is a no-op if MPS Profiler is not enabled |
| 77 | + //getMPSProfiler().beginProfileKernel(binaryPSO, kernel, {input, other}); |
| 78 | + |
| 79 | + [computeEncoder setComputePipelineState:binaryPSO]; |
| 80 | + [computeEncoder setBuffer:inputBuffer offset:dets_sorted.storage_offset() * dets_sorted.element_size() atIndex:0]; |
| 81 | + [computeEncoder setBuffer:outputBuffer offset:mask.storage_offset() * mask.element_size() atIndex:1]; |
| 82 | + [computeEncoder setBytes:&dets_num length:sizeof(int) atIndex:2]; |
| 83 | + [computeEncoder setBytes:&iou_threshold_f length:sizeof(float) atIndex:3]; |
| 84 | + |
| 85 | + NSUInteger tgSize = binaryPSO.maxTotalThreadsPerThreadgroup; |
| 86 | + if (tgSize > numThreads) { |
| 87 | + tgSize = numThreads; |
| 88 | + } |
| 89 | + |
| 90 | + MTLSize threadGroupSize = MTLSizeMake(tgSize, 1, 1); |
| 91 | + [computeEncoder dispatchThreads:gridSize threadsPerThreadgroup:threadGroupSize]; |
| 92 | + |
| 93 | + //getMPSProfiler().endProfileKernel(binaryPSO); |
| 94 | + } |
| 95 | + }); |
| 96 | + return mask; |
| 97 | + |
| 98 | +} |
| 99 | + |
| 100 | +} // namespace |
| 101 | + |
| 102 | +TORCH_LIBRARY_IMPL(torchvision, MPS, m) { |
| 103 | + m.impl(TORCH_SELECTIVE_NAME("torchvision::nms"), TORCH_FN(nms_kernel)); |
| 104 | +} |
| 105 | + |
| 106 | +} // namespace ops |
| 107 | +} // namespace vision |
0 commit comments