Refactor blit copies and reuse scalar tensor's MPS storage in BinaryOps (#52)

razarmehr · kulinseth · pytorchmergebot · commit e4ffdc3d8444 · 2022-07-12T23:21:06.000Z
* Refactor blit copies and use commitAndContinue in commitAndWait() * Remove commitAndContinue from commitAndWait() * [MPS] Add commitAndContinue to non blocking blits. (#51) * Merge commitAndContinue changes from mps_master * Don't use getMPSGraphTensorFromScalar() if tensor is already on MPS device in Binary Ops This improves performance by preventing copy from GPU to CPU and back to GPU again Co-authored-by: Kulin Seth <kulin_seth@apple.com>
diff --git a/aten/src/ATen/mps/MPSStream.h b/aten/src/ATen/mps/MPSStream.h
@@ -34,8 +34,6 @@ typedef void* MTLDevice_t;
 namespace at {
 namespace mps {
 
-#define USE_MPSCOMMANDBUFFER 1
-
 //-----------------------------------------------------------------
 //  MPSStream
 //-----------------------------------------------------------------
@@ -44,6 +42,13 @@ class TORCH_API MPSStream
 {
 public:
   enum Unchecked { UNCHECKED };
+
+  enum class SyncType {
+    NONE,               // no commit to command buffer
+    COMMIT,             // commit and flush the command buffer
+    COMMIT_AND_WAIT,    // flush and wait for command buffer execution to finish
+    COMMIT_AND_CONTINUE,// commit and continue with a new underlying command buffer
+  };
   /// Construct a MPSStream from a Stream.  This construction is checked,
   /// and will raise an error if the Stream is not, in fact, a MPS stream.
   explicit MPSStream(Stream stream);
@@ -57,7 +62,10 @@ class TORCH_API MPSStream
   void commitAndWait();
   void commitAndContinue();
   void synchronize();
-
+  void copy(id<MTLBuffer> srcBuffer, id<MTLBuffer> dstBuffer,
+            size_t length, size_t srcOffset, size_t dstOffset, SyncType syncType = SyncType::NONE);
+  void copy_and_sync(id<MTLBuffer> srcBuffer, id<MTLBuffer> dstBuffer,
+                     size_t length, size_t srcOffset, size_t dstOffset, bool non_blocking);
   void flush();
   void executeMPSGraph(MPSGraph* mpsGraph, NSDictionary* feeds, NSDictionary* results);
 
@@ -74,7 +82,7 @@ class TORCH_API MPSStream
 private:
   Stream _stream;
   MTLCommandQueue_t   _commandQueue = nil;
-  MTLCommandBuffer_t  _commandBuffer = nil;
+  MPSCommandBuffer*  _commandBuffer = nil;
   MPSGraphExecutionDescriptor *_executionDescriptor = nil;
   void _flush(bool commitAndWait) const;
 
diff --git a/aten/src/ATen/mps/MPSStream.mm b/aten/src/ATen/mps/MPSStream.mm
@@ -5,6 +5,8 @@
 namespace at {
 namespace mps {
 
+#define USE_MPSCOMMANDBUFFER 1
+
 //-----------------------------------------------------------------
 //  MPSStream
 //-----------------------------------------------------------------
@@ -46,9 +48,13 @@
 }
 
 void MPSStream::commit(bool doFlush) {
+#if USE_MPSCOMMANDBUFFER
+  [commandBuffer() commitAndContinue];
+#else
   if (doFlush) {
     flush();
   }
+#endif
 }
 
 void MPSStream::commitAndWait() {
@@ -81,6 +87,41 @@
   [_commandBuffer release];
 }
 
+void MPSStream::copy(id<MTLBuffer> srcBuffer, id<MTLBuffer> dstBuffer,
+                    size_t length, size_t srcOffset, size_t dstOffset, SyncType syncType) {
+  dispatch_sync(_serialQueue, ^() {
+    @autoreleasepool {
+      id<MTLBlitCommandEncoder> blitEncoder = [commandBuffer() blitCommandEncoder];
+
+      [blitEncoder copyFromBuffer:srcBuffer
+                     sourceOffset:(NSUInteger)srcOffset
+                         toBuffer:dstBuffer
+                destinationOffset:(NSUInteger)dstOffset
+                             size:(NSUInteger)length];
+      [blitEncoder endEncoding];
+      switch(syncType) {
+        case SyncType::NONE:
+          // typically in GPU to GPU copies we won't commit explicitly
+          break;
+        case SyncType::COMMIT:
+          commit(true);
+          break;
+        case SyncType::COMMIT_AND_WAIT:
+          commitAndWait();
+          break;
+        case SyncType::COMMIT_AND_CONTINUE:
+          commitAndContinue();
+          break;
+      }
+    }
+  });
+}
+
+void MPSStream::copy_and_sync(id<MTLBuffer> srcBuffer, id<MTLBuffer> dstBuffer, size_t length,
+                              size_t srcOffset, size_t dstOffset, bool non_blocking) {
+  copy(srcBuffer, dstBuffer, length, srcOffset, dstOffset,
+       !non_blocking ? SyncType::COMMIT_AND_WAIT : SyncType::COMMIT);
+}
 
 void MPSStream::executeMPSGraph(MPSGraph* mpsGraph, NSDictionary* feeds, NSDictionary* results) {
   dispatch_sync(_serialQueue, ^() {
diff --git a/aten/src/ATen/native/mps/operations/BinaryOps.mm b/aten/src/ATen/native/mps/operations/BinaryOps.mm
@@ -94,13 +94,13 @@ void binaryOpTensor(const Tensor& self, const Tensor& other, const Scalar& alpha
     Placeholder selfPlaceholder;
     Placeholder otherPlaceholder;
 
-    if (is_self_scalar) {
+    if (is_self_scalar && !self.is_mps()) {
       feeds[cachedGraph->primaryTensor] = getMPSGraphTensorFromScalar(mpsStream, self.item(), getMPSScalarType(self.scalar_type()));
     } else {
       selfPlaceholder = Placeholder(cachedGraph->primaryTensor, self);
       feeds[selfPlaceholder.getMPSGraphTensor()] = selfPlaceholder.getMPSGraphTensorData();
     }
-    if (is_other_scalar) {
+    if (is_other_scalar && !other.is_mps()) {
       feeds[cachedGraph->secondaryTensor] = getMPSGraphTensorFromScalar(mpsStream, other.item(), getMPSScalarType(other.scalar_type()));
     } else {
       otherPlaceholder = Placeholder(cachedGraph->secondaryTensor, other);
diff --git a/aten/src/ATen/native/mps/operations/Copy.mm b/aten/src/ATen/native/mps/operations/Copy.mm
@@ -136,31 +136,8 @@ void copy_cast_mps(at::Tensor& dst, const at::Tensor& src,
     // 4 bytes alignment required on macos for blits.
     TORCH_CHECK(destOffset % 4 == 0, "Unaligned blit request");
 
-    dispatch_sync(stream->queue(), ^() {
-      @autoreleasepool {
-        id<MTLCommandBuffer> commandBuffer = stream->commandBuffer();
-        id<MTLBlitCommandEncoder> blitEncoder =
-            [commandBuffer blitCommandEncoder];
-
-        [blitEncoder copyFromBuffer:sourceBuffer
-                       sourceOffset:(NSUInteger)storage_byte_offset
-                           toBuffer:destBuffer
-                  destinationOffset:(NSUInteger)destOffset
-                               size:(NSUInteger)src_size];
-        [blitEncoder endEncoding];
-
-        if (non_blocking) {
-#if USE_MPSCOMMANDBUFFER
-          stream->commitAndContinue();
-#else
-          stream->commit(true);
-#endif
-        } else {
-          stream->commitAndWait();
-        }
-        [destBuffer release];
-      }
-    });
+    stream->copy_and_sync(sourceBuffer, destBuffer, src_size, storage_byte_offset, destOffset, non_blocking);
+    [destBuffer release];
   }
   if (!dst.is_same(dst_)) {
     dst_.copy_(dst, non_blocking);
@@ -206,29 +183,7 @@ void copy_cast_mps(at::Tensor& dst, const at::Tensor& src,
     if (src_.is_view() || !src_.is_contiguous())
       sourceOffset += src_.storage_offset() * src_.itemsize();
 
-    dispatch_sync(stream->queue(), ^() {
-      @autoreleasepool {
-        id<MTLCommandBuffer> commandBuffer = stream->commandBuffer();
-        id<MTLBlitCommandEncoder> blitEncoder =
-            [commandBuffer blitCommandEncoder];
-
-        [blitEncoder copyFromBuffer:sourceBuffer
-                       sourceOffset:(NSUInteger)sourceOffset
-                           toBuffer:destBuffer
-                  destinationOffset:(NSUInteger)dst_byte_offset
-                               size:(NSUInteger)size];
-        [blitEncoder endEncoding];
-        if (non_blocking) {
-#if USE_MPSCOMMANDBUFFER
-          stream->commitAndContinue();
-#else
-          stream->commit(true);
-#endif
-        } else {
-          stream->commitAndWait();
-        }
-      }
-    });
+    stream->copy_and_sync(sourceBuffer, destBuffer, size, sourceOffset, dst_byte_offset, non_blocking);
     [sourceBuffer release];
   }
 
@@ -237,23 +192,7 @@ void copy_cast_mps(at::Tensor& dst, const at::Tensor& src,
 
 void copy_blit_mps(void* dst, const void* src, size_t size) {
   MPSStream* stream = getCurrentMPSStream();
-  id<MTLBuffer> sourceBuffer = (id<MTLBuffer>)(src);
-  id<MTLBuffer> destBuffer = (id<MTLBuffer>)(dst);
-  dispatch_sync(stream->queue(), ^() {
-    @autoreleasepool {
-      id<MTLCommandBuffer> commandBuffer = stream->commandBuffer();
-      id<MTLBlitCommandEncoder> blitEncoder =
-          [commandBuffer blitCommandEncoder];
-
-      [blitEncoder copyFromBuffer:sourceBuffer
-                     sourceOffset:0
-                         toBuffer:destBuffer
-                destinationOffset:0
-                             size:size];
-      [blitEncoder endEncoding];
-      stream->commitAndWait();
-    }
-  });
+  stream->copy_and_sync((id<MTLBuffer>)(src), (id<MTLBuffer>)(dst), size, 0, 0, true);
 }
 
 static at::Tensor& copy_kernel_mps(at::Tensor& dst_, const at::Tensor& src_, bool non_blocking)
@@ -294,27 +233,10 @@ void copy_blit_mps(void* dst, const void* src, size_t size) {
   id<MTLBuffer> destBuffer = getMTLBufferStorage(dst_);
   id<MTLBuffer> sourceBuffer = getMTLBufferStorage(src);
   const size_t src_size = src.nbytes();
-
   if (src.dtype() == dst_.dtype()) {
     MPSStream* stream = getCurrentMPSStream();
-    dispatch_sync(stream->queue(), ^() {
-      @autoreleasepool {
-        id<MTLCommandBuffer> commandBuffer = stream->commandBuffer();
-        id<MTLBlitCommandEncoder> blitEncoder = [commandBuffer blitCommandEncoder];
-        [blitEncoder copyFromBuffer:sourceBuffer
-                       sourceOffset:src_byte_offset
-                           toBuffer:destBuffer
-                  destinationOffset:dst_byte_offset
-                               size:src_size];
-        [blitEncoder endEncoding];
-        // GPU to GPU copy needs flushing only, and no synchronization with CPU is necessary
-#if USE_MPSCOMMANDBUFFER
-          stream->commitAndContinue();
-#else
-          stream->commit(true);
-#endif
-      }
-    });
+    // for GPU to GPU copies we only encode to stream's command buffer (no flushing)
+    stream->copy(sourceBuffer, destBuffer, src_size, src_byte_offset, dst_byte_offset);
   } else {
     copy_cast_mps(dst_, src, destBuffer, sourceBuffer);
   }