kulinseth · kulinseth · Jul 20, 2022 · Jul 20, 2022 · DenisVieriu97 · Jul 20, 2022
@@ -69,7 +69,7 @@ class TORCH_API MPSStream
   void copy_and_sync(id<MTLBuffer> srcBuffer, id<MTLBuffer> dstBuffer,
                      size_t length, size_t srcOffset, size_t dstOffset, bool non_blocking);
   void flush();
-  void executeMPSGraph(MPSGraph* mpsGraph, NSDictionary* feeds, NSDictionary* results);
+  void executeMPSGraph(MPSGraph* mpsGraph, NSDictionary* feeds, NSDictionary* results, SyncType syncType = SyncType::NONE);
 
   /// Get the MPS device index that this stream is associated with.
   c10::DeviceIndex device_index() const { return _stream.device_index(); }

@@ -38,6 +38,8 @@
 }
 
 void MPSStream::synchronize(SyncType syncType) {
+  if (!_commandBuffer)
+    return;
   switch(syncType) {
     case SyncType::NONE:
       // typically in GPU to GPU copies we won't commit explicitly
@@ -134,14 +136,16 @@
        !non_blocking ? SyncType::COMMIT_AND_WAIT : SyncType::COMMIT);
 }
 
-void MPSStream::executeMPSGraph(MPSGraph* mpsGraph, NSDictionary* feeds, NSDictionary* results) {
+void MPSStream::executeMPSGraph(MPSGraph* mpsGraph, NSDictionary* feeds, NSDictionary* results, SyncType syncType) {
   dispatch_sync(_serialQueue, ^() {
 #if USE_MPSCOMMANDBUFFER
     [mpsGraph encodeToCommandBuffer:commandBuffer()
                               feeds:feeds
                    targetOperations:nil
                   resultsDictionary:results
                 executionDescriptor:_executionDescriptor];
+    // mostly the syncType is NONE, but in some cases we may want to sync and wait (e.g., gatherViewTensor)
+    synchronize(syncType);
 #else
     commit(true);
     [mpsGraph runAsyncWithMTLCommandQueue:_commandQueue

@@ -36,7 +36,8 @@
 }
 
 // initializes the MTLBuffers for tesnsor data and runs the MPSGraph for the view op
-static Tensor& runViewGraph(ViewCachedGraph* cachedGraph, const at::Tensor& src, Tensor& output, bool needsScatter)
+static Tensor& runViewGraph(ViewCachedGraph* cachedGraph, const at::Tensor& src, Tensor& output,
+                            bool needsScatter, bool requires_sync = false)
 {
   const id<MTLBuffer> sourceBuffer = getMTLBufferStorage(src);
   const id<MTLBuffer> outputBuffer = getMTLBufferStorage(output);
@@ -71,7 +72,8 @@
     NSDictionary<MPSGraphTensor*, MPSGraphTensorData*>* results = @{
       cachedGraph->outputTensor : outputTensorData
     };
-    runMPSGraph(stream, cachedGraph->graph(), feeds, results);
+    stream->executeMPSGraph(cachedGraph->graph(), feeds, results,
+                            requires_sync ? SyncType::COMMIT_AND_WAIT : SyncType::NONE);
   }
   return output;
 }
@@ -225,11 +227,13 @@ Tensor gatherViewTensor(const at::Tensor& src, at::Tensor& dst)
     return Tensor();
   }
 
+  bool requires_sync = false;
   Tensor output;
-  if (!dst.has_storage())
+  if (!dst.has_storage()) {
     output = at::native::empty_mps(src.sizes(), src.scalar_type(), c10::nullopt, kMPS);
-
-  return runViewGraph(cachedGraph, src, dst.has_storage() ? dst : output, /*needsScatter*/ false);
+    requires_sync = true;
+  }
+  return runViewGraph(cachedGraph, src, dst.has_storage() ? dst : output, /*needsScatter*/ false, requires_sync);
 }
 
 Tensor& scatterViewTensor(const at::Tensor& src, at::Tensor& output)