kulinseth · DenisVieriu97 · Jun 28, 2022 · Jun 28, 2022 · Jun 28, 2022
@@ -52,6 +52,7 @@ double getMPSScalarValue(const Tensor& t);
 std::string getArrayRefString(const IntArrayRef s);
 // use has_storage() on the returned tensor to determine if src actually is a view
 Tensor gatherViewTensor(const at::Tensor& src);
+Tensor gatherViewTensorWithOutput(const at::Tensor& src, at::Tensor& dst);
 Tensor& scatterViewTensor(const at::Tensor& src, at::Tensor& output);
 
 MPSShape* getMPSShape(const Tensor& t);

@@ -250,10 +250,23 @@ void copy_blit_mps(void* dst, const void* src, size_t size) {
 static at::Tensor& copy_kernel_mps(at::Tensor& dst_, const at::Tensor& src_, bool non_blocking)
 {
  auto src_byte_offset = src_.storage_offset() * src_.itemsize();
+ auto dst_byte_offset = dst_.storage_offset() * dst_.itemsize();
+
+ // If dst is contiguous and there is no byte offset, we can save directly the result of
+ // gather into dst. This reduces the overhead of doing an additional blit for most cases
+ bool returnGatherOutput = (dst_.is_contiguous() && !dst_byte_offset);
  Tensor src;
+
  if (!src_.is_contiguous()) {
- src = gatherViewTensor(src_);
+ src = returnGatherOutput ?
+ gatherViewTensorWithOutput(src_, dst_) :
+ gatherViewTensor(src_);
+
  if (src.has_storage()) {
+
+ if (returnGatherOutput)
+ return dst_;
+
  src_byte_offset = 0;
  } else {
  src = src_.expand_as(dst_).contiguous();
@@ -271,7 +284,6 @@ void copy_blit_mps(void* dst, const void* src, size_t size) {
  src._set_conj(src_.is_conj());
  src._set_neg(src_.is_neg());
 
- auto dst_byte_offset = dst_.storage_offset() * dst_.itemsize();
  id<MTLBuffer> destBuffer = getMTLBufferStorage(dst_);
  id<MTLBuffer> sourceBuffer = getMTLBufferStorage(src);
  const size_t src_size = src.nbytes();

@@ -209,15 +209,21 @@
  }
 }
 
-Tensor gatherViewTensor(const at::Tensor& src)
-{
+static ViewCachedGraph* _getCachedGraph(const at::Tensor& src) {
  ViewCachedGraph* cachedGraph = nullptr;
 
  const IntArrayRef& base_shape = get_buffer_shape(src.storage().data());
  if (base_shape.size() > 0) {
  string key = getStridedKey(src.scalar_type(), base_shape, src.sizes(), /*is_scatter*/ false);
  cachedGraph = static_cast<ViewCachedGraph *>(MPSGraphCache::getInstance()->LookUp(key));
  }
+
+ return cachedGraph;
+}
+
+Tensor gatherViewTensor(const at::Tensor& src)
+{
+ ViewCachedGraph* cachedGraph = _getCachedGraph(src);
  // there are cases where gatherViewTensor() is called without having as_strided() called beforehand.
  // this typically may come from copy_mps variants. In such cases, when the base_shape isn't found the
  // callers would resort to make the tensor contiguous in an alternative code path.
@@ -229,6 +235,16 @@ Tensor gatherViewTensor(const at::Tensor& src)
  return runViewGraph(cachedGraph, src, output, /*needsScatter*/ false);
 }
 
+Tensor gatherViewTensorWithOutput(const at::Tensor& src, at::Tensor& dst)
+{
+ ViewCachedGraph* cachedGraph = _getCachedGraph(src);
+ if (!cachedGraph) {
+ return Tensor();
+ }
+
+ return runViewGraph(cachedGraph, src, dst, /*needsScatter*/ false);
+}
+
 Tensor& scatterViewTensor(const at::Tensor& src, at::Tensor& output)
 {
  ViewCachedGraph* cachedGraph = createViewGraph(output, output.sizes(), output.strides(),

@@ -4641,7 +4641,7 @@ def test_slicing_with_step(self):
  x_mps = torch.zeros(10, dtype=torch.float32, device="mps")
  x_mps[::2] = 1.0
 
- x_cpu = torch.zeros(10, dtype=torch.float32, device="mps")
+ x_cpu = torch.zeros(10, dtype=torch.float32, device="cpu")
  x_cpu[::2] = 1.0
 
  self.assertEqual(x_cpu, x_mps)