kulinseth
diff --git a/‎aten/src/ATen/mps/IndexKernels.h‎
Lines changed: 97 additions & 54 deletions b/‎aten/src/ATen/mps/IndexKernels.h‎
Lines changed: 97 additions & 54 deletions
diff --git a/‎aten/src/ATen/native/mps/OperationUtils.h‎
Lines changed: 2 additions & 1 deletion b/‎aten/src/ATen/native/mps/OperationUtils.h‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎aten/src/ATen/native/mps/OperationUtils.mm‎
Lines changed: 21 additions & 0 deletions b/‎aten/src/ATen/native/mps/OperationUtils.mm‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/mps/operations/BinaryKernel.h‎
Lines changed: 14 additions & 0 deletions b/‎aten/src/ATen/native/mps/operations/BinaryKernel.h‎
Lines changed: 14 additions & 0 deletions
@@ -3,7 +3,24 @@
 namespace at {
 namespace mps {
 
-static const char * indexing_metal_shaders = R"INDEX_METAL(
+#define GET_IDX_TEMPLATE                                     \
+"static inline uint3 get_idx(                              " \
+"  uint tid,                                               " \
+"  constant uint * iter_shape,                             " \
+"  const uint num_dimensions,                              " \
+"  constant packed_uint3 * strides) {{                     " \
+"  uint3 data_offsets = 0;                                 " \
+"  uint32_t idx = tid;                                     " \
+"  for (uint32_t dim = 0; dim < num_dimensions; dim++) {{  " \
+"      uint32_t remainder = idx % iter_shape[dim];         " \
+"      idx /= iter_shape[dim];                             " \
+"      data_offsets += remainder * strides[dim];           " \
+"  }}                                                      " \
+"  return data_offsets;                                    " \
+"}}"
+
+static const char * indexing_metal_shaders = GET_IDX_TEMPLATE
+R"INDEX_METAL(
 #include <metal_stdlib>
 #include <metal_atomic>
 
@@ -18,7 +35,6 @@ struct IndexAB {
 struct IndexAB {
     constant int64_t* indexArray;
 };
-
 #endif
 
 template<typename T>
@@ -30,11 +46,17 @@ kernel void index_select(
 #endif
     constant void     * indexSizes        [[buffer(1)]],
     constant void     * indexStrides      [[buffer(2)]],
-    constant uint3    * offsets           [[buffer(3)]],
     constant void     * inputData         [[buffer(4)]],
     device   void     * outputData        [[buffer(5)]],
     constant uint32_t & num_indices       [[buffer(6)]],
+    constant uint     * iter_shape        [[buffer(7)]],
+    constant uint     & num_dimensions    [[buffer(8)]],
+    constant packed_uint3 * strides   [[buffer(9)]],
+
     uint thread_index [[thread_position_in_grid]]) {
+
+    uint3 offsets = get_idx(thread_index, iter_shape, num_dimensions, strides);
+
     constant int64_t * index_sizes   = (constant int64_t *)indexSizes;
     constant int64_t * index_strides = (constant int64_t *)indexStrides;
     int64_t offset = 0;
@@ -44,14 +66,14 @@ kernel void index_select(
 #else
         constant int64_t* indexArray = (constant int64_t*)indexAB.indexArray[i];
 #endif
-        int64_t index = indexArray[offsets[thread_index].z / sizeof(int64_t)];
+        int64_t index = indexArray[offsets.z / sizeof(int64_t)];
         if (index < 0) {
             index += index_sizes[i];
         }
         offset += index * index_strides[i];
      }
-    device T * out = (device T*)((device char*)outputData + offsets[thread_index].x);
-    constant T * in  = (constant T*)((constant char*)inputData  + offsets[thread_index].y + offset);
+    device T * out = (device T*)((device char*)outputData + offsets.x);
+    constant T * in  = (constant T*)((constant char*)inputData  + offsets.y + offset);
     *out = *in;
 }
 
@@ -64,12 +86,19 @@ kernel void index_put(
 #endif
     constant void     * indexSizes        [[buffer(1)]],
     constant void     * indexStrides      [[buffer(2)]],
-    constant uint3    * offsets           [[buffer(3)]],
     constant void     * inputData         [[buffer(4)]],
     device   void     * outputData        [[buffer(5)]],
     constant uint32_t & num_indices       [[buffer(6)]],
+
+    constant uint  * iter_shape       [[buffer(7)]],
+    constant uint & num_dimensions    [[buffer(8)]],
+    constant packed_uint3 * strides   [[buffer(9)]],
+
     uint thread_index [[thread_position_in_grid]]) {
 
+    uint3 offsets = get_idx(thread_index, iter_shape, num_dimensions, strides);
+
+
     constant int64_t * index_sizes   = (constant int64_t *)indexSizes;
     constant int64_t * index_strides = (constant int64_t *)indexStrides;
     int64_t offset = 0;
@@ -79,15 +108,15 @@ kernel void index_put(
 #else
         constant int64_t* indexArray = (constant int64_t*)indexAB.indexArray[i];
 #endif
-        int64_t index = indexArray[offsets[thread_index].z / sizeof(int64_t)];
+        int64_t index = indexArray[offsets.z / sizeof(int64_t)];
 
         if (index < 0) {
             index += index_sizes[i];
         }
         offset += index * index_strides[i];
      }
-    device T * out = (device T*)((device char*)outputData + offsets[thread_index].x + offset);
-    constant T * in  = (constant T*)((constant char*)inputData  + offsets[thread_index].y);
+    device T * out = (device T*)((device char*)outputData + offsets.x + offset);
+    constant T * in  = (constant T*)((constant char*)inputData  + offsets.y);
     *out = *in;
 }
 
@@ -96,26 +125,30 @@ kernel void index_put(
 template                                                        \
 [[host_name("index_" #INDEX_OP_TYPE "_" #DTYPE_SIZE)]]          \
 kernel void index_ ## INDEX_OP_TYPE<DTYPE>(                     \
-    constant IndexAB & indexAB           [[buffer(0)]],         \
-    constant void    * indexSizes        [[buffer(1)]],         \
-    constant void    * indexStrides      [[buffer(2)]],         \
-    constant uint3   * offsets           [[buffer(3)]],         \
-    constant void    * inputData         [[buffer(4)]],         \
-    device   void    * outputData        [[buffer(5)]],         \
-    constant uint32_t & num_indices      [[buffer(6)]],         \
+    constant IndexAB  & indexAB           [[buffer(0)]],        \
+    constant void     * indexSizes        [[buffer(1)]],        \
+    constant void     * indexStrides      [[buffer(2)]],        \
+    constant void     * inputData         [[buffer(4)]],        \
+    device   void     * outputData        [[buffer(5)]],        \
+    constant uint32_t & num_indices       [[buffer(6)]],        \
+    constant uint     * iter_shape        [[buffer(7)]],        \
+    constant uint     & num_dimensions    [[buffer(8)]],        \
+    constant packed_uint3 * strides       [[buffer(9)]],        \
     uint thread_index [[thread_position_in_grid]]);
 #else
 #define REGISTER_INDEX_OP(DTYPE_SIZE, DTYPE, INDEX_OP_TYPE)     \
 template                                                        \
 [[host_name("index_" #INDEX_OP_TYPE "_" #DTYPE_SIZE)]]          \
 kernel void index_ ## INDEX_OP_TYPE<DTYPE>(                     \
-    constant IndexAB * indexAB           [[buffer(0)]],         \
-    constant void    * indexSizes        [[buffer(1)]],         \
-    constant void    * indexStrides      [[buffer(2)]],         \
-    constant uint3   * offsets           [[buffer(3)]],         \
-    constant void    * inputData         [[buffer(4)]],         \
-    device   void    * outputData        [[buffer(5)]],         \
+    constant IndexAB  * indexAB          [[buffer(0)]],         \
+    constant void     * indexSizes       [[buffer(1)]],         \
+    constant void     * indexStrides     [[buffer(2)]],         \
+    constant void     * inputData        [[buffer(4)]],         \
+    device   void     * outputData       [[buffer(5)]],         \
     constant uint32_t & num_indices      [[buffer(6)]],         \
+    constant uint     * iter_shape       [[buffer(7)]],         \
+    constant uint     & num_dimensions   [[buffer(8)]],         \
+    constant packed_uint3 * strides      [[buffer(9)]],         \
     uint thread_index [[thread_position_in_grid]]);
 #endif
 
@@ -147,17 +180,20 @@ kernel void kernel_index_offsets(constant packed_uint3 * strides         [[buffe
 template<typename T, typename E>
 kernel void index_put_accumulate_native_dtypes(
 #if __METAL_VERSION__ >= 300
-    constant IndexAB  * indexAB     [[buffer(0)]],
+    constant IndexAB  * indexAB        [[buffer(0)]],
 #else
-    constant IndexAB  & indexAB     [[buffer(0)]],
+    constant IndexAB  & indexAB        [[buffer(0)]],
 #endif
-    constant void    * indexSizes   [[buffer(1)]],
-    constant void    * indexStrides [[buffer(2)]],
-    constant uint3   * offsets      [[buffer(3)]],
-    constant void    * inputData    [[buffer(4)]],
-    device void      * outputData   [[buffer(5)]],
-    constant uint32_t& num_indices  [[buffer(6)]],
+    constant void     * indexSizes     [[buffer(1)]],
+    constant void     * indexStrides   [[buffer(2)]],
+    constant void     * inputData      [[buffer(4)]],
+    device void       * outputData     [[buffer(5)]],
+    constant uint32_t & num_indices    [[buffer(6)]],
+    constant uint     * iter_shape     [[buffer(7)]],
+    constant uint     & num_dimensions [[buffer(8)]],
+    constant packed_uint3 * strides    [[buffer(9)]],
     uint thread_index [[thread_position_in_grid]]) {
+    uint3 offsets = get_idx(thread_index, iter_shape, num_dimensions, strides);
     constant int64_t * index_sizes   = (constant int64_t *)indexSizes;
     constant int64_t * index_strides = (constant int64_t *)indexStrides;
     int64_t offset = 0;
@@ -167,14 +203,14 @@ kernel void index_put_accumulate_native_dtypes(
 #else
         constant int64_t* indexArray = (constant int64_t*)indexAB.indexArray[i];
 #endif
-        int64_t index = indexArray[offsets[thread_index].z / sizeof(int64_t)];
+        int64_t index = indexArray[offsets.z / sizeof(int64_t)];
         if (index < 0) {
             index += index_sizes[i];
         }
         offset += index * index_strides[i];
     }
-    device T * out = (device T*)((device char*)outputData + offsets[thread_index].x + offset);
-    constant E * in  = (constant E*)((constant char*)inputData  + offsets[thread_index].y);
+    device T * out = (device T*)((device char*)outputData + offsets.x + offset);
+    constant E * in  = (constant E*)((constant char*)inputData  + offsets.y);
     atomic_fetch_add_explicit(out, *in, memory_order_relaxed);
 }
 
@@ -191,17 +227,20 @@ __attribute__((__always_inline__)) void atomic_fetch_add_relaxed(device void * a
 template<typename T>
 kernel void atomic_index_put_accumulate(
 #if __METAL_VERSION__ >= 300
-    constant IndexAB * indexAB           [[buffer(0)]],
+    constant IndexAB  * indexAB           [[buffer(0)]],
 #else
-    constant IndexAB & indexAB           [[buffer(0)]],
+    constant IndexAB  & indexAB           [[buffer(0)]],
 #endif
-    constant void    * indexSizes        [[buffer(1)]],
-    constant void    * indexStrides      [[buffer(2)]],
-    constant uint3   * offsets           [[buffer(3)]],
-    constant void    * inputData         [[buffer(4)]],
-    device   void    * outputData        [[buffer(5)]],
-    constant uint32_t& num_indices       [[buffer(6)]],
+    constant void     * indexSizes        [[buffer(1)]],
+    constant void     * indexStrides      [[buffer(2)]],
+    constant void     * inputData         [[buffer(4)]],
+    device   void     * outputData        [[buffer(5)]],
+    constant uint32_t & num_indices       [[buffer(6)]],
+    constant uint     * iter_shape        [[buffer(7)]],
+    constant uint     & num_dimensions    [[buffer(8)]],
+    constant packed_uint3 * strides       [[buffer(9)]],
     uint thread_index [[thread_position_in_grid]]) {
+    uint3 offsets = get_idx(thread_index, iter_shape, num_dimensions, strides);
     constant int64_t * index_sizes   = (constant int64_t *)indexSizes;
     constant int64_t * index_strides = (constant int64_t *)indexStrides;
     int64_t offset = 0;
@@ -211,14 +250,14 @@ kernel void atomic_index_put_accumulate(
 #else
         constant int64_t* indexArray = (constant int64_t*)indexAB.indexArray[i];
 #endif
-        int64_t index = indexArray[offsets[thread_index].z / sizeof(int64_t)];
+        int64_t index = indexArray[offsets.z / sizeof(int64_t)];
         if (index < 0) {
             index += index_sizes[i];
         }
         offset += index * index_strides[i];
     }
-    device void * out = (device void*)((device char*)outputData + offsets[thread_index].x + offset);
-    constant T  * in  = (constant T*)((constant char*)inputData + offsets[thread_index].y);
+    device void * out = (device void*)((device char*)outputData + offsets.x + offset);
+    constant T  * in  = (constant T*)((constant char*)inputData + offsets.y);
     atomic_fetch_add_relaxed<T>(out, *in);
 }
 
@@ -232,26 +271,30 @@ kernel void atomic_index_put_accumulate<float>(
 #endif
     constant void    * indexSizes   [[buffer(1)]],
     constant void    * indexStrides [[buffer(2)]],
-    constant uint3   * offsets      [[buffer(3)]],
     constant void    * inputData    [[buffer(4)]],
     device   void    * outputData   [[buffer(5)]],
     constant uint32_t& num_indices  [[buffer(6)]],
+    constant uint  * iter_shape     [[buffer(7)]],
+    constant uint & num_dimensions  [[buffer(8)]],
+    constant packed_uint3 * strides [[buffer(9)]],
     uint thread_index [[thread_position_in_grid]]);
 
 template
 [[host_name("index_put_accumulate_32bit_int")]]
 kernel void index_put_accumulate_native_dtypes<atomic_int, int>(
 #if __METAL_VERSION__ >= 300
-    constant IndexAB  * indexAB     [[buffer(0)]],
+    constant IndexAB  * indexAB       [[buffer(0)]],
 #else
-    constant IndexAB  & indexAB     [[buffer(0)]],
+    constant IndexAB  & indexAB       [[buffer(0)]],
 #endif
-    constant void    * indexSizes   [[buffer(1)]],
-    constant void    * indexStrides [[buffer(2)]],
-    constant uint3   * offsets      [[buffer(3)]],
-    constant void    * inputData    [[buffer(4)]],
-    device   void    * outputData   [[buffer(5)]],
-    constant uint32_t& num_indices [[buffer(6)]],
+    constant void    * indexSizes     [[buffer(1)]],
+    constant void    * indexStrides   [[buffer(2)]],
+    constant void    * inputData      [[buffer(4)]],
+    device   void    * outputData     [[buffer(5)]],
+    constant uint32_t& num_indices    [[buffer(6)]],
+    constant uint    * iter_shape     [[buffer(7)]],
+    constant uint    & num_dimensions [[buffer(8)]],
+    constant packed_uint3 * strides   [[buffer(9)]],
     uint thread_index [[thread_position_in_grid]]);
 )INDEX_METAL";
 
 
@@ -61,6 +61,8 @@ NSArray<NSNumber*>* getTensorAxes(const IntArrayRef& sizes, at::OptionalIntArray
 std::string getMPSShapeString(MPSShape* shape);
 std::string getTensorsStringKey(const TensorList& tensors, bool short_dtype = true, bool exclude_shape = false);
 std::string getArrayRefString(const IntArrayRef s);
+const std::string& getMetalScalarType(const Tensor& t);
+const std::string& getMetalScalarType(const c10::ScalarType& scalar_type);
 // use has_storage() on the returned tensor to determine if src actually is a view
 Tensor gatherViewTensor(const at::Tensor& src, at::Tensor& dst);
 Tensor& scatterViewTensor(const at::Tensor& src, at::Tensor& output);
@@ -223,7 +225,6 @@ struct MPSGraphCache
   }
 
   MPSCachedGraph* LookUp(const std::string& key) const {
-
     __block MPSCachedGraph* cachedGraph = nullptr;
 
     MPSCacheKey hash = std::hash<std::string>{}(key);
 
@@ -223,6 +223,27 @@ MPSDataType getMPSScalarType(ScalarType scalar_type) {
   return ss.str();
 }
 
+const std::string& getMetalScalarType(const c10::ScalarType& scalar_type) {
+  static std::unordered_map<c10::ScalarType, std::string> scalarToMetalType = {
+    {c10::ScalarType::Float, "float"},
+    {c10::ScalarType::Half,  "half"},
+    {c10::ScalarType::Long,  "long"},
+    {c10::ScalarType::Int,   "int"},
+    {c10::ScalarType::Short, "short"},
+    {c10::ScalarType::Char,  "char"},
+    {c10::ScalarType::Byte,  "uchar"},
+    {c10::ScalarType::Bool,  "bool"},
+  };
+
+  auto it = scalarToMetalType.find(scalar_type);
+  TORCH_CHECK(it != scalarToMetalType.end(), "Unsupported type byte size: ", scalar_type);
+  return it->second;
+}
+
+const std::string& getMetalScalarType(const Tensor& t) {
+  return getMetalScalarType(t.scalar_type());
+}
+
 std::string getTensorsStringKey(const TensorList& tensors, bool short_dtype, bool exclude_shape) {
     std::string str;
     // The key format per tensor would look like ":Float32[1,1,1,10]:"
 
@@ -0,0 +1,14 @@
+//  Copyright © 2023 Apple Inc.
+#pragma once
+
+namespace at {
+namespace native {
+namespace mps {
+bool dispatchNativeBinaryKernel(const Tensor& self,
+                        const Tensor& other,
+                        const Tensor& output,
+                        const Scalar& alpha,
+                        const std::string& op_name);
+}
+}
+}