kulinseth · DenisVieriu97 · Jul 11, 2022 · Jul 8, 2022 · Jul 8, 2022 · Jul 8, 2022
diff --git a/aten/src/ATen/native/DispatchStub.cpp b/aten/src/ATen/native/DispatchStub.cpp
@@ -122,6 +122,12 @@ void* DispatchStubImpl::get_call_ptr(
  TORCH_INTERNAL_ASSERT(hip_dispatch_ptr, "DispatchStub: missing HIP kernel");
  return hip_dispatch_ptr;
 
+#if defined(USE_MPS)
+ case DeviceType::MPS:
+ TORCH_INTERNAL_ASSERT(mps_dispatch_ptr, "DispatchStub: missing MPS kernel");
+ return mps_dispatch_ptr;
+#endif
+
  default:
  AT_ERROR("DispatchStub: unsupported device type", device_type);
  }

diff --git a/aten/src/ATen/native/DispatchStub.h b/aten/src/ATen/native/DispatchStub.h
@@ -118,6 +118,7 @@ struct TORCH_API DispatchStubImpl {
  std::atomic<void*> cpu_dispatch_ptr{nullptr};
  void* cuda_dispatch_ptr = nullptr;
  void* hip_dispatch_ptr = nullptr;
+ void* mps_dispatch_ptr = nullptr;
  #endif
 };
 
@@ -165,6 +166,10 @@ struct DispatchStub<rT (*)(Args...), T> {
  impl.hip_dispatch_ptr = reinterpret_cast<void*>(fn_ptr);
  }
 
+ void set_mps_dispatch_ptr(FnPtr fn_ptr) {
+ impl.mps_dispatch_ptr = reinterpret_cast<void*>(fn_ptr);
+ }
+
  static TORCH_API FnPtr DEFAULT;
 #ifdef HAVE_AVX512_CPU_DEFINITION
  static TORCH_API FnPtr AVX512;
@@ -190,6 +195,13 @@ struct RegisterCUDADispatch {
  }
 };
 
+template <typename DispatchStub>
+struct RegisterMPSDispatch {
+ RegisterMPSDispatch(DispatchStub &stub, typename DispatchStub::FnPtr value) {
+ stub.set_mps_dispatch_ptr(value);
+ }
+};
+
 template <typename DispatchStub>
 struct RegisterHIPDispatch {
  RegisterHIPDispatch(DispatchStub &stub, typename DispatchStub::FnPtr value) {
@@ -259,6 +271,9 @@ struct RegisterHIPDispatch {
 #define REGISTER_HIP_DISPATCH(name, fn) \
  static RegisterHIPDispatch<struct name> name ## __register(name, fn);
 
+#define REGISTER_MPS_DISPATCH(name, fn) \
+ static RegisterMPSDispatch<struct name> name ## __register(name, fn);
+
 // NB: This macro must be used in an actual 'cu' file; if you try using
 // it from a 'cpp' file it will not work!
 #if defined(__CUDACC__)
@@ -268,6 +283,9 @@ struct RegisterHIPDispatch {
 // is HIP in the PyTorch HIPify build.
 #define REGISTER_DISPATCH(name, fn) REGISTER_CUDA_DISPATCH(name, fn)
 // #define REGISTER_DISPATCH(name, fn) REGISTER_HIP_DISPATCH(name, fn)
+#elif defined(__OBJC__) && defined(USE_MPS)
+// NB: this macro must be used from a 'mm' file in order to dispatch a MPS kernel
+#define REGISTER_DISPATCH(name, fn) REGISTER_MPS_DISPATCH(name, fn)
 #elif defined(CPU_CAPABILITY)
 #define REGISTER_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, fn)
 #define REGISTER_NO_AVX512_DISPATCH(name) \

diff --git a/aten/src/ATen/native/TensorAdvancedIndexing.cpp b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
@@ -521,9 +521,9 @@ AdvancedIndex::AdvancedIndex(const Tensor& src, TensorList indices_list)
  }
  }
 
- // For CUDA tensors, force all index tensors to have the same striding to
- // simplify the CUDA kernel.
- if (indices.size() >= 2 && this->src.device().type() == kCUDA) {
+ // For CUDA/MPS tensors, force all index tensors to have the same striding to
+ // simplify the CUDA/MPS kernel.
+ if (indices.size() >= 2 && (this->src.device().type() == kCUDA || this->src.device().type() == kMPS)) {
  if (!all_strides_match(indices)) {
  for (auto & indice : indices) {
  indice = indice.contiguous();

diff --git a/aten/src/ATen/native/mps/IndexKernels.mm b/aten/src/ATen/native/mps/IndexKernels.mm
@@ -0,0 +1,100 @@
+const char * index_select_kernel_mps =
+"#include <metal_stdlib>\n"
+"using namespace metal;\n"
+"\n"
+"constant int64_t storage_offset [[function_constant(0)]];\n"
+"constant uint32_t num_indices [[function_constant(1)]];\n"
+"\n"
+"struct IndexAB {\n"
+" // Allow up to 30 indices\n"
+" metal::array<device void *, 30> indexArray [[ id(0) ]];\n"
+"};\n"
+"\n"
+"template<typename T>\n"
+"kernel void index_select(device const IndexAB & indexAB [[buffer(0)]],\n"
+" device const void * indexSizes [[buffer(1)]],\n"
+" device const void * indexStrides [[buffer(2)]],\n"
+" device const uint3 * offsets [[buffer(3)]],\n"
+" device const void * inputData [[buffer(4)]],\n"
+" device void * outputData [[buffer(5)]],\n"
+" uint thread_index [[thread_position_in_grid]]) {\n"
+"\n"
+" device const int64_t * index_sizes = (device const int64_t *)indexSizes;\n"
+" device const int64_t * index_strides = (device const int64_t *)indexStrides;\n"
+" int64_t offset = 0;\n"
+" for (uint32_t i = 0; i < num_indices; i++) {\n"
+" int64_t index = ((device const int64_t*)(indexAB.indexArray[i]))[offsets[thread_index].z / sizeof(int64_t)];\n"
+" if (index < 0) {\n"
+" index += index_sizes[i];\n"
+" }\n"
+" offset += index * index_strides[i];\n"
+" }\n"
+" device T * out = (device T*)((device char*)outputData + offsets[thread_index].x);\n"
+" device T * in = (device T*)((device char*)inputData + offsets[thread_index].y + offset + storage_offset * sizeof(T));\n"
+" *out = *in;\n"
+"}\n"
+"\n"
+"template\n"
+"[[host_name(\"index_select_float\")]]\n"
+"kernel void index_select<float>(device const IndexAB & indexAB [[buffer(0)]],\n"
+" device const void * indexSizes [[buffer(1)]],\n"
+" device const void * indexStrides [[buffer(2)]],\n"
+" device const uint3 * offsets [[buffer(3)]],\n"
+" device const void * inputData [[buffer(4)]],\n"
+" device void * outputData [[buffer(5)]],\n"
+" uint thread_index [[thread_position_in_grid]]);\n"
+"template\n"
+"[[host_name(\"index_select_half\")]]\n"
+"kernel void index_select<half>(device const IndexAB & indexAB [[buffer(0)]],\n"
+" device const void * indexSizes [[buffer(1)]],\n"
+" device const void * indexStrides [[buffer(2)]],\n"
+" device const uint3 * offsets [[buffer(3)]],\n"
+" device const void * inputData [[buffer(4)]],\n"
+" device void * outputData [[buffer(5)]],\n"
+" uint thread_index [[thread_position_in_grid]]);\n"
+"template\n"
+"[[host_name(\"index_select_int32\")]]\n"
+"kernel void index_select<int32_t>(device const IndexAB & indexAB [[buffer(0)]],\n"
+" device const void * indexSizes [[buffer(1)]],\n"
+" device const void * indexStrides [[buffer(2)]],\n"
+" device const uint3 * offsets [[buffer(3)]],\n"
+" device const void * inputData [[buffer(4)]],\n"
+" device void * outputData [[buffer(5)]],\n"
+" uint thread_index [[thread_position_in_grid]]);\n"
+"template\n"
+"[[host_name(\"index_select_int64\")]]\n"
+"kernel void index_select<int64_t>(device const IndexAB & indexAB [[buffer(0)]],\n"
+" device const void * indexSizes [[buffer(1)]],\n"
+" device const void * indexStrides [[buffer(2)]],\n"
+" device const uint3 * offsets [[buffer(3)]],\n"
+" device const void * inputData [[buffer(4)]],\n"
+" device void * outputData [[buffer(5)]],\n"
+" uint thread_index [[thread_position_in_grid]]);\n"
+"template\n"
+"[[host_name(\"index_select_int16\")]]\n"
+"kernel void index_select<int16_t>(device const IndexAB & indexAB [[buffer(0)]],\n"
+" device const void * indexSizes [[buffer(1)]],\n"
+" device const void * indexStrides [[buffer(2)]],\n"
+" device const uint3 * offsets [[buffer(3)]],\n"
+" device const void * inputData [[buffer(4)]],\n"
+" device void * outputData [[buffer(5)]],\n"
+" uint thread_index [[thread_position_in_grid]]);\n"
+"template\n"
+"[[host_name(\"index_select_uint8\")]]\n"
+"kernel void index_select<uint8_t>(device const IndexAB & indexAB [[buffer(0)]],\n"
+" device const void * indexSizes [[buffer(1)]],\n"
+" device const void * indexStrides [[buffer(2)]],\n"
+" device const uint3 * offsets [[buffer(3)]],\n"
+" device const void * inputData [[buffer(4)]],\n"
+" device void * outputData [[buffer(5)]],\n"
+" uint thread_index [[thread_position_in_grid]]);\n"
+"template\n"
+"[[host_name(\"index_select_bool\")]]\n"
+"kernel void index_select<bool>(device const IndexAB & indexAB [[buffer(0)]],\n"
+" device const void * indexSizes [[buffer(1)]],\n"
+" device const void * indexStrides [[buffer(2)]],\n"
+" device const uint3 * offsets [[buffer(3)]],\n"
+" device const void * inputData [[buffer(4)]],\n"
+" device void * outputData [[buffer(5)]],\n"
+" uint thread_index [[thread_position_in_grid]]);\n"
+"\n";
@@ -89,6 +89,9 @@ MPSGraphTensor* castMPSTensor(MPSGraph *mpsGraph, MPSGraphTensor* tensor, Scalar
 MPSGraphTensorData *getMPSGraphTensorData(MPSGraph* mpsGraph, MPSStream* mpsStream, const Tensor& tensor);
 MPSGraphTensorData* getMPSGraphTensorFromScalar(MPSStream* mpsStream, const Scalar& scalar, MPSDataType dataType);
 
+// Helper function to choose the right 'index_select' kernel function name
+bool selectIndexFunctionName(ScalarType scalar_type, std::string& indexFunctionName);
+
 MPSGraph* make_mps_graph();
 void printTensorNDArray(const Tensor& t);
 

@@ -130,9 +130,9 @@ MPSDataType getMPSScalarType(ScalarType scalar_type) {
  case ScalarType::Short:
  return "Int16";
  case ScalarType::Char:
- return "UInt8";
- case ScalarType::Byte:
  return "Int8";
+ case ScalarType::Byte:
+ return "UInt8";
  case ScalarType::Bool:
  return "Bool";
  default:
@@ -385,6 +385,30 @@ string get_mem_format_string(c10::MemoryFormat memory_format) {
  return mem_format_key;
 }
 
+bool selectIndexFunctionName(ScalarType scalar_type, std::string& indexFunctionName) {
+ indexFunctionName = "index_select_";
+ switch (scalar_type) {
+ case ScalarType::Float:
+ indexFunctionName += "float"; return true;
+ case ScalarType::Half:
+ indexFunctionName += "half"; return true;
+ case ScalarType::Long:
+ indexFunctionName += "int64"; return true;
+ case ScalarType::Int:
+ indexFunctionName += "int32"; return true;
+ case ScalarType::Short:
+ indexFunctionName += "int16"; return true;
+ case ScalarType::Char:
+ indexFunctionName += "int8"; return true;
+ case ScalarType::Byte:
+ indexFunctionName += "uint8"; return true;
+ case ScalarType::Bool:
+ indexFunctionName += "bool"; return true;
+ default:
+ return false;
+ }
+}
+
 MPSGraphCache* MPSGraphCache::_instance_cache = nullptr;
 
 class MPSGraphCacheCallback : public IMpsAllocatorCallback {