csarofeen · zasdfgbnm · Nov 15, 2022 · Nov 11, 2022 · Nov 11, 2022 · Nov 11, 2022
diff --git a/test/cpp/jit/CMakeLists.txt b/test/cpp/jit/CMakeLists.txt
@@ -111,6 +111,7 @@ if(USE_CUDA)
   list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu_transpose.cpp)
   list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu_rng.cu)
   list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu_utils.cpp)
+  list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu_indexing_ops.cpp)
 endif()
 
 add_executable(test_jit

diff --git a/torch/csrc/jit/codegen/cuda/arith.cpp b/torch/csrc/jit/codegen/cuda/arith.cpp
@@ -442,6 +442,38 @@ TensorView* unaryOp(
   return unaryOp(type, cast_v1)->as<TensorView>();
 }
 
+TensorView* select(TensorView* tv, int dim, Int* index) {
+  auto dom = TensorDomain::noReductions(tv->getMaybeRFactorDomain());
+  TORCH_CHECK(dom.size() > 0, "select can not be applied to 0d tensor.");
+
+  std::vector<IterDomain*> new_root;
+  new_root.reserve(dom.size() - 1);
+
+  if (dim < 0) {
+    dim += dom.size();
+  }
+
+  TORCH_CHECK(
+      dim >= 0 && dim < dom.size(),
+      "Select on invalid axis, received: ",
+      dim,
+      " however tensor view only has ",
+      dom.size(),
+      " non-reduction dims.");
+
+  for (auto i : c10::irange(dom.size())) {
+    if (i != dim) {
+      new_root.emplace_back(dom[i]->cloneWithoutRFactor());
+    }
+  }
+
+  auto td = IrBuilder::create<TensorDomain>(
+      new_root, TensorDomain::getContiguousContiguity(new_root));
+  auto out = IrBuilder::create<TensorView>(td, *tv->getDataType());
+  IrBuilder::create<SelectOp>(out, tv, dom[dim], index);
+  return out;
+}
+
 // TENSOR FACTORIES
 TensorView* rand(const std::vector<Val*>& shape, DataType dtype) {
   auto n = shape.size();

diff --git a/torch/csrc/jit/codegen/cuda/arith.h b/torch/csrc/jit/codegen/cuda/arith.h
@@ -143,6 +143,8 @@ TORCH_CUDA_CU_API WelfordResult WelfordRaw(
     // import IrBuilder just for this one interface.
     Int* init_N = nullptr);
 
+TORCH_CUDA_CU_API TensorView* select(TensorView* tv, int dim, Int* index);
+
 // RNG OPERATIONS
 TORCH_CUDA_CU_API TensorView* rand(
     const std::vector<Val*>& shape,
@@ -375,12 +377,14 @@ TORCH_CUDA_CU_API Val* atan2(Val* v1, Val* v2);
 TORCH_CUDA_CU_API TensorView* atan2(TensorView* v1, Val* v2);
 TORCH_CUDA_CU_API TensorView* atan2(Val* v1, TensorView* v2);
 TORCH_CUDA_CU_API TensorView* atan2(TensorView* v1, TensorView* v2);
-// div
+// div: promote to float for integer division, has the same semantics as the
+// python's operator /
 TORCH_CUDA_CU_API Val* div(Val* v1, Val* v2);
 TORCH_CUDA_CU_API TensorView* div(TensorView* v1, Val* v2);
 TORCH_CUDA_CU_API TensorView* div(Val* v1, TensorView* v2);
 TORCH_CUDA_CU_API TensorView* div(TensorView* v1, TensorView* v2);
-// cpp_div: similar to div, but don't promote to float
+// cpp_div: similar to div, but don't promote to float, this has the same
+// semantics as the C++'s operator /
 TORCH_CUDA_CU_API Val* cpp_div(Val* v1, Val* v2);
 TORCH_CUDA_CU_API TensorView* cpp_div(TensorView* v1, Val* v2);
 TORCH_CUDA_CU_API TensorView* cpp_div(Val* v1, TensorView* v2);

diff --git a/torch/csrc/jit/codegen/cuda/dispatch.cpp b/torch/csrc/jit/codegen/cuda/dispatch.cpp
@@ -110,6 +110,9 @@ void Expr::dispatch(T handler, Expr* expr) {
     case ExprType::TernaryOp:
       ptr(handler)->handle(expr->as<TernaryOp>());
       return;
+    case ExprType::SelectOp:
+      ptr(handler)->handle(expr->as<SelectOp>());
+      return;
     case ExprType::RNGOp:
       ptr(handler)->handle(expr->as<RNGOp>());
       return;
@@ -296,6 +299,9 @@ void Expr::constDispatch(T handler, const Expr* expr) {
     case ExprType::TernaryOp:
       ptr(handler)->handle(expr->as<TernaryOp>());
       return;
+    case ExprType::SelectOp:
+      ptr(handler)->handle(expr->as<SelectOp>());
+      return;
     case ExprType::RNGOp:
       ptr(handler)->handle(expr->as<RNGOp>());
       return;
@@ -490,6 +496,9 @@ void Expr::mutatorDispatch(T mutator, Expr* expr) {
     case ExprType::TernaryOp:
       ptr(mutator)->mutate(expr->as<TernaryOp>());
       return;
+    case ExprType::SelectOp:
+      ptr(mutator)->mutate(expr->as<SelectOp>());
+      return;
     case ExprType::RNGOp:
       ptr(mutator)->mutate(expr->as<RNGOp>());
       return;
@@ -749,6 +758,9 @@ void OptOutConstDispatch::handle(const BinaryOp* stmt) {
 void OptOutConstDispatch::handle(const TernaryOp* stmt) {
   unhandled(stmt);
 }
+void OptOutConstDispatch::handle(const SelectOp* stmt) {
+  unhandled(stmt);
+}
 void OptOutConstDispatch::handle(const RNGOp* stmt) {
   unhandled(stmt);
 }
@@ -905,6 +917,9 @@ void OptOutDispatch::handle(BinaryOp* stmt) {
 void OptOutDispatch::handle(TernaryOp* stmt) {
   unhandled(stmt);
 }
+void OptOutDispatch::handle(SelectOp* stmt) {
+  unhandled(stmt);
+}
 void OptOutDispatch::handle(RNGOp* stmt) {
   unhandled(stmt);
 }

diff --git a/torch/csrc/jit/codegen/cuda/dispatch.h b/torch/csrc/jit/codegen/cuda/dispatch.h
@@ -74,6 +74,7 @@ class EyeOp;
 class UnaryOp;
 class BinaryOp;
 class TernaryOp;
+class SelectOp;
 class RNGOp;
 class ReductionOp;
 class GroupedReductionOp;
@@ -149,6 +150,7 @@ class TORCH_CUDA_CU_API OptOutConstDispatch : public PolymorphicBase {
   virtual void handle(const UnaryOp* stmt);
   virtual void handle(const BinaryOp* stmt);
   virtual void handle(const TernaryOp* stmt);
+  virtual void handle(const SelectOp* stmt);
   virtual void handle(const RNGOp* stmt);
   virtual void handle(const ReductionOp* stmt);
   virtual void handle(const GroupedReductionOp* stmt);
@@ -216,6 +218,7 @@ class TORCH_CUDA_CU_API OptOutDispatch : public PolymorphicBase {
   virtual void handle(UnaryOp* stmt);
   virtual void handle(BinaryOp* stmt);
   virtual void handle(TernaryOp* stmt);
+  virtual void handle(SelectOp* stmt);
   virtual void handle(RNGOp* stmt);
   virtual void handle(ReductionOp* stmt);
   virtual void handle(GroupedReductionOp* stmt);
@@ -324,6 +327,7 @@ class TORCH_CUDA_CU_API OptOutMutator : public PolymorphicBase {
   virtual void mutate(UnaryOp*);
   virtual void mutate(BinaryOp*);
   virtual void mutate(TernaryOp*);
+  virtual void mutate(SelectOp*);
   virtual void mutate(RNGOp*);
   virtual void mutate(ReductionOp*);
   virtual void mutate(GroupedReductionOp*);

diff --git a/torch/csrc/jit/codegen/cuda/fusion_segmenter.cpp b/torch/csrc/jit/codegen/cuda/fusion_segmenter.cpp
@@ -2633,7 +2633,8 @@ ScheduleHeuristic SegmentCandidateFinder::deriveHeuristic(
     SegmentedGroup* group) {
   Fusion* fusion = segmented_fusion_->completeFusion();
   auto h = tryMerge(fusion, runtime_info_, group);
-  TORCH_INTERNAL_ASSERT(h.has_value());
+  TORCH_INTERNAL_ASSERT(
+      h.has_value(), "Can not find a scheduler to schedule fusion segment");
   return h.value();
 }
 

diff --git a/torch/csrc/jit/codegen/cuda/index_compute.cpp b/torch/csrc/jit/codegen/cuda/index_compute.cpp
@@ -69,9 +69,11 @@ Val* getProducerIndexWithHalo(
     const TensorView* producer_tv,
     size_t producer_axis,
     Val* producer_index,
-    const TensorView* consumer_tv) {
-  const auto offset =
-      getProducerHaloOffset(producer_tv, producer_axis, consumer_tv);
+    const TensorView* consumer_tv,
+    bool is_overriden_index) {
+  const auto offset = is_overriden_index
+      ? 0
+      : getProducerHaloOffset(producer_tv, producer_axis, consumer_tv);
 
   if (offset == 0) {
     return producer_index;
@@ -1460,7 +1462,8 @@ Val* hoistProducerIndex(
 std::vector<Val*> Index::getGlobalProducerStridedIndices(
     TensorView* producer_tv,
     const TensorView* consumer_tv,
-    const std::vector<kir::ForLoop*>& loops) {
+    const std::vector<kir::ForLoop*>& loops,
+    const std::unordered_map<IterDomain*, Val*>& override_index) {
   FUSER_PERF_SCOPE("GpuLower::Lower::getGlobalProducerIndex");
 
   // Replay producer to look like consumer so we can index on producer since
@@ -1545,23 +1548,6 @@ std::vector<Val*> Index::getGlobalProducerStridedIndices(
       continue;
     }
 
-    Val* root_ind = nullptr;
-    if (producer_indexing.indexMap().find(root_dom[dim]) !=
-        producer_indexing.indexMap().end()) {
-      root_ind = producer_indexing.indexMap().at(root_dom[dim]);
-    } else if (root_dom[dim]->isBroadcast()) {
-      root_ind = GpuLower::current()->kernel()->zeroVal();
-    }
-
-    TORCH_INTERNAL_ASSERT(
-        root_ind != nullptr,
-        "Couldn't find root mapping for ",
-        producer_tv->toString(),
-        " dim: ",
-        dim,
-        " id: ",
-        root_dom[dim]->toString());
-
     if (producer_tv->domain()->contiguity()[dim]) {
       // If contig, used the stored stride which may be the previous
       // dimensions stride * previous dimensions size
@@ -1591,18 +1577,27 @@ std::vector<Val*> Index::getGlobalProducerStridedIndices(
       continue;
     }
 
-    TORCH_INTERNAL_ASSERT(
+    Val* root_ind = nullptr;
+    auto override_it = override_index.find(root_dom[i]);
+    if (override_it != override_index.end()) {
+      root_ind = override_it->second;
+    } else if (
         producer_indexing.indexMap().find(root_dom[i]) !=
-            producer_indexing.indexMap().end(),
-        "Couldn't find root mapping for TV",
-        producer_tv->name(),
+        producer_indexing.indexMap().end()) {
+      root_ind = producer_indexing.indexMap().at(root_dom[i]);
+    } else if (root_dom[i]->isBroadcast()) {
+      root_ind = GpuLower::current()->kernel()->zeroVal();
+    }
+
+    TORCH_INTERNAL_ASSERT(
+        root_ind != nullptr,
+        "Couldn't find root mapping for ",
+        producer_tv->toString(),
         " dim: ",
         i,
         " id: ",
         root_dom[i]->toString());
 
-    auto root_ind = producer_indexing.indexMap().at(root_dom[i]);
-
     // index hoist must be done before the adjustments for halo
     root_ind = hoistProducerIndex(
         root_dom[i],
@@ -1615,7 +1610,12 @@ std::vector<Val*> Index::getGlobalProducerStridedIndices(
         loops,
         root_ind);
 
-    root_ind = getProducerIndexWithHalo(producer_tv, i, root_ind, consumer_tv);
+    root_ind = getProducerIndexWithHalo(
+        producer_tv,
+        i,
+        root_ind,
+        consumer_tv,
+        override_index.count(root_dom[i]));
 
     root_ind = getProducerIndexWithGather(
         root_ind,
@@ -1686,7 +1686,8 @@ std::unordered_map<IterDomain*, IterDomain*> mapAllProducerDomainsToConsumer(
 std::vector<Val*> Index::getNonGlobalProducerStridedIndices(
     TensorView* producer_tv,
     const TensorView* consumer_tv,
-    const std::vector<kir::ForLoop*>& loops) {
+    const std::vector<kir::ForLoop*>& loops,
+    const std::unordered_map<IterDomain*, Val*>& override_index) {
   const auto gpu_lower = GpuLower::current();
 
   // Replay producer to look like consumer so we can index on producer since our
@@ -1827,7 +1828,10 @@ std::vector<Val*> Index::getNonGlobalProducerStridedIndices(
         " id: ",
         root_dom[i]->toString());
 
-    auto root_ind_i = index_map.at(root_dom[i]);
+    auto override_it = override_index.find(root_dom[i]);
+    auto root_ind_i =
+        (override_it != override_index.end() ? override_it->second
+                                             : index_map.at(root_dom[i]));
 
     // index hoist must be done before the adjustments for halo
     root_ind_i = hoistProducerIndex(
@@ -1841,8 +1845,12 @@ std::vector<Val*> Index::getNonGlobalProducerStridedIndices(
         loops,
         root_ind_i);
 
-    root_ind_i =
-        getProducerIndexWithHalo(producer_tv, i, root_ind_i, consumer_tv);
+    root_ind_i = getProducerIndexWithHalo(
+        producer_tv,
+        i,
+        root_ind_i,
+        consumer_tv,
+        override_index.count(root_dom[i]));
 
     root_ind_i = getProducerIndexWithGather(
         root_ind_i,
@@ -2226,7 +2234,8 @@ std::vector<Val*> Index::getNonGlobalConsumerStridedIndices(
 std::vector<Val*> Index::getProducerStridedIndices(
     TensorView* producer,
     const TensorView* consumer,
-    const std::vector<kir::ForLoop*>& loops) {
+    const std::vector<kir::ForLoop*>& loops,
+    const std::unordered_map<IterDomain*, Val*>& override_index) {
   FUSER_PERF_SCOPE("GpuLower::Lower::Index::getProducerStridedIndices");
   if (producer->domain()->noReductions().size() == 0) {
     return std::vector<Val*>(
@@ -2236,11 +2245,11 @@ std::vector<Val*> Index::getProducerStridedIndices(
 
   std::vector<Val*> strided_indices;
   if (producer->getMemoryType() == MemoryType::Global) {
-    strided_indices =
-        getGlobalProducerStridedIndices(producer, consumer, loops);
+    strided_indices = getGlobalProducerStridedIndices(
+        producer, consumer, loops, override_index);
   } else {
-    strided_indices =
-        getNonGlobalProducerStridedIndices(producer, consumer, loops);
+    strided_indices = getNonGlobalProducerStridedIndices(
+        producer, consumer, loops, override_index);
   }
 
   TORCH_INTERNAL_ASSERT(
@@ -2256,8 +2265,10 @@ std::vector<Val*> Index::getProducerStridedIndices(
 kir::TensorIndex* Index::getProducerIndex(
     TensorView* producer,
     const TensorView* consumer,
-    const std::vector<kir::ForLoop*>& loops) {
-  auto strided_indices = getProducerStridedIndices(producer, consumer, loops);
+    const std::vector<kir::ForLoop*>& loops,
+    const std::unordered_map<IterDomain*, Val*>& override_index) {
+  auto strided_indices =
+      getProducerStridedIndices(producer, consumer, loops, override_index);
   return SimplifyingIrBuilder::create<kir::TensorIndex>(
       producer, strided_indices);
 }

diff --git a/torch/csrc/jit/codegen/cuda/index_compute.h b/torch/csrc/jit/codegen/cuda/index_compute.h
@@ -309,7 +309,8 @@ class Index {
   static std::vector<Val*> getNonGlobalProducerStridedIndices(
       TensorView* producer,
       const TensorView* consumer,
-      const std::vector<kir::ForLoop*>& loops);
+      const std::vector<kir::ForLoop*>& loops,
+      const std::unordered_map<IterDomain*, Val*>& override_index = {});
 
   // Consumer indexing if it's in shared or local memory
   static std::vector<Val*> getNonGlobalConsumerStridedIndices(
@@ -320,7 +321,8 @@ class Index {
   static std::vector<Val*> getGlobalProducerStridedIndices(
       TensorView* producer,
       const TensorView* consumer,
-      const std::vector<kir::ForLoop*>& loops);
+      const std::vector<kir::ForLoop*>& loops,
+      const std::unordered_map<IterDomain*, Val*>& override_index = {});
 
   // Consumer indexing if it's in global memory
   static std::vector<Val*> getGlobalConsumerStridedIndices(
@@ -344,7 +346,8 @@ class Index {
   static kir::TensorIndex* getProducerIndex(
       TensorView* producer,
       const TensorView* consumer,
-      const std::vector<kir::ForLoop*>& loops);
+      const std::vector<kir::ForLoop*>& loops,
+      const std::unordered_map<IterDomain*, Val*>& override_index = {});
 
   // Consumer index dispatch
   static kir::TensorIndex* getConsumerIndex(
@@ -358,7 +361,8 @@ class Index {
   static std::vector<Val*> getProducerStridedIndices(
       TensorView* producer,
       const TensorView* consumer,
-      const std::vector<kir::ForLoop*>& loops);
+      const std::vector<kir::ForLoop*>& loops,
+      const std::unordered_map<IterDomain*, Val*>& override_index = {});
 
   //! Returns a vector of strided indices mapped onto the (rfactor)
   //! root domain of a consumer tensor. The size of the returned

diff --git a/torch/csrc/jit/codegen/cuda/ir_builder.cpp b/torch/csrc/jit/codegen/cuda/ir_builder.cpp
@@ -66,6 +66,7 @@ IR_BUILDER_INSTANTIATE(EyeOp)
 IR_BUILDER_INSTANTIATE(UnaryOp)
 IR_BUILDER_INSTANTIATE(BinaryOp)
 IR_BUILDER_INSTANTIATE(TernaryOp)
+IR_BUILDER_INSTANTIATE(SelectOp)
 IR_BUILDER_INSTANTIATE(RNGOp)
 IR_BUILDER_INSTANTIATE(ReductionOp)
 IR_BUILDER_INSTANTIATE(GroupedReductionOp)

diff --git a/torch/csrc/jit/codegen/cuda/ir_cloner.cpp b/torch/csrc/jit/codegen/cuda/ir_cloner.cpp
@@ -112,6 +112,10 @@ void IrCloner::handle(const TernaryOp* op) {
   clone_ = IrBuilder::clone(op, this);
 }
 
+void IrCloner::handle(const SelectOp* op) {
+  clone_ = IrBuilder::clone(op, this);
+}
+
 void IrCloner::handle(const RNGOp* op) {
   clone_ = IrBuilder::clone(op, this);
 }