csarofeen · zasdfgbnm · Sep 14, 2022 · Sep 2, 2022 · Sep 2, 2022 · Sep 2, 2022
diff --git a/torch/csrc/jit/codegen/cuda/compute_at_map.cpp b/torch/csrc/jit/codegen/cuda/compute_at_map.cpp
@@ -6,6 +6,8 @@
 #include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
 #include <torch/csrc/jit/codegen/cuda/transform_iter.h>
 
+#include <tuple>
+
 namespace torch {
 namespace jit {
 namespace fuser {
@@ -29,8 +31,22 @@ bool idIsALeafDomain(IterDomain* id, TensorView* tv) {
 
 } // namespace
 
-IterDomainGraph::IterDomainGraph(Fusion* fusion) {
+IterDomainGraph::IterDomainGraph(Fusion* fusion, bool allow_self_mapping) {
   build(fusion);
+
+  if (!allow_self_mapping) {
+    TORCH_INTERNAL_ASSERT(
+        !hasSelfMapping(),
+        "Unsupported domain mapping detected in ",
+        std::get<0>(*self_mapping_info_)->toString(),
+        ". ",
+        std::get<3>(*self_mapping_info_),
+        " domains, ",
+        std::get<1>(*self_mapping_info_)->toString(),
+        " and ",
+        std::get<2>(*self_mapping_info_)->toString(),
+        ", are mapped with each other.");
+  }
 }
 
 //! Map corresponding inputs and outputs of swizzle op together
@@ -197,7 +213,8 @@ c10::optional<std::pair<IterDomain*, IterDomain*>> detectMappablePair(
 // those domains should never be mapped with each other. It may be
 // possible to lift this assumption, but it's unclear if it could
 // matter in practice.
-void failIfSelfMappingExists(Fusion* fusion, const IterDomainGraph& id_graph) {
+c10::optional<std::tuple<TensorView*, IterDomain*, IterDomain*, std::string>>
+findFirstSelfMapping(Fusion* fusion, const IterDomainGraph& id_graph) {
   for (auto tv : ir_utils::allTvs(fusion)) {
     // For each tensor, make sure root, rfactor and leaf domains
     // should not include domains that are mapped with another domain
@@ -207,44 +224,39 @@ void failIfSelfMappingExists(Fusion* fusion, const IterDomainGraph& id_graph) {
     // Root domains
     auto self_mappped_root_pair =
         detectMappablePair(tv->getRootDomain(), id_graph);
-    TORCH_INTERNAL_ASSERT(
-        !self_mappped_root_pair.has_value(),
-        "Unsupported domain mapping detected in ",
-        tv->toString(),
-        ". Root domains, ",
-        self_mappped_root_pair->first->toString(),
-        " and ",
-        self_mappped_root_pair->second->toString(),
-        ", are mapped with each other.");
+    if (self_mappped_root_pair.has_value()) {
+      return std::make_tuple(
+          tv,
+          self_mappped_root_pair->first,
+          self_mappped_root_pair->second,
+          "Root");
+    }
 
     // Rfactor domains
     if (tv->hasRFactor()) {
       auto self_mappped_rf_pair =
           detectMappablePair(tv->getRFactorDomain(), id_graph);
-      TORCH_INTERNAL_ASSERT(
-          !self_mappped_rf_pair.has_value(),
-          "Unsupported domain mapping detected in ",
-          tv->toString(),
-          ". RFactor domains, ",
-          self_mappped_rf_pair->first->toString(),
-          " and ",
-          self_mappped_rf_pair->second->toString(),
-          ", are mapped with each other.");
+      if (self_mappped_rf_pair.has_value()) {
+        return std::make_tuple(
+            tv,
+            self_mappped_rf_pair->first,
+            self_mappped_rf_pair->second,
+            "RFactor");
+      }
     }
 
     // Leaf domains
     auto self_mappped_leaf_pair =
         detectMappablePair(tv->domain()->domain(), id_graph);
-    TORCH_INTERNAL_ASSERT(
-        !self_mappped_leaf_pair.has_value(),
-        "Unsupported domain mapping detected in ",
-        tv->toString(),
-        ". Leaf domains, ",
-        self_mappped_leaf_pair->first->toString(),
-        " and ",
-        self_mappped_leaf_pair->second->toString(),
-        ", are mapped with each other.");
+    if (self_mappped_leaf_pair.has_value()) {
+      return std::make_tuple(
+          tv,
+          self_mappped_leaf_pair->first,
+          self_mappped_leaf_pair->second,
+          "Leaf");
+    }
   }
+  return c10::nullopt;
 }
 
 } // namespace
@@ -591,8 +603,7 @@ void IterDomainGraph::build(Fusion* fusion) {
       }
     }
   }
-
-  failIfSelfMappingExists(fusion, *this);
+  self_mapping_info_ = findFirstSelfMapping(fusion, *this);
 }
 
 void IterDomainGraph::initializeId(

diff --git a/torch/csrc/jit/codegen/cuda/compute_at_map.h b/torch/csrc/jit/codegen/cuda/compute_at_map.h
@@ -54,7 +54,7 @@ namespace cuda {
 //   Do not forward through any broadcast IDs
 class TORCH_CUDA_CU_API IterDomainGraph {
  public:
-  IterDomainGraph(Fusion* fusion);
+  IterDomainGraph(Fusion* fusion, bool allow_self_mapping = false);
 
   const DisjointSets<IterDomain*>& permissiveNodes() const {
     return permissive_nodes_;
@@ -88,6 +88,10 @@ class TORCH_CUDA_CU_API IterDomainGraph {
     return view_rfactor_ids_;
   }
 
+  bool hasSelfMapping() const {
+    return self_mapping_info_.has_value();
+  }
+
  private:
   void build(Fusion* fusion);
 
@@ -116,6 +120,9 @@ class TORCH_CUDA_CU_API IterDomainGraph {
   VectorOfUniqueEntries<IterDomain*> all_ids_;
 
   std::unordered_set<IterDomain*> view_rfactor_ids_;
+
+  c10::optional<std::tuple<TensorView*, IterDomain*, IterDomain*, std::string>>
+      self_mapping_info_ = c10::nullopt;
 };
 
 class TrivialReductionInfo;

diff --git a/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp b/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp
@@ -1501,6 +1501,9 @@ bool checkCanSchedule(
     if (!isConnectedFusionGraph(fusion)) {
       return false;
     }
+    if (IterDomainGraph(fusion, /*allow_self_mapping=*/true).hasSelfMapping()) {
+      return false;
+    }
     if (!SchedulerType::canScheduleCompileTime(fusion)) {
       return false;
     }

diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu_transpose.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu_transpose.cpp
@@ -4,6 +4,7 @@
 
 #include <torch/csrc/jit/codegen/cuda/executor.h>
 #include <torch/csrc/jit/codegen/cuda/inline_propagator.h>
+#include <torch/csrc/jit/codegen/cuda/kernel_cache.h>
 #include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
 #include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
 #include <torch/csrc/jit/codegen/cuda/scheduler/transpose.h>
@@ -795,6 +796,61 @@ TEST_F(NVFuserTest, FusionViewNoTranspose_CUDA) {
   TORCH_CHECK(!hasAtLeastTwoValidGroups(&fusion));
 }
 
+TEST_F(NVFuserTest, FusionTransposeSelfMapping_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = transpose(tv0, 0, 1);
+  auto tv2 = add(tv0, tv1);
+  fusion.addOutput(tv2);
+
+  EXPECT_THAT(
+      [&]() { IterDomainGraph(fusion_ptr.get()); },
+      testing::ThrowsMessage<c10::Error>(
+          testing::HasSubstr("Unsupported domain mapping detected")));
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn({5, 5}, options);
+
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto cg_outputs = executor_cache.runFusionWithInputs({t0});
+
+  auto ref = t0.transpose(0, 1) + t0;
+
+  testValidate(
+      executor_cache.fusion(), cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+
+#if 0
+// silent wrong result
+TEST_F(NVFuserTest, FusionTransposeViewSelfMapping_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  Fusion& fusion = *fusion_ptr.get();
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(2);
+  fusion.addInput(tv0);
+  auto tv1 = transpose(tv0, 0, 1);
+  auto tv2 = view(tv0, {2, 3}, {3, 2});
+  auto tv3 = add(tv1, tv2);
+  fusion.addOutput(tv3);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  auto t0 = at::randn({2, 3}, options);
+
+  FusionExecutorCache executor_cache(std::move(fusion_ptr));
+  auto cg_outputs = executor_cache.runFusionWithInputs({t0});
+
+  auto ref = t0.transpose(0, 1) + t0.view({3, 2});
+
+  testValidate(
+      executor_cache.fusion(), cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
+}
+#endif
+
 // t0------------.
 // t2->broadcast->sub->mul->relu->t6
 // t1------------------'