From e84fff8dacf382351b725d0d1f469168e0db28c3 Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Wed, 24 Aug 2022 10:19:53 -0700
Subject: [PATCH 01/32] Enable transpose scheduler

---
 torch/csrc/jit/codegen/cuda/scheduler/registry.cpp | 6 ------
 torch/csrc/jit/codegen/cuda/utils.cpp              | 5 +----
 torch/csrc/jit/codegen/cuda/utils.h                | 1 -
 3 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp b/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp
index ac7f66836a87c6..2498c4b55b1f26 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp
@@ -1256,12 +1256,6 @@ class TransposeScheduler : public SchedulerEntry {
   }
 
   static bool canScheduleCompileTime(Fusion* fusion) {
-    if (!isOptionEnabled(EnableOption::TransposeScheduler)) {
-      scheduler_debug_utils::canScheduleRejectReason(
-          ScheduleHeuristic::Transpose, "not enabled");
-      return false;
-    }
-
     // Temporarily disallow view in transpose scheduler
     // TODO Add more testing before enabling
     auto view_tvs = scheduler_utils::getViewTVs(fusion);
diff --git a/torch/csrc/jit/codegen/cuda/utils.cpp b/torch/csrc/jit/codegen/cuda/utils.cpp
index 5e82014c0c3887..8eec9d7db0c459 100644
--- a/torch/csrc/jit/codegen/cuda/utils.cpp
+++ b/torch/csrc/jit/codegen/cuda/utils.cpp
@@ -169,8 +169,7 @@ auto parseEnableOptions() {
       {EnableOption::Complex, false},
       {EnableOption::KernelProfile, false},
       {EnableOption::LinearDecomposition, false},
-      {EnableOption::ConvDecomposition, false},
-      {EnableOption::TransposeScheduler, false}};
+      {EnableOption::ConvDecomposition, false}};
 
   if (const char* dump_options = std::getenv("PYTORCH_NVFUSER_ENABLE")) {
     c10::string_view options_view(dump_options);
@@ -185,8 +184,6 @@ auto parseEnableOptions() {
         options_map[EnableOption::LinearDecomposition] = true;
       } else if (token == "conv_decomposition") {
         options_map[EnableOption::ConvDecomposition] = true;
-      } else if (token == "transpose_scheduler") {
-        options_map[EnableOption::TransposeScheduler] = true;
       } else {
         TORCH_CHECK(
             false,
diff --git a/torch/csrc/jit/codegen/cuda/utils.h b/torch/csrc/jit/codegen/cuda/utils.h
index 679776b383af02..6c29b40550f002 100644
--- a/torch/csrc/jit/codegen/cuda/utils.h
+++ b/torch/csrc/jit/codegen/cuda/utils.h
@@ -80,7 +80,6 @@ enum class EnableOption {
   KernelProfile, //! Enable intra-kernel performance profiling
   LinearDecomposition, //! Enable linear-bias decomposition
   ConvDecomposition, //! Enable conv-bias decomposition
-  TransposeScheduler //! Enable the experimental transpose scheduler
 };
 
 TORCH_CUDA_CU_API bool isOptionEnabled(EnableOption option);

From bb8cef111c4828a9690ba68752dca84acd83fd08 Mon Sep 17 00:00:00 2001
From: shmsong <shisong@umich.edu>
Date: Wed, 24 Aug 2022 10:52:49 -0700
Subject: [PATCH 02/32] add lower index resolution

---
 torch/csrc/jit/codegen/cuda/index_compute.cpp | 33 +++++++++++
 .../jit/codegen/cuda/lower_index_compute.cpp  | 57 +++++++++++++++++++
 .../jit/codegen/cuda/lower_index_compute.h    | 12 ++++
 3 files changed, 102 insertions(+)

diff --git a/torch/csrc/jit/codegen/cuda/index_compute.cpp b/torch/csrc/jit/codegen/cuda/index_compute.cpp
index 895c3e4bd96efe..e7feff88fb10c7 100644
--- a/torch/csrc/jit/codegen/cuda/index_compute.cpp
+++ b/torch/csrc/jit/codegen/cuda/index_compute.cpp
@@ -650,6 +650,8 @@ IndexCompute::IndexCompute(
 }
 
 void IndexCompute::run(const LoopIndexing& loop_indexing) {
+  TORCH_INTERNAL_ASSERT(
+      concrete_id_pass_, "concrete pass only for this option");
   // Apply loop swizzles if there are any that outputs to
   //  the loop domains.
   // Currently only support loop swizzles that directly output
@@ -669,9 +671,40 @@ void IndexCompute::run(const LoopIndexing& loop_indexing) {
     }
   }
 
+  // Resolve the out of line expressions first:
+  std::unordered_map<IterDomain*, Val*> permissive_index_map;
+
+  for (auto expr : loop_indexing.getBackwardOutOfLineExprList()) {
+    handle(expr);
+
+    // Collect backward results from this expression if they are
+    //  made available in by this expression.
+    auto id_inputs = ir_utils::filterByType<IterDomain>(expr->inputs());
+    for (auto id : id_inputs) {
+      auto idx_it = index_map_.find(ir_utils::caMapExactConcreteId(id));
+      if (idx_it != index_map_.end()) {
+        permissive_index_map[GpuLower::current()->caMap()->getConcreteMappedID(
+            id, IdMappingMode::PERMISSIVE)] = idx_it->second;
+      }
+    }
+  }
+
   // Run through the loop indexing expressions and generate
   //  the indexing integer math for the concrete ids.
   for (auto expr : loop_indexing.getBackwardExprList()) {
+    auto id_outputs = ir_utils::filterByType<IterDomain>(expr->outputs());
+
+    for (auto id : id_outputs) {
+      auto concrete_id = ir_utils::caMapExactConcreteId(id);
+      if (!index_map_.count(concrete_id)) {
+        auto permissive_id = GpuLower::current()->caMap()->getConcreteMappedID(
+            id, IdMappingMode::PERMISSIVE);
+        auto permissive_it = permissive_index_map.find(permissive_id);
+        if (permissive_it != permissive_index_map.end()) {
+          index_map_[concrete_id] = permissive_it->second;
+        }
+      }
+    }
     handle(expr);
   }
 }
diff --git a/torch/csrc/jit/codegen/cuda/lower_index_compute.cpp b/torch/csrc/jit/codegen/cuda/lower_index_compute.cpp
index 70b019a4cc48c9..2d4444d3409039 100644
--- a/torch/csrc/jit/codegen/cuda/lower_index_compute.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower_index_compute.cpp
@@ -438,6 +438,7 @@ class LoopIndexingAnalysis {
     indexing.loop_root_ = loop_root_domains_;
     indexing.loop_domains_ = loop_domains_.vector();
     indexing.index_exprs_ = replayed_exprs_;
+    indexing.out_of_line_exprs_ = out_of_line_exprs_;
     return indexing;
   }
 
@@ -481,6 +482,12 @@ class LoopIndexingAnalysis {
   //! loop_domains_ with all of these iter domains.
   void constructLoopDomains();
 
+  //! Fills out_of_line_exprs_ by traversing the selected list of
+  //!  expressions in reverse topological order and collect iterdomains
+  //!  on the indexing paths that only involves leaf id's on the right
+  //!  of consumer's ca axis.
+  void collectOutOfLineExprs();
+
  private:
   //! Original loop nest input to derive info from.
   const std::vector<kir::ForLoop*>& loops_;
@@ -521,6 +528,10 @@ class LoopIndexingAnalysis {
   //! Selected list of exprs that will produce and consume each
   //!  of the exact concrete ids from the loop nest exactly once.
   std::vector<Expr*> replayed_exprs_;
+
+  //! Set of expressions from the selected list that can be
+  //!  resolved from axes on the right of ca axes.
+  std::vector<Expr*> out_of_line_exprs_;
 };
 
 LoopIndexingAnalysis::LoopIndexingAnalysis(
@@ -559,6 +570,10 @@ LoopIndexingAnalysis::LoopIndexingAnalysis(
   // Reconstruct the iterdomain view of the original loopnest after resolving
   // the exact definition of each index.
   constructLoopDomains();
+
+  //! Collect the set of indexing expressions that can be
+  //!  resolved out of line.
+  collectOutOfLineExprs();
 }
 
 void LoopIndexingAnalysis::validateLoopStructure(
@@ -1088,6 +1103,48 @@ std::vector<Expr*> LoopIndexingTraversal::getExprList() {
 
 } // namespace
 
+void LoopIndexingAnalysis::collectOutOfLineExprs() {
+  // Keep track of all the id's that can be resolved without
+  //  iterdomains on the left of ca axes.
+  std::unordered_set<IterDomain*> out_of_line_ids;
+
+  // Start the set with all the leaf ids.
+  std::transform(
+      consumer_tv_->domain()->domain().begin() +
+          consumer_tv_->getComputeAtPosition(),
+      consumer_tv_->domain()->domain().end(),
+      std::inserter(out_of_line_ids, out_of_line_ids.end()),
+      ir_utils::caMapExactConcreteId);
+
+  // Get the original selected list of index expressions
+  //  in reverse topological order.
+  auto backward_expr_list =
+      LoopIndexingTraversal::backwardTopologicalOrder(replayed_exprs_);
+
+  for (auto expr : backward_expr_list) {
+    auto id_outputs = ir_utils::filterByType<IterDomain>(expr->outputs());
+    if (
+        // Check that all of the outputs are out of line
+        std::all_of(
+            id_outputs.begin(),
+            id_outputs.end(),
+            [&out_of_line_ids](IterDomain* id) {
+              return out_of_line_ids.count(ir_utils::caMapExactConcreteId(id));
+            })) {
+      // Record out of line expression
+      out_of_line_exprs_.push_back(expr);
+
+      // Add all of the expression inputs as out of line id's.
+      auto id_inputs = ir_utils::filterByType<IterDomain>(expr->inputs());
+      std::transform(
+          id_inputs.begin(),
+          id_inputs.end(),
+          std::inserter(out_of_line_ids, out_of_line_ids.end()),
+          ir_utils::caMapExactConcreteId);
+    }
+  }
+}
+
 std::vector<Expr*> LoopIndexing::getForwardExprList() const {
   return LoopIndexingTraversal::forwardTopologicalOrder(index_exprs_);
 }
diff --git a/torch/csrc/jit/codegen/cuda/lower_index_compute.h b/torch/csrc/jit/codegen/cuda/lower_index_compute.h
index d8d4dd7103b3ae..4b81fd0dec0c5d 100644
--- a/torch/csrc/jit/codegen/cuda/lower_index_compute.h
+++ b/torch/csrc/jit/codegen/cuda/lower_index_compute.h
@@ -127,6 +127,12 @@ class LoopIndexing {
   //!  topological order.
   std::vector<Expr*> getBackwardExprList() const;
 
+  //! Returns the set of out of line expressions in
+  //!  reverse topological order.
+  const std::vector<Expr*>& getBackwardOutOfLineExprList() const {
+    return out_of_line_exprs_;
+  }
+
   //! Returns all exact concrete id's that were produced
   //!  or consumed in the selected indexing expressions
   std::unordered_set<IterDomain*> getAllExactConcreteIdSet() const;
@@ -152,6 +158,12 @@ class LoopIndexing {
   //! The selected sequence of expressions that should represent
   //!  the correct indexing math from the given loop nest.
   std::vector<Expr*> index_exprs_;
+
+  //! The subset of sequence of expressions that can be resolved
+  //!  with only the iterdomains on the right of consumer tv's ca
+  //!  axis.
+  //! Expressions are ordered in reverse topological order.
+  std::vector<Expr*> out_of_line_exprs_;
 };
 
 // When indexing there are sometimes an option to propagate an index down

From 799230907888fa218b300c9156979f38a54d4de1 Mon Sep 17 00:00:00 2001
From: shmsong <shisong@umich.edu>
Date: Wed, 24 Aug 2022 13:52:24 -0700
Subject: [PATCH 03/32] add repro

---
 .../codegen/cuda/test/test_gpu_tensorcore.cpp | 61 +++++++++++++++++++
 1 file changed, 61 insertions(+)

diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu_tensorcore.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu_tensorcore.cpp
index c00d02c8a40dd0..fa89f9e91a79a0 100644
--- a/torch/csrc/jit/codegen/cuda/test/test_gpu_tensorcore.cpp
+++ b/torch/csrc/jit/codegen/cuda/test/test_gpu_tensorcore.cpp
@@ -2857,6 +2857,67 @@ TEST_F(NVFuserTest, FusionTuringMatmulLargeLoad_CUDA) {
   }
 }
 
+TEST_F(NVFuserTest, FusionScheduleTransposeRepro1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(4);
+  auto tv1 = makeConcreteTensor({-1, -1, -1, 1});
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+  auto tv2 = add(tv0, tv1);
+  fusion.addOutput(tv2);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input0 = at::randn({1, 1, 333, 1}, options);
+  at::Tensor input1 = at::randn({1, 1, 333, 1}, options);
+
+  auto lparams = scheduleTranspose(&fusion, {input0, input1});
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input0, input1}, lparams);
+  auto outputs = fe.runFusion({input0, input1}, lparams);
+
+  auto tv_ref = input0 + input1;
+
+  testValidate(
+      &fusion, outputs, {input0, input1}, {tv_ref}, __LINE__, __FILE__);
+}
+
+TEST_F(NVFuserTest, FusionInlineBroadcastIndexing0_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(1);
+  auto tv1 = makeContigTensor(2);
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+  auto tv2 = set(tv0);
+  auto tv3 = broadcast(tv2, {true, false});
+  auto tv4 = add(tv3, tv1);
+  fusion.addOutput(tv4);
+
+  tv4->merge(0);
+  tv4->split(0, 32);
+
+  tv0->computeAt(tv4, 1);
+
+  tv2->split(-1, 8);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({123}, options);
+  at::Tensor t1 = at::randn({3, 123}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1});
+
+  auto outputs = fe.runFusion({t0, t1});
+
+  auto tv_ref = t0 + t1;
+
+  testValidate(&fusion, outputs, {t0, t1}, {tv_ref}, __LINE__, __FILE__);
+}
+
 #undef NVFUSER_TEST_CUDA_ARCH_GUARD
 
 } // namespace jit

From e9d09fe57264443cbe0ca22fd1c0b90d2f50175e Mon Sep 17 00:00:00 2001
From: shmsong <shisong@umich.edu>
Date: Wed, 24 Aug 2022 15:39:31 -0700
Subject: [PATCH 04/32] clear GPU memory after test

---
 torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h b/torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h
index 0247c33c8a7263..05a4fd600b6536 100644
--- a/torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h
+++ b/torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h
@@ -6,6 +6,7 @@
 
 #include <ATen/cuda/CUDAContext.h>
 #include <torch/torch.h>
+#include <c10/cuda/CUDACachingAllocator.h>
 
 #include <unordered_map>
 
@@ -36,6 +37,10 @@ class NVFuserTest : public ::testing::Test {
       GTEST_SKIP() << "skipping tests on pre-PASCAL GPUs";
     }
   }
+
+  void TearDown() override{
+    c10::cuda::CUDACachingAllocator::emptyCache();
+  }
 };
 
 struct ValidationConstants {

From f406e23bb49c755a2d97c04f839d5dc8bbc0384b Mon Sep 17 00:00:00 2001
From: shmsong <shisong@umich.edu>
Date: Wed, 24 Aug 2022 19:33:35 -0700
Subject: [PATCH 05/32] cleanup and comment

---
 torch/csrc/jit/codegen/cuda/index_compute.cpp | 88 +++++++++++++------
 torch/csrc/jit/codegen/cuda/index_compute.h   | 22 +++++
 torch/csrc/jit/codegen/cuda/test/test_gpu.cpp | 63 +++++++++++++
 .../codegen/cuda/test/test_gpu_tensorcore.cpp | 61 -------------
 .../codegen/cuda/test/test_gpu_validator.h    |  4 +-
 5 files changed, 149 insertions(+), 89 deletions(-)

diff --git a/torch/csrc/jit/codegen/cuda/index_compute.cpp b/torch/csrc/jit/codegen/cuda/index_compute.cpp
index e7feff88fb10c7..edd4fcf2c1acb9 100644
--- a/torch/csrc/jit/codegen/cuda/index_compute.cpp
+++ b/torch/csrc/jit/codegen/cuda/index_compute.cpp
@@ -671,41 +671,77 @@ void IndexCompute::run(const LoopIndexing& loop_indexing) {
     }
   }
 
-  // Resolve the out of line expressions first:
-  std::unordered_map<IterDomain*, Val*> permissive_index_map;
+  // Resolve the index vals that could be resolved with only
+  //  the loops that consumer_tv doesn't share with any of its
+  //  consumers, i.e. the not-inlined loops that define consumer_tv
+  //  values.
+  collectIndexIntoPermissiveMap(loop_indexing);
+
+  // Run through the loop indexing expressions and generate
+  //  the indexing integer math for the concrete ids.
+  for (auto expr : loop_indexing.getBackwardExprList()) {
+    // Resolve missing values from permissive map.
+    updateIndexMapFromPermissiveMap(expr);
 
-  for (auto expr : loop_indexing.getBackwardOutOfLineExprList()) {
     handle(expr);
+  }
+}
 
-    // Collect backward results from this expression if they are
-    //  made available in by this expression.
-    auto id_inputs = ir_utils::filterByType<IterDomain>(expr->inputs());
-    for (auto id : id_inputs) {
-      auto idx_it = index_map_.find(ir_utils::caMapExactConcreteId(id));
-      if (idx_it != index_map_.end()) {
-        permissive_index_map[GpuLower::current()->caMap()->getConcreteMappedID(
-            id, IdMappingMode::PERMISSIVE)] = idx_it->second;
+void IndexCompute::collectIndexIntoPermissiveMap(
+    const LoopIndexing& loop_indexing) {
+  // Visit the expressions that only produces un-inlined iterdomains,
+  //  in reverse topological order.
+  for (auto expr : loop_indexing.getBackwardOutOfLineExprList()) {
+    // Compute indexing vals for the expression inputs.
+    //
+    // This stage should run before any indexing computation so it could be
+    //  made sure that all index values computed at this stage are
+    //  the ones that can be resolved only with the not-inlined
+    //  iterdomains.
+    //
+    auto id_outputs = ir_utils::filterByType<IterDomain>(expr->outputs());
+    if (std::all_of(
+            id_outputs.begin(), id_outputs.end(), [this](IterDomain* id) {
+              return index_map_.count(ir_utils::caMapExactConcreteId(id));
+            })) {
+      // Visit this expression:
+      // LoopIndexingAnalysis::traverseFromDomainVals made sure that each
+      //  concrete index is bound exactly once so computing these expressions
+      //  early should still be consistent.
+      handle(expr);
+
+      auto id_inputs = ir_utils::filterByType<IterDomain>(expr->inputs());
+      for (auto id : id_inputs) {
+        // Collect backward pass results from this expression if they are
+        //  made available in by this expression.
+        auto idx_it = index_map_.find(ir_utils::caMapExactConcreteId(id));
+
+        if (idx_it != index_map_.end()) {
+          permissive_index_map_
+              [GpuLower::current()->caMap()->getConcreteMappedID(
+                  id, IdMappingMode::PERMISSIVE)] = idx_it->second;
+        }
       }
     }
   }
+}
 
-  // Run through the loop indexing expressions and generate
-  //  the indexing integer math for the concrete ids.
-  for (auto expr : loop_indexing.getBackwardExprList()) {
-    auto id_outputs = ir_utils::filterByType<IterDomain>(expr->outputs());
-
-    for (auto id : id_outputs) {
-      auto concrete_id = ir_utils::caMapExactConcreteId(id);
-      if (!index_map_.count(concrete_id)) {
-        auto permissive_id = GpuLower::current()->caMap()->getConcreteMappedID(
-            id, IdMappingMode::PERMISSIVE);
-        auto permissive_it = permissive_index_map.find(permissive_id);
-        if (permissive_it != permissive_index_map.end()) {
-          index_map_[concrete_id] = permissive_it->second;
-        }
+void IndexCompute::updateIndexMapFromPermissiveMap(const Expr* id_expr) {
+  auto id_outputs = ir_utils::filterByType<IterDomain>(id_expr->outputs());
+  for (auto id : id_outputs) {
+    auto concrete_id = ir_utils::caMapExactConcreteId(id);
+    // Only try to copy index val from permissive map when
+    //  the index is missing.
+    if (!index_map_.count(concrete_id)) {
+      auto permissive_id = GpuLower::current()->caMap()->getConcreteMappedID(
+          id, IdMappingMode::PERMISSIVE);
+      // Write the permissive index val into index_map_ if the
+      //  missing value is found here.
+      auto permissive_it = permissive_index_map_.find(permissive_id);
+      if (permissive_it != permissive_index_map_.end()) {
+        index_map_[concrete_id] = permissive_it->second;
       }
     }
-    handle(expr);
   }
 }
 
diff --git a/torch/csrc/jit/codegen/cuda/index_compute.h b/torch/csrc/jit/codegen/cuda/index_compute.h
index f064ebba293cb6..3d865b4a8ceb0a 100644
--- a/torch/csrc/jit/codegen/cuda/index_compute.h
+++ b/torch/csrc/jit/codegen/cuda/index_compute.h
@@ -86,6 +86,18 @@ class IndexCompute : public BackwardVisitor {
   //! based traversal.
   IterDomain* maybeGetExactMapConcreteID(IterDomain* id);
 
+  //! (Concrete indexing pass only)
+  //!  Collect permissive index binding from the given expression.
+  //! See also permissive_map_ and LoopIndexing::getBackwardOutOfLineExprList.
+  void collectIndexIntoPermissiveMap(const LoopIndexing& loop_indexing);
+
+  //! (Concrete indexing pass only)
+  //!  Iterate through id_expr's input and pull index vals from permissive
+  //! map, when both of the following are true:
+  //!    1. the output id is missing in index_map_.
+  //!    2. the output id is found in permissive map.
+  void updateIndexMapFromPermissiveMap(const Expr* id_expr);
+
   // Tensor domain we're mapping back to root
   const TensorDomain* td_; // NOLINT
 
@@ -137,6 +149,16 @@ class IndexCompute : public BackwardVisitor {
   //  pass. See also [Note on swizzle mode]
   SwizzleMode swizzle_mode_ = SwizzleMode::NoSwizzle;
 
+  // (Concrete id pass only)
+  // Contains the indexing math that could be resolved with only the
+  //  iterdomains on the right of the consumer_tv's ca axis, i.e. the
+  //  ones that corresponding to the loops that consumer_tv would not
+  //  share with any of its consumers.
+  // These indexing vals should be kept separate from index_map_ and
+  //  should only be used when the indexing traversal follows the
+  //  order defined in LoopIndexingAnalysis::traverseFromDomainVals.
+  std::unordered_map<IterDomain*, Val*> permissive_index_map_;
+
  public:
   const std::unordered_map<IterDomain*, Val*>& indexMap() const {
     return index_map_;
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu.cpp
index 4f72bf93ba36e9..8f2d3927eb1c14 100644
--- a/torch/csrc/jit/codegen/cuda/test/test_gpu.cpp
+++ b/torch/csrc/jit/codegen/cuda/test/test_gpu.cpp
@@ -25512,6 +25512,69 @@ TEST_F(NVFuserTest, FusionSizeDependentData_CUDA) {
       executor_cache.fusion(), cg_outputs, {a}, {a + 123}, __LINE__, __FILE__);
 }
 
+// Repro for issue #1925
+TEST_F(NVFuserTest, FusionScheduleTransposeRepro1_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeSymbolicTensor(4);
+  auto tv1 = makeConcreteTensor({-1, -1, -1, 1});
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+  auto tv2 = add(tv0, tv1);
+  fusion.addOutput(tv2);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor input0 = at::randn({1, 1, 333, 1}, options);
+  at::Tensor input1 = at::randn({1, 1, 333, 1}, options);
+
+  auto lparams = scheduleTranspose(&fusion, {input0, input1});
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {input0, input1}, lparams);
+  auto outputs = fe.runFusion({input0, input1}, lparams);
+
+  auto tv_ref = input0 + input1;
+
+  testValidate(
+      &fusion, outputs, {input0, input1}, {tv_ref}, __LINE__, __FILE__);
+}
+
+// Repro for issue #1873
+TEST_F(NVFuserTest, FusionInlineBroadcastIndexing0_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  auto tv0 = makeContigTensor(1);
+  auto tv1 = makeContigTensor(2);
+  fusion.addInput(tv0);
+  fusion.addInput(tv1);
+  auto tv2 = set(tv0);
+  auto tv3 = broadcast(tv2, {true, false});
+  auto tv4 = add(tv3, tv1);
+  fusion.addOutput(tv4);
+
+  tv4->merge(0);
+  tv4->split(0, 32);
+
+  tv0->computeAt(tv4, 1);
+
+  tv2->split(-1, 8);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({123}, options);
+  at::Tensor t1 = at::randn({3, 123}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion, {t0, t1});
+
+  auto outputs = fe.runFusion({t0, t1});
+
+  auto tv_ref = t0 + t1;
+
+  testValidate(&fusion, outputs, {t0, t1}, {tv_ref}, __LINE__, __FILE__);
+}
+
 TEST_F(NVFuserTest, FusionPredicateUnshare_CUDA) {
   // https://github.com/csarofeen/pytorch/issues/1926
   std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu_tensorcore.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu_tensorcore.cpp
index fa89f9e91a79a0..c00d02c8a40dd0 100644
--- a/torch/csrc/jit/codegen/cuda/test/test_gpu_tensorcore.cpp
+++ b/torch/csrc/jit/codegen/cuda/test/test_gpu_tensorcore.cpp
@@ -2857,67 +2857,6 @@ TEST_F(NVFuserTest, FusionTuringMatmulLargeLoad_CUDA) {
   }
 }
 
-TEST_F(NVFuserTest, FusionScheduleTransposeRepro1_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeSymbolicTensor(4);
-  auto tv1 = makeConcreteTensor({-1, -1, -1, 1});
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-  auto tv2 = add(tv0, tv1);
-  fusion.addOutput(tv2);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input0 = at::randn({1, 1, 333, 1}, options);
-  at::Tensor input1 = at::randn({1, 1, 333, 1}, options);
-
-  auto lparams = scheduleTranspose(&fusion, {input0, input1});
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {input0, input1}, lparams);
-  auto outputs = fe.runFusion({input0, input1}, lparams);
-
-  auto tv_ref = input0 + input1;
-
-  testValidate(
-      &fusion, outputs, {input0, input1}, {tv_ref}, __LINE__, __FILE__);
-}
-
-TEST_F(NVFuserTest, FusionInlineBroadcastIndexing0_CUDA) {
-  Fusion fusion;
-  FusionGuard fg(&fusion);
-
-  auto tv0 = makeContigTensor(1);
-  auto tv1 = makeContigTensor(2);
-  fusion.addInput(tv0);
-  fusion.addInput(tv1);
-  auto tv2 = set(tv0);
-  auto tv3 = broadcast(tv2, {true, false});
-  auto tv4 = add(tv3, tv1);
-  fusion.addOutput(tv4);
-
-  tv4->merge(0);
-  tv4->split(0, 32);
-
-  tv0->computeAt(tv4, 1);
-
-  tv2->split(-1, 8);
-
-  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor t0 = at::randn({123}, options);
-  at::Tensor t1 = at::randn({3, 123}, options);
-
-  FusionExecutor fe;
-  fe.compileFusion(&fusion, {t0, t1});
-
-  auto outputs = fe.runFusion({t0, t1});
-
-  auto tv_ref = t0 + t1;
-
-  testValidate(&fusion, outputs, {t0, t1}, {tv_ref}, __LINE__, __FILE__);
-}
-
 #undef NVFUSER_TEST_CUDA_ARCH_GUARD
 
 } // namespace jit
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h b/torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h
index 05a4fd600b6536..2d0bada1c09114 100644
--- a/torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h
+++ b/torch/csrc/jit/codegen/cuda/test/test_gpu_validator.h
@@ -5,8 +5,8 @@
 #include <torch/csrc/jit/codegen/cuda/lower_utils.h>
 
 #include <ATen/cuda/CUDAContext.h>
-#include <torch/torch.h>
 #include <c10/cuda/CUDACachingAllocator.h>
+#include <torch/torch.h>
 
 #include <unordered_map>
 
@@ -38,7 +38,7 @@ class NVFuserTest : public ::testing::Test {
     }
   }
 
-  void TearDown() override{
+  void TearDown() override {
     c10::cuda::CUDACachingAllocator::emptyCache();
   }
 };

From abe1f6dc5d11902ca25a27dea91a4e174f72de0d Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Thu, 25 Aug 2022 00:20:09 -0700
Subject: [PATCH 06/32] fix

---
 torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.h | 4 ++++
 torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp     | 6 +++++-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.h b/torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.h
index 7947a27f48360b..13d306fa066542 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.h
+++ b/torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.h
@@ -24,6 +24,10 @@ class DomainMap {
     return ca_map_.areMapped(id1, id2, IdMappingMode::EXACT);
   }
 
+  bool arePermissiveMapped(IterDomain* id1, IterDomain* id2) const {
+    return ca_map_.areMapped(id1, id2, IdMappingMode::PERMISSIVE);
+  }
+
   const ComputeAtMap& getComputeAtMap() const {
     return ca_map_;
   }
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
index 5ef502321b773d..d9cf61fbcb0a07 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
@@ -59,9 +59,10 @@ class DomainMap : public pointwise_utils::DomainMap {
   }
 
   int getPosMappedTo(TensorView* tv, IterDomain* id) const {
+    std::cout << ca_map_.toString() << std::endl;
     const auto& dom = tv->domain()->domain();
     for (auto i : c10::irange(dom.size())) {
-      if (areExactMapped(id, tv->axis(i))) {
+      if (arePermissiveMapped(tv->axis(i), id)) {
         return i;
       }
     }
@@ -382,6 +383,9 @@ std::shared_ptr<TransposeParams> getTransposeHeuristics(
   auto inner_most_id1 = scheduler_utils::innerMostRootDim(reference1);
   auto inner_most_id2 = scheduler_utils::innerMostRootDim(reference2);
 
+  fusion->printMath();
+  fusion->print();
+
   auto inner_most_pos1_in_ref1 =
       domain_map.getPosMappedTo(reference1, inner_most_id1);
   auto inner_most_pos2_in_ref1 =

From bcc6f6cd0fb0945644ddb359f543e63dc21c889f Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Thu, 25 Aug 2022 00:25:36 -0700
Subject: [PATCH 07/32] fix

---
 torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.h | 6 ++++++
 torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp     | 4 ----
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.h b/torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.h
index 13d306fa066542..8b40a306922eeb 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.h
+++ b/torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.h
@@ -21,10 +21,16 @@ class DomainMap {
   virtual ~DomainMap() = default;
 
   bool areExactMapped(IterDomain* id1, IterDomain* id2) const {
+    if (!ca_map_.idExistsInMap(id1) || !ca_map_.idExistsInMap(id2)) {
+      return false;
+    }
     return ca_map_.areMapped(id1, id2, IdMappingMode::EXACT);
   }
 
   bool arePermissiveMapped(IterDomain* id1, IterDomain* id2) const {
+    if (!ca_map_.idExistsInMap(id1) || !ca_map_.idExistsInMap(id2)) {
+      return false;
+    }
     return ca_map_.areMapped(id1, id2, IdMappingMode::PERMISSIVE);
   }
 
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
index d9cf61fbcb0a07..5ac3f61702a5ea 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
@@ -59,7 +59,6 @@ class DomainMap : public pointwise_utils::DomainMap {
   }
 
   int getPosMappedTo(TensorView* tv, IterDomain* id) const {
-    std::cout << ca_map_.toString() << std::endl;
     const auto& dom = tv->domain()->domain();
     for (auto i : c10::irange(dom.size())) {
       if (arePermissiveMapped(tv->axis(i), id)) {
@@ -383,9 +382,6 @@ std::shared_ptr<TransposeParams> getTransposeHeuristics(
   auto inner_most_id1 = scheduler_utils::innerMostRootDim(reference1);
   auto inner_most_id2 = scheduler_utils::innerMostRootDim(reference2);
 
-  fusion->printMath();
-  fusion->print();
-
   auto inner_most_pos1_in_ref1 =
       domain_map.getPosMappedTo(reference1, inner_most_id1);
   auto inner_most_pos2_in_ref1 =

From 63e3e762905a236c014505abe461211e12f3cfea Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Thu, 25 Aug 2022 11:45:19 -0700
Subject: [PATCH 08/32] Allow splitting inner-most ID to create virtual
 innermost ID

---
 .../codegen/cuda/scheduler/pointwise_utils.h  | 10 +++
 .../jit/codegen/cuda/scheduler/transpose.cpp  | 72 ++++++++++++++++---
 .../codegen/cuda/test/test_gpu_transpose.cpp  | 31 ++++++++
 3 files changed, 102 insertions(+), 11 deletions(-)

diff --git a/torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.h b/torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.h
index 7947a27f48360b..8b40a306922eeb 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.h
+++ b/torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.h
@@ -21,9 +21,19 @@ class DomainMap {
   virtual ~DomainMap() = default;
 
   bool areExactMapped(IterDomain* id1, IterDomain* id2) const {
+    if (!ca_map_.idExistsInMap(id1) || !ca_map_.idExistsInMap(id2)) {
+      return false;
+    }
     return ca_map_.areMapped(id1, id2, IdMappingMode::EXACT);
   }
 
+  bool arePermissiveMapped(IterDomain* id1, IterDomain* id2) const {
+    if (!ca_map_.idExistsInMap(id1) || !ca_map_.idExistsInMap(id2)) {
+      return false;
+    }
+    return ca_map_.areMapped(id1, id2, IdMappingMode::PERMISSIVE);
+  }
+
   const ComputeAtMap& getComputeAtMap() const {
     return ca_map_;
   }
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
index 5ef502321b773d..6a0a9324811571 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
@@ -59,9 +59,42 @@ class DomainMap : public pointwise_utils::DomainMap {
   }
 
   int getPosMappedTo(TensorView* tv, IterDomain* id) const {
+    // Find the root id mapped to `id`
+    const auto& root_dom = tv->getRootDomain();
+    IterDomain* mapped_id = nullptr;
+    for (auto i : c10::irange(root_dom.size())) {
+      if (arePermissiveMapped(root_dom[i], id)) {
+        mapped_id = root_dom[i];
+        break;
+      }
+    }
+    TORCH_INTERNAL_ASSERT(
+        mapped_id != nullptr,
+        "Can not find ID mapped to ",
+        id,
+        " in tensor ",
+        tv);
+    // Project the root id to leaf id
+    while (!mapped_id->uses().empty()) {
+      TORCH_INTERNAL_ASSERT(mapped_id->uses().size() == 1);
+      auto expr = mapped_id->uses()[0];
+      if (expr->isA<Split>()) {
+        mapped_id = expr->as<Split>()->inner();
+      } else {
+        auto merge = expr->as<Merge>();
+        TORCH_INTERNAL_ASSERT(
+            mapped_id == merge->inner(),
+            "Can not find ID mapped to ",
+            id,
+            " in tensor ",
+            tv);
+        mapped_id = merge->out();
+      }
+    }
+    // Find the position of the leaf id
     const auto& dom = tv->domain()->domain();
     for (auto i : c10::irange(dom.size())) {
-      if (areExactMapped(id, tv->axis(i))) {
+      if (dom[i] == mapped_id) {
         return i;
       }
     }
@@ -240,22 +273,35 @@ void maybeBuildVirtualInnerDims(
   //    both virtual innermost dim.
   // 2. The satisfied one did not merge in anything. For example,
   //    T0[I0{1024*1024}, I1{2}]
+  //    If this is the case, this means that we need to split the large
+  //    inner-most dimension to satisfy the small innermost dimension
   int64_t large_dim;
   int64_t split_factor;
+  bool split_inner_most;
   if (merged_size1 < params.tile_size1) {
     if (params.dims_merged_with_2.empty()) {
       // case 2
-      return;
+      split_inner_most = true;
+      large_dim = inner_most2;
+      split_factor = params.tile_size2;
+    } else {
+      // case 1
+      split_inner_most = false;
+      large_dim = params.dims_merged_with_2.back();
+      split_factor = ceilDiv(params.tile_size1, merged_size1);
     }
-    large_dim = params.dims_merged_with_2.back();
-    split_factor = ceilDiv(params.tile_size1, merged_size1);
   } else {
     if (params.dims_merged_with_1.empty()) {
       // case 2
-      return;
+      split_inner_most = true;
+      large_dim = inner_most1;
+      split_factor = params.tile_size1;
+    } else {
+      // case 1
+      split_inner_most = false;
+      large_dim = params.dims_merged_with_1.back();
+      split_factor = ceilDiv(params.tile_size2, merged_size2);
     }
-    large_dim = params.dims_merged_with_1.back();
-    split_factor = ceilDiv(params.tile_size2, merged_size2);
   }
   params.split_before_tiling.push_back({large_dim, split_factor});
   // adjust all dims to after-split
@@ -271,12 +317,16 @@ void maybeBuildVirtualInnerDims(
   }
   // Give the split-out dim to the unsatisfied one, so that both are satisfied.
   if (merged_size1 < params.tile_size1) {
-    params.dims_merged_with_2.pop_back();
-    params.dims_merged_with_2.push_back(large_dim + 1);
+    if (!split_inner_most) {
+      params.dims_merged_with_2.pop_back();
+      params.dims_merged_with_2.push_back(large_dim + 1);
+    }
     params.dims_merged_with_1.push_back(large_dim);
   } else {
-    params.dims_merged_with_1.pop_back();
-    params.dims_merged_with_1.push_back(large_dim + 1);
+    if (!split_inner_most) {
+      params.dims_merged_with_1.pop_back();
+      params.dims_merged_with_1.push_back(large_dim + 1);
+    }
     params.dims_merged_with_2.push_back(large_dim);
   }
 }
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu_transpose.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu_transpose.cpp
index b9d8e9d2947820..d5823c22683c0e 100644
--- a/torch/csrc/jit/codegen/cuda/test/test_gpu_transpose.cpp
+++ b/torch/csrc/jit/codegen/cuda/test/test_gpu_transpose.cpp
@@ -932,6 +932,37 @@ TEST_F(NVFuserTest, FusionScheduleTransposeSmallInnerSize3_CUDA) {
   testValidate(&fusion, outputs, {input}, {tv_ref}, __LINE__, __FILE__);
 }
 
+// x->sin->transpose->cos->y
+TEST_F(NVFuserTest, FusionScheduleTranspose2DSmallInnerSize_CUDA) {
+  std::array<std::vector<int64_t>, 2> shapes{
+      std::vector<int64_t>{1024 * 1024 * 128, 2},
+      std::vector<int64_t>{2, 1024 * 1024 * 128}};
+  for (const auto& shape : shapes) {
+    Fusion fusion;
+    FusionGuard fg(&fusion);
+
+    auto tv0 = makeContigTensor(2);
+    fusion.addInput(tv0);
+    auto tv1 = sin(tv0);
+    auto tv2 = transpose(tv1, 0, 1);
+    auto tv3 = cos(tv2);
+    fusion.addOutput(tv3);
+
+    auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+    at::Tensor input = at::randn(shape, options);
+
+    auto lparams = scheduleTranspose(&fusion, {input});
+
+    FusionExecutor fe;
+    fe.compileFusion(&fusion, {input}, lparams);
+    auto outputs = fe.runFusion({input}, lparams);
+
+    auto tv_ref = input.sin().transpose(0, 1).cos();
+
+    testValidate(&fusion, outputs, {input}, {tv_ref}, __LINE__, __FILE__);
+  }
+}
+
 } // namespace jit
 } // namespace torch
 #endif // #if defined(USE_CUDA)

From 5a423a01d7181f883e2c49fb5da6be9674da9f52 Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Thu, 25 Aug 2022 16:54:03 -0700
Subject: [PATCH 09/32] remove obselete comment

---
 torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
index 6a0a9324811571..0db554ebb9849a 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
@@ -419,12 +419,6 @@ std::shared_ptr<TransposeParams> getTransposeHeuristics(
   if (n_elems < device_multiprocessor_count * kMaxTileSize * kMaxTileSize) {
     params->tile_size1 = 8;
     params->tile_size2 = 8;
-    // TODO: I was trying the following but I got silent wrong result
-    // params->tile_size1 = 8;
-    // params->tile_size2 = 4;
-    // This should not happen, because the correctness should be irrevalent to
-    // schedulers. We don't have to use tile size (8, 4), but we need to fix our
-    // bug in codegen.
   }
 
   // Expand inner-most dims to virtual inner-most dims so that the inner-most

From 5f266b3ec1e7bdefcede6e7b2597460aaa5fcd76 Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Tue, 30 Aug 2022 21:54:33 -0700
Subject: [PATCH 10/32] skip innermost split

---
 torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
index b197924d5d368c..9ab09922ac7eca 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
@@ -280,10 +280,14 @@ void maybeBuildVirtualInnerDims(
   bool split_inner_most;
   if (merged_size1 < params.tile_size1) {
     if (params.dims_merged_with_2.empty()) {
+#if 0
       // case 2
       split_inner_most = true;
       large_dim = inner_most2;
       split_factor = params.tile_size2;
+#else
+      return;
+#endif
     } else {
       // case 1
       split_inner_most = false;
@@ -292,10 +296,14 @@ void maybeBuildVirtualInnerDims(
     }
   } else {
     if (params.dims_merged_with_1.empty()) {
+#if 0
       // case 2
       split_inner_most = true;
       large_dim = inner_most1;
       split_factor = params.tile_size1;
+#else
+      return;
+#endif
     } else {
       // case 1
       split_inner_most = false;

From bd93302ab7bc62adfd40bc4938df7f0dd5520ff0 Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Tue, 30 Aug 2022 21:55:16 -0700
Subject: [PATCH 11/32] comment

---
 torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
index 9ab09922ac7eca..b43810ec83a73b 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
@@ -286,6 +286,7 @@ void maybeBuildVirtualInnerDims(
       large_dim = inner_most2;
       split_factor = params.tile_size2;
 #else
+      // disabled due to indexing error
       return;
 #endif
     } else {
@@ -302,6 +303,7 @@ void maybeBuildVirtualInnerDims(
       large_dim = inner_most1;
       split_factor = params.tile_size1;
 #else
+      // disabled due to indexing error
       return;
 #endif
     } else {

From 5933f5381988e84ce0d65131006fe23843c4aa5a Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Tue, 30 Aug 2022 22:01:43 -0700
Subject: [PATCH 12/32] cleanup

---
 torch/csrc/jit/codegen/cuda/utils.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/csrc/jit/codegen/cuda/utils.cpp b/torch/csrc/jit/codegen/cuda/utils.cpp
index 56e4266a26e97d..a79c4d2db83ad9 100644
--- a/torch/csrc/jit/codegen/cuda/utils.cpp
+++ b/torch/csrc/jit/codegen/cuda/utils.cpp
@@ -191,7 +191,7 @@ auto parseEnableOptions() {
             token,
             "'\nAvailable options:\n",
             "\tcomplex, kernel_profile, linear_decomposition,",
-            "conv_decomposition, transpose_scheduler");
+            "conv_decomposition");
       }
       options_view = (end_pos != c10::string_view::npos)
           ? options_view.substr(end_pos + 1)

From 7c366b753f2b07e011dfe1a87226b90808d636a4 Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Wed, 31 Aug 2022 23:03:39 -0700
Subject: [PATCH 13/32] save

---
 torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
index b43810ec83a73b..abdab77fb2c430 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
@@ -536,13 +536,16 @@ std::shared_ptr<TransposeParams> getTransposeHeuristics(
     std::cerr << "\n===== Transpose Stats ========\n"
               << "inputs: " << ir_utils::toString(fusion->inputs()) << "\n"
               << "outputs: " << ir_utils::toString(fusion->outputs()) << "\n"
+              << "shape: " << shape_in_ref1 << "\n"
               << "num_elems: " << n_elems << "\n"
               << "n_input_tensors: " << n_input_tensors << "\n"
               << "max_input_dtype_size: " << max_input_dtype_size << "\n"
               << "group 1: " << ir_utils::toString(grouped_inputs_outputs[0])
               << "\n"
+              << "reference1: " << reference1 << "\n"
               << "group 2: " << ir_utils::toString(grouped_inputs_outputs[1])
-              << std::endl;
+              << "\n"
+              << "reference2: " << reference2 << std::endl;
     if (!params->split_before_tiling.empty() ||
         !params->dims_merged_with_1.empty() ||
         !params->dims_merged_with_2.empty()) {

From 2c4f646ad5a97dd6cf65c13282eca48efd08b2c7 Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Wed, 31 Aug 2022 23:08:17 -0700
Subject: [PATCH 14/32] save

---
 torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
index abdab77fb2c430..6ee1546abf27e0 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
@@ -543,9 +543,11 @@ std::shared_ptr<TransposeParams> getTransposeHeuristics(
               << "group 1: " << ir_utils::toString(grouped_inputs_outputs[0])
               << "\n"
               << "reference1: " << reference1 << "\n"
+              << "inner_most_id1: " << inner_most_id1 << "\n"
               << "group 2: " << ir_utils::toString(grouped_inputs_outputs[1])
               << "\n"
-              << "reference2: " << reference2 << std::endl;
+              << "reference2: " << reference2 << "\n"
+              << "inner_most_id2: " << inner_most_id2 << std::endl;
     if (!params->split_before_tiling.empty() ||
         !params->dims_merged_with_1.empty() ||
         !params->dims_merged_with_2.empty()) {

From 6d5a79fe466b6c69e926c801656e93e8cb394c4b Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Wed, 31 Aug 2022 23:43:29 -0700
Subject: [PATCH 15/32] save

---
 torch/csrc/jit/codegen/cuda/scheduler/transpose_heuristic.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose_heuristic.h b/torch/csrc/jit/codegen/cuda/scheduler/transpose_heuristic.h
index d672b6dc965bd4..099f6d6d38c111 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/transpose_heuristic.h
+++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose_heuristic.h
@@ -65,7 +65,7 @@ class TransposeParams : public HeuristicParams {
     std::stringstream ss;
     ss << "\n===== Transpose Parameters ========\n"
        << (tag == "" ? "" : "Tag: ") << tag << " Transpose Characteristics:\n"
-       << " Gridx: " << lparams.gdimx() << " BlckX: " << lparams.bdimx()
+       << " BlckX: " << lparams.bdimx()
        << "\n";
     ss << " input tile size: " << tile_size1 << "\n";
     ss << " output tile size: " << tile_size2 << "\n";

From f9a2d88ba2659a420fa57ccabf113a111aa6f2e9 Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Wed, 31 Aug 2022 23:47:12 -0700
Subject: [PATCH 16/32] no small tile

---
 torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
index 6ee1546abf27e0..f1e87b2553a207 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
@@ -425,12 +425,6 @@ std::shared_ptr<TransposeParams> getTransposeHeuristics(
 
   auto params = std::make_shared<TransposeParams>("Transpose heuristics");
 
-  // If the problem size is small use small tile sizes.
-  if (n_elems < device_multiprocessor_count * kMaxTileSize * kMaxTileSize) {
-    params->tile_size1 = 8;
-    params->tile_size2 = 8;
-  }
-
   // Expand inner-most dims to virtual inner-most dims so that the inner-most
   // dims has at least tile_size elements
   auto inner_most_id1 = scheduler_utils::innerMostRootDim(reference1);
@@ -957,6 +951,9 @@ void scheduleTranspose(Fusion* fusion, TransposeParams params) {
   InlinePropagator inline_propagator(
       reference1, -1, ComputeAtMode::MostInlined);
   entire_dag.traverse(&inline_propagator);
+
+  fusion->printMath();
+  fusion->print();
 }
 
 } // namespace cuda

From ade4d229f8134eaae6c18db0b71046ca7ab298da Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Wed, 31 Aug 2022 23:47:54 -0700
Subject: [PATCH 17/32] cleanup

---
 torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
index f1e87b2553a207..de64b7679100aa 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
@@ -24,8 +24,6 @@ namespace cuda {
 
 namespace {
 
-constexpr int64_t kMaxTileSize = 32;
-
 // DomainMap uses the ComputeAtMap to find a reference TensorView
 // that maps to all iterDomains in the fusion.
 class DomainMap : public pointwise_utils::DomainMap {

From ba9b01bb5132238ef987830a2911eb4f49657b2a Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Thu, 1 Sep 2022 00:03:31 -0700
Subject: [PATCH 18/32] tune tile size

---
 torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp         | 3 ---
 torch/csrc/jit/codegen/cuda/scheduler/transpose_heuristic.h | 2 +-
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
index de64b7679100aa..d3d787d92ab77a 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
@@ -949,9 +949,6 @@ void scheduleTranspose(Fusion* fusion, TransposeParams params) {
   InlinePropagator inline_propagator(
       reference1, -1, ComputeAtMode::MostInlined);
   entire_dag.traverse(&inline_propagator);
-
-  fusion->printMath();
-  fusion->print();
 }
 
 } // namespace cuda
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose_heuristic.h b/torch/csrc/jit/codegen/cuda/scheduler/transpose_heuristic.h
index 099f6d6d38c111..aa86a9754b0551 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/transpose_heuristic.h
+++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose_heuristic.h
@@ -37,7 +37,7 @@ class TransposeParams : public HeuristicParams {
   // https://github.com/csarofeen/pytorch/pull/1854#discussion_r928143729
 
   // Tile size for the inner most dim of tensors in the first group
-  size_t tile_size1 = 32;
+  size_t tile_size1 = 16;
 
   // Tile size for the inner most dim of tensors in the second group
   size_t tile_size2 = 32;

From cddae0bd57c0374b589e495607ae4b8fc6fad395 Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Thu, 1 Sep 2022 00:04:44 -0700
Subject: [PATCH 19/32] undo tune

---
 torch/csrc/jit/codegen/cuda/scheduler/transpose_heuristic.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose_heuristic.h b/torch/csrc/jit/codegen/cuda/scheduler/transpose_heuristic.h
index aa86a9754b0551..099f6d6d38c111 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/transpose_heuristic.h
+++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose_heuristic.h
@@ -37,7 +37,7 @@ class TransposeParams : public HeuristicParams {
   // https://github.com/csarofeen/pytorch/pull/1854#discussion_r928143729
 
   // Tile size for the inner most dim of tensors in the first group
-  size_t tile_size1 = 16;
+  size_t tile_size1 = 32;
 
   // Tile size for the inner most dim of tensors in the second group
   size_t tile_size2 = 32;

From 3ce9784b433e73e8bfc8878c5c89ab977e1bb569 Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Thu, 1 Sep 2022 00:23:13 -0700
Subject: [PATCH 20/32] no virt inner if low occupancy

---
 .../jit/codegen/cuda/scheduler/transpose.cpp  | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
index d3d787d92ab77a..915d43a92ecfaa 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
@@ -421,6 +421,11 @@ std::shared_ptr<TransposeParams> getTransposeHeuristics(
     shape_in_ref1.push_back(size);
   }
 
+  // maximum unroll factor limited by occupancy
+  auto max_unroll_factor_occupancy = ceilDiv(
+      n_elems,
+      device_multiprocessor_count * params->tile_size1 * params->tile_size2);
+
   auto params = std::make_shared<TransposeParams>("Transpose heuristics");
 
   // Expand inner-most dims to virtual inner-most dims so that the inner-most
@@ -434,8 +439,15 @@ std::shared_ptr<TransposeParams> getTransposeHeuristics(
       domain_map.getPosMappedTo(reference1, inner_most_id2);
 
   // See note [Supporting small transpose dimensions]
-  maybeBuildVirtualInnerDims(
-      *params, shape_in_ref1, inner_most_pos1_in_ref1, inner_most_pos2_in_ref1);
+  if (max_unroll_factor_occupancy > 1) {
+    // if creating virtual inner dims could not help us get better occupancy,
+    // then don't do it
+    maybeBuildVirtualInnerDims(
+        *params,
+        shape_in_ref1,
+        inner_most_pos1_in_ref1,
+        inner_most_pos2_in_ref1);
+  }
 
   // Note [vectorization and unroll of input and output]
   //
@@ -487,9 +499,6 @@ std::shared_ptr<TransposeParams> getTransposeHeuristics(
           (int64_t)1));
 
   // Don't unroll at the cost of getting a full wave on the GPU
-  auto max_unroll_factor_occupancy = ceilDiv(
-      n_elems,
-      device_multiprocessor_count * params->tile_size1 * params->tile_size2);
   max_unroll_factor = std::min(max_unroll_factor, max_unroll_factor_occupancy);
 
   // Don't unroll at the cost of getting a full warp, useful for the case where

From 5fb2b58716f5420c602c46a87530dcfd0810a2a6 Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Thu, 1 Sep 2022 00:23:34 -0700
Subject: [PATCH 21/32] fix

---
 torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
index 915d43a92ecfaa..82dd98c970334f 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
@@ -421,13 +421,13 @@ std::shared_ptr<TransposeParams> getTransposeHeuristics(
     shape_in_ref1.push_back(size);
   }
 
+  auto params = std::make_shared<TransposeParams>("Transpose heuristics");
+
   // maximum unroll factor limited by occupancy
   auto max_unroll_factor_occupancy = ceilDiv(
       n_elems,
       device_multiprocessor_count * params->tile_size1 * params->tile_size2);
 
-  auto params = std::make_shared<TransposeParams>("Transpose heuristics");
-
   // Expand inner-most dims to virtual inner-most dims so that the inner-most
   // dims has at least tile_size elements
   auto inner_most_id1 = scheduler_utils::innerMostRootDim(reference1);

From 41483a9a868b2f2bedc171254e8a16ca88e5658d Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Thu, 1 Sep 2022 00:34:18 -0700
Subject: [PATCH 22/32] if one full wave can handle all elements, don't create
 virtual inner dims

---
 .../jit/codegen/cuda/scheduler/transpose.cpp  | 38 ++++++++++++-------
 1 file changed, 24 insertions(+), 14 deletions(-)

diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
index 82dd98c970334f..a215a397532095 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
@@ -209,12 +209,26 @@ class DomainMap : public pointwise_utils::DomainMap {
 //   T0[I0*I1o*I5*I6{1024*1024/4*8}, I1i*I2*I3*I4{32}]
 void maybeBuildVirtualInnerDims(
     TransposeParams& params,
+    int64_t device_multiprocessor_count,
+    int64_t n_elems,
     const std::vector<int64_t>& shape_in_ref1,
     int64_t inner_most1,
     int64_t inner_most2) {
   int64_t merged_size1 = shape_in_ref1[inner_most1];
   int64_t merged_size2 = shape_in_ref1[inner_most2];
 
+  int64_t actual_tile_size1 =
+      std::min<int64_t>(merged_size1, params.tile_size1);
+  int64_t actual_tile_size2 =
+      std::min<int64_t>(merged_size2, params.tile_size2);
+  int64_t wave_elements =
+      device_multiprocessor_count * actual_tile_size1 * actual_tile_size2;
+
+  if (wave_elements >= n_elems) {
+    // if one full wave can handle all elements, don't create virtual inner dims
+    return;
+  }
+
   // merge inner_most1 and inner_most2 left until we are done or we can no
   // longer do so
   int64_t dim = inner_most1 - 1;
@@ -423,11 +437,6 @@ std::shared_ptr<TransposeParams> getTransposeHeuristics(
 
   auto params = std::make_shared<TransposeParams>("Transpose heuristics");
 
-  // maximum unroll factor limited by occupancy
-  auto max_unroll_factor_occupancy = ceilDiv(
-      n_elems,
-      device_multiprocessor_count * params->tile_size1 * params->tile_size2);
-
   // Expand inner-most dims to virtual inner-most dims so that the inner-most
   // dims has at least tile_size elements
   auto inner_most_id1 = scheduler_utils::innerMostRootDim(reference1);
@@ -439,15 +448,13 @@ std::shared_ptr<TransposeParams> getTransposeHeuristics(
       domain_map.getPosMappedTo(reference1, inner_most_id2);
 
   // See note [Supporting small transpose dimensions]
-  if (max_unroll_factor_occupancy > 1) {
-    // if creating virtual inner dims could not help us get better occupancy,
-    // then don't do it
-    maybeBuildVirtualInnerDims(
-        *params,
-        shape_in_ref1,
-        inner_most_pos1_in_ref1,
-        inner_most_pos2_in_ref1);
-  }
+  maybeBuildVirtualInnerDims(
+      *params,
+      device_multiprocessor_count,
+      n_elems,
+      shape_in_ref1,
+      inner_most_pos1_in_ref1,
+      inner_most_pos2_in_ref1);
 
   // Note [vectorization and unroll of input and output]
   //
@@ -499,6 +506,9 @@ std::shared_ptr<TransposeParams> getTransposeHeuristics(
           (int64_t)1));
 
   // Don't unroll at the cost of getting a full wave on the GPU
+  auto max_unroll_factor_occupancy = ceilDiv(
+      n_elems,
+      device_multiprocessor_count * params->tile_size1 * params->tile_size2);
   max_unroll_factor = std::min(max_unroll_factor, max_unroll_factor_occupancy);
 
   // Don't unroll at the cost of getting a full warp, useful for the case where

From 21a789625eb8fe7062858592549b948c75998171 Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Tue, 6 Sep 2022 19:09:55 -0700
Subject: [PATCH 23/32] reject < 1 wave on runtime

---
 .../jit/codegen/cuda/scheduler/registry.cpp   | 12 +++
 .../jit/codegen/cuda/scheduler/transpose.cpp  | 94 +++++++++++++++----
 .../jit/codegen/cuda/scheduler/transpose.h    |  5 +
 3 files changed, 91 insertions(+), 20 deletions(-)

diff --git a/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp b/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp
index 570307d7d49e1d..34c6dc9ef87cb9 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp
@@ -1267,6 +1267,18 @@ class TransposeScheduler : public SchedulerEntry {
       Fusion* fusion,
       SchedulerRuntimeInfo& runtime_info,
       HeuristicSummary* data_cache = nullptr) {
+    FUSER_PERF_SCOPE("TransposeScheduler::canScheduleRunTime");
+
+    const int64_t device_multiprocessor_count =
+        (int64_t)at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
+    auto n_elems = getShape(fusion, data_cache, runtime_info).second;
+    if (device_multiprocessor_count * 32 * 32 > n_elems) {
+      // don't schedule with transpose scheduler if less than a full wave
+      scheduler_debug_utils::canScheduleRejectReason(
+          ScheduleHeuristic::Transpose,
+          "Transpose scheduler does not perform well on small problem sizes.");
+      return false;
+    }
     return true;
   }
 
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
index c05a2d86dabf55..b71da876e7092c 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
@@ -370,35 +370,38 @@ std::shared_ptr<TransposeParams> getTransposeHeuristics(
   return getTransposeHeuristics(fusion, runtime_info, data_cache);
 }
 
-std::shared_ptr<TransposeParams> getTransposeHeuristics(
-    Fusion* fusion,
-    SchedulerRuntimeInfo& runtime_info,
-    HeuristicSummary* data_cache) {
-  FUSER_PERF_SCOPE("getTransposeHeuristics");
-
-  FusionGuard fg(fusion);
-
-  // Incase any buffer is of type DataType::Index
-  DataType index_type = indexModeToDtype(runtime_info.getIndexMode());
-
+HeuristicSummaryEntry<HeuristicCompileTime::DomainMap> getDomainMap(
+    HeuristicSummary* data_cache,
+    Fusion* fusion) {
   auto domain_map_entry =
       HeuristicSummaryEntry<HeuristicCompileTime::DomainMap>(
           data_cache,
           [fusion]() { return std::make_unique<DomainMap>(fusion); });
-  const auto& domain_map = dynamic_cast<DomainMap&>(domain_map_entry.get());
+  return domain_map_entry;
+}
 
+HeuristicSummaryEntry<HeuristicCompileTime::InputsOutputsInnerDimGroups>
+getInputsOutputsGroups(HeuristicSummary* data_cache, DomainMap& domain_map) {
   auto grouped_inputs_outputs_entry =
       HeuristicSummaryEntry<HeuristicCompileTime::InputsOutputsInnerDimGroups>(
           data_cache, [&domain_map]() {
             return std::make_unique<std::vector<std::vector<TensorView*>>>(
                 domain_map.groupInputsOutputsByInnerDim());
           });
-  auto grouped_inputs_outputs = grouped_inputs_outputs_entry.get();
+  auto& grouped_inputs_outputs = grouped_inputs_outputs_entry.get();
 
   TORCH_INTERNAL_ASSERT(
       grouped_inputs_outputs.size() >= 2,
       "Can not find mismatched inner most dim, should use pointwise scheduler.");
 
+  return grouped_inputs_outputs_entry;
+}
+
+HeuristicSummaryEntry<HeuristicCompileTime::ReferenceTensors>
+getReferenceTensors(
+    HeuristicSummary* data_cache,
+    DomainMap& domain_map,
+    std::vector<std::vector<TensorView*>>& grouped_inputs_outputs) {
   auto reference_tensors_entry =
       HeuristicSummaryEntry<HeuristicCompileTime::ReferenceTensors>(
           data_cache, [&domain_map, &grouped_inputs_outputs]() {
@@ -415,13 +418,17 @@ std::shared_ptr<TransposeParams> getTransposeHeuristics(
       reference1 != nullptr, "Unable to find reference tensor for group 1");
   TORCH_INTERNAL_ASSERT(
       reference2 != nullptr, "Unable to find reference tensor for group 2");
+  return reference_tensors_entry;
+}
 
-  const int64_t device_multiprocessor_count =
-      (int64_t)at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
-
-  auto ref_root = reference1->getMaybeRFactorDomain();
-  std::vector<int64_t> shape_in_ref1;
-  shape_in_ref1.reserve(reference1->nDims());
+std::pair<std::vector<int64_t>, int64_t> getShapeInReference(
+    HeuristicSummary* data_cache,
+    SchedulerRuntimeInfo& runtime_info,
+    TensorView* reference,
+    DomainMap& domain_map) {
+  auto ref_root = reference->getMaybeRFactorDomain();
+  std::vector<int64_t> shape_in_ref;
+  shape_in_ref.reserve(reference->nDims());
   int64_t n_elems = 1;
   for (size_t ref_i = 0; ref_i < ref_root.size(); ref_i++) {
     auto id = ref_root[ref_i];
@@ -435,8 +442,55 @@ std::shared_ptr<TransposeParams> getTransposeHeuristics(
         ref_root[ref_i]->extent()->toInlineString());
     int64_t size = inferred_val->as<int64_t>();
     n_elems *= size;
-    shape_in_ref1.push_back(size);
+    shape_in_ref.push_back(size);
   }
+  return {shape_in_ref, n_elems};
+}
+
+std::pair<std::vector<int64_t>, int64_t> getShape(
+    Fusion* fusion,
+    HeuristicSummary* data_cache,
+    SchedulerRuntimeInfo& runtime_info) {
+  auto domain_map_entry = getDomainMap(data_cache, fusion);
+  auto& domain_map = dynamic_cast<DomainMap&>(domain_map_entry.get());
+  auto grouped_inputs_outputs_entry =
+      getInputsOutputsGroups(data_cache, domain_map);
+  auto grouped_inputs_outputs = grouped_inputs_outputs_entry.get();
+  auto reference_tensors_entry =
+      getReferenceTensors(data_cache, domain_map, grouped_inputs_outputs);
+  auto reference_tensors = reference_tensors_entry.get();
+  TensorView* reference1 = reference_tensors[0];
+  return getShapeInReference(data_cache, runtime_info, reference1, domain_map);
+}
+
+std::shared_ptr<TransposeParams> getTransposeHeuristics(
+    Fusion* fusion,
+    SchedulerRuntimeInfo& runtime_info,
+    HeuristicSummary* data_cache) {
+  FUSER_PERF_SCOPE("getTransposeHeuristics");
+
+  FusionGuard fg(fusion);
+
+  // Incase any buffer is of type DataType::Index
+  DataType index_type = indexModeToDtype(runtime_info.getIndexMode());
+
+  auto domain_map_entry = getDomainMap(data_cache, fusion);
+  auto& domain_map = dynamic_cast<DomainMap&>(domain_map_entry.get());
+  auto grouped_inputs_outputs_entry =
+      getInputsOutputsGroups(data_cache, domain_map);
+  auto grouped_inputs_outputs = grouped_inputs_outputs_entry.get();
+  auto reference_tensors_entry =
+      getReferenceTensors(data_cache, domain_map, grouped_inputs_outputs);
+  auto reference_tensors = reference_tensors_entry.get();
+  TensorView* reference1 = reference_tensors[0];
+  TensorView* reference2 = reference_tensors[1];
+  auto pair =
+      getShapeInReference(data_cache, runtime_info, reference1, domain_map);
+  auto& shape_in_ref1 = pair.first;
+  auto& n_elems = pair.second;
+
+  const int64_t device_multiprocessor_count =
+      (int64_t)at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
 
   auto params = std::make_shared<TransposeParams>("Transpose heuristics");
 
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.h b/torch/csrc/jit/codegen/cuda/scheduler/transpose.h
index 0cf6920ea058b4..ad104d44e9835c 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.h
+++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.h
@@ -100,6 +100,11 @@ TORCH_CUDA_CU_API LaunchParams scheduleTranspose(
 //! groups, each with a fully broadcasted reference tensor.
 TORCH_CUDA_CU_API bool hasAtLeastTwoValidGroups(Fusion* fusion);
 
+TORCH_CUDA_CU_API std::pair<std::vector<int64_t>, int64_t> getShape(
+    Fusion* fusion,
+    HeuristicSummary* data_cache,
+    SchedulerRuntimeInfo& runtime_info);
+
 } // namespace cuda
 } // namespace fuser
 } // namespace jit

From f8551d76d0706d095159a0e9d642e3d0651164b9 Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Thu, 8 Sep 2022 15:19:20 -0700
Subject: [PATCH 24/32] cache inner most positions

---
 .../cuda/scheduler/compile_time_info.h        | 13 +++-
 .../jit/codegen/cuda/scheduler/registry.cpp   |  5 +-
 .../jit/codegen/cuda/scheduler/transpose.cpp  | 68 +++++++++++++------
 3 files changed, 62 insertions(+), 24 deletions(-)

diff --git a/torch/csrc/jit/codegen/cuda/scheduler/compile_time_info.h b/torch/csrc/jit/codegen/cuda/scheduler/compile_time_info.h
index c43ef64eac0a3d..86d906c4747a76 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/compile_time_info.h
+++ b/torch/csrc/jit/codegen/cuda/scheduler/compile_time_info.h
@@ -33,7 +33,8 @@ enum class CompileTimeEntryType {
   REDUCTION_TVS,
   PERSISTENT_BUFFER_INFO,
   SCOPE_PERSISTENT_FACTOR_INFO,
-  BROADCAST_BYTE_MULTIPLES
+  BROADCAST_BYTE_MULTIPLES,
+  INNER_MOST_DIMS_INFO,
 };
 
 //! Entry type definition class for `DOMAIN_MAP`,
@@ -99,6 +100,16 @@ class PersistentBufferInfo {
       CompileTimeEntryType::PERSISTENT_BUFFER_INFO;
 };
 
+//! Entry type definition class for `INNER_MOST_DIMS_INFO`,
+//!  Used in the transpose scheduler to store inner most IterDomains and their
+//!  position in reference1 of group 1 and group 2
+class InnerMostDimInfo {
+ public:
+  using DataType = std::vector<int64_t>;
+  static const CompileTimeEntryType EntryType =
+      CompileTimeEntryType::INNER_MOST_DIMS_INFO;
+};
+
 //! Auxiliary data types for `SCOPE_PERSISTENT_FACTOR_INFO` entry type.
 using ScopedPersistenceBufferMap = std::unordered_map<Val*, std::vector<bool>>;
 
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp b/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp
index 34c6dc9ef87cb9..90b859f16ae42f 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp
@@ -1271,7 +1271,9 @@ class TransposeScheduler : public SchedulerEntry {
 
     const int64_t device_multiprocessor_count =
         (int64_t)at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
-    auto n_elems = getShape(fusion, data_cache, runtime_info).second;
+    auto pair = getShape(fusion, data_cache, runtime_info);
+    auto shape = pair.first;
+    auto n_elems = pair.second;
     if (device_multiprocessor_count * 32 * 32 > n_elems) {
       // don't schedule with transpose scheduler if less than a full wave
       scheduler_debug_utils::canScheduleRejectReason(
@@ -1572,6 +1574,7 @@ template class HeuristicSummaryEntry<
 template class HeuristicSummaryEntry<
     HeuristicCompileTime::ScopePersistentFactorInfo>;
 template class HeuristicSummaryEntry<HeuristicCompileTime::BroadcastMultiples>;
+template class HeuristicSummaryEntry<HeuristicCompileTime::InnerMostDimInfo>;
 
 } // namespace cuda
 } // namespace fuser
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
index b71da876e7092c..1d07ea18581970 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
@@ -358,18 +358,6 @@ void maybeBuildVirtualInnerDims(
 
 } // namespace
 
-bool hasAtLeastTwoValidGroups(Fusion* fusion) {
-  return DomainMap::hasAtLeastTwoValidGroups(fusion);
-}
-
-std::shared_ptr<TransposeParams> getTransposeHeuristics(
-    Fusion* fusion,
-    const at::ArrayRef<c10::IValue>& runtime_inputs,
-    HeuristicSummary* data_cache) {
-  SchedulerRuntimeInfo runtime_info(fusion, runtime_inputs, true);
-  return getTransposeHeuristics(fusion, runtime_info, data_cache);
-}
-
 HeuristicSummaryEntry<HeuristicCompileTime::DomainMap> getDomainMap(
     HeuristicSummary* data_cache,
     Fusion* fusion) {
@@ -447,6 +435,29 @@ std::pair<std::vector<int64_t>, int64_t> getShapeInReference(
   return {shape_in_ref, n_elems};
 }
 
+HeuristicSummaryEntry<HeuristicCompileTime::InnerMostDimInfo>
+getInnerMostDimInfoInReference(
+    HeuristicSummary* data_cache,
+    const std::vector<TensorView*>& group_references,
+    TensorView* global_reference,
+    DomainMap& domain_map) {
+  auto innermost_info_entry =
+      HeuristicSummaryEntry<HeuristicCompileTime::InnerMostDimInfo>(
+          data_cache, [&]() {
+            std::vector<int64_t> data;
+            data.reserve(group_references.size());
+            for (auto ref_tv : group_references) {
+              auto inner_most_id = scheduler_utils::innerMostRootDim(ref_tv);
+              auto inner_most_pos_in_global_ref =
+                  domain_map.getInnerLeafDim(global_reference, inner_most_id);
+              data.emplace_back(inner_most_pos_in_global_ref);
+            }
+            return std::make_unique<std::vector<int64_t>>(std::move(data));
+            ;
+          });
+  return innermost_info_entry;
+}
+
 std::pair<std::vector<int64_t>, int64_t> getShape(
     Fusion* fusion,
     HeuristicSummary* data_cache,
@@ -463,6 +474,18 @@ std::pair<std::vector<int64_t>, int64_t> getShape(
   return getShapeInReference(data_cache, runtime_info, reference1, domain_map);
 }
 
+bool hasAtLeastTwoValidGroups(Fusion* fusion) {
+  return DomainMap::hasAtLeastTwoValidGroups(fusion);
+}
+
+std::shared_ptr<TransposeParams> getTransposeHeuristics(
+    Fusion* fusion,
+    const at::ArrayRef<c10::IValue>& runtime_inputs,
+    HeuristicSummary* data_cache) {
+  SchedulerRuntimeInfo runtime_info(fusion, runtime_inputs, true);
+  return getTransposeHeuristics(fusion, runtime_info, data_cache);
+}
+
 std::shared_ptr<TransposeParams> getTransposeHeuristics(
     Fusion* fusion,
     SchedulerRuntimeInfo& runtime_info,
@@ -492,18 +515,17 @@ std::shared_ptr<TransposeParams> getTransposeHeuristics(
   const int64_t device_multiprocessor_count =
       (int64_t)at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
 
+  auto innermost_info_entry = getInnerMostDimInfoInReference(
+      data_cache, reference_tensors, reference1, domain_map);
+  auto innermost_info = innermost_info_entry.get();
+
+  auto inner_most_pos1_in_ref1 = innermost_info[0];
+  auto inner_most_pos2_in_ref1 = innermost_info[1];
+
   auto params = std::make_shared<TransposeParams>("Transpose heuristics");
 
   // Expand inner-most dims to virtual inner-most dims so that the inner-most
   // dims has at least tile_size elements
-  auto inner_most_id1 = scheduler_utils::innerMostRootDim(reference1);
-  auto inner_most_id2 = scheduler_utils::innerMostRootDim(reference2);
-
-  auto inner_most_pos1_in_ref1 =
-      domain_map.getInnerLeafDim(reference1, inner_most_id1);
-  auto inner_most_pos2_in_ref1 =
-      domain_map.getInnerLeafDim(reference1, inner_most_id2);
-
   // See note [Supporting small transpose dimensions]
   maybeBuildVirtualInnerDims(
       *params,
@@ -611,11 +633,13 @@ std::shared_ptr<TransposeParams> getTransposeHeuristics(
               << "group 1: " << ir_utils::toString(grouped_inputs_outputs[0])
               << "\n"
               << "reference1: " << reference1 << "\n"
-              << "inner_most_id1: " << inner_most_id1 << "\n"
+              << "inner_most_id1 position: " << inner_most_pos1_in_ref1
+              << " (in reference 1)\n"
               << "group 2: " << ir_utils::toString(grouped_inputs_outputs[1])
               << "\n"
               << "reference2: " << reference2 << "\n"
-              << "inner_most_id2: " << inner_most_id2 << std::endl;
+              << "inner_most_id2 position: " << inner_most_pos2_in_ref1
+              << " (in reference 1)" << std::endl;
     if (!params->split_before_tiling.empty() ||
         !params->dims_merged_with_1.empty() ||
         !params->dims_merged_with_2.empty()) {

From 53690a0f2ffa7494a15ee37fa1ad42f008d6b5e9 Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Thu, 8 Sep 2022 16:28:04 -0700
Subject: [PATCH 25/32] issue id

---
 .../jit/codegen/cuda/scheduler/registry.cpp   | 12 +--
 .../jit/codegen/cuda/scheduler/transpose.cpp  | 78 +++++++++++++++++--
 .../jit/codegen/cuda/scheduler/transpose.h    |  6 +-
 .../cuda/scheduler/transpose_heuristic.h      |  8 +-
 4 files changed, 86 insertions(+), 18 deletions(-)

diff --git a/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp b/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp
index 90b859f16ae42f..a43fed67103e8e 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp
@@ -1269,16 +1269,10 @@ class TransposeScheduler : public SchedulerEntry {
       HeuristicSummary* data_cache = nullptr) {
     FUSER_PERF_SCOPE("TransposeScheduler::canScheduleRunTime");
 
-    const int64_t device_multiprocessor_count =
-        (int64_t)at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
-    auto pair = getShape(fusion, data_cache, runtime_info);
-    auto shape = pair.first;
-    auto n_elems = pair.second;
-    if (device_multiprocessor_count * 32 * 32 > n_elems) {
-      // don't schedule with transpose scheduler if less than a full wave
+    auto reason = getRuntimeRejectReason(fusion, data_cache, runtime_info);
+    if (!reason.empty()) {
       scheduler_debug_utils::canScheduleRejectReason(
-          ScheduleHeuristic::Transpose,
-          "Transpose scheduler does not perform well on small problem sizes.");
+          ScheduleHeuristic::Transpose, reason);
       return false;
     }
     return true;
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
index 1d07ea18581970..03c7f55742c138 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
@@ -293,7 +293,8 @@ void maybeBuildVirtualInnerDims(
   bool split_inner_most;
   if (merged_size1 < params.tile_size1) {
     if (params.dims_merged_with_2.empty()) {
-#if 0
+#if SUPPORT_SPLITTING_INNERMOST_DIM
+      // https://github.com/csarofeen/pytorch/issues/1964
       // case 2
       split_inner_most = true;
       large_dim = inner_most2;
@@ -311,7 +312,8 @@ void maybeBuildVirtualInnerDims(
     }
   } else {
     if (params.dims_merged_with_1.empty()) {
-#if 0
+#if SUPPORT_SPLITTING_INNERMOST_DIM
+      // https://github.com/csarofeen/pytorch/issues/1964
       // case 2
       split_inner_most = true;
       large_dim = inner_most1;
@@ -356,8 +358,6 @@ void maybeBuildVirtualInnerDims(
   }
 }
 
-} // namespace
-
 HeuristicSummaryEntry<HeuristicCompileTime::DomainMap> getDomainMap(
     HeuristicSummary* data_cache,
     Fusion* fusion) {
@@ -458,7 +458,9 @@ getInnerMostDimInfoInReference(
   return innermost_info_entry;
 }
 
-std::pair<std::vector<int64_t>, int64_t> getShape(
+} // namespace
+
+std::string getRuntimeRejectReason(
     Fusion* fusion,
     HeuristicSummary* data_cache,
     SchedulerRuntimeInfo& runtime_info) {
@@ -471,7 +473,71 @@ std::pair<std::vector<int64_t>, int64_t> getShape(
       getReferenceTensors(data_cache, domain_map, grouped_inputs_outputs);
   auto reference_tensors = reference_tensors_entry.get();
   TensorView* reference1 = reference_tensors[0];
-  return getShapeInReference(data_cache, runtime_info, reference1, domain_map);
+
+  auto pair =
+      getShapeInReference(data_cache, runtime_info, reference1, domain_map);
+  auto& shape_in_ref1 = pair.first;
+  auto& n_elems = pair.second;
+
+  constexpr size_t default_tile_elements =
+      TransposeParams::getDefaultTileSize() *
+      TransposeParams::getDefaultTileSize();
+
+  // don't schedule with transpose scheduler if less than a full wave
+  const int64_t device_multiprocessor_count =
+      (int64_t)at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
+  auto elements_per_wave = device_multiprocessor_count * default_tile_elements;
+  if (elements_per_wave > n_elems) {
+    return "Transpose scheduler does not perform well on small problem sizes.";
+  }
+
+  auto innermost_info_entry = getInnerMostDimInfoInReference(
+      data_cache, reference_tensors, reference1, domain_map);
+  auto innermost_info = innermost_info_entry.get();
+
+  auto inner_most_pos1_in_ref1 = innermost_info[0];
+  auto inner_most_pos2_in_ref1 = innermost_info[1];
+
+  auto inner_size1 = shape_in_ref1[inner_most_pos1_in_ref1];
+  auto inner_size2 = shape_in_ref1[inner_most_pos2_in_ref1];
+
+  // For cases like
+  //   transpose(T0[1000000000, 2, 2], 1, 2)
+  // the pointwise scheduler should provide better performance, because it
+  // provides coalesced memory access
+  if (inner_size1 * inner_size2 < default_tile_elements) {
+    auto inner_elements = inner_size1 * inner_size2;
+    for (int64_t i = inner_most_pos2_in_ref1 + 1; i < inner_most_pos1_in_ref1;
+         i++) {
+      inner_elements *= shape_in_ref1[i];
+    }
+    // note that the algorithm here is only an approximation because it only
+    // checks reference1. In principle, we need to check all inputs and outputs
+    // to get an accurate result, but that is too much work. I think checking
+    // only reference 1 is fine for now. Below is an example where the
+    // approximation here will not work:
+    //   T0[10000000, 2, 3] (reference 1)
+    //   T1[2, 10000000, 3] input/output
+    //   T2[2, 10000000, 3] input/output
+    //   T3[2, 10000000, 3] input/output
+    //   T4[3, 10000000, 2] input/output
+    //   T5[3, 10000000, 2] input/output
+    if (inner_elements < default_tile_elements) {
+      return "Inner transpose of small dimensions should be scheduled by the "
+             "pointwise scheduler because it provides better memory coalescing";
+    }
+  }
+
+#if !SUPPORT_SPLITTING_INNERMOST_DIM
+  if (n_elems / inner_size1 < TransposeParams::getDefaultTileSize() ||
+      n_elems / inner_size2 < TransposeParams::getDefaultTileSize()) {
+    return "Splitting of inner most dim for the creation of virtual inner most dim "
+           "is disabled due to indexing bug, skipping this case at runtime for now"
+           "See: https://github.com/csarofeen/pytorch/issues/1964";
+  }
+#endif
+
+  return "";
 }
 
 bool hasAtLeastTwoValidGroups(Fusion* fusion) {
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.h b/torch/csrc/jit/codegen/cuda/scheduler/transpose.h
index ad104d44e9835c..abf141846555da 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.h
+++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.h
@@ -5,6 +5,8 @@
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
 #include <torch/csrc/jit/codegen/cuda/scheduler/transpose_heuristic.h>
 
+#define SUPPORT_SPLITTING_INNERMOST_DIM 0
+
 namespace torch {
 namespace jit {
 namespace fuser {
@@ -100,7 +102,9 @@ TORCH_CUDA_CU_API LaunchParams scheduleTranspose(
 //! groups, each with a fully broadcasted reference tensor.
 TORCH_CUDA_CU_API bool hasAtLeastTwoValidGroups(Fusion* fusion);
 
-TORCH_CUDA_CU_API std::pair<std::vector<int64_t>, int64_t> getShape(
+// If can schedule at runtime, returns empty string, otherwise returns the
+// reason why we should not schedule at runtime.
+TORCH_CUDA_CU_API std::string getRuntimeRejectReason(
     Fusion* fusion,
     HeuristicSummary* data_cache,
     SchedulerRuntimeInfo& runtime_info);
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose_heuristic.h b/torch/csrc/jit/codegen/cuda/scheduler/transpose_heuristic.h
index 099f6d6d38c111..07be8dfb03a3d3 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/transpose_heuristic.h
+++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose_heuristic.h
@@ -21,6 +21,10 @@ class TransposeParams : public HeuristicParams {
     return 128;
   }
 
+  static constexpr size_t getDefaultTileSize() {
+    return 32;
+  }
+
   // See note [Supporting small transpose dimensions], all dims are positions in
   // reference1
   std::vector<std::pair<size_t, size_t>> split_before_tiling = {};
@@ -37,10 +41,10 @@ class TransposeParams : public HeuristicParams {
   // https://github.com/csarofeen/pytorch/pull/1854#discussion_r928143729
 
   // Tile size for the inner most dim of tensors in the first group
-  size_t tile_size1 = 32;
+  size_t tile_size1 = getDefaultTileSize();
 
   // Tile size for the inner most dim of tensors in the second group
-  size_t tile_size2 = 32;
+  size_t tile_size2 = getDefaultTileSize();
 
   using HeuristicParams::HeuristicParams;
 

From 44f781019130a3cc1bc9503c465bec63400d076f Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Thu, 8 Sep 2022 16:52:31 -0700
Subject: [PATCH 26/32] lint

---
 torch/csrc/jit/codegen/cuda/scheduler/transpose_heuristic.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose_heuristic.h b/torch/csrc/jit/codegen/cuda/scheduler/transpose_heuristic.h
index 07be8dfb03a3d3..5e56278a7f16b4 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/transpose_heuristic.h
+++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose_heuristic.h
@@ -69,8 +69,7 @@ class TransposeParams : public HeuristicParams {
     std::stringstream ss;
     ss << "\n===== Transpose Parameters ========\n"
        << (tag == "" ? "" : "Tag: ") << tag << " Transpose Characteristics:\n"
-       << " BlckX: " << lparams.bdimx()
-       << "\n";
+       << " BlckX: " << lparams.bdimx() << "\n";
     ss << " input tile size: " << tile_size1 << "\n";
     ss << " output tile size: " << tile_size2 << "\n";
     int elements_per_tile = tile_size1 * tile_size2;

From ea0d1fffb099abd4f33c81f0d720e4b63c1316c3 Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Sat, 10 Sep 2022 14:28:11 -0700
Subject: [PATCH 27/32] maybe_tune

---
 .../jit/codegen/cuda/scheduler/pointwise.cpp  | 20 ++++++++++---------
 .../jit/codegen/cuda/scheduler/pointwise.h    |  2 ++
 .../jit/codegen/cuda/scheduler/transpose.cpp  | 20 +++++++++----------
 3 files changed, 23 insertions(+), 19 deletions(-)

diff --git a/torch/csrc/jit/codegen/cuda/scheduler/pointwise.cpp b/torch/csrc/jit/codegen/cuda/scheduler/pointwise.cpp
index d404ab622a5c77..e298fda6893d4e 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/pointwise.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler/pointwise.cpp
@@ -26,7 +26,6 @@ namespace cuda {
 namespace {
 // constexpr int64_t x_grid_limit = ((int64_t)1 << (int64_t)31) - (int64_t)1;
 // Unused at the moment, commenting for clang tidy
-constexpr int64_t kThreadX = 128;
 
 class DomainMap : public pointwise_utils::DomainMap {
  public:
@@ -174,11 +173,12 @@ std::shared_ptr<PointwiseParams> getPointwiseHeuristics(
           (int64_t)1));
 
   // Don't unroll at the cost of getting a full wave on the GPU
-  if (n_elems < device_multiprocessor_count * kThreadX &&
+  if (n_elems < device_multiprocessor_count * kPointwiseSchedulerThreadX &&
       max_unroll_factor > 1) {
     max_unroll_factor = std::min(
         max_unroll_factor,
-        ceilDiv(n_elems, device_multiprocessor_count * kThreadX));
+        ceilDiv(
+            n_elems, device_multiprocessor_count * kPointwiseSchedulerThreadX));
   }
 
   auto params = std::make_shared<PointwiseParams>("Pointwise heuristics");
@@ -213,7 +213,7 @@ std::shared_ptr<PointwiseParams> getPointwiseHeuristics(
   // right)
   int64_t right_elem_count = 0;
 
-  int64_t bdimx = kThreadX;
+  int64_t bdimx = kPointwiseSchedulerThreadX;
 
   // bdimy may be used if the right side of the break point is not large and we
   // need to expand block level parallelism into the left side of the break
@@ -262,7 +262,8 @@ std::shared_ptr<PointwiseParams> getPointwiseHeuristics(
     }
 
     // If there isn't very much parallelism available, just use 1D scheduler
-    if (n_elems * 2 > device_multiprocessor_count * kThreadX) {
+    if (n_elems * 2 >
+        device_multiprocessor_count * kPointwiseSchedulerThreadX) {
       int64_t min_total_transfer = std::numeric_limits<int64_t>::max();
 
       for (const auto break_point_i : c10::irange(ref_root.size())) {
@@ -324,13 +325,14 @@ std::shared_ptr<PointwiseParams> getPointwiseHeuristics(
         }
         // Min transfer found, start setting values
         bdimx = std::min(
-            ceilDiv(cur_right_elem_count, max_unroll_factor), kThreadX);
+            ceilDiv(cur_right_elem_count, max_unroll_factor),
+            kPointwiseSchedulerThreadX);
         bdimy = 1;
         gdim_right = 1;
         // Put remainder in bdimy if there's at least a wave of grid level
         // parallelism.
         if (cur_left_elem_count > device_multiprocessor_count) {
-          bdimy = kThreadX / bdimx;
+          bdimy = kPointwiseSchedulerThreadX / bdimx;
         }
         auto remainder_left = ceilDiv(cur_left_elem_count, bdimy);
         auto remainder_right =
@@ -644,7 +646,7 @@ void schedulePointwise(Fusion* fusion, const PointwiseParams& params) {
       // Unswitch
       reference_tv->split(0, 1);
       // Threads
-      reference_tv->split(0, kThreadX);
+      reference_tv->split(0, kPointwiseSchedulerThreadX);
 
       reference_tv->axis(0)->parallelize(ParallelType::BIDx);
       reference_tv->axis(1)->parallelize(ParallelType::TIDx);
@@ -658,7 +660,7 @@ void schedulePointwise(Fusion* fusion, const PointwiseParams& params) {
       //[BIDx, Unswitch, Vectorization, TIDx]
     } else {
       // Threads
-      reference_tv->split(0, kThreadX);
+      reference_tv->split(0, kPointwiseSchedulerThreadX);
       // Unroll
       reference_tv->split(0, params.unroll_factor);
       // Unswitch
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/pointwise.h b/torch/csrc/jit/codegen/cuda/scheduler/pointwise.h
index 6cba29cd6b4b93..aee470c3ba12f8 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/pointwise.h
+++ b/torch/csrc/jit/codegen/cuda/scheduler/pointwise.h
@@ -10,6 +10,8 @@ namespace jit {
 namespace fuser {
 namespace cuda {
 
+constexpr int64_t kPointwiseSchedulerThreadX = 128;
+
 class SchedulerRuntimeInfo;
 class HeuristicSummary;
 
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
index 03c7f55742c138..963f37465166ca 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
@@ -479,21 +479,21 @@ std::string getRuntimeRejectReason(
   auto& shape_in_ref1 = pair.first;
   auto& n_elems = pair.second;
 
-  constexpr size_t default_tile_elements =
-      TransposeParams::getDefaultTileSize() *
-      TransposeParams::getDefaultTileSize();
+  auto innermost_info_entry = getInnerMostDimInfoInReference(
+      data_cache, reference_tensors, reference1, domain_map);
+  auto innermost_info = innermost_info_entry.get();
 
   // don't schedule with transpose scheduler if less than a full wave
   const int64_t device_multiprocessor_count =
       (int64_t)at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
-  auto elements_per_wave = device_multiprocessor_count * default_tile_elements;
-  if (elements_per_wave > n_elems) {
+  auto pointwise_elements_per_wave =
+      device_multiprocessor_count * kPointwiseSchedulerThreadX;
+  if (pointwise_elements_per_wave > n_elems) {
     return "Transpose scheduler does not perform well on small problem sizes.";
   }
 
-  auto innermost_info_entry = getInnerMostDimInfoInReference(
-      data_cache, reference_tensors, reference1, domain_map);
-  auto innermost_info = innermost_info_entry.get();
+  auto max_tile_elements = TransposeParams::getDefaultTileSize() *
+      TransposeParams::getDefaultTileSize();
 
   auto inner_most_pos1_in_ref1 = innermost_info[0];
   auto inner_most_pos2_in_ref1 = innermost_info[1];
@@ -505,7 +505,7 @@ std::string getRuntimeRejectReason(
   //   transpose(T0[1000000000, 2, 2], 1, 2)
   // the pointwise scheduler should provide better performance, because it
   // provides coalesced memory access
-  if (inner_size1 * inner_size2 < default_tile_elements) {
+  if (inner_size1 * inner_size2 < max_tile_elements) {
     auto inner_elements = inner_size1 * inner_size2;
     for (int64_t i = inner_most_pos2_in_ref1 + 1; i < inner_most_pos1_in_ref1;
          i++) {
@@ -522,7 +522,7 @@ std::string getRuntimeRejectReason(
     //   T3[2, 10000000, 3] input/output
     //   T4[3, 10000000, 2] input/output
     //   T5[3, 10000000, 2] input/output
-    if (inner_elements < default_tile_elements) {
+    if (inner_elements < max_tile_elements) {
       return "Inner transpose of small dimensions should be scheduled by the "
              "pointwise scheduler because it provides better memory coalescing";
     }

From 5920c62adaf125edaa8922dab56a3852579dbec7 Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Sat, 10 Sep 2022 14:28:27 -0700
Subject: [PATCH 28/32] Revert "maybe_tune"

This reverts commit ea0d1fffb099abd4f33c81f0d720e4b63c1316c3.
---
 .../jit/codegen/cuda/scheduler/pointwise.cpp  | 20 +++++++++----------
 .../jit/codegen/cuda/scheduler/pointwise.h    |  2 --
 .../jit/codegen/cuda/scheduler/transpose.cpp  | 20 +++++++++----------
 3 files changed, 19 insertions(+), 23 deletions(-)

diff --git a/torch/csrc/jit/codegen/cuda/scheduler/pointwise.cpp b/torch/csrc/jit/codegen/cuda/scheduler/pointwise.cpp
index e298fda6893d4e..d404ab622a5c77 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/pointwise.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler/pointwise.cpp
@@ -26,6 +26,7 @@ namespace cuda {
 namespace {
 // constexpr int64_t x_grid_limit = ((int64_t)1 << (int64_t)31) - (int64_t)1;
 // Unused at the moment, commenting for clang tidy
+constexpr int64_t kThreadX = 128;
 
 class DomainMap : public pointwise_utils::DomainMap {
  public:
@@ -173,12 +174,11 @@ std::shared_ptr<PointwiseParams> getPointwiseHeuristics(
           (int64_t)1));
 
   // Don't unroll at the cost of getting a full wave on the GPU
-  if (n_elems < device_multiprocessor_count * kPointwiseSchedulerThreadX &&
+  if (n_elems < device_multiprocessor_count * kThreadX &&
       max_unroll_factor > 1) {
     max_unroll_factor = std::min(
         max_unroll_factor,
-        ceilDiv(
-            n_elems, device_multiprocessor_count * kPointwiseSchedulerThreadX));
+        ceilDiv(n_elems, device_multiprocessor_count * kThreadX));
   }
 
   auto params = std::make_shared<PointwiseParams>("Pointwise heuristics");
@@ -213,7 +213,7 @@ std::shared_ptr<PointwiseParams> getPointwiseHeuristics(
   // right)
   int64_t right_elem_count = 0;
 
-  int64_t bdimx = kPointwiseSchedulerThreadX;
+  int64_t bdimx = kThreadX;
 
   // bdimy may be used if the right side of the break point is not large and we
   // need to expand block level parallelism into the left side of the break
@@ -262,8 +262,7 @@ std::shared_ptr<PointwiseParams> getPointwiseHeuristics(
     }
 
     // If there isn't very much parallelism available, just use 1D scheduler
-    if (n_elems * 2 >
-        device_multiprocessor_count * kPointwiseSchedulerThreadX) {
+    if (n_elems * 2 > device_multiprocessor_count * kThreadX) {
       int64_t min_total_transfer = std::numeric_limits<int64_t>::max();
 
       for (const auto break_point_i : c10::irange(ref_root.size())) {
@@ -325,14 +324,13 @@ std::shared_ptr<PointwiseParams> getPointwiseHeuristics(
         }
         // Min transfer found, start setting values
         bdimx = std::min(
-            ceilDiv(cur_right_elem_count, max_unroll_factor),
-            kPointwiseSchedulerThreadX);
+            ceilDiv(cur_right_elem_count, max_unroll_factor), kThreadX);
         bdimy = 1;
         gdim_right = 1;
         // Put remainder in bdimy if there's at least a wave of grid level
         // parallelism.
         if (cur_left_elem_count > device_multiprocessor_count) {
-          bdimy = kPointwiseSchedulerThreadX / bdimx;
+          bdimy = kThreadX / bdimx;
         }
         auto remainder_left = ceilDiv(cur_left_elem_count, bdimy);
         auto remainder_right =
@@ -646,7 +644,7 @@ void schedulePointwise(Fusion* fusion, const PointwiseParams& params) {
       // Unswitch
       reference_tv->split(0, 1);
       // Threads
-      reference_tv->split(0, kPointwiseSchedulerThreadX);
+      reference_tv->split(0, kThreadX);
 
       reference_tv->axis(0)->parallelize(ParallelType::BIDx);
       reference_tv->axis(1)->parallelize(ParallelType::TIDx);
@@ -660,7 +658,7 @@ void schedulePointwise(Fusion* fusion, const PointwiseParams& params) {
       //[BIDx, Unswitch, Vectorization, TIDx]
     } else {
       // Threads
-      reference_tv->split(0, kPointwiseSchedulerThreadX);
+      reference_tv->split(0, kThreadX);
       // Unroll
       reference_tv->split(0, params.unroll_factor);
       // Unswitch
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/pointwise.h b/torch/csrc/jit/codegen/cuda/scheduler/pointwise.h
index aee470c3ba12f8..6cba29cd6b4b93 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/pointwise.h
+++ b/torch/csrc/jit/codegen/cuda/scheduler/pointwise.h
@@ -10,8 +10,6 @@ namespace jit {
 namespace fuser {
 namespace cuda {
 
-constexpr int64_t kPointwiseSchedulerThreadX = 128;
-
 class SchedulerRuntimeInfo;
 class HeuristicSummary;
 
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
index 963f37465166ca..03c7f55742c138 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
@@ -479,21 +479,21 @@ std::string getRuntimeRejectReason(
   auto& shape_in_ref1 = pair.first;
   auto& n_elems = pair.second;
 
-  auto innermost_info_entry = getInnerMostDimInfoInReference(
-      data_cache, reference_tensors, reference1, domain_map);
-  auto innermost_info = innermost_info_entry.get();
+  constexpr size_t default_tile_elements =
+      TransposeParams::getDefaultTileSize() *
+      TransposeParams::getDefaultTileSize();
 
   // don't schedule with transpose scheduler if less than a full wave
   const int64_t device_multiprocessor_count =
       (int64_t)at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
-  auto pointwise_elements_per_wave =
-      device_multiprocessor_count * kPointwiseSchedulerThreadX;
-  if (pointwise_elements_per_wave > n_elems) {
+  auto elements_per_wave = device_multiprocessor_count * default_tile_elements;
+  if (elements_per_wave > n_elems) {
     return "Transpose scheduler does not perform well on small problem sizes.";
   }
 
-  auto max_tile_elements = TransposeParams::getDefaultTileSize() *
-      TransposeParams::getDefaultTileSize();
+  auto innermost_info_entry = getInnerMostDimInfoInReference(
+      data_cache, reference_tensors, reference1, domain_map);
+  auto innermost_info = innermost_info_entry.get();
 
   auto inner_most_pos1_in_ref1 = innermost_info[0];
   auto inner_most_pos2_in_ref1 = innermost_info[1];
@@ -505,7 +505,7 @@ std::string getRuntimeRejectReason(
   //   transpose(T0[1000000000, 2, 2], 1, 2)
   // the pointwise scheduler should provide better performance, because it
   // provides coalesced memory access
-  if (inner_size1 * inner_size2 < max_tile_elements) {
+  if (inner_size1 * inner_size2 < default_tile_elements) {
     auto inner_elements = inner_size1 * inner_size2;
     for (int64_t i = inner_most_pos2_in_ref1 + 1; i < inner_most_pos1_in_ref1;
          i++) {
@@ -522,7 +522,7 @@ std::string getRuntimeRejectReason(
     //   T3[2, 10000000, 3] input/output
     //   T4[3, 10000000, 2] input/output
     //   T5[3, 10000000, 2] input/output
-    if (inner_elements < max_tile_elements) {
+    if (inner_elements < default_tile_elements) {
       return "Inner transpose of small dimensions should be scheduled by the "
              "pointwise scheduler because it provides better memory coalescing";
     }

From 1427438d9b3eb328e55cea81c6ec2117e856b85b Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Sat, 10 Sep 2022 15:19:42 -0700
Subject: [PATCH 29/32] reject at pointwise scheduler

---
 .../cuda/scheduler/compile_time_info.h        |  10 ++
 .../jit/codegen/cuda/scheduler/registry.cpp   | 170 ++++++++++--------
 .../jit/codegen/cuda/scheduler/transpose.cpp  |   2 +-
 .../jit/codegen/cuda/scheduler/transpose.h    |   2 +-
 4 files changed, 105 insertions(+), 79 deletions(-)

diff --git a/torch/csrc/jit/codegen/cuda/scheduler/compile_time_info.h b/torch/csrc/jit/codegen/cuda/scheduler/compile_time_info.h
index 86d906c4747a76..2e509cfb8a106c 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/compile_time_info.h
+++ b/torch/csrc/jit/codegen/cuda/scheduler/compile_time_info.h
@@ -35,6 +35,7 @@ enum class CompileTimeEntryType {
   SCOPE_PERSISTENT_FACTOR_INFO,
   BROADCAST_BYTE_MULTIPLES,
   INNER_MOST_DIMS_INFO,
+  CAN_SCHEDULE_TRANSPOSE,
 };
 
 //! Entry type definition class for `DOMAIN_MAP`,
@@ -137,6 +138,15 @@ class BroadcastMultiples {
       CompileTimeEntryType::BROADCAST_BYTE_MULTIPLES;
 };
 
+//! Entry type definition class for `CAN_SCHEDULE_TRANSPOSE`,
+//!  stores if the transpose scheduler can scheduler this fusion
+class CanScheduleTranspose {
+ public:
+  using DataType = bool;
+  static const CompileTimeEntryType EntryType =
+      CompileTimeEntryType::CAN_SCHEDULE_TRANSPOSE;
+};
+
 //! Base abstract class for unified storage in `HeuristicSummary`,
 //!  each entry in `HeuristicSummary` will be a subclass.
 class CompileTimeInfoBase : public PolymorphicBase {
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp b/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp
index a43fed67103e8e..17a1fc540507ed 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp
@@ -937,6 +937,84 @@ class ReductionScheduler : public SchedulerEntry {
   }
 };
 
+class TransposeScheduler : public SchedulerEntry {
+ public:
+  explicit TransposeScheduler(
+      Fusion* fusion,
+      SchedulerRuntimeInfo& runtime_info,
+      HeuristicSummary* data_cache = nullptr)
+      : SchedulerEntry(ScheduleHeuristic::Transpose) {
+    computeHeuristics(fusion, runtime_info, data_cache);
+  }
+
+  static bool canScheduleCompileTime(Fusion* fusion) {
+    // Temporarily disallow view in transpose scheduler
+    // TODO Add more testing before enabling
+    auto view_tvs = scheduler_utils::getViewTVs(fusion);
+    if (view_tvs.size() > 0) {
+      scheduler_debug_utils::canScheduleRejectReason(
+          ScheduleHeuristic::Transpose, "No support for view op");
+      return false;
+    }
+
+    if (!hasAtLeastTwoValidGroups(fusion)) {
+      scheduler_debug_utils::canScheduleRejectReason(
+          ScheduleHeuristic::Transpose,
+          "cannot find two mismatching inner most dimensions");
+      return false;
+    }
+
+    // TODO: add support for trivial reduction
+    auto reduction_ops =
+        ir_utils::getReductionOps(fusion, false /* ignore_trivial */);
+
+    if (!reduction_ops.empty()) {
+      scheduler_debug_utils::canScheduleRejectReason(
+          ScheduleHeuristic::Transpose, "no support for reduction ops");
+      return false;
+    }
+
+    if (hasNonUniqueBcast(fusion)) {
+      scheduler_debug_utils::canScheduleRejectReason(
+          ScheduleHeuristic::Transpose,
+          "Broadcasting dimension might be broadcasting to multiple sizes.");
+      return false;
+    }
+
+    return true;
+  }
+
+  static bool canScheduleRunTime(
+      Fusion* fusion,
+      SchedulerRuntimeInfo& runtime_info,
+      HeuristicSummary* data_cache = nullptr) {
+    FUSER_PERF_SCOPE("TransposeScheduler::canScheduleRunTime");
+
+    auto reason =
+        getTransposeRuntimeRejectReason(fusion, data_cache, runtime_info);
+    if (!reason.empty()) {
+      scheduler_debug_utils::canScheduleRejectReason(
+          ScheduleHeuristic::Transpose, reason);
+      return false;
+    }
+    return true;
+  }
+
+  void schedule(Fusion* fusion) override {
+    FUSER_PERF_SCOPE("Schedule Transpose Fusion");
+    scheduleTranspose(fusion, transposeParams());
+  }
+
+ private:
+  void computeHeuristics(
+      Fusion* fusion,
+      SchedulerRuntimeInfo& runtime_info,
+      HeuristicSummary* data_cache = nullptr) {
+    params_ = getTransposeHeuristics(fusion, runtime_info, data_cache);
+    TORCH_INTERNAL_ASSERT(params_ != nullptr);
+  }
+};
+
 class PointWiseScheduler : public SchedulerEntry {
  public:
   explicit PointWiseScheduler(
@@ -980,6 +1058,19 @@ class PointWiseScheduler : public SchedulerEntry {
       Fusion* fusion,
       SchedulerRuntimeInfo& runtime_info,
       HeuristicSummary* data_cache = nullptr) {
+    auto can_schedule_transpose_entry =
+        HeuristicSummaryEntry<HeuristicCompileTime::CanScheduleTranspose>(
+            data_cache, [fusion]() {
+              return std::make_unique<bool>(
+                  TransposeScheduler::canScheduleCompileTime(fusion));
+            });
+    if (can_schedule_transpose_entry.get()) {
+      // TODO: data cache
+      auto reason =
+          getTransposeRuntimeRejectReason(fusion, nullptr, runtime_info);
+      return !reason.empty();
+    }
+
     return true;
   }
 
@@ -1216,83 +1307,6 @@ class PersistentKernelScheduler : public SchedulerEntry {
   }
 };
 
-class TransposeScheduler : public SchedulerEntry {
- public:
-  explicit TransposeScheduler(
-      Fusion* fusion,
-      SchedulerRuntimeInfo& runtime_info,
-      HeuristicSummary* data_cache = nullptr)
-      : SchedulerEntry(ScheduleHeuristic::Transpose) {
-    computeHeuristics(fusion, runtime_info, data_cache);
-  }
-
-  static bool canScheduleCompileTime(Fusion* fusion) {
-    // Temporarily disallow view in transpose scheduler
-    // TODO Add more testing before enabling
-    auto view_tvs = scheduler_utils::getViewTVs(fusion);
-    if (view_tvs.size() > 0) {
-      scheduler_debug_utils::canScheduleRejectReason(
-          ScheduleHeuristic::Transpose, "No support for view op");
-      return false;
-    }
-
-    if (!hasAtLeastTwoValidGroups(fusion)) {
-      scheduler_debug_utils::canScheduleRejectReason(
-          ScheduleHeuristic::Transpose,
-          "cannot find two mismatching inner most dimensions");
-      return false;
-    }
-
-    // TODO: add support for trivial reduction
-    auto reduction_ops =
-        ir_utils::getReductionOps(fusion, false /* ignore_trivial */);
-
-    if (!reduction_ops.empty()) {
-      scheduler_debug_utils::canScheduleRejectReason(
-          ScheduleHeuristic::Transpose, "no support for reduction ops");
-      return false;
-    }
-
-    if (hasNonUniqueBcast(fusion)) {
-      scheduler_debug_utils::canScheduleRejectReason(
-          ScheduleHeuristic::Transpose,
-          "Broadcasting dimension might be broadcasting to multiple sizes.");
-      return false;
-    }
-
-    return true;
-  }
-
-  static bool canScheduleRunTime(
-      Fusion* fusion,
-      SchedulerRuntimeInfo& runtime_info,
-      HeuristicSummary* data_cache = nullptr) {
-    FUSER_PERF_SCOPE("TransposeScheduler::canScheduleRunTime");
-
-    auto reason = getRuntimeRejectReason(fusion, data_cache, runtime_info);
-    if (!reason.empty()) {
-      scheduler_debug_utils::canScheduleRejectReason(
-          ScheduleHeuristic::Transpose, reason);
-      return false;
-    }
-    return true;
-  }
-
-  void schedule(Fusion* fusion) override {
-    FUSER_PERF_SCOPE("Schedule Transpose Fusion");
-    scheduleTranspose(fusion, transposeParams());
-  }
-
- private:
-  void computeHeuristics(
-      Fusion* fusion,
-      SchedulerRuntimeInfo& runtime_info,
-      HeuristicSummary* data_cache = nullptr) {
-    params_ = getTransposeHeuristics(fusion, runtime_info, data_cache);
-    TORCH_INTERNAL_ASSERT(params_ != nullptr);
-  }
-};
-
 // Schedule Table
 const std::vector<ScheduleHeuristic>& all_heuristics() {
   static const std::vector<ScheduleHeuristic> hlist = {
@@ -1569,6 +1583,8 @@ template class HeuristicSummaryEntry<
     HeuristicCompileTime::ScopePersistentFactorInfo>;
 template class HeuristicSummaryEntry<HeuristicCompileTime::BroadcastMultiples>;
 template class HeuristicSummaryEntry<HeuristicCompileTime::InnerMostDimInfo>;
+template class HeuristicSummaryEntry<
+    HeuristicCompileTime::CanScheduleTranspose>;
 
 } // namespace cuda
 } // namespace fuser
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
index 03c7f55742c138..8480b78e792d76 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
@@ -460,7 +460,7 @@ getInnerMostDimInfoInReference(
 
 } // namespace
 
-std::string getRuntimeRejectReason(
+std::string getTransposeRuntimeRejectReason(
     Fusion* fusion,
     HeuristicSummary* data_cache,
     SchedulerRuntimeInfo& runtime_info) {
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.h b/torch/csrc/jit/codegen/cuda/scheduler/transpose.h
index abf141846555da..c1a4ab6efb6ae9 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.h
+++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.h
@@ -104,7 +104,7 @@ TORCH_CUDA_CU_API bool hasAtLeastTwoValidGroups(Fusion* fusion);
 
 // If can schedule at runtime, returns empty string, otherwise returns the
 // reason why we should not schedule at runtime.
-TORCH_CUDA_CU_API std::string getRuntimeRejectReason(
+TORCH_CUDA_CU_API std::string getTransposeRuntimeRejectReason(
     Fusion* fusion,
     HeuristicSummary* data_cache,
     SchedulerRuntimeInfo& runtime_info);

From 27a4db83aec1cb9a97631fff3f1f655e7a617841 Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Sat, 10 Sep 2022 16:03:56 -0700
Subject: [PATCH 30/32] fix

---
 .../jit/codegen/cuda/scheduler/registry.cpp   | 27 ++++++++++++++-----
 .../jit/codegen/cuda/scheduler/transpose.cpp  |  9 +++----
 2 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp b/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp
index 17a1fc540507ed..40efd8353baf8c 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp
@@ -1065,9 +1065,8 @@ class PointWiseScheduler : public SchedulerEntry {
                   TransposeScheduler::canScheduleCompileTime(fusion));
             });
     if (can_schedule_transpose_entry.get()) {
-      // TODO: data cache
       auto reason =
-          getTransposeRuntimeRejectReason(fusion, nullptr, runtime_info);
+          getTransposeRuntimeRejectReason(fusion, data_cache, runtime_info);
       return !reason.empty();
     }
 
@@ -1499,6 +1498,25 @@ void HeuristicSummary::validate() const {
           entry_type_map_.count(EntryType::VECTORIZABLE_INPUTS_AND_OUTPUTS));
       TORCH_INTERNAL_ASSERT(
           entry_type_map_.count(EntryType::BROADCAST_BYTE_MULTIPLES));
+      TORCH_INTERNAL_ASSERT(
+          entry_type_map_.count(EntryType::CAN_SCHEDULE_TRANSPOSE));
+      auto can_schedule_transpose =
+          entry_type_map_.at(EntryType::CAN_SCHEDULE_TRANSPOSE)
+              ->as<
+                  CompileTimeInfo<HeuristicCompileTime::CanScheduleTranspose>>()
+              ->get();
+      if (!can_schedule_transpose) {
+        break;
+      }
+    }
+    case ScheduleHeuristic::Transpose: {
+      TORCH_INTERNAL_ASSERT(entry_type_map_.count(EntryType::DOMAIN_MAP));
+      TORCH_INTERNAL_ASSERT(entry_type_map_.count(
+          EntryType::INPUTS_AND_OUTPUTS_INNER_DIM_GROUPS));
+      TORCH_INTERNAL_ASSERT(
+          entry_type_map_.count(EntryType::REFERENCE_TENSORS));
+      TORCH_INTERNAL_ASSERT(
+          entry_type_map_.count(EntryType::INNER_MOST_DIMS_INFO));
       break;
     }
     case ScheduleHeuristic::Reduction: {
@@ -1528,11 +1546,6 @@ void HeuristicSummary::validate() const {
           entry_type_map_.count(EntryType::SCOPE_PERSISTENT_FACTOR_INFO));
       break;
     }
-    case ScheduleHeuristic::Transpose: {
-      TORCH_INTERNAL_ASSERT(entry_type_map_.count(
-          EntryType::INPUTS_AND_OUTPUTS_INNER_DIM_GROUPS));
-      break;
-    }
     default:
       TORCH_INTERNAL_ASSERT(false, "unknown heuristic");
   }
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
index 8480b78e792d76..bc330217b85965 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
@@ -453,7 +453,6 @@ getInnerMostDimInfoInReference(
               data.emplace_back(inner_most_pos_in_global_ref);
             }
             return std::make_unique<std::vector<int64_t>>(std::move(data));
-            ;
           });
   return innermost_info_entry;
 }
@@ -479,6 +478,10 @@ std::string getTransposeRuntimeRejectReason(
   auto& shape_in_ref1 = pair.first;
   auto& n_elems = pair.second;
 
+  auto innermost_info_entry = getInnerMostDimInfoInReference(
+      data_cache, reference_tensors, reference1, domain_map);
+  auto innermost_info = innermost_info_entry.get();
+
   constexpr size_t default_tile_elements =
       TransposeParams::getDefaultTileSize() *
       TransposeParams::getDefaultTileSize();
@@ -491,10 +494,6 @@ std::string getTransposeRuntimeRejectReason(
     return "Transpose scheduler does not perform well on small problem sizes.";
   }
 
-  auto innermost_info_entry = getInnerMostDimInfoInReference(
-      data_cache, reference_tensors, reference1, domain_map);
-  auto innermost_info = innermost_info_entry.get();
-
   auto inner_most_pos1_in_ref1 = innermost_info[0];
   auto inner_most_pos2_in_ref1 = innermost_info[1];
 

From 0c173fa5aeefe90abc519bf7d518727b1c904fee Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Sat, 10 Sep 2022 16:36:41 -0700
Subject: [PATCH 31/32] fix

---
 torch/csrc/jit/codegen/cuda/scheduler/registry.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp b/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp
index 40efd8353baf8c..c4a04fd0600773 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp
@@ -1505,7 +1505,7 @@ void HeuristicSummary::validate() const {
               ->as<
                   CompileTimeInfo<HeuristicCompileTime::CanScheduleTranspose>>()
               ->get();
-      if (!can_schedule_transpose) {
+      if (!*can_schedule_transpose) {
         break;
       }
     }

From 911956eabf28d32e07a2e96af76dfac294146736 Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Sun, 11 Sep 2022 02:15:25 -0700
Subject: [PATCH 32/32] fix fusion input reductions

---
 .../cuda/scheduler/compile_time_info.h        | 21 +++++++++++++++++++
 .../jit/codegen/cuda/scheduler/registry.cpp   |  8 +++++--
 .../jit/codegen/cuda/scheduler/transpose.cpp  | 14 +++++++++----
 3 files changed, 37 insertions(+), 6 deletions(-)

diff --git a/torch/csrc/jit/codegen/cuda/scheduler/compile_time_info.h b/torch/csrc/jit/codegen/cuda/scheduler/compile_time_info.h
index 2e509cfb8a106c..262f1f84d259ad 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/compile_time_info.h
+++ b/torch/csrc/jit/codegen/cuda/scheduler/compile_time_info.h
@@ -26,7 +26,9 @@ namespace HeuristicCompileTime {
 //! Enum for all possible types of cached entries of compile-time info.
 enum class CompileTimeEntryType {
   DOMAIN_MAP,
+  TRANSPOSE_DOMAIN_MAP,
   REFERENCE_TENSORS,
+  REFERENCE_TENSORS_FOR_GROUPS,
   VECTORIZABLE_INPUTS_AND_OUTPUTS,
   INPUTS_AND_OUTPUTS_INNER_DIM_GROUPS,
   UNROLLABLE_INPUTS_AND_OUTPUTS,
@@ -47,6 +49,15 @@ class DomainMap {
       CompileTimeEntryType::DOMAIN_MAP;
 };
 
+//! Entry type definition class for `DOMAIN_MAP`,
+//!  stores the domain map of a fusion, used by transpose scheduler.
+class TransposeDomainMap {
+ public:
+  using DataType = pointwise_utils::DomainMap;
+  static const CompileTimeEntryType EntryType =
+      CompileTimeEntryType::TRANSPOSE_DOMAIN_MAP;
+};
+
 //! Entry type definition class for `REFERENCE_TENSORS`,
 //!  stores the the reference TensorViews used to schedule a fusion.
 class ReferenceTensors {
@@ -56,6 +67,16 @@ class ReferenceTensors {
       CompileTimeEntryType::REFERENCE_TENSORS;
 };
 
+//! Entry type definition class for `REFERENCE_TENSORS`,
+//!  stores the the reference TensorViews used to schedule a fusion, used by
+//!  transpose scheduler.
+class ReferenceTensorsForGroups {
+ public:
+  using DataType = std::vector<TensorView*>;
+  static const CompileTimeEntryType EntryType =
+      CompileTimeEntryType::REFERENCE_TENSORS_FOR_GROUPS;
+};
+
 //! Entry type definition class for `VECTORIZABLE_INPUTS_AND_OUTPUTS`,
 //!  stores the vectorizable TensorViews on a fusion's inputs and outputs.
 class VectorizableInputsAndOutputs {
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp b/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp
index c4a04fd0600773..0ba2c1b8afa303 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp
@@ -1510,11 +1510,12 @@ void HeuristicSummary::validate() const {
       }
     }
     case ScheduleHeuristic::Transpose: {
-      TORCH_INTERNAL_ASSERT(entry_type_map_.count(EntryType::DOMAIN_MAP));
+      TORCH_INTERNAL_ASSERT(
+          entry_type_map_.count(EntryType::TRANSPOSE_DOMAIN_MAP));
       TORCH_INTERNAL_ASSERT(entry_type_map_.count(
           EntryType::INPUTS_AND_OUTPUTS_INNER_DIM_GROUPS));
       TORCH_INTERNAL_ASSERT(
-          entry_type_map_.count(EntryType::REFERENCE_TENSORS));
+          entry_type_map_.count(EntryType::REFERENCE_TENSORS_FOR_GROUPS));
       TORCH_INTERNAL_ASSERT(
           entry_type_map_.count(EntryType::INNER_MOST_DIMS_INFO));
       break;
@@ -1582,7 +1583,10 @@ HeuristicSummaryEntry<EntryClass>::HeuristicSummaryEntry(
 
 // Template instantiation for pre-defined cache entries
 template class HeuristicSummaryEntry<HeuristicCompileTime::DomainMap>;
+template class HeuristicSummaryEntry<HeuristicCompileTime::TransposeDomainMap>;
 template class HeuristicSummaryEntry<HeuristicCompileTime::ReferenceTensors>;
+template class HeuristicSummaryEntry<
+    HeuristicCompileTime::ReferenceTensorsForGroups>;
 template class HeuristicSummaryEntry<
     HeuristicCompileTime::VectorizableInputsAndOutputs>;
 template class HeuristicSummaryEntry<
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
index bc330217b85965..5d1c533224dcd3 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
@@ -160,6 +160,12 @@ class DomainMap : public pointwise_utils::DomainMap {
         // Then we still want to T1 and T2 to be grouped together.
         auto group =
             scheduler_utils::getInputsOutputsWithInnerDim(tv, true, false);
+        if (group.empty()) {
+          // In case that the inner most dim of tv is not found (for example, tv
+          // is a fusion input with only reductions), we just return a null
+          // result which will tell the scheduler to reject the fusion
+          return {};
+        }
         for (auto member_tv : group) {
           if (grouped.count(member_tv) == 0) {
             grouped.emplace(member_tv);
@@ -358,11 +364,11 @@ void maybeBuildVirtualInnerDims(
   }
 }
 
-HeuristicSummaryEntry<HeuristicCompileTime::DomainMap> getDomainMap(
+HeuristicSummaryEntry<HeuristicCompileTime::TransposeDomainMap> getDomainMap(
     HeuristicSummary* data_cache,
     Fusion* fusion) {
   auto domain_map_entry =
-      HeuristicSummaryEntry<HeuristicCompileTime::DomainMap>(
+      HeuristicSummaryEntry<HeuristicCompileTime::TransposeDomainMap>(
           data_cache,
           [fusion]() { return std::make_unique<DomainMap>(fusion); });
   return domain_map_entry;
@@ -385,13 +391,13 @@ getInputsOutputsGroups(HeuristicSummary* data_cache, DomainMap& domain_map) {
   return grouped_inputs_outputs_entry;
 }
 
-HeuristicSummaryEntry<HeuristicCompileTime::ReferenceTensors>
+HeuristicSummaryEntry<HeuristicCompileTime::ReferenceTensorsForGroups>
 getReferenceTensors(
     HeuristicSummary* data_cache,
     DomainMap& domain_map,
     std::vector<std::vector<TensorView*>>& grouped_inputs_outputs) {
   auto reference_tensors_entry =
-      HeuristicSummaryEntry<HeuristicCompileTime::ReferenceTensors>(
+      HeuristicSummaryEntry<HeuristicCompileTime::ReferenceTensorsForGroups>(
           data_cache, [&domain_map, &grouped_inputs_outputs]() {
             std::vector<TensorView*> data{
                 domain_map.findReferenceFor(grouped_inputs_outputs[0]),