From 6f2865e11e85d43a322b1de10a58276ba3e51ca8 Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Tue, 19 Jul 2022 15:12:18 -0700
Subject: [PATCH] Add debug dump for InlinePropagator

---
 .../jit/codegen/cuda/inline_propagator.cpp    | 42 +++++++++++++++++++
 torch/csrc/jit/codegen/cuda/utils.cpp         |  8 +++-
 torch/csrc/jit/codegen/cuda/utils.h           |  2 +
 3 files changed, 50 insertions(+), 2 deletions(-)

diff --git a/torch/csrc/jit/codegen/cuda/inline_propagator.cpp b/torch/csrc/jit/codegen/cuda/inline_propagator.cpp
index c93cd8354cffa..0dd2f1930f9bf 100644
--- a/torch/csrc/jit/codegen/cuda/inline_propagator.cpp
+++ b/torch/csrc/jit/codegen/cuda/inline_propagator.cpp
@@ -147,9 +147,17 @@ size_t InlinePropagator::getMaxPosAll(TensorView* tv, bool check_siblings) {
 }
 
 void InlinePropagator::setCAPos(TensorView* tv) {
+  bool debug = isDebugDumpEnabled(DebugDumpOption::InlinePropagator);
   size_t pos = mapped_reference_pos_.at(tv);
+  if (debug) {
+    std::cout << "  Setting CA pos of " << tv << ":" << std::endl;
+    std::cout << "    mapped position: " << pos << std::endl;
+  }
   if ((selected_.empty() || selected_.count(tv)) && !tv->isFusionInput()) {
     auto max_pos = getMaxPosAll(tv);
+    if (debug) {
+      std::cout << "    max inlinable position: " << max_pos << std::endl;
+    }
     if (mode_ == ComputeAtMode::Standard) {
       TORCH_INTERNAL_ASSERT(
           pos <= max_pos,
@@ -167,12 +175,22 @@ void InlinePropagator::setCAPos(TensorView* tv) {
       pos--;
     }
     auto current_ca_pos = tv->getComputeAtPosition();
+    if (debug) {
+      std::cout << "    current CA position: " << current_ca_pos << std::endl;
+    }
     if (pos > current_ca_pos) {
+      if (debug) {
+        std::cout << "    new CA position: " << pos << std::endl;
+      }
       tv->setComputeAt(pos);
       for (auto consumer_tv : ir_utils::consumerTvsOf(tv)) {
         needs_update_max_producer_.insert(consumer_tv);
       }
+    } else if (debug) {
+      std::cout << "    CA position not changed" << std::endl;
     }
+  } else if (debug) {
+    std::cout << "    tensor not selected, skip" << std::endl;
   }
 }
 
@@ -201,7 +219,13 @@ InlinePropagator::InlinePropagator(
 }
 
 void InlinePropagator::setUp() {
+  bool debug = isDebugDumpEnabled(DebugDumpOption::InlinePropagator);
   mapped_reference_pos_[reference_] = reference_pos_;
+  if (debug) {
+    std::cout << "InlinePropagator::setUp" << std::endl;
+    std::cout << "  reference: " << reference_ << " @ " << reference_pos_
+              << std::endl;
+  }
   setCAPos(reference_);
 }
 
@@ -273,6 +297,12 @@ void InlinePropagator::tearDown() {
 }
 
 void InlinePropagator::propagateC2P(TensorView* from, TensorView* to) {
+  bool debug = isDebugDumpEnabled(DebugDumpOption::InlinePropagator);
+  if (debug) {
+    std::cout << "InlinePropagator::propagateC2P" << std::endl;
+    std::cout << "  from: " << from << std::endl;
+    std::cout << "  to: " << to << std::endl;
+  }
   // Step 1: find mapped_reference_pos_[to]
   int from_pos;
   if (mode_ != ComputeAtMode::MostInlined) {
@@ -297,6 +327,12 @@ void InlinePropagator::propagateC2P(TensorView* from, TensorView* to) {
 }
 
 void InlinePropagator::propagateP2C(TensorView* from, TensorView* to) {
+  bool debug = isDebugDumpEnabled(DebugDumpOption::InlinePropagator);
+  if (debug) {
+    std::cout << "InlinePropagator::propagateP2C" << std::endl;
+    std::cout << "  from: " << from << std::endl;
+    std::cout << "  to: " << to << std::endl;
+  }
   // Step 1: find mapped_reference_pos_[to]
   int from_pos;
   if (mode_ != ComputeAtMode::MostInlined) {
@@ -321,6 +357,12 @@ void InlinePropagator::propagateP2C(TensorView* from, TensorView* to) {
 }
 
 void InlinePropagator::propagateSibling(TensorView* from, TensorView* to) {
+  bool debug = isDebugDumpEnabled(DebugDumpOption::InlinePropagator);
+  if (debug) {
+    std::cout << "InlinePropagator::propagateSibling" << std::endl;
+    std::cout << "  from: " << from << std::endl;
+    std::cout << "  to: " << to << std::endl;
+  }
   // Step 1: find mapped_reference_pos_[to]
   auto from_pos = mapped_reference_pos_.at(from);
   TORCH_CHECK(
diff --git a/torch/csrc/jit/codegen/cuda/utils.cpp b/torch/csrc/jit/codegen/cuda/utils.cpp
index f03f994dff9ad..c25f69a3aa455 100644
--- a/torch/csrc/jit/codegen/cuda/utils.cpp
+++ b/torch/csrc/jit/codegen/cuda/utils.cpp
@@ -36,7 +36,8 @@ auto parseDebugDumpOptions() {
       {DebugDumpOption::ParallelDimensions, false},
       {DebugDumpOption::Halo, false},
       {DebugDumpOption::PerfDebugVerbose, false},
-      {DebugDumpOption::TransformPropagator, false}};
+      {DebugDumpOption::TransformPropagator, false},
+      {DebugDumpOption::InlinePropagator, false}};
 
   if (const char* dump_options = std::getenv("PYTORCH_NVFUSER_DUMP")) {
     c10::string_view options_view(dump_options);
@@ -85,6 +86,8 @@ auto parseDebugDumpOptions() {
         options_map[DebugDumpOption::PerfDebugVerbose] = true;
       } else if (token == "transform_propagator") {
         options_map[DebugDumpOption::TransformPropagator] = true;
+      } else if (token == "inline_propagator") {
+        options_map[DebugDumpOption::InlinePropagator] = true;
       } else {
         TORCH_CHECK(
             false,
@@ -95,7 +98,8 @@ auto parseDebugDumpOptions() {
             "\tcuda_to_file, launch_param, segmented_fusion, fusion_args,\n",
             "\tkernel_args, dump_eff_bandwidth, draw_segmented_fusion,\n",
             "\tscheduler_params, parallel_dimensions, buffer_reuse_verbose,\n",
-            "\tptxas_verbose, halo, segmenter_logging, perf_debug_verbose\n");
+            "\tptxas_verbose, halo, segmenter_logging, perf_debug_verbose\n",
+            "\ttransform_propagator, inline_propagator\n");
       }
       options_view = (end_pos != c10::string_view::npos)
           ? options_view.substr(end_pos + 1)
diff --git a/torch/csrc/jit/codegen/cuda/utils.h b/torch/csrc/jit/codegen/cuda/utils.h
index 0fc3fb786e0cc..0a22d657f541f 100644
--- a/torch/csrc/jit/codegen/cuda/utils.h
+++ b/torch/csrc/jit/codegen/cuda/utils.h
@@ -46,6 +46,8 @@ enum class DebugDumpOption {
                     //! associated with what's running
   TransformPropagator, //! When running TransformPropagator, print propagation
                        //! path and replay result
+  InlinePropagator, //! When running InlinePropagator, print propagation
+                    //! path and inlining result
 };
 
 TORCH_CUDA_CU_API bool isDebugDumpEnabled(DebugDumpOption option);