From 6f2865e11e85d43a322b1de10a58276ba3e51ca8 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Tue, 19 Jul 2022 15:12:18 -0700 Subject: [PATCH] Add debug dump for InlinePropagator --- .../jit/codegen/cuda/inline_propagator.cpp | 42 +++++++++++++++++++ torch/csrc/jit/codegen/cuda/utils.cpp | 8 +++- torch/csrc/jit/codegen/cuda/utils.h | 2 + 3 files changed, 50 insertions(+), 2 deletions(-) diff --git a/torch/csrc/jit/codegen/cuda/inline_propagator.cpp b/torch/csrc/jit/codegen/cuda/inline_propagator.cpp index c93cd8354cffa..0dd2f1930f9bf 100644 --- a/torch/csrc/jit/codegen/cuda/inline_propagator.cpp +++ b/torch/csrc/jit/codegen/cuda/inline_propagator.cpp @@ -147,9 +147,17 @@ size_t InlinePropagator::getMaxPosAll(TensorView* tv, bool check_siblings) { } void InlinePropagator::setCAPos(TensorView* tv) { + bool debug = isDebugDumpEnabled(DebugDumpOption::InlinePropagator); size_t pos = mapped_reference_pos_.at(tv); + if (debug) { + std::cout << " Setting CA pos of " << tv << ":" << std::endl; + std::cout << " mapped position: " << pos << std::endl; + } if ((selected_.empty() || selected_.count(tv)) && !tv->isFusionInput()) { auto max_pos = getMaxPosAll(tv); + if (debug) { + std::cout << " max inlinable position: " << max_pos << std::endl; + } if (mode_ == ComputeAtMode::Standard) { TORCH_INTERNAL_ASSERT( pos <= max_pos, @@ -167,12 +175,22 @@ void InlinePropagator::setCAPos(TensorView* tv) { pos--; } auto current_ca_pos = tv->getComputeAtPosition(); + if (debug) { + std::cout << " current CA position: " << current_ca_pos << std::endl; + } if (pos > current_ca_pos) { + if (debug) { + std::cout << " new CA position: " << pos << std::endl; + } tv->setComputeAt(pos); for (auto consumer_tv : ir_utils::consumerTvsOf(tv)) { needs_update_max_producer_.insert(consumer_tv); } + } else if (debug) { + std::cout << " CA position not changed" << std::endl; } + } else if (debug) { + std::cout << " tensor not selected, skip" << std::endl; } } @@ -201,7 +219,13 @@ InlinePropagator::InlinePropagator( } void InlinePropagator::setUp() { + bool debug = isDebugDumpEnabled(DebugDumpOption::InlinePropagator); mapped_reference_pos_[reference_] = reference_pos_; + if (debug) { + std::cout << "InlinePropagator::setUp" << std::endl; + std::cout << " reference: " << reference_ << " @ " << reference_pos_ + << std::endl; + } setCAPos(reference_); } @@ -273,6 +297,12 @@ void InlinePropagator::tearDown() { } void InlinePropagator::propagateC2P(TensorView* from, TensorView* to) { + bool debug = isDebugDumpEnabled(DebugDumpOption::InlinePropagator); + if (debug) { + std::cout << "InlinePropagator::propagateC2P" << std::endl; + std::cout << " from: " << from << std::endl; + std::cout << " to: " << to << std::endl; + } // Step 1: find mapped_reference_pos_[to] int from_pos; if (mode_ != ComputeAtMode::MostInlined) { @@ -297,6 +327,12 @@ void InlinePropagator::propagateC2P(TensorView* from, TensorView* to) { } void InlinePropagator::propagateP2C(TensorView* from, TensorView* to) { + bool debug = isDebugDumpEnabled(DebugDumpOption::InlinePropagator); + if (debug) { + std::cout << "InlinePropagator::propagateP2C" << std::endl; + std::cout << " from: " << from << std::endl; + std::cout << " to: " << to << std::endl; + } // Step 1: find mapped_reference_pos_[to] int from_pos; if (mode_ != ComputeAtMode::MostInlined) { @@ -321,6 +357,12 @@ void InlinePropagator::propagateP2C(TensorView* from, TensorView* to) { } void InlinePropagator::propagateSibling(TensorView* from, TensorView* to) { + bool debug = isDebugDumpEnabled(DebugDumpOption::InlinePropagator); + if (debug) { + std::cout << "InlinePropagator::propagateSibling" << std::endl; + std::cout << " from: " << from << std::endl; + std::cout << " to: " << to << std::endl; + } // Step 1: find mapped_reference_pos_[to] auto from_pos = mapped_reference_pos_.at(from); TORCH_CHECK( diff --git a/torch/csrc/jit/codegen/cuda/utils.cpp b/torch/csrc/jit/codegen/cuda/utils.cpp index f03f994dff9ad..c25f69a3aa455 100644 --- a/torch/csrc/jit/codegen/cuda/utils.cpp +++ b/torch/csrc/jit/codegen/cuda/utils.cpp @@ -36,7 +36,8 @@ auto parseDebugDumpOptions() { {DebugDumpOption::ParallelDimensions, false}, {DebugDumpOption::Halo, false}, {DebugDumpOption::PerfDebugVerbose, false}, - {DebugDumpOption::TransformPropagator, false}}; + {DebugDumpOption::TransformPropagator, false}, + {DebugDumpOption::InlinePropagator, false}}; if (const char* dump_options = std::getenv("PYTORCH_NVFUSER_DUMP")) { c10::string_view options_view(dump_options); @@ -85,6 +86,8 @@ auto parseDebugDumpOptions() { options_map[DebugDumpOption::PerfDebugVerbose] = true; } else if (token == "transform_propagator") { options_map[DebugDumpOption::TransformPropagator] = true; + } else if (token == "inline_propagator") { + options_map[DebugDumpOption::InlinePropagator] = true; } else { TORCH_CHECK( false, @@ -95,7 +98,8 @@ auto parseDebugDumpOptions() { "\tcuda_to_file, launch_param, segmented_fusion, fusion_args,\n", "\tkernel_args, dump_eff_bandwidth, draw_segmented_fusion,\n", "\tscheduler_params, parallel_dimensions, buffer_reuse_verbose,\n", - "\tptxas_verbose, halo, segmenter_logging, perf_debug_verbose\n"); + "\tptxas_verbose, halo, segmenter_logging, perf_debug_verbose\n", + "\ttransform_propagator, inline_propagator\n"); } options_view = (end_pos != c10::string_view::npos) ? options_view.substr(end_pos + 1) diff --git a/torch/csrc/jit/codegen/cuda/utils.h b/torch/csrc/jit/codegen/cuda/utils.h index 0fc3fb786e0cc..0a22d657f541f 100644 --- a/torch/csrc/jit/codegen/cuda/utils.h +++ b/torch/csrc/jit/codegen/cuda/utils.h @@ -46,6 +46,8 @@ enum class DebugDumpOption { //! associated with what's running TransformPropagator, //! When running TransformPropagator, print propagation //! path and replay result + InlinePropagator, //! When running InlinePropagator, print propagation + //! path and inlining result }; TORCH_CUDA_CU_API bool isDebugDumpEnabled(DebugDumpOption option);