diff --git a/torch/csrc/jit/codegen/cuda/executor.cpp b/torch/csrc/jit/codegen/cuda/executor.cpp index cab5c7bf9ad2a..2c26570acf8f5 100644 --- a/torch/csrc/jit/codegen/cuda/executor.cpp +++ b/torch/csrc/jit/codegen/cuda/executor.cpp @@ -93,7 +93,9 @@ std::string FusionExecutor::getStructuredCode(const std::string& kernel) { std::cout << "\n======= Codegen output for kernel: " << kernelName() << " =======\n\n" << code << "\n======================================\n\n"; - } else if (isDebugDumpEnabled(DebugDumpOption::CudaToFile)) { + } + if (isDebugDumpEnabled(DebugDumpOption::CudaToFile) || + isDebugDumpEnabled(DebugDumpOption::DebugInfo)) { std::stringstream file_name; file_name << "__tmp_kernel" << fusion_id_ << ".cu"; std::cout << "PRINTING: " << file_name.str() << std::endl; diff --git a/torch/csrc/jit/codegen/cuda/executor_utils.cpp b/torch/csrc/jit/codegen/cuda/executor_utils.cpp index 3182a9273d8a8..6706fb9544263 100644 --- a/torch/csrc/jit/codegen/cuda/executor_utils.cpp +++ b/torch/csrc/jit/codegen/cuda/executor_utils.cpp @@ -915,9 +915,11 @@ std::pair nvrtcCompile( nvrtcProgram program; // NOLINT(cppcoreguidelines-init-variables) { + std::stringstream ss; + ss << "__tmp_kernel" << id << ".cu"; FUSER_PERF_SCOPE("executor_utils::NvrtcCreateProgram"); AT_CUDA_NVRTC_CHECK(at::globalContext().getNVRTC().nvrtcCreateProgram( - &program, code.c_str(), nullptr, 0, nullptr, nullptr)); + &program, code.c_str(), ss.str().c_str(), 0, nullptr, nullptr)); } ResourceGuard holdProgram([&] { @@ -964,11 +966,13 @@ std::pair nvrtcCompile( args.push_back("--fmad=true"); } #endif - -#ifndef NDEBUG // Add line info to generated kernels - args.push_back("-lineinfo"); -#else + if (isDebugDumpEnabled(DebugDumpOption::DebugInfo)) { + args.push_back("-lineinfo"); + args.push_back("-G"); + args.push_back("--dopt=on"); + } +#ifdef NDEBUG // Avoid excessive register usage from assertion args.push_back("-DNDEBUG"); #endif diff --git a/torch/csrc/jit/codegen/cuda/utils.cpp b/torch/csrc/jit/codegen/cuda/utils.cpp index c25f69a3aa455..ad96fcc38f4d9 100644 --- a/torch/csrc/jit/codegen/cuda/utils.cpp +++ b/torch/csrc/jit/codegen/cuda/utils.cpp @@ -23,6 +23,7 @@ auto parseDebugDumpOptions() { {DebugDumpOption::CudaKernel, false}, {DebugDumpOption::CudaFull, false}, {DebugDumpOption::CudaToFile, false}, + {DebugDumpOption::DebugInfo, false}, {DebugDumpOption::LaunchParam, false}, {DebugDumpOption::FusionSegments, false}, {DebugDumpOption::FusionSegmenterLog, false}, @@ -58,6 +59,8 @@ auto parseDebugDumpOptions() { options_map[DebugDumpOption::CudaFull] = true; } else if (token == "cuda_to_file") { options_map[DebugDumpOption::CudaToFile] = true; + } else if (token == "debug_info") { + options_map[DebugDumpOption::DebugInfo] = true; } else if (token == "launch_param") { options_map[DebugDumpOption::LaunchParam] = true; } else if (token == "segmented_fusion") { @@ -95,7 +98,7 @@ auto parseDebugDumpOptions() { token, "'\nAvailable options:\n", "\tfusion_ir, fusion_ir_math, kernel_ir, ca_map, cuda_kernel, cuda_full,\n", - "\tcuda_to_file, launch_param, segmented_fusion, fusion_args,\n", + "\tcuda_to_file, debug_info, launch_param, segmented_fusion, fusion_args,\n", "\tkernel_args, dump_eff_bandwidth, draw_segmented_fusion,\n", "\tscheduler_params, parallel_dimensions, buffer_reuse_verbose,\n", "\tptxas_verbose, halo, segmenter_logging, perf_debug_verbose\n", diff --git a/torch/csrc/jit/codegen/cuda/utils.h b/torch/csrc/jit/codegen/cuda/utils.h index 0a22d657f541f..6b67d7710bb90 100644 --- a/torch/csrc/jit/codegen/cuda/utils.h +++ b/torch/csrc/jit/codegen/cuda/utils.h @@ -29,6 +29,8 @@ enum class DebugDumpOption { CudaKernel, //!< Dump the generated CUDA C++ kernel code CudaFull, //!< Dump the complete CUDA C++ code CudaToFile, //!< Dump CUDA Strings to File + DebugInfo, //!< Embed line info and debug info to compiled kernel, and dump + //!< the full CUDA C++ code LaunchParam, //!< Dump the Launch parameters of kernel FusionSegments, //!< Dump Segmented Fusion Graph FusionSegmenterLog, //!< Dump Detailed Segmenter Logging