diff --git a/torch/csrc/jit/codegen/cuda/executor.cpp b/torch/csrc/jit/codegen/cuda/executor.cpp
index cab5c7bf9ad2a..2c26570acf8f5 100644
--- a/torch/csrc/jit/codegen/cuda/executor.cpp
+++ b/torch/csrc/jit/codegen/cuda/executor.cpp
@@ -93,7 +93,9 @@ std::string FusionExecutor::getStructuredCode(const std::string& kernel) {
     std::cout << "\n======= Codegen output for kernel: " << kernelName()
               << " =======\n\n"
               << code << "\n======================================\n\n";
-  } else if (isDebugDumpEnabled(DebugDumpOption::CudaToFile)) {
+  }
+  if (isDebugDumpEnabled(DebugDumpOption::CudaToFile) ||
+      isDebugDumpEnabled(DebugDumpOption::DebugInfo)) {
     std::stringstream file_name;
     file_name << "__tmp_kernel" << fusion_id_ << ".cu";
     std::cout << "PRINTING: " << file_name.str() << std::endl;
diff --git a/torch/csrc/jit/codegen/cuda/executor_utils.cpp b/torch/csrc/jit/codegen/cuda/executor_utils.cpp
index 3182a9273d8a8..6706fb9544263 100644
--- a/torch/csrc/jit/codegen/cuda/executor_utils.cpp
+++ b/torch/csrc/jit/codegen/cuda/executor_utils.cpp
@@ -915,9 +915,11 @@ std::pair<NvrtcFunction, std::string> nvrtcCompile(
   nvrtcProgram program; // NOLINT(cppcoreguidelines-init-variables)
 
   {
+    std::stringstream ss;
+    ss << "__tmp_kernel" << id << ".cu";
     FUSER_PERF_SCOPE("executor_utils::NvrtcCreateProgram");
     AT_CUDA_NVRTC_CHECK(at::globalContext().getNVRTC().nvrtcCreateProgram(
-        &program, code.c_str(), nullptr, 0, nullptr, nullptr));
+        &program, code.c_str(), ss.str().c_str(), 0, nullptr, nullptr));
   }
 
   ResourceGuard holdProgram([&] {
@@ -964,11 +966,13 @@ std::pair<NvrtcFunction, std::string> nvrtcCompile(
     args.push_back("--fmad=true");
   }
 #endif
-
-#ifndef NDEBUG
   // Add line info to generated kernels
-  args.push_back("-lineinfo");
-#else
+  if (isDebugDumpEnabled(DebugDumpOption::DebugInfo)) {
+    args.push_back("-lineinfo");
+    args.push_back("-G");
+    args.push_back("--dopt=on");
+  }
+#ifdef NDEBUG
   // Avoid excessive register usage from assertion
   args.push_back("-DNDEBUG");
 #endif
diff --git a/torch/csrc/jit/codegen/cuda/utils.cpp b/torch/csrc/jit/codegen/cuda/utils.cpp
index c25f69a3aa455..ad96fcc38f4d9 100644
--- a/torch/csrc/jit/codegen/cuda/utils.cpp
+++ b/torch/csrc/jit/codegen/cuda/utils.cpp
@@ -23,6 +23,7 @@ auto parseDebugDumpOptions() {
       {DebugDumpOption::CudaKernel, false},
       {DebugDumpOption::CudaFull, false},
       {DebugDumpOption::CudaToFile, false},
+      {DebugDumpOption::DebugInfo, false},
       {DebugDumpOption::LaunchParam, false},
       {DebugDumpOption::FusionSegments, false},
       {DebugDumpOption::FusionSegmenterLog, false},
@@ -58,6 +59,8 @@ auto parseDebugDumpOptions() {
         options_map[DebugDumpOption::CudaFull] = true;
       } else if (token == "cuda_to_file") {
         options_map[DebugDumpOption::CudaToFile] = true;
+      } else if (token == "debug_info") {
+        options_map[DebugDumpOption::DebugInfo] = true;
       } else if (token == "launch_param") {
         options_map[DebugDumpOption::LaunchParam] = true;
       } else if (token == "segmented_fusion") {
@@ -95,7 +98,7 @@ auto parseDebugDumpOptions() {
             token,
             "'\nAvailable options:\n",
             "\tfusion_ir, fusion_ir_math, kernel_ir, ca_map, cuda_kernel, cuda_full,\n",
-            "\tcuda_to_file, launch_param, segmented_fusion, fusion_args,\n",
+            "\tcuda_to_file, debug_info, launch_param, segmented_fusion, fusion_args,\n",
             "\tkernel_args, dump_eff_bandwidth, draw_segmented_fusion,\n",
             "\tscheduler_params, parallel_dimensions, buffer_reuse_verbose,\n",
             "\tptxas_verbose, halo, segmenter_logging, perf_debug_verbose\n",
diff --git a/torch/csrc/jit/codegen/cuda/utils.h b/torch/csrc/jit/codegen/cuda/utils.h
index 0a22d657f541f..6b67d7710bb90 100644
--- a/torch/csrc/jit/codegen/cuda/utils.h
+++ b/torch/csrc/jit/codegen/cuda/utils.h
@@ -29,6 +29,8 @@ enum class DebugDumpOption {
   CudaKernel, //!< Dump the generated CUDA C++ kernel code
   CudaFull, //!< Dump the complete CUDA C++ code
   CudaToFile, //!< Dump CUDA Strings to File
+  DebugInfo, //!< Embed line info and debug info to compiled kernel, and dump
+             //!< the full CUDA C++ code
   LaunchParam, //!< Dump the Launch parameters of kernel
   FusionSegments, //!< Dump Segmented Fusion Graph
   FusionSegmenterLog, //!< Dump Detailed Segmenter Logging