Improve and expose cpp_backtrace to python binding (pytorch#84896)

We can now get cpp stack trace by calling torch.utils.get_cpp_backtrace() Sample output when calling from a torch_dispatch stack: ``` <omitting python frames> frame #23: torch::handle_torch_function_no_python_arg_parser(c10::ArrayRef<pybind11::handle>, _object*, _object*, char const*, _object*, char const*, torch::TorchFunctionName) (0x7f69330bab90 in /fsx/users/bahuang/repos/pytorch_fsx/torch/csrc/utils/python_arg_parser.cpp:323) frame #24: <unknown function> (0x7f6932a09e79 in /fsx/users/bahuang/repos/pytorch_fsx/torch/csrc/autograd/python_variable.cpp:2252) frame #25: <unknown function> (0x7f69261aee33 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/PythonFallbackKernel.cpp:56) frame #26: <unknown function> (0x7f69261afef9 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/boxing/BoxedKernel_impl.h:19) frame #27: c10::BoxedKernel::callBoxed(c10::OperatorHandle const&, c10::DispatchKeySet, std::vector<c10::IValue, std::allocator<c10::IValue> >*) const (0x7f6932aadced in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/boxing/BoxedKernel_impl.h:41) frame #28: <unknown function> (0x7f6926fae9b9 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/boxing/impl/boxing.h:227) frame #29: at::Tensor c10::Dispatcher::redispatch<at::Tensor, at::Tensor const&>(c10::TypedOperatorHandle<at::Tensor (at::Tensor const&)> const&, c10::DispatchKeySet, at::Tensor const&) const (0x7f6926e821f5 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/boxing/KernelFunction_impl.h:106) frame #30: at::_ops::alias::redispatch(c10::DispatchKeySet, at::Tensor const&) (0x7f6927142c31 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/dispatch/Dispatcher.h:438) frame #31: <unknown function> (0x7f692ae4f8be in /fsx/users/bahuang/repos/pytorch_fsx/torch/csrc/autograd/generated/ADInplaceOrViewType_1.cpp:1361) frame #32: <unknown function> (0x7f692ae4f9b1 in /fsx/users/bahuang/repos/pytorch_fsx/torch/csrc/autograd/generated/ADInplaceOrViewType_1.cpp:1362) frame #33: <unknown function> (0x7f692aef77e9 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/boxing/impl/WrapFunctionIntoFunctor.h:13) frame #34: <unknown function> (0x7f6926fae7d8 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/boxing/KernelFunction_impl.h:50) frame #35: at::Tensor c10::Dispatcher::redispatch<at::Tensor, at::Tensor const&>(c10::TypedOperatorHandle<at::Tensor (at::Tensor const&)> const&, c10::DispatchKeySet, at::Tensor const&) const (0x7f6926e821c9 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/boxing/KernelFunction_impl.h:97) frame #36: at::_ops::alias::redispatch(c10::DispatchKeySet, at::Tensor const&) (0x7f6927142c31 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/dispatch/Dispatcher.h:438) frame #37: <unknown function> (0x7f6929ec654a in /fsx/users/bahuang/repos/pytorch_fsx/build/aten/src/ATen/RedispatchFunctions.h:10697) frame #38: <unknown function> (0x7f6929d9edae in /fsx/users/bahuang/repos/pytorch_fsx/torch/csrc/autograd/generated/VariableType_1.cpp:2837) frame #39: <unknown function> (0x7f6929d9f043 in /fsx/users/bahuang/repos/pytorch_fsx/torch/csrc/autograd/generated/VariableType_1.cpp:2838) frame #40: <unknown function> (0x7f6929e7d2f9 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/boxing/impl/WrapFunctionIntoFunctor.h:13) frame #41: <unknown function> (0x7f6929eb1344 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h:478) frame #42: <unknown function> (0x7f6929ea7b99 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h:490) frame #43: <unknown function> (0x7f6929e7d370 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h:563) frame #44: <unknown function> (0x7f6929e7d43a in /fsx/users/bahuang/repos/pytorch_fsx/c10/util/C++17.h:239) frame #45: <unknown function> (0x7f6929e7d48c in /fsx/users/bahuang/repos/pytorch_fsx/c10/util/C++17.h:364) frame #46: <unknown function> (0x7f6929e7d50a in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h:554) frame #47: c10::BoxedKernel::callBoxed(c10::OperatorHandle const&, c10::DispatchKeySet, std::vector<c10::IValue, std::allocator<c10::IValue> >*) const (0x7f6932aadced in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/boxing/BoxedKernel_impl.h:41) frame #48: c10::KernelFunction::callBoxed(c10::OperatorHandle const&, c10::DispatchKeySet, std::vector<c10::IValue, std::allocator<c10::IValue> >*) const (0x7f6932aadd26 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/boxing/KernelFunction_impl.h:43) frame #49: c10::Dispatcher::redispatchBoxed(c10::OperatorHandle const&, c10::DispatchKeySet, std::vector<c10::IValue, std::allocator<c10::IValue> >*) const (0x7f692603890a in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/dispatch/Dispatcher.h:652) frame #50: <unknown function> (0x7f69260387f9 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/dispatch/Dispatcher.h:388) frame #51: <unknown function> (0x7f69261af0ef in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/PythonFallbackKernel.cpp:96) frame #52: <unknown function> (0x7f69261aff2b in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/boxing/BoxedKernel_impl.h:25) frame #53: c10::BoxedKernel::callBoxed(c10::OperatorHandle const&, c10::DispatchKeySet, std::vector<c10::IValue, std::allocator<c10::IValue> >*) const (0x7f6932aadced in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/boxing/BoxedKernel_impl.h:41) frame #54: c10::KernelFunction::callBoxed(c10::OperatorHandle const&, c10::DispatchKeySet, std::vector<c10::IValue, std::allocator<c10::IValue> >*) const (0x7f6932aadd26 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/boxing/KernelFunction_impl.h:43) frame #55: c10::Dispatcher::callBoxed(c10::OperatorHandle const&, std::vector<c10::IValue, std::allocator<c10::IValue> >*) const (0x7f6925fd6ab2 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/dispatch/Dispatcher.h:628) frame #56: <unknown function> (0x7f6925fd6690 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/dispatch/Dispatcher.h:376) frame #57: <unknown function> (0x7f692bf5b525 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/dispatch/Dispatcher.h:380) frame #58: <unknown function> (0x7f692bf59fac in /fsx/users/bahuang/repos/pytorch_fsx/torch/csrc/jit/runtime/register_c10_ops.cpp:15) frame #59: <unknown function> (0x7f692bf5af41 in /usr/include/c++/7/bits/std_function.h:316) frame #60: std::function<void (std::vector<c10::IValue, std::allocator<c10::IValue> >&)>::operator()(std::vector<c10::IValue, std::allocator<c10::IValue> >&) const (0x7f6932ab9a0f in /usr/include/c++/7/bits/std_function.h:706) frame #61: <unknown function> (0x7f6932aad541 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/stack.h:41) frame #62: <unknown function> (0x7f6932ab3102 in /fsx/users/bahuang/repos/pytorch_fsx/torch/csrc/jit/python/pybind_utils.h:1206 (discriminator 1)) frame #63: <unknown function> (0x7f6932ab3943 in /fsx/users/bahuang/repos/pytorch_fsx/torch/csrc/jit/python/pybind_utils.h:1272) frame #64: <unknown function> (0x7f6932a46120 in /fsx/users/bahuang/repos/pytorch_fsx/torch/csrc/jit/python/init.cpp:1767) frame #65: <unknown function> (0x7f6932a997be in /fsx/users/bahuang/repos/pytorch_fsx/third_party/pybind11/include/pybind11/cast.h:1441) frame #66: <unknown function> (0x7f6932a8a985 in /fsx/users/bahuang/repos/pytorch_fsx/third_party/pybind11/include/pybind11/cast.h:1410) frame #67: <unknown function> (0x7f6932a66e1e in /fsx/users/bahuang/repos/pytorch_fsx/third_party/pybind11/include/pybind11/pybind11.h:249) frame #68: <unknown function> (0x7f6932a66ec2 in /fsx/users/bahuang/repos/pytorch_fsx/third_party/pybind11/include/pybind11/pybind11.h:224) frame #69: <unknown function> (0x7f6932473111 in /fsx/users/bahuang/repos/pytorch_fsx/third_party/pybind11/include/pybind11/pybind11.h:929) frame #104: __libc_start_main (0x7f693485dc87 in /build/glibc-uZu3wS/glibc-2.27/csu/../csu/libc-start.c:310) ``` Pull Request resolved: pytorch#84896 Approved by: https://github.com/ezyang
kulinseth · Sep 21, 2022 · 73fbca1 · 73fbca1
1 parent 52fd7e4
commit 73fbca1
Show file tree

Hide file tree

Showing 6 changed files with 121 additions and 7 deletions.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -924,6 +924,7 @@ add-auto-load-safe-path /path/to/pytorch/.gdbinit
 
 ### C++ stacktraces
 Set `TORCH_SHOW_CPP_STACKTRACES=1` to get the C++ stacktrace when an error occurs in Python.
+Set `TORCH_SHOW_CPP_STACKTRACES_WITH_LINENO=1` to get the C++ stacktrace with file and line number.
 
 ## CUDA development tips
 

diff --git a/c10/util/Backtrace.cpp b/c10/util/Backtrace.cpp
@@ -1,6 +1,7 @@
 #include <c10/util/Backtrace.h>
 #include <c10/util/Optional.h>
 #include <c10/util/Type.h>
+#include <c10/util/env.h>
 #include <c10/util/irange.h>
 
 #include <functional>
@@ -21,7 +22,14 @@
 #include <dlfcn.h>
 #include <unwind.h>
 #else
+#include <dlfcn.h>
 #include <execinfo.h>
+
+#ifndef __APPLE__
+// link.h is not available on IOS and Mac builds
+#include <link.h>
+#endif
+
 #endif
 #endif
 
@@ -87,6 +95,46 @@ void dump_stack(
 #if SUPPORTS_BACKTRACE
 namespace {
 
+#if !defined(C10_ANDROID) && !defined(__APPLE__)
+
+// converts a function's address in memory to its VMA address in the executable
+// file. VMA is what addr2line expects
+size_t ConvertToVMA(size_t addr) {
+ Dl_info info;
+ link_map* link_map;
+ dladdr1((void*)addr, &info, (void**)&link_map, RTLD_DL_LINKMAP);
+ return addr - link_map->l_addr;
+}
+
+std::string exec(const char* cmd) {
+ std::array<char, 128> buffer;
+ std::string result;
+ std::unique_ptr<FILE, decltype(&pclose)> pipe(popen(cmd, "r"), pclose);
+ if (!pipe) {
+ throw std::runtime_error("popen() failed!");
+ }
+ while (fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr) {
+ result += buffer.data();
+ }
+ return result;
+}
+
+std::string rstrip(const std::string& s) {
+ const std::string WHITESPACE = " \n\r\t\f\v";
+ size_t end = s.find_last_not_of(WHITESPACE);
+ return (end == std::string::npos) ? "" : s.substr(0, end + 1);
+}
+
+bool use_addr2line() {
+ static bool _use_addr2line = []() {
+ return c10::utils::check_env("TORCH_SHOW_CPP_STACKTRACES_WITH_LINENO") ==
+ true;
+ }();
+ return _use_addr2line;
+}
+
+#endif // !defined(C10_ANDROID) && !defined(__APPLE__)
+
 struct FrameInformation {
  /// If available, the demangled name of the function at this frame, else
  /// whatever (possibly mangled) name we got from `backtrace()`.
@@ -99,6 +147,10 @@ struct FrameInformation {
  /// NOTE: In debugger parlance, the "object file" refers to the ELF file that
  /// the symbol originates from, i.e. either an executable or a library.
  std::string object_file;
+ /// Source file name and line number
+ std::string source_file_lineno;
+
+ bool is_python_frame;
 };
 
 #ifndef C10_ANDROID
@@ -108,7 +160,8 @@ bool is_python_frame(const FrameInformation& frame) {
 }
 
 c10::optional<FrameInformation> parse_frame_information(
- const std::string& frame_string) {
+ const std::string& frame_string,
+ void* frame_pointer) {
  FrameInformation frame;
 
  // This is the function name in the CXX ABI mangled format, e.g. something
@@ -141,6 +194,7 @@ c10::optional<FrameInformation> parse_frame_information(
  frame.object_file = frame_string.substr(0, function_name_start - 1);
  frame.offset_into_function =
  frame_string.substr(offset_start, offset_end - offset_start);
+ frame.is_python_frame = is_python_frame(frame);
 
  // NOTE: We don't need to parse the return address because
  // we already have it from the call to `backtrace()`.
@@ -171,6 +225,30 @@ c10::optional<FrameInformation> parse_frame_information(
  }
 
  frame.function_name = demangle(mangled_function_name.c_str());
+
+#if !defined(__APPLE__)
+
+ if (use_addr2line() && !frame.is_python_frame) {
+ Dl_info info;
+ if (dladdr(frame_pointer, &info)) {
+ char command[256];
+ size_t VMA_addr = ConvertToVMA((size_t)frame_pointer);
+ // Need to decrease the VMA address by 1 to get the correct line number
+ // https://stackoverflow.com/questions/11579509/wrong-line-numbers-from-addr2line/63841497#63841497
+ VMA_addr -= 1;
+ snprintf(
+ command,
+ sizeof(command),
+ "addr2line -e %s -C %zx",
+ info.dli_fname,
+ VMA_addr);
+
+ frame.source_file_lineno = rstrip(exec(command));
+ }
+ }
+
+#endif // !defined(__APPLE__)
+
  return frame;
 }
 #endif /* !defined(C10_ANDROID) */
@@ -283,9 +361,10 @@ std::string get_backtrace(
  bool has_skipped_python_frames = false;
 
  for (const auto frame_number : c10::irange(callstack.size())) {
- const auto frame = parse_frame_information(symbols[frame_number]);
+ const auto frame =
+ parse_frame_information(symbols[frame_number], callstack[frame_number]);
 
- if (skip_python_frames && frame && is_python_frame(*frame)) {
+ if (skip_python_frames && frame && frame->is_python_frame) {
  if (!has_skipped_python_frames) {
  stream << "<omitting python frames>\n";
  has_skipped_python_frames = true;
@@ -297,10 +376,17 @@ std::string get_backtrace(
  stream << "frame #" << frame_number << ": ";
 
  if (frame) {
- // <function_name> + <offset> (<return-address> in <object-file>)
- stream << frame->function_name << " + " << frame->offset_into_function
- << " (" << callstack[frame_number] << " in " << frame->object_file
- << ")\n";
+ if (frame->source_file_lineno.empty()) {
+ // <function_name> + <offset> (<return-address> in <object-file>)
+ stream << frame->function_name << " + " << frame->offset_into_function
+ << " (" << callstack[frame_number] << " in "
+ << frame->object_file << ")\n";
+
+ } else {
+ // <function_name> (<return-address> in <filename>:<line-number>)
+ stream << frame->function_name << " (" << callstack[frame_number]
+ << " in " << frame->source_file_lineno << ")\n";
+ }
  } else {
  // In the edge-case where we couldn't parse the frame string, we can
  // just use it directly (it may have a different format).

diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
@@ -843,6 +843,7 @@ def _remove_meta_from_tls_dispatch_include() -> None: ...
 # https://code.activestate.com/lists/python-dev/139675/
 def _to_dlpack(data: Tensor) -> Any: ... # THPModule_toDLPack
 def _from_dlpack(data: Any) -> Tensor: ... # THPModule_fromDLPack
+def _get_cpp_backtrace(frames_to_skip: _int, maximum_number_of_frames: _int) -> str: ... # THPModule_getCppBacktrace
 def set_flush_denormal(arg: _bool) -> _bool: ... # THPModule_setFlushDenormal
 def get_default_dtype() -> _dtype: ... # THPModule_getDefaultDtype
 def _get_default_device() -> str: ... # THPModule_getDefaultDevice

diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
@@ -425,6 +425,19 @@ PyObject* THPModule_fromDLPack(PyObject* _unused, PyObject* data) {
  END_HANDLE_TH_ERRORS
 }
 
+PyObject* THModule_getCppBacktrace(PyObject* _unused, PyObject* args) {
+ HANDLE_TH_ERRORS
+ size_t frames_to_skip;
+ size_t maximum_number_of_frames;
+ if (!PyArg_ParseTuple(
+ args, "LL", &frames_to_skip, &maximum_number_of_frames)) {
+ return nullptr;
+ }
+ return THPUtils_packString(
+ c10::get_backtrace(frames_to_skip, maximum_number_of_frames, true));
+ END_HANDLE_TH_ERRORS
+}
+
 PyObject* THPModule_setAllowTF32CuDNN(PyObject* _unused, PyObject* arg) {
  THPUtils_assert(
  PyBool_Check(arg),
@@ -866,6 +879,7 @@ static PyMethodDef TorchMethods[] = {
  nullptr},
  {"_to_dlpack", THPModule_toDLPack, METH_O, nullptr},
  {"_from_dlpack", THPModule_fromDLPack, METH_O, nullptr},
+ {"_get_cpp_backtrace", THModule_getCppBacktrace, METH_VARARGS, nullptr},
  {"set_flush_denormal", THPModule_setFlushDenormal, METH_O, nullptr},
  {"get_default_dtype", THPModule_getDefaultDtype, METH_NOARGS, nullptr},
  {"_get_default_device", THPModule_getDefaultDevice, METH_NOARGS, nullptr},

diff --git a/torch/utils/__init__.py b/torch/utils/__init__.py
@@ -3,6 +3,7 @@
 
 from .throughput_benchmark import ThroughputBenchmark
 from ._crash_handler import enable_minidumps, disable_minidumps, enable_minidumps_on_exceptions
+from .cpp_backtrace import get_cpp_backtrace
 
 # Set the module for a given object for nicer printing
 def set_module(obj, mod):

diff --git a/torch/utils/cpp_backtrace.py b/torch/utils/cpp_backtrace.py
@@ -0,0 +1,11 @@
+from torch._C import _get_cpp_backtrace
+
+def get_cpp_backtrace(frames_to_skip=0, maximum_number_of_frames=64) -> str:
+ r"""
+ Returns a string containing the C++ stack trace of the current thread.
+ Args:
+ frames_to_skip (int): the number of frames to skip from the top of the stack
+ maximum_number_of_frames (int): the maximum number of frames to return
+ """
+
+ return _get_cpp_backtrace(frames_to_skip, maximum_number_of_frames)