diff --git a/.github/actions/install-all-dependences/action.yml b/.github/actions/install-all-dependences/action.yml index f507aa4070..8ff502f81a 100644 --- a/.github/actions/install-all-dependences/action.yml +++ b/.github/actions/install-all-dependences/action.yml @@ -2,7 +2,7 @@ runs: using: "composite" steps: - uses: ./.github/actions/install-essential-dependences - - run: sudo apt-get install -y libgoogle-glog-dev automake bison flex libboost-all-dev libevent-dev libtool pkg-config libibverbs1 libibverbs-dev + - run: sudo apt-get install -y libgoogle-glog-dev automake bison flex libboost-all-dev libevent-dev libtool pkg-config libibverbs1 libibverbs-dev libunwind8-dev shell: bash - run: wget https://archive.apache.org/dist/thrift/0.11.0/thrift-0.11.0.tar.gz && tar -xf thrift-0.11.0.tar.gz shell: bash diff --git a/.github/workflows/ci-linux.yml b/.github/workflows/ci-linux.yml index 14961ebb3b..801bc5e377 100644 --- a/.github/workflows/ci-linux.yml +++ b/.github/workflows/ci-linux.yml @@ -14,172 +14,199 @@ env: proc_num: $(nproc) jobs: - gcc-compile-with-make: - runs-on: ubuntu-20.04 # https://github.com/actions/runner-images - steps: - - uses: actions/checkout@v2 - - uses: ./.github/actions/install-essential-dependences - - uses: ./.github/actions/init-make-config - with: - options: --cc=gcc --cxx=g++ - - name: compile - run: | - make -j ${{env.proc_num}} - - gcc-compile-with-cmake: - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v2 - - uses: ./.github/actions/install-essential-dependences - - name: cmake - run: | - export CC=gcc && export CXX=g++ - mkdir build - cd build - cmake .. - - name: compile - run: | - cd build - make -j ${{env.proc_num}} - - gcc-compile-with-bazel: - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v2 - - run: bazel test --verbose_failures -- //... -//example/... - - gcc-compile-with-boringssl: - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v2 - - run: bazel test --verbose_failures --define with_mesalink=false --define with_glog=true --define with_thrift=true --define BRPC_WITH_BORINGSSL=true -- //... -//example/... - - gcc-compile-with-make-all-options: - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v2 - - uses: ./.github/actions/install-all-dependences - - uses: ./.github/actions/init-make-config - with: - options: --cc=gcc --cxx=g++ --with-thrift --with-glog --with-rdma --with-debug-bthread-sche-safety --with-debug-lock - - name: compile - run: | - make -j ${{env.proc_num}} - - gcc-compile-with-cmake-all-options: - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v2 - - uses: ./.github/actions/install-all-dependences - - name: cmake - run: | - export CC=gcc && export CXX=g++ - mkdir build - cd build - cmake -DWITH_MESALINK=OFF -DWITH_GLOG=ON -DWITH_THRIFT=ON -DWITH_RDMA=ON -DWITH_DEBUG_BTHREAD_SCHE_SAFETY=ON -DWITH_DEBUG_LOCK=ON .. - - name: compile - run: | - cd build - make -j ${{env.proc_num}} - - gcc-compile-with-bazel-all-options: - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v2 - - run: bazel test --verbose_failures --define with_mesalink=false --define with_glog=true --define with_thrift=true --define with_debug_bthread_sche_safety=true --define with_debug_lock=true -- //... -//example/... - - clang-compile-with-make: - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v2 - - uses: ./.github/actions/install-essential-dependences - - uses: ./.github/actions/init-make-config - with: - options: --cc=clang --cxx=clang++ - - name: compile - run: | - make -j ${{env.proc_num}} - - clang-compile-with-cmake: - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v2 - - uses: ./.github/actions/install-essential-dependences - - name: cmake - run: | - export CC=clang && export CXX=clang++ - mkdir build - cd build - cmake .. - - name: compile - run: | - cd build - make -j ${{env.proc_num}} - - clang-compile-with-bazel: - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v2 - - run: bazel build --verbose_failures --action_env=CC=clang-12 -- //... -//example/... - - clang-compile-with-boringssl: - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v2 - - run: bazel build --verbose_failures --action_env=CC=clang-12 --define with_mesalink=false --define with_glog=true --define with_thrift=true --define BRPC_WITH_BORINGSSL=true -- //... -//example/... - - clang-compile-with-make-all-options: - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v2 - - uses: ./.github/actions/install-all-dependences - - uses: ./.github/actions/init-make-config - with: - options: --cc=clang --cxx=clang++ --with-thrift --with-glog --with-rdma --with-debug-bthread-sche-safety --with-debug-lock - - name: compile - run: | - make -j ${{env.proc_num}} - - clang-compile-with-cmake-all-options: - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v2 - - uses: ./.github/actions/install-all-dependences - - name: cmake - run: | - export CC=clang && export CXX=clang++ - mkdir build - cd build - cmake -DWITH_MESALINK=OFF -DWITH_GLOG=ON -DWITH_THRIFT=ON -DWITH_RDMA=ON -DWITH_DEBUG_BTHREAD_SCHE_SAFETY=ON -DWITH_DEBUG_LOCK=ON .. - - name: compile - run: | - cd build - make -j ${{env.proc_num}} - - clang-compile-with-bazel-all-options: - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v2 - - run: bazel build --verbose_failures --action_env=CC=clang-12 --define with_mesalink=false --define with_glog=true --define with_thrift=true --define with_debug_bthread_sche_safety=true --define with_debug_lock=true -- //... -//example/... - clang-unittest: runs-on: ubuntu-20.04 steps: - - uses: actions/checkout@v2 - - uses: ./.github/actions/install-essential-dependences - - name: install gtest - run: | - sudo apt-get update - sudo apt-get install -y cmake libgtest-dev gdb - cd /usr/src/gtest && sudo cmake . && sudo make && sudo mv lib/libgtest* /usr/lib/ - - uses: ./.github/actions/init-make-config - with: - options: --cc=clang --cxx=clang++ - - name: compile tests - run: | - cd test - make -j ${{env.proc_num}} - - name: run tests - run: | - cd test - sh ./run_tests.sh + - uses: actions/checkout@v2 + - uses: ./.github/actions/install-essential-dependences + - name: install gtest + run: | + sudo git clone https://github.com/libunwind/libunwind.git && pwd && cd libunwind && sudo mkdir output && sudo autoreconf -i && sudo CC=clang CXX=clang++ ./configure --prefix=`pwd`/output && sudo make && sudo make install + sudo apt-get update + sudo apt-get install -y cmake libgtest-dev gdb libstdc++6-9-dbg + cd /usr/src/gtest && sudo cmake . && sudo make && sudo mv lib/libgtest* /usr/lib/ + - uses: ./.github/actions/init-make-config + with: + options: --cc=clang --cxx=clang++ --with-bthread-tracer + - name: compile tests + run: | + cd test + make -j ${{env.proc_num}} bthread_unittest + - name: run tests + run: | + cat config.mk + cd test + pwd + echo "libunwind" && ldd `pwd`/../libunwind/output/lib/libunwind.so + echo "brpc" && ldd libbrpc.dbg.so + echo "bthread_unittest" && ldd bthread_unittest + sh ./run_tests.sh +# gcc-compile-with-make: +# runs-on: ubuntu-20.04 # https://github.com/actions/runner-images +# steps: +# - uses: actions/checkout@v2 +# - uses: ./.github/actions/install-essential-dependences +# - uses: ./.github/actions/init-make-config +# with: +# options: --cc=gcc --cxx=g++ +# - name: compile +# run: | +# make -j ${{env.proc_num}} +# +# gcc-compile-with-cmake: +# runs-on: ubuntu-20.04 +# steps: +# - uses: actions/checkout@v2 +# - uses: ./.github/actions/install-essential-dependences +# - name: cmake +# run: | +# export CC=gcc && export CXX=g++ +# mkdir build +# cd build +# cmake .. +# - name: compile +# run: | +# cd build +# make -j ${{env.proc_num}} +# +# gcc-compile-with-bazel: +# runs-on: ubuntu-20.04 +# steps: +# - uses: actions/checkout@v2 +# - run: bazel test --verbose_failures -- //... -//example/... +# +# gcc-compile-with-boringssl: +# runs-on: ubuntu-20.04 +# steps: +# - uses: actions/checkout@v2 +# - run: bazel test --verbose_failures --define with_mesalink=false --define with_glog=true --define with_thrift=true --define BRPC_WITH_BORINGSSL=true -- //... -//example/... +# +# gcc-compile-with-make-all-options: +# runs-on: ubuntu-20.04 +# steps: +# - uses: actions/checkout@v2 +# - uses: ./.github/actions/install-all-dependences +# - uses: ./.github/actions/init-make-config +# with: +# options: --cc=gcc --cxx=g++ --with-thrift --with-glog --with-rdma --with-debug-bthread-sche-safety --with-debug-lock --with-bthread-tracer +# - name: compile +# run: | +# make -j ${{env.proc_num}} +# +# gcc-compile-with-cmake-all-options: +# runs-on: ubuntu-20.04 +# steps: +# - uses: actions/checkout@v2 +# - uses: ./.github/actions/install-all-dependences +# - name: cmake +# run: | +# export CC=gcc && export CXX=g++ +# mkdir build +# cd build +# cmake -DWITH_MESALINK=OFF -DWITH_GLOG=ON -DWITH_THRIFT=ON -DWITH_RDMA=ON -DWITH_DEBUG_BTHREAD_SCHE_SAFETY=ON -DWITH_DEBUG_LOCK=ON -WITH_BTHREAD_TRACER=ON .. +# - name: compile +# run: | +# cd build +# make -j ${{env.proc_num}} +# +# gcc-compile-with-bazel-all-options: +# runs-on: ubuntu-20.04 +# steps: +# - uses: actions/checkout@v2 +# - run: bazel test --verbose_failures --define with_mesalink=false --define with_glog=true --define with_thrift=true --define with_debug_bthread_sche_safety=true --define with_debug_lock=true -- //... -//example/... +# +# clang-compile-with-make: +# runs-on: ubuntu-20.04 +# steps: +# - uses: actions/checkout@v2 +# - uses: ./.github/actions/install-essential-dependences +# - uses: ./.github/actions/init-make-config +# with: +# options: --cc=clang --cxx=clang++ +# - name: compile +# run: | +# make -j ${{env.proc_num}} +# +# clang-compile-with-cmake: +# runs-on: ubuntu-20.04 +# steps: +# - uses: actions/checkout@v2 +# - uses: ./.github/actions/install-essential-dependences +# - name: cmake +# run: | +# export CC=clang && export CXX=clang++ +# mkdir build +# cd build +# cmake .. +# - name: compile +# run: | +# cd build +# make -j ${{env.proc_num}} +# +# clang-compile-with-bazel: +# runs-on: ubuntu-20.04 +# steps: +# - uses: actions/checkout@v2 +# - run: bazel build --verbose_failures --action_env=CC=clang-12 -- //... -//example/... +# +# clang-compile-with-boringssl: +# runs-on: ubuntu-20.04 +# steps: +# - uses: actions/checkout@v2 +# - run: bazel build --verbose_failures --action_env=CC=clang-12 --define with_mesalink=false --define with_glog=true --define with_thrift=true --define BRPC_WITH_BORINGSSL=true -- //... -//example/... +# +# clang-compile-with-make-all-options: +# runs-on: ubuntu-20.04 +# steps: +# - uses: actions/checkout@v2 +# - uses: ./.github/actions/install-all-dependences +# - uses: ./.github/actions/init-make-config +# with: +# options: --cc=clang --cxx=clang++ --with-thrift --with-glog --with-rdma --with-debug-bthread-sche-safety --with-debug-lock --with-bthread-tracer +# - name: compile +# run: | +# make -j ${{env.proc_num}} +# +# clang-compile-with-cmake-all-options: +# runs-on: ubuntu-20.04 +# steps: +# - uses: actions/checkout@v2 +# - uses: ./.github/actions/install-all-dependences +# - name: cmake +# run: | +# export CC=clang && export CXX=clang++ +# mkdir build +# cd build +# cmake -DWITH_MESALINK=OFF -DWITH_GLOG=ON -DWITH_THRIFT=ON -DWITH_RDMA=ON -DWITH_DEBUG_BTHREAD_SCHE_SAFETY=ON -DWITH_DEBUG_LOCK=ON -WITH_BTHREAD_TRACER=ON .. +# - name: compile +# run: | +# cd build +# make -j ${{env.proc_num}} +# +# clang-compile-with-bazel-all-options: +# runs-on: ubuntu-20.04 +# steps: +# - uses: actions/checkout@v2 +# - run: bazel build --verbose_failures --action_env=CC=clang-12 --define with_mesalink=false --define with_glog=true --define with_thrift=true --define with_debug_bthread_sche_safety=true --define with_debug_lock=true -- //... -//example/... +# +# clang-unittest: +# runs-on: ubuntu-20.04 +# steps: +# - uses: actions/checkout@v2 +# - uses: ./.github/actions/install-essential-dependences +# - name: install gtest +# run: | +# sudo apt-get update +# sudo apt-get install -y cmake libgtest-dev gdb libstdc++6-9-dbg libunwind8-dev +# cd /usr/src/gtest && sudo cmake . && sudo make && sudo mv lib/libgtest* /usr/lib/ +# - uses: ./.github/actions/init-make-config +# with: +# options: --cc=clang --cxx=clang++ --with-bthread-tracer +# - name: compile tests +# run: | +# cd test +# make -j ${{env.proc_num}} +# - name: run tests +# run: | +# cd test +# sh ./run_tests.sh diff --git a/CMakeLists.txt b/CMakeLists.txt index f3688766bb..a6b7716b29 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -24,6 +24,7 @@ option(WITH_BORINGSSL "With BoringSSL" OFF) option(DEBUG "Print debug logs" OFF) option(WITH_DEBUG_SYMBOLS "With debug symbols" ON) option(WITH_THRIFT "With thrift framed protocol supported" OFF) +option(WITH_BTHREAD_TRACER "With bthread tracer supported" OFF) option(WITH_SNAPPY "With snappy" OFF) option(WITH_RDMA "With RDMA" OFF) option(WITH_DEBUG_BTHREAD_SCHE_SAFETY "With debugging bthread sche safety" OFF) @@ -81,6 +82,20 @@ if(WITH_THRIFT) endif() endif() +if (WITH_BTHREAD_TRACER) + if (NOT (CMAKE_SYSTEM_NAME STREQUAL "Linux") OR NOT (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")) + message(FATAL_ERROR "bthread tracer is only supported on Linux x86_64 platform") + endif() + find_path(LIBUNWIND_INCLUDE_PATH NAMES libunwind.h) + find_library(LIBUNWIND_LIB NAMES unwind) + find_library(LIBUNWIND_X86_64_LIB NAMES unwind-x86_64) + if (NOT LIBUNWIND_INCLUDE_PATH OR NOT LIBUNWIND_LIB) + message(FATAL_ERROR "Fail to find libunwind, which is needed by bthread tracer") + endif() + add_definitions(-DBRPC_BTHREAD_TRACER) + include_directories(${LIBUNWIND_INCLUDE_PATH}) +endif () + set(WITH_RDMA_VAL "0") if(WITH_RDMA) set(WITH_RDMA_VAL "1") @@ -319,6 +334,11 @@ if(WITH_SNAPPY) set(BRPC_PRIVATE_LIBS "${BRPC_PRIVATE_LIBS} -lsnappy") endif() +if (WITH_BTHREAD_TRACER) + set(DYNAMIC_LIB ${DYNAMIC_LIB} ${LIBUNWIND_LIB} ${LIBUNWIND_X86_64_LIB}) + set(BRPC_PRIVATE_LIBS "${BRPC_PRIVATE_LIBS} -lunwind -lunwind-x86_64") +endif() + if(CMAKE_SYSTEM_NAME STREQUAL "Linux") set(DYNAMIC_LIB ${DYNAMIC_LIB} rt) set(BRPC_PRIVATE_LIBS "${BRPC_PRIVATE_LIBS} -lrt") diff --git a/config_brpc.sh b/config_brpc.sh index fa607292de..75916df615 100755 --- a/config_brpc.sh +++ b/config_brpc.sh @@ -38,11 +38,12 @@ else LDD=ldd fi -TEMP=`getopt -o v: --long headers:,libs:,cc:,cxx:,with-glog,with-thrift,with-rdma,with-mesalink,with-debug-bthread-sche-safety,with-debug-lock,nodebugsymbols -n 'config_brpc' -- "$@"` +TEMP=`getopt -o v: --long headers:,libs:,cc:,cxx:,with-glog,with-thrift,with-rdma,with-mesalink,with-bthread-tracer,with-debug-bthread-sche-safety,with-debug-lock,nodebugsymbols -n 'config_brpc' -- "$@"` WITH_GLOG=0 WITH_THRIFT=0 WITH_RDMA=0 WITH_MESALINK=0 +WITH_BTHREAD_TRACER=0 BRPC_DEBUG_BTHREAD_SCHE_SAFETY=0 DEBUGSYMBOLS=-g BRPC_DEBUG_LOCK=0 @@ -69,6 +70,7 @@ while true; do --with-thrift) WITH_THRIFT=1; shift 1 ;; --with-rdma) WITH_RDMA=1; shift 1 ;; --with-mesalink) WITH_MESALINK=1; shift 1 ;; + --with-bthread-tracer) WITH_BTHREAD_TRACER=1; shift 1 ;; --with-debug-bthread-sche-safety ) BRPC_DEBUG_BTHREAD_SCHE_SAFETY=1; shift 1 ;; --with-debug-lock ) BRPC_DEBUG_LOCK=1; shift 1 ;; --nodebugsymbols ) DEBUGSYMBOLS=; shift 1 ;; @@ -352,8 +354,8 @@ fi LEVELDB_HDR=$(find_dir_of_header_or_die leveldb/db.h) -HDRS=$($ECHO "$GFLAGS_HDR\n$PROTOBUF_HDR\n$ABSL_HDR\n$LEVELDB_HDR\n$OPENSSL_HDR" | sort | uniq) -LIBS=$($ECHO "$GFLAGS_LIB\n$PROTOBUF_LIB\n$ABSL_LIB\n$LEVELDB_LIB\n$OPENSSL_LIB\n$SNAPPY_LIB" | sort | uniq) +HDRS=$($ECHO "$(pwd)/libunwind/output/include\n$GFLAGS_HDR\n$PROTOBUF_HDR\n$ABSL_HDR\n$LEVELDB_HDR\n$OPENSSL_HDR" | sort | uniq) +LIBS=$($ECHO "$(pwd)/libunwind/output/lib/\n$GFLAGS_LIB\n$PROTOBUF_LIB\n$ABSL_LIB\n$LEVELDB_LIB\n$OPENSSL_LIB\n$SNAPPY_LIB" | sort | uniq) absent_in_the_list() { TMP=`$ECHO "$1\n$2" | sort | uniq` @@ -465,6 +467,27 @@ if [ $WITH_RDMA != 0 ]; then append_to_output "WITH_RDMA=1" fi +#if [ $WITH_BTHREAD_TRACER != 0 ]; then +# if [ "$SYSTEM" != "Linux" ] || [ "$(uname -m)" != "x86_64" ]; then +# >&2 $ECHO "bthread tracer is only supported on Linux x86_64 platform" +# exit 1 +# fi +# LIBUNWIND_LIB=$(find_dir_of_lib_or_die unwind) +# LIBUNWIND_HDR=$(find_dir_of_header_or_die libunwind.h) +# append_to_output_libs "$LIBUNWIND_LIB" +# append_to_output_headers "$LIBUNWIND_HDR" +# +# CPPFLAGS="${CPPFLAGS} -DBRPC_BTHREAD_TRACER" +# +# if [ -f "$LIBUNWIND_LIB/libunwind.$SO" ]; then +# append_to_output "DYNAMIC_LINKINGS+=-lunwind -lunwind-x86_64" +# else +# append_to_output "STATIC_LINKINGS+=-lunwind -lunwind-x86_64" +# fi +#fi +CPPFLAGS="${CPPFLAGS} -DBRPC_BTHREAD_TRACER" +append_to_output "DYNAMIC_LINKINGS+=-lunwind -lunwind-x86_64" + if [ $WITH_MESALINK != 0 ]; then CPPFLAGS="${CPPFLAGS} -DUSE_MESALINK" fi diff --git a/docs/cn/getting_started.md b/docs/cn/getting_started.md index 2a14bc2e02..e52cc6f356 100644 --- a/docs/cn/getting_started.md +++ b/docs/cn/getting_started.md @@ -358,7 +358,7 @@ pb 3.x中的Arena至今没被支持。 ## tcmalloc: 1.7-2.5 -brpc默认**不**链接 [tcmalloc](http://goog-perftools.sourceforge.net/doc/tcmalloc.html)。用户按需要链接tcmalloc。 +bRPC默认**不**链接 [tcmalloc](http://goog-perftools.sourceforge.net/doc/tcmalloc.html)。用户按需要链接tcmalloc。 和glibc内置的ptmalloc相比,tcmalloc通常能提升性能。然而不同版本的tcmalloc可能表现迥异。例如:tcmalloc 2.1与 tcmalloc 1.7和2.5相比,可能会让brpc的多线程样例性能显著恶化(tcmalloc中的一个自旋锁导致的)。甚至不同的小版本号之间变现也可能不同。当你的程序表现不符合预期的时候,移除tcmalloc然后尝试其他版本。 @@ -386,6 +386,10 @@ brpc会自动检测valgrind(然后注册bthread的栈)。不支持老版本 无已知问题。 +## libunwind: 1.1-1.8.1 + +bRPC默认**不**链接 [libunwind](https://github.com/libunwind/libunwind)。Users need to link libunwind only if they need bthread tracer. + # 实例追踪 我们提供了一个程序去帮助你追踪和监控所有brpc实例。 只需要在某处运行 [trackme_server](https://github.com/apache/brpc/tree/master/tools/trackme_server/) 然后再带着 -trackme_server=SERVER参数启动需要被追踪的实例。trackme_server将从实例周期性地收到ping消息然后打印日志。您可以从日志中聚合实例地址,并调用实例的内置服务以获取更多信息。 diff --git a/docs/en/getting_started.md b/docs/en/getting_started.md index 317a9f7c6a..e877d96481 100644 --- a/docs/en/getting_started.md +++ b/docs/en/getting_started.md @@ -380,6 +380,10 @@ brpc detects valgrind automatically (and registers stacks of bthread). Older val ## thrift: 0.9.3-0.11.0 +## libunwind: 1.1-1.8.1 + +brpc does **not** link [libunwind](https://github.com/libunwind/libunwind) by default. Users link libunwind on-demand. + no known issues. # Track instances diff --git a/src/brpc/builtin/bthreads_service.cpp b/src/brpc/builtin/bthreads_service.cpp index 209f7a9934..f65ff347f0 100644 --- a/src/brpc/builtin/bthreads_service.cpp +++ b/src/brpc/builtin/bthreads_service.cpp @@ -24,6 +24,9 @@ namespace bthread { void print_task(std::ostream& os, bthread_t tid); +#ifdef BRPC_BTHREAD_TRACER +void stack_trace(std::ostream& os, bthread_t tid); +#endif // BRPC_BTHREAD_TRACER } @@ -40,12 +43,25 @@ void BthreadsService::default_method(::google::protobuf::RpcController* cntl_bas const std::string& constraint = cntl->http_request().unresolved_path(); if (constraint.empty()) { +#ifdef BRPC_BTHREAD_TRACER + os << "Use /bthreads/ or /bthreads/?st=1 for stack trace"; +#else os << "Use /bthreads/"; +#endif // BRPC_BTHREAD_TRACER } else { char* endptr = NULL; bthread_t tid = strtoull(constraint.c_str(), &endptr, 10); - if (*endptr == '\0' || *endptr == '/') { + if (*endptr == '\0' || *endptr == '/' || *endptr == '?') { ::bthread::print_task(os, tid); + +#ifdef BRPC_BTHREAD_TRACER + const std::string* st = cntl->http_request().uri().GetQuery("st"); + LOG(INFO) << "st=" << (st ? *st : "NULL"); + if (NULL != st && *st == "1") { + os << "\nbthread call stack:\n"; + ::bthread::stack_trace(os, tid); + } +#endif // BRPC_BTHREAD_TRACER } else { cntl->SetFailed(ENOMETHOD, "path=%s is not a bthread id", constraint.c_str()); diff --git a/src/bthread/bthread.cpp b/src/bthread/bthread.cpp index bf2c24f819..4e136c6447 100644 --- a/src/bthread/bthread.cpp +++ b/src/bthread/bthread.cpp @@ -19,6 +19,7 @@ // Date: Tue Jul 10 17:40:58 CST 2012 +#include #include #include "butil/macros.h" // BAIDU_CASSERT #include "butil/logging.h" @@ -115,6 +116,25 @@ inline TaskControl* get_or_new_task_control() { return c; } +#ifdef BRPC_BTHREAD_TRACER +void stack_trace(std::ostream& os, bthread_t tid) { + TaskControl* c = get_task_control(); + if (NULL == c) { + os << "TaskControl has not been created"; + return; + } + c->stack_trace(os, tid); +} + +std::string stack_trace(bthread_t tid) { + TaskControl* c = get_task_control(); + if (NULL == c) { + return "TaskControl has not been created"; + } + return c->stack_trace(tid); +} +#endif // BRPC_BTHREAD_TRACER + static int add_workers_for_each_tag(int num) { int added = 0; auto c = get_task_control(); diff --git a/src/bthread/butex.cpp b/src/bthread/butex.cpp index 25cb953924..c54198c6cf 100644 --- a/src/bthread/butex.cpp +++ b/src/bthread/butex.cpp @@ -283,11 +283,11 @@ inline TaskGroup* get_task_group(TaskControl* c, bthread_tag_t tag) { return is_same_tag(tag) ? tls_task_group : c->choose_one_group(tag); } -inline void run_in_local_task_group(TaskGroup* g, bthread_t tid, bool nosignal) { +inline void run_in_local_task_group(TaskGroup* g, TaskMeta* next_meta, bool nosignal) { if (!nosignal) { - TaskGroup::exchange(&g, tid); + TaskGroup::exchange(&g, next_meta); } else { - g->ready_to_run(tid, nosignal); + g->ready_to_run(next_meta, nosignal); } } @@ -311,9 +311,9 @@ int butex_wake(void* arg, bool nosignal) { unsleep_if_necessary(bbw, get_global_timer_thread()); TaskGroup* g = get_task_group(bbw->control, bbw->tag); if (g == tls_task_group) { - run_in_local_task_group(g, bbw->tid, nosignal); + run_in_local_task_group(g, bbw->task_meta, nosignal); } else { - g->ready_to_run_remote(bbw->tid, check_nosignal(nosignal, g->tag())); + g->ready_to_run_remote(bbw->task_meta, check_nosignal(nosignal, g->tag())); } return 1; } @@ -363,7 +363,7 @@ int butex_wake_n(void* arg, size_t n, bool nosignal) { w->RemoveFromList(); unsleep_if_necessary(w, get_global_timer_thread()); auto g = get_task_group(w->control, w->tag); - g->ready_to_run_general(w->tid, true); + g->ready_to_run_general(w->task_meta, true); nwakeups[g->tag()] = g; ++nwakeup; } @@ -375,9 +375,9 @@ int butex_wake_n(void* arg, size_t n, bool nosignal) { } auto g = get_task_group(next->control, next->tag); if (g == tls_task_group) { - run_in_local_task_group(g, next->tid, nosignal); + run_in_local_task_group(g, next->task_meta, nosignal); } else { - g->ready_to_run_remote(next->tid, check_nosignal(nosignal, g->tag())); + g->ready_to_run_remote(next->task_meta, check_nosignal(nosignal, g->tag())); } return nwakeup; } @@ -436,7 +436,7 @@ int butex_wake_except(void* arg, bthread_t excluded_bthread) { w->RemoveFromList(); unsleep_if_necessary(w, get_global_timer_thread()); auto g = get_task_group(w->control, w->tag); - g->ready_to_run_general(w->tid, true); + g->ready_to_run_general(w->task_meta, true); nwakeups[g->tag()] = g; ++nwakeup; } while (!bthread_waiters.empty()); @@ -480,9 +480,9 @@ int butex_requeue(void* arg, void* arg2) { unsleep_if_necessary(bbw, get_global_timer_thread()); auto g = is_same_tag(bbw->tag) ? tls_task_group : NULL; if (g) { - TaskGroup::exchange(&g, front->tid); + TaskGroup::exchange(&g, bbw->task_meta); } else { - bbw->control->choose_one_group(bbw->tag)->ready_to_run_remote(front->tid); + bbw->control->choose_one_group(bbw->tag)->ready_to_run_remote(bbw->task_meta); } return 1; } @@ -520,7 +520,7 @@ inline bool erase_from_butex(ButexWaiter* bw, bool wakeup, WaiterState state) { if (erased && wakeup) { if (bw->tid) { ButexBthreadWaiter* bbw = static_cast(bw); - get_task_group(bbw->control, bbw->tag)->ready_to_run_general(bw->tid); + get_task_group(bbw->control, bbw->tag)->ready_to_run_general(bbw->task_meta); } else { ButexPthreadWaiter* pw = static_cast(bw); wakeup_pthread(pw); @@ -535,7 +535,7 @@ struct WaitForButexArgs { bool prepend; }; -static void wait_for_butex(void* arg) { +void wait_for_butex(void* arg) { auto args = static_cast(arg); ButexBthreadWaiter* const bw = args->bw; Butex* const b = bw->initial_butex; @@ -565,6 +565,9 @@ static void wait_for_butex(void* arg) { b->waiters.Append(bw); } bw->container.store(b, butil::memory_order_relaxed); +#ifdef BRPC_BTHREAD_TRACER + bw->control->_task_tracer.set_status(TASK_STATUS_SUSPENDED, bw->task_meta); +#endif // BRPC_BTHREAD_TRACER if (bw->abstime != NULL) { bw->sleep_id = get_global_timer_thread()->schedule( erase_from_butex_and_wakeup, bw, *bw->abstime); @@ -582,7 +585,7 @@ static void wait_for_butex(void* arg) { // the two functions. The on-stack ButexBthreadWaiter is safe to use and // bw->waiter_state will not change again. // unsleep_if_necessary(bw, get_global_timer_thread()); - tls_task_group->ready_to_run(bw->tid); + tls_task_group->ready_to_run(bw->task_meta); // FIXME: jump back to original thread is buggy. // // Value unmatched or waiter is already woken up by TimerThread, jump @@ -697,7 +700,7 @@ int butex_wait(void* arg, int expected_value, const timespec* abstime, bool prep // release fence matches with acquire fence in interrupt_and_consume_waiters // in task_group.cpp to guarantee visibility of `interrupted'. bbw.task_meta->current_waiter.store(&bbw, butil::memory_order_release); - WaitForButexArgs args{ &bbw, prepend}; + WaitForButexArgs args{ &bbw, prepend }; g->set_remained(wait_for_butex, &args); TaskGroup::sched(&g); diff --git a/src/bthread/execution_queue.cpp b/src/bthread/execution_queue.cpp index 557669ee05..bb01882cb2 100644 --- a/src/bthread/execution_queue.cpp +++ b/src/bthread/execution_queue.cpp @@ -68,7 +68,7 @@ inline ExecutionQueueVars* get_execq_vars() { void ExecutionQueueBase::start_execute(TaskNode* node) { node->next = TaskNode::UNCONNECTED; - node->status = UNEXECUTED; + node->status = TaskNode::UNEXECUTED; node->iterated = false; if (node->high_priority) { // Add _high_priority_tasks before pushing this task into queue to @@ -304,7 +304,7 @@ int ExecutionQueueBase::_execute(TaskNode* head, bool high_priority, int* nitera if (head != NULL && head->stop_task) { CHECK(head->next == NULL); head->iterated = true; - head->status = EXECUTED; + head->status = TaskNode::EXECUTED; TaskIteratorBase iter(NULL, this, true, false); _execute_func(_meta, _type_specific_function, iter); if (niterated) { diff --git a/src/bthread/execution_queue_inl.h b/src/bthread/execution_queue_inl.h index 6472505187..f5998a2628 100644 --- a/src/bthread/execution_queue_inl.h +++ b/src/bthread/execution_queue_inl.h @@ -38,17 +38,17 @@ struct ExecutionQueueId { uint64_t value; }; -enum TaskStatus { - UNEXECUTED = 0, - EXECUTING = 1, - EXECUTED = 2 -}; - struct TaskNode; class ExecutionQueueBase; typedef void (*clear_task_mem)(TaskNode*); struct BAIDU_CACHELINE_ALIGNMENT TaskNode { + enum TaskStatus { + UNEXECUTED = 0, + EXECUTING = 1, + EXECUTED = 2 + }; + TaskNode() : version(0) , status(UNEXECUTED) diff --git a/src/bthread/task_control.cpp b/src/bthread/task_control.cpp index d0549ea9e8..a07219880a 100644 --- a/src/bthread/task_control.cpp +++ b/src/bthread/task_control.cpp @@ -19,6 +19,7 @@ // Date: Tue Jul 10 17:40:58 CST 2012 +#include // SYS_gettid #include "butil/scoped_lock.h" // BAIDU_SCOPED_LOCK #include "butil/errno.h" // berror #include "butil/logging.h" @@ -88,14 +89,25 @@ void* TaskControl::worker_thread(void* arg) { LOG(ERROR) << "Fail to create TaskGroup in pthread=" << pthread_self(); return NULL; } + + g->_tid = syscall(SYS_gettid); + std::string worker_thread_name = butil::string_printf( - "brpc_wkr:%d-%d", g->tag(), c->_next_worker_id.fetch_add(1, butil::memory_order_relaxed)); + "brpc_wkr:%d-%d", g->tag(), + c->_next_worker_id.fetch_add(1, butil::memory_order_relaxed)); butil::PlatformThread::SetName(worker_thread_name.c_str()); - BT_VLOG << "Created worker=" << pthread_self() << " bthread=" << g->main_tid() - << " tag=" << g->tag(); + BT_VLOG << "Created worker=" << pthread_self() << " tid=" << g->_tid + << " bthread=" << g->main_tid() << " tag=" << g->tag(); tls_task_group = g; c->_nworkers << 1; c->tag_nworkers(g->tag()) << 1; + +#ifdef BRPC_BTHREAD_TRACER + if (!c->_task_tracer.RegisterSignalHandler(g->_tid)) { + LOG(ERROR) << "Fail to register signal handler for Tracer"; + } +#endif // BRPC_BTHREAD_TRACER + g->run_main_task(); stat = g->main_stat(); @@ -281,6 +293,16 @@ TaskGroup* TaskControl::choose_one_group(bthread_tag_t tag) { return NULL; } +#ifdef BRPC_BTHREAD_TRACER +void TaskControl::stack_trace(std::ostream& os, bthread_t tid) { + _task_tracer.Trace(os, tid); +} + +std::string TaskControl::stack_trace(bthread_t tid) { + return _task_tracer.Trace(tid); +} +#endif // BRPC_BTHREAD_TRACER + extern int stop_and_join_epoll_threads(); void TaskControl::stop_and_join() { @@ -301,10 +323,15 @@ void TaskControl::stop_and_join() { pl.stop(); } } + +#ifdef BRPC_BTHREAD_TRACER + _task_tracer.StopAndInterruptWorker(); +#else // Interrupt blocking operations. for (size_t i = 0; i < _workers.size(); ++i) { interrupt_pthread(_workers[i]); } +#endif // Join workers for (size_t i = 0; i < _workers.size(); ++i) { pthread_join(_workers[i], NULL); diff --git a/src/bthread/task_control.h b/src/bthread/task_control.h index 125980797a..f5be2ab4c5 100644 --- a/src/bthread/task_control.h +++ b/src/bthread/task_control.h @@ -25,14 +25,16 @@ #ifndef NDEBUG #include // std::ostream #endif +#include #include // size_t #include #include #include #include "butil/atomicops.h" // butil::atomic +#include "butil/resource_pool.h" // ResourcePool #include "bvar/bvar.h" // bvar::PassiveStatus +#include "bthread/tracer.h" #include "bthread/task_meta.h" // TaskMeta -#include "butil/resource_pool.h" // ResourcePool #include "bthread/work_stealing_queue.h" // WorkStealingQueue #include "bthread/parking_lot.h" @@ -43,7 +45,9 @@ class TaskGroup; // Control all task groups class TaskControl { - friend class TaskGroup; +friend class TaskGroup; +// todo 搞了friendaccessor? +friend void wait_for_butex(void*); public: TaskControl(); @@ -86,6 +90,12 @@ class TaskControl { // If this method is called after init(), it never returns NULL. TaskGroup* choose_one_group(bthread_tag_t tag); +#ifdef BRPC_BTHREAD_TRACER + // A stacktrace of bthread can be helpful in debugging. + void stack_trace(std::ostream& os, bthread_t tid); + std::string stack_trace(bthread_t tid); +#endif // BRPC_BTHREAD_TRACER + private: typedef std::array TaggedGroups; static const int PARKING_LOT_NUM = 4; @@ -144,6 +154,11 @@ class TaskControl { std::vector*> _tagged_nbthreads; std::vector _pl; + +#ifdef BRPC_BTHREAD_TRACER + TaskTracer _task_tracer; +#endif // BRPC_BTHREAD_TRACER + }; inline bvar::LatencyRecorder& TaskControl::exposed_pending_time() { diff --git a/src/bthread/task_group.cpp b/src/bthread/task_group.cpp index bab6469001..1165cf63bb 100644 --- a/src/bthread/task_group.cpp +++ b/src/bthread/task_group.cpp @@ -262,6 +262,9 @@ void TaskGroup::task_runner(intptr_t skip_remained) { // NOTE: tls_task_group is volatile since tasks are moved around // different groups. TaskGroup* g = tls_task_group; +#ifdef BRPC_BTHREAD_TRACER + TaskTracer::set_running_status(g->tid(), g->_cur_meta); +#endif // BRPC_BTHREAD_TRACER if (!skip_remained) { while (g->_last_context_remained) { @@ -337,14 +340,28 @@ void TaskGroup::task_runner(intptr_t skip_remained) { // is 0, change it to 1 to make bthread_t never be 0. Any access // or join to the bthread after changing version will be rejected. // The spinlock is for visibility of TaskGroup::get_attr. +#ifdef BRPC_BTHREAD_TRACER + bool tracing = false; +#endif // BRPC_BTHREAD_TRACER { BAIDU_SCOPED_LOCK(m->version_lock); if (0 == ++*m->version_butex) { +#ifdef BRPC_BTHREAD_TRACER + tracing = TaskTracer::set_end_status_unsafe(m); +#endif // BRPC_BTHREAD_TRACER ++*m->version_butex; } } butex_wake_except(m->version_butex, 0); +#ifdef BRPC_BTHREAD_TRACER + if (tracing) { + // Wait for tracing completion. + g->_control->_task_tracer.WaitForTracing(m); + } + g->_control->_task_tracer.set_status(TASK_STATUS_UNKNOWN, m); +#endif // BRPC_BTHREAD_TRACER + g->_control->_nbthreads << -1; g->_control->tag_nbthreads(g->tag()) << -1; g->set_remained(TaskGroup::_release_last_context, m); @@ -406,9 +423,12 @@ int TaskGroup::start_foreground(TaskGroup** pg, TaskGroup* g = *pg; g->_control->_nbthreads << 1; g->_control->tag_nbthreads(g->tag()) << 1; +#ifdef BRPC_BTHREAD_TRACER + g->_control->_task_tracer.set_status(TASK_STATUS_CREATED, m); +#endif // BRPC_BTHREAD_TRACER if (g->is_current_pthread_task()) { // never create foreground task in pthread. - g->ready_to_run(m->tid, (using_attr.flags & BTHREAD_NOSIGNAL)); + g->ready_to_run(m, using_attr.flags & BTHREAD_NOSIGNAL); } else { // NOSIGNAL affects current task, not the new task. RemainedFn fn = NULL; @@ -417,10 +437,7 @@ int TaskGroup::start_foreground(TaskGroup** pg, } else { fn = ready_to_run_in_worker; } - ReadyToRunArgs args = { - g->current_tid(), - (bool)(using_attr.flags & BTHREAD_NOSIGNAL) - }; + ReadyToRunArgs args = { g->_cur_meta, (bool)(using_attr.flags & BTHREAD_NOSIGNAL) }; g->set_remained(fn, &args); TaskGroup::sched_to(pg, m->tid); } @@ -464,10 +481,13 @@ int TaskGroup::start_background(bthread_t* __restrict th, } _control->_nbthreads << 1; _control->tag_nbthreads(tag()) << 1; +#ifdef BRPC_BTHREAD_TRACER + _control->_task_tracer.set_status(TASK_STATUS_CREATED, m); +#endif // BRPC_BTHREAD_TRACER if (REMOTE) { - ready_to_run_remote(m->tid, (using_attr.flags & BTHREAD_NOSIGNAL)); + ready_to_run_remote(m, (using_attr.flags & BTHREAD_NOSIGNAL)); } else { - ready_to_run(m->tid, (using_attr.flags & BTHREAD_NOSIGNAL)); + ready_to_run(m, (using_attr.flags & BTHREAD_NOSIGNAL)); } return 0; } @@ -625,9 +645,16 @@ void TaskGroup::sched_to(TaskGroup** pg, TaskMeta* next_meta) { if (cur_meta->stack != NULL) { if (next_meta->stack != cur_meta->stack) { CheckBthreadScheSafety(); +#ifdef BRPC_BTHREAD_TRACER + g->_control->_task_tracer.set_status(TASK_STATUS_JUMPING, cur_meta); + g->_control->_task_tracer.set_status(TASK_STATUS_JUMPING, next_meta); +#endif // BRPC_BTHREAD_TRACER jump_stack(cur_meta->stack, next_meta->stack); // probably went to another group, need to assign g again. g = BAIDU_GET_VOLATILE_THREAD_LOCAL(tls_task_group); +#ifdef BRPC_BTHREAD_TRACER + TaskTracer::set_running_status(g->tid(), g->_cur_meta); +#endif // BRPC_BTHREAD_TRACER } #ifndef NDEBUG else { @@ -669,8 +696,12 @@ void TaskGroup::destroy_self() { } } -void TaskGroup::ready_to_run(bthread_t tid, bool nosignal) { - push_rq(tid); + +void TaskGroup::ready_to_run(TaskMeta* meta, bool nosignal) { +#ifdef BRPC_BTHREAD_TRACER + _control->_task_tracer.set_status(TASK_STATUS_READY, meta); +#endif // BRPC_BTHREAD_TRACER + push_rq(meta->tid); if (nosignal) { ++_num_nosignal; } else { @@ -690,9 +721,12 @@ void TaskGroup::flush_nosignal_tasks() { } } -void TaskGroup::ready_to_run_remote(bthread_t tid, bool nosignal) { +void TaskGroup::ready_to_run_remote(TaskMeta* meta, bool nosignal) { +#ifdef BRPC_BTHREAD_TRACER + _control->_task_tracer.set_status(TASK_STATUS_READY, meta); +#endif // BRPC_BTHREAD_TRACER _remote_rq._mutex.lock(); - while (!_remote_rq.push_locked(tid)) { + while (!_remote_rq.push_locked(meta->tid)) { flush_nosignal_tasks_remote_locked(_remote_rq._mutex); LOG_EVERY_SECOND(ERROR) << "_remote_rq is full, capacity=" << _remote_rq.capacity(); @@ -723,11 +757,11 @@ void TaskGroup::flush_nosignal_tasks_remote_locked(butil::Mutex& locked_mutex) { _control->signal_task(val, _tag); } -void TaskGroup::ready_to_run_general(bthread_t tid, bool nosignal) { +void TaskGroup::ready_to_run_general(TaskMeta* meta, bool nosignal) { if (tls_task_group == this) { - return ready_to_run(tid, nosignal); + return ready_to_run(meta, nosignal); } - return ready_to_run_remote(tid, nosignal); + return ready_to_run_remote(meta, nosignal); } void TaskGroup::flush_nosignal_tasks_general() { @@ -739,12 +773,16 @@ void TaskGroup::flush_nosignal_tasks_general() { void TaskGroup::ready_to_run_in_worker(void* args_in) { ReadyToRunArgs* args = static_cast(args_in); - return tls_task_group->ready_to_run(args->tid, args->nosignal); + return tls_task_group->ready_to_run(args->meta, args->nosignal); } void TaskGroup::ready_to_run_in_worker_ignoresignal(void* args_in) { ReadyToRunArgs* args = static_cast(args_in); - return tls_task_group->push_rq(args->tid); +#ifdef BRPC_BTHREAD_TRACER + tls_task_group->_control->_task_tracer.set_status( + TASK_STATUS_READY, args->meta); +#endif // BRPC_BTHREAD_TRACER + return tls_task_group->push_rq(args->meta->tid); } struct SleepArgs { @@ -759,7 +797,7 @@ static void ready_to_run_from_timer_thread(void* arg) { const SleepArgs* e = static_cast(arg); auto g = e->group; auto tag = g->tag(); - g->control()->choose_one_group(tag)->ready_to_run_remote(e->tid); + g->control()->choose_one_group(tag)->ready_to_run_remote(e->meta); } void TaskGroup::_add_sleep_event(void* void_args) { @@ -768,6 +806,9 @@ void TaskGroup::_add_sleep_event(void* void_args) { // will be gone. SleepArgs e = *static_cast(void_args); TaskGroup* g = e.group; +#ifdef BRPC_BTHREAD_TRACER + g->_control->_task_tracer.set_status(TASK_STATUS_SUSPENDED, e.meta); +#endif // BRPC_BTHREAD_TRACER TimerThread::TaskId sleep_id; sleep_id = get_global_timer_thread()->schedule( @@ -777,7 +818,7 @@ void TaskGroup::_add_sleep_event(void* void_args) { if (!sleep_id) { e.meta->sleep_failed = true; // Fail to schedule timer, go back to previous thread. - g->ready_to_run(e.tid); + g->ready_to_run(e.meta); return; } @@ -801,7 +842,7 @@ void TaskGroup::_add_sleep_event(void* void_args) { // schedule previous thread as well. If sleep_id does not exist, // previous thread is scheduled by timer thread before and we don't // have to do it again. - g->ready_to_run(e.tid); + g->ready_to_run(e.meta); } } @@ -904,12 +945,12 @@ int TaskGroup::interrupt(bthread_t tid, TaskControl* c, bthread_tag_t tag) { if (get_global_timer_thread()->unschedule(sleep_id) == 0) { bthread::TaskGroup* g = bthread::tls_task_group; if (g) { - g->ready_to_run(tid); + g->ready_to_run(TaskGroup::address_meta(tid)); } else { if (!c) { return EINVAL; } - c->choose_one_group(tag)->ready_to_run_remote(tid); + c->choose_one_group(tag)->ready_to_run_remote(TaskGroup::address_meta(tid)); } } } @@ -918,7 +959,7 @@ int TaskGroup::interrupt(bthread_t tid, TaskControl* c, bthread_tag_t tag) { void TaskGroup::yield(TaskGroup** pg) { TaskGroup* g = *pg; - ReadyToRunArgs args = { g->current_tid(), false }; + ReadyToRunArgs args = { g->_cur_meta, false }; g->set_remained(ready_to_run_in_worker, &args); sched(pg); } diff --git a/src/bthread/task_group.h b/src/bthread/task_group.h index a19bd023f7..d68c0dbb67 100644 --- a/src/bthread/task_group.h +++ b/src/bthread/task_group.h @@ -83,7 +83,7 @@ class TaskGroup { // then being popped by sched(pg), which is not necessary. static void sched_to(TaskGroup** pg, TaskMeta* next_meta); static void sched_to(TaskGroup** pg, bthread_t next_tid); - static void exchange(TaskGroup** pg, bthread_t next_tid); + static void exchange(TaskGroup** pg, TaskMeta* next_meta); // The callback will be run in the beginning of next-run bthread. // Can't be called by current bthread directly because it often needs @@ -151,18 +151,18 @@ class TaskGroup { int64_t cumulated_cputime_ns() const { return _cumulated_cputime_ns; } // Push a bthread into the runqueue - void ready_to_run(bthread_t tid, bool nosignal = false); + void ready_to_run(TaskMeta* meta, bool nosignal = false); // Flush tasks pushed to rq but signalled. void flush_nosignal_tasks(); // Push a bthread into the runqueue from another non-worker thread. - void ready_to_run_remote(bthread_t tid, bool nosignal = false); + void ready_to_run_remote(TaskMeta* meta, bool nosignal = false); void flush_nosignal_tasks_remote_locked(butil::Mutex& locked_mutex); void flush_nosignal_tasks_remote(); // Automatically decide the caller is remote or local, and call // the corresponding function. - void ready_to_run_general(bthread_t tid, bool nosignal = false); + void ready_to_run_general(TaskMeta* meta, bool nosignal = false); void flush_nosignal_tasks_general(); // The TaskControl that this TaskGroup belongs to. @@ -189,6 +189,8 @@ class TaskGroup { bthread_tag_t tag() const { return _tag; } + pid_t tid() const { return _tid; } + private: friend class TaskControl; @@ -197,7 +199,7 @@ friend class TaskControl; int init(size_t runqueue_capacity); - // You shall call destroy_self() instead of destructor because deletion + // You shall call destroy_selfm() instead of destructor because deletion // of groups are postponed to avoid race. ~TaskGroup(); @@ -207,7 +209,7 @@ friend class TaskControl; static void _release_last_context(void*); static void _add_sleep_event(void*); struct ReadyToRunArgs { - bthread_t tid; + TaskMeta* meta; bool nosignal; }; static void ready_to_run_in_worker(void*); @@ -262,6 +264,9 @@ friend class TaskControl; int _sched_recursive_guard; // tag of this taskgroup bthread_tag_t _tag; + + // Worker thread id. + pid_t _tid; }; } // namespace bthread diff --git a/src/bthread/task_group_inl.h b/src/bthread/task_group_inl.h index 45626ceb49..75c377e12f 100644 --- a/src/bthread/task_group_inl.h +++ b/src/bthread/task_group_inl.h @@ -46,17 +46,17 @@ inline TaskMeta* TaskGroup::address_meta(bthread_t tid) { return address_resource(get_slot(tid)); } -inline void TaskGroup::exchange(TaskGroup** pg, bthread_t next_tid) { +inline void TaskGroup::exchange(TaskGroup** pg, TaskMeta* next_meta) { TaskGroup* g = *pg; if (g->is_current_pthread_task()) { - return g->ready_to_run(next_tid); + return g->ready_to_run(next_meta); } - ReadyToRunArgs args = { g->current_tid(), false }; + ReadyToRunArgs args = { g->_cur_meta, false }; g->set_remained((g->current_task()->about_to_quit ? ready_to_run_in_worker_ignoresignal : ready_to_run_in_worker), &args); - TaskGroup::sched_to(pg, next_tid); + TaskGroup::sched_to(pg, next_meta); } inline void TaskGroup::sched_to(TaskGroup** pg, bthread_t next_tid) { diff --git a/src/bthread/task_meta.h b/src/bthread/task_meta.h index 5e5c4541d1..f31e64f339 100644 --- a/src/bthread/task_meta.h +++ b/src/bthread/task_meta.h @@ -27,6 +27,7 @@ #include "butil/atomicops.h" // butil::atomic #include "bthread/types.h" // bthread_attr_t #include "bthread/stack.h" // ContextualStack +#include "bthread/timer_thread.h" namespace bthread { @@ -48,59 +49,90 @@ struct LocalStorage { const static LocalStorage LOCAL_STORAGE_INIT = BTHREAD_LOCAL_STORAGE_INITIALIZER; +// https://stackoverflow.com/a/61860187 +// https://stackoverflow.com/a/57308102 +// https://github.com/WAVM/WAVM/issues/80 +// https://www.nongnu.org/libunwind/man/unw_step(3).html#section_4 +// https://github.com/gperftools/gperftools/wiki/gperftools'-stacktrace-capturing-methods-and-their-issues#libunwind +// https://github.com/alk/unwind_safeness_helper/blob/master/unwind_safeness_helper.c +// https://github.com/gperftools/gperftools/issues/775#issuecomment-207836201 +// https://github.com/libunwind/libunwind/pull/490 +// https://github.com/libunwind/libunwind/issues/580 +// https://maskray.me/blog/2022-04-10-unwinding-through-signal-handler +// https://github.com/scanmem/scanmem/issues/131 +// https://news.ycombinator.com/item?id=16753900 +#ifdef BRPC_BTHREAD_TRACER +enum TaskStatus { + TASK_STATUS_UNKNOWN, + TASK_STATUS_CREATED, + TASK_STATUS_FIRST_READY, + TASK_STATUS_READY, + TASK_STATUS_JUMPING, + TASK_STATUS_RUNNING, + TASK_STATUS_SUSPENDED, + TASK_STATUS_END, +}; +#endif // BRPC_BTHREAD_TRACER + struct TaskMeta { // [Not Reset] - butil::atomic current_waiter; - uint64_t current_sleep; + butil::atomic current_waiter{NULL}; + uint64_t current_sleep{TimerThread::INVALID_TASK_ID}; // A flag to mark if the Timer scheduling failed. - bool sleep_failed; + bool sleep_failed{false}; // A builtin flag to mark if the thread is stopping. - bool stop; + bool stop{false}; // The thread is interrupted and should wake up from some blocking ops. - bool interrupted; + bool interrupted{false}; // Scheduling of the thread can be delayed. - bool about_to_quit; + bool about_to_quit{false}; // [Not Reset] guarantee visibility of version_butex. - pthread_spinlock_t version_lock; + pthread_spinlock_t version_lock{}; // [Not Reset] only modified by one bthread at any time, no need to be atomic - uint32_t* version_butex; + uint32_t* version_butex{NULL}; // The identifier. It does not have to be here, however many code is // simplified if they can get tid from TaskMeta. - bthread_t tid; + bthread_t tid{INVALID_BTHREAD}; // User function and argument - void* (*fn)(void*); - void* arg; + void* (*fn)(void*){NULL}; + void* arg{NULL}; // Stack of this task. - ContextualStack* stack; + ContextualStack* stack{NULL}; // Attributes creating this task - bthread_attr_t attr; + bthread_attr_t attr{BTHREAD_ATTR_NORMAL}; // Statistics - int64_t cpuwide_start_ns; - TaskStatistics stat; + int64_t cpuwide_start_ns{0}; + TaskStatistics stat{}; // bthread local storage, sync with tls_bls (defined in task_group.cpp) // when the bthread is created or destroyed. // DO NOT use this field directly, use tls_bls instead. - LocalStorage local_storage; + LocalStorage local_storage{}; + +#ifdef BRPC_BTHREAD_TRACER + // Bthread status. + TaskStatus status{TASK_STATUS_UNKNOWN}; + // Whether bthread is traced? + bool traced{false}; + // Worker thread id. + pid_t worker_tid{-1}; +#endif // BRPC_BTHREAD_TRACER public: // Only initialize [Not Reset] fields, other fields will be reset in // bthread_start* functions - TaskMeta() - : current_waiter(NULL) - , current_sleep(0) - , stack(NULL) { + TaskMeta() { pthread_spin_init(&version_lock, 0); version_butex = butex_create_checked(); *version_butex = 1; diff --git a/src/bthread/tracer.cpp b/src/bthread/tracer.cpp new file mode 100644 index 0000000000..9280b0a13b --- /dev/null +++ b/src/bthread/tracer.cpp @@ -0,0 +1,468 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifdef BRPC_BTHREAD_TRACER + +#include +#include "butil/debug/stack_trace.h" +#include "bthread/tracer.h" +#include "bthread/task_group.h" +#include "bthread/processor.h" + +namespace bthread { + +DEFINE_bool(enable_fast_unwind, true, "whether to enable fast unwind"); +DEFINE_uint32(max_sigqueue_try, 10, "max try times for sigqueue"); +DEFINE_uint32(signal_trace_timeout_ms, 50, "timeout for signal trace in ms"); + +void TaskTracer::set_status(TaskStatus s, TaskMeta* m) { + CHECK_NE(TASK_STATUS_RUNNING, s) << "Use `set_running_status' instead"; + CHECK_NE(TASK_STATUS_END, s) << "Use `set_end_status_unsafe' instead"; + + bool tracing; + { + BAIDU_SCOPED_LOCK(m->version_lock); + if (TASK_STATUS_UNKNOWN == m->status && TASK_STATUS_JUMPING == s) { + // Do not update status for jumping when bthread is ending. + return; + } + + tracing = m->traced; + // bthread is scheduled for the first time. + if (TASK_STATUS_READY == s || NULL == m->stack) { + m->status = TASK_STATUS_FIRST_READY; + } else { + m->status = s; + } + } + + if (tracing && TASK_STATUS_JUMPING == s) { + WaitForTracing(m); + } +} + +void TaskTracer::set_running_status(pid_t worker_tid, TaskMeta* m) { + BAIDU_SCOPED_LOCK(m->version_lock); + m->worker_tid = worker_tid; + m->status = TASK_STATUS_RUNNING; +} + +bool TaskTracer::set_end_status_unsafe(TaskMeta* m) { + m->status = TASK_STATUS_END; + return m->traced; +} + +std::string TaskTracer::Trace(bthread_t tid) { + Result result = TraceImpl(tid); + return result.error ? result.err_msg : ToString(result); +} + +void TaskTracer::Trace(std::ostream& os, bthread_t tid) { + Result result = TraceImpl(tid); + if (result.error) { + os << result.err_msg; + } else { + ToString(result, os); + } +} + +TaskTracer::Result TaskTracer::TraceImpl(bthread_t tid) { + if (tid == bthread_self()) { + return Result::MakeErrorResult("Can not trace self=%d", tid); + } + + // Only one bthread can be traced at a time. + BAIDU_SCOPED_LOCK(_trace_request_mutex); + + TaskMeta* m = TaskGroup::address_meta(tid); + if (NULL == m) { + return Result::MakeErrorResult("bthread=%d never existed", tid); + } + + BAIDU_SCOPED_LOCK(_mutex); + TaskStatus status; + pid_t worker_tid; + const uint32_t given_version = get_version(tid); + { + BAIDU_SCOPED_LOCK(m->version_lock); + if (given_version == *m->version_butex) { + // Start tracing. + m->traced = true; + worker_tid = m->worker_tid; + status = m->status; + } else { + return Result::MakeErrorResult("bthread=%d not exist now", tid); + } + } + + if (TASK_STATUS_UNKNOWN == status) { + return Result::MakeErrorResult("bthread=%d not exist now", tid); + } else if (TASK_STATUS_CREATED == status) { + return Result::MakeErrorResult("bthread=%d has just been created", tid); + } else if (TASK_STATUS_FIRST_READY == status) { + return Result::MakeErrorResult("bthread=%d is scheduled for the first time", tid); + } else if (TASK_STATUS_END == status) { + return Result::MakeErrorResult("bthread=%d has ended", tid); + } else if (TASK_STATUS_JUMPING == status) { + // Wait for jumping completion. + status = WaitForJumping(m); + } + + // After jumping, the status may be RUNNING, SUSPENDED, or READY, which is traceable. + + Result result{}; + if (TASK_STATUS_RUNNING == status) { + result = SignalTrace(worker_tid); + } else if (TASK_STATUS_SUSPENDED == status || TASK_STATUS_READY == status) { + result = ContextTrace(m->stack->context); + } + + { + BAIDU_SCOPED_LOCK(m->version_lock); + // If m->status is BTHREAD_STATUS_END, the bthread also waits for tracing completion, + // so given_version != *m->version_butex is OK. + m->traced = false; + } + // Wake up the waiting worker thread to jump. + _cond.Signal(); + + return result; +} + +void TaskTracer::SignalSafeUsleep(unsigned int microseconds) { + ErrnoGuard guard; + struct timespec sleep_time{}; + sleep_time.tv_sec = microseconds / 1000000; + sleep_time.tv_nsec = (microseconds % 1000000) * 1000; + // On Linux, sleep() is implemented via nanosleep(2). + // sleep() is async-signal-safety, so nanosleep() is considered as async-signal-safety on Linux. + // abseil-cpp and folly also use nanosleep() in signal handler. For details, see: + // 1. abseil-cpp: https://github.com/abseil/abseil-cpp/blob/27a0c7308f04e4560fabe5a7beca837e8f3f2c5b/absl/debugging/failure_signal_handler.cc#L314 + // 2. folly: https://github.com/facebook/folly/blob/479de0144d3acb6aa4b3483affa23cf4f49f07ee/folly/debugging/symbolizer/SignalHandler.cpp#L446 + while (nanosleep(&sleep_time, &sleep_time) == -1 && EINTR == errno); +} + +bool TaskTracer::RegisterSignalHandler(pid_t tid) { + { + BAIDU_SCOPED_LOCK(_worker_mutex); + _worker_tids.push_back(tid); + } + // Set up the signal handler. + struct sigaction sa{}; + std::memset(&sa, 0, sizeof(sa)); + sa.sa_sigaction = SignalHandler; + sa.sa_flags = SA_SIGINFO; + sigfillset(&sa.sa_mask); + + if (sigaction(SIGURG, &sa, nullptr) == -1) { + LOG(ERROR) << "Failed to sigaction"; + return false; + } + + return true; +} + +void TaskTracer::WaitForTracing(TaskMeta* m) { + BAIDU_SCOPED_LOCK(_mutex); + while (m->traced) { + _cond.Wait(); + } +} + +void TaskTracer::StopAndInterruptWorker() { + if (_stop.exchange(true, butil::memory_order_relaxed)) { + return; + } + + BAIDU_SCOPED_LOCK(_worker_mutex); + for (auto worker_tid : _worker_tids) { + union sigval value{}; + value.sival_ptr = this; + while (sigqueue(worker_tid, SIGURG, value) != 0) { + PLOG(ERROR) << "Fail to sigqueue"; + } + } + _worker_tids.clear(); +} + +TaskStatus TaskTracer::WaitForJumping(TaskMeta* m) { + // Reasons for not using locks here: + // 1. It is necessary to lock before jump_stack, unlock after jump_stack, + // which involves two different bthread and is prone to errors. + // 2. jump_stack is fast. + int i = 0; + do { + // The bthread is jumping now, spin until it finishes. + if (i++ < 30) { + cpu_relax(); + } else { + sched_yield(); + } + + BAIDU_SCOPED_LOCK(m->version_lock); + if (TASK_STATUS_JUMPING != m->status) { + return m->status; + } + } while (true); +} + +TaskTracer::Result TaskTracer::ContextTrace(bthread_fcontext_t fcontext) { + unw_cursor_t cursor = MakeCursor(fcontext); + return TraceCore(cursor); +} + +void TaskTracer::SignalHandler(int, siginfo_t* info, void* context) { + ErrnoGuard guard; + static_cast(info->si_value.sival_ptr) + ->SignalTraceHandler(static_cast(context)); +} + +// Caution: This function is called in signal handler, so it should be async-signal-safety. +void TaskTracer::SignalTraceHandler(unw_context_t* context) { + if (_stop.load(butil::memory_order_relaxed)) { + // Only wakeup the worker thread. + return; + } + + // The signal is not from BthreadTracer, ignore. + if (SIGNAL_TRACE_STATUS_START != _signal_handler_flag.load(butil::memory_order_acquire)) { + return; + } + + _signal_context = context; + _signal_handler_flag.store(SIGNAL_TRACE_STATUS_TRACING, butil::memory_order_seq_cst); + butil::Timer timer(butil::Timer::STARTED); + while (SIGNAL_TRACE_STATUS_TRACING == + _signal_handler_flag.load(butil::memory_order_seq_cst)) { + SignalSafeUsleep(50); // 50us + // Timeout to avoid deadlock of libunwind. + timer.stop(); + if (timer.m_elapsed() > FLAGS_signal_trace_timeout_ms) { + break; + } + } +} + +TaskTracer::Result TaskTracer::SignalTrace(pid_t tid) { + _signal_context = NULL; + // Use memory_order_seq_cst to ensure the flag is set before sending signal. + _signal_handler_flag.store(SIGNAL_TRACE_STATUS_START, butil::memory_order_seq_cst); + + // CAUTION: + // The signal handler will wait for the backtrace to complete. + // If the worker thread is interrupted when holding a resource(lock, etc), + // and this function waits for the resource during capturing backtraces, + // it may cause a deadlock. + // + // https://github.com/gperftools/gperftools/wiki/gperftools'-stacktrace-capturing-methods-and-their-issues#libunwind + // Generally, libunwind promises async-signal-safety for capturing backtraces. + // But in practice, it is only partially async-signal-safe due to reliance on + // dl_iterate_phdr API, which is used to enumerate all loaded ELF modules + // (.so files and main executable binary). No libc offers dl_iterate_pdhr that + // is async-signal-safe. In practice, the issue may happen if we take tracing + // signal during an existing dl_iterate_phdr call (such as when the program + // throws an exception) or during dlopen/dlclose-ing some .so module. + // Deadlock call stack: + // #0 __lll_lock_wait (futex=futex@entry=0x7f0d3d7f0990 <_rtld_global+2352>, private=0) at lowlevellock.c:52 + // #1 0x00007f0d3a73c131 in __GI___pthread_mutex_lock (mutex=0x7f0d3d7f0990 <_rtld_global+2352>) at ../nptl/pthread_mutex_lock.c:115 + // #2 0x00007f0d38eb0231 in __GI___dl_iterate_phdr (callback=callback@entry=0x7f0d38c456a0 <_ULx86_64_dwarf_callback>, data=data@entry=0x7f0d07defad0) at dl-iteratephdr.c:40 + // #3 0x00007f0d38c45d79 in _ULx86_64_dwarf_find_proc_info (as=0x7f0d38c4f340 , ip=ip@entry=139694791966897, pi=pi@entry=0x7f0d07df0498, need_unwind_info=need_unwind_info@entry=1, arg=0x7f0 + // d07df0340) at dwarf/Gfind_proc_info-lsb.c:759 + // #4 0x00007f0d38c43260 in fetch_proc_info (c=c@entry=0x7f0d07df0340, ip=139694791966897) at dwarf/Gparser.c:461 + // #5 0x00007f0d38c44e46 in find_reg_state (sr=0x7f0d07defd10, c=0x7f0d07df0340) at dwarf/Gparser.c:925 + // #6 _ULx86_64_dwarf_step (c=c@entry=0x7f0d07df0340) at dwarf/Gparser.c:972 + // #7 0x00007f0d38c40c14 in _ULx86_64_step (cursor=cursor@entry=0x7f0d07df0340) at x86_64/Gstep.c:71 + // #8 0x00007f0d399ed8f6 in GetStackTraceWithContext_libunwind (result=, max_depth=63, skip_count=132054887, ucp=) at src/stacktrace_libunwind-inl.h:138 + // #9 0x00007f0d399ee083 in GetStackTraceWithContext (result=0x7f0d07df07b8, max_depth=63, skip_count=3, uc=0x7f0d07df0a40) at src/stacktrace.cc:305 + // #10 0x00007f0d399ea992 in CpuProfiler::prof_handler (signal_ucontext=, cpu_profiler=0x7f0d399f6600, sig=) at src/profiler.cc:359 + // #11 0x00007f0d399eb633 in ProfileHandler::SignalHandler (sig=27, sinfo=0x7f0d07df0b70, ucontext=0x7f0d07df0a40) at src/profile-handler.cc:530 + // #12 + // #13 0x00007f0d3a73c0b1 in __GI___pthread_mutex_lock (mutex=0x7f0d3d7f0990 <_rtld_global+2352>) at ../nptl/pthread_mutex_lock.c:115 + // #14 0x00007f0d38eb0231 in __GI___dl_iterate_phdr (callback=0x7f0d38f525f0, data=0x7f0d07df10c0) at dl-iteratephdr.c:40 + // #15 0x00007f0d38f536c1 in _Unwind_Find_FDE () from /lib/x86_64-linux-gnu/libgcc_s.so.1 + // #16 0x00007f0d38f4f868 in ?? () from /lib/x86_64-linux-gnu/libgcc_s.so.1 + // #17 0x00007f0d38f50a20 in ?? () from /lib/x86_64-linux-gnu/libgcc_s.so.1 + // #18 0x00007f0d38f50f99 in _Unwind_RaiseException () from /lib/x86_64-linux-gnu/libgcc_s.so.1 + // #19 0x00007f0d390088dc in __cxa_throw () from /lib/x86_64-linux-gnu/libstdc++.so.6 + // #20 0x00007f0d3b5b2245 in __cxxabiv1::__cxa_throw (thrownException=0x7f0d114ea8c0, type=0x7f0d3d6dd830 , destructor=) at /src/folly/folly/experimental/exception_tracer/ExceptionTracerLib.cpp:107 + // + // Therefore, we do not capture backtracks in the signal handler to avoid mutex + // reentry and deadlock. Instead, we capture backtracks in this function and + // ends the signal handler after capturing backtraces is complete. + // Even so, there is still a deadlock problem: + // the worker thread is interrupted when during an existing dl_iterate_phdr call, + // and wait for the capturing backtraces to complete. This function capture + // backtracks with dl_iterate_phdr. We introduce a timeout mechanism in signal + // handler to avoid deadlock. + + union sigval value{}; + value.sival_ptr = this; + size_t sigqueue_try = 0; + while (sigqueue(tid, SIGURG, value) != 0) { + if (errno != EAGAIN || sigqueue_try++ >= FLAGS_max_sigqueue_try) { + return Result::MakeErrorResult("Fail to sigqueue: %s", berror()); + } + } + + butil::Timer timer(butil::Timer::STARTED); + // Use memory_order_seq_cst to ensure the signal is sent and the flag is set before checking. + for (int i = 0; + SIGNAL_TRACE_STATUS_START == _signal_handler_flag.load(butil::memory_order_seq_cst); + ++i) { + if (i < 30) { + sched_yield(); + } else { + SignalSafeUsleep(5); // 5us + } + + // Timeout to avoid dead loop if SIGURG is covered. + timer.stop(); + if (timer.m_elapsed() > FLAGS_signal_trace_timeout_ms) { + return Result::MakeErrorResult( + "Timeout exceed %dms", FLAGS_signal_trace_timeout_ms); + + } + } + + unw_cursor_t cursor; + int rc = unw_init_local(&cursor, _signal_context); + if (0 != rc) { + return Result::MakeErrorResult("Failed to init local, rc=%d", rc); + } + + Result result = TraceCore(cursor); + + // Use memory_order_seq_cst to ensure the flag is set after tracing. + _signal_handler_flag.store(SIGNAL_TRACE_STATUS_UNKNOWN, butil::memory_order_seq_cst); + + return result; +} + +unw_cursor_t TaskTracer::MakeCursor(bthread_fcontext_t fcontext) { + unw_cursor_t cursor; + unw_init_local(&cursor, &_context); + auto regs = reinterpret_cast(fcontext); + + // Only need RBP, RIP, RSP. + // The base pointer (RBP). + if (unw_set_reg(&cursor, UNW_X86_64_RBP, regs[6]) != 0) { + LOG(ERROR) << "Fail to set RBP"; + } + // The instruction pointer (RIP). + if (unw_set_reg(&cursor, UNW_REG_IP, regs[7]) != 0) { + LOG(ERROR) << "Fail to set RIP"; + } +#if UNW_VERSION_MAJOR >= 1 && UNW_VERSION_MINOR >= 7 + // The stack pointer (RSP). + if (unw_set_reg(&cursor, UNW_REG_SP, regs[8]) != 0) { + LOG(ERROR) << "Fail to set RSP"; + } +#endif + + return cursor; +} + +TaskTracer::Result TaskTracer::TraceCore(unw_cursor_t& cursor) { + Result result{}; + result.fast_unwind = FLAGS_enable_fast_unwind; + for (result.frame_count = 0; result.frame_count < arraysize(result.ips); ++result.frame_count) { + int rc = unw_step(&cursor); + if (0 == rc) { + break; + } else if (rc < 0) { + return Result::MakeErrorResult("unw_step rc=%d", rc); + } + + unw_word_t ip = 0; + // Fast unwind do not care about the return value. + rc = unw_get_reg(&cursor, UNW_REG_IP, &ip); + result.ips[result.frame_count] = ip; + + if (result.fast_unwind) { + continue; + } + + if (0 != rc) { + butil::strings::SafeSPrintf(result.mangled[result.frame_count], "\0"); + continue; + } + + rc = unw_get_proc_name( + &cursor, result.mangled[result.frame_count], sizeof(result.mangled[result.frame_count]), NULL); + if (0 != rc && UNW_ENOMEM != rc) { + butil::strings::SafeSPrintf(result.mangled[result.frame_count], "\0"); + } + } + + return result; +} + +std::string TaskTracer::ToString(const Result& result) { + if (result.frame_count == 0) { + return "No frame"; + } + + if (result.fast_unwind) { + butil::debug::StackTrace stack_trace((void**)&result.ips, result.frame_count); + return stack_trace.ToString(); + } + + std::string result_str; + result_str.reserve(2048); + for (size_t i = 0; i < result.frame_count; ++i) { + butil::string_appendf(&result_str, "#%zu 0x%016lx ", i, result.ips[i]); + if (strlen(result.mangled[i]) == 0) { + result_str.append(""); + } else { + result_str.append(butil::demangle(result.mangled[i])); + } + if (i + 1 < result.frame_count) { + result_str.push_back('\n'); + } + } + return result_str; +} + +void TaskTracer::ToString(const Result& result, std::ostream& out_os) { + if (result.frame_count == 0) { + out_os << "No frame"; + return; + } + + if (result.fast_unwind) { + butil::debug::StackTrace stack_trace((void**)&result.ips, result.frame_count); + stack_trace.OutputToStream(&out_os); + return; + } + + for (size_t i = 0; i < result.frame_count; ++i) { + out_os << "# " << i << " 0x" << std::hex << result.ips[i] << std::dec << " "; + if (strlen(result.mangled[i]) == 0) { + out_os << ""; + } else { + out_os << butil::demangle(result.mangled[i]); + } + if (i + 1 < result.frame_count) { + out_os << '\n'; + } + } +} + +} // namespace bthread + +#endif // BRPC_BTHREAD_TRACER + diff --git a/src/bthread/tracer.h b/src/bthread/tracer.h new file mode 100644 index 0000000000..ebcdccec0a --- /dev/null +++ b/src/bthread/tracer.h @@ -0,0 +1,127 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef BRPC_BTHREAD_TRACER_H +#define BRPC_BTHREAD_TRACER_H + +#ifdef BRPC_BTHREAD_TRACER + + +#include +#include +#include "butil/strings/safe_sprintf.h" +#include "butil/synchronization/condition_variable.h" +#include "bthread/task_meta.h" +#include "bthread/mutex.h" + +namespace bthread { + +// Tracer for bthread. +class TaskTracer { +public: + // Set the status to `s'. + void set_status(TaskStatus s, TaskMeta* meta); + static void set_running_status(pid_t worker_tid, TaskMeta* meta); + static bool set_end_status_unsafe(TaskMeta* m); + + // Async signal safe usleep. + static void SignalSafeUsleep(unsigned int microseconds); + // Register signal handler for signal trace. + bool RegisterSignalHandler(pid_t tid); + + // Trace the bthread of `tid'. + std::string Trace(bthread_t tid); + void Trace(std::ostream& os, bthread_t tid); + + // When the worker is jumping stack from a bthread to another, + void WaitForTracing(TaskMeta* m); + + // Stop and interrupt the worker thread. + void StopAndInterruptWorker(); + +private: + // Error number guard used in signal handler. + class ErrnoGuard { + public: + ErrnoGuard() : _errno(errno) {} + ~ErrnoGuard() { errno = _errno; } + private: + int _errno; + }; + + enum SignalTraceStatus { + SIGNAL_TRACE_STATUS_UNKNOWN = 0, + SIGNAL_TRACE_STATUS_START, + SIGNAL_TRACE_STATUS_TRACING, + }; + + struct Result { + template + static Result MakeErrorResult(const char* fmt, Args... args) { + Result result{}; + result.error = true; + butil::strings::SafeSPrintf(result.err_msg, fmt, args...); + return result; + } + + static const size_t MAX_TRACE_NUM = 64; + unw_word_t ips[MAX_TRACE_NUM]; + char mangled[MAX_TRACE_NUM][256]{}; + size_t frame_count{0}; + bool error{false}; + char err_msg[256]{}; + + bool fast_unwind{false}; + }; + + Result TraceImpl(bthread_t tid); + + static TaskStatus WaitForJumping(TaskMeta* m); + + Result ContextTrace(bthread_fcontext_t fcontext); + + static void SignalHandler(int sig, siginfo_t* info, void* context); + void SignalTraceHandler(unw_context_t* context); + Result SignalTrace(pid_t worker_tid); + + unw_cursor_t MakeCursor(bthread_fcontext_t fcontext); + static Result TraceCore(unw_cursor_t& cursor); + + static std::string ToString(const Result& result); + static void ToString(const Result& result, std::ostream& out_os); + + bthread::Mutex _trace_request_mutex; + + butil::Mutex _mutex; + butil::ConditionVariable _cond{&_mutex}; + + // For context trace. + unw_context_t _context{}; + // For signal trace. + unw_context_t* _signal_context{NULL}; + butil::atomic _signal_handler_flag{SIGNAL_TRACE_STATUS_UNKNOWN}; + + butil::atomic _stop{false}; + butil::Mutex _worker_mutex; + std::vector _worker_tids; +}; + +} // namespace bthread + +#endif // BRPC_BTHREAD_TRACER + +#endif // BRPC_BTHREAD_TRACER_H diff --git a/src/butil/debug/stack_trace.cc b/src/butil/debug/stack_trace.cc index 38abede980..97a4cd7683 100644 --- a/src/butil/debug/stack_trace.cc +++ b/src/butil/debug/stack_trace.cc @@ -29,17 +29,18 @@ const void *const *StackTrace::Addresses(size_t* count) const { } size_t StackTrace::CopyAddressTo(void** buffer, size_t max_nframes) const { - size_t nframes = std::min(count_, max_nframes); - memcpy(buffer, trace_, nframes * sizeof(void*)); - return nframes; + size_t nframes = std::min(count_, max_nframes); + memcpy(buffer, trace_, nframes * sizeof(void*)); + return nframes; } std::string StackTrace::ToString() const { - std::stringstream stream; + std::string str; + str.reserve(1024); #if !defined(__UCLIBC__) - OutputToStream(&stream); + OutputToString(str); #endif - return stream.str(); + return str; } } // namespace debug diff --git a/src/butil/debug/stack_trace.h b/src/butil/debug/stack_trace.h index 5c6545add4..e812058321 100644 --- a/src/butil/debug/stack_trace.h +++ b/src/butil/debug/stack_trace.h @@ -79,6 +79,7 @@ class BUTIL_EXPORT StackTrace { #if !defined(__UCLIBC__) // Resolves backtrace to symbols and write to stream. void OutputToStream(std::ostream* os) const; + void OutputToString(std::string& str) const; #endif // Resolves backtrace to symbols and returns as string. diff --git a/src/butil/debug/stack_trace_posix.cc b/src/butil/debug/stack_trace_posix.cc index 878f94a7c8..9ef91c2f0b 100644 --- a/src/butil/debug/stack_trace_posix.cc +++ b/src/butil/debug/stack_trace_posix.cc @@ -429,6 +429,23 @@ class StreamBacktraceOutputHandler : public BacktraceOutputHandler { DISALLOW_COPY_AND_ASSIGN(StreamBacktraceOutputHandler); }; +class StringBacktraceOutputHandler : public BacktraceOutputHandler { +public: + explicit StringBacktraceOutputHandler(std::string& str) : _str(str) {} + + DISALLOW_COPY_AND_ASSIGN(StringBacktraceOutputHandler); + + void HandleOutput(const char* output) OVERRIDE { + if (NULL == output) { + return; + } + _str.append(output); + } + +private: + std::string& _str; +}; + void WarmUpBacktrace() { // Warm up stack trace infrastructure. It turns out that on the first // call glibc initializes some internal data structures using pthread_once, @@ -801,6 +818,11 @@ void StackTrace::OutputToStream(std::ostream* os) const { StreamBacktraceOutputHandler handler(os); ProcessBacktrace(trace_, count_, &handler); } + +void StackTrace::OutputToString(std::string& str) const { + StringBacktraceOutputHandler handler(str); + ProcessBacktrace(trace_, count_, &handler); +} #endif namespace internal { diff --git a/test/brpc_builtin_service_unittest.cpp b/test/brpc_builtin_service_unittest.cpp index 178c3f9e5a..5b21b8d7e4 100644 --- a/test/brpc_builtin_service_unittest.cpp +++ b/test/brpc_builtin_service_unittest.cpp @@ -113,7 +113,7 @@ void MyVLogSite() { void CheckContent(const brpc::Controller& cntl, const char* name) { const std::string& content = cntl.response_attachment().to_string(); std::size_t pos = content.find(name); - ASSERT_TRUE(pos != std::string::npos) << "name=" << name; + ASSERT_TRUE(pos != std::string::npos) << "response_attachment=" << content << "\nname=" << name; } void CheckErrorText(const brpc::Controller& cntl, const char* error) { @@ -837,6 +837,15 @@ void* dummy_bthread(void*) { return NULL; } + +bool g_bthread_trace_stop = false; +void* bthread_trace(void*) { + while (!g_bthread_trace_stop) { + bthread_usleep(1000 * 100); + } + return NULL; +} + TEST_F(BuiltinServiceTest, bthreads) { brpc::BthreadsService service; brpc::BthreadsRequest req; @@ -867,7 +876,23 @@ TEST_F(BuiltinServiceTest, bthreads) { service.default_method(&cntl, &req, &res, &done); EXPECT_FALSE(cntl.Failed()); CheckContent(cntl, "stop=0"); - } + } + + { + bthread_t th; + EXPECT_EQ(0, bthread_start_background(&th, NULL, bthread_trace, NULL)); + ClosureChecker done; + brpc::Controller cntl; + std::string id_string; + butil::string_printf(&id_string, "%llu?st=1", (unsigned long long)th); + cntl.http_request().uri().SetHttpURL("/bthreads/" + id_string); + cntl.http_request()._unresolved_path = id_string; + service.default_method(&cntl, &req, &res, &done); + g_bthread_trace_stop = true; + EXPECT_FALSE(cntl.Failed()); + CheckContent(cntl, "stop=0"); + CheckContent(cntl, "bthread_trace"); + } } TEST_F(BuiltinServiceTest, sockets) { diff --git a/test/brpc_event_dispatcher_unittest.cpp b/test/brpc_event_dispatcher_unittest.cpp index 185e9f2dc8..f3d3a7c98a 100644 --- a/test/brpc_event_dispatcher_unittest.cpp +++ b/test/brpc_event_dispatcher_unittest.cpp @@ -446,8 +446,7 @@ friend class brpc::IOEvent; } void BeforeRecycled() { - brpc::GetGlobalEventDispatcher(_pipe_fds[0], bthread_self_tag()) - .RemoveConsumer(_pipe_fds[0]); + _io_event.RemoveConsumer(_pipe_fds[0]); _io_event.Reset(); if (_pipe_fds[0] >= 0) { close(_pipe_fds[0]); diff --git a/test/bthread_unittest.cpp b/test/bthread_unittest.cpp index 5ed8aba897..ac5076fe90 100644 --- a/test/bthread_unittest.cpp +++ b/test/bthread_unittest.cpp @@ -20,14 +20,21 @@ #include "butil/time.h" #include "butil/macros.h" #include "butil/logging.h" -#include "butil/logging.h" #include "butil/gperftools_profiler.h" #include "bthread/bthread.h" #include "bthread/unstable.h" #include "bthread/task_meta.h" +int main(int argc, char* argv[]) { + testing::InitGoogleTest(&argc, argv); + GFLAGS_NS::ParseCommandLineFlags(&argc, &argv, true); + int rc = RUN_ALL_TESTS(); + return rc; +} + namespace bthread { - extern __thread bthread::LocalStorage tls_bls; +extern __thread bthread::LocalStorage tls_bls; +extern std::string stack_trace(bthread_t tid); } namespace { @@ -117,6 +124,13 @@ void* just_exit(void* arg) { return NULL; } +void* pthread_just_exit(void*) { + LOG(INFO) << "pthread_just_exit"; + pthread_exit(NULL); + EXPECT_TRUE(false) << "pthread_just_exit should never be here"; + return NULL; +} + void* repeated_sleep(void* arg) { for (size_t i = 0; !stop; ++i) { LOG(INFO) << "repeated_sleep(" << arg << ") i=" << i; @@ -437,7 +451,11 @@ TEST_F(BthreadTest, bthread_exit) { ASSERT_EQ(0, bthread_start_urgent(&th1, NULL, just_exit, NULL)); ASSERT_EQ(0, bthread_start_background(&th2, NULL, just_exit, NULL)); +// #ifndef BRPC_BTHREAD_TRACER ASSERT_EQ(0, pthread_create(&th3, NULL, just_exit, NULL)); +// #else +// ASSERT_EQ(0, pthread_create(&th3, NULL, pthread_just_exit, NULL)); +// #endif // BRPC_BTHREAD_TRACER EXPECT_EQ(0, bthread_start_urgent(&th4, &attr, just_exit, NULL)); EXPECT_EQ(0, bthread_start_background(&th5, &attr, just_exit, NULL)); @@ -608,4 +626,24 @@ TEST_F(BthreadTest, yield_single_thread) { ASSERT_EQ(0, bthread_join(tid, NULL)); } +#ifdef BRPC_BTHREAD_TRACER +TEST_F(BthreadTest, trace) { + stop = false; + bthread_t th1; + ASSERT_EQ(0, bthread_start_urgent(&th1, NULL, spin_and_log, (void*)1)); + usleep(100 * 1000); + LOG(INFO) << "spin_and_log stack trace:\n" << bthread::stack_trace(th1); + stop = true; + ASSERT_EQ(0, bthread_join(th1, NULL)); + + stop = false; + ASSERT_EQ(0, bthread_start_urgent(&th1, NULL, repeated_sleep, (void*)1)); + usleep(100 * 1000); + LOG(INFO) << "repeated_sleep stack trace:\n" << bthread::stack_trace(th1); + stop = true; + ASSERT_EQ(0, bthread_join(th1, NULL)); +} +#endif // BRPC_BTHREAD_TRACER + + } // namespace diff --git a/test/run_tests.sh b/test/run_tests.sh index 142977975a..d003bac020 100755 --- a/test/run_tests.sh +++ b/test/run_tests.sh @@ -22,11 +22,11 @@ rm core.* test_num=0 failed_test="" rc=0 -test_bins="test_butil test_bvar bthread*unittest brpc*unittest" +test_bins="bthread_unittest" for test_bin in $test_bins; do test_num=$((test_num + 1)) >&2 echo "[runtest] $test_bin" - ./$test_bin + ./$test_bin --gtest_filter=BthreadTest.bthread_exit rc=$? if [ $rc -ne 0 ]; then failed_test="$test_bin"