From 9facc409e321bf8ebc822593b963f4aa81cc2853 Mon Sep 17 00:00:00 2001 From: Jack Bond-Preston Date: Fri, 10 Oct 2025 12:04:16 +0000 Subject: [PATCH 1/4] Add third-party spdlog (v1.15.3) as a submodule Also modifies Github CI config to ensure the checkout action also checks out the submodule. --- .github/workflows/build-linux.yml | 2 ++ .github/workflows/build-windows.yml | 2 ++ .github/workflows/super-linter.yml | 1 + .gitmodules | 3 +++ README.md | 17 +++++++++++++++++ third_party/spdlog | 1 + 6 files changed, 26 insertions(+) create mode 100644 .gitmodules create mode 160000 third_party/spdlog diff --git a/.github/workflows/build-linux.yml b/.github/workflows/build-linux.yml index e2d575c59..6b0272eb1 100644 --- a/.github/workflows/build-linux.yml +++ b/.github/workflows/build-linux.yml @@ -23,6 +23,8 @@ jobs: steps: - name: Checkout code uses: actions/checkout@v4 + with: + submodules: true - name: Setup environment run: | sudo apt-get install -y libssl-dev diff --git a/.github/workflows/build-windows.yml b/.github/workflows/build-windows.yml index 310423b82..9839aa772 100644 --- a/.github/workflows/build-windows.yml +++ b/.github/workflows/build-windows.yml @@ -24,6 +24,8 @@ jobs: uses: microsoft/setup-msbuild@v2 - name: Checkout code uses: actions/checkout@v4 + with: + submodules: true - name: Set up Python 3.10 uses: actions/setup-python@v5 with: diff --git a/.github/workflows/super-linter.yml b/.github/workflows/super-linter.yml index 19ef6a3f9..2ce018664 100644 --- a/.github/workflows/super-linter.yml +++ b/.github/workflows/super-linter.yml @@ -24,6 +24,7 @@ jobs: # Full git history is needed to get a proper list of changed files # within `super-linter` fetch-depth: 0 + submodules: true - name: Super-linter uses: super-linter/super-linter@v7.3.0 # x-release-please-version env: diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 000000000..eab6041af --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "third_party/spdlog"] + path = third_party/spdlog + url = https://github.com/gabime/spdlog.git diff --git a/README.md b/README.md index d9d29ace7..63b6c5538 100644 --- a/README.md +++ b/README.md @@ -62,6 +62,23 @@ Optional dependencies are: Please refer to [docs/](docs/) for detailed documentation. +## Cloning + +Gloo makes use of [Git submodules](https://git-scm.com/book/en/v2/Git-Tools-Submodules) +for third-party build-time dependencies. When cloning, ensure that you +initialize and sync submodules: + +``` shell +git clone --recurse-submodules https://github.com/pytorch/gloo.git # or use SSH remote +``` + +Or, if you already cloned without submodules, run the following in the existing +local repository: + +``` shell +git submodule update --init --recursive +``` + ## Building You can build Gloo using CMake. diff --git a/third_party/spdlog b/third_party/spdlog new file mode 160000 index 000000000..6fa36017c --- /dev/null +++ b/third_party/spdlog @@ -0,0 +1 @@ +Subproject commit 6fa36017cfd5731d617e1a934f0e5ea9c4445b13 From 2122920047dc04bab8c09a70f998303b228b7527 Mon Sep 17 00:00:00 2001 From: Jack Bond-Preston Date: Fri, 10 Oct 2025 22:25:46 +0000 Subject: [PATCH 2/4] build: Add spdlog build dependency spdlog is included and built from source, using the third_party/spdlog submodule. When Gloo is built as a static library, spdlog is used in its header-only library form. When Gloo is built as a shared library, spdlog is built statically and included. A key goal is to make spdlog a dependency internal to Gloo only. Gloo users should not have to install spdlog themselves. This commit does not add any usages of spdlog in Gloo, just the build infrastructure to use it. --- CMakeLists.txt | 3 +++ cmake/Dependencies.cmake | 21 ++++++++++++++++++++- gloo/CMakeLists.txt | 5 ++++- gloo/benchmark/CMakeLists.txt | 2 +- gloo/examples/CMakeLists.txt | 8 ++++---- gloo/test/CMakeLists.txt | 6 +++--- 6 files changed, 35 insertions(+), 10 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index caf3bf388..487e2857f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -98,6 +98,9 @@ endif() set(GLOO_INSTALL ON CACHE BOOL "") mark_as_advanced(GLOO_INSTALL) +# Compile definitions for Gloo +set(GLOO_COMPILE_DEFS) + # Build shared or static libraries (override from parent project) if(BUILD_SHARED_LIBS) set(GLOO_STATIC_OR_SHARED SHARED CACHE STRING "") diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 63873fc64..7ca837a8d 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -2,6 +2,9 @@ set(gloo_DEPENDENCY_LIBS "") set(gloo_cuda_DEPENDENCY_LIBS "") set(gloo_hip_DEPENDENCY_LIBS "") +# Dependency libraries for all Gloo targets (e.g. tests, benchmarks, etc.) +set(gloo_ALL_TARGETS_DEPENDENCY_LIBS "") + # Configure path to modules (for find_package) set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${PROJECT_SOURCE_DIR}/cmake/Modules/") @@ -104,7 +107,7 @@ if(USE_MPI) message(STATUS "MPI libraries: " ${MPI_CXX_LIBRARIES}) include_directories(SYSTEM ${MPI_CXX_INCLUDE_PATH}) list(APPEND gloo_DEPENDENCY_LIBS ${MPI_CXX_LIBRARIES}) - add_definitions(-DGLOO_USE_MPI=1) + list(APPEND GLOO_COMPILE_DEFS "GLOO_USE_MPI=1") else() message(WARNING "Not compiling with MPI support. Suppress this warning with -DUSE_MPI=OFF.") set(USE_MPI OFF) @@ -199,3 +202,19 @@ if(BUILD_TEST) target_link_libraries(gtest INTERFACE ${GTEST_LIBRARIES}) endif() endif() + +set(SPDLOG_BUILD_SHARED OFF CACHE BOOL "" FORCE) +set(SPDLOG_BUILD_PIC ON CACHE BOOL "" FORCE) +set(SPDLOG_FMT_EXTERNAL OFF CACHE BOOL "" FORCE) + +set(_save_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS}) +set(BUILD_SHARED_LIBS OFF) +add_subdirectory(third_party/spdlog EXCLUDE_FROM_ALL) +set(BUILD_SHARED_LIBS ${_save_BUILD_SHARED_LIBS}) + +if(GLOO_STATIC_OR_SHARED STREQUAL "STATIC") + list(APPEND gloo_ALL_TARGETS_DEPENDENCY_LIBS $) +else() + list(APPEND gloo_ALL_TARGETS_DEPENDENCY_LIBS spdlog::spdlog) + list(APPEND GLOO_COMPILE_DEFS SPDLOG_COMPILED_LIB) +endif() diff --git a/gloo/CMakeLists.txt b/gloo/CMakeLists.txt index 186fe1288..11dd34b79 100644 --- a/gloo/CMakeLists.txt +++ b/gloo/CMakeLists.txt @@ -147,10 +147,12 @@ if(USE_CUDA) cuda_add_library(gloo_cuda ${GLOO_CUDA_SRCS} ${GLOO_STATIC_OR_SHARED}) endif() target_link_libraries(gloo_cuda gloo ${gloo_cuda_DEPENDENCY_LIBS}) + set_property(TARGET gloo_cuda APPEND PROPERTY LINK_LIBRARIES ${gloo_ALL_TARGETS_DEPENDENCY_LIBS}) endif() if(USE_ROCM) gloo_hip_add_library(gloo_hip ${GLOO_HIP_SRCS}) target_link_libraries(gloo_hip gloo) + set_property(TARGET gloo_hip APPEND PROPERTY LINK_LIBRARIES ${gloo_ALL_TARGETS_DEPENDENCY_LIBS}) endif() if(USE_LIBUV) target_link_libraries(gloo PRIVATE uv_a) @@ -165,7 +167,8 @@ elseif(USE_TCP_OPENSSL_LOAD) target_link_libraries(gloo PRIVATE dl) endif() -target_link_libraries(gloo PRIVATE ${gloo_DEPENDENCY_LIBS}) +target_link_libraries(gloo PRIVATE ${gloo_DEPENDENCY_LIBS} ${gloo_ALL_TARGETS_DEPENDENCY_LIBS}) +target_compile_definitions(gloo PRIVATE ${GLOO_COMPILE_DEFS}) # Add Interface include directories that are relocatable. target_include_directories(gloo INTERFACE $) diff --git a/gloo/benchmark/CMakeLists.txt b/gloo/benchmark/CMakeLists.txt index 74da2c04b..2c8069131 100644 --- a/gloo/benchmark/CMakeLists.txt +++ b/gloo/benchmark/CMakeLists.txt @@ -5,7 +5,7 @@ set(GLOO_BENCHMARK_SRCS ) add_executable(benchmark ${GLOO_BENCHMARK_SRCS}) -target_link_libraries(benchmark gloo) +target_link_libraries(benchmark gloo ${gloo_ALL_TARGETS_DEPENDENCY_LIBS}) if(GLOO_INSTALL) install(TARGETS benchmark DESTINATION ${CMAKE_INSTALL_PREFIX}/bin) diff --git a/gloo/examples/CMakeLists.txt b/gloo/examples/CMakeLists.txt index b5f87e85c..a5fba49cf 100644 --- a/gloo/examples/CMakeLists.txt +++ b/gloo/examples/CMakeLists.txt @@ -1,8 +1,8 @@ if(MSVC) add_executable(example_reduce example_reduce.cc) - target_link_libraries(example_reduce gloo) + target_link_libraries(example_reduce gloo ${gloo_ALL_TARGETS_DEPENDENCY_LIBS}) add_executable(example_allreduce example_allreduce.cc) - target_link_libraries(example_allreduce gloo) + target_link_libraries(example_allreduce gloo ${gloo_ALL_TARGETS_DEPENDENCY_LIBS}) if(USE_LIBUV) add_custom_command(TARGET example_reduce POST_BUILD @@ -12,7 +12,7 @@ if(MSVC) endif() else() add_executable(example1 example1.cc) - target_link_libraries(example1 gloo) + target_link_libraries(example1 gloo ${gloo_ALL_TARGETS_DEPENDENCY_LIBS}) add_executable(looks_like_mpi looks_like_mpi.cc) - target_link_libraries(looks_like_mpi gloo) + target_link_libraries(looks_like_mpi gloo ${gloo_ALL_TARGETS_DEPENDENCY_LIBS}) endif() diff --git a/gloo/test/CMakeLists.txt b/gloo/test/CMakeLists.txt index d16af64e9..e97738575 100644 --- a/gloo/test/CMakeLists.txt +++ b/gloo/test/CMakeLists.txt @@ -17,7 +17,7 @@ set(GLOO_TEST_SRCS "${CMAKE_CURRENT_SOURCE_DIR}/remote_key_test.cc" "${CMAKE_CURRENT_SOURCE_DIR}/send_recv_test.cc" ) -set(GLOO_TEST_LIBRARIES) +set(GLOO_TEST_LIBRARIES ${gloo_ALL_TARGETS_DEPENDENCY_LIBS}) if(${CMAKE_SYSTEM_NAME} STREQUAL "Linux") list(APPEND GLOO_TEST_SRCS @@ -64,7 +64,7 @@ if(USE_CUDA) else() cuda_add_executable(gloo_test_cuda ${GLOO_TEST_CUDA_SRCS}) endif() - target_link_libraries(gloo_test_cuda gloo_cuda gtest OpenSSL::SSL OpenSSL::Crypto ${GLOO_CUDA_LIBRARIES}) + target_link_libraries(gloo_test_cuda gloo_cuda gtest ${GLOO_TEST_LIBRARIES} ${GLOO_CUDA_LIBRARIES}) endif() endif() @@ -79,5 +79,5 @@ if(USE_ROCM) ) gloo_hip_add_executable(gloo_test_hip ${GLOO_TEST_HIP_SRCS}) - target_link_libraries(gloo_test_hip gloo_hip gtest OpenSSL::SSL OpenSSL::Crypto) + target_link_libraries(gloo_test_hip gloo_hip gtest ${GLOO_TEST_LIBRARIES}) endif() From aeaea26ca72896ac85e04b7678e5835a903f1e1f Mon Sep 17 00:00:00 2001 From: Jack Bond-Preston Date: Fri, 10 Oct 2025 22:29:27 +0000 Subject: [PATCH 3/4] Use spdlog for logging in Gloo Change Gloo logging macros to use SPDLOG_* macros for logging. This change is largely mechanical - changing the structure of the logging calls to use fmtlib::fmt-style formatting. For example: GLOO_DEBUG("thingOne=", thingOne, " thingTwo=", thingTwo); becomes: GLOO_DEBUG("thingOne={} thingTwo={}", thingOne, thingTwo); However, there are also some less mechanical changes: - Since we don't want to expose spdlog as a public dependency of Gloo, the logging infrastructure is moved into a different header (gloo/common/log.h). This header is not included in GLOO_HDRS, so it is not installed. It should only ever be included by .cc files, or non-public headers. - Due to the previous change, the GLOO_ENFORCE_* macros are moved into a separate header (gloo/common/enforce.h). These are used often in public header files, and are not inherently tied to the logging, so this made the most sense. - gloo/common/log.h (effectively renamed from logging.h) no longer needs a .cc file, as it just passes GLOO_* macros through to the SPDLOG_* macros. - gloo has no init function which must be called first, and can be built as a static library (precluding something like __attribute__((constructor))). For this reason, all logger calls are prefixed with a std::run_once which ensures the logger init/config is performed before the first use of the logger. - gloo/transport/tcp/debug_logger.{h,cc} are deleted, and replaced with a custom fmt formatter implementation in gloo/transport/debug_data.h. - Wherever lists of includes were touched as part of this change, they were reformatted/fixed (e.g. X.h -> cX, alphabetical re-ordering, changing gloo includes from <> to ""). --- docs/errors.md | 2 +- gloo/algorithm.cc | 2 +- gloo/allgather.cc | 2 +- gloo/allgatherv.cc | 2 +- gloo/allreduce.cc | 2 +- gloo/alltoall.cc | 2 +- gloo/alltoall.h | 2 +- gloo/alltoallv.cc | 2 +- gloo/alltoallv.h | 2 +- gloo/benchmark/cuda_main.cc | 2 +- gloo/benchmark/main.cc | 2 +- gloo/benchmark/runner.cc | 2 +- gloo/broadcast.cc | 2 +- gloo/broadcast_one_to_all.h | 2 +- gloo/common/CMakeLists.txt | 4 +- gloo/common/{logging.cc => enforce.cc} | 32 +--------- gloo/common/{logging.h => enforce.h} | 46 -------------- gloo/common/linux.cc | 2 +- gloo/common/log.h | 72 ++++++++++++++++++++++ gloo/common/win.cc | 2 +- gloo/context.cc | 2 +- gloo/cuda.h | 2 +- gloo/cuda_allreduce_local.cc | 2 +- gloo/cuda_collectives_device.h | 2 +- gloo/cuda_collectives_host.h | 2 +- gloo/cuda_collectives_native.h | 2 +- gloo/cuda_collectives_nccl.h | 2 +- gloo/cuda_private.h | 2 +- gloo/gather.cc | 2 +- gloo/gatherv.cc | 2 +- gloo/mpi/context.cc | 2 +- gloo/nccl/nccl.h | 2 +- gloo/pairwise_exchange.h | 2 +- gloo/reduce.cc | 2 +- gloo/rendezvous/context.cc | 2 +- gloo/rendezvous/file_store.cc | 2 +- gloo/rendezvous/hash_store.cc | 2 +- gloo/rendezvous/redis_store.cc | 2 +- gloo/rendezvous/store.h | 2 +- gloo/scatter.cc | 2 +- gloo/test/base_test.cc | 3 +- gloo/test/multiproc_test.h | 2 +- gloo/test/openssl_utils.cc | 2 +- gloo/transport/ibverbs/address.cc | 2 +- gloo/transport/ibverbs/buffer.cc | 10 +-- gloo/transport/ibverbs/device.cc | 18 +++--- gloo/transport/ibverbs/memory_region.cc | 2 +- gloo/transport/ibverbs/pair.cc | 77 ++++++++---------------- gloo/transport/ibverbs/unbound_buffer.cc | 9 ++- gloo/transport/pair.h | 2 +- gloo/transport/tcp/CMakeLists.txt | 2 - gloo/transport/tcp/address.cc | 2 +- gloo/transport/tcp/buffer.cc | 2 +- gloo/transport/tcp/context.cc | 14 ++--- gloo/transport/tcp/debug_data.h | 30 ++++++--- gloo/transport/tcp/debug_logger.cc | 30 --------- gloo/transport/tcp/debug_logger.h | 20 ------ gloo/transport/tcp/device.cc | 2 +- gloo/transport/tcp/helpers.cc | 48 ++++++++++++++- gloo/transport/tcp/helpers.h | 60 ++---------------- gloo/transport/tcp/listener.cc | 2 +- gloo/transport/tcp/loop.cc | 2 +- gloo/transport/tcp/pair.cc | 2 +- gloo/transport/tcp/socket.cc | 2 +- gloo/transport/tcp/tls/context.cc | 2 +- gloo/transport/tcp/tls/pair.cc | 2 +- gloo/transport/tcp/unbound_buffer.cc | 2 +- gloo/transport/uv/address.cc | 2 +- gloo/transport/uv/context.cc | 2 +- gloo/transport/uv/device.cc | 2 +- gloo/transport/uv/pair.cc | 2 +- gloo/transport/uv/unbound_buffer.cc | 2 +- 72 files changed, 254 insertions(+), 333 deletions(-) rename gloo/common/{logging.cc => enforce.cc} (50%) rename gloo/common/{logging.h => enforce.h} (81%) create mode 100644 gloo/common/log.h delete mode 100644 gloo/transport/tcp/debug_logger.cc delete mode 100644 gloo/transport/tcp/debug_logger.h diff --git a/docs/errors.md b/docs/errors.md index 93f1279b7..94248d584 100644 --- a/docs/errors.md +++ b/docs/errors.md @@ -18,4 +18,4 @@ extend `::gloo::Exception`. ## Assertions Gloo asserts unexpected errors and logical invariants instead of expecting callers to handle them. `GLOO_ENFORCE` macros are defined in -[`logging.h`](../gloo/common/logging.h) +[`enforce.h`](../gloo/common/enforce.h) diff --git a/gloo/algorithm.cc b/gloo/algorithm.cc index 9488ff59d..bc2d06fe1 100644 --- a/gloo/algorithm.cc +++ b/gloo/algorithm.cc @@ -8,7 +8,7 @@ #include "gloo/algorithm.h" -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" namespace gloo { diff --git a/gloo/allgather.cc b/gloo/allgather.cc index 8ab4e6f4f..b45a0215f 100644 --- a/gloo/allgather.cc +++ b/gloo/allgather.cc @@ -11,7 +11,7 @@ #include #include -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #include "gloo/types.h" namespace gloo { diff --git a/gloo/allgatherv.cc b/gloo/allgatherv.cc index 72fc3988e..50db2a143 100644 --- a/gloo/allgatherv.cc +++ b/gloo/allgatherv.cc @@ -11,7 +11,7 @@ #include #include -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #include "gloo/types.h" namespace gloo { diff --git a/gloo/allreduce.cc b/gloo/allreduce.cc index 080f7f302..00da4dff7 100644 --- a/gloo/allreduce.cc +++ b/gloo/allreduce.cc @@ -12,7 +12,7 @@ #include #include -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #include "gloo/math.h" #include "gloo/types.h" diff --git a/gloo/alltoall.cc b/gloo/alltoall.cc index aa7db9a87..2ce5d5754 100644 --- a/gloo/alltoall.cc +++ b/gloo/alltoall.cc @@ -10,7 +10,7 @@ #include -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #include "gloo/types.h" namespace gloo { diff --git a/gloo/alltoall.h b/gloo/alltoall.h index c588f9b2c..448d2c209 100644 --- a/gloo/alltoall.h +++ b/gloo/alltoall.h @@ -8,7 +8,7 @@ #pragma once -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #include "gloo/context.h" #include "gloo/transport/unbound_buffer.h" diff --git a/gloo/alltoallv.cc b/gloo/alltoallv.cc index 7a51f9527..612ad8a06 100644 --- a/gloo/alltoallv.cc +++ b/gloo/alltoallv.cc @@ -11,7 +11,7 @@ #include #include -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #include "gloo/types.h" namespace gloo { diff --git a/gloo/alltoallv.h b/gloo/alltoallv.h index b0bb21e10..b7761d89a 100644 --- a/gloo/alltoallv.h +++ b/gloo/alltoallv.h @@ -8,7 +8,7 @@ #pragma once -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #include "gloo/context.h" #include "gloo/transport/unbound_buffer.h" diff --git a/gloo/benchmark/cuda_main.cc b/gloo/benchmark/cuda_main.cc index 387338f6e..7736f2b97 100644 --- a/gloo/benchmark/cuda_main.cc +++ b/gloo/benchmark/cuda_main.cc @@ -11,7 +11,7 @@ #include "gloo/benchmark/benchmark.h" #include "gloo/benchmark/runner.h" -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #include "gloo/cuda_allreduce_bcube.h" #include "gloo/cuda_allreduce_halving_doubling.h" #include "gloo/cuda_allreduce_halving_doubling_pipelined.h" diff --git a/gloo/benchmark/main.cc b/gloo/benchmark/main.cc index 2e4678006..d7e19f060 100644 --- a/gloo/benchmark/main.cc +++ b/gloo/benchmark/main.cc @@ -26,7 +26,7 @@ #include "gloo/broadcast_one_to_all.h" #include "gloo/common/aligned_allocator.h" #include "gloo/common/common.h" -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #include "gloo/context.h" #include "gloo/pairwise_exchange.h" #include "gloo/reduce.h" diff --git a/gloo/benchmark/runner.cc b/gloo/benchmark/runner.cc index f73dade2b..ed3cbaab9 100644 --- a/gloo/benchmark/runner.cc +++ b/gloo/benchmark/runner.cc @@ -15,7 +15,7 @@ #include "gloo/barrier_all_to_one.h" #include "gloo/broadcast_one_to_all.h" #include "gloo/common/common.h" -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #include "gloo/rendezvous/context.h" #include "gloo/rendezvous/file_store.h" #include "gloo/rendezvous/prefix_store.h" diff --git a/gloo/broadcast.cc b/gloo/broadcast.cc index 14f0fc2b4..311d16a00 100644 --- a/gloo/broadcast.cc +++ b/gloo/broadcast.cc @@ -11,7 +11,7 @@ #include #include -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #include "gloo/math.h" #include "gloo/types.h" diff --git a/gloo/broadcast_one_to_all.h b/gloo/broadcast_one_to_all.h index e599e102f..e8910bb76 100644 --- a/gloo/broadcast_one_to_all.h +++ b/gloo/broadcast_one_to_all.h @@ -13,7 +13,7 @@ #include "gloo/algorithm.h" #include "gloo/common/common.h" -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" namespace gloo { diff --git a/gloo/common/CMakeLists.txt b/gloo/common/CMakeLists.txt index a8d474498..ebf0fb23d 100644 --- a/gloo/common/CMakeLists.txt +++ b/gloo/common/CMakeLists.txt @@ -1,13 +1,13 @@ set(GLOO_COMMON_SRCS - "${CMAKE_CURRENT_SOURCE_DIR}/logging.cc" + "${CMAKE_CURRENT_SOURCE_DIR}/enforce.cc" "${CMAKE_CURRENT_SOURCE_DIR}/utils.cc" ) set(GLOO_COMMON_HDRS "${CMAKE_CURRENT_SOURCE_DIR}/aligned_allocator.h" "${CMAKE_CURRENT_SOURCE_DIR}/common.h" + "${CMAKE_CURRENT_SOURCE_DIR}/enforce.h" "${CMAKE_CURRENT_SOURCE_DIR}/error.h" - "${CMAKE_CURRENT_SOURCE_DIR}/logging.h" "${CMAKE_CURRENT_SOURCE_DIR}/store.h" "${CMAKE_CURRENT_SOURCE_DIR}/string.h" "${CMAKE_CURRENT_SOURCE_DIR}/utils.h" diff --git a/gloo/common/logging.cc b/gloo/common/enforce.cc similarity index 50% rename from gloo/common/logging.cc rename to gloo/common/enforce.cc index 91b02ea18..879a33c2e 100644 --- a/gloo/common/logging.cc +++ b/gloo/common/enforce.cc @@ -6,42 +6,12 @@ * LICENSE file in the root directory of this source tree. */ -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" -#include -#include #include namespace gloo { -// Initialize log level from environment variable, and return static value at -// each inquiry. -LogLevel logLevel() { - // Global log level. Initialized once. - static LogLevel log_level = LogLevel::UNSET; - if (log_level != LogLevel::UNSET) { - return log_level; - } - - const char* level = getenv("GLOO_LOG_LEVEL"); - // Defaults to WARN. - if (level == nullptr) { - log_level = LogLevel::WARN; - return log_level; - } - - if (std::strcmp(level, "DEBUG") == 0) { - log_level = LogLevel::DEBUG; - } else if (std::strcmp(level, "INFO") == 0) { - log_level = LogLevel::INFO; - } else if (std::strcmp(level, "WARN") == 0) { - log_level = LogLevel::WARN; - } else { - log_level = LogLevel::ERROR; - } - return log_level; -} - EnforceNotMet::EnforceNotMet( const char* file, const int line, diff --git a/gloo/common/logging.h b/gloo/common/enforce.h similarity index 81% rename from gloo/common/logging.h rename to gloo/common/enforce.h index c5e7c1da9..d940a3fc1 100644 --- a/gloo/common/logging.h +++ b/gloo/common/enforce.h @@ -10,58 +10,12 @@ #include #include -#include -#include -#include #include -#include "gloo/common/error.h" #include "gloo/common/string.h" namespace gloo { -enum LogLevel { - ERROR, - WARN, - INFO, - DEBUG, - UNSET, -}; - -LogLevel logLevel(); - -#define GLOO_LOG_MSG(level, ...) \ - std::cerr << ::gloo::MakeString( \ - "[", __FILE__, ":", __LINE__, "] ", level, " ", __VA_ARGS__, "\n") - -#define GLOO_ERROR(...) \ - do { \ - if (logLevel() >= LogLevel::ERROR) { \ - GLOO_LOG_MSG("ERROR", __VA_ARGS__); \ - } \ - } while (0) - -#define GLOO_WARN(...) \ - do { \ - if (logLevel() >= LogLevel::WARN) { \ - GLOO_LOG_MSG("WARN", __VA_ARGS__); \ - } \ - } while (0) - -#define GLOO_INFO(...) \ - do { \ - if (logLevel() >= LogLevel::INFO) { \ - GLOO_LOG_MSG("INFO", __VA_ARGS__); \ - } \ - } while (0) - -#define GLOO_DEBUG(...) \ - do { \ - if (logLevel() >= LogLevel::DEBUG) { \ - GLOO_LOG_MSG("DEBUG", __VA_ARGS__); \ - } \ - } while (0) - class EnforceNotMet : public std::exception { public: EnforceNotMet( diff --git a/gloo/common/linux.cc b/gloo/common/linux.cc index 9b45fc314..784e5e8f9 100644 --- a/gloo/common/linux.cc +++ b/gloo/common/linux.cc @@ -28,7 +28,7 @@ #include #include -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #ifndef SPEED_UNKNOWN /* SPEED_UNKOWN is sometimes undefined, c.f. diff --git a/gloo/common/log.h b/gloo/common/log.h new file mode 100644 index 000000000..cde3eacde --- /dev/null +++ b/gloo/common/log.h @@ -0,0 +1,72 @@ +#pragma once + +/** + * @file log.h + * @brief This header is intended to be internal only, so as not to make spdlog + * a dependency of gloo library consumers. For this reason, it should not be + * included by public headers. + */ + +/* The compile-time log level is set to the most sensitive (TRACE), to ensure no + * logging statements are removed at build time. This allows logging to be fully + * controllable at runtime, even to the TRACE level. */ +#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_TRACE + +#include +#include + +/* ranges.h provides formatter for containers, e.g. std::vector */ +#include + +namespace gloo::log { + +inline void init() { + /* Default to WARN log level */ + spdlog::set_level(spdlog::level::warn); + /* Override log level if the environment variable is set. */ + spdlog::cfg::load_env_levels("GLOO_LOG_LEVEL"); + + /* Set custom format. This is similiar to PyTorch's format. Equivalent to: + * [{short-log-level}{month-num}{day-of-month} {hour}:{min}:{sec}.{microsec} + * {threadid} {source_file}:{line_no}] {MESSAGE} */ + spdlog::set_pattern("[%L%m%d %H:%M:%S.%f %t %s:%#] %v"); +} + +inline std::once_flag onceFlag; + +inline void ensureInit() { + std::call_once(onceFlag, [] { init(); }); +} + +} // namespace gloo::log + +#define GLOO_TRACE(...) \ + do { \ + ::gloo::log::ensureInit(); \ + SPDLOG_TRACE(__VA_ARGS__); \ + } while (0) +#define GLOO_DEBUG(...) \ + do { \ + ::gloo::log::ensureInit(); \ + SPDLOG_DEBUG(__VA_ARGS__); \ + } while (0) +#define GLOO_INFO(...) \ + do { \ + ::gloo::log::ensureInit(); \ + SPDLOG_INFO(__VA_ARGS__); \ + } while (0) +#define GLOO_WARN(...) \ + do { \ + ::gloo::log::ensureInit(); \ + SPDLOG_WARN(__VA_ARGS__); \ + } while (0) +#define GLOO_ERROR(...) \ + do { \ + ::gloo::log::ensureInit(); \ + SPDLOG_ERROR(__VA_ARGS__); \ + } while (0) +#define GLOO_CRITICAL(...) \ + do { \ + ::gloo::log::ensureInit(); \ + SPDLOG_CRITICAL(__VA_ARGS__); \ + } while (0) diff --git a/gloo/common/win.cc b/gloo/common/win.cc index 5dfb80e28..bbce93c6c 100644 --- a/gloo/common/win.cc +++ b/gloo/common/win.cc @@ -7,7 +7,7 @@ */ #include "gloo/common/win.h" -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #include diff --git a/gloo/context.cc b/gloo/context.cc index fd9b83c7b..a919163ac 100644 --- a/gloo/context.cc +++ b/gloo/context.cc @@ -8,8 +8,8 @@ #include "gloo/context.h" +#include "gloo/common/enforce.h" #include "gloo/common/error.h" -#include "gloo/common/logging.h" #include "gloo/transport/device.h" #include "gloo/transport/unbound_buffer.h" diff --git a/gloo/cuda.h b/gloo/cuda.h index ccb196a24..39049ba76 100644 --- a/gloo/cuda.h +++ b/gloo/cuda.h @@ -16,7 +16,7 @@ #include #include "gloo/algorithm.h" -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #include "gloo/config.h" // Check that configuration header was properly generated diff --git a/gloo/cuda_allreduce_local.cc b/gloo/cuda_allreduce_local.cc index 9c31e5e46..52c78f054 100644 --- a/gloo/cuda_allreduce_local.cc +++ b/gloo/cuda_allreduce_local.cc @@ -8,7 +8,7 @@ #include "gloo/cuda_allreduce_local.h" -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #include "gloo/cuda_collectives_device.h" #include "gloo/cuda_private.h" diff --git a/gloo/cuda_collectives_device.h b/gloo/cuda_collectives_device.h index c2c6a81d1..2cace96a8 100644 --- a/gloo/cuda_collectives_device.h +++ b/gloo/cuda_collectives_device.h @@ -12,7 +12,7 @@ #include #include "gloo/common/common.h" -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #include "gloo/config.h" #include "gloo/cuda.h" #include "gloo/cuda_private.h" diff --git a/gloo/cuda_collectives_host.h b/gloo/cuda_collectives_host.h index 32eb6d3ef..2f9579162 100644 --- a/gloo/cuda_collectives_host.h +++ b/gloo/cuda_collectives_host.h @@ -9,7 +9,7 @@ #pragma once #include "gloo/common/common.h" -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #include "gloo/cuda.h" namespace gloo { diff --git a/gloo/cuda_collectives_native.h b/gloo/cuda_collectives_native.h index e6c45c401..07b21384d 100644 --- a/gloo/cuda_collectives_native.h +++ b/gloo/cuda_collectives_native.h @@ -13,7 +13,7 @@ #include #include "gloo/common/common.h" -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #include "gloo/cuda.h" #include "gloo/cuda_private.h" diff --git a/gloo/cuda_collectives_nccl.h b/gloo/cuda_collectives_nccl.h index acc64a6b0..ffec2463c 100644 --- a/gloo/cuda_collectives_nccl.h +++ b/gloo/cuda_collectives_nccl.h @@ -9,7 +9,7 @@ #pragma once #include "gloo/common/common.h" -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #include "gloo/nccl/nccl.h" namespace gloo { diff --git a/gloo/cuda_private.h b/gloo/cuda_private.h index 355fedd06..9fc353980 100644 --- a/gloo/cuda_private.h +++ b/gloo/cuda_private.h @@ -16,7 +16,7 @@ #ifdef __linux__ #include "gloo/common/linux.h" #endif -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #include "gloo/cuda.h" #include "gloo/transport/device.h" diff --git a/gloo/gather.cc b/gloo/gather.cc index cab5ffadc..c86ed29ad 100644 --- a/gloo/gather.cc +++ b/gloo/gather.cc @@ -10,7 +10,7 @@ #include -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #include "gloo/types.h" namespace gloo { diff --git a/gloo/gatherv.cc b/gloo/gatherv.cc index 306f6e45f..c0deb61f3 100644 --- a/gloo/gatherv.cc +++ b/gloo/gatherv.cc @@ -11,7 +11,7 @@ #include #include -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #include "gloo/types.h" namespace gloo { diff --git a/gloo/mpi/context.cc b/gloo/mpi/context.cc index e8c287960..245a1af56 100644 --- a/gloo/mpi/context.cc +++ b/gloo/mpi/context.cc @@ -12,8 +12,8 @@ #include #include +#include "gloo/common/enforce.h" #include "gloo/common/error.h" -#include "gloo/common/logging.h" #include "gloo/transport/address.h" namespace gloo { diff --git a/gloo/nccl/nccl.h b/gloo/nccl/nccl.h index 35746e350..0e6d5356f 100644 --- a/gloo/nccl/nccl.h +++ b/gloo/nccl/nccl.h @@ -13,7 +13,7 @@ #include #include -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #include "gloo/cuda.h" #include "gloo/types.h" diff --git a/gloo/pairwise_exchange.h b/gloo/pairwise_exchange.h index f0da7552f..03550865f 100644 --- a/gloo/pairwise_exchange.h +++ b/gloo/pairwise_exchange.h @@ -11,7 +11,7 @@ #include #include "gloo/algorithm.h" -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #include "gloo/context.h" namespace gloo { diff --git a/gloo/reduce.cc b/gloo/reduce.cc index 4ace5607d..41d7db00c 100644 --- a/gloo/reduce.cc +++ b/gloo/reduce.cc @@ -12,7 +12,7 @@ #include #include -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #include "gloo/math.h" #include "gloo/types.h" diff --git a/gloo/rendezvous/context.cc b/gloo/rendezvous/context.cc index f71739d3a..7db0a17cd 100644 --- a/gloo/rendezvous/context.cc +++ b/gloo/rendezvous/context.cc @@ -10,7 +10,7 @@ #include -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #include "gloo/rendezvous/store.h" #include "gloo/transport/address.h" diff --git a/gloo/rendezvous/file_store.cc b/gloo/rendezvous/file_store.cc index 1202e80f0..12a1b34e9 100644 --- a/gloo/rendezvous/file_store.cc +++ b/gloo/rendezvous/file_store.cc @@ -26,8 +26,8 @@ #include #endif +#include "gloo/common/enforce.h" #include "gloo/common/error.h" -#include "gloo/common/logging.h" namespace gloo { namespace rendezvous { diff --git a/gloo/rendezvous/hash_store.cc b/gloo/rendezvous/hash_store.cc index 34c0951f7..984c46d0d 100644 --- a/gloo/rendezvous/hash_store.cc +++ b/gloo/rendezvous/hash_store.cc @@ -8,8 +8,8 @@ #include "gloo/rendezvous/hash_store.h" +#include "gloo/common/enforce.h" #include "gloo/common/error.h" -#include "gloo/common/logging.h" namespace gloo { namespace rendezvous { diff --git a/gloo/rendezvous/redis_store.cc b/gloo/rendezvous/redis_store.cc index f4d91fb2a..503162178 100644 --- a/gloo/rendezvous/redis_store.cc +++ b/gloo/rendezvous/redis_store.cc @@ -10,8 +10,8 @@ #include +#include "gloo/common/enforce.h" #include "gloo/common/error.h" -#include "gloo/common/logging.h" #include "gloo/common/string.h" namespace gloo { diff --git a/gloo/rendezvous/store.h b/gloo/rendezvous/store.h index e2569d0ef..b32e5f72c 100644 --- a/gloo/rendezvous/store.h +++ b/gloo/rendezvous/store.h @@ -12,8 +12,8 @@ #include #include +#include "gloo/common/enforce.h" #include "gloo/common/error.h" -#include "gloo/common/logging.h" #include "gloo/common/store.h" // can be used by upstream users to know whether this is available or not. diff --git a/gloo/scatter.cc b/gloo/scatter.cc index c1810ca47..e7317a62e 100644 --- a/gloo/scatter.cc +++ b/gloo/scatter.cc @@ -11,7 +11,7 @@ #include #include -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #include "gloo/types.h" namespace gloo { diff --git a/gloo/test/base_test.cc b/gloo/test/base_test.cc index ccccfa3bb..b5cf44bc8 100644 --- a/gloo/test/base_test.cc +++ b/gloo/test/base_test.cc @@ -7,6 +7,7 @@ */ #include "gloo/test/base_test.h" +#include "gloo/common/log.h" #include "gloo/test/openssl_utils.h" namespace gloo { @@ -81,7 +82,7 @@ std::shared_ptr<::gloo::transport::Device> createDevice(Transport transport) { try { return ::gloo::transport::ibverbs::CreateDevice(attr); } catch (const InvalidOperationException& e) { - GLOO_INFO("IBVERBS not available: ", e.what()); + GLOO_INFO("IBVERBS not available: {}", e.what()); } } #endif diff --git a/gloo/test/multiproc_test.h b/gloo/test/multiproc_test.h index e639fc3e7..0e4193052 100644 --- a/gloo/test/multiproc_test.h +++ b/gloo/test/multiproc_test.h @@ -15,7 +15,7 @@ #include #include -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #include "gloo/rendezvous/context.h" #include "gloo/rendezvous/file_store.h" #include "gloo/test/base_test.h" diff --git a/gloo/test/openssl_utils.cc b/gloo/test/openssl_utils.cc index 485f78e25..8eb9f0ba9 100644 --- a/gloo/test/openssl_utils.cc +++ b/gloo/test/openssl_utils.cc @@ -5,7 +5,7 @@ #include "gloo/test/openssl_utils.h" #if GLOO_HAVE_TRANSPORT_TCP_TLS -#include +#include #include namespace gloo { diff --git a/gloo/transport/ibverbs/address.cc b/gloo/transport/ibverbs/address.cc index 7f1b9d0c5..050a813e7 100644 --- a/gloo/transport/ibverbs/address.cc +++ b/gloo/transport/ibverbs/address.cc @@ -12,7 +12,7 @@ #include -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" namespace gloo { namespace transport { diff --git a/gloo/transport/ibverbs/buffer.cc b/gloo/transport/ibverbs/buffer.cc index bb39e8b04..965c889c3 100644 --- a/gloo/transport/ibverbs/buffer.cc +++ b/gloo/transport/ibverbs/buffer.cc @@ -6,16 +6,15 @@ * LICENSE file in the root directory of this source tree. */ -#include "gloo/transport/ibverbs/buffer.h" - #include #include #include - #include +#include "gloo/common/enforce.h" #include "gloo/common/error.h" -#include "gloo/common/logging.h" +#include "gloo/common/log.h" +#include "gloo/transport/ibverbs/buffer.h" namespace gloo { namespace transport { @@ -63,7 +62,8 @@ Buffer::~Buffer() { std::lock_guard lock(m_); if (sendPending_ > 0) { GLOO_WARN( - "Destructing buffer with pending sends, sendPending_=", sendPending_); + "Destructing buffer with pending sends, sendPending_={}", + sendPending_.load()); } ibv_dereg_mr(mr_); diff --git a/gloo/transport/ibverbs/device.cc b/gloo/transport/ibverbs/device.cc index ef0d5a123..6146e7f86 100644 --- a/gloo/transport/ibverbs/device.cc +++ b/gloo/transport/ibverbs/device.cc @@ -6,18 +6,17 @@ * LICENSE file in the root directory of this source tree. */ -#include "gloo/transport/ibverbs/device.h" - #include #include -#include - #include +#include +#include "gloo/common/enforce.h" #include "gloo/common/error.h" #include "gloo/common/linux.h" -#include "gloo/common/logging.h" +#include "gloo/common/log.h" #include "gloo/transport/ibverbs/context.h" +#include "gloo/transport/ibverbs/device.h" #include "gloo/transport/ibverbs/pair.h" namespace gloo { @@ -85,9 +84,8 @@ std::shared_ptr<::gloo::transport::Device> CreateDevice( std::vector names; for (auto i = 0; i < devices.size(); i++) { GLOO_DEBUG( - "found candidate device ", + "found candidate device {} dev={}", devices[i]->name, - " dev=", devices[i]->dev_name); names.push_back(devices[i]->name); } @@ -96,11 +94,9 @@ std::shared_ptr<::gloo::transport::Device> CreateDevice( } GLOO_INFO( - "Using ibverbs device=", + "Using ibverbs device={} port={} index={}", attr.name, - " port=", attr.port, - " index=", attr.index); // Look for specified device name @@ -227,7 +223,7 @@ void Device::loop() { Pair* pair = static_cast(cqContext); pair->handleCompletionEvent(); } catch (const std::exception& ex) { - GLOO_ERROR("Exception while handling completion event: ", ex.what()); + GLOO_ERROR("Exception while handling completion event: {}", ex.what()); throw; } } diff --git a/gloo/transport/ibverbs/memory_region.cc b/gloo/transport/ibverbs/memory_region.cc index 83320dd2d..702bcb9f8 100644 --- a/gloo/transport/ibverbs/memory_region.cc +++ b/gloo/transport/ibverbs/memory_region.cc @@ -10,7 +10,7 @@ #include -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" namespace gloo { namespace transport { diff --git a/gloo/transport/ibverbs/pair.cc b/gloo/transport/ibverbs/pair.cc index d3a0410bb..0ede8ae96 100644 --- a/gloo/transport/ibverbs/pair.cc +++ b/gloo/transport/ibverbs/pair.cc @@ -6,16 +6,16 @@ * LICENSE file in the root directory of this source tree. */ -#include "gloo/transport/ibverbs/pair.h" -#include "gloo/transport/ibverbs/buffer.h" -#include "gloo/transport/ibverbs/unbound_buffer.h" - -#include -#include +#include +#include #include "gloo/common/common.h" +#include "gloo/common/enforce.h" #include "gloo/common/error.h" -#include "gloo/common/logging.h" +#include "gloo/common/log.h" +#include "gloo/transport/ibverbs/buffer.h" +#include "gloo/transport/ibverbs/pair.h" +#include "gloo/transport/ibverbs/unbound_buffer.h" namespace gloo { namespace transport { @@ -231,7 +231,7 @@ void Pair::sendMemoryRegion(struct ibv_mr* src, int slot) { wr.imm_data = slot; GLOO_DEBUG( - "sendMemoryRegion slot=", slot, " addr=", src->addr, " lkey=", src->lkey); + "sendMemoryRegion slot={} addr={} lkey={}", slot, src->addr, src->lkey); // The work request is serialized and sent to the driver so it // doesn't need to be valid after the ibv_post_send call. @@ -331,7 +331,7 @@ void Pair::send( size_t offset, size_t nbytes) { std::unique_lock lock(m_); - GLOO_DEBUG("send tag=", slot, " offset=", offset, " nbytes=", nbytes); + GLOO_DEBUG("send tag={} offset={} nbytes={}", slot, offset, nbytes); GLOO_ENFORCE(!sync_, "Cannot send in sync mode"); @@ -364,15 +364,11 @@ void Pair::send( wr.wr.rdma.rkey = peer.rkey; GLOO_DEBUG( - "send UnboundBuffer async slot=", + "send UnboundBuffer async slot={} peer_addr={} remote_addr={:#x} " + "rkey={}", wr.wr_id, - " peer_addr=", peer.addr, - " remote_addr=", - std::hex, wr.wr.rdma.remote_addr, - std::dec, - " rkey=", wr.wr.rdma.rkey); struct ibv_send_wr* bad_wr; @@ -391,7 +387,7 @@ void Pair::recv( size_t nbytes) { std::unique_lock lock(m_); - GLOO_DEBUG("recv tag=", tag, " offset=", offset, " nbytes=", nbytes); + GLOO_DEBUG("recv tag={} offset={} nbytes={}", tag, offset, nbytes); GLOO_ENFORCE(!sync_, "Cannot recv in sync mode"); @@ -445,19 +441,13 @@ void Pair::put( wr.wr.rdma.rkey = key->rkey_; GLOO_DEBUG( + "{}->{}: put UnboundBuffer async slot={} peer_addr={} " + "remote_addr={:#x} rkey={}", self_.str(), - "->", peer_.str(), - ": ", - "put UnboundBuffer async slot=", wr.wr_id, - " peer_addr=", key->addr_, - " remote_addr=", - std::hex, wr.wr.rdma.remote_addr, - std::dec, - " rkey=", wr.wr.rdma.rkey); struct ibv_send_wr* bad_wr; @@ -507,19 +497,13 @@ void Pair::get( wr.wr.rdma.rkey = key->rkey_; GLOO_DEBUG( + "{}->{}: get UnboundBuffer async slot={} peer_addr={} " + "remote_addr={:#x} rkey={}", self_.str(), - "->", peer_.str(), - ": ", - "get UnboundBuffer async slot=", wr.wr_id, - " peer_addr=", key->addr_, - " remote_addr=", - std::hex, wr.wr.rdma.remote_addr, - std::dec, - " rkey=", wr.wr.rdma.rkey); struct ibv_send_wr* bad_wr; @@ -585,11 +569,9 @@ void Pair::pollCompletions() { handleCompletion(&wc[i]); } catch (const std::exception& ex) { GLOO_ERROR( + "{}->{}: Exception in handleCompletion: {}", self_.str(), - "->", peer_.str(), - ": ", - "Exception in handleCompletion: ", ex.what()); throw; } @@ -604,13 +586,11 @@ void Pair::pollCompletions() { void Pair::handleCompletion(struct ibv_wc* wc) { GLOO_DEBUG( + "{}->{}: handleCompletion id={} opcode={}", self_.str(), - "->", peer_.str(), - ": handleCompletion id=", wc->wr_id, - " opcode=", - wc->opcode); + fmt::underlying(wc->opcode)); if (wc->opcode == IBV_WC_RECV_RDMA_WITH_IMM) { // Incoming RDMA write completed. @@ -657,12 +637,10 @@ void Pair::handleCompletion(struct ibv_wc* wc) { } } else if (wc->opcode == IBV_WC_RECV) { GLOO_DEBUG( + "{}->{}: handleCompletion id={} opcode=IBV_WC_RECV slot={}", self_.str(), - "->", peer_.str(), - ": handleCompletion id=", wc->wr_id, - " opcode=IBV_WC_RECV slot=", wc->imm_data); // Memory region recv completed. // @@ -706,12 +684,10 @@ void Pair::handleCompletion(struct ibv_wc* wc) { // Memory region send completed. GLOO_DEBUG( + "{}->{}: handleCompletion id={} opcode=IBV_WC_SEND", self_.str(), - "->", peer_.str(), - ": handleCompletion id=", - wc->wr_id, - " opcode=IBV_WC_SEND"); + wc->wr_id); auto slot = wc->wr_id; GLOO_ENFORCE_EQ( @@ -750,15 +726,10 @@ void Pair::send(Buffer* buffer, size_t offset, size_t length, size_t roffset) { wr.wr.rdma.rkey = peer.rkey; GLOO_DEBUG( - "send Buffer async slot=", + "send Buffer async slot={} peer_addr={} remote_addr={:#x} rkey={}", wr.wr_id, - " peer_addr=", peer.addr, - " remote_addr=", - std::hex, wr.wr.rdma.remote_addr, - std::dec, - " rkey=", wr.wr.rdma.rkey); struct ibv_send_wr* bad_wr; @@ -781,7 +752,7 @@ void Pair::send(Buffer* buffer, size_t offset, size_t length, size_t roffset) { void Pair::signalIoFailure(const std::string& msg) { std::lock_guard lock(m_); - GLOO_ERROR(msg); + GLOO_ERROR("{}", msg); auto ex = ::gloo::IoException(msg); if (ex_ == nullptr) { // If we haven't seen an error yet, store the exception to throw on future diff --git a/gloo/transport/ibverbs/unbound_buffer.cc b/gloo/transport/ibverbs/unbound_buffer.cc index 971535d7c..e3a4fd022 100644 --- a/gloo/transport/ibverbs/unbound_buffer.cc +++ b/gloo/transport/ibverbs/unbound_buffer.cc @@ -6,13 +6,13 @@ * LICENSE file in the root directory of this source tree. */ -#include "gloo/transport/ibverbs/unbound_buffer.h" - #include +#include "gloo/common/enforce.h" #include "gloo/common/error.h" -#include "gloo/common/logging.h" +#include "gloo/common/log.h" #include "gloo/transport/ibverbs/context.h" +#include "gloo/transport/ibverbs/unbound_buffer.h" namespace gloo { namespace transport { @@ -106,9 +106,8 @@ void UnboundBuffer::handleCompletion(int rank, struct ibv_wc* wc) { sendCv_.notify_one(); GLOO_DEBUG( - "send complete sendPending=", + "send complete sendPending={} sendCompletions={}", sendPending_, - " sendCompletions=", sendCompletions_); } else { GLOO_ENFORCE(false, "Unexpected completion (opcode: ", wc->opcode, ")"); diff --git a/gloo/transport/pair.h b/gloo/transport/pair.h index 42ef7cd69..93eabf783 100644 --- a/gloo/transport/pair.h +++ b/gloo/transport/pair.h @@ -10,7 +10,7 @@ #include -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #include "gloo/transport/address.h" #include "gloo/transport/buffer.h" #include "gloo/transport/unbound_buffer.h" diff --git a/gloo/transport/tcp/CMakeLists.txt b/gloo/transport/tcp/CMakeLists.txt index acaf906b7..62691dd55 100644 --- a/gloo/transport/tcp/CMakeLists.txt +++ b/gloo/transport/tcp/CMakeLists.txt @@ -5,7 +5,6 @@ else() "${CMAKE_CURRENT_SOURCE_DIR}/address.cc" "${CMAKE_CURRENT_SOURCE_DIR}/buffer.cc" "${CMAKE_CURRENT_SOURCE_DIR}/context.cc" - "${CMAKE_CURRENT_SOURCE_DIR}/debug_logger.cc" "${CMAKE_CURRENT_SOURCE_DIR}/device.cc" "${CMAKE_CURRENT_SOURCE_DIR}/error.cc" "${CMAKE_CURRENT_SOURCE_DIR}/helpers.cc" @@ -21,7 +20,6 @@ else() "${CMAKE_CURRENT_SOURCE_DIR}/buffer.h" "${CMAKE_CURRENT_SOURCE_DIR}/context.h" "${CMAKE_CURRENT_SOURCE_DIR}/debug_data.h" - "${CMAKE_CURRENT_SOURCE_DIR}/debug_logger.h" "${CMAKE_CURRENT_SOURCE_DIR}/device.h" "${CMAKE_CURRENT_SOURCE_DIR}/error.h" "${CMAKE_CURRENT_SOURCE_DIR}/helpers.h" diff --git a/gloo/transport/tcp/address.cc b/gloo/transport/tcp/address.cc index e37f7b493..93c540d11 100644 --- a/gloo/transport/tcp/address.cc +++ b/gloo/transport/tcp/address.cc @@ -13,7 +13,7 @@ #include #include -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" namespace gloo { namespace transport { diff --git a/gloo/transport/tcp/buffer.cc b/gloo/transport/tcp/buffer.cc index 54a5e1119..11d9f34bd 100644 --- a/gloo/transport/tcp/buffer.cc +++ b/gloo/transport/tcp/buffer.cc @@ -15,7 +15,7 @@ #include #include "gloo/common/error.h" -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" namespace gloo { namespace transport { diff --git a/gloo/transport/tcp/context.cc b/gloo/transport/tcp/context.cc index 023178741..28e527cd5 100644 --- a/gloo/transport/tcp/context.cc +++ b/gloo/transport/tcp/context.cc @@ -6,16 +6,15 @@ * LICENSE file in the root directory of this source tree. */ -#include "gloo/transport/tcp/context.h" - #include #include #include -#include #include -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" +#include "gloo/common/log.h" #include "gloo/common/utils.h" +#include "gloo/transport/tcp/context.h" #include "gloo/transport/tcp/device.h" #include "gloo/transport/tcp/pair.h" #include "gloo/transport/tcp/unbound_buffer.h" @@ -241,17 +240,14 @@ std::vector Context::getUnConnectedPeerRanks() const { void Context::printConnectivityInfo() const { int numConnectedPeers = getConnectedPeerRanks().size(); GLOO_INFO( - "Rank ", + "Rank {} is connected to {}. Expected number of connected peers is: {}", rank, - " is connected to ", numConnectedPeers, - ". Expected number of connected peers is: ", size - 1); if (numConnectedPeers != size - 1) { std::vector unConnectedPeers = getUnConnectedPeerRanks(); - auto peerStrCat = ::gloo::MakeString(unConnectedPeers, /*delim=*/", "); - GLOO_INFO("Rank ", rank, " is NOT connected to: [", peerStrCat, "]"); + GLOO_INFO("Rank {} is NOT connected to: {}", rank, unConnectedPeers); } } diff --git a/gloo/transport/tcp/debug_data.h b/gloo/transport/tcp/debug_data.h index 71863067e..699836b98 100644 --- a/gloo/transport/tcp/debug_data.h +++ b/gloo/transport/tcp/debug_data.h @@ -1,11 +1,10 @@ // (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. +#pragma once +#include #include -#pragma once -namespace gloo { -namespace transport { -namespace tcp { +namespace gloo::transport::tcp { struct ConnectDebugData { const int retryCount; @@ -18,6 +17,23 @@ struct ConnectDebugData { const std::string local; }; -} // namespace tcp -} // namespace transport -} // namespace gloo +} // namespace gloo::transport::tcp + +template <> +struct fmt::formatter + : fmt::formatter { + auto format(gloo::transport::tcp::ConnectDebugData& data, format_context& ctx) + const -> decltype(ctx.out()) { + return fmt::format_to( + ctx.out(), + "willRetry={}, retry={}, retryLimit={}, rank={}, size={}, local={}, remote={}, error={}", + data.willRetry, + data.retryCount, + data.retryLimit, + data.glooRank, + data.glooSize, + data.local, + data.remote, + data.error); + } +}; diff --git a/gloo/transport/tcp/debug_logger.cc b/gloo/transport/tcp/debug_logger.cc deleted file mode 100644 index b55209616..000000000 --- a/gloo/transport/tcp/debug_logger.cc +++ /dev/null @@ -1,30 +0,0 @@ -#include -#include - -namespace gloo { -namespace transport { -namespace tcp { - -void DebugLogger::log(const ConnectDebugData& data) { - GLOO_ERROR( - "failed to connect, willRetry=", - data.willRetry, - ", retry=", - data.retryCount, - ", retryLimit=", - data.retryLimit, - ", rank=", - data.glooRank, - ", size=", - data.glooSize, - ", local=", - data.local, - ", remote=", - data.remote, - ", error=", - data.error); -} - -} // namespace tcp -} // namespace transport -} // namespace gloo diff --git a/gloo/transport/tcp/debug_logger.h b/gloo/transport/tcp/debug_logger.h deleted file mode 100644 index e58f9c6a1..000000000 --- a/gloo/transport/tcp/debug_logger.h +++ /dev/null @@ -1,20 +0,0 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. - -#pragma once - -#include - -namespace gloo { -namespace transport { -namespace tcp { - -class DebugLogger { - public: - static void log(const ConnectDebugData& data); - - private: -}; - -} // namespace tcp -} // namespace transport -} // namespace gloo diff --git a/gloo/transport/tcp/device.cc b/gloo/transport/tcp/device.cc index daf0c2fb1..be9b19edf 100644 --- a/gloo/transport/tcp/device.cc +++ b/gloo/transport/tcp/device.cc @@ -15,9 +15,9 @@ #include #include +#include "gloo/common/enforce.h" #include "gloo/common/error.h" #include "gloo/common/linux.h" -#include "gloo/common/logging.h" #include "gloo/common/utils.h" #include "gloo/transport/tcp/context.h" #include "gloo/transport/tcp/helpers.h" diff --git a/gloo/transport/tcp/helpers.cc b/gloo/transport/tcp/helpers.cc index c403996b4..e57ccadc0 100644 --- a/gloo/transport/tcp/helpers.cc +++ b/gloo/transport/tcp/helpers.cc @@ -1,4 +1,7 @@ -#include +#include "gloo/transport/tcp/helpers.h" +#include "gloo/common/log.h" +#include "gloo/transport/tcp/debug_data.h" +#include "gloo/transport/tcp/loop.h" namespace gloo { namespace transport { @@ -16,6 +19,49 @@ void connectLoop( x->run(loop); } +void ConnectOperation::handleEvents(Loop& loop, int /*events*/) { + // Hold a reference to this object to keep it alive until the + // callback is called. + auto leak = shared_from_this(); + loop.unregisterDescriptor(socket_->fd(), this); + + int result; + socklen_t result_len = sizeof(result); + if (getsockopt(socket_->fd(), SOL_SOCKET, SO_ERROR, &result, &result_len) < + 0) { + fn_(loop, socket_, SystemError("getsockopt", errno, remote_)); + return; + } + if (result != 0) { + SystemError e("SO_ERROR", result, remote_); + bool willRetry = + std::chrono::steady_clock::now() < deadline_ && retry_++ < maxRetries_; + + auto debugData = ConnectDebugData{ + retry_, + maxRetries_, + willRetry, + rank_, + size_, + e.what(), + remote_.str(), + socket_->sockName().str(), + }; + GLOO_WARN("{}", debugData); + + // check deadline + if (willRetry) { + run(loop); + } else { + fn_(loop, socket_, TimeoutError("timed out connecting: " + e.what())); + } + + return; + } + + fn_(loop, socket_, Error::kSuccess); +} + } // namespace tcp } // namespace transport } // namespace gloo diff --git a/gloo/transport/tcp/helpers.h b/gloo/transport/tcp/helpers.h index c75993590..14329a4e9 100644 --- a/gloo/transport/tcp/helpers.h +++ b/gloo/transport/tcp/helpers.h @@ -11,16 +11,11 @@ #include #include -#include -#include -#include -#include -#include -#include "gloo/transport/tcp/debug_logger.h" // @manual=//gloo:debug_logger +#include "gloo/transport/tcp/error.h" +#include "gloo/transport/tcp/loop.h" +#include "gloo/transport/tcp/socket.h" -namespace gloo { -namespace transport { -namespace tcp { +namespace gloo::transport::tcp { // ReadValueOperation asynchronously reads a value of type T from the // socket specified at construction. Upon completion or error, the @@ -171,48 +166,7 @@ class ConnectOperation final this->shared_from_this()); } - void handleEvents(Loop& loop, int /*events*/) override { - // Hold a reference to this object to keep it alive until the - // callback is called. - auto leak = shared_from_this(); - loop.unregisterDescriptor(socket_->fd(), this); - - int result; - socklen_t result_len = sizeof(result); - if (getsockopt(socket_->fd(), SOL_SOCKET, SO_ERROR, &result, &result_len) < - 0) { - fn_(loop, socket_, SystemError("getsockopt", errno, remote_)); - return; - } - if (result != 0) { - SystemError e("SO_ERROR", result, remote_); - bool willRetry = std::chrono::steady_clock::now() < deadline_ && - retry_++ < maxRetries_; - - auto debugData = ConnectDebugData{ - retry_, - maxRetries_, - willRetry, - rank_, - size_, - e.what(), - remote_.str(), - socket_->sockName().str(), - }; - DebugLogger::log(debugData); - - // check deadline - if (willRetry) { - run(loop); - } else { - fn_(loop, socket_, TimeoutError("timed out connecting: " + e.what())); - } - - return; - } - - fn_(loop, socket_, Error::kSuccess); - } + void handleEvents(Loop& loop, int /*events*/) override; private: const Address remote_; @@ -235,6 +189,4 @@ void connectLoop( std::chrono::milliseconds timeout, typename ConnectOperation::callback_t fn); -} // namespace tcp -} // namespace transport -} // namespace gloo +} // namespace gloo::transport::tcp diff --git a/gloo/transport/tcp/listener.cc b/gloo/transport/tcp/listener.cc index 325fe0ffc..409a9a3f6 100644 --- a/gloo/transport/tcp/listener.cc +++ b/gloo/transport/tcp/listener.cc @@ -12,7 +12,7 @@ #include #include -#include +#include #include #include # diff --git a/gloo/transport/tcp/loop.cc b/gloo/transport/tcp/loop.cc index dca13e33f..613b92a48 100644 --- a/gloo/transport/tcp/loop.cc +++ b/gloo/transport/tcp/loop.cc @@ -14,8 +14,8 @@ #include +#include #include -#include #if defined(__SANITIZE_THREAD__) #define TSAN_ENABLED diff --git a/gloo/transport/tcp/pair.cc b/gloo/transport/tcp/pair.cc index 030778c0b..95705b83f 100644 --- a/gloo/transport/tcp/pair.cc +++ b/gloo/transport/tcp/pair.cc @@ -22,8 +22,8 @@ #include #include +#include "gloo/common/enforce.h" #include "gloo/common/error.h" -#include "gloo/common/logging.h" #include "gloo/transport/tcp/buffer.h" #include "gloo/transport/tcp/context.h" #include "gloo/transport/tcp/unbound_buffer.h" diff --git a/gloo/transport/tcp/socket.cc b/gloo/transport/tcp/socket.cc index 1b1270653..4c304b319 100644 --- a/gloo/transport/tcp/socket.cc +++ b/gloo/transport/tcp/socket.cc @@ -13,7 +13,7 @@ #include #include -#include +#include namespace gloo { namespace transport { diff --git a/gloo/transport/tcp/tls/context.cc b/gloo/transport/tcp/tls/context.cc index 141fdac3b..06a819097 100644 --- a/gloo/transport/tcp/tls/context.cc +++ b/gloo/transport/tcp/tls/context.cc @@ -8,7 +8,7 @@ #include "gloo/transport/tcp/tls/context.h" -#include "gloo/common/logging.h" +#include "gloo/common/enforce.h" #include "gloo/transport/tcp/tls/device.h" #include "gloo/transport/tcp/tls/openssl.h" #include "gloo/transport/tcp/tls/pair.h" diff --git a/gloo/transport/tcp/tls/pair.cc b/gloo/transport/tcp/tls/pair.cc index 3e0fcfde3..7c096ea8b 100644 --- a/gloo/transport/tcp/tls/pair.cc +++ b/gloo/transport/tcp/tls/pair.cc @@ -8,8 +8,8 @@ #include "gloo/transport/tcp/tls/pair.h" +#include "gloo/common/enforce.h" #include "gloo/common/error.h" -#include "gloo/common/logging.h" #include "gloo/transport/tcp/buffer.h" #include "gloo/transport/tcp/tls/context.h" #include "gloo/transport/tcp/tls/device.h" diff --git a/gloo/transport/tcp/unbound_buffer.cc b/gloo/transport/tcp/unbound_buffer.cc index b8cac4679..59b484b30 100644 --- a/gloo/transport/tcp/unbound_buffer.cc +++ b/gloo/transport/tcp/unbound_buffer.cc @@ -8,8 +8,8 @@ #include "gloo/transport/tcp/unbound_buffer.h" +#include "gloo/common/enforce.h" #include "gloo/common/error.h" -#include "gloo/common/logging.h" #include "gloo/transport/tcp/context.h" namespace gloo { diff --git a/gloo/transport/uv/address.cc b/gloo/transport/uv/address.cc index 9b69b9ece..f85d49b46 100644 --- a/gloo/transport/uv/address.cc +++ b/gloo/transport/uv/address.cc @@ -12,7 +12,7 @@ #include -#include +#include namespace gloo { namespace transport { diff --git a/gloo/transport/uv/context.cc b/gloo/transport/uv/context.cc index 606bc95af..1b342854f 100644 --- a/gloo/transport/uv/context.cc +++ b/gloo/transport/uv/context.cc @@ -8,8 +8,8 @@ #include +#include #include -#include #include #include #include diff --git a/gloo/transport/uv/device.cc b/gloo/transport/uv/device.cc index 1dc986433..c6af3fefe 100644 --- a/gloo/transport/uv/device.cc +++ b/gloo/transport/uv/device.cc @@ -22,7 +22,7 @@ #else #include // @manual #endif -#include +#include #include #include #include diff --git a/gloo/transport/uv/pair.cc b/gloo/transport/uv/pair.cc index 4db8bf32c..e1fea6839 100644 --- a/gloo/transport/uv/pair.cc +++ b/gloo/transport/uv/pair.cc @@ -11,8 +11,8 @@ #include #include +#include #include -#include #include #include #include diff --git a/gloo/transport/uv/unbound_buffer.cc b/gloo/transport/uv/unbound_buffer.cc index 593020b3c..f0cebf261 100644 --- a/gloo/transport/uv/unbound_buffer.cc +++ b/gloo/transport/uv/unbound_buffer.cc @@ -8,8 +8,8 @@ #include +#include #include -#include #include namespace gloo { From 3b85da07bf8a86242d2ab2cf44e4a6ab314860ae Mon Sep 17 00:00:00 2001 From: Jack Bond-Preston Date: Fri, 10 Oct 2025 22:46:40 +0000 Subject: [PATCH 4/4] Replace cout usages with logger calls --- gloo/allreduce_bcube.cc | 74 ++++++++++++++++++++++++++++++++ gloo/allreduce_bcube.h | 46 ++------------------ gloo/cuda_allreduce_bcube.cc | 62 ++++++++++++++++---------- gloo/transport/buffer.h | 7 +-- gloo/transport/ibverbs/buffer.cc | 18 ++------ gloo/transport/tcp/buffer.cc | 14 ++---- gloo/types.h | 8 ++++ 7 files changed, 133 insertions(+), 96 deletions(-) create mode 100644 gloo/allreduce_bcube.cc diff --git a/gloo/allreduce_bcube.cc b/gloo/allreduce_bcube.cc new file mode 100644 index 000000000..2aed4606f --- /dev/null +++ b/gloo/allreduce_bcube.cc @@ -0,0 +1,74 @@ +#include "gloo/allreduce_bcube.h" + +#include + +#include "gloo/common/log.h" + +namespace gloo { + +template +void AllreduceBcube::printElems(T* p, int count, int start) { + /* Early return if log level is not high enough, to prevent expensive code + * running. */ + if (!spdlog::should_log(spdlog::level::trace)) + return; + + const std::size_t alignedStart = (start / wordsPerLine) * wordsPerLine; + fmt::memory_buffer line{}; + + /* Logs/flushes the line buffer - starting a new line */ + auto printLine = [&]() { + if (!line.size()) + return; + std::string_view sv{line.data(), line.size()}; + GLOO_TRACE("{}", sv); + line.clear(); + }; + + for (std::size_t x = alignedStart; x < start + count; ++x) { + if (x % wordsPerLine == 0) { + if (x != alignedStart) + printLine(); + fmt::format_to( + std::back_inserter(line), "{} {:05}: ", fmt::ptr(&p[x]), x); + } else if (x % wordsPerSection == 0) { + fmt::format_to(std::back_inserter(line), "- "); + } + + if (x < start) + fmt::format_to(std::back_inserter(line), "..... "); + else + fmt::format_to(std::back_inserter(line), "{:05} ", p[x]); + } + printLine(); +} + +template +void AllreduceBcube::printStageBuffer(const std::string& msg) { + if (printCheck(myRank_)) { + GLOO_TRACE("rank ({}) {}:", myRank_, msg); + printElems(&ptrs_[0][0], totalNumElems_); + } +} + +template +void AllreduceBcube::printStepBuffer( + const std::string& stage, + int step, + int srcRank, + int destRank, + T* p, + int count, + int start) { + if (printCheck(myRank_)) { + GLOO_TRACE( + "{}: step ({}) srcRank ({}) -> destRank ({})", + stage, + step, + srcRank, + destRank); + printElems(p, count, start); + } +} + +} // namespace gloo diff --git a/gloo/allreduce_bcube.h b/gloo/allreduce_bcube.h index c91fa130d..ff094efa8 100644 --- a/gloo/allreduce_bcube.h +++ b/gloo/allreduce_bcube.h @@ -13,15 +13,12 @@ #include #include #include -#include -#include #include #include #include #include #include "gloo/algorithm.h" -#include "gloo/common/error.h" #include "gloo/context.h" /** @@ -537,48 +534,18 @@ class AllreduceBcube : public Algorithm { static bool printCheck(int /*rank*/) { return false; } - /** - * Prints a break given the offset of an element about to be printed - * @param p Pointer to the elements - * @param x The current offset to the pointer to words - */ - static void printBreak(T* p, int x) { - if (0 == x % wordsPerLine) { - std::cout << std::endl - << &p[x] << " " << std::setfill('0') << std::setw(5) << x - << ": "; - } else if (0 == x % wordsPerSection) { - std::cout << "- "; - } - } /** * Pretty prints a list of elements * @param p Pointer to the elements * @param count The number of elements to be printed * @param start The offset from which to print */ - static void printElems(T* p, int count, int start = 0) { - auto alignedStart = (start / wordsPerLine) * wordsPerLine; - for (int x = alignedStart; x < start + count; ++x) { - printBreak(p, x); - if (x < start) { - std::cout << "..... "; - } else { - std::cout << std::setfill('0') << std::setw(5) << p[x] << " "; - } - } - } + static void printElems(T* p, int count, int start); /** * Prints contents in the ptrs array at a particular stage * @param msg Custom message to be printed */ - void printStageBuffer(const std::string& msg) { - if (printCheck(myRank_)) { - std::cout << "rank (" << myRank_ << ") " << msg << ": "; - printElems(&ptrs_[0][0], totalNumElems_); - std::cout << std::endl; - } - } + void printStageBuffer(const std::string& msg); /** * Prints specified buffer during a step @@ -596,14 +563,7 @@ class AllreduceBcube : public Algorithm { int destRank, T* p, int count, - int start = 0) { - if (printCheck(myRank_)) { - std::cout << stage << ": step (" << step << ") " << "srcRank (" << srcRank - << ") -> " << "destRank (" << destRank << "): "; - printElems(p, count, start); - std::cout << std::endl; - } - } + int start = 0); /** * Get all the peers of node with specified rank * @param rank Rank of the node for which peers are needed diff --git a/gloo/cuda_allreduce_bcube.cc b/gloo/cuda_allreduce_bcube.cc index 065862f95..8374c8fb4 100644 --- a/gloo/cuda_allreduce_bcube.cc +++ b/gloo/cuda_allreduce_bcube.cc @@ -8,11 +8,13 @@ #include "gloo/cuda_allreduce_bcube.h" +#include "gloo/common/log.h" #include "gloo/cuda_collectives_device.h" #include "gloo/cuda_collectives_host.h" #include "gloo/cuda_private.h" #include +#include #ifndef _WIN32 #include #endif @@ -226,35 +228,48 @@ bool CudaAllreduceBcube::printCheck(int /* rank */) { return false; } -template -void CudaAllreduceBcube::printBreak(T* p, int x) { - if (0 == x % wordsPerLine) { - std::cout << std::endl - << &p[x] << " " << std::setfill('0') << std::setw(5) << x << ": "; - } else if (0 == x % wordsPerSection) { - std::cout << "- "; - } -} - template void CudaAllreduceBcube::printElems(T* p, int count, int start) { - auto alignedStart = (start / wordsPerLine) * wordsPerLine; - for (int x = alignedStart; x < start + count; ++x) { - printBreak(p, x); - if (x < start) { - std::cout << "..... "; - } else { - std::cout << std::setfill('0') << std::setw(5) << p[x] << " "; + /* Early return if log level is not high enough, to prevent expensive code + * running. */ + if (!spdlog::should_log(spdlog::level::trace)) + return; + + const std::size_t alignedStart = (start / wordsPerLine) * wordsPerLine; + fmt::memory_buffer line{}; + + /* Logs/flushes the line buffer - starting a new line */ + auto printLine = [&]() { + if (!line.size()) + return; + std::string_view sv{line.data(), line.size()}; + GLOO_TRACE("{}", sv); + line.clear(); + }; + + for (std::size_t x = alignedStart; x < start + count; ++x) { + if (x % wordsPerLine == 0) { + if (x != alignedStart) + printLine(); + fmt::format_to( + std::back_inserter(line), "{} {:05}: ", fmt::ptr(&p[x]), x); + } else if (x % wordsPerSection == 0) { + fmt::format_to(std::back_inserter(line), "- "); } + + if (x < start) + fmt::format_to(std::back_inserter(line), "..... "); + else + fmt::format_to(std::back_inserter(line), "{:05} ", p[x]); } + printLine(); } template void CudaAllreduceBcube::printStageBuffer(const std::string& msg) { if (printCheck(myRank_)) { - std::cout << "rank (" << myRank_ << ") " << msg << ": "; + GLOO_TRACE("rank ({}) {}:", myRank_, msg); printElems(&scratch_[0], totalNumElems_); - std::cout << std::endl; } } @@ -268,10 +283,13 @@ void CudaAllreduceBcube::printStepBuffer( int count, int start) { if (printCheck(myRank_)) { - std::cout << stage << ": step (" << step << ") " << "srcRank (" << srcRank - << ") -> " << "destRank (" << destRank << "): "; + GLOO_TRACE( + "{}: step ({}) srcRank ({}) -> destRank ({}):", + stage, + step, + srcRank, + destRank); printElems(p, count, start); - std::cout << std::endl; } } diff --git a/gloo/transport/buffer.h b/gloo/transport/buffer.h index 382061a87..8431d37b6 100644 --- a/gloo/transport/buffer.h +++ b/gloo/transport/buffer.h @@ -16,13 +16,9 @@ namespace transport { class Buffer { public: explicit Buffer(int slot, void* ptr, size_t size) - : slot_(slot), ptr_(ptr), size_(size), debug_(false) {} + : slot_(slot), ptr_(ptr), size_(size) {} virtual ~Buffer() = 0; - virtual void setDebug(bool debug) { - debug_ = debug; - } - virtual void send(size_t offset, size_t length, size_t roffset = 0) = 0; // Send entire buffer by default @@ -37,7 +33,6 @@ class Buffer { int slot_; void* ptr_; size_t size_; - bool debug_; }; } // namespace transport diff --git a/gloo/transport/ibverbs/buffer.cc b/gloo/transport/ibverbs/buffer.cc index 965c889c3..64836641b 100644 --- a/gloo/transport/ibverbs/buffer.cc +++ b/gloo/transport/ibverbs/buffer.cc @@ -189,11 +189,7 @@ void Buffer::send(size_t offset, size_t length, size_t roffset) { checkErrorState(); - if (debug_) { - std::cout << "[" << getpid() << "] "; - std::cout << "send " << length << " bytes"; - std::cout << std::endl; - } + GLOO_TRACE("send {} bytes", length); // Increment number of sends in flight sendPending_++; @@ -208,19 +204,11 @@ void Buffer::handleCompletion(int rank, struct ibv_wc* wc) { std::unique_lock lock(m_); if (wc->opcode & IBV_WC_RECV) { - if (debug_) { - std::cout << "[" << getpid() << "] "; - std::cout << "recv " << wc->byte_len << " bytes"; - std::cout << std::endl; - } + GLOO_TRACE("recv {} bytes", wc->byte_len); recvCompletions_++; recvCv_.notify_one(); } else if (wc->opcode == IBV_WC_RDMA_WRITE) { - if (debug_) { - std::cout << "[" << getpid() << "] "; - std::cout << "send complete"; - std::cout << std::endl; - } + GLOO_TRACE("send complete"); sendCompletions_++; sendPending_--; sendCv_.notify_one(); diff --git a/gloo/transport/tcp/buffer.cc b/gloo/transport/tcp/buffer.cc index 11d9f34bd..1bee06e30 100644 --- a/gloo/transport/tcp/buffer.cc +++ b/gloo/transport/tcp/buffer.cc @@ -6,16 +6,15 @@ * LICENSE file in the root directory of this source tree. */ -#include "gloo/transport/tcp/buffer.h" - #include #include #include - #include -#include "gloo/common/error.h" #include "gloo/common/enforce.h" +#include "gloo/common/error.h" +#include "gloo/common/log.h" +#include "gloo/transport/tcp/buffer.h" namespace gloo { namespace transport { @@ -126,12 +125,7 @@ void Buffer::send(size_t offset, size_t length, size_t roffset) { // to support this. GLOO_ENFORCE_LE(offset + length, size_); - if (debug_) { - std::cout << "[" << getpid() << ": " << syscall(SYS_gettid) << "] "; - std::cout << "send " << length << " bytes"; - std::cout << " to " << pair_->peer().str(); - std::cout << std::endl; - } + GLOO_TRACE("send {} bytes to {}", length, pair_->peer().str()); op.preamble.nbytes = sizeof(op.preamble) + length; op.preamble.opcode = Op::SEND_BUFFER; diff --git a/gloo/types.h b/gloo/types.h index b9cb94680..b88093e80 100644 --- a/gloo/types.h +++ b/gloo/types.h @@ -339,4 +339,12 @@ inline float cpu_half2float(float16 h) { return *(float*)rp; } +/** + * @brief Format helper for spdlog/fmt - convert to a float32 and format as + * usual + */ +inline float format_as(float16 v) { + return cpu_half2float(v); +} + } // namespace gloo