rapidsai · RAMitchell · Nov 19, 2024 · Nov 14, 2024 · Nov 15, 2024 · Nov 15, 2024
diff --git a/.clang-tidy b/.clang-tidy
@@ -0,0 +1,4 @@
+Checks:          'performance-*'
+WarningsAsErrors: true
+HeaderFilterRegex: '.*'
+FormatStyle:     google
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
@@ -23,6 +23,7 @@ jobs:
   pr-builder:
     needs:
       - pre-commit
+      - clang-tidy
       - conda-python-build
       - conda-python-tests-cpu
       - conda-python-tests-gpu
@@ -35,6 +36,7 @@ jobs:
     - uses: actions/checkout@v4
     - uses: pre-commit/action@v3.0.1
 
+
   conda-python-build:
     needs:
       - pre-commit
@@ -79,6 +81,29 @@ jobs:
         run: |
           ci/test_python_cpu.sh
 
+  clang-tidy:
+    needs :
+      - pre-commit
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - ARCH: "amd64"
+            CUDA_VER: "12.5.1"
+            PY_VER: "3.11"
+    runs-on: linux-${{ matrix.ARCH }}-cpu4
+    container:
+      image: "rapidsai/ci-conda:cuda${{ matrix.CUDA_VER }}-ubuntu22.04-py${{ matrix.PY_VER }}"
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          # the notebooks and PNG files stored in git LFS aren't necessary
+          lfs: false
+      - name: run clang-tidy
+        run: |
+          ci/run_clang_tidy.sh
+
   conda-python-tests-gpu:
     needs:
       - pre-commit

diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,6 @@
 *.a
 build
+build_clang_tidy
 *.conda
 *.data
 dist/

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -48,9 +48,10 @@ repos:
           # disable whitespace checks as they are already covered by clang-format
           # runtime/references is disabled as this is a more controversial check in the google style guide
           # include_subdir is disabled as it gives false postiives for top level includes
+          # readability/nolint is disabled as it interferes with clang-tidy
           args: [
             "--linelength=100",
-            "--filter=-whitespace,-runtime/references,-build/include_subdir",
+            "--filter=-whitespace,-runtime/references,-build/include_subdir,-readability/nolint",
           ]
 
 default_language_version:

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -12,7 +12,7 @@ else()
   )
 endif()
 
-project(legateboost VERSION "${_legateboost_version}" LANGUAGES C CXX)
+project(legateboost VERSION "${_legateboost_version}" LANGUAGES C CXX CUDA)
 
 option(SANITIZE "Build with address sanitizer" OFF)
 

diff --git a/build.sh b/build.sh
@@ -13,10 +13,12 @@ HELP="$0 [<target> ...] [<flag> ...]
 
     liblegateboost     - build the liblegateboost.so shared library
     legate-boost       - build and 'pip install' the legate-boost Python package
+    clang-tidy         - run clang-tidy on the codebase
 
  where <flag> is any of:
 
    --editable        - install Python wheel in editable mode
+   --fix             - clang-tidy will attempt to fix issues
    -h | --help       - print the help text
 "
 
@@ -44,18 +46,32 @@ if hasArg --editable; then
     PIP_INSTALL_ARGS+=("--editable")
 fi
 
+legate_root=$(
+    python -c 'import legate.install_info as i; from pathlib import Path; print(Path(i.libpath).parent.resolve())'
+)
 if hasArg liblegateboost || hasArg --editable; then
     echo "building liblegateboost..."
-    legate_root=$(
-        python -c 'import legate.install_info as i; from pathlib import Path; print(Path(i.libpath).parent.resolve())'
-    )
     echo "Using Legate at '${legate_root}'"
 
     cmake -S . -B build -Dlegate_ROOT="${legate_root}" -DCMAKE_BUILD_TYPE=Release -DCMAKE_CUDA_ARCHITECTURES="${CMAKE_CUDA_ARCHITECTURES}"
     cmake --build build -j
     echo "done building liblegateboost"
 fi
 
+if hasArg clang-tidy; then
+    echo "running clang-tidy..."
+    # Build the project with clang
+    CUDA_ROOT="$(dirname "$(dirname "$(which cuda-gdb)")")"
+    echo "Using CUDA at '${CUDA_ROOT}'"
+    cmake . -B build_clang_tidy -Dlegate_ROOT="${legate_root}" -DCMAKE_BUILD_TYPE=Release -DCMAKE_CUDA_ARCHITECTURES="${CMAKE_CUDA_ARCHITECTURES}" -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_CUDA_HOST_COMPILER=clang++ -DCMAKE_CUDA_COMPILER=clang++ -DCUDAToolkit_ROOT="${CUDA_ROOT}" -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
+    FIX_ARG=""
+    if hasArg --fix; then
+        FIX_ARG="-fix"
+    fi
+    run-clang-tidy -p build_clang_tidy -header-filter=.* ${FIX_ARG}
+    echo "done running clang-tidy"
+fi
+
 if hasArg legate-boost; then
     echo "building legate-boost Python package..."
     CUDAARCHS="${CMAKE_CUDA_ARCHITECTURES}" \

diff --git a/ci/run_clang_tidy.sh b/ci/run_clang_tidy.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+# [description]
+#
+#   Run 'clang-tidy' static analysis.
+
+set -e -E -u -o pipefail
+
+# Ensure this is running from the root of the repo
+cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../;
+
+# set up conda environment
+
+# source conda settings so 'conda activate' will work
+# shellcheck disable=SC1091
+. /opt/conda/etc/profile.d/conda.sh
+
+rapids-generate-version > ./VERSION
+
+rapids-print-env
+
+rapids-dependency-file-generator \
+  --output conda \
+  --file-key run_clang_tidy \
+  --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" \
+| tee /tmp/env.yaml
+
+rapids-mamba-retry env create \
+    --yes \
+    --file /tmp/env.yaml \
+    --name test-env
+
+# Temporarily allow unbound variables for conda activation.
+set +u
+conda activate test-env
+set -u
+
+rapids-print-env
+
+CMAKE_GENERATOR=Ninja \
+./build.sh clang-tidy
diff --git a/dependencies.yaml b/dependencies.yaml
@@ -19,6 +19,12 @@ files:
       table: build-system
     includes:
       - build
+      - build_tools
+  run_clang_tidy:
+    output: none
+    includes:
+      - build
+      - clang_tidy
   py_docs:
     output: none
     includes:
@@ -45,6 +51,7 @@ files:
     includes:
       - py_version
       - test
+
 channels:
   - legate
   - legate/label/experimental
@@ -65,6 +72,13 @@ dependencies:
           - ninja>=1.11.1.1
           - scikit-build>=0.18.0
           - setuptools>=70.0
+  clang_tidy:
+    common:
+      - output_types: [conda]
+        packages:
+          - cuda-toolkit
+          - clang-tools>=19.0
+          - clangxx>=19.0
   build_tools:
     common:
       - output_types: [conda]

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -9,13 +9,17 @@ if(Legion_USE_CUDA)
   # it built and installed on the machine already
   rapids_find_package(NCCL REQUIRED)
 
-  list(APPEND LB_CUDA_FLAGS --expt-extended-lambda )
-  list(APPEND LB_CUDA_FLAGS --expt-relaxed-constexpr )
-
-  if(CMAKE_BUILD_TYPE MATCHES Debug)
-    list(APPEND LB_CUDA_FLAGS -G -Xcompiler=-rdynamic)
+  if (CMAKE_CUDA_COMPILER_ID STREQUAL "Clang")
+    set(CMAKE_EXPORT_COMPILE_COMMANDS ON) # used to run clang-tidy
   else()
-    list(APPEND LB_CUDA_FLAGS -lineinfo )
+    list(APPEND LB_CUDA_FLAGS --expt-extended-lambda )
+    list(APPEND LB_CUDA_FLAGS --expt-relaxed-constexpr )
+
+    if(CMAKE_BUILD_TYPE MATCHES Debug)
+      list(APPEND LB_CUDA_FLAGS -G -Xcompiler=-rdynamic)
+    else()
+      list(APPEND LB_CUDA_FLAGS -lineinfo )
+    endif()
   endif()
 endif()
 

diff --git a/src/cpp_utils/cpp_utils.cuh b/src/cpp_utils/cpp_utils.cuh
@@ -16,10 +16,12 @@
 
 #pragma once
 
+#include "cpp_utils.h"
 #include <nccl.h>
 #include <cstdio>
+#include <utility>
+#include <string>
 #include "legate.h"
-#include "cpp_utils.h"
 #include "legate/cuda/cuda.h"
 
 namespace legateboost {
@@ -36,9 +38,28 @@ __host__ inline void check_nccl(ncclResult_t error, const char* file, int line)
   }
 }
 
-#define CHECK_CUDA(expr) LegateCheckCUDA(expr)
+inline void throw_on_cuda_error(cudaError_t code, const char* file, int line)
+{
+  if (code != cudaSuccess) {
+    std::stringstream ss;
+    ss << file << "(" << line << ")";
+    std::string file_and_line;
+    ss >> file_and_line;
+    throw thrust::system_error(code, thrust::cuda_category(), file_and_line);
+  }
+}
 
-#define CHECK_CUDA_STREAM(expr) LegateCheckCUDAStream(expr)
+#define CHECK_CUDA(expr) throw_on_cuda_error(expr, __FILE__, __LINE__)
+
+#ifdef DEBUG
+#define CHECK_CUDA_STREAM(stream)              \
+  do {                                         \
+    CHECK_CUDA(cudaStreamSynchronize(stream)); \
+    CHECK_CUDA(cudaPeekAtLastError());         \
+  } while (false)
+#else
+#define CHECK_CUDA_STREAM(stream) CHECK_CUDA(cudaPeekAtLastError())
+#endif
 
 #define CHECK_NCCL(expr)                    \
   do {                                      \
@@ -47,15 +68,16 @@ __host__ inline void check_nccl(ncclResult_t error, const char* file, int line)
   } while (false)
 
 template <typename L, int BLOCK_THREADS>
-__global__ void __launch_bounds__(BLOCK_THREADS) LaunchNKernel(size_t size, L lambda)
+__global__ void __launch_bounds__(BLOCK_THREADS)
+  LaunchNKernel(size_t size, L lambda)  // NOLINT(performance-unnecessary-value-param)
 {
   for (auto i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += blockDim.x * gridDim.x) {
     lambda(i);
   }
 }
 
 template <int ITEMS_PER_THREAD = 8, typename L>
-inline void LaunchN(size_t n, cudaStream_t stream, L lambda)
+inline void LaunchN(size_t n, cudaStream_t stream, const L& lambda)
 {
   if (n == 0) { return; }
   const int kBlockThreads = 256;
@@ -67,12 +89,12 @@ inline void LaunchN(size_t n, cudaStream_t stream, L lambda)
 template <typename T>
 void AllReduce(legate::TaskContext context, T* x, int count, ncclRedOp_t op, cudaStream_t stream)
 {
-  auto domain      = context.get_launch_domain();
-  size_t num_ranks = domain.get_volume();
+  const auto& domain = context.get_launch_domain();
+  size_t num_ranks   = domain.get_volume();
   EXPECT(num_ranks == 1 || context.num_communicators() > 0,
          "Expected a GPU communicator for multi-rank task.");
   if (context.num_communicators() == 0) return;
-  auto comm             = context.communicator(0);
+  const auto& comm      = context.communicator(0);
   ncclComm_t* nccl_comm = comm.get<ncclComm_t*>();
 
   if (num_ranks > 1) {

diff --git a/src/cpp_utils/cpp_utils.h b/src/cpp_utils/cpp_utils.h
@@ -32,14 +32,14 @@ namespace legateboost {
 
 extern Legion::Logger logger;
 
-inline void expect(bool condition, std::string message, std::string file, int line)
+inline void expect(bool condition, const std::string& message, const std::string& file, int line)
 {
   if (!condition) { throw std::runtime_error(file + "(" + std::to_string(line) + "): " + message); }
 }
 #define EXPECT(condition, message) (expect(condition, message, __FILE__, __LINE__))
 
 template <int AXIS, typename ShapeAT, typename ShapeBT>
-void expect_axis_aligned(const ShapeAT& a, const ShapeBT& b, std::string file, int line)
+void expect_axis_aligned(const ShapeAT& a, const ShapeBT& b, const std::string& file, int line)
 {
   expect((a.lo[AXIS] == b.lo[AXIS]) && (a.hi[AXIS] == b.hi[AXIS]),
          "Inconsistent axis alignment.",
@@ -50,7 +50,7 @@ void expect_axis_aligned(const ShapeAT& a, const ShapeBT& b, std::string file, i
   (expect_axis_aligned<axis>(shape_a, shape_b, __FILE__, __LINE__))
 
 template <int DIM>
-void expect_is_broadcast(const legate::Rect<DIM>& shape, std::string file, int line)
+void expect_is_broadcast(const legate::Rect<DIM>& shape, const std::string& file, int line)
 {
   for (int i = 0; i < DIM; i++) {
     std::stringstream ss;
@@ -62,7 +62,7 @@ void expect_is_broadcast(const legate::Rect<DIM>& shape, std::string file, int l
 
 template <typename T, int NDIM, bool assert_row_major = true>
 std::tuple<legate::PhysicalStore, legate::Rect<NDIM>, legate::AccessorRO<T, NDIM>> GetInputStore(
-  legate::PhysicalStore store)
+  const legate::PhysicalStore& store)
 {
   auto shape    = store.shape<NDIM>();
   auto accessor = store.read_accessor<T, NDIM, true>();
@@ -78,7 +78,7 @@ constexpr decltype(auto) type_dispatch_impl(legate::Type::Code code, Functor&& f
 template <typename AccessorT, int N, typename T>
 void expect_dense_row_major(const AccessorT& accessor,
                             const legate::Rect<N, T>& shape,
-                            std::string file,
+                            const std::string& file,
                             int line)
 {
   // workaround to check 'row-major' for more than 2 dimensions, with
@@ -88,7 +88,7 @@ void expect_dense_row_major(const AccessorT& accessor,
 
   expect(shape_mod.empty() || accessor.is_dense_row_major(shape_mod),
          "Expected a dense row major store",
-         file,
+         std::move(file),
          line);
 }
 #define EXPECT_DENSE_ROW_MAJOR(accessor, shape) \
@@ -118,12 +118,12 @@ constexpr decltype(auto) type_dispatch_float(legate::Type::Code code, Functor&&
 template <typename T, typename OpT>
 void AllReduce(legate::TaskContext context, T* x, int count, OpT op)
 {
-  auto domain      = context.get_launch_domain();
-  size_t num_ranks = domain.get_volume();
+  const auto& domain = context.get_launch_domain();
+  size_t num_ranks   = domain.get_volume();
   EXPECT(num_ranks == 1 || context.num_communicators() > 0,
          "Expected a CPU communicator for multi-rank task.");
   if (count == 0 || context.num_communicators() == 0) return;
-  auto comm = context.communicator(0);
+  const auto& comm = context.communicator(0);
   legate::comm::coll::CollDataType type;
   if (std::is_same<T, float>::value)
     type = legate::comm::coll::CollDataType::CollFloat;
@@ -282,6 +282,6 @@ class UnaryOpTask : public Task<UnaryOpTask<F, OpCode>, OpCode> {
 
 }  // namespace legateboost
 
-#if __CUDACC__
+#ifdef __CUDACC__
 #include "../cpp_utils/cpp_utils.cuh"
 #endif
diff --git a/src/legateboost.h b/src/legateboost.h
@@ -17,7 +17,7 @@
 #ifndef SRC_LEGATEBOOST_H_
 #define SRC_LEGATEBOOST_H_
 
-enum LegateBoostOpCode {
+enum LegateBoostOpCode {  // NOLINT(performance-enum-size)
   _OP_CODE_BASE = 0,
   BUILD_TREE    = 1,
   PREDICT       = 2,
-Original file line number
+Diff line change
@@ -1,5 +1,6 @@
     *.a
     build
+    build_clang_tidy
     *.conda
     *.data
     dist/
@@ Expand Down @@