✨ Update examples (#52)

Makes clang-tidy compatible with HIP, adds an example for using external dependencies and fixes a few clang-tidy warnings.
eomii · Mar 27, 2023 · 70da306 · 70da306
1 parent e19d31f
commit 70da306
Show file tree

Hide file tree

Showing 14 changed files with 74 additions and 20 deletions.
diff --git a/examples/.envrc b/examples/.envrc
@@ -0,0 +1 @@
+use flake
diff --git a/examples/BUILD.bazel b/examples/BUILD.bazel
@@ -16,9 +16,9 @@ ll_compilation_database(
         "//format_example",
         "//frontend_action_example",
         "//external_dependency_example",
-        "//hip_example",
-        "//hip_rdc_example",
-        "//module_partition_example",
+        "//hip_example:amdgpu",
+        "//hip_rdc_example:amdgpu",
+        # "//module_partition_example",  # Bugged. Need to report upstream.
         "//modules_example",
         # "//modules_draft_example",  # Not clang-tidy conform.
         "//openmp_example",

diff --git a/examples/cuda_example/BUILD.bazel b/examples/cuda_example/BUILD.bazel
@@ -7,6 +7,7 @@ ll_binary(
     compile_flags = OFFLOAD_ALL_NVPTX + [
         "--std=c++20",
     ],
+    visibility = ["@//:__pkg__"],
 )
 
 ll_test(

diff --git a/examples/cuda_example/example.cpp b/examples/cuda_example/example.cpp
@@ -8,7 +8,7 @@
 constexpr float kInputA = 1.0F;
 constexpr float kInputB = 2.0F;
 constexpr float kExpectedOutput = 3.0F;
-int kDimension = 1 << 20;
+constexpr int kDimension = 1 << 20;
 constexpr auto kThreadsPerBlockX = 128;
 constexpr auto kThreadsPerBlockY = 1;
 constexpr auto kThreadsPerBlockZ = 1;
@@ -20,7 +20,7 @@ constexpr void cuda_assert(const T value) {
 
 __global__ void add_vector(float *input_a, const float *input_b,
                            const int dimension) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  const uint32_t index = blockIdx.x * blockDim.x + threadIdx.x;
   if (index < dimension) {
     // NOLINTNEXTLINE cppcoreguidelines-pro-bounds-pointer-arithmetic
     input_a[index] += input_b[index];
@@ -29,13 +29,13 @@ __global__ void add_vector(float *input_a, const float *input_b,
 
 void print_device_info() {
   int count = 0;
-  cudaError_t err = cudaGetDeviceCount(&count);
+  const cudaError_t err = cudaGetDeviceCount(&count);
   if (err == cudaErrorInvalidDevice) {
     std::cout << "FAIL: invalid device" << std::endl;
   }
   std::cout << "Number of devices is " << count << std::endl;
 
-  cudaDeviceProp device_prop;
+  cudaDeviceProp device_prop{};
   cuda_assert(cudaGetDeviceProperties(&device_prop, 0));
   std::cout << "System major: " << device_prop.major << std::endl;
   std::cout << "System minor: " << device_prop.minor << std::endl;
@@ -93,18 +93,24 @@ auto main() -> int {
   cuda_assert(cudaMemcpy(device_input_b, host_input_b,
                          kDimension * sizeof(float), cudaMemcpyHostToDevice));
 
-  dim3 grid_dim = dim3(kDimension / kThreadsPerBlockX);
-  dim3 block_dim = dim3(kThreadsPerBlockX);
+  const dim3 grid_dim = dim3(kDimension / kThreadsPerBlockX);
+  const dim3 block_dim = dim3(kThreadsPerBlockX);
 
   // This is not pretty, but it is close to the HIP implementation.
-  std::array<void *, 3> args = {&device_input_a, &device_input_b, &kDimension};
+  // NOLINTBEGIN cppcoreguidelines-pro-type-reinterpret-cast
+  // NOLINTBEGIN cppcoreguidelines-pro-type-const-cast
+  std::array<void *, 3> args = {
+      &device_input_a, &device_input_b,
+      reinterpret_cast<void *>(const_cast<int *>(&kDimension))};
   cudaLaunchKernel(reinterpret_cast<void *>(add_vector), grid_dim, block_dim,
                    args.data(), 0, nullptr);
+  // NOLINTEND cppcoreguidelines-pro-type-reinterpret-cast
+  // NOLINTEND cppcoreguidelines-pro-type-const-cast
 
   cuda_assert(cudaMemcpy(host_input_a, device_input_a,
                          kDimension * sizeof(float), cudaMemcpyDeviceToHost));
 
-  int errors = count_errors(host_input_a);
+  const int errors = count_errors(host_input_a);
 
   cuda_assert(cudaFree(device_input_a));
   cuda_assert(cudaFree(device_input_b));

diff --git a/examples/external_dependency_example/BUILD.bazel b/examples/external_dependency_example/BUILD.bazel
@@ -0,0 +1,15 @@
+load("@rules_ll//ll:defs.bzl", "ll_binary")
+
+ll_binary(
+    name = "external_dependency_example",
+    srcs = ["main.cpp"],
+    compile_flags = ["-std=c++2b"],
+    # The include path, library search path and rpath are already set in
+    # flake.nix and added to all targets in the workspace. This way we still
+    # have a reproducible openssl despite it not being tracked by Bazel.
+    #
+    # If possible, try to avoid this pattern and write custom BUILD files for
+    # external dependencies that are not too complicated to port to Bazel.
+    link_flags = ["-lcrypto"],
+    visibility = ["@//:__pkg__"],
+)
diff --git a/examples/external_dependency_example/main.cpp b/examples/external_dependency_example/main.cpp
@@ -0,0 +1,26 @@
+#include <openssl/sha.h>
+
+#include <array>
+#include <format>
+#include <iostream>
+
+auto main() -> int {
+  constexpr auto kMessageSize = 5;
+  std::array<unsigned char, kMessageSize> message = {'h', 'e', 'l', 'l', 'o'};
+
+  auto *hashed_message = SHA256(message.data(), message.size(), nullptr);
+
+  std::array<std::byte, SHA256_DIGEST_LENGTH> output = {};
+  std::memcpy(output.data(), hashed_message, SHA256_DIGEST_LENGTH);
+
+  std::cout << "Calculated:\t";
+  for (auto val : output) {
+    std::cout << std::format("{:0>2x}", static_cast<int>(val));
+  }
+  std::cout << "\n";
+
+  std::cout
+      << "Expected:\t"
+      << "2cf24dba5fb0a30e26e83b2ac5b9e29e1b161e5c1fa7425e73043362938b9824"
+      << std::endl;
+}
diff --git a/examples/hip_example/BUILD.bazel b/examples/hip_example/BUILD.bazel
@@ -13,6 +13,7 @@ ll_binary(
     compile_flags = OFFLOAD_ALL_NVPTX + [
         "--std=c++20",
     ],
+    visibility = ["@//:__pkg__"],
 )
 
 ll_binary(
@@ -22,6 +23,7 @@ ll_binary(
     compile_flags = OFFLOAD_ALL_AMDGPU + [
         "--std=c++20",
     ],
+    visibility = ["@//:__pkg__"],
 )
 
 # We usually prefer native_test over ll_test, but that won't work here as the

diff --git a/examples/hip_example/example.cpp b/examples/hip_example/example.cpp
@@ -19,7 +19,7 @@ constexpr void hip_assert(const T value) {
 
 __global__ auto add_vector(float *input_a, const float *input_b,
                            const int dimension) -> void {
-  int index = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
+  const uint32_t index = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
   if (index < dimension) {
     // NOLINTNEXTLINE cppcoreguidelines-pro-bounds-pointer-arithmetic
     input_a[index] += input_b[index];
@@ -28,7 +28,7 @@ __global__ auto add_vector(float *input_a, const float *input_b,
 
 void print_device_info() {
   int count = 0;
-  hipError_t err = hipGetDeviceCount(&count);
+  const hipError_t err = hipGetDeviceCount(&count);
   if (err == hipErrorInvalidDevice) {
     std::cout << "FAIL: invalid device" << std::endl;
   }
@@ -93,8 +93,8 @@ auto main() -> int {
   hip_assert(hipMemcpy(device_input_b, host_input_b, kDimension * sizeof(float),
                        hipMemcpyHostToDevice));
 
-  dim3 grid_dim = dim3(kDimension / kThreadsPerBlockX);
-  dim3 block_dim = dim3(kThreadsPerBlockX);
+  const dim3 grid_dim = dim3(kDimension / kThreadsPerBlockX);
+  const dim3 block_dim = dim3(kThreadsPerBlockX);
 
   hipLaunchKernelGGL(add_vector, grid_dim, block_dim, 0, nullptr,
                      device_input_a, device_input_b, kDimension);

diff --git a/examples/hip_rdc_example/BUILD.bazel b/examples/hip_rdc_example/BUILD.bazel
@@ -47,6 +47,7 @@ ll_binary(
     ],
     compilation_mode = "hip_amdgpu",
     compile_flags = COMPILE_FLAGS_AMDGPU,
+    visibility = ["@//:__pkg__"],
     deps = [
         ":add_amdgpu",
         ":multiply_amdgpu",
@@ -93,6 +94,7 @@ ll_binary(
     ],
     compilation_mode = "hip_nvptx",
     compile_flags = COMPILE_FLAGS_NVPTX,
+    visibility = ["@//:__pkg__"],
     deps = [
         ":add_nvptx",
         ":multiply_nvptx",

diff --git a/examples/hip_rdc_example/add.hip.cpp b/examples/hip_rdc_example/add.hip.cpp
@@ -4,7 +4,7 @@
 
 __device__ auto add_vector(float *input_a, const float *input_b,
                            const int dimension) -> void {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  const uint32_t index = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
   if (index < dimension) {
     // NOLINTNEXTLINE cppcoreguidelines-pro-bounds-pointer-arithmetic
     input_a[index] += input_b[index];

diff --git a/examples/hip_rdc_example/main.hip.cpp b/examples/hip_rdc_example/main.hip.cpp
@@ -78,8 +78,8 @@ auto main() -> int {
   hip_assert(hipMemcpy(device_input_c, host_input_c, kDimension * sizeof(float),
                        hipMemcpyHostToDevice));
 
-  dim3 grid_dim = dim3(kDimension / kThreadsPerBlockX);
-  dim3 block_dim = dim3(kThreadsPerBlockX);
+  const dim3 grid_dim = dim3(kDimension / kThreadsPerBlockX);
+  const dim3 block_dim = dim3(kThreadsPerBlockX);
 
   hipLaunchKernelGGL(multiply_add, grid_dim, block_dim, 0, nullptr,
                      device_input_a, device_input_b, device_input_c,

diff --git a/examples/hip_rdc_example/multiply.hip.cpp b/examples/hip_rdc_example/multiply.hip.cpp
@@ -4,7 +4,7 @@
 
 __device__ auto multiply_vector(float *input_a, const float *input_b,
                                 const int dimension) -> void {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  const uint32_t index = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
   if (index < dimension) {
     // NOLINTNEXTLINE cppcoreguidelines-pro-bounds-pointer-arithmetic
     input_a[index] *= input_b[index];

diff --git a/ll/compilation_database.bzl b/ll/compilation_database.bzl
@@ -91,7 +91,7 @@ with open(sys.argv[1], 'r') as in_file, open(sys.argv[2], 'w') as out_file:
 
         # Workaround for https://github.com/llvm/llvm-project/issues/59291.
         for arg in fragment['arguments']:
-            if arg == '-xcuda':
+            if arg in ['-xcuda', '-xhip']:
                 fragment['arguments'] += ['--offload-host-only']
             if arg.startswith('--offload-arch'):
                 fragment['arguments'].remove(arg)

diff --git a/ll/init.bzl b/ll/init.bzl
@@ -50,6 +50,7 @@ def _initialize_rules_ll_impl(_):
             "@rules_ll//patches:hipamd_inconsistent_overrides.diff",
             "@rules_ll//patches:hipamd_fix_extraneous_parentheses.diff",
             "@rules_ll//patches:hipamd_default_visibility.diff",
+            "@rules_ll//patches:hipamd_enforce_semicolon.diff",
         ],
         patch_args = ["-p1"],
     )