From b085f164b9654f814d3ed9710e4c2c8479ea9e09 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Tue, 13 Feb 2024 09:10:14 -0600 Subject: [PATCH] [libc] Remove remaining GPU architecture dependent instructions Summary: Recent patches have added solutions to the remaining sources of divergence. This patch simply removes the last occures of things like `has_builtin`, `ifdef` or builtins with feature requirements. The one exception here is `nanosleep`, but I made changes in the `__nvvm_reflect` pass to make usage like this actually work at O0. Depends on https://github.com/llvm/llvm-project/pull/81331 --- libc/src/__support/GPU/amdgpu/utils.h | 9 +-------- libc/src/__support/GPU/nvptx/utils.h | 6 ++---- libc/src/__support/RPC/rpc_util.h | 5 +++-- libc/src/time/gpu/nanosleep.cpp | 5 +++-- libc/src/time/gpu/time_utils.h | 21 +++++---------------- 5 files changed, 14 insertions(+), 32 deletions(-) diff --git a/libc/src/__support/GPU/amdgpu/utils.h b/libc/src/__support/GPU/amdgpu/utils.h index 9432b7b39f7836..75f0b5744ebd72 100644 --- a/libc/src/__support/GPU/amdgpu/utils.h +++ b/libc/src/__support/GPU/amdgpu/utils.h @@ -152,14 +152,7 @@ LIBC_INLINE uint64_t processor_clock() { return __builtin_readcyclecounter(); } /// Returns a fixed-frequency timestamp. The actual frequency is dependent on /// the card and can only be queried via the driver. LIBC_INLINE uint64_t fixed_frequency_clock() { - if constexpr (LIBC_HAS_BUILTIN(__builtin_amdgcn_s_sendmsg_rtnl)) - return __builtin_amdgcn_s_sendmsg_rtnl(0x83); - else if constexpr (LIBC_HAS_BUILTIN(__builtin_amdgcn_s_memrealtime)) - return __builtin_amdgcn_s_memrealtime(); - else if constexpr (LIBC_HAS_BUILTIN(__builtin_amdgcn_s_memtime)) - return __builtin_amdgcn_s_memtime(); - else - return 0; + return __builtin_readsteadycounter(); } /// Terminates execution of the associated wavefront. diff --git a/libc/src/__support/GPU/nvptx/utils.h b/libc/src/__support/GPU/nvptx/utils.h index 6c4bb5a7720a50..22a46e87cfc05d 100644 --- a/libc/src/__support/GPU/nvptx/utils.h +++ b/libc/src/__support/GPU/nvptx/utils.h @@ -135,13 +135,11 @@ LIBC_INLINE uint32_t get_lane_size() { return 32; } } /// Returns the current value of the GPU's processor clock. -LIBC_INLINE uint64_t processor_clock() { - return __nvvm_read_ptx_sreg_clock64(); -} +LIBC_INLINE uint64_t processor_clock() { return __builtin_readcyclecounter(); } /// Returns a global fixed-frequency timer at nanosecond frequency. LIBC_INLINE uint64_t fixed_frequency_clock() { - return __nvvm_read_ptx_sreg_globaltimer(); + return __builtin_readsteadycounter(); } /// Terminates execution of the calling thread. diff --git a/libc/src/__support/RPC/rpc_util.h b/libc/src/__support/RPC/rpc_util.h index ff9569298a1ed7..cc2a11a1108e01 100644 --- a/libc/src/__support/RPC/rpc_util.h +++ b/libc/src/__support/RPC/rpc_util.h @@ -21,8 +21,9 @@ namespace rpc { /// Suspend the thread briefly to assist the thread scheduler during busy loops. LIBC_INLINE void sleep_briefly() { -#if defined(LIBC_TARGET_ARCH_IS_NVPTX) && __CUDA_ARCH__ >= 700 - __nvvm_nanosleep(64); +#if defined(LIBC_TARGET_ARCH_IS_NVPTX) + if (__nvvm_reflect("__CUDA_ARCH") >= 700) + LIBC_INLINE_ASM("nanosleep.u32 64;" ::: "memory"); #elif defined(LIBC_TARGET_ARCH_IS_AMDGPU) __builtin_amdgcn_s_sleep(2); #elif defined(LIBC_TARGET_ARCH_IS_X86) diff --git a/libc/src/time/gpu/nanosleep.cpp b/libc/src/time/gpu/nanosleep.cpp index 34ff904c49c65b..dd669ff46c75c8 100644 --- a/libc/src/time/gpu/nanosleep.cpp +++ b/libc/src/time/gpu/nanosleep.cpp @@ -23,14 +23,15 @@ LLVM_LIBC_FUNCTION(int, nanosleep, uint64_t tick_rate = TICKS_PER_SEC / GPU_CLOCKS_PER_SEC; uint64_t start = gpu::fixed_frequency_clock(); -#if defined(LIBC_TARGET_ARCH_IS_NVPTX) && __CUDA_ARCH__ >= 700 +#if defined(LIBC_TARGET_ARCH_IS_NVPTX) uint64_t end = start + (nsecs + tick_rate - 1) / tick_rate; uint64_t cur = gpu::fixed_frequency_clock(); // The NVPTX architecture supports sleeping and guaruntees the actual time // slept will be somewhere between zero and twice the requested amount. Here // we will sleep again if we undershot the time. while (cur < end) { - __nvvm_nanosleep(static_cast(nsecs)); + if (__nvvm_reflect("__CUDA_ARCH") >= 700) + LIBC_INLINE_ASM("nanosleep.u32 %0;" ::"r"(nsecs)); cur = gpu::fixed_frequency_clock(); nsecs -= nsecs > cur - start ? cur - start : 0; } diff --git a/libc/src/time/gpu/time_utils.h b/libc/src/time/gpu/time_utils.h index 531a748665b07b..8a9a5f0f65b89c 100644 --- a/libc/src/time/gpu/time_utils.h +++ b/libc/src/time/gpu/time_utils.h @@ -15,24 +15,13 @@ namespace LIBC_NAMESPACE { #if defined(LIBC_TARGET_ARCH_IS_AMDGPU) // AMDGPU does not have a single set frequency. Different architectures and -// cards can have vary values. Here we default to a few known values, but for -// complete support the frequency needs to be read from the kernel driver. -#if defined(__GFX10__) || defined(__GFX11__) || defined(__GFX12__) || \ - defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) -// These architectures use a 100 MHz fixed frequency clock. -constexpr uint64_t clock_freq = 100000000; -#elif defined(__GFX9__) -// These architectures use a 25 MHz fixed frequency clock expect for Vega 10 -// which is actually 27 Mhz. We default to 25 MHz in all cases anyway. -constexpr uint64_t clock_freq = 25000000; -#else -// The frequency for these architecture is unknown. We simply default to zero. -constexpr uint64_t clock_freq = 0; -#endif +// cards can have different values. The actualy frequency needs to be read from +// the kernel driver and will be between 25 MHz and 100 MHz on most cards. All +// cards following the GFX9 ISAs use a 100 MHz clock so we will default to that. +constexpr uint64_t clock_freq = 100000000UL; // We provide an externally visible symbol such that the runtime can set -// this to the correct value. If it is not set we try to default to the -// known values. +// this to the correct value. extern "C" [[gnu::visibility("protected")]] uint64_t [[clang::address_space(4)]] __llvm_libc_clock_freq; #define GPU_CLOCKS_PER_SEC static_cast(__llvm_libc_clock_freq)