-
Notifications
You must be signed in to change notification settings - Fork 12.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[libc] Remove remaining GPU architecture dependent instructions #81612
Conversation
@llvm/pr-subscribers-libc @llvm/pr-subscribers-backend-amdgpu Author: Joseph Huber (jhuber6) ChangesSummary: Depends on #81331 Full diff: https://github.com/llvm/llvm-project/pull/81612.diff 5 Files Affected:
diff --git a/libc/src/__support/GPU/amdgpu/utils.h b/libc/src/__support/GPU/amdgpu/utils.h
index 58bbe29cb3a7d7..6f704a262fbdf5 100644
--- a/libc/src/__support/GPU/amdgpu/utils.h
+++ b/libc/src/__support/GPU/amdgpu/utils.h
@@ -152,14 +152,7 @@ LIBC_INLINE uint64_t processor_clock() { return __builtin_readcyclecounter(); }
/// Returns a fixed-frequency timestamp. The actual frequency is dependent on
/// the card and can only be queried via the driver.
LIBC_INLINE uint64_t fixed_frequency_clock() {
- if constexpr (LIBC_HAS_BUILTIN(__builtin_amdgcn_s_sendmsg_rtnl))
- return __builtin_amdgcn_s_sendmsg_rtnl(0x83);
- else if constexpr (LIBC_HAS_BUILTIN(__builtin_amdgcn_s_memrealtime))
- return __builtin_amdgcn_s_memrealtime();
- else if constexpr (LIBC_HAS_BUILTIN(__builtin_amdgcn_s_memtime))
- return __builtin_amdgcn_s_memtime();
- else
- return 0;
+ return __builtin_readsteadycounter();
}
/// Terminates execution of the associated wavefront.
diff --git a/libc/src/__support/GPU/nvptx/utils.h b/libc/src/__support/GPU/nvptx/utils.h
index e7e297adf7ecca..5a31d7c140587f 100644
--- a/libc/src/__support/GPU/nvptx/utils.h
+++ b/libc/src/__support/GPU/nvptx/utils.h
@@ -138,13 +138,11 @@ LIBC_INLINE uint32_t get_lane_size() { return LANE_SIZE; }
}
/// Returns the current value of the GPU's processor clock.
-LIBC_INLINE uint64_t processor_clock() {
- return __nvvm_read_ptx_sreg_clock64();
-}
+LIBC_INLINE uint64_t processor_clock() { return __builtin_readcyclecounter(); }
/// Returns a global fixed-frequency timer at nanosecond frequency.
LIBC_INLINE uint64_t fixed_frequency_clock() {
- return __nvvm_read_ptx_sreg_globaltimer();
+ return __builtin_readsteadycounter();
}
/// Terminates execution of the calling thread.
diff --git a/libc/src/__support/RPC/rpc_util.h b/libc/src/__support/RPC/rpc_util.h
index ff9569298a1ed7..cc2a11a1108e01 100644
--- a/libc/src/__support/RPC/rpc_util.h
+++ b/libc/src/__support/RPC/rpc_util.h
@@ -21,8 +21,9 @@ namespace rpc {
/// Suspend the thread briefly to assist the thread scheduler during busy loops.
LIBC_INLINE void sleep_briefly() {
-#if defined(LIBC_TARGET_ARCH_IS_NVPTX) && __CUDA_ARCH__ >= 700
- __nvvm_nanosleep(64);
+#if defined(LIBC_TARGET_ARCH_IS_NVPTX)
+ if (__nvvm_reflect("__CUDA_ARCH") >= 700)
+ LIBC_INLINE_ASM("nanosleep.u32 64;" ::: "memory");
#elif defined(LIBC_TARGET_ARCH_IS_AMDGPU)
__builtin_amdgcn_s_sleep(2);
#elif defined(LIBC_TARGET_ARCH_IS_X86)
diff --git a/libc/src/time/gpu/nanosleep.cpp b/libc/src/time/gpu/nanosleep.cpp
index e84fe622100e80..5cf9e961e81256 100644
--- a/libc/src/time/gpu/nanosleep.cpp
+++ b/libc/src/time/gpu/nanosleep.cpp
@@ -22,14 +22,15 @@ LLVM_LIBC_FUNCTION(int, nanosleep,
uint64_t nsecs = req->tv_nsec + req->tv_sec * TICKS_PER_NS;
uint64_t start = gpu::fixed_frequency_clock();
-#if defined(LIBC_TARGET_ARCH_IS_NVPTX) && __CUDA_ARCH__ >= 700
+#if defined(LIBC_TARGET_ARCH_IS_NVPTX)
uint64_t end = start + nsecs / (TICKS_PER_NS / GPU_CLOCKS_PER_SEC);
uint64_t cur = gpu::fixed_frequency_clock();
// The NVPTX architecture supports sleeping and guaruntees the actual time
// slept will be somewhere between zero and twice the requested amount. Here
// we will sleep again if we undershot the time.
while (cur < end) {
- __nvvm_nanosleep(static_cast<uint32_t>(nsecs));
+ if (__nvvm_reflect("__CUDA_ARCH") >= 700)
+ LIBC_INLINE_ASM("nanosleep.u32 %0;" ::"r"(nsecs));
cur = gpu::fixed_frequency_clock();
nsecs -= nsecs > cur - start ? cur - start : 0;
}
diff --git a/libc/src/time/gpu/time_utils.h b/libc/src/time/gpu/time_utils.h
index 531a748665b07b..b1edbbb387b37b 100644
--- a/libc/src/time/gpu/time_utils.h
+++ b/libc/src/time/gpu/time_utils.h
@@ -15,24 +15,14 @@ namespace LIBC_NAMESPACE {
#if defined(LIBC_TARGET_ARCH_IS_AMDGPU)
// AMDGPU does not have a single set frequency. Different architectures and
-// cards can have vary values. Here we default to a few known values, but for
-// complete support the frequency needs to be read from the kernel driver.
-#if defined(__GFX10__) || defined(__GFX11__) || defined(__GFX12__) || \
- defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
-// These architectures use a 100 MHz fixed frequency clock.
-constexpr uint64_t clock_freq = 100000000;
-#elif defined(__GFX9__)
-// These architectures use a 25 MHz fixed frequency clock expect for Vega 10
-// which is actually 27 Mhz. We default to 25 MHz in all cases anyway.
-constexpr uint64_t clock_freq = 25000000;
-#else
-// The frequency for these architecture is unknown. We simply default to zero.
-constexpr uint64_t clock_freq = 0;
+// cards can have different values. The actualy frequency needs to be read from
+// the kernel driver and will be between 25 MHz and 100 MHz on most cards. All
+// cards following the GFX9 ISAs use a 100 MHz clock so we will default to that.
+constexpr uint64_t clock_freq = 100000000UL;
#endif
// We provide an externally visible symbol such that the runtime can set
-// this to the correct value. If it is not set we try to default to the
-// known values.
+// this to the correct value.
extern "C" [[gnu::visibility("protected")]] uint64_t
[[clang::address_space(4)]] __llvm_libc_clock_freq;
#define GPU_CLOCKS_PER_SEC static_cast<clock_t>(__llvm_libc_clock_freq)
|
654806c
to
d503724
Compare
uint64_t end = start + nsecs / (TICKS_PER_NS / GPU_CLOCKS_PER_SEC); | ||
uint64_t cur = gpu::fixed_frequency_clock(); | ||
// The NVPTX architecture supports sleeping and guaruntees the actual time | ||
// slept will be somewhere between zero and twice the requested amount. Here | ||
// we will sleep again if we undershot the time. | ||
while (cur < end) { | ||
__nvvm_nanosleep(static_cast<uint32_t>(nsecs)); | ||
if (__nvvm_reflect("__CUDA_ARCH") >= 700) | ||
LIBC_INLINE_ASM("nanosleep.u32 %0;" ::"r"(nsecs)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So __nvvm_nanosleep
is not always available but the asm is always supported?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, I made some changes to make this well-formed. The __nvvm_reflect
pass returns the backend's value of the sm
it's compiling with. That used to be an issue if the ASM was invalid and O0
was run, but I made a patch that runs __nvvm_reflect
in the backend to trim these branches even at O0 in #81253. So, basically this will only make it to PTX if the backend is compiled with sm_70 or greater.
Summary: Recent patches have added solutions to the remaining sources of divergence. This patch simply removes the last occures of things like `has_builtin`, `ifdef` or builtins with feature requirements. The one exception here is `nanosleep`, but I made changes in the `__nvvm_reflect` pass to make usage like this actually work at O0. Depends on llvm#81331
d503724
to
b085f16
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Very nice.
Summary:
Recent patches have added solutions to the remaining sources of
divergence. This patch simply removes the last occures of things like
has_builtin
,ifdef
or builtins with feature requirements. The oneexception here is
nanosleep
, but I made changes in the__nvvm_reflect
pass to make usage like this actually work at O0.Depends on #81331