Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Headers][NFC] Steps to allow sharing code between gpu intrin.h headers #131134

Merged
merged 1 commit into from
Mar 13, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 2 additions & 20 deletions clang/lib/Headers/amdgpuintrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,8 @@
#error "This file is intended for AMDGPU targets or offloading to AMDGPU"
#endif

#include <stdint.h>

#if !defined(__cplusplus)
_Pragma("push_macro(\"bool\")");
#define bool _Bool
#ifndef __GPUINTRIN_H
#error "Never use <amdgpuintrin.h> directly; include <gpuintrin.h> instead"
#endif

_Pragma("omp begin declare target device_type(nohost)");
Expand Down Expand Up @@ -146,17 +143,6 @@ __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
return __builtin_amdgcn_ds_bpermute(__lane << 2, __x);
}

// Shuffles the the lanes inside the wavefront according to the given index.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
uint32_t __width) {
uint32_t __hi = (uint32_t)(__x >> 32ull);
uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
return ((uint64_t)__gpu_shuffle_idx_u32(__lane_mask, __idx, __hi, __width)
<< 32ull) |
((uint64_t)__gpu_shuffle_idx_u32(__lane_mask, __idx, __lo, __width));
}

// Returns a bitmask marking all lanes that have the same value of __x.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
Expand Down Expand Up @@ -238,8 +224,4 @@ _DEFAULT_FN_ATTRS static __inline__ void __gpu_thread_suspend(void) {
_Pragma("omp end declare variant");
_Pragma("omp end declare target");

#if !defined(__cplusplus)
_Pragma("pop_macro(\"bool\")");
#endif

#endif // __AMDGPUINTRIN_H
26 changes: 19 additions & 7 deletions clang/lib/Headers/gpuintrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,13 @@
#endif
#endif

#include <stdint.h>

#if !defined(__cplusplus)
_Pragma("push_macro(\"bool\")");
#define bool _Bool
#endif

#if defined(__NVPTX__)
#include <nvptxintrin.h>
#elif defined(__AMDGPU__)
Expand All @@ -33,13 +40,6 @@
#error "This header is only meant to be used on GPU architectures."
#endif

#include <stdint.h>

#if !defined(__cplusplus)
_Pragma("push_macro(\"bool\")");
#define bool _Bool
#endif

_Pragma("omp begin declare target device_type(nohost)");
_Pragma("omp begin declare variant match(device = {kind(gpu)})");

Expand Down Expand Up @@ -141,6 +141,18 @@ __gpu_read_first_lane_f64(uint64_t __lane_mask, double __x) {
__builtin_bit_cast(uint64_t, __x)));
}

// Shuffles the the lanes according to the given index.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
uint32_t __width) {
uint32_t __hi = (uint32_t)(__x >> 32ull);
uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
uint32_t __mask = (uint32_t)__lane_mask;
return ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __hi, __width)
<< 32ull) |
((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __lo, __width));
}

// Shuffles the the lanes according to the given index.
_DEFAULT_FN_ATTRS static __inline__ float
__gpu_shuffle_idx_f32(uint64_t __lane_mask, uint32_t __idx, float __x,
Expand Down
27 changes: 4 additions & 23 deletions clang/lib/Headers/nvptxintrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,12 @@
#error "This file is intended for NVPTX targets or offloading to NVPTX"
#endif

#ifndef __CUDA_ARCH__
#define __CUDA_ARCH__ 0
#ifndef __GPUINTRIN_H
#error "Never use <nvptxintrin.h> directly; include <gpuintrin.h> instead"
#endif

#include <stdint.h>

#if !defined(__cplusplus)
_Pragma("push_macro(\"bool\")");
#define bool _Bool
#ifndef __CUDA_ARCH__
#define __CUDA_ARCH__ 0
#endif

_Pragma("omp begin declare target device_type(nohost)");
Expand Down Expand Up @@ -153,18 +150,6 @@ __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
((__gpu_num_lanes() - __width) << 8u) | 0x1f);
}

// Shuffles the the lanes inside the warp according to the given index.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
uint32_t __width) {
uint32_t __hi = (uint32_t)(__x >> 32ull);
uint32_t __lo = (uint32_t)(__x & 0xFFFFFFFF);
uint32_t __mask = (uint32_t)__lane_mask;
return ((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __hi, __width)
<< 32ull) |
((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __lo, __width));
}

// Returns a bitmask marking all lanes that have the same value of __x.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
Expand Down Expand Up @@ -263,8 +248,4 @@ _DEFAULT_FN_ATTRS static __inline__ void __gpu_thread_suspend(void) {
_Pragma("omp end declare variant");
_Pragma("omp end declare target");

#if !defined(__cplusplus)
_Pragma("pop_macro(\"bool\")");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where did this go?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Up into gpuintrin.h

#endif

#endif // __NVPTXINTRIN_H
Loading