Skip to content

Commit a7ba6de

Browse files
authored
[OpenMP] [Xteam] Added envar LIBOMPTARGET_AMDGPU_XTEAM_BLOCKSIZE (#482)
This envar acts as an override blocksize to be used for Xteam reduction kernels. The default is 0 (unused).
1 parent 5348141 commit a7ba6de

File tree

2 files changed

+18
-0
lines changed

2 files changed

+18
-0
lines changed

offload/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -946,6 +946,10 @@ struct AMDGPUKernelTy : public GenericKernelTy {
946946
if (ThreadLimitClause[0] > 0 && ThreadLimitClause[0] != (uint32_t)-1 &&
947947
ThreadLimitClause[0] <= static_cast<uint32_t>(ConstWGSize))
948948
return llvm::omp::getBlockSizeAsPowerOfTwo(ThreadLimitClause[0]);
949+
uint32_t BlockSizeOverride = GenericDevice.getOMPXXteamBlockSize();
950+
if (BlockSizeOverride > 0 &&
951+
BlockSizeOverride <= static_cast<int32_t>(ConstWGSize))
952+
return llvm::omp::getBlockSizeAsPowerOfTwo(BlockSizeOverride);
949953
assert(((ConstWGSize & (ConstWGSize - 1)) == 0) &&
950954
"XTeam Reduction blocksize must be a power of two");
951955
return ConstWGSize;
@@ -3096,6 +3100,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
30963100
"LIBOMPTARGET_AMDGPU_ADJUST_XTEAM_RED_TEAMS", 1),
30973101
OMPX_GenericSpmdUseSmallBlockSize(
30983102
"LIBOMPTARGET_AMDGPU_GENERIC_SPMD_USE_SMALL_BLOCKSIZE", 1),
3103+
OMPX_XteamBlockSize("LIBOMPTARGET_AMDGPU_XTEAM_BLOCKSIZE", 0),
30993104
OMPX_MaxAsyncCopyBytes("LIBOMPTARGET_AMDGPU_MAX_ASYNC_COPY_BYTES",
31003105
64 * 1024),
31013106
OMPX_InitialNumSignals("LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS",
@@ -3235,6 +3240,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
32353240
virtual bool getOMPXGenericSpmdUseSmallBlockSize() const override {
32363241
return OMPX_GenericSpmdUseSmallBlockSize;
32373242
}
3243+
virtual uint32_t getOMPXXteamBlockSize() const override {
3244+
return OMPX_XteamBlockSize;
3245+
}
32383246

32393247
uint64_t getDeviceTimeStamp() override { return getSystemTimestampInNs(); }
32403248

@@ -4872,6 +4880,13 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
48724880
/// be reduced and the corresponding number of teams adjusted.
48734881
BoolEnvar OMPX_GenericSpmdUseSmallBlockSize;
48744882

4883+
/// Envar indicating the blocksize to be used for Xteam reduction kernels. The
4884+
/// default of 0 indicates that there is no runtime override and the value
4885+
/// indicated by CodeGen will be used. If a non-zero value is specified, the
4886+
/// runtime will attempt to use it as an override if other constraints are
4887+
/// satisfied.
4888+
UInt32Envar OMPX_XteamBlockSize;
4889+
48754890
/// Envar specifying the maximum size in bytes where the memory copies are
48764891
/// asynchronous operations. Up to this transfer size, the memory copies are
48774892
/// asynchronous operations pushed to the corresponding stream. For larger

offload/plugins-nextgen/common/include/PluginInterface.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1164,6 +1164,9 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
11641164
virtual bool getOMPXGenericSpmdUseSmallBlockSize() const {
11651165
llvm_unreachable("Unimplemented");
11661166
}
1167+
virtual uint32_t getOMPXXteamBlockSize() const {
1168+
llvm_unreachable("Unimplemented");
1169+
}
11671170

11681171
/// Get target compute unit kind (e.g., sm_80, or gfx908).
11691172
virtual std::string getComputeUnitKind() const { return "unknown"; }

0 commit comments

Comments
 (0)