@@ -946,6 +946,10 @@ struct AMDGPUKernelTy : public GenericKernelTy {
946946 if (ThreadLimitClause[0 ] > 0 && ThreadLimitClause[0 ] != (uint32_t )-1 &&
947947 ThreadLimitClause[0 ] <= static_cast <uint32_t >(ConstWGSize))
948948 return llvm::omp::getBlockSizeAsPowerOfTwo (ThreadLimitClause[0 ]);
949+ uint32_t BlockSizeOverride = GenericDevice.getOMPXXteamBlockSize ();
950+ if (BlockSizeOverride > 0 &&
951+ BlockSizeOverride <= static_cast <int32_t >(ConstWGSize))
952+ return llvm::omp::getBlockSizeAsPowerOfTwo (BlockSizeOverride);
949953 assert (((ConstWGSize & (ConstWGSize - 1 )) == 0 ) &&
950954 " XTeam Reduction blocksize must be a power of two" );
951955 return ConstWGSize;
@@ -3096,6 +3100,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
30963100 " LIBOMPTARGET_AMDGPU_ADJUST_XTEAM_RED_TEAMS" , 1 ),
30973101 OMPX_GenericSpmdUseSmallBlockSize (
30983102 " LIBOMPTARGET_AMDGPU_GENERIC_SPMD_USE_SMALL_BLOCKSIZE" , 1 ),
3103+ OMPX_XteamBlockSize (" LIBOMPTARGET_AMDGPU_XTEAM_BLOCKSIZE" , 0 ),
30993104 OMPX_MaxAsyncCopyBytes (" LIBOMPTARGET_AMDGPU_MAX_ASYNC_COPY_BYTES" ,
31003105 64 * 1024 ),
31013106 OMPX_InitialNumSignals (" LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS" ,
@@ -3235,6 +3240,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
32353240 virtual bool getOMPXGenericSpmdUseSmallBlockSize () const override {
32363241 return OMPX_GenericSpmdUseSmallBlockSize;
32373242 }
3243+ virtual uint32_t getOMPXXteamBlockSize () const override {
3244+ return OMPX_XteamBlockSize;
3245+ }
32383246
32393247 uint64_t getDeviceTimeStamp () override { return getSystemTimestampInNs (); }
32403248
@@ -4872,6 +4880,13 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
48724880 // / be reduced and the corresponding number of teams adjusted.
48734881 BoolEnvar OMPX_GenericSpmdUseSmallBlockSize;
48744882
4883+ // / Envar indicating the blocksize to be used for Xteam reduction kernels. The
4884+ // / default of 0 indicates that there is no runtime override and the value
4885+ // / indicated by CodeGen will be used. If a non-zero value is specified, the
4886+ // / runtime will attempt to use it as an override if other constraints are
4887+ // / satisfied.
4888+ UInt32Envar OMPX_XteamBlockSize;
4889+
48754890 // / Envar specifying the maximum size in bytes where the memory copies are
48764891 // / asynchronous operations. Up to this transfer size, the memory copies are
48774892 // / asynchronous operations pushed to the corresponding stream. For larger
0 commit comments