ROCm 5.1.3 updates

ROCm · May 20, 2022 · 9f759e7 · 9f759e7
1 parent 2d66b99
commit 9f759e7
Show file tree

Hide file tree

Showing 4 changed files with 27 additions and 10 deletions.
diff --git a/src/core/inc/scratch_cache.h b/src/core/inc/scratch_cache.h
@@ -94,11 +94,12 @@ class ScratchCache {
     size_t size_per_thread;
     uint32_t lanes_per_wave;
     uint32_t waves_per_group;
+    uint64_t wanted_slots;
+    bool cooperative;
     ptrdiff_t queue_process_offset;
     bool large;
     bool retry;
     hsa_signal_t queue_retry;
-    uint64_t wanted_slots;
     ScratchCache::ref_t scratch_node;
   };
 
@@ -135,7 +136,6 @@ class ScratchCache {
       if (it->second.isFree()) {
         it->second.alloc();
         info.queue_base = it->second.base;
-        info.size = it->first;
         info.scratch_node = it;
         available_bytes -= it->first;
         return true;
@@ -155,7 +155,6 @@ class ScratchCache {
     }
     it->second.free();
     available_bytes += it->first;
-    assert(it->first == info.size && "Scratch cache size mismatch.");
   }
 
   bool trim(bool trim_nodes_in_use) {

diff --git a/src/core/runtime/amd_aql_queue.cpp b/src/core/runtime/amd_aql_queue.cpp
@@ -831,15 +831,21 @@ bool AqlQueue::DynamicScratchHandler(hsa_signal_value_t error_code, void* arg) {
       assert((scratch_request != 0) &&
              "Scratch memory request from packet with no scratch demand.  Possible bad kernel code object.");
 
+      // Get the hw maximum scratch slot count taking into consideration asymmetric harvest.
+      const uint32_t engines = queue->agent_->properties().NumShaderBanks;
+      const uint32_t cu_count = queue->amd_queue_.max_cu_id + 1;
       const uint32_t MaxScratchSlots =
-          (queue->amd_queue_.max_cu_id + 1) * queue->agent_->properties().MaxSlotsScratchCU;
+          AlignUp(cu_count, engines) * queue->agent_->properties().MaxSlotsScratchCU;
 
       scratch.size_per_thread = scratch_request;
       scratch.lanes_per_wave = (error_code & 0x400) ? 32 : 64;
       // Align whole waves to 1KB.
       scratch.size_per_thread = AlignUp(scratch.size_per_thread, 1024 / scratch.lanes_per_wave);
       scratch.size = scratch.size_per_thread * MaxScratchSlots * scratch.lanes_per_wave;
 
+      // Smaller dispatches may not need to reach full device occupancy.
+      // For these we need to ensure that the scratch we give doesn't restrict the dispatch even
+      // though it does not fill the device. Figure the total requested dispatch size.
       uint64_t lanes_per_group =
           (uint64_t(pkt.dispatch.workgroup_size_x) * pkt.dispatch.workgroup_size_y) *
           pkt.dispatch.workgroup_size_z;
@@ -854,14 +860,24 @@ bool AqlQueue::DynamicScratchHandler(hsa_signal_value_t error_code, void* arg) {
           ((uint64_t(pkt.dispatch.grid_size_z) + pkt.dispatch.workgroup_size_z - 1) /
            pkt.dispatch.workgroup_size_z);
 
-      // Assign an equal number of groups to each engine, clipping to capacity limits
-      const uint32_t engines = queue->agent_->properties().NumShaderBanks;
-      groups = ((groups + engines - 1) / engines) * engines;
+      // Find the maximum number of groups assigned to any engine.
+      const uint32_t symmetric_cus = AlignDown(cu_count, engines);
+      const uint32_t asymmetryPerRound = cu_count - symmetric_cus;
+      const uint64_t rounds = groups / cu_count;
+      const uint64_t asymmetricGroups = rounds * asymmetryPerRound;
+      const uint64_t symmetricGroups = groups - asymmetricGroups;
+      const uint64_t maxGroupsPerEngine =
+          ((symmetricGroups + engines - 1) / engines) + (asymmetryPerRound ? rounds : 0);
+
+      // Populate all engines at max group occupancy, then clip down to device limits.
+      groups = maxGroupsPerEngine * engines;
       scratch.wanted_slots = groups * waves_per_group;
       scratch.wanted_slots = Min(scratch.wanted_slots, uint64_t(MaxScratchSlots));
       scratch.dispatch_size =
           scratch.size_per_thread * scratch.wanted_slots * scratch.lanes_per_wave;
 
+      scratch.cooperative = (queue->amd_queue_.hsa_queue.type == HSA_QUEUE_TYPE_COOPERATIVE);
+
       queue->agent_->AcquireQueueScratch(scratch);
 
       if (scratch.retry) {
@@ -1328,6 +1344,8 @@ void AqlQueue::InitScratchSRD() {
   uint32_t num_waves = queue_scratch_.size / (tmpring_size.bits.WAVESIZE * 1024);
   tmpring_size.bits.WAVES = std::min(num_waves, max_scratch_waves);
   amd_queue_.compute_tmpring_size = tmpring_size.u32All;
+  assert((tmpring_size.bits.WAVES % agent_props.NumShaderBanks == 0) &&
+         "Invalid scratch wave count.  Must be divisible by #SEs.");
   return;
 }
 

diff --git a/src/core/runtime/amd_gpu_agent.cpp b/src/core/runtime/amd_gpu_agent.cpp
@@ -1230,7 +1230,8 @@ void GpuAgent::AcquireQueueScratch(ScratchInfo& scratch) {
     }
 
     // Fail scratch allocation if reducing occupancy is disabled.
-    if ((!use_reclaim) || core::Runtime::runtime_singleton_->flag().no_scratch_thread_limiter())
+    if (scratch.cooperative || (!use_reclaim) ||
+        core::Runtime::runtime_singleton_->flag().no_scratch_thread_limiter())
       return;
 
     // Attempt to trim the maximum number of concurrent waves to allow scratch to fit.

diff --git a/src/core/util/utils.h b/src/core/util/utils.h
@@ -237,8 +237,7 @@ static __forceinline bool IsPowerOfTwo(T val) {
 /// @return: T.
 template <typename T>
 static __forceinline T AlignDown(T value, size_t alignment) {
-  assert(IsPowerOfTwo(alignment));
-  return (T)(value & ~(alignment - 1));
+  return (T)((value / alignment) * alignment);
 }
 
 /// @brief: Same as previous one, but first parameter becomes pointer, for more