Skip to content

Commit

Permalink
ROCm 5.1.3 updates
Browse files Browse the repository at this point in the history
  • Loading branch information
skeelyamd committed May 20, 2022
1 parent 2d66b99 commit 9f759e7
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 10 deletions.
5 changes: 2 additions & 3 deletions src/core/inc/scratch_cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,11 +94,12 @@ class ScratchCache {
size_t size_per_thread;
uint32_t lanes_per_wave;
uint32_t waves_per_group;
uint64_t wanted_slots;
bool cooperative;
ptrdiff_t queue_process_offset;
bool large;
bool retry;
hsa_signal_t queue_retry;
uint64_t wanted_slots;
ScratchCache::ref_t scratch_node;
};

Expand Down Expand Up @@ -135,7 +136,6 @@ class ScratchCache {
if (it->second.isFree()) {
it->second.alloc();
info.queue_base = it->second.base;
info.size = it->first;
info.scratch_node = it;
available_bytes -= it->first;
return true;
Expand All @@ -155,7 +155,6 @@ class ScratchCache {
}
it->second.free();
available_bytes += it->first;
assert(it->first == info.size && "Scratch cache size mismatch.");
}

bool trim(bool trim_nodes_in_use) {
Expand Down
26 changes: 22 additions & 4 deletions src/core/runtime/amd_aql_queue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -831,15 +831,21 @@ bool AqlQueue::DynamicScratchHandler(hsa_signal_value_t error_code, void* arg) {
assert((scratch_request != 0) &&
"Scratch memory request from packet with no scratch demand. Possible bad kernel code object.");

// Get the hw maximum scratch slot count taking into consideration asymmetric harvest.
const uint32_t engines = queue->agent_->properties().NumShaderBanks;
const uint32_t cu_count = queue->amd_queue_.max_cu_id + 1;
const uint32_t MaxScratchSlots =
(queue->amd_queue_.max_cu_id + 1) * queue->agent_->properties().MaxSlotsScratchCU;
AlignUp(cu_count, engines) * queue->agent_->properties().MaxSlotsScratchCU;

scratch.size_per_thread = scratch_request;
scratch.lanes_per_wave = (error_code & 0x400) ? 32 : 64;
// Align whole waves to 1KB.
scratch.size_per_thread = AlignUp(scratch.size_per_thread, 1024 / scratch.lanes_per_wave);
scratch.size = scratch.size_per_thread * MaxScratchSlots * scratch.lanes_per_wave;

// Smaller dispatches may not need to reach full device occupancy.
// For these we need to ensure that the scratch we give doesn't restrict the dispatch even
// though it does not fill the device. Figure the total requested dispatch size.
uint64_t lanes_per_group =
(uint64_t(pkt.dispatch.workgroup_size_x) * pkt.dispatch.workgroup_size_y) *
pkt.dispatch.workgroup_size_z;
Expand All @@ -854,14 +860,24 @@ bool AqlQueue::DynamicScratchHandler(hsa_signal_value_t error_code, void* arg) {
((uint64_t(pkt.dispatch.grid_size_z) + pkt.dispatch.workgroup_size_z - 1) /
pkt.dispatch.workgroup_size_z);

// Assign an equal number of groups to each engine, clipping to capacity limits
const uint32_t engines = queue->agent_->properties().NumShaderBanks;
groups = ((groups + engines - 1) / engines) * engines;
// Find the maximum number of groups assigned to any engine.
const uint32_t symmetric_cus = AlignDown(cu_count, engines);
const uint32_t asymmetryPerRound = cu_count - symmetric_cus;
const uint64_t rounds = groups / cu_count;
const uint64_t asymmetricGroups = rounds * asymmetryPerRound;
const uint64_t symmetricGroups = groups - asymmetricGroups;
const uint64_t maxGroupsPerEngine =
((symmetricGroups + engines - 1) / engines) + (asymmetryPerRound ? rounds : 0);

// Populate all engines at max group occupancy, then clip down to device limits.
groups = maxGroupsPerEngine * engines;
scratch.wanted_slots = groups * waves_per_group;
scratch.wanted_slots = Min(scratch.wanted_slots, uint64_t(MaxScratchSlots));
scratch.dispatch_size =
scratch.size_per_thread * scratch.wanted_slots * scratch.lanes_per_wave;

scratch.cooperative = (queue->amd_queue_.hsa_queue.type == HSA_QUEUE_TYPE_COOPERATIVE);

queue->agent_->AcquireQueueScratch(scratch);

if (scratch.retry) {
Expand Down Expand Up @@ -1328,6 +1344,8 @@ void AqlQueue::InitScratchSRD() {
uint32_t num_waves = queue_scratch_.size / (tmpring_size.bits.WAVESIZE * 1024);
tmpring_size.bits.WAVES = std::min(num_waves, max_scratch_waves);
amd_queue_.compute_tmpring_size = tmpring_size.u32All;
assert((tmpring_size.bits.WAVES % agent_props.NumShaderBanks == 0) &&
"Invalid scratch wave count. Must be divisible by #SEs.");
return;
}

Expand Down
3 changes: 2 additions & 1 deletion src/core/runtime/amd_gpu_agent.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1230,7 +1230,8 @@ void GpuAgent::AcquireQueueScratch(ScratchInfo& scratch) {
}

// Fail scratch allocation if reducing occupancy is disabled.
if ((!use_reclaim) || core::Runtime::runtime_singleton_->flag().no_scratch_thread_limiter())
if (scratch.cooperative || (!use_reclaim) ||
core::Runtime::runtime_singleton_->flag().no_scratch_thread_limiter())
return;

// Attempt to trim the maximum number of concurrent waves to allow scratch to fit.
Expand Down
3 changes: 1 addition & 2 deletions src/core/util/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -237,8 +237,7 @@ static __forceinline bool IsPowerOfTwo(T val) {
/// @return: T.
template <typename T>
static __forceinline T AlignDown(T value, size_t alignment) {
assert(IsPowerOfTwo(alignment));
return (T)(value & ~(alignment - 1));
return (T)((value / alignment) * alignment);
}

/// @brief: Same as previous one, but first parameter becomes pointer, for more
Expand Down

0 comments on commit 9f759e7

Please sign in to comment.