Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement accurate max block size in hipFuncGetAttributes() #1676

Merged
merged 9 commits into from
Mar 18, 2020
54 changes: 45 additions & 9 deletions src/hip_module.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1047,12 +1047,14 @@ hipError_t hipModuleGetFunctionEx(hipFunction_t* hfunc, hipModule_t hmod,
return ihipLogStatus(ihipModuleGetFunction(tls, hfunc, hmod, name, agent));
}

void getGprsLdsUsage(hipFunction_t f, size_t* usedVGPRS, size_t* usedSGPRS, size_t* usedLDS);

namespace {
const amd_kernel_code_v3_t *header_v3(const ihipModuleSymbol_t& kd) {
return reinterpret_cast<const amd_kernel_code_v3_t*>(kd._header);
}

hipFuncAttributes make_function_attributes(TlsData *tls, const ihipModuleSymbol_t& kd) {
hipFuncAttributes make_function_attributes(TlsData *tls, ihipModuleSymbol_t& kd) {
hipFuncAttributes r{};

hipDeviceProp_t prop{};
Expand All @@ -1062,23 +1064,57 @@ hipFuncAttributes make_function_attributes(TlsData *tls, const ihipModuleSymbol_
prop.regsPerBlock = prop.regsPerBlock ? prop.regsPerBlock : 64 * 1024;

if (kd._is_code_object_v3) {
r.localSizeBytes = header_v3(kd)->private_segment_fixed_size;
r.sharedSizeBytes = header_v3(kd)->group_segment_fixed_size;
r.numRegs = ((header_v3(kd)->compute_pgm_rsrc1 & 0x3F) + 1) << 2;
r.binaryVersion = 0; // FIXME: should it be the ISA version or code
// object format version?
r.localSizeBytes = header_v3(kd)->private_segment_fixed_size;
r.sharedSizeBytes = header_v3(kd)->group_segment_fixed_size;
} else {
r.localSizeBytes = kd._header->workitem_private_segment_byte_size;
r.sharedSizeBytes = kd._header->workgroup_group_segment_byte_size;
r.numRegs = kd._header->workitem_vgpr_count;
r.binaryVersion =
kd._header->amd_machine_version_major * 10 +
kd._header->amd_machine_version_minor;
}
r.maxDynamicSharedSizeBytes = prop.sharedMemPerBlock - r.sharedSizeBytes;
r.maxThreadsPerBlock = r.numRegs ?
std::min(prop.maxThreadsPerBlock, prop.regsPerBlock / r.numRegs) :
prop.maxThreadsPerBlock;

size_t usedVGPRS = 0;
size_t usedSGPRS = 0;
size_t usedLDS = 0;
getGprsLdsUsage(&kd, &usedVGPRS, &usedSGPRS, &usedLDS);

r.numRegs = usedVGPRS;

size_t wavefrontSize = prop.warpSize;
size_t maxWavefrontsPerBlock = prop.maxThreadsPerBlock / wavefrontSize;
size_t maxWavefrontsPerCU = min(prop.maxThreadsPerMultiProcessor / wavefrontSize, 32);
const size_t numSIMD = 4;
const size_t maxWavesPerSimd = maxWavefrontsPerCU / numSIMD;
size_t maxWaves = 0;
for (int i = 0; i < maxWavefrontsPerBlock; i++) {
size_t wavefronts = i + 1;

if (usedVGPRS > 0) {
size_t availableVGPRs = (prop.regsPerBlock / wavefrontSize / numSIMD);
size_t vgprs_alu_occupancy = numSIMD * std::min(maxWavesPerSimd, availableVGPRs / usedVGPRS);

// Calculate blocks occupancy per CU based on VGPR usage
if (vgprs_alu_occupancy < wavefronts)
break;
}

if (usedSGPRS > 0) {
const size_t availableSGPRs = (prop.gcnArch < 800) ? 512 : 800;
size_t sgprs_alu_occupancy = numSIMD * ((usedSGPRS == 0) ? maxWavesPerSimd
: std::min(maxWavesPerSimd, availableSGPRs / usedSGPRS));

// Calculate blocks occupancy per CU based on SGPR usage
if (sgprs_alu_occupancy < wavefronts)
break;
}
maxWaves = wavefronts;
}

r.maxThreadsPerBlock = maxWaves * wavefrontSize;
r.ptxVersion = prop.major * 10 + prop.minor; // HIP currently presents itself as PTX 3.0.

return r;
Expand Down Expand Up @@ -1230,7 +1266,7 @@ void getGprsLdsUsage(hipFunction_t f, size_t* usedVGPRS, size_t* usedSGPRS, size
{
if (f->_is_code_object_v3) {
const auto header = reinterpret_cast<const amd_kernel_code_v3_t*>(f->_header);
// GRANULATED_WAVEFRONT_VGPR_COUNT is specified in 0:5 bits of COMPUTE_PGM_RSRC1
// GRANULATED_WORKITEM_VGPR_COUNT is specified in 0:5 bits of COMPUTE_PGM_RSRC1
// the granularity for gfx6-gfx9 is max(0, ceil(vgprs_used / 4) - 1)
*usedVGPRS = ((header->compute_pgm_rsrc1 & 0x3F) + 1) << 2;
// GRANULATED_WAVEFRONT_SGPR_COUNT is specified in 6:9 bits of COMPUTE_PGM_RSRC1
Expand Down