From c037ed80e51072da5ac08fbfdf4c433ac0de34b7 Mon Sep 17 00:00:00 2001 From: zhaozhangjian Date: Wed, 8 Apr 2020 15:28:11 +0800 Subject: [PATCH 1/2] In findGlobalPool, return once a fine grained global pool is found to avoid useless iterations. Also add an ErrorCheck in the constructor of UnpinnedCopyEngine accordingly. --- lib/hsa/unpinned_copy_engine.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/hsa/unpinned_copy_engine.cpp b/lib/hsa/unpinned_copy_engine.cpp index fdcd2168d82..a29b7ee14b7 100644 --- a/lib/hsa/unpinned_copy_engine.cpp +++ b/lib/hsa/unpinned_copy_engine.cpp @@ -52,6 +52,7 @@ static hsa_status_t findGlobalPool(hsa_amd_memory_pool_t pool, void* data) if ((HSA_AMD_SEGMENT_GLOBAL == segment) && (flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED)) { *((hsa_amd_memory_pool_t*)data) = pool; + return HSA_STATUS_INFO_BREAK; } return HSA_STATUS_SUCCESS; } @@ -96,6 +97,7 @@ UnpinnedCopyEngine::UnpinnedCopyEngine(hsa_agent_t hsaAgent, hsa_agent_t cpuAgen { hsa_amd_memory_pool_t sys_pool; hsa_status_t err = hsa_amd_agent_iterate_memory_pools(_cpuAgent, findGlobalPool, &sys_pool); + ErrorCheck(err); // Generate a packed C-style array of agents, for use below with hsa_amd_agents_allow_access // TODO - should this include the CPU agents as well? From 28e2b8f155e73656ae58555319e079e4a8545b88 Mon Sep 17 00:00:00 2001 From: zhaozhangjian Date: Tue, 28 Apr 2020 17:07:33 +0800 Subject: [PATCH 2/2] Avoid unnecessary lock when getting thread local queue --- include/kalmar_runtime.h | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/include/kalmar_runtime.h b/include/kalmar_runtime.h index 46766050e5e..9efb7f75995 100644 --- a/include/kalmar_runtime.h +++ b/include/kalmar_runtime.h @@ -421,13 +421,18 @@ class KalmarDevice }); return def; #else + std::shared_ptr result; std::thread::id tid = std::this_thread::get_id(); - tlsDefaultQueueMap_mutex.lock(); - if (tlsDefaultQueueMap.find(tid) == tlsDefaultQueueMap.end()) { - tlsDefaultQueueMap[tid] = createQueue(); + if (tlsDefaultQueueMap.find(tid) != tlsDefaultQueueMap.end()) { + result = tlsDefaultQueueMap[tid]; + } else { + tlsDefaultQueueMap_mutex.lock(); + if (tlsDefaultQueueMap.find(tid) == tlsDefaultQueueMap.end()) { + tlsDefaultQueueMap[tid] = createQueue(); + } + result = tlsDefaultQueueMap[tid]; + tlsDefaultQueueMap_mutex.unlock(); } - std::shared_ptr result = tlsDefaultQueueMap[tid]; - tlsDefaultQueueMap_mutex.unlock(); return result; #endif }