From 02ca5adedb3d42dfb35c980927ab61bd2c8ff905 Mon Sep 17 00:00:00 2001 From: Drew Lewis Date: Mon, 29 Sep 2025 19:48:28 +0000 Subject: [PATCH 1/4] Add a thread local cache infront of `findTag` to reduce lock contention. `findTag` spends ~90% of its time contending on the lock, this lowers that significantly, taking findTag from 3.9% of walltime to 0.8%. There will be a small amount of Tag* leakage until the threads are destroyed at which point the thread local caches will be automatically cleaned up. Signed-off-by: Drew Lewis --- include/sta/Search.hh | 4 +- search/Search.cc | 133 ++++++++++++++++++++++++++++++++++++++---- 2 files changed, 125 insertions(+), 12 deletions(-) diff --git a/include/sta/Search.hh b/include/sta/Search.hh index 934c1006..ee3c1ff9 100644 --- a/include/sta/Search.hh +++ b/include/sta/Search.hh @@ -72,6 +72,8 @@ typedef Vector VertexSlackMapSeq; typedef Vector WorstSlacksSeq; typedef std::vector DelayDblSeq; +class ThreadLocalCacheTagSet; + class Search : public StaState { public: @@ -627,7 +629,7 @@ protected: ClkInfoSet *clk_info_set_; std::mutex clk_info_lock_; // Use pointer to tag set so Tag.hh does not need to be included. - TagSet *tag_set_; + ThreadLocalCacheTagSet *tag_set_; // Entries in tags_ may be missing where previous filter tags were deleted. TagIndex tag_capacity_; std::atomic tags_; diff --git a/search/Search.cc b/search/Search.cc index 21d8e6d7..e6dfac32 100644 --- a/search/Search.cc +++ b/search/Search.cc @@ -25,6 +25,7 @@ #include "Search.hh" #include +#include #include // abs #include "Mutex.hh" @@ -71,6 +72,111 @@ namespace sta { +// A thread-local cache of tags wrapping a tag set. Goal is to make findKey +// lock free when possible. +class ThreadLocalCacheTagSet { + private: + TagSet tag_set_; + std::atomic clear_count_ = 0; + void* search_parent_ptr_ = nullptr; + + // Can be marked const because the object doesn't own the caches + TagSet& GetThreadLocalCache() const { + // Both thread-local variables are unique to each thread and live for the + // lifetime of the thread that created them. Because multiple Search + // objects execute on the same thread, we need to make the cache Search + // object specific for safety. + thread_local std::unordered_map tl_cache_map; + thread_local std::unordered_map tl_clear_count_map; + + auto& cache = tl_cache_map[search_parent_ptr_]; + auto& clear_count = tl_clear_count_map[search_parent_ptr_]; + + // To avoid needing to destroy threads and/or worrying about thread + // lifetimes we use a counter to ensure that any removals from the tag set + // trigger a clearing of the cache. This will prevent any stale data in + // the cache. + if (clear_count != clear_count_.load()) { + clear_count = clear_count_.load(); + cache.clear(); + } + return cache; + } + + void incrClearCount() { clear_count_.fetch_add(1); } + + public: + ThreadLocalCacheTagSet(uint64_t tagset_capacity, + const TagSet::hasher &hash, + const TagSet::key_equal &equal, + void* search_parent_ptr) + : tag_set_(tagset_capacity, hash, equal), + search_parent_ptr_(search_parent_ptr) {} + + // Lock free lookup in the thread-local cache. + Tag* findCachedTag(Tag* tag) const { + TagSet& cache = GetThreadLocalCache(); + auto it = cache.find(tag); + if (it != cache.end()) { + return *it; + } + return nullptr; + } + // + // Requires the Search to hold a lock + Tag* findKey(Tag* tag) const { + auto it = tag_set_.find(tag); + if (it == tag_set_.end()) { + return nullptr; + } + + // Insert into this thread's cache to speed up future lookups. + TagSet& cache = GetThreadLocalCache(); + cache.insert(*it); + return *it; + } + + void insert(Tag* tag) { + // Also insert the tag into the cache for this thread + TagSet& cache = GetThreadLocalCache(); + cache.insert(tag); + tag_set_.insert(tag); + } + + std::size_t size() const { return tag_set_.size(); } + + void deleteContentsClear() { + incrClearCount(); // clear caches if used again + tag_set_.deleteContentsClear(); + } + + template + std::vector eraseIf(Fn&& fn) { + static_assert( + std::is_invocable_r_v, + "eraseIf predicate must be a function taking a Tag* and returning a bool"); + std::vector to_delete; + for (auto it = tag_set_.begin(); it != tag_set_.end();) { + if (fn(*it)) { + to_delete.push_back(*it); + it = tag_set_.erase(it); + } else { + ++it; + } + } + + if (to_delete.size() > 0) { + incrClearCount(); // clear caches if used again + } + return to_delete; + } + + std::size_t bucket_count() const { return tag_set_.bucket_count(); } + std::size_t bucket_size(std::size_t n) const { + return tag_set_.bucket_size(n); + } +}; + using std::min; using std::max; using std::abs; @@ -235,7 +341,7 @@ Search::init(StaState *sta) arrival_iter_ = new BfsFwdIterator(BfsIndex::arrival, nullptr, sta); required_iter_ = new BfsBkwdIterator(BfsIndex::required, search_adj_, sta); tag_capacity_ = 128; - tag_set_ = new TagSet(tag_capacity_, TagHash(sta), TagEqual(sta)); + tag_set_ = new ThreadLocalCacheTagSet(tag_capacity_, TagHash(sta), TagEqual(sta), this); clk_info_set_ = new ClkInfoSet(ClkInfoLess(sta)); tag_next_ = 0; tags_ = new Tag*[tag_capacity_]; @@ -573,16 +679,16 @@ Search::deleteTagGroup(TagGroup *group) void Search::deleteFilterTags() { - for (TagIndex i = 0; i < tag_next_; i++) { - Tag *tag = tags_[i]; - if (tag - && (tag->isFilter() - || tag->clkInfo()->crprPathRefsFilter())) { - tags_[i] = nullptr; - tag_set_->erase(tag); - delete tag; - tag_free_indices_.push_back(i); + std::vector to_delete = + tag_set_->eraseIf([](Tag* tag) { return tag->isFilter(); }); + for (Tag* tag : to_delete) { + if (tag == nullptr) { + continue; } + + tags_[tag->index()] = nullptr; + delete tag; + tag_free_indices_.push_back(tag->index()); } } @@ -2978,8 +3084,13 @@ Search::findTag(const RiseFall *rf, { Tag probe(0, rf->index(), path_ap->index(), clk_info, is_clk, input_delay, is_segment_start, states, false, this); + Tag* tag = tag_set_->findCachedTag(&probe); + if (tag) { + return tag; + } + LockGuard lock(tag_lock_); - Tag *tag = tag_set_->findKey(&probe); + tag = tag_set_->findKey(&probe); if (tag == nullptr) { ExceptionStateSet *new_states = !own_states && states ? new ExceptionStateSet(*states) : states; From 4f6f75af9fa2d8745613c3e6a69457ee4326144f Mon Sep 17 00:00:00 2001 From: Drew Lewis Date: Mon, 29 Sep 2025 20:27:33 +0000 Subject: [PATCH 2/4] Fix a few things Signed-off-by: Drew Lewis --- search/Search.cc | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/search/Search.cc b/search/Search.cc index e6dfac32..e2d3c3f1 100644 --- a/search/Search.cc +++ b/search/Search.cc @@ -27,6 +27,7 @@ #include #include #include // abs +#include #include "Mutex.hh" #include "Report.hh" @@ -79,6 +80,7 @@ class ThreadLocalCacheTagSet { TagSet tag_set_; std::atomic clear_count_ = 0; void* search_parent_ptr_ = nullptr; + const StaState *sta_ = nullptr; // Can be marked const because the object doesn't own the caches TagSet& GetThreadLocalCache() const { @@ -89,7 +91,17 @@ class ThreadLocalCacheTagSet { thread_local std::unordered_map tl_cache_map; thread_local std::unordered_map tl_clear_count_map; - auto& cache = tl_cache_map[search_parent_ptr_]; + // Because TagSet is not default constructable we have to explicitly make + // one if it doesn't already exist + auto it = tl_cache_map.find(search_parent_ptr_); + if (it == tl_cache_map.end()){ + bool emplaced = false; + std::tie(it, emplaced) = tl_cache_map.emplace(search_parent_ptr_, + TagSet(/* bucket_count= */ 0, + TagSet::hasher(sta_), + TagSet::key_equal(sta_))); + } + auto& cache = it->second; auto& clear_count = tl_clear_count_map[search_parent_ptr_]; // To avoid needing to destroy threads and/or worrying about thread @@ -107,11 +119,11 @@ class ThreadLocalCacheTagSet { public: ThreadLocalCacheTagSet(uint64_t tagset_capacity, - const TagSet::hasher &hash, - const TagSet::key_equal &equal, + const StaState* sta, void* search_parent_ptr) - : tag_set_(tagset_capacity, hash, equal), - search_parent_ptr_(search_parent_ptr) {} + : tag_set_(tagset_capacity, TagSet::hasher(sta), TagSet::key_equal(sta)), + search_parent_ptr_(search_parent_ptr), + sta_(sta) {} // Lock free lookup in the thread-local cache. Tag* findCachedTag(Tag* tag) const { @@ -341,7 +353,7 @@ Search::init(StaState *sta) arrival_iter_ = new BfsFwdIterator(BfsIndex::arrival, nullptr, sta); required_iter_ = new BfsBkwdIterator(BfsIndex::required, search_adj_, sta); tag_capacity_ = 128; - tag_set_ = new ThreadLocalCacheTagSet(tag_capacity_, TagHash(sta), TagEqual(sta), this); + tag_set_ = new ThreadLocalCacheTagSet(tag_capacity_, sta, this); clk_info_set_ = new ClkInfoSet(ClkInfoLess(sta)); tag_next_ = 0; tags_ = new Tag*[tag_capacity_]; @@ -686,9 +698,10 @@ Search::deleteFilterTags() continue; } - tags_[tag->index()] = nullptr; + auto index = tag->index(); + tags_[index] = nullptr; delete tag; - tag_free_indices_.push_back(tag->index()); + tag_free_indices_.push_back(index); } } @@ -3119,7 +3132,6 @@ Search::findTag(const RiseFall *rf, tags_prev_.push_back(tags_); tags_ = tags; tag_capacity_ = tag_capacity; - tag_set_->reserve(tag_capacity); } if (tag_next_ == tag_index_max) report_->critical(1511, "max tag index exceeded"); From a1ac83119c54bbda6ffaa2700eb636703addeca6 Mon Sep 17 00:00:00 2001 From: Drew Lewis Date: Tue, 30 Sep 2025 14:45:46 +0000 Subject: [PATCH 3/4] Format file Signed-off-by: Drew Lewis --- search/Search.cc | 105 ++++++++++++++++++++++++++++------------------- 1 file changed, 63 insertions(+), 42 deletions(-) diff --git a/search/Search.cc b/search/Search.cc index e2d3c3f1..48c4e979 100644 --- a/search/Search.cc +++ b/search/Search.cc @@ -74,35 +74,37 @@ namespace sta { // A thread-local cache of tags wrapping a tag set. Goal is to make findKey -// lock free when possible. -class ThreadLocalCacheTagSet { - private: +// lock free when possible. +class ThreadLocalCacheTagSet +{ +private: TagSet tag_set_; std::atomic clear_count_ = 0; - void* search_parent_ptr_ = nullptr; + void *search_parent_ptr_ = nullptr; const StaState *sta_ = nullptr; // Can be marked const because the object doesn't own the caches - TagSet& GetThreadLocalCache() const { + TagSet &GetThreadLocalCache() const + { // Both thread-local variables are unique to each thread and live for the // lifetime of the thread that created them. Because multiple Search // objects execute on the same thread, we need to make the cache Search // object specific for safety. - thread_local std::unordered_map tl_cache_map; - thread_local std::unordered_map tl_clear_count_map; + thread_local std::unordered_map tl_cache_map; + thread_local std::unordered_map tl_clear_count_map; // Because TagSet is not default constructable we have to explicitly make // one if it doesn't already exist auto it = tl_cache_map.find(search_parent_ptr_); - if (it == tl_cache_map.end()){ + if (it == tl_cache_map.end()) { bool emplaced = false; - std::tie(it, emplaced) = tl_cache_map.emplace(search_parent_ptr_, - TagSet(/* bucket_count= */ 0, - TagSet::hasher(sta_), - TagSet::key_equal(sta_))); + std::tie(it, emplaced) = tl_cache_map.emplace(search_parent_ptr_, + TagSet(/* bucket_count= */ 0, + TagSet::hasher(sta_), + TagSet::key_equal(sta_))); } - auto& cache = it->second; - auto& clear_count = tl_clear_count_map[search_parent_ptr_]; + auto &cache = it->second; + auto &clear_count = tl_clear_count_map[search_parent_ptr_]; // To avoid needing to destroy threads and/or worrying about thread // lifetimes we use a counter to ensure that any removals from the tag set @@ -115,76 +117,95 @@ class ThreadLocalCacheTagSet { return cache; } - void incrClearCount() { clear_count_.fetch_add(1); } + void + incrClearCount() { clear_count_.fetch_add(1); } - public: - ThreadLocalCacheTagSet(uint64_t tagset_capacity, - const StaState* sta, - void* search_parent_ptr) - : tag_set_(tagset_capacity, TagSet::hasher(sta), TagSet::key_equal(sta)), - search_parent_ptr_(search_parent_ptr), - sta_(sta) {} +public: + ThreadLocalCacheTagSet(uint64_t tagset_capacity, + const StaState *sta, + void *search_parent_ptr) : tag_set_(tagset_capacity, + TagSet::hasher(sta), + TagSet::key_equal(sta)), + search_parent_ptr_(search_parent_ptr), + sta_(sta) + { + } // Lock free lookup in the thread-local cache. - Tag* findCachedTag(Tag* tag) const { - TagSet& cache = GetThreadLocalCache(); + Tag * + findCachedTag(Tag *tag) const + { + TagSet &cache = GetThreadLocalCache(); auto it = cache.find(tag); if (it != cache.end()) { return *it; } return nullptr; } - // + // Requires the Search to hold a lock - Tag* findKey(Tag* tag) const { + Tag * + findKey(Tag *tag) const + { auto it = tag_set_.find(tag); if (it == tag_set_.end()) { return nullptr; } // Insert into this thread's cache to speed up future lookups. - TagSet& cache = GetThreadLocalCache(); + TagSet &cache = GetThreadLocalCache(); cache.insert(*it); return *it; } - void insert(Tag* tag) { + void + insert(Tag *tag) + { // Also insert the tag into the cache for this thread - TagSet& cache = GetThreadLocalCache(); + TagSet &cache = GetThreadLocalCache(); cache.insert(tag); tag_set_.insert(tag); } std::size_t size() const { return tag_set_.size(); } - void deleteContentsClear() { - incrClearCount(); // clear caches if used again + void + deleteContentsClear() + { + incrClearCount(); // clear caches if used again tag_set_.deleteContentsClear(); } template - std::vector eraseIf(Fn&& fn) { + std::vector + eraseIf(Fn &&fn) + { static_assert( - std::is_invocable_r_v, - "eraseIf predicate must be a function taking a Tag* and returning a bool"); - std::vector to_delete; + std::is_invocable_r_v, + "eraseIf predicate must be a function taking a Tag* and returning a bool"); + + std::vector to_delete; for (auto it = tag_set_.begin(); it != tag_set_.end();) { if (fn(*it)) { to_delete.push_back(*it); it = tag_set_.erase(it); - } else { + } + else { ++it; } } if (to_delete.size() > 0) { - incrClearCount(); // clear caches if used again + incrClearCount(); // clear caches if used again } return to_delete; } std::size_t bucket_count() const { return tag_set_.bucket_count(); } - std::size_t bucket_size(std::size_t n) const { + + std::size_t + bucket_size(std::size_t n) const + { return tag_set_.bucket_size(n); } }; @@ -691,9 +712,9 @@ Search::deleteTagGroup(TagGroup *group) void Search::deleteFilterTags() { - std::vector to_delete = - tag_set_->eraseIf([](Tag* tag) { return tag->isFilter(); }); - for (Tag* tag : to_delete) { + std::vector to_delete = + tag_set_->eraseIf([](Tag *tag) { return tag->isFilter(); }); + for (Tag *tag : to_delete) { if (tag == nullptr) { continue; } @@ -3097,7 +3118,7 @@ Search::findTag(const RiseFall *rf, { Tag probe(0, rf->index(), path_ap->index(), clk_info, is_clk, input_delay, is_segment_start, states, false, this); - Tag* tag = tag_set_->findCachedTag(&probe); + Tag *tag = tag_set_->findCachedTag(&probe); if (tag) { return tag; } From 835092fd3f67ef8dcbd4e99b39f166f52c153832 Mon Sep 17 00:00:00 2001 From: Drew Lewis Date: Tue, 30 Sep 2025 14:58:49 +0000 Subject: [PATCH 4/4] Update format Signed-off-by: Drew Lewis --- search/Search.cc | 175 ++++++++++++++++++++++++++--------------------- 1 file changed, 97 insertions(+), 78 deletions(-) diff --git a/search/Search.cc b/search/Search.cc index 48c4e979..4415cd7d 100644 --- a/search/Search.cc +++ b/search/Search.cc @@ -73,141 +73,160 @@ namespace sta { -// A thread-local cache of tags wrapping a tag set. Goal is to make findKey -// lock free when possible. +// A thread-local cache of tags wrapping a tag set. +// The goal is to make findKey lock free when possible. class ThreadLocalCacheTagSet { -private: - TagSet tag_set_; - std::atomic clear_count_ = 0; - void *search_parent_ptr_ = nullptr; - const StaState *sta_ = nullptr; - - // Can be marked const because the object doesn't own the caches - TagSet &GetThreadLocalCache() const - { - // Both thread-local variables are unique to each thread and live for the - // lifetime of the thread that created them. Because multiple Search - // objects execute on the same thread, we need to make the cache Search - // object specific for safety. - thread_local std::unordered_map tl_cache_map; - thread_local std::unordered_map tl_clear_count_map; - - // Because TagSet is not default constructable we have to explicitly make - // one if it doesn't already exist - auto it = tl_cache_map.find(search_parent_ptr_); - if (it == tl_cache_map.end()) { - bool emplaced = false; - std::tie(it, emplaced) = tl_cache_map.emplace(search_parent_ptr_, - TagSet(/* bucket_count= */ 0, - TagSet::hasher(sta_), - TagSet::key_equal(sta_))); - } - auto &cache = it->second; - auto &clear_count = tl_clear_count_map[search_parent_ptr_]; - - // To avoid needing to destroy threads and/or worrying about thread - // lifetimes we use a counter to ensure that any removals from the tag set - // trigger a clearing of the cache. This will prevent any stale data in - // the cache. - if (clear_count != clear_count_.load()) { - clear_count = clear_count_.load(); - cache.clear(); - } - return cache; - } - - void - incrClearCount() { clear_count_.fetch_add(1); } - public: ThreadLocalCacheTagSet(uint64_t tagset_capacity, - const StaState *sta, - void *search_parent_ptr) : tag_set_(tagset_capacity, - TagSet::hasher(sta), - TagSet::key_equal(sta)), - search_parent_ptr_(search_parent_ptr), - sta_(sta) + const StaState* sta, + void* search_parent_ptr) : + tag_set_(tagset_capacity, + TagSet::hasher(sta), + TagSet::key_equal(sta)), + search_parent_ptr_(search_parent_ptr), + sta_(sta) { } // Lock free lookup in the thread-local cache. - Tag * - findCachedTag(Tag *tag) const + Tag* + findCachedTag(Tag* tag) const { - TagSet &cache = GetThreadLocalCache(); + TagSet& cache = getThreadLocalCache(); auto it = cache.find(tag); - if (it != cache.end()) { + if (it != cache.end()) return *it; - } return nullptr; } - // Requires the Search to hold a lock - Tag * - findKey(Tag *tag) const + // Requires the Search to hold a lock. + Tag* + findKey(Tag* tag) const { auto it = tag_set_.find(tag); - if (it == tag_set_.end()) { + if (it == tag_set_.end()) return nullptr; - } // Insert into this thread's cache to speed up future lookups. - TagSet &cache = GetThreadLocalCache(); + TagSet& cache = getThreadLocalCache(); cache.insert(*it); return *it; } void - insert(Tag *tag) + insert(Tag* tag) { - // Also insert the tag into the cache for this thread - TagSet &cache = GetThreadLocalCache(); + // Also insert the tag into the cache for this thread. + TagSet& cache = getThreadLocalCache(); cache.insert(tag); tag_set_.insert(tag); } - std::size_t size() const { return tag_set_.size(); } + std::size_t + size() const + { + return tag_set_.size(); + } void deleteContentsClear() { - incrClearCount(); // clear caches if used again + incrClearCount(); // Clear caches if used again. tag_set_.deleteContentsClear(); } template - std::vector - eraseIf(Fn &&fn) + std::vector + eraseIf(Fn&& fn) { static_assert( - std::is_invocable_r_v, - "eraseIf predicate must be a function taking a Tag* and returning a bool"); + std::is_invocable_r_v, + "EraseIf predicate must be a function taking a Tag* and returning a bool"); - std::vector to_delete; + std::vector to_delete; for (auto it = tag_set_.begin(); it != tag_set_.end();) { if (fn(*it)) { to_delete.push_back(*it); it = tag_set_.erase(it); - } - else { + } else { ++it; } } - if (to_delete.size() > 0) { - incrClearCount(); // clear caches if used again - } + if (to_delete.size() > 0) + incrClearCount(); // Clear caches if used again. return to_delete; } - std::size_t bucket_count() const { return tag_set_.bucket_count(); } + std::size_t + bucket_count() const + { + return tag_set_.bucket_count(); + } std::size_t bucket_size(std::size_t n) const { return tag_set_.bucket_size(n); } + +private: + TagSet tag_set_; + std::atomic clear_count_ = 0; + void* search_parent_ptr_ = nullptr; + const StaState* sta_ = nullptr; + + // Can be marked const because the object doesn't own the caches. + TagSet& + getThreadLocalCache() const + { + // Both thread-local variables are unique to each thread and live for the + // lifetime of the thread that created them. Because multiple Search + // objects execute on the same thread, we need to make the cache Search + // object specific for safety. + thread_local std::unordered_map tl_cache_map; + thread_local std::unordered_map tl_clear_count_map; + + // Because TagSet is not default constructable we have to explicitly make + // one if it doesn't already exist. + auto cache_it = tl_cache_map.find(search_parent_ptr_); + if (cache_it == tl_cache_map.end()) { + bool emplaced = false; + std::tie(cache_it, emplaced) + = tl_cache_map.emplace(search_parent_ptr_, + TagSet(/* bucket_count= */ 0, + TagSet::hasher(sta_), + TagSet::key_equal(sta_))); + } + + // Find/emplace the clear count for this parent pointer. + auto clear_count_it = tl_clear_count_map.find(search_parent_ptr_); + if (clear_count_it == tl_clear_count_map.end()) { + bool emplaced = false; + std::tie(clear_count_it, emplaced) + = tl_clear_count_map.emplace(search_parent_ptr_, 0); + } + + auto& cache = cache_it->second; + auto& clear_count = clear_count_it->second; + + // To avoid needing to destroy threads and/or worrying about thread + // lifetimes we use a counter to ensure that any removals from the tag set + // trigger a clearing of the cache. This will prevent any stale data in + // the cache. + if (clear_count != clear_count_.load()) { + clear_count = clear_count_.load(); + cache.clear(); + } + return cache; + } + + void + incrClearCount() + { + clear_count_.fetch_add(1); + } }; using std::min;