Skip to content

Commit

Permalink
stats: fix scope cache consistency issue when scope memory is recyled (
Browse files Browse the repository at this point in the history
…#3253)

The memory allocator can reuse a memory address immediately, leading to
a situation in which TLS references are used for an already destroyed
scope. Instead, an incrementing ID is used as the cache key.

Fixes #3214

Signed-off-by: Matt Klein <mklein@lyft.com>
  • Loading branch information
mattklein123 authored May 1, 2018
1 parent d338e45 commit 02fb980
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 14 deletions.
17 changes: 11 additions & 6 deletions source/common/stats/thread_local_store.cc
Original file line number Diff line number Diff line change
Expand Up @@ -83,21 +83,22 @@ void ThreadLocalStoreImpl::releaseScopeCrossThread(ScopeImpl* scope) {
// This can happen from any thread. We post() back to the main thread which will initiate the
// cache flush operation.
if (!shutting_down_ && main_thread_dispatcher_) {
main_thread_dispatcher_->post([this, scope]() -> void { clearScopeFromCaches(scope); });
main_thread_dispatcher_->post(
[ this, scope_id = scope->scope_id_ ]()->void { clearScopeFromCaches(scope_id); });
}
}

std::string ThreadLocalStoreImpl::getTagsForName(const std::string& name, std::vector<Tag>& tags) {
return tag_producer_->produceTags(name, tags);
}

void ThreadLocalStoreImpl::clearScopeFromCaches(ScopeImpl* scope) {
void ThreadLocalStoreImpl::clearScopeFromCaches(uint64_t scope_id) {
// If we are shutting down we no longer perform cache flushes as workers may be shutting down
// at the same time.
if (!shutting_down_) {
// Perform a cache flush on all threads.
tls_->runOnAllThreads(
[this, scope]() -> void { tls_->getTyped<TlsCache>().scope_cache_.erase(scope); });
[this, scope_id]() -> void { tls_->getTyped<TlsCache>().scope_cache_.erase(scope_id); });
}
}

Expand All @@ -114,6 +115,8 @@ ThreadLocalStoreImpl::SafeAllocData ThreadLocalStoreImpl::safeAlloc(const std::s
}
}

std::atomic<uint64_t> ThreadLocalStoreImpl::ScopeImpl::next_scope_id_;

ThreadLocalStoreImpl::ScopeImpl::~ScopeImpl() { parent_.releaseScopeCrossThread(this); }

Counter& ThreadLocalStoreImpl::ScopeImpl::counter(const std::string& name) {
Expand All @@ -125,7 +128,8 @@ Counter& ThreadLocalStoreImpl::ScopeImpl::counter(const std::string& name) {
// is no cache entry.
CounterSharedPtr* tls_ref = nullptr;
if (!parent_.shutting_down_ && parent_.tls_) {
tls_ref = &parent_.tls_->getTyped<TlsCache>().scope_cache_[this].counters_[final_name];
tls_ref =
&parent_.tls_->getTyped<TlsCache>().scope_cache_[this->scope_id_].counters_[final_name];
}

// If we have a valid cache entry, return it.
Expand Down Expand Up @@ -176,7 +180,7 @@ Gauge& ThreadLocalStoreImpl::ScopeImpl::gauge(const std::string& name) {
std::string final_name = prefix_ + name;
GaugeSharedPtr* tls_ref = nullptr;
if (!parent_.shutting_down_ && parent_.tls_) {
tls_ref = &parent_.tls_->getTyped<TlsCache>().scope_cache_[this].gauges_[final_name];
tls_ref = &parent_.tls_->getTyped<TlsCache>().scope_cache_[this->scope_id_].gauges_[final_name];
}

if (tls_ref && *tls_ref) {
Expand Down Expand Up @@ -206,7 +210,8 @@ Histogram& ThreadLocalStoreImpl::ScopeImpl::histogram(const std::string& name) {
std::string final_name = prefix_ + name;
HistogramSharedPtr* tls_ref = nullptr;
if (!parent_.shutting_down_ && parent_.tls_) {
tls_ref = &parent_.tls_->getTyped<TlsCache>().scope_cache_[this].histograms_[final_name];
tls_ref =
&parent_.tls_->getTyped<TlsCache>().scope_cache_[this->scope_id_].histograms_[final_name];
}

if (tls_ref && *tls_ref) {
Expand Down
27 changes: 19 additions & 8 deletions source/common/stats/thread_local_store.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,10 @@ namespace Stats {
* Store implementation with thread local caching. This implementation supports the following
* features:
* - Thread local per scope stat caching.
* - Overallaping scopes with proper reference counting (2 scopes with the same name will point to
* - Overlapping scopes with proper reference counting (2 scopes with the same name will point to
* the same backing stats).
* - Scope deletion.
* - Lockless in the fast path.
*
* This implementation is complicated so here is a rough overview of the threading model.
* - The store can be used before threading is initialized. This is needed during server init.
Expand All @@ -34,10 +35,9 @@ namespace Stats {
* - Scopes are entirely owned by the caller. The store only keeps weak pointers.
* - When a scope is destroyed, a cache flush operation is run on all threads to flush any cached
* data owned by the destroyed scope.
* - NOTE: It is theoretically possible that when a scope is deleted, it could be reallocated
* with the same address, and a cache flush operation could race and delete cache data
* for the new scope. This is extremely unlikely, and if it happens the cache will be
* repopulated on the next access.
* - Scopes use a unique incrementing ID for the cache key. This ensures that if a new scope is
* created at the same address as a recently deleted scope, cache references will not accidently
* reference the old scope which may be about to be cache flushed.
* - Since it's possible to have overlapping scopes, we de-dup stats when counters() or gauges() is
* called since these are very uncommon operations.
* - Though this implementation is designed to work with a fixed shared memory space, it will fall
Expand Down Expand Up @@ -83,7 +83,8 @@ class ThreadLocalStoreImpl : public StoreRoot {

struct ScopeImpl : public Scope {
ScopeImpl(ThreadLocalStoreImpl& parent, const std::string& prefix)
: parent_(parent), prefix_(Utility::sanitizeStatsName(prefix)) {}
: scope_id_(next_scope_id_++), parent_(parent),
prefix_(Utility::sanitizeStatsName(prefix)) {}
~ScopeImpl();

// Stats::Scope
Expand All @@ -95,13 +96,23 @@ class ThreadLocalStoreImpl : public StoreRoot {
Gauge& gauge(const std::string& name) override;
Histogram& histogram(const std::string& name) override;

static std::atomic<uint64_t> next_scope_id_;

const uint64_t scope_id_;
ThreadLocalStoreImpl& parent_;
const std::string prefix_;
TlsCacheEntry central_cache_;
};

struct TlsCache : public ThreadLocal::ThreadLocalObject {
std::unordered_map<ScopeImpl*, TlsCacheEntry> scope_cache_;
// The TLS scope cache is keyed by scope ID. This is used to avoid complex circular references
// during scope destruction. An ID is required vs. using the address of the scope pointer
// because it's possible that the memory allocator will recyle the scope pointer immediately
// upon destruction, leading to a situation in which a new scope with the same address is used
// to reference the cache, and then subsequently cache flushed, leaving nothing in the central
// store. See the overview for more information. This complexity is required for lockless
// operation in the fast path.
std::unordered_map<uint64_t, TlsCacheEntry> scope_cache_;
};

struct SafeAllocData {
Expand All @@ -110,7 +121,7 @@ class ThreadLocalStoreImpl : public StoreRoot {
};

std::string getTagsForName(const std::string& name, std::vector<Tag>& tags);
void clearScopeFromCaches(ScopeImpl* scope);
void clearScopeFromCaches(uint64_t scope_id);
void releaseScopeCrossThread(ScopeImpl* scope);
SafeAllocData safeAlloc(const std::string& name);

Expand Down

0 comments on commit 02fb980

Please sign in to comment.