diff --git a/MultiTierDataMovement.md b/MultiTierDataMovement.md new file mode 100644 index 0000000000..888db672f6 --- /dev/null +++ b/MultiTierDataMovement.md @@ -0,0 +1,119 @@ +# Background Data Movement + +In order to reduce the number of online evictions and support asynchronous +promotion - we have added two periodic workers to handle eviction and promotion. + +The diagram below shows a simplified version of how the background evictor +thread (green) is integrated to the CacheLib architecture. + +

+ BackgroundEvictor +

+ +## Synchronous Eviction and Promotion + +- `disableEvictionToMemory`: Disables eviction to memory (item is always evicted to NVMe or removed +on eviction) + +## Background Evictors + +The background evictors scan each class to see if there are objects to move the next (higher) +tier using a given strategy. Here we document the parameters for the different +strategies and general parameters. + +- `backgroundEvictorIntervalMilSec`: The interval that this thread runs for - by default +the background evictor threads will wake up every 10 ms to scan the AllocationClasses. Also, +the background evictor thead will be woken up everytime there is a failed allocation (from +a request handling thread) and the current percentage of free memory for the +AllocationClass is lower than `lowEvictionAcWatermark`. This may render the interval parameter +not as important when there are many allocations occuring from request handling threads. + +- `evictorThreads`: The number of background evictors to run - each thread is a assigned +a set of AllocationClasses to scan and evict objects from. Currently, each thread gets +an equal number of classes to scan - but as object size distribution may be unequal - future +versions will attempt to balance the classes among threads. The range is 1 to number of AllocationClasses. +The default is 1. + +- `maxEvictionBatch`: The number of objects to remove in a given eviction call. The +default is 40. Lower range is 10 and the upper range is 1000. Too low and we might not +remove objects at a reasonable rate, too high and we hold the locks for copying data +between tiers for too long. + +- `minEvictionBatch`: Minimum number of items to evict at any time (if there are any +candidates) + +- `maxEvictionPromotionHotness`: Maximum candidates to consider for eviction. This is similar to `maxEvictionBatch` +but it specifies how many candidates will be taken into consideration, not the actual number of items to evict. +This option can be used to configure duration of critical section on LRU lock. + + +### FreeThresholdStrategy (default) + +- `lowEvictionAcWatermark`: Triggers background eviction thread to run +when this percentage of the AllocationClass is free. +The default is `2.0`, to avoid wasting capacity we don't set this above `10.0`. + +- `highEvictionAcWatermark`: Stop the evictions from an AllocationClass when this +percentage of the AllocationClass is free. The default is `5.0`, to avoid wasting capacity we +don't set this above `10`. + + +## Background Promoters + +The background promotes scan each class to see if there are objects to move to a lower +tier using a given strategy. Here we document the parameters for the different +strategies and general parameters. + +- `backgroundPromoterIntervalMilSec`: The interval that this thread runs for - by default +the background promoter threads will wake up every 10 ms to scan the AllocationClasses for +objects to promote. + +- `promoterThreads`: The number of background promoters to run - each thread is a assigned +a set of AllocationClasses to scan and promote objects from. Currently, each thread gets +an equal number of classes to scan - but as object size distribution may be unequal - future +versions will attempt to balance the classes among threads. The range is `1` to number of AllocationClasses. The default is `1`. + +- `maxProtmotionBatch`: The number of objects to promote in a given promotion call. The +default is 40. Lower range is 10 and the upper range is 1000. Too low and we might not +remove objects at a reasonable rate, too high and we hold the locks for copying data +between tiers for too long. + +- `minPromotionBatch`: Minimum number of items to promote at any time (if there are any +candidates) + +- `numDuplicateElements`: This allows us to promote items that have existing handles (read-only) since +we won't need to modify the data when a user is done with the data. Therefore, for a short time +the data could reside in both tiers until it is evicted from its current tier. The default is to +not allow this (0). Setting the value to 100 will enable duplicate elements in tiers. + +### Background Promotion Strategy (only one currently) + +- `promotionAcWatermark`: Promote items if there is at least this +percent of free AllocationClasses. Promotion thread will attempt to move `maxPromotionBatch` number of objects +to that tier. The objects are chosen from the head of the LRU. The default is `4.0`. +This value should correlate with `lowEvictionAcWatermark`, `highEvictionAcWatermark`, `minAcAllocationWatermark`, `maxAcAllocationWatermark`. +- `maxPromotionBatch`: The number of objects to promote in batch during BG promotion. Analogous to +`maxEvictionBatch`. It's value should be lower to decrease contention on hot items. + +## Allocation policies + +- `maxAcAllocationWatermark`: Item is always allocated in topmost tier if at least this +percentage of the AllocationClass is free. +- `minAcAllocationWatermark`: Item is always allocated in bottom tier if only this percent +of the AllocationClass is free. If percentage of free AllocationClasses is between `maxAcAllocationWatermark` +and `minAcAllocationWatermark`: then extra checks (described below) are performed to decide where to put the element. + +By default, allocation will always be performed from the upper tier. + +- `acTopTierEvictionWatermark`: If there is less that this percent of free memory in topmost tier, cachelib will attempt to evict from top tier. This option takes precedence before allocationWatermarks. + +### Extra policies (used only when percentage of free AllocationClasses is between `maxAcAllocationWatermark` +and `minAcAllocationWatermark`) +- `sizeThresholdPolicy`: If item is smaller than this value, always allocate it in upper tier. +- `defaultTierChancePercentage`: Change (0-100%) of allocating item in top tier + +## MMContainer options + +- `lruInsertionPointSpec`: Can be set per tier when LRU2Q is used. Determines where new items are +inserted. 0 = insert to hot queue, 1 = insert to warm queue, 2 = insert to cold queue +- `markUsefulChance`: Per-tier, determines chance of moving item to the head of LRU on access diff --git a/cachelib-background-evictor.png b/cachelib-background-evictor.png new file mode 100644 index 0000000000..08869029c2 Binary files /dev/null and b/cachelib-background-evictor.png differ diff --git a/cachelib/allocator/BackgroundEvictor-inl.h b/cachelib/allocator/BackgroundEvictor-inl.h new file mode 100644 index 0000000000..3221a45f32 --- /dev/null +++ b/cachelib/allocator/BackgroundEvictor-inl.h @@ -0,0 +1,116 @@ +/* + * Copyright (c) Intel and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +namespace facebook { +namespace cachelib { + + +template +BackgroundEvictor::BackgroundEvictor(Cache& cache, + std::shared_ptr strategy) + : cache_(cache), + strategy_(strategy) +{ +} + +template +BackgroundEvictor::~BackgroundEvictor() { stop(std::chrono::seconds(0)); } + +template +void BackgroundEvictor::work() { + try { + checkAndRun(); + } catch (const std::exception& ex) { + XLOGF(ERR, "BackgroundEvictor interrupted due to exception: {}", ex.what()); + } +} + +template +void BackgroundEvictor::setAssignedMemory(std::vector> &&assignedMemory) +{ + XLOG(INFO, "Class assigned to background worker:"); + for (auto [tid, pid, cid] : assignedMemory) { + XLOGF(INFO, "Tid: {}, Pid: {}, Cid: {}", tid, pid, cid); + } + + mutex.lock_combine([this, &assignedMemory]{ + this->assignedMemory_ = std::move(assignedMemory); + }); +} + +// Look for classes that exceed the target memory capacity +// and return those for eviction +template +void BackgroundEvictor::checkAndRun() { + auto assignedMemory = mutex.lock_combine([this]{ + return assignedMemory_; + }); + + unsigned int evictions = 0; + std::set classes{}; + auto batches = strategy_->calculateBatchSizes(cache_,assignedMemory); + + for (size_t i = 0; i < batches.size(); i++) { + const auto [tid, pid, cid] = assignedMemory[i]; + const auto batch = batches[i]; + + classes.insert(cid); + const auto& mpStats = cache_.getPoolByTid(pid,tid).getStats(); + + if (!batch) { + continue; + } + + stats.evictionSize.add(batch * mpStats.acStats.at(cid).allocSize); + + //try evicting BATCH items from the class in order to reach free target + auto evicted = + BackgroundEvictorAPIWrapper::traverseAndEvictItems(cache_, + tid,pid,cid,batch); + evictions += evicted; + + //const size_t cid_id = (size_t)mpStats.acStats.at(cid).allocSize; + auto it = evictions_per_class_.find(cid); + if (it != evictions_per_class_.end()) { + it->second += evicted; + } else if (evicted > 0) { + evictions_per_class_[cid] = evicted; + } + } + + stats.numTraversals.inc(); + stats.numEvictedItems.add(evictions); + stats.totalClasses.add(classes.size()); +} + +template +BackgroundEvictionStats BackgroundEvictor::getStats() const noexcept { + BackgroundEvictionStats evicStats; + evicStats.numEvictedItems = stats.numEvictedItems.get(); + evicStats.runCount = stats.numTraversals.get(); + evicStats.evictionSize = stats.evictionSize.get(); + evicStats.totalClasses = stats.totalClasses.get(); + + return evicStats; +} + +template +std::map BackgroundEvictor::getClassStats() const noexcept { + return evictions_per_class_; +} + +} // namespace cachelib +} // namespace facebook diff --git a/cachelib/allocator/BackgroundEvictor.h b/cachelib/allocator/BackgroundEvictor.h new file mode 100644 index 0000000000..3b2886a3ae --- /dev/null +++ b/cachelib/allocator/BackgroundEvictor.h @@ -0,0 +1,99 @@ +/* + * Copyright (c) Intel and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include "cachelib/allocator/CacheStats.h" +#include "cachelib/common/PeriodicWorker.h" +#include "cachelib/allocator/BackgroundEvictorStrategy.h" +#include "cachelib/common/AtomicCounter.h" + + +namespace facebook { +namespace cachelib { + +// wrapper that exposes the private APIs of CacheType that are specifically +// needed for the eviction. +template +struct BackgroundEvictorAPIWrapper { + + static size_t traverseAndEvictItems(C& cache, + unsigned int tid, unsigned int pid, unsigned int cid, size_t batch) { + return cache.traverseAndEvictItems(tid,pid,cid,batch); + } +}; + +struct BackgroundEvictorStats { + // items evicted + AtomicCounter numEvictedItems{0}; + + // traversals + AtomicCounter numTraversals{0}; + + // total class size + AtomicCounter totalClasses{0}; + + // item eviction size + AtomicCounter evictionSize{0}; +}; + +// Periodic worker that evicts items from tiers in batches +// The primary aim is to reduce insertion times for new items in the +// cache +template +class BackgroundEvictor : public PeriodicWorker { + public: + using Cache = CacheT; + // @param cache the cache interface + // @param target_free the target amount of memory to keep free in + // this tier + // @param tier id memory tier to perform eviction on + BackgroundEvictor(Cache& cache, + std::shared_ptr strategy); + + ~BackgroundEvictor() override; + + BackgroundEvictionStats getStats() const noexcept; + std::map getClassStats() const noexcept; + + void setAssignedMemory(std::vector> &&assignedMemory); + + private: + std::map evictions_per_class_; + + // cache allocator's interface for evicting + + using Item = typename Cache::Item; + + Cache& cache_; + std::shared_ptr strategy_; + + // implements the actual logic of running the background evictor + void work() override final; + void checkAndRun(); + + BackgroundEvictorStats stats; + + std::vector> assignedMemory_; + folly::DistributedMutex mutex; +}; +} // namespace cachelib +} // namespace facebook + +#include "cachelib/allocator/BackgroundEvictor-inl.h" diff --git a/cachelib/allocator/BackgroundEvictorStrategy.h b/cachelib/allocator/BackgroundEvictorStrategy.h new file mode 100644 index 0000000000..1d05a801bb --- /dev/null +++ b/cachelib/allocator/BackgroundEvictorStrategy.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "cachelib/allocator/Cache.h" + +namespace facebook { +namespace cachelib { + +// Base class for background eviction strategy. +class BackgroundEvictorStrategy { + +public: + virtual std::vector calculateBatchSizes(const CacheBase& cache, + std::vector> acVec) = 0; +}; + +} // namespace cachelib +} // namespace facebook diff --git a/cachelib/allocator/BackgroundPromoter-inl.h b/cachelib/allocator/BackgroundPromoter-inl.h new file mode 100644 index 0000000000..9881db2495 --- /dev/null +++ b/cachelib/allocator/BackgroundPromoter-inl.h @@ -0,0 +1,115 @@ +/* + * Copyright (c) Intel and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +namespace facebook { +namespace cachelib { + + +template +BackgroundPromoter::BackgroundPromoter(Cache& cache, + std::shared_ptr strategy) + : cache_(cache), + strategy_(strategy) +{ +} + +template +BackgroundPromoter::~BackgroundPromoter() { stop(std::chrono::seconds(0)); } + +template +void BackgroundPromoter::work() { + try { + checkAndRun(); + } catch (const std::exception& ex) { + XLOGF(ERR, "BackgroundPromoter interrupted due to exception: {}", ex.what()); + } +} + +template +void BackgroundPromoter::setAssignedMemory(std::vector> &&assignedMemory) +{ + XLOG(INFO, "Class assigned to background worker:"); + for (auto [tid, pid, cid] : assignedMemory) { + XLOGF(INFO, "Tid: {}, Pid: {}, Cid: {}", tid, pid, cid); + } + + mutex.lock_combine([this, &assignedMemory]{ + this->assignedMemory_ = std::move(assignedMemory); + }); +} + +// Look for classes that exceed the target memory capacity +// and return those for eviction +template +void BackgroundPromoter::checkAndRun() { + auto assignedMemory = mutex.lock_combine([this]{ + return assignedMemory_; + }); + + unsigned int promotions = 0; + std::set classes{}; + + auto batches = strategy_->calculateBatchSizes(cache_,assignedMemory); + + for (size_t i = 0; i < batches.size(); i++) { + const auto [tid, pid, cid] = assignedMemory[i]; + const auto batch = batches[i]; + + + classes.insert(cid); + const auto& mpStats = cache_.getPoolByTid(pid,tid).getStats(); + if (!batch) { + continue; + } + + // stats.promotionsize.add(batch * mpStats.acStats.at(cid).allocSize); + + //try evicting BATCH items from the class in order to reach free target + auto promoted = + BackgroundPromoterAPIWrapper::traverseAndPromoteItems(cache_, + tid,pid,cid,batch); + promotions += promoted; + + //const size_t cid_id = (size_t)mpStats.acStats.at(cid).allocSize; + auto it = promotions_per_class_.find(cid); + if (it != promotions_per_class_.end()) { + it->second += promoted; + } else if (promoted > 0) { + promotions_per_class_[cid] = promoted; + } + } + + stats.numTraversals.inc(); + stats.numPromotedItems.add(promotions); + // stats.totalClasses.add(classes.size()); +} + +template + BackgroundPromotionStats BackgroundPromoter::getStats() const noexcept { + BackgroundPromotionStats promoStats; + promoStats.numPromotedItems = stats.numPromotedItems.get(); + promoStats.runCount = stats.numTraversals.get(); + + return promoStats; + } + + template + std::map BackgroundPromoter::getClassStats() const noexcept { + return promotions_per_class_; + } + +} // namespace cachelib +} // namespace facebook diff --git a/cachelib/allocator/BackgroundPromoter.h b/cachelib/allocator/BackgroundPromoter.h new file mode 100644 index 0000000000..93f592586e --- /dev/null +++ b/cachelib/allocator/BackgroundPromoter.h @@ -0,0 +1,98 @@ +/* + * Copyright (c) Intel and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include "cachelib/allocator/CacheStats.h" +#include "cachelib/common/PeriodicWorker.h" +#include "cachelib/allocator/BackgroundEvictorStrategy.h" +#include "cachelib/common/AtomicCounter.h" + + +namespace facebook { +namespace cachelib { + +// wrapper that exposes the private APIs of CacheType that are specifically +// needed for the promotion. +template +struct BackgroundPromoterAPIWrapper { + + static size_t traverseAndPromoteItems(C& cache, + unsigned int tid, unsigned int pid, unsigned int cid, size_t batch) { + return cache.traverseAndPromoteItems(tid,pid,cid,batch); + } +}; + +struct BackgroundPromoterStats { + // items evicted + AtomicCounter numPromotedItems{0}; + + // traversals + AtomicCounter numTraversals{0}; + + // total class size + AtomicCounter totalClasses{0}; + + // item eviction size + AtomicCounter promotionSize{0}; +}; + +template +class BackgroundPromoter : public PeriodicWorker { + public: + using Cache = CacheT; + // @param cache the cache interface + // @param target_free the target amount of memory to keep free in + // this tier + // @param tier id memory tier to perform promotin from + BackgroundPromoter(Cache& cache, + std::shared_ptr strategy); + // TODO: use separate strategy for eviction and promotion + + ~BackgroundPromoter() override; + + // TODO + BackgroundPromotionStats getStats() const noexcept; + std::map getClassStats() const noexcept; + + void setAssignedMemory(std::vector> &&assignedMemory); + + private: + std::map promotions_per_class_; + + // cache allocator's interface for evicting + + using Item = typename Cache::Item; + + Cache& cache_; + std::shared_ptr strategy_; + + // implements the actual logic of running the background evictor + void work() override final; + void checkAndRun(); + + BackgroundPromoterStats stats; + + std::vector> assignedMemory_; + folly::DistributedMutex mutex; +}; +} // namespace cachelib +} // namespace facebook + +#include "cachelib/allocator/BackgroundPromoter-inl.h" diff --git a/cachelib/allocator/CMakeLists.txt b/cachelib/allocator/CMakeLists.txt index b00302086b..8dc0166ecf 100644 --- a/cachelib/allocator/CMakeLists.txt +++ b/cachelib/allocator/CMakeLists.txt @@ -35,6 +35,7 @@ add_library (cachelib_allocator CCacheManager.cpp ContainerTypes.cpp FreeMemStrategy.cpp + FreeThresholdStrategy.cpp HitsPerSlabStrategy.cpp LruTailAgeStrategy.cpp MarginalHitsOptimizeStrategy.cpp diff --git a/cachelib/allocator/Cache.h b/cachelib/allocator/Cache.h index ffbff0289e..52ed839c4f 100644 --- a/cachelib/allocator/Cache.h +++ b/cachelib/allocator/Cache.h @@ -84,7 +84,7 @@ class CacheBase { CacheBase& operator=(CacheBase&&) = default; // TODO: come up with some reasonable number - static constexpr unsigned kMaxTiers = 8; + static constexpr unsigned kMaxTiers = 2; // Get a string referring to the cache name for this cache virtual const std::string getCacheName() const = 0; @@ -93,6 +93,12 @@ class CacheBase { // // @param poolId The pool id to query virtual const MemoryPool& getPool(PoolId poolId) const = 0; + + // Get the reference to a memory pool using a tier id, for stats purposes + // + // @param poolId The pool id to query + // @param tierId The tier of the pool id + virtual const MemoryPool& getPoolByTid(PoolId poolId, TierId tid) const = 0; // Get Pool specific stats (regular pools). This includes stats from the // Memory Pool and also the cache. diff --git a/cachelib/allocator/CacheAllocator-inl.h b/cachelib/allocator/CacheAllocator-inl.h index 59f8b1cc43..8bc73604ce 100644 --- a/cachelib/allocator/CacheAllocator-inl.h +++ b/cachelib/allocator/CacheAllocator-inl.h @@ -44,8 +44,6 @@ CacheAllocator::CacheAllocator(Config config) [this](Item* it) -> ItemHandle { return acquire(it); })), chainedItemLocks_(config_.chainedItemsLockPower, std::make_shared()), - movesMap_(kShards), - moveLock_(kShards), cacheCreationTime_{util::getCurrentTimeSec()} { if (numTiers_ > 1 || std::holds_alternative( @@ -132,8 +130,6 @@ CacheAllocator::CacheAllocator(SharedMemNewT, Config config) [this](Item* it) -> ItemHandle { return acquire(it); })), chainedItemLocks_(config_.chainedItemsLockPower, std::make_shared()), - movesMap_(kShards), - moveLock_(kShards), cacheCreationTime_{util::getCurrentTimeSec()} { initCommon(false); shmManager_->removeShm(detail::kShmInfoName, @@ -170,8 +166,6 @@ CacheAllocator::CacheAllocator(SharedMemAttachT, Config config) [this](Item* it) -> ItemHandle { return acquire(it); })), chainedItemLocks_(config_.chainedItemsLockPower, std::make_shared()), - movesMap_(kShards), - moveLock_(kShards), cacheCreationTime_{*metadata_.cacheCreationTime_ref()} { /* TODO - per tier? */ for (auto pid : *metadata_.compactCachePools_ref()) { @@ -272,6 +266,14 @@ void CacheAllocator::initCommon(bool dramCacheAttached) { nvmAdmissionPolicy_->initMinTTL(config_.nvmAdmissionMinTTL); } } + if (numTiers_ > 1 && !config_.moveCb) { + XLOG(WARN, "No moveCb set, using memcpy for moving items between tiers."); + config_.moveCb = [](Item& oldItem, Item& newItem, Item* parentItem){ + if (parentItem != nullptr) + throw std::runtime_error("TODO: chained items not supported"); + std::memcpy(newItem.getMemory(), oldItem.getMemory(), oldItem.getSize()); + }; + } initStats(); initNvmCache(dramCacheAttached); initWorkers(); @@ -340,6 +342,18 @@ void CacheAllocator::initWorkers() { config_.poolOptimizeStrategy, config_.ccacheOptimizeStepSizePercent); } + + if (config_.backgroundEvictorEnabled()) { + startNewBackgroundEvictor(config_.backgroundEvictorInterval, + config_.backgroundEvictorStrategy, + config_.backgroundEvictorThreads); + } + + if (config_.backgroundPromoterEnabled()) { + startNewBackgroundPromoter(config_.backgroundPromoterInterval, + config_.backgroundPromoterStrategy, + config_.backgroundPromoterThreads); + } } template @@ -362,7 +376,24 @@ CacheAllocator::allocate(PoolId poolId, creationTime = util::getCurrentTimeSec(); } return allocateInternal(poolId, key, size, creationTime, - ttlSecs == 0 ? 0 : creationTime + ttlSecs); + ttlSecs == 0 ? 0 : creationTime + ttlSecs, false); +} + +template +bool CacheAllocator::shouldWakeupBgEvictor(TierId tid, PoolId pid, ClassId cid) +{ + // TODO: should we also work on lower tiers? should we have separate set of params? + if (tid == 1) return false; + return getAllocationClassStats(tid, pid, cid).approxFreePercent <= config_.lowEvictionAcWatermark; +} + +template +size_t CacheAllocator::backgroundWorkerId(TierId tid, PoolId pid, ClassId cid, size_t numWorkers) +{ + XDCHECK(numWorkers); + + // TODO: came up with some better sharding (use some hashing) + return (tid + pid + cid) % numWorkers; } template @@ -372,7 +403,8 @@ CacheAllocator::allocateInternalTier(TierId tid, typename Item::Key key, uint32_t size, uint32_t creationTime, - uint32_t expiryTime) { + uint32_t expiryTime, + bool fromEvictorThread) { util::LatencyTracker tracker{stats().allocateLatency_}; SCOPE_FAIL { stats_.invalidAllocs.inc(); }; @@ -382,18 +414,36 @@ CacheAllocator::allocateInternalTier(TierId tid, // the allocation class in our memory allocator. const auto cid = allocator_[tid]->getAllocationClassId(pid, requiredSize); + util::RollingLatencyTracker rollTracker{(*stats_.classAllocLatency)[tid][pid][cid]}; // TODO: per-tier (*stats_.allocAttempts)[pid][cid].inc(); - void* memory = allocator_[tid]->allocate(pid, requiredSize); + void *memory = nullptr; + + if (tid == 0 && config_.acTopTierEvictionWatermark > 0.0 + && getAllocationClassStats(tid, pid, cid) + .approxFreePercent < config_.acTopTierEvictionWatermark) { + memory = findEviction(tid, pid, cid); + } + + if (memory == nullptr) { + // TODO: should we try allocate item even if this will result in violating + // acTopTierEvictionWatermark? + memory = allocator_[tid]->allocate(pid, requiredSize); + } + + if (backgroundEvictor_.size() && !fromEvictorThread && (memory == nullptr || shouldWakeupBgEvictor(tid, pid, cid))) { + backgroundEvictor_[backgroundWorkerId(tid, pid, cid, backgroundEvictor_.size())]->wakeUp(); + } + // TODO: Today disableEviction means do not evict from memory (DRAM). // Should we support eviction between memory tiers (e.g. from DRAM to PMEM)? if (memory == nullptr && !config_.disableEviction) { memory = findEviction(tid, pid, cid); } - ItemHandle handle; + WriteHandle handle; if (memory != nullptr) { // At this point, we have a valid memory allocation that is ready for use. // Ensure that when we abort from here under any circumstances, we free up @@ -430,18 +480,71 @@ CacheAllocator::allocateInternalTier(TierId tid, } template -typename CacheAllocator::WriteHandle -CacheAllocator::allocateInternal(PoolId pid, +TierId +CacheAllocator::getTargetTierForItem(PoolId pid, typename Item::Key key, uint32_t size, uint32_t creationTime, uint32_t expiryTime) { - auto tid = 0; /* TODO: consult admission policy */ - for(TierId tid = 0; tid < numTiers_; ++tid) { - auto handle = allocateInternalTier(tid, pid, key, size, creationTime, expiryTime); - if (handle) return handle; + if (numTiers_ == 1) + return 0; + + if (config_.forceAllocationTier != UINT64_MAX) { + return config_.forceAllocationTier; } - return {}; + + const TierId defaultTargetTier = 0; + + const auto requiredSize = Item::getRequiredSize(key, size); + const auto cid = allocator_[defaultTargetTier]->getAllocationClassId(pid, requiredSize); + + auto freePercentage = getAllocationClassStats(defaultTargetTier, pid, cid).approxFreePercent; + + // TODO: COULD we implement BG worker which would move slabs around + // so that there is similar amount of free space in each pool/ac. + // Should this be responsibility of BG evictor? + + if (freePercentage >= config_.maxAcAllocationWatermark) + return defaultTargetTier; + + if (freePercentage <= config_.minAcAllocationWatermark) + return defaultTargetTier + 1; + + // TODO: we can even think about creating different allocation classes for PMEM + // and we could look at possible fragmentation when deciding where to put the item + if (config_.sizeThresholdPolicy) + return requiredSize < config_.sizeThresholdPolicy ? defaultTargetTier : defaultTargetTier + 1; + + // TODO: (e.g. always put chained items to PMEM) + // if (chainedItemsPolicy) + // return item.isChainedItem() ? defaultTargetTier + 1 : defaultTargetTier; + + // TODO: + // if (expiryTimePolicy) + // return (expiryTime - creationTime) < expiryTimePolicy ? defaultTargetTier : defaultTargetTier + 1; + + // TODO: + // if (keyPolicy) // this can be based on key length or some other properties + // return getTargetTierForKey(key); + + // TODO: + // if (compressabilityPolicy) // if compresses well store in PMEM? latency will be higher anyway + // return TODO; + + // TODO: only works for 2 tiers + return (folly::Random::rand32() % 100) < config_.defaultTierChancePercentage ? defaultTargetTier : defaultTargetTier + 1; +} + +template +typename CacheAllocator::WriteHandle +CacheAllocator::allocateInternal(PoolId pid, + typename Item::Key key, + uint32_t size, + uint32_t creationTime, + uint32_t expiryTime, + bool fromEvictorThread) { + auto tid = getTargetTierForItem(pid, key, size, creationTime, expiryTime); + return allocateInternalTier(tid, pid, key, size, creationTime, expiryTime, fromEvictorThread); } template @@ -480,6 +583,8 @@ CacheAllocator::allocateChainedItemInternal( const auto pid = allocator_[tid]->getAllocInfo(parent->getMemory()).poolId; const auto cid = allocator_[tid]->getAllocationClassId(pid, requiredSize); + util::RollingLatencyTracker rollTracker{(*stats_.classAllocLatency)[tid][pid][cid]}; + // TODO: per-tier? Right now stats_ are not used in any public periodic // worker (*stats_.allocAttempts)[pid][cid].inc(); @@ -1169,159 +1274,11 @@ CacheAllocator::insertOrReplace(const ItemHandle& handle) { return replaced; } -/* Next two methods are used to asynchronously move Item between memory tiers. - * - * The thread, which moves Item, allocates new Item in the tier we are moving to - * and calls moveRegularItemOnEviction() method. This method does the following: - * 1. Create MoveCtx and put it to the movesMap. - * 2. Update the access container with the new item from the tier we are - * moving to. This Item has kIncomplete flag set. - * 3. Copy data from the old Item to the new one. - * 4. Unset the kIncomplete flag and Notify MoveCtx - * - * Concurrent threads which are getting handle to the same key: - * 1. When a handle is created it checks if the kIncomplete flag is set - * 2. If so, Handle implementation creates waitContext and adds it to the - * MoveCtx by calling addWaitContextForMovingItem() method. - * 3. Wait until the moving thread will complete its job. - */ -template -bool CacheAllocator::addWaitContextForMovingItem( - folly::StringPiece key, std::shared_ptr> waiter) { - auto shard = getShardForKey(key); - auto& movesMap = getMoveMapForShard(shard); - auto lock = getMoveLockForShard(shard); - auto it = movesMap.find(key); - if (it == movesMap.end()) { - return false; - } - auto ctx = it->second.get(); - ctx->addWaiter(std::move(waiter)); - return true; -} - -template -typename CacheAllocator::ItemHandle -CacheAllocator::moveRegularItemOnEviction( - Item& oldItem, ItemHandle& newItemHdl) { - XDCHECK(oldItem.isMoving()); - // TODO: should we introduce new latency tracker. E.g. evictRegularLatency_ - // ??? util::LatencyTracker tracker{stats_.evictRegularLatency_}; - - if (!oldItem.isAccessible() || oldItem.isExpired()) { - return {}; - } - - XDCHECK_EQ(newItemHdl->getSize(), oldItem.getSize()); - XDCHECK_NE(getTierId(oldItem), getTierId(*newItemHdl)); - - // take care of the flags before we expose the item to be accessed. this - // will ensure that when another thread removes the item from RAM, we issue - // a delete accordingly. See D7859775 for an example - if (oldItem.isNvmClean()) { - newItemHdl->markNvmClean(); - } - - folly::StringPiece key(oldItem.getKey()); - auto shard = getShardForKey(key); - auto& movesMap = getMoveMapForShard(shard); - MoveCtx* ctx(nullptr); - { - auto lock = getMoveLockForShard(shard); - auto res = movesMap.try_emplace(key, std::make_unique()); - if (!res.second) { - return {}; - } - ctx = res.first->second.get(); - } - - auto resHdl = ItemHandle{}; - auto guard = folly::makeGuard([key, this, ctx, shard, &resHdl]() { - auto& movesMap = getMoveMapForShard(shard); - if (resHdl) - resHdl->unmarkIncomplete(); - auto lock = getMoveLockForShard(shard); - ctx->setItemHandle(std::move(resHdl)); - movesMap.erase(key); - }); - - // TODO: Possibly we can use markMoving() instead. But today - // moveOnSlabRelease logic assume that we mark as moving old Item - // and than do copy and replace old Item with the new one in access - // container. Furthermore, Item can be marked as Moving only - // if it is linked to MM container. In our case we mark the new Item - // and update access container before the new Item is ready (content is - // copied). - newItemHdl->markIncomplete(); - - // Inside the access container's lock, this checks if the old item is - // accessible and its refcount is zero. If the item is not accessible, - // there is no point to replace it since it had already been removed - // or in the process of being removed. If the item is in cache but the - // refcount is non-zero, it means user could be attempting to remove - // this item through an API such as remove(ItemHandle). In this case, - // it is unsafe to replace the old item with a new one, so we should - // also abort. - if (!accessContainer_->replaceIf(oldItem, *newItemHdl, - itemMovingPredicate)) { - return {}; - } - - if (config_.moveCb) { - // Execute the move callback. We cannot make any guarantees about the - // consistency of the old item beyond this point, because the callback can - // do more than a simple memcpy() e.g. update external references. If there - // are any remaining handles to the old item, it is the caller's - // responsibility to invalidate them. The move can only fail after this - // statement if the old item has been removed or replaced, in which case it - // should be fine for it to be left in an inconsistent state. - config_.moveCb(oldItem, *newItemHdl, nullptr); - } else { - std::memcpy(newItemHdl->getWritableMemory(), oldItem.getMemory(), - oldItem.getSize()); - } - - // Inside the MM container's lock, this checks if the old item exists to - // make sure that no other thread removed it, and only then replaces it. - if (!replaceInMMContainer(oldItem, *newItemHdl)) { - accessContainer_->remove(*newItemHdl); - return {}; - } - - // Replacing into the MM container was successful, but someone could have - // called insertOrReplace() or remove() before or after the - // replaceInMMContainer() operation, which would invalidate newItemHdl. - if (!newItemHdl->isAccessible()) { - removeFromMMContainer(*newItemHdl); - return {}; - } - - // no one can add or remove chained items at this point - if (oldItem.hasChainedItem()) { - // safe to acquire handle for a moving Item - auto oldHandle = acquire(&oldItem); - XDCHECK_EQ(1u, oldHandle->getRefCount()) << oldHandle->toString(); - XDCHECK(!newItemHdl->hasChainedItem()) << newItemHdl->toString(); - try { - auto l = chainedItemLocks_.lockExclusive(oldItem.getKey()); - transferChainLocked(oldHandle, newItemHdl); - } catch (const std::exception& e) { - // this should never happen because we drained all the handles. - XLOGF(DFATAL, "{}", e.what()); - throw; - } - - XDCHECK(!oldItem.hasChainedItem()); - XDCHECK(newItemHdl->hasChainedItem()); - } - newItemHdl.unmarkNascent(); - resHdl = std::move(newItemHdl); // guard will assign it to ctx under lock - return acquire(&oldItem); -} - template +template bool CacheAllocator::moveRegularItem(Item& oldItem, - ItemHandle& newItemHdl) { + ItemHandle& newItemHdl, + Predicate &&predicate) { XDCHECK(config_.moveCb); util::LatencyTracker tracker{stats_.moveRegularLatency_}; @@ -1330,8 +1287,6 @@ bool CacheAllocator::moveRegularItem(Item& oldItem, } XDCHECK_EQ(newItemHdl->getSize(), oldItem.getSize()); - XDCHECK_EQ(reinterpret_cast(&getMMContainer(oldItem)), - reinterpret_cast(&getMMContainer(*newItemHdl))); // take care of the flags before we expose the item to be accessed. this // will ensure that when another thread removes the item from RAM, we issue @@ -1357,7 +1312,7 @@ bool CacheAllocator::moveRegularItem(Item& oldItem, // this item through an API such as remove(ItemHandle). In this case, // it is unsafe to replace the old item with a new one, so we should // also abort. - if (!accessContainer_->replaceIf(oldItem, *newItemHdl, itemMovingPredicate)) { + if (!accessContainer_->replaceIf(oldItem, *newItemHdl, predicate)) { return false; } @@ -1605,38 +1560,93 @@ bool CacheAllocator::shouldWriteToNvmCacheExclusive( return true; } +template +bool CacheAllocator::shouldEvictToNextMemoryTier( + TierId sourceTierId, TierId targetTierId, PoolId pid, Item& item) +{ + if (config_.disableEvictionToMemory) + return false; + + // TODO: implement more advanced admission policies for memory tiers + return true; +} + template typename CacheAllocator::WriteHandle CacheAllocator::tryEvictToNextMemoryTier( - TierId tid, PoolId pid, Item& item) { - if(item.isChainedItem()) return {}; // TODO: We do not support ChainedItem yet + TierId tid, PoolId pid, Item& item, bool fromEvictorThread) { if(item.isExpired()) return acquire(&item); - TierId nextTier = tid; // TODO - calculate this based on some admission policy + TierId nextTier = tid; while (++nextTier < numTiers_) { // try to evict down to the next memory tiers + if (!shouldEvictToNextMemoryTier(tid, nextTier, pid, item)) + continue; + // allocateInternal might trigger another eviction auto newItemHdl = allocateInternalTier(nextTier, pid, item.getKey(), item.getSize(), item.getCreationTime(), - item.getExpiryTime()); + item.getExpiryTime(), + fromEvictorThread); if (newItemHdl) { XDCHECK_EQ(newItemHdl->getSize(), item.getSize()); - - return moveRegularItemOnEviction(item, newItemHdl); + if (tryMovingItem(item, newItemHdl, itemMovingPredicate)) { + return acquire(&item); + } } } return {}; } +template +bool +CacheAllocator::tryPromoteToNextMemoryTier( + TierId tid, PoolId pid, Item& item, bool fromEvictorThread) { + TierId nextTier = tid; + while (nextTier > 0) { // try to evict down to the next memory tiers + auto toPromoteTier = nextTier - 1; + --nextTier; + + // allocateInternal might trigger another eviction + auto newItemHdl = allocateInternalTier(toPromoteTier, pid, + item.getKey(), + item.getSize(), + item.getCreationTime(), + item.getExpiryTime(), + fromEvictorThread); + + if (newItemHdl) { + XDCHECK_EQ(newItemHdl->getSize(), item.getSize()); + auto predicate = [this](const Item& item) { + // if inclusive cache is allowed, replace even if there are active users. + return config_.numDuplicateElements > 0 || item.getRefCount() == 0; + }; + if (tryMovingItem(item, newItemHdl, std::move(predicate))) { + return true; + } + } + } + + return false; +} + template typename CacheAllocator::WriteHandle -CacheAllocator::tryEvictToNextMemoryTier(Item& item) { +CacheAllocator::tryEvictToNextMemoryTier(Item& item, bool fromEvictorThread) { + auto tid = getTierId(item); + auto pid = allocator_[tid]->getAllocInfo(item.getMemory()).poolId; + return tryEvictToNextMemoryTier(tid, pid, item, fromEvictorThread); +} + +template +bool +CacheAllocator::tryPromoteToNextMemoryTier(Item& item, bool fromBgThread) { auto tid = getTierId(item); auto pid = allocator_[tid]->getAllocInfo(item.getMemory()).poolId; - return tryEvictToNextMemoryTier(tid, pid, item); + return tryPromoteToNextMemoryTier(tid, pid, item, fromBgThread); } template @@ -2294,6 +2304,16 @@ PoolId CacheAllocator::addPool( setRebalanceStrategy(pid, std::move(rebalanceStrategy)); setResizeStrategy(pid, std::move(resizeStrategy)); + if (backgroundEvictor_.size()) { + for (size_t id = 0; id < backgroundEvictor_.size(); id++) + backgroundEvictor_[id]->setAssignedMemory(getAssignedMemoryToBgWorker(id, backgroundEvictor_.size(), 0)); + } + + if (backgroundPromoter_.size()) { + for (size_t id = 0; id < backgroundPromoter_.size(); id++) + backgroundPromoter_[id]->setAssignedMemory(getAssignedMemoryToBgWorker(id, backgroundPromoter_.size(), 1)); + } + return pid; } @@ -2358,6 +2378,10 @@ void CacheAllocator::createMMContainers(const PoolId pid, .getAllocsPerSlab() : 0); for (TierId tid = 0; tid < numTiers_; tid++) { + if constexpr (std::is_same_v || std::is_same_v) { + config.lruInsertionPointSpec = config_.memoryTierConfigs[tid].lruInsertionPointSpec ; + config.markUsefulChance = config_.memoryTierConfigs[tid].markUsefulChance; + } mmContainers_[tid][pid][cid].reset(new MMContainer(config, compressor_)); } } @@ -2412,7 +2436,7 @@ std::set CacheAllocator::getRegularPoolIds() const { folly::SharedMutex::ReadHolder r(poolsResizeAndRebalanceLock_); // TODO - get rid of the duplication - right now, each tier // holds pool objects with mostly the same info - return filterCompactCachePools(allocator_[0]->getPoolIds()); + return filterCompactCachePools(allocator_[currentTier()]->getPoolIds()); } template @@ -2540,6 +2564,7 @@ AllocationClassBaseStat CacheAllocator::getAllocationClassStats( } else { stats.approxFreePercent = ac.approxFreePercentage(); } + stats.allocLatencyNs = (*stats_.classAllocLatency)[tid][pid][cid]; return stats; } @@ -2715,7 +2740,7 @@ bool CacheAllocator::moveForSlabRelease( // if we have a valid handle, try to move, if not, we retry. if (newItemHdl) { - isMoved = tryMovingForSlabRelease(oldItem, newItemHdl); + isMoved = tryMovingItem(oldItem, newItemHdl, itemMovingPredicate); if (isMoved) { break; } @@ -2824,7 +2849,8 @@ CacheAllocator::allocateNewItemForOldItem(const Item& oldItem) { oldItem.getKey(), oldItem.getSize(), oldItem.getCreationTime(), - oldItem.getExpiryTime()); + oldItem.getExpiryTime(), + false); if (!newItemHdl) { return {}; } @@ -2837,8 +2863,9 @@ CacheAllocator::allocateNewItemForOldItem(const Item& oldItem) { } template -bool CacheAllocator::tryMovingForSlabRelease( - Item& oldItem, ItemHandle& newItemHdl) { +template +bool CacheAllocator::tryMovingItem( + Item& oldItem, ItemHandle& newItemHdl, Predicate&& predicate) { // By holding onto a user-level synchronization object, we ensure moving // a regular item or chained item is synchronized with any potential // user-side mutation. @@ -2870,7 +2897,7 @@ bool CacheAllocator::tryMovingForSlabRelease( return oldItem.isChainedItem() ? moveChainedItem(oldItem.asChainedItem(), newItemHdl) - : moveRegularItem(oldItem, newItemHdl); + : moveRegularItem(oldItem, newItemHdl, std::move(predicate)); } template @@ -2957,14 +2984,14 @@ void CacheAllocator::evictForSlabRelease( template typename CacheAllocator::ItemHandle CacheAllocator::evictNormalItem(Item& item, - bool skipIfTokenInvalid) { + bool skipIfTokenInvalid, bool fromEvictorThread) { XDCHECK(item.isMoving()); if (item.isOnlyMoving()) { return ItemHandle{}; } - auto evictHandle = tryEvictToNextMemoryTier(item); + auto evictHandle = tryEvictToNextMemoryTier(item, fromEvictorThread); if(evictHandle) return evictHandle; auto predicate = [](const Item& it) { return it.getRefCount() == 0; }; @@ -3349,6 +3376,8 @@ bool CacheAllocator::stopWorkers(std::chrono::seconds timeout) { success &= stopPoolResizer(timeout); success &= stopMemMonitor(timeout); success &= stopReaper(timeout); + success &= stopBackgroundEvictor(timeout); + success &= stopBackgroundPromoter(timeout); return success; } @@ -3629,6 +3658,8 @@ GlobalCacheStats CacheAllocator::getGlobalCacheStats() const { ret.nvmCacheEnabled = nvmCache_ ? nvmCache_->isEnabled() : false; ret.nvmUpTime = currTime - getNVMCacheCreationTime(); ret.reaperStats = getReaperStats(); + ret.evictionStats = getBackgroundEvictorStats(); + ret.promotionStats = getBackgroundPromoterStats(); ret.numActiveHandles = getNumActiveHandles(); return ret; @@ -3732,6 +3763,7 @@ bool CacheAllocator::startNewPoolRebalancer( freeAllocThreshold); } + template bool CacheAllocator::startNewPoolResizer( std::chrono::milliseconds interval, @@ -3769,6 +3801,64 @@ bool CacheAllocator::startNewReaper( return startNewWorker("Reaper", reaper_, interval, reaperThrottleConfig); } +template +auto CacheAllocator::getAssignedMemoryToBgWorker(size_t evictorId, size_t numWorkers, TierId tid) +{ + std::vector> asssignedMemory; + // TODO: for now, only evict from tier 0 + auto pools = filterCompactCachePools(allocator_[tid]->getPoolIds()); + for (const auto pid : pools) { + const auto& mpStats = getPoolByTid(pid,tid).getStats(); + for (const auto cid : mpStats.classIds) { + if (backgroundWorkerId(tid, pid, cid, numWorkers) == evictorId) { + asssignedMemory.emplace_back(tid, pid, cid); + } + } + } + return asssignedMemory; +} + +template +bool CacheAllocator::startNewBackgroundEvictor( + std::chrono::milliseconds interval, + std::shared_ptr strategy, + size_t threads) { + XDCHECK(threads > 0); + backgroundEvictor_.resize(threads); + bool result = true; + + for (size_t i = 0; i < threads; i++) { + auto ret = startNewWorker("BackgroundEvictor" + std::to_string(i), backgroundEvictor_[i], interval, strategy); + result = result && ret; + + if (result) { + backgroundEvictor_[i]->setAssignedMemory(getAssignedMemoryToBgWorker(i, backgroundEvictor_.size(), 0)); + } + } + return result; +} + +template +bool CacheAllocator::startNewBackgroundPromoter( + std::chrono::milliseconds interval, + std::shared_ptr strategy, + size_t threads) { + XDCHECK(threads > 0); + XDCHECK(numTiers_ > 1); + backgroundPromoter_.resize(threads); + bool result = true; + + for (size_t i = 0; i < threads; i++) { + auto ret = startNewWorker("BackgroundPromoter" + std::to_string(i), backgroundPromoter_[i], interval, strategy); + result = result && ret; + + if (result) { + backgroundPromoter_[i]->setAssignedMemory(getAssignedMemoryToBgWorker(i, backgroundPromoter_.size(), 1)); + } + } + return result; +} + template bool CacheAllocator::stopPoolRebalancer( std::chrono::seconds timeout) { @@ -3796,6 +3886,28 @@ bool CacheAllocator::stopReaper(std::chrono::seconds timeout) { return stopWorker("Reaper", reaper_, timeout); } +template +bool CacheAllocator::stopBackgroundEvictor( + std::chrono::seconds timeout) { + bool result = true; + for (size_t i = 0; i < backgroundEvictor_.size(); i++) { + auto ret = stopWorker("BackgroundEvictor" + std::to_string(i), backgroundEvictor_[i], timeout); + result = result && ret; + } + return result; +} + +template +bool CacheAllocator::stopBackgroundPromoter( + std::chrono::seconds timeout) { + bool result = true; + for (size_t i = 0; i < backgroundPromoter_.size(); i++) { + auto ret = stopWorker("BackgroundPromoter" + std::to_string(i), backgroundPromoter_[i], timeout); + result = result && ret; + } + return result; +} + template bool CacheAllocator::cleanupStrayShmSegments( const std::string& cacheDir, bool posix /*TODO(SHM_FILE): const std::vector& config */) { diff --git a/cachelib/allocator/CacheAllocator.h b/cachelib/allocator/CacheAllocator.h index 81ce90d189..144d68db7c 100644 --- a/cachelib/allocator/CacheAllocator.h +++ b/cachelib/allocator/CacheAllocator.h @@ -36,7 +36,8 @@ #include #include #pragma GCC diagnostic pop - +#include "cachelib/allocator/BackgroundEvictor.h" +#include "cachelib/allocator/BackgroundPromoter.h" #include "cachelib/allocator/CCacheManager.h" #include "cachelib/allocator/Cache.h" #include "cachelib/allocator/CacheAllocatorConfig.h" @@ -695,6 +696,8 @@ class CacheAllocator : public CacheBase { std::shared_ptr resizeStrategy = nullptr, bool ensureProvisionable = false); + auto getAssignedMemoryToBgWorker(size_t evictorId, size_t numWorkers, TierId tid); + // update an existing pool's config // // @param pid pool id for the pool to be updated @@ -945,6 +948,11 @@ class CacheAllocator : public CacheBase { // @param reaperThrottleConfig throttling config bool startNewReaper(std::chrono::milliseconds interval, util::Throttler::Config reaperThrottleConfig); + + bool startNewBackgroundEvictor(std::chrono::milliseconds interval, + std::shared_ptr strategy, size_t threads); + bool startNewBackgroundPromoter(std::chrono::milliseconds interval, + std::shared_ptr strategy, size_t threads); // Stop existing workers with a timeout bool stopPoolRebalancer(std::chrono::seconds timeout = std::chrono::seconds{ @@ -954,6 +962,8 @@ class CacheAllocator : public CacheBase { 0}); bool stopMemMonitor(std::chrono::seconds timeout = std::chrono::seconds{0}); bool stopReaper(std::chrono::seconds timeout = std::chrono::seconds{0}); + bool stopBackgroundEvictor(std::chrono::seconds timeout = std::chrono::seconds{0}); + bool stopBackgroundPromoter(std::chrono::seconds timeout = std::chrono::seconds{0}); // Set pool optimization to either true or false // @@ -988,6 +998,10 @@ class CacheAllocator : public CacheBase { const MemoryPool& getPool(PoolId pid) const override final { return allocator_[currentTier()]->getPool(pid); } + + const MemoryPool& getPoolByTid(PoolId pid, TierId tid) const override final { + return allocator_[tid]->getPool(pid); + } // calculate the number of slabs to be advised/reclaimed in each pool PoolAdviseReclaimData calcNumSlabsToAdviseReclaim() override final { @@ -1034,6 +1048,59 @@ class CacheAllocator : public CacheBase { auto stats = reaper_ ? reaper_->getStats() : ReaperStats{}; return stats; } + + // returns the background evictor + BackgroundEvictionStats getBackgroundEvictorStats() const { + auto stats = BackgroundEvictionStats{}; + for (auto &bg : backgroundEvictor_) + stats += bg->getStats(); + return stats; + } + + BackgroundPromotionStats getBackgroundPromoterStats() const { + auto stats = BackgroundPromotionStats{}; + for (auto &bg : backgroundPromoter_) + stats += bg->getStats(); + return stats; + } + + std::map getBackgroundEvictorClassStats() const { + auto stats = std::map(); + + for (auto &bg : backgroundEvictor_) { + for (const auto entry : bg->getClassStats()) { + auto cid = entry.first; + auto count = entry.second; + auto it = stats.find(cid); + if ( it != stats.end() ) { + it->second += count; + } else { + stats[cid] = count; + } + } + } + + return stats; + } + + std::map getBackgroundPromoterClassStats() const { + auto stats = std::map(); + + for (auto &bg : backgroundPromoter_) { + for (const auto entry : bg->getClassStats()) { + auto cid = entry.first; + auto count = entry.second; + auto it = stats.find(cid); + if ( it != stats.end() ) { + it->second += count; + } else { + stats[cid] = count; + } + } + } + + return stats; + } // return the LruType of an item typename MMType::LruType getItemLruType(const Item& item) const; @@ -1181,6 +1248,9 @@ class CacheAllocator : public CacheBase { // gives a relative offset to a pointer within the cache. uint64_t getItemPtrAsOffset(const void* ptr); + bool shouldWakeupBgEvictor(TierId tid, PoolId pid, ClassId cid); + size_t backgroundWorkerId(TierId tid, PoolId pid, ClassId cid, size_t numWorkers); + // this ensures that we dont introduce any more hidden fields like vtable by // inheriting from the Hooks and their bool interface. static_assert((sizeof(typename MMType::template Hook) + @@ -1222,6 +1292,11 @@ class CacheAllocator : public CacheBase { // allocator and executes the necessary callbacks. no-op if it is nullptr. FOLLY_ALWAYS_INLINE void release(Item* it, bool isNascent); + TierId getTargetTierForItem(PoolId pid, typename Item::Key key, + uint32_t size, + uint32_t creationTime, + uint32_t expiryTime); + // This is the last step in item release. We also use this for the eviction // scenario where we have to do everything, but not release the allocation // to the allocator and instead recycle it for another new allocation. If @@ -1326,7 +1401,8 @@ class CacheAllocator : public CacheBase { Key key, uint32_t size, uint32_t creationTime, - uint32_t expiryTime); + uint32_t expiryTime, + bool fromEvictorThread); // create a new cache allocation on specific memory tier. // For description see allocateInternal. @@ -1337,7 +1413,8 @@ class CacheAllocator : public CacheBase { Key key, uint32_t size, uint32_t creationTime, - uint32_t expiryTime); + uint32_t expiryTime, + bool fromEvictorThread); // Allocate a chained item // @@ -1403,15 +1480,6 @@ class CacheAllocator : public CacheBase { // not exist. FOLLY_ALWAYS_INLINE ItemHandle findFastImpl(Key key, AccessMode mode); - // Moves a regular item to a different memory tier. - // - // @param oldItem Reference to the item being moved - // @param newItemHdl Reference to the handle of the new item being moved into - // - // @return true If the move was completed, and the containers were updated - // successfully. - ItemHandle moveRegularItemOnEviction(Item& oldItem, ItemHandle& newItemHdl); - // Moves a regular item to a different slab. This should only be used during // slab release after the item's moving bit has been set. The user supplied // callback is responsible for copying the contents and fixing the semantics @@ -1422,7 +1490,8 @@ class CacheAllocator : public CacheBase { // // @return true If the move was completed, and the containers were updated // successfully. - bool moveRegularItem(Item& oldItem, ItemHandle& newItemHdl); + template + bool moveRegularItem(Item& oldItem, ItemHandle& newItemHdl, Predicate&& predicate); // template class for viewAsChainedAllocs that takes either ReadHandle or // WriteHandle @@ -1577,7 +1646,8 @@ class CacheAllocator : public CacheBase { // // @return valid handle to the item. This will be the last // handle to the item. On failure an empty handle. - WriteHandle tryEvictToNextMemoryTier(TierId tid, PoolId pid, Item& item); + WriteHandle tryEvictToNextMemoryTier(TierId tid, PoolId pid, Item& item, bool fromEvictorThread); + bool tryPromoteToNextMemoryTier(TierId tid, PoolId pid, Item& item, bool fromEvictorThread); // Try to move the item down to the next memory tier // @@ -1585,7 +1655,11 @@ class CacheAllocator : public CacheBase { // // @return valid handle to the item. This will be the last // handle to the item. On failure an empty handle. - WriteHandle tryEvictToNextMemoryTier(Item& item); + WriteHandle tryEvictToNextMemoryTier(Item& item, bool fromEvictorThread); + bool tryPromoteToNextMemoryTier(Item& item, bool fromEvictorThread); + + bool shouldEvictToNextMemoryTier(TierId sourceTierId, + TierId targetTierId, PoolId pid, Item& item); size_t memoryTierSize(TierId tid) const; @@ -1698,7 +1772,8 @@ class CacheAllocator : public CacheBase { // // @return true if the item has been moved // false if we have exhausted moving attempts - bool tryMovingForSlabRelease(Item& item, ItemHandle& newItemHdl); + template + bool tryMovingItem(Item& item, ItemHandle& newItemHdl, Predicate &&predicate); // Evict an item from access and mm containers and // ensure it is safe for freeing. @@ -1714,7 +1789,7 @@ class CacheAllocator : public CacheBase { // // @return last handle for corresponding to item on success. empty handle on // failure. caller can retry if needed. - ItemHandle evictNormalItem(Item& item, bool skipIfTokenInvalid = false); + ItemHandle evictNormalItem(Item& item, bool skipIfTokenInvalid = false, bool fromEvictorThread = false); // Helper function to evict a child item for slab release // As a side effect, the parent item is also evicted @@ -1742,6 +1817,133 @@ class CacheAllocator : public CacheBase { folly::annotate_ignore_thread_sanitizer_guard g(__FILE__, __LINE__); allocator_[currentTier()]->forEachAllocation(std::forward(f)); } + + // exposed for the background evictor to iterate through the memory and evict + // in batch. This should improve insertion path for tiered memory config + size_t traverseAndEvictItems(unsigned int tid, unsigned int pid, unsigned int cid, size_t batch) { + auto& mmContainer = getMMContainer(tid, pid, cid); + size_t evictions = 0; + size_t evictionCandidates = 0; + std::vector candidates; + candidates.reserve(batch); + + size_t tries = 0; + mmContainer.withEvictionIterator([&tries, &candidates, &batch, this](auto &&itr){ + while (candidates.size() < batch && (config_.maxEvictionPromotionHotness == 0 || tries < config_.maxEvictionPromotionHotness) && itr) { + tries++; + Item* candidate = itr.get(); + XDCHECK(candidate); + + if (candidate->isChainedItem()) { + throw std::runtime_error("Not supported for chained items"); + } + + if (candidate->getRefCount() == 0 && candidate->markMoving()) { + candidates.push_back(candidate); + } + + ++itr; + } + }); + + for (Item *candidate : candidates) { + auto toReleaseHandle = + evictNormalItem(*candidate, true /* skipIfTokenInvalid */, true /* from BG thread */); + auto ref = candidate->unmarkMoving(); + + if (toReleaseHandle || ref == 0u) { + if (candidate->hasChainedItem()) { + (*stats_.chainedItemEvictions)[pid][cid].inc(); + } else { + (*stats_.regularItemEvictions)[pid][cid].inc(); + } + + evictions++; + } else { + if (candidate->hasChainedItem()) { + stats_.evictFailParentAC.inc(); + } else { + stats_.evictFailAC.inc(); + } + } + + if (toReleaseHandle) { + XDCHECK(toReleaseHandle.get() == candidate); + XDCHECK_EQ(1u, toReleaseHandle->getRefCount()); + + // We manually release the item here because we don't want to + // invoke the Item Handle's destructor which will be decrementing + // an already zero refcount, which will throw exception + auto& itemToRelease = *toReleaseHandle.release(); + + // Decrementing the refcount because we want to recycle the item + const auto ref = decRef(itemToRelease); + XDCHECK_EQ(0u, ref); + + auto res = releaseBackToAllocator(*candidate, RemoveContext::kEviction, + /* isNascent */ false); + XDCHECK(res == ReleaseRes::kReleased); + } else if (ref == 0u) { + // it's safe to recycle the item here as there are no more + // references and the item could not been marked as moving + // by other thread since it's detached from MMContainer. + auto res = releaseBackToAllocator(*candidate, RemoveContext::kEviction, + /* isNascent */ false); + XDCHECK(res == ReleaseRes::kReleased); + } + } + + return evictions; + } + + size_t traverseAndPromoteItems(unsigned int tid, unsigned int pid, unsigned int cid, size_t batch) { + auto& mmContainer = getMMContainer(tid, pid, cid); + size_t promotions = 0; + std::vector candidates; + candidates.reserve(batch); + + size_t tries = 0; + + mmContainer.withPromotionIterator([&tries, &candidates, &batch, this](auto &&itr){ + while (candidates.size() < batch && (config_.maxEvictionPromotionHotness == 0 || tries < config_.maxEvictionPromotionHotness) && itr) { + tries++; + Item* candidate = itr.get(); + XDCHECK(candidate); + + if (candidate->isChainedItem()) { + throw std::runtime_error("Not supported for chained items"); + } + + // if (candidate->getRefCount() == 0 && candidate->markMoving()) { + // candidates.push_back(candidate); + // } + + // TODO: only allow it for read-only items? + // or implement mvcc + if (!candidate->isExpired() && candidate->markMoving()) { + candidates.push_back(candidate); + } + + ++itr; + } + }); + + for (Item *candidate : candidates) { + auto promoted = tryPromoteToNextMemoryTier(*candidate, true); + auto ref = candidate->unmarkMoving(); + if (promoted) + promotions++; + + if (ref == 0u) { + // stats_.promotionMoveSuccess.inc(); + auto res = releaseBackToAllocator(*candidate, RemoveContext::kEviction, + /* isNascent */ false); + XDCHECK(res == ReleaseRes::kReleased); + } + } + + return promotions; + } // returns true if nvmcache is enabled and we should write this item to // nvmcache. @@ -1888,91 +2090,14 @@ class CacheAllocator : public CacheBase { return 0; } - bool addWaitContextForMovingItem( - folly::StringPiece key, std::shared_ptr> waiter); - - class MoveCtx { - public: - MoveCtx() {} - - ~MoveCtx() { - // prevent any further enqueue to waiters - // Note: we don't need to hold locks since no one can enqueue - // after this point. - wakeUpWaiters(); - } - - // record the item handle. Upon destruction we will wake up the waiters - // and pass a clone of the handle to the callBack. By default we pass - // a null handle - void setItemHandle(ItemHandle _it) { it = std::move(_it); } - - // enqueue a waiter into the waiter list - // @param waiter WaitContext - void addWaiter(std::shared_ptr> waiter) { - XDCHECK(waiter); - waiters.push_back(std::move(waiter)); - } - - private: - // notify all pending waiters that are waiting for the fetch. - void wakeUpWaiters() { - bool refcountOverflowed = false; - for (auto& w : waiters) { - // If refcount overflowed earlier, then we will return miss to - // all subsequent waitors. - if (refcountOverflowed) { - w->set(ItemHandle{}); - continue; - } - - try { - w->set(it.clone()); - } catch (const exception::RefcountOverflow&) { - // We'll return a miss to the user's pending read, - // so we should enqueue a delete via NvmCache. - // TODO: cache.remove(it); - refcountOverflowed = true; - } - } - } - - ItemHandle it; // will be set when Context is being filled - std::vector>> waiters; // list of - // waiters - }; - using MoveMap = - folly::F14ValueMap, - folly::HeterogeneousAccessHash>; - - static size_t getShardForKey(folly::StringPiece key) { - return folly::Hash()(key) % kShards; - } - - MoveMap& getMoveMapForShard(size_t shard) { - return movesMap_[shard].movesMap_; - } - - MoveMap& getMoveMap(folly::StringPiece key) { - return getMoveMapForShard(getShardForKey(key)); - } - - std::unique_lock getMoveLockForShard(size_t shard) { - return std::unique_lock(moveLock_[shard].moveLock_); - } - - std::unique_lock getMoveLock(folly::StringPiece key) { - return getMoveLockForShard(getShardForKey(key)); - } - // Whether the memory allocator for this cache allocator was created on shared // memory. The hash table, chained item hash table etc is also created on // shared memory except for temporary shared memory mode when they're created // on heap. const bool isOnShm_{false}; - const Config config_{}; + // TODO: make it const again + Config config_{}; const typename Config::MemoryTierConfigs memoryTierConfigs; @@ -2050,6 +2175,10 @@ class CacheAllocator : public CacheBase { // free memory monitor std::unique_ptr memMonitor_; + + // background evictor + std::vector>> backgroundEvictor_; + std::vector>> backgroundPromoter_; // check whether a pool is a slabs pool std::array isCompactCachePool_{}; @@ -2062,22 +2191,6 @@ class CacheAllocator : public CacheBase { // poolResizer_, poolOptimizer_, memMonitor_, reaper_ mutable std::mutex workersMutex_; - static constexpr size_t kShards = 8192; // TODO: need to define right value - - struct MovesMapShard { - alignas(folly::hardware_destructive_interference_size) MoveMap movesMap_; - }; - - struct MoveLock { - alignas(folly::hardware_destructive_interference_size) std::mutex moveLock_; - }; - - // a map of all pending moves - std::vector movesMap_; - - // a map of move locks for each shard - std::vector moveLock_; - // time when the ram cache was first created const time_t cacheCreationTime_{0}; @@ -2105,6 +2218,8 @@ class CacheAllocator : public CacheBase { // Make this friend to give access to acquire and release friend ReadHandle; friend ReaperAPIWrapper; + friend BackgroundEvictorAPIWrapper; + friend BackgroundPromoterAPIWrapper; friend class CacheAPIWrapperForNvm; friend class FbInternalRuntimeUpdateWrapper; diff --git a/cachelib/allocator/CacheAllocatorConfig.h b/cachelib/allocator/CacheAllocatorConfig.h index ca51deb94c..aa8ff039ee 100644 --- a/cachelib/allocator/CacheAllocatorConfig.h +++ b/cachelib/allocator/CacheAllocatorConfig.h @@ -32,6 +32,7 @@ #include "cachelib/allocator/NvmAdmissionPolicy.h" #include "cachelib/allocator/PoolOptimizeStrategy.h" #include "cachelib/allocator/RebalanceStrategy.h" +#include "cachelib/allocator/BackgroundEvictorStrategy.h" #include "cachelib/allocator/Util.h" #include "cachelib/common/EventInterface.h" #include "cachelib/common/Throttler.h" @@ -266,6 +267,16 @@ class CacheAllocatorConfig { std::chrono::seconds regularInterval, std::chrono::seconds ccacheInterval, uint32_t ccacheStepSizePercent); + + // Enable the background evictor - scans a tier to look for objects + // to evict to the next tier + CacheAllocatorConfig& enableBackgroundEvictor( + std::shared_ptr backgroundEvictorStrategy, + std::chrono::milliseconds regularInterval, size_t threads); + + CacheAllocatorConfig& enableBackgroundPromoter( + std::shared_ptr backgroundEvictorStrategy, + std::chrono::milliseconds regularInterval, size_t threads); // This enables an optimization for Pool rebalancing and resizing. // The rough idea is to ensure only the least useful items are evicted when @@ -337,6 +348,17 @@ class CacheAllocatorConfig { compactCacheOptimizeInterval.count() > 0) && poolOptimizeStrategy != nullptr; } + + // @return whether background evictor thread is enabled + bool backgroundEvictorEnabled() const noexcept { + return backgroundEvictorInterval.count() > 0 && + backgroundEvictorStrategy != nullptr; + } + + bool backgroundPromoterEnabled() const noexcept { + return backgroundPromoterInterval.count() > 0 && + backgroundPromoterStrategy != nullptr; + } // @return whether memory monitor is enabled bool memMonitoringEnabled() const noexcept { @@ -433,6 +455,13 @@ class CacheAllocatorConfig { // time interval to sleep between iterators of rebalancing the pools. std::chrono::milliseconds poolRebalanceInterval{std::chrono::seconds{1}}; + + // time interval to sleep between runs of the background evictor + std::chrono::milliseconds backgroundEvictorInterval{std::chrono::milliseconds{1000}}; + std::chrono::milliseconds backgroundPromoterInterval{std::chrono::milliseconds{1000}}; + + size_t backgroundEvictorThreads{1}; + size_t backgroundPromoterThreads{1}; // Free slabs pro-actively if the ratio of number of freeallocs to // the number of allocs per slab in a slab class is above this @@ -444,6 +473,10 @@ class CacheAllocatorConfig { // rebalance to avoid alloc fialures. std::shared_ptr defaultPoolRebalanceStrategy{ new RebalanceStrategy{}}; + + // rebalance to avoid alloc fialures. + std::shared_ptr backgroundEvictorStrategy; + std::shared_ptr backgroundPromoterStrategy; // time interval to sleep between iterations of pool size optimization, // for regular pools and compact caches @@ -585,6 +618,34 @@ class CacheAllocatorConfig { // skip promote children items in chained when parent fail to promote bool skipPromoteChildrenWhenParentFailed{false}; + bool disableEvictionToMemory{false}; + + double promotionAcWatermark{4.0}; + double lowEvictionAcWatermark{2.0}; + double highEvictionAcWatermark{5.0}; + double minAcAllocationWatermark{0.0}; + double maxAcAllocationWatermark{0.0}; + double acTopTierEvictionWatermark{0.0}; // TODO: make it per TIER? + uint64_t sizeThresholdPolicy{0}; + double defaultTierChancePercentage{50.0}; + // TODO: default could be based on ratio + + double numDuplicateElements{0.0}; // inclusivness of the cache + double syncPromotion{0.0}; // can promotion be done synchronously in user thread + + uint64_t evictorThreads{1}; + uint64_t promoterThreads{1}; + + uint64_t maxEvictionBatch{40}; + uint64_t maxPromotionBatch{10}; + + uint64_t minEvictionBatch{1}; + uint64_t minPromotionBatch{1}; + + uint64_t maxEvictionPromotionHotness{60}; + + uint64_t forceAllocationTier{UINT64_MAX}; + friend CacheT; private: @@ -933,6 +994,26 @@ CacheAllocatorConfig& CacheAllocatorConfig::enablePoolRebalancing( return *this; } +template +CacheAllocatorConfig& CacheAllocatorConfig::enableBackgroundEvictor( + std::shared_ptr strategy, + std::chrono::milliseconds interval, size_t evictorThreads) { + backgroundEvictorStrategy = strategy; + backgroundEvictorInterval = interval; + backgroundEvictorThreads = evictorThreads; + return *this; +} + +template +CacheAllocatorConfig& CacheAllocatorConfig::enableBackgroundPromoter( + std::shared_ptr strategy, + std::chrono::milliseconds interval, size_t promoterThreads) { + backgroundPromoterStrategy = strategy; + backgroundPromoterInterval = interval; + backgroundPromoterThreads = promoterThreads; + return *this; +} + template CacheAllocatorConfig& CacheAllocatorConfig::enablePoolResizing( std::shared_ptr resizeStrategy, diff --git a/cachelib/allocator/CacheStats.cpp b/cachelib/allocator/CacheStats.cpp index 4f7811e5be..98a02cad75 100644 --- a/cachelib/allocator/CacheStats.cpp +++ b/cachelib/allocator/CacheStats.cpp @@ -42,6 +42,8 @@ void Stats::init() { initToZero(*fragmentationSize); initToZero(*chainedItemEvictions); initToZero(*regularItemEvictions); + + classAllocLatency = std::make_unique(); } template diff --git a/cachelib/allocator/CacheStats.h b/cachelib/allocator/CacheStats.h index a24b13d35e..c8af1a2a98 100644 --- a/cachelib/allocator/CacheStats.h +++ b/cachelib/allocator/CacheStats.h @@ -25,6 +25,7 @@ #include "cachelib/allocator/memory/Slab.h" #include "cachelib/common/FastStats.h" #include "cachelib/common/PercentileStats.h" +#include "cachelib/common/RollingStats.h" #include "cachelib/common/Time.h" namespace facebook { @@ -107,6 +108,9 @@ struct AllocationClassBaseStat { // percent of free memory in this class double approxFreePercent{0.0}; + + // Rolling allocation latency (in ns) + util::RollingStats allocLatencyNs; }; // cache related stats for a given allocation class. @@ -296,6 +300,43 @@ struct ReaperStats { uint64_t avgTraversalTimeMs{0}; }; +// Eviction Stats +struct BackgroundEvictionStats { + // the number of items this worker evicted by looking at pools/classes stats + uint64_t numEvictedItems{0}; + + // number of times we went executed the thread //TODO: is this def correct? + uint64_t runCount{0}; + + // total number of classes + uint64_t totalClasses{0}; + + // eviction size + uint64_t evictionSize{0}; + + BackgroundEvictionStats& operator+=(const BackgroundEvictionStats& rhs) { + numEvictedItems += rhs.numEvictedItems; + runCount += rhs.runCount; + totalClasses += rhs.totalClasses; + evictionSize += rhs.evictionSize; + return *this; + } +}; + +struct BackgroundPromotionStats { + // the number of items this worker evicted by looking at pools/classes stats + uint64_t numPromotedItems{0}; + + // number of times we went executed the thread //TODO: is this def correct? + uint64_t runCount{0}; + + BackgroundPromotionStats& operator+=(const BackgroundPromotionStats& rhs) { + numPromotedItems += rhs.numPromotedItems; + runCount += rhs.runCount; + return *this; + } +}; + // CacheMetadata type to export struct CacheMetadata { // allocator_version @@ -316,6 +357,11 @@ struct Stats; // Stats that apply globally in cache and // the ones that are aggregated over all pools struct GlobalCacheStats { + // background eviction stats + BackgroundEvictionStats evictionStats; + + BackgroundPromotionStats promotionStats; + // number of calls to CacheAllocator::find uint64_t numCacheGets{0}; diff --git a/cachelib/allocator/CacheStatsInternal.h b/cachelib/allocator/CacheStatsInternal.h index 355afb594f..7f1a92af73 100644 --- a/cachelib/allocator/CacheStatsInternal.h +++ b/cachelib/allocator/CacheStatsInternal.h @@ -21,6 +21,7 @@ #include "cachelib/allocator/Cache.h" #include "cachelib/allocator/memory/MemoryAllocator.h" #include "cachelib/common/AtomicCounter.h" +#include "cachelib/common/RollingStats.h" namespace facebook { namespace cachelib { @@ -221,6 +222,15 @@ struct Stats { std::unique_ptr chainedItemEvictions{}; std::unique_ptr regularItemEvictions{}; + using PerTierPoolClassRollingStats = + std::array, + MemoryPoolManager::kMaxPools>, + CacheBase::kMaxTiers>; + + // rolling latency tracking for every alloc class in every pool + std::unique_ptr classAllocLatency{}; + // Eviction failures due to parent cannot be removed from access container AtomicCounter evictFailParentAC{0}; diff --git a/cachelib/allocator/FreeThresholdStrategy.cpp b/cachelib/allocator/FreeThresholdStrategy.cpp new file mode 100644 index 0000000000..5ffc718fa7 --- /dev/null +++ b/cachelib/allocator/FreeThresholdStrategy.cpp @@ -0,0 +1,67 @@ +/* + * Copyright (c) Intel and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "cachelib/allocator/FreeThresholdStrategy.h" + +#include + +namespace facebook { +namespace cachelib { + + + +FreeThresholdStrategy::FreeThresholdStrategy(double lowEvictionAcWatermark, double highEvictionAcWatermark, uint64_t maxEvictionBatch, uint64_t minEvictionBatch) + : lowEvictionAcWatermark(lowEvictionAcWatermark), highEvictionAcWatermark(highEvictionAcWatermark), maxEvictionBatch(maxEvictionBatch), minEvictionBatch(minEvictionBatch) {} + +std::vector FreeThresholdStrategy::calculateBatchSizes( + const CacheBase& cache, std::vector> acVec) { + std::vector batches{}; + for (auto [tid, pid, cid] : acVec) { + auto stats = cache.getAllocationClassStats(tid, pid, cid); + if (stats.approxFreePercent >= highEvictionAcWatermark) { + batches.push_back(0); + } else { + auto toFreeMemPercent = highEvictionAcWatermark - stats.approxFreePercent; + auto toFreeItems = static_cast(toFreeMemPercent * stats.memorySize / stats.allocSize); + batches.push_back(toFreeItems); + } + } + + if (batches.size() == 0) { + return batches; + } + + auto maxBatch = *std::max_element(batches.begin(), batches.end()); + if (maxBatch == 0) + return batches; + + std::transform(batches.begin(), batches.end(), batches.begin(), [&](auto numItems){ + if (numItems == 0) { + return 0UL; + } + + auto cappedBatchSize = maxEvictionBatch * numItems / maxBatch; + if (cappedBatchSize < minEvictionBatch) + return minEvictionBatch; + else + return cappedBatchSize; + }); + + return batches; +} + +} // namespace cachelib +} // namespace facebook diff --git a/cachelib/allocator/FreeThresholdStrategy.h b/cachelib/allocator/FreeThresholdStrategy.h new file mode 100644 index 0000000000..6a6b0c8950 --- /dev/null +++ b/cachelib/allocator/FreeThresholdStrategy.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "cachelib/allocator/Cache.h" +#include "cachelib/allocator/BackgroundEvictorStrategy.h" + +namespace facebook { +namespace cachelib { + + +// Base class for background eviction strategy. +class FreeThresholdStrategy : public BackgroundEvictorStrategy { + +public: + FreeThresholdStrategy(double lowEvictionAcWatermark, double highEvictionAcWatermark, uint64_t maxEvictionBatch, uint64_t minEvictionBatch); + ~FreeThresholdStrategy() {} + + std::vector calculateBatchSizes(const CacheBase& cache, + std::vector> acVecs); +private: + double lowEvictionAcWatermark{2.0}; + double highEvictionAcWatermark{5.0}; + uint64_t maxEvictionBatch{40}; + uint64_t minEvictionBatch{5}; +}; + +} // namespace cachelib +} // namespace facebook diff --git a/cachelib/allocator/Handle.h b/cachelib/allocator/Handle.h index 507e2968bc..52ca06b1f3 100644 --- a/cachelib/allocator/Handle.h +++ b/cachelib/allocator/Handle.h @@ -480,10 +480,6 @@ struct ReadHandleImpl { : alloc_(&alloc), it_(it) { if (it_ && it_->isIncomplete()) { waitContext_ = std::make_shared(alloc); - if (!alloc_->addWaitContextForMovingItem(it->getKey(), waitContext_)) { - waitContext_->discard(); - waitContext_.reset(); - } } } diff --git a/cachelib/allocator/MM2Q-inl.h b/cachelib/allocator/MM2Q-inl.h index e791d6c6c3..469a3b6a84 100644 --- a/cachelib/allocator/MM2Q-inl.h +++ b/cachelib/allocator/MM2Q-inl.h @@ -14,6 +14,8 @@ * limitations under the License. */ +#include + namespace facebook { namespace cachelib { @@ -104,6 +106,10 @@ bool MM2Q::Container::recordAccess(T& node, return false; } + // TODO: % 100 is not very accurate + if (config_.markUsefulChance < 100.0 && folly::Random::rand32() % 100 >= config_.markUsefulChance) + return false; + return lruMutex_->lock_combine(func); } return false; @@ -211,15 +217,32 @@ void MM2Q::Container::rebalance() noexcept { template T::*HookPtr> bool MM2Q::Container::add(T& node) noexcept { const auto currTime = static_cast