diff --git a/.github/workflows/build-cachelib-docker.yml b/.github/workflows/build-cachelib-docker.yml index f73339e0d9..be28bc233c 100644 --- a/.github/workflows/build-cachelib-docker.yml +++ b/.github/workflows/build-cachelib-docker.yml @@ -9,7 +9,7 @@ jobs: runs-on: ubuntu-latest env: REPO: cachelib - GITHUB_REPO: pmem/CacheLib + GITHUB_REPO: intel/CacheLib CONTAINER_REG: ghcr.io/pmem/cachelib CONTAINER_REG_USER: ${{ secrets.GH_CR_USER }} CONTAINER_REG_PASS: ${{ secrets.GH_CR_PAT }} diff --git a/cachelib/allocator/CacheAllocator-inl.h b/cachelib/allocator/CacheAllocator-inl.h index 46b903c22f..5f48c6de58 100644 --- a/cachelib/allocator/CacheAllocator-inl.h +++ b/cachelib/allocator/CacheAllocator-inl.h @@ -125,6 +125,7 @@ ShmSegmentOpts CacheAllocator::createShmCacheOpts(TierId tid) { ShmSegmentOpts opts; opts.alignment = sizeof(Slab); opts.typeOpts = memoryTierConfigs[tid].getShmTypeOpts(); + opts.memBindNumaNodes = memoryTierConfigs[tid].getMemBind(); if (auto *v = std::get_if(&opts.typeOpts)) { v->usePosix = config_.usePosixShm; } @@ -1307,7 +1308,7 @@ CacheAllocator::moveRegularItemWithSync( // make sure that no other thread removed it, and only then replaces it. if (!replaceInMMContainer(oldItem, *newItemHdl)) { accessContainer_->remove(*newItemHdl); - return {}; + return acquire(&oldItem); } // Replacing into the MM container was successful, but someone could have @@ -1315,7 +1316,7 @@ CacheAllocator::moveRegularItemWithSync( // replaceInMMContainer() operation, which would invalidate newItemHdl. if (!newItemHdl->isAccessible()) { removeFromMMContainer(*newItemHdl); - return {}; + return acquire(&oldItem); } // no one can add or remove chained items at this point @@ -1640,7 +1641,13 @@ typename CacheAllocator::WriteHandle CacheAllocator::tryEvictToNextMemoryTier( TierId tid, PoolId pid, Item& item) { if(item.isChainedItem()) return {}; // TODO: We do not support ChainedItem yet - if(item.isExpired()) return acquire(&item); + if(item.isExpired()) { + auto handle = removeIf(item, [](const Item& it) { + return it.getRefCount() == 0; + }); + + if (handle) { return handle; } + } TierId nextTier = tid; // TODO - calculate this based on some admission policy while (++nextTier < getNumTiers()) { // try to evict down to the next memory tiers @@ -3066,16 +3073,12 @@ CacheAllocator::evictNormalItem(Item& item, // We remove the item from both access and mm containers. It doesn't matter // if someone else calls remove on the item at this moment, the item cannot // be freed as long as we have the moving bit set. - auto handle = accessContainer_->removeIf(item, std::move(predicate)); - + auto handle = removeIf(item, std::move(predicate)); if (!handle) { return handle; } - XDCHECK_EQ(reinterpret_cast(handle.get()), - reinterpret_cast(&item)); XDCHECK_EQ(1u, handle->getRefCount()); - removeFromMMContainer(item); // now that we are the only handle and we actually removed something from // the RAM cache, we enqueue it to nvmcache. @@ -3187,6 +3190,21 @@ CacheAllocator::evictChainedItemForSlabRelease(ChainedItem& child) { return parentHandle; } +template +template +typename CacheAllocator::WriteHandle +CacheAllocator::removeIf(Item& item, Fn&& predicate) { + auto handle = accessContainer_->removeIf(item, std::forward(predicate)); + + if (handle) { + XDCHECK_EQ(reinterpret_cast(handle.get()), + reinterpret_cast(&item)); + removeFromMMContainer(item); + } + + return handle; +} + template bool CacheAllocator::removeIfExpired(const ReadHandle& handle) { if (!handle) { @@ -3195,14 +3213,7 @@ bool CacheAllocator::removeIfExpired(const ReadHandle& handle) { // We remove the item from both access and mm containers. // We want to make sure the caller is the only one holding the handle. - auto removedHandle = - accessContainer_->removeIf(*(handle.getInternal()), itemExpiryPredicate); - if (removedHandle) { - removeFromMMContainer(*(handle.getInternal())); - return true; - } - - return false; + return (bool)removeIf(*(handle.getInternal()), itemExpiryPredicate); } template diff --git a/cachelib/allocator/CacheAllocator.h b/cachelib/allocator/CacheAllocator.h index 02557dfe24..9cf04cc1a9 100644 --- a/cachelib/allocator/CacheAllocator.h +++ b/cachelib/allocator/CacheAllocator.h @@ -1496,8 +1496,9 @@ class CacheAllocator : public CacheBase { // @param oldItem Reference to the item being moved // @param newItemHdl Reference to the handle of the new item being moved into // - // @return true If the move was completed, and the containers were updated - // successfully. + // @return the handle to the oldItem if the move was completed + // and the oldItem can be recycled. + // Otherwise an empty handle is returned. template WriteHandle moveRegularItemWithSync(Item& oldItem, WriteHandle& newItemHdl, P&& predicate); @@ -1806,6 +1807,12 @@ class CacheAllocator : public CacheBase { // handle on failure. caller can retry. WriteHandle evictChainedItemForSlabRelease(ChainedItem& item); + // Helper function to remove a item if predicates is true. + // + // @return last handle to the item on success. empty handle on failure. + template + WriteHandle removeIf(Item& item, Fn&& predicate); + // Helper function to remove a item if expired. // // @return true if it item expire and removed successfully. diff --git a/cachelib/allocator/MemoryTierCacheConfig.h b/cachelib/allocator/MemoryTierCacheConfig.h index ae07a92516..662983ea84 100644 --- a/cachelib/allocator/MemoryTierCacheConfig.h +++ b/cachelib/allocator/MemoryTierCacheConfig.h @@ -53,6 +53,16 @@ class MemoryTierCacheConfig { size_t getRatio() const noexcept { return ratio; } + // Allocate memory only from specified NUMA nodes + MemoryTierCacheConfig& setMemBind(const std::vector& _numaNodes) { + numaNodes = _numaNodes; + return *this; + } + + std::vector getMemBind() const { + return numaNodes; + } + size_t calculateTierSize(size_t totalCacheSize, size_t partitionNum) const { // TODO: Call this method when tiers are enabled in allocator // to calculate tier sizes in bytes. @@ -82,6 +92,9 @@ class MemoryTierCacheConfig { // Options specific to shm type ShmTypeOpts shmOpts; + // Numa node(s) to bind the tier + std::vector numaNodes; + MemoryTierCacheConfig() = default; }; } // namespace cachelib diff --git a/cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp b/cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp index 90ef34be41..0484b843f2 100644 --- a/cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp +++ b/cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp @@ -23,9 +23,13 @@ namespace tests { using LruAllocatorMemoryTiersTest = AllocatorMemoryTiersTest; // TODO(MEMORY_TIER): add more tests with different eviction policies -TEST_F(LruAllocatorMemoryTiersTest, MultiTiersInvalid) { this->testMultiTiersInvalid(); } -TEST_F(LruAllocatorMemoryTiersTest, MultiTiersValid) { this->testMultiTiersValid(); } +TEST_F(LruAllocatorMemoryTiersTest, MultiTiersFromFileInvalid) { this->testMultiTiersFormFileInvalid(); } +TEST_F(LruAllocatorMemoryTiersTest, MultiTiersFromFileValid) { this->testMultiTiersFromFileValid(); } TEST_F(LruAllocatorMemoryTiersTest, MultiTiersValidMixed) { this->testMultiTiersValidMixed(); } +TEST_F(LruAllocatorMemoryTiersTest, MultiTiersNumaBindingsSysVValid) { this->testMultiTiersNumaBindingsSysVValid(); } +TEST_F(LruAllocatorMemoryTiersTest, MultiTiersNumaBindingsPosixValid) { this->testMultiTiersNumaBindingsPosixValid(); } +TEST_F(LruAllocatorMemoryTiersTest, MultiTiersRemoveDuringEviction) { this->testMultiTiersRemoveDuringEviction(); } +TEST_F(LruAllocatorMemoryTiersTest, MultiTiersReplaceDuringEviction) { this->testMultiTiersReplaceDuringEviction(); } } // end of namespace tests } // end of namespace cachelib diff --git a/cachelib/allocator/tests/AllocatorMemoryTiersTest.h b/cachelib/allocator/tests/AllocatorMemoryTiersTest.h index dba8cfd2dd..3ff6c6a90a 100644 --- a/cachelib/allocator/tests/AllocatorMemoryTiersTest.h +++ b/cachelib/allocator/tests/AllocatorMemoryTiersTest.h @@ -20,14 +20,44 @@ #include "cachelib/allocator/MemoryTierCacheConfig.h" #include "cachelib/allocator/tests/TestBase.h" +#include + namespace facebook { namespace cachelib { namespace tests { template class AllocatorMemoryTiersTest : public AllocatorTest { + private: + template + void testMultiTiersAsyncOpDuringMove(std::unique_ptr& alloc, + PoolId& pool, bool& quit, MvCallback&& moveCb) { + typename AllocatorT::Config config; + config.setCacheSize(4 * Slab::kSize); + config.enableCachePersistence("/tmp"); + config.configureMemoryTiers({ + MemoryTierCacheConfig::fromShm() + .setRatio(1).setMemBind({0}), + MemoryTierCacheConfig::fromShm() + .setRatio(1).setMemBind({0}) + }); + + config.enableMovingOnSlabRelease(moveCb, {} /* ChainedItemsMoveSync */, + -1 /* movingAttemptsLimit */); + + alloc = std::make_unique(AllocatorT::SharedMemNew, config); + ASSERT(alloc != nullptr); + pool = alloc->addPool("default", alloc->getCacheMemoryStats().cacheSize); + + int i = 0; + while(!quit) { + auto handle = alloc->allocate(pool, std::to_string(++i), std::string("value").size()); + ASSERT(handle != nullptr); + ASSERT_NO_THROW(alloc->insertOrReplace(handle)); + } + } public: - void testMultiTiersInvalid() { + void testMultiTiersFormFileInvalid() { typename AllocatorT::Config config; config.setCacheSize(100 * Slab::kSize); config.configureMemoryTiers({ @@ -42,7 +72,7 @@ class AllocatorMemoryTiersTest : public AllocatorTest { std::invalid_argument); } - void testMultiTiersValid() { + void testMultiTiersFromFileValid() { typename AllocatorT::Config config; config.setCacheSize(100 * Slab::kSize); config.enableCachePersistence("/tmp"); @@ -83,6 +113,111 @@ class AllocatorMemoryTiersTest : public AllocatorTest { ASSERT(handle != nullptr); ASSERT_NO_THROW(alloc->insertOrReplace(handle)); } + + void testMultiTiersNumaBindingsSysVValid() { + typename AllocatorT::Config config; + config.setCacheSize(100 * Slab::kSize); + config.enableCachePersistence("/tmp"); + config.configureMemoryTiers({ + MemoryTierCacheConfig::fromShm() + .setRatio(1).setMemBind({0}), + MemoryTierCacheConfig::fromShm() + .setRatio(1).setMemBind({0}) + }); + + auto alloc = std::make_unique(AllocatorT::SharedMemNew, config); + ASSERT(alloc != nullptr); + + auto pool = alloc->addPool("default", alloc->getCacheMemoryStats().cacheSize); + auto handle = alloc->allocate(pool, "key", std::string("value").size()); + ASSERT(handle != nullptr); + ASSERT_NO_THROW(alloc->insertOrReplace(handle)); + } + + void testMultiTiersNumaBindingsPosixValid() { + typename AllocatorT::Config config; + config.setCacheSize(100 * Slab::kSize); + config.enableCachePersistence("/tmp"); + config.usePosixForShm(); + config.configureMemoryTiers({ + MemoryTierCacheConfig::fromShm() + .setRatio(1).setMemBind({0}), + MemoryTierCacheConfig::fromShm() + .setRatio(1).setMemBind({0}) + }); + + auto alloc = std::make_unique(AllocatorT::SharedMemNew, config); + ASSERT(alloc != nullptr); + + auto pool = alloc->addPool("default", alloc->getCacheMemoryStats().cacheSize); + auto handle = alloc->allocate(pool, "key", std::string("value").size()); + ASSERT(handle != nullptr); + ASSERT_NO_THROW(alloc->insertOrReplace(handle)); + } + + void testMultiTiersRemoveDuringEviction() { + std::unique_ptr alloc; + PoolId pool; + std::unique_ptr t; + folly::Latch latch(1); + bool quit = false; + + auto moveCb = [&] (typename AllocatorT::Item& oldItem, + typename AllocatorT::Item& newItem, + typename AllocatorT::Item* /* parentPtr */) { + + auto key = oldItem.getKey(); + t = std::make_unique([&](){ + // remove() function is blocked by wait context + // till item is moved to next tier. So that, we should + // notify latch before calling remove() + latch.count_down(); + alloc->remove(key); + }); + // wait till async thread is running + latch.wait(); + memcpy(newItem.getMemory(), oldItem.getMemory(), oldItem.getSize()); + quit = true; + }; + + testMultiTiersAsyncOpDuringMove(alloc, pool, quit, moveCb); + + t->join(); + } + + void testMultiTiersReplaceDuringEviction() { + std::unique_ptr alloc; + PoolId pool; + std::unique_ptr t; + folly::Latch latch(1); + bool quit = false; + + auto moveCb = [&] (typename AllocatorT::Item& oldItem, + typename AllocatorT::Item& newItem, + typename AllocatorT::Item* /* parentPtr */) { + auto key = oldItem.getKey(); + if(!quit) { + // we need to replace only once because subsequent allocate calls + // will cause evictions recursevly + quit = true; + t = std::make_unique([&](){ + auto handle = alloc->allocate(pool, key, std::string("new value").size()); + // insertOrReplace() function is blocked by wait context + // till item is moved to next tier. So that, we should + // notify latch before calling insertOrReplace() + latch.count_down(); + ASSERT_NO_THROW(alloc->insertOrReplace(handle)); + }); + // wait till async thread is running + latch.wait(); + } + memcpy(newItem.getMemory(), oldItem.getMemory(), oldItem.getSize()); + }; + + testMultiTiersAsyncOpDuringMove(alloc, pool, quit, moveCb); + + t->join(); + } }; } // namespace tests } // namespace cachelib diff --git a/cachelib/cachebench/CMakeLists.txt b/cachelib/cachebench/CMakeLists.txt index 1a1063104c..f935e6e706 100644 --- a/cachelib/cachebench/CMakeLists.txt +++ b/cachelib/cachebench/CMakeLists.txt @@ -89,5 +89,6 @@ if (BUILD_TESTS) add_test (consistency/tests/ValueHistoryTest.cpp) add_test (consistency/tests/ValueTrackerTest.cpp) add_test (util/tests/NandWritesTest.cpp) + add_test (util/tests/MemoryTierConfigTest.cpp) add_test (cache/tests/TimeStampTickerTest.cpp) endif() diff --git a/cachelib/cachebench/util/CacheConfig.cpp b/cachelib/cachebench/util/CacheConfig.cpp index f12992dd9e..29cd9cb6a3 100644 --- a/cachelib/cachebench/util/CacheConfig.cpp +++ b/cachelib/cachebench/util/CacheConfig.cpp @@ -137,8 +137,53 @@ std::shared_ptr CacheConfig::getRebalanceStrategy() const { MemoryTierConfig::MemoryTierConfig(const folly::dynamic& configJson) { JSONSetVal(configJson, file); JSONSetVal(configJson, ratio); + JSONSetVal(configJson, memBindNodes); - checkCorrectSize(); + checkCorrectSize(); +} + +static bool starts_with() {return true;} + +std::vector MemoryTierConfig::parseNumaNodes() { + std::vector numaNodes; + + std::vector tokens; + folly::split(",", memBindNodes, tokens, true /*ignore empty*/); + for(const auto &token : tokens) { + if(token.startsWith("!")) { + throw std::invalid_argument(folly::sformat( + "invalid NUMA nodes binding in memory tier config: {} " + "inverse !N or !N-N is not supported " + "nodes may be specified as N,N,N or N-N or N,N-N or N-N,N-N and so forth.", + token)); + } + else if(token.startsWith("+")) { + throw std::invalid_argument(folly::sformat( + "invalid NUMA nodes binding in memory tier config: {} " + "relative nodes are not supported. " + "nodes may be specified as N,N,N or N-N or N,N-N or N-N,N-N and so forth.", + token)); + } + else if (token.contains("-")) { + size_t begin, end; + if(folly::split("-", token, begin, end) && begin < end) { + while(begin <=end) { + numaNodes.push_back(begin++); + } + } else { + throw std::invalid_argument(folly::sformat( + "invalid NUMA nodes binding in memory tier config: {} " + "Invalid range format. " + "nodes may be specified as N,N,N or N-N or N,N-N or N-N,N-N and so forth.", + token)); + } + } + else { + numaNodes.push_back(folly::to(token)); + } + } + + return numaNodes; } } // namespace cachebench diff --git a/cachelib/cachebench/util/CacheConfig.h b/cachelib/cachebench/util/CacheConfig.h index b7829e28c7..7a8c9020b0 100644 --- a/cachelib/cachebench/util/CacheConfig.h +++ b/cachelib/cachebench/util/CacheConfig.h @@ -48,11 +48,13 @@ struct MemoryTierConfig : public JSONConfig { MemoryTierCacheConfig getMemoryTierCacheConfig() { MemoryTierCacheConfig config = memoryTierCacheConfigFromSource(); config.setRatio(ratio); + config.setMemBind(parseNumaNodes()); return config; } std::string file{""}; size_t ratio{0}; + std::string memBindNodes{""}; private: MemoryTierCacheConfig memoryTierCacheConfigFromSource() { @@ -62,6 +64,8 @@ struct MemoryTierConfig : public JSONConfig { return MemoryTierCacheConfig::fromFile(file); } } + + std::vector parseNumaNodes(); }; struct CacheConfig : public JSONConfig { diff --git a/cachelib/cachebench/util/tests/MemoryTierConfigTest.cpp b/cachelib/cachebench/util/tests/MemoryTierConfigTest.cpp new file mode 100644 index 0000000000..afd2bf80ad --- /dev/null +++ b/cachelib/cachebench/util/tests/MemoryTierConfigTest.cpp @@ -0,0 +1,86 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Copyright 2022-present Facebook. All Rights Reserved. + +#include + +#include +#include + +#include "cachelib/cachebench/util/CacheConfig.h" + +namespace facebook { +namespace cachelib { +namespace cachebench { + +TEST(MemoryTierConfigTest, MemBind_SingleNumaNode) { + const std::string configString = + "{" + " \"ratio\": 1," + " \"memBindNodes\": 1" + "}"; + + const std::vector expectedNumaNodes = {1}; + + auto configJson = folly::parseJson(folly::json::stripComments(configString)); + + MemoryTierConfig memoryTierConfig(configJson); + MemoryTierCacheConfig tierCacheConfig = memoryTierConfig.getMemoryTierCacheConfig(); + + auto parsedNumaNodes = tierCacheConfig.getMemBind(); + ASSERT_TRUE(std::equal(expectedNumaNodes.begin(), expectedNumaNodes.end(), parsedNumaNodes.begin())); +} + +TEST(MemoryTierConfigTest, MemBind_RangeNumaNodes) { + const std::string configString = + "{" + " \"ratio\": 1," + " \"memBindNodes\": \"0-2\"" + "}"; + + const std::vector expectedNumaNodes = {0, 1, 2}; + + auto configJson = folly::parseJson(folly::json::stripComments(configString)); + + MemoryTierConfig memoryTierConfig(configJson); + MemoryTierCacheConfig tierCacheConfig = memoryTierConfig.getMemoryTierCacheConfig(); + + auto parsedNumaNodes = tierCacheConfig.getMemBind(); + ASSERT_TRUE(std::equal(expectedNumaNodes.begin(), expectedNumaNodes.end(), parsedNumaNodes.begin())); +} + +TEST(MemoryTierConfigTest, MemBind_SingleAndRangeNumaNodes) { + const std::string configString = + "{" + " \"ratio\": 1," + " \"memBindNodes\": \"0,2-5\"" + "}"; + + const std::vector expectedNumaNodes = {0, 2, 3, 4, 5}; + + auto configJson = folly::parseJson(folly::json::stripComments(configString)); + + MemoryTierConfig memoryTierConfig(configJson); + MemoryTierCacheConfig tierCacheConfig = memoryTierConfig.getMemoryTierCacheConfig(); + + auto parsedNumaNodes = tierCacheConfig.getMemBind(); + ASSERT_TRUE(std::equal(expectedNumaNodes.begin(), expectedNumaNodes.end(), parsedNumaNodes.begin())); +} + +} // namespace facebook +} // namespace cachelib +} // namespace cachebench \ No newline at end of file diff --git a/cachelib/shm/CMakeLists.txt b/cachelib/shm/CMakeLists.txt index 4f97c0e763..83a798949c 100644 --- a/cachelib/shm/CMakeLists.txt +++ b/cachelib/shm/CMakeLists.txt @@ -25,6 +25,7 @@ add_library (cachelib_shm add_dependencies(cachelib_shm thrift_generated_files) target_link_libraries(cachelib_shm PUBLIC cachelib_common + numa ) install(TARGETS cachelib_shm diff --git a/cachelib/shm/PosixShmSegment.cpp b/cachelib/shm/PosixShmSegment.cpp index 027fee8bb8..1bdeec253d 100644 --- a/cachelib/shm/PosixShmSegment.cpp +++ b/cachelib/shm/PosixShmSegment.cpp @@ -21,6 +21,8 @@ #include #include #include +#include +#include #include "cachelib/common/Utils.h" @@ -176,6 +178,7 @@ void* PosixShmSegment::mapAddress(void* addr) const { util::throwSystemError(EINVAL, "Address already mapped"); } XDCHECK(retAddr == addr || addr == nullptr); + memBind(addr); return retAddr; } @@ -183,6 +186,44 @@ void PosixShmSegment::unMap(void* addr) const { detail::munmapImpl(addr, getSize()); } +static void forcePageAllocation(void* addr, size_t size, size_t pageSize) { + for(volatile char* curAddr = (char*)addr; curAddr < (char*)addr+size; curAddr += pageSize) { + *curAddr = *curAddr; + } +} + +void PosixShmSegment::memBind(void* addr) const { + if(opts_.memBindNumaNodes.empty()) return; + + struct bitmask *oldNodeMask = numa_allocate_nodemask(); + int oldMode = 0; + struct bitmask *nodesMask = numa_allocate_nodemask(); + auto guard = folly::makeGuard([&] { numa_bitmask_free(nodesMask); numa_bitmask_free(oldNodeMask); }); + + for(auto node : opts_.memBindNumaNodes) { + numa_bitmask_setbit(nodesMask, node); + } + + // mbind() cannot be used because mmap was called with MAP_SHARED flag + // But we can set memory policy for current thread and force page allcoation. + // The following logic is used: + // 1. Remember current memory policy for the current thread + // 2. Set new memory policy as specifiec by config + // 3. Force page allocation by touching every page in the segment + // 4. Restore memory policy + + // Remember current memory policy + get_mempolicy(&oldMode, oldNodeMask->maskp, oldNodeMask->size, nullptr, 0); + + // Set memory bindings + set_mempolicy(MPOL_BIND, nodesMask->maskp, nodesMask->size); + + forcePageAllocation(addr, getSize(), detail::getPageSize(opts_.pageSize)); + + // Restore memory policy for the thread + set_mempolicy(oldMode, nodesMask->maskp, nodesMask->size); +} + std::string PosixShmSegment::createKeyForName( const std::string& name) noexcept { // ensure that the slash is always there in the head. repetitive diff --git a/cachelib/shm/PosixShmSegment.h b/cachelib/shm/PosixShmSegment.h index 6aaeb004e7..bf43b2ca55 100644 --- a/cachelib/shm/PosixShmSegment.h +++ b/cachelib/shm/PosixShmSegment.h @@ -108,6 +108,8 @@ class PosixShmSegment : public ShmBase { void createReferenceMapping(); void deleteReferenceMapping() const; + void memBind(void* addr) const; + // file descriptor associated with the shm. This has FD_CLOEXEC set // and once opened, we close this only on destruction of this object int fd_{kInvalidFD}; diff --git a/cachelib/shm/ShmCommon.h b/cachelib/shm/ShmCommon.h index 0998f2f951..8ed5202b62 100644 --- a/cachelib/shm/ShmCommon.h +++ b/cachelib/shm/ShmCommon.h @@ -93,6 +93,7 @@ struct ShmSegmentOpts { PageSizeT pageSize{PageSizeT::NORMAL}; bool readOnly{false}; size_t alignment{1}; // alignment for mapping. + std::vector memBindNumaNodes; // opts specific to segment type ShmTypeOpts typeOpts{PosixSysVSegmentOpts(false)}; diff --git a/cachelib/shm/SysVShmSegment.cpp b/cachelib/shm/SysVShmSegment.cpp index e13d605aa5..8b13246ded 100644 --- a/cachelib/shm/SysVShmSegment.cpp +++ b/cachelib/shm/SysVShmSegment.cpp @@ -18,8 +18,11 @@ #include #include +#include #include #include +#include +#include #include "cachelib/common/Utils.h" @@ -184,6 +187,50 @@ void shmCtlImpl(int shmid, int cmd, shmid_ds* buf) { } } +void mbindImpl(void *addr, unsigned long len, int mode, + const std::vector& memBindNumaNodes, + unsigned int flags) { + struct bitmask *nodesMask = numa_allocate_nodemask(); + auto guard = folly::makeGuard([&] { numa_bitmask_free(nodesMask); }); + + for(auto node : memBindNumaNodes) { + numa_bitmask_setbit(nodesMask, node); + } + + long ret = mbind(addr, len, mode, nodesMask->maskp, nodesMask->size, flags); + if(ret == 0) return; + + switch (errno) { + case EFAULT: + util::throwSystemError(errno); + break; + case EINVAL: + util::throwSystemError(errno, "Invalid parameters when bind segment to NUMA node(s)"); + break; + case EIO: + if(flags & MPOL_MF_STRICT) { + util::throwSystemError(errno, "Segment already allocated on another NUMA node that does not follow the policy."); + } + if(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL )) { + util::throwSystemError(errno, "Segment already allocated but kernel was unable to move it to specified NUMA node(s)."); + } + util::throwSystemError(errno, "Invalid errno"); + break; + case ENOMEM: + util::throwSystemError(errno, "Could not bind memory. Insufficient kernel memory was available"); + break; + case EPERM: + if(flags & MPOL_MF_MOVE_ALL) { + util::throwSystemError(errno, "Process does not have the CAP_SYS_NICE privilege to bind segment with MPOL_MF_MOVE_ALL flag"); + } + util::throwSystemError(errno, "Invalid errno"); + break; + default: + XDCHECK(false); + util::throwSystemError(errno, "Invalid errno"); + } +} + } // namespace detail void ensureSizeforHugePage(size_t size) { @@ -270,11 +317,17 @@ void* SysVShmSegment::mapAddress(void* addr) const { void* retAddr = detail::shmAttachImpl(shmid_, addr, shmFlags); XDCHECK(retAddr == addr || addr == nullptr); + memBind(retAddr); return retAddr; } void SysVShmSegment::unMap(void* addr) const { detail::shmDtImpl(addr); } +void SysVShmSegment::memBind(void* addr) const { + if(opts_.memBindNumaNodes.empty()) return; + detail::mbindImpl(addr, getSize(), MPOL_BIND, opts_.memBindNumaNodes, 0); +} + void SysVShmSegment::markForRemoval() { if (isMarkedForRemoval()) { return; diff --git a/cachelib/shm/SysVShmSegment.h b/cachelib/shm/SysVShmSegment.h index fcebe03eb1..5a57215508 100644 --- a/cachelib/shm/SysVShmSegment.h +++ b/cachelib/shm/SysVShmSegment.h @@ -100,6 +100,7 @@ class SysVShmSegment : public ShmBase { void lockPagesInMemory() const; void createReferenceMapping(); void deleteReferenceMapping() const; + void memBind(void* addr) const; // the key identifier for the shared memory KeyType key_{kInvalidKey}; diff --git a/contrib/prerequisites-centos8.sh b/contrib/prerequisites-centos8.sh index 7e6cfad1d8..26be9201b3 100755 --- a/contrib/prerequisites-centos8.sh +++ b/contrib/prerequisites-centos8.sh @@ -57,7 +57,8 @@ sudo dnf --enablerepo="$POWERTOOLS_REPO" install -y \ libsodium-static \ libdwarf-static \ boost-static \ - double-conversion-static + double-conversion-static \ + numactl-devel #Do not install these from OS packages - they are typically outdated. #gflags-devel \ diff --git a/docker/images/install-cachelib-deps.sh b/docker/images/install-cachelib-deps.sh index dd920d9064..6d8fbdef7b 100755 --- a/docker/images/install-cachelib-deps.sh +++ b/docker/images/install-cachelib-deps.sh @@ -2,7 +2,7 @@ # SPDX-License-Identifier: BSD-3-Clause # Copyright 2022, Intel Corporation -git clone https://github.com/pmem/CacheLib CacheLib +git clone -b develop https://github.com/intel/CacheLib CacheLib ./CacheLib/contrib/prerequisites-centos8.sh diff --git a/examples/multitier_cache/main.cpp b/examples/multitier_cache/main.cpp index 28990c341f..800c0c7cfa 100644 --- a/examples/multitier_cache/main.cpp +++ b/examples/multitier_cache/main.cpp @@ -57,7 +57,7 @@ bool put(CacheKey key, const std::string& value) { if (!handle) { return false; // cache may fail to evict due to too many pending writes } - std::memcpy(handle->getWritableMemory(), value.data(), value.size()); + std::memcpy(handle->getMemory(), value.data(), value.size()); gCache_->insertOrReplace(handle); return true; } diff --git a/run_tests.sh b/run_tests.sh index 97fc7cda72..f7814f5edc 100755 --- a/run_tests.sh +++ b/run_tests.sh @@ -1,12 +1,7 @@ #!/bin/bash # Newline separated list of tests to ignore -BLACKLIST="allocator-test-AllocationClassTest -allocator-test-AllocatorTypeTest -allocator-test-NvmCacheTests -allocator-test-NavySetupTest -common-test-TimeTests -common-test-UtilTests +BLACKLIST="allocator-test-NavySetupTest shm-test-test_page_size" if [ "$1" == "long" ]; then