From 1e679820669d5552971dacf7346981bb2e2d3d42 Mon Sep 17 00:00:00 2001 From: John Sully Date: Sun, 17 Sep 2023 23:41:10 -0400 Subject: [PATCH 01/68] async flash start --- src/IStorage.h | 8 ++++++++ src/storage/rocksdb.cpp | 19 +++++++++++++++++++ src/storage/rocksdb.h | 4 ++++ 3 files changed, 31 insertions(+) diff --git a/src/IStorage.h b/src/IStorage.h index dc608d490..e23a061c9 100644 --- a/src/IStorage.h +++ b/src/IStorage.h @@ -1,9 +1,14 @@ #pragma once #include #include "sds.h" +#include "ae.h" #define METADATA_DB_IDENTIFIER "c299fde0-6d42-4ec4-b939-34f680ffe39f" +struct StorageToken { + virtual ~StorageToken() {} +}; + class IStorageFactory { public: @@ -35,6 +40,9 @@ class IStorage virtual bool enumerate_hashslot(callback fn, unsigned int hashslot) const = 0; virtual size_t count() const = 0; + virtual StorageToken *begin_retrieve(struct aeEventLoop *, aePostFunctionTokenProc, const char * /*key*/, size_t /*cchKey*/) {return nullptr;}; + virtual void complete_retrieve(StorageToken * /*tok*/, callbackSingle /*fn*/) {}; + virtual void bulkInsert(char **rgkeys, size_t *rgcbkeys, char **rgvals, size_t *rgcbvals, size_t celem) { beginWriteBatch(); for (size_t ielem = 0; ielem < celem; ++ielem) { diff --git a/src/storage/rocksdb.cpp b/src/storage/rocksdb.cpp index 76eaa133a..95ed94e58 100644 --- a/src/storage/rocksdb.cpp +++ b/src/storage/rocksdb.cpp @@ -276,4 +276,23 @@ bool RocksDBStorageProvider::FKeyExists(std::string& key) const if (m_spbatch) return m_spbatch->GetFromBatchAndDB(m_spdb.get(), ReadOptions(), m_spcolfamily.get(), rocksdb::Slice(key), &slice).ok(); return m_spdb->Get(ReadOptions(), m_spcolfamily.get(), rocksdb::Slice(key), &slice).ok(); +} + +struct RetrievalStorageToken : public StorageToken { + std::string key; +}; + +StorageToken *RocksDBStorageProvider::begin_retrieve(struct aeEventLoop *el, aePostFunctionTokenProc callback, const char *key, size_t cchKey) { + RetrievalStorageToken *tok = new RetrievalStorageToken(); + tok->key = std::string(key, cchKey); + aePostFunction(el, callback, tok); + return tok; +} + +void RocksDBStorageProvider::complete_retrieve(StorageToken *tok, callbackSingle fn) { + rocksdb::PinnableSlice slice; + RetrievalStorageToken *rtok = static_cast(tok); + auto status = m_spdb->Get(ReadOptions(), m_spcolfamily.get(), rocksdb::Slice(rtok->key), &slice); + if (status.ok()) + fn(rtok->key.data(), rtok->key.size(), slice.data(), slice.size()); } \ No newline at end of file diff --git a/src/storage/rocksdb.h b/src/storage/rocksdb.h index b78788eb2..c8557a577 100644 --- a/src/storage/rocksdb.h +++ b/src/storage/rocksdb.h @@ -30,6 +30,10 @@ class RocksDBStorageProvider : public IStorage virtual void insert(const char *key, size_t cchKey, void *data, size_t cb, bool fOverwrite) override; virtual bool erase(const char *key, size_t cchKey) override; virtual void retrieve(const char *key, size_t cchKey, callbackSingle fn) const override; + + virtual StorageToken *begin_retrieve(struct aeEventLoop *el, aePostFunctionTokenProc callback, const char *key, size_t cchKey); + virtual void complete_retrieve(StorageToken *tok, callbackSingle fn); + virtual size_t clear() override; virtual bool enumerate(callback fn) const override; virtual bool enumerate_hashslot(callback fn, unsigned int hashslot) const override; From e95c8b4ebc279bf4e6c75b4fb965828fffad7832 Mon Sep 17 00:00:00 2001 From: John Sully Date: Sun, 24 Sep 2023 21:10:48 -0400 Subject: [PATCH 02/68] Initial async impl --- src/IStorage.h | 2 + src/StorageCache.cpp | 18 ++++++ src/StorageCache.h | 2 + src/blocked.cpp | 2 + src/config.cpp | 2 +- src/db.cpp | 94 +++++++++++----------------- src/networking.cpp | 2 +- src/server.h | 5 +- src/storage/rocksdb.cpp | 22 +++++-- src/storage/rocksdbfactor_internal.h | 5 +- src/storage/rocksdbfactory.cpp | 8 +-- src/storage/rocksdbfactory.h | 2 +- 12 files changed, 93 insertions(+), 71 deletions(-) diff --git a/src/IStorage.h b/src/IStorage.h index e23a061c9..5301884ee 100644 --- a/src/IStorage.h +++ b/src/IStorage.h @@ -6,6 +6,8 @@ #define METADATA_DB_IDENTIFIER "c299fde0-6d42-4ec4-b939-34f680ffe39f" struct StorageToken { + struct client *c; + struct redisDbPersistentData *db; virtual ~StorageToken() {} }; diff --git a/src/StorageCache.cpp b/src/StorageCache.cpp index 91d4b3657..2445e7259 100644 --- a/src/StorageCache.cpp +++ b/src/StorageCache.cpp @@ -188,6 +188,24 @@ void StorageCache::retrieve(sds key, IStorage::callbackSingle fn) const m_spstorage->retrieve(key, sdslen(key), fn); } +StorageToken *StorageCache::begin_retrieve(struct aeEventLoop *el, aePostFunctionTokenProc proc, sds key) { + std::unique_lock ul(m_lock); + if (m_pdict != nullptr) + { + uint64_t hash = dictSdsHash(key); + dictEntry *de = dictFind(m_pdict, reinterpret_cast(hash)); + + if (de == nullptr) + return nullptr; // Not found + } + ul.unlock(); + return m_spstorage->begin_retrieve(el, proc, key, sdslen(key)); +} + +void StorageCache::complete_retrieve(StorageToken *tok, IStorage::callbackSingle fn) { + m_spstorage->complete_retrieve(tok, fn); +} + size_t StorageCache::count() const { std::unique_lock ul(m_lock, std::defer_lock); diff --git a/src/StorageCache.h b/src/StorageCache.h index 4f3c1a374..b9996acda 100644 --- a/src/StorageCache.h +++ b/src/StorageCache.h @@ -43,6 +43,8 @@ class StorageCache void insert(sds key, const void *data, size_t cbdata, bool fOverwrite); void bulkInsert(char **rgkeys, size_t *rgcbkeys, char **rgvals, size_t *rgcbvals, size_t celem); void retrieve(sds key, IStorage::callbackSingle fn) const; + StorageToken *begin_retrieve(struct aeEventLoop *el, aePostFunctionTokenProc proc, sds key); + void complete_retrieve(StorageToken *tok, IStorage::callbackSingle fn); bool erase(sds key); void emergencyFreeCache(); bool keycacheIsEnabled() const { return m_pdict != nullptr; } diff --git a/src/blocked.cpp b/src/blocked.cpp index 56df022ea..5f098706f 100644 --- a/src/blocked.cpp +++ b/src/blocked.cpp @@ -207,6 +207,8 @@ void unblockClient(client *c) { } else if (c->btype == BLOCKED_PAUSE) { listDelNode(g_pserver->paused_clients,c->paused_list_node); c->paused_list_node = NULL; + } else if (c->btype == BLOCKED_STORAGE) { + serverTL->vecclientsProcess.push_back(c); } else { serverPanic("Unknown btype in unblockClient()."); } diff --git a/src/config.cpp b/src/config.cpp index 574cf89cb..a8217e7bc 100644 --- a/src/config.cpp +++ b/src/config.cpp @@ -361,7 +361,7 @@ bool initializeStorageProvider(const char **err) // Create The Storage Factory (if necessary) serverLog(LL_NOTICE, "Initializing FLASH storage provider (this may take a long time)"); adjustOpenFilesLimit(); - g_pserver->m_pstorageFactory = CreateRocksDBStorageFactory(g_sdsArgs, cserver.dbnum, cserver.storage_conf, cserver.storage_conf ? strlen(cserver.storage_conf) : 0); + g_pserver->m_pstorageFactory = CreateRocksDBStorageFactory(g_sdsArgs, cserver.dbnum, cserver.storage_conf, cserver.storage_conf ? strlen(cserver.storage_conf) : 0, &g_pserver->asyncworkqueue); #else serverLog(LL_WARNING, "To use the flash storage provider please compile KeyDB with ENABLE_FLASH=yes"); serverLog(LL_WARNING, "Exiting due to the use of an unsupported storage provider"); diff --git a/src/db.cpp b/src/db.cpp index 297910b9e..7185e72cb 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -3383,76 +3383,58 @@ void redisDbPersistentData::prefetchKeysAsync(client *c, parsed_command &command if (this->find_cached_threadsafe(szFromObj(objKey)) == nullptr) veckeys.push_back(objKey); } - lock.disarm(); getKeysFreeResult(&result); std::vector>> vecInserts; for (robj *objKey : veckeys) { - sds sharedKey = sdsdupshared((sds)szFromObj(objKey)); std::unique_ptr spexpire; - robj *o = nullptr; - m_spstorage->retrieve((sds)szFromObj(objKey), [&](const char *, size_t, const void *data, size_t cb){ - size_t offset = 0; - spexpire = deserializeExpire(sharedKey, (const char*)data, cb, &offset); - o = deserializeStoredObject(this, sharedKey, reinterpret_cast(data) + offset, cb - offset); - serverAssert(o != nullptr); - }); - - if (o != nullptr) { - vecInserts.emplace_back(sharedKey, o, std::move(spexpire)); - } else if (sharedKey != nullptr) { - sdsfree(sharedKey); + auto *tok = m_spstorage->begin_retrieve(serverTL->el, storageLoadCallback, (sds)szFromObj(objKey)); + if (tok != nullptr) { + tok->c = c; + tok->db = this; + blockClient(c, BLOCKED_STORAGE); } } - if (!vecInserts.empty()) { - lock.arm(c); - for (auto &tuple : vecInserts) - { - sds sharedKey = std::get<0>(tuple); - robj *o = std::get<1>(tuple); - std::unique_ptr spexpire = std::move(std::get<2>(tuple)); + return; +} - if (o != nullptr) - { - if (this->find_cached_threadsafe(sharedKey) != nullptr) - { - // While unlocked this was already ensured - decrRefCount(o); - sdsfree(sharedKey); - } - else - { - if (spexpire != nullptr) { - if (spexpire->when() < mstime()) { - break; - } - } - dictAdd(m_pdict, sharedKey, o); - o->SetFExpires(spexpire != nullptr); +/*static*/ void redisDbPersistentData::storageLoadCallback(aeEventLoop *el, StorageToken *tok) { + tok->db->m_spstorage->complete_retrieve(tok, [&](const char *szKey, size_t cbKey, const void *data, size_t cb) { + auto *db = tok->db; + size_t offset = 0; + sds key = sdsnewlen(szKey, -((ssize_t)cbKey)); + auto spexpire = deserializeExpire(key, (const char*)data, cb, &offset); + robj *o = deserializeStoredObject(db, key, reinterpret_cast(data) + offset, cb - offset); + serverAssert(o != nullptr); - std::unique_lock ul(g_expireLock); - if (spexpire != nullptr) - { - auto itr = m_setexpire->find(sharedKey); - if (itr != m_setexpire->end()) - m_setexpire->erase(itr); - m_setexpire->insert(std::move(*spexpire)); - serverAssert(m_setexpire->find(sharedKey) != m_setexpire->end()); - } - serverAssert(o->FExpires() == (m_setexpire->find(sharedKey) != m_setexpire->end())); + if (db->find_cached_threadsafe(key) != nullptr) { + LUnneeded: + // While unlocked this was already ensured + decrRefCount(o); + sdsfree(key); + } else { + if (spexpire != nullptr) { + if (spexpire->when() < mstime()) { + goto LUnneeded; } } - else - { - if (sharedKey != nullptr) - sdsfree(sharedKey); // BUG but don't bother crashing + dictAdd(db->m_pdict, key, o); + o->SetFExpires(spexpire != nullptr); + + std::unique_lock ul(g_expireLock); + if (spexpire != nullptr) { + auto itr = db->m_setexpire->find(key); + if (itr != db->m_setexpire->end()) + db->m_setexpire->erase(itr); + db->m_setexpire->insert(std::move(*spexpire)); + serverAssert(db->m_setexpire->find(key) != db->m_setexpire->end()); } + serverAssert(o->FExpires() == (db->m_setexpire->find(key) != db->m_setexpire->end())); } - lock.disarm(); - } - - return; + }); + std::unique_lock ul(tok->c->lock); + unblockClient(tok->c); } diff --git a/src/networking.cpp b/src/networking.cpp index 1728b5f29..76d70684a 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -2774,7 +2774,7 @@ void readQueryFromClient(connection *conn) { processInputBuffer(c, false, CMD_CALL_SLOWLOG | CMD_CALL_STATS | CMD_CALL_ASYNC); } } - if (!c->vecqueuedcmd.empty()) + if (!c->vecqueuedcmd.empty() && !(c->flags & CLIENT_BLOCKED)) serverTL->vecclientsProcess.push_back(c); } else { // If we're single threaded its actually better to just process the command here while the query is hot in the cache diff --git a/src/server.h b/src/server.h index 9ad1aab8d..39d1cfeb8 100644 --- a/src/server.h +++ b/src/server.h @@ -557,7 +557,8 @@ extern int configOOMScoreAdjValuesDefaults[CONFIG_OOM_COUNT]; #define BLOCKED_ZSET 5 /* BZPOP et al. */ #define BLOCKED_PAUSE 6 /* Blocked by CLIENT PAUSE */ #define BLOCKED_ASYNC 7 -#define BLOCKED_NUM 8 /* Number of blocked states. */ +#define BLOCKED_STORAGE 8 +#define BLOCKED_NUM 9 /* Number of blocked states. */ /* Client request types */ #define PROTO_REQ_INLINE 1 @@ -1218,6 +1219,8 @@ class redisDbPersistentData dict_iter find_cached_threadsafe(const char *key) const; + static void storageLoadCallback(struct aeEventLoop *el, struct StorageToken *token); + protected: uint64_t m_mvccCheckpoint = 0; diff --git a/src/storage/rocksdb.cpp b/src/storage/rocksdb.cpp index 95ed94e58..1456834ef 100644 --- a/src/storage/rocksdb.cpp +++ b/src/storage/rocksdb.cpp @@ -280,19 +280,29 @@ bool RocksDBStorageProvider::FKeyExists(std::string& key) const struct RetrievalStorageToken : public StorageToken { std::string key; + std::vector data; + bool fFound = false; }; StorageToken *RocksDBStorageProvider::begin_retrieve(struct aeEventLoop *el, aePostFunctionTokenProc callback, const char *key, size_t cchKey) { RetrievalStorageToken *tok = new RetrievalStorageToken(); tok->key = std::string(key, cchKey); - aePostFunction(el, callback, tok); + (*m_pfactory->m_wqueue)->AddWorkFunction([this, el, callback, tok]{ + rocksdb::PinnableSlice slice; + auto status = m_spdb->Get(ReadOptions(), m_spcolfamily.get(), rocksdb::Slice(prefixKey(tok->key.data(), tok->key.size())), &slice); + if (status.ok()) { + tok->data.resize(slice.size()); + memcpy(tok->data.data(), slice.data(), slice.size()); + tok->fFound = true; + } + aePostFunction(el, callback, tok); + }); return tok; } void RocksDBStorageProvider::complete_retrieve(StorageToken *tok, callbackSingle fn) { - rocksdb::PinnableSlice slice; - RetrievalStorageToken *rtok = static_cast(tok); - auto status = m_spdb->Get(ReadOptions(), m_spcolfamily.get(), rocksdb::Slice(rtok->key), &slice); - if (status.ok()) - fn(rtok->key.data(), rtok->key.size(), slice.data(), slice.size()); + RetrievalStorageToken *rtok = reinterpret_cast(tok); + if (rtok->fFound) + fn(rtok->key.data(), rtok->key.size(), rtok->data.data(), rtok->data.size()); + delete rtok; } \ No newline at end of file diff --git a/src/storage/rocksdbfactor_internal.h b/src/storage/rocksdbfactor_internal.h index dc27f6987..addff77ce 100644 --- a/src/storage/rocksdbfactor_internal.h +++ b/src/storage/rocksdbfactor_internal.h @@ -1,5 +1,6 @@ #pragma once #include "rocksdb.h" +#include "../AsyncWorkQueue.h" class RocksDBStorageFactory : public IStorageFactory { @@ -10,7 +11,9 @@ class RocksDBStorageFactory : public IStorageFactory bool m_fCreatedTempFolder = false; public: - RocksDBStorageFactory(const char *dbfile, int dbnum, const char *rgchConfig, size_t cchConfig); + AsyncWorkQueue **m_wqueue; + + RocksDBStorageFactory(const char *dbfile, int dbnum, const char *rgchConfig, size_t cchConfig, AsyncWorkQueue **wqueue); ~RocksDBStorageFactory(); virtual IStorage *create(int db, key_load_iterator iter, void *privdata) override; diff --git a/src/storage/rocksdbfactory.cpp b/src/storage/rocksdbfactory.cpp index 5c3beeb4b..0ebfb93e1 100644 --- a/src/storage/rocksdbfactory.cpp +++ b/src/storage/rocksdbfactory.cpp @@ -35,9 +35,9 @@ rocksdb::Options DefaultRocksDBOptions() { return options; } -IStorageFactory *CreateRocksDBStorageFactory(const char *path, int dbnum, const char *rgchConfig, size_t cchConfig) +IStorageFactory *CreateRocksDBStorageFactory(const char *path, int dbnum, const char *rgchConfig, size_t cchConfig, AsyncWorkQueue **wqueue) { - return new RocksDBStorageFactory(path, dbnum, rgchConfig, cchConfig); + return new RocksDBStorageFactory(path, dbnum, rgchConfig, cchConfig, wqueue); } rocksdb::Options RocksDBStorageFactory::RocksDbOptions() @@ -52,8 +52,8 @@ rocksdb::Options RocksDBStorageFactory::RocksDbOptions() return options; } -RocksDBStorageFactory::RocksDBStorageFactory(const char *dbfile, int dbnum, const char *rgchConfig, size_t cchConfig) - : m_path(dbfile) +RocksDBStorageFactory::RocksDBStorageFactory(const char *dbfile, int dbnum, const char *rgchConfig, size_t cchConfig, AsyncWorkQueue **wqueue) + : m_path(dbfile), m_wqueue(wqueue) { dbnum++; // create an extra db for metadata // Get the count of column families in the actual database diff --git a/src/storage/rocksdbfactory.h b/src/storage/rocksdbfactory.h index d60f9fcf6..c137a79e4 100644 --- a/src/storage/rocksdbfactory.h +++ b/src/storage/rocksdbfactory.h @@ -1,3 +1,3 @@ #pragma once -class IStorageFactory *CreateRocksDBStorageFactory(const char *path, int dbnum, const char *rgchConfig, size_t cchConfig); \ No newline at end of file +class IStorageFactory *CreateRocksDBStorageFactory(const char *path, int dbnum, const char *rgchConfig, size_t cchConfig, AsyncWorkQueue **wqueue); \ No newline at end of file From f0e8c6f14acae5a360b94067c81afcccbb323efe Mon Sep 17 00:00:00 2001 From: John Sully Date: Mon, 25 Sep 2023 01:07:20 -0400 Subject: [PATCH 03/68] Use MultiGet --- src/AsyncWorkQueue.cpp | 2 +- src/IStorage.h | 5 ++- src/StorageCache.cpp | 6 ++- src/StorageCache.h | 2 +- src/ae.cpp | 2 +- src/blocked.cpp | 2 +- src/db.cpp | 88 +++++++++++++++++++++++++---------------- src/networking.cpp | 18 ++++++--- src/server.cpp | 16 +++++++- src/server.h | 8 +++- src/storage/rocksdb.cpp | 49 +++++++++++++++++------ src/storage/rocksdb.h | 2 +- 12 files changed, 137 insertions(+), 63 deletions(-) diff --git a/src/AsyncWorkQueue.cpp b/src/AsyncWorkQueue.cpp index 8d04b742a..77f5e9ea4 100644 --- a/src/AsyncWorkQueue.cpp +++ b/src/AsyncWorkQueue.cpp @@ -56,7 +56,7 @@ void AsyncWorkQueue::WorkerThreadMain() listRelease(vars.clients_pending_asyncwrite); std::unique_lock lockf(serverTL->lockPendingWrite); - serverTL->vecclientsProcess.clear(); + serverTL->setclientsProcess.clear(); serverTL->clients_pending_write.clear(); std::atomic_thread_fence(std::memory_order_seq_cst); } diff --git a/src/IStorage.h b/src/IStorage.h index 5301884ee..79726daf5 100644 --- a/src/IStorage.h +++ b/src/IStorage.h @@ -1,12 +1,13 @@ #pragma once #include +#include #include "sds.h" #include "ae.h" #define METADATA_DB_IDENTIFIER "c299fde0-6d42-4ec4-b939-34f680ffe39f" struct StorageToken { - struct client *c; + std::unordered_set setc; struct redisDbPersistentData *db; virtual ~StorageToken() {} }; @@ -42,7 +43,7 @@ class IStorage virtual bool enumerate_hashslot(callback fn, unsigned int hashslot) const = 0; virtual size_t count() const = 0; - virtual StorageToken *begin_retrieve(struct aeEventLoop *, aePostFunctionTokenProc, const char * /*key*/, size_t /*cchKey*/) {return nullptr;}; + virtual StorageToken *begin_retrieve(struct aeEventLoop *, aePostFunctionTokenProc, sds *, size_t) {return nullptr;}; virtual void complete_retrieve(StorageToken * /*tok*/, callbackSingle /*fn*/) {}; virtual void bulkInsert(char **rgkeys, size_t *rgcbkeys, char **rgvals, size_t *rgcbvals, size_t celem) { diff --git a/src/StorageCache.cpp b/src/StorageCache.cpp index 2445e7259..c0ff305b1 100644 --- a/src/StorageCache.cpp +++ b/src/StorageCache.cpp @@ -188,7 +188,8 @@ void StorageCache::retrieve(sds key, IStorage::callbackSingle fn) const m_spstorage->retrieve(key, sdslen(key), fn); } -StorageToken *StorageCache::begin_retrieve(struct aeEventLoop *el, aePostFunctionTokenProc proc, sds key) { +StorageToken *StorageCache::begin_retrieve(struct aeEventLoop *el, aePostFunctionTokenProc proc, sds *rgkey, size_t ckey) { +#if 0 std::unique_lock ul(m_lock); if (m_pdict != nullptr) { @@ -199,7 +200,8 @@ StorageToken *StorageCache::begin_retrieve(struct aeEventLoop *el, aePostFunctio return nullptr; // Not found } ul.unlock(); - return m_spstorage->begin_retrieve(el, proc, key, sdslen(key)); +#endif + return m_spstorage->begin_retrieve(el, proc, rgkey, ckey); } void StorageCache::complete_retrieve(StorageToken *tok, IStorage::callbackSingle fn) { diff --git a/src/StorageCache.h b/src/StorageCache.h index b9996acda..614f8c27b 100644 --- a/src/StorageCache.h +++ b/src/StorageCache.h @@ -43,7 +43,7 @@ class StorageCache void insert(sds key, const void *data, size_t cbdata, bool fOverwrite); void bulkInsert(char **rgkeys, size_t *rgcbkeys, char **rgvals, size_t *rgcbvals, size_t celem); void retrieve(sds key, IStorage::callbackSingle fn) const; - StorageToken *begin_retrieve(struct aeEventLoop *el, aePostFunctionTokenProc proc, sds key); + StorageToken *begin_retrieve(struct aeEventLoop *el, aePostFunctionTokenProc proc, sds *rgkey, size_t ckey); void complete_retrieve(StorageToken *tok, IStorage::callbackSingle fn); bool erase(sds key); void emergencyFreeCache(); diff --git a/src/ae.cpp b/src/ae.cpp index a43969fe0..2b3b854cd 100644 --- a/src/ae.cpp +++ b/src/ae.cpp @@ -278,7 +278,7 @@ int aePostFunction(aeEventLoop *eventLoop, aePostFunctionTokenProc *proc, Storag cmd.op = AE_ASYNC_OP::PostAsynDBFunction; cmd.tproc = proc; cmd.clientData = (void*)token; - cmd.fLock = true; + cmd.fLock = false; auto size = write(eventLoop->fdCmdWrite, &cmd, sizeof(cmd)); if (size != sizeof(cmd)) return AE_ERR; diff --git a/src/blocked.cpp b/src/blocked.cpp index 5f098706f..8062b8d54 100644 --- a/src/blocked.cpp +++ b/src/blocked.cpp @@ -208,7 +208,7 @@ void unblockClient(client *c) { listDelNode(g_pserver->paused_clients,c->paused_list_node); c->paused_list_node = NULL; } else if (c->btype == BLOCKED_STORAGE) { - serverTL->vecclientsProcess.push_back(c); + serverTL->setclientsProcess.insert(c); } else { serverPanic("Unknown btype in unblockClient()."); } diff --git a/src/db.cpp b/src/db.cpp index 7185e72cb..6e0f127e8 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -3363,45 +3363,60 @@ void redisDbPersistentData::prefetchKeysAsync(client *c, parsed_command &command #endif return; } +} - AeLocker lock; +void redisDbPersistentData::prefetchKeysFlash(std::unordered_set &setc) +{ + serverAssert(GlobalLocksAcquired()); + std::vector veckeys; + std::unordered_set setcBlocked; + + for (client *c : setc) { + for (auto &command : c->vecqueuedcmd) { + getKeysResult result = GETKEYS_RESULT_INIT; + auto cmd = lookupCommand(szFromObj(command.argv[0])); + if (cmd == nullptr) + return; // Bad command? It's not for us to judge, just bail + + if (command.argc < std::abs(cmd->arity)) + return; // Invalid number of args + + int numkeys = getKeysFromCommand(cmd, command.argv, command.argc, &result); + bool fQueued = false; + for (int ikey = 0; ikey < numkeys; ++ikey) + { + robj *objKey = command.argv[result.keys[ikey]]; + if (this->find_cached_threadsafe(szFromObj(objKey)) == nullptr) { + veckeys.push_back(szFromObj(objKey)); + fQueued = true; + } + } - std::vector veckeys; - lock.arm(c); - getKeysResult result = GETKEYS_RESULT_INIT; - auto cmd = lookupCommand(szFromObj(command.argv[0])); - if (cmd == nullptr) - return; // Bad command? It's not for us to judge, just bail - - if (command.argc < std::abs(cmd->arity)) - return; // Invalid number of args - - int numkeys = getKeysFromCommand(cmd, command.argv, command.argc, &result); - for (int ikey = 0; ikey < numkeys; ++ikey) - { - robj *objKey = command.argv[result.keys[ikey]]; - if (this->find_cached_threadsafe(szFromObj(objKey)) == nullptr) - veckeys.push_back(objKey); + if (fQueued) { + setcBlocked.insert(c); + } + getKeysFreeResult(&result); + } } - getKeysFreeResult(&result); - - std::vector>> vecInserts; - for (robj *objKey : veckeys) - { - std::unique_ptr spexpire; - auto *tok = m_spstorage->begin_retrieve(serverTL->el, storageLoadCallback, (sds)szFromObj(objKey)); - if (tok != nullptr) { - tok->c = c; - tok->db = this; - blockClient(c, BLOCKED_STORAGE); + auto *tok = m_spstorage->begin_retrieve(serverTL->el, storageLoadCallback, veckeys.data(), veckeys.size()); + if (tok != nullptr) { + for (client *c : setcBlocked) { + if (!(c->flags & CLIENT_BLOCKED)) + blockClient(c, BLOCKED_STORAGE); } + tok->setc = std::move(setcBlocked); + tok->db = this; } - return; } -/*static*/ void redisDbPersistentData::storageLoadCallback(aeEventLoop *el, StorageToken *tok) { +/*static*/ void redisDbPersistentData::storageLoadCallback(aeEventLoop *, StorageToken *tok) { + serverTL->setStorageTokensProcess.insert(tok); +} + +void redisDbPersistentData::processStorageToken(StorageToken *tok) { + auto setc = std::move(tok->setc); tok->db->m_spstorage->complete_retrieve(tok, [&](const char *szKey, size_t cbKey, const void *data, size_t cb) { auto *db = tok->db; size_t offset = 0; @@ -3435,6 +3450,13 @@ void redisDbPersistentData::prefetchKeysAsync(client *c, parsed_command &command serverAssert(o->FExpires() == (db->m_setexpire->find(key) != db->m_setexpire->end())); } }); - std::unique_lock ul(tok->c->lock); - unblockClient(tok->c); -} + tok = nullptr; // Invalid past this point + + for (client *c : setc) { + std::unique_lock ul(c->lock); + if (c->flags & CLIENT_BLOCKED) + unblockClient(c); + else + serverTL->setclientsProcess.insert(c); + } +} \ No newline at end of file diff --git a/src/networking.cpp b/src/networking.cpp index 76d70684a..2c047ae7c 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -1593,7 +1593,8 @@ void unlinkClient(client *c) { c->fPendingAsyncWrite = FALSE; } - serverTL->vecclientsProcess.erase(std::remove(serverTL->vecclientsProcess.begin(), serverTL->vecclientsProcess.end(), c), serverTL->vecclientsProcess.end()); + serverTL->setclientsProcess.erase(c); + serverTL->setclientsPrefetch.erase(c); /* Clear the tracking status. */ if (c->flags & CLIENT_TRACKING) disableTracking(c); @@ -2774,8 +2775,13 @@ void readQueryFromClient(connection *conn) { processInputBuffer(c, false, CMD_CALL_SLOWLOG | CMD_CALL_STATS | CMD_CALL_ASYNC); } } - if (!c->vecqueuedcmd.empty() && !(c->flags & CLIENT_BLOCKED)) - serverTL->vecclientsProcess.push_back(c); + if (!c->vecqueuedcmd.empty()) { + if (g_pserver->m_pstorageFactory != nullptr && g_pserver->prefetch_enabled) { + serverTL->setclientsPrefetch.insert(c); + } else { + serverTL->setclientsProcess.insert(c); + } + } } else { // If we're single threaded its actually better to just process the command here while the query is hot in the cache // multithreaded lock contention dominates and batching is better @@ -2790,9 +2796,9 @@ void processClients() serverAssert(GlobalLocksAcquired()); // Note that this function is reentrant and vecclients may be modified by code called from processInputBuffer - while (!serverTL->vecclientsProcess.empty()) { - client *c = serverTL->vecclientsProcess.front(); - serverTL->vecclientsProcess.erase(serverTL->vecclientsProcess.begin()); + while (!serverTL->setclientsProcess.empty()) { + client *c = *serverTL->setclientsProcess.begin(); + serverTL->setclientsProcess.erase(serverTL->setclientsProcess.begin()); /* There is more data in the client input buffer, continue parsing it * in case to check if there is a full command to execute. */ diff --git a/src/server.cpp b/src/server.cpp index b8dbbf5a8..0bbf2dec4 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -2840,6 +2840,20 @@ void beforeSleep(struct aeEventLoop *eventLoop) { locker.arm(); + for (auto *tok : serverTL->setStorageTokensProcess) { + tok->db->processStorageToken(tok); + } + serverTL->setStorageTokensProcess.clear(); + + if (g_pserver->m_pstorageFactory != nullptr && !serverTL->setclientsPrefetch.empty()) { + g_pserver->db[0]->prefetchKeysFlash(serverTL->setclientsPrefetch); + for (client *c : serverTL->setclientsPrefetch) { + if (!(c->flags & CLIENT_BLOCKED)) + serverTL->setclientsProcess.insert(c); + } + serverTL->setclientsPrefetch.clear(); + } + /* end any snapshots created by fast async commands */ for (int idb = 0; idb < cserver.dbnum; ++idb) { if (serverTL->rgdbSnapshot[idb] != nullptr && serverTL->rgdbSnapshot[idb]->FStale()) { @@ -4146,7 +4160,7 @@ void InitServerLast() { set_jemalloc_bg_thread(cserver.jemalloc_bg_thread); g_pserver->initial_memory_usage = zmalloc_used_memory(); - g_pserver->asyncworkqueue = new (MALLOC_LOCAL) AsyncWorkQueue(cserver.cthreads); + g_pserver->asyncworkqueue = new (MALLOC_LOCAL) AsyncWorkQueue(cserver.cthreads*10); // Allocate the repl backlog diff --git a/src/server.h b/src/server.h index 39d1cfeb8..ef21fd930 100644 --- a/src/server.h +++ b/src/server.h @@ -1210,6 +1210,8 @@ class redisDbPersistentData bool keycacheIsEnabled(); void prefetchKeysAsync(client *c, struct parsed_command &command); + void prefetchKeysFlash(std::unordered_set &setc); + void processStorageToken(StorageToken *tok); bool FSnapshot() const { return m_spdbSnapshotHOLDER != nullptr; } @@ -1374,6 +1376,8 @@ struct redisDb : public redisDbPersistentDataSnapshot using redisDbPersistentData::dictUnsafeKeyOnly; using redisDbPersistentData::resortExpire; using redisDbPersistentData::prefetchKeysAsync; + using redisDbPersistentData::prefetchKeysFlash; + using redisDbPersistentData::processStorageToken; using redisDbPersistentData::prepOverwriteForSnapshot; using redisDbPersistentData::FRehashing; using redisDbPersistentData::FTrackingChanges; @@ -2212,7 +2216,9 @@ struct redisServerThreadVars { int propagate_in_transaction = 0; /* Make sure we don't propagate nested MULTI/EXEC */ int client_pause_in_transaction = 0; /* Was a client pause executed during this Exec? */ - std::vector vecclientsProcess; + std::unordered_set setclientsProcess; + std::unordered_set setclientsPrefetch; + std::unordered_set setStorageTokensProcess; dictAsyncRehashCtl *rehashCtl = nullptr; int getRdbKeySaveDelay(); diff --git a/src/storage/rocksdb.cpp b/src/storage/rocksdb.cpp index 1456834ef..ef2f14abb 100644 --- a/src/storage/rocksdb.cpp +++ b/src/storage/rocksdb.cpp @@ -279,21 +279,41 @@ bool RocksDBStorageProvider::FKeyExists(std::string& key) const } struct RetrievalStorageToken : public StorageToken { - std::string key; - std::vector data; - bool fFound = false; + std::unordered_map> mapkeydata; }; -StorageToken *RocksDBStorageProvider::begin_retrieve(struct aeEventLoop *el, aePostFunctionTokenProc callback, const char *key, size_t cchKey) { +StorageToken *RocksDBStorageProvider::begin_retrieve(struct aeEventLoop *el, aePostFunctionTokenProc callback, sds *rgkey, size_t ckey) { RetrievalStorageToken *tok = new RetrievalStorageToken(); - tok->key = std::string(key, cchKey); + + for (size_t ikey = 0; ikey < ckey; ++ikey) { + tok->mapkeydata.insert(std::make_pair(std::string(rgkey[ikey], sdslen(rgkey[ikey])), nullptr)); + } + (*m_pfactory->m_wqueue)->AddWorkFunction([this, el, callback, tok]{ - rocksdb::PinnableSlice slice; - auto status = m_spdb->Get(ReadOptions(), m_spcolfamily.get(), rocksdb::Slice(prefixKey(tok->key.data(), tok->key.size())), &slice); - if (status.ok()) { - tok->data.resize(slice.size()); - memcpy(tok->data.data(), slice.data(), slice.size()); - tok->fFound = true; + std::vector veckeysStr; + std::vector veckeys; + std::vector vecvals; + std::vector vecstatus; + veckeys.reserve(tok->mapkeydata.size()); + veckeysStr.reserve(tok->mapkeydata.size()); + vecvals.resize(tok->mapkeydata.size()); + vecstatus.resize(tok->mapkeydata.size()); + for (auto &pair : tok->mapkeydata) { + veckeysStr.emplace_back(prefixKey(pair.first.data(), pair.first.size())); + veckeys.emplace_back(veckeysStr.back().data(), veckeysStr.back().size()); + } + + m_spdb->MultiGet(ReadOptions(), + m_spcolfamily.get(), + veckeys.size(), const_cast(veckeys.data()), + vecvals.data(), vecstatus.data()); + + auto itrDst = tok->mapkeydata.begin(); + for (size_t ires = 0; ires < veckeys.size(); ++ires) { + if (vecstatus[ires].ok()) { + itrDst->second = std::make_unique(std::move(vecvals[ires])); + } + ++itrDst; } aePostFunction(el, callback, tok); }); @@ -302,7 +322,10 @@ StorageToken *RocksDBStorageProvider::begin_retrieve(struct aeEventLoop *el, aeP void RocksDBStorageProvider::complete_retrieve(StorageToken *tok, callbackSingle fn) { RetrievalStorageToken *rtok = reinterpret_cast(tok); - if (rtok->fFound) - fn(rtok->key.data(), rtok->key.size(), rtok->data.data(), rtok->data.size()); + for (auto &pair : rtok->mapkeydata) { + if (pair.second != nullptr) { + fn(pair.first.data(), pair.first.size(), pair.second->data(), pair.second->size()); + } + } delete rtok; } \ No newline at end of file diff --git a/src/storage/rocksdb.h b/src/storage/rocksdb.h index c8557a577..01edb4975 100644 --- a/src/storage/rocksdb.h +++ b/src/storage/rocksdb.h @@ -31,7 +31,7 @@ class RocksDBStorageProvider : public IStorage virtual bool erase(const char *key, size_t cchKey) override; virtual void retrieve(const char *key, size_t cchKey, callbackSingle fn) const override; - virtual StorageToken *begin_retrieve(struct aeEventLoop *el, aePostFunctionTokenProc callback, const char *key, size_t cchKey); + virtual StorageToken *begin_retrieve(struct aeEventLoop *el, aePostFunctionTokenProc callback, sds *rgkey, size_t ckey); virtual void complete_retrieve(StorageToken *tok, callbackSingle fn); virtual size_t clear() override; From cdc5e8fa91c5e863115844f93b7f8f9751f29eb7 Mon Sep 17 00:00:00 2001 From: John Sully Date: Mon, 25 Sep 2023 06:03:47 +0000 Subject: [PATCH 04/68] Remove a global lock --- src/db.cpp | 2 +- src/server.cpp | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/db.cpp b/src/db.cpp index 6e0f127e8..c4bcb2870 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -3181,7 +3181,7 @@ void redisDbPersistentData::trackChanges(bool fBulk, size_t sizeHint) if (fBulk) m_fAllChanged.fetch_add(1, std::memory_order_acq_rel); - if (sizeHint > 0) + if (sizeHint > 0 && aeThreadOwnsLock()) dictExpand(m_dictChanged, sizeHint, false); } diff --git a/src/server.cpp b/src/server.cpp index 0bbf2dec4..4792d0fd9 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -3068,10 +3068,8 @@ void afterSleep(struct aeEventLoop *eventLoop) { serverAssert(serverTL->gcEpoch.isReset()); serverTL->gcEpoch = g_pserver->garbageCollector.startEpoch(); - aeAcquireLock(); for (int idb = 0; idb < cserver.dbnum; ++idb) g_pserver->db[idb]->trackChanges(false); - aeReleaseLock(); serverTL->disable_async_commands = false; } From 71b92e77d56333d89464f1a601b5e76ff465ba35 Mon Sep 17 00:00:00 2001 From: John Sully Date: Mon, 25 Sep 2023 06:47:27 +0000 Subject: [PATCH 05/68] Avoid 0 size batches --- src/db.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/db.cpp b/src/db.cpp index c4bcb2870..09c7b665c 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -3399,6 +3399,9 @@ void redisDbPersistentData::prefetchKeysFlash(std::unordered_set &setc) } } + if (veckeys.empty()) + return; + auto *tok = m_spstorage->begin_retrieve(serverTL->el, storageLoadCallback, veckeys.data(), veckeys.size()); if (tok != nullptr) { for (client *c : setcBlocked) { From b0935de746014490b0cc654e6c89528a5a66a49e Mon Sep 17 00:00:00 2001 From: John Sully Date: Tue, 26 Sep 2023 03:51:49 +0000 Subject: [PATCH 06/68] More fixes --- src/db.cpp | 4 ++-- src/storage/rocksdb.cpp | 6 ++++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/db.cpp b/src/db.cpp index 09c7b665c..013473163 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -3376,10 +3376,10 @@ void redisDbPersistentData::prefetchKeysFlash(std::unordered_set &setc) getKeysResult result = GETKEYS_RESULT_INIT; auto cmd = lookupCommand(szFromObj(command.argv[0])); if (cmd == nullptr) - return; // Bad command? It's not for us to judge, just bail + break; // Bad command? It's not for us to judge, just bail if (command.argc < std::abs(cmd->arity)) - return; // Invalid number of args + break; // Invalid number of args int numkeys = getKeysFromCommand(cmd, command.argv, command.argc, &result); bool fQueued = false; diff --git a/src/storage/rocksdb.cpp b/src/storage/rocksdb.cpp index ef2f14abb..16aed43b7 100644 --- a/src/storage/rocksdb.cpp +++ b/src/storage/rocksdb.cpp @@ -288,8 +288,10 @@ StorageToken *RocksDBStorageProvider::begin_retrieve(struct aeEventLoop *el, aeP for (size_t ikey = 0; ikey < ckey; ++ikey) { tok->mapkeydata.insert(std::make_pair(std::string(rgkey[ikey], sdslen(rgkey[ikey])), nullptr)); } - - (*m_pfactory->m_wqueue)->AddWorkFunction([this, el, callback, tok]{ + + auto opts = ReadOptions(); + opts.async_io = true; + (*m_pfactory->m_wqueue)->AddWorkFunction([this, el, callback, tok, opts]{ std::vector veckeysStr; std::vector veckeys; std::vector vecvals; From 7e745ef9c5b79c3ec9be814c5bba2679b6f8d0fd Mon Sep 17 00:00:00 2001 From: John Sully Date: Tue, 7 Nov 2023 17:38:13 +0000 Subject: [PATCH 07/68] Inform the kernel when we won't need the disk reserve --- src/replication.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/replication.cpp b/src/replication.cpp index 8a032182d..960cd1929 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -308,10 +308,14 @@ void resizeReplicationBacklog(long long newsize) { auto cbActiveBacklog = cbPhase1 + g_pserver->repl_backlog_idx; serverAssert(g_pserver->repl_backlog_histlen == cbActiveBacklog); } - if (g_pserver->repl_backlog != g_pserver->repl_backlog_disk) + if (g_pserver->repl_backlog != g_pserver->repl_backlog_disk) { zfree(g_pserver->repl_backlog); - else + } else { serverLog(LL_NOTICE, "Returning to memory backed replication backlog"); + // The kernel doesn't make promises with how it will manage the memory but users really want to + // see the RSS go down. So lets encourage that to happen. + madvise(g_pserver->repl_backlog_disk, cserver.repl_backlog_disk_size, MADV_DONTNEED); + } g_pserver->repl_backlog = backlog; g_pserver->repl_backlog_idx = g_pserver->repl_backlog_histlen; if (g_pserver->repl_batch_idxStart >= 0) { From a4ca5271f6b067720f13ef77744271cc27f02f4f Mon Sep 17 00:00:00 2001 From: John Sully Date: Tue, 7 Nov 2023 19:53:39 +0000 Subject: [PATCH 08/68] Free up the disk repl buffer when its not being used --- src/replication.cpp | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/src/replication.cpp b/src/replication.cpp index 960cd1929..03d63bf85 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -283,7 +283,7 @@ void resizeReplicationBacklog(long long newsize) { if (cserver.repl_backlog_disk_size != 0) { if (newsize > g_pserver->repl_backlog_config_size || cserver.force_backlog_disk) { - if (g_pserver->repl_backlog == g_pserver->repl_backlog_disk) + if (g_pserver->repl_backlog_disk == nullptr || g_pserver->repl_backlog == g_pserver->repl_backlog_disk) return; // Can't do anything more serverLog(LL_NOTICE, "Switching to disk backed replication backlog due to exceeding memory limits"); backlog = g_pserver->repl_backlog_disk; @@ -312,9 +312,20 @@ void resizeReplicationBacklog(long long newsize) { zfree(g_pserver->repl_backlog); } else { serverLog(LL_NOTICE, "Returning to memory backed replication backlog"); - // The kernel doesn't make promises with how it will manage the memory but users really want to - // see the RSS go down. So lets encourage that to happen. - madvise(g_pserver->repl_backlog_disk, cserver.repl_backlog_disk_size, MADV_DONTNEED); + auto repl_backlog = g_pserver->repl_backlog_disk; + g_pserver->repl_backlog_disk = nullptr; + g_pserver->asyncworkqueue->AddWorkFunction([repl_backlog, size = cserver.repl_backlog_disk_size]{ + // The kernel doesn't make promises with how it will manage the memory but users really want to + // see the RSS go down. So lets encourage that to happen. + madvise(repl_backlog, size, MADV_DONTNEED); // NOTE: This will block until all pages are released + aeAcquireLock(); + if (g_pserver->repl_backlog_disk == nullptr && cserver.repl_backlog_disk_size == size) { + g_pserver->repl_backlog_disk = repl_backlog; + } else { + munmap(g_pserver->repl_backlog_disk, size); + } + aeReleaseLock(); + }, true /*fHiPri*/); } g_pserver->repl_backlog = backlog; g_pserver->repl_backlog_idx = g_pserver->repl_backlog_histlen; From 0b5b0c065e6e5f97e23b02e891616b3f3ced930c Mon Sep 17 00:00:00 2001 From: Malavan Sotheeswaran Date: Wed, 15 Nov 2023 18:09:18 -0500 Subject: [PATCH 09/68] want to correctly cache the system total memory always, as force-eviction-percent is a mutable config (#236) --- src/server.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/server.cpp b/src/server.cpp index 48c78a838..50226a8b5 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -4039,7 +4039,7 @@ void initServer(void) { g_pserver->cron_malloc_stats.allocator_active = 0; g_pserver->cron_malloc_stats.allocator_resident = 0; g_pserver->cron_malloc_stats.sys_available = 0; - g_pserver->cron_malloc_stats.sys_total = g_pserver->force_eviction_percent ? getMemTotal() : 0; + g_pserver->cron_malloc_stats.sys_total = getMemTotal(); g_pserver->lastbgsave_status = C_OK; g_pserver->aof_last_write_status = C_OK; g_pserver->aof_last_write_errno = 0; From f53dbd32564c1bc4b735361d8ca3472161e6ef74 Mon Sep 17 00:00:00 2001 From: a00817524 Date: Tue, 24 Oct 2023 18:47:15 +0000 Subject: [PATCH 10/68] commitChanges async api --- src/IStorage.h | 10 ++++++++++ src/StorageCache.cpp | 3 ++- src/StorageCache.h | 2 ++ src/db.cpp | 18 +++++++++++++++++- src/storage/rocksdb.cpp | 15 +++++++++++++++ src/storage/rocksdb.h | 2 ++ 6 files changed, 48 insertions(+), 2 deletions(-) diff --git a/src/IStorage.h b/src/IStorage.h index 79726daf5..82f8087a0 100644 --- a/src/IStorage.h +++ b/src/IStorage.h @@ -7,6 +7,13 @@ #define METADATA_DB_IDENTIFIER "c299fde0-6d42-4ec4-b939-34f680ffe39f" struct StorageToken { + enum class TokenType { + SingleRead, + SingleWrite, + Delete, + BatchWrite, + }; + TokenType type; std::unordered_set setc; struct redisDbPersistentData *db; virtual ~StorageToken() {} @@ -46,6 +53,9 @@ class IStorage virtual StorageToken *begin_retrieve(struct aeEventLoop *, aePostFunctionTokenProc, sds *, size_t) {return nullptr;}; virtual void complete_retrieve(StorageToken * /*tok*/, callbackSingle /*fn*/) {}; + virtual StorageToken* begin_endWriteBatch(struct aeEventLoop *, aePostFunctionTokenProc*) {} // NOP + virtual void complete_endWriteBatch(StorageToken * /*tok*/) {}; + virtual void bulkInsert(char **rgkeys, size_t *rgcbkeys, char **rgvals, size_t *rgcbvals, size_t celem) { beginWriteBatch(); for (size_t ielem = 0; ielem < celem; ++ielem) { diff --git a/src/StorageCache.cpp b/src/StorageCache.cpp index c0ff305b1..51d2e8c1f 100644 --- a/src/StorageCache.cpp +++ b/src/StorageCache.cpp @@ -233,4 +233,5 @@ void StorageCache::emergencyFreeCache() { dictRelease(d); }); } -} \ No newline at end of file +} + diff --git a/src/StorageCache.h b/src/StorageCache.h index 614f8c27b..828c657df 100644 --- a/src/StorageCache.h +++ b/src/StorageCache.h @@ -45,6 +45,8 @@ class StorageCache void retrieve(sds key, IStorage::callbackSingle fn) const; StorageToken *begin_retrieve(struct aeEventLoop *el, aePostFunctionTokenProc proc, sds *rgkey, size_t ckey); void complete_retrieve(StorageToken *tok, IStorage::callbackSingle fn); + StorageToken* begin_endWriteBatch(struct aeEventLoop *el, aePostFunctionTokenProc* proc) {m_spstorage->begin_endWriteBatch(el,proc);} // NOP + void complete_endWriteBatch(StorageToken *tok) {m_spstorage->complete_endWriteBatch(tok);}; bool erase(sds key); void emergencyFreeCache(); bool keycacheIsEnabled() const { return m_pdict != nullptr; } diff --git a/src/db.cpp b/src/db.cpp index 013473163..21961a6af 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -3072,6 +3072,8 @@ void redisDbPersistentData::bulkDirectStorageInsert(char **rgKeys, size_t *rgcbK void redisDbPersistentData::commitChanges(const redisDbPersistentDataSnapshot **psnapshotFree) { + + std::unordered_set setcBlocked; if (m_pdbSnapshotStorageFlush) { dictIterator *di = dictGetIterator(m_dictChangedStorageFlush); @@ -3086,8 +3088,22 @@ void redisDbPersistentData::commitChanges(const redisDbPersistentDataSnapshot ** *psnapshotFree = m_pdbSnapshotStorageFlush; m_pdbSnapshotStorageFlush = nullptr; } + if (m_spstorage != nullptr) - m_spstorage->endWriteBatch(); + { + auto tok = m_spstorage->begin_endWriteBatch(serverTL->el, storageLoadCallback); + if (tok != nullptr) + { + for (client *c : setcBlocked) //need to check how to push client to blocked list + { + if (!(c->flags & CLIENT_BLOCKED)) + blockClient(c, BLOCKED_STORAGE); + } + // tok->setc = std::move(setcBlocked); + tok->db = this; + tok->type = StorageToken::TokenType::BatchWrite; + } + } } redisDbPersistentData::~redisDbPersistentData() diff --git a/src/storage/rocksdb.cpp b/src/storage/rocksdb.cpp index 16aed43b7..1bae69d8c 100644 --- a/src/storage/rocksdb.cpp +++ b/src/storage/rocksdb.cpp @@ -255,6 +255,21 @@ void RocksDBStorageProvider::endWriteBatch() m_lock.unlock(); } +StorageToken* RocksDBStorageProvider::begin_endWriteBatch(struct aeEventLoop *el, aePostFunctionTokenProc* callback) +{ + StorageToken *tok = new StorageToken(); + auto pbatch = m_spbatch.get(); + (*m_pfactory->m_wqueue)->AddWorkFunction([this, el,callback,tok,&pbatch]{ + m_spdb->Write(WriteOptions(),pbatch); + aePostFunction(el,callback,tok); + }); +} + +void RocksDBStorageProvider::complete_endWriteBatch(StorageToken* tok){ + // m_spbatch = nullptr; + m_lock.unlock(); +} + void RocksDBStorageProvider::batch_lock() { m_lock.lock(); diff --git a/src/storage/rocksdb.h b/src/storage/rocksdb.h index 01edb4975..d5587c9e9 100644 --- a/src/storage/rocksdb.h +++ b/src/storage/rocksdb.h @@ -42,6 +42,8 @@ class RocksDBStorageProvider : public IStorage virtual void beginWriteBatch() override; virtual void endWriteBatch() override; + virtual StorageToken* begin_endWriteBatch(struct aeEventLoop *el, aePostFunctionTokenProc* proc); + virtual void complete_endWriteBatch(StorageToken *tok); virtual void bulkInsert(char **rgkeys, size_t *rgcbkeys, char **rgvals, size_t *rgcbvals, size_t celem) override; From eea48c584be32fa7736642f4520fbc363623fd53 Mon Sep 17 00:00:00 2001 From: a00817524 Date: Tue, 24 Oct 2023 18:54:18 +0000 Subject: [PATCH 11/68] missed changes --- src/db.cpp | 32 +++++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/src/db.cpp b/src/db.cpp index 21961a6af..8d23af067 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -3436,7 +3436,13 @@ void redisDbPersistentData::prefetchKeysFlash(std::unordered_set &setc) void redisDbPersistentData::processStorageToken(StorageToken *tok) { auto setc = std::move(tok->setc); - tok->db->m_spstorage->complete_retrieve(tok, [&](const char *szKey, size_t cbKey, const void *data, size_t cb) { + switch (tok->type) + { + + case StorageToken::TokenType::SingleRead: + { + tok->db->m_spstorage->complete_retrieve(tok, [&](const char *szKey, size_t cbKey, const void *data, size_t cb) + { auto *db = tok->db; size_t offset = 0; sds key = sdsnewlen(szKey, -((ssize_t)cbKey)); @@ -3467,11 +3473,27 @@ void redisDbPersistentData::processStorageToken(StorageToken *tok) { serverAssert(db->m_setexpire->find(key) != db->m_setexpire->end()); } serverAssert(o->FExpires() == (db->m_setexpire->find(key) != db->m_setexpire->end())); - } - }); - tok = nullptr; // Invalid past this point + } }); + break; + } + case StorageToken::TokenType::BatchWrite: + { + tok->db->m_spstorage->complete_endWriteBatch(tok); + break; + } + case StorageToken::TokenType::SingleWrite: + { + tok->db->m_spstorage->complete_insert(tok); + break; + } + default: + break; + } //switch end - for (client *c : setc) { + tok = nullptr; // Invalid past this point + + for (client *c : setc) + { std::unique_lock ul(c->lock); if (c->flags & CLIENT_BLOCKED) unblockClient(c); From e8747522cad472edba748a35064d4d58656e102f Mon Sep 17 00:00:00 2001 From: a00817524 Date: Wed, 25 Oct 2023 02:02:44 +0000 Subject: [PATCH 12/68] some fixes --- src/StorageCache.h | 2 +- src/db.cpp | 2 +- src/storage/rocksdb.cpp | 32 +++++++++++++++++++++++++------- 3 files changed, 27 insertions(+), 9 deletions(-) diff --git a/src/StorageCache.h b/src/StorageCache.h index 828c657df..18fd7f7e4 100644 --- a/src/StorageCache.h +++ b/src/StorageCache.h @@ -45,7 +45,7 @@ class StorageCache void retrieve(sds key, IStorage::callbackSingle fn) const; StorageToken *begin_retrieve(struct aeEventLoop *el, aePostFunctionTokenProc proc, sds *rgkey, size_t ckey); void complete_retrieve(StorageToken *tok, IStorage::callbackSingle fn); - StorageToken* begin_endWriteBatch(struct aeEventLoop *el, aePostFunctionTokenProc* proc) {m_spstorage->begin_endWriteBatch(el,proc);} // NOP + StorageToken* begin_endWriteBatch(struct aeEventLoop *el, aePostFunctionTokenProc* proc) {return m_spstorage->begin_endWriteBatch(el,proc);} // NOP void complete_endWriteBatch(StorageToken *tok) {m_spstorage->complete_endWriteBatch(tok);}; bool erase(sds key); void emergencyFreeCache(); diff --git a/src/db.cpp b/src/db.cpp index 8d23af067..387fbdcdb 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -3091,7 +3091,7 @@ void redisDbPersistentData::commitChanges(const redisDbPersistentDataSnapshot ** if (m_spstorage != nullptr) { - auto tok = m_spstorage->begin_endWriteBatch(serverTL->el, storageLoadCallback); + auto *tok = m_spstorage->begin_endWriteBatch(serverTL->el, storageLoadCallback); if (tok != nullptr) { for (client *c : setcBlocked) //need to check how to push client to blocked list diff --git a/src/storage/rocksdb.cpp b/src/storage/rocksdb.cpp index 1bae69d8c..6c84ccad8 100644 --- a/src/storage/rocksdb.cpp +++ b/src/storage/rocksdb.cpp @@ -255,21 +255,39 @@ void RocksDBStorageProvider::endWriteBatch() m_lock.unlock(); } -StorageToken* RocksDBStorageProvider::begin_endWriteBatch(struct aeEventLoop *el, aePostFunctionTokenProc* callback) +struct BatchStorageToken : public StorageToken { + std::shared_ptr tspdb; // Note: This must be first so it is deleted last + rocksdb::WriteBatch* tspbatch; + ~BatchStorageToken(){ + tspdb.reset(); + tspdb = nullptr; + tspbatch = nullptr; + } +}; + +StorageToken* RocksEncoderStorageProvider::begin_endWriteBatch(struct aeEventLoop *el, aePostFunctionTokenProc* callback) { - StorageToken *tok = new StorageToken(); - auto pbatch = m_spbatch.get(); - (*m_pfactory->m_wqueue)->AddWorkFunction([this, el,callback,tok,&pbatch]{ - m_spdb->Write(WriteOptions(),pbatch); + // serverLog(LL_WARNING, "RocksEncoderStorageProvider::begin_endWriteBatch"); + BatchStorageToken *tok = new BatchStorageToken(); + tok->tspbatch = m_spbatch.get(); + tok->tspdb = m_spdb; + (*m_pfactory->m_wqueue)->AddWorkFunction([this, el,callback,tok]{ + // serverAssert(db); + serverAssert(tok->tspdb); + tok->tspdb->Write(WriteOptions(),tok->tspbatch); aePostFunction(el,callback,tok); }); + return tok; } -void RocksDBStorageProvider::complete_endWriteBatch(StorageToken* tok){ - // m_spbatch = nullptr; +void RocksEncoderStorageProvider::complete_endWriteBatch(StorageToken* tok){ + // serverLog(LL_WARNING, "RocksEncoderStorageProvider::complete_endWriteBatch"); m_lock.unlock(); + delete tok; + tok = nullptr; } + void RocksDBStorageProvider::batch_lock() { m_lock.lock(); From 4c014ee53db81a38a4b40d5d791f182060363b1d Mon Sep 17 00:00:00 2001 From: a00817524 Date: Wed, 25 Oct 2023 02:13:10 +0000 Subject: [PATCH 13/68] build fixes --- src/db.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/db.cpp b/src/db.cpp index 387fbdcdb..a20ea0a3f 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -3483,7 +3483,6 @@ void redisDbPersistentData::processStorageToken(StorageToken *tok) { } case StorageToken::TokenType::SingleWrite: { - tok->db->m_spstorage->complete_insert(tok); break; } default: From 1129f67ba347ad18fdfa85ee3bd21a33223cdd72 Mon Sep 17 00:00:00 2001 From: a00817524 Date: Mon, 30 Oct 2023 13:49:51 +0000 Subject: [PATCH 14/68] correcting classname --- src/storage/rocksdb.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/storage/rocksdb.cpp b/src/storage/rocksdb.cpp index 6c84ccad8..3ed94be60 100644 --- a/src/storage/rocksdb.cpp +++ b/src/storage/rocksdb.cpp @@ -257,7 +257,7 @@ void RocksDBStorageProvider::endWriteBatch() struct BatchStorageToken : public StorageToken { std::shared_ptr tspdb; // Note: This must be first so it is deleted last - rocksdb::WriteBatch* tspbatch; + std::unique_ptr tspbatch; ~BatchStorageToken(){ tspdb.reset(); tspdb = nullptr; @@ -265,23 +265,23 @@ struct BatchStorageToken : public StorageToken { } }; -StorageToken* RocksEncoderStorageProvider::begin_endWriteBatch(struct aeEventLoop *el, aePostFunctionTokenProc* callback) +StorageToken* RocksDBStorageProvider::begin_endWriteBatch(struct aeEventLoop *el, aePostFunctionTokenProc* callback) { - // serverLog(LL_WARNING, "RocksEncoderStorageProvider::begin_endWriteBatch"); + serverLog(LL_WARNING, "RocksDBStorageProvider::begin_endWriteBatch"); BatchStorageToken *tok = new BatchStorageToken(); - tok->tspbatch = m_spbatch.get(); + tok->tspbatch = std::move(m_spbatch); tok->tspdb = m_spdb; (*m_pfactory->m_wqueue)->AddWorkFunction([this, el,callback,tok]{ - // serverAssert(db); - serverAssert(tok->tspdb); - tok->tspdb->Write(WriteOptions(),tok->tspbatch); + tok->tspdb->Write(WriteOptions(),tok->tspbatch.get()); aePostFunction(el,callback,tok); }); + m_spbatch = nullptr; + m_lock.unlock(); return tok; } -void RocksEncoderStorageProvider::complete_endWriteBatch(StorageToken* tok){ - // serverLog(LL_WARNING, "RocksEncoderStorageProvider::complete_endWriteBatch"); +void RocksDBStorageProvider::complete_endWriteBatch(StorageToken* tok){ + // serverLog(LL_WARNING, "RocksDBStorageProvider::complete_endWriteBatch"); m_lock.unlock(); delete tok; tok = nullptr; From de73b2ccb6aa4b24c538680a2648b0998c5e3d78 Mon Sep 17 00:00:00 2001 From: a00817524 Date: Tue, 31 Oct 2023 03:41:38 +0000 Subject: [PATCH 15/68] missed changes --- src/storage/rocksdb.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/storage/rocksdb.cpp b/src/storage/rocksdb.cpp index 3ed94be60..3b7b9ec18 100644 --- a/src/storage/rocksdb.cpp +++ b/src/storage/rocksdb.cpp @@ -265,24 +265,23 @@ struct BatchStorageToken : public StorageToken { } }; -StorageToken* RocksDBStorageProvider::begin_endWriteBatch(struct aeEventLoop *el, aePostFunctionTokenProc* callback) -{ +StorageToken* RocksDBStorageProvider::begin_endWriteBatch(struct aeEventLoop *el, aePostFunctionTokenProc* callback){ serverLog(LL_WARNING, "RocksDBStorageProvider::begin_endWriteBatch"); BatchStorageToken *tok = new BatchStorageToken(); tok->tspbatch = std::move(m_spbatch); tok->tspdb = m_spdb; + m_spbatch = nullptr; + m_lock.unlock(); (*m_pfactory->m_wqueue)->AddWorkFunction([this, el,callback,tok]{ tok->tspdb->Write(WriteOptions(),tok->tspbatch.get()); aePostFunction(el,callback,tok); }); - m_spbatch = nullptr; - m_lock.unlock(); + return tok; } void RocksDBStorageProvider::complete_endWriteBatch(StorageToken* tok){ - // serverLog(LL_WARNING, "RocksDBStorageProvider::complete_endWriteBatch"); - m_lock.unlock(); + serverLog(LL_WARNING, "RocksDBStorageProvider::complete_endWriteBatch"); delete tok; tok = nullptr; } From 4818ddf4f6e2bf278af4a1c0dbc593eba95ce782 Mon Sep 17 00:00:00 2001 From: a00817524 Date: Fri, 3 Nov 2023 20:35:27 +0000 Subject: [PATCH 16/68] fixed some errors --- src/storage/rocksdb.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/storage/rocksdb.cpp b/src/storage/rocksdb.cpp index 3b7b9ec18..e4c1d45ca 100644 --- a/src/storage/rocksdb.cpp +++ b/src/storage/rocksdb.cpp @@ -257,7 +257,7 @@ void RocksDBStorageProvider::endWriteBatch() struct BatchStorageToken : public StorageToken { std::shared_ptr tspdb; // Note: This must be first so it is deleted last - std::unique_ptr tspbatch; + std::unique_ptr tspbatch; ~BatchStorageToken(){ tspdb.reset(); tspdb = nullptr; @@ -273,7 +273,7 @@ StorageToken* RocksDBStorageProvider::begin_endWriteBatch(struct aeEventLoop *el m_spbatch = nullptr; m_lock.unlock(); (*m_pfactory->m_wqueue)->AddWorkFunction([this, el,callback,tok]{ - tok->tspdb->Write(WriteOptions(),tok->tspbatch.get()); + tok->tspdb->Write(WriteOptions(),tok->tspbatch.get()->GetWriteBatch()); aePostFunction(el,callback,tok); }); @@ -322,7 +322,7 @@ StorageToken *RocksDBStorageProvider::begin_retrieve(struct aeEventLoop *el, aeP } auto opts = ReadOptions(); - opts.async_io = true; + //opts.async_io = true; (*m_pfactory->m_wqueue)->AddWorkFunction([this, el, callback, tok, opts]{ std::vector veckeysStr; std::vector veckeys; @@ -362,4 +362,4 @@ void RocksDBStorageProvider::complete_retrieve(StorageToken *tok, callbackSingle } } delete rtok; -} \ No newline at end of file +} From 0cf263c5322a39256e2482da7f72b8ee559b6369 Mon Sep 17 00:00:00 2001 From: a00817524 Date: Mon, 6 Nov 2023 18:23:45 +0000 Subject: [PATCH 17/68] reverting changes of async io rocksdb option as this is version dependent --- src/storage/rocksdb.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/storage/rocksdb.cpp b/src/storage/rocksdb.cpp index e4c1d45ca..8c4429f3b 100644 --- a/src/storage/rocksdb.cpp +++ b/src/storage/rocksdb.cpp @@ -322,7 +322,7 @@ StorageToken *RocksDBStorageProvider::begin_retrieve(struct aeEventLoop *el, aeP } auto opts = ReadOptions(); - //opts.async_io = true; + opts.async_io = true; (*m_pfactory->m_wqueue)->AddWorkFunction([this, el, callback, tok, opts]{ std::vector veckeysStr; std::vector veckeys; From 567ef8d0ce45c5054c2cbdd86ae465802fc6a9af Mon Sep 17 00:00:00 2001 From: a00817524 Date: Wed, 8 Nov 2023 20:15:56 +0000 Subject: [PATCH 18/68] removing unwanted loc --- src/db.cpp | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/db.cpp b/src/db.cpp index a20ea0a3f..383637454 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -3072,8 +3072,6 @@ void redisDbPersistentData::bulkDirectStorageInsert(char **rgKeys, size_t *rgcbK void redisDbPersistentData::commitChanges(const redisDbPersistentDataSnapshot **psnapshotFree) { - - std::unordered_set setcBlocked; if (m_pdbSnapshotStorageFlush) { dictIterator *di = dictGetIterator(m_dictChangedStorageFlush); @@ -3094,12 +3092,6 @@ void redisDbPersistentData::commitChanges(const redisDbPersistentDataSnapshot ** auto *tok = m_spstorage->begin_endWriteBatch(serverTL->el, storageLoadCallback); if (tok != nullptr) { - for (client *c : setcBlocked) //need to check how to push client to blocked list - { - if (!(c->flags & CLIENT_BLOCKED)) - blockClient(c, BLOCKED_STORAGE); - } - // tok->setc = std::move(setcBlocked); tok->db = this; tok->type = StorageToken::TokenType::BatchWrite; } From b6b7b1b7302196eac13e8c50d931d9185e506254 Mon Sep 17 00:00:00 2001 From: a00817524 Date: Wed, 8 Nov 2023 20:17:12 +0000 Subject: [PATCH 19/68] whitespace removed --- src/db.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/db.cpp b/src/db.cpp index 383637454..a04b5700e 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -3086,7 +3086,6 @@ void redisDbPersistentData::commitChanges(const redisDbPersistentDataSnapshot ** *psnapshotFree = m_pdbSnapshotStorageFlush; m_pdbSnapshotStorageFlush = nullptr; } - if (m_spstorage != nullptr) { auto *tok = m_spstorage->begin_endWriteBatch(serverTL->el, storageLoadCallback); From 1d59efbfbd1f0d8b3a613084139f89ce5ac5c8b2 Mon Sep 17 00:00:00 2001 From: a00817524 Date: Wed, 8 Nov 2023 20:52:23 +0000 Subject: [PATCH 20/68] review rework, added serverassert default switch case,removed debug prints, removed unwanted reset of pointers --- src/db.cpp | 5 +---- src/storage/rocksdb.cpp | 7 ------- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/src/db.cpp b/src/db.cpp index a04b5700e..9a0f35684 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -3472,11 +3472,8 @@ void redisDbPersistentData::processStorageToken(StorageToken *tok) { tok->db->m_spstorage->complete_endWriteBatch(tok); break; } - case StorageToken::TokenType::SingleWrite: - { - break; - } default: + serverAssert((tok->type == StorageToken::TokenType::SingleRead) || (tok->type == StorageToken::TokenType::BatchWrite)); break; } //switch end diff --git a/src/storage/rocksdb.cpp b/src/storage/rocksdb.cpp index 8c4429f3b..56f00821f 100644 --- a/src/storage/rocksdb.cpp +++ b/src/storage/rocksdb.cpp @@ -258,15 +258,9 @@ void RocksDBStorageProvider::endWriteBatch() struct BatchStorageToken : public StorageToken { std::shared_ptr tspdb; // Note: This must be first so it is deleted last std::unique_ptr tspbatch; - ~BatchStorageToken(){ - tspdb.reset(); - tspdb = nullptr; - tspbatch = nullptr; - } }; StorageToken* RocksDBStorageProvider::begin_endWriteBatch(struct aeEventLoop *el, aePostFunctionTokenProc* callback){ - serverLog(LL_WARNING, "RocksDBStorageProvider::begin_endWriteBatch"); BatchStorageToken *tok = new BatchStorageToken(); tok->tspbatch = std::move(m_spbatch); tok->tspdb = m_spdb; @@ -281,7 +275,6 @@ StorageToken* RocksDBStorageProvider::begin_endWriteBatch(struct aeEventLoop *el } void RocksDBStorageProvider::complete_endWriteBatch(StorageToken* tok){ - serverLog(LL_WARNING, "RocksDBStorageProvider::complete_endWriteBatch"); delete tok; tok = nullptr; } From c8895059fa5f091020a2f7281e77f2acd23afad6 Mon Sep 17 00:00:00 2001 From: Malavan Sotheeswaran Date: Fri, 17 Nov 2023 15:00:13 -0500 Subject: [PATCH 21/68] add support for overload-ignorelist (#237) --- src/config.cpp | 69 ++++++++++++++++++++++++++++++++++-- src/networking.cpp | 68 ++++++++++++++++++----------------- src/server.cpp | 2 +- src/server.h | 2 ++ tests/unit/introspection.tcl | 10 +++--- 5 files changed, 112 insertions(+), 39 deletions(-) diff --git a/src/config.cpp b/src/config.cpp index 3af2ef6ad..3d815c62a 100644 --- a/src/config.cpp +++ b/src/config.cpp @@ -779,6 +779,15 @@ void loadServerConfigFromString(char *config) { } for (int i = 1; i < argc; i++) g_pserver->tls_auditlog_blocklist.emplace(argv[i], strlen(argv[i])); + } else if (!strcasecmp(argv[0], "overload-ignorelist")) { + if (argc < 2) { + err = "must supply at least one element in the ignore list"; goto loaderr; + } + if (!g_pserver->overload_ignorelist.empty()) { + err = "overload-ignorelist may only be set once"; goto loaderr; + } + for (int i = 1; i < argc; i++) + g_pserver->overload_ignorelist.emplace(argv[i], strlen(argv[i])); } else if (!strcasecmp(argv[0], "version-override") && argc == 2) { KEYDB_SET_VERSION = zstrdup(argv[1]); serverLog(LL_WARNING, "Warning version is overriden to: %s\n", KEYDB_SET_VERSION); @@ -908,8 +917,8 @@ void configSetCommand(client *c) { if (c->argc < 4 || c->argc > 4) { o = nullptr; - // Variadic set is only supported for tls-allowlist - if (strcasecmp(szFromObj(c->argv[2]), "tls-allowlist")) { + // Variadic set is only supported for tls-allowlist, tls-auditlog-blocklist and overload-ignorelist + if (strcasecmp(szFromObj(c->argv[2]), "tls-allowlist") && strcasecmp(szFromObj(c->argv[2]), "tls-auditlog-blocklist") && strcasecmp(szFromObj(c->argv[2]), "overload-ignorelist")) { addReplySubcommandSyntaxError(c); return; } @@ -1084,6 +1093,18 @@ void configSetCommand(client *c) { robj *val = c->argv[i]; g_pserver->tls_allowlist.emplace(szFromObj(val), sdslen(szFromObj(val))); } + } config_set_special_field("tls-auditlog-blocklist") { + g_pserver->tls_auditlog_blocklist.clear(); + for (int i = 3; i < c->argc; ++i) { + robj *val = c->argv[i]; + g_pserver->tls_auditlog_blocklist.emplace(szFromObj(val), sdslen(szFromObj(val))); + } + } config_set_special_field("overload-ignorelist") { + g_pserver->overload_ignorelist.clear(); + for (int i = 3; i < c->argc; ++i) { + robj *val = c->argv[i]; + g_pserver->overload_ignorelist.emplace(szFromObj(val), sdslen(szFromObj(val))); + } /* Everything else is an error... */ } config_set_else { addReplyErrorFormat(c,"Unsupported CONFIG parameter: %s", @@ -1295,6 +1316,22 @@ void configGetCommand(client *c) { } matches++; } + if (stringmatch(pattern, "tls-auditlog-blocklist", 1)) { + addReplyBulkCString(c,"tls-auditlog-blocklist"); + addReplyArrayLen(c, (long)g_pserver->tls_auditlog_blocklist.size()); + for (auto &elem : g_pserver->tls_auditlog_blocklist) { + addReplyBulkCBuffer(c, elem.get(), elem.size()); // addReplyBulkSds will free which we absolutely don't want + } + matches++; + } + if (stringmatch(pattern, "overload-ignorelist", 1)) { + addReplyBulkCString(c,"overload-ignorelist"); + addReplyArrayLen(c, (long)g_pserver->overload_ignorelist.size()); + for (auto &elem : g_pserver->overload_ignorelist) { + addReplyBulkCBuffer(c, elem.get(), elem.size()); // addReplyBulkSds will free which we absolutely don't want + } + matches++; + } setDeferredMapLen(c,replylen,matches); } @@ -1978,6 +2015,34 @@ int rewriteConfig(char *path, int force_all) { rewriteConfigMarkAsProcessed(state, "tls-allowlist"); // ensure the line is removed if it existed } + if (!g_pserver->tls_auditlog_blocklist.empty()) { + sds conf = sdsnew("tls-auditlog-blocklist "); + for (auto &elem : g_pserver->tls_auditlog_blocklist) { + conf = sdscatsds(conf, (sds)elem.get()); + conf = sdscat(conf, " "); + } + // trim the trailing space + sdsrange(conf, 0, -1); + rewriteConfigRewriteLine(state,"tls-auditlog-blocklist",conf,1 /*force*/); + // note: conf is owned by rewriteConfigRewriteLine - no need to free + } else { + rewriteConfigMarkAsProcessed(state, "tls-auditlog-blocklist"); // ensure the line is removed if it existed + } + + if (!g_pserver->overload_ignorelist.empty()) { + sds conf = sdsnew("overload-ignorelist "); + for (auto &elem : g_pserver->overload_ignorelist) { + conf = sdscatsds(conf, (sds)elem.get()); + conf = sdscat(conf, " "); + } + // trim the trailing space + sdsrange(conf, 0, -1); + rewriteConfigRewriteLine(state,"overload-ignorelist",conf,1 /*force*/); + // note: conf is owned by rewriteConfigRewriteLine - no need to free + } else { + rewriteConfigMarkAsProcessed(state, "overload-ignorelist"); // ensure the line is removed if it existed + } + /* Rewrite Sentinel config if in Sentinel mode. */ if (g_pserver->sentinel_mode) rewriteConfigSentinelOption(state); diff --git a/src/networking.cpp b/src/networking.cpp index 7aac41ca5..422437687 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -1193,6 +1193,9 @@ void clientAcceptHandler(connection *conn) { if (cserver.fThreadAffinity) connSetThreadAffinity(conn, c->iel); + char cip[NET_IP_STR_LEN+1] = { 0 }; + connPeerToString(conn, cip, sizeof(cip)-1, NULL); + /* If the server is running in protected mode (the default) and there * is no password set, nor a specific interface is bound, we don't accept * requests from non loopback interfaces. Instead we try to explain the @@ -1200,40 +1203,41 @@ void clientAcceptHandler(connection *conn) { if (g_pserver->protected_mode && g_pserver->bindaddr_count == 0 && DefaultUser->flags & USER_FLAG_NOPASS && - !(c->flags & CLIENT_UNIX_SOCKET)) + !(c->flags & CLIENT_UNIX_SOCKET) && + strcmp(cip,"127.0.0.1") && strcmp(cip,"::1")) { - char cip[NET_IP_STR_LEN+1] = { 0 }; - connPeerToString(conn, cip, sizeof(cip)-1, NULL); - - if (strcmp(cip,"127.0.0.1") && strcmp(cip,"::1")) { - const char *err = - "-DENIED KeyDB is running in protected mode because protected " - "mode is enabled, no bind address was specified, no " - "authentication password is requested to clients. In this mode " - "connections are only accepted from the loopback interface. " - "If you want to connect from external computers to KeyDB you " - "may adopt one of the following solutions: " - "1) Just disable protected mode sending the command " - "'CONFIG SET protected-mode no' from the loopback interface " - "by connecting to KeyDB from the same host the server is " - "running, however MAKE SURE KeyDB is not publicly accessible " - "from internet if you do so. Use CONFIG REWRITE to make this " - "change permanent. " - "2) Alternatively you can just disable the protected mode by " - "editing the KeyDB configuration file, and setting the protected " - "mode option to 'no', and then restarting the server " - "3) If you started the server manually just for testing, restart " - "it with the '--protected-mode no' option. " - "4) Setup a bind address or an authentication password. " - "NOTE: You only need to do one of the above things in order for " - "the server to start accepting connections from the outside.\r\n"; - if (connWrite(c->conn,err,strlen(err)) == -1) { - /* Nothing to do, Just to avoid the warning... */ - } - g_pserver->stat_rejected_conn++; - freeClientAsync(c); - return; + const char *err = + "-DENIED KeyDB is running in protected mode because protected " + "mode is enabled, no bind address was specified, no " + "authentication password is requested to clients. In this mode " + "connections are only accepted from the loopback interface. " + "If you want to connect from external computers to KeyDB you " + "may adopt one of the following solutions: " + "1) Just disable protected mode sending the command " + "'CONFIG SET protected-mode no' from the loopback interface " + "by connecting to KeyDB from the same host the server is " + "running, however MAKE SURE KeyDB is not publicly accessible " + "from internet if you do so. Use CONFIG REWRITE to make this " + "change permanent. " + "2) Alternatively you can just disable the protected mode by " + "editing the KeyDB configuration file, and setting the protected " + "mode option to 'no', and then restarting the server " + "3) If you started the server manually just for testing, restart " + "it with the '--protected-mode no' option. " + "4) Setup a bind address or an authentication password. " + "NOTE: You only need to do one of the above things in order for " + "the server to start accepting connections from the outside.\r\n"; + if (connWrite(c->conn,err,strlen(err)) == -1) { + /* Nothing to do, Just to avoid the warning... */ } + g_pserver->stat_rejected_conn++; + freeClientAsync(c); + return; + } + + if (g_pserver->overload_ignorelist.find(sdsstring(sdsnew(cip))) != g_pserver->overload_ignorelist.end()) + { + c->flags |= CLIENT_IGNORE_OVERLOAD; } g_pserver->stat_numconnections++; diff --git a/src/server.cpp b/src/server.cpp index 50226a8b5..7a716389b 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -1965,7 +1965,7 @@ int closeClientOnOverload(client *c) { if (g_pserver->overload_closed_clients > MAX_CLIENTS_SHED_PER_PERIOD) return false; if (!g_pserver->is_overloaded) return false; // Don't close masters, replicas, or pub/sub clients - if (c->flags & (CLIENT_MASTER | CLIENT_SLAVE | CLIENT_PENDING_WRITE | CLIENT_PUBSUB | CLIENT_BLOCKED)) return false; + if (c->flags & (CLIENT_MASTER | CLIENT_SLAVE | CLIENT_PENDING_WRITE | CLIENT_PUBSUB | CLIENT_BLOCKED | CLIENT_IGNORE_OVERLOAD)) return false; freeClient(c); ++g_pserver->overload_closed_clients; return true; diff --git a/src/server.h b/src/server.h index 77f35b8e6..c06ff74ee 100644 --- a/src/server.h +++ b/src/server.h @@ -546,6 +546,7 @@ extern int configOOMScoreAdjValuesDefaults[CONFIG_OOM_COUNT]; RDB without replication buffer. */ #define CLIENT_FORCE_REPLY (1ULL<<44) /* Should addReply be forced to write the text? */ #define CLIENT_AUDIT_LOGGING (1ULL<<45) /* Client commands required audit logging */ +#define CLIENT_IGNORE_OVERLOAD (1ULL<<46) /* Client that should not be disconnected by overload protection */ /* Client block type (btype field in client structure) * if CLIENT_BLOCKED flag is set. */ @@ -2714,6 +2715,7 @@ struct redisServer { std::set tls_auditlog_blocklist; /* Certificates that can be excluded from audit logging */ std::set tls_allowlist; + std::set overload_ignorelist; redisTLSContextConfig tls_ctx_config; /* cpu affinity */ diff --git a/tests/unit/introspection.tcl b/tests/unit/introspection.tcl index 7678a1dd5..c89265cd8 100644 --- a/tests/unit/introspection.tcl +++ b/tests/unit/introspection.tcl @@ -162,15 +162,17 @@ start_server {tags {"introspection"}} { aof_rewrite_cpulist time-thread-priority bgsave_cpulist - storage-cache-mode - storage-provider-options - use-fork + storage-cache-mode + storage-provider-options + use-fork multi-master active-replica bind set-proc-title repl-backlog-disk-reserve - tls-allowlist + tls-allowlist + tls-auditlog-blocklist + overload-ignorelist db-s3-object } From 20ae39d643c6fed65284ef8b3002e99d573da624 Mon Sep 17 00:00:00 2001 From: zliang Date: Mon, 27 Nov 2023 13:38:43 -0700 Subject: [PATCH 22/68] update insufficient replicas metrics to separate severity --- src/modules/keydb_modstatsd/modmain.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/modules/keydb_modstatsd/modmain.cpp b/src/modules/keydb_modstatsd/modmain.cpp index 5feafc284..d8de39d92 100644 --- a/src/modules/keydb_modstatsd/modmain.cpp +++ b/src/modules/keydb_modstatsd/modmain.cpp @@ -67,8 +67,6 @@ class StatsdClientWrapper /* constants */ static time_t c_infoUpdateSeconds = 10; -// the current Redis Cluster setup we configure replication factor as 2, each non-empty master node should have 2 replicas, given that there are 3 zones in each regions -static const int EXPECTED_NUMBER_OF_REPLICAS = 2; StatsdClientWrapper *g_stats = nullptr; std::string m_strPrefix { "keydb" }; @@ -567,8 +565,12 @@ void emit_metrics_for_insufficient_replicas(struct RedisModuleCtx *ctx, long lon if (strncmp(role, "master", len) == 0) { RedisModuleCallReply *replicasReply = RedisModule_CallReplyArrayElement(reply, 2); // check if there are less than 2 connected replicas - if (RedisModule_CallReplyLength(replicasReply) < EXPECTED_NUMBER_OF_REPLICAS) { - g_stats->increment("lessThanExpectedReplicas_error", 1); + size_t numberOfReplicas = RedisModule_CallReplyLength(replicasReply); + if (numberOfReplicas == 0) { + g_stats->increment("lessThanExpectedReplicas_noReplicas_criticalError", 1); + } + else if (numberOfReplicas == 1) { + g_stats->increment("lessThanExpectedReplicas_1ReplicaLeft_error", 1); } } RedisModule_FreeCallReply(reply); From 8a5857ddb7af39afcfd7dc5bdc9232c31564d9f6 Mon Sep 17 00:00:00 2001 From: Jacob Bohac Date: Mon, 27 Nov 2023 15:31:29 -0700 Subject: [PATCH 23/68] Overload CPU reading metric (#240) * Overload CPU reading metric * fix * fix --- src/modules/keydb_modstatsd/modmain.cpp | 1 + src/server.cpp | 7 +++++-- src/server.h | 1 + 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/modules/keydb_modstatsd/modmain.cpp b/src/modules/keydb_modstatsd/modmain.cpp index 5feafc284..4330d7e92 100644 --- a/src/modules/keydb_modstatsd/modmain.cpp +++ b/src/modules/keydb_modstatsd/modmain.cpp @@ -253,6 +253,7 @@ std::unordered_map g_mapInfoFields = { { "cluster_size", { StatsD_Type::STATSD_GAUGE_LONGLONG }}, { "storage_flash_available_bytes", { StatsD_Type::STATSD_GAUGE_BYTES }}, { "storage_flash_total_bytes", { StatsD_Type::STATSD_GAUGE_BYTES }}, + { "last_overload_cpu_reading", { StatsD_Type::STATSD_GAUGE_FLOAT }}, }; /* Globals */ diff --git a/src/server.cpp b/src/server.cpp index 7a716389b..062c5bd2c 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -2610,6 +2610,7 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) { perc /= cserver.cthreads; perc *= 100.0; serverLog(LL_WARNING, "CPU Used: %.2f", perc); + g_pserver->last_overload_cpu_reading = static_cast(perc); if (perc > g_pserver->overload_protect_threshold) { serverLog(LL_WARNING, "\tWARNING: CPU overload detected."); g_pserver->is_overloaded = true; @@ -6271,13 +6272,15 @@ sds genRedisInfoString(const char *section) { "used_cpu_sys_children:%ld.%06ld\r\n" "used_cpu_user_children:%ld.%06ld\r\n" "server_threads:%d\r\n" - "long_lock_waits:%" PRIu64 "\r\n", + "long_lock_waits:%" PRIu64 "\r\n" + "last_overload_cpu_reading:%.2f\r\n", (long)self_ru.ru_stime.tv_sec, (long)self_ru.ru_stime.tv_usec, (long)self_ru.ru_utime.tv_sec, (long)self_ru.ru_utime.tv_usec, (long)c_ru.ru_stime.tv_sec, (long)c_ru.ru_stime.tv_usec, (long)c_ru.ru_utime.tv_sec, (long)c_ru.ru_utime.tv_usec, cserver.cthreads, - fastlock_getlongwaitcount()); + fastlock_getlongwaitcount(), + g_pserver->last_overload_cpu_reading); #ifdef RUSAGE_THREAD struct rusage m_ru; getrusage(RUSAGE_THREAD, &m_ru); diff --git a/src/server.h b/src/server.h index c06ff74ee..3f53de6e6 100644 --- a/src/server.h +++ b/src/server.h @@ -2758,6 +2758,7 @@ struct redisServer { sds sdsAvailabilityZone; int overload_protect_threshold = 0; + float last_overload_cpu_reading = 0.0f; int is_overloaded = 0; int overload_closed_clients = 0; From 8fec9ba466a7692c1d11afac579625a2c739301e Mon Sep 17 00:00:00 2001 From: zliang Date: Mon, 27 Nov 2023 17:01:04 -0700 Subject: [PATCH 24/68] emit number of active replicas metrics instead --- src/modules/keydb_modstatsd/modmain.cpp | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/src/modules/keydb_modstatsd/modmain.cpp b/src/modules/keydb_modstatsd/modmain.cpp index d8de39d92..8d30808f1 100644 --- a/src/modules/keydb_modstatsd/modmain.cpp +++ b/src/modules/keydb_modstatsd/modmain.cpp @@ -564,14 +564,8 @@ void emit_metrics_for_insufficient_replicas(struct RedisModuleCtx *ctx, long lon // check if the current node is a primary if (strncmp(role, "master", len) == 0) { RedisModuleCallReply *replicasReply = RedisModule_CallReplyArrayElement(reply, 2); - // check if there are less than 2 connected replicas - size_t numberOfReplicas = RedisModule_CallReplyLength(replicasReply); - if (numberOfReplicas == 0) { - g_stats->increment("lessThanExpectedReplicas_noReplicas_criticalError", 1); - } - else if (numberOfReplicas == 1) { - g_stats->increment("lessThanExpectedReplicas_1ReplicaLeft_error", 1); - } + size_t numberOfActiveReplicas = RedisModule_CallReplyLength(replicasReply); + g_stats->gauge("numberOfActiveReplicas", numberOfActiveReplicas); } RedisModule_FreeCallReply(reply); } From 443e1afe89aa415ff3e3c48496f20b17a86e7c45 Mon Sep 17 00:00:00 2001 From: zliang Date: Mon, 27 Nov 2023 17:50:06 -0700 Subject: [PATCH 25/68] add cool --- ci.yaml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/ci.yaml b/ci.yaml index 595d23239..7b885cacd 100644 --- a/ci.yaml +++ b/ci.yaml @@ -26,6 +26,16 @@ on: # references a build defined in build.yaml build_name: keydb-docker-build arch_types: ["amd64", "arm64"] + # Doc: go/cool-guide + cool: + workflows: + - workflow_type: backend_workflow + build_name: keydb-build + arch_types: ["amd64", "arm64"] + - workflow_type: backend_workflow + # references a build defined in build.yaml + build_name: keydb-docker-build + arch_types: ["amd64", "arm64"] # below defines which branch is release branch / release tag machamp: From 49afd8ff2f619f60e90066a4ea3512f25a412458 Mon Sep 17 00:00:00 2001 From: Malavan Sotheeswaran Date: Mon, 4 Dec 2023 16:20:40 -0500 Subject: [PATCH 26/68] update overload-ignore-list to accept subnets as well (#239) * update overload-ignore-list to be use subnets as well * get ip directly from conn * add getIPV4 impl * ntohl * don't shift at the end of the loop * update for more efficient impl and add ipv6 support * array of arrays * one combined array * have to reassign output of sdscat * include mask in operator< * replace raw numbers with constants * add log when we ignore client for loadshedding --- src/config.cpp | 122 ++++++++++++++++++++++++++++++++++++++++++--- src/networking.cpp | 91 +++++++++++++++++++++------------ src/server.h | 105 +++++++++++++++++++++++++++++++++++++- src/tls.cpp | 13 ----- 4 files changed, 277 insertions(+), 54 deletions(-) diff --git a/src/config.cpp b/src/config.cpp index 3d815c62a..bcc4379a4 100644 --- a/src/config.cpp +++ b/src/config.cpp @@ -474,6 +474,90 @@ void initConfigValues() { } } +struct in_addr maskFromBitsIPV4(int bits) { + struct in_addr mask; + memset(&mask, 0, sizeof(mask)); + mask.s_addr = htonl(ULONG_MAX << (IPV4_BITS - bits)); + return mask; +} + +bool validateAndAddIPV4(sds string) { + struct in_addr ip, mask; + int subnetParts; + sds* subnetSplit = sdssplitlen(string, sdslen(string), "/", 1, &subnetParts); + TCleanup splitCleanup([&](){ + sdsfreesplitres(subnetSplit, subnetParts); + }); + if (subnetParts > 2) { + return false; + } else if (subnetParts == 2) { + char* endptr; + long bits = strtol(subnetSplit[1], &endptr, 10); + if (*endptr != '\0' || bits < 0 || bits > IPV4_BITS) { + return false; + } + mask = maskFromBitsIPV4(bits); + } else { + mask = maskFromBitsIPV4(IPV4_BITS); + } + if (subnetParts > 0) { + if (inet_pton(AF_INET, subnetSplit[0], &ip) == 0) { + return false; + } + g_pserver->overload_ignorelist.emplace(ip, mask); + return true; + } else { + return false; + } +} + +struct in6_addr maskFromBitsIPV6(int bits) { + struct in6_addr mask; + memset(&mask, 0, sizeof(mask)); + for (unsigned int i = 0; i < sizeof(struct in6_addr); i++) { + if (bits >= CHAR_BIT) { + mask.s6_addr[i] = UCHAR_MAX; + bits -= CHAR_BIT; + } else if (bits > 0) { + mask.s6_addr[i] = UCHAR_MAX << (CHAR_BIT - bits); + bits = 0; + } else { + break; + } + } + return mask; +} + +bool validateAndAddIPV6(sds string) { + struct in6_addr ip, mask; + int subnetParts; + sds* subnetSplit = sdssplitlen(string, sdslen(string), "/", 1, &subnetParts); + TCleanup splitCleanup([&](){ + sdsfreesplitres(subnetSplit, subnetParts); + }); + if (subnetParts > 2) { + return false; + } else if (subnetParts == 2) { + char* endptr; + long bits = strtol(subnetSplit[1], &endptr, 10); + if (*endptr != '\0' || bits < 0 || bits > IPV6_BITS) { + return false; + } + mask = maskFromBitsIPV6(bits); + } else { + mask = maskFromBitsIPV6(IPV6_BITS); + } + if (subnetParts > 0) { + if (inet_pton(AF_INET6, subnetSplit[0], &ip) == 0) { + return false; + } + g_pserver->overload_ignorelist_ipv6.emplace(ip, mask); + return true; + } else { + return false; + } +} + void loadServerConfigFromString(char *config) { const char *err = NULL; int linenum = 0, totlines, i; @@ -783,11 +867,14 @@ void loadServerConfigFromString(char *config) { if (argc < 2) { err = "must supply at least one element in the ignore list"; goto loaderr; } - if (!g_pserver->overload_ignorelist.empty()) { + if (!g_pserver->overload_ignorelist.empty() || !g_pserver->overload_ignorelist_ipv6.empty()) { err = "overload-ignorelist may only be set once"; goto loaderr; } - for (int i = 1; i < argc; i++) - g_pserver->overload_ignorelist.emplace(argv[i], strlen(argv[i])); + for (int i = 1; i < argc; i++) { + if (!validateAndAddIPV4(argv[i]) && !validateAndAddIPV6(argv[i])) { + err = "overload-ignorelist must be a list of valid IP addresses or subnets"; goto loaderr; + } + } } else if (!strcasecmp(argv[0], "version-override") && argc == 2) { KEYDB_SET_VERSION = zstrdup(argv[1]); serverLog(LL_WARNING, "Warning version is overriden to: %s\n", KEYDB_SET_VERSION); @@ -1101,9 +1188,13 @@ void configSetCommand(client *c) { } } config_set_special_field("overload-ignorelist") { g_pserver->overload_ignorelist.clear(); + g_pserver->overload_ignorelist_ipv6.clear(); for (int i = 3; i < c->argc; ++i) { robj *val = c->argv[i]; - g_pserver->overload_ignorelist.emplace(szFromObj(val), sdslen(szFromObj(val))); + if (!validateAndAddIPV4(szFromObj(val)) && !validateAndAddIPV6(szFromObj(val))) { + addReplyError(c, "overload-ignorelist must be a list of valid IP addresses or subnets"); + return; + } } /* Everything else is an error... */ } config_set_else { @@ -1326,9 +1417,16 @@ void configGetCommand(client *c) { } if (stringmatch(pattern, "overload-ignorelist", 1)) { addReplyBulkCString(c,"overload-ignorelist"); - addReplyArrayLen(c, (long)g_pserver->overload_ignorelist.size()); + addReplyArrayLen(c, (long)g_pserver->overload_ignorelist.size() + (long)g_pserver->overload_ignorelist_ipv6.size()); for (auto &elem : g_pserver->overload_ignorelist) { - addReplyBulkCBuffer(c, elem.get(), elem.size()); // addReplyBulkSds will free which we absolutely don't want + sds elem_sds = elem.getString(); + addReplyBulkCBuffer(c, elem_sds, sdslen(elem_sds)); + sdsfree(elem_sds); + } + for (auto &elem : g_pserver->overload_ignorelist_ipv6) { + sds elem_sds = elem.getString(); + addReplyBulkCBuffer(c, elem_sds, sdslen(elem_sds)); + sdsfree(elem_sds); } matches++; } @@ -2029,10 +2127,18 @@ int rewriteConfig(char *path, int force_all) { rewriteConfigMarkAsProcessed(state, "tls-auditlog-blocklist"); // ensure the line is removed if it existed } - if (!g_pserver->overload_ignorelist.empty()) { + if (!g_pserver->overload_ignorelist.empty() || !g_pserver->overload_ignorelist_ipv6.empty()) { sds conf = sdsnew("overload-ignorelist "); for (auto &elem : g_pserver->overload_ignorelist) { - conf = sdscatsds(conf, (sds)elem.get()); + sds elem_sds = elem.getString(); + conf = sdscatsds(conf, elem_sds); + sdsfree(elem_sds); + conf = sdscat(conf, " "); + } + for (auto &elem : g_pserver->overload_ignorelist_ipv6) { + sds elem_sds = elem.getString(); + conf = sdscatsds(conf, elem_sds); + sdsfree(elem_sds); conf = sdscat(conf, " "); } // trim the trailing space diff --git a/src/networking.cpp b/src/networking.cpp index 422437687..d4ee80d32 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -38,6 +38,7 @@ #include #include #include "aelocker.h" +#include static void setProtocolError(const char *errstr, client *c); __thread int ProcessingEventsWhileBlocked = 0; /* See processEventsWhileBlocked(). */ @@ -1173,6 +1174,29 @@ int chooseBestThreadForAccept() return ielMinLoad; } +bool checkOverloadIgnorelist(connection *conn) { + struct sockaddr_storage sa; + socklen_t salen = sizeof(sa); + if (getpeername(conn->fd, (struct sockaddr *)&sa, &salen) == -1) { + return false; + } + if (sa.ss_family == AF_INET) { + struct sockaddr_in *s = (struct sockaddr_in *)&sa; + for (auto ignore : g_pserver->overload_ignorelist) { + if (ignore.match(s->sin_addr)) + return true; + } + } + if (sa.ss_family == AF_INET6) { + struct sockaddr_in6 *s = (struct sockaddr_in6 *)&sa; + for (auto ignore : g_pserver->overload_ignorelist_ipv6) { + if (ignore.match(s->sin6_addr)) + return true; + } + } + return false; +} + void clientAcceptHandler(connection *conn) { client *c = (client*)connGetPrivateData(conn); @@ -1193,9 +1217,6 @@ void clientAcceptHandler(connection *conn) { if (cserver.fThreadAffinity) connSetThreadAffinity(conn, c->iel); - char cip[NET_IP_STR_LEN+1] = { 0 }; - connPeerToString(conn, cip, sizeof(cip)-1, NULL); - /* If the server is running in protected mode (the default) and there * is no password set, nor a specific interface is bound, we don't accept * requests from non loopback interfaces. Instead we try to explain the @@ -1203,41 +1224,47 @@ void clientAcceptHandler(connection *conn) { if (g_pserver->protected_mode && g_pserver->bindaddr_count == 0 && DefaultUser->flags & USER_FLAG_NOPASS && - !(c->flags & CLIENT_UNIX_SOCKET) && - strcmp(cip,"127.0.0.1") && strcmp(cip,"::1")) + !(c->flags & CLIENT_UNIX_SOCKET)) { - const char *err = - "-DENIED KeyDB is running in protected mode because protected " - "mode is enabled, no bind address was specified, no " - "authentication password is requested to clients. In this mode " - "connections are only accepted from the loopback interface. " - "If you want to connect from external computers to KeyDB you " - "may adopt one of the following solutions: " - "1) Just disable protected mode sending the command " - "'CONFIG SET protected-mode no' from the loopback interface " - "by connecting to KeyDB from the same host the server is " - "running, however MAKE SURE KeyDB is not publicly accessible " - "from internet if you do so. Use CONFIG REWRITE to make this " - "change permanent. " - "2) Alternatively you can just disable the protected mode by " - "editing the KeyDB configuration file, and setting the protected " - "mode option to 'no', and then restarting the server " - "3) If you started the server manually just for testing, restart " - "it with the '--protected-mode no' option. " - "4) Setup a bind address or an authentication password. " - "NOTE: You only need to do one of the above things in order for " - "the server to start accepting connections from the outside.\r\n"; - if (connWrite(c->conn,err,strlen(err)) == -1) { - /* Nothing to do, Just to avoid the warning... */ + char cip[NET_IP_STR_LEN+1] = { 0 }; + connPeerToString(conn, cip, sizeof(cip)-1, NULL); + if (strcmp(cip,"127.0.0.1") && strcmp(cip,"::1")) { + const char *err = + "-DENIED KeyDB is running in protected mode because protected " + "mode is enabled, no bind address was specified, no " + "authentication password is requested to clients. In this mode " + "connections are only accepted from the loopback interface. " + "If you want to connect from external computers to KeyDB you " + "may adopt one of the following solutions: " + "1) Just disable protected mode sending the command " + "'CONFIG SET protected-mode no' from the loopback interface " + "by connecting to KeyDB from the same host the server is " + "running, however MAKE SURE KeyDB is not publicly accessible " + "from internet if you do so. Use CONFIG REWRITE to make this " + "change permanent. " + "2) Alternatively you can just disable the protected mode by " + "editing the KeyDB configuration file, and setting the protected " + "mode option to 'no', and then restarting the server " + "3) If you started the server manually just for testing, restart " + "it with the '--protected-mode no' option. " + "4) Setup a bind address or an authentication password. " + "NOTE: You only need to do one of the above things in order for " + "the server to start accepting connections from the outside.\r\n"; + if (connWrite(c->conn,err,strlen(err)) == -1) { + /* Nothing to do, Just to avoid the warning... */ + } + g_pserver->stat_rejected_conn++; + freeClientAsync(c); + return; } - g_pserver->stat_rejected_conn++; - freeClientAsync(c); - return; } - if (g_pserver->overload_ignorelist.find(sdsstring(sdsnew(cip))) != g_pserver->overload_ignorelist.end()) + if (checkOverloadIgnorelist(conn)) { c->flags |= CLIENT_IGNORE_OVERLOAD; + char cip[NET_IP_STR_LEN+1] = { 0 }; + connPeerToString(conn, cip, sizeof(cip)-1, NULL); + serverLog(LL_NOTICE, "Ignoring client from %s for loadshedding", cip); } g_pserver->stat_numconnections++; diff --git a/src/server.h b/src/server.h index 3f53de6e6..74e6d93bc 100644 --- a/src/server.h +++ b/src/server.h @@ -64,6 +64,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { #include @@ -125,9 +126,25 @@ typedef long long ustime_t; /* microsecond time type. */ #define OVERLOAD_PROTECT_PERIOD_MS 10'000 // 10 seconds #define MAX_CLIENTS_SHED_PER_PERIOD (OVERLOAD_PROTECT_PERIOD_MS / 10) // Restrict to one client per 10ms +#define IPV4_BITS 32 +#define IPV6_BITS 128 + extern int g_fTestMode; extern struct redisServer *g_pserver; +class TCleanup { + std::function fn; + +public: + TCleanup(std::function fn) + : fn(fn) + {} + + ~TCleanup() { + fn(); + } +}; + struct redisObject; class robj_roptr { @@ -2715,7 +2732,93 @@ struct redisServer { std::set tls_auditlog_blocklist; /* Certificates that can be excluded from audit logging */ std::set tls_allowlist; - std::set overload_ignorelist; + class IPV4 { + struct in_addr m_ip; + struct in_addr m_mask; + + int bitsFromMask() const { + uint32_t mask = ntohl(m_mask.s_addr); + int bits = 0; + while (mask > 0) { + bits += mask & 1; + mask >>= 1; + } + return bits; + } + + public: + IPV4(struct in_addr ip, struct in_addr mask) : m_ip(ip), m_mask(mask) {}; + bool match(struct in_addr ip) const + { + return (ip.s_addr & m_mask.s_addr) == m_ip.s_addr; + } + + sds getString() const + { + sds result = sdsempty(); + char buf[INET_ADDRSTRLEN]; + inet_ntop(AF_INET, &m_ip, buf, sizeof(buf)); + result = sdscat(result, buf); + int bits = bitsFromMask(); + if (bits != IPV4_BITS) { + result = sdscat(result, "/"); + result = sdscat(result, std::to_string(bits).c_str()); + } + return result; + } + + bool operator<(const IPV4& rhs) const + { + return memcmp(&m_ip, &rhs.m_ip, sizeof(m_ip)) < 0 || (memcmp(&m_ip, &rhs.m_ip, sizeof(m_ip)) == 0 && memcmp(&m_mask, &rhs.m_mask, sizeof(m_mask)) < 0); + } + }; + class IPV6 { + struct in6_addr m_ip; + struct in6_addr m_mask; + + int bitsFromMask() const { + int bits = 0; + for (unsigned int i = 0; i < sizeof(struct in6_addr); i++) { + uint8_t mask = m_mask.s6_addr[i]; + while (mask > 0) { + bits += mask & 1; + mask >>= 1; + } + } + return bits; + } + public: + IPV6(struct in6_addr ip, struct in6_addr mask) : m_ip(ip), m_mask(mask) {}; + bool match(struct in6_addr ip) const + { + for (unsigned int i = 0; i < sizeof(struct in6_addr); i++) { + if ((ip.s6_addr[i] & m_mask.s6_addr[i]) != m_ip.s6_addr[i]) + return false; + } + return true; + } + + sds getString() const + { + sds result = sdsempty(); + char buf[INET6_ADDRSTRLEN]; + inet_ntop(AF_INET6, &m_ip, buf, sizeof(buf)); + result = sdscat(result, buf); + int bits = bitsFromMask(); + if (bits != IPV6_BITS) { + result = sdscat(result, "/"); + result = sdscat(result, std::to_string(bits).c_str()); + } + return result; + } + + bool operator<(const IPV6& rhs) const + { + return memcmp(&m_ip, &rhs.m_ip, sizeof(m_ip)) < 0 || (memcmp(&m_ip, &rhs.m_ip, sizeof(m_ip)) == 0 && memcmp(&m_mask, &rhs.m_mask, sizeof(m_mask)) < 0); + } + }; + std::set overload_ignorelist; + std::set overload_ignorelist_ipv6; redisTLSContextConfig tls_ctx_config; /* cpu affinity */ diff --git a/src/tls.cpp b/src/tls.cpp index 8a8a97a95..9260cc0a2 100644 --- a/src/tls.cpp +++ b/src/tls.cpp @@ -555,19 +555,6 @@ void tlsSetCertificateFingerprint(tls_connection* conn, X509 * cert) { #define ASN1_STRING_get0_data ASN1_STRING_data #endif -class TCleanup { - std::function fn; - -public: - TCleanup(std::function fn) - : fn(fn) - {} - - ~TCleanup() { - fn(); - } -}; - bool tlsCheckCertificateAgainstAllowlist(tls_connection* conn, std::set allowlist, const char** commonName){ if (allowlist.empty()){ // An empty list implies acceptance of all From bb115ca99aef2839c4d27c189cd00896f76674a1 Mon Sep 17 00:00:00 2001 From: Malavan Sotheeswaran Date: Fri, 8 Dec 2023 15:02:44 -0500 Subject: [PATCH 27/68] add overload-protect-tenacity to configure how many clients to evict (#243) * add overload-protect-tenacity to configure how many clients to evict * need to divide tenacity config by 100 to get percentage --- src/config.cpp | 1 + src/server.cpp | 6 +++--- src/server.h | 3 ++- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/config.cpp b/src/config.cpp index bcc4379a4..098560133 100644 --- a/src/config.cpp +++ b/src/config.cpp @@ -3152,6 +3152,7 @@ standardConfig configs[] = { createSizeTConfig("semi-ordered-set-bucket-size", NULL, MODIFIABLE_CONFIG, 0, 1024, g_semiOrderedSetTargetBucketSize, 0, INTEGER_CONFIG, NULL, NULL), createSDSConfig("availability-zone", NULL, MODIFIABLE_CONFIG, 0, g_pserver->sdsAvailabilityZone, "", NULL, NULL), createIntConfig("overload-protect-percent", NULL, MODIFIABLE_CONFIG, 0, 200, g_pserver->overload_protect_threshold, 0, INTEGER_CONFIG, NULL, NULL), + createIntConfig("overload-protect-tenacity", NULL, MODIFIABLE_CONFIG, 0, 100, g_pserver->overload_protect_tenacity, 10, INTEGER_CONFIG, NULL, NULL), createIntConfig("force-eviction-percent", NULL, MODIFIABLE_CONFIG, 0, 100, g_pserver->force_eviction_percent, 0, INTEGER_CONFIG, NULL, NULL), createBoolConfig("enable-async-rehash", NULL, MODIFIABLE_CONFIG, g_pserver->enable_async_rehash, 1, NULL, NULL), createBoolConfig("enable-keydb-fastsync", NULL, MODIFIABLE_CONFIG, g_pserver->fEnableFastSync, 0, NULL, NULL), diff --git a/src/server.cpp b/src/server.cpp index 062c5bd2c..f00ce9f5f 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -1962,12 +1962,12 @@ void getExpansiveClientsInfo(size_t *in_usage, size_t *out_usage) { } int closeClientOnOverload(client *c) { - if (g_pserver->overload_closed_clients > MAX_CLIENTS_SHED_PER_PERIOD) return false; + if (g_pserver->overload_closeable_clients <= 0) return false; if (!g_pserver->is_overloaded) return false; // Don't close masters, replicas, or pub/sub clients if (c->flags & (CLIENT_MASTER | CLIENT_SLAVE | CLIENT_PENDING_WRITE | CLIENT_PUBSUB | CLIENT_BLOCKED | CLIENT_IGNORE_OVERLOAD)) return false; freeClient(c); - ++g_pserver->overload_closed_clients; + --g_pserver->overload_closeable_clients; return true; } @@ -2602,7 +2602,7 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) { /* Check for CPU Overload */ run_with_period(10'000) { g_pserver->is_overloaded = false; - g_pserver->overload_closed_clients = 0; + g_pserver->overload_closeable_clients = (listLength(g_pserver->clients)-listLength(g_pserver->slaves)) * (g_pserver->overload_protect_tenacity/100); static clock_t last = 0; if (g_pserver->overload_protect_threshold > 0) { clock_t cur = clock(); diff --git a/src/server.h b/src/server.h index 74e6d93bc..bca1da7f7 100644 --- a/src/server.h +++ b/src/server.h @@ -2861,9 +2861,10 @@ struct redisServer { sds sdsAvailabilityZone; int overload_protect_threshold = 0; + int overload_protect_tenacity = 0; float last_overload_cpu_reading = 0.0f; int is_overloaded = 0; - int overload_closed_clients = 0; + int overload_closeable_clients = 0; int module_blocked_pipe[2]; /* Pipe used to awake the event loop if a client blocked on a module command needs From bb7c5a4c77f43ec4c8d8ae39f7f5623330eaa314 Mon Sep 17 00:00:00 2001 From: Jacob Bohac Date: Fri, 8 Dec 2023 14:15:50 -0700 Subject: [PATCH 28/68] TLS cname loadshedding ignore list (#242) * TLS cname loadshedding ignore list --- src/config.cpp | 41 ++++++++++++++++++++++++++++++++++-- src/connection.h | 1 + src/networking.cpp | 3 +++ src/server.h | 1 + src/tls.cpp | 15 +++++++++++++ tests/unit/introspection.tcl | 1 + 6 files changed, 60 insertions(+), 2 deletions(-) diff --git a/src/config.cpp b/src/config.cpp index 098560133..29a750835 100644 --- a/src/config.cpp +++ b/src/config.cpp @@ -863,6 +863,15 @@ void loadServerConfigFromString(char *config) { } for (int i = 1; i < argc; i++) g_pserver->tls_auditlog_blocklist.emplace(argv[i], strlen(argv[i])); + } else if (!strcasecmp(argv[0], "tls-overload-ignorelist")) { + if (argc < 2) { + err = "must supply at least one element in the ignore list"; goto loaderr; + } + if (!g_pserver->tls_overload_ignorelist.empty()) { + err = "tls-overload-ignorelist may only be set once"; goto loaderr; + } + for (int i = 1; i < argc; i++) + g_pserver->tls_overload_ignorelist.emplace(argv[i], strlen(argv[i])); } else if (!strcasecmp(argv[0], "overload-ignorelist")) { if (argc < 2) { err = "must supply at least one element in the ignore list"; goto loaderr; @@ -1004,8 +1013,8 @@ void configSetCommand(client *c) { if (c->argc < 4 || c->argc > 4) { o = nullptr; - // Variadic set is only supported for tls-allowlist, tls-auditlog-blocklist and overload-ignorelist - if (strcasecmp(szFromObj(c->argv[2]), "tls-allowlist") && strcasecmp(szFromObj(c->argv[2]), "tls-auditlog-blocklist") && strcasecmp(szFromObj(c->argv[2]), "overload-ignorelist")) { + // Variadic set is only supported for tls-allowlist, tls-auditlog-blocklist, overload-ignorelist and tls-overload-ignorelist + if (strcasecmp(szFromObj(c->argv[2]), "tls-allowlist") && strcasecmp(szFromObj(c->argv[2]), "tls-auditlog-blocklist") && strcasecmp(szFromObj(c->argv[2]), "overload-ignorelist") && strcasecmp(szFromObj(c->argv[2]), "tls-overload-ignorelist") ) { addReplySubcommandSyntaxError(c); return; } @@ -1186,6 +1195,12 @@ void configSetCommand(client *c) { robj *val = c->argv[i]; g_pserver->tls_auditlog_blocklist.emplace(szFromObj(val), sdslen(szFromObj(val))); } + } config_set_special_field("tls-overload-ignorelist") { + g_pserver->tls_overload_ignorelist.clear(); + for (int i = 3; i < c->argc; ++i) { + robj *val = c->argv[i]; + g_pserver->tls_overload_ignorelist.emplace(szFromObj(val), sdslen(szFromObj(val))); + } } config_set_special_field("overload-ignorelist") { g_pserver->overload_ignorelist.clear(); g_pserver->overload_ignorelist_ipv6.clear(); @@ -1415,6 +1430,14 @@ void configGetCommand(client *c) { } matches++; } + if (stringmatch(pattern, "tls-overload-ignorelist", 1)) { + addReplyBulkCString(c,"tls-overload-ignorelist"); + addReplyArrayLen(c, (long)g_pserver->tls_overload_ignorelist.size()); + for (auto &elem : g_pserver->tls_overload_ignorelist) { + addReplyBulkCBuffer(c, elem.get(), elem.size()); // addReplyBulkSds will free which we absolutely don't want + } + matches++; + } if (stringmatch(pattern, "overload-ignorelist", 1)) { addReplyBulkCString(c,"overload-ignorelist"); addReplyArrayLen(c, (long)g_pserver->overload_ignorelist.size() + (long)g_pserver->overload_ignorelist_ipv6.size()); @@ -2113,6 +2136,20 @@ int rewriteConfig(char *path, int force_all) { rewriteConfigMarkAsProcessed(state, "tls-allowlist"); // ensure the line is removed if it existed } + if (!g_pserver->tls_overload_ignorelist.empty()) { + sds conf = sdsnew("tls-overload-ignorelist "); + for (auto &elem : g_pserver->tls_overload_ignorelist) { + conf = sdscatsds(conf, (sds)elem.get()); + conf = sdscat(conf, " "); + } + // trim the trailing space + sdsrange(conf, 0, -1); + rewriteConfigRewriteLine(state,"tls-overload-ignorelist",conf,1 /*force*/); + // note: conf is owned by rewriteConfigRewriteLine - no need to free + } else { + rewriteConfigMarkAsProcessed(state, "tls-overload-ignorelist"); // ensure the line is removed if it existed + } + if (!g_pserver->tls_auditlog_blocklist.empty()) { sds conf = sdsnew("tls-auditlog-blocklist "); for (auto &elem : g_pserver->tls_auditlog_blocklist) { diff --git a/src/connection.h b/src/connection.h index 0b0b6603a..1e6fb23d7 100644 --- a/src/connection.h +++ b/src/connection.h @@ -52,6 +52,7 @@ typedef enum { #define CONN_FLAG_READ_THREADSAFE (1<<2) #define CONN_FLAG_WRITE_THREADSAFE (1<<3) #define CONN_FLAG_AUDIT_LOGGING_REQUIRED (1<<4) +#define CONN_FLAG_IGNORE_OVERLOAD (1<<5) #define CONN_TYPE_SOCKET 1 #define CONN_TYPE_TLS 2 diff --git a/src/networking.cpp b/src/networking.cpp index d4ee80d32..0ceabd64d 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -1175,6 +1175,9 @@ int chooseBestThreadForAccept() } bool checkOverloadIgnorelist(connection *conn) { + if (conn->flags & CONN_FLAG_IGNORE_OVERLOAD) { + return true; + } struct sockaddr_storage sa; socklen_t salen = sizeof(sa); if (getpeername(conn->fd, (struct sockaddr *)&sa, &salen) == -1) { diff --git a/src/server.h b/src/server.h index bca1da7f7..e50ba6c3a 100644 --- a/src/server.h +++ b/src/server.h @@ -2731,6 +2731,7 @@ struct redisServer { int tls_rotation; std::set tls_auditlog_blocklist; /* Certificates that can be excluded from audit logging */ + std::set tls_overload_ignorelist; /* Certificates that are be excluded load shedding */ std::set tls_allowlist; class IPV4 { struct in_addr m_ip; diff --git a/src/tls.cpp b/src/tls.cpp index 9260cc0a2..f1d6baa9f 100644 --- a/src/tls.cpp +++ b/src/tls.cpp @@ -627,6 +627,18 @@ bool tlsCheckCertificateAgainstAllowlist(tls_connection* conn, std::settls_overload_ignorelist, &cn)) { + // Certificate is in exclusion list, no need to audit log + serverLog(LL_NOTICE, "Loadshedding: disabled for %s", conn->c.fprint); + return true; + } else { + serverLog(LL_NOTICE, "Loadshedding: enabled for %s", conn->c.fprint); + return false; + } +} + bool tlsCertificateRequiresAuditLogging(tls_connection* conn){ const char* cn = ""; if (tlsCheckCertificateAgainstAllowlist(conn, g_pserver->tls_auditlog_blocklist, &cn)) { @@ -879,6 +891,9 @@ void tlsHandleEvent(tls_connection *conn, int mask) { if (tlsCertificateRequiresAuditLogging(conn)){ conn->c.flags |= CONN_FLAG_AUDIT_LOGGING_REQUIRED; } + if (tlsCertificateIgnoreLoadShedding(conn)){ + conn->c.flags |= CONN_FLAG_IGNORE_OVERLOAD; + } } } diff --git a/tests/unit/introspection.tcl b/tests/unit/introspection.tcl index c89265cd8..5555f3b44 100644 --- a/tests/unit/introspection.tcl +++ b/tests/unit/introspection.tcl @@ -172,6 +172,7 @@ start_server {tags {"introspection"}} { repl-backlog-disk-reserve tls-allowlist tls-auditlog-blocklist + tls-overload-ignorelist overload-ignorelist db-s3-object } From e1935a7cfd15e01861202cf2ae888f639f047e28 Mon Sep 17 00:00:00 2001 From: "Yu Zhao (Yuri)" Date: Wed, 22 Nov 2023 00:50:58 -0500 Subject: [PATCH 29/68] refactor asyncwrite and asyncread work queue --- src/config.cpp | 2 +- src/db.cpp | 1 + src/server.cpp | 7 ++++++- src/server.h | 2 ++ src/storage/rocksdb.cpp | 4 ++-- src/storage/rocksdbfactor_internal.h | 5 +++-- src/storage/rocksdbfactory.cpp | 8 ++++---- src/storage/rocksdbfactory.h | 2 +- 8 files changed, 20 insertions(+), 11 deletions(-) diff --git a/src/config.cpp b/src/config.cpp index a8217e7bc..fbe97dd0f 100644 --- a/src/config.cpp +++ b/src/config.cpp @@ -361,7 +361,7 @@ bool initializeStorageProvider(const char **err) // Create The Storage Factory (if necessary) serverLog(LL_NOTICE, "Initializing FLASH storage provider (this may take a long time)"); adjustOpenFilesLimit(); - g_pserver->m_pstorageFactory = CreateRocksDBStorageFactory(g_sdsArgs, cserver.dbnum, cserver.storage_conf, cserver.storage_conf ? strlen(cserver.storage_conf) : 0, &g_pserver->asyncworkqueue); + g_pserver->m_pstorageFactory = CreateRocksDBStorageFactory(g_sdsArgs, cserver.dbnum, cserver.storage_conf, cserver.storage_conf ? strlen(cserver.storage_conf) : 0, &g_pserver->asyncreadworkqueue, &g_pserver->asyncwriteworkqueue); #else serverLog(LL_WARNING, "To use the flash storage provider please compile KeyDB with ENABLE_FLASH=yes"); serverLog(LL_WARNING, "Exiting due to the use of an unsupported storage provider"); diff --git a/src/db.cpp b/src/db.cpp index 9a0f35684..a861244b9 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -3416,6 +3416,7 @@ void redisDbPersistentData::prefetchKeysFlash(std::unordered_set &setc) blockClient(c, BLOCKED_STORAGE); } tok->setc = std::move(setcBlocked); + tok->type = StorageToken::TokenType::SingleRead; tok->db = this; } return; diff --git a/src/server.cpp b/src/server.cpp index 4792d0fd9..abdf73dfe 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -4158,7 +4158,12 @@ void InitServerLast() { set_jemalloc_bg_thread(cserver.jemalloc_bg_thread); g_pserver->initial_memory_usage = zmalloc_used_memory(); - g_pserver->asyncworkqueue = new (MALLOC_LOCAL) AsyncWorkQueue(cserver.cthreads*10); + g_pserver->asyncworkqueue = new (MALLOC_LOCAL) AsyncWorkQueue(cserver.cthreads); + + g_pserver->asyncreadworkqueue = new (MALLOC_LOCAL) AsyncWorkQueue(cserver.cthreads*10); + + //Process one write/commit at a time to ensure consistency + g_pserver->asyncwriteworkqueue = new (MALLOC_LOCAL) AsyncWorkQueue(1); // Allocate the repl backlog diff --git a/src/server.h b/src/server.h index ef21fd930..88f3031ce 100644 --- a/src/server.h +++ b/src/server.h @@ -2705,6 +2705,8 @@ struct redisServer { uint64_t mvcc_tstamp; AsyncWorkQueue *asyncworkqueue; + AsyncWorkQueue *asyncreadworkqueue; + AsyncWorkQueue *asyncwriteworkqueue; /* System hardware info */ size_t system_memory_size; /* Total memory in system as reported by OS */ diff --git a/src/storage/rocksdb.cpp b/src/storage/rocksdb.cpp index 56f00821f..73e709a8b 100644 --- a/src/storage/rocksdb.cpp +++ b/src/storage/rocksdb.cpp @@ -266,7 +266,7 @@ StorageToken* RocksDBStorageProvider::begin_endWriteBatch(struct aeEventLoop *el tok->tspdb = m_spdb; m_spbatch = nullptr; m_lock.unlock(); - (*m_pfactory->m_wqueue)->AddWorkFunction([this, el,callback,tok]{ + (*m_pfactory->m_wwqueue)->AddWorkFunction([this, el,callback,tok]{ tok->tspdb->Write(WriteOptions(),tok->tspbatch.get()->GetWriteBatch()); aePostFunction(el,callback,tok); }); @@ -316,7 +316,7 @@ StorageToken *RocksDBStorageProvider::begin_retrieve(struct aeEventLoop *el, aeP auto opts = ReadOptions(); opts.async_io = true; - (*m_pfactory->m_wqueue)->AddWorkFunction([this, el, callback, tok, opts]{ + (*m_pfactory->m_rwqueue)->AddWorkFunction([this, el, callback, tok, opts]{ std::vector veckeysStr; std::vector veckeys; std::vector vecvals; diff --git a/src/storage/rocksdbfactor_internal.h b/src/storage/rocksdbfactor_internal.h index addff77ce..0fa34eec5 100644 --- a/src/storage/rocksdbfactor_internal.h +++ b/src/storage/rocksdbfactor_internal.h @@ -11,9 +11,10 @@ class RocksDBStorageFactory : public IStorageFactory bool m_fCreatedTempFolder = false; public: - AsyncWorkQueue **m_wqueue; + AsyncWorkQueue **m_rwqueue; + AsyncWorkQueue **m_wwqueue; - RocksDBStorageFactory(const char *dbfile, int dbnum, const char *rgchConfig, size_t cchConfig, AsyncWorkQueue **wqueue); + RocksDBStorageFactory(const char *dbfile, int dbnum, const char *rgchConfig, size_t cchConfig, AsyncWorkQueue **rwqueue, AsyncWorkQueue **wwqueue); ~RocksDBStorageFactory(); virtual IStorage *create(int db, key_load_iterator iter, void *privdata) override; diff --git a/src/storage/rocksdbfactory.cpp b/src/storage/rocksdbfactory.cpp index 0ebfb93e1..2e02010be 100644 --- a/src/storage/rocksdbfactory.cpp +++ b/src/storage/rocksdbfactory.cpp @@ -35,9 +35,9 @@ rocksdb::Options DefaultRocksDBOptions() { return options; } -IStorageFactory *CreateRocksDBStorageFactory(const char *path, int dbnum, const char *rgchConfig, size_t cchConfig, AsyncWorkQueue **wqueue) +IStorageFactory *CreateRocksDBStorageFactory(const char *path, int dbnum, const char *rgchConfig, size_t cchConfig, AsyncWorkQueue **rwqueue, AsyncWorkQueue **wwqueue) { - return new RocksDBStorageFactory(path, dbnum, rgchConfig, cchConfig, wqueue); + return new RocksDBStorageFactory(path, dbnum, rgchConfig, cchConfig, rwqueue, wwqueue); } rocksdb::Options RocksDBStorageFactory::RocksDbOptions() @@ -52,8 +52,8 @@ rocksdb::Options RocksDBStorageFactory::RocksDbOptions() return options; } -RocksDBStorageFactory::RocksDBStorageFactory(const char *dbfile, int dbnum, const char *rgchConfig, size_t cchConfig, AsyncWorkQueue **wqueue) - : m_path(dbfile), m_wqueue(wqueue) +RocksDBStorageFactory::RocksDBStorageFactory(const char *dbfile, int dbnum, const char *rgchConfig, size_t cchConfig, AsyncWorkQueue **rwqueue, AsyncWorkQueue **wwqueue) + : m_path(dbfile), m_rwqueue(rwqueue), m_wwqueue(wwqueue) { dbnum++; // create an extra db for metadata // Get the count of column families in the actual database diff --git a/src/storage/rocksdbfactory.h b/src/storage/rocksdbfactory.h index c137a79e4..e12881475 100644 --- a/src/storage/rocksdbfactory.h +++ b/src/storage/rocksdbfactory.h @@ -1,3 +1,3 @@ #pragma once -class IStorageFactory *CreateRocksDBStorageFactory(const char *path, int dbnum, const char *rgchConfig, size_t cchConfig, AsyncWorkQueue **wqueue); \ No newline at end of file +class IStorageFactory *CreateRocksDBStorageFactory(const char *path, int dbnum, const char *rgchConfig, size_t cchConfig, AsyncWorkQueue **rwqueue, AsyncWorkQueue **wwqueue); \ No newline at end of file From 3d4c5a0af0b9d74b1744ef5c350903f6c01d72dc Mon Sep 17 00:00:00 2001 From: John Sully Date: Thu, 11 Jan 2024 19:16:39 +0000 Subject: [PATCH 30/68] Fix compile warning --- src/IStorage.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/IStorage.h b/src/IStorage.h index 82f8087a0..98c00b49b 100644 --- a/src/IStorage.h +++ b/src/IStorage.h @@ -53,7 +53,7 @@ class IStorage virtual StorageToken *begin_retrieve(struct aeEventLoop *, aePostFunctionTokenProc, sds *, size_t) {return nullptr;}; virtual void complete_retrieve(StorageToken * /*tok*/, callbackSingle /*fn*/) {}; - virtual StorageToken* begin_endWriteBatch(struct aeEventLoop *, aePostFunctionTokenProc*) {} // NOP + virtual StorageToken* begin_endWriteBatch(struct aeEventLoop *, aePostFunctionTokenProc*) { return nullptr; } // NOP virtual void complete_endWriteBatch(StorageToken * /*tok*/) {}; virtual void bulkInsert(char **rgkeys, size_t *rgcbkeys, char **rgvals, size_t *rgcbvals, size_t celem) { From b74652a18048625b60aa067f951af06721edcc5d Mon Sep 17 00:00:00 2001 From: Malavan Sotheeswaran Date: Tue, 23 Jan 2024 14:29:43 -0500 Subject: [PATCH 31/68] change overload protection to sending -OVERLOAD errors instead of disconnecting clients (#245) --- src/networking.cpp | 16 ++++++++++++---- src/server.cpp | 20 ++++++++------------ src/server.h | 10 +++++----- 3 files changed, 25 insertions(+), 21 deletions(-) diff --git a/src/networking.cpp b/src/networking.cpp index 0ceabd64d..be0a5efc2 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -2793,9 +2793,9 @@ void readQueryFromClient(connection *conn) { return; } - if (cserver.cthreads > 1 || g_pserver->m_pstorageFactory) { + if (cserver.cthreads > 1 || g_pserver->m_pstorageFactory || g_pserver->is_overloaded) { parseClientCommandBuffer(c); - if (g_pserver->enable_async_commands && !serverTL->disable_async_commands && listLength(g_pserver->monitors) == 0 && (aeLockContention() || serverTL->rgdbSnapshot[c->db->id] || g_fTestMode) && !serverTL->in_eval && !serverTL->in_exec) { + if (g_pserver->enable_async_commands && !serverTL->disable_async_commands && listLength(g_pserver->monitors) == 0 && (aeLockContention() || serverTL->rgdbSnapshot[c->db->id] || g_fTestMode) && !serverTL->in_eval && !serverTL->in_exec && !g_pserver->is_overloaded) { // Frequent writers aren't good candidates for this optimization, they cause us to renew the snapshot too often // so we exclude them unless the snapshot we need already exists. // Note: In test mode we want to create snapshots as often as possibl to excercise them - we don't care about perf @@ -2807,8 +2807,16 @@ void readQueryFromClient(connection *conn) { processInputBuffer(c, false, CMD_CALL_SLOWLOG | CMD_CALL_STATS | CMD_CALL_ASYNC); } } - if (!c->vecqueuedcmd.empty()) - serverTL->vecclientsProcess.push_back(c); + if (!c->vecqueuedcmd.empty()) { + if (g_pserver->is_overloaded && !(c->flags & (CLIENT_MASTER | CLIENT_SLAVE | CLIENT_PENDING_WRITE | CLIENT_PUBSUB | CLIENT_BLOCKED | CLIENT_IGNORE_OVERLOAD)) && ((random() % 100) < g_pserver->overload_protect_strength)) { + for (unsigned i = 0; i < c->vecqueuedcmd.size(); i++) { + addReply(c, shared.overloaderr); + } + c->vecqueuedcmd.clear(); + } else { + serverTL->vecclientsProcess.push_back(c); + } + } } else { // If we're single threaded its actually better to just process the command here while the query is hot in the cache // multithreaded lock contention dominates and batching is better diff --git a/src/server.cpp b/src/server.cpp index f00ce9f5f..3a2a5dda1 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -1961,15 +1961,6 @@ void getExpansiveClientsInfo(size_t *in_usage, size_t *out_usage) { *out_usage = o; } -int closeClientOnOverload(client *c) { - if (g_pserver->overload_closeable_clients <= 0) return false; - if (!g_pserver->is_overloaded) return false; - // Don't close masters, replicas, or pub/sub clients - if (c->flags & (CLIENT_MASTER | CLIENT_SLAVE | CLIENT_PENDING_WRITE | CLIENT_PUBSUB | CLIENT_BLOCKED | CLIENT_IGNORE_OVERLOAD)) return false; - freeClient(c); - --g_pserver->overload_closeable_clients; - return true; -} /* This function is called by serverCron() and is used in order to perform * operations on clients that are important to perform constantly. For instance @@ -2041,7 +2032,6 @@ void clientsCron(int iel) { if (clientsCronTrackExpansiveClients(c, curr_peak_mem_usage_slot)) goto LContinue; if (clientsCronTrackClientsMemUsage(c)) goto LContinue; if (closeClientOnOutputBufferLimitReached(c, 0)) continue; // Client also free'd - if (closeClientOnOverload(c)) continue; LContinue: fastlock_unlock(&c->lock); } @@ -2601,8 +2591,13 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) { /* Check for CPU Overload */ run_with_period(10'000) { + if (g_pserver->is_overloaded) { + if (g_pserver->overload_protect_strength < 100) + g_pserver->overload_protect_strength *= 2; + } else { + g_pserver->overload_protect_strength = g_pserver->overload_protect_tenacity; + } g_pserver->is_overloaded = false; - g_pserver->overload_closeable_clients = (listLength(g_pserver->clients)-listLength(g_pserver->slaves)) * (g_pserver->overload_protect_tenacity/100); static clock_t last = 0; if (g_pserver->overload_protect_threshold > 0) { clock_t cur = clock(); @@ -3120,7 +3115,8 @@ void createSharedObjects(void) { "-NOREPLICAS Not enough good replicas to write.\r\n"))); shared.busykeyerr = makeObjectShared(createObject(OBJ_STRING,sdsnew( "-BUSYKEY Target key name already exists.\r\n"))); - + shared.overloaderr = makeObjectShared(createObject(OBJ_STRING,sdsnew( + "-OVERLOAD KeyDB is overloaded.\r\n"))); /* The shared NULL depends on the protocol version. */ shared.null[0] = NULL; diff --git a/src/server.h b/src/server.h index e50ba6c3a..e4954a135 100644 --- a/src/server.h +++ b/src/server.h @@ -1762,7 +1762,7 @@ struct sharedObjectsStruct { *emptyarray, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr, *outofrangeerr, *noscripterr, *loadingerr, *slowscripterr, *bgsaveerr, *masterdownerr, *roslaveerr, *execaborterr, *noautherr, *noreplicaserr, - *busykeyerr, *oomerr, *plus, *messagebulk, *pmessagebulk, *subscribebulk, + *busykeyerr, *oomerr, *overloaderr, *plus, *messagebulk, *pmessagebulk, *subscribebulk, *unsubscribebulk, *psubscribebulk, *punsubscribebulk, *del, *unlink, *rpop, *lpop, *lpush, *rpoplpush, *lmove, *blmove, *zpopmin, *zpopmax, *emptyscan, *multi, *exec, *left, *right, *hset, *srem, *xgroup, *xclaim, @@ -2863,13 +2863,13 @@ struct redisServer { sds sdsAvailabilityZone; int overload_protect_threshold = 0; int overload_protect_tenacity = 0; + int overload_protect_strength = 0; float last_overload_cpu_reading = 0.0f; int is_overloaded = 0; - int overload_closeable_clients = 0; - int module_blocked_pipe[2]; /* Pipe used to awake the event loop if a - client blocked on a module command needs - to be processed. */ + int module_blocked_pipe[2]; /* Pipe used to awake the event loop if a + client blocked on a module command needs + to be processed. */ bool FRdbSaveInProgress() const { return g_pserver->rdbThreadVars.fRdbThreadActive; } }; From 2b01c6c71027383a5111999773802767082fa057 Mon Sep 17 00:00:00 2001 From: Malavan Sotheeswaran Date: Thu, 25 Jan 2024 15:54:00 -0500 Subject: [PATCH 32/68] handle nodes in handshake state in statsd module cluster nodes metrics (#246) * handle nodes in handshake state in statsd module cluster nodes metrics * handle all possible outputs of cluster nodes * Update src/modules/keydb_modstatsd/modmain.cpp Co-authored-by: Sergey Kolosov * make error plural Co-authored-by: Sergey Kolosov --- src/modules/keydb_modstatsd/modmain.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/modules/keydb_modstatsd/modmain.cpp b/src/modules/keydb_modstatsd/modmain.cpp index e26583ccf..27c7df65c 100644 --- a/src/modules/keydb_modstatsd/modmain.cpp +++ b/src/modules/keydb_modstatsd/modmain.cpp @@ -437,6 +437,8 @@ void handle_cluster_nodes_response(struct RedisModuleCtx *ctx, const char *szRep const char *pchLineStart = szReply; long long primaries = 0; long long replicas = 0; + long long handshakes = 0; + long long errors = 0; while (SAFETY_CHECK_POINTER(pchLineStart) && *pchLineStart != '\0') { // Loop Each Line const char *pchLineEnd = pchLineStart; @@ -449,6 +451,10 @@ void handle_cluster_nodes_response(struct RedisModuleCtx *ctx, const char *szRep ++primaries; } else if (std::string::npos != line.find("slave")) { ++replicas; + } else if (std::string::npos != line.find("handshake")) { + ++handshakes; + } else if (std::string::npos != line.find("fail?") || std::string::npos != line.find("fail") || std::string::npos != line.find("noaddr") || std::string::npos != line.find("nofailover") || std::string::npos != line.find("noflags") ) { + ++errors; } else { RedisModule_Log(ctx, REDISMODULE_LOGLEVEL_WARNING, "Unexpected NODE format returned by \"CLUSTER NODES\" command: \"%s\"", line.c_str()); } @@ -480,6 +486,10 @@ void handle_cluster_nodes_response(struct RedisModuleCtx *ctx, const char *szRep RedisModule_Log(ctx, REDISMODULE_LOGLEVEL_DEBUG, "Emitting metric \"primaries\": %llu", primaries); g_stats->gauge("replicas", replicas); RedisModule_Log(ctx, REDISMODULE_LOGLEVEL_DEBUG, "Emitting metric \"replicas\": %llu", replicas); + g_stats->gauge("handshakes", handshakes); + RedisModule_Log(ctx, REDISMODULE_LOGLEVEL_DEBUG, "Emitting metric \"handshakes\": %llu", handshakes); + g_stats->gauge("cluster_nodes_error_nodes", errors); + RedisModule_Log(ctx, REDISMODULE_LOGLEVEL_DEBUG, "Emitting metric \"cluster_nodes_error_nodes\": %llu", errors); #undef SAFETY_CHECK_POINTER } From 7a32cf9955bfa0c2c0856b502c0f1e8302a3775a Mon Sep 17 00:00:00 2001 From: Malavan Sotheeswaran Date: Thu, 25 Jan 2024 19:03:38 -0500 Subject: [PATCH 33/68] reset replica offsets after resetting replication buffer (#247) --- src/replication.cpp | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/replication.cpp b/src/replication.cpp index 03d63bf85..82f9d01e6 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -346,6 +346,17 @@ void resizeReplicationBacklog(long long newsize) { /* Next byte we have is... the next since the buffer is empty. */ g_pserver->repl_backlog_off = g_pserver->master_repl_offset+1; g_pserver->repl_backlog_start = g_pserver->master_repl_offset; + listIter li; + listNode *ln; + listRewind(g_pserver->slaves, &li); + while ((ln = listNext(&li))) { + client *replica = (client*)listNodeValue(ln); + + std::unique_lock ul(replica->lock); + + replica->repl_curr_off = g_pserver->master_repl_offset; + replica->repl_end_off = g_pserver->master_repl_offset; + } } } g_pserver->repl_backlog_size = newsize; From d1484aa780d23a0656706d783186864e1655b267 Mon Sep 17 00:00:00 2001 From: Malavan Sotheeswaran Date: Tue, 30 Jan 2024 13:54:18 -0500 Subject: [PATCH 34/68] Add redis bloom module (#252) * build redisbloom in dockerfile * add separate step for redisbloom * end RUN properly * can't verify with github? * just needed ca-cert --- machamp_scripts/Dockerfile | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/machamp_scripts/Dockerfile b/machamp_scripts/Dockerfile index b7f3bf6e4..d15a62df9 100644 --- a/machamp_scripts/Dockerfile +++ b/machamp_scripts/Dockerfile @@ -73,7 +73,25 @@ RUN set -eux; \ | xargs -r apt-mark manual \ ; \ apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false; \ - rm -rf /var/lib/apt/lists/*; \ + rm -rf /var/lib/apt/lists/* +# build RedisBloom +RUN set -eux; \ + savedAptMark="$(apt-mark showmanual)"; \ + apt-get update; \ + DEBIAN_FRONTEND=noninteractive apt-get -o Dpkg::Options::="--force-confnew" install -qqy --no-install-recommends \ + ca-certificates \ + wget \ + make \ + pkg-config \ + build-essential \ + git; \ + git clone --recursive https://github.com/RedisBloom/RedisBloom.git; \ + cd RedisBloom; \ + ./sbin/setup; \ + bash -l; \ + make; \ + redisBloomPath=`make run -n | awk '{print $NF}'`; \ + cp $redisBloomPath /usr/local/lib/ # create working directories and organize files RUN \ mkdir /data && chown keydb:keydb /data; \ From 5681d067a69fbc4513696d5ab05686bfa68e731f Mon Sep 17 00:00:00 2001 From: Malavan Sotheeswaran Date: Tue, 30 Jan 2024 14:22:51 -0500 Subject: [PATCH 35/68] Fix replication rdb load timeout (#251) * don't timeout replication when loading rdb * fix bracket placement --- src/replication.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/replication.cpp b/src/replication.cpp index 82f9d01e6..5f53d393e 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -4823,7 +4823,8 @@ void replicationCron(void) { /* Bulk transfer I/O timeout? */ if (mi->masterhost && mi->repl_state == REPL_STATE_TRANSFER && - (time(NULL)-mi->repl_transfer_lastio) > g_pserver->repl_timeout) + (time(NULL)-mi->repl_transfer_lastio) > g_pserver->repl_timeout && + !(g_pserver->loading & LOADING_REPLICATION)) { serverLog(LL_WARNING,"Timeout receiving bulk data from MASTER... If the problem persists try to set the 'repl-timeout' parameter in keydb.conf to a larger value."); cancelReplicationHandshake(mi,true); From 77eaaa224759d2845df558e7ecdac89eff9323fa Mon Sep 17 00:00:00 2001 From: Malavan Sotheeswaran Date: Tue, 30 Jan 2024 17:12:00 -0500 Subject: [PATCH 36/68] skip memefficiency tests in CI as they are inconsistent (#250) --- build.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build.yaml b/build.yaml index 8b63cc7ff..28b578956 100644 --- a/build.yaml +++ b/build.yaml @@ -21,7 +21,7 @@ machamp: parent: make-build # https://github.sc-corp.net/Snapchat/img/tree/master/keydb/ubuntu-20-04 builder_image: us.gcr.io/snapchat-build-artifacts/prod/snapchat/img/keydb/keydb-ubuntu-20-04@sha256:cf869a3f5d1de1e1d976bb906689c37b7031938eb68661b844a38c532f27248c - command: ./runtest --clients 4 --verbose --tls + command: ./runtest --clients 4 --verbose --tls --skipunit unit/memefficiency cluster-test: type: cmd parent: make-build From 65b91650318fada9945f6f212f955bcbf1cc7297 Mon Sep 17 00:00:00 2001 From: John Sully Date: Wed, 31 Jan 2024 22:02:02 +0000 Subject: [PATCH 37/68] Fix crash in replica replaying storage op --- src/blocked.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/blocked.cpp b/src/blocked.cpp index 8062b8d54..366e284dd 100644 --- a/src/blocked.cpp +++ b/src/blocked.cpp @@ -92,7 +92,8 @@ void blockClient(client *c, int btype) { /* Master client should never be blocked unless pause or module */ serverAssert(!(c->flags & CLIENT_MASTER && btype != BLOCKED_MODULE && - btype != BLOCKED_PAUSE)); + btype != BLOCKED_PAUSE && + btype != BLOCKED_STORAGE)); c->flags |= CLIENT_BLOCKED; c->btype = btype; From 989421ea550a8b066f0553c90c69aeb924e0b140 Mon Sep 17 00:00:00 2001 From: John Sully Date: Wed, 31 Jan 2024 22:02:26 +0000 Subject: [PATCH 38/68] Prevent crash on client disconnecting while I/O in flight --- src/networking.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/networking.cpp b/src/networking.cpp index 2c047ae7c..fd0b0d1a3 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -1608,7 +1608,7 @@ bool freeClient(client *c) { /* If a client is protected, yet we need to free it right now, make sure * to at least use asynchronous freeing. */ - if (c->flags & CLIENT_PROTECTED || c->casyncOpsPending || c->replstate == SLAVE_STATE_FASTSYNC_TX) { + if (c->flags & CLIENT_PROTECTED || c->casyncOpsPending || c->replstate == SLAVE_STATE_FASTSYNC_TX || c->btype == BLOCKED_STORAGE) { freeClientAsync(c); return false; } From 195828ed4c14f665d7f73b8daff0266432879abf Mon Sep 17 00:00:00 2001 From: Alex Cope Date: Thu, 15 Feb 2024 12:58:20 -0800 Subject: [PATCH 39/68] Log hitrate metric in modstatsd (#255) https://jira.sc-corp.net/browse/CACHE-1543 Log hit rate metric separately because M3DB caps values at 360 million so dividing monotonically increasing `keyspace_hit/miss` values is meaningless with any significant amount of traffic. Multiplying by one million since M3DB only uses integer values. --- src/modules/keydb_modstatsd/modmain.cpp | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/modules/keydb_modstatsd/modmain.cpp b/src/modules/keydb_modstatsd/modmain.cpp index 27c7df65c..4e015aefc 100644 --- a/src/modules/keydb_modstatsd/modmain.cpp +++ b/src/modules/keydb_modstatsd/modmain.cpp @@ -298,6 +298,7 @@ void handleStatItem(struct RedisModuleCtx *ctx, std::string name, StatsRecord &r long long val = strtoll(pchValue, nullptr, 10); g_stats->gauge(name, val, record.prefixOnly); RedisModule_Log(ctx, REDISMODULE_LOGLEVEL_DEBUG, "Emitting metric \"%s\": %lld", name.c_str(), val); + record.prevVal = val; break; } @@ -430,6 +431,20 @@ void handle_info_response(struct RedisModuleCtx *ctx, const char *szReply, size_ } #undef SAFETY_CHECK_POINTER + + // Emit keyspace hit rate metric which is computed from info values + auto hit = g_mapInfoFields.find("keyspace_hits"); + auto miss = g_mapInfoFields.find("keyspace_misses"); + if (hit != g_mapInfoFields.end() && miss != g_mapInfoFields.end()) { + long long hitVal = hit->second.prevVal; + long long missVal = miss->second.prevVal; + long long total = hitVal + missVal; + if (total > 0) { + double hitRate = (double)hitVal / total * 1000000; + g_stats->gauge("keyspace_hit_rate_per_million", hitRate); + RedisModule_Log(ctx, REDISMODULE_LOGLEVEL_DEBUG, "Emitting metric \"keyspace_hit_rate_per_million\": %f", hitRate); + } + } } void handle_cluster_nodes_response(struct RedisModuleCtx *ctx, const char *szReply, size_t len) { From cba39986c1702e4f5d9eadd7d52953b24a067b8c Mon Sep 17 00:00:00 2001 From: Alex Cope Date: Tue, 20 Feb 2024 18:09:40 -0800 Subject: [PATCH 40/68] Limit repl buffer writes per event loop (#256) We discovered that full sync loops were happening because after the replica loads the RDB file, the repl backlog which the primary sends over can sometimes exceed int32 max, causing an SSL error due to integer overflow: ``` Error writing to client: error:140D010F:SSL routines:SSL_write:bad length ``` After fixing that error by capping the and testing in staging, we still observed errors because the primary attempts to send a massive (multi-GB) payload in one event loop, which on the staging instances takes over one minute causing the primary to fail health checks and restart. To address this we cap the writes sent per event loop to a configurable amount, similar to `NET_MAX_WRITES_PER_EVENT` in the normal case. --- src/config.cpp | 1 + src/networking.cpp | 16 ++++++++++------ src/server.h | 1 + src/tls.cpp | 6 ++++++ 4 files changed, 18 insertions(+), 6 deletions(-) diff --git a/src/config.cpp b/src/config.cpp index 29a750835..8a6a8e3ad 100644 --- a/src/config.cpp +++ b/src/config.cpp @@ -3158,6 +3158,7 @@ standardConfig configs[] = { createLongLongConfig("repl-backlog-disk-reserve", NULL, IMMUTABLE_CONFIG, 0, LLONG_MAX, cserver.repl_backlog_disk_size, 0, MEMORY_CONFIG, NULL, NULL), createLongLongConfig("max-snapshot-slip", NULL, MODIFIABLE_CONFIG, 0, 5000, g_pserver->snapshot_slip, 400, 0, NULL, NULL), createLongLongConfig("max-rand-count", NULL, MODIFIABLE_CONFIG, 0, LONG_MAX/2, g_pserver->rand_total_threshold, LONG_MAX/2, 0, NULL, NULL), + createLongLongConfig("repl-backlog-max-writes-per-event", NULL, MODIFIABLE_CONFIG, 0, LLONG_MAX, g_pserver->repl_backlog_max_writes_per_event, 8ll*1024*1024, MEMORY_CONFIG, NULL, NULL), /* Unsigned Long Long configs */ createULongLongConfig("maxmemory", NULL, MODIFIABLE_CONFIG, 0, LLONG_MAX, g_pserver->maxmemory, 0, MEMORY_CONFIG, NULL, updateMaxmemory), diff --git a/src/networking.cpp b/src/networking.cpp index be0a5efc2..89d0fca81 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -1871,21 +1871,23 @@ int writeToClient(client *c, int handler_installed) { while (clientHasPendingReplies(c)) { long long repl_end_idx = getReplIndexFromOffset(c->repl_end_off); + long long repl_max_writes = g_pserver->repl_backlog_max_writes_per_event > 0 ? g_pserver->repl_backlog_max_writes_per_event : LLONG_MAX; + serverAssert(c->repl_curr_off != -1); - if (c->repl_curr_off != c->repl_end_off){ + if (c->repl_curr_off != c->repl_end_off) { long long repl_curr_idx = getReplIndexFromOffset(c->repl_curr_off); long long nwritten2ndStage = 0; /* How much was written from the start of the replication backlog * in the event of a wrap around write */ /* normal case with no wrap around */ - if (repl_end_idx >= repl_curr_idx){ - nwritten = connWrite(c->conn, g_pserver->repl_backlog + repl_curr_idx, repl_end_idx - repl_curr_idx); + if (repl_end_idx >= repl_curr_idx) { + nwritten = connWrite(c->conn, g_pserver->repl_backlog + repl_curr_idx, std::min(repl_max_writes, repl_end_idx - repl_curr_idx)); /* wrap around case */ } else { - nwritten = connWrite(c->conn, g_pserver->repl_backlog + repl_curr_idx, g_pserver->repl_backlog_size - repl_curr_idx); + nwritten = connWrite(c->conn, g_pserver->repl_backlog + repl_curr_idx, std::min(repl_max_writes, g_pserver->repl_backlog_size - repl_curr_idx)); /* only attempt wrapping if we write the correct number of bytes */ if (nwritten == g_pserver->repl_backlog_size - repl_curr_idx){ - nwritten2ndStage = connWrite(c->conn, g_pserver->repl_backlog, repl_end_idx); + nwritten2ndStage = connWrite(c->conn, g_pserver->repl_backlog, std::min(repl_max_writes - nwritten, repl_end_idx)); if (nwritten2ndStage != -1) nwritten += nwritten2ndStage; } @@ -1902,6 +1904,8 @@ int writeToClient(client *c, int handler_installed) { if (nwritten2ndStage == -1) nwritten = -1; if (nwritten == -1) break; + + if (totwritten > g_pserver->repl_backlog_max_writes_per_event) break; } else { break; } @@ -1972,7 +1976,7 @@ int writeToClient(client *c, int handler_installed) { if (nwritten == -1) { if (connGetState(c->conn) != CONN_STATE_CONNECTED) { serverLog(LL_VERBOSE, - "Error writing to client: %s", connGetLastError(c->conn)); + "Error writing to client: %s (totwritten: %zd)", connGetLastError(c->conn), totwritten); freeClientAsync(c); return C_ERR; diff --git a/src/server.h b/src/server.h index e4954a135..ce2ec3738 100644 --- a/src/server.h +++ b/src/server.h @@ -2564,6 +2564,7 @@ struct redisServer { int repl_diskless_sync_delay; /* Delay to start a diskless repl BGSAVE. */ std::atomic repl_lowest_off; /* The lowest offset amongst all replicas -1 if there are no replicas */ + long long repl_backlog_max_writes_per_event; /* Caps the output buffer sent to the replica at once */ /* Replication (replica) */ list *masters; int enable_multimaster; diff --git a/src/tls.cpp b/src/tls.cpp index f1d6baa9f..fb2ffa82d 100644 --- a/src/tls.cpp +++ b/src/tls.cpp @@ -1071,6 +1071,12 @@ static int connTLSWrite(connection *conn_, const void *data, size_t data_len) { if (conn->c.state != CONN_STATE_CONNECTED) return -1; ERR_clear_error(); + + if (data_len > std::numeric_limits::max()) { + // OpenSSL expects length to be 32-bit int + data_len = std::numeric_limits::max(); + } + ret = SSL_write(conn->ssl, data, data_len); if (ret <= 0) { From 752f2044549e6e5d06810a31411a7f840304354a Mon Sep 17 00:00:00 2001 From: Alex Cope Date: Thu, 22 Feb 2024 11:55:58 -0800 Subject: [PATCH 41/68] Fix for hit rate metric (#257) Tweak hit rate metric to get delta instead of aggregate --- src/modules/keydb_modstatsd/modmain.cpp | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/modules/keydb_modstatsd/modmain.cpp b/src/modules/keydb_modstatsd/modmain.cpp index 4e015aefc..203dcdcdf 100644 --- a/src/modules/keydb_modstatsd/modmain.cpp +++ b/src/modules/keydb_modstatsd/modmain.cpp @@ -92,6 +92,7 @@ struct StatsRecord { /* Dynamic Values */ long long prevVal = 0; + long long currVal = 0; }; std::unordered_map g_mapInfoFields = { @@ -296,9 +297,10 @@ void handleStatItem(struct RedisModuleCtx *ctx, std::string name, StatsRecord &r switch (record.type) { case StatsD_Type::STATSD_GAUGE_LONGLONG: { long long val = strtoll(pchValue, nullptr, 10); + record.prevVal = record.currVal; + record.currVal = val; g_stats->gauge(name, val, record.prefixOnly); RedisModule_Log(ctx, REDISMODULE_LOGLEVEL_DEBUG, "Emitting metric \"%s\": %lld", name.c_str(), val); - record.prevVal = val; break; } @@ -318,9 +320,10 @@ void handleStatItem(struct RedisModuleCtx *ctx, std::string name, StatsRecord &r case StatsD_Type::STATSD_DELTA: { long long val = strtoll(pchValue, nullptr, 10); + record.prevVal = record.currVal; + record.currVal = val; g_stats->count(name, val - record.prevVal, record.prefixOnly); RedisModule_Log(ctx, REDISMODULE_LOGLEVEL_DEBUG, "Emitting metric count for \"%s\": %lld", name.c_str() , val - record.prevVal); - record.prevVal = val; break; } @@ -436,12 +439,12 @@ void handle_info_response(struct RedisModuleCtx *ctx, const char *szReply, size_ auto hit = g_mapInfoFields.find("keyspace_hits"); auto miss = g_mapInfoFields.find("keyspace_misses"); if (hit != g_mapInfoFields.end() && miss != g_mapInfoFields.end()) { - long long hitVal = hit->second.prevVal; - long long missVal = miss->second.prevVal; - long long total = hitVal + missVal; + long long hitValDelta = hit->second.currVal - hit->second.prevVal; + long long missValDelta = miss->second.currVal - miss->second.prevVal; + long long total = hitValDelta + missValDelta; if (total > 0) { - double hitRate = (double)hitVal / total * 1000000; - g_stats->gauge("keyspace_hit_rate_per_million", hitRate); + double hitRate = (double)hitValDelta / total * 1000000; + g_stats->gauge("keyspace_hit_rate_per_million", hitRate, false /* prefixOnly */); RedisModule_Log(ctx, REDISMODULE_LOGLEVEL_DEBUG, "Emitting metric \"keyspace_hit_rate_per_million\": %f", hitRate); } } From c08683e920f9ec35bd4e8d707bb58b917f633890 Mon Sep 17 00:00:00 2001 From: John Sully Date: Fri, 1 Mar 2024 22:18:56 +0000 Subject: [PATCH 42/68] Return set to vector for clients to be processed ensuring we run them in order --- src/AsyncWorkQueue.cpp | 2 +- src/blocked.cpp | 2 +- src/db.cpp | 10 +++++++--- src/networking.cpp | 12 ++++++------ src/server.cpp | 2 +- src/server.h | 2 +- 6 files changed, 17 insertions(+), 13 deletions(-) diff --git a/src/AsyncWorkQueue.cpp b/src/AsyncWorkQueue.cpp index 77f5e9ea4..8d04b742a 100644 --- a/src/AsyncWorkQueue.cpp +++ b/src/AsyncWorkQueue.cpp @@ -56,7 +56,7 @@ void AsyncWorkQueue::WorkerThreadMain() listRelease(vars.clients_pending_asyncwrite); std::unique_lock lockf(serverTL->lockPendingWrite); - serverTL->setclientsProcess.clear(); + serverTL->vecclientsProcess.clear(); serverTL->clients_pending_write.clear(); std::atomic_thread_fence(std::memory_order_seq_cst); } diff --git a/src/blocked.cpp b/src/blocked.cpp index 366e284dd..89f219bf0 100644 --- a/src/blocked.cpp +++ b/src/blocked.cpp @@ -209,7 +209,7 @@ void unblockClient(client *c) { listDelNode(g_pserver->paused_clients,c->paused_list_node); c->paused_list_node = NULL; } else if (c->btype == BLOCKED_STORAGE) { - serverTL->setclientsProcess.insert(c); + serverTL->vecclientsProcess.push_back(c); } else { serverPanic("Unknown btype in unblockClient()."); } diff --git a/src/db.cpp b/src/db.cpp index 6e4a1f36e..8a8a465a4 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -3142,8 +3142,11 @@ void redisDbPersistentData::trackChanges(bool fBulk, size_t sizeHint) if (fBulk) m_fAllChanged.fetch_add(1, std::memory_order_acq_rel); - if (sizeHint > 0 && aeThreadOwnsLock()) + if (sizeHint > 0) { + aeAcquireLock(); dictExpand(m_dictChanged, sizeHint, false); + aeReleaseLock(); + } } void redisDbPersistentData::removeAllCachedValues() @@ -3362,8 +3365,9 @@ void redisDbPersistentData::prefetchKeysFlash(std::unordered_set &setc) auto *tok = m_spstorage->begin_retrieve(serverTL->el, storageLoadCallback, veckeys.data(), veckeys.size()); if (tok != nullptr) { for (client *c : setcBlocked) { - if (!(c->flags & CLIENT_BLOCKED)) + if (!(c->flags & CLIENT_BLOCKED)) { blockClient(c, BLOCKED_STORAGE); + } } tok->setc = std::move(setcBlocked); tok->type = StorageToken::TokenType::SingleRead; @@ -3429,6 +3433,6 @@ void redisDbPersistentData::processStorageToken(StorageToken *tok) { if (c->flags & CLIENT_BLOCKED) unblockClient(c); else - serverTL->setclientsProcess.insert(c); + serverTL->vecclientsProcess.push_back(c); } } \ No newline at end of file diff --git a/src/networking.cpp b/src/networking.cpp index be3901644..8ce1437a2 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -1626,7 +1626,7 @@ void unlinkClient(client *c) { c->fPendingAsyncWrite = FALSE; } - serverTL->setclientsProcess.erase(c); + serverTL->vecclientsProcess.erase(std::remove(serverTL->vecclientsProcess.begin(), serverTL->vecclientsProcess.end(), c), serverTL->vecclientsProcess.end()); serverTL->setclientsPrefetch.erase(c); /* Clear the tracking status. */ @@ -2822,7 +2822,7 @@ void readQueryFromClient(connection *conn) { } c->vecqueuedcmd.clear(); } else { - serverTL->setclientsProcess.insert(c); + serverTL->vecclientsProcess.push_back(c); } } } @@ -2839,10 +2839,10 @@ void processClients() { serverAssert(GlobalLocksAcquired()); - // Note that this function is reentrant and vecclients may be modified by code called from processInputBuffer - while (!serverTL->setclientsProcess.empty()) { - client *c = *serverTL->setclientsProcess.begin(); - serverTL->setclientsProcess.erase(serverTL->setclientsProcess.begin()); + // Note that this function is reentrant and vecclientsProcess may be modified by code called from processInputBuffer + while (!serverTL->vecclientsProcess.empty()) { + client *c = *serverTL->vecclientsProcess.begin(); + serverTL->vecclientsProcess.erase(serverTL->vecclientsProcess.begin()); /* There is more data in the client input buffer, continue parsing it * in case to check if there is a full command to execute. */ diff --git a/src/server.cpp b/src/server.cpp index bfe3bf7ea..4a42df649 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -2847,7 +2847,7 @@ void beforeSleep(struct aeEventLoop *eventLoop) { g_pserver->db[0]->prefetchKeysFlash(serverTL->setclientsPrefetch); for (client *c : serverTL->setclientsPrefetch) { if (!(c->flags & CLIENT_BLOCKED)) - serverTL->setclientsProcess.insert(c); + serverTL->vecclientsProcess.push_back(c); } serverTL->setclientsPrefetch.clear(); } diff --git a/src/server.h b/src/server.h index 42e321659..00ed185de 100644 --- a/src/server.h +++ b/src/server.h @@ -2230,7 +2230,7 @@ struct redisServerThreadVars { int propagate_in_transaction = 0; /* Make sure we don't propagate nested MULTI/EXEC */ int client_pause_in_transaction = 0; /* Was a client pause executed during this Exec? */ - std::unordered_set setclientsProcess; + std::vector vecclientsProcess; std::unordered_set setclientsPrefetch; std::unordered_set setStorageTokensProcess; dictAsyncRehashCtl *rehashCtl = nullptr; From 260f47bbb5b30beebba2f9dc1fff520cabd94e6f Mon Sep 17 00:00:00 2001 From: John Sully Date: Fri, 1 Mar 2024 22:19:25 +0000 Subject: [PATCH 43/68] Do not create threadpools if we won't need them --- src/server.cpp | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/server.cpp b/src/server.cpp index 4a42df649..a14e4a9ce 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -4159,10 +4159,15 @@ void InitServerLast() { g_pserver->asyncworkqueue = new (MALLOC_LOCAL) AsyncWorkQueue(cserver.cthreads); - g_pserver->asyncreadworkqueue = new (MALLOC_LOCAL) AsyncWorkQueue(cserver.cthreads*10); + if (g_pserver->m_pstorageFactory != nullptr) { + g_pserver->asyncreadworkqueue = new (MALLOC_LOCAL) AsyncWorkQueue(cserver.cthreads); - //Process one write/commit at a time to ensure consistency - g_pserver->asyncwriteworkqueue = new (MALLOC_LOCAL) AsyncWorkQueue(1); + //Process one write/commit at a time to ensure consistency + g_pserver->asyncwriteworkqueue = new (MALLOC_LOCAL) AsyncWorkQueue(1); + } else { + g_pserver->asyncreadworkqueue = nullptr; + g_pserver->asyncwriteworkqueue = nullptr; + } // Allocate the repl backlog From e5f8e77a52a15f766491607039c49fc3fbe8dfca Mon Sep 17 00:00:00 2001 From: John Sully Date: Fri, 1 Mar 2024 23:40:43 +0000 Subject: [PATCH 44/68] Fix test failure due to slowness with many DBs --- tests/unit/moduleapi/load.tcl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/moduleapi/load.tcl b/tests/unit/moduleapi/load.tcl index 12ca22402..0c9508b3e 100644 --- a/tests/unit/moduleapi/load.tcl +++ b/tests/unit/moduleapi/load.tcl @@ -1,7 +1,7 @@ set testmodule [file normalize tests/modules/load.so] if {$::flash_enabled} { - start_server [list tags [list "modules"] overrides [list storage-provider {flash ./rocks.db.master.load.test} databases 256 loadmodule $testmodule]] { + start_server [list tags [list "modules"] overrides [list storage-provider {flash ./rocks.db.master.load.test} loadmodule $testmodule]] { test "Module is notified of keys loaded from flash" { r flushall r set foo bar From e7ab2752769ca135ca7b40a38ac2f557f7da44be Mon Sep 17 00:00:00 2001 From: John Sully Date: Mon, 4 Mar 2024 20:50:22 +0000 Subject: [PATCH 45/68] Fix linker issues with newer linkers --- src/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Makefile b/src/Makefile index a3cd741f4..cfe19a493 100644 --- a/src/Makefile +++ b/src/Makefile @@ -73,10 +73,10 @@ ifneq ($(strip $(SANITIZE)),) endif ifeq ($(ENABLE_FLASH),yes) + FINAL_LIBS+= ../deps/rocksdb/librocksdb.a FINAL_LIBS+= -lz -lcrypto -lbz2 -lzstd -llz4 -lsnappy CXXFLAGS+= -I../deps/rocksdb/include/ -DENABLE_ROCKSDB STORAGE_OBJ+= storage/rocksdb.o storage/rocksdbfactory.o - FINAL_LIBS+= ../deps/rocksdb/librocksdb.a DEPENDENCY_TARGETS+= rocksdb endif From 8f2dbc3b5391c53c5b996494b489a0cf07a06549 Mon Sep 17 00:00:00 2001 From: John Sully Date: Tue, 5 Mar 2024 02:49:05 +0000 Subject: [PATCH 46/68] Fix crash --- src/db.cpp | 4 +++- src/server.h | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/db.cpp b/src/db.cpp index 8a8a465a4..eefa5462b 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -3325,7 +3325,7 @@ void redisDbPersistentData::prefetchKeysAsync(client *c, parsed_command &command } } -void redisDbPersistentData::prefetchKeysFlash(std::unordered_set &setc) +void redisDbPersistentData::prefetchKeysFlash(const std::unordered_set &setc) { serverAssert(GlobalLocksAcquired()); std::vector veckeys; @@ -3334,6 +3334,8 @@ void redisDbPersistentData::prefetchKeysFlash(std::unordered_set &setc) for (client *c : setc) { for (auto &command : c->vecqueuedcmd) { getKeysResult result = GETKEYS_RESULT_INIT; + if (command.argc == 0) // parse can do this it will be handled by processClient + break; auto cmd = lookupCommand(szFromObj(command.argv[0])); if (cmd == nullptr) break; // Bad command? It's not for us to judge, just bail diff --git a/src/server.h b/src/server.h index 00ed185de..3826385a6 100644 --- a/src/server.h +++ b/src/server.h @@ -1230,7 +1230,7 @@ class redisDbPersistentData bool keycacheIsEnabled(); void prefetchKeysAsync(client *c, struct parsed_command &command); - void prefetchKeysFlash(std::unordered_set &setc); + void prefetchKeysFlash(const std::unordered_set &setc); void processStorageToken(StorageToken *tok); bool FSnapshot() const { return m_spdbSnapshotHOLDER != nullptr; } From c5c4779ca4e6bdfc9f7419192eb43aa71be69984 Mon Sep 17 00:00:00 2001 From: John Sully Date: Tue, 5 Mar 2024 02:54:15 +0000 Subject: [PATCH 47/68] Fix hung clients --- src/db.cpp | 6 +++--- src/networking.cpp | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/db.cpp b/src/db.cpp index eefa5462b..edf9a0746 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -3364,7 +3364,7 @@ void redisDbPersistentData::prefetchKeysFlash(const std::unordered_set if (veckeys.empty()) return; - auto *tok = m_spstorage->begin_retrieve(serverTL->el, storageLoadCallback, veckeys.data(), veckeys.size()); + StorageToken *tok = m_spstorage->begin_retrieve(serverTL->el, storageLoadCallback, veckeys.data(), veckeys.size()); if (tok != nullptr) { for (client *c : setcBlocked) { if (!(c->flags & CLIENT_BLOCKED)) { @@ -3434,7 +3434,7 @@ void redisDbPersistentData::processStorageToken(StorageToken *tok) { std::unique_lock ul(c->lock); if (c->flags & CLIENT_BLOCKED) unblockClient(c); - else - serverTL->vecclientsProcess.push_back(c); + + serverTL->vecclientsProcess.push_back(c); } } \ No newline at end of file diff --git a/src/networking.cpp b/src/networking.cpp index 8ce1437a2..34f13909c 100644 --- a/src/networking.cpp +++ b/src/networking.cpp @@ -2813,7 +2813,7 @@ void readQueryFromClient(connection *conn) { } } if (!c->vecqueuedcmd.empty()) { - if (g_pserver->m_pstorageFactory != nullptr && g_pserver->prefetch_enabled) { + if (g_pserver->m_pstorageFactory != nullptr && g_pserver->prefetch_enabled && !(c->flags & CLIENT_BLOCKED)) { serverTL->setclientsPrefetch.insert(c); } else { if (g_pserver->is_overloaded && !(c->flags & (CLIENT_MASTER | CLIENT_SLAVE | CLIENT_PENDING_WRITE | CLIENT_PUBSUB | CLIENT_BLOCKED | CLIENT_IGNORE_OVERLOAD)) && ((random() % 100) < g_pserver->overload_protect_strength)) { From 8c4fa5d9911e0ded19ab111655a8d7ad0c8355b1 Mon Sep 17 00:00:00 2001 From: John Sully Date: Tue, 5 Mar 2024 03:03:08 +0000 Subject: [PATCH 48/68] Fix typo in test path --- tests/integration/replication-psync-flash.tcl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/replication-psync-flash.tcl b/tests/integration/replication-psync-flash.tcl index 84e877c6b..e57ccea0e 100644 --- a/tests/integration/replication-psync-flash.tcl +++ b/tests/integration/replication-psync-flash.tcl @@ -10,7 +10,7 @@ # checked for consistency. if {$::flash_enabled} { proc test_psync {descr duration backlog_size backlog_ttl delay cond mdl sdl reconnect} { - start_server [list tags {"repl"} overrides [list storage-provider {flash .rocks.db.m} repl-backlog-size 64m]] { + start_server [list tags {"repl"} overrides [list storage-provider {flash ./rocks.db.m} repl-backlog-size 64m]] { start_server [list tags {flash} overrides [list storage-provider {flash ./rocks.db} delete-on-evict no storage-flush-period 10]] { set master [srv -1 client] From 7a488a83e9fdcd22eb50ed3b2898fa66432ed2a8 Mon Sep 17 00:00:00 2001 From: John Sully Date: Tue, 5 Mar 2024 03:04:10 +0000 Subject: [PATCH 49/68] Serialize writes due to races causing test failures --- src/db.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/db.cpp b/src/db.cpp index edf9a0746..f49482374 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -3044,12 +3044,15 @@ void redisDbPersistentData::commitChanges(const redisDbPersistentDataSnapshot ** } if (m_spstorage != nullptr) { + m_spstorage->endWriteBatch(); +#if 0 auto *tok = m_spstorage->begin_endWriteBatch(serverTL->el, storageLoadCallback); if (tok != nullptr) { tok->db = this; tok->type = StorageToken::TokenType::BatchWrite; } +#endif } } From 8a2016aaea0f579cc5dcea9c909bc8ca1ec16926 Mon Sep 17 00:00:00 2001 From: John Sully Date: Tue, 5 Mar 2024 03:22:24 +0000 Subject: [PATCH 50/68] Enable key cache for async reads --- src/StorageCache.cpp | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/src/StorageCache.cpp b/src/StorageCache.cpp index 0feca8e9e..ba307fc68 100644 --- a/src/StorageCache.cpp +++ b/src/StorageCache.cpp @@ -209,18 +209,22 @@ void StorageCache::retrieve(sds key, IStorage::callbackSingle fn) const } StorageToken *StorageCache::begin_retrieve(struct aeEventLoop *el, aePostFunctionTokenProc proc, sds *rgkey, size_t ckey) { -#if 0 std::unique_lock ul(m_lock); if (m_pdict != nullptr) { - uint64_t hash = dictSdsHash(key); - dictEntry *de = dictFind(m_pdict, reinterpret_cast(hash)); - - if (de == nullptr) - return nullptr; // Not found + bool fAnyKey = false; + for (size_t ik = 0; ik < ckey; ++ik) { + uint64_t hash = dictSdsHash(rgkey[ik]); + dictEntry *de = dictFind(m_pdict, reinterpret_cast(hash)); + + if (de != nullptr) + fAnyKey = true; + } + if (!fAnyKey) + return nullptr; // All keys are missing - skip the io } ul.unlock(); -#endif + return m_spstorage->begin_retrieve(el, proc, rgkey, ckey); } From bff8b73d0da3a996b2611bd8a1049d99cee3cbe6 Mon Sep 17 00:00:00 2001 From: John Sully Date: Tue, 5 Mar 2024 03:42:34 +0000 Subject: [PATCH 51/68] Fix ARM CI --- src/db.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/db.cpp b/src/db.cpp index f49482374..5a87a4230 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -3323,6 +3323,9 @@ void redisDbPersistentData::prefetchKeysAsync(client *c, parsed_command &command } } } +#else + UNUSED(c); + UNUSED(command); #endif return; } From 6c15c438c62420a17df5bd9b012123d4b3b3328c Mon Sep 17 00:00:00 2001 From: John Sully Date: Tue, 5 Mar 2024 19:19:53 +0000 Subject: [PATCH 52/68] Build with flash --- build.yaml | 2 +- machamp_scripts/Dockerfile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/build.yaml b/build.yaml index 28b578956..16120e014 100644 --- a/build.yaml +++ b/build.yaml @@ -59,7 +59,7 @@ machamp: # to ensure a clearer docker build env code-checkout: type: cmd - command: echo checkout + command: echo checkout && git submodule init && git submodule update # default machamp builder image does not work for multi arch builder_image: us.gcr.io/snapchat-build-artifacts/prod/snapchat/img/ubuntu/ubuntu-23-04@sha256:bd43177a80e6ce1c3583e8ea959b88a9081c0f56b765ec9c5a157c27a637c23b docker: diff --git a/machamp_scripts/Dockerfile b/machamp_scripts/Dockerfile index d15a62df9..66627ab34 100644 --- a/machamp_scripts/Dockerfile +++ b/machamp_scripts/Dockerfile @@ -24,7 +24,7 @@ RUN set -eux; \ gosu nobody true # build KeyDB ARG MAKE_JOBS="" -ARG ENABLE_FLASH="" +ARG ENABLE_FLASH="yes" COPY . /tmp/keydb-internal RUN set -eux; \ cd /tmp/keydb-internal; \ From fe5503a64019f6b39acbcc607bd9dec5c8edf41a Mon Sep 17 00:00:00 2001 From: John Sully Date: Thu, 7 Mar 2024 20:27:29 +0000 Subject: [PATCH 53/68] Emit maxstorage metric --- src/modules/keydb_modstatsd/modmain.cpp | 1 + src/server.cpp | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/modules/keydb_modstatsd/modmain.cpp b/src/modules/keydb_modstatsd/modmain.cpp index 203dcdcdf..f6a5c0ee2 100644 --- a/src/modules/keydb_modstatsd/modmain.cpp +++ b/src/modules/keydb_modstatsd/modmain.cpp @@ -100,6 +100,7 @@ std::unordered_map g_mapInfoFields = { { "used_memory", { StatsD_Type::STATSD_GAUGE_BYTES, false /* prefixOnly */}}, { "used_memory_rss", { StatsD_Type::STATSD_GAUGE_BYTES }}, { "maxmemory", { StatsD_Type::STATSD_GAUGE_BYTES, false /* prefixOnly */}}, + { "maxstorage", { StatsD_Type::STATSD_GAUGE_BYTES, false /* prefixOnly */}}, { "used_memory_dataset_perc", { StatsD_Type::STATSD_GAUGE_FLOAT }}, { "avg_lock_contention", { StatsD_Type::STATSD_GAUGE_LONGLONG }}, { "repl_backlog_size", { StatsD_Type::STATSD_GAUGE_BYTES }}, diff --git a/src/server.cpp b/src/server.cpp index a14e4a9ce..e7ef53d23 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -5826,7 +5826,8 @@ sds genRedisInfoString(const char *section) { "lazyfree_pending_objects:%zu\r\n" "lazyfreed_objects:%zu\r\n" "storage_provider:%s\r\n" - "available_system_memory:%s\r\n", + "available_system_memory:%s\r\n" + "maxstorage:%llu\r\n", zmalloc_used, hmem, g_pserver->cron_malloc_stats.process_rss, @@ -5872,7 +5873,8 @@ sds genRedisInfoString(const char *section) { lazyfreeGetPendingObjectsCount(), lazyfreeGetFreedObjectsCount(), g_pserver->m_pstorageFactory ? g_pserver->m_pstorageFactory->name() : "none", - available_system_mem + available_system_mem, + g_pserver->maxstorage ); freeMemoryOverheadData(mh); } From c83c55f1814cd3714d952d981329bfa805c02fa1 Mon Sep 17 00:00:00 2001 From: John Sully Date: Fri, 8 Mar 2024 08:12:22 +0000 Subject: [PATCH 54/68] Fix expire tracking assert failure --- src/db.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/db.cpp b/src/db.cpp index 5a87a4230..bf5b40d68 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -2884,6 +2884,7 @@ void redisDbPersistentData::ensure(const char *sdsKey, dictEntry **pde) o->SetFExpires(spexpire != nullptr); if (spexpire != nullptr) { o->expire = std::move(*spexpire); + ++m_numexpires; } g_pserver->stat_storage_provider_read_hits++; } else { @@ -3124,8 +3125,15 @@ bool redisDbPersistentData::removeCachedValue(const char *key, dictEntry **ppde) // since we write ASAP the database already has a valid copy so safe to delete if (ppde != nullptr) { + robj *o = (robj*)dictGetVal(*ppde); + if (o->FExpires()) + --m_numexpires; *ppde = dictUnlink(m_pdict, key); } else { + dictEntry *deT = dictFind(m_pdict, key); + robj *o = (robj*)dictGetVal(deT); + if (o->FExpires()) + --m_numexpires; dictDelete(m_pdict, key); } @@ -3172,6 +3180,7 @@ void redisDbPersistentData::removeAllCachedValues() } else { dictEmpty(m_pdict, nullptr); } + m_numexpires = 0; } void redisDbPersistentData::disableKeyCache() From ad2a2824b53c61989bbd60bb4ad73cd5a1167c66 Mon Sep 17 00:00:00 2001 From: John Sully Date: Fri, 8 Mar 2024 08:43:09 +0000 Subject: [PATCH 55/68] Disable swapdb with flash since it doesn't work --- src/server.cpp | 5 +++++ tests/cluster/run.tcl | 10 ++++++++++ tests/cluster/tests/17-diskless-load-swapdb.tcl | 2 ++ 3 files changed, 17 insertions(+) diff --git a/src/server.cpp b/src/server.cpp index e7ef53d23..10cb25f2f 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -7436,6 +7436,11 @@ static void validateConfiguration() exit(EXIT_FAILURE); } + if (g_pserver->repl_diskless_load == REPL_DISKLESS_LOAD_SWAPDB && g_pserver->m_pstorageFactory) { + serverLog(LL_WARNING, "SWAPDB is not implemented when using a storage provider."); + exit(EXIT_FAILURE); + } + g_pserver->repl_backlog_size = g_pserver->repl_backlog_config_size; // this is normally set in the update logic, but not on initial config } diff --git a/tests/cluster/run.tcl b/tests/cluster/run.tcl index 7e1e91081..883313a0b 100644 --- a/tests/cluster/run.tcl +++ b/tests/cluster/run.tcl @@ -10,6 +10,16 @@ source ../../support/cluster.tcl ; # Redis Cluster client. set ::instances_count 20 ; # How many instances we use at max. set ::tlsdir "../../tls" +# Check if we compiled with flash +set status [catch {exec ../../../src/keydb-server --is-flash-enabled}] +if {$status == 0} { + puts "KeyDB was built with FLASH, including FLASH tests" + set ::flash_enabled 1 +} else { + puts "KeyDB was not built with FLASH. Excluding FLASH tests" + set ::flash_enabled 0 +} + proc main {} { parse_options spawn_instance redis $::redis_base_port $::instances_count { diff --git a/tests/cluster/tests/17-diskless-load-swapdb.tcl b/tests/cluster/tests/17-diskless-load-swapdb.tcl index 1b19a222b..8cce011d8 100644 --- a/tests/cluster/tests/17-diskless-load-swapdb.tcl +++ b/tests/cluster/tests/17-diskless-load-swapdb.tcl @@ -14,6 +14,7 @@ test "Cluster is writable" { cluster_write_test 0 } +if {!$::flash_enabled} { test "Right to restore backups when fail to diskless load " { set master [Rn 0] set replica [Rn 1] @@ -84,3 +85,4 @@ test "Right to restore backups when fail to diskless load " { assert_equal {1} [$replica get $slot0_key] assert_equal $slot0_key [$replica CLUSTER GETKEYSINSLOT 0 1] "POST RUN" } +} \ No newline at end of file From 088a1dbf825e3bced4500ce7f8af218b3ed4063b Mon Sep 17 00:00:00 2001 From: John Sully Date: Fri, 8 Mar 2024 17:56:57 +0000 Subject: [PATCH 56/68] Ensure cluster tests run in flash mode in CI --- build.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/build.yaml b/build.yaml index 16120e014..8d60fbd21 100644 --- a/build.yaml +++ b/build.yaml @@ -28,6 +28,12 @@ machamp: # https://github.sc-corp.net/Snapchat/img/tree/master/keydb/ubuntu-20-04 builder_image: us.gcr.io/snapchat-build-artifacts/prod/snapchat/img/keydb/keydb-ubuntu-20-04@sha256:cf869a3f5d1de1e1d976bb906689c37b7031938eb68661b844a38c532f27248c command: ./runtest-cluster --tls + cluster-test-flash: + type: cmd + parent: make-build + # https://github.sc-corp.net/Snapchat/img/tree/master/keydb/ubuntu-20-04 + builder_image: us.gcr.io/snapchat-build-artifacts/prod/snapchat/img/keydb/keydb-ubuntu-20-04@sha256:cf869a3f5d1de1e1d976bb906689c37b7031938eb68661b844a38c532f27248c + command: ./runtest-cluster --tls --flash sentinel-test: type: cmd parent: make-build From d84fa525b59606af7217cd3675ac2af7d486cae4 Mon Sep 17 00:00:00 2001 From: John Sully Date: Fri, 8 Mar 2024 20:21:30 +0000 Subject: [PATCH 57/68] Fix crash in FLASH eviction --- src/db.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/db.cpp b/src/db.cpp index bf5b40d68..f32e8084c 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -3125,10 +3125,10 @@ bool redisDbPersistentData::removeCachedValue(const char *key, dictEntry **ppde) // since we write ASAP the database already has a valid copy so safe to delete if (ppde != nullptr) { + *ppde = dictUnlink(m_pdict, key); robj *o = (robj*)dictGetVal(*ppde); if (o->FExpires()) --m_numexpires; - *ppde = dictUnlink(m_pdict, key); } else { dictEntry *deT = dictFind(m_pdict, key); robj *o = (robj*)dictGetVal(deT); From 559588b995e70ca0c0ec125fce2f87af5c0f1c5a Mon Sep 17 00:00:00 2001 From: John Sully Date: Fri, 8 Mar 2024 22:27:55 +0000 Subject: [PATCH 58/68] Disable swapdb tests with flash --- src/config.cpp | 10 +++++++++- tests/integration/replication-psync-flash.tcl | 4 ++++ tests/integration/replication-psync-multimaster.tcl | 4 ++++ tests/integration/replication-psync.tcl | 4 ++++ tests/integration/replication.tcl | 9 ++++++++- 5 files changed, 29 insertions(+), 2 deletions(-) diff --git a/src/config.cpp b/src/config.cpp index 50c08d9ed..749fb5746 100644 --- a/src/config.cpp +++ b/src/config.cpp @@ -2724,6 +2724,14 @@ static int isValidProcTitleTemplate(char *val, const char **err) { return 1; } +static int isValidDisklessEnum(int e, const char **err) { + if (e == REPL_DISKLESS_LOAD_SWAPDB && g_pserver->m_pstorageFactory != nullptr) { + *err = "cannot use swapdb with a storage provider"; + return 0; + } + return 1; +} + static int updateProcTitleTemplate(char *val, char *prev, const char **err) { UNUSED(val); UNUSED(prev); @@ -3091,7 +3099,7 @@ standardConfig configs[] = { /* Enum Configs */ createEnumConfig("supervised", NULL, IMMUTABLE_CONFIG, supervised_mode_enum, cserver.supervised_mode, SUPERVISED_NONE, NULL, NULL), createEnumConfig("syslog-facility", NULL, IMMUTABLE_CONFIG, syslog_facility_enum, g_pserver->syslog_facility, LOG_LOCAL0, NULL, NULL), - createEnumConfig("repl-diskless-load", NULL, MODIFIABLE_CONFIG, repl_diskless_load_enum, g_pserver->repl_diskless_load, REPL_DISKLESS_LOAD_DISABLED, NULL, NULL), + createEnumConfig("repl-diskless-load", NULL, MODIFIABLE_CONFIG, repl_diskless_load_enum, g_pserver->repl_diskless_load, REPL_DISKLESS_LOAD_DISABLED, isValidDisklessEnum, NULL), createEnumConfig("loglevel", NULL, MODIFIABLE_CONFIG, loglevel_enum, cserver.verbosity, LL_NOTICE, NULL, NULL), createEnumConfig("maxmemory-policy", NULL, MODIFIABLE_CONFIG, maxmemory_policy_enum, g_pserver->maxmemory_policy, MAXMEMORY_NO_EVICTION, NULL, NULL), createEnumConfig("appendfsync", NULL, MODIFIABLE_CONFIG, aof_fsync_enum, g_pserver->aof_fsync, AOF_FSYNC_EVERYSEC, NULL, NULL), diff --git a/tests/integration/replication-psync-flash.tcl b/tests/integration/replication-psync-flash.tcl index e57ccea0e..353c02614 100644 --- a/tests/integration/replication-psync-flash.tcl +++ b/tests/integration/replication-psync-flash.tcl @@ -114,6 +114,10 @@ if {$::flash_enabled} { foreach mdl {no yes} { foreach sdl {disabled swapdb} { + if {$::flash_enabled && $sdl == "swapdb"} { + continue + } + test_psync {no reconnection, just sync} 6 1000000 3600 0 { } $mdl $sdl 0 diff --git a/tests/integration/replication-psync-multimaster.tcl b/tests/integration/replication-psync-multimaster.tcl index ada3f8cec..a905dfae4 100644 --- a/tests/integration/replication-psync-multimaster.tcl +++ b/tests/integration/replication-psync-multimaster.tcl @@ -128,6 +128,10 @@ proc test_psync {descr duration backlog_size backlog_ttl delay cond mdl sdl reco foreach mdl {no yes} { foreach sdl {disabled swapdb} { + if {$::flash_enabled && $sdl == "swapdb"} { + continue + } + test_psync {no reconnection, just sync} 6 1000000 3600 0 { } $mdl $sdl 0 diff --git a/tests/integration/replication-psync.tcl b/tests/integration/replication-psync.tcl index 08e21d310..4d108aa30 100644 --- a/tests/integration/replication-psync.tcl +++ b/tests/integration/replication-psync.tcl @@ -119,6 +119,10 @@ proc test_psync {descr duration backlog_size backlog_ttl delay cond mdl sdl reco foreach mdl {no yes} { foreach sdl {disabled swapdb} { + if {$::flash_enabled && $sdl == "swapdb"} { + continue + } + test_psync {no reconnection, just sync} 6 1000000 3600 0 { } $mdl $sdl 0 diff --git a/tests/integration/replication.tcl b/tests/integration/replication.tcl index fa84ac9d5..d80c25884 100644 --- a/tests/integration/replication.tcl +++ b/tests/integration/replication.tcl @@ -252,6 +252,10 @@ start_server {tags {"repl"}} { foreach mdl {no yes} { foreach sdl {disabled swapdb} { + if {$::flash_enabled && $sdl == "swapdb"} { + # swapdb not compatible with flash + continue + } start_server {tags {"repl"}} { set master [srv 0 client] $master config set repl-diskless-sync $mdl @@ -383,6 +387,7 @@ start_server {tags {"repl"}} { } } +if {!$::flash_enabled} { test {slave fails full sync and diskless load swapdb recovers it} { start_server {tags {"repl"}} { set slave [srv 0 client] @@ -547,6 +552,7 @@ test {diskless loading short read} { } } } +} # get current stime and utime metrics for a thread (since it's creation) proc get_cpu_metrics { statfile } { @@ -578,7 +584,7 @@ proc compute_cpu_usage {start end} { return [ list $pucpu $pscpu ] } - +if {!$::flash_enabled} { # test diskless rdb pipe with multiple replicas, which may drop half way start_server {tags {"repl"}} { set master [srv 0 client] @@ -814,6 +820,7 @@ test "diskless replication read pipe cleanup" { } } } +} test {replicaof right after disconnection} { # this is a rare race condition that was reproduced sporadically by the psync2 unit. From 5fbe9915c308c4a5f48a6548c363d537800c677c Mon Sep 17 00:00:00 2001 From: John Sully Date: Mon, 18 Mar 2024 21:50:51 +0000 Subject: [PATCH 59/68] Fixup ifdef for easier switching --- src/db.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/db.cpp b/src/db.cpp index f32e8084c..69d5a6405 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -3045,8 +3045,9 @@ void redisDbPersistentData::commitChanges(const redisDbPersistentDataSnapshot ** } if (m_spstorage != nullptr) { +#if 1 m_spstorage->endWriteBatch(); -#if 0 +#else auto *tok = m_spstorage->begin_endWriteBatch(serverTL->el, storageLoadCallback); if (tok != nullptr) { From a430473a39e2f5f109fee389f34aa1ae6b092004 Mon Sep 17 00:00:00 2001 From: John Sully Date: Tue, 19 Mar 2024 03:28:48 +0000 Subject: [PATCH 60/68] Fix asserts after RDB load with FLASH --- src/db.cpp | 1 + src/rdb.cpp | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/src/db.cpp b/src/db.cpp index 69d5a6405..3accdd361 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -2989,6 +2989,7 @@ void redisDbPersistentData::processChangesAsync(std::atomic &pendingJobs) dict *dictNew = dictCreate(&dbDictType, nullptr); std::swap(dictNew, m_pdict); m_cnewKeysPending = 0; + m_numexpires = 0; g_pserver->asyncworkqueue->AddWorkFunction([dictNew, this, &pendingJobs]{ dictIterator *di = dictGetIterator(dictNew); dictEntry *de; diff --git a/src/rdb.cpp b/src/rdb.cpp index 997ef67f3..15946f01c 100644 --- a/src/rdb.cpp +++ b/src/rdb.cpp @@ -2865,6 +2865,12 @@ class rdbAsyncWorkThread size_t endWork() { if (!fLaunched) { + for (int idb = 0; idb < cserver.dbnum; ++idb) { + if (g_pserver->m_pstorageFactory) { + g_pserver->db[idb]->processChangesAsync(cstorageWritesInFlight); + } + } + while (cstorageWritesInFlight > 0); return ckeysLoaded; } if (!vecbatch.empty()) { From 0267bc5ef75a89371966425f69d70499b3cbe880 Mon Sep 17 00:00:00 2001 From: John Sully Date: Tue, 19 Mar 2024 03:32:44 +0000 Subject: [PATCH 61/68] Add FLASH tests to CI --- build.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/build.yaml b/build.yaml index 8d60fbd21..ac4448554 100644 --- a/build.yaml +++ b/build.yaml @@ -22,6 +22,12 @@ machamp: # https://github.sc-corp.net/Snapchat/img/tree/master/keydb/ubuntu-20-04 builder_image: us.gcr.io/snapchat-build-artifacts/prod/snapchat/img/keydb/keydb-ubuntu-20-04@sha256:cf869a3f5d1de1e1d976bb906689c37b7031938eb68661b844a38c532f27248c command: ./runtest --clients 4 --verbose --tls --skipunit unit/memefficiency + tests-with-flash: + type: cmd + parent: make-build + # https://github.sc-corp.net/Snapchat/img/tree/master/keydb/ubuntu-20-04 + builder_image: us.gcr.io/snapchat-build-artifacts/prod/snapchat/img/keydb/keydb-ubuntu-20-04@sha256:cf869a3f5d1de1e1d976bb906689c37b7031938eb68661b844a38c532f27248c + command: ./runtest --verbose --skipunit unit/memefficiency --flash cluster-test: type: cmd parent: make-build From f5e367d8898c4ae26975ab04b754abb2f06e15bf Mon Sep 17 00:00:00 2001 From: John Sully Date: Tue, 19 Mar 2024 23:10:03 +0000 Subject: [PATCH 62/68] Fix hash count mismatch asserts for getKeysInSlot --- src/db.cpp | 12 ++++++++++++ src/storage/rocksdb.cpp | 19 +++++++++++-------- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/src/db.cpp b/src/db.cpp index 3accdd361..a7d228434 100644 --- a/src/db.cpp +++ b/src/db.cpp @@ -2528,6 +2528,12 @@ void slotToKeyFlush(int async) { * decrement the reference count to release the keys names. */ unsigned int getKeysInSlot(unsigned int hashslot, robj **keys, unsigned int count) { if (g_pserver->m_pstorageFactory != nullptr) { + // We must commit so the storage engine agrees on the number of items in the hash slot + if (g_pserver->db[0]->FTrackingChanges()) { + if (g_pserver->db[0]->processChanges(false)) + g_pserver->db[0]->commitChanges(); + g_pserver->db[0]->trackChanges(false); + } int j = 0; g_pserver->db[0]->getStorageCache()->enumerate_hashslot([&](const char *key, size_t cchKey, const void *, size_t )->bool { keys[j++] = createStringObject(key, cchKey); @@ -2557,6 +2563,12 @@ unsigned int getKeysInSlot(unsigned int hashslot, robj **keys, unsigned int coun unsigned int delKeysInSlot(unsigned int hashslot) { serverAssert(GlobalLocksAcquired()); if (g_pserver->m_pstorageFactory != nullptr) { + // We must commit so the storage engine agrees on the number of items in the hash slot + if (g_pserver->db[0]->FTrackingChanges()) { + if (g_pserver->db[0]->processChanges(false)) + g_pserver->db[0]->commitChanges(); + g_pserver->db[0]->trackChanges(false); + } int j = 0; g_pserver->db[0]->getStorageCache()->enumerate_hashslot([&](const char *key, size_t cchKey, const void *, size_t )->bool { robj *keyobj = createStringObject(key, cchKey); diff --git a/src/storage/rocksdb.cpp b/src/storage/rocksdb.cpp index c8e364117..ace598d5b 100644 --- a/src/storage/rocksdb.cpp +++ b/src/storage/rocksdb.cpp @@ -25,7 +25,7 @@ bool FInternalKey(const char *key, size_t cch) std::string getPrefix(unsigned int hashslot) { char *hash_char = (char *)&hashslot; - return std::string(hash_char + (sizeof(unsigned int) - 2), 2); + return std::string(hash_char, 2); } std::string prefixKey(const char *key, size_t cchKey) @@ -202,23 +202,26 @@ bool RocksDBStorageProvider::enumerate_hashslot(callback fn, unsigned int hashsl std::string prefix = getPrefix(hashslot); std::unique_ptr it = std::unique_ptr(m_spdb->NewIterator(ReadOptions(), m_spcolfamily.get())); size_t count = 0; - for (it->Seek(prefix.c_str()); it->Valid(); it->Next()) { + serverAssert(prefix.size() >= 2); + bool full_iter = true; + for (it->Seek(rocksdb::Slice(prefix.data(), prefix.size())); it->Valid(); it->Next()) { if (FInternalKey(it->key().data(), it->key().size())) continue; - if (strncmp(it->key().data(),prefix.c_str(),2) != 0) + if (it->key().size() < 2 || memcmp(it->key().data(),prefix.data(),2) != 0) break; ++count; bool fContinue = fn(it->key().data()+2, it->key().size()-2, it->value().data(), it->value().size()); - if (!fContinue) + if (!fContinue) { + full_iter = false; break; + } } - bool full_iter = !it->Valid() || (strncmp(it->key().data(),prefix.c_str(),2) != 0); if (full_iter && count != g_pserver->cluster->slots_keys_count[hashslot]) { - printf("WARNING: rocksdb hashslot count mismatch"); + serverLog(LL_WARNING, "WARNING: rocksdb hashslot %d count mismatch %zu vs expected %lu", hashslot, count, g_pserver->cluster->slots_keys_count[hashslot]); } - assert(!full_iter || count == g_pserver->cluster->slots_keys_count[hashslot]); - assert(it->status().ok()); // Check for any errors found during the scan + serverAssert(!full_iter || count == g_pserver->cluster->slots_keys_count[hashslot]); + serverAssert(it->status().ok()); // Check for any errors found during the scan return full_iter; } From 41733d733f7d228e7b8954e2534124a347b9e520 Mon Sep 17 00:00:00 2001 From: John Sully Date: Wed, 20 Mar 2024 19:21:13 +0000 Subject: [PATCH 63/68] Add data on actual disk used that is compared to maxstorage --- src/modules/keydb_modstatsd/modmain.cpp | 1 + src/server.cpp | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/modules/keydb_modstatsd/modmain.cpp b/src/modules/keydb_modstatsd/modmain.cpp index f6a5c0ee2..970082fa5 100644 --- a/src/modules/keydb_modstatsd/modmain.cpp +++ b/src/modules/keydb_modstatsd/modmain.cpp @@ -101,6 +101,7 @@ std::unordered_map g_mapInfoFields = { { "used_memory_rss", { StatsD_Type::STATSD_GAUGE_BYTES }}, { "maxmemory", { StatsD_Type::STATSD_GAUGE_BYTES, false /* prefixOnly */}}, { "maxstorage", { StatsD_Type::STATSD_GAUGE_BYTES, false /* prefixOnly */}}, + { "storage_used", { StatsD_Type::STATSD_GAUGE_BYTES, false /* prefixOnly */}}, { "used_memory_dataset_perc", { StatsD_Type::STATSD_GAUGE_FLOAT }}, { "avg_lock_contention", { StatsD_Type::STATSD_GAUGE_LONGLONG }}, { "repl_backlog_size", { StatsD_Type::STATSD_GAUGE_BYTES }}, diff --git a/src/server.cpp b/src/server.cpp index 10cb25f2f..ece3bce63 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -5827,7 +5827,8 @@ sds genRedisInfoString(const char *section) { "lazyfreed_objects:%zu\r\n" "storage_provider:%s\r\n" "available_system_memory:%s\r\n" - "maxstorage:%llu\r\n", + "maxstorage:%llu\r\n" + "storage_used:%llu\r\n", zmalloc_used, hmem, g_pserver->cron_malloc_stats.process_rss, @@ -5874,7 +5875,8 @@ sds genRedisInfoString(const char *section) { lazyfreeGetFreedObjectsCount(), g_pserver->m_pstorageFactory ? g_pserver->m_pstorageFactory->name() : "none", available_system_mem, - g_pserver->maxstorage + g_pserver->maxstorage, + g_pserver->m_pstorageFactory ? g_pserver->m_pstorageFactory->totalDiskspaceUsed() : 0 ); freeMemoryOverheadData(mh); } From bb1780a3f33b73e8351a7c807cf2f55e60c520ae Mon Sep 17 00:00:00 2001 From: John Sully Date: Wed, 20 Mar 2024 19:24:36 +0000 Subject: [PATCH 64/68] Fix warning --- src/server.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/server.cpp b/src/server.cpp index ece3bce63..c9ac73e9f 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -5828,7 +5828,7 @@ sds genRedisInfoString(const char *section) { "storage_provider:%s\r\n" "available_system_memory:%s\r\n" "maxstorage:%llu\r\n" - "storage_used:%llu\r\n", + "storage_used:%lu\r\n", zmalloc_used, hmem, g_pserver->cron_malloc_stats.process_rss, From e580a8ebbee556c35e7ae2d88f29a43e643f04a2 Mon Sep 17 00:00:00 2001 From: John Sully Date: Thu, 4 Apr 2024 21:20:47 +0000 Subject: [PATCH 65/68] Handle pending IO fastsync --- src/SnapshotPayloadParseState.cpp | 5 ----- src/SnapshotPayloadParseState.h | 1 + src/config.cpp | 2 +- src/replication.cpp | 21 ++++++++++++++++++--- src/server.h | 1 + 5 files changed, 21 insertions(+), 9 deletions(-) diff --git a/src/SnapshotPayloadParseState.cpp b/src/SnapshotPayloadParseState.cpp index 83bea31eb..123bd4e35 100644 --- a/src/SnapshotPayloadParseState.cpp +++ b/src/SnapshotPayloadParseState.cpp @@ -233,11 +233,6 @@ void SnapshotPayloadParseState::trimState() { if (stackParse.empty()) { flushQueuedKeys(); - while (*insertsInFlight > 0) { - // TODO: ProcessEventsWhileBlocked - aeReleaseLock(); - aeAcquireLock(); - } } } diff --git a/src/SnapshotPayloadParseState.h b/src/SnapshotPayloadParseState.h index 29ac28fb6..91e9bc956 100644 --- a/src/SnapshotPayloadParseState.h +++ b/src/SnapshotPayloadParseState.h @@ -63,4 +63,5 @@ class SnapshotPayloadParseState { void pushValue(const char *rgch, long long cch); void pushValue(long long value); bool shouldThrottle() const { return *insertsInFlight > (cserver.cthreads*4); } + bool hasIOInFlight() const { return *insertsInFlight > 0; } }; \ No newline at end of file diff --git a/src/config.cpp b/src/config.cpp index 749fb5746..0ac230a9f 100644 --- a/src/config.cpp +++ b/src/config.cpp @@ -3201,7 +3201,7 @@ standardConfig configs[] = { createIntConfig("overload-protect-tenacity", NULL, MODIFIABLE_CONFIG, 0, 100, g_pserver->overload_protect_tenacity, 10, INTEGER_CONFIG, NULL, NULL), createIntConfig("force-eviction-percent", NULL, MODIFIABLE_CONFIG, 0, 100, g_pserver->force_eviction_percent, 0, INTEGER_CONFIG, NULL, NULL), createBoolConfig("enable-async-rehash", NULL, MODIFIABLE_CONFIG, g_pserver->enable_async_rehash, 1, NULL, NULL), - createBoolConfig("enable-keydb-fastsync", NULL, MODIFIABLE_CONFIG, g_pserver->fEnableFastSync, 0, NULL, NULL), + createBoolConfig("enable-keydb-fastsync", NULL, MODIFIABLE_CONFIG, g_pserver->fEnableFastSync, 1, NULL, NULL), #ifdef USE_OPENSSL createIntConfig("tls-port", NULL, MODIFIABLE_CONFIG, 0, 65535, g_pserver->tls_port, 0, INTEGER_CONFIG, NULL, updateTLSPort), /* TCP port. */ diff --git a/src/replication.cpp b/src/replication.cpp index 91b995991..d0c3d1b91 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -1059,7 +1059,7 @@ class replicationBuffer { while (checkClientOutputBufferLimits(replica) && (replica->flags.load(std::memory_order_relaxed) & CLIENT_CLOSE_ASAP) == 0) { ul.unlock(); - usleep(0); + usleep(1000); // give 1ms for the I/O before we poll again ul.lock(); } } @@ -2521,7 +2521,7 @@ size_t parseCount(const char *rgch, size_t cch, long long *pvalue) { return cchNumeral + 3; } -bool readSnapshotBulkPayload(connection *conn, redisMaster *mi, rdbSaveInfo &rsi) { +bool readFastSyncBulkPayload(connection *conn, redisMaster *mi, rdbSaveInfo &rsi) { int fUpdate = g_pserver->fActiveReplica || g_pserver->enable_multimaster; serverAssert(GlobalLocksAcquired()); serverAssert(mi->master == nullptr); @@ -2546,6 +2546,10 @@ bool readSnapshotBulkPayload(connection *conn, redisMaster *mi, rdbSaveInfo &rsi } } + if (mi->repl_state == REPL_STATE_WAIT_STORAGE_IO) { + goto LWaitIO; + } + serverAssert(mi->parseState != nullptr); for (int iter = 0; iter < 10; ++iter) { if (mi->parseState->shouldThrottle()) @@ -2663,7 +2667,14 @@ bool readSnapshotBulkPayload(connection *conn, redisMaster *mi, rdbSaveInfo &rsi if (!fFinished) return false; +LWaitIO: + if (mi->parseState->hasIOInFlight()) { + mi->repl_state = REPL_STATE_WAIT_STORAGE_IO; + return false; + } + serverLog(LL_NOTICE, "Fast sync complete"); + serverAssert(!mi->parseState->hasIOInFlight()); delete mi->parseState; mi->parseState = nullptr; return true; @@ -3040,7 +3051,7 @@ void readSyncBulkPayload(connection *conn) { } if (mi->isKeydbFastsync) { - if (!readSnapshotBulkPayload(conn, mi, rsi)) + if (!readFastSyncBulkPayload(conn, mi, rsi)) return; } else { if (!readSyncBulkPayloadRdb(conn, mi, rsi, usemark)) @@ -4807,6 +4818,10 @@ void replicationCron(void) { { redisMaster *mi = (redisMaster*)listNodeValue(lnMaster); + if (mi->repl_state == REPL_STATE_WAIT_STORAGE_IO && !mi->parseState->hasIOInFlight()) { + readSyncBulkPayload(mi->repl_transfer_s); + } + std::unique_lockmaster->lock)> ulock; if (mi->master != nullptr) ulock = decltype(ulock)(mi->master->lock); diff --git a/src/server.h b/src/server.h index 3826385a6..69966f9f3 100644 --- a/src/server.h +++ b/src/server.h @@ -612,6 +612,7 @@ typedef enum { REPL_STATE_RECEIVE_PSYNC_REPLY, /* Wait for PSYNC reply */ /* --- End of handshake states --- */ REPL_STATE_TRANSFER, /* Receiving .rdb from master */ + REPL_STATE_WAIT_STORAGE_IO, REPL_STATE_CONNECTED, /* Connected to master */ } repl_state; From c50e2d49d6ccbdabe37b3a64dd93ed64c2a03e75 Mon Sep 17 00:00:00 2001 From: John Sully Date: Fri, 5 Apr 2024 19:12:19 +0000 Subject: [PATCH 66/68] Fix fastsync test failures caused by closing due to repl buffer size too soon --- src/replication.cpp | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/src/replication.cpp b/src/replication.cpp index d0c3d1b91..afdb62256 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -1158,10 +1158,23 @@ class replicationBuffer { } void putSlavesOnline() { + for (auto replica : replicas) { + std::unique_lock ul(replica->lock); + // If we put the replica online before the output is drained then it will get immediately closed + while (checkClientOutputBufferLimits(replica) + && (replica->flags.load(std::memory_order_relaxed) & CLIENT_CLOSE_ASAP) == 0) { + ul.unlock(); + usleep(1000); // give 1ms for the I/O before we poll again + ul.lock(); + } + } + + aeAcquireLock(); for (auto replica : replicas) { replica->replstate = SLAVE_STATE_FASTSYNC_DONE; replica->repl_put_online_on_ack = 1; } + aeReleaseLock(); } void abort() { @@ -1282,9 +1295,7 @@ int rdbSaveSnapshotForReplication(rdbSaveInfo *rsi) { auto usec = ustime() - timeStart; serverLog(LL_NOTICE, "Transferred %zuMB total (%zuMB data) in %.2f seconds. (%.2fGbit/s)", spreplBuf->cbWritten()/1024/1024, cbData/1024/1024, usec/1000000.0, (spreplBuf->cbWritten()*8.0)/(usec/1000000.0)/1000000000.0); if (retval == C_OK) { - aeAcquireLock(); replBuf.putSlavesOnline(); - aeReleaseLock(); } }); From 66be482ef52bc95ef3acc67571fb345c5032cc96 Mon Sep 17 00:00:00 2001 From: John Sully Date: Fri, 5 Apr 2024 19:15:24 +0000 Subject: [PATCH 67/68] Naming fix --- src/replication.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/replication.cpp b/src/replication.cpp index afdb62256..22e5bc28e 100644 --- a/src/replication.cpp +++ b/src/replication.cpp @@ -1157,7 +1157,7 @@ class replicationBuffer { } } - void putSlavesOnline() { + void putReplicasOnline() { for (auto replica : replicas) { std::unique_lock ul(replica->lock); // If we put the replica online before the output is drained then it will get immediately closed @@ -1295,7 +1295,7 @@ int rdbSaveSnapshotForReplication(rdbSaveInfo *rsi) { auto usec = ustime() - timeStart; serverLog(LL_NOTICE, "Transferred %zuMB total (%zuMB data) in %.2f seconds. (%.2fGbit/s)", spreplBuf->cbWritten()/1024/1024, cbData/1024/1024, usec/1000000.0, (spreplBuf->cbWritten()*8.0)/(usec/1000000.0)/1000000000.0); if (retval == C_OK) { - replBuf.putSlavesOnline(); + replBuf.putReplicasOnline(); } }); From f886fce2d138084dfeeabc0d77338e844ea9e1d5 Mon Sep 17 00:00:00 2001 From: John Sully Date: Fri, 5 Apr 2024 21:48:04 +0000 Subject: [PATCH 68/68] Prevent erroneous socket closure --- src/server.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/server.cpp b/src/server.cpp index c9ac73e9f..c21d332fe 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -3262,6 +3262,7 @@ void initMasterInfo(redisMaster *master) master->repl_state = REPL_STATE_NONE; master->repl_down_since = 0; /* Never connected, repl is down since EVER. */ master->mvccLastSync = 0; + master->repl_transfer_fd = -1; } void initServerConfig(void) {