Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(keys, scan): Support arbitrary glob patterns #2608

Merged
merged 18 commits into from
Oct 29, 2024
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .github/config/typos.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,11 @@ extend-exclude = [
".git/",
"src/vendor/",
"tests/gocase/util/slot.go",

# Uses short strings for testing glob matching
"tests/cppunit/glob_util_test.cc",
"tests/gocase/unit/keyspace/keyspace_test.go",
"tests/gocase/unit/keyspace/scan_test.go",
]
ignore-hidden = false

Expand Down
14 changes: 6 additions & 8 deletions src/commands/cmd_server.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,10 @@
#include "command_parser.h"
#include "commander.h"
#include "commands/scan_base.h"
#include "common/glob.h"
#include "common/io_util.h"
#include "common/rdb_stream.h"
#include "common/time_util.h"
#include "config/config.h"
#include "error_constants.h"
#include "server/redis_connection.h"
Expand All @@ -31,7 +33,6 @@
#include "stats/disk_stats.h"
#include "storage/rdb/rdb.h"
#include "string_util.h"
#include "time_util.h"

namespace redis {

Expand Down Expand Up @@ -114,15 +115,12 @@ class CommandNamespace : public Commander {
class CommandKeys : public Commander {
public:
Status Execute(engine::Context &ctx, Server *srv, Connection *conn, std::string *output) override {
const std::string &prefix = args_[1];
const std::string &glob_pattern = args_[1];
std::vector<std::string> keys;
redis::Database redis(srv->storage, conn->GetNamespace());

if (prefix.empty() || prefix.find('*') != prefix.size() - 1) {
return {Status::RedisExecErr, "only keys prefix match was supported"};
}

const rocksdb::Status s = redis.Keys(ctx, prefix.substr(0, prefix.size() - 1), &keys);
const auto [prefix, suffix_glob] = util::SplitGlob(glob_pattern);
const rocksdb::Status s = redis.Keys(ctx, prefix, suffix_glob, &keys);
if (!s.ok()) {
return {Status::RedisExecErr, s.ToString()};
}
Expand Down Expand Up @@ -848,7 +846,7 @@ class CommandScan : public CommandScanBase {
std::vector<std::string> keys;
std::string end_key;

auto s = redis_db.Scan(ctx, key_name, limit_, prefix_, &keys, &end_key, type_);
const auto s = redis_db.Scan(ctx, key_name, limit_, prefix_, suffix_glob_, &keys, &end_key, type_);
if (!s.ok()) {
return {Status::RedisExecErr, s.ToString()};
}
Expand Down
12 changes: 4 additions & 8 deletions src/commands/scan_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include "commander.h"
#include "commands/command_parser.h"
#include "error_constants.h"
#include "glob.h"
#include "parse_util.h"
#include "server/server.h"

Expand All @@ -44,14 +45,8 @@ class CommandScanBase : public Commander {
Status ParseAdditionalFlags(Parser &parser) {
while (parser.Good()) {
if (parser.EatEqICase("match")) {
prefix_ = GET_OR_RET(parser.TakeStr());
// The match pattern should contain exactly one '*' at the end; remove the * to
// get the prefix to match.
if (!prefix_.empty() && prefix_.find('*') == prefix_.size() - 1) {
prefix_.pop_back();
} else {
return {Status::RedisParseErr, "currently only key prefix matching is supported"};
}
const std::string glob_pattern = GET_OR_RET(parser.TakeStr());
std::tie(prefix_, suffix_glob_) = util::SplitGlob(glob_pattern);
} else if (parser.EatEqICase("count")) {
limit_ = GET_OR_RET(parser.TakeInt());
if (limit_ <= 0) {
Expand Down Expand Up @@ -100,6 +95,7 @@ class CommandScanBase : public Commander {
protected:
std::string cursor_;
std::string prefix_;
std::string suffix_glob_;
int limit_ = 20;
RedisType type_ = kRedisNone;
};
Expand Down
158 changes: 158 additions & 0 deletions src/common/glob.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*/

#pragma once

#include <algorithm>
#include <cctype>
#include <string>
#include <string_view>

namespace util {

constexpr inline bool GlobMatchImpl(std::string_view pattern, std::string_view string, bool ignore_case,
bool *skip_longer_matches, size_t recursion_depth = 0) {
// If we want to ignore case, this is equivalent to converting both the pattern and the string to lowercase
const auto canonicalize = [ignore_case](unsigned char c) -> unsigned char {
return ignore_case ? static_cast<unsigned char>(std::tolower(c)) : c;
};

if (recursion_depth > 1000) return false;

while (!pattern.empty() && !string.empty()) {
switch (pattern[0]) {
case '*':
// Optimization: collapse multiple * into one
while (pattern.size() >= 2 && pattern[1] == '*') {
pattern.remove_prefix(1);
}
// Optimization: If the '*' is the last character in the pattern, it can match anything
if (pattern.length() == 1) return true;
while (!string.empty()) {
if (GlobMatchImpl(pattern.substr(1), string, ignore_case, skip_longer_matches, recursion_depth + 1))
return true;
if (*skip_longer_matches) return false;
string.remove_prefix(1);
}
// There was no match for the rest of the pattern starting
// from anywhere in the rest of the string. If there were
// any '*' earlier in the pattern, we can terminate the
// search early without trying to match them to longer
// substrings. This is because a longer match for the
// earlier part of the pattern would require the rest of the
// pattern to match starting later in the string, and we
// have just determined that there is no match for the rest
// of the pattern starting from anywhere in the current
// string.
*skip_longer_matches = true;
return false;
case '?':
if (string.empty()) return false;
string.remove_prefix(1);
break;
case '[': {
pattern.remove_prefix(1);
const bool invert = pattern[0] == '^';
if (invert) pattern.remove_prefix(1);

bool match = false;
while (true) {
if (pattern.empty()) {
// unterminated [ group: reject invalid pattern
return false;
} else if (pattern[0] == ']') {
break;
} else if (pattern.length() >= 2 && pattern[0] == '\\') {
pattern.remove_prefix(1);
if (pattern[0] == string[0]) match = true;
} else if (pattern.length() >= 3 && pattern[1] == '-') {
unsigned char start = canonicalize(pattern[0]);
unsigned char end = canonicalize(pattern[2]);
if (start > end) std::swap(start, end);
const int c = canonicalize(string[0]);
pattern.remove_prefix(2);

if (c >= start && c <= end) match = true;
} else if (canonicalize(pattern[0]) == canonicalize(string[0])) {
match = true;
}
pattern.remove_prefix(1);
}
if (invert) match = !match;
if (!match) return false;
string.remove_prefix(1);
break;
}
case '\\':
if (pattern.length() >= 2) {
pattern.remove_prefix(1);
}
[[fallthrough]];
default:
// Just a normal character
if (!ignore_case) {
if (pattern[0] != string[0]) return false;
} else {
if (std::tolower((int)pattern[0]) != std::tolower((int)string[0])) return false;
}
string.remove_prefix(1);
break;
}
pattern.remove_prefix(1);
}

// Now that either the pattern is empty or the string is empty, this is a match iff
// the pattern consists only of '*', and the string is empty.
return string.empty() && std::all_of(pattern.begin(), pattern.end(), [](char c) { return c == '*'; });
}

// Given a glob [pattern] and a string [string], return true iff the string matches the glob.
// If [ignore_case] is true, the match is case-insensitive.
constexpr inline bool GlobMatches(std::string_view glob, std::string_view str, bool ignore_case = false) {
bool skip_longer_matches = false;
return GlobMatchImpl(glob, str, ignore_case, &skip_longer_matches);
}
nathanlo-hrt marked this conversation as resolved.
Show resolved Hide resolved

// Split a glob pattern into a literal prefix and a suffix containing wildcards.
// For example, if the user calls [KEYS bla*bla], this function will return {"bla", "*bla"}.
// This allows the caller of this function to optimize this call by performing a
// prefix-scan on "bla" and then filtering the results using the GlobMatches function.
inline std::pair<std::string, std::string> SplitGlob(std::string_view glob) {
// Stores the prefix of the glob pattern, with backslashes removed
std::string prefix;
// Find the first un-escaped '*', '?' or '[' character in [glob]
for (size_t idx = 0; idx < glob.size(); ++idx) {
if (glob[idx] == '*' || glob[idx] == '?' || glob[idx] == '[') {
// Return a pair of views: the part of the glob before the wildcard, and the part after
return {prefix, std::string(glob.substr(idx))};
} else if (glob[idx] == '\\') {
// Skip checking whether the next character is a special character
++idx;
// Append the escaped special character to the prefix
prefix.push_back(glob[idx]);
} else {
prefix.push_back(glob[idx]);
}
}
// No wildcard found, return the entire string (without the backslashes) as the prefix
return {prefix, ""};
}

} // namespace util
28 changes: 17 additions & 11 deletions src/storage/redis_db.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,15 @@
#include "redis_db.h"

#include <ctime>
#include <map>
#include <utility>

#include "cluster/redis_slot.h"
#include "common/scope_exit.h"
#include "db_util.h"
#include "glob.h"
#include "parse_util.h"
#include "rocksdb/iterator.h"
#include "rocksdb/status.h"
#include "server/server.h"
#include "storage/iterator.h"
#include "storage/redis_metadata.h"
#include "storage/storage.h"
Expand Down Expand Up @@ -249,11 +248,11 @@ rocksdb::Status Database::GetExpireTime(engine::Context &ctx, const Slice &user_
}

rocksdb::Status Database::GetKeyNumStats(engine::Context &ctx, const std::string &prefix, KeyNumStats *stats) {
return Keys(ctx, prefix, nullptr, stats);
return Keys(ctx, prefix, "*", nullptr, stats);
}

rocksdb::Status Database::Keys(engine::Context &ctx, const std::string &prefix, std::vector<std::string> *keys,
KeyNumStats *stats) {
rocksdb::Status Database::Keys(engine::Context &ctx, const std::string &prefix, const std::string &suffix_glob,
std::vector<std::string> *keys, KeyNumStats *stats) {
uint16_t slot_id = 0;
std::string ns_prefix;
if (namespace_ != kDefaultNamespace || keys != nullptr) {
Expand All @@ -277,6 +276,10 @@ rocksdb::Status Database::Keys(engine::Context &ctx, const std::string &prefix,
if (!ns_prefix.empty() && !iter->key().starts_with(ns_prefix)) {
break;
}
auto [_, user_key] = ExtractNamespaceKey(iter->key(), storage_->IsSlotIdEncoded());
if (!util::GlobMatches(suffix_glob, user_key.ToString().substr(prefix.size()))) {
continue;
}
Metadata metadata(kRedisNone, false);
auto s = metadata.Decode(iter->value());
if (!s.ok()) continue;
Expand All @@ -293,7 +296,6 @@ rocksdb::Status Database::Keys(engine::Context &ctx, const std::string &prefix,
}
}
if (keys) {
auto [_, user_key] = ExtractNamespaceKey(iter->key(), storage_->IsSlotIdEncoded());
keys->emplace_back(user_key.ToString());
}
}
Expand All @@ -319,8 +321,8 @@ rocksdb::Status Database::Keys(engine::Context &ctx, const std::string &prefix,
}

rocksdb::Status Database::Scan(engine::Context &ctx, const std::string &cursor, uint64_t limit,
const std::string &prefix, std::vector<std::string> *keys, std::string *end_cursor,
RedisType type) {
const std::string &prefix, const std::string &suffix_glob,
std::vector<std::string> *keys, std::string *end_cursor, RedisType type) {
end_cursor->clear();
uint64_t cnt = 0;
uint16_t slot_start = 0;
Expand Down Expand Up @@ -366,6 +368,10 @@ rocksdb::Status Database::Scan(engine::Context &ctx, const std::string &cursor,

if (metadata.Expired()) continue;
std::tie(std::ignore, user_key) = ExtractNamespaceKey<std::string>(iter->key(), storage_->IsSlotIdEncoded());

if (!util::GlobMatches(suffix_glob, user_key.substr(prefix.size()))) {
continue;
}
keys->emplace_back(user_key);
cnt++;
Copy link
Member

@PragmaTwice PragmaTwice Oct 19, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm wondering that, if nothing got matched in this limit, e.g. the limit is 10, and in this 10 keys no key is matched, but the 12th key is matched, will a valid cursor be returned so that users can use it to continue the scan?

Also could we add a test case to ensure that?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it seems like cnt is only incremented when we actually add a key to the result, so the loop will continue until no more keys match the prefix, or limit keys are inserted into the result vector

Copy link
Member

@PragmaTwice PragmaTwice Oct 22, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmmm I think we'd better refactor the logic, so that the SCAN can be quick, instead of a long-time blocking scan. E.g. we can scan a fixed maximum numbers of keys even if the limit/cnt is not reach. It's fine to return even zero matched key to users.
Or maybe we can confirm the logic in Redis?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure if we can just merge this first and then plan to refactor to a "quick" scan.

WDYT? cc @mapleFU @git-hulk

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry for missing this comment. This sounds good to me after taking a rough review.

Hmmm I think we'd better refactor the logic, so that the SCAN can be quick, instead of a long-time blocking scan. E.g. we can scan a fixed maximum numbers of keys even if the limit/cnt is not reach. It's fine to return even zero matched key to users.
Or maybe we can confirm the logic in Redis?

Yes, Redis will set a max iteration number(10*count) to avoid blocking too long.

Refer: https://github.com/redis/redis/blob/611c950293ae34dcef148ec62c9dd9626d7dc9e3/src/db.c#L1212

}
Expand Down Expand Up @@ -395,7 +401,7 @@ rocksdb::Status Database::Scan(engine::Context &ctx, const std::string &cursor,
if (iter->Valid()) {
std::tie(std::ignore, user_key) = ExtractNamespaceKey<std::string>(iter->key(), storage_->IsSlotIdEncoded());
auto res = std::mismatch(prefix.begin(), prefix.end(), user_key.begin());
if (res.first == prefix.end()) {
if (res.first == prefix.end() && util::GlobMatches(suffix_glob, user_key.substr(prefix.size()))) {
keys->emplace_back(user_key);
}

Expand All @@ -420,13 +426,13 @@ rocksdb::Status Database::RandomKey(engine::Context &ctx, const std::string &cur

std::string end_cursor;
std::vector<std::string> keys;
auto s = Scan(ctx, cursor, RANDOM_KEY_SCAN_LIMIT, "", &keys, &end_cursor);
auto s = Scan(ctx, cursor, RANDOM_KEY_SCAN_LIMIT, "", "*", &keys, &end_cursor);
if (!s.ok()) {
return s;
}
if (keys.empty() && !cursor.empty()) {
// if reach the end, restart from beginning
s = Scan(ctx, "", RANDOM_KEY_SCAN_LIMIT, "", &keys, &end_cursor);
s = Scan(ctx, "", RANDOM_KEY_SCAN_LIMIT, "", "*", &keys, &end_cursor);
if (!s.ok()) {
return s;
}
Expand Down
9 changes: 4 additions & 5 deletions src/storage/redis_db.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@

#pragma once

#include <map>
#include <optional>
#include <string>
#include <utility>
Expand All @@ -29,7 +28,6 @@

#include "cluster/cluster_defs.h"
#include "redis_metadata.h"
#include "server/redis_reply.h"
#include "storage.h"

namespace redis {
Expand Down Expand Up @@ -119,11 +117,12 @@ class Database {
[[nodiscard]] rocksdb::Status FlushDB(engine::Context &ctx);
[[nodiscard]] rocksdb::Status FlushAll(engine::Context &ctx);
[[nodiscard]] rocksdb::Status GetKeyNumStats(engine::Context &ctx, const std::string &prefix, KeyNumStats *stats);
[[nodiscard]] rocksdb::Status Keys(engine::Context &ctx, const std::string &prefix,
[[nodiscard]] rocksdb::Status Keys(engine::Context &ctx, const std::string &prefix, const std::string &suffix_glob,
std::vector<std::string> *keys = nullptr, KeyNumStats *stats = nullptr);
[[nodiscard]] rocksdb::Status Scan(engine::Context &ctx, const std::string &cursor, uint64_t limit,
const std::string &prefix, std::vector<std::string> *keys,
std::string *end_cursor = nullptr, RedisType type = kRedisNone);
const std::string &prefix, const std::string &suffix_glob,
std::vector<std::string> *keys, std::string *end_cursor = nullptr,
RedisType type = kRedisNone);
[[nodiscard]] rocksdb::Status RandomKey(engine::Context &ctx, const std::string &cursor, std::string *key);
std::string AppendNamespacePrefix(const Slice &user_key);
[[nodiscard]] rocksdb::Status ClearKeysOfSlotRange(engine::Context &ctx, const rocksdb::Slice &ns,
Expand Down
Loading
Loading