Skip to content

Commit

Permalink
stats: convert tag extractor regexs to Re2 (#14519)
Browse files Browse the repository at this point in the history
Risk Level: high, the regexes are updated to match more specific patterns.
Testing: unit tests

Fixes #14439

Signed-off-by: Dmitry Rozhkov <dmitry.rozhkov@intel.com>
  • Loading branch information
rojkov authored Jan 14, 2021
1 parent 4cadf72 commit de02955
Show file tree
Hide file tree
Showing 6 changed files with 249 additions and 60 deletions.
129 changes: 73 additions & 56 deletions source/common/config/well_known_names.cc
Original file line number Diff line number Diff line change
@@ -1,8 +1,31 @@
#include "common/config/well_known_names.h"

#include "absl/strings/str_replace.h"

namespace Envoy {
namespace Config {

namespace {

// To allow for more readable regular expressions to be declared below, and to
// reduce duplication, define a few common pattern substitutions for regex
// segments.
std::string expandRegex(const std::string& regex) {
return absl::StrReplaceAll(
regex, {// Regex to look for either IPv4 or IPv6 addresses plus port number after underscore.
{"<ADDRESS>", R"((?:(?:\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|\[[a-fA-F_\d]+\])_\d+))"},
// Cipher names can contain alphanumerics with dashes and
// underscores.
{"<CIPHER>", R"([\w-]+)"},
// A generic name can contain any character except dots.
{"<NAME>", R"([^\.]+)"},
// Route names may contain dots in addition to alphanumerics and
// dashes with underscores.
{"<ROUTE_CONFIG_NAME>", R"([\w-\.]+)"}});
}

} // namespace

TagNameValues::TagNameValues() {
// Note: the default regexes are defined below in the order that they will typically be matched
// (see the TagExtractor class definition for an explanation of the iterative matching process).
Expand All @@ -24,107 +47,101 @@ TagNameValues::TagNameValues() {
// - Typical * notation will be used to denote an arbitrary set of characters.

// *_rq(_<response_code>)
addRegex(RESPONSE_CODE, "_rq(_(\\d{3}))$", "_rq_");
addRe2(RESPONSE_CODE, R"(_rq(_(\d{3}))$)", "_rq_");

// *_rq_(<response_code_class>)xx
addRegex(RESPONSE_CODE_CLASS, "_rq_(\\d)xx$", "_rq_");
addRe2(RESPONSE_CODE_CLASS, R"(_rq_((\d))xx$)", "_rq_");

// http.[<stat_prefix>.]dynamodb.table.[<table_name>.]capacity.[<operation_name>.](__partition_id=<last_seven_characters_from_partition_id>)
addRegex(DYNAMO_PARTITION_ID,
"^http(?=\\.).*?\\.dynamodb\\.table(?=\\.).*?\\."
"capacity(?=\\.).*?(\\.__partition_id=(\\w{7}))$",
".dynamodb.table.");
addRe2(DYNAMO_PARTITION_ID,
R"(^http\.<NAME>\.dynamodb\.table\.<NAME>\.capacity\.<NAME>(\.__partition_id=(\w{7}))$)",
".dynamodb.table.");

// http.[<stat_prefix>.]dynamodb.operation.(<operation_name>.)<base_stat> or
// http.[<stat_prefix>.]dynamodb.operation.(<operation_name>.)* or
// http.[<stat_prefix>.]dynamodb.table.[<table_name>.]capacity.(<operation_name>.)[<partition_id>]
addRegex(DYNAMO_OPERATION,
"^http(?=\\.).*?\\.dynamodb.(?:operation|table(?="
"\\.).*?\\.capacity)(\\.(.*?))(?:\\.|$)",
".dynamodb.");
addRe2(DYNAMO_OPERATION,
R"(^http\.<NAME>\.dynamodb.(?:operation|table\.<NAME>\.capacity)(\.(<NAME>))(?:\.|$))",
".dynamodb.");

// mongo.[<stat_prefix>.]collection.[<collection>.]callsite.(<callsite>.)query.<base_stat>
addRegex(MONGO_CALLSITE,
R"(^mongo(?=\.).*?\.collection(?=\.).*?\.callsite\.((.*?)\.).*?query.\w+?$)",
".collection.");
// mongo.[<stat_prefix>.]collection.[<collection>.]callsite.(<callsite>.)query.*
addRe2(MONGO_CALLSITE, R"(^mongo\.<NAME>\.collection\.<NAME>\.callsite\.((<NAME>)\.)query\.)",
".collection.");

// http.[<stat_prefix>.]dynamodb.table.(<table_name>.) or
// http.[<stat_prefix>.]dynamodb.table.(<table_name>.)* or
// http.[<stat_prefix>.]dynamodb.error.(<table_name>.)*
addRegex(DYNAMO_TABLE, R"(^http(?=\.).*?\.dynamodb.(?:table|error)\.((.*?)\.))", ".dynamodb.");
addRe2(DYNAMO_TABLE, R"(^http\.<NAME>\.dynamodb.(?:table|error)\.((<NAME>)\.))", ".dynamodb.");

// mongo.[<stat_prefix>.]collection.(<collection>.)query.<base_stat>
addRegex(MONGO_COLLECTION, R"(^mongo(?=\.).*?\.collection\.((.*?)\.).*?query.\w+?$)",
".collection.");
// mongo.[<stat_prefix>.]collection.(<collection>.)query.*
addRe2(MONGO_COLLECTION, R"(^mongo\.<NAME>\.collection\.((<NAME>)\.).*?query\.)", ".collection.");

// mongo.[<stat_prefix>.]cmd.(<cmd>.)<base_stat>
addRegex(MONGO_CMD, R"(^mongo(?=\.).*?\.cmd\.((.*?)\.)\w+?$)", ".cmd.");
// mongo.[<stat_prefix>.]cmd.(<cmd>.)*
addRe2(MONGO_CMD, R"(^mongo\.<NAME>\.cmd\.((<NAME>)\.))", ".cmd.");

// cluster.[<route_target_cluster>.]grpc.[<grpc_service>.](<grpc_method>.)<base_stat>
addRegex(GRPC_BRIDGE_METHOD, R"(^cluster(?=\.).*?\.grpc(?=\.).*\.((.*?)\.)\w+?$)", ".grpc.");
// cluster.[<route_target_cluster>.]grpc.[<grpc_service>.](<grpc_method>.)*
addRe2(GRPC_BRIDGE_METHOD, R"(^cluster\.<NAME>\.grpc\.<NAME>\.((<NAME>)\.))", ".grpc.");

// http.[<stat_prefix>.]user_agent.(<user_agent>.)<base_stat>
addRegex(HTTP_USER_AGENT, R"(^http(?=\.).*?\.user_agent\.((.*?)\.)\w+?$)", ".user_agent.");
// http.[<stat_prefix>.]user_agent.(<user_agent>.)*
addRe2(HTTP_USER_AGENT, R"(^http\.<NAME>\.user_agent\.((<NAME>)\.))", ".user_agent.");

// vhost.[<virtual host name>.]vcluster.(<virtual_cluster_name>.)<base_stat>
addRegex(VIRTUAL_CLUSTER, R"(^vhost(?=\.).*?\.vcluster\.((.*?)\.)\w+?$)", ".vcluster.");
// vhost.[<virtual host name>.]vcluster.(<virtual_cluster_name>.)*
addRe2(VIRTUAL_CLUSTER, R"(^vhost\.<NAME>\.vcluster\.((<NAME>)\.))", ".vcluster.");

// http.[<stat_prefix>.]fault.(<downstream_cluster>.)<base_stat>
addRegex(FAULT_DOWNSTREAM_CLUSTER, R"(^http(?=\.).*?\.fault\.((.*?)\.)\w+?$)", ".fault.");
// http.[<stat_prefix>.]fault.(<downstream_cluster>.)*
addRe2(FAULT_DOWNSTREAM_CLUSTER, R"(^http\.<NAME>\.fault\.((<NAME>)\.))", ".fault.");

// listener.[<address>.]ssl.cipher.(<cipher>)
addRegex(SSL_CIPHER, R"(^listener(?=\.).*?\.ssl\.cipher(\.(.*?))$)");
addRe2(SSL_CIPHER, R"(^listener\..*?\.ssl\.cipher(\.(<CIPHER>))$)");

// cluster.[<cluster_name>.]ssl.ciphers.(<cipher>)
addRegex(SSL_CIPHER_SUITE, R"(^cluster(?=\.).*?\.ssl\.ciphers(\.(.*?))$)", ".ssl.ciphers.");
addRe2(SSL_CIPHER_SUITE, R"(^cluster\.<NAME>\.ssl\.ciphers(\.(<CIPHER>))$)", ".ssl.ciphers.");

// cluster.[<route_target_cluster>.]grpc.(<grpc_service>.)*
addRegex(GRPC_BRIDGE_SERVICE, R"(^cluster(?=\.).*?\.grpc\.((.*?)\.))", ".grpc.");
addRe2(GRPC_BRIDGE_SERVICE, R"(^cluster\.<NAME>\.grpc\.((<NAME>)\.))", ".grpc.");

// tcp.(<stat_prefix>.)<base_stat>
addRegex(TCP_PREFIX, R"(^tcp\.((.*?)\.)\w+?$)");
// tcp.(<stat_prefix>.)*
addRe2(TCP_PREFIX, R"(^tcp\.((<NAME>)\.))");

// udp.(<stat_prefix>.)<base_stat>
addRegex(UDP_PREFIX, R"(^udp\.((.*?)\.)\w+?$)");
// udp.(<stat_prefix>.)*
addRe2(UDP_PREFIX, R"(^udp\.((<NAME>)\.))");

// auth.clientssl.(<stat_prefix>.)<base_stat>
addRegex(CLIENTSSL_PREFIX, R"(^auth\.clientssl\.((.*?)\.)\w+?$)");
// auth.clientssl.(<stat_prefix>.)*
addRe2(CLIENTSSL_PREFIX, R"(^auth\.clientssl\.((<NAME>)\.))");

// ratelimit.(<stat_prefix>.)<base_stat>
addRegex(RATELIMIT_PREFIX, R"(^ratelimit\.((.*?)\.)\w+?$)");
// ratelimit.(<stat_prefix>.)*
addRe2(RATELIMIT_PREFIX, R"(^ratelimit\.((<NAME>)\.))");

// cluster.(<cluster_name>.)*
addRe2(CLUSTER_NAME, "^cluster\\.(([^\\.]+)\\.).*");
addRe2(CLUSTER_NAME, R"(^cluster\.((<NAME>)\.))");

// listener.[<address>.]http.(<stat_prefix>.)*
addRegex(HTTP_CONN_MANAGER_PREFIX, R"(^listener(?=\.).*?\.http\.((.*?)\.))", ".http.");
// The <address> part can be anything here (.*?) for the sake of a simpler
// internal state of the regex which performs better.
addRe2(HTTP_CONN_MANAGER_PREFIX, R"(^listener\..*?\.http\.((<NAME>)\.))", ".http.");

// http.(<stat_prefix>.)*
addRegex(HTTP_CONN_MANAGER_PREFIX, "^http\\.((.*?)\\.)");
addRe2(HTTP_CONN_MANAGER_PREFIX, R"(^http\.((<NAME>)\.))");

// listener.(<address>.)*
addRegex(LISTENER_ADDRESS,
R"(^listener\.(((?:[_.[:digit:]]*|[_\[\]aAbBcCdDeEfF[:digit:]]*))\.))");
addRe2(LISTENER_ADDRESS, R"(^listener\.((<ADDRESS>)\.))");

// vhost.(<virtual host name>.)*
addRegex(VIRTUAL_HOST, "^vhost\\.((.*?)\\.)");
addRe2(VIRTUAL_HOST, R"(^vhost\.((<NAME>)\.))");

// mongo.(<stat_prefix>.)*
addRegex(MONGO_PREFIX, "^mongo\\.((.*?)\\.)");
addRe2(MONGO_PREFIX, R"(^mongo\.((<NAME>)\.))");

// http.[<stat_prefix>.]rds.(<route_config_name>.)<base_stat>
addRegex(RDS_ROUTE_CONFIG, R"(^http(?=\.).*?\.rds\.((.*?)\.)\w+?$)", ".rds.");
// Note: <route_config_name> can contain dots thus we have to maintain full
// match.
addRe2(RDS_ROUTE_CONFIG, R"(^http\.<NAME>\.rds\.((<ROUTE_CONFIG_NAME>)\.)\w+?$)", ".rds.");

// listener_manager.(worker_<id>.)*
addRegex(WORKER_ID, R"(^listener_manager\.((worker_\d+)\.))", "listener_manager.worker_");
}

void TagNameValues::addRegex(const std::string& name, const std::string& regex,
const std::string& substr) {
descriptor_vec_.emplace_back(Descriptor{name, regex, substr, Regex::Type::StdRegex});
addRe2(WORKER_ID, R"(^listener_manager\.((worker_\d+)\.))", "listener_manager.worker_");
}

void TagNameValues::addRe2(const std::string& name, const std::string& regex,
const std::string& substr) {
descriptor_vec_.emplace_back(Descriptor{name, regex, substr, Regex::Type::Re2});
descriptor_vec_.emplace_back(Descriptor{name, expandRegex(regex), substr, Regex::Type::Re2});
}

} // namespace Config
Expand Down
1 change: 0 additions & 1 deletion source/common/config/well_known_names.h
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,6 @@ class TagNameValues {
const std::vector<Descriptor>& descriptorVec() const { return descriptor_vec_; }

private:
void addRegex(const std::string& name, const std::string& regex, const std::string& substr = "");
void addRe2(const std::string& name, const std::string& regex, const std::string& substr = "");

// Collection of tag descriptors.
Expand Down
14 changes: 11 additions & 3 deletions source/common/stats/tag_extractor_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,9 @@ bool regexStartsWithDot(absl::string_view regex) {

TagExtractorImplBase::TagExtractorImplBase(absl::string_view name, absl::string_view regex,
absl::string_view substr)
: name_(name), prefix_(std::string(extractRegexPrefix(regex))), substr_(substr) {}
: name_(name), prefix_(std::string(extractRegexPrefix(regex))), substr_(substr) {
PERF_TAG_INIT;
}

std::string TagExtractorImplBase::extractRegexPrefix(absl::string_view regex) {
std::string prefix;
Expand Down Expand Up @@ -90,6 +92,7 @@ bool TagExtractorStdRegexImpl::extractTag(absl::string_view stat_name, std::vect

if (substrMismatch(stat_name)) {
PERF_RECORD(perf, "re-skip", name_);
PERF_TAG_INC(skipped_);
return false;
}

Expand All @@ -113,9 +116,11 @@ bool TagExtractorStdRegexImpl::extractTag(absl::string_view stat_name, std::vect
std::string::size_type end = remove_subexpr.second - stat_name.begin();
remove_characters.insert(start, end);
PERF_RECORD(perf, "re-match", name_);
PERF_TAG_INC(matched_);
return true;
}
PERF_RECORD(perf, "re-miss", name_);
PERF_TAG_INC(missed_);
return false;
}

Expand All @@ -129,15 +134,16 @@ bool TagExtractorRe2Impl::extractTag(absl::string_view stat_name, std::vector<Ta

if (substrMismatch(stat_name)) {
PERF_RECORD(perf, "re2-skip", name_);
PERF_TAG_INC(skipped_);
return false;
}

// remove_subexpr is the first submatch. It represents the portion of the string to be removed.
re2::StringPiece remove_subexpr, value_subexpr;

// The regex must match and contain one or more subexpressions (all after the first are ignored).
if (re2::RE2::FullMatch(re2::StringPiece(stat_name.data(), stat_name.size()), regex_,
&remove_subexpr, &value_subexpr) &&
if (re2::RE2::PartialMatch(re2::StringPiece(stat_name.data(), stat_name.size()), regex_,
&remove_subexpr, &value_subexpr) &&
!remove_subexpr.empty()) {

// value_subexpr is the optional second submatch. It is usually inside the first submatch
Expand All @@ -155,9 +161,11 @@ bool TagExtractorRe2Impl::extractTag(absl::string_view stat_name, std::vector<Ta
remove_characters.insert(start, end);

PERF_RECORD(perf, "re2-match", name_);
PERF_TAG_INC(matched_);
return true;
}
PERF_RECORD(perf, "re2-miss", name_);
PERF_TAG_INC(missed_);
return false;
}

Expand Down
36 changes: 36 additions & 0 deletions source/common/stats/tag_extractor_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@
#include <regex>
#include <string>

#ifdef ENVOY_PERF_ANNOTATION
#include <fmt/core.h>
#endif

#include "envoy/stats/tag_extractor.h"

#include "common/common/regex.h"
Expand All @@ -14,6 +18,29 @@
namespace Envoy {
namespace Stats {

// To check if a tag extractor is actually used you can run
// bazel test //test/... --test_output=streamed --define=perf_annotation=enabled
#ifdef ENVOY_PERF_ANNOTATION

struct Counters {
uint32_t skipped_{};
uint32_t matched_{};
uint32_t missed_{};
};

#define PERF_TAG_COUNTERS std::unique_ptr<Counters> counters_

#define PERF_TAG_INIT counters_ = std::make_unique<Counters>()
#define PERF_TAG_INC(member) ++(counters_->member)

#else

#define PERF_TAG_COUNTERS
#define PERF_TAG_INIT
#define PERF_TAG_INC(member)

#endif

class TagExtractorImplBase : public TagExtractor {
public:
/**
Expand All @@ -32,6 +59,13 @@ class TagExtractorImplBase : public TagExtractor {

TagExtractorImplBase(absl::string_view name, absl::string_view regex,
absl::string_view substr = "");
#ifdef ENVOY_PERF_ANNOTATION
~TagExtractorImplBase() override {
std::cout << fmt::format("TagStats for {} tag extractor: skipped {}, matched {}, missing {}",
name_, counters_->skipped_, counters_->matched_, counters_->missed_)
<< std::endl;
}
#endif
std::string name() const override { return name_; }
absl::string_view prefixToken() const override { return prefix_; }

Expand Down Expand Up @@ -62,6 +96,8 @@ class TagExtractorImplBase : public TagExtractor {
const std::string name_;
const std::string prefix_;
const std::string substr_;

PERF_TAG_COUNTERS;
};

class TagExtractorStdRegexImpl : public TagExtractorImplBase {
Expand Down
19 changes: 19 additions & 0 deletions test/common/stats/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,25 @@ envoy_cc_test(
],
)

envoy_cc_benchmark_binary(
name = "tag_extractor_impl_benchmark",
srcs = [
"tag_extractor_impl_speed_test.cc",
],
external_deps = [
"benchmark",
],
deps = [
"//source/common/stats:tag_producer_lib",
"@envoy_api//envoy/config/metrics/v3:pkg_cc_proto",
],
)

envoy_benchmark_test(
name = "tag_extractor_impl_benchmark_test",
benchmark_binary = "tag_extractor_impl_benchmark",
)

envoy_cc_test(
name = "thread_local_store_test",
srcs = ["thread_local_store_test.cc"],
Expand Down
Loading

0 comments on commit de02955

Please sign in to comment.