Skip to content

Commit

Permalink
opt6: reorder_subqueries (#225)
Browse files Browse the repository at this point in the history
  • Loading branch information
msm-cert authored Oct 1, 2024
1 parent 4b4474e commit d64bc15
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 2 deletions.
57 changes: 56 additions & 1 deletion libursa/QueryOptimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,61 @@ Query propagate_degenerate_queries(Query &&q, bool *changed) {
return std::move(q);
}

// This heuristic should ideally measure "what is the chance
// that this query returns zero results", or "how many files we expect to get".
// Of course, less files and bigger chance for zero result is better.
// This should also be weighted by the query cost (100 queries for 10% chance
// to get empty result is worse than 2 queries for 15% chance of empty result).
//
// The current implementation is a very naive heuristic, that just looks at
// the query type, and index type for primitives, and orders basing on that.
uint32_t query_heuristic_cost(const Query &q) {
// From empirical test, order of query types doesn't seem to matter much.
switch (q.get_type()) {
case QueryType::PRIMITIVE:
// Sort by ngram type, then by ngram value, alphabetically first.
// This is (un)surprisingly important for two reasons:
// 1. we read sequentially as many ngrams as possible.
// 2. consecutive ngrams are independent: (abc, bcd) vs (abc, def).
// Use smaller indexes first, because they're faster to read.
switch (q.as_ngram().itype) {
case IndexType::WIDE8:
return (0 << 24) + q.as_ngram().trigram;
case IndexType::TEXT4:
return (1 << 24) + q.as_ngram().trigram;
case IndexType::HASH4:
return (2 << 24) + q.as_ngram().trigram;
case IndexType::GRAM3:
return (3 << 24) + q.as_ngram().trigram;
}
case QueryType::AND:
return 4 << 24;
case QueryType::MIN_OF:
return 5 << 24;
case QueryType::OR:
// OR is the worst operation, since it always needs to scan
// all of its arguments (no chance of early exit).
return 6 << 24;
}
throw std::runtime_error("Unexpected query/index type.");
}

// Order queries by their heuristic cost.
bool query_heuristic_comparer(const Query &left, const Query &right) {
return query_heuristic_cost(left) < query_heuristic_cost(right);
}

// Order the subqueries to maximize the chance of early exit.
// This is done after all other optimizations, and there's no point of
// running this in a loop.
Query reorder_subqueries(Query &&q) {
if (q.get_type() == QueryType::AND) {
std::stable_sort(q.as_queries().begin(), q.as_queries().end(),
query_heuristic_comparer);
}
return std::move(q); // Currently only support AND operators.
}

Query q_optimize(Query &&q) {
if (q.get_type() == QueryType::PRIMITIVE) {
// Nothing to improve here.
Expand All @@ -160,6 +215,6 @@ Query q_optimize(Query &&q) {
q = simplify_minof(std::move(q), &changed);
q = propagate_degenerate_queries(std::move(q), &changed);
}

q = reorder_subqueries(std::move(q));
return std::move(q);
}
2 changes: 1 addition & 1 deletion libursa/Version.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@ constexpr std::string_view ursadb_format_version = "1.5.0";
// Project version.
// Consider updating the version tag when doing PRs.
// clang-format off
constexpr std::string_view ursadb_version_string = "@PROJECT_VERSION@+opt5";
constexpr std::string_view ursadb_version_string = "@PROJECT_VERSION@+opt6";
// clang-format on

0 comments on commit d64bc15

Please sign in to comment.