opt8: keep runs in compressed form (#227)

CERT-Polska · Oct 6, 2024 · cdbf0c1 · cdbf0c1
1 parent 93b169a
commit cdbf0c1
Show file tree

Hide file tree

Showing 12 changed files with 203 additions and 76 deletions.
diff --git a/libursa/OnDiskDataset.cpp b/libursa/OnDiskDataset.cpp
@@ -108,7 +108,7 @@ void OnDiskDataset::execute(const Query &query, ResultWriter *out,
  files_index->for_each_filename(
  [&out](const std::string &fname) { out->push_back(fname); });
  } else {
- for (const auto &fid : result.vector()) {
+ for (const auto &fid : result.vector().decompressed()) {
  out->push_back(get_file_name(fid));
  }
  }

diff --git a/libursa/OnDiskIndex.cpp b/libursa/OnDiskIndex.cpp
@@ -74,8 +74,7 @@ std::pair<uint64_t, uint64_t> OnDiskIndex::get_run_offsets(
  return std::make_pair(ptrs[0], ptrs[1]);
 }
 
-std::vector<FileId> OnDiskIndex::get_run(uint64_t ptr,
- uint64_t next_ptr) const {
+SortedRun OnDiskIndex::get_run(uint64_t ptr, uint64_t next_ptr) const {
  uint64_t run_length = next_ptr - ptr;
 
  if (ptr > next_ptr || next_ptr > index_size) {
@@ -86,12 +85,11 @@ std::vector<FileId> OnDiskIndex::get_run(uint64_t ptr,
 
  std::vector<uint8_t> run_bytes(run_length);
  ndxfile.pread(run_bytes.data(), run_length, ptr);
- return read_compressed_run(run_bytes.data(),
- run_bytes.data() + run_bytes.size());
+ return SortedRun(std::move(run_bytes));
 }
 
-std::vector<FileId> OnDiskIndex::query_primitive(TriGram trigram,
-  QueryCounter *counter) const {
+SortedRun OnDiskIndex::query_primitive(TriGram trigram,
+ QueryCounter *counter) const {
  auto op = QueryOperation(counter);
  std::pair<uint64_t, uint64_t> offsets = get_run_offsets(trigram);
  return get_run(offsets.first, offsets.second);

diff --git a/libursa/OnDiskIndex.h b/libursa/OnDiskIndex.h
@@ -21,9 +21,8 @@ class OnDiskIndex {
  IndexType ntype;
 
  static constexpr uint32_t VERSION = 6;
- std::vector<FileId> get_run(uint64_t ptr, uint64_t next_ptr) const;
- std::vector<FileId> query_primitive(TriGram trigram,
- QueryCounter *counter) const;
+ SortedRun get_run(uint64_t ptr, uint64_t next_ptr) const;
+ SortedRun query_primitive(TriGram trigram, QueryCounter *counter) const;
  std::pair<uint64_t, uint64_t> get_run_offsets(TriGram trigram) const;
 
  static void on_disk_merge_core(const std::vector<IndexMergeHelper> &indexes,

diff --git a/libursa/Query.cpp b/libursa/Query.cpp
@@ -226,7 +226,6 @@ void Query::prefetch(int from_index, int howmany, bool only_last,
  if (only_last && (i + 1 != howmany)) {
  continue;
  }
- spdlog::debug("prefetching {}", ndx);
  prefetcher(queries[ndx].ngram);
  }
  }
@@ -260,7 +259,7 @@ QueryResult Query::run(const QueryPrimitive &primitive,
  // Case: or. Short circuits when result is already everything.
  if (type == QueryType::OR) {
  auto result = QueryResult::empty();
- for (const auto &query : queries) {
+ for (auto &query : queries) {
  result.do_or(query.run(primitive, prefetcher, counters),
  &counters->ors());
  if (result.is_everything()) {
@@ -276,7 +275,7 @@ QueryResult Query::run(const QueryPrimitive &primitive,
  // There is some logic duplication here and in QueryResult::do_min_of_real.
  if (type == QueryType::MIN_OF) {
  std::vector<QueryResult> results;
- std::vector<const QueryResult *> results_ptrs;
+ std::vector<QueryResult *> results_ptrs;
  results.reserve(queries.size());
  results_ptrs.reserve(queries.size());
  int cutoff = count;

diff --git a/libursa/QueryOptimizer.cpp b/libursa/QueryOptimizer.cpp
@@ -32,7 +32,7 @@ Query simplify_subqueries(Query &&q) {
  return std::move(Query(q.get_type(), std::move(newqueries)));
 }
 
-// This optimization simplifies trivial (one operant) operations:
+// This optimization simplifies trivial (one operand) operations:
 // AND(x) --> x
 // OR(x) --> x
 Query flatten_trivial_operations(Query &&q, bool *changed) {

diff --git a/libursa/QueryResult.cpp b/libursa/QueryResult.cpp
@@ -2,31 +2,31 @@
 
 #include <algorithm>
 
-void QueryResult::do_or(const QueryResult &other, QueryCounter *counter) {
+void QueryResult::do_or(QueryResult &&other, QueryCounter *counter) {
  auto op = QueryOperation(counter);
  if (this->is_everything() || other.is_everything()) {
  has_everything = true;
- results = SortedRun();
+ results = std::move(SortedRun());
  } else {
  results.do_or(other.results);
  }
 }
 
-void QueryResult::do_and(const QueryResult &other, QueryCounter *counter) {
+void QueryResult::do_and(QueryResult &&other, QueryCounter *counter) {
  auto op = QueryOperation(counter);
  if (other.is_everything()) {
  } else if (this->is_everything()) {
- results = other.results;
+ results = std::move(other.results);
  has_everything = other.has_everything;
  } else {
  results.do_and(other.results);
  }
 }
 
-QueryResult QueryResult::do_min_of_real(
- int cutoff, const std::vector<const QueryResult *> &sources) {
- std::vector<const SortedRun *> nontrivial_sources;
- for (const auto *source : sources) {
+QueryResult QueryResult::do_min_of_real(int cutoff,
+  std::vector<QueryResult *> &sources) {
+ std::vector<SortedRun *> nontrivial_sources;
+ for (QueryResult *source : sources) {
  if (source->is_everything()) {
  cutoff -= 1;
  } else if (!source->is_empty()) {
@@ -66,9 +66,9 @@ QueryResult QueryResult::do_min_of_real(
  return QueryResult(SortedRun::pick_common(cutoff, nontrivial_sources));
 }
 
-QueryResult QueryResult::do_min_of(
- int cutoff, const std::vector<const QueryResult *> &sources,
- QueryCounter *counter) {
+QueryResult QueryResult::do_min_of(int cutoff,
+  std::vector<QueryResult *> &sources,
+  QueryCounter *counter) {
  // TODO: sources can be mutable here, to save us some copies later.
  QueryOperation op(counter);
  QueryResult out{do_min_of_real(cutoff, sources)};

diff --git a/libursa/QueryResult.h b/libursa/QueryResult.h
@@ -15,8 +15,8 @@ class QueryResult {
 
  QueryResult() : results{}, has_everything{true} {}
 
- static QueryResult do_min_of_real(
- int cutoff, const std::vector<const QueryResult *> &sources);
+ static QueryResult do_min_of_real(int cutoff,
+  std::vector<QueryResult *> &sources);
 
  public:
  QueryResult(QueryResult &&other) = default;
@@ -28,12 +28,12 @@ class QueryResult {
 
  static QueryResult everything() { return QueryResult(); }
 
- void do_or(const QueryResult &other, QueryCounter *counter);
- void do_and(const QueryResult &other, QueryCounter *counter);
+ void do_or(QueryResult &&other, QueryCounter *counter);
+ void do_and(QueryResult &&other, QueryCounter *counter);
 
- static QueryResult do_min_of(
- int cutoff, const std::vector<const QueryResult *> &sources,
- QueryCounter *counter);
+ static QueryResult do_min_of(int cutoff,
+  std::vector<QueryResult *> &sources,
+  QueryCounter *counter);
 
  // If true, means that QueryResults represents special "uninitialized"
  // value, "set of all FileIds in DataSet".
@@ -44,4 +44,5 @@ class QueryResult {
  bool is_empty() const { return !has_everything && results.empty(); }
 
  const SortedRun &vector() const { return results; }
+ SortedRun &vector() { return results; }
 };
diff --git a/libursa/SortedRun.cpp b/libursa/SortedRun.cpp
@@ -1,23 +1,100 @@
 #include "SortedRun.h"
 
 #include <algorithm>
+#include <stdexcept>
 
-void SortedRun::do_or(const SortedRun &other) {
+#include "Utils.h"
+
+uint32_t RunIterator::current() const {
+ uint64_t acc = 0;
+ uint32_t shift = 0;
+ for (uint8_t *it = pos_;; it++) {
+ uint32_t next = *it;
+ acc += (next & 0x7FU) << shift;
+ shift += 7U;
+ if ((next & 0x80U) == 0) {
+ return prev_ + acc + 1;
+ }
+ }
+}
+
+uint8_t *RunIterator::nextpos() {
+ for (uint8_t *it = pos_;; it++) {
+ if ((*it & 0x80) == 0) {
+ return it + 1;
+ }
+ }
+}
+
+void SortedRun::validate_compression(bool expected) {
+ if (!empty() && is_compressed() != expected) {
+ throw std::runtime_error("Run was in invalid compression state");
+ }
+}
+
+std::vector<uint32_t>::iterator SortedRun::begin() {
+ validate_compression(false);
+ return sequence_.begin();
+}
+
+std::vector<uint32_t>::iterator SortedRun::end() {
+ validate_compression(false);
+ return sequence_.end();
+}
+
+RunIterator SortedRun::comp_begin() {
+ validate_compression(true);
+ return RunIterator(run_.data());
+}
+
+RunIterator SortedRun::comp_end() {
+ validate_compression(true);
+ return RunIterator(run_.data() + run_.size());
+}
+
+void SortedRun::do_or(SortedRun &other) {
+ // In almost every case this is already decompressed.
+ decompress();
  std::vector<FileId> new_results;
- std::set_union(other.begin(), other.end(), sequence_.begin(),
- sequence_.end(), std::back_inserter(new_results));
+ if (other.is_compressed()) {
+ // Unlikely case, in most cases both runs are already decompressed.
+ std::set_union(other.comp_begin(), other.comp_end(), begin(), end(),
+ std::back_inserter(new_results));
+ } else {
+ std::set_union(other.begin(), other.end(), begin(), end(),
+ std::back_inserter(new_results));
+ }
  std::swap(new_results, sequence_);
 }
 
-void SortedRun::do_and(const SortedRun &other) {
- auto new_end =
- std::set_intersection(other.begin(), other.end(), sequence_.begin(),
- sequence_.end(), sequence_.begin());
+void SortedRun::do_and(SortedRun &other) {
+ // Benchmarking shows that handling a situation where this->is_compressed()
+ // makes the code *slower*. I assume that's because of memory efficiency.
+ decompress();
+ std::vector<uint32_t>::iterator new_end;
+ if (other.is_compressed()) {
+ new_end = std::set_intersection(other.comp_begin(), other.comp_end(),
+ begin(), end(), begin());
+ } else {
+ new_end = std::set_intersection(other.begin(), other.end(), begin(),
+ end(), begin());
+ }
  sequence_.erase(new_end, sequence_.end());
 }
 
-SortedRun SortedRun::pick_common(
- int cutoff, const std::vector<const SortedRun *> &sources) {
+void SortedRun::decompress() {
+ if (run_.empty()) {
+ // Already decompressed
+ return;
+ }
+
+ sequence_ = read_compressed_run(run_.data(), run_.data() + run_.size());
+ std::vector<uint8_t> empty;
+ run_.swap(empty);
+}
+
+SortedRun SortedRun::pick_common(int cutoff,
+ std::vector<SortedRun *> &sources) {
  // returns all FileIds which appear at least `cutoff` times among provided
  // `sources`
  using FileIdRange = std::pair<std::vector<FileId>::const_iterator,
@@ -27,9 +104,9 @@ SortedRun SortedRun::pick_common(
  heads.reserve(sources.size());
 
  for (auto source : sources) {
+ source->decompress();
  if (!source->empty()) {
- heads.emplace_back(
- std::make_pair(source->cbegin(), source->cend()));
+ heads.emplace_back(std::make_pair(source->begin(), source->end()));
  }
  }
 
@@ -70,3 +147,8 @@ SortedRun SortedRun::pick_common(
 
  return SortedRun(std::move(result));
 }
+
+const std::vector<uint32_t> &SortedRun::decompressed() {
+ decompress();
+ return sequence_;
+}