Skip to content

Commit

Permalink
opt8: keep runs in compressed form (#227)
Browse files Browse the repository at this point in the history
  • Loading branch information
msm-cert authored Oct 6, 2024
1 parent 93b169a commit cdbf0c1
Show file tree
Hide file tree
Showing 12 changed files with 203 additions and 76 deletions.
2 changes: 1 addition & 1 deletion libursa/OnDiskDataset.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ void OnDiskDataset::execute(const Query &query, ResultWriter *out,
files_index->for_each_filename(
[&out](const std::string &fname) { out->push_back(fname); });
} else {
for (const auto &fid : result.vector()) {
for (const auto &fid : result.vector().decompressed()) {
out->push_back(get_file_name(fid));
}
}
Expand Down
10 changes: 4 additions & 6 deletions libursa/OnDiskIndex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,7 @@ std::pair<uint64_t, uint64_t> OnDiskIndex::get_run_offsets(
return std::make_pair(ptrs[0], ptrs[1]);
}

std::vector<FileId> OnDiskIndex::get_run(uint64_t ptr,
uint64_t next_ptr) const {
SortedRun OnDiskIndex::get_run(uint64_t ptr, uint64_t next_ptr) const {
uint64_t run_length = next_ptr - ptr;

if (ptr > next_ptr || next_ptr > index_size) {
Expand All @@ -86,12 +85,11 @@ std::vector<FileId> OnDiskIndex::get_run(uint64_t ptr,

std::vector<uint8_t> run_bytes(run_length);
ndxfile.pread(run_bytes.data(), run_length, ptr);
return read_compressed_run(run_bytes.data(),
run_bytes.data() + run_bytes.size());
return SortedRun(std::move(run_bytes));
}

std::vector<FileId> OnDiskIndex::query_primitive(TriGram trigram,
QueryCounter *counter) const {
SortedRun OnDiskIndex::query_primitive(TriGram trigram,
QueryCounter *counter) const {
auto op = QueryOperation(counter);
std::pair<uint64_t, uint64_t> offsets = get_run_offsets(trigram);
return get_run(offsets.first, offsets.second);
Expand Down
5 changes: 2 additions & 3 deletions libursa/OnDiskIndex.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,8 @@ class OnDiskIndex {
IndexType ntype;

static constexpr uint32_t VERSION = 6;
std::vector<FileId> get_run(uint64_t ptr, uint64_t next_ptr) const;
std::vector<FileId> query_primitive(TriGram trigram,
QueryCounter *counter) const;
SortedRun get_run(uint64_t ptr, uint64_t next_ptr) const;
SortedRun query_primitive(TriGram trigram, QueryCounter *counter) const;
std::pair<uint64_t, uint64_t> get_run_offsets(TriGram trigram) const;

static void on_disk_merge_core(const std::vector<IndexMergeHelper> &indexes,
Expand Down
5 changes: 2 additions & 3 deletions libursa/Query.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,6 @@ void Query::prefetch(int from_index, int howmany, bool only_last,
if (only_last && (i + 1 != howmany)) {
continue;
}
spdlog::debug("prefetching {}", ndx);
prefetcher(queries[ndx].ngram);
}
}
Expand Down Expand Up @@ -260,7 +259,7 @@ QueryResult Query::run(const QueryPrimitive &primitive,
// Case: or. Short circuits when result is already everything.
if (type == QueryType::OR) {
auto result = QueryResult::empty();
for (const auto &query : queries) {
for (auto &query : queries) {
result.do_or(query.run(primitive, prefetcher, counters),
&counters->ors());
if (result.is_everything()) {
Expand All @@ -276,7 +275,7 @@ QueryResult Query::run(const QueryPrimitive &primitive,
// There is some logic duplication here and in QueryResult::do_min_of_real.
if (type == QueryType::MIN_OF) {
std::vector<QueryResult> results;
std::vector<const QueryResult *> results_ptrs;
std::vector<QueryResult *> results_ptrs;
results.reserve(queries.size());
results_ptrs.reserve(queries.size());
int cutoff = count;
Expand Down
2 changes: 1 addition & 1 deletion libursa/QueryOptimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ Query simplify_subqueries(Query &&q) {
return std::move(Query(q.get_type(), std::move(newqueries)));
}

// This optimization simplifies trivial (one operant) operations:
// This optimization simplifies trivial (one operand) operations:
// AND(x) --> x
// OR(x) --> x
Query flatten_trivial_operations(Query &&q, bool *changed) {
Expand Down
22 changes: 11 additions & 11 deletions libursa/QueryResult.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,31 +2,31 @@

#include <algorithm>

void QueryResult::do_or(const QueryResult &other, QueryCounter *counter) {
void QueryResult::do_or(QueryResult &&other, QueryCounter *counter) {
auto op = QueryOperation(counter);
if (this->is_everything() || other.is_everything()) {
has_everything = true;
results = SortedRun();
results = std::move(SortedRun());
} else {
results.do_or(other.results);
}
}

void QueryResult::do_and(const QueryResult &other, QueryCounter *counter) {
void QueryResult::do_and(QueryResult &&other, QueryCounter *counter) {
auto op = QueryOperation(counter);
if (other.is_everything()) {
} else if (this->is_everything()) {
results = other.results;
results = std::move(other.results);
has_everything = other.has_everything;
} else {
results.do_and(other.results);
}
}

QueryResult QueryResult::do_min_of_real(
int cutoff, const std::vector<const QueryResult *> &sources) {
std::vector<const SortedRun *> nontrivial_sources;
for (const auto *source : sources) {
QueryResult QueryResult::do_min_of_real(int cutoff,
std::vector<QueryResult *> &sources) {
std::vector<SortedRun *> nontrivial_sources;
for (QueryResult *source : sources) {
if (source->is_everything()) {
cutoff -= 1;
} else if (!source->is_empty()) {
Expand Down Expand Up @@ -66,9 +66,9 @@ QueryResult QueryResult::do_min_of_real(
return QueryResult(SortedRun::pick_common(cutoff, nontrivial_sources));
}

QueryResult QueryResult::do_min_of(
int cutoff, const std::vector<const QueryResult *> &sources,
QueryCounter *counter) {
QueryResult QueryResult::do_min_of(int cutoff,
std::vector<QueryResult *> &sources,
QueryCounter *counter) {
// TODO: sources can be mutable here, to save us some copies later.
QueryOperation op(counter);
QueryResult out{do_min_of_real(cutoff, sources)};
Expand Down
15 changes: 8 additions & 7 deletions libursa/QueryResult.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ class QueryResult {

QueryResult() : results{}, has_everything{true} {}

static QueryResult do_min_of_real(
int cutoff, const std::vector<const QueryResult *> &sources);
static QueryResult do_min_of_real(int cutoff,
std::vector<QueryResult *> &sources);

public:
QueryResult(QueryResult &&other) = default;
Expand All @@ -28,12 +28,12 @@ class QueryResult {

static QueryResult everything() { return QueryResult(); }

void do_or(const QueryResult &other, QueryCounter *counter);
void do_and(const QueryResult &other, QueryCounter *counter);
void do_or(QueryResult &&other, QueryCounter *counter);
void do_and(QueryResult &&other, QueryCounter *counter);

static QueryResult do_min_of(
int cutoff, const std::vector<const QueryResult *> &sources,
QueryCounter *counter);
static QueryResult do_min_of(int cutoff,
std::vector<QueryResult *> &sources,
QueryCounter *counter);

// If true, means that QueryResults represents special "uninitialized"
// value, "set of all FileIds in DataSet".
Expand All @@ -44,4 +44,5 @@ class QueryResult {
bool is_empty() const { return !has_everything && results.empty(); }

const SortedRun &vector() const { return results; }
SortedRun &vector() { return results; }
};
104 changes: 93 additions & 11 deletions libursa/SortedRun.cpp
Original file line number Diff line number Diff line change
@@ -1,23 +1,100 @@
#include "SortedRun.h"

#include <algorithm>
#include <stdexcept>

void SortedRun::do_or(const SortedRun &other) {
#include "Utils.h"

uint32_t RunIterator::current() const {
uint64_t acc = 0;
uint32_t shift = 0;
for (uint8_t *it = pos_;; it++) {
uint32_t next = *it;
acc += (next & 0x7FU) << shift;
shift += 7U;
if ((next & 0x80U) == 0) {
return prev_ + acc + 1;
}
}
}

uint8_t *RunIterator::nextpos() {
for (uint8_t *it = pos_;; it++) {
if ((*it & 0x80) == 0) {
return it + 1;
}
}
}

void SortedRun::validate_compression(bool expected) {
if (!empty() && is_compressed() != expected) {
throw std::runtime_error("Run was in invalid compression state");
}
}

std::vector<uint32_t>::iterator SortedRun::begin() {
validate_compression(false);
return sequence_.begin();
}

std::vector<uint32_t>::iterator SortedRun::end() {
validate_compression(false);
return sequence_.end();
}

RunIterator SortedRun::comp_begin() {
validate_compression(true);
return RunIterator(run_.data());
}

RunIterator SortedRun::comp_end() {
validate_compression(true);
return RunIterator(run_.data() + run_.size());
}

void SortedRun::do_or(SortedRun &other) {
// In almost every case this is already decompressed.
decompress();
std::vector<FileId> new_results;
std::set_union(other.begin(), other.end(), sequence_.begin(),
sequence_.end(), std::back_inserter(new_results));
if (other.is_compressed()) {
// Unlikely case, in most cases both runs are already decompressed.
std::set_union(other.comp_begin(), other.comp_end(), begin(), end(),
std::back_inserter(new_results));
} else {
std::set_union(other.begin(), other.end(), begin(), end(),
std::back_inserter(new_results));
}
std::swap(new_results, sequence_);
}

void SortedRun::do_and(const SortedRun &other) {
auto new_end =
std::set_intersection(other.begin(), other.end(), sequence_.begin(),
sequence_.end(), sequence_.begin());
void SortedRun::do_and(SortedRun &other) {
// Benchmarking shows that handling a situation where this->is_compressed()
// makes the code *slower*. I assume that's because of memory efficiency.
decompress();
std::vector<uint32_t>::iterator new_end;
if (other.is_compressed()) {
new_end = std::set_intersection(other.comp_begin(), other.comp_end(),
begin(), end(), begin());
} else {
new_end = std::set_intersection(other.begin(), other.end(), begin(),
end(), begin());
}
sequence_.erase(new_end, sequence_.end());
}

SortedRun SortedRun::pick_common(
int cutoff, const std::vector<const SortedRun *> &sources) {
void SortedRun::decompress() {
if (run_.empty()) {
// Already decompressed
return;
}

sequence_ = read_compressed_run(run_.data(), run_.data() + run_.size());
std::vector<uint8_t> empty;
run_.swap(empty);
}

SortedRun SortedRun::pick_common(int cutoff,
std::vector<SortedRun *> &sources) {
// returns all FileIds which appear at least `cutoff` times among provided
// `sources`
using FileIdRange = std::pair<std::vector<FileId>::const_iterator,
Expand All @@ -27,9 +104,9 @@ SortedRun SortedRun::pick_common(
heads.reserve(sources.size());

for (auto source : sources) {
source->decompress();
if (!source->empty()) {
heads.emplace_back(
std::make_pair(source->cbegin(), source->cend()));
heads.emplace_back(std::make_pair(source->begin(), source->end()));
}
}

Expand Down Expand Up @@ -70,3 +147,8 @@ SortedRun SortedRun::pick_common(

return SortedRun(std::move(result));
}

const std::vector<uint32_t> &SortedRun::decompressed() {
decompress();
return sequence_;
}
Loading

0 comments on commit cdbf0c1

Please sign in to comment.