Skip to content

Commit

Permalink
Merge pull request #32409 from vespa-engine/toregge/prepare-for-prese…
Browse files Browse the repository at this point in the history
…rving-weights-in-range-and-prefix-searches

Prepare for preserving weights in range and prefix searches.
  • Loading branch information
geirst committed Sep 17, 2024
2 parents 0a37742 + e44df6d commit a9c942f
Show file tree
Hide file tree
Showing 4 changed files with 88 additions and 1 deletion.
76 changes: 76 additions & 0 deletions searchlib/src/tests/attribute/searchcontext/searchcontext_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include <vespa/vespalib/util/stringfmt.h>
#include <filesystem>
#include <initializer_list>
#include <memory>
#include <set>

#include <vespa/log/log.h>
Expand Down Expand Up @@ -65,6 +66,7 @@ using attribute::BasicType;
using attribute::CollectionType;
using attribute::Config;
using attribute::HitEstimate;
using attribute::PostingListSearchContext;
using attribute::SearchContextParams;
using attribute::test::AttributeBuilder;
using fef::MatchData;
Expand Down Expand Up @@ -161,6 +163,7 @@ class SearchContextTest : public ::testing::Test
ConfigMap _floatCfg;
ConfigMap _stringCfg;
static std::string _test_dir;
static bool _default_preserve_weight;

static AttributePtr create_as(const AttributeVector& attr, const std::string& name_suffix);

Expand Down Expand Up @@ -271,6 +274,9 @@ class SearchContextTest : public ::testing::Test
// test prefix search
void testPrefixSearch(const std::string& name, const Config& cfg);

// Test prefix search with weight information
void test_weighted_prefix_search(const std::string& name, const Config& cfg);

// test fuzzy search
void testFuzzySearch(const std::string& name, const Config& cfg);

Expand Down Expand Up @@ -306,9 +312,11 @@ class SearchContextTest : public ::testing::Test
~SearchContextTest() override;
static void SetUpTestSuite();
static void TearDownTestSuite();
void SetUp() override;
};

std::string SearchContextTest::_test_dir = "test_data";
bool SearchContextTest::_default_preserve_weight = false;

SearchContextTest::SearchContextTest() :
_integerCfg(),
Expand All @@ -327,6 +335,7 @@ SearchContextTest::SetUpTestSuite()
{
std::filesystem::remove_all(_test_dir);
std::filesystem::create_directory(_test_dir);
_default_preserve_weight = PostingListSearchContext::get_preserve_weight();
}

void
Expand All @@ -335,6 +344,12 @@ SearchContextTest::TearDownTestSuite()
std::filesystem::remove_all(_test_dir);
}

void
SearchContextTest::SetUp()
{
PostingListSearchContext::set_preserve_weight(_default_preserve_weight);
}

void
SearchContextTest::addReservedDoc(AttributeVector &ptr)
{
Expand Down Expand Up @@ -1547,6 +1562,67 @@ TEST_F(SearchContextTest, test_prefix_search)
}
}

void
SearchContextTest::test_weighted_prefix_search(const std::string& name, const Config& cfg)
{
SCOPED_TRACE(name);
auto attr = AttributeBuilder(name, cfg).get();
auto string_attr = std::dynamic_pointer_cast<StringAttribute>(attr);
ASSERT_TRUE(string_attr);
attr->addDocs(800);
uint32_t docid = 0;
std::string val_a("a");
std::string val_A("A");
std::string val_aa("aa");
std::string val_aaa("aaa");
std::string val_AAA("AAA");
std::string val_aaaa("aaaa");
for (docid = 1; docid < 10; ++docid) {
if (attr->hasMultiValue()) {
string_attr->append(docid, val_a, 3);
string_attr->append(docid, val_A, 2);
string_attr->append(docid, val_aa, 10);
if (docid == 1) {
string_attr->append(docid, val_aaa, 300);
string_attr->append(docid, val_AAA, 200);
string_attr->append(docid, val_aaaa, 1000);
}
} else {
string_attr->update(docid, val_aaa);
}
}
attr->commit();

for (auto preserve_weight : { false, true }) {
SCOPED_TRACE(std::string("preserve_weight=") + (preserve_weight ? "true" : "false"));
PostingListSearchContext::set_preserve_weight(preserve_weight);
for (auto common_word : { false, true }) {
SCOPED_TRACE(std::string("common_word=") + (common_word ? "true" : "false"));
TermFieldMatchData md;
auto sc = getSearch(*attr, common_word ? val_a : val_aaa, TermType::PREFIXTERM);
sc->fetchPostings(queryeval::ExecuteInfo::FULL, true);
auto itr = sc->createIterator(&md, true);
itr->initRange(1, attr->getCommittedDocIdLimit());
EXPECT_TRUE(itr->seek(1));
itr->unpack(1);
EXPECT_EQ(1, md.getDocId());
int32_t expected_weight = (preserve_weight || !common_word || !cfg.fastSearch()) ?
(attr->hasWeightedSetType() ?
(common_word ? (1000 + 300 + 200 + 10 + 3 + 2) : (1000 + 300 + 200)) :
(attr->hasMultiValue() ? (common_word ? (1 + 1 + 1 + 1 + 1 + 1) : (1 + 1 + 1)) : 1)) :
1;
EXPECT_EQ(expected_weight, md.getWeight());
}
}
}

TEST_F(SearchContextTest, test_weighted_prefix_search)
{
for (const auto& cfg : _stringCfg) {
test_weighted_prefix_search(cfg.first, cfg.second);
}
}

//-----------------------------------------------------------------------------
// Test fuzzy search
//-----------------------------------------------------------------------------
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ namespace search::attribute {

using vespalib::btree::BTreeNode;

bool PostingListSearchContext::_preserve_weight = false;

PostingListSearchContext::
PostingListSearchContext(const IEnumStoreDictionary& dictionary, bool has_btree_dictionary, uint32_t docIdLimit,
uint64_t numValues, bool useBitVector, const ISearchContext &baseSearchCtx)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ class PostingListSearchContext : public IPostingListSearchContext
EntryRef _frozenRoot; // Posting list in tree form
bool _useBitVector;
mutable std::optional<size_t> _estimated_hits; // Snapshot of size of posting lists in range
static bool _preserve_weight; // Use temporary posting list with weight information

PostingListSearchContext(const IEnumStoreDictionary& dictionary, bool has_btree_dictionary, uint32_t docIdLimit,
uint64_t numValues, bool useBitVector, const ISearchContext &baseSearchCtx);
Expand Down Expand Up @@ -78,6 +79,11 @@ class PostingListSearchContext : public IPostingListSearchContext
* by looking at the posting lists in the range [lower, upper>.
*/
virtual size_t calc_estimated_hits_in_range() const = 0;

public:
// Used by unit tests.
static bool get_preserve_weight() noexcept { return _preserve_weight; }
static void set_preserve_weight(bool value) noexcept { _preserve_weight = value; }
};


Expand All @@ -100,6 +106,8 @@ class PostingListSearchContextT : public PostingListSearchContext
*/
PostingListMerger<DataT> _merger;

static constexpr bool merged_array_has_weight = !std::is_same_v<DataT, vespalib::btree::BTreeNoLeafData>;

PostingListSearchContextT(const IEnumStoreDictionary& dictionary, uint32_t docIdLimit, uint64_t numValues,
const PostingStore& posting_store,
bool useBitVector, const ISearchContext &baseSearchCtx);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -170,8 +170,9 @@ PostingListSearchContextT<DataT>::fetchPostings(const ExecuteInfo & exec_info, b
if (!_merger.merge_done() && _uniqueValues >= 2u && this->_dictionary.get_has_btree_dictionary()) {
if (strict || use_posting_lists_when_non_strict(exec_info)) {
size_t sum = estimated_hits_in_range();
bool force_array = merged_array_has_weight && _preserve_weight && !_useBitVector;
//TODO Honour soft_doom and forward it to merge code
if (sum < (_docIdLimit * threshold_for_using_array)) {
if (sum < (_docIdLimit * threshold_for_using_array) || force_array) {
_merger.reserveArray(_uniqueValues, sum);
fillArray();
} else {
Expand Down

0 comments on commit a9c942f

Please sign in to comment.