-
Notifications
You must be signed in to change notification settings - Fork 20
Matched documents handler example
Mark Papadakis edited this page Aug 31, 2017
·
3 revisions
// This MatchedIndexDocumentsFilter subclass demonstrates how you
// consider the token instances in the query to score based on proximity
// In a future update, this implementation will be updated to demonstrate how you can take advantage of rep(etitions) and
// how you could go about scoring based on payload.
struct MatchesHandler final
: public MatchedIndexDocumentsFilter
{
// A unique term's hits
struct query_term_hits
{
const term_hit *it;
const term_hit *end;
bool considered; // XXX: maybe there is a way to avoid the extra ivar
};
// distinct (query index, toNextSpan) among all matched terms
struct tracked
{
query_term_hits *th;
uint16_t index;
uint8_t toNextSpan;
exec_term_id_t termID;
};
// A sequence of span > 1 that we either accept or reject
struct tracked_span
{
uint16_t queryTokenIndex;
exec_term_id_t termID;
tokenpos_t pos;
uint16_t span;
};
query_term_hits qthStorage[Trinity::Limits::MaxQueryTokens];
tracked allTracked[Trinity::Limits::MaxQueryTokens];
query_term_hits *allQTH[Trinity::Limits::MaxQueryTokens];
std::vector<tracked_span> trackedSpans;
void consider_sequence(Trinity::DocWordsSpace *const dws, const tokenpos_t pos, const uint16_t queryIndex, uint16_t *__restrict__ const span)
{
static constexpr bool trace{false};
if (auto adjacent = queryIndicesTerms[queryIndex])
{
const auto nextPos = pos + 1;
const auto cnt = adjacent->cnt;
for (uint32_t i{0}; i != cnt; ++i)
{
const auto &pair = adjacent->uniques[i];
const auto termID = pair.first;
if (dws->test(termID, pos))
{
(*span)++;
consider_sequence(nextPos, queryIndex + pair.second, span);
}
}
}
}
void consider(const matched_document &match) override final
{
auto *const dws{match.dws};
const auto totalMatchedTerms{match.matchedTermsCnt};
uint16_t rem{0}, remQTH{0};
uint32_t score{0};
// We need to consider all possible combinations of (starting query index, toNextSpan)
// Consider the query:
// [world of warcraft game is warcraft of blizzard]
// We have two ocurrences of [of] in the query, and we need to consider
// [of warcraft...] first, and then [warcraft of..] and then [of blizzard..]
//
// If we were to process of occurrences first, then we 'd process
// [of warcraft ..] and then [of blizzard..] but because we dws->unset() accepted sequences, if we were
// to process [of blizzard..] before [warcraft is blizzard] we 'd miss that 3 token sequence.
//
// This is also about query tokens where the adjacent token is not found at query token index + 1. e.g
// [world of (warcraft OR (war craft)) mists warcraft is a blizzard game]
// for [warcraft] at index 1, its next logical token is [mists] and its not found at [warcraft] index + 1, whereas
// for the next [warcraft] token in the query, its next logical token is right next to it ([is]).
//
// Trinity::phrase decl.comments. assign_query_indices() is responsible for assinging indices and toNextSpan
for (uint32_t i{0}; i != totalMatchedTerms; ++i)
{
const auto mt = match.matchedTerms + i;
const auto qi = mt->queryCtx;
auto p = qthStorage + i;
const auto instancesCnt = qi->instancesCnt;
p->it = mt->hits->all;
p->end = p->it + mt->hits->freq;
p->considered = false;
allQTH[remQTH++] = p;
for (uint32_t i{0}; i != instancesCnt; ++i)
allTracked[rem++] = {p, qi->instances[i].index, qi->instances[i].toNextSpan, qi->term.id};
}
std::sort(allTracked, allTracked + rem, [](const auto &a, const auto &b) {
return a.index < b.index;
});
for (;;)
{
trackedSpans.clear();
// It is important that we consider all query tokens
// and then advance hits iterator for all matched terms afterwards,
// because e.g for query [world OF warcraft mists OF pandaria]
// [of] is found twice in the query and we want to check at position X
// for both [OF pandaria] and [OF warcraft mists..], but we can't do that one term index at a time because
// of reasons described earlier.
for (uint32_t i{0}; i < rem;)
{
const auto it = allTracked + i;
auto th = it->th;
if (unlikely(th->it == th->end))
{
if (--rem == 0)
goto l10;
else
memmove(it, it + 1, (rem - i) * sizeof(allTracked[0]));
}
else
{
if (const auto pos = th->it->pos)
{
uint16_t span{1};
consider_sequence(dws, pos + 1, it->index + it->toNextSpan, &span);
if (span != 1)
trackedSpans.push_back({it->index, it->termID, pos, span});
else
{
// XXX:
// for e.g query [barrack obama]
// check if obama is found _two_ positions ahead of barrack
// so that we can e.g boost barrack HUSSEIN obama matches
}
}
if (!th->considered)
{
// first termID to consider for this hit (if we have the same term in multiple query indices
// we need to guard against that)
th->considered = true;
// check payload etc here
++score;
}
++i;
}
}
std::sort(trackedSpans.begin(), trackedSpans.end(), [](const auto &a, const auto &b) {
return a.pos < b.pos || (a.pos == b.pos && b.span < a.span);
});
for (const auto *p = trackedSpans.data(), *const e = p + trackedSpans.size(); p != e;)
{
const auto pos = p->pos;
const auto span = p->span;
const auto l = pos + span;
if (span > 4)
score += 25;
else
{
static constexpr uint8_t v[] = {0, 0, 5, 8, 14, 20};
score += v[span];
}
// this is important
for (uint32_t i = pos; i != l; ++i)
dws->unset(i);
// Ignore overlaps
for (++p; p != e && p->pos < l; ++p)
continue;
}
for (uint32_t i{0}; i < remQTH;)
{
if (++allQTH[i]->it == allQTH[i]->end)
{
if (--remQTH == 0)
goto l10;
else
allQTH[i] = allQTH[remQTH];
}
else
{
allQTH[i]->considered = false;
++i;
}
}
}
l10:;
// do whatever with (socre, match.id)
}
};