Skip to content

Matched documents handler example

Mark Papadakis edited this page Aug 31, 2017 · 3 revisions
// This MatchedIndexDocumentsFilter subclass demonstrates how you 
// consider the token instances in the query to score based on proximity
// In a future update, this implementation will be updated to demonstrate how you can take advantage of rep(etitions) and
// how you could go about scoring based on payload.

struct MatchesHandler final
    : public MatchedIndexDocumentsFilter
{

        // A unique term's hits
        struct query_term_hits
        {
                const term_hit *it;
                const term_hit *end;
                bool considered; // XXX: maybe there is a way to avoid the extra ivar
        };

        // distinct (query index, toNextSpan) among all matched terms
        struct tracked
        {
                query_term_hits *th;
                uint16_t index;
                uint8_t toNextSpan;
                exec_term_id_t termID;
        };

        // A sequence of span  > 1 that we either accept or reject
        struct tracked_span
        {
                uint16_t queryTokenIndex;
                exec_term_id_t termID;
                tokenpos_t pos;
                uint16_t span;
        };

        query_term_hits qthStorage[Trinity::Limits::MaxQueryTokens];
        tracked allTracked[Trinity::Limits::MaxQueryTokens];
        query_term_hits *allQTH[Trinity::Limits::MaxQueryTokens];
        std::vector<tracked_span> trackedSpans;
        

        void consider_sequence(Trinity::DocWordsSpace *const dws, const tokenpos_t pos, const uint16_t queryIndex, uint16_t *__restrict__ const span)
        {
                static constexpr bool trace{false};

                if (auto adjacent = queryIndicesTerms[queryIndex])
                {
                        const auto nextPos = pos + 1;
                        const auto cnt = adjacent->cnt;

                        for (uint32_t i{0}; i != cnt; ++i)
                        {
                                const auto &pair = adjacent->uniques[i];
                                const auto termID = pair.first;

                                if (dws->test(termID, pos))
                                {
                                        (*span)++;
                                        consider_sequence(nextPos, queryIndex + pair.second, span);
                                }
                        }
                }
        }

        void consider(const matched_document &match) override final
        {
                auto *const dws{match.dws};
                const auto totalMatchedTerms{match.matchedTermsCnt};
                uint16_t rem{0}, remQTH{0};
                uint32_t score{0};

                // We need to consider all possible combinations of (starting query index, toNextSpan)
                // Consider the query:
                //               [world of warcraft game is warcraft of blizzard]
                // We have two ocurrences of [of] in the query, and we need to consider
                // [of warcraft...] first, and then [warcraft of..] and then [of blizzard..]
                //
                // If we were to process of occurrences first, then we 'd process
                // [of warcraft ..] and then [of blizzard..] but because we dws->unset() accepted sequences, if we were
                // to process [of blizzard..] before [warcraft is blizzard] we 'd miss that 3 token sequence.
                //
                // This is also about query tokens where the adjacent token is not found at query token index + 1. e.g
                //              [world of (warcraft OR (war craft)) mists  warcraft is a blizzard game]
                // for [warcraft] at index 1, its next logical token is [mists] and its not found at [warcraft] index + 1, whereas
                // for the next [warcraft] token in the query, its next logical token is right next to it ([is]).
                //
                // Trinity::phrase decl.comments. assign_query_indices() is responsible for assinging indices and toNextSpan
                for (uint32_t i{0}; i != totalMatchedTerms; ++i)
                {
                        const auto mt = match.matchedTerms + i;
                        const auto qi = mt->queryCtx;
                        auto p = qthStorage + i;
                        const auto instancesCnt = qi->instancesCnt;

                        p->it = mt->hits->all;
                        p->end = p->it + mt->hits->freq;
                        p->considered = false;

                        allQTH[remQTH++] = p;
                        for (uint32_t i{0}; i != instancesCnt; ++i)
                                allTracked[rem++] = {p, qi->instances[i].index, qi->instances[i].toNextSpan, qi->term.id};
                }

                std::sort(allTracked, allTracked + rem, [](const auto &a, const auto &b) {
                        return a.index < b.index;
                });

                for (;;)
                {
                        trackedSpans.clear();

                        // It is important that we consider all query tokens
                        // and then advance hits iterator for all matched terms afterwards,
                        // because e.g for query [world OF warcraft mists OF pandaria]
                        // [of] is found twice in the query and we want to check at position X
                        // for both [OF pandaria] and [OF warcraft mists..], but we can't do that one term index at a time because
                        // of reasons described earlier.
                        for (uint32_t i{0}; i < rem;)
                        {
                                const auto it = allTracked + i;
                                auto th = it->th;

                                if (unlikely(th->it == th->end))
                                {
                                        if (--rem == 0)
                                                goto l10;
                                        else
                                                memmove(it, it + 1, (rem - i) * sizeof(allTracked[0]));
                                }
                                else
                                {
                                        if (const auto pos = th->it->pos)
                                        {
                                                uint16_t span{1};

                                                consider_sequence(dws, pos + 1, it->index + it->toNextSpan, &span);
                                                if (span != 1)
                                                        trackedSpans.push_back({it->index, it->termID, pos, span});
                                                else
                                                {
                                                        // XXX:
                                                        // for e.g query [barrack obama]
                                                        // check if obama is found _two_ positions ahead of barrack
                                                        // so that we can e.g boost barrack HUSSEIN obama matches
                                                }
                                        }

                                        if (!th->considered)
                                        {
                                                // first termID to consider for this hit (if we have the same term in multiple query indices
                                                // we need to guard against that)
                                                th->considered = true;
                                                //  check payload etc here

                                                ++score;
                                        }

                                        ++i;
                                }
                        }

                        std::sort(trackedSpans.begin(), trackedSpans.end(), [](const auto &a, const auto &b) {
                                return a.pos < b.pos || (a.pos == b.pos && b.span < a.span);
                        });

                        for (const auto *p = trackedSpans.data(), *const e = p + trackedSpans.size(); p != e;)
                        {
                                const auto pos = p->pos;
                                const auto span = p->span;
                                const auto l = pos + span;

                                if (span > 4)
                                        score += 25;
                                else
                                {
                                        static constexpr uint8_t v[] = {0, 0, 5, 8, 14, 20};

                                        score += v[span];
                                }

                                // this is important
                                for (uint32_t i = pos; i != l; ++i)
                                        dws->unset(i);

                                // Ignore overlaps
                                for (++p; p != e && p->pos < l; ++p)
                                        continue;
                        }

                        for (uint32_t i{0}; i < remQTH;)
                        {
                                if (++allQTH[i]->it == allQTH[i]->end)
                                {
                                        if (--remQTH == 0)
                                                goto l10;
                                        else
                                                allQTH[i] = allQTH[remQTH];
                                }
                                else
                                {
                                        allQTH[i]->considered = false;
                                        ++i;
                                }
                        }
                }

        l10:;
                // do whatever with (socre, match.id)

       
        }
};
Clone this wiki locally