queries.h

#pragma once
#include "common.h"
#include <switch_mallocators.h>
#include <vector>

namespace Trinity {
        static constexpr uint8_t UnaryOperatorPrio{100};
        static constexpr uint8_t DefaultToNextSpan{0};

        enum class Operator : uint8_t {
                NONE,
                AND,
                NOT,
                OR,
                STRICT_AND, // mostly equivalent to "token"
        };

        struct phrase;

        // A query is an ASTree
        //
        // Choice of AST vs Lucene's Query Interface:
        // Lucene's use of the Query interface, which is to say, building the actual queries by subclassing the Query abstract class, provides some benefits, but gives up
        // some useful properties in the process.
        // Specifically, it doesn't capture sequence and context. That is to say, for example, the use of BooleanQuery as a container of BooleanClauses can't be used
        // to determine that term A is before term B, because clauses order is not necessarily taken into account
        // (you pack clauses of type Occur.MUST, Occur.FILTER, Occur.SHOULD ) in a list, and furthermore BooleanQuery::rewrite() alters that list
        // and doesn't preserve any logical order. It shouldn't be too hard for someone to update the respective Lucene classes to account for that and properly
        // track state and context, but in general Lucuene does not care for the query structure.
        // Trinity's default query execution mode relies on capturing that state and passing information about matched terms and their context in the orignal query
        // to the callback.
        //
        // On the other hand, it's simpler to to just create a BooleanQuery and push BooleanClause's in to it. It's also easier to subclass Query and implement
        // the various abstract methods in your Lucene application, whereas in Trinity you 'd need to define a new ast_node::Type in the Trinity codebase and
        // implement whatever's required.
        struct ast_node final {
                union {
                        struct
                        {
                                ast_node *lhs, *rhs;
                                Operator  op;

                                constexpr auto normalized_operator() const noexcept {
                                        return op == Operator::STRICT_AND ? Operator::AND : op;
                                }
                        } binop;

                        struct
                        {
                                uint16_t   size; // among that many nodes here
                                uint16_t   min;  // at least those many should match
                                ast_node **nodes;
                        } match_some;

                        struct
                        {
                                ast_node *expr;
                                Operator  op;
                        } unaryop;

                        // for both type::Token and type::Phrase
                        phrase *  p;
                        ast_node *expr;
                };

                enum class Type : uint8_t {
                        BinOp = 0,
                        // We need different types for phrases and tokens, because
                        // we may have a phrase with a single token, but we need to know that it is indeed a phrase
                        // so that we won't try to expand it e.g "ipad" should not be expanded to synonyms etc
                        Token,
                        Phrase,
                        UnaryOp,
                        Dummy,      // also semantically equivalent to both a 'true' node and a useless node. normalize_root() will deal with such nodes(GC)
                        ConstFalse, // normalize_root() will GC it
                        // Suppose you want to match special tokens FOO|BAR and FANCY|TERM if they are found in the index, i.e treat them as optional.
                        // That¢s easy. You can get the original query e.g [apple iphone] and transform it like so [(FOO|BAR OR FANCY|TERM) OR (apple iPhone)] and that¢d work great.
                        //
                        // However, imagine you want to match FOO|BAR and FANCY|TERM but ONLY IF the original query also matches. You can¢t practically construct that query like the one above.
                        // This is where you 'd use this node. When evaluated it will always return true after evaluating its expression.
                        // You will transform it to [(FOO|BAR OR FANCY|TERM) AND (apple iphone)]
                        // If [apple iphone] matches THEN it will attempt to match [FOO|BAR OR FANCY|TERM], but even if that sub-expression fails, it won't matter.
                        // This allows for this kind of expression and other such use cases.
                        //
                        // The only challenge is properly handling this node(and the related exec_node) properly when capturing leading nodes. See
                        // the respective methods implementations for details.
                        //
                        // You an also think of this as an 'optional match' node.
                        ConstTrueExpr,

                        // This is a special-purprose type, and it can be used to specify 2+ nodes and a threshold; at least threshold many
                        // of those should be matched during execution.
                        // Suppose you are buiding a visual search engine, and you extract all images features(say, maybe about 1000 or so for each image). You can then index them, and then for each input image, extract
                        // its own features, and use MatchSome ast_node with all the features extracted from the input image, and a threshold set to say 50% of the total features in that input image, and then execute
                        // the query; it will match all images that have at least 50% common features with the input image. This is going to be very fast and very handy.
                        MatchSome,
                } type;

                // this is handy if you want to delete a node
                // normalize_root() will GC those nodes
                void set_dummy() noexcept {
                        type = Type::Dummy;
                }

                constexpr bool is_binop() const noexcept {
                        return type == Type::BinOp;
                }

                constexpr bool is_unary() const noexcept {
                        return type == Type::Phrase || type == Type::Token;
                }

                constexpr bool is_dummy() const noexcept {
                        return type == Type::Dummy;
                }

                constexpr void set_const_false() noexcept {
                        type = Type::ConstFalse;
                }

                constexpr bool is_const_false() const noexcept {
                        return type == Type::ConstFalse;
                }

                constexpr bool is_token() const noexcept {
                        return type == Type::Token;
                }

                constexpr bool is_phrase() const noexcept {
                        return type == Type::Phrase;
                }

                static ast_node *make(simple_allocator &a, const Type t) {
                        auto r = a.Alloc<ast_node>();

                        r->type = t;
                        return r;
                }

                static ast_node *make_match_some(simple_allocator &a, ast_node **nodes, const uint16_t cnt, const uint16_t min) {
                        auto res = make(a, Type::MatchSome);

                        EXPECT(cnt);
                        EXPECT(min <= cnt);

                        res->match_some.size = cnt;
                        res->match_some.min  = min;

                        if (const auto required = sizeof(ast_node *) * cnt; a.can_allocate(required))
                                res->match_some.nodes = a.CopyOf(nodes, cnt);
                        else {
                                throw Switch::data_error("Can't fit");
                        }

                        return res;
                }

                static ast_node *make_binop(simple_allocator &a) {
                        auto res = make(a, Type::BinOp);

                        // so that we 'll catch this if necessary
                        res->binop.lhs = res->binop.rhs = nullptr;
                        return res;
                }

                ast_node *copy(simple_allocator *const a, std::vector<void *> *);

                // same as copy(), except we are not copying tokens
                ast_node *shallow_copy(simple_allocator *const a, std::vector<void *> *);

                // see query::leader_nodes() comments
                bool any_leader_tokens() const;

                // Recursively sets p->flags to `flags` to all nodes of this branch
                //
                // This is helpful when you for example rewrite an original query to incorporate synonyms etc
                // and you want to know in your MatchedIndexDocumentsFilter::consume() impl. if the term hit is for a token from
                // any of the tokens added to the query after the rewrite.
                void set_alltokens_flags(const uint16_t flags);

                void set_app_phrase_id(const uint16_t id);

                void set_rewrite_range(const range_base<uint16_t, uint8_t>);

                void set_rewrite_translation_coeff(const uint16_t span);

                size_t nodes_count() const noexcept {
                        switch (type) {
                                case Type::BinOp:
                                        return binop.lhs->nodes_count() + binop.rhs->nodes_count() + 1;

                                case Type::UnaryOp:
                                        return 1 + unaryop.expr->nodes_count();

                                case Type::ConstTrueExpr:
                                        return 1 + expr->nodes_count();

                                default:
                                        return 1;
                        }
                }
        };

        static constexpr auto unary_same_type(const ast_node *a, const ast_node *b) noexcept {
                return a->type == b->type && (a->type == ast_node::Type::Phrase || b->type == ast_node::Type::Token);
        }

        struct term final {
                // for now, just the token but maybe in the future we 'll want to extend to support flags etc
                str8_t token;

                // We now have this union here, which maybe useful for some applications, but is otherwise not used by Trinity
                union {
                        uint32_t u32;
                };
        };

        // This is an AST parser
        //
        // Encapsulates the input query(text) to be parsed,
        // a reference to the allocator to use, and the function to use for parsing tokens from the content
        //
        // This is mainly used by `Trinity::query` to parse its root node, but you can use it for parsing expressions in order to replace runs with them.
        // See app.cpp for examples.
        //
        // A query only holds its root node and its own allocator, and uses a ast_parser to parse the query input. See query::parse()
        //
        // Keep in mind that you can always copy nodes/asts using ast_node::copy()
        struct ast_parser final {
                enum class Flags : uint32_t {
                        // Treat OR as a regular token
                        // Amazon.com treats (AND, NOT, OR) as regular tokens, but they do support '|' AND '-' AND '+' operators
                        ORAsToken = 1u,
                        // Treat NOT as a regular token
                        NOTAsToken = 1u << 1,
                        // Treat AND as a regular token
                        ANDAsToken         = 1u << 2,
                        ParseConstTrueExpr = 1u << 3,
                        ParseMatchSomeExpr = 1u << 4,
                };

                str32_t           content;
                const char_t *    contentBase;
                simple_allocator &allocator;
                term              terms[Trinity::Limits::MaxPhraseSize];
                // It is important that your queries token parser semantics are also implemented in your documents content parser
                std::pair<uint32_t, uint8_t> (*token_parser)(const str32_t, char_t *, const bool);
                const uint32_t      parserFlags;
                std::vector<str8_t> distinctTokens; // for interning
                // facilitates parsing
                std::vector<const char *> groupTerm;
                char_t                    lastParsedToken[255];

                auto *alloc_node(const ast_node::Type t) {
                        return ast_node::make(allocator, t);
                }

                ast_parser(const str32_t     input,
                           simple_allocator &a,
                           std::pair<uint32_t, uint8_t> (*p)(const str32_t, char_t *, const bool) = default_token_parser_impl,
                           const uint32_t parserFlags_                                            = 0)
                    : content{input}, contentBase{content.data()}, allocator{a}, token_parser{p}, parserFlags{parserFlags_} {
                }

                // Useful for initializing and later using parse(const str32_t) to parse content
                ast_parser(simple_allocator &a,
                           std::pair<uint32_t, uint8_t> (*p)(const str32_t, char_t *, const bool) = default_token_parser_impl,
                           const uint32_t parserFlags_                                            = 0)
                    : contentBase{nullptr}, allocator{a}, token_parser{p}, parserFlags{parserFlags_} {
                }

                // You may NOT invoke parse() more than once
                // because content, allocator, distinctTokens will already be initialized and updated
                //
                // Make sure that the allocator you provide is not deleted or reused before you are done accessing
                // any query tokens
                //
                // UPDATE: you can now use reset_and_parse()
                // which will reset content and state. This is useful if you really don't want to allocate/init a new ast_parser
                // in say, a loop, and want to reuse it.
		// UPDATE: you can use parse(const str32_t) to parse another AST; this is akin to using reset_and_parse(), except that it doesn't reset
		// the allocator and distinctTokens.
                ast_node *parse();

		auto parse(const str32_t c) {
			content = c;
			contentBase = content.data();
			groupTerm.clear();
			return parse();
		}

                auto reset_and_parse(const str32_t c) {
                        this->allocator.reuse();

                        distinctTokens.clear();
			return parse(c);
                }

                inline void skip_ws() {
                        content.trim_leading_whitespace();
                }

                auto parse_failnode() {
                        // We no longer return nullptr
                        // because we really can't fail (not strict)
                        return alloc_node(ast_node::Type::Dummy);
                }

                void track_term(term &t);
        };

        struct phrase final {
                // total terms (1 for a single token)
                uint8_t size;

                // repetition; e.g for [APPLE APPLE APPLE APPLE] repetition is 5
                // This is important; we need to capture e.g [tom tom] (the manufucacturer) and [boing boing]
                // but we don't want to capture the same phrase/word twice if in order.
                // This is useful in MatchedIndexDocumentsFilter::consider() implementations for scoring documents based
                // on the query structure and the matched terms.
                // See Trinity::rewrite_query() where we consider this when rewriting sequences
                uint8_t rep;

                // index in the query
                uint16_t index;
                // See assign_query_indices() for how this is assigned
                //
                // SUB-EXPRESSION: A query is broken down into 0+ sub-expressions. When you use the OR operator, it creates an overlapping
                //	sub-query; the left-handside overlaps the right-handside, and the next sub-expression for that overlapped union begins
                //	after past the right-handside sub-query.
                // 	Examples:
                //	- [lord of the rings]: this is a 4 sub-expressions query (on1 for each token)
                //	- [google OR amazon] : this is 1 sub-expression, not 2 (because google overlaps amazon)
                //	- [google OR amazon jobs]: 2 sub-expressions, one for the overlapping [google or amazon] and another for [jobs]
                // toNextSpan is the offset from a token(or phrase) to the next token or phrase in the query to the next sub-expression
                // or 0 if there is no next sub-expression.
                //
                //
                // This is how many terms/tokens to advance from this index(i.e query token) to get to the next term in the query and skip
                // all other tokens in the same OR group as this token. That is, this is the offset from index to the next sub-expression
                //
                // This is useful in MatchedIndexDocumentsFilter::consider()
                //
                // For example:
                // [world of warcraft OR (war craft) mists of pandaria]
                //---------------------------------------------
                // INDEX 	|TOKEN 		|TONEXTSPAN
                //---------------------------------------------
                // 0 		WORLD 	 	1
                // 1 		OF 		1
                // 2 		WARCRAFT 	2
                // 2 		WAR 		1
                // 3 		CRAFT 		1
                // 4 		MISTS 		1
                // 5 		OF 		1
                // 6 		PANDARIA 	0
                // If you are going to try to match a sequence for matched term  'WARCRAFT'
                // the only query instance of it is at index 2 and the next term in the query past
                // any other 'equivalent' OR groups(in this case there is only one other OR expression matching warcraft, [war craft]) is 2 positions ahead to 4 (i.e to mists)
                //
                // The semantics of (index, toNextSpan) are somewhat complicated, but it's only because it is required for accurately and relatively effortlessly being able to
                // capture sequences. A sequence is a 2+ consequtive tokens in a query.
                //
                // IMPORTANT: It can be 0 if there is no adjacent term. (i.e last term in a sequence or in the query). You can now use query::final_index()
                // if you want to compute a range for when toNextSpan = 0, e.g range32_t(index, toNextSpan ?: final_index() - index)
                //
                // Effectively, the offset of the next sub-expression from index, iff there is a next sub-expression(otherwise it's 0)
                uint8_t toNextSpan;

                // flags. Usually 0, but e.g if you are rewritting a [wow] to [wow OR "world of warcraft"] you
                // maybe want "world of warcraft" flags to be 1 (i.e derived). This can be very useful for scoring matches.
                // So you can rely on flags for query expansion/rewrite tagging/tracking, and on matched query terms hits to compute a 'relevance' score
                // and maybe another 'context' score(based on personalization, popularity, recency, etc) and then fuse them together to come up with the final score.
                //
                // You can use it for encoding flags, and state or anything else
                // This is not in rewrite_ctx because it's not specifically here for rewrites only.
		//
		// You can also rely on app_phrase_id for encoding flags. app_phrase_id is a far more versatile choice for
		// associating context/weight with query tokens runs.
                //
                // See ast_node::set_alltokens_flags()
                query_term_flags_t flags;

                // By default, this is set to 0
                // If for example you wanted to use 2 sub-queries, and you wanted the document score to be based on wether the first or the second
                // subequery was matched, you could assign the first sub-query app_phrase_id to 1 and for the same app_phrase_id to 2 and check accordingly.
                uint16_t app_phrase_id;

                // This is the range in the input query string
                // This is handy for e.g spell checking runs where you want to highlight corrected tokens/phrases in the input query
                // and you 'd rather not have to go through hoops to accomplish it
                range_base<uint16_t, uint16_t> inputRange;

                struct
                {
                        // A range, which represents the logical span in the input query terms list that was expanded/rewritten/captured.
                        // For example, rewrite_query() for query [pc games] where "pc games" is expanded to cid:806: [ (pc games) OR cid:806 ]
                        // [cid:806] range would be [0, 2), [pc] would be [0, 1) and [games] would be [1, 2). This would make it easier to determine that
                        // e.g cid:806 was matched, which overlaps 'pc' and 'games'
                        range_base<uint16_t, uint8_t> range;

                        // We encode expansions and contractions here
                        // if you rewrite(expand) [cod] to [call of duty], i.e from 1 token to 3, then
                        // this should be equal to 1/3
                        // Conversely, if we rewrite(contract) [lord of the rings] to [lotr], i.e from 4 tokens to 1, then
                        // this should be equal to 1/4 -- i.e std::min(low, high)/std::max(low, high)
                        float translationCoefficient;

                        // This is 0, except when we expand or contract a sequence and its output is a single token
                        // e.g [mac book] =>  [macbook]. A sequence of two tokens into 1
                        // however for e.g
                        // [mac book] => [apple cool laptops], a sequence of two expanded to a sequence of 3, we can't currently treat
                        // the final sequence as a single entity, however we rarely if ever need to care for that kind of expansion, and we will
                        // come up with something by then.
                        //
                        // You should probably just use translationCoefficient.
                        uint8_t srcSeqSize;
                } rewrite_ctx;

                // Difference between two ranges:
                // query range: [index, index + toNextSpan ?: 1)
                // rewrite range: rewrite_ctx.range
                //
                // For e.g query [wow OR (world of warcraft) game]
                // where we used Trinity::rewrite_query() but didn't rewrite the query, so that rewrite_ctx is initialized
                // we expect to get:
                // [wow] 	index:0 	toNextSpan:3  rr:[1, 2)
                // [world] 	index:0 	toNextSpan:1  rr:[2, 3)
                // [of] 	index:1 	toNextSpan:1  rr:[3, 4)
                // [warcraft] 	index:2 	toNextSpan:1  rr:[4, 5)
                // [game] 	index:3 	toNextSpan:0  rr:[0, 1)
                //
                // for the query [world of warcraft game], where we used Trinity::rewrite_query(), and we contracted 'world of warcraft' to 'wow', we expect to get:
                // [world] 	index:0 	toNextSpan:1  rr:[0, 1)
                // [of] 	index:1 	toNextSpan:1  rr:[1, 2)
                // [warcraft] 	index:2 	toNextSpan:1  rr:[2, 3)
                // [wow] 	index:0 	toNextSpan:3  rr:[0, 3)
                // [game] 	index:3 	toNextSpan:0  rr:[3, 4)
                //
                // for the query [wow game], where we used Trinity::rewrite_query() to expand 'wow' to 'world of warcraft', we expect to get:
                // [wow] 	index:0 	toNextSpan:3  rr:[0, 1)
                // [world] 	index:0 	toNextSpan:1  rr:[0, 1)
                // [of] 	index:1 	toNextSpan:1  rr:[0, 1)
                // [warcraft] 	index:2 	toNextSpan:2  rr:[0, 1)
                // [game] 	index:3 	toNextSpan:0  rr:[1, 2)
                //
                // As you can see:
                // - rewrite range is very useful for identifying phrases and tokens that resulted from rewrites
                // - rewrite range is always set, even for the last logical term in a sub-expression(for which we set toNextSpan to 0)
                // - we can't identify logically equal sub-expressions by relying on rewrite_ctx.range. For the last expression
                //	where [wow] and [world of warcraft] are logically equal, we could check for equality by checking
                //	[index, toNextSpan). index is == 0 for both sub-expressions and we can easily calculate the span of
                // 	the second logical sub-expressions and thus figure out that they are identical here.
                // - if [game] wasn't part of the query, then toNextSpan would have been 0 for the last token of every sub-expression
                // 	in those examples, so we clearly need a way to deal with those situations. You can now use
                //	query::final_index() to properly and correctly compute a range when toNextSpan == 0
                term terms[0];

                bool operator==(const phrase &o) const noexcept {
                        //WAS: if (size == o.size)
                        // XXX: should we also check if (this->app_phrase_id == o.app_phrase_id) ?
                        if (size == o.size && flags == o.flags) {
                                size_t i;

                                for (i = 0; i != size && terms[i].token == o.terms[i].token; ++i)
                                        continue;

                                return i == size;
                        } else
                                return false;
                }

                // handy utility method
                static auto make(const str8_t *tokens, const size_t n, simple_allocator *const a) {
                        auto p = (phrase *)a->Alloc(sizeof(phrase) + sizeof(term) * n);

                        p->flags         = 0;
                        p->rep           = 1;
                        p->inputRange.reset();
                        p->app_phrase_id = 0;
                        p->toNextSpan = DefaultToNextSpan;
                        p->rewrite_ctx.range.reset();
                        p->rewrite_ctx.srcSeqSize             = 1;
                        p->rewrite_ctx.translationCoefficient = 1.0;
                        p->size                               = n;

                        for (size_t i{0}; i != n; ++i) {
                                p->terms[i].token.Set(a->CopyOf(tokens[i].data(), tokens[i].size()), tokens[i].size());
			}
                        return p;
                }
        };

        // see query::normalize()
        ast_node *normalize_ast(ast_node *);

        // This is really just a container for an ast root node and the allocator
        // used to allocate the AST from.
        // It also provides a few useful methods that operate on the root and may make use of the allocator
        struct query final {
                ast_node *       root;
                uint16_t         final_index_;
                simple_allocator allocator{512 * 3};
                // parse() will set tokensParser; this may come in handy elsewhere, e.g see rewrite_query() impl.
                std::pair<uint32_t, uint8_t> (*tokensParser)(const str32_t, char_t *, const bool);

                // if we can't satisfy the allocation, we use malloc() and keep track of that here
                std::vector<void *> large_allocs;

                // Normalize a query.
                // This is invoked when you initially parse the query, but if you
                // rewrite it (i.e its AST) by e.g replacing runs or otherwise modifying or deleting ast nodes
                // you must normalize it to fix any issues etc
                //
                // You can also use Trinity::normalize_ast() if you want to do this youserlf
                // When you Trinity::exec_query(), it will be normalized again just in case
                bool normalize();

                /*
		   A query can have an AND token so that we can consume documents from its docslist and check the query against that, 
		   but it can also be a simple OR query (e.g [apple OR imac OR macbook]), in which case we don¢t have a single token 
		   to consume from. Instead, we need to identify the "lead tokens" (in the case of an AND that's token, but in 
		   the case of an OR sequence, that's all the distinct tokens in that OR sequence), and then use a merge-scan to get 
		   the lowest next ID from those, and then execute the query based on that query and then advance all lowest such tokens.

		   It is a bit more complicated than that, but capture_leader() will properly identify the lead nodes

		   e.g for
		   - [ipad pro] : the lead token is ipad. So we just scan the ipad documents and advance those
		   - [apple OR macbook OR iphone NOT cable]: the lead tokens are (apple, macbook, iphone)
		   - [apple OR (macbook pro)] : lead tokens are [apple, macbook]

		   * the execution engine uses a specialised such implementation for exec_nodes
		   * for simplicity and performance.
		   *
		   * See also ast_node::Type::ConstTrueExpr comments
		 */
                void leader_nodes(std::vector<ast_node *> *const out);

                bool parse(const str32_t in,
                           std::pair<uint32_t, uint8_t> (*tp)(const str32_t, char_t *, const bool) = default_token_parser_impl,
                           uint32_t parserFlags                                                    = 0);

                // This is handy. When we copy a query to another query, we want to make sure
                // that tokens point to the destination query allocator, not the source, because it is possible for
                // the source query to go away
                static void bind_tokens_to_allocator(ast_node *, simple_allocator *);

                query()
                    : root{nullptr}, final_index_{0}, tokensParser{nullptr} {
                }

                ~query() {
			free_large_allocs();
                }

		void free_large_allocs() {
                        for (auto p : large_allocs) {
                                std::free(p);
			}
			large_allocs.clear();
		}

                inline operator bool() const noexcept {
                        return root;
                }

                // This would be the index of the first new sub-expression of the query, if
                // the query had more sub-expressions.
                // It is computed by assign_query_indices(), and it's very handy because toNextSpan is set to 0
                // for all last tokens of the last query sub-exprs. and you may need to
                // deal with those kind of ranges in your applications.
                auto final_index() const noexcept {
                        return final_index_;
                }

                query(const str32_t in,
                      std::pair<uint32_t, uint8_t> (*tp)(const str32_t, char_t *, const bool) = default_token_parser_impl,
                      const uint32_t parserFlags                                              = 0)
                    : tokensParser{tp} {

                        if (!parse(in, tp, parserFlags)) {
                                throw Switch::data_error("Failed to parse query");
			}
                }

                query(ast_node *r)
                    : root{r}, final_index_{0} {
                }

                explicit query(const query &o, const bool shallow = false) {
                        tokensParser = o.tokensParser;
                        root         = o.root
                                   ? (shallow ? o.root->shallow_copy(&allocator, &large_allocs)
                                              : o.root->copy(&allocator, &large_allocs))
                                   : nullptr;
                        final_index_ = o.final_index_;
                        if (root && false == shallow) {
                                bind_tokens_to_allocator(this->root, &this->allocator);
                        }
                }

                query(query &&o) {
                        root         = std::exchange(o.root, nullptr);
                        final_index_ = o.final_index_;
                        tokensParser = o.tokensParser;
                        allocator    = std::move(o.allocator);
                }

                query &operator=(const query &o) {
                        this->allocator.reuse();
			this->large_allocs.clear();

                        root = o.root
                                   ? o.root->copy(&this->allocator, &large_allocs)
                                   : nullptr;
                        final_index_ = o.final_index_;
                        tokensParser = o.tokensParser;

                        if (root) {
                                bind_tokens_to_allocator(this->root, &this->allocator);
			}

                        return *this;
                }

                // utility method; returns all nodes
                static std::vector<ast_node *> &nodes(ast_node *root, std::vector<ast_node *> *const res);

                static auto nodes(ast_node *root) {
                        std::vector<ast_node *> out;

                        return nodes(root, &out);
                }

                auto nodes() const {
                        return nodes(root);
                }

                // Computes number of sub-expressions, whereas sub-expressions are collated using AND operators
                // OR expressions are properly counted as one, e.g
                //  (usa OR (united states of (america OR americas))) is a single expression
                //
                // XXX: This is not a particularly optimal implementation
                // make sure you have normalize()d the query before invoking this method
                // This is equivalent to subexpressions_offsets().size()
                size_t subexpressions_count() const noexcept;

                // This returns the phrase::index for all query subexpressions
                // which can be very beneficial in your MatchedIndexDocumentsFilter::consider() impl.
                std::vector<uint16_t> subexpressions_offsets() const noexcept;

                // If all you want to do is remove a node, set it a dummy
                // If all you want to do is replace a single node (i.e not a run), just replace
                // that node's value directly.
                // This is a utility method for replacing a run (see process_runs() method) with a new
                // expression(node)
                //
                // This is great for augmenting a query with synonyms e.g
                // for query [sofa] you could replace sofa with
                // [sofa OR couch OR sofas OR couches OR lounges OR lounge]
                // and also great for spell checking queries. You can process_runs() and then spell check every run
                // and if you have an alternative spelling for the run, then replace the run with a new ast that contains
                // the original and the suggested
                // e.g for [world of worrcraft video game]
                // [ ((world of worrcraft video game) OR (world of warcraft video game)) ]
                //
                // XXX: you need to be careful if you are replacing a node's value (e.g *n = *anotherNode)
                // and that anotherNode is e.g a binop and either of its (lhs, rhs) references itself.
                // In that case, just create another node (e.g use clone() method) and use that
                static void replace_run(ast_node **run, const size_t cnt, ast_node *newExprNode) {
                        // Just set all nodes _except_ the first to dummy
                        // and replace value of the first with the newExprNode

                        for (size_t i{1}; i < cnt; ++i)
                                run[i]->set_dummy();

                        *run[0] = *newExprNode;

                        // That's all there is to it
                        // Now just make sure you normalize_root()
                }

                // Returns true if can intersect this query
                // See intersections.{h,cpp} for comments
                bool can_intersect() const;

                // Make sure you check the repetition count
                // See also ast_node::set_alltokens_flags()
                //
                // andOnly: if false, will also consider runs of STRICT_AND nodes
                template <typename L>
                void process_runs(const bool includePhrases, const bool processStrictAND, const bool processNOT, L &&cb) {
                        static thread_local std::vector<std::pair<uint32_t, ast_node *>> unaryNodes, stack;
                        static thread_local std::vector<ast_node *>                      run;
                        uint32_t                                                         segments{0};

                        stack.clear();
                        unaryNodes.clear();

                        if (root) {
                                stack.push_back({0, root});
			}

                        while (stack.size()) {
                                const auto pair = stack.back();
                                auto       n    = pair.second;
                                const auto seg  = pair.first;

                                stack.pop_back();
                                switch (n->type) {
                                        case ast_node::Type::Token:
                                                unaryNodes.push_back({seg, n});
                                                break;

                                        case ast_node::Type::Phrase:
                                                if (includePhrases) {
                                                        unaryNodes.push_back({seg, n});
						}
                                                break;

                                        case ast_node::Type::MatchSome:
                                                for (size_t i{0}; i != n->match_some.size; ++i) {
                                                        // we shouldn't be treating whatever's here as a run
                                                        // we should treat each as a new segment
                                                        stack.push_back({++segments, n->match_some.nodes[i]});
                                                }
                                                break;

                                        case ast_node::Type::BinOp:
                                                if (n->binop.op == Operator::AND) {
                                                        stack.push_back({seg, n->binop.lhs});
                                                        stack.push_back({seg, n->binop.rhs});
                                                } else if (n->binop.op == Operator::NOT) {
                                                        stack.push_back({seg, n->binop.lhs});
                                                        if (processNOT) {
                                                                ++segments;
                                                                stack.push_back({segments, n->binop.rhs});
                                                        }
                                                } else if (n->binop.op == Operator::OR) {
                                                        ++segments;
                                                        stack.push_back({segments, n->binop.lhs});
                                                        ++segments;
                                                        stack.push_back({segments, n->binop.rhs});
                                                } else if (processStrictAND && n->binop.op == Operator::STRICT_AND) {
                                                        stack.push_back({seg, n->binop.lhs});
                                                        stack.push_back({seg, n->binop.rhs});
                                                }
                                                break;

                                        case ast_node::Type::UnaryOp:
                                                if (n->unaryop.op != Operator::STRICT_AND || processStrictAND) {
                                                        stack.push_back({seg, n->unaryop.expr});
						}
                                                break;

                                        case ast_node::Type::Dummy:
                                        case ast_node::Type::ConstFalse:
                                        case ast_node::Type::ConstTrueExpr:
                                                break;
                                }
                        }

                        std::sort(unaryNodes.begin(), unaryNodes.end(), [](const auto &a, const auto &b) noexcept {
                                return a.first < b.first || (a.first == b.first && a.second->p->index < b.second->p->index);
                        });

                        for (const auto *p = unaryNodes.data(), *const e = p + unaryNodes.size(); p != e;) {
                                const auto segment = p->first;

                                run.clear();
                                do {
                                        run.push_back(p->second);
                                } while (++p != e && p->first == segment);

                                cb(run);
                        }
                }

                // If more than that many different tokens are in this query, it will
                // remove however many required so that the total number of tokens is <= maxQueryTokens
                // and will return the first node removed(ast_node::Type::Token or ast_node::Type::Phrase), otherwise it will return nullptr
                // Try a google search for over 32 tokens
                ast_node *trim(const std::size_t maxQueryTokens);

                void reset() {
                        root         = nullptr;
                        final_index_ = 0;
                        allocator.reuse();
                }
        };
} // namespace Trinity

void PrintImpl(Buffer &b, const Trinity::ast_node &n);
void PrintImpl(Buffer &b, const Trinity::phrase &);

inline void PrintImpl(Buffer &b, const Trinity::query &q) {
        if (q.root)
                b.append(*q.root);
        else
                b.append("<not initialized>"_s32);
}