-
Notifications
You must be signed in to change notification settings - Fork 20
/
queries.h
802 lines (680 loc) · 42.6 KB
/
queries.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
#pragma once
#include "common.h"
#include <switch_mallocators.h>
#include <vector>
namespace Trinity {
static constexpr uint8_t UnaryOperatorPrio{100};
static constexpr uint8_t DefaultToNextSpan{0};
enum class Operator : uint8_t {
NONE,
AND,
NOT,
OR,
STRICT_AND, // mostly equivalent to "token"
};
struct phrase;
// A query is an ASTree
//
// Choice of AST vs Lucene's Query Interface:
// Lucene's use of the Query interface, which is to say, building the actual queries by subclassing the Query abstract class, provides some benefits, but gives up
// some useful properties in the process.
// Specifically, it doesn't capture sequence and context. That is to say, for example, the use of BooleanQuery as a container of BooleanClauses can't be used
// to determine that term A is before term B, because clauses order is not necessarily taken into account
// (you pack clauses of type Occur.MUST, Occur.FILTER, Occur.SHOULD ) in a list, and furthermore BooleanQuery::rewrite() alters that list
// and doesn't preserve any logical order. It shouldn't be too hard for someone to update the respective Lucene classes to account for that and properly
// track state and context, but in general Lucuene does not care for the query structure.
// Trinity's default query execution mode relies on capturing that state and passing information about matched terms and their context in the orignal query
// to the callback.
//
// On the other hand, it's simpler to to just create a BooleanQuery and push BooleanClause's in to it. It's also easier to subclass Query and implement
// the various abstract methods in your Lucene application, whereas in Trinity you 'd need to define a new ast_node::Type in the Trinity codebase and
// implement whatever's required.
struct ast_node final {
union {
struct
{
ast_node *lhs, *rhs;
Operator op;
constexpr auto normalized_operator() const noexcept {
return op == Operator::STRICT_AND ? Operator::AND : op;
}
} binop;
struct
{
uint16_t size; // among that many nodes here
uint16_t min; // at least those many should match
ast_node **nodes;
} match_some;
struct
{
ast_node *expr;
Operator op;
} unaryop;
// for both type::Token and type::Phrase
phrase * p;
ast_node *expr;
};
enum class Type : uint8_t {
BinOp = 0,
// We need different types for phrases and tokens, because
// we may have a phrase with a single token, but we need to know that it is indeed a phrase
// so that we won't try to expand it e.g "ipad" should not be expanded to synonyms etc
Token,
Phrase,
UnaryOp,
Dummy, // also semantically equivalent to both a 'true' node and a useless node. normalize_root() will deal with such nodes(GC)
ConstFalse, // normalize_root() will GC it
// Suppose you want to match special tokens FOO|BAR and FANCY|TERM if they are found in the index, i.e treat them as optional.
// That¢s easy. You can get the original query e.g [apple iphone] and transform it like so [(FOO|BAR OR FANCY|TERM) OR (apple iPhone)] and that¢d work great.
//
// However, imagine you want to match FOO|BAR and FANCY|TERM but ONLY IF the original query also matches. You can¢t practically construct that query like the one above.
// This is where you 'd use this node. When evaluated it will always return true after evaluating its expression.
// You will transform it to [(FOO|BAR OR FANCY|TERM) AND (apple iphone)]
// If [apple iphone] matches THEN it will attempt to match [FOO|BAR OR FANCY|TERM], but even if that sub-expression fails, it won't matter.
// This allows for this kind of expression and other such use cases.
//
// The only challenge is properly handling this node(and the related exec_node) properly when capturing leading nodes. See
// the respective methods implementations for details.
//
// You an also think of this as an 'optional match' node.
ConstTrueExpr,
// This is a special-purprose type, and it can be used to specify 2+ nodes and a threshold; at least threshold many
// of those should be matched during execution.
// Suppose you are buiding a visual search engine, and you extract all images features(say, maybe about 1000 or so for each image). You can then index them, and then for each input image, extract
// its own features, and use MatchSome ast_node with all the features extracted from the input image, and a threshold set to say 50% of the total features in that input image, and then execute
// the query; it will match all images that have at least 50% common features with the input image. This is going to be very fast and very handy.
MatchSome,
} type;
// this is handy if you want to delete a node
// normalize_root() will GC those nodes
void set_dummy() noexcept {
type = Type::Dummy;
}
constexpr bool is_binop() const noexcept {
return type == Type::BinOp;
}
constexpr bool is_unary() const noexcept {
return type == Type::Phrase || type == Type::Token;
}
constexpr bool is_dummy() const noexcept {
return type == Type::Dummy;
}
constexpr void set_const_false() noexcept {
type = Type::ConstFalse;
}
constexpr bool is_const_false() const noexcept {
return type == Type::ConstFalse;
}
constexpr bool is_token() const noexcept {
return type == Type::Token;
}
constexpr bool is_phrase() const noexcept {
return type == Type::Phrase;
}
static ast_node *make(simple_allocator &a, const Type t) {
auto r = a.Alloc<ast_node>();
r->type = t;
return r;
}
static ast_node *make_match_some(simple_allocator &a, ast_node **nodes, const uint16_t cnt, const uint16_t min) {
auto res = make(a, Type::MatchSome);
EXPECT(cnt);
EXPECT(min <= cnt);
res->match_some.size = cnt;
res->match_some.min = min;
if (const auto required = sizeof(ast_node *) * cnt; a.can_allocate(required))
res->match_some.nodes = a.CopyOf(nodes, cnt);
else {
throw Switch::data_error("Can't fit");
}
return res;
}
static ast_node *make_binop(simple_allocator &a) {
auto res = make(a, Type::BinOp);
// so that we 'll catch this if necessary
res->binop.lhs = res->binop.rhs = nullptr;
return res;
}
ast_node *copy(simple_allocator *const a, std::vector<void *> *);
// same as copy(), except we are not copying tokens
ast_node *shallow_copy(simple_allocator *const a, std::vector<void *> *);
// see query::leader_nodes() comments
bool any_leader_tokens() const;
// Recursively sets p->flags to `flags` to all nodes of this branch
//
// This is helpful when you for example rewrite an original query to incorporate synonyms etc
// and you want to know in your MatchedIndexDocumentsFilter::consume() impl. if the term hit is for a token from
// any of the tokens added to the query after the rewrite.
void set_alltokens_flags(const uint16_t flags);
void set_app_phrase_id(const uint16_t id);
void set_rewrite_range(const range_base<uint16_t, uint8_t>);
void set_rewrite_translation_coeff(const uint16_t span);
size_t nodes_count() const noexcept {
switch (type) {
case Type::BinOp:
return binop.lhs->nodes_count() + binop.rhs->nodes_count() + 1;
case Type::UnaryOp:
return 1 + unaryop.expr->nodes_count();
case Type::ConstTrueExpr:
return 1 + expr->nodes_count();
default:
return 1;
}
}
};
static constexpr auto unary_same_type(const ast_node *a, const ast_node *b) noexcept {
return a->type == b->type && (a->type == ast_node::Type::Phrase || b->type == ast_node::Type::Token);
}
struct term final {
// for now, just the token but maybe in the future we 'll want to extend to support flags etc
str8_t token;
// We now have this union here, which maybe useful for some applications, but is otherwise not used by Trinity
union {
uint32_t u32;
};
};
// This is an AST parser
//
// Encapsulates the input query(text) to be parsed,
// a reference to the allocator to use, and the function to use for parsing tokens from the content
//
// This is mainly used by `Trinity::query` to parse its root node, but you can use it for parsing expressions in order to replace runs with them.
// See app.cpp for examples.
//
// A query only holds its root node and its own allocator, and uses a ast_parser to parse the query input. See query::parse()
//
// Keep in mind that you can always copy nodes/asts using ast_node::copy()
struct ast_parser final {
enum class Flags : uint32_t {
// Treat OR as a regular token
// Amazon.com treats (AND, NOT, OR) as regular tokens, but they do support '|' AND '-' AND '+' operators
ORAsToken = 1u,
// Treat NOT as a regular token
NOTAsToken = 1u << 1,
// Treat AND as a regular token
ANDAsToken = 1u << 2,
ParseConstTrueExpr = 1u << 3,
ParseMatchSomeExpr = 1u << 4,
};
str32_t content;
const char_t * contentBase;
simple_allocator &allocator;
term terms[Trinity::Limits::MaxPhraseSize];
// It is important that your queries token parser semantics are also implemented in your documents content parser
std::pair<uint32_t, uint8_t> (*token_parser)(const str32_t, char_t *, const bool);
const uint32_t parserFlags;
std::vector<str8_t> distinctTokens; // for interning
// facilitates parsing
std::vector<const char *> groupTerm;
char_t lastParsedToken[255];
auto *alloc_node(const ast_node::Type t) {
return ast_node::make(allocator, t);
}
ast_parser(const str32_t input,
simple_allocator &a,
std::pair<uint32_t, uint8_t> (*p)(const str32_t, char_t *, const bool) = default_token_parser_impl,
const uint32_t parserFlags_ = 0)
: content{input}, contentBase{content.data()}, allocator{a}, token_parser{p}, parserFlags{parserFlags_} {
}
// Useful for initializing and later using parse(const str32_t) to parse content
ast_parser(simple_allocator &a,
std::pair<uint32_t, uint8_t> (*p)(const str32_t, char_t *, const bool) = default_token_parser_impl,
const uint32_t parserFlags_ = 0)
: contentBase{nullptr}, allocator{a}, token_parser{p}, parserFlags{parserFlags_} {
}
// You may NOT invoke parse() more than once
// because content, allocator, distinctTokens will already be initialized and updated
//
// Make sure that the allocator you provide is not deleted or reused before you are done accessing
// any query tokens
//
// UPDATE: you can now use reset_and_parse()
// which will reset content and state. This is useful if you really don't want to allocate/init a new ast_parser
// in say, a loop, and want to reuse it.
// UPDATE: you can use parse(const str32_t) to parse another AST; this is akin to using reset_and_parse(), except that it doesn't reset
// the allocator and distinctTokens.
ast_node *parse();
auto parse(const str32_t c) {
content = c;
contentBase = content.data();
groupTerm.clear();
return parse();
}
auto reset_and_parse(const str32_t c) {
this->allocator.reuse();
distinctTokens.clear();
return parse(c);
}
inline void skip_ws() {
content.trim_leading_whitespace();
}
auto parse_failnode() {
// We no longer return nullptr
// because we really can't fail (not strict)
return alloc_node(ast_node::Type::Dummy);
}
void track_term(term &t);
};
struct phrase final {
// total terms (1 for a single token)
uint8_t size;
// repetition; e.g for [APPLE APPLE APPLE APPLE] repetition is 5
// This is important; we need to capture e.g [tom tom] (the manufucacturer) and [boing boing]
// but we don't want to capture the same phrase/word twice if in order.
// This is useful in MatchedIndexDocumentsFilter::consider() implementations for scoring documents based
// on the query structure and the matched terms.
// See Trinity::rewrite_query() where we consider this when rewriting sequences
uint8_t rep;
// index in the query
uint16_t index;
// See assign_query_indices() for how this is assigned
//
// SUB-EXPRESSION: A query is broken down into 0+ sub-expressions. When you use the OR operator, it creates an overlapping
// sub-query; the left-handside overlaps the right-handside, and the next sub-expression for that overlapped union begins
// after past the right-handside sub-query.
// Examples:
// - [lord of the rings]: this is a 4 sub-expressions query (on1 for each token)
// - [google OR amazon] : this is 1 sub-expression, not 2 (because google overlaps amazon)
// - [google OR amazon jobs]: 2 sub-expressions, one for the overlapping [google or amazon] and another for [jobs]
// toNextSpan is the offset from a token(or phrase) to the next token or phrase in the query to the next sub-expression
// or 0 if there is no next sub-expression.
//
//
// This is how many terms/tokens to advance from this index(i.e query token) to get to the next term in the query and skip
// all other tokens in the same OR group as this token. That is, this is the offset from index to the next sub-expression
//
// This is useful in MatchedIndexDocumentsFilter::consider()
//
// For example:
// [world of warcraft OR (war craft) mists of pandaria]
//---------------------------------------------
// INDEX |TOKEN |TONEXTSPAN
//---------------------------------------------
// 0 WORLD 1
// 1 OF 1
// 2 WARCRAFT 2
// 2 WAR 1
// 3 CRAFT 1
// 4 MISTS 1
// 5 OF 1
// 6 PANDARIA 0
// If you are going to try to match a sequence for matched term 'WARCRAFT'
// the only query instance of it is at index 2 and the next term in the query past
// any other 'equivalent' OR groups(in this case there is only one other OR expression matching warcraft, [war craft]) is 2 positions ahead to 4 (i.e to mists)
//
// The semantics of (index, toNextSpan) are somewhat complicated, but it's only because it is required for accurately and relatively effortlessly being able to
// capture sequences. A sequence is a 2+ consequtive tokens in a query.
//
// IMPORTANT: It can be 0 if there is no adjacent term. (i.e last term in a sequence or in the query). You can now use query::final_index()
// if you want to compute a range for when toNextSpan = 0, e.g range32_t(index, toNextSpan ?: final_index() - index)
//
// Effectively, the offset of the next sub-expression from index, iff there is a next sub-expression(otherwise it's 0)
uint8_t toNextSpan;
// flags. Usually 0, but e.g if you are rewritting a [wow] to [wow OR "world of warcraft"] you
// maybe want "world of warcraft" flags to be 1 (i.e derived). This can be very useful for scoring matches.
// So you can rely on flags for query expansion/rewrite tagging/tracking, and on matched query terms hits to compute a 'relevance' score
// and maybe another 'context' score(based on personalization, popularity, recency, etc) and then fuse them together to come up with the final score.
//
// You can use it for encoding flags, and state or anything else
// This is not in rewrite_ctx because it's not specifically here for rewrites only.
//
// You can also rely on app_phrase_id for encoding flags. app_phrase_id is a far more versatile choice for
// associating context/weight with query tokens runs.
//
// See ast_node::set_alltokens_flags()
query_term_flags_t flags;
// By default, this is set to 0
// If for example you wanted to use 2 sub-queries, and you wanted the document score to be based on wether the first or the second
// subequery was matched, you could assign the first sub-query app_phrase_id to 1 and for the same app_phrase_id to 2 and check accordingly.
uint16_t app_phrase_id;
// This is the range in the input query string
// This is handy for e.g spell checking runs where you want to highlight corrected tokens/phrases in the input query
// and you 'd rather not have to go through hoops to accomplish it
range_base<uint16_t, uint16_t> inputRange;
struct
{
// A range, which represents the logical span in the input query terms list that was expanded/rewritten/captured.
// For example, rewrite_query() for query [pc games] where "pc games" is expanded to cid:806: [ (pc games) OR cid:806 ]
// [cid:806] range would be [0, 2), [pc] would be [0, 1) and [games] would be [1, 2). This would make it easier to determine that
// e.g cid:806 was matched, which overlaps 'pc' and 'games'
range_base<uint16_t, uint8_t> range;
// We encode expansions and contractions here
// if you rewrite(expand) [cod] to [call of duty], i.e from 1 token to 3, then
// this should be equal to 1/3
// Conversely, if we rewrite(contract) [lord of the rings] to [lotr], i.e from 4 tokens to 1, then
// this should be equal to 1/4 -- i.e std::min(low, high)/std::max(low, high)
float translationCoefficient;
// This is 0, except when we expand or contract a sequence and its output is a single token
// e.g [mac book] => [macbook]. A sequence of two tokens into 1
// however for e.g
// [mac book] => [apple cool laptops], a sequence of two expanded to a sequence of 3, we can't currently treat
// the final sequence as a single entity, however we rarely if ever need to care for that kind of expansion, and we will
// come up with something by then.
//
// You should probably just use translationCoefficient.
uint8_t srcSeqSize;
} rewrite_ctx;
// Difference between two ranges:
// query range: [index, index + toNextSpan ?: 1)
// rewrite range: rewrite_ctx.range
//
// For e.g query [wow OR (world of warcraft) game]
// where we used Trinity::rewrite_query() but didn't rewrite the query, so that rewrite_ctx is initialized
// we expect to get:
// [wow] index:0 toNextSpan:3 rr:[1, 2)
// [world] index:0 toNextSpan:1 rr:[2, 3)
// [of] index:1 toNextSpan:1 rr:[3, 4)
// [warcraft] index:2 toNextSpan:1 rr:[4, 5)
// [game] index:3 toNextSpan:0 rr:[0, 1)
//
// for the query [world of warcraft game], where we used Trinity::rewrite_query(), and we contracted 'world of warcraft' to 'wow', we expect to get:
// [world] index:0 toNextSpan:1 rr:[0, 1)
// [of] index:1 toNextSpan:1 rr:[1, 2)
// [warcraft] index:2 toNextSpan:1 rr:[2, 3)
// [wow] index:0 toNextSpan:3 rr:[0, 3)
// [game] index:3 toNextSpan:0 rr:[3, 4)
//
// for the query [wow game], where we used Trinity::rewrite_query() to expand 'wow' to 'world of warcraft', we expect to get:
// [wow] index:0 toNextSpan:3 rr:[0, 1)
// [world] index:0 toNextSpan:1 rr:[0, 1)
// [of] index:1 toNextSpan:1 rr:[0, 1)
// [warcraft] index:2 toNextSpan:2 rr:[0, 1)
// [game] index:3 toNextSpan:0 rr:[1, 2)
//
// As you can see:
// - rewrite range is very useful for identifying phrases and tokens that resulted from rewrites
// - rewrite range is always set, even for the last logical term in a sub-expression(for which we set toNextSpan to 0)
// - we can't identify logically equal sub-expressions by relying on rewrite_ctx.range. For the last expression
// where [wow] and [world of warcraft] are logically equal, we could check for equality by checking
// [index, toNextSpan). index is == 0 for both sub-expressions and we can easily calculate the span of
// the second logical sub-expressions and thus figure out that they are identical here.
// - if [game] wasn't part of the query, then toNextSpan would have been 0 for the last token of every sub-expression
// in those examples, so we clearly need a way to deal with those situations. You can now use
// query::final_index() to properly and correctly compute a range when toNextSpan == 0
term terms[0];
bool operator==(const phrase &o) const noexcept {
//WAS: if (size == o.size)
// XXX: should we also check if (this->app_phrase_id == o.app_phrase_id) ?
if (size == o.size && flags == o.flags) {
size_t i;
for (i = 0; i != size && terms[i].token == o.terms[i].token; ++i)
continue;
return i == size;
} else
return false;
}
// handy utility method
static auto make(const str8_t *tokens, const size_t n, simple_allocator *const a) {
auto p = (phrase *)a->Alloc(sizeof(phrase) + sizeof(term) * n);
p->flags = 0;
p->rep = 1;
p->inputRange.reset();
p->app_phrase_id = 0;
p->toNextSpan = DefaultToNextSpan;
p->rewrite_ctx.range.reset();
p->rewrite_ctx.srcSeqSize = 1;
p->rewrite_ctx.translationCoefficient = 1.0;
p->size = n;
for (size_t i{0}; i != n; ++i) {
p->terms[i].token.Set(a->CopyOf(tokens[i].data(), tokens[i].size()), tokens[i].size());
}
return p;
}
};
// see query::normalize()
ast_node *normalize_ast(ast_node *);
// This is really just a container for an ast root node and the allocator
// used to allocate the AST from.
// It also provides a few useful methods that operate on the root and may make use of the allocator
struct query final {
ast_node * root;
uint16_t final_index_;
simple_allocator allocator{512 * 3};
// parse() will set tokensParser; this may come in handy elsewhere, e.g see rewrite_query() impl.
std::pair<uint32_t, uint8_t> (*tokensParser)(const str32_t, char_t *, const bool);
// if we can't satisfy the allocation, we use malloc() and keep track of that here
std::vector<void *> large_allocs;
// Normalize a query.
// This is invoked when you initially parse the query, but if you
// rewrite it (i.e its AST) by e.g replacing runs or otherwise modifying or deleting ast nodes
// you must normalize it to fix any issues etc
//
// You can also use Trinity::normalize_ast() if you want to do this youserlf
// When you Trinity::exec_query(), it will be normalized again just in case
bool normalize();
/*
A query can have an AND token so that we can consume documents from its docslist and check the query against that,
but it can also be a simple OR query (e.g [apple OR imac OR macbook]), in which case we don¢t have a single token
to consume from. Instead, we need to identify the "lead tokens" (in the case of an AND that's token, but in
the case of an OR sequence, that's all the distinct tokens in that OR sequence), and then use a merge-scan to get
the lowest next ID from those, and then execute the query based on that query and then advance all lowest such tokens.
It is a bit more complicated than that, but capture_leader() will properly identify the lead nodes
e.g for
- [ipad pro] : the lead token is ipad. So we just scan the ipad documents and advance those
- [apple OR macbook OR iphone NOT cable]: the lead tokens are (apple, macbook, iphone)
- [apple OR (macbook pro)] : lead tokens are [apple, macbook]
* the execution engine uses a specialised such implementation for exec_nodes
* for simplicity and performance.
*
* See also ast_node::Type::ConstTrueExpr comments
*/
void leader_nodes(std::vector<ast_node *> *const out);
bool parse(const str32_t in,
std::pair<uint32_t, uint8_t> (*tp)(const str32_t, char_t *, const bool) = default_token_parser_impl,
uint32_t parserFlags = 0);
// This is handy. When we copy a query to another query, we want to make sure
// that tokens point to the destination query allocator, not the source, because it is possible for
// the source query to go away
static void bind_tokens_to_allocator(ast_node *, simple_allocator *);
query()
: root{nullptr}, final_index_{0}, tokensParser{nullptr} {
}
~query() {
free_large_allocs();
}
void free_large_allocs() {
for (auto p : large_allocs) {
std::free(p);
}
large_allocs.clear();
}
inline operator bool() const noexcept {
return root;
}
// This would be the index of the first new sub-expression of the query, if
// the query had more sub-expressions.
// It is computed by assign_query_indices(), and it's very handy because toNextSpan is set to 0
// for all last tokens of the last query sub-exprs. and you may need to
// deal with those kind of ranges in your applications.
auto final_index() const noexcept {
return final_index_;
}
query(const str32_t in,
std::pair<uint32_t, uint8_t> (*tp)(const str32_t, char_t *, const bool) = default_token_parser_impl,
const uint32_t parserFlags = 0)
: tokensParser{tp} {
if (!parse(in, tp, parserFlags)) {
throw Switch::data_error("Failed to parse query");
}
}
query(ast_node *r)
: root{r}, final_index_{0} {
}
explicit query(const query &o, const bool shallow = false) {
tokensParser = o.tokensParser;
root = o.root
? (shallow ? o.root->shallow_copy(&allocator, &large_allocs)
: o.root->copy(&allocator, &large_allocs))
: nullptr;
final_index_ = o.final_index_;
if (root && false == shallow) {
bind_tokens_to_allocator(this->root, &this->allocator);
}
}
query(query &&o) {
root = std::exchange(o.root, nullptr);
final_index_ = o.final_index_;
tokensParser = o.tokensParser;
allocator = std::move(o.allocator);
}
query &operator=(const query &o) {
this->allocator.reuse();
this->large_allocs.clear();
root = o.root
? o.root->copy(&this->allocator, &large_allocs)
: nullptr;
final_index_ = o.final_index_;
tokensParser = o.tokensParser;
if (root) {
bind_tokens_to_allocator(this->root, &this->allocator);
}
return *this;
}
// utility method; returns all nodes
static std::vector<ast_node *> &nodes(ast_node *root, std::vector<ast_node *> *const res);
static auto nodes(ast_node *root) {
std::vector<ast_node *> out;
return nodes(root, &out);
}
auto nodes() const {
return nodes(root);
}
// Computes number of sub-expressions, whereas sub-expressions are collated using AND operators
// OR expressions are properly counted as one, e.g
// (usa OR (united states of (america OR americas))) is a single expression
//
// XXX: This is not a particularly optimal implementation
// make sure you have normalize()d the query before invoking this method
// This is equivalent to subexpressions_offsets().size()
size_t subexpressions_count() const noexcept;
// This returns the phrase::index for all query subexpressions
// which can be very beneficial in your MatchedIndexDocumentsFilter::consider() impl.
std::vector<uint16_t> subexpressions_offsets() const noexcept;
// If all you want to do is remove a node, set it a dummy
// If all you want to do is replace a single node (i.e not a run), just replace
// that node's value directly.
// This is a utility method for replacing a run (see process_runs() method) with a new
// expression(node)
//
// This is great for augmenting a query with synonyms e.g
// for query [sofa] you could replace sofa with
// [sofa OR couch OR sofas OR couches OR lounges OR lounge]
// and also great for spell checking queries. You can process_runs() and then spell check every run
// and if you have an alternative spelling for the run, then replace the run with a new ast that contains
// the original and the suggested
// e.g for [world of worrcraft video game]
// [ ((world of worrcraft video game) OR (world of warcraft video game)) ]
//
// XXX: you need to be careful if you are replacing a node's value (e.g *n = *anotherNode)
// and that anotherNode is e.g a binop and either of its (lhs, rhs) references itself.
// In that case, just create another node (e.g use clone() method) and use that
static void replace_run(ast_node **run, const size_t cnt, ast_node *newExprNode) {
// Just set all nodes _except_ the first to dummy
// and replace value of the first with the newExprNode
for (size_t i{1}; i < cnt; ++i)
run[i]->set_dummy();
*run[0] = *newExprNode;
// That's all there is to it
// Now just make sure you normalize_root()
}
// Returns true if can intersect this query
// See intersections.{h,cpp} for comments
bool can_intersect() const;
// Make sure you check the repetition count
// See also ast_node::set_alltokens_flags()
//
// andOnly: if false, will also consider runs of STRICT_AND nodes
template <typename L>
void process_runs(const bool includePhrases, const bool processStrictAND, const bool processNOT, L &&cb) {
static thread_local std::vector<std::pair<uint32_t, ast_node *>> unaryNodes, stack;
static thread_local std::vector<ast_node *> run;
uint32_t segments{0};
stack.clear();
unaryNodes.clear();
if (root) {
stack.push_back({0, root});
}
while (stack.size()) {
const auto pair = stack.back();
auto n = pair.second;
const auto seg = pair.first;
stack.pop_back();
switch (n->type) {
case ast_node::Type::Token:
unaryNodes.push_back({seg, n});
break;
case ast_node::Type::Phrase:
if (includePhrases) {
unaryNodes.push_back({seg, n});
}
break;
case ast_node::Type::MatchSome:
for (size_t i{0}; i != n->match_some.size; ++i) {
// we shouldn't be treating whatever's here as a run
// we should treat each as a new segment
stack.push_back({++segments, n->match_some.nodes[i]});
}
break;
case ast_node::Type::BinOp:
if (n->binop.op == Operator::AND) {
stack.push_back({seg, n->binop.lhs});
stack.push_back({seg, n->binop.rhs});
} else if (n->binop.op == Operator::NOT) {
stack.push_back({seg, n->binop.lhs});
if (processNOT) {
++segments;
stack.push_back({segments, n->binop.rhs});
}
} else if (n->binop.op == Operator::OR) {
++segments;
stack.push_back({segments, n->binop.lhs});
++segments;
stack.push_back({segments, n->binop.rhs});
} else if (processStrictAND && n->binop.op == Operator::STRICT_AND) {
stack.push_back({seg, n->binop.lhs});
stack.push_back({seg, n->binop.rhs});
}
break;
case ast_node::Type::UnaryOp:
if (n->unaryop.op != Operator::STRICT_AND || processStrictAND) {
stack.push_back({seg, n->unaryop.expr});
}
break;
case ast_node::Type::Dummy:
case ast_node::Type::ConstFalse:
case ast_node::Type::ConstTrueExpr:
break;
}
}
std::sort(unaryNodes.begin(), unaryNodes.end(), [](const auto &a, const auto &b) noexcept {
return a.first < b.first || (a.first == b.first && a.second->p->index < b.second->p->index);
});
for (const auto *p = unaryNodes.data(), *const e = p + unaryNodes.size(); p != e;) {
const auto segment = p->first;
run.clear();
do {
run.push_back(p->second);
} while (++p != e && p->first == segment);
cb(run);
}
}
// If more than that many different tokens are in this query, it will
// remove however many required so that the total number of tokens is <= maxQueryTokens
// and will return the first node removed(ast_node::Type::Token or ast_node::Type::Phrase), otherwise it will return nullptr
// Try a google search for over 32 tokens
ast_node *trim(const std::size_t maxQueryTokens);
void reset() {
root = nullptr;
final_index_ = 0;
allocator.reuse();
}
};
} // namespace Trinity
void PrintImpl(Buffer &b, const Trinity::ast_node &n);
void PrintImpl(Buffer &b, const Trinity::phrase &);
inline void PrintImpl(Buffer &b, const Trinity::query &q) {
if (q.root)
b.append(*q.root);
else
b.append("<not initialized>"_s32);
}