-
Notifications
You must be signed in to change notification settings - Fork 20
/
docwordspace.h
140 lines (122 loc) · 6.34 KB
/
docwordspace.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#pragma once
#include "common.h"
#include "runtime.h"
#include "common.h"
namespace Trinity {
class DocWordsSpace final {
private:
// Just 4 bytes/position.
// We could have separated docSeq and termID into different arrays(as in, not in the same struct) so that reset()
// would only memset() the docSeq array(2 bytes vs 4 bytes) * maxPos
// but that 'd make access somewhat slower for set() and test() because we 'd get more cache misses so we optimise for it
using seq_t = uint16_t;
struct position final {
exec_term_id_t termID; // See IMPL.md
seq_t docSeq;
};
position *const positions;
const uint32_t maxPos;
seq_t curSeq;
public:
// Allocating max + Trinity::Limits::MaxPhraseSize, because that is the theoritical maximum phrase size
// and if we are going to test starting from maxPos extending to 10 positions ahead, we want to
// make sure we won't read outside positions.
// The extra positions will be always initialized to 0 and we won't need to reset those in reset()
DocWordsSpace(const uint32_t max = Trinity::Limits::MaxPosition)
: positions((position *)calloc(sizeof(position), max + 1 + Trinity::Limits::MaxPhraseSize)), maxPos{max} {
curSeq = 1; // IMPORTANT, start from (1)
EXPECT(max && max <= Trinity::Limits::MaxPosition);
}
~DocWordsSpace() noexcept {
std::free(positions);
}
void reset() {
// In order to avoid resetting/clearing positions[] for every other document
// we track a document-specific identifier in positions[] so if positions[idx].docSeq != curDocSeq
// then whatever is in positions[] is stale and should be considered unset.
//
// In a previous Trinity design we stored the document ID as u32 but that is excessive, we care about cache-misses
// so now we instead use a `uint16_t curSeq` and periodically clear. This is more efficient
if (unlikely(curSeq == std::numeric_limits<seq_t>::max())) {
// we reset every 65k(for u16 seq_t) documents
// this is preferrable to using uint32_t to encode the actual document in position{}
// no need to memset() for (maxPos + 1 + Trinity::Limits::MaxPhraseSize), just upto (maxPos + 1)
memset(positions, 0, sizeof(position) * (maxPos + 1));
curSeq = 1; // important; set to 1 not 0
} else {
++curSeq;
}
}
// XXX: pos must be > 0
#if !defined(TRINITY_VERIFY_HITS)
[[gnu::always_inline]] void set(const exec_term_id_t termID, const tokenpos_t pos) noexcept {
positions[pos] = {termID, curSeq};
}
#else
[[gnu::always_inline]] void set(const exec_term_id_t termID, const tokenpos_t pos) {
EXPECT(pos < maxPos);
positions[pos] = {termID, curSeq};
}
#endif
#if 1
/*
* -O1
movq (%rdi), %rax
movzwl (%rax,%rdx,4), %ecx
cmpw 12(%rdi), %cx
jne .LBB3_1
cmpw %si, 2(%rax,%rdx,4)
sete %al
retq
.LBB3_1:
xorl %eax, %eax
retq
*
*/
// Turns out with -O1 or higher, this is faster than the alternative impl
// based on my folly benchmarks
// XXX: pos must be > 0
inline bool test(const exec_term_id_t termID, const tokenpos_t pos) const noexcept {
return positions[pos].docSeq == curSeq && positions[pos].termID == termID;
}
#else
/*
* -O1
shll $16, %esi
movzwl 12(%rdi), %eax
orl %esi, %eax
movq (%rdi), %rcx
cmpl (%rcx,%rdx,4), %eax
sete %al
retq
*
*/
inline bool test(const exec_term_id_t termID, const uint16_t pos) const noexcept {
// this works thanks on little-endian arhs.
// not sure if this is faster than the older impl.
// in -O1, this saves us 2 instructions and looks faster. Will investigate
//
// TODO: We could in theory use this trick to check for _2_ positions ahead by
// building a u64 and using *(uint64_t *)&positions[pos] which will also
// access the adjacent positions[pos+1].
// e.g test2() method
static_assert(sizeof(termID) == sizeof(uint16_t));
static_assert(sizeof(curSeq) == sizeof(uint16_t));
return ((uint32_t(termID) << 16) | curSeq) == *(uint32_t *)&positions[pos];
}
#endif
// This can facilitate tracking sequences(e.g 2+ qeury terms matches in a document) of a MatchedIndexDocumentsFilter::consider() impl.
inline void unset(const tokenpos_t pos) noexcept {
positions[pos].docSeq = 0;
}
// We can probably just sort all phrase terms by freq asc
// and iterate across all hits, and for each hit, see if it is set
// in the adjacement position for the next phrase term, and the next, and so on, but
// we would need to track the offset(relative index in the phrase)
// This is an example/reference implementation
bool test_phrase(const std::vector<exec_term_id_t> &phraseTerms, const tokenpos_t *phraseFirstTokenPositions, const tokenpos_t phraseFirstTokenPositionsCnt) const;
auto max_pos() const noexcept {
return maxPos;
}
};
} // namespace Trinity