Skip to content

Commit 8c18f78

Browse files
committed
Improve best-case simdhash search performance for matches and failed matches
1 parent 01833b9 commit 8c18f78

File tree

1 file changed

+38
-23
lines changed

1 file changed

+38
-23
lines changed

src/native/containers/dn-simdhash-specialization.h

+38-23
Original file line numberDiff line numberDiff line change
@@ -120,22 +120,35 @@ find_first_matching_suffix_scalar (
120120
// smaller code that seems to be much faster than a chain of
121121
// 'if (...) return' for successful matches, and only slightly slower
122122
// for failed matches
123-
ITER(13);
124-
ITER(12);
125-
ITER(11);
126-
ITER(10);
127-
ITER(9);
128-
ITER(8);
129-
ITER(7);
130-
ITER(6);
123+
// We split the unrolled loop into 2-3 independent chains of selects,
124+
// and early-out if the first or second ones are successful. This
125+
// improves best-case lookup performance slightly without penalizing
126+
// worst-case performance too much.
131127
ITER(5);
132128
ITER(4);
133129
ITER(3);
134130
ITER(2);
135131
ITER(1);
136132
ITER(0);
137-
#undef ITER
133+
if (result != 32)
134+
return result;
135+
136+
result = 32;
137+
ITER(11);
138+
ITER(10);
139+
ITER(9);
140+
ITER(8);
141+
ITER(7);
142+
ITER(6);
143+
if (result != 32)
144+
return result;
145+
146+
// Most buckets won't be this full due to load factor, and for some
147+
// specializations these slots will *never* be full
148+
ITER(13);
149+
ITER(12);
138150
return result;
151+
#undef ITER
139152
}
140153

141154
static DN_FORCEINLINE(void)
@@ -190,31 +203,33 @@ DN_SIMDHASH_SCAN_BUCKET_INTERNAL (DN_SIMDHASH_T_PTR hash, bucket_t *restrict buc
190203
// no good reason.
191204
#define bucket_suffixes (bucket->suffixes)
192205
#endif
193-
uint8_t count = dn_simdhash_extract_lane(bucket_suffixes, DN_SIMDHASH_COUNT_SLOT),
194-
overflow_count = dn_simdhash_extract_lane(bucket_suffixes, DN_SIMDHASH_CASCADED_SLOT);
206+
// Don't load the cascaded slot early, since we won't need it if we find a match,
207+
// and loading it too early will waste a valuable register or worse, spill to the stack
208+
uint8_t count = dn_simdhash_extract_lane(bucket_suffixes, DN_SIMDHASH_COUNT_SLOT);
195209
// We could early-out here when count==0, but it doesn't appear to meaningfully improve
196-
// search performance to do so, and might actually worsen it
210+
// search performance to do so, and might actually worsen it. With an ideal load factor,
211+
// most buckets will not be empty.
197212
#ifdef DN_SIMDHASH_USE_SCALAR_FALLBACK
198213
uint32_t index = find_first_matching_suffix_scalar(search_vector, bucket->suffixes.values);
199214
#else
200215
uint32_t index = find_first_matching_suffix_simd(search_vector, bucket_suffixes);
201216
#endif
202-
#undef bucket_suffixes
203-
for (; index < count; index++) {
204-
// FIXME: Could be profitable to manually hoist the data load outside of the loop,
205-
// if not out of SCAN_BUCKET_INTERNAL entirely. Clang appears to do LICM on it.
206-
// It's better to index bucket->keys each iteration inside the loop than to precompute
207-
// a pointer outside and bump the pointer, because in many cases the bucket will be
208-
// empty, and in many other cases it will have one match. Putting the index inside the
209-
// loop means that for empty/no-match buckets we don't do the index calculation at all.
210-
if (DN_SIMDHASH_KEY_EQUALS(DN_SIMDHASH_GET_DATA(hash), needle, bucket->keys[index]))
211-
return index;
217+
if (index < count) {
218+
// Make scans slightly faster by not recomputing the key address every iteration
219+
DN_SIMDHASH_KEY_T *key = bucket->keys + index;
220+
do {
221+
if (DN_SIMDHASH_KEY_EQUALS(DN_SIMDHASH_GET_DATA(hash), needle, bucket->keys[index]))
222+
return index;
223+
key++;
224+
index++;
225+
} while (index < count);
212226
}
213227

214-
if (overflow_count)
228+
if (dn_simdhash_extract_lane(bucket_suffixes, DN_SIMDHASH_CASCADED_SLOT))
215229
return DN_SIMDHASH_SCAN_BUCKET_OVERFLOWED;
216230
else
217231
return DN_SIMDHASH_SCAN_BUCKET_NO_OVERFLOW;
232+
#undef bucket_suffixes
218233
}
219234

220235
// Helper macros so that we can optimize and change scan logic more easily

0 commit comments

Comments
 (0)