@@ -120,22 +120,35 @@ find_first_matching_suffix_scalar (
120
120
// smaller code that seems to be much faster than a chain of
121
121
// 'if (...) return' for successful matches, and only slightly slower
122
122
// for failed matches
123
- ITER (13 );
124
- ITER (12 );
125
- ITER (11 );
126
- ITER (10 );
127
- ITER (9 );
128
- ITER (8 );
129
- ITER (7 );
130
- ITER (6 );
123
+ // We split the unrolled loop into 2-3 independent chains of selects,
124
+ // and early-out if the first or second ones are successful. This
125
+ // improves best-case lookup performance slightly without penalizing
126
+ // worst-case performance too much.
131
127
ITER (5 );
132
128
ITER (4 );
133
129
ITER (3 );
134
130
ITER (2 );
135
131
ITER (1 );
136
132
ITER (0 );
137
- #undef ITER
133
+ if (result != 32 )
134
+ return result ;
135
+
136
+ result = 32 ;
137
+ ITER (11 );
138
+ ITER (10 );
139
+ ITER (9 );
140
+ ITER (8 );
141
+ ITER (7 );
142
+ ITER (6 );
143
+ if (result != 32 )
144
+ return result ;
145
+
146
+ // Most buckets won't be this full due to load factor, and for some
147
+ // specializations these slots will *never* be full
148
+ ITER (13 );
149
+ ITER (12 );
138
150
return result ;
151
+ #undef ITER
139
152
}
140
153
141
154
static DN_FORCEINLINE (void )
@@ -190,31 +203,33 @@ DN_SIMDHASH_SCAN_BUCKET_INTERNAL (DN_SIMDHASH_T_PTR hash, bucket_t *restrict buc
190
203
// no good reason.
191
204
#define bucket_suffixes (bucket->suffixes)
192
205
#endif
193
- uint8_t count = dn_simdhash_extract_lane (bucket_suffixes , DN_SIMDHASH_COUNT_SLOT ),
194
- overflow_count = dn_simdhash_extract_lane (bucket_suffixes , DN_SIMDHASH_CASCADED_SLOT );
206
+ // Don't load the cascaded slot early, since we won't need it if we find a match,
207
+ // and loading it too early will waste a valuable register or worse, spill to the stack
208
+ uint8_t count = dn_simdhash_extract_lane (bucket_suffixes , DN_SIMDHASH_COUNT_SLOT );
195
209
// We could early-out here when count==0, but it doesn't appear to meaningfully improve
196
- // search performance to do so, and might actually worsen it
210
+ // search performance to do so, and might actually worsen it. With an ideal load factor,
211
+ // most buckets will not be empty.
197
212
#ifdef DN_SIMDHASH_USE_SCALAR_FALLBACK
198
213
uint32_t index = find_first_matching_suffix_scalar (search_vector , bucket -> suffixes .values );
199
214
#else
200
215
uint32_t index = find_first_matching_suffix_simd (search_vector , bucket_suffixes );
201
216
#endif
202
- #undef bucket_suffixes
203
- for (; index < count ; index ++ ) {
204
- // FIXME: Could be profitable to manually hoist the data load outside of the loop,
205
- // if not out of SCAN_BUCKET_INTERNAL entirely. Clang appears to do LICM on it.
206
- // It's better to index bucket->keys each iteration inside the loop than to precompute
207
- // a pointer outside and bump the pointer, because in many cases the bucket will be
208
- // empty, and in many other cases it will have one match. Putting the index inside the
209
- // loop means that for empty/no-match buckets we don't do the index calculation at all.
210
- if (DN_SIMDHASH_KEY_EQUALS (DN_SIMDHASH_GET_DATA (hash ), needle , bucket -> keys [index ]))
211
- return index ;
217
+ if (index < count ) {
218
+ // Make scans slightly faster by not recomputing the key address every iteration
219
+ DN_SIMDHASH_KEY_T * key = bucket -> keys + index ;
220
+ do {
221
+ if (DN_SIMDHASH_KEY_EQUALS (DN_SIMDHASH_GET_DATA (hash ), needle , bucket -> keys [index ]))
222
+ return index ;
223
+ key ++ ;
224
+ index ++ ;
225
+ } while (index < count );
212
226
}
213
227
214
- if (overflow_count )
228
+ if (dn_simdhash_extract_lane ( bucket_suffixes , DN_SIMDHASH_CASCADED_SLOT ) )
215
229
return DN_SIMDHASH_SCAN_BUCKET_OVERFLOWED ;
216
230
else
217
231
return DN_SIMDHASH_SCAN_BUCKET_NO_OVERFLOW ;
232
+ #undef bucket_suffixes
218
233
}
219
234
220
235
// Helper macros so that we can optimize and change scan logic more easily
0 commit comments