Skip to content

Commit 69d9a08

Browse files
authored
gh-94808: improve comments and coverage of fastsearch.h (GH-96760)
1 parent 4995f5f commit 69d9a08

File tree

3 files changed

+54
-5
lines changed

3 files changed

+54
-5
lines changed

Lib/test/string_tests.py

+48
Original file line numberDiff line numberDiff line change
@@ -341,6 +341,42 @@ def reference_find(p, s):
341341
self.checkequal(reference_find(p, text),
342342
text, 'find', p)
343343

344+
def test_find_many_lengths(self):
345+
haystack_repeats = [a * 10**e for e in range(6) for a in (1,2,5)]
346+
haystacks = [(n, self.fixtype("abcab"*n + "da")) for n in haystack_repeats]
347+
348+
needle_repeats = [a * 10**e for e in range(6) for a in (1, 3)]
349+
needles = [(m, self.fixtype("abcab"*m + "da")) for m in needle_repeats]
350+
351+
for n, haystack1 in haystacks:
352+
haystack2 = haystack1[:-1]
353+
for m, needle in needles:
354+
answer1 = 5 * (n - m) if m <= n else -1
355+
self.assertEqual(haystack1.find(needle), answer1, msg=(n,m))
356+
self.assertEqual(haystack2.find(needle), -1, msg=(n,m))
357+
358+
def test_adaptive_find(self):
359+
# This would be very slow for the naive algorithm,
360+
# but str.find() should be O(n + m).
361+
for N in 1000, 10_000, 100_000, 1_000_000:
362+
A, B = 'a' * N, 'b' * N
363+
haystack = A + A + B + A + A
364+
needle = A + B + B + A
365+
self.checkequal(-1, haystack, 'find', needle)
366+
self.checkequal(0, haystack, 'count', needle)
367+
self.checkequal(len(haystack), haystack + needle, 'find', needle)
368+
self.checkequal(1, haystack + needle, 'count', needle)
369+
370+
def test_find_with_memory(self):
371+
# Test the "Skip with memory" path in the two-way algorithm.
372+
for N in 1000, 3000, 10_000, 30_000:
373+
needle = 'ab' * N
374+
haystack = ('ab'*(N-1) + 'b') * 2
375+
self.checkequal(-1, haystack, 'find', needle)
376+
self.checkequal(0, haystack, 'count', needle)
377+
self.checkequal(len(haystack), haystack + needle, 'find', needle)
378+
self.checkequal(1, haystack + needle, 'count', needle)
379+
344380
def test_find_shift_table_overflow(self):
345381
"""When the table of 8-bit shifts overflows."""
346382
N = 2**8 + 100
@@ -715,6 +751,18 @@ def test_replace(self):
715751
self.checkraises(TypeError, 'hello', 'replace', 42, 'h')
716752
self.checkraises(TypeError, 'hello', 'replace', 'h', 42)
717753

754+
def test_replace_uses_two_way_maxcount(self):
755+
# Test that maxcount works in _two_way_count in fastsearch.h
756+
A, B = "A"*1000, "B"*1000
757+
AABAA = A + A + B + A + A
758+
ABBA = A + B + B + A
759+
self.checkequal(AABAA + ABBA,
760+
AABAA + ABBA, 'replace', ABBA, "ccc", 0)
761+
self.checkequal(AABAA + "ccc",
762+
AABAA + ABBA, 'replace', ABBA, "ccc", 1)
763+
self.checkequal(AABAA + "ccc",
764+
AABAA + ABBA, 'replace', ABBA, "ccc", 2)
765+
718766
@unittest.skipIf(sys.maxsize > (1 << 32) or struct.calcsize('P') != 4,
719767
'only applies to 32-bit platforms')
720768
def test_replace_overflow(self):

Objects/stringlib/fastsearch.h

+4-3
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,8 @@
1818
algorithm, which has worst-case O(n) runtime and best-case O(n/k).
1919
Also compute a table of shifts to achieve O(n/k) in more cases,
2020
and often (data dependent) deduce larger shifts than pure C&P can
21-
deduce. */
21+
deduce. See stringlib_find_two_way_notes.txt in this folder for a
22+
detailed explanation. */
2223

2324
#define FAST_COUNT 0
2425
#define FAST_SEARCH 1
@@ -398,7 +399,7 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack,
398399
if (window_last >= haystack_end) {
399400
return -1;
400401
}
401-
LOG("Horspool skip");
402+
LOG("Horspool skip\n");
402403
}
403404
no_shift:
404405
window = window_last - len_needle + 1;
@@ -457,7 +458,7 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack,
457458
if (window_last >= haystack_end) {
458459
return -1;
459460
}
460-
LOG("Horspool skip");
461+
LOG("Horspool skip\n");
461462
}
462463
window = window_last - len_needle + 1;
463464
assert((window[len_needle - 1] & TABLE_MASK) ==

Objects/stringlib/stringlib_find_two_way_notes.txt

+2-2
Original file line numberDiff line numberDiff line change
@@ -239,7 +239,7 @@ We cut as AA + bAAbAAbA, and then the algorithm runs as follows:
239239
~~ AA != bA at the cut
240240
bbbAbbAAbAAbAAbbbAAbAAbAAbAA
241241
AAbAAbAAbA
242-
^^^^X 7-3=4 match, and the 5th misses.
242+
^^^^X 7-3=4 match, and the 5th misses.
243243
bbbAbbAAbAAbAAbbbAAbAAbAAbAA
244244
AAbAAbAAbA
245245
~ A != b at the cut
@@ -395,7 +395,7 @@ of their proof goes something like this (this is far from complete):
395395
needle == (a + w) + (w + b), meaning there's a bad equality
396396
w == w, it's impossible for w + b to be bigger than both
397397
b and w + w + b, so this can't happen. We thus have all of
398-
the ineuqalities with no question marks.
398+
the inequalities with no question marks.
399399
* By maximality, the right part is not a substring of the left
400400
part. Thus, we have all of the inequalities involving no
401401
left-side question marks.

0 commit comments

Comments
 (0)