Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

gh-94808: Improve coverage of fastsearch.h #96760

Merged
merged 1 commit into from
Sep 13, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 48 additions & 0 deletions Lib/test/string_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,42 @@ def reference_find(p, s):
self.checkequal(reference_find(p, text),
text, 'find', p)

def test_find_many_lengths(self):
haystack_repeats = [a * 10**e for e in range(6) for a in (1,2,5)]
haystacks = [(n, self.fixtype("abcab"*n + "da")) for n in haystack_repeats]

needle_repeats = [a * 10**e for e in range(6) for a in (1, 3)]
needles = [(m, self.fixtype("abcab"*m + "da")) for m in needle_repeats]

for n, haystack1 in haystacks:
haystack2 = haystack1[:-1]
for m, needle in needles:
answer1 = 5 * (n - m) if m <= n else -1
self.assertEqual(haystack1.find(needle), answer1, msg=(n,m))
self.assertEqual(haystack2.find(needle), -1, msg=(n,m))

def test_adaptive_find(self):
# This would be very slow for the naive algorithm,
# but str.find() should be O(n + m).
for N in 1000, 10_000, 100_000, 1_000_000:
A, B = 'a' * N, 'b' * N
haystack = A + A + B + A + A
needle = A + B + B + A
self.checkequal(-1, haystack, 'find', needle)
self.checkequal(0, haystack, 'count', needle)
self.checkequal(len(haystack), haystack + needle, 'find', needle)
self.checkequal(1, haystack + needle, 'count', needle)

def test_find_with_memory(self):
# Test the "Skip with memory" path in the two-way algorithm.
for N in 1000, 3000, 10_000, 30_000:
needle = 'ab' * N
haystack = ('ab'*(N-1) + 'b') * 2
self.checkequal(-1, haystack, 'find', needle)
self.checkequal(0, haystack, 'count', needle)
self.checkequal(len(haystack), haystack + needle, 'find', needle)
self.checkequal(1, haystack + needle, 'count', needle)

def test_find_shift_table_overflow(self):
"""When the table of 8-bit shifts overflows."""
N = 2**8 + 100
Expand Down Expand Up @@ -715,6 +751,18 @@ def test_replace(self):
self.checkraises(TypeError, 'hello', 'replace', 42, 'h')
self.checkraises(TypeError, 'hello', 'replace', 'h', 42)

def test_replace_uses_two_way_maxcount(self):
# Test that maxcount works in _two_way_count in fastsearch.h
A, B = "A"*1000, "B"*1000
AABAA = A + A + B + A + A
ABBA = A + B + B + A
self.checkequal(AABAA + ABBA,
AABAA + ABBA, 'replace', ABBA, "ccc", 0)
self.checkequal(AABAA + "ccc",
AABAA + ABBA, 'replace', ABBA, "ccc", 1)
self.checkequal(AABAA + "ccc",
AABAA + ABBA, 'replace', ABBA, "ccc", 2)

@unittest.skipIf(sys.maxsize > (1 << 32) or struct.calcsize('P') != 4,
'only applies to 32-bit platforms')
def test_replace_overflow(self):
Expand Down
7 changes: 4 additions & 3 deletions Objects/stringlib/fastsearch.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@
algorithm, which has worst-case O(n) runtime and best-case O(n/k).
Also compute a table of shifts to achieve O(n/k) in more cases,
and often (data dependent) deduce larger shifts than pure C&P can
deduce. */
deduce. See stringlib_find_two_way_notes.txt in this folder for a
detailed explanation. */

#define FAST_COUNT 0
#define FAST_SEARCH 1
Expand Down Expand Up @@ -398,7 +399,7 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack,
if (window_last >= haystack_end) {
return -1;
}
LOG("Horspool skip");
LOG("Horspool skip\n");
}
no_shift:
window = window_last - len_needle + 1;
Expand Down Expand Up @@ -457,7 +458,7 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack,
if (window_last >= haystack_end) {
return -1;
}
LOG("Horspool skip");
LOG("Horspool skip\n");
}
window = window_last - len_needle + 1;
assert((window[len_needle - 1] & TABLE_MASK) ==
Expand Down
4 changes: 2 additions & 2 deletions Objects/stringlib/stringlib_find_two_way_notes.txt
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ We cut as AA + bAAbAAbA, and then the algorithm runs as follows:
~~ AA != bA at the cut
bbbAbbAAbAAbAAbbbAAbAAbAAbAA
AAbAAbAAbA
^^^^X 7-3=4 match, and the 5th misses.
^^^^X 7-3=4 match, and the 5th misses.
bbbAbbAAbAAbAAbbbAAbAAbAAbAA
AAbAAbAAbA
~ A != b at the cut
Expand Down Expand Up @@ -395,7 +395,7 @@ of their proof goes something like this (this is far from complete):
needle == (a + w) + (w + b), meaning there's a bad equality
w == w, it's impossible for w + b to be bigger than both
b and w + w + b, so this can't happen. We thus have all of
the ineuqalities with no question marks.
the inequalities with no question marks.
* By maximality, the right part is not a substring of the left
part. Thus, we have all of the inequalities involving no
left-side question marks.
Expand Down