python · sweeneyde · Sep 13, 2022 · Sep 12, 2022
diff --git a/Lib/test/string_tests.py b/Lib/test/string_tests.py
@@ -341,6 +341,42 @@ def reference_find(p, s):
                 self.checkequal(reference_find(p, text),
                                 text, 'find', p)
 
+    def test_find_many_lengths(self):
+        haystack_repeats = [a * 10**e for e in range(6) for a in (1,2,5)]
+        haystacks = [(n, self.fixtype("abcab"*n + "da")) for n in haystack_repeats]
+
+        needle_repeats = [a * 10**e for e in range(6) for a in (1, 3)]
+        needles = [(m, self.fixtype("abcab"*m + "da")) for m in needle_repeats]
+
+        for n, haystack1 in haystacks:
+            haystack2 = haystack1[:-1]
+            for m, needle in needles:
+                answer1 = 5 * (n - m) if m <= n else -1
+                self.assertEqual(haystack1.find(needle), answer1, msg=(n,m))
+                self.assertEqual(haystack2.find(needle), -1, msg=(n,m))
+
+    def test_adaptive_find(self):
+        # This would be very slow for the naive algorithm,
+        # but str.find() should be O(n + m).
+        for N in 1000, 10_000, 100_000, 1_000_000:
+            A, B = 'a' * N, 'b' * N
+            haystack = A + A + B + A + A
+            needle = A + B + B + A
+            self.checkequal(-1, haystack, 'find', needle)
+            self.checkequal(0, haystack, 'count', needle)
+            self.checkequal(len(haystack), haystack + needle, 'find', needle)
+            self.checkequal(1, haystack + needle, 'count', needle)
+
+    def test_find_with_memory(self):
+        # Test the "Skip with memory" path in the two-way algorithm.
+        for N in 1000, 3000, 10_000, 30_000:
+            needle = 'ab' * N
+            haystack = ('ab'*(N-1) + 'b') * 2
+            self.checkequal(-1, haystack, 'find', needle)
+            self.checkequal(0, haystack, 'count', needle)
+            self.checkequal(len(haystack), haystack + needle, 'find', needle)
+            self.checkequal(1, haystack + needle, 'count', needle)
+
     def test_find_shift_table_overflow(self):
         """When the table of 8-bit shifts overflows."""
         N = 2**8 + 100
@@ -715,6 +751,18 @@ def test_replace(self):
         self.checkraises(TypeError, 'hello', 'replace', 42, 'h')
         self.checkraises(TypeError, 'hello', 'replace', 'h', 42)
 
+    def test_replace_uses_two_way_maxcount(self):
+        # Test that maxcount works in _two_way_count in fastsearch.h
+        A, B = "A"*1000, "B"*1000
+        AABAA = A + A + B + A + A
+        ABBA = A + B + B + A
+        self.checkequal(AABAA + ABBA,
+                        AABAA + ABBA, 'replace', ABBA, "ccc", 0)
+        self.checkequal(AABAA + "ccc",
+                        AABAA + ABBA, 'replace', ABBA, "ccc", 1)
+        self.checkequal(AABAA + "ccc",
+                        AABAA + ABBA, 'replace', ABBA, "ccc", 2)
+
     @unittest.skipIf(sys.maxsize > (1 << 32) or struct.calcsize('P') != 4,
                      'only applies to 32-bit platforms')
     def test_replace_overflow(self):

diff --git a/Objects/stringlib/fastsearch.h b/Objects/stringlib/fastsearch.h
@@ -18,7 +18,8 @@
    algorithm, which has worst-case O(n) runtime and best-case O(n/k).
    Also compute a table of shifts to achieve O(n/k) in more cases,
    and often (data dependent) deduce larger shifts than pure C&P can
-   deduce. */
+   deduce. See stringlib_find_two_way_notes.txt in this folder for a
+   detailed explanation. */
 
 #define FAST_COUNT 0
 #define FAST_SEARCH 1
@@ -398,7 +399,7 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack,
                 if (window_last >= haystack_end) {
                     return -1;
                 }
-                LOG("Horspool skip");
+                LOG("Horspool skip\n");
             }
           no_shift:
             window = window_last - len_needle + 1;
@@ -457,7 +458,7 @@ STRINGLIB(_two_way)(const STRINGLIB_CHAR *haystack, Py_ssize_t len_haystack,
                 if (window_last >= haystack_end) {
                     return -1;
                 }
-                LOG("Horspool skip");
+                LOG("Horspool skip\n");
             }
             window = window_last - len_needle + 1;
             assert((window[len_needle - 1] & TABLE_MASK) ==

diff --git a/Objects/stringlib/stringlib_find_two_way_notes.txt b/Objects/stringlib/stringlib_find_two_way_notes.txt
@@ -239,7 +239,7 @@ We cut as AA + bAAbAAbA, and then the algorithm runs as follows:
                 ~~                  AA != bA at the cut
         bbbAbbAAbAAbAAbbbAAbAAbAAbAA
                  AAbAAbAAbA
-                 ^^^^X              7-3=4 match, and the 5th misses.
+                   ^^^^X            7-3=4 match, and the 5th misses.
         bbbAbbAAbAAbAAbbbAAbAAbAAbAA
                   AAbAAbAAbA
                    ~                A != b at the cut
@@ -395,7 +395,7 @@ of their proof goes something like this (this is far from complete):
       needle == (a + w) + (w + b), meaning there's a bad equality
       w == w, it's impossible for w + b to be bigger than both
       b and w + w + b, so this can't happen. We thus have all of
-      the ineuqalities with no question marks.
+      the inequalities with no question marks.
     * By maximality, the right part is not a substring of the left
       part. Thus, we have all of the inequalities involving no
       left-side question marks.