From e4a6160188e56dced283af031d5c84a49914e0b6 Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Thu, 6 Apr 2023 23:04:30 -0500 Subject: [PATCH 1/2] Add failing firstRange(of: string) test --- Tests/RegexTests/AlgorithmsTests.swift | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/Tests/RegexTests/AlgorithmsTests.swift b/Tests/RegexTests/AlgorithmsTests.swift index d5419fe9c..c0044acc3 100644 --- a/Tests/RegexTests/AlgorithmsTests.swift +++ b/Tests/RegexTests/AlgorithmsTests.swift @@ -165,8 +165,12 @@ class AlgorithmTests: XCTestCase { let actualCol: [Range] = input.ranges(of: pattern)[...].map(input.offsets(of:)) XCTAssertEqual(actualCol, expected, file: file, line: line) - let firstRange = input.firstRange(of: pattern).map(input.offsets(of:)) - XCTAssertEqual(firstRange, expected.first, file: file, line: line) + let firstRange = input.firstRange(of: pattern) + XCTAssertEqual(firstRange.map(input.offsets(of:)), expected.first, file: file, line: line) + if let fr = firstRange, !pattern.isEmpty { + let secondRange = input[fr.upperBound...].firstRange(of: pattern).map(input.offsets(of:)) + XCTAssertEqual(secondRange, expected.dropFirst().first, file: file, line: line) + } } expectRanges("", "", [0..<0]) @@ -176,6 +180,8 @@ class AlgorithmTests: XCTestCase { expectRanges("abcde", "bcd", [1..<4]) expectRanges("ababacabababa", "abababa", [6..<13]) expectRanges("ababacabababa", "aba", [0..<3, 6..<9, 10..<13]) + + expectRanges("ADACBADADACBADACB", "ADACB", [0..<5, 7..<12, 12..<17]) } // rdar://105154010 From e637fe3b73d183d5d7d5ec4e32ece36243f0f21a Mon Sep 17 00:00:00 2001 From: Nate Cook Date: Tue, 11 Apr 2023 12:02:57 -0500 Subject: [PATCH 2/2] Remove TwoWaySearcher Calls to `ranges(of:)` and `firstRange(of:)` with a string parameter actually use two different string searching algorithms. `ranges(of:)` uses the "z-searcher" algorithm, while `firstRange(of:)` uses a two-way search. Since it's better to align on a single path for these searches, the z-searcher has lower requirements, and the two-way search implementation has a correctness bug, this change removes the two-way search algorithm and uses z-search for `firstRange(of:)`. The correctness bug in `firstRange(of:)` appears only when searching for the second (or later) occurrence of a substring, which you have to be fairly deliberate about. In the example below, the substring at offsets `7..<12` is missed: let text = "ADACBADADACBADACB" // ===== -----===== let pattern = "ADACB" let firstRange = text.firstRange(of: pattern)! // firstRange ~= 0..<5 let secondRange = text[firstRange.upperBound...].firstRange(of: pattern)! // secondRange ~= 12..<17 This change also removes some unrelated, unused code in Split.swift, in addition to removing an (unused) usage of `TwoWaySearcher`. rdar://92794248 --- .../Algorithms/Algorithms/FirstRange.swift | 7 +- .../Algorithms/Algorithms/Split.swift | 60 ------ .../Algorithms/Searchers/TwoWaySearcher.swift | 197 ------------------ Sources/_StringProcessing/CMakeLists.txt | 1 - Tests/RegexTests/AlgorithmsTests.swift | 5 +- 5 files changed, 5 insertions(+), 265 deletions(-) delete mode 100644 Sources/_StringProcessing/Algorithms/Searchers/TwoWaySearcher.swift diff --git a/Sources/_StringProcessing/Algorithms/Algorithms/FirstRange.swift b/Sources/_StringProcessing/Algorithms/Algorithms/FirstRange.swift index 484ff4648..d0fb8673d 100644 --- a/Sources/_StringProcessing/Algorithms/Algorithms/FirstRange.swift +++ b/Sources/_StringProcessing/Algorithms/Algorithms/FirstRange.swift @@ -50,11 +50,8 @@ extension BidirectionalCollection where Element: Comparable { public func firstRange( of other: C ) -> Range? where C.Element == Element { - let searcher = PatternOrEmpty( - searcher: TwoWaySearcher(pattern: Array(other))) - let slice = self[...] - var state = searcher.state(for: slice, in: startIndex..(pattern: Array(other), by: ==) + return searcher.search(self[...], in: startIndex.. Bool, - maxSplits: Int, - omittingEmptySubsequences: Bool - ) -> SplitCollection> { - split(by: PredicateConsumer(predicate: predicate), maxSplits: maxSplits, omittingEmptySubsequences: omittingEmptySubsequences) - } -} - -// MARK: Single element algorithms - -extension Collection where Element: Equatable { - func split( - by separator: Element, - maxSplits: Int, - omittingEmptySubsequences: Bool - ) -> SplitCollection> { - split(whereSeparator: { $0 == separator }, maxSplits: maxSplits, omittingEmptySubsequences: omittingEmptySubsequences) - } -} - // MARK: Fixed pattern algorithms extension Collection where Element: Equatable { @@ -180,41 +155,6 @@ extension Collection where Element: Equatable { } } -extension BidirectionalCollection where Element: Equatable { - // FIXME -// public func splitFromBack( -// separator: S -// ) -> ReversedSplitCollection> -// where S.Element == Element -// { -// splitFromBack(separator: ZSearcher(pattern: Array(separator), by: ==)) -// } -} - -extension BidirectionalCollection where Element: Comparable { - func split( - by separator: C, - maxSplits: Int, - omittingEmptySubsequences: Bool - ) -> SplitCollection>> - where C.Element == Element - { - split( - by: PatternOrEmpty(searcher: TwoWaySearcher(pattern: Array(separator))), - maxSplits: maxSplits, omittingEmptySubsequences: omittingEmptySubsequences) - } - - // FIXME -// public func splitFromBack( -// separator: S -// ) -> ReversedSplitCollection>> -// where S.Element == Element -// { -// splitFromBack(separator: PatternOrEmpty( -// searcher: TwoWaySearcher(pattern: Array(separator)))) -// } -} - // String split overload breakers // // These are underscored and marked as SPI so that the *actual* public overloads diff --git a/Sources/_StringProcessing/Algorithms/Searchers/TwoWaySearcher.swift b/Sources/_StringProcessing/Algorithms/Searchers/TwoWaySearcher.swift deleted file mode 100644 index 2428f89cd..000000000 --- a/Sources/_StringProcessing/Algorithms/Searchers/TwoWaySearcher.swift +++ /dev/null @@ -1,197 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// This source file is part of the Swift.org open source project -// -// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors -// Licensed under Apache License v2.0 with Runtime Library Exception -// -// See https://swift.org/LICENSE.txt for license information -// -//===----------------------------------------------------------------------===// - -extension BidirectionalCollection where Element: Equatable { - fileprivate func _ends(with suffix: C) -> Bool - where C.Element == Element - { - FixedPatternConsumer(pattern: suffix).consumingBack(self[...]) != nil - } - } - -struct TwoWaySearcher - where Searched.Element: Comparable -{ - // TODO: Be generic over the pattern? - let pattern: [Searched.Element] - let criticalIndex: Int - let period: Int - let periodIsExact: Bool - - init?(pattern: [Searched.Element]) { - guard !pattern.isEmpty else { return nil } - - let (criticalIndex, periodOfSecondPart) = pattern._criticalFactorization(<) - let periodIsExact = pattern[criticalIndex...] - .prefix(periodOfSecondPart) - ._ends(with: pattern[.. - ) -> State { - // FIXME: Is this 'limitedBy' requirement a sign of error? - let criticalIndex = searched.index( - range.lowerBound, offsetBy: criticalIndex, limitedBy: range.upperBound) - ?? range.upperBound - return State( - end: range.upperBound, - index: range.lowerBound, - criticalIndex: - criticalIndex, - memory: nil) - } - - func search( - _ searched: Searched, - _ state: inout State - ) -> Range? { - while state.criticalIndex != searched.endIndex { - if let end = _searchRight(searched, &state), - let start = _searchLeft(searched, &state, end) - { - state.index = end - // FIXME: Is this 'limitedBy' requirement a sign of error? - state.criticalIndex = searched.index( - end, offsetBy: criticalIndex, limitedBy: searched.endIndex) - ?? searched.endIndex - state.memory = nil - return start.. Searched.Index? { - let rStart: Int - var rIndex: Searched.Index - - if let memory = state.memory, memory.offset > criticalIndex { - rStart = memory.offset - rIndex = memory.index - } else { - rStart = criticalIndex - rIndex = state.criticalIndex - } - - for i in rStart.. Searched.Index? { - let lStart = min(state.memory?.offset ?? 0, criticalIndex) - var lIndex = state.criticalIndex - - for i in (lStart.. Bool - ) -> (index: Int, periodOfSecondPart: Int) { - let less = _maximalSuffix(isOrderedBefore) - let greater = _maximalSuffix({ isOrderedBefore($1, $0) }) - return less.index > greater.index ? less : greater - } - - func _maximalSuffix( - _ isOrderedBefore: (Element, Element) -> Bool - ) -> (index: Int, periodOfSecondPart: Int) { - var left = 0 - var right = 1 - var offset = 0 - var period = 1 - - while right + offset < count { - let a = self[right + offset] - let b = self[left + offset] - - if isOrderedBefore(a, b) { - // Suffix is smaller, period is entire prefix so far. - right += offset + 1 - offset = 0 - period = right - left - } else if isOrderedBefore(b, a) { - // Suffix is larger, start over from current location. - left = right - right += 1 - offset = 0 - period = 1 - } else { - // Advance through repetition of the current period. - offset += 1 - if offset + 1 == period { - right += offset + 1 - offset = 0 - } else { - offset += 1 - } - } - } - - return (left, period) - } -} diff --git a/Sources/_StringProcessing/CMakeLists.txt b/Sources/_StringProcessing/CMakeLists.txt index ef4aeb6ef..f7ccd7ab9 100644 --- a/Sources/_StringProcessing/CMakeLists.txt +++ b/Sources/_StringProcessing/CMakeLists.txt @@ -23,7 +23,6 @@ add_library(_StringProcessing Algorithms/Searchers/NaivePatternSearcher.swift Algorithms/Searchers/PatternOrEmpty.swift Algorithms/Searchers/PredicateSearcher.swift - Algorithms/Searchers/TwoWaySearcher.swift Algorithms/Searchers/ZSearcher.swift Engine/Backtracking.swift Engine/Consume.swift diff --git a/Tests/RegexTests/AlgorithmsTests.swift b/Tests/RegexTests/AlgorithmsTests.swift index c0044acc3..60548a0a2 100644 --- a/Tests/RegexTests/AlgorithmsTests.swift +++ b/Tests/RegexTests/AlgorithmsTests.swift @@ -167,8 +167,8 @@ class AlgorithmTests: XCTestCase { let firstRange = input.firstRange(of: pattern) XCTAssertEqual(firstRange.map(input.offsets(of:)), expected.first, file: file, line: line) - if let fr = firstRange, !pattern.isEmpty { - let secondRange = input[fr.upperBound...].firstRange(of: pattern).map(input.offsets(of:)) + if let upperBound = firstRange?.upperBound, !pattern.isEmpty { + let secondRange = input[upperBound...].firstRange(of: pattern).map(input.offsets(of:)) XCTAssertEqual(secondRange, expected.dropFirst().first, file: file, line: line) } } @@ -181,6 +181,7 @@ class AlgorithmTests: XCTestCase { expectRanges("ababacabababa", "abababa", [6..<13]) expectRanges("ababacabababa", "aba", [0..<3, 6..<9, 10..<13]) + // Test for rdar://92794248 expectRanges("ADACBADADACBADACB", "ADACB", [0..<5, 7..<12, 12..<17]) }