diff --git a/Sources/RegexBenchmark/Benchmark.swift b/Sources/RegexBenchmark/Benchmark.swift index 2622639fb..f807a8e55 100644 --- a/Sources/RegexBenchmark/Benchmark.swift +++ b/Sources/RegexBenchmark/Benchmark.swift @@ -71,6 +71,13 @@ struct NSBenchmark: RegexBenchmark { enum NSMatchType { case allMatches case first + + init(_ type: Benchmark.MatchType) { + switch type { + case .whole, .first: self = .first + case .allMatches: self = .allMatches + } + } } func run() { @@ -126,7 +133,7 @@ struct CrossBenchmark { /// The base name of the benchmark var baseName: String - /// The string to compile in differnet engines + /// The string to compile in different engines var regex: String /// The text to search @@ -143,57 +150,32 @@ struct CrossBenchmark { /// Whether or not to do firstMatch as well or just allMatches var includeFirst: Bool = false + /// Whether to also run scalar-semantic mode + var alsoRunScalarSemantic: Bool = true + func register(_ runner: inout BenchmarkRunner) { - let swiftRegex = try! Regex(regex) - let nsRegex: NSRegularExpression if isWhole { - nsRegex = try! NSRegularExpression(pattern: "^" + regex + "$") + runner.registerCrossBenchmark( + nameBase: baseName, + input: input, + pattern: regex, + .whole, + alsoRunScalarSemantic: alsoRunScalarSemantic) } else { - nsRegex = try! NSRegularExpression(pattern: regex) - } + runner.registerCrossBenchmark( + nameBase: baseName, + input: input, + pattern: regex, + .allMatches, + alsoRunScalarSemantic: alsoRunScalarSemantic) - if isWhole { - runner.register( - Benchmark( - name: baseName + "Whole", - regex: swiftRegex, - pattern: regex, - type: .whole, - target: input)) - runner.register( - NSBenchmark( - name: baseName + "Whole" + CrossBenchmark.nsSuffix, - regex: nsRegex, - type: .first, - target: input)) - } else { - runner.register( - Benchmark( - name: baseName + "All", - regex: swiftRegex, - pattern: regex, - type: .allMatches, - target: input)) - runner.register( - NSBenchmark( - name: baseName + "All" + CrossBenchmark.nsSuffix, - regex: nsRegex, - type: .allMatches, - target: input)) if includeFirst || runner.includeFirstOverride { - runner.register( - Benchmark( - name: baseName + "First", - regex: swiftRegex, - pattern: regex, - type: .first, - target: input)) - runner.register( - NSBenchmark( - name: baseName + "First" + CrossBenchmark.nsSuffix, - regex: nsRegex, - type: .first, - target: input)) + runner.registerCrossBenchmark( + nameBase: baseName, + input: input, + pattern: regex, + .first, + alsoRunScalarSemantic: alsoRunScalarSemantic) } } } @@ -209,20 +191,16 @@ struct CrossInputListBenchmark { /// The list of strings to search var inputs: [String] + + /// Also run in scalar-semantic mode + var alsoRunScalarSemantic: Bool = true func register(_ runner: inout BenchmarkRunner) { - let swiftRegex = try! Regex(regex) - runner.register(InputListBenchmark( + runner.registerCrossBenchmark( name: baseName, - regex: swiftRegex, + inputList: inputs, pattern: regex, - targets: inputs - )) - runner.register(InputListNSBenchmark( - name: baseName + CrossBenchmark.nsSuffix, - regex: regex, - targets: inputs - )) + alsoRunScalarSemantic: alsoRunScalarSemantic) } } diff --git a/Sources/RegexBenchmark/BenchmarkRunner.swift b/Sources/RegexBenchmark/BenchmarkRunner.swift index 641c03224..b067b9679 100644 --- a/Sources/RegexBenchmark/BenchmarkRunner.swift +++ b/Sources/RegexBenchmark/BenchmarkRunner.swift @@ -4,6 +4,16 @@ import Foundation /// The number of times to re-run the benchmark if results are too varying private var rerunCount: Int { 3 } +extension Benchmark.MatchType { + fileprivate var nameSuffix: String { + switch self { + case .whole: return "_Whole" + case .first: return "_First" + case .allMatches: return "_All" + } + } +} + struct BenchmarkRunner { let suiteName: String var suite: [any RegexBenchmark] = [] @@ -16,12 +26,141 @@ struct BenchmarkRunner { // Forcibly include firstMatch benchmarks for all CrossBenchmarks let includeFirstOverride: Bool + + // Register a cross-benchmark + mutating func registerCrossBenchmark( + nameBase: String, + input: String, + pattern: String, + _ type: Benchmark.MatchType, + alsoRunScalarSemantic: Bool = true + ) { + let swiftRegex = try! Regex(pattern) + let nsRegex: NSRegularExpression + if type == .whole { + nsRegex = try! NSRegularExpression(pattern: "^" + pattern + "$") + } else { + nsRegex = try! NSRegularExpression(pattern: pattern) + } + let nameSuffix = type.nameSuffix + + register( + Benchmark( + name: nameBase + nameSuffix, + regex: swiftRegex, + pattern: pattern, + type: type, + target: input)) + register( + NSBenchmark( + name: nameBase + nameSuffix + CrossBenchmark.nsSuffix, + regex: nsRegex, + type: .init(type), + target: input)) + + if alsoRunScalarSemantic { + register( + Benchmark( + name: nameBase + nameSuffix + "_Scalar", + regex: swiftRegex.matchingSemantics(.unicodeScalar), + pattern: pattern, + type: type, + target: input)) + register( + NSBenchmark( + name: nameBase + nameSuffix + "_Scalar" + CrossBenchmark.nsSuffix, + regex: nsRegex, + type: .init(type), + target: input)) + } + } + + // Register a cross-benchmark list + mutating func registerCrossBenchmark( + name: String, + inputList: [String], + pattern: String, + alsoRunScalarSemantic: Bool = true + ) { + let swiftRegex = try! Regex(pattern) + register(InputListBenchmark( + name: name, + regex: swiftRegex, + pattern: pattern, + targets: inputList + )) + register(InputListNSBenchmark( + name: name + CrossBenchmark.nsSuffix, + regex: pattern, + targets: inputList + )) + + if alsoRunScalarSemantic { + register(InputListBenchmark( + name: name + "_Scalar", + regex: swiftRegex.matchingSemantics(.unicodeScalar), + pattern: pattern, + targets: inputList + )) + register(InputListNSBenchmark( + name: name + "_Scalar" + CrossBenchmark.nsSuffix, + regex: pattern, + targets: inputList + )) + } + + } + + // Register a swift-only benchmark + mutating func register( + nameBase: String, + input: String, + pattern: String, + _ swiftRegex: Regex, + _ type: Benchmark.MatchType, + alsoRunScalarSemantic: Bool = true + ) { + let nameSuffix = type.nameSuffix + + register( + Benchmark( + name: nameBase + nameSuffix, + regex: swiftRegex, + pattern: pattern, + type: type, + target: input)) + + if alsoRunScalarSemantic { + register( + Benchmark( + name: nameBase + nameSuffix + "_Scalar", + regex: swiftRegex, + pattern: pattern, + type: type, + target: input)) + } + } - mutating func register(_ benchmark: some RegexBenchmark) { + private mutating func register(_ benchmark: NSBenchmark) { suite.append(benchmark) } - mutating func register(_ benchmark: some SwiftRegexBenchmark) { + private mutating func register(_ benchmark: Benchmark) { + var benchmark = benchmark + if enableTracing { + benchmark.enableTracing() + } + if enableMetrics { + benchmark.enableMetrics() + } + suite.append(benchmark) + } + + private mutating func register(_ benchmark: InputListNSBenchmark) { + suite.append(benchmark) + } + + private mutating func register(_ benchmark: InputListBenchmark) { var benchmark = benchmark if enableTracing { benchmark.enableTracing() diff --git a/Sources/RegexBenchmark/Suite/CustomCharacterClasses.swift b/Sources/RegexBenchmark/Suite/CustomCharacterClasses.swift index 61d7b197f..27b2b07b4 100644 --- a/Sources/RegexBenchmark/Suite/CustomCharacterClasses.swift +++ b/Sources/RegexBenchmark/Suite/CustomCharacterClasses.swift @@ -12,53 +12,55 @@ extension BenchmarkRunner { let input = Inputs.graphemeBreakData - register(Benchmark( - name: "BasicCCC", - regex: try! Regex(basic), + // TODO: Which of these can be cross-benchmarks? + + register( + nameBase: "BasicCCC", + input: input, pattern: basic, - type: .allMatches, - target: input)) + try! Regex(basic), + .allMatches) - register(Benchmark( - name: "BasicRangeCCC", - regex: try! Regex(basicRange), + register( + nameBase: "BasicRangeCCC", + input: input, pattern: basicRange, - type: .allMatches, - target: input)) + try! Regex(basicRange), + .allMatches) - register(Benchmark( - name: "CaseInsensitiveCCC", - regex: try! Regex(caseInsensitive), + register( + nameBase: "CaseInsensitiveCCC", + input: input, pattern: caseInsensitive, - type: .allMatches, - target: input)) + try! Regex(caseInsensitive), + .allMatches) - register(Benchmark( - name: "InvertedCCC", - regex: try! Regex(inverted), + register( + nameBase: "InvertedCCC", + input: input, pattern: inverted, - type: .allMatches, - target: input)) + try! Regex(inverted), + .allMatches) - register(Benchmark( - name: "SubtractionCCC", - regex: try! Regex(subtraction), + register( + nameBase: "SubtractionCCC", + input: input, pattern: subtraction, - type: .allMatches, - target: input)) + try! Regex(subtraction), + .allMatches) - register(Benchmark( - name: "IntersectionCCC", - regex: try! Regex(intersection), + register( + nameBase: "IntersectionCCC", + input: input, pattern: intersection, - type: .allMatches, - target: input)) + try! Regex(intersection), + .allMatches) - register(Benchmark( - name: "symDiffCCC", - regex: try! Regex(symmetricDifference), + register( + nameBase: "symDiffCCC", + input: input, pattern: symmetricDifference, - type: .allMatches, - target: input)) + try! Regex(symmetricDifference), + .allMatches) } } diff --git a/Sources/RegexBenchmark/Utils/Stats.swift b/Sources/RegexBenchmark/Utils/Stats.swift index 0cc9156a4..175826a0b 100644 --- a/Sources/RegexBenchmark/Utils/Stats.swift +++ b/Sources/RegexBenchmark/Utils/Stats.swift @@ -3,8 +3,8 @@ import Foundation enum Stats {} extension Stats { - // Maximum allowed standard deviation is 5% of the median runtime - static let maxAllowedStdev = 0.05 + // Maximum allowed standard deviation is 7.5% of the median runtime + static let maxAllowedStdev = 0.075 static func tTest(_ a: Measurement, _ b: Measurement) -> Bool { // Student's t-test diff --git a/Sources/_RegexParser/Utility/TypeConstruction.swift b/Sources/_RegexParser/Utility/TypeConstruction.swift index 4d1765e34..54a7c9263 100644 --- a/Sources/_RegexParser/Utility/TypeConstruction.swift +++ b/Sources/_RegexParser/Utility/TypeConstruction.swift @@ -107,15 +107,15 @@ public enum TypeConstruction { var currentElementAddressUnaligned = UnsafeMutableRawPointer(baseAddress) for element in elements { // Open existential on each element type. - func initializeElement(_ element: T) { + func initializeElement(_ element: U) { currentElementAddressUnaligned = - currentElementAddressUnaligned.roundedUp(toAlignmentOf: T.self) + currentElementAddressUnaligned.roundedUp(toAlignmentOf: U.self) currentElementAddressUnaligned.bindMemory( - to: T.self, capacity: MemoryLayout.size + to: U.self, capacity: MemoryLayout.size ).initialize(to: element) // Advance to the next element (unaligned). currentElementAddressUnaligned = - currentElementAddressUnaligned.advanced(by: MemoryLayout.size) + currentElementAddressUnaligned.advanced(by: MemoryLayout.size) } _openExistential(element, do: initializeElement) } @@ -175,8 +175,8 @@ extension MemoryLayout { if byteOffset == 0 { return 0 } var currentOffset = 0 for (index, type) in elementTypes.enumerated() { - func sizeAndAlignMask(_: T.Type) -> (Int, Int) { - (MemoryLayout.size, MemoryLayout.alignment - 1) + func sizeAndAlignMask(_: U.Type) -> (Int, Int) { + (MemoryLayout.size, MemoryLayout.alignment - 1) } // The ABI of an offset-based key path only stores the byte offset, so // this doesn't work if there's a 0-sized element, e.g. `Void`, diff --git a/Sources/_StringProcessing/Algorithms/Algorithms/FirstRange.swift b/Sources/_StringProcessing/Algorithms/Algorithms/FirstRange.swift index 484ff4648..d0fb8673d 100644 --- a/Sources/_StringProcessing/Algorithms/Algorithms/FirstRange.swift +++ b/Sources/_StringProcessing/Algorithms/Algorithms/FirstRange.swift @@ -50,11 +50,8 @@ extension BidirectionalCollection where Element: Comparable { public func firstRange( of other: C ) -> Range? where C.Element == Element { - let searcher = PatternOrEmpty( - searcher: TwoWaySearcher(pattern: Array(other))) - let slice = self[...] - var state = searcher.state(for: slice, in: startIndex..(pattern: Array(other), by: ==) + return searcher.search(self[...], in: startIndex.. Bool, - maxSplits: Int, - omittingEmptySubsequences: Bool - ) -> SplitCollection> { - split(by: PredicateConsumer(predicate: predicate), maxSplits: maxSplits, omittingEmptySubsequences: omittingEmptySubsequences) - } -} - -// MARK: Single element algorithms - -extension Collection where Element: Equatable { - func split( - by separator: Element, - maxSplits: Int, - omittingEmptySubsequences: Bool - ) -> SplitCollection> { - split(whereSeparator: { $0 == separator }, maxSplits: maxSplits, omittingEmptySubsequences: omittingEmptySubsequences) - } -} - // MARK: Fixed pattern algorithms extension Collection where Element: Equatable { @@ -180,41 +155,6 @@ extension Collection where Element: Equatable { } } -extension BidirectionalCollection where Element: Equatable { - // FIXME -// public func splitFromBack( -// separator: S -// ) -> ReversedSplitCollection> -// where S.Element == Element -// { -// splitFromBack(separator: ZSearcher(pattern: Array(separator), by: ==)) -// } -} - -extension BidirectionalCollection where Element: Comparable { - func split( - by separator: C, - maxSplits: Int, - omittingEmptySubsequences: Bool - ) -> SplitCollection>> - where C.Element == Element - { - split( - by: PatternOrEmpty(searcher: TwoWaySearcher(pattern: Array(separator))), - maxSplits: maxSplits, omittingEmptySubsequences: omittingEmptySubsequences) - } - - // FIXME -// public func splitFromBack( -// separator: S -// ) -> ReversedSplitCollection>> -// where S.Element == Element -// { -// splitFromBack(separator: PatternOrEmpty( -// searcher: TwoWaySearcher(pattern: Array(separator)))) -// } -} - // String split overload breakers // // These are underscored and marked as SPI so that the *actual* public overloads diff --git a/Sources/_StringProcessing/Algorithms/Searchers/TwoWaySearcher.swift b/Sources/_StringProcessing/Algorithms/Searchers/TwoWaySearcher.swift deleted file mode 100644 index 2428f89cd..000000000 --- a/Sources/_StringProcessing/Algorithms/Searchers/TwoWaySearcher.swift +++ /dev/null @@ -1,197 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// This source file is part of the Swift.org open source project -// -// Copyright (c) 2021-2022 Apple Inc. and the Swift project authors -// Licensed under Apache License v2.0 with Runtime Library Exception -// -// See https://swift.org/LICENSE.txt for license information -// -//===----------------------------------------------------------------------===// - -extension BidirectionalCollection where Element: Equatable { - fileprivate func _ends(with suffix: C) -> Bool - where C.Element == Element - { - FixedPatternConsumer(pattern: suffix).consumingBack(self[...]) != nil - } - } - -struct TwoWaySearcher - where Searched.Element: Comparable -{ - // TODO: Be generic over the pattern? - let pattern: [Searched.Element] - let criticalIndex: Int - let period: Int - let periodIsExact: Bool - - init?(pattern: [Searched.Element]) { - guard !pattern.isEmpty else { return nil } - - let (criticalIndex, periodOfSecondPart) = pattern._criticalFactorization(<) - let periodIsExact = pattern[criticalIndex...] - .prefix(periodOfSecondPart) - ._ends(with: pattern[.. - ) -> State { - // FIXME: Is this 'limitedBy' requirement a sign of error? - let criticalIndex = searched.index( - range.lowerBound, offsetBy: criticalIndex, limitedBy: range.upperBound) - ?? range.upperBound - return State( - end: range.upperBound, - index: range.lowerBound, - criticalIndex: - criticalIndex, - memory: nil) - } - - func search( - _ searched: Searched, - _ state: inout State - ) -> Range? { - while state.criticalIndex != searched.endIndex { - if let end = _searchRight(searched, &state), - let start = _searchLeft(searched, &state, end) - { - state.index = end - // FIXME: Is this 'limitedBy' requirement a sign of error? - state.criticalIndex = searched.index( - end, offsetBy: criticalIndex, limitedBy: searched.endIndex) - ?? searched.endIndex - state.memory = nil - return start.. Searched.Index? { - let rStart: Int - var rIndex: Searched.Index - - if let memory = state.memory, memory.offset > criticalIndex { - rStart = memory.offset - rIndex = memory.index - } else { - rStart = criticalIndex - rIndex = state.criticalIndex - } - - for i in rStart.. Searched.Index? { - let lStart = min(state.memory?.offset ?? 0, criticalIndex) - var lIndex = state.criticalIndex - - for i in (lStart.. Bool - ) -> (index: Int, periodOfSecondPart: Int) { - let less = _maximalSuffix(isOrderedBefore) - let greater = _maximalSuffix({ isOrderedBefore($1, $0) }) - return less.index > greater.index ? less : greater - } - - func _maximalSuffix( - _ isOrderedBefore: (Element, Element) -> Bool - ) -> (index: Int, periodOfSecondPart: Int) { - var left = 0 - var right = 1 - var offset = 0 - var period = 1 - - while right + offset < count { - let a = self[right + offset] - let b = self[left + offset] - - if isOrderedBefore(a, b) { - // Suffix is smaller, period is entire prefix so far. - right += offset + 1 - offset = 0 - period = right - left - } else if isOrderedBefore(b, a) { - // Suffix is larger, start over from current location. - left = right - right += 1 - offset = 0 - period = 1 - } else { - // Advance through repetition of the current period. - offset += 1 - if offset + 1 == period { - right += offset + 1 - offset = 0 - } else { - offset += 1 - } - } - } - - return (left, period) - } -} diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index d4c91bd63..00ce0d5f6 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -23,7 +23,9 @@ extension Compiler { var hasEmittedFirstMatchableAtom = false private let compileOptions: _CompileOptions - fileprivate var optimizationsEnabled: Bool { !compileOptions.contains(.disableOptimizations) } + fileprivate var optimizationsEnabled: Bool { + !compileOptions.contains(.disableOptimizations) + } init( options: MatchingOptions, @@ -665,10 +667,10 @@ fileprivate extension Compiler.ByteCodeGen { _ minTrips: Int, _ extraTrips: Int? ) -> Bool { + let isScalarSemantics = options.semanticLevel == .unicodeScalar guard optimizationsEnabled && minTrips <= QuantifyPayload.maxStorableTrips && extraTrips ?? 0 <= QuantifyPayload.maxStorableTrips - && options.semanticLevel == .graphemeCluster && kind != .reluctant else { return false } @@ -678,7 +680,7 @@ fileprivate extension Compiler.ByteCodeGen { guard let bitset = ccc.asAsciiBitset(options) else { return false } - builder.buildQuantify(bitset: bitset, kind, minTrips, extraTrips) + builder.buildQuantify(bitset: bitset, kind, minTrips, extraTrips, isScalarSemantics: isScalarSemantics) case .atom(let atom): switch atom { @@ -687,17 +689,17 @@ fileprivate extension Compiler.ByteCodeGen { guard let val = c._singleScalarAsciiValue else { return false } - builder.buildQuantify(asciiChar: val, kind, minTrips, extraTrips) + builder.buildQuantify(asciiChar: val, kind, minTrips, extraTrips, isScalarSemantics: isScalarSemantics) case .any: builder.buildQuantifyAny( - matchesNewlines: true, kind, minTrips, extraTrips) + matchesNewlines: true, kind, minTrips, extraTrips, isScalarSemantics: isScalarSemantics) case .anyNonNewline: builder.buildQuantifyAny( - matchesNewlines: false, kind, minTrips, extraTrips) + matchesNewlines: false, kind, minTrips, extraTrips, isScalarSemantics: isScalarSemantics) case .dot: builder.buildQuantifyAny( - matchesNewlines: options.dotMatchesNewline, kind, minTrips, extraTrips) + matchesNewlines: options.dotMatchesNewline, kind, minTrips, extraTrips, isScalarSemantics: isScalarSemantics) case .characterClass(let cc): // Custom character class that consumes a single grapheme @@ -706,7 +708,8 @@ fileprivate extension Compiler.ByteCodeGen { model: model, kind, minTrips, - extraTrips) + extraTrips, + isScalarSemantics: isScalarSemantics) default: return false } diff --git a/Sources/_StringProcessing/CMakeLists.txt b/Sources/_StringProcessing/CMakeLists.txt index ef4aeb6ef..f7ccd7ab9 100644 --- a/Sources/_StringProcessing/CMakeLists.txt +++ b/Sources/_StringProcessing/CMakeLists.txt @@ -23,7 +23,6 @@ add_library(_StringProcessing Algorithms/Searchers/NaivePatternSearcher.swift Algorithms/Searchers/PatternOrEmpty.swift Algorithms/Searchers/PredicateSearcher.swift - Algorithms/Searchers/TwoWaySearcher.swift Algorithms/Searchers/ZSearcher.swift Engine/Backtracking.swift Engine/Consume.swift diff --git a/Sources/_StringProcessing/Engine/Backtracking.swift b/Sources/_StringProcessing/Engine/Backtracking.swift index 3ebb060c9..48470ce91 100644 --- a/Sources/_StringProcessing/Engine/Backtracking.swift +++ b/Sources/_StringProcessing/Engine/Backtracking.swift @@ -16,6 +16,11 @@ extension Processor { // Quantifiers may store a range of positions to restore to var rangeStart: Position? var rangeEnd: Position? + + // FIXME: refactor, for now this field is only used for quantifier save + // points. We should try to separate out the concerns better. + var isScalarSemantics: Bool + // The end of the call stack, so we can slice it off // when failing inside a call. // @@ -68,7 +73,11 @@ extension Processor { rangeStart = nil rangeEnd = nil } else { - input.formIndex(before: &rangeEnd!) + if isScalarSemantics { + input.unicodeScalars.formIndex(before: &rangeEnd!) + } else { + input.formIndex(before: &rangeEnd!) + } } } } @@ -82,19 +91,23 @@ extension Processor { pos: addressOnly ? nil : currentPosition, rangeStart: nil, rangeEnd: nil, + isScalarSemantics: false, // FIXME: refactor away stackEnd: .init(callStack.count), captureEnds: storedCaptures, intRegisters: registers.ints, posRegisters: registers.positions) } - func startQuantifierSavePoint() -> SavePoint { + func startQuantifierSavePoint( + isScalarSemantics: Bool + ) -> SavePoint { // Restores to the instruction AFTER the current quantifier instruction SavePoint( pc: controller.pc + 1, pos: nil, rangeStart: nil, rangeEnd: nil, + isScalarSemantics: isScalarSemantics, stackEnd: .init(callStack.count), captureEnds: storedCaptures, intRegisters: registers.ints, diff --git a/Sources/_StringProcessing/Engine/InstPayload.swift b/Sources/_StringProcessing/Engine/InstPayload.swift index f6d5bfcc7..a0e849851 100644 --- a/Sources/_StringProcessing/Engine/InstPayload.swift +++ b/Sources/_StringProcessing/Engine/InstPayload.swift @@ -370,6 +370,10 @@ extension Instruction.Payload { } } +// TODO: Consider switching all quantification to a quantification +// instruction, where the general path has an instruction list (i.e. a +// slice of a list) + // MARK: Struct definitions struct QuantifyPayload: RawRepresentable { let rawValue: UInt64 @@ -380,9 +384,12 @@ struct QuantifyPayload: RawRepresentable { case builtin = 4 } + // TODO: figure out how to better organize this... + // Future work: optimize this layout -> payload type should be a fast switch // The top 8 bits are reserved for the opcode so we have 56 bits to work with - // b55-b38 - Unused + // b55-b39 - Unused + // b39-b38 - isScalarSemantics // b38-b35 - Payload type (one of 4 types, stored on 3 bits) // b35-b27 - minTrips (8 bit int) // b27-b18 - extraTrips (8 bit value, one bit for nil) @@ -393,6 +400,7 @@ struct QuantifyPayload: RawRepresentable { static var minTripsShift: UInt64 { 27 } static var typeShift: UInt64 { 35 } static var maxStorableTrips: UInt64 { (1 << 8) - 1 } + static var isScalarSemanticsBit: UInt64 { 1 &<< 38 } var quantKindMask: UInt64 { 3 } var extraTripsMask: UInt64 { 0x1FF } @@ -404,7 +412,8 @@ struct QuantifyPayload: RawRepresentable { _ kind: AST.Quantification.Kind, _ minTrips: Int, _ extraTrips: Int?, - _ type: PayloadType + _ type: PayloadType, + isScalarSemantics: Bool ) -> UInt64 { let kindVal: UInt64 switch kind { @@ -415,11 +424,14 @@ struct QuantifyPayload: RawRepresentable { case .possessive: kindVal = 2 } + // TODO: refactor / reimplement let extraTripsVal: UInt64 = extraTrips == nil ? 1 : UInt64(extraTrips!) << 1 - return (kindVal << QuantifyPayload.quantKindShift) + - (extraTripsVal << QuantifyPayload.extraTripsShift) + - (UInt64(minTrips) << QuantifyPayload.minTripsShift) + - (type.rawValue << QuantifyPayload.typeShift) + let scalarSemanticsBit = isScalarSemantics ? Self.isScalarSemanticsBit : 0 + return (kindVal << QuantifyPayload.quantKindShift) | + (extraTripsVal << QuantifyPayload.extraTripsShift) | + (UInt64(minTrips) << QuantifyPayload.minTripsShift) | + (type.rawValue << QuantifyPayload.typeShift) | + scalarSemanticsBit } init(rawValue: UInt64) { @@ -431,46 +443,49 @@ struct QuantifyPayload: RawRepresentable { bitset: AsciiBitsetRegister, _ kind: AST.Quantification.Kind, _ minTrips: Int, - _ extraTrips: Int? + _ extraTrips: Int?, + isScalarSemantics: Bool ) { assert(bitset.bits <= _payloadMask) self.rawValue = bitset.bits - + QuantifyPayload.packInfoValues(kind, minTrips, extraTrips, .bitset) + + QuantifyPayload.packInfoValues(kind, minTrips, extraTrips, .bitset, isScalarSemantics: isScalarSemantics) } init( asciiChar: UInt8, _ kind: AST.Quantification.Kind, _ minTrips: Int, - _ extraTrips: Int? + _ extraTrips: Int?, + isScalarSemantics: Bool ) { self.rawValue = UInt64(asciiChar) - + QuantifyPayload.packInfoValues(kind, minTrips, extraTrips, .asciiChar) + + QuantifyPayload.packInfoValues(kind, minTrips, extraTrips, .asciiChar, isScalarSemantics: isScalarSemantics) } init( matchesNewlines: Bool, _ kind: AST.Quantification.Kind, _ minTrips: Int, - _ extraTrips: Int? + _ extraTrips: Int?, + isScalarSemantics: Bool ) { self.rawValue = (matchesNewlines ? 1 : 0) - + QuantifyPayload.packInfoValues(kind, minTrips, extraTrips, .any) + + QuantifyPayload.packInfoValues(kind, minTrips, extraTrips, .any, isScalarSemantics: isScalarSemantics) } init( model: _CharacterClassModel, _ kind: AST.Quantification.Kind, _ minTrips: Int, - _ extraTrips: Int? + _ extraTrips: Int?, + isScalarSemantics: Bool ) { assert(model.cc.rawValue < 0xFF) - assert(model.matchLevel != .unicodeScalar) let packedModel = model.cc.rawValue + (model.isInverted ? 1 << 9 : 0) + (model.isStrictASCII ? 1 << 10 : 0) self.rawValue = packedModel - + QuantifyPayload.packInfoValues(kind, minTrips, extraTrips, .builtin) + + QuantifyPayload.packInfoValues(kind, minTrips, extraTrips, .builtin, isScalarSemantics: isScalarSemantics) } var type: PayloadType { @@ -500,6 +515,10 @@ struct QuantifyPayload: RawRepresentable { } } + var isScalarSemantics: Bool { + rawValue & Self.isScalarSemanticsBit != 0 + } + var bitset: AsciiBitsetRegister { TypedInt(self.rawValue & payloadMask) } diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index 4b623fbda..93801aeec 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -222,44 +222,48 @@ extension MEProgram.Builder { bitset: DSLTree.CustomCharacterClass.AsciiBitset, _ kind: AST.Quantification.Kind, _ minTrips: Int, - _ extraTrips: Int? + _ extraTrips: Int?, + isScalarSemantics: Bool ) { instructions.append(.init( .quantify, - .init(quantify: .init(bitset: makeAsciiBitset(bitset), kind, minTrips, extraTrips)))) + .init(quantify: .init(bitset: makeAsciiBitset(bitset), kind, minTrips, extraTrips, isScalarSemantics: isScalarSemantics)))) } mutating func buildQuantify( asciiChar: UInt8, _ kind: AST.Quantification.Kind, _ minTrips: Int, - _ extraTrips: Int? + _ extraTrips: Int?, + isScalarSemantics: Bool ) { instructions.append(.init( .quantify, - .init(quantify: .init(asciiChar: asciiChar, kind, minTrips, extraTrips)))) + .init(quantify: .init(asciiChar: asciiChar, kind, minTrips, extraTrips, isScalarSemantics: isScalarSemantics)))) } mutating func buildQuantifyAny( matchesNewlines: Bool, _ kind: AST.Quantification.Kind, _ minTrips: Int, - _ extraTrips: Int? + _ extraTrips: Int?, + isScalarSemantics: Bool ) { instructions.append(.init( .quantify, - .init(quantify: .init(matchesNewlines: matchesNewlines, kind, minTrips, extraTrips)))) + .init(quantify: .init(matchesNewlines: matchesNewlines, kind, minTrips, extraTrips, isScalarSemantics: isScalarSemantics)))) } mutating func buildQuantify( model: _CharacterClassModel, _ kind: AST.Quantification.Kind, _ minTrips: Int, - _ extraTrips: Int? + _ extraTrips: Int?, + isScalarSemantics: Bool ) { instructions.append(.init( .quantify, - .init(quantify: .init(model: model,kind, minTrips, extraTrips)))) + .init(quantify: .init(model: model,kind, minTrips, extraTrips, isScalarSemantics: isScalarSemantics)))) } mutating func buildAccept() { diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift index a3d864165..5e446b472 100644 --- a/Sources/_StringProcessing/Engine/MEBuiltins.swift +++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift @@ -15,7 +15,7 @@ extension Processor { isStrictASCII: Bool, isScalarSemantics: Bool ) -> Bool { - guard let next = input._matchBuiltinCC( + guard let next = input.matchBuiltinCC( cc, at: currentPosition, isInverted: isInverted, @@ -30,6 +30,7 @@ extension Processor { } func isAtStartOfLine(_ payload: AssertionPayload) -> Bool { + // TODO: needs benchmark coverage if currentPosition == subjectBounds.lowerBound { return true } switch payload.semanticLevel { case .graphemeCluster: @@ -40,6 +41,7 @@ extension Processor { } func isAtEndOfLine(_ payload: AssertionPayload) -> Bool { + // TODO: needs benchmark coverage if currentPosition == subjectBounds.upperBound { return true } switch payload.semanticLevel { case .graphemeCluster: @@ -50,6 +52,8 @@ extension Processor { } mutating func builtinAssert(by payload: AssertionPayload) throws -> Bool { + // TODO: needs benchmark coverage + // Future work: Optimize layout and dispatch switch payload.kind { case .startOfSubject: return currentPosition == subjectBounds.lowerBound @@ -59,10 +63,10 @@ extension Processor { switch payload.semanticLevel { case .graphemeCluster: return input.index(after: currentPosition) == subjectBounds.upperBound - && input[currentPosition].isNewline + && input[currentPosition].isNewline case .unicodeScalar: return input.unicodeScalars.index(after: currentPosition) == subjectBounds.upperBound - && input.unicodeScalars[currentPosition].isNewline + && input.unicodeScalars[currentPosition].isNewline } case .endOfSubject: return currentPosition == subjectBounds.upperBound @@ -115,12 +119,75 @@ extension Processor { } } -// MARK: Built-in character class matching +// MARK: Matching `.` +extension String { + // TODO: Should the below have a `limitedBy` parameter? + + func matchAnyNonNewline( + at currentPosition: String.Index, + isScalarSemantics: Bool + ) -> String.Index? { + guard currentPosition < endIndex else { + return nil + } + if case .definite(let result) = _quickMatchAnyNonNewline( + at: currentPosition, + isScalarSemantics: isScalarSemantics + ) { + assert(result == _thoroughMatchAnyNonNewline( + at: currentPosition, + isScalarSemantics: isScalarSemantics)) + return result + } + return _thoroughMatchAnyNonNewline( + at: currentPosition, + isScalarSemantics: isScalarSemantics) + } + + @inline(__always) + private func _quickMatchAnyNonNewline( + at currentPosition: String.Index, + isScalarSemantics: Bool + ) -> QuickResult { + assert(currentPosition < endIndex) + guard let (asciiValue, next, isCRLF) = _quickASCIICharacter( + at: currentPosition + ) else { + return .unknown + } + switch asciiValue { + case (._lineFeed)...(._carriageReturn): + return .definite(nil) + default: + assert(!isCRLF) + return .definite(next) + } + } + + @inline(never) + private func _thoroughMatchAnyNonNewline( + at currentPosition: String.Index, + isScalarSemantics: Bool + ) -> String.Index? { + assert(currentPosition < endIndex) + if isScalarSemantics { + let scalar = unicodeScalars[currentPosition] + guard !scalar.isNewline else { return nil } + return unicodeScalars.index(after: currentPosition) + } + + let char = self[currentPosition] + guard !char.isNewline else { return nil } + return index(after: currentPosition) + } +} +// MARK: - Built-in character class matching extension String { + // TODO: Should the below have a `limitedBy` parameter? // Mentioned in ProgrammersManual.md, update docs if redesigned - func _matchBuiltinCC( + func matchBuiltinCC( _ cc: _CharacterClassModel.Representation, at currentPosition: String.Index, isInverted: Bool, @@ -155,7 +222,7 @@ extension String { // Mentioned in ProgrammersManual.md, update docs if redesigned @inline(__always) - func _quickMatchBuiltinCC( + private func _quickMatchBuiltinCC( _ cc: _CharacterClassModel.Representation, at currentPosition: String.Index, isInverted: Bool, @@ -173,7 +240,7 @@ extension String { // Mentioned in ProgrammersManual.md, update docs if redesigned @inline(never) - func _thoroughMatchBuiltinCC( + private func _thoroughMatchBuiltinCC( _ cc: _CharacterClassModel.Representation, at currentPosition: String.Index, isInverted: Bool, diff --git a/Sources/_StringProcessing/Engine/MEQuantify.swift b/Sources/_StringProcessing/Engine/MEQuantify.swift index fa68b8b76..1ff734ccd 100644 --- a/Sources/_StringProcessing/Engine/MEQuantify.swift +++ b/Sources/_StringProcessing/Engine/MEQuantify.swift @@ -1,26 +1,45 @@ extension Processor { func _doQuantifyMatch(_ payload: QuantifyPayload) -> Input.Index? { - var next: Input.Index? + let isScalarSemantics = payload.isScalarSemantics + switch payload.type { case .bitset: - next = _doMatchBitset(registers[payload.bitset]) + return input.matchBitset( + registers[payload.bitset], + at: currentPosition, + limitedBy: end, + isScalarSemantics: isScalarSemantics) case .asciiChar: - next = _doMatchScalar( - UnicodeScalar.init(_value: UInt32(payload.asciiChar)), true) + return input.matchScalar( + UnicodeScalar.init(_value: UInt32(payload.asciiChar)), + at: currentPosition, + limitedBy: end, + boundaryCheck: !isScalarSemantics, + isCaseInsensitive: false) case .builtin: + // FIXME: bounds check? endIndex or end? + // We only emit .quantify if it consumes a single character - next = input._matchBuiltinCC( + return input.matchBuiltinCC( payload.builtin, at: currentPosition, isInverted: payload.builtinIsInverted, isStrictASCII: payload.builtinIsStrict, - isScalarSemantics: false) + isScalarSemantics: isScalarSemantics) case .any: - let matched = currentPosition != input.endIndex - && (!input[currentPosition].isNewline || payload.anyMatchesNewline) - next = matched ? input.index(after: currentPosition) : nil + // FIXME: endIndex or end? + guard currentPosition < input.endIndex else { return nil } + + if payload.anyMatchesNewline { + if isScalarSemantics { + return input.unicodeScalars.index(after: currentPosition) + } + return input.index(after: currentPosition) + } + + return input.matchAnyNonNewline( + at: currentPosition, isScalarSemantics: isScalarSemantics) } - return next } /// Generic quantify instruction interpreter @@ -29,8 +48,10 @@ extension Processor { mutating func runQuantify(_ payload: QuantifyPayload) -> Bool { var trips = 0 var extraTrips = payload.extraTrips - var savePoint = startQuantifierSavePoint() - + var savePoint = startQuantifierSavePoint( + isScalarSemantics: payload.isScalarSemantics + ) + while true { if trips >= payload.minTrips { if extraTrips == 0 { break } @@ -40,7 +61,14 @@ extension Processor { } } let next = _doQuantifyMatch(payload) - guard let idx = next else { break } + guard let idx = next else { + if !savePoint.rangeIsEmpty { + // The last save point has saved the current, non-matching position, + // so it's unneeded. + savePoint.shrinkRange(input) + } + break + } currentPosition = idx trips += 1 } @@ -50,12 +78,8 @@ extension Processor { return false } - if payload.quantKind == .eager && !savePoint.rangeIsEmpty { - // The last save point has saved the current position, so it's unneeded - savePoint.shrinkRange(input) - if !savePoint.rangeIsEmpty { - savePoints.append(savePoint) - } + if !savePoint.rangeIsEmpty { + savePoints.append(savePoint) } return true } @@ -65,7 +89,9 @@ extension Processor { assert(payload.quantKind == .eager && payload.minTrips == 0 && payload.extraTrips == nil) - var savePoint = startQuantifierSavePoint() + var savePoint = startQuantifierSavePoint( + isScalarSemantics: payload.isScalarSemantics + ) while true { savePoint.updateRange(newEnd: currentPosition) @@ -87,7 +113,9 @@ extension Processor { assert(payload.quantKind == .eager && payload.minTrips == 1 && payload.extraTrips == nil) - var savePoint = startQuantifierSavePoint() + var savePoint = startQuantifierSavePoint( + isScalarSemantics: payload.isScalarSemantics + ) while true { let next = _doQuantifyMatch(payload) guard let idx = next else { break } diff --git a/Sources/_StringProcessing/Engine/Metrics.swift b/Sources/_StringProcessing/Engine/Metrics.swift index 753c3c3d1..372a7e1b4 100644 --- a/Sources/_StringProcessing/Engine/Metrics.swift +++ b/Sources/_StringProcessing/Engine/Metrics.swift @@ -1,13 +1,71 @@ extension Processor { +#if PROCESSOR_MEASUREMENTS_ENABLED struct ProcessorMetrics { var instructionCounts: [Instruction.OpCode: Int] = [:] var backtracks: Int = 0 var resets: Int = 0 + var cycleCount: Int = 0 + + var isTracingEnabled: Bool = false + var shouldMeasureMetrics: Bool = false + + init(isTracingEnabled: Bool, shouldMeasureMetrics: Bool) { + self.isTracingEnabled = isTracingEnabled + self.shouldMeasureMetrics = shouldMeasureMetrics + } } - +#else + struct ProcessorMetrics { + var isTracingEnabled: Bool { false } + var shouldMeasureMetrics: Bool { false } + var cycleCount: Int { 0 } + + init(isTracingEnabled: Bool, shouldMeasureMetrics: Bool) { } + } +#endif +} + +extension Processor { + + mutating func startCycleMetrics() { +#if PROCESSOR_MEASUREMENTS_ENABLED + if metrics.cycleCount == 0 { + trace() + measureMetrics() + } +#endif + } + + mutating func endCycleMetrics() { +#if PROCESSOR_MEASUREMENTS_ENABLED + metrics.cycleCount += 1 + trace() + measureMetrics() + _checkInvariants() +#endif + } +} + +extension Processor.ProcessorMetrics { + + mutating func addReset() { +#if PROCESSOR_MEASUREMENTS_ENABLED + self.resets += 1 +#endif + } + + mutating func addBacktrack() { +#if PROCESSOR_MEASUREMENTS_ENABLED + self.backtracks += 1 +#endif + } +} + +extension Processor { +#if PROCESSOR_MEASUREMENTS_ENABLED func printMetrics() { print("===") - print("Total cycle count: \(cycleCount)") + print("Total cycle count: \(metrics.cycleCount)") print("Backtracks: \(metrics.backtracks)") print("Resets: \(metrics.resets)") print("Instructions:") @@ -21,7 +79,7 @@ extension Processor { } mutating func measure() { - let (opcode, _) = fetch().destructure + let (opcode, _) = fetch() if metrics.instructionCounts.keys.contains(opcode) { metrics.instructionCounts[opcode]! += 1 } else { @@ -30,8 +88,9 @@ extension Processor { } mutating func measureMetrics() { - if shouldMeasureMetrics { + if metrics.shouldMeasureMetrics { measure() } } +#endif } diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index 25cad5c8c..0350a37db 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -35,7 +35,7 @@ struct Processor { /// of the search. `input` can be a "supersequence" of the subject, while /// `input[subjectBounds]` is the logical entity that is being searched. let input: Input - + /// The bounds of the logical subject in `input`. /// /// `subjectBounds` represents the bounds of the string or substring that a @@ -46,7 +46,7 @@ struct Processor { /// `subjectBounds` is always equal to or a subrange of /// `input.startIndex.. - + /// The bounds within the subject for an individual search. /// /// `searchBounds` is equal to `subjectBounds` in some cases, but can be a @@ -62,7 +62,7 @@ struct Processor { let instructions: InstructionList // MARK: Resettable state - + /// The current search position while processing. /// /// `currentPosition` must always be in the range `subjectBounds` or equal @@ -81,16 +81,12 @@ struct Processor { var wordIndexCache: Set? = nil var wordIndexMaxIndex: String.Index? = nil - + var state: State = .inProgress var failureReason: Error? = nil - // MARK: Metrics, debugging, etc. - var cycleCount = 0 - var isTracingEnabled: Bool - let shouldMeasureMetrics: Bool - var metrics: ProcessorMetrics = ProcessorMetrics() + var metrics: ProcessorMetrics } extension Processor { @@ -116,14 +112,17 @@ extension Processor { self.subjectBounds = subjectBounds self.searchBounds = searchBounds self.matchMode = matchMode - self.isTracingEnabled = isTracingEnabled - self.shouldMeasureMetrics = shouldMeasureMetrics + + self.metrics = ProcessorMetrics( + isTracingEnabled: isTracingEnabled, + shouldMeasureMetrics: shouldMeasureMetrics) + self.currentPosition = searchBounds.lowerBound // Initialize registers with end of search bounds self.registers = Registers(program, searchBounds.upperBound) self.storedCaptures = Array( - repeating: .init(), count: program.registerInfo.captures) + repeating: .init(), count: program.registerInfo.captures) _checkInvariants() } @@ -144,8 +143,8 @@ extension Processor { self.state = .inProgress self.failureReason = nil - - if shouldMeasureMetrics { metrics.resets += 1 } + + metrics.addReset() _checkInvariants() } @@ -160,23 +159,22 @@ extension Processor { } extension Processor { + func fetch() -> (Instruction.OpCode, Instruction.Payload) { + instructions[controller.pc].destructure + } + var slice: Input.SubSequence { // TODO: Should we whole-scale switch to slices, or // does that depend on options for some anchors? input[searchBounds] } - // Advance in our input, without any checks or failure signalling - mutating func _uncheckedForcedConsumeOne() { - assert(currentPosition != end) - input.formIndex(after: ¤tPosition) - } - // Advance in our input // // Returns whether the advance succeeded. On failure, our // save point was restored mutating func consume(_ n: Distance) -> Bool { + // TODO: needs benchmark coverage guard let idx = input.index( currentPosition, offsetBy: n.rawValue, limitedBy: end ) else { @@ -186,9 +184,10 @@ extension Processor { currentPosition = idx return true } - + // Advances in unicode scalar view mutating func consumeScalar(_ n: Distance) -> Bool { + // TODO: needs benchmark coverage guard let idx = input.unicodeScalars.index( currentPosition, offsetBy: n.rawValue, limitedBy: end ) else { @@ -220,29 +219,29 @@ extension Processor { func load() -> Element? { currentPosition < end ? input[currentPosition] : nil } - func load(count: Int) -> Input.SubSequence? { - let slice = self.slice[currentPosition...].prefix(count) - guard slice.count == count else { return nil } - return slice - } + + // MARK: Match functions + // + // TODO: refactor these such that `cycle()` calls the corresponding String + // method directly, and all the step, signalFailure, and + // currentPosition logic is collected into a single place inside + // cycle(). // Match against the current input element. Returns whether // it succeeded vs signaling an error. - mutating func match(_ e: Element) -> Bool { - guard let cur = load(), cur == e else { - signalFailure() - return false - } - _uncheckedForcedConsumeOne() - return true - } - - mutating func matchCaseInsensitive(_ e: Element) -> Bool { - guard let cur = load(), cur.lowercased() == e.lowercased() else { + mutating func match( + _ e: Element, isCaseInsensitive: Bool + ) -> Bool { + guard let next = input.match( + e, + at: currentPosition, + limitedBy: end, + isCaseInsensitive: isCaseInsensitive + ) else { signalFailure() return false } - _uncheckedForcedConsumeOne() + currentPosition = next return true } @@ -250,81 +249,54 @@ extension Processor { // it succeeded vs signaling an error. mutating func matchSeq( _ seq: Substring, - isScalarMode: Bool + isScalarSemantics: Bool ) -> Bool { - if isScalarMode { - for s in seq.unicodeScalars { - guard matchScalar(s, boundaryCheck: false) else { return false } - } - return true - } - - for e in seq { - guard match(e) else { return false } - } - return true - } - - func loadScalar() -> Unicode.Scalar? { - currentPosition < end ? input.unicodeScalars[currentPosition] : nil - } - - func _doMatchScalar(_ s: Unicode.Scalar, _ boundaryCheck: Bool) -> Input.Index? { - if s == loadScalar(), - let idx = input.unicodeScalars.index( - currentPosition, - offsetBy: 1, - limitedBy: end), - (!boundaryCheck || input.isOnGraphemeClusterBoundary(idx)) { - return idx - } else { - return nil - } - } - - mutating func matchScalar(_ s: Unicode.Scalar, boundaryCheck: Bool) -> Bool { - guard let next = _doMatchScalar(s, boundaryCheck) else { + guard let next = input.matchSeq( + seq, + at: currentPosition, + limitedBy: end, + isScalarSemantics: isScalarSemantics + ) else { signalFailure() return false } + currentPosition = next return true } - mutating func matchScalarCaseInsensitive( + mutating func matchScalar( _ s: Unicode.Scalar, - boundaryCheck: Bool + boundaryCheck: Bool, + isCaseInsensitive: Bool ) -> Bool { - guard let curScalar = loadScalar(), - s.properties.lowercaseMapping == curScalar.properties.lowercaseMapping, - let idx = input.unicodeScalars.index( - currentPosition, - offsetBy: 1, - limitedBy: end), - (!boundaryCheck || input.isOnGraphemeClusterBoundary(idx)) - else { + guard let next = input.matchScalar( + s, + at: currentPosition, + limitedBy: end, + boundaryCheck: boundaryCheck, + isCaseInsensitive: isCaseInsensitive + ) else { signalFailure() return false } - currentPosition = idx + currentPosition = next return true } - func _doMatchBitset(_ bitset: DSLTree.CustomCharacterClass.AsciiBitset) -> Input.Index? { - if let cur = load(), bitset.matches(char: cur) { - return input.index(after: currentPosition) - } else { - return nil - } - } - // If we have a bitset we know that the CharacterClass only matches against // ascii characters, so check if the current input element is ascii then // check if it is set in the bitset mutating func matchBitset( - _ bitset: DSLTree.CustomCharacterClass.AsciiBitset + _ bitset: DSLTree.CustomCharacterClass.AsciiBitset, + isScalarSemantics: Bool ) -> Bool { - guard let next = _doMatchBitset(bitset) else { + guard let next = input.matchBitset( + bitset, + at: currentPosition, + limitedBy: end, + isScalarSemantics: isScalarSemantics + ) else { signalFailure() return false } @@ -332,37 +304,18 @@ extension Processor { return true } - // Equivalent of matchBitset but emitted when in unicode scalar semantic mode - mutating func matchBitsetScalar( - _ bitset: DSLTree.CustomCharacterClass.AsciiBitset + // Matches the next character/scalar if it is not a newline + mutating func matchAnyNonNewline( + isScalarSemantics: Bool ) -> Bool { - guard let curScalar = loadScalar(), - bitset.matches(scalar: curScalar), - let idx = input.unicodeScalars.index(currentPosition, offsetBy: 1, limitedBy: end) else { - signalFailure() - return false - } - currentPosition = idx - return true - } - - // Matches the next character if it is not a newline - mutating func matchAnyNonNewline() -> Bool { - guard let c = load(), !c.isNewline else { - signalFailure() - return false - } - _uncheckedForcedConsumeOne() - return true - } - - // Matches the next scalar if it is not a newline - mutating func matchAnyNonNewlineScalar() -> Bool { - guard let s = loadScalar(), !s.isNewline else { + guard let next = input.matchAnyNonNewline( + at: currentPosition, + isScalarSemantics: isScalarSemantics + ) else { signalFailure() return false } - input.unicodeScalars.formIndex(after: ¤tPosition) + currentPosition = next return true } @@ -401,8 +354,8 @@ extension Processor { storedCaptures = capEnds registers.ints = intRegisters registers.positions = posRegisters - - if shouldMeasureMetrics { metrics.backtracks += 1 } + + metrics.addBacktrack() } mutating func abort(_ e: Error? = nil) { @@ -436,25 +389,15 @@ extension Processor { // TODO: What should we do here? fatalError("Invalid code: Tried to clear save points when empty") } - + mutating func cycle() { _checkInvariants() assert(state == .inProgress) -#if PROCESSOR_MEASUREMENTS_ENABLED - if cycleCount == 0 { - trace() - measureMetrics() - } - defer { - cycleCount += 1 - trace() - measureMetrics() - _checkInvariants() - } -#endif + startCycleMetrics() + defer { endCycleMetrics() } - let (opcode, payload) = fetch().destructure + let (opcode, payload) = fetch() switch opcode { case .invalid: fatalError("Invalid program") @@ -535,50 +478,30 @@ extension Processor { } } case .matchAnyNonNewline: - if payload.isScalar { - if matchAnyNonNewlineScalar() { - controller.step() - } - } else { - if matchAnyNonNewline() { - controller.step() - } + if matchAnyNonNewline(isScalarSemantics: payload.isScalar) { + controller.step() } case .match: let (isCaseInsensitive, reg) = payload.elementPayload - if isCaseInsensitive { - if matchCaseInsensitive(registers[reg]) { - controller.step() - } - } else { - if match(registers[reg]) { - controller.step() - } + if match(registers[reg], isCaseInsensitive: isCaseInsensitive) { + controller.step() } case .matchScalar: let (scalar, caseInsensitive, boundaryCheck) = payload.scalarPayload - if caseInsensitive { - if matchScalarCaseInsensitive(scalar, boundaryCheck: boundaryCheck) { - controller.step() - } - } else { - if matchScalar(scalar, boundaryCheck: boundaryCheck) { - controller.step() - } + if matchScalar( + scalar, + boundaryCheck: boundaryCheck, + isCaseInsensitive: caseInsensitive + ) { + controller.step() } case .matchBitset: let (isScalar, reg) = payload.bitsetPayload let bitset = registers[reg] - if isScalar { - if matchBitsetScalar(bitset) { - controller.step() - } - } else { - if matchBitset(bitset) { - controller.step() - } + if matchBitset(bitset, isScalarSemantics: isScalar) { + controller.step() } case .matchBuiltin: @@ -669,7 +592,7 @@ extension Processor { signalFailure() return } - if matchSeq(input[range], isScalarMode: isScalarMode) { + if matchSeq(input[range], isScalarSemantics: isScalarMode) { controller.step() } @@ -714,3 +637,122 @@ extension Processor { } } } + +// MARK: String matchers +// +// TODO: Refactor into separate file, formalize patterns + +extension String { + + func match( + _ char: Character, + at pos: Index, + limitedBy end: String.Index, + isCaseInsensitive: Bool + ) -> Index? { + // TODO: This can be greatly sped up with string internals + // TODO: This is also very much quick-check-able + assert(end <= endIndex) + + guard pos < end else { return nil } + + if isCaseInsensitive { + guard self[pos].lowercased() == char.lowercased() else { return nil } + } else { + guard self[pos] == char else { return nil } + } + + let idx = index(after: pos) + guard idx <= end else { return nil } + + return idx + } + + func matchSeq( + _ seq: Substring, + at pos: Index, + limitedBy end: Index, + isScalarSemantics: Bool + ) -> Index? { + // TODO: This can be greatly sped up with string internals + // TODO: This is also very much quick-check-able + assert(end <= endIndex) + + var cur = pos + + if isScalarSemantics { + for e in seq.unicodeScalars { + guard cur < end, unicodeScalars[cur] == e else { return nil } + self.unicodeScalars.formIndex(after: &cur) + } + } else { + for e in seq { + guard cur < end, self[cur] == e else { return nil } + self.formIndex(after: &cur) + } + } + + guard cur <= end else { return nil } + return cur + } + + func matchScalar( + _ scalar: Unicode.Scalar, + at pos: Index, + limitedBy end: String.Index, + boundaryCheck: Bool, + isCaseInsensitive: Bool + ) -> Index? { + // TODO: extremely quick-check-able + // TODO: can be sped up with string internals + assert(end <= endIndex) + + guard pos < end else { return nil } + let curScalar = unicodeScalars[pos] + + if isCaseInsensitive { + guard curScalar.properties.lowercaseMapping == scalar.properties.lowercaseMapping + else { + return nil + } + } else { + guard curScalar == scalar else { return nil } + } + + let idx = unicodeScalars.index(after: pos) + guard idx <= end else { return nil } + + if boundaryCheck && !isOnGraphemeClusterBoundary(idx) { + return nil + } + + return idx + } + + func matchBitset( + _ bitset: DSLTree.CustomCharacterClass.AsciiBitset, + at pos: Index, + limitedBy end: Index, + isScalarSemantics: Bool + ) -> Index? { + // TODO: extremely quick-check-able + // TODO: can be sped up with string internals + assert(end <= endIndex) + + guard pos < end else { return nil } + + let idx: String.Index + if isScalarSemantics { + guard bitset.matches(unicodeScalars[pos]) else { return nil } + idx = unicodeScalars.index(after: pos) + } else { + guard bitset.matches(self[pos]) else { return nil } + idx = index(after: pos) + } + + guard idx <= end else { return nil } + return idx + } + + +} diff --git a/Sources/_StringProcessing/Engine/Tracing.swift b/Sources/_StringProcessing/Engine/Tracing.swift index 725319b00..b0ce67555 100644 --- a/Sources/_StringProcessing/Engine/Tracing.swift +++ b/Sources/_StringProcessing/Engine/Tracing.swift @@ -9,7 +9,12 @@ // //===----------------------------------------------------------------------===// + +// TODO: Remove this protocol (and/or reuse it for something like a FastProcessor) extension Processor: TracedProcessor { + var cycleCount: Int { metrics.cycleCount } + var isTracingEnabled: Bool { metrics.isTracingEnabled } + var isFailState: Bool { state == .fail } var isAcceptState: Bool { state == .accept } diff --git a/Sources/_StringProcessing/Executor.swift b/Sources/_StringProcessing/Executor.swift index 253858d1f..0453fcd80 100644 --- a/Sources/_StringProcessing/Executor.swift +++ b/Sources/_StringProcessing/Executor.swift @@ -31,7 +31,7 @@ struct Executor { subjectBounds: subjectBounds, searchBounds: searchBounds) #if PROCESSOR_MEASUREMENTS_ENABLED - defer { if cpu.shouldMeasureMetrics { cpu.printMetrics() } } + defer { if cpu.metrics.shouldMeasureMetrics { cpu.printMetrics() } } #endif var low = searchBounds.lowerBound let high = searchBounds.upperBound @@ -60,7 +60,7 @@ struct Executor { var cpu = engine.makeProcessor( input: input, bounds: subjectBounds, matchMode: mode) #if PROCESSOR_MEASUREMENTS_ENABLED - defer { if cpu.shouldMeasureMetrics { cpu.printMetrics() } } + defer { if cpu.metrics.shouldMeasureMetrics { cpu.printMetrics() } } #endif return try _match(input, from: subjectBounds.lowerBound, using: &cpu) } diff --git a/Sources/_StringProcessing/Regex/AnyRegexOutput.swift b/Sources/_StringProcessing/Regex/AnyRegexOutput.swift index 243c1ba01..57bf06dae 100644 --- a/Sources/_StringProcessing/Regex/AnyRegexOutput.swift +++ b/Sources/_StringProcessing/Regex/AnyRegexOutput.swift @@ -283,7 +283,7 @@ extension Regex where Output == AnyRegexOutput { /// /// - Parameter regex: A regular expression to convert to use a dynamic /// capture list. - public init(_ regex: Regex) { + public init(_ regex: Regex) { self.init(node: regex.root) } } @@ -299,7 +299,7 @@ extension Regex.Match where Output == AnyRegexOutput { /// /// - Parameter match: A regular expression match to convert to a match with /// type-erased captures. - public init(_ match: Regex.Match) { + public init(_ match: Regex.Match) { self.init( anyRegexOutput: match.anyRegexOutput, range: match.range diff --git a/Sources/_StringProcessing/Unicode/ASCII.swift b/Sources/_StringProcessing/Unicode/ASCII.swift index 5150e18cc..de13e340a 100644 --- a/Sources/_StringProcessing/Unicode/ASCII.swift +++ b/Sources/_StringProcessing/Unicode/ASCII.swift @@ -9,26 +9,25 @@ // //===----------------------------------------------------------------------===// -private var _lineFeed: UInt8 { 0x0A } -private var _carriageReturn: UInt8 { 0x0D } -private var _lineTab: UInt8 { 0x0B } -private var _formFeed: UInt8 { 0x0C } -private var _space: UInt8 { 0x20 } -private var _tab: UInt8 { 0x09 } +extension UInt8 { + static var _lineFeed: UInt8 { 0x0A } + static var _carriageReturn: UInt8 { 0x0D } + static var _lineTab: UInt8 { 0x0B } + static var _formFeed: UInt8 { 0x0C } + static var _space: UInt8 { 0x20 } + static var _tab: UInt8 { 0x09 } + + static var _underscore: UInt8 { 0x5F } +} private var _0: UInt8 { 0x30 } private var _9: UInt8 { 0x39 } -private func _isASCIINumber(_ x: UInt8) -> Bool { - return (_0..._9).contains(x) -} private var _a: UInt8 { 0x61 } private var _z: UInt8 { 0x7A } private var _A: UInt8 { 0x41 } private var _Z: UInt8 { 0x5A } -private var _underscore: UInt8 { 0x5F } - extension UInt8 { var _isASCII: Bool { self < 0x80 } @@ -43,14 +42,14 @@ extension UInt8 { /// Assuming we're ASCII, whether we match `\h` var _asciiIsHorizontalWhitespace: Bool { assert(_isASCII) - return self == _space || self == _tab + return self == ._space || self == ._tab } /// Assuming we're ASCII, whether we match `\v` var _asciiIsVerticalWhitespace: Bool { assert(_isASCII) switch self { - case _lineFeed, _carriageReturn, _lineTab, _formFeed: + case ._lineFeed, ._carriageReturn, ._lineTab, ._formFeed: return true default: return false @@ -61,7 +60,7 @@ extension UInt8 { var _asciiIsWhitespace: Bool { assert(_isASCII) switch self { - case _space, _tab, _lineFeed, _lineTab, _formFeed, _carriageReturn: + case ._space, ._tab, ._lineFeed, ._lineTab, ._formFeed, ._carriageReturn: return true default: return false @@ -77,11 +76,13 @@ extension UInt8 { /// Assuming we're ASCII, whether we match `\w` var _asciiIsWord: Bool { assert(_isASCII) - return _asciiIsDigit || _asciiIsLetter || self == _underscore + return _asciiIsDigit || _asciiIsLetter || self == ._underscore } } extension String { + /// TODO: better to take isScalarSemantics parameter, we can return more results + /// and we can give the right `next` index, not requiring the caller to re-adjust it /// TODO: detailed description of nuanced semantics func _quickASCIICharacter( at idx: Index @@ -107,7 +108,7 @@ extension String { guard tail._isSub300StartingByte else { return nil } // Handle CR-LF: - if base == _carriageReturn && tail == _lineFeed { + if base == ._carriageReturn && tail == ._lineFeed { utf8.formIndex(after: &next) guard next == endIndex || utf8[next]._isSub300StartingByte else { return nil @@ -165,5 +166,6 @@ extension String { return (next, asciiValue._asciiIsWord) } } + } diff --git a/Sources/_StringProcessing/Unicode/WordBreaking.swift b/Sources/_StringProcessing/Unicode/WordBreaking.swift index 50da079f6..f1f9573c1 100644 --- a/Sources/_StringProcessing/Unicode/WordBreaking.swift +++ b/Sources/_StringProcessing/Unicode/WordBreaking.swift @@ -12,6 +12,7 @@ @_spi(_Unicode) import Swift +// TODO: Sink onto String extension Processor { func atSimpleBoundary( _ usesAsciiWord: Bool, @@ -20,9 +21,11 @@ extension Processor { func matchesWord(at i: Input.Index) -> Bool { switch semanticLevel { case .graphemeCluster: + // TODO: needs benchmark coverage let c = input[i] return c.isWordCharacter && (c.isASCII || !usesAsciiWord) case .unicodeScalar: + // TODO: needs benchmark coverage let c = input.unicodeScalars[i] return (c.properties.isAlphabetic || c == "_") && (c.isASCII || !usesAsciiWord) } @@ -51,6 +54,7 @@ extension String { using cache: inout Set?, _ maxIndex: inout String.Index? ) -> Bool { + // TODO: needs benchmark coverage guard i != startIndex, i != endIndex else { return true } diff --git a/Sources/_StringProcessing/Utility/AsciiBitset.swift b/Sources/_StringProcessing/Utility/AsciiBitset.swift index ad3159820..e063447a0 100644 --- a/Sources/_StringProcessing/Utility/AsciiBitset.swift +++ b/Sources/_StringProcessing/Utility/AsciiBitset.swift @@ -57,7 +57,7 @@ extension DSLTree.CustomCharacterClass { } } - internal func matches(char: Character) -> Bool { + internal func matches(_ char: Character) -> Bool { let matched: Bool if let val = char._singleScalarAsciiValue { matched = matches(val) @@ -71,7 +71,7 @@ extension DSLTree.CustomCharacterClass { return matched } - internal func matches(scalar: Unicode.Scalar) -> Bool { + internal func matches(_ scalar: Unicode.Scalar) -> Bool { let matched: Bool if scalar.isASCII { let val = UInt8(ascii: scalar) diff --git a/Sources/_StringProcessing/Utility/Protocols.swift b/Sources/_StringProcessing/Utility/Protocols.swift index 7542a17dd..24ffbcf70 100644 --- a/Sources/_StringProcessing/Utility/Protocols.swift +++ b/Sources/_StringProcessing/Utility/Protocols.swift @@ -44,13 +44,3 @@ protocol ProcessorProtocol { var registers: Registers { get } } -extension ProcessorProtocol { - func fetch() -> Instruction { - instructions[currentPC] - } - - var callStack: Array { [] } -// var savePoints: Array { [] } - var registers: Array { [] } - -} diff --git a/Sources/_StringProcessing/Utility/Traced.swift b/Sources/_StringProcessing/Utility/Traced.swift index 112a601b1..198564fe1 100644 --- a/Sources/_StringProcessing/Utility/Traced.swift +++ b/Sources/_StringProcessing/Utility/Traced.swift @@ -13,7 +13,7 @@ // TODO: Place shared formatting and trace infrastructure here protocol Traced { - var isTracingEnabled: Bool { get set } + var isTracingEnabled: Bool { get } } protocol TracedProcessor: ProcessorProtocol, Traced { diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index cdee66ddb..c053b31e4 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -79,7 +79,7 @@ struct _CharacterClassModel: Hashable { let isScalarSemantics = matchLevel == .unicodeScalar - return input._matchBuiltinCC( + return input.matchBuiltinCC( cc, at: currentPosition, isInverted: isInverted, diff --git a/Tests/RegexBuilderTests/AlgorithmsTests.swift b/Tests/RegexBuilderTests/AlgorithmsTests.swift index 115941070..b85395d19 100644 --- a/Tests/RegexBuilderTests/AlgorithmsTests.swift +++ b/Tests/RegexBuilderTests/AlgorithmsTests.swift @@ -16,7 +16,7 @@ import RegexBuilder @available(SwiftStdlib 5.7, *) class RegexConsumerTests: XCTestCase { func testMatches() { - let regex = Capture(OneOrMore(.digit)) { 2 * Int($0)! } + let regex = Capture<(Substring, Int)>(OneOrMore(.digit)) { 2 * Int($0)! } let str = "foo 160 bar 99 baz" XCTAssertEqual(str.matches(of: regex).map(\.output.1), [320, 198]) } diff --git a/Tests/RegexBuilderTests/CustomTests.swift b/Tests/RegexBuilderTests/CustomTests.swift index 26746d613..848ef4626 100644 --- a/Tests/RegexBuilderTests/CustomTests.swift +++ b/Tests/RegexBuilderTests/CustomTests.swift @@ -64,7 +64,7 @@ private struct IntParser: CustomConsumingRegexComponent { guard index != bounds.upperBound else { return nil } let r = Regex { - Capture(OneOrMore(.digit)) { Int($0) } + Capture<(Substring, Int?)>(OneOrMore(.digit)) { Int($0) } } guard let match = input[index..(Repeat(.digit, count: 2)) { Int($0) } } } diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index 5e85ad26c..19ac675dc 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -1091,7 +1091,7 @@ class RegexDSLTests: XCTestCase { OneOrMore("a") Capture { TryCapture("b", transform: { Int($0) }) - ZeroOrMore( + ZeroOrMore<(Substring, Double?)>( TryCapture("c", transform: { Double($0) }) ) Optionally("e") @@ -1542,12 +1542,12 @@ class RegexDSLTests: XCTestCase { in bounds: Range ) throws -> (upperBound: String.Index, output: SemanticVersion)? { let regex = Regex { - TryCapture(OneOrMore(.digit)) { Int($0) } + TryCapture<(Substring, Int)>(OneOrMore(.digit)) { Int($0) } "." - TryCapture(OneOrMore(.digit)) { Int($0) } + TryCapture<(Substring, Int)>(OneOrMore(.digit)) { Int($0) } Optionally { "." - TryCapture(OneOrMore(.digit)) { Int($0) } + TryCapture<(Substring, Int)>(OneOrMore(.digit)) { Int($0) } } Optionally { "-" @@ -1876,7 +1876,7 @@ extension RegexDSLTests { ":" regexWithTooManyCaptures ":" - TryCapture(OneOrMore(.word)) { Int($0) } + TryCapture<(Substring, Int)>(OneOrMore(.word)) { Int($0) } #/:(\d+):/# } XCTAssert(type(of: dslWithTooManyCaptures).self diff --git a/Tests/RegexTests/AlgorithmsTests.swift b/Tests/RegexTests/AlgorithmsTests.swift index d5419fe9c..60548a0a2 100644 --- a/Tests/RegexTests/AlgorithmsTests.swift +++ b/Tests/RegexTests/AlgorithmsTests.swift @@ -165,8 +165,12 @@ class AlgorithmTests: XCTestCase { let actualCol: [Range] = input.ranges(of: pattern)[...].map(input.offsets(of:)) XCTAssertEqual(actualCol, expected, file: file, line: line) - let firstRange = input.firstRange(of: pattern).map(input.offsets(of:)) - XCTAssertEqual(firstRange, expected.first, file: file, line: line) + let firstRange = input.firstRange(of: pattern) + XCTAssertEqual(firstRange.map(input.offsets(of:)), expected.first, file: file, line: line) + if let upperBound = firstRange?.upperBound, !pattern.isEmpty { + let secondRange = input[upperBound...].firstRange(of: pattern).map(input.offsets(of:)) + XCTAssertEqual(secondRange, expected.dropFirst().first, file: file, line: line) + } } expectRanges("", "", [0..<0]) @@ -176,6 +180,9 @@ class AlgorithmTests: XCTestCase { expectRanges("abcde", "bcd", [1..<4]) expectRanges("ababacabababa", "abababa", [6..<13]) expectRanges("ababacabababa", "aba", [0..<3, 6..<9, 10..<13]) + + // Test for rdar://92794248 + expectRanges("ADACBADADACBADACB", "ADACB", [0..<5, 7..<12, 12..<17]) } // rdar://105154010 diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index a1bf0e76f..3fc547e34 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -620,6 +620,50 @@ extension RegexTests { // TODO: After captures, easier to test these } + func testQuantificationScalarSemantics() { + // TODO: We want more thorough testing here, including "a{n,m}", "a?", etc. + + firstMatchTest("a*", input: "aaa\u{301}", match: "aa") + firstMatchTest("a*", input: "aaa\u{301}", match: "aaa", semanticLevel: .unicodeScalar) + firstMatchTest("a+", input: "aaa\u{301}", match: "aa") + firstMatchTest("a+", input: "aaa\u{301}", match: "aaa", semanticLevel: .unicodeScalar) + firstMatchTest("a?", input: "a\u{301}", match: "") + firstMatchTest("a?", input: "a\u{301}", match: "a", semanticLevel: .unicodeScalar) + + firstMatchTest("[ab]*", input: "abab\u{301}", match: "aba") + firstMatchTest("[ab]*", input: "abab\u{301}", match: "abab", semanticLevel: .unicodeScalar) + firstMatchTest("[ab]+", input: "abab\u{301}", match: "aba") + firstMatchTest("[ab]+", input: "abab\u{301}", match: "abab", semanticLevel: .unicodeScalar) + firstMatchTest("[ab]?", input: "b\u{301}", match: "") + firstMatchTest("[ab]?", input: "b\u{301}", match: "b", semanticLevel: .unicodeScalar) + + firstMatchTest(#"\s*"#, input: " \u{301}", match: " \u{301}") + firstMatchTest(#"\s*"#, input: " \u{301}", match: " ", semanticLevel: .unicodeScalar) + firstMatchTest(#"\s+"#, input: " \u{301}", match: " \u{301}") + firstMatchTest(#"\s+"#, input: " \u{301}", match: " ", semanticLevel: .unicodeScalar) + firstMatchTest(#"\s?"#, input: " \u{301}", match: " \u{301}") + firstMatchTest(#"\s?"#, input: " \u{301}", match: " ", semanticLevel: .unicodeScalar) + + firstMatchTest(#".*?a"#, input: "xxa\u{301}xaZ", match: "xxa\u{301}xa") + firstMatchTest(#".*?a"#, input: "xxa\u{301}xaZ", match: "xxa", semanticLevel: .unicodeScalar) + firstMatchTest(#".+?a"#, input: "xxa\u{301}xaZ", match: "xxa\u{301}xa") + firstMatchTest(#".+?a"#, input: "xxa\u{301}xaZ", match: "xxa", semanticLevel: .unicodeScalar) + firstMatchTest(#".?a"#, input: "e\u{301}aZ", match: "e\u{301}a") + firstMatchTest(#".?a"#, input: "e\u{301}aZ", match: "\u{301}a", semanticLevel: .unicodeScalar) + + firstMatchTest(#".+\u{301}"#, input: "aa\u{301}Z", match: nil) + firstMatchTest(#".+\u{301}"#, input: "aa\u{301}Z", match: "aa\u{301}", semanticLevel: .unicodeScalar) + firstMatchTest(#".*\u{301}"#, input: "\u{301}Z", match: "\u{301}") + firstMatchTest(#".*\u{301}"#, input: "\u{301}Z", match: "\u{301}", semanticLevel: .unicodeScalar) + + firstMatchTest(#".?\u{301}"#, input: "aa\u{302}\u{301}Z", match: nil) + firstMatchTest(#".?\u{301}.?Z"#, input: "aa\u{302}\u{301}Z", match: "\u{302}\u{301}Z", semanticLevel: .unicodeScalar) + firstMatchTest(#".?.?\u{301}.?Z"#, input: "aa\u{302}\u{301}Z", match: "a\u{302}\u{301}Z", semanticLevel: .unicodeScalar) + + + // TODO: other test cases? + } + func testMatchCharacterClasses() { // Must have new stdlib for character class ranges and word boundaries. guard ensureNewStdlib() else { return } @@ -1891,6 +1935,11 @@ extension RegexTests { func testSingleLineMode() { firstMatchTest(#".+"#, input: "a\nb", match: "a") firstMatchTest(#"(?s:.+)"#, input: "a\nb", match: "a\nb") + + // We recognize LF, line tab, FF, and CR as newlines by default + firstMatchTest(#"."#, input: "\u{A}\u{B}\u{C}\u{D}\nb", match: "b") + firstMatchTest(#".+"#, input: "\u{A}\u{B}\u{C}\u{D}\nbb", match: "bb") + } func testMatchNewlines() { @@ -2574,4 +2623,35 @@ extension RegexTests { func testFuzzerArtifacts() throws { expectCompletion(regex: #"(b?)\1*"#, in: "a") } + + func testIssue640() throws { + // Original report from https://github.com/apple/swift-experimental-string-processing/issues/640 + let original = try Regex("[1-9][0-9]{0,2}(?:,?[0-9]{3})*") + XCTAssertNotNil("36,769".wholeMatch(of: original)) + XCTAssertNotNil("36769".wholeMatch(of: original)) + + // Simplified case + let simplified = try Regex("a{0,2}a") + XCTAssertNotNil("aaa".wholeMatch(of: simplified)) + + for max in 1...8 { + let patternEager = "a{0,\(max)}a" + let regexEager = try Regex(patternEager) + let patternReluctant = "a{0,\(max)}?a" + let regexReluctant = try Regex(patternReluctant) + for length in 1...(max + 1) { + let str = String(repeating: "a", count: length) + if str.wholeMatch(of: regexEager) == nil { + XCTFail("Didn't match '\(patternEager)' in '\(str)' (\(max),\(length)).") + } + if str.wholeMatch(of: regexReluctant) == nil { + XCTFail("Didn't match '\(patternReluctant)' in '\(str)' (\(max),\(length)).") + } + } + + let possessiveRegex = try Regex("a{0,\(max)}+a") + let str = String(repeating: "a", count: max + 1) + XCTAssertNotNil(str.wholeMatch(of: possessiveRegex)) + } + } }