diff --git a/Sources/RegexBenchmark/Benchmark.swift b/Sources/RegexBenchmark/Benchmark.swift index 2622639fb..f807a8e55 100644 --- a/Sources/RegexBenchmark/Benchmark.swift +++ b/Sources/RegexBenchmark/Benchmark.swift @@ -71,6 +71,13 @@ struct NSBenchmark: RegexBenchmark { enum NSMatchType { case allMatches case first + + init(_ type: Benchmark.MatchType) { + switch type { + case .whole, .first: self = .first + case .allMatches: self = .allMatches + } + } } func run() { @@ -126,7 +133,7 @@ struct CrossBenchmark { /// The base name of the benchmark var baseName: String - /// The string to compile in differnet engines + /// The string to compile in different engines var regex: String /// The text to search @@ -143,57 +150,32 @@ struct CrossBenchmark { /// Whether or not to do firstMatch as well or just allMatches var includeFirst: Bool = false + /// Whether to also run scalar-semantic mode + var alsoRunScalarSemantic: Bool = true + func register(_ runner: inout BenchmarkRunner) { - let swiftRegex = try! Regex(regex) - let nsRegex: NSRegularExpression if isWhole { - nsRegex = try! NSRegularExpression(pattern: "^" + regex + "$") + runner.registerCrossBenchmark( + nameBase: baseName, + input: input, + pattern: regex, + .whole, + alsoRunScalarSemantic: alsoRunScalarSemantic) } else { - nsRegex = try! NSRegularExpression(pattern: regex) - } + runner.registerCrossBenchmark( + nameBase: baseName, + input: input, + pattern: regex, + .allMatches, + alsoRunScalarSemantic: alsoRunScalarSemantic) - if isWhole { - runner.register( - Benchmark( - name: baseName + "Whole", - regex: swiftRegex, - pattern: regex, - type: .whole, - target: input)) - runner.register( - NSBenchmark( - name: baseName + "Whole" + CrossBenchmark.nsSuffix, - regex: nsRegex, - type: .first, - target: input)) - } else { - runner.register( - Benchmark( - name: baseName + "All", - regex: swiftRegex, - pattern: regex, - type: .allMatches, - target: input)) - runner.register( - NSBenchmark( - name: baseName + "All" + CrossBenchmark.nsSuffix, - regex: nsRegex, - type: .allMatches, - target: input)) if includeFirst || runner.includeFirstOverride { - runner.register( - Benchmark( - name: baseName + "First", - regex: swiftRegex, - pattern: regex, - type: .first, - target: input)) - runner.register( - NSBenchmark( - name: baseName + "First" + CrossBenchmark.nsSuffix, - regex: nsRegex, - type: .first, - target: input)) + runner.registerCrossBenchmark( + nameBase: baseName, + input: input, + pattern: regex, + .first, + alsoRunScalarSemantic: alsoRunScalarSemantic) } } } @@ -209,20 +191,16 @@ struct CrossInputListBenchmark { /// The list of strings to search var inputs: [String] + + /// Also run in scalar-semantic mode + var alsoRunScalarSemantic: Bool = true func register(_ runner: inout BenchmarkRunner) { - let swiftRegex = try! Regex(regex) - runner.register(InputListBenchmark( + runner.registerCrossBenchmark( name: baseName, - regex: swiftRegex, + inputList: inputs, pattern: regex, - targets: inputs - )) - runner.register(InputListNSBenchmark( - name: baseName + CrossBenchmark.nsSuffix, - regex: regex, - targets: inputs - )) + alsoRunScalarSemantic: alsoRunScalarSemantic) } } diff --git a/Sources/RegexBenchmark/BenchmarkRunner.swift b/Sources/RegexBenchmark/BenchmarkRunner.swift index 641c03224..b067b9679 100644 --- a/Sources/RegexBenchmark/BenchmarkRunner.swift +++ b/Sources/RegexBenchmark/BenchmarkRunner.swift @@ -4,6 +4,16 @@ import Foundation /// The number of times to re-run the benchmark if results are too varying private var rerunCount: Int { 3 } +extension Benchmark.MatchType { + fileprivate var nameSuffix: String { + switch self { + case .whole: return "_Whole" + case .first: return "_First" + case .allMatches: return "_All" + } + } +} + struct BenchmarkRunner { let suiteName: String var suite: [any RegexBenchmark] = [] @@ -16,12 +26,141 @@ struct BenchmarkRunner { // Forcibly include firstMatch benchmarks for all CrossBenchmarks let includeFirstOverride: Bool + + // Register a cross-benchmark + mutating func registerCrossBenchmark( + nameBase: String, + input: String, + pattern: String, + _ type: Benchmark.MatchType, + alsoRunScalarSemantic: Bool = true + ) { + let swiftRegex = try! Regex(pattern) + let nsRegex: NSRegularExpression + if type == .whole { + nsRegex = try! NSRegularExpression(pattern: "^" + pattern + "$") + } else { + nsRegex = try! NSRegularExpression(pattern: pattern) + } + let nameSuffix = type.nameSuffix + + register( + Benchmark( + name: nameBase + nameSuffix, + regex: swiftRegex, + pattern: pattern, + type: type, + target: input)) + register( + NSBenchmark( + name: nameBase + nameSuffix + CrossBenchmark.nsSuffix, + regex: nsRegex, + type: .init(type), + target: input)) + + if alsoRunScalarSemantic { + register( + Benchmark( + name: nameBase + nameSuffix + "_Scalar", + regex: swiftRegex.matchingSemantics(.unicodeScalar), + pattern: pattern, + type: type, + target: input)) + register( + NSBenchmark( + name: nameBase + nameSuffix + "_Scalar" + CrossBenchmark.nsSuffix, + regex: nsRegex, + type: .init(type), + target: input)) + } + } + + // Register a cross-benchmark list + mutating func registerCrossBenchmark( + name: String, + inputList: [String], + pattern: String, + alsoRunScalarSemantic: Bool = true + ) { + let swiftRegex = try! Regex(pattern) + register(InputListBenchmark( + name: name, + regex: swiftRegex, + pattern: pattern, + targets: inputList + )) + register(InputListNSBenchmark( + name: name + CrossBenchmark.nsSuffix, + regex: pattern, + targets: inputList + )) + + if alsoRunScalarSemantic { + register(InputListBenchmark( + name: name + "_Scalar", + regex: swiftRegex.matchingSemantics(.unicodeScalar), + pattern: pattern, + targets: inputList + )) + register(InputListNSBenchmark( + name: name + "_Scalar" + CrossBenchmark.nsSuffix, + regex: pattern, + targets: inputList + )) + } + + } + + // Register a swift-only benchmark + mutating func register( + nameBase: String, + input: String, + pattern: String, + _ swiftRegex: Regex, + _ type: Benchmark.MatchType, + alsoRunScalarSemantic: Bool = true + ) { + let nameSuffix = type.nameSuffix + + register( + Benchmark( + name: nameBase + nameSuffix, + regex: swiftRegex, + pattern: pattern, + type: type, + target: input)) + + if alsoRunScalarSemantic { + register( + Benchmark( + name: nameBase + nameSuffix + "_Scalar", + regex: swiftRegex, + pattern: pattern, + type: type, + target: input)) + } + } - mutating func register(_ benchmark: some RegexBenchmark) { + private mutating func register(_ benchmark: NSBenchmark) { suite.append(benchmark) } - mutating func register(_ benchmark: some SwiftRegexBenchmark) { + private mutating func register(_ benchmark: Benchmark) { + var benchmark = benchmark + if enableTracing { + benchmark.enableTracing() + } + if enableMetrics { + benchmark.enableMetrics() + } + suite.append(benchmark) + } + + private mutating func register(_ benchmark: InputListNSBenchmark) { + suite.append(benchmark) + } + + private mutating func register(_ benchmark: InputListBenchmark) { var benchmark = benchmark if enableTracing { benchmark.enableTracing() diff --git a/Sources/RegexBenchmark/Suite/CustomCharacterClasses.swift b/Sources/RegexBenchmark/Suite/CustomCharacterClasses.swift index 61d7b197f..27b2b07b4 100644 --- a/Sources/RegexBenchmark/Suite/CustomCharacterClasses.swift +++ b/Sources/RegexBenchmark/Suite/CustomCharacterClasses.swift @@ -12,53 +12,55 @@ extension BenchmarkRunner { let input = Inputs.graphemeBreakData - register(Benchmark( - name: "BasicCCC", - regex: try! Regex(basic), + // TODO: Which of these can be cross-benchmarks? + + register( + nameBase: "BasicCCC", + input: input, pattern: basic, - type: .allMatches, - target: input)) + try! Regex(basic), + .allMatches) - register(Benchmark( - name: "BasicRangeCCC", - regex: try! Regex(basicRange), + register( + nameBase: "BasicRangeCCC", + input: input, pattern: basicRange, - type: .allMatches, - target: input)) + try! Regex(basicRange), + .allMatches) - register(Benchmark( - name: "CaseInsensitiveCCC", - regex: try! Regex(caseInsensitive), + register( + nameBase: "CaseInsensitiveCCC", + input: input, pattern: caseInsensitive, - type: .allMatches, - target: input)) + try! Regex(caseInsensitive), + .allMatches) - register(Benchmark( - name: "InvertedCCC", - regex: try! Regex(inverted), + register( + nameBase: "InvertedCCC", + input: input, pattern: inverted, - type: .allMatches, - target: input)) + try! Regex(inverted), + .allMatches) - register(Benchmark( - name: "SubtractionCCC", - regex: try! Regex(subtraction), + register( + nameBase: "SubtractionCCC", + input: input, pattern: subtraction, - type: .allMatches, - target: input)) + try! Regex(subtraction), + .allMatches) - register(Benchmark( - name: "IntersectionCCC", - regex: try! Regex(intersection), + register( + nameBase: "IntersectionCCC", + input: input, pattern: intersection, - type: .allMatches, - target: input)) + try! Regex(intersection), + .allMatches) - register(Benchmark( - name: "symDiffCCC", - regex: try! Regex(symmetricDifference), + register( + nameBase: "symDiffCCC", + input: input, pattern: symmetricDifference, - type: .allMatches, - target: input)) + try! Regex(symmetricDifference), + .allMatches) } } diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift index b50d1c213..d8c8c347b 100644 --- a/Sources/_StringProcessing/Engine/MEBuiltins.swift +++ b/Sources/_StringProcessing/Engine/MEBuiltins.swift @@ -63,10 +63,10 @@ extension Processor { switch payload.semanticLevel { case .graphemeCluster: return input.index(after: currentPosition) == subjectBounds.upperBound - && input[currentPosition].isNewline + && input[currentPosition].isNewline case .unicodeScalar: return input.unicodeScalars.index(after: currentPosition) == subjectBounds.upperBound - && input.unicodeScalars[currentPosition].isNewline + && input.unicodeScalars[currentPosition].isNewline } case .endOfSubject: return currentPosition == subjectBounds.upperBound @@ -121,6 +121,7 @@ extension Processor { // MARK: Matching `.` extension String { + // TODO: Should the below have a `limitedBy` parameter? func _matchAnyNonNewline( at currentPosition: String.Index, @@ -155,11 +156,11 @@ extension String { return .unknown } switch asciiValue { - case ._lineFeed, ._carriageReturn: - return .definite(nil) - default: - assert(!isCRLF) - return .definite(next) + case (._lineFeed)...(._carriageReturn): + return .definite(nil) + default: + assert(!isCRLF) + return .definite(next) } } @@ -183,6 +184,7 @@ extension String { // MARK: - Built-in character class matching extension String { + // TODO: Should the below have a `limitedBy` parameter? // Mentioned in ProgrammersManual.md, update docs if redesigned func _matchBuiltinCC( diff --git a/Sources/_StringProcessing/Engine/MEQuantify.swift b/Sources/_StringProcessing/Engine/MEQuantify.swift index 81b80b00e..873627567 100644 --- a/Sources/_StringProcessing/Engine/MEQuantify.swift +++ b/Sources/_StringProcessing/Engine/MEQuantify.swift @@ -1,31 +1,37 @@ extension Processor { func _doQuantifyMatch(_ payload: QuantifyPayload) -> Input.Index? { - var next: Input.Index? + // FIXME: is the below updated for scalar semantics? switch payload.type { case .bitset: - next = input.matchBitset( + return input.matchBitset( registers[payload.bitset], at: currentPosition, limitedBy: end) case .asciiChar: - next = input.matchScalar( + return input.matchScalar( UnicodeScalar.init(_value: UInt32(payload.asciiChar)), at: currentPosition, limitedBy: end, boundaryCheck: true) case .builtin: + // FIXME: bounds check? endIndex or end? + // We only emit .quantify if it consumes a single character - next = input._matchBuiltinCC( + return input._matchBuiltinCC( payload.builtin, at: currentPosition, isInverted: payload.builtinIsInverted, isStrictASCII: payload.builtinIsStrict, isScalarSemantics: false) case .any: - // TODO: call out to existing code with quick check - let matched = currentPosition != input.endIndex - && (!input[currentPosition].isNewline || payload.anyMatchesNewline) - next = matched ? input.index(after: currentPosition) : nil + // FIXME: endIndex or end? + guard currentPosition < input.endIndex else { return nil } + + if payload.anyMatchesNewline { + return input.index(after: currentPosition) + } + + return input._matchAnyNonNewline( + at: currentPosition, isScalarSemantics: false) } - return next } /// Generic quantify instruction interpreter diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index e8e41a114..a6c9babbe 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -1891,6 +1891,11 @@ extension RegexTests { func testSingleLineMode() { firstMatchTest(#".+"#, input: "a\nb", match: "a") firstMatchTest(#"(?s:.+)"#, input: "a\nb", match: "a\nb") + + // We recognize LF, line tab, FF, and CR as newlines by default + firstMatchTest(#"."#, input: "\u{A}\u{B}\u{C}\u{D}\nb", match: "b") + firstMatchTest(#".+"#, input: "\u{A}\u{B}\u{C}\u{D}\nbb", match: "bb") + } func testMatchNewlines() {