diff --git a/Package.swift b/Package.swift index abc895813..c1e9bff37 100644 --- a/Package.swift +++ b/Package.swift @@ -75,15 +75,17 @@ let package = Package( name: "RegexBuilder", dependencies: ["_StringProcessing", "_RegexParser"], swiftSettings: publicStdlibSettings), + .target(name: "TestSupport", + swiftSettings: [availabilityDefinition]), .testTarget( name: "RegexTests", - dependencies: ["_StringProcessing"], + dependencies: ["_StringProcessing", "TestSupport"], swiftSettings: [ .unsafeFlags(["-Xfrontend", "-disable-availability-checking"]), ]), .testTarget( name: "RegexBuilderTests", - dependencies: ["_StringProcessing", "RegexBuilder"], + dependencies: ["_StringProcessing", "RegexBuilder", "TestSupport"], swiftSettings: [ .unsafeFlags(["-Xfrontend", "-disable-availability-checking"]) ]), diff --git a/Sources/RegexBuilder/Anchor.swift b/Sources/RegexBuilder/Anchor.swift index 31a3e8a0d..cf1931577 100644 --- a/Sources/RegexBuilder/Anchor.swift +++ b/Sources/RegexBuilder/Anchor.swift @@ -37,16 +37,30 @@ public struct Anchor { @available(SwiftStdlib 5.7, *) extension Anchor: RegexComponent { - var baseAssertion: DSLTree._AST.AssertionKind { + var baseAssertion: DSLTree.Atom.Assertion { switch kind { - case .startOfSubject: return .startOfSubject(isInverted) - case .endOfSubjectBeforeNewline: return .endOfSubjectBeforeNewline(isInverted) - case .endOfSubject: return .endOfSubject(isInverted) - case .firstMatchingPositionInSubject: return .firstMatchingPositionInSubject(isInverted) - case .textSegmentBoundary: return .textSegmentBoundary(isInverted) - case .startOfLine: return .startOfLine(isInverted) - case .endOfLine: return .endOfLine(isInverted) - case .wordBoundary: return .wordBoundary(isInverted) + case .startOfSubject: + // FIXME: Inverted? + return .startOfSubject + case .endOfSubjectBeforeNewline: + // FIXME: Inverted? + return .endOfSubjectBeforeNewline + case .endOfSubject: + // FIXME: Inverted? + return .endOfSubject + case .firstMatchingPositionInSubject: + // FIXME: Inverted? + return .firstMatchingPositionInSubject + case .textSegmentBoundary: + return isInverted ? .notTextSegment : .textSegment + case .startOfLine: + // FIXME: Inverted? + return .startOfLine + case .endOfLine: + // FIXME: Inverted? + return .endOfLine + case .wordBoundary: + return isInverted ? .notWordBoundary : .wordBoundary } } @@ -104,6 +118,12 @@ extension Anchor { /// /// This anchor is equivalent to `^` in regex syntax when the `m` option /// has been enabled or `anchorsMatchLineEndings(true)` has been called. + /// + /// For example, the following regexes are all equivalent: + /// + /// - `Regex { Anchor.startOfLine }` + /// - `/(?m)^/` or `/(?m:^)/` + /// - `/^/.anchorsMatchLineEndings(true)` public static var startOfLine: Anchor { Anchor(kind: .startOfLine) } @@ -113,6 +133,12 @@ extension Anchor { /// /// This anchor is equivalent to `$` in regex syntax when the `m` option /// has been enabled or `anchorsMatchLineEndings(true)` has been called. + /// + /// For example, the following regexes are all equivalent: + /// + /// - `Regex { Anchor.endOfLine }` + /// - `/(?m)$/` or `/(?m:$)/` + /// - `/$/.anchorsMatchLineEndings(true)` public static var endOfLine: Anchor { Anchor(kind: .endOfLine) } diff --git a/Sources/RegexBuilder/CharacterClass.swift b/Sources/RegexBuilder/CharacterClass.swift index a6d18b2cf..ea52c28f3 100644 --- a/Sources/RegexBuilder/CharacterClass.swift +++ b/Sources/RegexBuilder/CharacterClass.swift @@ -20,11 +20,8 @@ public struct CharacterClass { self.ccc = ccc } - init(unconverted model: _CharacterClassModel) { - guard let ccc = model.makeDSLTreeCharacterClass() else { - fatalError("Unsupported character class") - } - self.ccc = ccc + init(unconverted atom: DSLTree._AST.Atom) { + self.ccc = .init(members: [.atom(.unconverted(atom))]) } } @@ -48,16 +45,20 @@ extension RegexComponent where Self == CharacterClass { .init(DSLTree.CustomCharacterClass(members: [.atom(.any)])) } + public static var anyNonNewline: CharacterClass { + .init(DSLTree.CustomCharacterClass(members: [.atom(.anyNonNewline)])) + } + public static var anyGraphemeCluster: CharacterClass { - .init(unconverted: .anyGrapheme) + .init(unconverted: ._anyGrapheme) } public static var whitespace: CharacterClass { - .init(unconverted: .whitespace) + .init(unconverted: ._whitespace) } public static var digit: CharacterClass { - .init(unconverted: .digit) + .init(unconverted: ._digit) } public static var hexDigit: CharacterClass { @@ -69,19 +70,19 @@ extension RegexComponent where Self == CharacterClass { } public static var horizontalWhitespace: CharacterClass { - .init(unconverted: .horizontalWhitespace) + .init(unconverted: ._horizontalWhitespace) } public static var newlineSequence: CharacterClass { - .init(unconverted: .newlineSequence) + .init(unconverted: ._newlineSequence) } public static var verticalWhitespace: CharacterClass { - .init(unconverted: .verticalWhitespace) + .init(unconverted: ._verticalWhitespace) } public static var word: CharacterClass { - .init(unconverted: .word) + .init(unconverted: ._word) } } diff --git a/Sources/TestSupport/TestSupport.swift b/Sources/TestSupport/TestSupport.swift new file mode 100644 index 000000000..b60adb63f --- /dev/null +++ b/Sources/TestSupport/TestSupport.swift @@ -0,0 +1,33 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2022 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +import XCTest + +// We need to split this out of the test files, as it needs to be compiled +// *without* `-disable-availability-checking` to ensure the #available check is +// not compiled into a no-op. + +#if os(Linux) +public func XCTExpectFailure( + _ message: String? = nil, body: () throws -> Void +) rethrows {} +#endif + +/// Guards certain tests to make sure we have a new stdlib available. +public func ensureNewStdlib( + file: StaticString = #file, line: UInt = #line +) -> Bool { + guard #available(SwiftStdlib 5.7, *) else { + XCTExpectFailure { XCTFail("Unsupported stdlib", file: file, line: line) } + return false + } + return true +} diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift index f1419ad78..8706327f7 100644 --- a/Sources/_RegexParser/Regex/AST/Atom.swift +++ b/Sources/_RegexParser/Regex/AST/Atom.swift @@ -60,13 +60,13 @@ extension AST { case namedCharacter(String) /// . - case any + case dot /// ^ - case startOfLine + case caretAnchor /// $ - case endOfLine + case dollarAnchor // References case backreference(Reference) @@ -104,9 +104,9 @@ extension AST.Atom { case .callout(let v): return v case .backtrackingDirective(let v): return v case .changeMatchingOptions(let v): return v - case .any: return nil - case .startOfLine: return nil - case .endOfLine: return nil + case .dot: return nil + case .caretAnchor: return nil + case .dollarAnchor: return nil case .invalid: return nil } } @@ -511,67 +511,6 @@ extension AST.Atom.CharacterProperty { } } -extension AST.Atom { - /// Anchors and other built-in zero-width assertions. - public enum AssertionKind: String, Hashable { - /// \A - case startOfSubject = #"\A"# - - /// \Z - case endOfSubjectBeforeNewline = #"\Z"# - - /// \z - case endOfSubject = #"\z"# - - /// \K - case resetStartOfMatch = #"\K"# - - /// \G - case firstMatchingPositionInSubject = #"\G"# - - /// \y - case textSegment = #"\y"# - - /// \Y - case notTextSegment = #"\Y"# - - /// ^ - case startOfLine = #"^"# - - /// $ - case endOfLine = #"$"# - - /// \b (from outside a custom character class) - case wordBoundary = #"\b"# - - /// \B - case notWordBoundary = #"\B"# - - } - - public var assertionKind: AssertionKind? { - switch kind { - case .startOfLine: return .startOfLine - case .endOfLine: return .endOfLine - - case .escaped(.wordBoundary): return .wordBoundary - case .escaped(.notWordBoundary): return .notWordBoundary - case .escaped(.startOfSubject): return .startOfSubject - case .escaped(.endOfSubject): return .endOfSubject - case .escaped(.textSegment): return .textSegment - case .escaped(.notTextSegment): return .notTextSegment - case .escaped(.endOfSubjectBeforeNewline): - return .endOfSubjectBeforeNewline - case .escaped(.firstMatchingPositionInSubject): - return .firstMatchingPositionInSubject - - case .escaped(.resetStartOfMatch): return .resetStartOfMatch - - default: return nil - } - } -} - extension AST.Atom { public enum Callout: Hashable { /// A PCRE callout written `(?C...)` @@ -806,9 +745,9 @@ extension AST.Atom { // the AST? Or defer for the matching engine? return nil - case .scalarSequence, .property, .any, .startOfLine, .endOfLine, - .backreference, .subpattern, .callout, .backtrackingDirective, - .changeMatchingOptions, .invalid: + case .scalarSequence, .property, .dot, .caretAnchor, + .dollarAnchor, .backreference, .subpattern, .callout, + .backtrackingDirective, .changeMatchingOptions, .invalid: return nil } } @@ -816,8 +755,10 @@ extension AST.Atom { /// Whether this atom is valid as the operand of a custom character class /// range. public var isValidCharacterClassRangeBound: Bool { - // If we have a literal character value for this, it can be used as a bound. - if literalCharacterValue != nil { return true } + if let c = literalCharacterValue { + // We only match character range bounds that are single scalar NFC. + return c.hasExactlyOneScalar && c.isNFC + } switch kind { // \cx, \C-x, \M-x, \M-\C-x, \N{...} case .keyboardControl, .keyboardMeta, .keyboardMetaControl, .namedCharacter: @@ -858,7 +799,7 @@ extension AST.Atom { case .keyboardMetaControl(let x): return "\\M-\\C-\(x)" - case .property, .escaped, .any, .startOfLine, .endOfLine, + case .property, .escaped, .dot, .caretAnchor, .dollarAnchor, .backreference, .subpattern, .namedCharacter, .callout, .backtrackingDirective, .changeMatchingOptions, .invalid: return nil @@ -874,7 +815,7 @@ extension AST.Atom { // TODO: Are callouts quantifiable? case .escaped(let esc): return esc.isQuantifiable - case .startOfLine, .endOfLine: + case .caretAnchor, .dollarAnchor: return false default: return true diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift index 2168dbb03..a830a18b7 100644 --- a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift +++ b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift @@ -480,35 +480,37 @@ extension Parser { /// mutating func lexQuantifier( ) -> (Located, Located, [AST.Trivia])? { - var trivia: [AST.Trivia] = [] + tryEating { p in + var trivia: [AST.Trivia] = [] - if let t = lexNonSemanticWhitespace() { trivia.append(t) } + if let t = p.lexNonSemanticWhitespace() { trivia.append(t) } - let amt: Located? = recordLoc { p in - if p.tryEat("*") { return .zeroOrMore } - if p.tryEat("+") { return .oneOrMore } - if p.tryEat("?") { return .zeroOrOne } + let amt: Located? = p.recordLoc { p in + if p.tryEat("*") { return .zeroOrMore } + if p.tryEat("+") { return .oneOrMore } + if p.tryEat("?") { return .zeroOrOne } - return p.tryEating { p in - guard p.tryEat("{"), - let range = p.lexRange(trivia: &trivia), - p.tryEat("}") - else { return nil } - return range.value + return p.tryEating { p in + guard p.tryEat("{"), + let range = p.lexRange(trivia: &trivia), + p.tryEat("}") + else { return nil } + return range.value + } } - } - guard let amt = amt else { return nil } + guard let amt = amt else { return nil } - // PCRE allows non-semantic whitespace here in extended syntax mode. - if let t = lexNonSemanticWhitespace() { trivia.append(t) } + // PCRE allows non-semantic whitespace here in extended syntax mode. + if let t = p.lexNonSemanticWhitespace() { trivia.append(t) } - let kind: Located = recordLoc { p in - if p.tryEat("?") { return .reluctant } - if p.tryEat("+") { return .possessive } - return .eager - } + let kind: Located = p.recordLoc { p in + if p.tryEat("?") { return .reluctant } + if p.tryEat("+") { return .possessive } + return .eager + } - return (amt, kind, trivia) + return (amt, kind, trivia) + } } /// Try to consume a range, returning `nil` if unsuccessful. @@ -2073,9 +2075,9 @@ extension Parser { p.unreachable("Should have lexed a group or group-like atom") // (sometimes) special metacharacters - case ".": return customCC ? .char(".") : .any - case "^": return customCC ? .char("^") : .startOfLine - case "$": return customCC ? .char("$") : .endOfLine + case ".": return customCC ? .char(".") : .dot + case "^": return customCC ? .char("^") : .caretAnchor + case "$": return customCC ? .char("$") : .dollarAnchor // Escaped case "\\": return p.expectEscaped().value diff --git a/Sources/_RegexParser/Regex/Parse/Sema.swift b/Sources/_RegexParser/Regex/Parse/Sema.swift index 0aeee282d..ea541fba7 100644 --- a/Sources/_RegexParser/Regex/Parse/Sema.swift +++ b/Sources/_RegexParser/Regex/Parse/Sema.swift @@ -221,7 +221,7 @@ extension RegexValidator { ) { switch esc { case .resetStartOfMatch, .singleDataUnit, .trueAnychar, - // '\N' needs to be emitted using 'emitAny'. + // '\N' needs to be emitted using 'emitDot'. .notNewline: error(.unsupported("'\\\(esc.character)'"), at: loc) @@ -288,7 +288,7 @@ extension RegexValidator { at: atom.location) } - case .char, .scalar, .startOfLine, .endOfLine, .any: + case .char, .scalar, .caretAnchor, .dollarAnchor, .dot: break case .invalid: diff --git a/Sources/_RegexParser/Regex/Printing/DumpAST.swift b/Sources/_RegexParser/Regex/Printing/DumpAST.swift index 48a2512cf..cf5a56721 100644 --- a/Sources/_RegexParser/Regex/Printing/DumpAST.swift +++ b/Sources/_RegexParser/Regex/Printing/DumpAST.swift @@ -153,9 +153,9 @@ extension AST.Atom { case .keyboardControl, .keyboardMeta, .keyboardMetaControl: fatalError("TODO") - case .any: return "." - case .startOfLine: return "^" - case .endOfLine: return "$" + case .dot: return "." + case .caretAnchor: return "^" + case .dollarAnchor: return "$" case .backreference(let r), .subpattern(let r): return "\(r._dumpBase)" diff --git a/Sources/_RegexParser/Regex/Printing/PrintAsCanonical.swift b/Sources/_RegexParser/Regex/Printing/PrintAsCanonical.swift index 0e7cfb1d3..6b8c8ab93 100644 --- a/Sources/_RegexParser/Regex/Printing/PrintAsCanonical.swift +++ b/Sources/_RegexParser/Regex/Printing/PrintAsCanonical.swift @@ -237,9 +237,6 @@ extension AST.Atom.Number { extension AST.Atom { var _canonicalBase: String { - if let anchor = self.assertionKind { - return anchor.rawValue - } if let lit = self.literalStringValue { // FIXME: We may have to re-introduce escapes // For example, `\.` will come back as "." instead @@ -248,6 +245,10 @@ extension AST.Atom { return lit } switch self.kind { + case .caretAnchor: + return "^" + case .dollarAnchor: + return "$" case .escaped(let e): return "\\\(e.character)" case .backreference(let br): diff --git a/Sources/_RegexParser/Utility/Misc.swift b/Sources/_RegexParser/Utility/Misc.swift index d37dfbd4a..70dc7a7d5 100644 --- a/Sources/_RegexParser/Utility/Misc.swift +++ b/Sources/_RegexParser/Utility/Misc.swift @@ -19,6 +19,21 @@ extension Substring { var string: String { String(self) } } +extension Character { + /// Whether this character is made up of exactly one Unicode scalar value. + public var hasExactlyOneScalar: Bool { + let scalars = unicodeScalars + return scalars.index(after: scalars.startIndex) == scalars.endIndex + } + + /// Whether the given character is in NFC form. + internal var isNFC: Bool { + if isASCII { return true } + let str = String(self) + return str._nfcCodeUnits.elementsEqual(str.utf8) + } +} + extension CustomStringConvertible { @_alwaysEmitIntoClient public var halfWidthCornerQuoted: String { diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index d18d50aa0..e8c92f2b5 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -58,14 +58,24 @@ fileprivate extension Compiler.ByteCodeGen { case .any: emitAny() + case .anyNonNewline: + emitAnyNonNewline() + + case .dot: + emitDot() + case let .char(c): - try emitCharacter(c) + emitCharacter(c) case let .scalar(s): - try emitScalar(s) + if options.semanticLevel == .graphemeCluster { + emitCharacter(Character(s)) + } else { + emitMatchScalar(s) + } case let .assertion(kind): - try emitAssertion(kind.ast) + try emitAssertion(kind) case let .backreference(ref): try emitBackreference(ref.ast) @@ -88,6 +98,34 @@ fileprivate extension Compiler.ByteCodeGen { } } + mutating func emitQuotedLiteral(_ s: String) { + guard options.semanticLevel == .graphemeCluster else { + for char in s { + for scalar in char.unicodeScalars { + emitMatchScalar(scalar) + } + } + return + } + + // Fast path for eliding boundary checks for an all ascii quoted literal + if optimizationsEnabled && s.allSatisfy(\.isASCII) { + let lastIdx = s.unicodeScalars.indices.last! + for idx in s.unicodeScalars.indices { + let boundaryCheck = idx == lastIdx + let scalar = s.unicodeScalars[idx] + if options.isCaseInsensitive && scalar.properties.isCased { + builder.buildMatchScalarCaseInsensitive(scalar, boundaryCheck: boundaryCheck) + } else { + builder.buildMatchScalar(scalar, boundaryCheck: boundaryCheck) + } + } + return + } + + for c in s { emitCharacter(c) } + } + mutating func emitBackreference( _ ref: AST.Reference ) throws { @@ -110,8 +148,34 @@ fileprivate extension Compiler.ByteCodeGen { } } + mutating func emitStartOfLine() { + builder.buildAssert { [semanticLevel = options.semanticLevel] + (_, _, input, pos, subjectBounds) in + if pos == subjectBounds.lowerBound { return true } + switch semanticLevel { + case .graphemeCluster: + return input[input.index(before: pos)].isNewline + case .unicodeScalar: + return input.unicodeScalars[input.unicodeScalars.index(before: pos)].isNewline + } + } + } + + mutating func emitEndOfLine() { + builder.buildAssert { [semanticLevel = options.semanticLevel] + (_, _, input, pos, subjectBounds) in + if pos == subjectBounds.upperBound { return true } + switch semanticLevel { + case .graphemeCluster: + return input[pos].isNewline + case .unicodeScalar: + return input.unicodeScalars[pos].isNewline + } + } + } + mutating func emitAssertion( - _ kind: AST.Atom.AssertionKind + _ kind: DSLTree.Atom.Assertion ) throws { // FIXME: Depends on API model we have... We may want to // think through some of these with API interactions in mind @@ -168,43 +232,23 @@ fileprivate extension Compiler.ByteCodeGen { } case .startOfLine: - // FIXME: Anchor.startOfLine must always use this first branch - // The behavior of `^` should depend on `anchorsMatchNewlines`, but - // the DSL-based `.startOfLine` anchor should always match the start - // of a line. Right now we don't distinguish between those anchors. + emitStartOfLine() + + case .endOfLine: + emitEndOfLine() + + case .caretAnchor: if options.anchorsMatchNewlines { - builder.buildAssert { [semanticLevel = options.semanticLevel] - (_, _, input, pos, subjectBounds) in - if pos == subjectBounds.lowerBound { return true } - switch semanticLevel { - case .graphemeCluster: - return input[input.index(before: pos)].isNewline - case .unicodeScalar: - return input.unicodeScalars[input.unicodeScalars.index(before: pos)].isNewline - } - } + emitStartOfLine() } else { builder.buildAssert { (_, _, input, pos, subjectBounds) in pos == subjectBounds.lowerBound } } - - case .endOfLine: - // FIXME: Anchor.endOfLine must always use this first branch - // The behavior of `$` should depend on `anchorsMatchNewlines`, but - // the DSL-based `.endOfLine` anchor should always match the end - // of a line. Right now we don't distinguish between those anchors. + + case .dollarAnchor: if options.anchorsMatchNewlines { - builder.buildAssert { [semanticLevel = options.semanticLevel] - (_, _, input, pos, subjectBounds) in - if pos == subjectBounds.upperBound { return true } - switch semanticLevel { - case .graphemeCluster: - return input[pos].isNewline - case .unicodeScalar: - return input.unicodeScalars[pos].isNewline - } - } + emitEndOfLine() } else { builder.buildAssert { (_, _, input, pos, subjectBounds) in pos == subjectBounds.upperBound @@ -245,60 +289,70 @@ fileprivate extension Compiler.ByteCodeGen { } } - mutating func emitScalar(_ s: UnicodeScalar) throws { - // TODO: Native instruction buildMatchScalar(s) - if options.isCaseInsensitive { - // TODO: e.g. buildCaseInsensitiveMatchScalar(s) - builder.buildConsume(by: consumeScalar { - $0.properties.lowercaseMapping == s.properties.lowercaseMapping - }) + mutating func emitMatchScalar(_ s: UnicodeScalar) { + assert(options.semanticLevel == .unicodeScalar) + if options.isCaseInsensitive && s.properties.isCased { + builder.buildMatchScalarCaseInsensitive(s, boundaryCheck: false) } else { - builder.buildConsume(by: consumeScalar { - $0 == s - }) + builder.buildMatchScalar(s, boundaryCheck: false) } } - mutating func emitCharacter(_ c: Character) throws { - // Unicode scalar matches the specific scalars that comprise a character + mutating func emitCharacter(_ c: Character) { + // Unicode scalar mode matches the specific scalars that comprise a character if options.semanticLevel == .unicodeScalar { for scalar in c.unicodeScalars { - try emitScalar(scalar) + emitMatchScalar(scalar) } return } if options.isCaseInsensitive && c.isCased { - // TODO: buildCaseInsensitiveMatch(c) or buildMatch(c, caseInsensitive: true) - builder.buildConsume { input, bounds in - let inputChar = input[bounds.lowerBound].lowercased() - let matchChar = c.lowercased() - return inputChar == matchChar - ? input.index(after: bounds.lowerBound) - : nil + if optimizationsEnabled && c.isASCII { + // c.isCased ensures that c is not CR-LF, + // so we know that c is a single scalar + assert(c.unicodeScalars.count == 1) + builder.buildMatchScalarCaseInsensitive( + c.unicodeScalars.last!, + boundaryCheck: true) + } else { + builder.buildMatch(c, isCaseInsensitive: true) } - } else { - builder.buildMatch(c) + return + } + + if optimizationsEnabled && c.isASCII { + let lastIdx = c.unicodeScalars.indices.last! + for idx in c.unicodeScalars.indices { + builder.buildMatchScalar(c.unicodeScalars[idx], boundaryCheck: idx == lastIdx) + } + return } + + builder.buildMatch(c, isCaseInsensitive: false) } mutating func emitAny() { - switch (options.semanticLevel, options.dotMatchesNewline) { - case (.graphemeCluster, true): + switch options.semanticLevel { + case .graphemeCluster: builder.buildAdvance(1) - case (.graphemeCluster, false): + case .unicodeScalar: + // TODO: builder.buildAdvanceUnicodeScalar(1) builder.buildConsume { input, bounds in - input[bounds.lowerBound].isNewline - ? nil - : input.index(after: bounds.lowerBound) + input.unicodeScalars.index(after: bounds.lowerBound) } + } + } - case (.unicodeScalar, true): - // TODO: builder.buildAdvanceUnicodeScalar(1) + mutating func emitAnyNonNewline() { + switch options.semanticLevel { + case .graphemeCluster: builder.buildConsume { input, bounds in - input.unicodeScalars.index(after: bounds.lowerBound) + input[bounds.lowerBound].isNewline + ? nil + : input.index(after: bounds.lowerBound) } - case (.unicodeScalar, false): + case .unicodeScalar: builder.buildConsume { input, bounds in input[bounds.lowerBound].isNewline ? nil @@ -307,6 +361,14 @@ fileprivate extension Compiler.ByteCodeGen { } } + mutating func emitDot() { + if options.dotMatchesNewline { + emitAny() + } else { + emitAnyNonNewline() + } + } + mutating func emitAlternation( _ children: [DSLTree.Node] ) throws { @@ -543,7 +605,12 @@ fileprivate extension Compiler.ByteCodeGen { decrement %minTrips and fallthrough loop-body: + : + mov currentPosition %pos evaluate the subexpression + : + if %pos is currentPosition: + goto exit goto min-trip-count control block exit-policy control block: @@ -646,7 +713,28 @@ fileprivate extension Compiler.ByteCodeGen { // // branch min-trip-count builder.label(loopBody) + + // if we aren't sure if the child node will have forward progress and + // we have an unbounded quantification + let startPosition: PositionRegister? + let emitPositionChecking = + (!optimizationsEnabled || !child.guaranteesForwardProgress) && + extraTrips == nil + + if emitPositionChecking { + startPosition = builder.makePositionRegister() + builder.buildMoveCurrentPosition(into: startPosition!) + } else { + startPosition = nil + } try emitNode(child) + if emitPositionChecking { + // in all quantifier cases, no matter what minTrips or extraTrips is, + // if we have a successful non-advancing match, branch to exit because it + // can match an arbitrary number of times + builder.buildCondBranch(to: exit, ifSamePositionAs: startPosition!) + } + if minTrips <= 1 { // fallthrough } else { @@ -687,21 +775,183 @@ fileprivate extension Compiler.ByteCodeGen { builder.label(exit) } + /// Coalesce any adjacent scalar members in a custom character class together. + /// This is required in order to produce correct grapheme matching behavior. + func coalescingCustomCharacterClassMembers( + _ members: [DSLTree.CustomCharacterClass.Member] + ) -> [DSLTree.CustomCharacterClass.Member] { + struct Accumulator { + /// A series of range operands. For example, in `[ab-cde-fg]`, this will + /// contain the strings `["ab", "cde", "fg"]`. From there, the resulting + /// ranges will be created. + private var rangeOperands: [String] = [""] + + /// The current range operand. + private var current: String { + _read { yield rangeOperands[rangeOperands.count - 1] } + _modify { yield &rangeOperands[rangeOperands.count - 1] } + } + + /// Try to accumulate a character class member, returning `true` if + /// successful, `false` otherwise. + mutating func tryAccumulate( + _ member: DSLTree.CustomCharacterClass.Member + ) -> Bool { + switch member { + case .atom(let a): + guard let c = a.literalCharacterValue else { return false } + current.append(c) + return true + case .quotedLiteral(let str): + current += str + return true + case let .range(lhs, rhs): + guard let lhs = lhs.literalCharacterValue, + let rhs = rhs.literalCharacterValue + else { return false } + current.append(lhs) + rangeOperands.append(String(rhs)) + return true + case .trivia: + // Trivia can be completely ignored if we've already coalesced + // something. + return !current.isEmpty + default: + return false + } + } + + func finish() -> [DSLTree.CustomCharacterClass.Member] { + if rangeOperands.count == 1 { + // If we didn't have any additional range operands, this isn't a + // range, we can just form a standard quoted literal. + return [.quotedLiteral(current)] + } + var members = [DSLTree.CustomCharacterClass.Member]() + + // We have other range operands, splice them together. For N operands + // we have N - 1 ranges. + for (i, lhs) in rangeOperands.dropLast().enumerated() { + let rhs = rangeOperands[i + 1] + + // If this is the first operand we only need to drop the last + // character for its quoted members, otherwise this is both an LHS + // and RHS of a range, and as such needs both sides trimmed. + let leading = i == 0 ? lhs.dropLast() : lhs.dropFirst().dropLast() + if !leading.isEmpty { + members.append(.quotedLiteral(String(leading))) + } + members.append(.range(.char(lhs.last!), .char(rhs.first!))) + } + // We've handled everything except the quoted portion of the last + // operand, add it now. + let trailing = rangeOperands.last!.dropFirst() + if !trailing.isEmpty { + members.append(.quotedLiteral(String(trailing))) + } + return members + } + } + return members + .map { m -> DSLTree.CustomCharacterClass.Member in + // First we need to recursively coalsce any child character classes. + switch m { + case .custom(let ccc): + return .custom(coalescingCustomCharacterClass(ccc)) + case .intersection(let lhs, let rhs): + return .intersection( + coalescingCustomCharacterClass(lhs), + coalescingCustomCharacterClass(rhs)) + case .subtraction(let lhs, let rhs): + return .subtraction( + coalescingCustomCharacterClass(lhs), + coalescingCustomCharacterClass(rhs)) + case .symmetricDifference(let lhs, let rhs): + return .symmetricDifference( + coalescingCustomCharacterClass(lhs), + coalescingCustomCharacterClass(rhs)) + case .atom, .range, .quotedLiteral, .trivia: + return m + } + } + .coalescing(with: Accumulator(), into: { $0.finish() }) { accum, member in + accum.tryAccumulate(member) + } + } + + func coalescingCustomCharacterClass( + _ ccc: DSLTree.CustomCharacterClass + ) -> DSLTree.CustomCharacterClass { + // This only needs to be done in grapheme semantic mode. In scalar semantic + // mode, we don't want to coalesce any scalars into a grapheme. This + // means that e.g `[e\u{301}-\u{302}]` remains a range between U+301 and + // U+302. + guard options.semanticLevel == .graphemeCluster else { return ccc } + + let members = coalescingCustomCharacterClassMembers(ccc.members) + return .init(members: members, isInverted: ccc.isInverted) + } + mutating func emitCustomCharacterClass( _ ccc: DSLTree.CustomCharacterClass ) throws { + // Before emitting a custom character class in grapheme semantic mode, we + // need to coalesce together any adjacent characters and scalars, over which + // we can perform grapheme breaking. This includes e.g range bounds for + // `[e\u{301}-\u{302}]`. + let ccc = coalescingCustomCharacterClass(ccc) if let asciiBitset = ccc.asAsciiBitset(options), - options.semanticLevel == .graphemeCluster, optimizationsEnabled { - // future work: add a bit to .matchBitset to consume either a character - // or a scalar so we can have this optimization in scalar mode - builder.buildMatchAsciiBitset(asciiBitset) + if options.semanticLevel == .unicodeScalar { + builder.buildScalarMatchAsciiBitset(asciiBitset) + } else { + builder.buildMatchAsciiBitset(asciiBitset) + } } else { let consumer = try ccc.generateConsumer(options) builder.buildConsume(by: consumer) } } + mutating func emitConcatenation(_ children: [DSLTree.Node]) throws { + // Before emitting a concatenation, we need to flatten out any nested + // concatenations, and coalesce any adjacent characters and scalars, forming + // quoted literals of their contents, over which we can perform grapheme + // breaking. + func flatten(_ node: DSLTree.Node) -> [DSLTree.Node] { + switch node { + case .concatenation(let ch): + return ch.flatMap(flatten) + case .convertedRegexLiteral(let n, _): + return flatten(n) + default: + return [node] + } + } + let children = children + .flatMap(flatten) + .coalescing(with: "", into: DSLTree.Node.quotedLiteral) { str, node in + switch node { + case .atom(let a): + guard let c = a.literalCharacterValue else { return false } + str.append(c) + return true + case .quotedLiteral(let q): + str += q + return true + case .trivia: + // Trivia can be completely ignored if we've already coalesced + // something. + return !str.isEmpty + default: + return false + } + } + for child in children { + try emitConcatenationComponent(child) + } + } + @discardableResult mutating func emitNode(_ node: DSLTree.Node) throws -> ValueRegister? { switch node { @@ -710,9 +960,7 @@ fileprivate extension Compiler.ByteCodeGen { try emitAlternation(children) case let .concatenation(children): - for child in children { - try emitConcatenationComponent(child) - } + try emitConcatenation(children) case let .capture(name, refId, child, transform): options.beginScope() @@ -758,9 +1006,9 @@ fileprivate extension Compiler.ByteCodeGen { try emitQuantification(amt.ast, kind, child) case let .customCharacterClass(ccc): - if ccc.containsAny { + if ccc.containsDot { if !ccc.isInverted { - emitAny() + emitDot() } else { throw Unsupported("Inverted any") } @@ -772,45 +1020,7 @@ fileprivate extension Compiler.ByteCodeGen { try emitAtom(a) case let .quotedLiteral(s): - if options.semanticLevel == .graphemeCluster { - if options.isCaseInsensitive { - // TODO: buildCaseInsensitiveMatchSequence(c) or alternative - builder.buildConsume { input, bounds in - var iterator = s.makeIterator() - var currentIndex = bounds.lowerBound - while let ch = iterator.next() { - guard currentIndex < bounds.upperBound, - ch.lowercased() == input[currentIndex].lowercased() - else { return nil } - input.formIndex(after: ¤tIndex) - } - return currentIndex - } - } else { - builder.buildMatchSequence(s) - } - } else { - builder.buildConsume { - [caseInsensitive = options.isCaseInsensitive] input, bounds in - // TODO: Case folding - var iterator = s.unicodeScalars.makeIterator() - var currentIndex = bounds.lowerBound - while let scalar = iterator.next() { - guard currentIndex < bounds.upperBound else { return nil } - if caseInsensitive { - if scalar.properties.lowercaseMapping != input.unicodeScalars[currentIndex].properties.lowercaseMapping { - return nil - } - } else { - if scalar != input.unicodeScalars[currentIndex] { - return nil - } - } - input.unicodeScalars.formIndex(after: ¤tIndex) - } - return currentIndex - } - } + emitQuotedLiteral(s) case let .convertedRegexLiteral(n, _): return try emitNode(n) @@ -832,3 +1042,42 @@ fileprivate extension Compiler.ByteCodeGen { return nil } } + +extension DSLTree.Node { + var guaranteesForwardProgress: Bool { + switch self { + case .orderedChoice(let children): + return children.allSatisfy { $0.guaranteesForwardProgress } + case .concatenation(let children): + return children.contains(where: { $0.guaranteesForwardProgress }) + case .capture(_, _, let node, _): + return node.guaranteesForwardProgress + case .nonCapturingGroup(let kind, let child): + switch kind.ast { + case .lookahead, .negativeLookahead, .lookbehind, .negativeLookbehind: + return false + default: return child.guaranteesForwardProgress + } + case .atom(let atom): + switch atom { + case .changeMatchingOptions, .assertion: return false + default: return true + } + case .trivia, .empty: + return false + case .quotedLiteral(let string): + return !string.isEmpty + case .convertedRegexLiteral(let node, _): + return node.guaranteesForwardProgress + case .consumer, .matcher: + // Allow zero width consumers and matchers + return false + case .customCharacterClass: + return true + case .quantification(let amount, _, let child): + let (atLeast, _) = amount.ast.bounds + return atLeast ?? 0 > 0 && child.guaranteesForwardProgress + default: return false + } + } +} diff --git a/Sources/_StringProcessing/Compiler.swift b/Sources/_StringProcessing/Compiler.swift index 530126a32..b8daa8b21 100644 --- a/Sources/_StringProcessing/Compiler.swift +++ b/Sources/_StringProcessing/Compiler.swift @@ -42,19 +42,43 @@ class Compiler { } } +/// Hashable wrapper for `Any.Type`. +struct AnyHashableType: CustomStringConvertible, Hashable { + var ty: Any.Type + init(_ ty: Any.Type) { + self.ty = ty + } + var description: String { "\(ty)" } + + static func == (lhs: Self, rhs: Self) -> Bool { + lhs.ty == rhs.ty + } + func hash(into hasher: inout Hasher) { + hasher.combine(ObjectIdentifier(ty)) + } +} + // An error produced when compiling a regular expression. -enum RegexCompilationError: Error, CustomStringConvertible { +enum RegexCompilationError: Error, Hashable, CustomStringConvertible { // TODO: Source location? case uncapturedReference + case incorrectOutputType(incorrect: AnyHashableType, correct: AnyHashableType) + case invalidCharacterClassRangeOperand(Character) + + static func incorrectOutputType( + incorrect: Any.Type, correct: Any.Type + ) -> Self { + .incorrectOutputType(incorrect: .init(incorrect), correct: .init(correct)) + } - case incorrectOutputType(incorrect: Any.Type, correct: Any.Type) - var description: String { switch self { case .uncapturedReference: return "Found a reference used before it captured any match." case .incorrectOutputType(let incorrect, let correct): return "Cast to incorrect type 'Regex<\(incorrect)>', expected 'Regex<\(correct)>'" + case .invalidCharacterClassRangeOperand(let c): + return "'\(c)' is an invalid bound for character class range" } } } diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index dbb324b67..083781120 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -11,6 +11,13 @@ @_implementationOnly import _RegexParser +extension Character { + var _singleScalarAsciiValue: UInt8? { + guard self != "\r\n" else { return nil } + return asciiValue + } +} + extension DSLTree.Node { /// Attempt to generate a consumer from this AST node /// @@ -53,11 +60,50 @@ extension DSLTree._AST.Atom { } } +extension Character { + func generateConsumer( + _ opts: MatchingOptions + ) throws -> MEProgram.ConsumeFunction { + let isCaseInsensitive = opts.isCaseInsensitive + switch opts.semanticLevel { + case .graphemeCluster: + return { input, bounds in + let low = bounds.lowerBound + if isCaseInsensitive && isCased { + return input[low].lowercased() == lowercased() + ? input.index(after: low) + : nil + } else { + return input[low] == self + ? input.index(after: low) + : nil + } + } + case .unicodeScalar: + // TODO: This should only be reachable from character class emission, can + // we guarantee that? Otherwise we'd want a different matching behavior. + let consumers = unicodeScalars.map { s in consumeScalar { + isCaseInsensitive + ? $0.properties.lowercaseMapping == s.properties.lowercaseMapping + : $0 == s + }} + return { input, bounds in + for fn in consumers { + if let idx = fn(input, bounds) { + return idx + } + } + return nil + } + } + } +} + extension DSLTree.Atom { var singleScalarASCIIValue: UInt8? { switch self { - case let .char(c) where c != "\r\n": - return c.asciiValue + case let .char(c): + return c._singleScalarAsciiValue case let .scalar(s) where s.isASCII: return UInt8(ascii: s) case let .unconverted(atom): @@ -72,44 +118,15 @@ extension DSLTree.Atom { func generateConsumer( _ opts: MatchingOptions ) throws -> MEProgram.ConsumeFunction? { - let isCaseInsensitive = opts.isCaseInsensitive - switch self { case let .char(c): - if opts.semanticLevel == .graphemeCluster { - return { input, bounds in - let low = bounds.lowerBound - if isCaseInsensitive && c.isCased { - return input[low].lowercased() == c.lowercased() - ? input.index(after: low) - : nil - } else { - return input[low] == c - ? input.index(after: low) - : nil - } - } - } else { - let consumers = c.unicodeScalars.map { s in consumeScalar { - isCaseInsensitive - ? $0.properties.lowercaseMapping == s.properties.lowercaseMapping - : $0 == s - }} - return { input, bounds in - for fn in consumers { - if let idx = fn(input, bounds) { - return idx - } - } - return nil - } - } + return try c.generateConsumer(opts) + case let .scalar(s): - return consumeScalar { - isCaseInsensitive - ? $0.properties.lowercaseMapping == s.properties.lowercaseMapping - : $0 == s - } + // A scalar always matches the same as a single scalar character. This + // means it must match a whole grapheme in grapheme semantic mode, but + // can match a single scalar in scalar semantic mode. + return try Character(s).generateConsumer(opts) case .any: // FIXME: Should this be a total ordering? @@ -123,6 +140,25 @@ extension DSLTree.Atom { } } + case .anyNonNewline: + switch opts.semanticLevel { + case .graphemeCluster: + return { input, bounds in + input[bounds.lowerBound].isNewline + ? nil + : input.index(after: bounds.lowerBound) + } + case .unicodeScalar: + return { input, bounds in + input[bounds.lowerBound].isNewline + ? nil + : input.unicodeScalars.index(after: bounds.lowerBound) + } + } + + case .dot: + throw Unreachable(".atom(.dot) should be handled by emitDot") + case .assertion: // TODO: We could handle, should this be total? return nil @@ -211,16 +247,20 @@ extension AST.Atom { var singleScalar: UnicodeScalar? { switch kind { case .scalar(let s): return s.value + case .escaped(let e): + guard let s = e.scalarValue else { return nil } + return s default: return nil } } var singleScalarASCIIValue: UInt8? { + if let s = singleScalar, s.isASCII { + return UInt8(ascii: s) + } switch kind { - case let .char(c) where c != "\r\n": - return c.asciiValue - case let .scalar(s) where s.value.isASCII: - return UInt8(ascii: s.value) + case let .char(c): + return c._singleScalarAsciiValue default: return nil } @@ -264,12 +304,12 @@ extension AST.Atom { case let .namedCharacter(name): return consumeName(name, opts: opts) - case .any: + case .dot: assertionFailure( "Should have been handled by tree conversion") - fatalError(".atom(.any) is handled in emitAny") + fatalError(".atom(.dot) is handled in emitDot") - case .startOfLine, .endOfLine: + case .caretAnchor, .dollarAnchor: // handled in emitAssertion return nil @@ -287,24 +327,25 @@ extension DSLTree.CustomCharacterClass.Member { _ opts: MatchingOptions, _ isInverted: Bool ) -> DSLTree.CustomCharacterClass.AsciiBitset? { + typealias Bitset = DSLTree.CustomCharacterClass.AsciiBitset switch self { case let .atom(a): if let val = a.singleScalarASCIIValue { - return DSLTree.CustomCharacterClass.AsciiBitset( - val, - isInverted, - opts.isCaseInsensitive - ) + return Bitset(val, isInverted, opts.isCaseInsensitive) } case let .range(low, high): - if let lowVal = low.singleScalarASCIIValue, let highVal = high.singleScalarASCIIValue { - return DSLTree.CustomCharacterClass.AsciiBitset( - low: lowVal, - high: highVal, - isInverted: isInverted, - isCaseInsensitive: opts.isCaseInsensitive - ) + if let lowVal = low.singleScalarASCIIValue, + let highVal = high.singleScalarASCIIValue { + return Bitset(low: lowVal, high: highVal, isInverted: isInverted, + isCaseInsensitive: opts.isCaseInsensitive) } + case .quotedLiteral(let str): + var bitset = Bitset(isInverted: isInverted) + for c in str { + guard let ascii = c._singleScalarAsciiValue else { return nil } + bitset = bitset.union(Bitset(ascii, isInverted, opts.isCaseInsensitive)) + } + return bitset default: return nil } @@ -321,38 +362,68 @@ extension DSLTree.CustomCharacterClass.Member { } return c case let .range(low, high): - // TODO: - guard let lhs = low.literalCharacterValue else { + guard let lhsChar = low.literalCharacterValue else { throw Unsupported("\(low) in range") } - guard let rhs = high.literalCharacterValue else { + guard let rhsChar = high.literalCharacterValue else { throw Unsupported("\(high) in range") } - if opts.isCaseInsensitive { - let lhsLower = lhs.lowercased() - let rhsLower = rhs.lowercased() - guard lhsLower <= rhsLower else { throw Unsupported("Invalid range \(lhs)-\(rhs)") } - return { input, bounds in - // TODO: check for out of bounds? - let curIdx = bounds.lowerBound - if (lhsLower...rhsLower).contains(input[curIdx].lowercased()) { - // TODO: semantic level - return input.index(after: curIdx) - } - return nil + // We must have NFC single scalar bounds. + guard let lhs = lhsChar.singleScalar, lhs.isNFC else { + throw RegexCompilationError.invalidCharacterClassRangeOperand(lhsChar) + } + guard let rhs = rhsChar.singleScalar, rhs.isNFC else { + throw RegexCompilationError.invalidCharacterClassRangeOperand(rhsChar) + } + guard lhs <= rhs else { + throw Unsupported("Invalid range \(low)-\(high)") + } + + let isCaseInsensitive = opts.isCaseInsensitive + let isCharacterSemantic = opts.semanticLevel == .graphemeCluster + + return { input, bounds in + let curIdx = bounds.lowerBound + let nextIndex = isCharacterSemantic + ? input.index(after: curIdx) + : input.unicodeScalars.index(after: curIdx) + + // Under grapheme semantics, we compare based on single NFC scalars. If + // such a character is not single scalar under NFC, the match fails. In + // scalar semantics, we compare the exact scalar value to the NFC + // bounds. + let scalar = isCharacterSemantic ? input[curIdx].singleNFCScalar + : input.unicodeScalars[curIdx] + guard let scalar = scalar else { return nil } + let scalarRange = lhs ... rhs + if scalarRange.contains(scalar) { + return nextIndex } - } else { - guard lhs <= rhs else { throw Unsupported("Invalid range \(lhs)-\(rhs)") } - return { input, bounds in - // TODO: check for out of bounds? - let curIdx = bounds.lowerBound - if (lhs...rhs).contains(input[curIdx]) { - // TODO: semantic level - return input.index(after: curIdx) + + // Check for case insensitive matches. + func matchesCased( + _ cased: (UnicodeScalar.Properties) -> String + ) -> Bool { + let casedStr = cased(scalar.properties) + // In character semantic mode, we need to map to NFC. In scalar + // semantics, we should have an exact scalar. + let mapped = isCharacterSemantic ? casedStr.singleNFCScalar + : casedStr.singleScalar + guard let mapped = mapped else { return false } + return scalarRange.contains(mapped) + } + if isCaseInsensitive { + if scalar.properties.changesWhenLowercased, + matchesCased(\.lowercaseMapping) { + return nextIndex + } + if scalar.properties.changesWhenUppercased, + matchesCased(\.uppercaseMapping) { + return nextIndex } - return nil } + return nil } case let .custom(ccc): @@ -394,21 +465,17 @@ extension DSLTree.CustomCharacterClass.Member { } return rhs(input, bounds) } - case .quotedLiteral(let s): - if opts.isCaseInsensitive { - return { input, bounds in - guard s.lowercased()._contains(input[bounds.lowerBound].lowercased()) else { - return nil - } - return input.index(after: bounds.lowerBound) - } - } else { - return { input, bounds in - guard s.contains(input[bounds.lowerBound]) else { - return nil + case .quotedLiteral(let str): + let consumers = try str.map { + try $0.generateConsumer(opts) + } + return { input, bounds in + for fn in consumers { + if let idx = fn(input, bounds) { + return idx } - return input.index(after: bounds.lowerBound) } + return nil } case .trivia: // TODO: Should probably strip this earlier... diff --git a/Sources/_StringProcessing/Engine/Backtracking.swift b/Sources/_StringProcessing/Engine/Backtracking.swift index 8fcdf9312..355702ac1 100644 --- a/Sources/_StringProcessing/Engine/Backtracking.swift +++ b/Sources/_StringProcessing/Engine/Backtracking.swift @@ -32,15 +32,18 @@ extension Processor { // The int registers store values that can be relevant to // backtracking, such as the number of trips in a quantification. var intRegisters: [Int] + // Same with position registers + var posRegisters: [Input.Index] var destructure: ( pc: InstructionAddress, pos: Position?, stackEnd: CallStackAddress, captureEnds: [_StoredCapture], - intRegisters: [Int] + intRegisters: [Int], + PositionRegister: [Input.Index] ) { - (pc, pos, stackEnd, captureEnds, intRegisters) + (pc, pos, stackEnd, captureEnds, intRegisters, posRegisters) } } @@ -53,7 +56,8 @@ extension Processor { pos: addressOnly ? nil : currentPosition, stackEnd: .init(callStack.count), captureEnds: storedCaptures, - intRegisters: registers.ints) + intRegisters: registers.ints, + posRegisters: registers.positions) } } diff --git a/Sources/_StringProcessing/Engine/InstPayload.swift b/Sources/_StringProcessing/Engine/InstPayload.swift index c614e10fd..42fb86913 100644 --- a/Sources/_StringProcessing/Engine/InstPayload.swift +++ b/Sources/_StringProcessing/Engine/InstPayload.swift @@ -147,6 +147,26 @@ extension Instruction.Payload { var string: StringRegister { interpret() } + + init(scalar: Unicode.Scalar) { + self.init(UInt64(scalar.value)) + } + var scalar: Unicode.Scalar { + return Unicode.Scalar(_value: UInt32(self.rawValue)) + } + + init(scalar: Unicode.Scalar, caseInsensitive: Bool, boundaryCheck: Bool) { + let raw = UInt64(scalar.value) + + (caseInsensitive ? 1 << 55: 0) + + (boundaryCheck ? 1 << 54 : 0) + self.init(raw) + } + var scalarPayload: (Unicode.Scalar, caseInsensitive: Bool, boundaryCheck: Bool) { + let caseInsensitive = (self.rawValue >> 55) & 1 == 1 + let boundaryCheck = (self.rawValue >> 54) & 1 == 1 + let scalar = Unicode.Scalar(_value: UInt32(self.rawValue & 0xFFFF_FFFF)) + return (scalar, caseInsensitive: caseInsensitive, boundaryCheck: boundaryCheck) + } init(sequence: SequenceRegister) { self.init(sequence) @@ -190,18 +210,20 @@ extension Instruction.Payload { interpret() } - init(element: ElementRegister) { - self.init(element) + init(element: ElementRegister, isCaseInsensitive: Bool) { + self.init(isCaseInsensitive ? 1 : 0, element) } - var element: ElementRegister { - interpret() + var elementPayload: (isCaseInsensitive: Bool, ElementRegister) { + let pair: (UInt64, ElementRegister) = interpretPair() + return (isCaseInsensitive: pair.0 == 1, pair.1) } - init(bitset: AsciiBitsetRegister) { - self.init(bitset) + init(bitset: AsciiBitsetRegister, isScalar: Bool) { + self.init(isScalar ? 1 : 0, bitset) } - var bitset: AsciiBitsetRegister { - interpret() + var bitsetPayload: (isScalar: Bool, AsciiBitsetRegister) { + let pair: (UInt64, AsciiBitsetRegister) = interpretPair() + return (isScalar: pair.0 == 1, pair.1) } init(consumer: ConsumeFunctionRegister) { @@ -284,10 +306,10 @@ extension Instruction.Payload { interpretPair() } - init(pos: PositionRegister, pos2: PositionRegister) { - self.init(pos, pos2) + init(addr: InstructionAddress, position: PositionRegister) { + self.init(addr, position) } - var pairedPosPos: (PositionRegister, PositionRegister) { + var pairedAddrPos: (InstructionAddress, PositionRegister) { interpretPair() } diff --git a/Sources/_StringProcessing/Engine/Instruction.swift b/Sources/_StringProcessing/Engine/Instruction.swift index 4e715ad9d..8e1a1f294 100644 --- a/Sources/_StringProcessing/Engine/Instruction.swift +++ b/Sources/_StringProcessing/Engine/Instruction.swift @@ -37,6 +37,14 @@ extension Instruction { /// case moveImmediate + /// Move the current position into a register + /// + /// moveCurrentPosition(into: PositionRegister) + /// + /// Operands: + /// - Position register to move into + case moveCurrentPosition + // MARK: General Purpose: Control flow /// Branch to a new instruction @@ -57,6 +65,16 @@ extension Instruction { /// case condBranchZeroElseDecrement + /// Conditionally branch if the current position is the same as the register + /// + /// condBranch( + /// to: InstAddr, ifSamePositionAs: PositionRegister) + /// + /// Operands: + /// - Instruction address to branch to, if the position in the register is the same as currentPosition + /// - Position register to check against + case condBranchSamePosition + // TODO: Function calls // MARK: - Matching @@ -72,20 +90,27 @@ extension Instruction { /// Composite assert-advance else restore. /// - /// match(_: EltReg) + /// match(_: EltReg, isCaseInsensitive: Bool) /// - /// Operand: Element register to compare against. + /// Operands: + /// - Element register to compare against. + /// - Boolean for if we should match in a case insensitive way case match - /// Match against a sequence of elements + /// Match against a scalar and possibly perform a boundary check or match in a case insensitive way /// - /// matchSequence(_: SeqReg) + /// matchScalar(_: Unicode.Scalar, isCaseInsensitive: Bool, boundaryCheck: Bool) /// - /// Operand: Sequence register to compare against. - case matchSequence + /// Operands: Scalar value to match against and booleans + case matchScalar - /// Match against a set of valid ascii values stored in a bitset - /// Operand: Ascii bitset register containing the bitset + /// Match a character or a scalar against a set of valid ascii values stored in a bitset + /// + /// matchBitset(_: AsciiBitsetRegister, isScalar: Bool) + /// + /// Operand: + /// - Ascii bitset register containing the bitset + /// - Boolean for if we should match by scalar value case matchBitset /// TODO: builtin assertions and anchors @@ -306,7 +331,7 @@ extension Instruction { var elementRegister: ElementRegister? { switch opcode { case .match: - return payload.element + return payload.elementPayload.1 default: return nil } } diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index 676b21473..0b9a91726 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -32,6 +32,7 @@ extension MEProgram { var nextIntRegister = IntRegister(0) var nextCaptureRegister = CaptureRegister(0) var nextValueRegister = ValueRegister(0) + var nextPositionRegister = PositionRegister(0) // Special addresses or instructions var failAddressToken: AddressToken? = nil @@ -105,6 +106,14 @@ extension MEProgram.Builder { fixup(to: t) } + mutating func buildCondBranch( + to t: AddressToken, + ifSamePositionAs r: PositionRegister + ) { + instructions.append(.init(.condBranchSamePosition, .init(position: r))) + fixup(to: t) + } + mutating func buildSave(_ t: AddressToken) { instructions.append(.init(.save)) fixup(to: t) @@ -135,24 +144,32 @@ extension MEProgram.Builder { instructions.append(.init(.advance, .init(distance: n))) } - mutating func buildMatch(_ e: Character) { + mutating func buildMatch(_ e: Character, isCaseInsensitive: Bool) { instructions.append(.init( - .match, .init(element: elements.store(e)))) + .match, .init(element: elements.store(e), isCaseInsensitive: isCaseInsensitive))) } - mutating func buildMatchSequence( - _ s: S - ) where S.Element == Character { - instructions.append(.init( - .matchSequence, - .init(sequence: sequences.store(.init(s))))) + mutating func buildMatchScalar(_ s: Unicode.Scalar, boundaryCheck: Bool) { + instructions.append(.init(.matchScalar, .init(scalar: s, caseInsensitive: false, boundaryCheck: boundaryCheck))) + } + + mutating func buildMatchScalarCaseInsensitive(_ s: Unicode.Scalar, boundaryCheck: Bool) { + instructions.append(.init(.matchScalar, .init(scalar: s, caseInsensitive: true, boundaryCheck: boundaryCheck))) } + mutating func buildMatchAsciiBitset( _ b: DSLTree.CustomCharacterClass.AsciiBitset ) { instructions.append(.init( - .matchBitset, .init(bitset: makeAsciiBitset(b)))) + .matchBitset, .init(bitset: makeAsciiBitset(b), isScalar: false))) + } + + mutating func buildScalarMatchAsciiBitset( + _ b: DSLTree.CustomCharacterClass.AsciiBitset + ) { + instructions.append(.init( + .matchBitset, .init(bitset: makeAsciiBitset(b), isScalar: true))) } mutating func buildConsume( @@ -211,6 +228,10 @@ extension MEProgram.Builder { .init(value: value, capture: capture))) } + mutating func buildMoveCurrentPosition(into r: PositionRegister) { + instructions.append(.init(.moveCurrentPosition, .init(position: r))) + } + mutating func buildBackreference( _ cap: CaptureRegister ) { @@ -257,7 +278,8 @@ extension MEProgram.Builder { switch inst.opcode { case .condBranchZeroElseDecrement: payload = .init(addr: addr, int: inst.payload.int) - + case .condBranchSamePosition: + payload = .init(addr: addr, position: inst.payload.position) case .branch, .save, .saveAddress, .clearThrough: payload = .init(addr: addr) @@ -281,6 +303,7 @@ extension MEProgram.Builder { regInfo.sequences = sequences.count regInfo.ints = nextIntRegister.rawValue regInfo.values = nextValueRegister.rawValue + regInfo.positions = nextPositionRegister.rawValue regInfo.bitsets = asciiBitsets.count regInfo.consumeFunctions = consumeFunctions.count regInfo.assertionFunctions = assertionFunctions.count @@ -421,6 +444,12 @@ extension MEProgram.Builder { return r } + mutating func makePositionRegister() -> PositionRegister { + let r = nextPositionRegister + defer { nextPositionRegister.rawValue += 1 } + return r + } + // TODO: A register-mapping helper struct, which could release // registers without monotonicity required diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index f7b3a65a2..2be918294 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -219,6 +219,15 @@ extension Processor { return true } + mutating func matchCaseInsensitive(_ e: Element) -> Bool { + guard let cur = load(), cur.lowercased() == e.lowercased() else { + signalFailure() + return false + } + _uncheckedForcedConsumeOne() + return true + } + // Match against the current input prefix. Returns whether // it succeeded vs signaling an error. mutating func matchSeq( @@ -230,6 +239,44 @@ extension Processor { return true } + func loadScalar() -> Unicode.Scalar? { + currentPosition < end ? input.unicodeScalars[currentPosition] : nil + } + + mutating func matchScalar(_ s: Unicode.Scalar, boundaryCheck: Bool) -> Bool { + guard s == loadScalar(), + let idx = input.unicodeScalars.index( + currentPosition, + offsetBy: 1, + limitedBy: end), + (!boundaryCheck || input.isOnGraphemeClusterBoundary(idx)) + else { + signalFailure() + return false + } + currentPosition = idx + return true + } + + mutating func matchScalarCaseInsensitive( + _ s: Unicode.Scalar, + boundaryCheck: Bool + ) -> Bool { + guard let curScalar = loadScalar(), + s.properties.lowercaseMapping == curScalar.properties.lowercaseMapping, + let idx = input.unicodeScalars.index( + currentPosition, + offsetBy: 1, + limitedBy: end), + (!boundaryCheck || input.isOnGraphemeClusterBoundary(idx)) + else { + signalFailure() + return false + } + currentPosition = idx + return true + } + // If we have a bitset we know that the CharacterClass only matches against // ascii characters, so check if the current input element is ascii then // check if it is set in the bitset @@ -244,8 +291,22 @@ extension Processor { return true } + // Equivalent of matchBitset but emitted when in unicode scalar semantic mode + mutating func matchBitsetScalar( + _ bitset: DSLTree.CustomCharacterClass.AsciiBitset + ) -> Bool { + guard let curScalar = loadScalar(), + bitset.matches(scalar: curScalar), + let idx = input.unicodeScalars.index(currentPosition, offsetBy: 1, limitedBy: end) else { + signalFailure() + return false + } + currentPosition = idx + return true + } + mutating func signalFailure() { - guard let (pc, pos, stackEnd, capEnds, intRegisters) = + guard let (pc, pos, stackEnd, capEnds, intRegisters, posRegisters) = savePoints.popLast()?.destructure else { state = .fail @@ -259,6 +320,7 @@ extension Processor { callStack.removeLast(callStack.count - stackEnd.rawValue) storedCaptures = capEnds registers.ints = intRegisters + registers.positions = posRegisters } mutating func abort(_ e: Error? = nil) { @@ -315,7 +377,10 @@ extension Processor { registers[reg] = int controller.step() - + case .moveCurrentPosition: + let reg = payload.position + registers[reg] = currentPosition + controller.step() case .branch: controller.pc = payload.addr @@ -327,7 +392,13 @@ extension Processor { registers[int] -= 1 controller.step() } - + case .condBranchSamePosition: + let (addr, pos) = payload.pairedAddrPos + if registers[pos] == currentPosition { + controller.pc = addr + } else { + controller.step() + } case .save: let resumeAddr = payload.addr let sp = makeSavePoint(resumeAddr) @@ -369,23 +440,40 @@ extension Processor { } case .match: - let reg = payload.element - if match(registers[reg]) { - controller.step() + let (isCaseInsensitive, reg) = payload.elementPayload + if isCaseInsensitive { + if matchCaseInsensitive(registers[reg]) { + controller.step() + } + } else { + if match(registers[reg]) { + controller.step() + } } - case .matchSequence: - let reg = payload.sequence - let seq = registers[reg] - if matchSeq(seq) { - controller.step() + case .matchScalar: + let (scalar, caseInsensitive, boundaryCheck) = payload.scalarPayload + if caseInsensitive { + if matchScalarCaseInsensitive(scalar, boundaryCheck: boundaryCheck) { + controller.step() + } + } else { + if matchScalar(scalar, boundaryCheck: boundaryCheck) { + controller.step() + } } case .matchBitset: - let reg = payload.bitset + let (isScalar, reg) = payload.bitsetPayload let bitset = registers[reg] - if matchBitset(bitset) { - controller.step() + if isScalar { + if matchBitsetScalar(bitset) { + controller.step() + } + } else { + if matchBitset(bitset) { + controller.step() + } } case .consumeBy: diff --git a/Sources/_StringProcessing/Engine/Registers.swift b/Sources/_StringProcessing/Engine/Registers.swift index c76413383..e5d33af8b 100644 --- a/Sources/_StringProcessing/Engine/Registers.swift +++ b/Sources/_StringProcessing/Engine/Registers.swift @@ -47,6 +47,8 @@ extension Processor { var ints: [Int] var values: [Any] + + var positions: [Input.Index] } } @@ -66,6 +68,12 @@ extension Processor.Registers { values[i.rawValue] = newValue } } + subscript(_ i: PositionRegister) -> Input.Index { + get { positions[i.rawValue] } + set { + positions[i.rawValue] = newValue + } + } subscript(_ i: ElementRegister) -> Input.Element { elements[i.rawValue] } @@ -89,6 +97,8 @@ extension Processor.Registers { } extension Processor.Registers { + static let sentinelIndex = "".startIndex + init( _ program: MEProgram, _ sentinel: String.Index @@ -120,11 +130,15 @@ extension Processor.Registers { self.values = Array( repeating: SentinelValue(), count: info.values) + self.positions = Array( + repeating: Processor.Registers.sentinelIndex, + count: info.positions) } mutating func reset(sentinel: Input.Index) { self.ints._setAll(to: 0) self.values._setAll(to: SentinelValue()) + self.positions._setAll(to: Processor.Registers.sentinelIndex) } } diff --git a/Sources/_StringProcessing/PrintAsPattern.swift b/Sources/_StringProcessing/PrintAsPattern.swift index 4237eda33..c1753c49d 100644 --- a/Sources/_StringProcessing/PrintAsPattern.swift +++ b/Sources/_StringProcessing/PrintAsPattern.swift @@ -70,16 +70,9 @@ extension PrettyPrinter { for namedCapture in namedCaptures { print("let \(namedCapture) = Reference(Substring.self)") } - - switch node { - case .concatenation(_): - printAsPattern(convertedFromAST: node) - case .convertedRegexLiteral(.concatenation(_), _): - printAsPattern(convertedFromAST: node) - default: - printBlock("Regex") { printer in - printer.printAsPattern(convertedFromAST: node) - } + + printBlock("Regex") { printer in + printer.printAsPattern(convertedFromAST: node, isTopLevel: true) } } @@ -89,7 +82,7 @@ extension PrettyPrinter { // to have a non-backing-off pretty-printer that this // can defer to. private mutating func printAsPattern( - convertedFromAST node: DSLTree.Node + convertedFromAST node: DSLTree.Node, isTopLevel: Bool = false ) { if patternBackoff(DSLTree._Tree(node)) { printBackoff(node) @@ -106,11 +99,7 @@ extension PrettyPrinter { } case let .concatenation(c): - printBlock("Regex") { printer in - c.forEach { - printer.printAsPattern(convertedFromAST: $0) - } - } + printConcatenationAsPattern(c, isTopLevel: isTopLevel) case let .nonCapturingGroup(kind, child): switch kind.ast { @@ -263,7 +252,7 @@ extension PrettyPrinter { // check above, so it should work out. Need a // cleaner way to do this. This means the argument // label is a lie. - printAsPattern(convertedFromAST: n) + printAsPattern(convertedFromAST: n, isTopLevel: isTopLevel) case let .customCharacterClass(ccc): printAsPattern(ccc) @@ -279,6 +268,64 @@ extension PrettyPrinter { print("/* TODO: absent function */") } } + + enum NodeToPrint { + case dslNode(DSLTree.Node) + case stringLiteral(String) + } + + mutating func printAsPattern(_ node: NodeToPrint) { + switch node { + case .dslNode(let n): + printAsPattern(convertedFromAST: n) + case .stringLiteral(let str): + print(str) + } + } + + mutating func printConcatenationAsPattern( + _ nodes: [DSLTree.Node], isTopLevel: Bool + ) { + // We need to coalesce any adjacent character and scalar elements into a + // string literal, preserving scalar syntax. + let nodes = nodes + .map { NodeToPrint.dslNode($0.lookingThroughConvertedLiteral) } + .coalescing( + with: StringLiteralBuilder(), into: { .stringLiteral($0.result) } + ) { literal, node in + guard case .dslNode(let node) = node else { return false } + switch node { + case let .atom(.char(c)): + literal.append(c) + return true + case let .atom(.scalar(s)): + literal.append(unescaped: s._dslBase) + return true + case .quotedLiteral(let q): + literal.append(q) + return true + case .trivia: + // Trivia can be completely ignored if we've already coalesced + // something. + return !literal.isEmpty + default: + return false + } + } + if isTopLevel || nodes.count == 1 { + // If we're at the top level, or we coalesced everything into a single + // element, we don't need to print a surrounding Regex { ... }. + for n in nodes { + printAsPattern(n) + } + return + } + printBlock("Regex") { printer in + for n in nodes { + printer.printAsPattern(n) + } + } + } mutating func printAsPattern( _ ccc: DSLTree.CustomCharacterClass, @@ -315,8 +362,7 @@ extension PrettyPrinter { return } - var charMembers = "" - + var charMembers = StringLiteralBuilder() // This iterates through all of the character class members collecting all // of the members who can be stuffed into a singular '.anyOf(...)' vs. @@ -340,14 +386,9 @@ extension PrettyPrinter { switch a { case let .char(c): charMembers.append(c) - - if c == "\\" { - charMembers.append(c) - } - return false case let .scalar(s): - charMembers += "\\u{\(String(s.value, radix: 16, uppercase: true))}" + charMembers.append(unescaped: s._dslBase) return false case .unconverted(_): return true @@ -356,7 +397,7 @@ extension PrettyPrinter { } case let .quotedLiteral(s): - charMembers += s + charMembers.append(s) return false case .trivia(_): @@ -370,7 +411,7 @@ extension PrettyPrinter { // Also in the same vein, if we have a few atom members but no // nonAtomMembers, then we can emit a single .anyOf(...) for them. if !charMembers.isEmpty, nonCharMembers.isEmpty { - let anyOf = ".anyOf(\(charMembers._quoted))" + let anyOf = ".anyOf(\(charMembers))" indent() @@ -393,7 +434,7 @@ extension PrettyPrinter { printer.indent() if !charMembers.isEmpty { - printer.output(".anyOf(\(charMembers._quoted))") + printer.output(".anyOf(\(charMembers))") if nonCharMembers.count > 0 { printer.output(",") @@ -454,9 +495,9 @@ extension PrettyPrinter { case let .scalar(s): if wrap { - output("One(.anyOf(\"\\u{\(String(s.value, radix: 16, uppercase: true))}\"))") + output("One(.anyOf(\(s._dslBase._bareQuoted)))") } else { - output(".anyOf(\"\\u{\(String(s.value, radix: 16, uppercase: true))}\")") + output(".anyOf(\(s._dslBase._bareQuoted))") } case let .unconverted(a): @@ -617,13 +658,46 @@ extension PrettyPrinter { } extension String { - // TODO: Escaping? + fileprivate var _escaped: String { + _replacing(#"\"#, with: #"\\"#)._replacing(#"""#, with: #"\""#) + } + fileprivate var _quoted: String { - "\"\(self._replacing(#"\"#, with: #"\\"#)._replacing(#"""#, with: #"\""#))\"" + _escaped._bareQuoted + } + + fileprivate var _bareQuoted: String { + #""\#(self)""# + } +} + +extension UnicodeScalar { + var _dslBase: String { "\\u{\(String(value, radix: 16, uppercase: true))}" } +} + +/// A helper for building string literals, which handles escaping the contents +/// appended. +fileprivate struct StringLiteralBuilder { + private var contents = "" + + var result: String { contents._bareQuoted } + var isEmpty: Bool { contents.isEmpty } + + mutating func append(_ str: String) { + contents += str._escaped + } + mutating func append(_ c: Character) { + contents += String(c)._escaped } + mutating func append(unescaped str: String) { + contents += str + } +} +extension StringLiteralBuilder: CustomStringConvertible { + var description: String { result } } -extension AST.Atom.AssertionKind { +extension DSLTree.Atom.Assertion { // TODO: Some way to integrate this with conversion... var _patternBase: String { switch self { @@ -631,6 +705,12 @@ extension AST.Atom.AssertionKind { return "Anchor.startOfLine" case .endOfLine: return "Anchor.endOfLine" + case .caretAnchor: + // The DSL doesn't have an equivalent to this, so print as regex. + return "/^/" + case .dollarAnchor: + // The DSL doesn't have an equivalent to this, so print as regex. + return "/$/" case .wordBoundary: return "Anchor.wordBoundary" case .notWordBoundary: @@ -809,7 +889,7 @@ extension AST.Atom { /// /// TODO: Some way to integrate this with conversion... var _patternBase: (String, canBeWrapped: Bool) { - if let anchor = self.assertionKind { + if let anchor = self.dslAssertionKind { return (anchor._patternBase, false) } @@ -821,19 +901,15 @@ extension AST.Atom { } var _dslBase: (String, canBeWrapped: Bool) { - func scalarLiteral(_ s: UnicodeScalar) -> String { - let hex = String(s.value, radix: 16, uppercase: true) - return "\\u{\(hex)}" - } switch kind { case let .char(c): return (String(c), false) case let .scalar(s): - return (scalarLiteral(s.value), false) + return (s.value._dslBase, false) case let .scalarSequence(seq): - return (seq.scalarValues.map(scalarLiteral).joined(), false) + return (seq.scalarValues.map(\._dslBase).joined(), false) case let .property(p): return (p._dslBase, true) @@ -895,10 +971,11 @@ extension AST.Atom { case .namedCharacter: return (" /* TODO: named character */", false) - case .any: - return (".any", true) + case .dot: + // The DSL does not have an equivalent to '.', print as a regex. + return ("/./", false) - case .startOfLine, .endOfLine: + case .caretAnchor, .dollarAnchor: fatalError("unreachable") case .backreference: @@ -950,10 +1027,10 @@ extension AST.Atom { case .namedCharacter(let n): return "\\N{\(n)}" - case .any: + case .dot: return "." - case .startOfLine, .endOfLine: + case .caretAnchor, .dollarAnchor: fatalError("unreachable") case .backreference: @@ -1101,14 +1178,21 @@ extension DSLTree.Atom { switch self { case .any: return (".any", true) + + case .anyNonNewline: + return (".anyNonNewline", true) + + case .dot: + // The DSL does not have an equivalent to '.', print as a regex. + return ("/./", false) case let .char(c): return (String(c)._quoted, false) case let .scalar(s): let hex = String(s.value, radix: 16, uppercase: true) - return ("\\u{\(hex)}"._quoted, false) - + return ("\\u{\(hex)}"._bareQuoted, false) + case let .unconverted(a): if a.ast.isUnprintableAtom { return ("#/\(a.ast._regexBase)/#", false) @@ -1117,7 +1201,7 @@ extension DSLTree.Atom { } case .assertion(let a): - return (a.ast._patternBase, false) + return (a._patternBase, false) case .backreference(_): return ("/* TOOD: backreferences */", false) @@ -1142,6 +1226,12 @@ extension DSLTree.Atom { var _regexBase: String { switch self { case .any: + return "(?s:.)" + + case .anyNonNewline: + return "(?-s:.)" + + case .dot: return "." case let .char(c): @@ -1149,7 +1239,7 @@ extension DSLTree.Atom { case let .scalar(s): let hex = String(s.value, radix: 16, uppercase: true) - return "\\u{\(hex)}"._quoted + return "\\u{\(hex)}"._bareQuoted case let .unconverted(a): return a.ast._regexBase diff --git a/Sources/_StringProcessing/Regex/ASTConversion.swift b/Sources/_StringProcessing/Regex/ASTConversion.swift index 320d10897..4eb7bc42c 100644 --- a/Sources/_StringProcessing/Regex/ASTConversion.swift +++ b/Sources/_StringProcessing/Regex/ASTConversion.swift @@ -43,61 +43,7 @@ extension AST.Node { return .orderedChoice(children) case let .concatenation(v): - // Coalesce adjacent children who can produce a - // string literal representation - let astChildren = v.children - func coalesce( - _ idx: Array.Index - ) -> (Array.Index, String)? { - var result = "" - var idx = idx - while idx < astChildren.endIndex { - guard let atom: AST.Atom = astChildren[idx].as() else { break } - - // TODO: For printing, nice to coalesce - // scalars literals too. We likely need a different - // approach even before we have a better IR. - if let char = atom.singleCharacter { - result.append(char) - } else if let scalar = atom.singleScalar { - result.append(Character(scalar)) - } else if case .scalarSequence(let seq) = atom.kind { - result += seq.scalarValues.map(Character.init) - } else { - break - } - - astChildren.formIndex(after: &idx) - } - return result.isEmpty ? nil : (idx, result) - } - - // No need to nest single children concatenations - if astChildren.count == 1 { - return astChildren.first!.dslTreeNode - } - - // Check for a single child post-coalescing - if let (idx, str) = coalesce(astChildren.startIndex), - idx == astChildren.endIndex - { - return .quotedLiteral(str) - } - - // Coalesce adjacent string children - var curIdx = astChildren.startIndex - var children = Array() - while curIdx < astChildren.endIndex { - if let (nextIdx, str) = coalesce(curIdx) { - // TODO: Track source info... - children.append(.quotedLiteral(str)) - curIdx = nextIdx - } else { - children.append(astChildren[curIdx].dslTreeNode) - astChildren.formIndex(after: &curIdx) - } - } - return .concatenation(children) + return .concatenation(v.children.map(\.dslTreeNode)) case let .group(v): let child = v.child.dslTreeNode @@ -135,10 +81,9 @@ extension AST.Node { case let .atom(v): switch v.kind { case .scalarSequence(let seq): - // Scalar sequences are splatted into concatenated scalars, which - // becomes a quoted literal. Sequences nested in concatenations have - // already been coalesced, this just handles the lone atom case. - return .quotedLiteral(String(seq.scalarValues.map(Character.init))) + // The DSL doesn't have an equivalent node for scalar sequences. Splat + // them into a concatenation of scalars. + return .concatenation(seq.scalarValues.map { .atom(.scalar($0)) }) default: return .atom(v.dslTreeAtom) } @@ -208,16 +153,44 @@ extension AST.CustomCharacterClass { } } +extension AST.Atom.EscapedBuiltin { + var dslAssertionKind: DSLTree.Atom.Assertion? { + switch self { + case .wordBoundary: return .wordBoundary + case .notWordBoundary: return .notWordBoundary + case .startOfSubject: return .startOfSubject + case .endOfSubject: return .endOfSubject + case .textSegment: return .textSegment + case .notTextSegment: return .notTextSegment + case .endOfSubjectBeforeNewline: return .endOfSubjectBeforeNewline + case .firstMatchingPositionInSubject: return .firstMatchingPositionInSubject + case .resetStartOfMatch: return .resetStartOfMatch + default: return nil + } + } +} + +extension AST.Atom { + var dslAssertionKind: DSLTree.Atom.Assertion? { + switch kind { + case .caretAnchor: return .caretAnchor + case .dollarAnchor: return .dollarAnchor + case .escaped(let b): return b.dslAssertionKind + default: return nil + } + } +} + extension AST.Atom { var dslTreeAtom: DSLTree.Atom { - if let kind = assertionKind { - return .assertion(.init(ast: kind)) + if let kind = dslAssertionKind { + return .assertion(kind) } switch self.kind { case let .char(c): return .char(c) - case let .scalar(s): return .char(Character(s.value)) - case .any: return .any + case let .scalar(s): return .scalar(s.value) + case .dot: return .dot case let .backreference(r): return .backreference(.init(ast: r)) case let .changeMatchingOptions(seq): return .changeMatchingOptions(.init(ast: seq)) diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index 740bdcb8d..520f4991a 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -117,11 +117,11 @@ extension DSLTree { var members: [Member] var isInverted: Bool - var containsAny: Bool { + var containsDot: Bool { members.contains { member in switch member { - case .atom(.any): return true - case .custom(let ccc): return ccc.containsAny + case .atom(.dot): return true + case .custom(let ccc): return ccc.containsDot default: return false } @@ -159,95 +159,25 @@ extension DSLTree { indirect case subtraction(CustomCharacterClass, CustomCharacterClass) indirect case symmetricDifference(CustomCharacterClass, CustomCharacterClass) } - - internal struct AsciiBitset { - let isInverted: Bool - var a: UInt64 = 0 - var b: UInt64 = 0 - - init(isInverted: Bool) { - self.isInverted = isInverted - } - - init(_ val: UInt8, _ isInverted: Bool, _ isCaseInsensitive: Bool) { - self.isInverted = isInverted - add(val, isCaseInsensitive) - } - - init(low: UInt8, high: UInt8, isInverted: Bool, isCaseInsensitive: Bool) { - self.isInverted = isInverted - for val in low...high { - add(val, isCaseInsensitive) - } - } - - internal init( - a: UInt64, - b: UInt64, - isInverted: Bool - ) { - self.isInverted = isInverted - self.a = a - self.b = b - } - - internal mutating func add(_ val: UInt8, _ isCaseInsensitive: Bool) { - setBit(val) - if isCaseInsensitive { - switch val { - case 64...90: setBit(val + 32) - case 97...122: setBit(val - 32) - default: break - } - } - } - - internal mutating func setBit(_ val: UInt8) { - if val < 64 { - a = a | 1 << val - } else { - b = b | 1 << (val - 64) - } - } - - internal func matches(char: Character) -> Bool { - let ret: Bool - if let val = char.asciiValue { - if val < 64 { - ret = (a >> val) & 1 == 1 - } else { - ret = (b >> (val - 64)) & 1 == 1 - } - } else { - ret = false - } - - if isInverted { - return !ret - } - - return ret - } - - /// Joins another bitset from a Member of the same CustomCharacterClass - internal func union(_ other: AsciiBitset) -> AsciiBitset { - precondition(self.isInverted == other.isInverted) - return AsciiBitset( - a: self.a | other.a, - b: self.b | other.b, - isInverted: self.isInverted - ) - } - } } @_spi(RegexBuilder) public enum Atom { case char(Character) case scalar(Unicode.Scalar) + + /// Any character, including newlines. case any - case assertion(_AST.AssertionKind) + /// Any character, excluding newlines. This differs from '.', as it is not + /// affected by single line mode. + case anyNonNewline + + /// The DSL representation of '.' in a regex literal. This does not match + /// newlines unless single line mode is enabled. + case dot + + case assertion(Assertion) case backreference(_AST.Reference) case symbolicReference(ReferenceID) @@ -257,6 +187,52 @@ extension DSLTree { } } +extension DSLTree.Atom { + @_spi(RegexBuilder) + public enum Assertion: Hashable { + /// \A + case startOfSubject + + /// \Z + case endOfSubjectBeforeNewline + + /// \z + case endOfSubject + + /// \K + case resetStartOfMatch + + /// \G + case firstMatchingPositionInSubject + + /// \y + case textSegment + + /// \Y + case notTextSegment + + /// The DSL's Anchor.startOfLine, which matches the start of a line + /// even if `anchorsMatchNewlines` is false. + case startOfLine + + /// The DSL's Anchor.endOfLine, which matches the end of a line + /// even if `anchorsMatchNewlines` is false. + case endOfLine + + /// ^ + case caretAnchor + + /// $ + case dollarAnchor + + /// \b (from outside a custom character class) + case wordBoundary + + /// \B + case notWordBoundary + } +} + extension Unicode.GeneralCategory { var extendedGeneralCategory: Unicode.ExtendedGeneralCategory? { switch self { @@ -358,6 +334,14 @@ extension DSLTree.Node { default: return nil } } + + /// If this node is for a converted literal, look through it. + var lookingThroughConvertedLiteral: Self { + switch self { + case let .convertedRegexLiteral(n, _): return n + default: return self + } + } } extension DSLTree.Atom { @@ -773,40 +757,6 @@ extension DSLTree { internal var ast: AST.AbsentFunction } - @_spi(RegexBuilder) - public struct AssertionKind { - internal var ast: AST.Atom.AssertionKind - - public static func startOfSubject(_ inverted: Bool = false) -> Self { - .init(ast: .startOfSubject) - } - public static func endOfSubjectBeforeNewline(_ inverted: Bool = false) -> Self { - .init(ast: .endOfSubjectBeforeNewline) - } - public static func endOfSubject(_ inverted: Bool = false) -> Self { - .init(ast: .endOfSubject) - } - public static func firstMatchingPositionInSubject(_ inverted: Bool = false) -> Self { - .init(ast: .firstMatchingPositionInSubject) - } - public static func textSegmentBoundary(_ inverted: Bool = false) -> Self { - inverted - ? .init(ast: .notTextSegment) - : .init(ast: .textSegment) - } - public static func startOfLine(_ inverted: Bool = false) -> Self { - .init(ast: .startOfLine) - } - public static func endOfLine(_ inverted: Bool = false) -> Self { - .init(ast: .endOfLine) - } - public static func wordBoundary(_ inverted: Bool = false) -> Self { - inverted - ? .init(ast: .notWordBoundary) - : .init(ast: .wordBoundary) - } - } - @_spi(RegexBuilder) public struct Reference { internal var ast: AST.Reference @@ -820,6 +770,31 @@ extension DSLTree { @_spi(RegexBuilder) public struct Atom { internal var ast: AST.Atom + + // FIXME: The below APIs should be removed once the DSL tree has been + // migrated to use proper DSL atoms for them. + + public static var _anyGrapheme: Self { + .init(ast: .init(.escaped(.graphemeCluster), .fake)) + } + public static var _whitespace: Self { + .init(ast: .init(.escaped(.whitespace), .fake)) + } + public static var _digit: Self { + .init(ast: .init(.escaped(.decimalDigit), .fake)) + } + public static var _horizontalWhitespace: Self { + .init(ast: .init(.escaped(.horizontalWhitespace), .fake)) + } + public static var _newlineSequence: Self { + .init(ast: .init(.escaped(.newlineSequence), .fake)) + } + public static var _verticalWhitespace: Self { + .init(ast: .init(.escaped(.verticalTab), .fake)) + } + public static var _word: Self { + .init(ast: .init(.escaped(.wordCharacter), .fake)) + } } } } @@ -832,7 +807,8 @@ extension DSLTree.Atom { switch self { case .changeMatchingOptions, .assertion: return false - case .char, .scalar, .any, .backreference, .symbolicReference, .unconverted: + case .char, .scalar, .any, .anyNonNewline, .dot, .backreference, + .symbolicReference, .unconverted: return true } } diff --git a/Sources/_StringProcessing/Regex/Options.swift b/Sources/_StringProcessing/Regex/Options.swift index 24d5c422e..88d2dbf5d 100644 --- a/Sources/_StringProcessing/Regex/Options.swift +++ b/Sources/_StringProcessing/Regex/Options.swift @@ -12,7 +12,7 @@ @_implementationOnly import _RegexParser @available(SwiftStdlib 5.7, *) -extension RegexComponent { +extension Regex { /// Returns a regular expression that ignores case when matching. /// /// - Parameter ignoresCase: A Boolean value indicating whether to ignore case. @@ -65,7 +65,7 @@ extension RegexComponent { /// - Parameter wordBoundaryKind: The algorithm to use for determining word boundaries. /// - Returns: The modified regular expression. public func wordBoundaryKind(_ wordBoundaryKind: RegexWordBoundaryKind) -> Regex { - wrapInOption(.unicodeWordBoundaries, addingIf: wordBoundaryKind == .unicodeLevel2) + wrapInOption(.unicodeWordBoundaries, addingIf: wordBoundaryKind == .default) } /// Returns a regular expression where the start and end of input @@ -83,8 +83,8 @@ extension RegexComponent { /// /// This method corresponds to applying the `m` option in regex syntax. For /// this behavior in the `RegexBuilder` syntax, see - /// ``Anchor.startOfLine``, ``Anchor.endOfLine``, ``Anchor.startOfInput``, - /// and ``Anchor.endOfInput``. + /// ``Anchor.startOfLine``, ``Anchor.endOfLine``, ``Anchor.startOfSubject``, + /// and ``Anchor.endOfSubject``. /// /// - Parameter matchLineEndings: A Boolean value indicating whether `^` and /// `$` should match the start and end of lines, respectively. @@ -205,7 +205,7 @@ public struct RegexWordBoundaryKind: Hashable { /// that match `/\w\W/` or `/\W\w/`, or between the start or end of the input /// and a `\w` character. Word boundaries therefore depend on the option- /// defined behavior of `\w`. - public static var unicodeLevel1: Self { + public static var simple: Self { .init(base: .unicodeLevel1) } @@ -215,7 +215,7 @@ public struct RegexWordBoundaryKind: Hashable { /// Default word boundaries use a Unicode algorithm that handles some cases /// better than simple word boundaries, such as words with internal /// punctuation, changes in script, and Emoji. - public static var unicodeLevel2: Self { + public static var `default`: Self { .init(base: .unicodeLevel2) } } diff --git a/Sources/_StringProcessing/Unicode/CharacterProps.swift b/Sources/_StringProcessing/Unicode/CharacterProps.swift index 80f6819a6..e0be4e386 100644 --- a/Sources/_StringProcessing/Unicode/CharacterProps.swift +++ b/Sources/_StringProcessing/Unicode/CharacterProps.swift @@ -11,10 +11,3 @@ // TODO - -extension Character { - /// Whether this character is made up of exactly one Unicode scalar value. - var hasExactlyOneScalar: Bool { - unicodeScalars.index(after: unicodeScalars.startIndex) == unicodeScalars.endIndex - } -} diff --git a/Sources/_StringProcessing/Unicode/NFC.swift b/Sources/_StringProcessing/Unicode/NFC.swift new file mode 100644 index 000000000..5c2c4aa48 --- /dev/null +++ b/Sources/_StringProcessing/Unicode/NFC.swift @@ -0,0 +1,55 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2022 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +@_spi(_Unicode) +import Swift + +extension UnicodeScalar { + /// Checks whether the scalar is in NFC form. + var isNFC: Bool { Character(self).singleNFCScalar == self } +} + +extension Character { + /// If the given character consists of a single NFC scalar, returns it. If + /// there are multiple NFC scalars, returns `nil`. + var singleNFCScalar: UnicodeScalar? { + // SwiftStdlib is always >= 5.7 for a shipped StringProcessing. + guard #available(SwiftStdlib 5.7, *) else { return nil } + var nfcIter = String(self)._nfc.makeIterator() + guard let scalar = nfcIter.next(), nfcIter.next() == nil else { return nil } + return scalar + } + + /// If the given character contains a single scalar, returns it. If none or + /// multiple scalars are present, returns `nil`. + var singleScalar: UnicodeScalar? { + hasExactlyOneScalar ? unicodeScalars.first! : nil + } +} + +extension String { + /// If the given string consists of a single NFC scalar, returns it. If none + /// or multiple NFC scalars are present, returns `nil`. + var singleNFCScalar: UnicodeScalar? { + guard !isEmpty && index(after: startIndex) == endIndex else { return nil } + return first!.singleNFCScalar + } + + /// If the given string contains a single scalar, returns it. If none or + /// multiple scalars are present, returns `nil`. + var singleScalar: UnicodeScalar? { + let scalars = unicodeScalars + guard !scalars.isEmpty && + scalars.index(after: scalars.startIndex) == scalars.endIndex + else { return nil } + return scalars.first! + } +} diff --git a/Sources/_StringProcessing/Utility/AsciiBitset.swift b/Sources/_StringProcessing/Utility/AsciiBitset.swift new file mode 100644 index 000000000..ad3159820 --- /dev/null +++ b/Sources/_StringProcessing/Utility/AsciiBitset.swift @@ -0,0 +1,99 @@ +extension DSLTree.CustomCharacterClass { + internal struct AsciiBitset { + let isInverted: Bool + var a: UInt64 = 0 + var b: UInt64 = 0 + + init(isInverted: Bool) { + self.isInverted = isInverted + } + + init(_ val: UInt8, _ isInverted: Bool, _ isCaseInsensitive: Bool) { + self.isInverted = isInverted + add(val, isCaseInsensitive) + } + + init(low: UInt8, high: UInt8, isInverted: Bool, isCaseInsensitive: Bool) { + self.isInverted = isInverted + for val in low...high { + add(val, isCaseInsensitive) + } + } + + internal init( + a: UInt64, + b: UInt64, + isInverted: Bool + ) { + self.isInverted = isInverted + self.a = a + self.b = b + } + + internal mutating func add(_ val: UInt8, _ isCaseInsensitive: Bool) { + setBit(val) + if isCaseInsensitive { + switch val { + case 64...90: setBit(val + 32) + case 97...122: setBit(val - 32) + default: break + } + } + } + + internal mutating func setBit(_ val: UInt8) { + if val < 64 { + a = a | 1 << val + } else { + b = b | 1 << (val - 64) + } + } + + private func matches(_ val: UInt8) -> Bool { + if val < 64 { + return (a >> val) & 1 == 1 + } else { + return (b >> (val - 64)) & 1 == 1 + } + } + + internal func matches(char: Character) -> Bool { + let matched: Bool + if let val = char._singleScalarAsciiValue { + matched = matches(val) + } else { + matched = false + } + + if isInverted { + return !matched + } + return matched + } + + internal func matches(scalar: Unicode.Scalar) -> Bool { + let matched: Bool + if scalar.isASCII { + let val = UInt8(ascii: scalar) + matched = matches(val) + } else { + matched = false + } + + if isInverted { + return !matched + } + return matched + } + + /// Joins another bitset from a Member of the same CustomCharacterClass + internal func union(_ other: AsciiBitset) -> AsciiBitset { + precondition(self.isInverted == other.isInverted) + return AsciiBitset( + a: self.a | other.a, + b: self.b | other.b, + isInverted: self.isInverted + ) + } + } +} diff --git a/Sources/_StringProcessing/Utility/Misc.swift b/Sources/_StringProcessing/Utility/Misc.swift new file mode 100644 index 000000000..8a9cbe325 --- /dev/null +++ b/Sources/_StringProcessing/Utility/Misc.swift @@ -0,0 +1,59 @@ +//===----------------------------------------------------------------------===// +// +// This source file is part of the Swift.org open source project +// +// Copyright (c) 2022 Apple Inc. and the Swift project authors +// Licensed under Apache License v2.0 with Runtime Library Exception +// +// See https://swift.org/LICENSE.txt for license information +// +//===----------------------------------------------------------------------===// + +extension Array { + /// Coalesce adjacent elements using a given accumulator. The accumulator is + /// transformed into elements of the array by `finish`. The `accumulate` + /// function should return `true` if the accumulator has coalesced the + /// element, `false` otherwise. + func coalescing( + with initialAccumulator: T, into finish: (T) -> Self, + accumulate: (inout T, Element) -> Bool + ) -> Self { + var didAccumulate = false + var accumulator = initialAccumulator + + var result = Self() + for elt in self { + if accumulate(&accumulator, elt) { + // The element has been coalesced into accumulator, there is nothing + // else to do. + didAccumulate = true + continue + } + if didAccumulate { + // We have a leftover accumulator, which needs to be finished before we + // can append the next element. + result += finish(accumulator) + accumulator = initialAccumulator + didAccumulate = false + } + result.append(elt) + } + // Handle a leftover accumulation. + if didAccumulate { + result += finish(accumulator) + } + return result + } + + /// Coalesce adjacent elements using a given accumulator. The accumulator is + /// transformed into an element of the array by `finish`. The `accumulate` + /// function should return `true` if the accumulator has coalesced the + /// element, `false` otherwise. + func coalescing( + with initialAccumulator: T, into finish: (T) -> Element, + accumulate: (inout T, Element) -> Bool + ) -> Self { + coalescing( + with: initialAccumulator, into: { [finish($0) ]}, accumulate: accumulate) + } +} diff --git a/Sources/_StringProcessing/Utility/RegexFactory.swift b/Sources/_StringProcessing/Utility/RegexFactory.swift index 693b04966..31245c0f7 100644 --- a/Sources/_StringProcessing/Utility/RegexFactory.swift +++ b/Sources/_StringProcessing/Utility/RegexFactory.swift @@ -40,7 +40,7 @@ public struct _RegexFactory { @_spi(RegexBuilder) @available(SwiftStdlib 5.7, *) public func assertion( - _ kind: DSLTree._AST.AssertionKind + _ kind: DSLTree.Atom.Assertion ) -> Regex { .init(node: .atom(.assertion(kind))) } diff --git a/Sources/_StringProcessing/_CharacterClassModel.swift b/Sources/_StringProcessing/_CharacterClassModel.swift index db2088782..9f515f220 100644 --- a/Sources/_StringProcessing/_CharacterClassModel.swift +++ b/Sources/_StringProcessing/_CharacterClassModel.swift @@ -15,8 +15,7 @@ // an AST, but this isn't a natural thing to produce in the context // of parsing or to store in an AST -@_spi(RegexBuilder) -public struct _CharacterClassModel: Hashable { +struct _CharacterClassModel: Hashable { /// The actual character class to match. var cc: Representation @@ -28,7 +27,7 @@ public struct _CharacterClassModel: Hashable { var isInverted: Bool = false // TODO: Split out builtin character classes into their own type? - public enum Representation: Hashable { + enum Representation: Hashable { /// Any character case any /// Any grapheme cluster @@ -50,74 +49,6 @@ public struct _CharacterClassModel: Hashable { case whitespace /// Character.isLetter or Character.isDigit or Character == "_" case word - /// One of the custom character set. - case custom([CharacterSetComponent]) - } - - public enum SetOperator: Hashable { - case subtraction - case intersection - case symmetricDifference - } - - /// A binary set operation that forms a character class component. - public struct SetOperation: Hashable { - var lhs: CharacterSetComponent - var op: SetOperator - var rhs: CharacterSetComponent - - func matches(_ c: Character, with options: MatchingOptions) -> Bool { - switch op { - case .intersection: - return lhs.matches(c, with: options) && rhs.matches(c, with: options) - case .subtraction: - return lhs.matches(c, with: options) && !rhs.matches(c, with: options) - case .symmetricDifference: - return lhs.matches(c, with: options) != rhs.matches(c, with: options) - } - } - } - - public enum CharacterSetComponent: Hashable { - case character(Character) - case range(ClosedRange) - - /// A nested character class. - case characterClass(_CharacterClassModel) - - /// A binary set operation of character class components. - indirect case setOperation(SetOperation) - - public static func setOperation( - lhs: CharacterSetComponent, op: SetOperator, rhs: CharacterSetComponent - ) -> CharacterSetComponent { - .setOperation(.init(lhs: lhs, op: op, rhs: rhs)) - } - - func matches(_ character: Character, with options: MatchingOptions) -> Bool { - switch self { - case .character(let c): - if options.isCaseInsensitive { - return c.lowercased() == character.lowercased() - } else { - return c == character - } - case .range(let range): - if options.isCaseInsensitive { - let newLower = range.lowerBound.lowercased() - let newUpper = range.upperBound.lowercased() - // FIXME: Is failing this possible? Is this the right behavior if so? - guard newLower <= newUpper else { return false } - return (newLower...newUpper).contains(character.lowercased()) - } else { - return range.contains(character) - } - case .characterClass(let custom): - let str = String(character) - return custom.matches(in: str, at: str.startIndex, with: options) != nil - case .setOperation(let op): return op.matches(character, with: options) - } - } } enum MatchLevel: Hashable { @@ -153,7 +84,7 @@ public struct _CharacterClassModel: Hashable { } /// Inverts a character class. - public var inverted: Self { + var inverted: Self { return withInversion(true) } @@ -188,8 +119,6 @@ public struct _CharacterClassModel: Hashable { matched = c.isWhitespace && (c.isASCII || !options.usesASCIISpaces) case .word: matched = c.isWordCharacter && (c.isASCII || !options.usesASCIIWord) - case .custom(let set): - matched = set.any { $0.matches(c, with: options) } } if isInverted { matched.toggle() @@ -222,8 +151,6 @@ public struct _CharacterClassModel: Hashable { matched = c.properties.isWhitespace && (c.isASCII || !options.usesASCIISpaces) case .word: matched = (c.properties.isAlphabetic || c == "_") && (c.isASCII || !options.usesASCIIWord) - case .custom(let set): - matched = set.any { $0.matches(Character(c), with: options) } } if isInverted { matched.toggle() @@ -233,80 +160,50 @@ public struct _CharacterClassModel: Hashable { } } -@available(SwiftStdlib 5.7, *) -extension _CharacterClassModel: RegexComponent { - public typealias RegexOutput = Substring - - public var regex: Regex { - guard let ast = self.makeAST() else { - fatalError("FIXME: extended AST?") - } - return Regex(ast: ast) - } -} - -@_spi(RegexBuilder) extension _CharacterClassModel { - public static var any: _CharacterClassModel { + static var any: _CharacterClassModel { .init(cc: .any, matchLevel: .graphemeCluster) } - public static var anyGrapheme: _CharacterClassModel { + static var anyGrapheme: _CharacterClassModel { .init(cc: .anyGrapheme, matchLevel: .graphemeCluster) } - public static var anyUnicodeScalar: _CharacterClassModel { + static var anyUnicodeScalar: _CharacterClassModel { .init(cc: .any, matchLevel: .unicodeScalar) } - public static var whitespace: _CharacterClassModel { + static var whitespace: _CharacterClassModel { .init(cc: .whitespace, matchLevel: .graphemeCluster) } - public static var digit: _CharacterClassModel { + static var digit: _CharacterClassModel { .init(cc: .digit, matchLevel: .graphemeCluster) } - public static var hexDigit: _CharacterClassModel { + static var hexDigit: _CharacterClassModel { .init(cc: .hexDigit, matchLevel: .graphemeCluster) } - public static var horizontalWhitespace: _CharacterClassModel { + static var horizontalWhitespace: _CharacterClassModel { .init(cc: .horizontalWhitespace, matchLevel: .graphemeCluster) } - public static var newlineSequence: _CharacterClassModel { + static var newlineSequence: _CharacterClassModel { .init(cc: .newlineSequence, matchLevel: .graphemeCluster) } - public static var verticalWhitespace: _CharacterClassModel { + static var verticalWhitespace: _CharacterClassModel { .init(cc: .verticalWhitespace, matchLevel: .graphemeCluster) } - public static var word: _CharacterClassModel { + static var word: _CharacterClassModel { .init(cc: .word, matchLevel: .graphemeCluster) } - - public static func custom( - _ components: [_CharacterClassModel.CharacterSetComponent] - ) -> _CharacterClassModel { - .init(cc: .custom(components), matchLevel: .graphemeCluster) - } -} - -extension _CharacterClassModel.CharacterSetComponent: CustomStringConvertible { - public var description: String { - switch self { - case .range(let range): return "" - case .character(let character): return "" - case .characterClass(let custom): return "\(custom)" - case .setOperation(let op): return "<\(op.lhs) \(op.op) \(op.rhs)>" - } - } } extension _CharacterClassModel.Representation: CustomStringConvertible { - public var description: String { + var description: String { switch self { case .any: return "" case .anyGrapheme: return "" @@ -318,95 +215,16 @@ extension _CharacterClassModel.Representation: CustomStringConvertible { case .verticalWhitespace: return "vertical whitespace" case .whitespace: return "" case .word: return "" - case .custom(let set): return "" } } } extension _CharacterClassModel: CustomStringConvertible { - public var description: String { + var description: String { return "\(isInverted ? "not " : "")\(cc)" } } -extension _CharacterClassModel { - public func makeDSLTreeCharacterClass() -> DSLTree.CustomCharacterClass? { - // FIXME: Implement in DSLTree instead of wrapping an AST atom - switch makeAST() { - case .atom(let atom): - return .init(members: [.atom(.unconverted(.init(ast: atom)))]) - default: - return nil - } - } - - internal func makeAST() -> AST.Node? { - let inv = isInverted - - func esc(_ b: AST.Atom.EscapedBuiltin) -> AST.Node { - escaped(b) - } - - switch cc { - case .any: return atom(.any) - - case .digit: - return esc(inv ? .notDecimalDigit : .decimalDigit) - - case .horizontalWhitespace: - return esc( - inv ? .notHorizontalWhitespace : .horizontalWhitespace) - - // FIXME: newline sequence is not same as \n - case .newlineSequence: - return esc(inv ? .notNewline : .newline) - - case .whitespace: - return esc(inv ? .notWhitespace : .whitespace) - - case .verticalWhitespace: - return esc(inv ? .notVerticalTab : .verticalTab) - - case .word: - return esc(inv ? .notWordCharacter : .wordCharacter) - - case .anyGrapheme: - return esc(.graphemeCluster) - - case .hexDigit: - let members: [AST.CustomCharacterClass.Member] = [ - range_m(.char("a"), .char("f")), - range_m(.char("A"), .char("F")), - range_m(.char("0"), .char("9")), - ] - let ccc = AST.CustomCharacterClass( - .init(faking: inv ? .inverted : .normal), - members, - .fake) - - return .customCharacterClass(ccc) - - default: return nil - } - } -} - -extension DSLTree.Node { - var characterClass: _CharacterClassModel? { - switch self { - case let .customCharacterClass(ccc): - return ccc.modelCharacterClass - case let .atom(a): - return a.characterClass - case .characterPredicate: - // FIXME: Do we make one from this? - return nil - default: - return nil - } - } -} - extension _CharacterClassModel { func withMatchLevel( _ level: _CharacterClassModel.MatchLevel @@ -417,17 +235,6 @@ extension _CharacterClassModel { } } -extension DSLTree.Atom { - var characterClass: _CharacterClassModel? { - switch self { - case let .unconverted(a): - return a.ast.characterClass - - default: return nil - } - } -} - extension AST.Atom { var characterClass: _CharacterClassModel? { switch kind { @@ -438,8 +245,8 @@ extension AST.Atom { // this? Or does grapheme-semantic mode complicate that? return nil - case .any: - // `.any` is handled in the matching engine by Compiler.emitAny() and in + case .dot: + // `.dot` is handled in the matching engine by Compiler.emitDot() and in // the legacy compiler by the `.any` instruction, which can provide lower // level instructions than the CharacterClass-generated consumer closure // @@ -468,7 +275,7 @@ extension AST.Atom.EscapedBuiltin { // FIXME: This is more like '.' than inverted '\R', as it is affected // by e.g (*CR). We should therefore really be emitting it through - // emitAny(). For now we treat it as semantically invalid. + // emitDot(). For now we treat it as semantically invalid. case .notNewline: return .newlineSequence.inverted case .whitespace: return .whitespace @@ -489,81 +296,6 @@ extension AST.Atom.EscapedBuiltin { } } -extension DSLTree.CustomCharacterClass { - // TODO: Refactor a bit, and... can we drop this type? - var modelCharacterClass: _CharacterClassModel? { - var result = - Array<_CharacterClassModel.CharacterSetComponent>() - for m in members { - switch m { - case let .atom(a): - if let cc = a.characterClass { - result.append(.characterClass(cc)) - } else if let c = a.literalCharacterValue { - result.append(.character(c)) - } else { - return nil - } - case let .range(low, high): - guard let lhs = low.literalCharacterValue, - let rhs = high.literalCharacterValue - else { - return nil - } - result.append(.range(lhs...rhs)) - - case let .custom(ccc): - guard let cc = ccc.modelCharacterClass else { - return nil - } - result.append(.characterClass(cc)) - - case let .intersection(lhs, rhs): - guard let lhs = lhs.modelCharacterClass, - let rhs = rhs.modelCharacterClass - else { - return nil - } - result.append(.setOperation( - lhs: .characterClass(lhs), - op: .intersection, - rhs: .characterClass(rhs))) - - case let .subtraction(lhs, rhs): - guard let lhs = lhs.modelCharacterClass, - let rhs = rhs.modelCharacterClass - else { - return nil - } - result.append(.setOperation( - lhs: .characterClass(lhs), - op: .subtraction, - rhs: .characterClass(rhs))) - - case let .symmetricDifference(lhs, rhs): - guard let lhs = lhs.modelCharacterClass, - let rhs = rhs.modelCharacterClass - else { - return nil - } - result.append(.setOperation( - lhs: .characterClass(lhs), - op: .symmetricDifference, - rhs: .characterClass(rhs))) - - case let .quotedLiteral(s): - // Decompose quoted literal into literal characters. - result += s.map { .character($0) } - - case .trivia: - break - } - } - let cc = _CharacterClassModel.custom(result) - return isInverted ? cc.inverted : cc - } -} - extension _CharacterClassModel { // FIXME: Calling on inverted sets wont be the same as the // inverse of a boundary if at the start or end of the diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index b67c6c242..e25f2df05 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -12,6 +12,7 @@ import XCTest import _StringProcessing import RegexBuilder +import TestSupport class RegexDSLTests: XCTestCase { func _testDSLCaptures( @@ -69,7 +70,13 @@ class RegexDSLTests: XCTestCase { XCTAssertTrue(match.output == substringMatch.output) } + let allNewlines = "\u{A}\u{B}\u{C}\u{D}\r\n\u{85}\u{2028}\u{2029}" + let asciiNewlines = "\u{A}\u{B}\u{C}\u{D}\r\n" + func testCharacterClasses() throws { + // Must have new stdlib for character class ranges. + guard ensureNewStdlib() else { return } + try _testDSLCaptures( ("a c", ("a c", " ", "c")), matchType: (Substring, Substring, Substring).self, ==) @@ -110,9 +117,143 @@ class RegexDSLTests: XCTestCase { CharacterClass.whitespace.inverted } } + + // `.newlineSequence` and `.verticalWhitespace` match the same set of + // newlines in grapheme semantic mode, and scalar mode when applied with + // OneOrMore. + for cc in [CharacterClass.newlineSequence, .verticalWhitespace] { + for mode in [RegexSemanticLevel.unicodeScalar, .graphemeCluster] { + try _testDSLCaptures( + ("\n", ("\n", "\n")), + ("\r", ("\r", "\r")), + ("\r\n", ("\r\n", "\r\n")), + (allNewlines, (allNewlines[...], allNewlines[...])), + ("abc\ndef", ("abc\ndef", "\n")), + ("abc\n\r\ndef", ("abc\n\r\ndef", "\n\r\n")), + ("abc\(allNewlines)def", ("abc\(allNewlines)def", allNewlines[...])), + ("abc", nil), + matchType: (Substring, Substring).self, ==) + { + Regex { + ZeroOrMore { + cc.inverted + } + Capture { + OneOrMore(cc) + } + ZeroOrMore { + cc.inverted + } + }.matchingSemantics(mode) + } + + // Try with ASCII-only whitespace. + try _testDSLCaptures( + ("\n", ("\n", "\n")), + ("\r", ("\r", "\r")), + ("\r\n", ("\r\n", "\r\n")), + (allNewlines, (allNewlines[...], asciiNewlines[...])), + ("abc\ndef", ("abc\ndef", "\n")), + ("abc\n\r\ndef", ("abc\n\r\ndef", "\n\r\n")), + ("abc\(allNewlines)def", ("abc\(allNewlines)def", asciiNewlines[...])), + ("abc", nil), + matchType: (Substring, Substring).self, ==) + { + Regex { + ZeroOrMore { + cc.inverted + } + Capture { + OneOrMore(cc) + } + ZeroOrMore { + cc.inverted + } + }.matchingSemantics(mode).asciiOnlyWhitespace() + } + } + } + + // `.newlineSequence` in scalar mode may match a single `\r\n`. + // `.verticalWhitespace` may not. + for asciiOnly in [true, false] { + try _testDSLCaptures( + ("\r", "\r"), + ("\r\n", "\r\n"), + matchType: Substring.self, ==) + { + Regex { + CharacterClass.newlineSequence + }.matchingSemantics(.unicodeScalar).asciiOnlyWhitespace(asciiOnly) + } + try _testDSLCaptures( + ("\r", nil), + ("\r\n", nil), + matchType: Substring.self, ==) + { + Regex { + CharacterClass.newlineSequence.inverted + }.matchingSemantics(.unicodeScalar).asciiOnlyWhitespace(asciiOnly) + } + try _testDSLCaptures( + ("\r", "\r"), + ("\r\n", nil), + matchType: Substring.self, ==) + { + Regex { + CharacterClass.verticalWhitespace + }.matchingSemantics(.unicodeScalar).asciiOnlyWhitespace(asciiOnly) + } + try _testDSLCaptures( + ("\r", nil), + ("\r\n", nil), + matchType: Substring.self, ==) + { + Regex { + CharacterClass.verticalWhitespace.inverted + }.matchingSemantics(.unicodeScalar).asciiOnlyWhitespace(asciiOnly) + } + try _testDSLCaptures( + ("\r", nil), + ("\r\n", nil), + matchType: Substring.self, ==) + { + Regex { + CharacterClass.verticalWhitespace.inverted + "\n" + }.matchingSemantics(.unicodeScalar).asciiOnlyWhitespace(asciiOnly) + } + } + + // Make sure horizontal whitespace does not match newlines or other + // vertical whitespace. + try _testDSLCaptures( + (" \u{A0} \u{9} \t ", " \u{A0} \u{9} \t "), + (" \n", nil), + (" \r", nil), + (" \r\n", nil), + (" \u{2028}", nil), + matchType: Substring.self, ==) + { + OneOrMore(.horizontalWhitespace) + } + + // Horizontal whitespace in ASCII mode. + try _testDSLCaptures( + (" \u{9} \t ", " \u{9} \t "), + ("\u{A0}", nil), + matchType: Substring.self, ==) + { + Regex { + OneOrMore(.horizontalWhitespace) + }.asciiOnlyWhitespace() + } } func testCharacterClassOperations() throws { + // Must have new stdlib for character class ranges. + guard ensureNewStdlib() else { return } + try _testDSLCaptures( ("bcdefn1a", "bcdefn1a"), ("nbcdef1a", nil), // fails symmetric difference lookahead @@ -133,6 +274,105 @@ class RegexDSLTests: XCTestCase { } } + func testAny() throws { + // .any matches newlines regardless of matching options. + for dotMatchesNewline in [true, false] { + try _testDSLCaptures( + ("abc\(allNewlines)def", "abc\(allNewlines)def"), + matchType: Substring.self, ==) + { + Regex { + OneOrMore(.any) + }.dotMatchesNewlines(dotMatchesNewline) + } + } + + // `.anyGraphemeCluster` is the same as `.any` in grapheme mode. + for mode in [RegexSemanticLevel.graphemeCluster, .unicodeScalar] { + try _testDSLCaptures( + ("a", "a"), + ("\r\n", "\r\n"), + ("e\u{301}", "e\u{301}"), + ("e\u{301}f", nil), + ("e\u{303}\u{301}\u{302}", "e\u{303}\u{301}\u{302}"), + matchType: Substring.self, ==) + { + Regex { + One(.anyGraphemeCluster) + }.matchingSemantics(mode) + } + + // Like `.any` it also always matches newlines. + for dotMatchesNewline in [true, false] { + try _testDSLCaptures( + ("abc\(allNewlines)def", "abc\(allNewlines)def"), + matchType: Substring.self, ==) + { + Regex { + OneOrMore(.anyGraphemeCluster) + }.matchingSemantics(mode).dotMatchesNewlines(dotMatchesNewline) + } + } + } + } + + func testAnyNonNewline() throws { + // `.anyNonNewline` is `.` without single-line mode. + for mode in [RegexSemanticLevel.graphemeCluster, .unicodeScalar] { + for dotMatchesNewline in [true, false] { + try _testDSLCaptures( + ("abcdef", "abcdef"), + ("abcdef\n", nil), + ("\r\n", nil), + ("\r", nil), + ("\n", nil), + matchType: Substring.self, ==) + { + Regex { + OneOrMore(.anyNonNewline) + }.matchingSemantics(mode).dotMatchesNewlines(dotMatchesNewline) + } + + try _testDSLCaptures( + ("abcdef", nil), + ("abcdef\n", nil), + ("\r\n", "\r\n"), + ("\r", "\r"), + ("\n", "\n"), + matchType: Substring.self, ==) + { + Regex { + OneOrMore(.anyNonNewline.inverted) + }.matchingSemantics(mode).dotMatchesNewlines(dotMatchesNewline) + } + + try _testDSLCaptures( + ("abc", "abc"), + ("abcd", nil), + ("\r\n", nil), + ("\r", nil), + ("\n", nil), + matchType: Substring.self, ==) + { + Regex { + OneOrMore(CharacterClass.anyNonNewline.intersection(.anyOf("\n\rabc"))) + }.matchingSemantics(mode).dotMatchesNewlines(dotMatchesNewline) + } + } + } + + try _testDSLCaptures( + ("\r\n", "\r\n"), matchType: Substring.self, ==) { + CharacterClass.anyNonNewline.inverted + } + try _testDSLCaptures( + ("\r\n", nil), matchType: Substring.self, ==) { + Regex { + CharacterClass.anyNonNewline.inverted + }.matchingSemantics(.unicodeScalar) + } + } + func testMatchResultDotZeroWithoutCapture() throws { let match = try XCTUnwrap("aaa".wholeMatch { OneOrMore { "a" } }) XCTAssertEqual(match.0, "aaa") @@ -234,8 +474,10 @@ class RegexDSLTests: XCTestCase { ("abcabc", "abcabc"), ("abcABCaBc", "abcABCaBc"), matchType: Substring.self, ==) { - OneOrMore { - "abc" + Regex { + OneOrMore { + "abc" + } }.ignoresCase(true) } @@ -247,8 +489,10 @@ class RegexDSLTests: XCTestCase { ("abcabc", "abcabc"), ("abcABCaBc", "abcABCaBc"), matchType: Substring.self, ==) { - OneOrMore { - "abc" + Regex { + OneOrMore { + "abc" + } } .ignoresCase(true) .ignoresCase(false) @@ -264,9 +508,13 @@ class RegexDSLTests: XCTestCase { ("abcabc", "abcabc"), ("abcdeABCdeaBcde", "abcdeABCdeaBcde"), matchType: Substring.self, ==) { - OneOrMore { - "abc".ignoresCase(true) - Optionally("de") + Regex { + OneOrMore { + Regex { + "abc" + }.ignoresCase(true) + Optionally("de") + } } .ignoresCase(false) } @@ -303,11 +551,13 @@ class RegexDSLTests: XCTestCase { "stop" " " - Capture { - OneOrMore(.word) - Anchor.wordBoundary - } - .wordBoundaryKind(.unicodeLevel1) + Regex { + Capture { + OneOrMore(.word) + Anchor.wordBoundary + } + }.wordBoundaryKind(.simple) + OneOrMore(.any, .reluctant) "stop" } @@ -317,15 +567,17 @@ class RegexDSLTests: XCTestCase { matchType: (Substring, Substring, Substring).self, ==) { Capture { // Reluctant behavior due to option - OneOrMore(.anyOf("abcd")) - .repetitionBehavior(.reluctant) + Regex { + OneOrMore(.anyOf("abcd")) + }.repetitionBehavior(.reluctant) } ZeroOrMore("a"..."z") Capture { // Eager behavior due to explicit parameter, despite option - OneOrMore(.digit, .eager) - .repetitionBehavior(.reluctant) + Regex { + OneOrMore(.digit, .eager) + }.repetitionBehavior(.reluctant) } ZeroOrMore(.digit) } @@ -334,16 +586,20 @@ class RegexDSLTests: XCTestCase { ("abcdefg", ("abcdefg", "abcdefg")), ("abcdΓ©fg", ("abcdΓ©fg", "abcd")), matchType: (Substring, Substring).self, ==) { - Capture { - OneOrMore(.word) - } - .asciiOnlyWordCharacters() + Regex { + Capture { + OneOrMore(.word) + } + }.asciiOnlyWordCharacters() ZeroOrMore(.any) } } func testQuantificationBehavior() throws { + // Must have new stdlib for character class ranges. + guard ensureNewStdlib() else { return } + // Eager by default try _testDSLCaptures( ("abc1def2", ("abc1def2", "2")), @@ -368,8 +624,10 @@ class RegexDSLTests: XCTestCase { ("abc1def2", ("abc1def2", "1")), matchType: (Substring, Substring).self, ==) { - OneOrMore(.reluctant) { - One(.word) + Regex { + OneOrMore(.reluctant) { + One(.word) + } }.repetitionBehavior(.possessive) Capture(.digit) ZeroOrMore(.any) @@ -421,8 +679,9 @@ class RegexDSLTests: XCTestCase { { Regex { Capture { - OneOrMore("a") - .repetitionBehavior(.eager) + Regex { + OneOrMore("a") + }.repetitionBehavior(.eager) } OneOrMore("a") }.repetitionBehavior(.possessive) @@ -434,8 +693,9 @@ class RegexDSLTests: XCTestCase { { Regex { Capture { - OneOrMore("a") - .repetitionBehavior(.reluctant) + Regex { + OneOrMore("a") + }.repetitionBehavior(.reluctant) } OneOrMore("a") }.repetitionBehavior(.possessive) @@ -674,19 +934,40 @@ class RegexDSLTests: XCTestCase { Anchor.endOfSubject }.anchorsMatchLineEndings() } - - // FIXME: Anchor.start/endOfLine needs to always match line endings, - // even when the `anchorsMatchLineEndings()` option is turned off. + try _testDSLCaptures( - ("\naaa", "aaa"), - ("aaa\n", "aaa"), - ("\naaa\n", "aaa"), - matchType: Substring.self, ==, xfail: true) + ("\naaa", "\naaa"), + ("aaa\n", "aaa\n"), + ("\naaa\n", "\naaa\n"), + matchType: Substring.self, ==) { Regex { + Optionally { "\n" } Anchor.startOfLine Repeat("a", count: 3) Anchor.endOfLine + Optionally { "\n" } + } + } + + // startOfLine/endOfLine apply regardless of mode. + for matchLineEndings in [true, false] { + for mode in [RegexSemanticLevel.graphemeCluster, .unicodeScalar] { + let r = Regex { + Anchor.startOfLine + Repeat("a", count: 3) + Anchor.endOfLine + }.anchorsMatchLineEndings(matchLineEndings).matchingSemantics(mode) + + XCTAssertNotNil(try r.firstMatch(in: "\naaa")) + XCTAssertNotNil(try r.firstMatch(in: "aaa\n")) + XCTAssertNotNil(try r.firstMatch(in: "\naaa\n")) + XCTAssertNotNil(try r.firstMatch(in: "\naaa\r\n")) + XCTAssertNotNil(try r.firstMatch(in: "\r\naaa\n")) + XCTAssertNotNil(try r.firstMatch(in: "\r\naaa\r\n")) + + XCTAssertNil(try r.firstMatch(in: "\nbaaa\n")) + XCTAssertNil(try r.firstMatch(in: "\naaab\n")) } } } @@ -1120,6 +1401,121 @@ class RegexDSLTests: XCTestCase { } } + func testScalarMatching() throws { + // RegexBuilder provides a RegexComponent conformance for UnicodeScalar. In + // grapheme cluster mode, it should only match entire graphemes. It may + // match a single scalar of a grapheme cluster in scalar semantic mode. + XCTAssertNotNil("a".firstMatch(of: "a" as UnicodeScalar)) + XCTAssertNil("a\u{301}".firstMatch(of: "a" as UnicodeScalar)) + XCTAssertNotNil("a\u{301}".firstMatch( + of: ("a" as UnicodeScalar).regex.matchingSemantics(.unicodeScalar))) + + let r1 = Regex { + "a" as UnicodeScalar + } + XCTAssertNil(try r1.firstMatch(in: "a\u{301}")) + XCTAssertNotNil( + try r1.matchingSemantics(.unicodeScalar).firstMatch(in: "a\u{301}") + ) + + let r2 = Regex { + CharacterClass.anyOf(["a" as UnicodeScalar, "πŸ‘"]) + } + XCTAssertNil(try r2.firstMatch(in: "a\u{301}")) + XCTAssertNotNil( + try r2.matchingSemantics(.unicodeScalar).firstMatch(in: "a\u{301}") + ) + + let r3 = Regex { + "πŸ‘¨" as UnicodeScalar + "\u{200D}" as UnicodeScalar + "πŸ‘¨" as UnicodeScalar + "\u{200D}" as UnicodeScalar + "πŸ‘§" as UnicodeScalar + "\u{200D}" as UnicodeScalar + "πŸ‘¦" as UnicodeScalar + } + XCTAssertNotNil(try r3.firstMatch(in: "πŸ‘¨β€πŸ‘¨β€πŸ‘§β€πŸ‘¦")) + XCTAssertNotNil(try r3.wholeMatch(in: "πŸ‘¨β€πŸ‘¨β€πŸ‘§β€πŸ‘¦")) + XCTAssertNotNil(try r3.matchingSemantics(.unicodeScalar).firstMatch(in: "πŸ‘¨β€πŸ‘¨β€πŸ‘§β€πŸ‘¦")) + XCTAssertNotNil(try r3.matchingSemantics(.unicodeScalar).wholeMatch(in: "πŸ‘¨β€πŸ‘¨β€πŸ‘§β€πŸ‘¦")) + + let r4 = Regex { "Γ©" as UnicodeScalar } + XCTAssertNotNil( + try r4.firstMatch(in: "e\u{301}") + ) + XCTAssertNotNil( + try r4.firstMatch(in: "Γ©") + ) + + let r5 = Regex { + "e" + "\u{301}" as UnicodeScalar + } + XCTAssertNotNil(try r5.firstMatch(in: "e\u{301}")) + XCTAssertNotNil(try r5.firstMatch(in: "Γ©")) + + let r6 = Regex { + "abcde" + "\u{301}" + } + XCTAssertNotNil(try r6.firstMatch(in: "abcde\u{301}")) + XCTAssertNotNil(try r6.firstMatch(in: "abcdΓ©")) + + let r7 = Regex { + "e" as Character + "\u{301}" as Character + } + XCTAssertNotNil(try r7.firstMatch(in: "e\u{301}")) + XCTAssertNotNil(try r7.firstMatch(in: "Γ©")) + + // You can't match a partial grapheme in grapheme semantic mode. + let r8 = Regex { + "πŸ‘¨" as UnicodeScalar + "\u{200D}" as UnicodeScalar + "πŸ‘¨" as UnicodeScalar + "\u{200D}" as UnicodeScalar + "πŸ‘§" as UnicodeScalar + } + XCTAssertNil(try r8.firstMatch(in: "πŸ‘¨β€πŸ‘¨β€πŸ‘§β€πŸ‘¦")) + XCTAssertNil(try r8.wholeMatch(in: "πŸ‘¨β€πŸ‘¨β€πŸ‘§β€πŸ‘¦")) + XCTAssertNotNil(try r8.matchingSemantics(.unicodeScalar).firstMatch(in: "πŸ‘¨β€πŸ‘¨β€πŸ‘§β€πŸ‘¦")) + XCTAssertNil(try r8.matchingSemantics(.unicodeScalar).wholeMatch(in: "πŸ‘¨β€πŸ‘¨β€πŸ‘§β€πŸ‘¦")) + + // Scalar coalescing occurs across nested concatenations and literals. + let r9 = Regex { + Regex { + try! Regex(#"πŸ‘¨"#) + "\u{200D}" as UnicodeScalar + Regex { + "πŸ‘¨" as UnicodeScalar + } + } + Regex { + Regex { + "\u{200D}" as UnicodeScalar + "πŸ‘§" + } + try! Regex(#"\u{200D}πŸ‘¦"#) + } + } + XCTAssertNotNil(try r9.firstMatch(in: "πŸ‘¨β€πŸ‘¨β€πŸ‘§β€πŸ‘¦")) + XCTAssertNotNil(try r9.wholeMatch(in: "πŸ‘¨β€πŸ‘¨β€πŸ‘§β€πŸ‘¦")) + XCTAssertNotNil(try r9.matchingSemantics(.unicodeScalar).firstMatch(in: "πŸ‘¨β€πŸ‘¨β€πŸ‘§β€πŸ‘¦")) + XCTAssertNotNil(try r9.matchingSemantics(.unicodeScalar).wholeMatch(in: "πŸ‘¨β€πŸ‘¨β€πŸ‘§β€πŸ‘¦")) + + let r10 = Regex { + "πŸ‘¨" as UnicodeScalar + try! Regex(#"\u{200D 1F468 200D 1F467}"#) + "\u{200D}" as UnicodeScalar + "πŸ‘¦" as UnicodeScalar + } + XCTAssertNotNil(try r10.firstMatch(in: "πŸ‘¨β€πŸ‘¨β€πŸ‘§β€πŸ‘¦")) + XCTAssertNotNil(try r10.wholeMatch(in: "πŸ‘¨β€πŸ‘¨β€πŸ‘§β€πŸ‘¦")) + XCTAssertNotNil(try r10.matchingSemantics(.unicodeScalar).firstMatch(in: "πŸ‘¨β€πŸ‘¨β€πŸ‘§β€πŸ‘¦")) + XCTAssertNotNil(try r10.matchingSemantics(.unicodeScalar).wholeMatch(in: "πŸ‘¨β€πŸ‘¨β€πŸ‘§β€πŸ‘¦")) + } + struct SemanticVersion: Equatable { var major: Int var minor: Int diff --git a/Tests/RegexTests/CompileTests.swift b/Tests/RegexTests/CompileTests.swift index 4e64f7335..27f8d79cb 100644 --- a/Tests/RegexTests/CompileTests.swift +++ b/Tests/RegexTests/CompileTests.swift @@ -11,9 +11,135 @@ @testable import _RegexParser @testable import _StringProcessing +import TestSupport import XCTest +enum DecodedInstr { + case invalid + case moveImmediate + case moveCurrentPosition + case branch + case condBranchZeroElseDecrement + case condBranchSamePosition + case save + case saveAddress + case splitSaving + case clear + case clearThrough + case accept + case fail + case advance + case match + case matchCaseInsensitive + case matchScalar + case matchScalarCaseInsensitiveUnchecked + case matchScalarCaseInsensitive + case matchScalarUnchecked + case matchBitsetScalar + case matchBitset + case consumeBy + case assertBy + case matchBy + case backreference + case beginCapture + case endCapture + case transformCapture + case captureValue + case builtinAssertion + case builtinCharacterClass +} + +extension DecodedInstr { + /// Decode the given instruction by looking at the opcode and payload, expanding out certain instructions + /// like matchScalar and match into their variants + /// + /// Must stay in sync with Processor.cycle + static func decode(_ instruction: Instruction) -> DecodedInstr { + let (opcode, payload) = instruction.destructure + + switch opcode { + case .invalid: + fatalError("Invalid program") + case .moveImmediate: + return .moveImmediate + case .moveCurrentPosition: + return .moveCurrentPosition + case .branch: + return .branch + case .condBranchZeroElseDecrement: + return .condBranchZeroElseDecrement + case .condBranchSamePosition: + return .condBranchSamePosition + case .save: + return .save + case .saveAddress: + return .saveAddress + case .splitSaving: + return .splitSaving + case .clear: + return .clear + case .clearThrough: + return .clearThrough + case .accept: + return .accept + case .fail: + return .fail + case .advance: + return .advance + case .match: + let (isCaseInsensitive, _) = payload.elementPayload + if isCaseInsensitive { + return .matchCaseInsensitive + } else { + return .match + } + case .matchScalar: + let (_, caseInsensitive, boundaryCheck) = payload.scalarPayload + if caseInsensitive { + if boundaryCheck { + return .matchScalarCaseInsensitive + } else { + return .matchScalarCaseInsensitiveUnchecked + } + } else { + if boundaryCheck { + return .matchScalar + } else { + return .matchScalarUnchecked + } + } + case .matchBitset: + let (isScalar, _) = payload.bitsetPayload + if isScalar { + return .matchBitsetScalar + } else { + return .matchBitset + } + case .consumeBy: + return consumeBy + case .assertBy: + return .assertBy + case .matchBy: + return .matchBy + case .backreference: + return .backreference + case .beginCapture: + return .beginCapture + case .endCapture: + return .endCapture + case .transformCapture: + return .transformCapture + case .captureValue: + return .captureValue + case .builtinAssertion: + return .builtinAssertion + case .builtinCharacterClass: + return .builtinCharacterClass +} + } +} + extension RegexTests { private func testCompilationEquivalence( @@ -43,6 +169,45 @@ extension RegexTests { } } + private func testCompileError( + _ regex: String, _ error: RegexCompilationError, + file: StaticString = #file, line: UInt = #line + ) { + do { + _ = try _compileRegex(regex) + XCTFail("Expected compile error", file: file, line: line) + } catch let err as RegexCompilationError { + XCTAssertEqual(err, error, file: file, line: line) + } catch { + XCTFail("Unknown compile error", file: file, line: line) + } + } + + func testInvalidScalarCoalescing() throws { + guard ensureNewStdlib() else { return } + + // Non-single-scalar bounds. + testCompileError( + #"[a\u{302}-βœ…]"#, .invalidCharacterClassRangeOperand("a\u{302}")) + testCompileError( + #"[e\u{301}-\u{302}]"#, .invalidCharacterClassRangeOperand("e\u{301}")) + testCompileError( + #"[\u{73}\u{323}\u{307}-\u{1E00}]"#, + .invalidCharacterClassRangeOperand("\u{73}\u{323}\u{307}")) + testCompileError( + #"[a\u{315}\u{301}-\u{302}]"#, + .invalidCharacterClassRangeOperand("a\u{315}\u{301}") + ) + testCompileError( + #"[a-z1e\u{301}-\u{302}\u{E1}3-59]"#, + .invalidCharacterClassRangeOperand("e\u{301}") + ) + testCompileError( + #"[[e\u{301}-\u{302}]&&e\u{303}]"#, + .invalidCharacterClassRangeOperand("e\u{301}") + ) + } + func testCompileQuantification() throws { // NOTE: While we might change how we compile @@ -147,16 +312,24 @@ extension RegexTests { for regex: String, syntax: SyntaxOptions = .traditional, semanticLevel: RegexSemanticLevel? = nil, - contains targets: Set, + contains targets: Set = [], + doesNotContain invalid: Set = [], file: StaticString = #file, line: UInt = #line ) { do { let prog = try _compileRegex(regex, syntax, semanticLevel) - var found: Set = [] + var found: Set = [] for inst in prog.engine.instructions { - if targets.contains(inst.opcode) { - found.insert(inst.opcode) + let decoded = DecodedInstr.decode(inst) + found.insert(decoded) + + if invalid.contains(decoded) { + XCTFail( + "Compiled regex '\(regex)' contains incorrect opcode \(decoded)", + file: file, + line: line) + return } } @@ -174,38 +347,139 @@ extension RegexTests { } } - private func expectProgram( - for regex: String, - syntax: SyntaxOptions = .traditional, - semanticLevel: RegexSemanticLevel? = nil, - doesNotContain targets: Set, - file: StaticString = #file, - line: UInt = #line - ) { - do { - let prog = try _compileRegex(regex, syntax, semanticLevel) - for inst in prog.engine.instructions { - if targets.contains(inst.opcode) { - XCTFail( - "Compiled regex '\(regex)' contains incorrect opcode \(inst.opcode)", - file: file, - line: line) - return - } - } - } catch { - XCTFail( - "Failed to compile regex '\(regex)': \(error)", - file: file, - line: line) - } + func testBitsetCompile() { + expectProgram( + for: "[abc]", + contains: [.matchBitset], + doesNotContain: [.consumeBy, .matchBitsetScalar]) + expectProgram( + for: "[abc]", + semanticLevel: .unicodeScalar, + contains: [.matchBitsetScalar], + doesNotContain: [.matchBitset, .consumeBy]) + expectProgram( + for: #"[\Qab\Ec]"#, + contains: [.matchBitset], + doesNotContain: [.consumeBy, .matchBitsetScalar]) + expectProgram( + for: #"[\Qab\Ec]"#, + semanticLevel: .unicodeScalar, + contains: [.matchBitsetScalar], + doesNotContain: [.matchBitset, .consumeBy]) } - func testBitsetCompile() { - expectProgram(for: "[abc]", contains: [.matchBitset]) - expectProgram(for: "[abc]", doesNotContain: [.consumeBy]) + func testScalarOptimizeCompilation() { + // all ascii quoted literal -> elide boundary checks + expectProgram( + for: "abcd", + contains: [.matchScalar, .matchScalarUnchecked], + doesNotContain: [.match, .consumeBy]) + // ascii character -> matchScalar with boundary check + expectProgram( + for: "a", + contains: [.matchScalar], + doesNotContain: [.match, .consumeBy, .matchScalarUnchecked]) + // quoted literal is not all ascii -> match scalar when possible, always do boundary checks + expectProgram( + for: "aaa\u{301}", + contains: [.match, .matchScalar], + doesNotContain: [.consumeBy, .matchScalarUnchecked]) + // scalar mode -> always emit match scalar without boundary checks + expectProgram( + for: "abcd", + semanticLevel: .unicodeScalar, + contains: [.matchScalarUnchecked], + doesNotContain: [.match, .consumeBy, .matchScalar]) + expectProgram( + for: "a", + semanticLevel: .unicodeScalar, + contains: [.matchScalarUnchecked], + doesNotContain: [.match, .consumeBy, .matchScalar]) + expectProgram( + for: "aaa\u{301}", + semanticLevel: .unicodeScalar, + contains: [.matchScalarUnchecked], + doesNotContain: [.match, .consumeBy, .matchScalar]) + } + + func testCaseInsensitivityCompilation() { + // quoted literal is all ascii -> match scalar case insensitive and skip + // boundary checks + expectProgram( + for: "(?i)abcd", + contains: [.matchScalarCaseInsensitiveUnchecked, .matchScalarCaseInsensitive], + doesNotContain: [.match, .matchCaseInsensitive, .matchScalar, .matchScalarUnchecked]) + // quoted literal is all non-cased ascii -> emit match scalar instructions + expectProgram( + for: "(?i)&&&&", + contains: [.matchScalar, .matchScalarUnchecked], + doesNotContain: [.match, .matchCaseInsensitive, + .matchScalarCaseInsensitive, .matchScalarCaseInsensitiveUnchecked]) + // quoted literal is not all ascii -> match scalar case insensitive when + // possible, match character case insensitive when needed, always perform + // boundary check + expectProgram( + for: "(?i)abcd\u{301}", + contains: [.matchCaseInsensitive, .matchScalarCaseInsensitive], + doesNotContain: [.matchScalarCaseInsensitiveUnchecked, .match, .matchScalar]) + // same as before but contains ascii non cased characters -> emit matchScalar for them + expectProgram( + for: "(?i)abcd\u{301};.'!", + contains: [.matchCaseInsensitive, .matchScalarCaseInsensitive, .matchScalar], + doesNotContain: [.matchScalarCaseInsensitiveUnchecked, .match]) + // contains non-ascii non-cased characters -> emit match + expectProgram( + for: "(?i)abcd\u{301};.'!πŸ’–", + contains: [.matchCaseInsensitive, .matchScalarCaseInsensitive, .matchScalar, .match], + doesNotContain: [.matchScalarCaseInsensitiveUnchecked]) + + // scalar mode -> emit unchecked scalar match only, emit case insensitive + // only if the scalar is cased + expectProgram( + for: "(?i);.'!πŸ’–", + semanticLevel: .unicodeScalar, + contains: [.matchScalarUnchecked], + doesNotContain: [.matchScalarCaseInsensitiveUnchecked]) + expectProgram( + for: "(?i)abcdΓ©", + semanticLevel: .unicodeScalar, + contains: [.matchScalarCaseInsensitiveUnchecked], + doesNotContain: [.matchScalarUnchecked]) + } + + func testQuantificationForwardProgressCompile() { + // Unbounded quantification + non forward progressing inner nodes + // Expect to emit the position checking instructions + expectProgram(for: #"(?:(?=a)){1,}"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:\b)*"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:(?#comment))+"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:|)+"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:\w|)+"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:\w|(?i-i:))+"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:\w|(?#comment))+"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:\w|(?#comment)(?i-i:))+"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:\w|(?i))+"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) - expectProgram(for: "[abc]", semanticLevel: .unicodeScalar, doesNotContain: [.matchBitset]) - expectProgram(for: "[abc]", semanticLevel: .unicodeScalar, contains: [.consumeBy]) + // Bounded quantification, don't emit position checking + expectProgram(for: #"(?:(?=a)){1,4}"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:\b)?"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:(?#comment)){,4}"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:|){,4}"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:\w|){,4}"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:\w|(?i-i:)){,4}"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:\w|(?#comment)){,4}"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:\w|(?#comment)(?i-i:)){,4}"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(?:\w|(?i)){,4}"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition]) + + // Inner node is a quantification that does not guarantee forward progress + expectProgram(for: #"(a*)*"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(a?)*"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(a{,5})*"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"((\b){,4})*"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"((\b){1,4})*"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"((|){1,4})*"#, contains: [.moveCurrentPosition, .condBranchSamePosition]) + // Inner node is a quantification that guarantees forward progress + expectProgram(for: #"(a+)*"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition]) + expectProgram(for: #"(a{1,})*"#, doesNotContain: [.moveCurrentPosition, .condBranchSamePosition]) } } diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift index d375065ab..8e01582a9 100644 --- a/Tests/RegexTests/MatchTests.swift +++ b/Tests/RegexTests/MatchTests.swift @@ -12,6 +12,7 @@ import XCTest @testable import _RegexParser @testable import _StringProcessing +import TestSupport struct MatchError: Error { var message: String @@ -24,24 +25,35 @@ func _firstMatch( _ regexStr: String, input: String, validateOptimizations: Bool, + semanticLevel: RegexSemanticLevel = .graphemeCluster, syntax: SyntaxOptions = .traditional -) throws -> (String, [String?]) { - var regex = try Regex(regexStr, syntax: syntax) - guard let result = try regex.firstMatch(in: input) else { - throw MatchError("match not found for \(regexStr) in \(input)") - } - let caps = result.output.slices(from: input) - +) throws -> (String, [String?])? { + var regex = try Regex(regexStr, syntax: syntax).matchingSemantics(semanticLevel) + let result = try regex.firstMatch(in: input) + if validateOptimizations { regex._setCompilerOptionsForTesting(.disableOptimizations) - guard let unoptResult = try regex.firstMatch(in: input) else { + let unoptResult = try regex.firstMatch(in: input) + if result != nil && unoptResult == nil { throw MatchError("match not found for unoptimized \(regexStr) in \(input)") } - XCTAssertEqual( - String(input[result.range]), - String(input[unoptResult.range]), - "Unoptimized regex returned a different result") + if result == nil && unoptResult != nil { + throw MatchError("match not found in optimized \(regexStr) in \(input)") + } + if let result = result, let unoptResult = unoptResult { + let optMatch = String(input[result.range]) + let unoptMatch = String(input[unoptResult.range]) + if optMatch != unoptMatch { + throw MatchError(""" + + Unoptimized regex returned: '\(unoptMatch)' + Optimized regex returned: '\(optMatch)' + """) + } + } } + guard let result = result else { return nil } + let caps = result.output.slices(from: input) return (String(input[result.range]), caps.map { $0.map(String.init) }) } @@ -54,6 +66,7 @@ func flatCaptureTest( dumpAST: Bool = false, xfail: Bool = false, validateOptimizations: Bool = true, + semanticLevel: RegexSemanticLevel = .graphemeCluster, file: StaticString = #file, line: UInt = #line ) { @@ -63,6 +76,7 @@ func flatCaptureTest( regex, input: test, validateOptimizations: validateOptimizations, + semanticLevel: semanticLevel, syntax: syntax ) else { if expect == nil { @@ -113,6 +127,7 @@ func matchTest( dumpAST: Bool = false, xfail: Bool = false, validateOptimizations: Bool = true, + semanticLevel: RegexSemanticLevel = .graphemeCluster, file: StaticString = #file, line: UInt = #line ) { @@ -126,6 +141,7 @@ func matchTest( dumpAST: dumpAST, xfail: xfail, validateOptimizations: validateOptimizations, + semanticLevel: semanticLevel, file: file, line: line) } @@ -143,25 +159,25 @@ func firstMatchTest( dumpAST: Bool = false, xfail: Bool = false, validateOptimizations: Bool = true, + semanticLevel: RegexSemanticLevel = .graphemeCluster, file: StaticString = #filePath, line: UInt = #line ) { do { - let (found, _) = try _firstMatch( + let found = try _firstMatch( regex, input: input, validateOptimizations: validateOptimizations, - syntax: syntax) + semanticLevel: semanticLevel, + syntax: syntax)?.0 if xfail { XCTAssertNotEqual(found, match, file: file, line: line) } else { - XCTAssertEqual(found, match, file: file, line: line) + XCTAssertEqual(found, match, "Incorrect match", file: file, line: line) } } catch { - // FIXME: This allows non-matches to succeed even when xfail'd - // When xfail == true, this should report failure for match == nil - if !xfail && match != nil { + if !xfail { XCTFail("\(error)", file: file, line: line) } return @@ -175,6 +191,7 @@ func firstMatchTests( enableTracing: Bool = false, dumpAST: Bool = false, xfail: Bool = false, + semanticLevel: RegexSemanticLevel = .graphemeCluster, file: StaticString = #filePath, line: UInt = #line ) { @@ -187,6 +204,7 @@ func firstMatchTests( enableTracing: enableTracing, dumpAST: dumpAST, xfail: xfail, + semanticLevel: semanticLevel, file: file, line: line) } @@ -296,6 +314,55 @@ extension RegexTests { match: "\u{006f}\u{031b}\u{0323}" ) + // e + combining accents + firstMatchTest( + #"e\u{301 302 303}"#, + input: "e\u{301}\u{302}\u{303}", + match: "e\u{301}\u{302}\u{303}" + ) + firstMatchTest( + #"e\u{315 35C 301}"#, + input: "e\u{301}\u{315}\u{35C}", + match: "e\u{301}\u{315}\u{35C}" + ) + firstMatchTest( + #"e\u{301}\u{302 303}"#, + input: "e\u{301}\u{302}\u{303}", + match: "e\u{301}\u{302}\u{303}" + ) + firstMatchTest( + #"e\u{35C}\u{315 301}"#, + input: "e\u{301}\u{315}\u{35C}", + match: "e\u{301}\u{315}\u{35C}" + ) + firstMatchTest( + #"e\u{35C}\u{315 301}"#, + input: "e\u{315}\u{301}\u{35C}", + match: "e\u{315}\u{301}\u{35C}" + ) + firstMatchTest( + #"e\u{301}\de\u{302}"#, + input: "e\u{301}0e\u{302}", + match: "e\u{301}0e\u{302}" + ) + firstMatchTest( + #"(?x) e \u{35C} \u{315}(?#hello)\u{301}"#, + input: "e\u{301}\u{315}\u{35C}", + match: "e\u{301}\u{315}\u{35C}" + ) + firstMatchTest( + #"(?x) e \u{35C} \u{315 301}"#, + input: "e\u{301}\u{315}\u{35C}", + match: "e\u{301}\u{315}\u{35C}" + ) + + // We don't coalesce across groups. + firstMatchTests( + #"e\u{301}(?:\u{315}\u{35C})?"#, + ("e\u{301}", "e\u{301}"), + ("e\u{301}\u{315}\u{35C}", nil) + ) + // Escape sequences that represent scalar values. firstMatchTest(#"\a[\b]\e\f\n\r\t"#, input: "\u{7}\u{8}\u{1B}\u{C}\n\r\t", @@ -304,8 +371,6 @@ extension RegexTests { input: "\u{7}\u{8}\u{1B}\u{C}\n\r\t", match: "\u{7}\u{8}\u{1B}\u{C}\n\r\t") - firstMatchTest(#"\r\n"#, input: "\r\n", match: "\r\n") - // MARK: Quotes firstMatchTest( @@ -421,8 +486,7 @@ extension RegexTests { "a++a", ("babc", nil), ("baaabc", nil), - ("bb", nil), - xfail: true) + ("bb", nil)) firstMatchTests( "a+?a", ("babc", nil), @@ -498,23 +562,19 @@ extension RegexTests { ("baabc", nil), ("bb", nil)) - // XFAIL'd versions of the above firstMatchTests( "a{2,4}+a", - ("baaabc", nil), - xfail: true) + ("baaabc", nil)) firstMatchTests( "a{,4}+a", ("babc", nil), ("baabc", nil), - ("baaabc", nil), - xfail: true) + ("baaabc", nil)) firstMatchTests( "a{2,}+a", ("baaabc", nil), ("baaaaabc", nil), - ("baaaaaaaabc", nil), - xfail: true) + ("baaaaaaaabc", nil)) // XFAIL'd possessive tests firstMatchTests( @@ -561,6 +621,9 @@ extension RegexTests { } func testMatchCharacterClasses() { + // Must have new stdlib for character class ranges and word boundaries. + guard ensureNewStdlib() else { return } + // MARK: Character classes firstMatchTest(#"abc\d"#, input: "xyzabc123", match: "abc1") @@ -596,6 +659,12 @@ extension RegexTests { ("A", true), ("a", false)) + matchTest(#"(?i)[a]"#, + ("πŸ’Ώ", false), + ("a\u{301}", false), + ("A", true), + ("a", true)) + matchTest("[a]", ("a\u{301}", false)) @@ -610,14 +679,12 @@ extension RegexTests { // interpreted as matching the scalars "\r" or "\n". // It does not fully match the character "\r\n" because the character class // in scalar mode will only match one scalar - do { - let regex = try Regex("[\r\n]").matchingSemantics(.unicodeScalar) - XCTAssertEqual("\r", try regex.wholeMatch(in: "\r")?.0) - XCTAssertEqual("\n", try regex.wholeMatch(in: "\n")?.0) - XCTAssertEqual(nil, try regex.wholeMatch(in: "\r\n")?.0) - } catch { - XCTFail("\(error)", file: #filePath, line: #line) - } + matchTest( + "^[\r\n]$", + ("\r", true), + ("\n", true), + ("\r\n", false), + semanticLevel: .unicodeScalar) matchTest("[^\r\n]", ("\r\n", false), @@ -625,7 +692,385 @@ extension RegexTests { ("\r", true)) matchTest("[\n\r]", ("\n", true), - ("\r", true)) + ("\r", true), + ("\r\n", false)) + + matchTest( + #"[a]\u0301"#, + ("a\u{301}", false), + semanticLevel: .graphemeCluster) + matchTest( + #"[a]\u0301"#, + ("a\u{301}", true), + semanticLevel: .unicodeScalar) + + let allNewlines = "\u{A}\u{B}\u{C}\u{D}\r\n\u{85}\u{2028}\u{2029}" + let asciiNewlines = "\u{A}\u{B}\u{C}\u{D}\r\n" + + for level in [RegexSemanticLevel.graphemeCluster, .unicodeScalar] { + firstMatchTest( + #"\R+"#, + input: "abc\(allNewlines)def", match: allNewlines, + semanticLevel: level + ) + firstMatchTest( + #"\v+"#, + input: "abc\(allNewlines)def", match: allNewlines, + semanticLevel: level + ) + } + + // In scalar mode, \R can match \r\n, \v cannot. + firstMatchTest( + #"\R"#, input: "\r\n", match: "\r\n", semanticLevel: .unicodeScalar) + firstMatchTest( + #"\v"#, input: "\r\n", match: "\r", semanticLevel: .unicodeScalar) + firstMatchTest( + #"\v\v"#, input: "\r\n", match: "\r\n", semanticLevel: .unicodeScalar) + firstMatchTest( + #"[^\v]"#, input: "\r\n", match: nil, semanticLevel: .unicodeScalar) + + // ASCII-only spaces. + firstMatchTest(#"(?S)\R+"#, input: allNewlines, match: asciiNewlines) + firstMatchTest(#"(?S)\v+"#, input: allNewlines, match: asciiNewlines) + firstMatchTest( + #"(?S)\R"#, input: "\r\n", match: "\r\n", semanticLevel: .unicodeScalar) + firstMatchTest( + #"(?S)\v"#, input: "\r\n", match: "\r", semanticLevel: .unicodeScalar) + + matchTest( + #"[a]\u0301"#, + ("a\u{301}", false), + semanticLevel: .graphemeCluster) + matchTest( + #"[a]\u0301"#, + ("a\u{301}", true), + semanticLevel: .unicodeScalar) + + // Scalar matching in quoted sequences. + firstMatchTests( + "[\\Qe\u{301}\\E]", + ("e", nil), + ("E", nil), + ("\u{301}", nil), + (eDecomposed, eDecomposed), + (eComposed, eComposed), + ("E\u{301}", nil), + ("\u{C9}", nil) + ) + firstMatchTests( + "[\\Qe\u{301}\\E]", + ("e", "e"), + ("E", nil), + ("\u{301}", "\u{301}"), + (eDecomposed, "e"), + (eComposed, nil), + ("E\u{301}", "\u{301}"), + ("\u{C9}", nil), + semanticLevel: .unicodeScalar + ) + firstMatchTests( + "(?i)[\\Qe\u{301}\\E]", + ("e", nil), + ("E", nil), + ("\u{301}", nil), + (eDecomposed, eDecomposed), + (eComposed, eComposed), + ("E\u{301}", "E\u{301}"), + ("\u{C9}", "\u{C9}") + ) + firstMatchTests( + "(?i)[\\Qe\u{301}\\E]", + ("e", "e"), + ("E", "E"), + ("\u{301}", "\u{301}"), + (eDecomposed, "e"), + (eComposed, nil), + ("E\u{301}", "E"), + ("\u{C9}", nil), + semanticLevel: .unicodeScalar + ) + + // Scalar coalescing. + firstMatchTests( + #"[e\u{301}]"#, + (eDecomposed, eDecomposed), + (eComposed, eComposed), + ("e", nil), + ("\u{301}", nil) + ) + firstMatchTests( + #"[e\u{301}]"#, + (eDecomposed, "e"), + (eComposed, nil), + ("e", "e"), + ("\u{301}", "\u{301}"), + semanticLevel: .unicodeScalar + ) + firstMatchTests( + #"[[[e\u{301}]]]"#, + (eDecomposed, eDecomposed), + (eComposed, eComposed), + ("e", nil), + ("\u{301}", nil) + ) + firstMatchTests( + #"[[[e\u{301}]]]"#, + (eDecomposed, "e"), + (eComposed, nil), + ("e", "e"), + ("\u{301}", "\u{301}"), + semanticLevel: .unicodeScalar + ) + firstMatchTests( + #"[πŸ‘¨\u{200D}πŸ‘©\u{200D}πŸ‘§\u{200D}πŸ‘¦]"#, + ("πŸ‘¨", nil), + ("πŸ‘©", nil), + ("πŸ‘§", nil), + ("πŸ‘¦", nil), + ("\u{200D}", nil), + ("πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦", "πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦") + ) + firstMatchTests( + #"[πŸ‘¨\u{200D}πŸ‘©\u{200D}πŸ‘§\u{200D}πŸ‘¦]"#, + ("πŸ‘¨", "πŸ‘¨"), + ("πŸ‘©", "πŸ‘©"), + ("πŸ‘§", "πŸ‘§"), + ("πŸ‘¦", "πŸ‘¦"), + ("\u{200D}", "\u{200D}"), + ("πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦", "πŸ‘¨"), + semanticLevel: .unicodeScalar + ) + firstMatchTests( + #"[e\u{315}\u{301}\u{35C}]"#, + ("e", nil), + ("e\u{315}", nil), + ("e\u{301}", nil), + ("e\u{315}\u{301}\u{35C}", "e\u{315}\u{301}\u{35C}"), + ("e\u{301}\u{315}\u{35C}", "e\u{301}\u{315}\u{35C}"), + ("e\u{35C}\u{301}\u{315}", "e\u{35C}\u{301}\u{315}") + ) + firstMatchTests( + #"(?x) [ e \u{315} \u{301} \u{35C} ]"#, + ("e", nil), + ("e\u{315}", nil), + ("e\u{301}", nil), + ("e\u{315}\u{301}\u{35C}", "e\u{315}\u{301}\u{35C}"), + ("e\u{301}\u{315}\u{35C}", "e\u{301}\u{315}\u{35C}"), + ("e\u{35C}\u{301}\u{315}", "e\u{35C}\u{301}\u{315}") + ) + + // We don't coalesce across character classes. + firstMatchTests( + #"e[\u{315}\u{301}\u{35C}]"#, + ("e", nil), + ("e\u{315}", nil), + ("e\u{315}\u{301}", nil), + ("e\u{301}\u{315}\u{35C}", nil) + ) + firstMatchTests( + #"[e[\u{301}]]"#, + ("e", "e"), + ("\u{301}", "\u{301}"), + ("e\u{301}", nil) + ) + + firstMatchTests( + #"[a-z1\u{E9}-\u{302}\u{E1}3-59]"#, + ("a", "a"), + ("a\u{301}", "a\u{301}"), + ("\u{E1}", "\u{E1}"), + ("\u{E2}", nil), + ("z", "z"), + ("e", "e"), + (eDecomposed, eDecomposed), + (eComposed, eComposed), + ("\u{302}", "\u{302}"), + ("1", "1"), + ("2", nil), + ("3", "3"), + ("4", "4"), + ("5", "5"), + ("6", nil), + ("7", nil), + ("8", nil), + ("9", "9") + ) + firstMatchTests( + #"[ab-df-hik-lm]"#, + ("a", "a"), + ("b", "b"), + ("c", "c"), + ("d", "d"), + ("e", nil), + ("f", "f"), + ("g", "g"), + ("h", "h"), + ("i", "i"), + ("j", nil), + ("k", "k"), + ("l", "l"), + ("m", "m") + ) + firstMatchTests( + #"[a-ce-fh-j]"#, + ("a", "a"), + ("b", "b"), + ("c", "c"), + ("d", nil), + ("e", "e"), + ("f", "f"), + ("g", nil), + ("h", "h"), + ("i", "i"), + ("j", "j") + ) + + + // These can't compile in grapheme semantic mode, but make sure they work in + // scalar semantic mode. + firstMatchTests( + #"[a\u{315}\u{301}-\u{302}]"#, + ("a", "a"), + ("\u{315}", "\u{315}"), + ("\u{301}", "\u{301}"), + ("\u{302}", "\u{302}"), + ("\u{303}", nil), + semanticLevel: .unicodeScalar + ) + firstMatchTests( + #"[\u{73}\u{323}\u{307}-\u{1E00}]"#, + ("\u{73}", "\u{73}"), + ("\u{323}", "\u{323}"), + ("\u{307}", "\u{307}"), + ("\u{400}", "\u{400}"), + ("\u{500}", "\u{500}"), + ("\u{1E00}", "\u{1E00}"), + ("\u{1E01}", nil), + ("\u{1E69}", nil), + semanticLevel: .unicodeScalar + ) + firstMatchTests( + #"[a\u{302}-βœ…]"#, + ("a", "a"), + ("\u{302}", "\u{302}"), + ("A\u{302}", "\u{302}"), + ("E\u{301}", nil), + ("a\u{301}", "a"), + ("\u{E1}", nil), + ("a\u{302}", "a"), + ("\u{E2}", nil), + ("\u{E3}", nil), + ("\u{EF}", nil), + ("e\u{301}", nil), + ("e\u{302}", "\u{302}"), + ("\u{2705}", "\u{2705}"), + ("βœ…", "βœ…"), + ("\u{376}", "\u{376}"), + ("\u{850}", "\u{850}"), + ("a\u{302}\u{315}", "a"), + semanticLevel: .unicodeScalar + ) + firstMatchTests( + #"(?i)[a\u{302}-βœ…]"#, + ("a", "a"), + ("\u{302}", "\u{302}"), + ("A\u{302}", "A"), + ("E\u{301}", nil), + ("a\u{301}", "a"), + ("\u{E1}", nil), + ("a\u{302}", "a"), + ("\u{E2}", nil), + ("\u{E3}", nil), + ("\u{EF}", nil), + ("e\u{301}", nil), + ("e\u{302}", "\u{302}"), + ("\u{2705}", "\u{2705}"), + ("βœ…", "βœ…"), + ("\u{376}", "\u{376}"), + ("\u{850}", "\u{850}"), + ("a\u{302}\u{315}", "a"), + semanticLevel: .unicodeScalar + ) + firstMatchTests( + #"[e\u{301}-\u{302}]"#, + ("a", nil), + ("e", "e"), + ("\u{302}", "\u{302}"), + ("A\u{302}", "\u{302}"), + ("E\u{301}", "\u{301}"), + ("\u{C8}", nil), + ("\u{C9}", nil), + ("\u{CA}", nil), + ("\u{CB}", nil), + ("a\u{301}", "\u{301}"), + ("a\u{302}", "\u{302}"), + ("e\u{301}", "e"), + ("e\u{302}", "e"), + ("\u{E1}", nil), + ("\u{E2}", nil), + ("\u{E9}", nil), + ("\u{EA}", nil), + ("\u{EF}", nil), + semanticLevel: .unicodeScalar + ) + firstMatchTests( + #"(?i)[e\u{301}-\u{302}]"#, + ("a", nil), + ("e", "e"), + ("\u{302}", "\u{302}"), + ("A\u{302}", "\u{302}"), + ("E\u{301}", "E"), + ("\u{C8}", nil), + ("\u{C9}", nil), + ("\u{CA}", nil), + ("\u{CB}", nil), + ("a\u{301}", "\u{301}"), + ("a\u{302}", "\u{302}"), + ("e\u{301}", "e"), + ("e\u{302}", "e"), + ("\u{E1}", nil), + ("\u{E2}", nil), + ("\u{E9}", nil), + ("\u{EA}", nil), + ("\u{EF}", nil), + semanticLevel: .unicodeScalar + ) + + // Set operation scalar coalescing. + firstMatchTests( + #"[e\u{301}&&e\u{301}e\u{302}]"#, + ("e", nil), + ("\u{301}", nil), + ("\u{302}", nil), + ("e\u{301}", "e\u{301}"), + ("e\u{302}", nil)) + firstMatchTests( + #"[e\u{301}~~[[e\u{301}]e\u{302}]]"#, + ("e", nil), + ("\u{301}", nil), + ("\u{302}", nil), + ("e\u{301}", nil), + ("e\u{302}", "e\u{302}")) + firstMatchTests( + #"[e\u{301}[e\u{303}]--[[e\u{301}]e\u{302}]]"#, + ("e", nil), + ("\u{301}", nil), + ("\u{302}", nil), + ("\u{303}", nil), + ("e\u{301}", nil), + ("e\u{302}", nil), + ("e\u{303}", "e\u{303}")) + + firstMatchTests( + #"(?x) [ e \u{301} [ e \u{303} ] -- [ [ e \u{301} ] e \u{302} ] ]"#, + ("e", nil), + ("\u{301}", nil), + ("\u{302}", nil), + ("\u{303}", nil), + ("e\u{301}", nil), + ("e\u{302}", nil), + ("e\u{303}", "e\u{303}")) firstMatchTest("[-]", input: "123-abcxyz", match: "-") @@ -709,6 +1154,15 @@ extension RegexTests { } firstMatchTest(#"[\t-\t]"#, input: "\u{8}\u{A}\u{9}", match: "\u{9}") + firstMatchTest(#"[12]"#, input: "1️⃣", match: nil) + firstMatchTest(#"[1-2]"#, input: "1️⃣", match: nil) + firstMatchTest(#"[\d]"#, input: "1️⃣", match: "1️⃣") + firstMatchTest(#"(?P)[\d]"#, input: "1️⃣", match: nil) + firstMatchTest("[0-2&&1-3]", input: "1️⃣", match: nil) + firstMatchTest("[1-2e\u{301}]", input: "1️⃣", match: nil) + + firstMatchTest(#"[\u{3A9}-\u{3A9}]"#, input: "\u{3A9}", match: "\u{3A9}") + // Currently not supported in the matching engine. for c: UnicodeScalar in ["a", "b", "c"] { firstMatchTest(#"[\c!-\C-#]"#, input: "def\(c)", match: "\(c)", @@ -762,6 +1216,35 @@ extension RegexTests { firstMatchTest(#"["abc"]+"#, input: #""abc""#, match: "abc", syntax: .experimental) firstMatchTest(#"["abc"]+"#, input: #""abc""#, match: #""abc""#) + + for semantics in [RegexSemanticLevel.unicodeScalar, .graphemeCluster] { + // Case sensitivity and ranges. + for ch in "abcD" { + firstMatchTest("[a-cD]", input: String(ch), match: String(ch)) + } + for ch in "ABCd" { + firstMatchTest("[a-cD]", input: String(ch), match: nil) + } + for ch in "abcABCdD" { + let input = String(ch) + firstMatchTest( + "(?i)[a-cd]", input: input, match: input, semanticLevel: semantics) + firstMatchTest( + "(?i)[A-CD]", input: input, match: input, semanticLevel: semantics) + } + for ch in "XYZ[\\]^_`abcd" { + let input = String(ch) + firstMatchTest( + "[X-cd]", input: input, match: input, semanticLevel: semantics) + } + for ch in "XYZ[\\]^_`abcxyzABCdD" { + let input = String(ch) + firstMatchTest( + "(?i)[X-cd]", input: input, match: input, semanticLevel: semantics) + firstMatchTest( + "(?i)[X-cD]", input: input, match: input, semanticLevel: semantics) + } + } } func testCharacterProperties() { @@ -974,6 +1457,9 @@ extension RegexTests { } func testMatchAnchors() throws { + // Must have new stdlib for character class ranges and word boundaries. + guard ensureNewStdlib() else { return } + // MARK: Anchors firstMatchTests( #"^\d+"#, @@ -1022,8 +1508,6 @@ extension RegexTests { (" 123\n456\n", nil), ("123 456", "456")) - // FIXME: Keep this until _wordIndex and friends are -#if os(Linux) firstMatchTests( #"\d+\b"#, ("123", "123"), @@ -1041,7 +1525,6 @@ extension RegexTests { ("123", "23"), (" 123", "23"), ("123 456", "23")) -#endif // TODO: \G and \K do { @@ -1054,8 +1537,8 @@ extension RegexTests { // TODO: Oniguruma \y and \Y firstMatchTests( #"\u{65}"#, // Scalar 'e' is present in both - ("Cafe\u{301}", nil), // but scalar mode requires boundary at end of match - xfail: true) + ("Cafe\u{301}", nil)) // but scalar mode requires boundary at end of match + firstMatchTests( #"\u{65}"#, // Scalar 'e' is present in both ("Sol Cafe", "e")) // standalone is okay @@ -1072,9 +1555,10 @@ extension RegexTests { ("Sol Cafe", nil), xfail: true) } - // FIXME: Keep this until _wordIndex and friends are -#if os(Linux) func testLevel2WordBoundaries() { + // Must have new stdlib for character class ranges and word boundaries. + guard ensureNewStdlib() else { return } + // MARK: Level 2 Word Boundaries firstMatchTest(#"\b😊\b"#, input: "πŸ”₯πŸ˜ŠπŸ‘", match: "😊") firstMatchTest(#"\bπŸ‘¨πŸ½\b"#, input: "πŸ‘©πŸ»πŸ‘ΆπŸΏπŸ‘¨πŸ½πŸ§‘πŸΎπŸ‘©πŸΌ", match: "πŸ‘¨πŸ½") @@ -1090,9 +1574,11 @@ extension RegexTests { firstMatchTest(#"can\B\'\Bt"#, input: "I can't do that.", match: "can't") firstMatchTest(#"\bΓ·\b"#, input: "3 Γ· 3 = 1", match: "Γ·") } -#endif - + func testMatchGroups() { + // Must have new stdlib for character class ranges and word boundaries. + guard ensureNewStdlib() else { return } + // MARK: Groups // Named captures @@ -1316,6 +1802,9 @@ extension RegexTests { } func testMatchExamples() { + // Must have new stdlib for character class ranges and word boundaries. + guard ensureNewStdlib() else { return } + // Backreferences matchTest( #"(sens|respons)e and \1ibility"#, @@ -1365,8 +1854,6 @@ extension RegexTests { xfail: true ) - // FIXME: Keep this until _wordIndex and friends are -#if os(Linux) // HTML tags matchTest( #"<([a-zA-Z][a-zA-Z0-9]*)\b[^>]*>.*?"#, @@ -1384,7 +1871,6 @@ extension RegexTests { ("pass me the the kettle", ["the"]), ("this doesn't have any", nil) ) -#endif // Floats flatCaptureTest( @@ -1400,8 +1886,79 @@ extension RegexTests { firstMatchTest(#".+"#, input: "a\nb", match: "a") firstMatchTest(#"(?s:.+)"#, input: "a\nb", match: "a\nb") } + + func testMatchNewlines() { + // Must have new stdlib for character class ranges and word boundaries. + guard ensureNewStdlib() else { return } + + for semantics in [RegexSemanticLevel.unicodeScalar, .graphemeCluster] { + firstMatchTest( + #"\r\n"#, input: "\r\n", match: "\r\n", + semanticLevel: semantics + ) + firstMatchTest( + #"\r\n"#, input: "\n", match: nil, semanticLevel: semantics) + firstMatchTest( + #"\r\n"#, input: "\r", match: nil, semanticLevel: semantics) + + // \r\n is not treated as ASCII. + firstMatchTest( + #"^\p{ASCII}$"#, input: "\r\n", match: nil, + semanticLevel: semantics + ) + firstMatchTest( + #"^\r$"#, input: "\r\n", match: nil, + semanticLevel: semantics + ) + firstMatchTest( + #"^[\r]$"#, input: "\r\n", match: nil, + semanticLevel: semantics + ) + firstMatchTest( + #"^\n$"#, input: "\r\n", match: nil, + semanticLevel: semantics + ) + firstMatchTest( + #"^[\n]$"#, input: "\r\n", match: nil, + semanticLevel: semantics + ) + firstMatchTest( + #"^[\u{0}-\u{7F}]$"#, input: "\r\n", match: nil, + semanticLevel: semantics + ) + + let scalarSemantics = semantics == .unicodeScalar + firstMatchTest( + #"\p{ASCII}"#, input: "\r\n", match: scalarSemantics ? "\r" : nil, + semanticLevel: semantics + ) + firstMatchTest( + #"\r"#, input: "\r\n", match: scalarSemantics ? "\r" : nil, + semanticLevel: semantics + ) + firstMatchTest( + #"[\r]"#, input: "\r\n", match: scalarSemantics ? "\r" : nil, + semanticLevel: semantics + ) + firstMatchTest( + #"\n"#, input: "\r\n", match: scalarSemantics ? "\n" : nil, + semanticLevel: semantics + ) + firstMatchTest( + #"[\n]"#, input: "\r\n", match: scalarSemantics ? "\n" : nil, + semanticLevel: semantics + ) + firstMatchTest( + #"[\u{0}-\u{7F}]"#, input: "\r\n", match: scalarSemantics ? "\r" : nil, + semanticLevel: semantics + ) + } + } func testCaseSensitivity() { + // Must have new stdlib for character class ranges and word boundaries. + guard ensureNewStdlib() else { return } + matchTest( #"c..e"#, ("cafe", true), @@ -1464,6 +2021,9 @@ extension RegexTests { } func testASCIIClasses() { + // Must have new stdlib for character class ranges and word boundaries. + guard ensureNewStdlib() else { return } + // 'D' ASCII-only digits matchTest( #"\d+"#, @@ -1492,8 +2052,6 @@ extension RegexTests { ("aeiou", true), ("Γ₯e\u{301}ïôú", false)) - // FIXME: Keep this until _wordIndex and friends are -#if os(Linux) matchTest( #"abcd\b.+"#, ("abcd ef", true), @@ -1509,7 +2067,6 @@ extension RegexTests { ("abcd ef", true), ("abcdef", false), ("abcdΓ©f", false)) -#endif // 'S' ASCII-only spaces matchTest( @@ -1635,6 +2192,9 @@ extension RegexTests { var eComposed: String { "Γ©" } var eDecomposed: String { "e\u{301}" } + var eComposedUpper: String { "Γ‰" } + var eDecomposedUpper: String { "E\u{301}" } + func testIndividualScalars() { // Expectation: A standalone Unicode scalar value in a regex literal // can match either that specific scalar value or participate in matching @@ -1647,19 +2207,15 @@ extension RegexTests { firstMatchTest(#"\u{65 301}$"#, input: eComposed, match: eComposed) // FIXME: Implicit \y at end of match - firstMatchTest(#"\u{65}"#, input: eDecomposed, match: nil, - xfail: true) + firstMatchTest(#"\u{65}"#, input: eDecomposed, match: nil) firstMatchTest(#"\u{65}$"#, input: eDecomposed, match: nil) - // FIXME: \y is unsupported - firstMatchTest(#"\u{65}\y"#, input: eDecomposed, match: nil, - xfail: true) + firstMatchTest(#"\u{65}\y"#, input: eDecomposed, match: nil) // FIXME: Unicode scalars are only matched at the start of a grapheme cluster firstMatchTest(#"\u{301}"#, input: eDecomposed, match: "\u{301}", xfail: true) - // FIXME: \y is unsupported - firstMatchTest(#"\y\u{301}"#, input: eDecomposed, match: nil, - xfail: true) + + firstMatchTest(#"\y\u{301}"#, input: eDecomposed, match: nil) } func testCanonicalEquivalence() throws { @@ -1681,6 +2237,16 @@ extension RegexTests { #"e$"#, (eComposed, false), (eDecomposed, false)) + + matchTest( + #"\u{65 301}"#, + (eComposed, true), + (eDecomposed, true)) + + matchTest( + #"(?x) \u{65} \u{301}"#, + (eComposed, true), + (eDecomposed, true)) } func testCanonicalEquivalenceCharacterClass() throws { @@ -1717,41 +2283,70 @@ extension RegexTests { // \s firstMatchTest(#"\s"#, input: " ", match: " ") // FIXME: \s shouldn't match a number composed with a non-number character - firstMatchTest(#"\s\u{305}"#, input: " ", match: nil, - xfail: true) + firstMatchTest(#"\s\u{305}"#, input: " ", match: nil) // \p{Whitespace} firstMatchTest(#"\s"#, input: " ", match: " ") - // FIXME: \p{Whitespace} shouldn't match whitespace composed with a non-whitespace character - firstMatchTest(#"\s\u{305}"#, input: " ", match: nil, - xfail: true) + // \p{Whitespace} shouldn't match whitespace composed with a non-whitespace character + firstMatchTest(#"\s\u{305}"#, input: " ", match: nil) } func testCanonicalEquivalenceCustomCharacterClass() throws { - // Expectation: Concatenations with custom character classes should be able - // to match within a grapheme cluster. That is, a regex should be able to - // match the scalar values that comprise a grapheme cluster in separate, - // or repeated, custom character classes. - + // Expectation: Custom character class matches do not cross grapheme + // character boundaries by default. When matching with Unicode scalar + // semantics, grapheme cluster boundaries are ignored, so matching + // sequences of custom character classes can succeed. + + // Must have new stdlib for character class ranges and word boundaries. + guard ensureNewStdlib() else { return } + matchTest( #"[Ñéíóú]$"#, (eComposed, true), (eDecomposed, true)) - // FIXME: Custom char classes don't use canonical equivalence with composed characters - firstMatchTest(#"e[\u{301}]$"#, input: eComposed, match: eComposed, - xfail: true) - firstMatchTest(#"e[\u{300}-\u{320}]$"#, input: eComposed, match: eComposed, - xfail: true) - firstMatchTest(#"[a-z][\u{300}-\u{320}]$"#, input: eComposed, match: eComposed, - xfail: true) + for input in [eDecomposed, eComposed] { + // Unicode scalar semantics means that only the decomposed version can + // match here. + let match = input.unicodeScalars.count == 2 ? input : nil + firstMatchTest( + #"e[\u{301}]$"#, input: input, match: match, + semanticLevel: .unicodeScalar) + firstMatchTest( + #"e[\u{300}-\u{320}]$"#, input: input, match: match, + semanticLevel: .unicodeScalar) + firstMatchTest( + #"[e][\u{300}-\u{320}]$"#, input: input, match: match, + semanticLevel: .unicodeScalar) + firstMatchTest( + #"[e-e][\u{300}-\u{320}]$"#, input: input, match: match, + semanticLevel: .unicodeScalar) + firstMatchTest( + #"[a-z][\u{300}-\u{320}]$"#, input: input, match: match, + semanticLevel: .unicodeScalar) + } + for input in [eComposed, eDecomposed] { + // Grapheme cluster semantics means that we can't match the 'e' separately + // from the accent. + firstMatchTest(#"e[\u{301}]$"#, input: input, match: nil) + firstMatchTest(#"e[\u{300}-\u{320}]$"#, input: input, match: nil) + firstMatchTest(#"[e][\u{300}-\u{320}]$"#, input: input, match: nil) + firstMatchTest(#"[e-e][\u{300}-\u{320}]$"#, input: input, match: nil) + firstMatchTest(#"[a-z][\u{300}-\u{320}]$"#, input: input, match: nil) + + // A range that covers Γ© (U+E9). Inputs are mapped to NFC, so match. + firstMatchTest(#"[\u{E8}-\u{EA}]"#, input: input, match: input) + } - // FIXME: Custom char classes don't match decomposed characters - firstMatchTest(#"e[\u{301}]$"#, input: eDecomposed, match: eDecomposed, - xfail: true) - firstMatchTest(#"e[\u{300}-\u{320}]$"#, input: eDecomposed, match: eDecomposed, - xfail: true) - firstMatchTest(#"[a-z][\u{300}-\u{320}]$"#, input: eDecomposed, match: eDecomposed, - xfail: true) + // A range that covers Γ‰ (U+C9). Inputs are mapped to NFC, so match. + for input in [eComposedUpper, eDecomposedUpper] { + firstMatchTest(#"[\u{C8}-\u{CA}]"#, input: input, match: input) + firstMatchTest(#"[\u{C9}-\u{C9}]"#, input: input, match: input) + } + // Case insensitive matching of Γ‰ (U+C9). + for input in [eComposed, eDecomposed, eComposedUpper, eDecomposedUpper] { + firstMatchTest(#"(?i)[\u{C8}-\u{CA}]"#, input: input, match: input) + firstMatchTest(#"(?i)[\u{C9}-\u{C9}]"#, input: input, match: input) + } let flag = "πŸ‡°πŸ‡·" firstMatchTest(#"πŸ‡°πŸ‡·"#, input: flag, match: flag) @@ -1760,27 +2355,33 @@ extension RegexTests { firstMatchTest(#"\u{1F1F0 1F1F7}"#, input: flag, match: flag) // First Unicode scalar followed by CCC of regional indicators - firstMatchTest(#"\u{1F1F0}[\u{1F1E6}-\u{1F1FF}]"#, input: flag, match: flag, - xfail: true) - - // FIXME: CCC of Regional Indicator doesn't match with both parts of a flag character + firstMatchTest( + #"^\u{1F1F0}[\u{1F1E6}-\u{1F1FF}]$"#, input: flag, match: flag, + semanticLevel: .unicodeScalar + ) + // A CCC of regional indicators followed by the second Unicode scalar + firstMatchTest( + #"^[\u{1F1E6}-\u{1F1FF}]\u{1F1F7}$"#, input: flag, match: flag, + semanticLevel: .unicodeScalar + ) // A CCC of regional indicators x 2 - firstMatchTest(#"[\u{1F1E6}-\u{1F1FF}]{2}"#, input: flag, match: flag, - xfail: true) + firstMatchTest( + #"^[\u{1F1E6}-\u{1F1FF}]{2}$"#, input: flag, match: flag, + semanticLevel: .unicodeScalar + ) + // A CCC of N regional indicators + firstMatchTest( + #"^[\u{1F1E6}-\u{1F1FF}]+$"#, input: flag, match: flag, + semanticLevel: .unicodeScalar + ) - // FIXME: A single CCC of regional indicators matches the whole flag character - // A CCC of regional indicators followed by the second Unicode scalar - firstMatchTest(#"[\u{1F1E6}-\u{1F1FF}]\u{1F1F7}"#, input: flag, match: flag, - xfail: true) // A single CCC of regional indicators - firstMatchTest(#"[\u{1F1E6}-\u{1F1FF}]"#, input: flag, match: nil, - xfail: true) - - // A single CCC of actual flag emojis / combined regional indicators - firstMatchTest(#"[πŸ‡¦πŸ‡«-πŸ‡ΏπŸ‡Ό]"#, input: flag, match: flag) - // This succeeds (correctly) because \u{1F1F0} is lexicographically - // within the CCC range - firstMatchTest(#"[πŸ‡¦πŸ‡«-πŸ‡ΏπŸ‡Ό]"#, input: "\u{1F1F0}abc", match: "\u{1F1F0}") + firstMatchTest( + #"^[\u{1F1E6}-\u{1F1FF}]$"#, input: flag, match: nil) + firstMatchTest( + #"^[\u{1F1E6}-\u{1F1FF}]$"#, input: flag, match: nil, + semanticLevel: .unicodeScalar + ) } func testAnyChar() throws { @@ -1853,6 +2454,19 @@ extension RegexTests { // TODO: Add test for grapheme boundaries at start/end of match + // Testing the matchScalar optimization for ascii quoted literals and characters + func testScalarOptimization() throws { + // check that we are correctly doing the boundary check after matchScalar + firstMatchTest("a", input: "a\u{301}", match: nil) + firstMatchTest("aa", input: "aa\u{301}", match: nil) + + firstMatchTest("a", input: "a\u{301}", match: "a", semanticLevel: .unicodeScalar) + firstMatchTest("aa", input: "aa\u{301}", match: "aa", semanticLevel: .unicodeScalar) + + // case insensitive tests + firstMatchTest(#"(?i)abc\u{301}d"#, input: "AbC\u{301}d", match: "AbC\u{301}d", semanticLevel: .unicodeScalar) + } + func testCase() { let regex = try! Regex(#".\N{SPARKLING HEART}."#) let input = "πŸ§Ÿβ€β™€οΈπŸ’–πŸ§  or πŸ§ πŸ’–β˜•οΈ" @@ -1893,5 +2507,31 @@ extension RegexTests { XCTAssertEqual(matches.count, 3) } } -} + func expectCompletion(regex: String, in target: String) { + let expectation = XCTestExpectation(description: "Run the given regex to completion") + Task.init { + let r = try! Regex(regex) + let val = target.matches(of: r).isEmpty + expectation.fulfill() + return val + } + wait(for: [expectation], timeout: 3.0) + } + + func testQuantificationForwardProgress() { + expectCompletion(regex: #"(?:(?=a)){1,}"#, in: "aa") + expectCompletion(regex: #"(?:\b)+"#, in: "aa") + expectCompletion(regex: #"(?:(?#comment))+"#, in: "aa") + expectCompletion(regex: #"(?:|)+"#, in: "aa") + expectCompletion(regex: #"(?:\w|)+"#, in: "aa") + expectCompletion(regex: #"(?:\w|(?i-i:))+"#, in: "aa") + expectCompletion(regex: #"(?:\w|(?#comment))+"#, in: "aa") + expectCompletion(regex: #"(?:\w|(?#comment)(?i-i:))+"#, in: "aa") + expectCompletion(regex: #"(?:\w|(?i))+"#, in: "aa") + expectCompletion(regex: #"(a*)*"#, in: "aa") + expectCompletion(regex: #"(a?)*"#, in: "aa") + expectCompletion(regex: #"(a{,4})*"#, in: "aa") + expectCompletion(regex: #"((|)+)*"#, in: "aa") + } +} diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 3c43f27af..84ce361f3 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -359,14 +359,14 @@ extension RegexTests { parseTest( "(.)*(.*)", concat( - zeroOrMore(of: capture(atom(.any))), - capture(zeroOrMore(of: atom(.any)))), + zeroOrMore(of: capture(atom(.dot))), + capture(zeroOrMore(of: atom(.dot)))), captures: [.opt, .cap]) parseTest( "((.))*((.)?)", concat( - zeroOrMore(of: capture(capture(atom(.any)))), - capture(zeroOrOne(of: capture(atom(.any))))), + zeroOrMore(of: capture(capture(atom(.dot)))), + capture(zeroOrOne(of: capture(atom(.dot))))), captures: [.opt, .opt, .cap, .opt]) parseTest( #"abc\d"#, @@ -374,10 +374,21 @@ extension RegexTests { // MARK: Allowed combining characters - parseTest("e\u{301}", "e\u{301}") parseTest("1\u{358}", "1\u{358}") parseTest(#"\ \#u{361}"#, " \u{361}") + parseTest("e\u{301}", "e\u{301}") + parseTest("[e\u{301}]", charClass("e\u{301}")) + parseTest("\u{E9}", "e\u{301}") + parseTest("[\u{E9}]", charClass("e\u{301}")) + + parseTest( + "\\e\u{301}", "e\u{301}", throwsError: .invalidEscape("e\u{301}")) + parseTest( + "[\\e\u{301}]", charClass("e\u{301}"), + throwsError: .invalidEscape("e\u{301}") + ) + // MARK: Alternations parseTest( @@ -479,7 +490,7 @@ extension RegexTests { parseTest(#"abc\d"#, concat("a", "b", "c", escaped(.decimalDigit))) - // FIXME: '\N' should be emitted through 'emitAny', not through the + // FIXME: '\N' should be emitted through 'emitDot', not through the // _CharacterClassModel model. parseTest(#"\N"#, escaped(.notNewline), unsupported: true) @@ -2885,11 +2896,41 @@ extension RegexTests { diagnosticTest(#"[a-\Qbc\E]"#, .unsupported("range with quoted sequence")) diagnosticTest(#"[\Qbc\E-de]"#, .unsupported("range with quoted sequence")) + diagnosticTest(#"|([πŸ‡¦πŸ‡«-πŸ‡ΏπŸ‡Ό])?"#, .invalidCharacterClassRangeOperand) + diagnosticTest(#"|([πŸ‘¨β€πŸ‘©β€πŸ‘¦-πŸ‘©β€πŸ‘©β€πŸ‘§β€πŸ‘§])?"#, .invalidCharacterClassRangeOperand) + + // Not single-scalar NFC. + diagnosticTest("[e\u{301}-e\u{302}]", .invalidCharacterClassRangeOperand) + + // These scalar values expand under NFC. + let nfcExpandingScalars: [UInt32] = [ + 0x344, 0x958, 0x959, 0x95A, 0x95B, 0x95C, 0x95D, 0x95E, 0x95F, 0x9DC, + 0x9DD, 0x9DF, 0xA33, 0xA36, 0xA59, 0xA5A, 0xA5B, 0xA5E, 0xB5C, 0xB5D, + 0xF43, 0xF4D, 0xF52, 0xF57, 0xF5C, 0xF69, 0xF73, 0xF75, 0xF76, 0xF78, + 0xF81, 0xF93, 0xF9D, 0xFA2, 0xFA7, 0xFAC, 0xFB9, 0x2ADC, 0xFB1D, 0xFB1F, + 0xFB2A, 0xFB2B, 0xFB2C, 0xFB2D, 0xFB2E, 0xFB2F, 0xFB30, 0xFB31, 0xFB32, + 0xFB33, 0xFB34, 0xFB35, 0xFB36, 0xFB38, 0xFB39, 0xFB3A, 0xFB3B, 0xFB3C, + 0xFB3E, 0xFB40, 0xFB41, 0xFB43, 0xFB44, 0xFB46, 0xFB47, 0xFB48, 0xFB49, + 0xFB4A, 0xFB4B, 0xFB4C, 0xFB4D, 0xFB4E, 0x1D15E, 0x1D15F, 0x1D160, + 0x1D161, 0x1D162, 0x1D163, 0x1D164, 0x1D1BB, 0x1D1BC, 0x1D1BD, 0x1D1BE, + 0x1D1BF, 0x1D1C0 + ] + for scalar in nfcExpandingScalars { + let hex = String(scalar, radix: 16) + diagnosticTest( + #"[\u{\#(hex)}-\u{\#(hex)}]"#, .invalidCharacterClassRangeOperand) + } + + // The NFC form of U+2126 is U+3A9. + diagnosticTest(#"[\u{2126}-\u{2126}]"#, .invalidCharacterClassRangeOperand) + diagnosticTest(#"[_-A]"#, .invalidCharacterRange(from: "_", to: "A")) diagnosticTest(#"(?i)[_-A]"#, .invalidCharacterRange(from: "_", to: "A")) diagnosticTest(#"[c-b]"#, .invalidCharacterRange(from: "c", to: "b")) diagnosticTest(#"[\u{66}-\u{65}]"#, .invalidCharacterRange(from: "\u{66}", to: "\u{65}")) + diagnosticTest(#"[e\u{301}-e\u{302}]"#, .invalidCharacterRange(from: "\u{301}", to: "e")) + diagnosticTest("(?x)[(?#)]", .expected("]")) diagnosticTest("(?x)[(?#abc)]", .expected("]")) diff --git a/Tests/RegexTests/RenderDSLTests.swift b/Tests/RegexTests/RenderDSLTests.swift index 97ba3e333..e925d255c 100644 --- a/Tests/RegexTests/RenderDSLTests.swift +++ b/Tests/RegexTests/RenderDSLTests.swift @@ -68,7 +68,38 @@ extension RenderDSLTests { } """) } - + + func testDot() throws { + try testConversion(#".+"#, #""" + Regex { + OneOrMore { + /./ + } + } + """#) + try testConversion(#"a.c"#, #""" + Regex { + "a" + /./ + "c" + } + """#) + } + + func testAnchor() throws { + try testConversion(#"^(?:a|b|c)$"#, #""" + Regex { + /^/ + ChoiceOf { + "a" + "b" + "c" + } + /$/ + } + """#) + } + func testOptions() throws { try XCTExpectFailure("Options like '(?i)' aren't converted") { try testConversion(#"(?i)abc"#, """ @@ -117,4 +148,95 @@ extension RenderDSLTests { } """#) } + + func testScalar() throws { + try testConversion(#"\u{B4}"#, #""" + Regex { + "\u{B4}" + } + """#) + try testConversion(#"\u{301}"#, #""" + Regex { + "\u{301}" + } + """#) + try testConversion(#"[\u{301}]"#, #""" + Regex { + One(.anyOf("\u{301}")) + } + """#) + try testConversion(#"[abc\u{301}]"#, #""" + Regex { + One(.anyOf("abc\u{301}")) + } + """#) + + try testConversion(#"a\u{301}"#, #""" + Regex { + "a\u{301}" + } + """#) + + try testConversion(#"(?x) a \u{301}"#, #""" + Regex { + "a\u{301}" + } + """#) + + try testConversion(#"(?x) [ a b c \u{301} ] "#, #""" + Regex { + One(.anyOf("abc\u{301}")) + } + """#) + + try testConversion(#"πŸ‘¨\u{200D}πŸ‘¨\u{200D}πŸ‘§\u{200D}πŸ‘¦"#, #""" + Regex { + "πŸ‘¨\u{200D}πŸ‘¨\u{200D}πŸ‘§\u{200D}πŸ‘¦" + } + """#) + + try testConversion(#"(πŸ‘¨\u{200D}πŸ‘¨)\u{200D}πŸ‘§\u{200D}πŸ‘¦"#, #""" + Regex { + Capture { + "πŸ‘¨\u{200D}πŸ‘¨" + } + "\u{200D}πŸ‘§\u{200D}πŸ‘¦" + } + """#) + + // We preserve the structure of non-capturing groups. + try testConversion(#"abcd(?:e\u{301}\d)"#, #""" + Regex { + "abcd" + Regex { + "e\u{301}" + One(.digit) + } + } + """#) + + try testConversion(#"\u{A B C}"#, #""" + Regex { + "\u{A}\u{B}\u{C}" + } + """#) + + // TODO: We might want to consider preserving scalar sequences in the DSL, + // and allowing them to merge with other concatenations. + try testConversion(#"\u{A B C}\u{d}efg"#, #""" + Regex { + "\u{A}\u{B}\u{C}" + "\u{D}efg" + } + """#) + + // FIXME: We don't actually have a way of specifying in the DSL that we + // shouldn't join these together, should we print them as regex instead? + try testConversion(#"a(?:\u{301})"#, #""" + Regex { + "a" + "\u{301}" + } + """#) + } } diff --git a/Tests/RegexTests/UTS18Tests.swift b/Tests/RegexTests/UTS18Tests.swift index fa8a1729d..11479bfb6 100644 --- a/Tests/RegexTests/UTS18Tests.swift +++ b/Tests/RegexTests/UTS18Tests.swift @@ -21,6 +21,7 @@ import XCTest @testable // for internal `matches(of:)` import _StringProcessing +import TestSupport extension UnicodeScalar { var value4Digits: String { @@ -222,7 +223,7 @@ extension UTS18Tests { // - Nonspacing marks are never divided from their base characters, and // otherwise ignored in locating boundaries. func testSimpleWordBoundaries() { - let simpleWordRegex = regex(#".+?\b"#).wordBoundaryKind(.unicodeLevel1) + let simpleWordRegex = regex(#".+?\b"#).wordBoundaryKind(.simple) expectFirstMatch(input, simpleWordRegex, input[pos: ..<11]) expectFirstMatch("don't", simpleWordRegex, "don") expectFirstMatch("Cafe\u{301}", simpleWordRegex, "CafΓ©") @@ -316,6 +317,9 @@ extension UTS18Tests { // surrogate followed by a trailing surrogate shall be handled as a single // code point in matching. func testSupplementaryCodePoints() { + // Must have new stdlib for character class ranges. + guard ensureNewStdlib() else { return } + XCTAssertTrue("πŸ‘".contains(regex(#"\u{1F44D}"#))) XCTAssertTrue("πŸ‘".contains(regex(#"[\u{1F440}-\u{1F44F}]"#))) XCTAssertTrue("πŸ‘πŸ‘Ž".contains(regex(#"^[\u{1F440}-\u{1F44F}]+$"#))) @@ -388,6 +392,9 @@ extension UTS18Tests { } func testCharacterClassesWithStrings() { + // Must have new stdlib for character class ranges. + guard ensureNewStdlib() else { return } + let regex = regex(#"[a-zπŸ§πŸ‡§πŸ‡ͺπŸ‡§πŸ‡«πŸ‡§πŸ‡¬]"#) XCTAssertEqual("🧐", "🧐".wholeMatch(of: regex)?.0) XCTAssertEqual("πŸ‡§πŸ‡«", "πŸ‡§πŸ‡«".wholeMatch(of: regex)?.0)